- 2010-05-31 10:50:17||||||http://www.360buy.com/product/201185.html
2010-05-31 10:50:17||||||http://www.360buy.com/product/201185.html
分隔符是“ ||| ”,这是为了尽可能防止日志正文出现与分隔符相同的字符而导致数据混淆。
hive 的内部分隔符是“ \001 ”,所以我们需要做一下转换
- package com.jd.cloud.clickstore;
- import java.io.IOException;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapred.FileSplit;
- import org.apache.hadoop.mapred.InputSplit;
- import org.apache.hadoop.mapred.JobConf;
- import org.apache.hadoop.mapred.JobConfigurable;
- import org.apache.hadoop.mapred.RecordReader;
- import org.apache.hadoop.mapred.Reporter;
- import org.apache.hadoop.mapred.TextInputFormat;
- public class ClickstreamInputFormat extends TextInputFormat implements
- JobConfigurable {
- public RecordReader<LongWritable, Text> getRecordReader(
- InputSplit genericSplit, JobConf job, Reporter reporter)
- throws IOException {
- reporter.setStatus(genericSplit.toString());
- return new ClickstreamRecordReader(job, (FileSplit) genericSplit);
- }
- }
package com.jd.cloud.clickstore;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
* 自定义hadoop的 org.apache.hadoop.mapred.InputFormat
* @author winston
public class ClickstreamInputFormat extends TextInputFormat implements
JobConfigurable {
public RecordReader<LongWritable, Text> getRecordReader(
InputSplit genericSplit, JobConf job, Reporter reporter)
throws IOException {
return new ClickstreamRecordReader(job, (FileSplit) genericSplit);
- public synchronized boolean next(LongWritable key, Text value)
- throws IOException {
- while (pos < end) {
- key.set(pos);
- int newSize = in.readLine(value, maxLineLength,
- Math.max((int)Math.min(Integer.MAX_VALUE, end-pos),
- maxLineLength));
- String strReplace = value.toString().toLowerCase().replaceAll("\\|\\|\\|" , "\001" );
- Text txtReplace = new Text();
- txtReplace.set(strReplace );
- value.set(txtReplace.getBytes(), 0, txtReplace.getLength());
- if (newSize == 0) {
- return false;
- }
- pos += newSize;
- if (newSize < maxLineLength) {
- return true;
- }
- LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
- }
- return false;
- }
/** Read a line. */
public synchronized boolean next(LongWritable key, Text value)
throws IOException {
while (pos < end) {
int newSize = in.readLine(value, maxLineLength,
Math.max((int)Math.min(Integer.MAX_VALUE, end-pos),
String strReplace = value.toString().toLowerCase().replaceAll("\\|\\|\\|" , "\001" );
Text txtReplace = new Text();
txtReplace.set(strReplace );
value.set(txtReplace.getBytes(), 0, txtReplace.getLength());
if (newSize == 0) {
return false;
pos += newSize;
if (newSize < maxLineLength) {
return true;
// line too long. try again
LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
return false;
- create table clickstream_table(time string, ip string, url string) stored as INPUTFORMAT 'com.jd.cloud.clickstore.ClickstreamInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION '/data/clickstream_20110216.txt';
create table clickstream_table(time string, ip string, url string) stored as INPUTFORMAT 'com.jd.cloud.clickstore.ClickstreamInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION '/data/clickstream_20110216.txt';
- LOAD DATA LOCAL INPATH '/data/clickstream_20110216.txt' OVERWRITE INTO TABLE clickstream_table;
LOAD DATA LOCAL INPATH '/data/clickstream_20110216.txt' OVERWRITE INTO TABLE clickstream_table;
select * from clickstream_table;