1. 修改Nutch源代码,让Nutch基于Hypertable工作,可以参考Hbase的实现. 由于该实现缺失Nutch好多特性,而且不易升级,考虑作罢.
2. 将Nutch抓取结果以命令导出为text的dump文件,然后用MapReduce解析该文件,哪相关信息到Hypertable.
3. 其实和第一一样,只不过是直接使用人家已经改好的基于Hbase的实现,然后导出一份tsv文件导入到Hypertable. 不仅融合了第一的缺点还增加了麻烦. 不考虑.
好,以下代码基于第二种思想实现.
package nutchdump; import java.io.IOException; import java.sql.Timestamp; import java.util.Iterator; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.Tool; import org.apache.thrift.TException; import org.apache.thrift.transport.TTransportException; import org.hypertable.thrift.ThriftClient; import org.hypertable.thriftgen.Cell; import org.hypertable.thriftgen.ClientException; import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.util.Tool; /** * NutchDumpReader * *Reads the dump entries from nutch dump command output, get each line result to *write into hypertable database as special format *由于只保存抓取的网页内容,所以只关心Nutch导出的文件中,Content::这一块的相关信息 * * @author(lovejuan1314) */ public class NutchDumpReader extends Configured implements Tool{ // where to put the data in hdfs when we're done private static final String OUTPUT_PATH = "nutch_content_result"; // where to read the data from. private static final String INPUT_PATH = "/shared/nutch/segdump"; static class NutchReaderMapper extends MapReduceBase implements Mapper{ public NutchReaderMapper() { } public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { String dumpline = value.toString(); NutchDumpRecord nutchDumpRecord = new NutchDumpRecord(dumpline); String version = nutchDumpRecord.getVersion(); if (version != null){ output.collect(new Text("version"), new Text(version)); } String base = nutchDumpRecord.getBase(); if (base != null){ output.collect(new Text("base"), new Text(base)); } String ContentType = nutchDumpRecord.getContentType(); if (ContentType != null){ output.collect(new Text("ContentType"), new Text(ContentType)); } String metadata = nutchDumpRecord.getMetadata(); if (metadata != null){ output.collect(new Text("metadata"), new Text(metadata)); } String url = nutchDumpRecord.getUrl(); if (url != null){ output.collect(new Text("url"), new Text(url)); } String content = nutchDumpRecord.getContent(); if (content != null){ output.collect(new Text("content"), new Text(content)); } } } static class NutchReaderReducer extends MapReduceBase implements Reducer { public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { String valKey = key.toString(); while(values.hasNext()){ Text val = values.next(); if (val.toString() != null){ //write into hypertable writeIntoTable(valKey,val.toString()); // output output.collect(key, NullWritable.get()); } } } } /** * * @param colName * @param colValue */ private static void writeIntoTable(String colName,String colValue){ try { ThriftClient client = ThriftClient.create("192.168.0.40", 38080); // mutator examples long mutator = client.open_mutator("webDb", 0, 0); Timestamp ts = new Timestamp(System.currentTimeMillis()); try { Cell cell = new Cell(); String sysDt = ts.toString(); //设置行关键字 我使用了系统时间+反转URL的格式 cell.row_key = sysDt+" "+"com.mytest.www"; //列名 cell.column_family = colName; //列值 cell.value = colValue.getBytes(); client.set_cell(mutator, cell); } finally { client.close_mutator(mutator, true); } } catch (TTransportException e) { e.printStackTrace(); } catch (TException e) { e.printStackTrace(); }catch (ClientException ex){ ex.printStackTrace(); } } /** Driver for the actual MapReduce process */ private void runJob() throws IOException{ JobConf conf = new JobConf(getConf(),NutchDumpReader.class); FileInputFormat.addInputPath(conf, new Path(INPUT_PATH)); FileOutputFormat.setOutputPath(conf, new Path(OUTPUT_PATH)); conf.setMapperClass(NutchReaderMapper.class); conf.setReducerClass(NutchReaderReducer.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(NullWritable.class); conf.setMapOutputValueClass(Text.class); JobClient.runJob(conf); } public int run(String[] arg0) throws Exception { runJob(); return 0; } public static void main(String [] args) throws Exception { int ret = ToolRunner.run(new NutchDumpReader(), args); System.exit(ret); } }
package nutchdump; public class NutchDumpRecord { // the actual line from dump file private String record; // the fileds on the line private String version; private String url; private String base; private String ContentType; private String metadata; private String content; //public NutchDumpFileRecord public NutchDumpRecord(final String record){ if (record == null){ this.record = ""; }else{ this.record = record; } this.parse(); } protected void parse(){ int versionIdx = this.record.indexOf("Version:"); int urlIdx = this.record.indexOf("url:"); int baseIdx = this.record.indexOf("base:"); int contentTypeIdx = this.record.indexOf("contentType:"); int metadataIdx = this.record.indexOf("metadata"); int contentIdx = this.record.indexOf("Content:"); if (versionIdx != -1){ this.version = this.record.substring(versionIdx).trim(); } if (urlIdx != -1){ this.url = this.record.substring(urlIdx).trim(); } if (baseIdx != -1){ this.base = this.record.substring(baseIdx).trim(); } if (contentTypeIdx != -1){ this.ContentType = this.record.substring(contentTypeIdx).trim(); } if (metadataIdx != -1){ this.metadata = this.record.substring(metadataIdx).trim(); } if (contentIdx != -1){ this.content = this.record.substring(contentIdx).trim(); } } // getters /** Return the record */ public String getRecord(){ return this.record; } public String getVersion(){ return this.version; } public String getUrl(){ return this.url; } public String getBase(){ return this.base; } public String getContentType(){ return this.ContentType; } public String getMetadata(){ return this.metadata; } public String getContent(){ return this.content; } }
//这个类是Hypertable源码中提供的. /* * To change this template, choose Tools | Templates * and open the template in the editor. */ /** * Copyright (C) 2008 Luke Lu (Zvents, Inc.) * * This file is distributed under the Apache Software License * (http://www.apache.org/licenses/) */ package nutchdump; import org.hypertable.thriftgen.*; import org.apache.thrift.TException; import org.apache.thrift.transport.TSocket; import org.apache.thrift.transport.TFramedTransport; import org.apache.thrift.transport.TTransportException; import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.protocol.TProtocol; public class ThriftClient extends HqlService.Client { public ThriftClient(TProtocol protocol) { super(protocol); } // Java only allow super as the first statement of constructor. It doesn't // support multiple inheritance to use the base-from-member idiom either. So, // we're resorting to a static factory method here. public static ThriftClient create(String host, int port, int timeout_ms, boolean do_open) throws TTransportException, TException { TFramedTransport transport = new TFramedTransport( new TSocket(host, port, timeout_ms)); ThriftClient client = new ThriftClient(new TBinaryProtocol(transport)); client.transport = transport; if (do_open) client.open(); return client; } // Java doesn't support default argument values, which makes things // unnecessarily verbose here public static ThriftClient create(String host, int port) throws TTransportException, TException { return create(host, port, 30000, true); } public void open() throws TTransportException, TException { transport.open(); do_close = true; } public void close() { if (do_close) { transport.close(); do_close = false; } } private TFramedTransport transport; private boolean do_close = false; }
代码完成后直接打成jar包,在hadoop环境下运行就可以了.
Ps:仅供参考,如果大家有什么更好的方法,欢迎讨论. 另外代码里也没有严格控制数据的一致性,若要在产品上运行还得进一步修改.