Nutch 研究<三> 将Nutch爬取结果放入Hypertable

阅读更多
想把Nutch抓取的web page结果放入到Hypertable中去,目前思路主要有三个:

1. 修改Nutch源代码,让Nutch基于Hypertable工作,可以参考Hbase的实现. 由于该实现缺失Nutch好多特性,而且不易升级,考虑作罢.

2. 将Nutch抓取结果以命令导出为text的dump文件,然后用MapReduce解析该文件,哪相关信息到Hypertable.

3. 其实和第一一样,只不过是直接使用人家已经改好的基于Hbase的实现,然后导出一份tsv文件导入到Hypertable. 不仅融合了第一的缺点还增加了麻烦. 不考虑.

好,以下代码基于第二种思想实现.

package nutchdump;

import java.io.IOException;
import java.sql.Timestamp;
import java.util.Iterator;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.thrift.TException;
import org.apache.thrift.transport.TTransportException;
import org.hypertable.thrift.ThriftClient;
import org.hypertable.thriftgen.Cell;
import org.hypertable.thriftgen.ClientException;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.Tool;


/**
 * NutchDumpReader
 *
 *Reads the dump entries from nutch dump command output, get each line result to
 *write into hypertable database as special format
 *由于只保存抓取的网页内容,所以只关心Nutch导出的文件中,Content::这一块的相关信息
 *
 * @author(lovejuan1314)
 */

public class NutchDumpReader extends Configured implements Tool{
	
	  // where to put the data in hdfs when we're done
	  private static final String OUTPUT_PATH = "nutch_content_result";

	  // where to read the data from.
	  private static final String INPUT_PATH = "/shared/nutch/segdump";
  
	  static class NutchReaderMapper extends MapReduceBase
      implements Mapper {
		  
		public NutchReaderMapper() { }  
	
		public void map(LongWritable key, Text value,
				OutputCollector output, Reporter reporter)
				throws IOException {
			String dumpline = value.toString();
			NutchDumpRecord nutchDumpRecord = new NutchDumpRecord(dumpline);
			String version = nutchDumpRecord.getVersion();
			if (version != null){
				output.collect(new Text("version"), new Text(version));
			}
			String base = nutchDumpRecord.getBase();
			if (base != null){
				output.collect(new Text("base"), new Text(base));
			}
			String ContentType = nutchDumpRecord.getContentType();
			if (ContentType != null){
				output.collect(new Text("ContentType"), new Text(ContentType));
			}
			String metadata = nutchDumpRecord.getMetadata();
			if (metadata != null){
				output.collect(new Text("metadata"), new Text(metadata));
			}
			String url = nutchDumpRecord.getUrl();
			if (url != null){
				output.collect(new Text("url"), new Text(url));
			} 
			
			String content = nutchDumpRecord.getContent();
			if (content != null){
				output.collect(new Text("content"), new Text(content));
			}
			
		}
		  
	  }
	  
	  static class NutchReaderReducer extends MapReduceBase
      implements Reducer {
		  
		public void reduce(Text key, Iterator values,
				OutputCollector output, Reporter reporter)
				throws IOException {
			String valKey = key.toString();
			
			while(values.hasNext()){
				Text val = values.next();
				if (val.toString() != null){
					//write into hypertable
					writeIntoTable(valKey,val.toString());
					// output
					output.collect(key, NullWritable.get());
				}
			}
			
		}
		  
	  }
	  
	  /**
	   * 
	   * @param colName
	   * @param colValue
	   */
	  
	  private static void writeIntoTable(String colName,String colValue){
		  
		  try {
			  
			ThriftClient client = ThriftClient.create("192.168.0.40", 38080);
			// mutator examples
		    long mutator = client.open_mutator("webDb", 0, 0);
		      
		    Timestamp ts = new Timestamp(System.currentTimeMillis());
		      
		      try {
		        Cell cell = new Cell();
		        String sysDt = ts.toString();
//设置行关键字 我使用了系统时间+反转URL的格式
		        cell.row_key = sysDt+" "+"com.mytest.www";
//列名
		        cell.column_family = colName;
//列值
		        cell.value = colValue.getBytes();
		        client.set_cell(mutator, cell);
		      }
		      finally {
		        client.close_mutator(mutator, true);
		      }
			
		} catch (TTransportException e) {
			e.printStackTrace();
		} catch (TException e) {
			e.printStackTrace();
		}catch (ClientException ex){
			ex.printStackTrace();
		}
		  
	  }
	  
	  /** Driver for the actual MapReduce process */
	  
	  private void runJob() throws IOException{
		  JobConf conf = new JobConf(getConf(),NutchDumpReader.class);
		  
		  FileInputFormat.addInputPath(conf, new Path(INPUT_PATH));
		  FileOutputFormat.setOutputPath(conf, new Path(OUTPUT_PATH));
		  
		  conf.setMapperClass(NutchReaderMapper.class);
		  conf.setReducerClass(NutchReaderReducer.class);
		  
		  conf.setOutputKeyClass(Text.class);
		  conf.setOutputValueClass(NullWritable.class);
		  
		  conf.setMapOutputValueClass(Text.class);
		  
		  JobClient.runJob(conf);
	  }


	public int run(String[] arg0) throws Exception {
		runJob();
		return 0;
	}
	
	 public static void main(String [] args) throws Exception {
		    int ret = ToolRunner.run(new NutchDumpReader(), args);
		    System.exit(ret);
		  }

}





package nutchdump;


public class NutchDumpRecord {
	
	// the actual line from dump file
	private String record;
	
	// the fileds on the line
	private String version;
	private String url;
	private String base;
	private String ContentType;
	private String metadata;
	private String content;

	
	//public NutchDumpFileRecord
	
	public NutchDumpRecord(final String record){
		if (record == null){
			this.record = "";
		}else{
			this.record = record;
		}
		this.parse();
	}
	
	protected void parse(){
		int versionIdx = this.record.indexOf("Version:");
		int urlIdx = this.record.indexOf("url:");
		int baseIdx = this.record.indexOf("base:");
		int contentTypeIdx = this.record.indexOf("contentType:");
		int metadataIdx = this.record.indexOf("metadata");
		int contentIdx = this.record.indexOf("Content:");
		
		if (versionIdx != -1){
			this.version = this.record.substring(versionIdx).trim();
		}
		
		if (urlIdx != -1){
			this.url = this.record.substring(urlIdx).trim();
		}
		if (baseIdx != -1){
			this.base = this.record.substring(baseIdx).trim();
		}
		if (contentTypeIdx != -1){
			this.ContentType = this.record.substring(contentTypeIdx).trim();
		}
		if (metadataIdx != -1){
			this.metadata = this.record.substring(metadataIdx).trim();
		}
		if (contentIdx != -1){
			this.content = this.record.substring(contentIdx).trim();
		}
		
	}
	
	// getters

	  /** Return the record */
	public String getRecord(){
		return this.record;
	}
	
	public String getVersion(){
		return this.version;
	}
	public String getUrl(){
		return this.url;
	}
	public String getBase(){
		return this.base;
	}
	public String getContentType(){
		return this.ContentType;
	}
	public String getMetadata(){
		return this.metadata;
	}
	public String getContent(){
		return this.content;
	}
}





//这个类是Hypertable源码中提供的. 

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */

/**
 * Copyright (C) 2008  Luke Lu (Zvents, Inc.)
 *
 * This file is distributed under the Apache Software License
 * (http://www.apache.org/licenses/)
 */

package nutchdump;

import org.hypertable.thriftgen.*;

import org.apache.thrift.TException;
import org.apache.thrift.transport.TSocket;
import org.apache.thrift.transport.TFramedTransport;
import org.apache.thrift.transport.TTransportException;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.apache.thrift.protocol.TProtocol;

public class ThriftClient extends HqlService.Client {
  public ThriftClient(TProtocol protocol) { super(protocol); }

  // Java only allow super as the first statement of constructor.  It doesn't
  // support multiple inheritance to use the base-from-member idiom either. So,
  // we're resorting to a static factory method here.
  public static ThriftClient
  create(String host, int port, int timeout_ms, boolean do_open)
      throws TTransportException, TException {
    TFramedTransport transport = new TFramedTransport(
        new TSocket(host, port, timeout_ms));
    ThriftClient client = new ThriftClient(new TBinaryProtocol(transport));
    client.transport = transport;

    if (do_open)
      client.open();

    return client;
  }

  // Java doesn't support default argument values, which makes things
  // unnecessarily verbose here
  public static ThriftClient create(String host, int port)
      throws TTransportException, TException {
    return create(host, port, 30000, true);
  }

  public void open() throws TTransportException, TException {
    transport.open();
    do_close = true;
  }

  public void close() {
    if (do_close) {
      transport.close();
      do_close = false;
    }
  }

  private TFramedTransport transport;
  private boolean do_close = false;
}




代码完成后直接打成jar包,在hadoop环境下运行就可以了.


Ps:仅供参考,如果大家有什么更好的方法,欢迎讨论. 另外代码里也没有严格控制数据的一致性,若要在产品上运行还得进一步修改.

你可能感兴趣的:(Hadoop,Apache,HBase,Mapreduce,SQL)