想把Nutch抓取的web page结果放入到Hypertable中去,目前思路主要有三个:
1. 修改Nutch源代码,让Nutch基于Hypertable工作,可以参考Hbase的实现. 由于该实现缺失Nutch好多特性,而且不易升级,考虑作罢.
2. 将Nutch抓取结果以命令导出为text的dump文件,然后用MapReduce解析该文件,哪相关信息到Hypertable.
3. 其实和第一一样,只不过是直接使用人家已经改好的基于Hbase的实现,然后导出一份tsv文件导入到Hypertable. 不仅融合了第一的缺点还增加了麻烦. 不考虑.
好,以下代码基于第二种思想实现.
package nutchdump;
import java.io.IOException;
import java.sql.Timestamp;
import java.util.Iterator;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.thrift.TException;
import org.apache.thrift.transport.TTransportException;
import org.hypertable.thrift.ThriftClient;
import org.hypertable.thriftgen.Cell;
import org.hypertable.thriftgen.ClientException;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.Tool;
/**
* NutchDumpReader
*
*Reads the dump entries from nutch dump command output, get each line result to
*write into hypertable database as special format
*由于只保存抓取的网页内容,所以只关心Nutch导出的文件中,Content::这一块的相关信息
*
* @author(lovejuan1314)
*/
public class NutchDumpReader extends Configured implements Tool{
// where to put the data in hdfs when we're done
private static final String OUTPUT_PATH = "nutch_content_result";
// where to read the data from.
private static final String INPUT_PATH = "/shared/nutch/segdump";
static class NutchReaderMapper extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
public NutchReaderMapper() { }
public void map(LongWritable key, Text value,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
String dumpline = value.toString();
NutchDumpRecord nutchDumpRecord = new NutchDumpRecord(dumpline);
String version = nutchDumpRecord.getVersion();
if (version != null){
output.collect(new Text("version"), new Text(version));
}
String base = nutchDumpRecord.getBase();
if (base != null){
output.collect(new Text("base"), new Text(base));
}
String ContentType = nutchDumpRecord.getContentType();
if (ContentType != null){
output.collect(new Text("ContentType"), new Text(ContentType));
}
String metadata = nutchDumpRecord.getMetadata();
if (metadata != null){
output.collect(new Text("metadata"), new Text(metadata));
}
String url = nutchDumpRecord.getUrl();
if (url != null){
output.collect(new Text("url"), new Text(url));
}
String content = nutchDumpRecord.getContent();
if (content != null){
output.collect(new Text("content"), new Text(content));
}
}
}
static class NutchReaderReducer extends MapReduceBase
implements Reducer<Text, Text, Text, NullWritable> {
public void reduce(Text key, Iterator<Text> values,
OutputCollector<Text, NullWritable> output, Reporter reporter)
throws IOException {
String valKey = key.toString();
while(values.hasNext()){
Text val = values.next();
if (val.toString() != null){
//write into hypertable
writeIntoTable(valKey,val.toString());
// output
output.collect(key, NullWritable.get());
}
}
}
}
/**
*
* @param colName
* @param colValue
*/
private static void writeIntoTable(String colName,String colValue){
try {
ThriftClient client = ThriftClient.create("192.168.0.40", 38080);
// mutator examples
long mutator = client.open_mutator("webDb", 0, 0);
Timestamp ts = new Timestamp(System.currentTimeMillis());
try {
Cell cell = new Cell();
String sysDt = ts.toString();
//设置行关键字 我使用了系统时间+反转URL的格式
cell.row_key = sysDt+" "+"com.mytest.www";
//列名
cell.column_family = colName;
//列值
cell.value = colValue.getBytes();
client.set_cell(mutator, cell);
}
finally {
client.close_mutator(mutator, true);
}
} catch (TTransportException e) {
e.printStackTrace();
} catch (TException e) {
e.printStackTrace();
}catch (ClientException ex){
ex.printStackTrace();
}
}
/** Driver for the actual MapReduce process */
private void runJob() throws IOException{
JobConf conf = new JobConf(getConf(),NutchDumpReader.class);
FileInputFormat.addInputPath(conf, new Path(INPUT_PATH));
FileOutputFormat.setOutputPath(conf, new Path(OUTPUT_PATH));
conf.setMapperClass(NutchReaderMapper.class);
conf.setReducerClass(NutchReaderReducer.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(NullWritable.class);
conf.setMapOutputValueClass(Text.class);
JobClient.runJob(conf);
}
public int run(String[] arg0) throws Exception {
runJob();
return 0;
}
public static void main(String [] args) throws Exception {
int ret = ToolRunner.run(new NutchDumpReader(), args);
System.exit(ret);
}
}
package nutchdump;
public class NutchDumpRecord {
// the actual line from dump file
private String record;
// the fileds on the line
private String version;
private String url;
private String base;
private String ContentType;
private String metadata;
private String content;
//public NutchDumpFileRecord
public NutchDumpRecord(final String record){
if (record == null){
this.record = "";
}else{
this.record = record;
}
this.parse();
}
protected void parse(){
int versionIdx = this.record.indexOf("Version:");
int urlIdx = this.record.indexOf("url:");
int baseIdx = this.record.indexOf("base:");
int contentTypeIdx = this.record.indexOf("contentType:");
int metadataIdx = this.record.indexOf("metadata");
int contentIdx = this.record.indexOf("Content:");
if (versionIdx != -1){
this.version = this.record.substring(versionIdx).trim();
}
if (urlIdx != -1){
this.url = this.record.substring(urlIdx).trim();
}
if (baseIdx != -1){
this.base = this.record.substring(baseIdx).trim();
}
if (contentTypeIdx != -1){
this.ContentType = this.record.substring(contentTypeIdx).trim();
}
if (metadataIdx != -1){
this.metadata = this.record.substring(metadataIdx).trim();
}
if (contentIdx != -1){
this.content = this.record.substring(contentIdx).trim();
}
}
// getters
/** Return the record */
public String getRecord(){
return this.record;
}
public String getVersion(){
return this.version;
}
public String getUrl(){
return this.url;
}
public String getBase(){
return this.base;
}
public String getContentType(){
return this.ContentType;
}
public String getMetadata(){
return this.metadata;
}
public String getContent(){
return this.content;
}
}
//这个类是Hypertable源码中提供的.
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
/**
* Copyright (C) 2008 Luke Lu (Zvents, Inc.)
*
* This file is distributed under the Apache Software License
* (http://www.apache.org/licenses/)
*/
package nutchdump;
import org.hypertable.thriftgen.*;
import org.apache.thrift.TException;
import org.apache.thrift.transport.TSocket;
import org.apache.thrift.transport.TFramedTransport;
import org.apache.thrift.transport.TTransportException;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.apache.thrift.protocol.TProtocol;
public class ThriftClient extends HqlService.Client {
public ThriftClient(TProtocol protocol) { super(protocol); }
// Java only allow super as the first statement of constructor. It doesn't
// support multiple inheritance to use the base-from-member idiom either. So,
// we're resorting to a static factory method here.
public static ThriftClient
create(String host, int port, int timeout_ms, boolean do_open)
throws TTransportException, TException {
TFramedTransport transport = new TFramedTransport(
new TSocket(host, port, timeout_ms));
ThriftClient client = new ThriftClient(new TBinaryProtocol(transport));
client.transport = transport;
if (do_open)
client.open();
return client;
}
// Java doesn't support default argument values, which makes things
// unnecessarily verbose here
public static ThriftClient create(String host, int port)
throws TTransportException, TException {
return create(host, port, 30000, true);
}
public void open() throws TTransportException, TException {
transport.open();
do_close = true;
}
public void close() {
if (do_close) {
transport.close();
do_close = false;
}
}
private TFramedTransport transport;
private boolean do_close = false;
}
代码完成后直接打成jar包,在hadoop环境下运行就可以了.
Ps:仅供参考,如果大家有什么更好的方法,欢迎讨论. 另外代码里也没有严格控制数据的一致性,若要在产品上运行还得进一步修改.