MapRunnable设计一例

package org.apache.nutch.fetcher;

import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.fetcher.FetcherOutputFormat;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;

public class HostCheck extends Configured implements MapRunnable<Text, Text, Text, Text> {
	
	public static final Log LOG = LogFactory.getLog(HostCheck.class);
	
	OutputCollector<Text, Text> output;
	static HttpClient httpClient;
	static{
		httpClient = new HttpClient();
		httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(10000);
	}
	@Override
	public void run(RecordReader<Text, Text> input,
			OutputCollector<Text, Text> output, Reporter reporter)
			throws IOException {
		// TODO Auto-generated method stub
		this.output = output;
		Text host = new Text();
		Text tmp = new Text();
		HashSet<Thread> fetchset = new HashSet<Thread>();
		while(input.next(host, tmp)){
			if(fetchset.size()<100){
				String h = host.toString();
				Thread ft = new Thread(new FetchThread(h,output));
				fetchset.add(ft);
				ft.start();
			}else{
				Iterator<Thread> itr = fetchset.iterator();
				while(itr.hasNext()){
					Thread t = itr.next();
					if(!t.isAlive()){
						itr.remove();
					}
				}
				try {
					Thread.sleep(1000);
				} catch (InterruptedException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
		}
	}

	public static String checkHost(String host) {
		GetMethod getMethod = new GetMethod(host);
		getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 10000);
		getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
				new DefaultHttpMethodRetryHandler());
		getMethod.setRequestHeader("User-Agent", "Nokia-N73/1.0");
		getMethod.setRequestHeader("Accept", "text/vnd.wap.wml");

		String type = null;
		try {
			int statusCode = httpClient.executeMethod(getMethod);
			if (statusCode == HttpStatus.SC_OK) {
				Header h = getMethod.getResponseHeader("Content-Type");
				type = h.getValue();
//				if (type.toLowerCase().indexOf("text/html") >=0) {
//					byte[] responseBody = getMethod.getResponseBody();
//					String result = new String(responseBody, "utf-8");
//					if (result.toLowerCase().indexOf("dtd/xhtml-mobile") >= 0) {
//						type = "text/xhtml-mobile"; 
//					}
//				}

				
					LOG.info(type + "\thost: " + host);
				

			}
		} catch (Exception e) {
			// e.printStackTrace();
		} finally {
			getMethod.releaseConnection();
		}
		return type;
	}
	public static class FetchThread implements Runnable{

		String url ;
		OutputCollector<Text, Text> output;
		@Override
		public void run() {
			try {
				String type = HostCheck.checkHost("wap."+url.trim());
				if(type==null)return;
				if(type.indexOf("text/vnd.wap.wml")>=0 || type.indexOf("application/xhtml+xml")>=0){
					this.output.collect(new Text("wap."+url.trim()), new Text(type));
				}
				type = HostCheck.checkHost("www."+url.trim()+"/wap");
				if(type==null)return;
				if(type.indexOf("text/vnd.wap.wml")>=0 || type.indexOf("application/xhtml+xml")>=0){	
					this.output.collect(new Text("www."+url.trim()+"/wap"), new Text(type));
				}
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			
		
		}
		
		public FetchThread(String url,OutputCollector<Text, Text> output){
			this.url = url;
			this.output = output;
		}
	}
	@Override
	public void configure(JobConf job) {
		// TODO Auto-generated method stub
		
	}
	
	public void check(String[] args){
		Configuration conf = NutchConfiguration.create();
		JobConf job = new NutchJob(conf);
		job.setJobName("hostcheck ");

		FileInputFormat.addInputPath(job, new Path(args[0]));
		job.setInputFormat(SequenceFileInputFormat.class);

		job.setMapRunnerClass(HostCheck.class);

		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		job.setOutputFormat(SequenceFileOutputFormat.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		try {
			JobClient.runJob(job);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		if (LOG.isInfoEnabled()) {
			// LOG.info("Fetcher: done");
		}
	}
	public static void main(String[]args){
		new HostCheck().check(args);
	}

}

你可能感兴趣的:(apache,hadoop,XHTML,mobile,WAP)