package org.apache.nutch.fetcher; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapRunnable; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.fetcher.Fetcher; import org.apache.nutch.fetcher.FetcherOutputFormat; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; public class HostCheck extends Configured implements MapRunnable<Text, Text, Text, Text> { public static final Log LOG = LogFactory.getLog(HostCheck.class); OutputCollector<Text, Text> output; static HttpClient httpClient; static{ httpClient = new HttpClient(); httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(10000); } @Override public void run(RecordReader<Text, Text> input, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { // TODO Auto-generated method stub this.output = output; Text host = new Text(); Text tmp = new Text(); HashSet<Thread> fetchset = new HashSet<Thread>(); while(input.next(host, tmp)){ if(fetchset.size()<100){ String h = host.toString(); Thread ft = new Thread(new FetchThread(h,output)); fetchset.add(ft); ft.start(); }else{ Iterator<Thread> itr = fetchset.iterator(); while(itr.hasNext()){ Thread t = itr.next(); if(!t.isAlive()){ itr.remove(); } } try { Thread.sleep(1000); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } public static String checkHost(String host) { GetMethod getMethod = new GetMethod(host); getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 10000); getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler()); getMethod.setRequestHeader("User-Agent", "Nokia-N73/1.0"); getMethod.setRequestHeader("Accept", "text/vnd.wap.wml"); String type = null; try { int statusCode = httpClient.executeMethod(getMethod); if (statusCode == HttpStatus.SC_OK) { Header h = getMethod.getResponseHeader("Content-Type"); type = h.getValue(); // if (type.toLowerCase().indexOf("text/html") >=0) { // byte[] responseBody = getMethod.getResponseBody(); // String result = new String(responseBody, "utf-8"); // if (result.toLowerCase().indexOf("dtd/xhtml-mobile") >= 0) { // type = "text/xhtml-mobile"; // } // } LOG.info(type + "\thost: " + host); } } catch (Exception e) { // e.printStackTrace(); } finally { getMethod.releaseConnection(); } return type; } public static class FetchThread implements Runnable{ String url ; OutputCollector<Text, Text> output; @Override public void run() { try { String type = HostCheck.checkHost("wap."+url.trim()); if(type==null)return; if(type.indexOf("text/vnd.wap.wml")>=0 || type.indexOf("application/xhtml+xml")>=0){ this.output.collect(new Text("wap."+url.trim()), new Text(type)); } type = HostCheck.checkHost("www."+url.trim()+"/wap"); if(type==null)return; if(type.indexOf("text/vnd.wap.wml")>=0 || type.indexOf("application/xhtml+xml")>=0){ this.output.collect(new Text("www."+url.trim()+"/wap"), new Text(type)); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } public FetchThread(String url,OutputCollector<Text, Text> output){ this.url = url; this.output = output; } } @Override public void configure(JobConf job) { // TODO Auto-generated method stub } public void check(String[] args){ Configuration conf = NutchConfiguration.create(); JobConf job = new NutchJob(conf); job.setJobName("hostcheck "); FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormat(SequenceFileInputFormat.class); job.setMapRunnerClass(HostCheck.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); try { JobClient.runJob(job); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (LOG.isInfoEnabled()) { // LOG.info("Fetcher: done"); } } public static void main(String[]args){ new HostCheck().check(args); } }