HtmlParser 解析搜索页面

package com.safetys.crawler.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.safetys.framework.exception.ApplicationAccessException;
/**
 * 整合百度、谷歌搜索数据
 * @author zhaozhi3758
 * date:2011-04-19
 */
public class Crawler {
	
	
	private final static String splitStr="zzc@cheng";
	private String encoding="gbk"; //解析页面编码
	public String searchMode;//指定搜索方式 keyword 按关键字搜索,specifyUrl 按指定url搜索
	public String baiduUrl; //百度搜索url,按照设定的搜索链接模版,需包含${keyword}:关键字/${searchNum}:搜索数量 "http://www.baidu.com/s?rn=${searchNum}&wd=${keyword}"
	public String googleUrl; //google 搜索url,按照设定的搜索链接模版,需包含${keyword}:关键字/${searchNum}:搜索数量 "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=${keyword}&num=${searchNum}&aq=f&aqi=&aql=&oq=&gs_rfai="
	public String keyword; //搜索关键字
	public int searchNum = 0;//搜索数量
	public String specifyUrl; //按指定的url 搜索
	
	/**
	 * 抓取百度搜索结果页面
	 */
	public  List<String> crawlerBaidu(){
	    Parser myParser = new Parser();  
	    try {
			myParser.setURL(getBaiduUrl());
			myParser.setEncoding(myParser.getEncoding());
		} catch (ParserException e1) {
			e1.printStackTrace();
		}  
	    NodeList nodeList = null;  
	    NodeFilter tableFilter = new NodeClassFilter(TableTag.class);  
	    OrFilter lastFilter = new OrFilter();  
	    lastFilter.setPredicates(new NodeFilter[] { tableFilter });  
	    List<String> result = new ArrayList<String>();
	    try {  
	      nodeList = myParser.parse(lastFilter);  
	      for (int i = 0; i <= nodeList.size(); i++) {
	    		if (nodeList.elementAt(i) instanceof TableTag) {
	    			TableTag tag = (TableTag) nodeList.elementAt(i);
	    			if(tag.getAttribute("id")!=null){
	    				result.addAll(getBaiduLink(tag.getChildrenHTML()));
	    			}
	    		}
	    	}
	    } catch (ParserException e) {  
	      e.printStackTrace();  
	    }  
	    return result;
   }
	
   private  List<String> getBaiduLink(String s){
		Parser myParser;  
	     NodeList nodeList = null;  
	     myParser = Parser.createParser(s,encoding);
	     List<String> result = new ArrayList<String>();
		  try {
			  //设置解析编码格式
			  nodeList =myParser.parse (new NodeClassFilter(LinkTag.class)) ; // 使用 NodeClassFilter
			  if (nodeList!=null && nodeList.size () > 0) {
				 // 循环遍历每个Url 节点
				 for (int l = 0; l < nodeList.size () ; l ++) {
					String urlLink= ((LinkTag) nodeList.elementAt (l)) .extractLink () ;
					String LinkName = ((LinkTag) nodeList.elementAt (l)).getLinkText () ;
					if(!LinkName.equals("百度快照") && urlLink.indexOf("baidu")==-1 && urlLink.indexOf("http") == 0){
						System.out.println("baidu--->"+LinkName + splitStr + urlLink);
						result.add(LinkName + splitStr + urlLink);
					}
				 }
			  }
		 } catch (ParserException e) {
			  e.printStackTrace () ;
		 }
		 return result;
	}
   
   
	/**
	 * 抓取谷歌搜索结果页面的指定范围的链接
	 */
   private  List<String> crawlerGoogle() {   
	   String htmlstr = getUrlHtmlByHttpClient(getGoogleUrl());
	   List<String> result = new ArrayList<String>();
	   try {   
           Parser parser = Parser.createParser(htmlstr, encoding);   
           // 创建TagNameFilter实例   
           TagNameFilter filter = new TagNameFilter("A");   
           // 筛选出所有A标签节点   
           NodeList nodes = parser.extractAllNodesThatMatch(filter);   
           if (nodes != null) {   
               for (int i = 0; i < nodes.size(); i++) {   
                   LinkTag tag = (LinkTag) nodes.elementAt(i);
                   if (tag.getLink().indexOf ("google") ==-1 && tag.getLink().indexOf ("http") == 0 ){
                	   System.out.println("google--->"+tag.getLinkText() +splitStr+ tag.getLink());
                   	result.add(tag.getLinkText() +splitStr+ tag.getLink());
                   }
               }   
           }   
       } catch (Exception e) {   
           e.printStackTrace();   
       }   
       return result;
   }   
   /**  
    * 模拟客户端访问获取搜索结果页面  
    * @param url  
    * @return  
    */  
   private  String getUrlHtmlByHttpClient(String url) {   
       String searchHtml = null;   
       HttpClient httpClient = new HttpClient();   
       httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000);   
       GetMethod getMethod = new GetMethod(url);   
       getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000);   
       getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());   
       try {   
           int statusCode = httpClient.executeMethod(getMethod);   
           if (statusCode != HttpStatus.SC_OK) {   
               System.err.println("Method failed: "  
                       + getMethod.getStatusLine());   
           }   
           InputStream bodyIs = getMethod.getResponseBodyAsStream();//   
           //System.out.println("get reoponse body stream:" + bodyIs);   
 
           //如果中文乱码 修改字符集   
           BufferedReader br = new BufferedReader(   
           new InputStreamReader(bodyIs,encoding));   
		   //BufferedReader br = new BufferedReader(   
		   //new InputStreamReader(bodyIs));   
           StringBuffer sb = new StringBuffer();   
           String line = null;   
           while ((line = br.readLine()) != null) {   
               sb.append(line);   
           }   
           searchHtml = sb.toString();   
           return searchHtml;   
       } catch (HttpException e) {   
           System.out.println("Please check your http address!");   
           e.printStackTrace();   
           return null;   
       } catch (IOException e) {   
           e.printStackTrace();   
           return null;   
       } finally {   
           getMethod.releaseConnection();   
       }   
 
   }
   
   /**
    * 按关键字抓取的统一入口
    */
   public  List<String> crawler() throws ApplicationAccessException{
	   if(null == searchMode || searchMode.equals(""))
		    throw new ApplicationAccessException("searchMode is null");
	   Set<String> set = new HashSet<String>();
	   List<String> list = new ArrayList<String>();
	   if(searchMode.equals("specifyUrl")){ //按指定url搜索
		  
		  
	   }
	   else if(searchMode.equals("keyword")){ //按关键字搜索
		     set.addAll(crawlerBaidu());
		     set.addAll(crawlerGoogle());
	   }
	   list.addAll(set);
	   return list;
   }
	
	
	public String getBaiduUrl() {
		 return baiduUrl.replace("${keyword}", getKeyword()).replace("${searchNum}", ""+(searchNum/2));
	}
	public void setBaiduUrl(String baiduUrl) {
		this.baiduUrl = baiduUrl;
	}
	public String getEncoding() {
		return encoding;
	}
	public void setEncoding(String encoding) {
		this.encoding = encoding;
	}
	public String getGoogleUrl() {
		
		return googleUrl.replace("${keyword}",getKeyword()).replace("${searchNum}",""+(searchNum/2));
	}
	public void setGoogleUrl(String googleUrl) {
		this.googleUrl = googleUrl;
	}
	public String getKeyword() {
		String key ="";
		try {
			key = URLEncoder.encode(keyword,encoding);
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		return key;
	}
	public void setKeyword(String keyword) {
		this.keyword = keyword;
	}
	public String getSearchMode() {
		return searchMode;
	}
	public void setSearchMode(String searchMode) {
		this.searchMode = searchMode;
	}
	public int getSearchNum() {
		return searchNum;
	}
	public void setSearchNum(int searchNum) {
		this.searchNum = searchNum;
	}
	public String getSpecifyUrl() {
		return specifyUrl;
	}
	public void setSpecifyUrl(String specifyUrl) {
		this.specifyUrl = specifyUrl;
	}
	
	public static void main(String[] args) throws ApplicationAccessException {
		
		Crawler cl = new Crawler();
		cl.setEncoding("gbk");
		cl.setSearchNum(10);
		cl.setKeyword("面包");
		cl.setSearchMode("keyword");
		cl.setBaiduUrl("http://www.baidu.com/s?rn=${searchNum}&wd=${keyword}");
		cl.setGoogleUrl("http://www.google.com.hk/search?hl=zh-CN&source=hp&q=${keyword}&num=${searchNum}&aq=f&aqi=&aql=&oq=&gs_rfai=");
		System.out.println("=====>"+cl.getBaiduUrl());
		System.out.println("=====>"+cl.getGoogleUrl());
		System.out.println(cl.crawler());
		
		
		
	}
	

}

你可能感兴趣的:(apache,F#,百度,Google,HP)