抓取html

package com.neusoft.mid.parser;



import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NTCredentials;
import org.apache.commons.httpclient.auth.AuthScope;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.Span;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import org.htmlparser.visitors.NodeVisitor;

import com.neusoft.mid.parser.bean.ContentBean;






public class CatchContent  implements Runnable{
	private static final Logger logger = Logger.getLogger(CatchContent.class);
	
	final static boolean IS_INIT=false;//true问第一次使用 false为初始化之后使用
	
    private DoSql doSql;
    private String type;
    private String url;
    private String maxDate;
	
	public DoSql getDoSql() {
		return doSql;
	}
	public void setDoSql(DoSql doSql) {
		this.doSql = doSql;
	}

	/**
	* 字符串转换成日期
	* @param str
	* @return date
	*/
	public static Date StrToDate(String str) {
	  
	   SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
	   Date date = null;
	   try {
	    date = format.parse(str);
	   } catch (ParseException e) {
		   logger.error(e);
	   }
	   
	   return date;
	}
	
	
	
	/**
	  * 判断日期大小
	  */
    public boolean isLate(String createdate){
		if(this.maxDate ==null){
			return false;
		}	
		 if(StrToDate(createdate).getTime()<StrToDate(this.maxDate).getTime()){
					logger.info("日期小于数据库中最大日期,不予录入");
					return true;
	    	}
			return false;
			
		}
    
     public  void initInsertDB(String name,String createdate,String content){
        ContentBean cbean = new ContentBean();
    	 cbean.setName(name);
    	 cbean.setContent(content);
    	 cbean.setCreatedate(createdate);
    	 cbean.setType(type);
    	 try {
			doSql.insertData(cbean);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	
    }

    
  public  void insertDB(String name,String createdate,String content){
        
    		
    		 if(isLate(createdate)){
    			 logger.info("日期小于数据库中最大日期,不予录入");
    		 }else{
    			 ContentBean cbean = new ContentBean();
    	    	 cbean.setName(name);
    	    	 cbean.setContent(content);
    	    	 cbean.setCreatedate(createdate);
    	    	 cbean.setType(type);
    	    	 try {
    				doSql.insertData(cbean);
    			} catch (Exception e) {
    				// TODO Auto-generated catch block
    				e.printStackTrace();
    			}
    		 }
    	
    	
    	
    }


  //分别读纯文本和链接
    
    public static String[] readDateAndLink(String result) throws Exception
    {
    	 Parser myParser;
    	// Parser parser;
         myParser = Parser.createParser(result, "GB2312");

         HtmlPage htmlPage = new HtmlPage(myParser);

         myParser.visitAllNodesWith(htmlPage);
 
         NodeList nodelist ;
         nodelist = htmlPage.getBody();
         int size= nodelist.size();
         Node[] nodes = nodelist.toNodeArray();
         StringBuffer sb = new StringBuffer();
         for(int i =0;i<size;i++){
        	 sb.append(nodes[i].toHtml().trim());
         }
  
       /**
        * 获取url
        */
         Parser parser = new Parser(sb.toString());
         
         parser.setEncoding("GB2312");
      //   NodeList list = parser.extractAllNodesThatMatch(filter);
         
        final StringBuffer spanBuffer = new StringBuffer();
        final StringBuffer titleBuffer = new StringBuffer();
        final StringBuffer urlBuffer = new StringBuffer();
         NodeVisitor visitor = new NodeVisitor() {  
             public void visitTag(Tag tag) {	
                 if (tag instanceof Span) {  
                	
                	 spanBuffer.append(tag.getChildren().asString());
                	 spanBuffer.append("`");
                 } else if (tag instanceof LinkTag) { 
                	 urlBuffer.append(tag.getAttribute("href"));
                	 urlBuffer.append("`");
                	 titleBuffer.append(tag.getAttribute("title"));
                	 titleBuffer.append("`"); 
                 } 
                 else if (tag instanceof org.htmlparser.tags.ParagraphTag ) { 
                	System.out.println( spanBuffer.append(tag.getChildren().asString()));
                	 
                 } 
                
             }  

         };  
         parser.visitAllNodesWith(visitor);       
         String allContent[] = {spanBuffer.toString(),urlBuffer.toString(),titleBuffer.toString()};
         
		 return allContent;  
    }


    /**
     * 获得代理的httpClient
     * @return
     */
   public static HttpClient getHttpClient(){
	   //构造HttpClient
		  HttpClient httpClient = new HttpClient();   
		  httpClient.getHostConfiguration().setProxy("192.168.107.28",8080);
		  NTCredentials defaultcreds = new NTCredentials("帐号", "密码", "192.168.107.28", "hold");
		  httpClient.getState().setProxyCredentials(AuthScope.ANY, defaultcreds);
	      HostConfiguration hcf =new HostConfiguration();
	      hcf.setProxy("192.168.107.28",8080);
	      
	      return httpClient;
	   
   }
   
  
   public  void save(String[] result){
	   //链接字符串
	    String url = result[1];
	    //标题字符串
	    String title = result[2];
	  //日期字符串
	    String date = result[0];
	   
	    String[] urlArr = url.split("`");
	    String[] titleArr = title.split("`");
	    String[] dateArr = date.split("`");
	    //判断是否为初始化第一次操作
	    if(IS_INIT){
	    	for(int i = 0 ; i< urlArr.length;i++){
		    	String _url = urlArr[i];
		    	String _title = titleArr[i];
		    	String _date = dateArr[i];
		    	
		    	String _content  = getContent(_url);
		    	initInsertDB(_title,_date,_content);
		    }
	    	
	    }else{
	    for(int i = 0 ; i< urlArr.length;i++){
	    	String _url = urlArr[i];
	    	String _title = titleArr[i];
	    	String _date = dateArr[i];
	    	
	    	String _content  = getContent(_url);
	    	insertDB(_title,_date,_content);
	    }
	    }
	   
   }
   
   public static String getContent(String _url) {
	      HttpClient httpClient = getHttpClient();
	      
	    //  System.out.println("http://www.szeb.edu.cn/"+_url);
		  //创建GET方法
		  GetMethod getMethod = new GetMethod("http://www.szeb.edu.cn/"+_url);   
		  //使用系统提供的默认的恢复策略   
		  getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,   
		    new DefaultHttpMethodRetryHandler());   
		  try {   
		   //执行getMethod   
			
		   int statusCode = httpClient.executeMethod(getMethod);   
		   if (statusCode != HttpStatus.SC_OK) {   
		   // System.err.println("抓取失败: " + getMethod.getStatusLine());   
		   }   
		   //读取内容    
		   byte[] responseBody = getMethod.getResponseBody(); 

		  String result= readContent(new String(responseBody));
		  
		     return result;
		  } catch (HttpException e) {   
		   //发生致命的异常,可能是协议不对或者返回的内容有问题   
			  logger.error(e);
		  } catch (IOException e) {   
		   //发生网络异常   
		   e.printStackTrace();   
		  } catch (Exception e) {
			  logger.error(e);
			} finally {   
		 
		   //释放 
		   getMethod.releaseConnection();   
		  }   
		 
	   
	     return null;
    }


   /**
    * 获得内容
    * @param url
    * @return
    */
    private static String readContent(String string) {
    	Parser myParser;
    	// Parser parser;
         myParser = Parser.createParser(string, "GB2312");

         HtmlPage htmlPage = new HtmlPage(myParser);

         try {
			myParser.visitAllNodesWith(htmlPage);
		} catch (ParserException e) {
			logger.error(e);
		}
       //获得标题
      //   String textInPage = htmlPage.getTitle();
         
       //  System.out.println(textInPage);
         NodeList nodelist ;
         nodelist = htmlPage.getBody();
         int size= nodelist.size();
         Node[] nodes = nodelist.toNodeArray();
         StringBuffer sb = new StringBuffer();
         for(int i =5;i<size;i++){
        	// System.out.println(i+"@@@@@@@@@@@@@@@@@@@@"+nodes[i].toHtml().trim());
        	 //获得内容
        	// System.out.println(i+"@@@@@@@@@@@@@@@@@@@@"+nodes[i].toPlainTextString());
        	 //取得连接页面
        	 sb.append(nodes[i].toPlainTextString());
         }
         
         return sb.toString();
    }


  /**
    * 获得连接以及标题 日期
    * @param url
    * @return
    */
   public  void getUrlAndTitle(String url){
	   
	   HttpClient httpClient = getHttpClient();
	   
		  //创建GET方法
		  GetMethod getMethod = new GetMethod(url);   
		 
		  //使用系统提供的默认的恢复策略   
		  getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,   
		    new DefaultHttpMethodRetryHandler());   
		  try {   
		   //执行getMethod 
			  logger.info("#############开始抓取网页源代码###################");
			
		   int statusCode = httpClient.executeMethod(getMethod);   
		   if (statusCode != HttpStatus.SC_OK) {   
		    System.err.println("抓取失败: " + getMethod.getStatusLine());   
		   }   
		   //读取内容    
		   byte[] responseBody = getMethod.getResponseBody(); 

		  String result[]= readDateAndLink(new String(responseBody));
		  
		    save(result);
		  } catch (HttpException e) {   
		   //发生致命的异常,可能是协议不对或者返回的内容有问题   
		  // System.out.println("没有连上网络");   
		   logger.error(e);  
		  } catch (IOException e) {   
		   //发生网络异常   
			  logger.error(e); 
		  } catch (Exception e) {
			  logger.error(e);
			} finally {   
				 logger.info("#############抓取网页源代码结束###################");
		//  System.out.println("#############抓取网页源代码结束###################");   
		   //释放 
		   getMethod.releaseConnection();   
		  }   
		 
	   
	    

   }
   
   //main.jsp?start=0&PageCount=100000&totalnum=156&colid=504
	public void run() {
		
		
		
		getUrlAndTitle(this.url);
	}
	public String getType() {
		return type;
	}
	public void setType(String type) {
		this.type = type;
	}
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
	public String getMaxDate() {
		return maxDate;
	}
	public void setMaxDate(String maxDate) {
		this.maxDate = maxDate;
	}
	
	
	}


		 




你可能感兴趣的:(apache,html,log4j,bean,网络协议)