httpClient及jsoup抓取解析网页数据

需要在网页上展示今日黄历信息,数据格式如下:
公历时间:2017年06月21日 星期三
农历时间:鸡年五月廿七
天干地支:丁酉年 丙午月 己卯日
宜:年破日,大事不宜 
忌:年破日,大事不宜 

 

主要包括公历/农历日期,以及忌宜信息的等。但是手里并没有现成的数据可供使用,怎么办呢?革命前辈曾经说过,没有枪,没有炮,敌人给我们造!网络上有很多现成的在线万年历应用可供使用,虽然没有现成接口,但是我们可以伸出手来,自己去拿。也就是所谓的数据抓取。

 

这里介绍两个使用的工具,httpClient以及jsoup,简介如下:

HttpClient是Apache Jakarta Common下的子项目,用来提供高效的、最新的、功能丰富的支持HTTP协议的客户端编程工具包,并且它支持HTTP协议最新的版本和建议。HttpClient已经应用在很多的项目中,比如Apache Jakarta上很著名的另外两个开源项目Cactus和HTMLUnit都使用了HttpClient。

httpClient使用方法如下:
  1. 创建HttpClient对象。
  2. 创建请求方法的实例,并指定请求URL。
  3. 调用HttpClient对象的execute(HttpUriRequest request)发送请求,该方法返回一个HttpResponse。
  4. 调用HttpResponse相关方法获取相应内容。
  5. 释放连接。

jsoup是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

 

接下来我们直接上代码,这里我们抓取2345在线万年历的数据 http://tools.2345.com/rili.htm

 

 

 

在myeclipse中建立项目:

httpClient及jsoup抓取解析网页数据_第1张图片

其中所需要jar包的下载地址:

http://download.csdn.net/detail/m0_37739193/9876204

http://download.csdn.net/detail/m0_37739193/9876210

 

首先我们定义一个实体类Almanac来存储黄历数据:Almanac.java

package com.likx.picker.util;

public class Almanac {
    private String solar;        /* 阳历 e.g.2016年 4月11日 星期一 */
    private String lunar;        /* 阴历 e.g. 猴年 三月初五*/
    private String chineseAra;    /* 天干地支纪年法 e.g.丙申年 壬辰月 癸亥日*/
    private String should;        /* 宜e.g. 求子 祈福 开光 祭祀 安床*/
    private String avoid;        /* 忌 e.g. 玉堂(黄道)危日,忌出行*/

    public String getSolar() {
        return solar;
    }

    public void setSolar(String date) {
        this.solar = date;
    }

    public String getLunar() {
        return lunar;
    }

    public void setLunar(String lunar) {
        this.lunar = lunar;
    }

    public String getChineseAra() {
        return chineseAra;
    }

    public void setChineseAra(String chineseAra) {
        this.chineseAra = chineseAra;
    }

    public String getAvoid() {
        return avoid;
    }

    public void setAvoid(String avoid) {
        this.avoid = avoid;
    }

    public String getShould() {
        return should;
    }

    public void setShould(String should) {
        this.should = should;
    }

    public Almanac(String solar, String lunar, String chineseAra, String should,
            String avoid) {
        this.solar = solar;
        this.lunar = lunar;
        this.chineseAra = chineseAra;
        this.should = should;
        this.avoid = avoid;
    }
}


然后是抓取解析的主程序,写程序之前需要在官网下载需要的jar包:AlmanacUtil.java

 

package com.likx.picker.util;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class AlmanacUtil {
    
    /**
     * 单例工具类
     */
    private AlmanacUtil() {
    }
    /**
     * 获取万年历信息
     * @return
     */
    public static Almanac getAlmanac(){
        String url="http://tools.2345.com/rili.htm";
        String html=pickData(url);
//      System.out.println("html-->"+html);		//源代码信息
        Almanac almanac=analyzeHTMLByString(html);
        return almanac;
    }
    
    /*
     * 爬取网页信息
     */
    private static String pickData(String url) {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        try {
            HttpGet httpget = new HttpGet(url);
//            System.out.println("httpget-->"+httpget);
            CloseableHttpResponse response = httpclient.execute(httpget);
//            System.out.println("response-->"+response);
            try {
                // 获取响应实体
                HttpEntity entity = response.getEntity();
                // 打印响应状态
                if (entity != null) {
                    return EntityUtils.toString(entity);
                }
            } finally {
                response.close();
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            // 关闭连接,释放资源
            try {
                httpclient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return null;
    }
    
    /*
     * 使用jsoup解析网页信息
     */
    private static Almanac analyzeHTMLByString(String html){
        String solarDate,lunarDate,chineseAra,should,avoid=" ";
        Document document = Jsoup.parse(html);
        //公历时间
        solarDate=getSolarDate();
        //农历时间
        Element eLunarDate=document.getElementById("info_nong");
        lunarDate=eLunarDate.child(0).html().substring(1,3)+eLunarDate.html().substring(11);
        //天干地支纪年法
        Element eChineseAra=document.getElementById("info_chang");
        chineseAra=eChineseAra.text().toString();    
        //宜
        should=getSuggestion(document,"yi");
        //忌
        avoid=getSuggestion(document,"ji");
        Almanac almanac=new Almanac(solarDate,lunarDate,chineseAra,should,avoid);
        return almanac;
    }
    /*
     * 获取忌/宜
     */
    private static String getSuggestion(Document doc,String id){
        Element element=doc.getElementById(id);
        Elements elements=element.getElementsByTag("a");
        StringBuffer sb=new StringBuffer();
        for (Element e : elements) {
            sb.append(e.text()+" ");
        }
        return sb.toString();
    }

    /*
     * 获取公历时间,用yyyy年MM月dd日 EEEE格式表示。
     * @return yyyy年MM月dd日 EEEE
     */
    private static String getSolarDate() {
        Calendar calendar = Calendar.getInstance();
        Date solarDate = calendar.getTime();
        SimpleDateFormat formatter = new SimpleDateFormat("yyyy年MM月dd日 EEEE");
        return formatter.format(solarDate);
    }

}


为了简单明了我把抓取解析抽象成了几个独立的方法,其中pickData()方法使用httpClient来抓取数据到一个字符串中(就是在网页上点击查看源代码看到的HTML源码),analyzeHTMLByString()方法来解析抓取到的字符串,getSuggestion方法把抓取方法类似的宜忌数据抽象到了一起,另外因为公历时间可以很容易的自己生成就没有在网页上爬取。
然后下面是一个测试类简单测试下效果:AlmanacUtilTest.java

 

package com.likx.picker.util;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class AlmanacUtil {
    
    /**
     * 单例工具类
     */
    private AlmanacUtil() {
    }
    /**
     * 获取万年历信息
     * @return
     */
    public static Almanac getAlmanac(){
        String url="http://tools.2345.com/rili.htm";
        String html=pickData(url);
//      System.out.println("html-->"+html);		//源代码信息
        Almanac almanac=analyzeHTMLByString(html);
        return almanac;
    }
    
    /*
     * 爬取网页信息
     */
    private static String pickData(String url) {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        try {
            HttpGet httpget = new HttpGet(url);
//            System.out.println("httpget-->"+httpget);
            CloseableHttpResponse response = httpclient.execute(httpget);
//            System.out.println("response-->"+response);
            try {
                // 获取响应实体
                HttpEntity entity = response.getEntity();
                // 打印响应状态
                if (entity != null) {
                    return EntityUtils.toString(entity);
                }
            } finally {
                response.close();
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            // 关闭连接,释放资源
            try {
                httpclient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return null;
    }
    
    /*
     * 使用jsoup解析网页信息
     */
    private static Almanac analyzeHTMLByString(String html){
        String solarDate,lunarDate,chineseAra,should,avoid=" ";
        Document document = Jsoup.parse(html);
        //公历时间
        solarDate=getSolarDate();
        //农历时间
        Element eLunarDate=document.getElementById("info_nong");
        lunarDate=eLunarDate.child(0).html().substring(1,3)+eLunarDate.html().substring(11);
        //天干地支纪年法
        Element eChineseAra=document.getElementById("info_chang");
        chineseAra=eChineseAra.text().toString();    
        //宜
        should=getSuggestion(document,"yi");
        //忌
        avoid=getSuggestion(document,"ji");
        Almanac almanac=new Almanac(solarDate,lunarDate,chineseAra,should,avoid);
        return almanac;
    }
    /*
     * 获取忌/宜
     */
    private static String getSuggestion(Document doc,String id){
        Element element=doc.getElementById(id);
        Elements elements=element.getElementsByTag("a");
        StringBuffer sb=new StringBuffer();
        for (Element e : elements) {
            sb.append(e.text()+" ");
        }
        return sb.toString();
    }

    /*
     * 获取公历时间,用yyyy年MM月dd日 EEEE格式表示。
     * @return yyyy年MM月dd日 EEEE
     */
    private static String getSolarDate() {
        Calendar calendar = Calendar.getInstance();
        Date solarDate = calendar.getTime();
        SimpleDateFormat formatter = new SimpleDateFormat("yyyy年MM月dd日 EEEE");
        return formatter.format(solarDate);
    }

}


为了简单明了我把抓取解析抽象成了几个独立的方法,其中pickData()方法使用httpClient来抓取数据到一个字符串中(就是在网页上点击查看源代码看到的HTML源码),analyzeHTMLByString()方法来解析抓取到的字符串,getSuggestion方法把抓取方法类似的宜忌数据抽象到了一起,另外因为公历时间可以很容易的自己生成就没有在网页上爬取。

 

然后下面是一个测试类简单测试下效果:AlmanacUtilTest.java

 

package com.likx.picker.util;

public class AlmanacUtilTest {
    
    public static void main(String args[]){
        Almanac almanac=AlmanacUtil.getAlmanac();
        System.out.println("公历时间:"+almanac.getSolar());
        System.out.println("农历时间:"+almanac.getLunar());
        System.out.println("天干地支:"+almanac.getChineseAra());
        System.out.println("宜:"+almanac.getShould());
        System.out.println("忌:"+almanac.getAvoid());
    }
}


运行结果如下:

 

httpClient及jsoup抓取解析网页数据_第2张图片

 

集成到实际项目中效果是这样的:

httpClient及jsoup抓取解析网页数据_第3张图片

 

当然本例只是一个非常浅显的小例子,网页上内容也很容易抓取,httpClient及jsoup工具更多强大的地方没有体现到,比如httpClient不仅可以发送get请求,而且可以发送post请求,提交表单,传送文件,还比如jsoup最强大的地方在于它支持仿jquery的选择器。本例仅仅使用了最简单的document.getElementById()匹配元素,实际上jsoup的选择器异常强大,可以说它就是java版的jquery,比如这样:

Elements links = doc.select("a[href]"); // a with href
Elements pngs = doc.select("img[src$=.png]");
  // img with src ending .png
Element masthead = doc.select("div.masthead").first();
  // div with class=masthead
Elements resultLinks = doc.select("h3.r > a"); // direct a after h3

备注:HttpUtil

import java.io.IOException;
import java.io.InterruptedIOException;
import java.net.UnknownHostException;
import java.nio.charset.Charset;

import javax.net.ssl.SSLException;
import javax.net.ssl.SSLHandshakeException;

import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpRequest;
import org.apache.http.NoHttpResponseException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;

@Component
public class HttpUtil{
	
	private static PoolingHttpClientConnectionManager connMgr;  
    private static RequestConfig requestConfig;  
    private static final int MAX_TIMEOUT = 800000;  
    private static HttpRequestRetryHandler httpRequestRetryHandler = null;
    static{
    	ConnectionSocketFactory plainsf = PlainConnectionSocketFactory.getSocketFactory();
        LayeredConnectionSocketFactory sslsf = SSLConnectionSocketFactory.getSocketFactory();
        Registry registry = RegistryBuilder.create()
                .register("http", plainsf)
                .register("https", sslsf)
                .build();	
    	// 设置连接池  
        connMgr = new PoolingHttpClientConnectionManager(registry);  
        // 设置连接池大小  
        connMgr.setMaxTotal(200);  
        connMgr.setDefaultMaxPerRoute(connMgr.getMaxTotal());  
  
        requestConfig = RequestConfig.custom()
        		.setConnectionRequestTimeout(MAX_TIMEOUT)
        		.setConnectTimeout(MAX_TIMEOUT)
        		.setSocketTimeout(MAX_TIMEOUT).build(); 
        
      //请求重试处理
       httpRequestRetryHandler = new HttpRequestRetryHandler() {
            public boolean retryRequest(IOException exception,int executionCount, HttpContext context) {
                if (executionCount >= 5) {// 如果已经重试了5次,就放弃                    
                    return false;
                }
                if (exception instanceof NoHttpResponseException) {// 如果服务器丢掉了连接,那么就重试                    
                    return true;
                }
                if (exception instanceof SSLHandshakeException) {// 不要重试SSL握手异常                    
                    return false;
                }                
                if (exception instanceof InterruptedIOException) {// 超时                    
                    return false;
                }
                if (exception instanceof UnknownHostException) {// 目标服务器不可达                    
                    return false;
                }
                if (exception instanceof ConnectTimeoutException) {// 连接被拒绝                    
                    return false;
                }
                if (exception instanceof SSLException) {// ssl握手异常                    
                    return false;
                }
                
                HttpClientContext clientContext = HttpClientContext.adapt( context);
                HttpRequest request = clientContext.getRequest();
                // 如果请求是幂等的,就再次尝试
                if (!(request instanceof HttpEntityEnclosingRequest)) {                    
                    return true;
                }
                return false;
            }
        };  
    }
    
    public static String doPost(String apiUrl,String params){
    	/*System.out.println("--------------apiUrl-----------"+apiUrl);
    	System.out.println("--------------params-----------"+params);*/
    	try{
    		if(StringUtils.isNoneBlank(apiUrl)){
    			CloseableHttpClient httpClient = HttpClients.custom()
        				.setDefaultRequestConfig(requestConfig)
                        .setConnectionManager(connMgr)
                        .setRetryHandler(httpRequestRetryHandler)
                        .build();
            	HttpPost httpPost = new HttpPost(apiUrl); 
            	
            	
            	if(StringUtils.isNoneBlank(params)){
            		StringEntity s = new StringEntity(params,Charset.forName("utf-8"));
                	s.setContentEncoding("UTF-8");
                	s.setContentType("application/json;charset=UTF-8");
                	httpPost.setEntity(s);
            	}
                CloseableHttpResponse response = httpClient.execute(httpPost);  
                int status = response.getStatusLine().getStatusCode();  
               /* System.out.println("--------------status-----------"+status);*/
                if (status >= 200 && status < 300) {  
                    HttpEntity entity = response.getEntity();  
                    String returnResponseStr="";  
                    if(entity != null)  
                    {  
                    	returnResponseStr=EntityUtils.toString(entity,"utf-8");  
                    }  
                    return returnResponseStr;
                } else {  
                    httpPost.abort();  
                    return null;
                }
    		}else{
    			return null;
    		}
    	}catch(Exception e){
    		e.printStackTrace();
    		return null;
    	}
    	
    }
    
    public static String doGet(String apiUrl){
    	try{
    		if(StringUtils.isNoneBlank(apiUrl)){
    			CloseableHttpClient httpClient = HttpClients.custom()
        				.setDefaultRequestConfig(requestConfig)
                        .setConnectionManager(connMgr)
                        .setRetryHandler(httpRequestRetryHandler)
                        .build();
            	HttpGet httpGet = new HttpGet(apiUrl); 
                CloseableHttpResponse response = httpClient.execute(httpGet);  
                int status = response.getStatusLine().getStatusCode();  
                if (status >= 200 && status < 300) {  
                    HttpEntity entity = response.getEntity();  
                    String returnResponseStr="";  
                    if(entity != null)  
                    {  
                    	returnResponseStr=EntityUtils.toString(entity,"utf-8");  
                    }  
                    return returnResponseStr;
                } else {   
                	httpGet.abort();  
                    return null;
                }
    		}else{
    			return null;
    		}
    	}catch(Exception e){
    		e.printStackTrace();
    		return null;
    	}
    	
    }

//    public static void main(String[] args) throws UnsupportedEncodingException {
//		String jsons = HttpUtil.doPost("http://120.27.9.235:8090/sendMessage","");
//		System.out.println(jsons);
//    }
    
}


maven:
        
            org.apache.httpcomponents
            httpclient
            4.4.1
        
        
            org.apache.commons
            commons-lang3
            3.4
        

 

参考:

http://www.cnblogs.com/lkxsnow/p/5380164.html

http://www.cnblogs.com/roverliang/p/5176456.html

你可能感兴趣的:(技术猎奇,httpclent,jsoup,抓取解析网页数据)