java爬虫之基于httpclient的简单Demo(二)

延续demo1的 java爬虫的2种爬取方式(HTTP||Socket)简单Demo(一),demo2出炉啦,大家想学爬虫都可以从这个网盘学习哦:https://pan.baidu.com/s/1pJJrcqJ#list/path=%2F

免费课程,非常不错。其实还是主要学习一个httpclient,httpclient全是英文文档,看的我心累啊



package com.simple.crawImpl;

import com.simple.Icrawl.ICrawl;
import com.simple.pojos.CrawlResultPojo;
import com.simple.pojos.UrlPojo;
import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;

/**
 *
 * Created by lewis on 2016/10/16.
 */
public class HttpClientCrawlerImpl implements ICrawl{

    public CloseableHttpClient httpClient = HttpClients.custom().build();           //创建定制HttpClient
    @Override
    public CrawlResultPojo crawl(UrlPojo urlpojo) {

        if(urlpojo==null){
            return null;
        }
        CrawlResultPojo crawlResultPojo = new CrawlResultPojo();                //结果集
        CloseableHttpResponse response = null;                                  //HTTP返回的各种信息集合,包含协议http标准,httpcode状态码
        BufferedReader br = null;                                               //

        try {
            HttpGet httpGet = new HttpGet(urlpojo.getUrl());
            response = httpClient.execute(httpGet);
            HttpEntity entity = response.getEntity();                                       //获取输入流
            InputStreamReader isr = new InputStreamReader(entity.getContent(),"utf-8");     //字节流转化为字符流,设置编码
            br =new BufferedReader(isr);

            String line =null;
            StringBuilder context = new StringBuilder();

            while((line=br.readLine())!=null){
                context.append(line+"\n");
            }

            crawlResultPojo.setSuccess(true);
            crawlResultPojo.setPageContent(context.toString());

            return crawlResultPojo;

        } catch (IOException e) {
            e.printStackTrace();
            crawlResultPojo.setSuccess(false);
        }finally {
            try {
                if (br!=null)
                    br.close();                                                                 //关闭流
                if(response!=null)
                    response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return crawlResultPojo;
    }

    /**
     * 带参数post的urlpojo
     * */
    public CrawlResultPojo crawl4Post(UrlPojo urlPojo){
        if(urlPojo==null||urlPojo.getUrl()==null){
            return null;
        }

        CrawlResultPojo crawlResultPojo = new CrawlResultPojo();
        BufferedReader br= null;

        try {

            RequestBuilder rb = RequestBuilder.post().setUri(new URI(urlPojo.getUrl()));
            Map parasMap = urlPojo.getParasMap() ;
            if(parasMap!=null){
                for(Entry entry:parasMap.entrySet()){
                    rb.addParameter(entry.getKey(),entry.getValue().toString());
                }
            }
            HttpUriRequest httpUriRequest = rb.build();
            HttpEntity entity =httpClient.execute(httpUriRequest).getEntity();
            InputStreamReader isr=new InputStreamReader(entity.getContent(),"utf-8");
            br = new BufferedReader(isr);

            String line = null;
            StringBuilder stringBuilder = new StringBuilder();

            while((line=br.readLine())!=null){
                stringBuilder.append(line+"\n");
            }

            crawlResultPojo.setPageContent(stringBuilder.toString());
            crawlResultPojo.setSuccess(true);

            return crawlResultPojo;
        } catch (URISyntaxException e) {
            e.printStackTrace();
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if(br!=null)
                    br.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        crawlResultPojo.setSuccess(false);
        return crawlResultPojo;
    }

    public static void main(String []args){

        HttpClientCrawlerImpl httpClientCrawlerImpl = new HttpClientCrawlerImpl();
        String url = "http://www.wangdaizhijia.com/front_select-plat";
        UrlPojo urlPojo = new UrlPojo(url);
        Map parasMap = new HashMap();

        int max_page_number = 1000;

        parasMap.put("currPage", 30);
        parasMap.put("params", "");
        parasMap.put("sort", 0);
        urlPojo.setParasMap(parasMap);

        CrawlResultPojo resultPojo = httpClientCrawlerImpl.crawl4Post(urlPojo);
        print(resultPojo);
        resultPojo=httpClientCrawlerImpl.crawl(urlPojo);
        print(resultPojo);
    }

    public static void print(Object s){
        System.out.println(s);
    }

}


你可能感兴趣的:(Java,-----爬虫相关-----)