httpclient4.5 结合 讯代理 实现IP代理


遇见问题


        嘘嘘,不要让太多的人知道!!!! 

        代理ip是爬虫工资必要的消费,那么如何很好的利用各家服务商提供的免费代理IP呢?


使用方案

       httpclient4.5 结合 讯代理 实现IP代理_第1张图片


     讲解一下,就是在爬取之前先到各家服务商哪里爬取最新的可利用代理IP,然后放在ip池里,然后再去爬取目标网站。如此以来,维护好这个ip池,就可以源源不断的接收新的可以使用的代理ip,剩下的工作就是从网上四处寻找代理网站了。


给个demo吧

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.util.ArrayList;
import java.util.List;

/**
 * 使用免费代理demo
 *
 * Created by zc on 2017/8/11. */
public class HttpClientUtilTest {


    public static void main(String[] args) throws Exception {
        //第一步,爬取ip代理池
        List proxyModelList = spriderProxyIp();
        String ip = proxyModelList.get(2).getIp();
        int port = proxyModelList.get(2).getPort();
        System.out.println(ip + " " + port + " " + proxyModelList.get(2).getAnony());
        //添加白名单
        whilteList();
        //请求目标地址
        reqWeb(ip, port);

    }

    /**
     * 请求目标地址
     *
     * @param ip   代理Ip
     * @param port 端口
     * @throws Exception 异常
     */
    private static void reqWeb(String ip, int port) throws Exception {
        HttpClientBuilder build = HttpClients.custom();
        HttpHost proxy = new HttpHost(ip, port);
        CloseableHttpClient client = build.setProxy(proxy).build();
        String url = "http://write.blog.csdn.net/postedit/77099632";
        HttpGet request = new HttpGet(url);
        CloseableHttpResponse response = client.execute(request);
        HttpEntity entity = response.getEntity();
        System.out.println(EntityUtils.toString(entity));
    }

    /**
     * 添加自己外网ip到讯代理白名单
     *
     * @throws Exception 异常
     */
    private static void whilteList() throws Exception {
        String url = "http://www.xdaili.cn/ipagent/whilteList/addIp?spiderId=dce0442efaac42618205f177c2xxxxip=xx.xx.xx.xx";
        HttpGet request = new HttpGet(url);
        CloseableHttpClient client = HttpClients.custom().build();
        CloseableHttpResponse response = client.execute(request);
        System.out.println(response.getStatusLine());
    }

    /**
     * 爬取讯代理IP池
     *
     * @return ip集合
     * @throws Exception 异常
     */
    private static List spriderProxyIp() throws Exception {
        List proxyModelList = new ArrayList<>();
        String url = "http://www.xdaili.cn/ipagent//freeip/getFreeIps?page=1&rows=10";
        HttpGet request = new HttpGet(url);
        CloseableHttpClient client = HttpClients.custom().build();
        CloseableHttpResponse response = client.execute(request);
        HttpEntity entity = response.getEntity();
        String resTxt = EntityUtils.toString(entity);
        JSONObject jsonObject = JSON.parseObject(resTxt);
        JSONArray rows = jsonObject.getJSONArray("rows");
        rows.stream().map(v -> (JSONObject) v).filter(v -> v.getString("anony").equals("高匿")).forEach(v -> {
            ProxyModel model = new ProxyModel();
            model.setIp(v.getString("ip"));
            model.setPort(Integer.parseInt(v.getString("port")));
            model.setResponsetime(v.getString("responsetime"));
            model.setAnony(v.getString("anony"));
            proxyModelList.add(model);
        });
        return proxyModelList;
    }

    public static class ProxyModel {
        private String ip;
        private int port;
        private String responsetime;
        private String anony;

        public String getIp() {
            return ip;
        }

        public void setIp(String ip) {
            this.ip = ip;
        }

        public int getPort() {
            return port;
        }

        public void setPort(int port) {
            this.port = port;
        }

        public String getResponsetime() {
            return responsetime;
        }

        public void setResponsetime(String responsetime) {
            this.responsetime = responsetime;
        }

        public String getAnony() {
            return anony;
        }

        public void setAnony(String anony) {
            this.anony = anony;
        }
    }





你可能感兴趣的:(爬虫,技巧篇)