代理Ip的爬取-验证-PAC脚本服务

代码包含了3种方式的验证

  1. 通过已获取的文件IP
  2. 通过生成ip字符串
  3. 通过代理网站的爬取验证

验证成功的ip保存到一个文件
开启Socket服务器 127.0.0.1:9999
浏览器中访问返回的内容为

var ips = ["210.101.131.229:8080","47.91.138.21:3128"];
var current ;
function FindProxyForURL(url,host){
                
        if(url.indexOf("next")>0){
            var len = Math.floor( Math.random() * 2);
            current = ips[len];
        }
        
        
        if(url.indexOf("fidder")>0){
            current = "127.0.0.1:8888"; //fidder 代理
        }
        
        if(url.indexOf("direct")>0){
            current = "DIRECT";
        }
        
        if(isInNet(dnsResolve(host), "127.0.0.0", "255.255.255.0")){//如果是本地主机,localhost 直接连接
            return "DIRECT";
        }
        
        
        return "PROXY "+current;
        
}

function httpGet(callback)
{
    var xmlhttp;
    if (window.XMLHttpRequest)
    {
        xmlhttp=new XMLHttpRequest();
    }
    else
    {
        xmlhttp=new ActiveXObject("Microsoft.XMLHTTP");
    }
    xmlhttp.onreadystatechange=function()
    {
        if (xmlhttp.readyState==4 && xmlhttp.status==200)
        {
            var response = xmlhttp.responseText;
            callback.done(response);
        }
    }
    xmlhttp.open("GET","http://localhost:9999",true);// true 异步
    xmlhttp.send();
}

该内容是Pac脚本,关于Pac脚本的使用方式请参考其他教程

实现代码

验证ip是否可用的原理非常简单,只需使用代理模拟请求访问某一网站根据服务器返回结果即可判断(当然还有其他方式)

因为当时随便写写就直接用了httpclient,没有再修改,这里推荐使用Okhttp不需要写线程池并发相关的更加简单。

public class IPCheck {
    // 代理666的IP提取地址
    private static ServerSocket serverSocket = null;// 保证代理服务器只需要一个实例即可

    static final int workerNumber = 4;// 线程池保留数量,服务器为8核cpu,合适的数量应该小于8

    static final int maxPoolSize = 256;// 最大线程数量,即最大并发量

    static final int maxWorkerInQueue = 2500;// 最大工作队列数量

    static final int waitTime = 5;// 超时等待时间

    private static final ThreadPoolExecutor tpe = new ThreadPoolExecutor(workerNumber, maxPoolSize, waitTime,
            TimeUnit.SECONDS, new ArrayBlockingQueue(maxWorkerInQueue));

    // 使用上面或下面的线程池
    private static ExecutorService executor = Executors.newFixedThreadPool(100);
    private static List> proxyIps = new ArrayList<>();
    private static CloseableHttpClient client = null;
    private static String host = "http://www.qq.com/robots.txt";
    private static AtomicInteger atomicInteger = new AtomicInteger();

    private static CopyOnWriteArrayList ipList = new CopyOnWriteArrayList<>();

    static {

        client = HttpClients.createDefault();
    }

    public static void main(String[] args) throws Exception {

        testByFile();
//      testByCustom();
//      testByParse(1);
        executor.shutdown();

        while (true) {
            if (executor.isTerminated()) {
                System.out.println("所有的子线程都结束了!");
                break;
            }
            Thread.sleep(1000);
        }

        StringBuilder builder = new StringBuilder();
        builder.append("[");
        for (int i = 0; i < ipList.size(); i++) {
            String ipStr = ipList.get(i);
            builder.append("\"");
            builder.append(ipStr);
            builder.append("\"");

            if (i != ipList.size() - 1) {
                builder.append(",");
            }
        }
        builder.append("]");

        String js = StringUtils.readFile("src/templet.txt");

        final String response = js.replace("#ips#", builder.toString()).replace("#len#", ipList.size() + "");

        // 开启服务器
        System.out.println("=====runing at 127.0.0.1:9999 ======");
        ThreadPoolExecutor executor = ExecutorsUtils.tpe;
        serverSocket = new ServerSocket(9999);

        while (true) {

            final Socket browserSocket = serverSocket.accept();
            executor.execute(new Thread(new Runnable() {
                public void run() {
                    try {
                        System.out.println("=======处理请求========");
                        OutputStream chromeOutputStream = browserSocket.getOutputStream();
                        PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(chromeOutputStream));
                        printWriter.write(response);
                        printWriter.close();

                    } catch (IOException e) {
                        e.printStackTrace();
                    }

                }
            }));

        }

    }

    private static void testByParse(int page) {
        System.out.println("begin parse");

        // String url = "http://www.xicidaili.com/nn/"+page;
        String url = "http://www.xicidaili.com/wt/" + page;
        try {
            Connection con = Jsoup.connect(url).timeout(3000);
            con.header("User-Agent",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
            con.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");

            Document document = con.get();
            Elements table = document.select("#ip_list");
            Elements trList = table.select("tr");

            System.out.println("Size: " + trList.size());

            List ipBeans = new ArrayList<>();
            IPBean ipBean = null;

            trList.remove(0);
            for (int i = 0; i < trList.size(); i++) {

                Element tr = trList.get(i);

                String country = tr.select("td.country").get(0).html();// Cn
                String ip = tr.select("td").get(1).text();// ip
                String port = tr.select("td").get(2).text();// port
                String area = tr.select("td").get(3).select("a").text();// 江苏苏州
                String type = tr.select("td").get(4).text();// 高匿还是普通
                String protocol = tr.select("td").get(5).text();// 协议类型
                String speed = tr.select("td").get(6).select("div.bar").attr("title");// 速度
                String connectTimeout = tr.select("td").get(7).select("div.bar").attr("title");// 连接时间
                String survivalTimeout = tr.select("td").get(8).text();// 存活时间
                String checkTime = tr.select("td").get(9).text();// 验证时间

                ipBean = new IPBean(country, ip, port, area, type, protocol, speed, connectTimeout, survivalTimeout,
                        checkTime);

                ipBeans.add(ipBean);
            }

            for (IPBean ip : ipBeans) {
                final String ipStr = ip.getIp();
                final int port = Integer.parseInt(ip.getPort());

                executor.submit(new Runnable() {

                    @Override
                    public void run() {
                        sendRequest(ipStr, port);

                    }
                });

            }

        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    private static void testByCustom() {

        final int port = 80;
        for (int i = 5; i < 255; i++) {
            for (int j = 0; j < 255; j++) {
                final String ip = "221.216." + i + "." + j;

                tpe.execute(new Runnable() {

                    @Override
                    public void run() {
                        sendRequest(ip, port);

                    }
                });

            }
        }
    }

    private static void testByFile() {
        initProxyIp();

        for (Map map : proxyIps) {
            Entry entry = map.entrySet().iterator().next();
            final String ip = entry.getKey();
            final int port = entry.getValue();

            executor.submit(new Runnable() {

                @Override
                public void run() {
                    sendRequest(ip, port);

                }
            });

        }

    }

    private static void sendRequest(String ip, int port) {
        System.out.println("当前访问的代理是:" + ip + ":" + port + "  已发送的请求数是:" + atomicInteger.incrementAndGet());

        HttpGet get = new HttpGet(host);
        // 对单个请求设置代理
        HttpHost proxy = new HttpHost(ip, port);

        RequestConfig config = RequestConfig.custom().setProxy(proxy).setConnectionRequestTimeout(5000)
                .setSocketTimeout(5000).build();

        get.setConfig(config);
        get.setHeader("Referer", "http://www.qq.com/");
        get.setHeader("Host", "www.qq.com");
        get.setHeader("Accept-Encoding", "gzip, deflate, sdch");
        get.setHeader("User-Agent",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
        get.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");

        try {
            HttpResponse response = client.execute(get);
            String content = EntityUtils.toString(response.getEntity());

            if (content.contains("http://www.qq.com/sitemap_index.xml")) {
                System.out.println(ip + ":" + port + "代理可用!!!");
                // 写入文件
                StringUtils.write(ip + ":" + port, "C:\\Users\\Administrator\\Desktop\\ip.txt", true);
                // 写入copyList
                ipList.add(ip + ":" + port);

            } else {
                System.out.println(ip + "不可用");
            }

        } catch (Exception e) {
            System.out.println(ip + " Connection timed out 不可用");
        }

    }

    private static void initProxyIp() {
        String content = StringUtils.readFile("src/ips.txt");
        String ips[] = content.split("\r\n");
        Map map;
        for (String line : ips) {
            String ip_port[] = line.split(":");
            String ip = ip_port[0];
            int port = Integer.parseInt(ip_port[1]);

            map = new HashMap<>();
            map.put(ip, port);

            proxyIps.add(map);
        }

    }

}

class IPBean {
    String country;
    String ip;
    String port;
    String area;
    String type;
    String protocol;
    String speed;
    String connectTimeout;
    String survivalTimeout;
    String checkTime;

    public IPBean(String country, String ip, String port, String area, String type, String protocol, String speed,
            String connectTimeout, String survivalTimeout, String checkTime) {
        super();
        this.country = country;
        this.ip = ip;
        this.port = port;
        this.area = area;
        this.type = type;
        this.protocol = protocol;
        this.speed = speed;
        this.connectTimeout = connectTimeout;
        this.survivalTimeout = survivalTimeout;
        this.checkTime = checkTime;
    }

    @Override
    public String toString() {
        return "IPBean [country=" + country + ", ip=" + ip + ", port=" + port + ", area=" + area + ", type=" + type
                + ", protocol=" + protocol + ", speed=" + speed + ", connectTimeout=" + connectTimeout
                + ", survivalTimeout=" + survivalTimeout + ", checkTime=" + checkTime + "]";
    }

    public String getCountry() {
        return country;
    }

    public void setCountry(String country) {
        this.country = country;
    }

    public String getIp() {
        return ip;
    }

    public void setIp(String ip) {
        this.ip = ip;
    }

    public String getPort() {
        return port;
    }

    public void setPort(String port) {
        this.port = port;
    }

    public String getArea() {
        return area;
    }

    public void setArea(String area) {
        this.area = area;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getProtocol() {
        return protocol;
    }

    public void setProtocol(String protocol) {
        this.protocol = protocol;
    }

    public String getSpeed() {
        return speed;
    }

    public void setSpeed(String speed) {
        this.speed = speed;
    }

    public String getConnectTimeout() {
        return connectTimeout;
    }

    public void setConnectTimeout(String connectTimeout) {
        this.connectTimeout = connectTimeout;
    }

    public String getSurvivalTimeout() {
        return survivalTimeout;
    }

    public void setSurvivalTimeout(String survivalTimeout) {
        this.survivalTimeout = survivalTimeout;
    }

    public String getCheckTime() {
        return checkTime;
    }

    public void setCheckTime(String checkTime) {
        this.checkTime = checkTime;
    }

}

代码中使用到的模板代码如下

var ips = #ips#;
var current ;
function FindProxyForURL(url,host){
                
        if(url.indexOf("next")>0){
            var len = Math.floor( Math.random() * #len#);
            current = ips[len];
        }
        
        
        if(url.indexOf("fidder")>0){
            current = "127.0.0.1:8888"; //fidder 代理
        }
        
        if(url.indexOf("direct")>0){
            current = "DIRECT";
        }
        
        if(isInNet(dnsResolve(host), "127.0.0.0", "255.255.255.0")){//如果是本地主机,localhost 直接连接
            return "DIRECT";
        }
        
        
        return "PROXY "+current;
        
}

function httpGet(callback)
{
    var xmlhttp;
    if (window.XMLHttpRequest)
    {
        xmlhttp=new XMLHttpRequest();
    }
    else
    {
        xmlhttp=new ActiveXObject("Microsoft.XMLHTTP");
    }
    xmlhttp.onreadystatechange=function()
    {
        if (xmlhttp.readyState==4 && xmlhttp.status==200)
        {
            var response = xmlhttp.responseText;
            callback.done(response);
        }
    }
    xmlhttp.open("GET","http://localhost:9999",true);// true 异步
    xmlhttp.send();
}

你可能感兴趣的:(代理Ip的爬取-验证-PAC脚本服务)