java爬虫程序卡死的问题

我开发了一个爬虫程序使用的是httpclient4.3.5+jsoup1.7.2,发现在爬取数据的时候有的时候会卡住,发现每次都是执行到

response = httpClient.execute(httpGet);

这句话的时候,我百度之后,发了一句话说的很好:

我们知道Socket在读数据的时候是阻塞式的,如果没有读到数据程序会一直阻塞在那里。在同步请求的时候我们肯定是不能允许这样的情况发生的,这就需要我们在请求达到一定的时间后控制阻塞的中断,让程序得以继续运行。Socket为我们提供了一个setSoTimeout()方法来设置接收数据的超时时间,单位是毫秒。当设置的超时时间大于0,并且超过了这一时间Socket还没有接收到返回的数据的话,Socket就会抛出一个SocketTimeoutException。

于是我设置了sockettimeout,但是还是会卡住,有的人说将httpGet释放,response释放,对我的程序来说都没啥用,我发现卡住之后response会没有返回值。后来我在网上继续寻找答案,看到一篇博文,说是可以使用定时器来设置时间,如果httpClient.execute(httpGet)执行过长,将httpClient关闭。我最初写代码的时候将httpClient写成了单例!!!结果httpClient释放资源之后,再次请求,就报错,代码写了一段时间不看很容易忘记。

代码修改之后有了下面的代码:

public CrawlResultPojo crawler(ProxyIP proxy, int index, String proxyPath) {
        CrawlResultPojo crawlResultPojo = new CrawlResultPojo();
        if (urlPojo == null || urlPojo.getUrl() == null || urlPojo.getUrl().equals("")) {
            crawlResultPojo.setSuccess(false);
            crawlResultPojo.setPageContent(null);
            return crawlResultPojo;
        }

        BufferedReader br = null;
        CloseableHttpResponse response = null;
        RequestConfig requestConfig = null;
        URIBuilder uriBuilder = null;
        Map parasMap = null;
        List params = null;
        BasicNameValuePair basicNameValuePair = null;
        HttpGet httpGet = null;
        HttpPost httpPost = null;
        HttpEntity entity = null;
        InputStreamReader isr = null;
        int httpCode = 0;
        CloseableHttpClient httpClient = null;


        try {
            params = new LinkedList<>();
            parasMap = urlPojo.getParasMap();
            uriBuilder = new URIBuilder(urlPojo.getUrl());

            if (proxy == null) {
                requestConfig = RequestConfig.custom().setConnectTimeout(2000).setSocketTimeout(5000)
                        .setConnectionRequestTimeout(2000).build();
                System.err.println("请使用代理IP进行爬去,或者调用crawl()!");
            } else {
                HttpHost httpHost = new HttpHost(proxy.getIp(), proxy.getPort());
                requestConfig = RequestConfig.custom().setProxy(httpHost).setConnectTimeout(50000).setSocketTimeout(5000)
                        .setConnectionRequestTimeout(5000).build();
                if (urlPojo.getHttpMethodEnum() == HttpMethodEnum.GET) {
                    if (parasMap == null) {
                        httpGet = new HttpGet(urlPojo.getUrl());
                        httpGet.setConfig(requestConfig);
                        //专门用来模拟浏览器,让我的请求看起来和真实的浏览器一样
                        httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
                        httpGet.setHeader("Accept-Encoding", "gzip, deflate");
                        httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
                        httpGet.setHeader("Connection", "keep-alive");
                        httpGet.setHeader("Cookie", "_ga=GA1.2.70069932.1538118002; cookie_consent_dismissed=1; _gid=GA1.2.1456316958.1539831293; PHPSESSID=v7g0jk1v5pkqjpjcdtiejjpj04; _gat=1");
                        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0");
                        httpGet.setHeader("Host", "www.devicespecifications.com");
                        httpGet.setHeader("Referer", "https://www.devicespecifications.com/en");
                    } else {
                        for (Entry entrySet :
                                parasMap.entrySet()) {
                            String key = entrySet.getKey();
                            Object value = entrySet.getValue();
                            basicNameValuePair = new BasicNameValuePair(key, value.toString());
                            params.add(basicNameValuePair);
                            uriBuilder.setParameters(params);
                        }
                        httpGet = new HttpGet(uriBuilder.build());
                        httpGet.setConfig(requestConfig);

                    }
                } else {
                    if (parasMap == null) {
                        httpPost = new HttpPost(urlPojo.getUrl());
                        httpPost.setConfig(requestConfig);
                        //专门用来模拟浏览器
                        httpPost.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
                        httpPost.setHeader("Accept-Encoding", "gzip, deflate");
                        httpPost.setHeader("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
                        httpPost.setHeader("Cache-Control", "no-cache");
                        httpPost.setHeader("Cookie", "screen=%7B%22w%22%3A1366%2C%22h%22%3A768%2C%22d%22%3A1%7D;");
                        httpPost.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
                    } else {
                        for (Entry entrySet :
                                parasMap.entrySet()) {
                            String key = entrySet.getKey();
                            Object value = entrySet.getValue();
                            basicNameValuePair = new BasicNameValuePair(key, value.toString());
                            params.add(basicNameValuePair);
                            uriBuilder.setParameters(params);
                        }
                        httpPost = new HttpPost(uriBuilder.build());
                        httpPost.setConfig(requestConfig);
                    }
                }

                if (httpGet != null) {
                    try {
                        httpClient = HttpClients.custom().build();
                        timer = new Timer();
                        timerTask = new ReleaseIdleConnTask(httpClient);
                        timer.schedule(timerTask, 60 * 1000);

                        response = httpClient.execute(httpGet);
                        httpCode = response.getStatusLine().getStatusCode();
                        if (httpCode == 200) {
                            System.out.println("开始正常爬取网页");
                            entity = response.getEntity();
                            isr = new InputStreamReader(entity.getContent(), "utf-8");
                            br = new BufferedReader(isr);
                            String line = null;
                            StringBuilder stringBuilder = new StringBuilder();
                            while ((line = br.readLine()) != null) {
                                stringBuilder.append(line + "\n");
                            }
                            crawlResultPojo.setSuccess(true);
                            crawlResultPojo.setPageContent(stringBuilder.toString());
                        } else {
                            if (list != null) {
                                deleteInvalidIP(proxyPath, list.get(index));
                                System.err.println("爬取的网页不正常!");
                            } else {
                                System.err.println("没有好用的代理IP了!");
                            }
                        }
                    } catch (Exception e) {
                        e.printStackTrace();
                        deleteInvalidIP(proxyPath, list.get(index));
                    } finally {
                        timer.cancel();
                        httpGet.releaseConnection();

                    }
                } else {
                    try {
                        httpClient = HttpClients.custom().build();
                        timer = new Timer();
                        timerTask = new ReleaseIdleConnTask(httpClient);
                        timer.schedule(timerTask, 60 * 1000);

                        response = httpClient.execute(httpPost);
                        httpCode = response.getStatusLine().getStatusCode();
                        if (httpCode == 200) {
                            System.out.println("开始正常爬取网页");
                            entity = response.getEntity();
                            isr = new InputStreamReader(entity.getContent(), "utf-8");
                            br = new BufferedReader(isr);
                            String line = null;
                            StringBuilder stringBuilder = new StringBuilder();
                            while ((line = br.readLine()) != null) {
                                stringBuilder.append(line + "\n");
                            }
                            crawlResultPojo.setSuccess(true);
                            crawlResultPojo.setPageContent(stringBuilder.toString());
                        } else {
                            if (list != null) {
                                deleteInvalidIP(proxyPath, list.get(index));
                                System.err.println("爬取的网页不正常!");
                            } else {
                                System.err.println("没有好用的代理IP了!");
                            }
                        }
                    } catch (Exception e) {
                        e.printStackTrace();
                        deleteInvalidIP(proxyPath, list.get(index));
                    } finally {
                        timer.cancel();
                        if (httpPost != null) {
                            httpPost.releaseConnection();
                        }
                    }
                }
            }

        } catch (
                Exception e) {
            e.printStackTrace();
            crawlResultPojo.setSuccess(false);
        } finally {
            if (br != null) {
                try {
                    br.close();
                } catch (IOException e) {
                    e.printStackTrace();
                    System.out.println("流最终未关闭!");
                }
            }
            if (response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                    System.out.println("流最终未关闭!");
                }
            } else {
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            try {
                if (httpClient != null) {

                    httpClient.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return crawlResultPojo;
    }


 

你可能感兴趣的:(java)