我开发了一个爬虫程序使用的是httpclient4.3.5+jsoup1.7.2,发现在爬取数据的时候有的时候会卡住,发现每次都是执行到
response = httpClient.execute(httpGet);
这句话的时候,我百度之后,发了一句话说的很好:
我们知道Socket在读数据的时候是阻塞式的,如果没有读到数据程序会一直阻塞在那里。在同步请求的时候我们肯定是不能允许这样的情况发生的,这就需要我们在请求达到一定的时间后控制阻塞的中断,让程序得以继续运行。Socket为我们提供了一个setSoTimeout()方法来设置接收数据的超时时间,单位是毫秒。当设置的超时时间大于0,并且超过了这一时间Socket还没有接收到返回的数据的话,Socket就会抛出一个SocketTimeoutException。
于是我设置了sockettimeout,但是还是会卡住,有的人说将httpGet释放,response释放,对我的程序来说都没啥用,我发现卡住之后response会没有返回值。后来我在网上继续寻找答案,看到一篇博文,说是可以使用定时器来设置时间,如果httpClient.execute(httpGet)执行过长,将httpClient关闭。我最初写代码的时候将httpClient写成了单例!!!结果httpClient释放资源之后,再次请求,就报错,代码写了一段时间不看很容易忘记。
代码修改之后有了下面的代码:
public CrawlResultPojo crawler(ProxyIP proxy, int index, String proxyPath) {
CrawlResultPojo crawlResultPojo = new CrawlResultPojo();
if (urlPojo == null || urlPojo.getUrl() == null || urlPojo.getUrl().equals("")) {
crawlResultPojo.setSuccess(false);
crawlResultPojo.setPageContent(null);
return crawlResultPojo;
}
BufferedReader br = null;
CloseableHttpResponse response = null;
RequestConfig requestConfig = null;
URIBuilder uriBuilder = null;
Map parasMap = null;
List params = null;
BasicNameValuePair basicNameValuePair = null;
HttpGet httpGet = null;
HttpPost httpPost = null;
HttpEntity entity = null;
InputStreamReader isr = null;
int httpCode = 0;
CloseableHttpClient httpClient = null;
try {
params = new LinkedList<>();
parasMap = urlPojo.getParasMap();
uriBuilder = new URIBuilder(urlPojo.getUrl());
if (proxy == null) {
requestConfig = RequestConfig.custom().setConnectTimeout(2000).setSocketTimeout(5000)
.setConnectionRequestTimeout(2000).build();
System.err.println("请使用代理IP进行爬去,或者调用crawl()!");
} else {
HttpHost httpHost = new HttpHost(proxy.getIp(), proxy.getPort());
requestConfig = RequestConfig.custom().setProxy(httpHost).setConnectTimeout(50000).setSocketTimeout(5000)
.setConnectionRequestTimeout(5000).build();
if (urlPojo.getHttpMethodEnum() == HttpMethodEnum.GET) {
if (parasMap == null) {
httpGet = new HttpGet(urlPojo.getUrl());
httpGet.setConfig(requestConfig);
//专门用来模拟浏览器,让我的请求看起来和真实的浏览器一样
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpGet.setHeader("Accept-Encoding", "gzip, deflate");
httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("Cookie", "_ga=GA1.2.70069932.1538118002; cookie_consent_dismissed=1; _gid=GA1.2.1456316958.1539831293; PHPSESSID=v7g0jk1v5pkqjpjcdtiejjpj04; _gat=1");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0");
httpGet.setHeader("Host", "www.devicespecifications.com");
httpGet.setHeader("Referer", "https://www.devicespecifications.com/en");
} else {
for (Entry entrySet :
parasMap.entrySet()) {
String key = entrySet.getKey();
Object value = entrySet.getValue();
basicNameValuePair = new BasicNameValuePair(key, value.toString());
params.add(basicNameValuePair);
uriBuilder.setParameters(params);
}
httpGet = new HttpGet(uriBuilder.build());
httpGet.setConfig(requestConfig);
}
} else {
if (parasMap == null) {
httpPost = new HttpPost(urlPojo.getUrl());
httpPost.setConfig(requestConfig);
//专门用来模拟浏览器
httpPost.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpPost.setHeader("Accept-Encoding", "gzip, deflate");
httpPost.setHeader("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
httpPost.setHeader("Cache-Control", "no-cache");
httpPost.setHeader("Cookie", "screen=%7B%22w%22%3A1366%2C%22h%22%3A768%2C%22d%22%3A1%7D;");
httpPost.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
} else {
for (Entry entrySet :
parasMap.entrySet()) {
String key = entrySet.getKey();
Object value = entrySet.getValue();
basicNameValuePair = new BasicNameValuePair(key, value.toString());
params.add(basicNameValuePair);
uriBuilder.setParameters(params);
}
httpPost = new HttpPost(uriBuilder.build());
httpPost.setConfig(requestConfig);
}
}
if (httpGet != null) {
try {
httpClient = HttpClients.custom().build();
timer = new Timer();
timerTask = new ReleaseIdleConnTask(httpClient);
timer.schedule(timerTask, 60 * 1000);
response = httpClient.execute(httpGet);
httpCode = response.getStatusLine().getStatusCode();
if (httpCode == 200) {
System.out.println("开始正常爬取网页");
entity = response.getEntity();
isr = new InputStreamReader(entity.getContent(), "utf-8");
br = new BufferedReader(isr);
String line = null;
StringBuilder stringBuilder = new StringBuilder();
while ((line = br.readLine()) != null) {
stringBuilder.append(line + "\n");
}
crawlResultPojo.setSuccess(true);
crawlResultPojo.setPageContent(stringBuilder.toString());
} else {
if (list != null) {
deleteInvalidIP(proxyPath, list.get(index));
System.err.println("爬取的网页不正常!");
} else {
System.err.println("没有好用的代理IP了!");
}
}
} catch (Exception e) {
e.printStackTrace();
deleteInvalidIP(proxyPath, list.get(index));
} finally {
timer.cancel();
httpGet.releaseConnection();
}
} else {
try {
httpClient = HttpClients.custom().build();
timer = new Timer();
timerTask = new ReleaseIdleConnTask(httpClient);
timer.schedule(timerTask, 60 * 1000);
response = httpClient.execute(httpPost);
httpCode = response.getStatusLine().getStatusCode();
if (httpCode == 200) {
System.out.println("开始正常爬取网页");
entity = response.getEntity();
isr = new InputStreamReader(entity.getContent(), "utf-8");
br = new BufferedReader(isr);
String line = null;
StringBuilder stringBuilder = new StringBuilder();
while ((line = br.readLine()) != null) {
stringBuilder.append(line + "\n");
}
crawlResultPojo.setSuccess(true);
crawlResultPojo.setPageContent(stringBuilder.toString());
} else {
if (list != null) {
deleteInvalidIP(proxyPath, list.get(index));
System.err.println("爬取的网页不正常!");
} else {
System.err.println("没有好用的代理IP了!");
}
}
} catch (Exception e) {
e.printStackTrace();
deleteInvalidIP(proxyPath, list.get(index));
} finally {
timer.cancel();
if (httpPost != null) {
httpPost.releaseConnection();
}
}
}
}
} catch (
Exception e) {
e.printStackTrace();
crawlResultPojo.setSuccess(false);
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
System.out.println("流最终未关闭!");
}
}
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
System.out.println("流最终未关闭!");
}
} else {
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
try {
if (httpClient != null) {
httpClient.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return crawlResultPojo;
}