建议:事先定义一个线程池进行线程托管,推荐线程数20需定义:pool、worker、task、queue等参数(在此并不进行线程的讨论)
一、请求模拟
- 定义默认的一个closeableHttpClient
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
RequestConfig config= RequestConfig.custom().setConnectTimeout(10*1000).setConnectionRequestTimeout(3*1000).setSocketTimeout(10 * 1000).build();
httpGet.setConfig(config);
httpclient.execute(httpGet)
closeableHttpResponse.getStatusLine().getStatusCode()==20
EntityUtils.toString(closeableHttpResponse.getEntity(), CHARSET);
Document doc=jsop.parse(String)
二、document内容判断
Elements links = document.select("a[href]");//
for (Element link : links) { ***}
thread1.start();//下文的三为在此执行
threads.destory();
三、(上文的异步的性质)可以链接里的需要获得的内容进行抓取下载例如(img)
URL url = new URL(downloadFileUrl);
URLConnection urlConnection = url.openConnection();
urlConnection.setConnectTimeout(10*1000);
urlConnection.setReadTimeout(7*1000);
- 请求头的属性设置 在此使用的是mozilla内核参数
urlConnection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
InputStream inputStream = urlConnection.getInputStream();
byte[] bytes = new byte[1024 * 4];
int len=0;
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
while ((len=inputStream.read(bytes))!=-1){
byteArrayOutputStream.write(bytes,0,len);
}
byte[] dataArrByte = byteArrayOutputStream.toByteArray();
byteArrayOutputStream.close();
File downloadFile = new File(dirPath);
if (!downloadFile.exists()){
downloadFile.mkdirs();
}
if (!new File(dirPath+CrawlerUrl.fileName(downloadFileUrl)).exists()){
File file = new File(downloadFile + File.separator + CrawlerUrl.fileName(downloadFileUrl));
FileOutputStream fileOutputStream = new FileOutputStream(file);
fileOutputStream.write(dataArrByte);
if (fileOutputStream!=null){
fileOutputStream.close();
}
System.out.println("文件:["+CrawlerUrl.fileName(downloadFileUrl)+"]本地路径为"+dirPath+CrawlerUrl.fileName(downloadFileUrl));
}else {
System.out.println("文件:["+CrawlerUrl.fileName(downloadFileUrl)+"]已存在!!!!!!!");
}
if (inputStream!=null){
inputStream.close();
}