Java 网络爬虫具有很好的扩展性可伸缩性,其是目前搜索引擎开发的重要组成部分。例如,著
名的网络爬虫工具 Nutch 便是采用 Java 开发
<dependencies>
<dependency>
<groupId>org.apache.httpcomponentsgroupId>
<artifactId>httpclientartifactId>
<version>4.5.3version>
dependency>
<dependency>
<groupId>org.jsoupgroupId>
<artifactId>jsoupartifactId>
<version>1.10.3version>
dependency>
<dependency>
<groupId>junitgroupId>
<artifactId>junitartifactId>
<version>4.12version>
dependency>
<dependency>
<groupId>org.apache.commonsgroupId>
<artifactId>commons-lang3artifactId>
<version>3.7version>
dependency>
<dependency>
<groupId>commons-iogroupId>
<artifactId>commons-ioartifactId>
<version>2.6version>
dependency>
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-log4j12artifactId>
<version>1.7.25version>
dependency>
dependencies>
log4j.rootLogger=DEBUG,A1
log4j.logger.cn.itcast = DEBUG
log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{
yyyy-MM-dd
HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n
URLConnection 是 JDK 自带的一个抽象类,其代表应用程序和 URL 之间的通信链接。在网络爬虫中,我们可以使用 URLConnection 请求一个 URL 地址,然后获取流信息,通过对流信息的操作,可获得请求到的实体内容
package cn.itcast;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
/**
* 演示原生JDK-API-URLConnection发送get和post请求
*/
public class JDKAPITest {
@Test
public void testGet() throws Exception{
//1.确定首页的URL
URL url = new URL("http://www.itcast.cn/?username=zhangsan");
//2.通过url对象获取远程连接
HttpURLConnection urlConnection =
(HttpURLConnection)url.openConnection();
//3.设置请求方式 请求参数 请求头
urlConnection.setRequestMethod("GET"); //设置请求方式的时候一定要大写, 默认的请求方式是GET
urlConnection.setRequestProperty("User-Agent", "Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.108 Safari/537.36");
urlConnection.setConnectTimeout(30000); //连接超时 单位毫秒
urlConnection.setReadTimeout(30000); //读取超时 单位毫秒
//4.获取数据
InputStream in = urlConnection.getInputStream();
BufferedReader reader = new BufferedReader(new
InputStreamReader(in));
String line;
String html = "";
while ((line = reader.readLine()) != null) {
html += line + "\n";
}
System.out.println(html);
in.close();
reader.close();
}
@Test
public void testPost() throws Exception{
//1.确定首页的URL
URL url = new URL("http://www.itcast.cn");
//2.获取远程连接
HttpURLConnection urlConnection =(HttpURLConnection)
url.openConnection();
//3.设置请求方式 请求参数 请求头
urlConnection.setRequestMethod("POST");
urlConnection.setDoOutput(true); // 原生jdk默认关闭了输出流
urlConnection.setRequestProperty("User-Agent", "Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.108 Safari/537.36");
urlConnection.setConnectTimeout(30000); //连接超时 单位毫秒
urlConnection.setReadTimeout(30000); //读取超时 单位毫秒
OutputStream out = urlConnection.getOutputStream();
out.write("username=zhangsan&password=123".getBytes());
//4.获取数据
InputStream in = urlConnection.getInputStream();
BufferedReader reader = new BufferedReader(new
InputStreamReader(in));
String line;
String html = "";
while ((line = reader.readLine()) != null) {
html += line + "\n";
}
System.out.println(html);
in.close();
reader.close();
}
}
HttpClient 是 Apache Jakarta Common 下的子项目,用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包,并且它支持 HTTP 协议最新的版本和建议。其相比于传统JDK 自带的URLConnection,增加了易用性和灵活性。其功能主要是用来向服务器发送请求,并返回相关资源。在网络爬虫实战中,经常使用 HttpClient 获取网页内容,使用 jsoup 解析网页内容。
package cn.itcast;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.junit.Test;
/**
* 演示使用HttpClient实现网络爬虫
*/
public class HttpClientTest {
@Test
public void testGet() throws Exception {
//1.创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.创建HttpClient请求并进行相关设置
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36 SLBrowser/6.0.1.6181");
//3.发起请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.获取响应数据
if (response.getStatusLine().getStatusCode()==200){
String html = EntityUtils.toString(response.getEntity(),"utf-8");
System.out.println(html);
}
//5.关闭资源
httpClient.close();
response.close();
}
}
package cn.itcast;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.junit.Test;
import java.util.ArrayList;
import java.util.List;
/**
* 演示使用HttpClient实现网络爬虫
*/
public class HttpClientTest {
@Test
public void testPost() throws Exception {
//1.创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.创建HttpPost对象并进行相关配置
HttpPost httpPost = new HttpPost("http://www.itcast.cn");
//给HttpPost设置参数
//准备集合用来存放请求参数
List<NameValuePair> params = new ArrayList<NameValuePair>();
params.add(new BasicNameValuePair("username","java"));
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(params, "UTF-8");
httpPost.setEntity(entity);//设置请求体,参数放在请求体中
//设置请求头
httpPost.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36 SLBrowser/6.0.1.6181");
//3.发起请求
CloseableHttpResponse response = httpClient.execute(httpPost);
//4.判断响应状态码并获取数据
if (response.getStatusLine().getStatusCode()==200){
//获取到响应体,设置编码
String html = EntityUtils.toString(response.getEntity(),"UTF-8");
System.out.println(html);
}
//5.关闭资源
response.close();
httpClient.close();
}
}
package cn.itcast;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.junit.Test;
/**
* 演示使用HttpClient连接池
*/
public class HttpClientTest {
@Test
public void testPool() throws Exception {
//1.创建HttpClient连接管理器
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
//2.设置参数
cm.setMaxTotal(200);//设置最大连接数
cm.setDefaultMaxPerRoute(20);//设置每个主机的最大并发
doGet(cm);
doGet(cm);
}
private void doGet(PoolingHttpClientConnectionManager cm) throws Exception {
//3.从连接池中获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();//在这里加上断点观察到,每次从池中获取到一个HttpClient对象
//4.创建HttpGet对象
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
//5.发生请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//6.获取数据
if (response.getStatusLine().getStatusCode()==200){
String html = EntityUtils.toString(response.getEntity(), "UTF-8");
System.out.println(html);
}
//7.关闭资源
response.close();
//httpClient.close();
//注意这里不要关闭HttpClient对象,因为使用连接池,HttpClient对象使用完之后应该要还回到池中,而不是关闭
}
}
package cn.itcast;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.junit.Test;
/**
* 演示使用HttpClient的相关设置实现网络爬虫
*/
public class HttpClientTest {
@Test
public void testConfig() throws Exception {
//0.创建请求配置对象
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(10000)//设置连接超时时间
.setConnectTimeout(10000)//设置创建超时时间
.setConnectionRequestTimeout(10000)//设置请求超时时间
.setProxy(new HttpHost("58.220.95.78",9401))//添加代理
.build();
//1.创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
//2.创建HttpGet对象
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
//3.发起请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.获取响应数据
if (response.getStatusLine().getStatusCode()==200){
String html = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(html);
}
//5.关闭资源
httpClient.close();
response.close();
}
}
package cn.itcast.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
/**
* 封装HttpClient工具,方便爬取网页内容
*/
public abstract class HttpUtils {
private static PoolingHttpClientConnectionManager cm = null;//声明HttpClient管理器对象(HttpClient连接池)
private static List<String> userAgentList = null;
private static RequestConfig config = null;
//静态代码块会在类被加载的时候执行
static {
cm = new PoolingHttpClientConnectionManager();
cm.setMaxTotal(200);
cm.setDefaultMaxPerRoute(20);
config = RequestConfig.custom()
.setSocketTimeout(10000)
.setConnectTimeout(10000)
.setConnectionRequestTimeout(10000)
.build();
userAgentList = new ArrayList<String>();
userAgentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36");
userAgentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15;rv:73.0) Gecko/20100101 Firefox/73.0");
userAgentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3)AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15");
userAgentList.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299");
userAgentList.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
userAgentList.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0");
}
public static String getHtml(String url){
//1.从连接池中获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//2.创建HttpGet对象
HttpGet httpGet = new HttpGet(url);
//3.设置请求头和请求配置对象
httpGet.setConfig(config);
httpGet.setHeader("User-Agent",userAgentList.get(new Random().nextInt(userAgentList.size())));
CloseableHttpResponse response = null;
//4.发起请求
try {
response = httpClient.execute(httpGet);
//5.获取响应内容
if (response.getStatusLine().getStatusCode()==200){
String html = "";
if (response.getEntity()!=null){
html = EntityUtils.toString(response.getEntity(), "utf-8");
}
return html;
}
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
public static void main(String[] args) {
String html = HttpUtils.getHtml("http://www.itcast.cn");
System.out.println(html);
}
}