1.添加相依依赖
2.编写工具类,根据url(需要爬取的页面)参数,使用HttpClient连接到网页获取网页源码
3.根据获取到的html格式的网页源码,使用 Jsoup获取所需的 Element 元素及各属性值
一、依赖
org.apache.httpcomponents
httpclient
4.5.6
org.jsoup
jsoup
1.8.3
二、工具类:
package com.gourd.base.utils;
import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContexts;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import javax.net.ssl.SSLContext;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.KeyStore;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
/**
* JsoupHttpClient工具类
*
* @author gourd
*/
@Slf4j
public class JsoupHttpClientUtils {
private final static String DEFAULT_CHARSET = "utf8";
/**
* 根据url获取网页源码
*
* @param url 需要爬取的网页URL
* @return
*/
public static String getHtmlByUrl(String url) {
// 获取到的网页源码
String html = null;
// 建立一个新的请求客户端
CloseableHttpClient httpClient = null;
if (url.startsWith("https://")) {
httpClient = getHttpsClient();
} else {
httpClient = HttpClients.createDefault();
}
// 使用HttpGet的方式请求网址
HttpGet httpGet = new HttpGet(url);
// 模拟浏览器
httpGet.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
// 使用代理 IP
// HttpHost proxy = new HttpHost("118.114.77.47", 8080);
RequestConfig config = RequestConfig.custom()
.setConnectionRequestTimeout(10000)
// 设置连接超时时间 10秒钟
.setConnectTimeout(10000)
// 设置读取超时时间10秒钟
.setSocketTimeout(10000)
.build();
httpGet.setConfig(config);
// 获取网址的返回结果
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
if (response != null && response.getStatusLine().getStatusCode() == 200) {
//获取返回结果中的实体
HttpEntity entity = response.getEntity();
//将返回的实体输出
html = EntityUtils.toString(entity, DEFAULT_CHARSET);
} else {
log.error("获取网页源码失败");
}
} catch (IOException e) {
log.error("获取网页源码异常:", e);
} finally {
// 关闭资源
try {
if (response != null) {
response.close();
}
} catch (IOException e) {
log.error("response关闭错误,请检查原因");
}
try {
if(httpClient != null){
httpClient.close();
}
} catch (IOException e) {
log.error("httpClient关闭错误,请检查原因");
}
}
return html;
}
/**
* 根据html获取元素
*
* @param html 网页源码
* @param tagName 标签名
* @return
*/
public static Elements getElements(String html, String tagName) {
// 解析网页 得到文档对象
Document doc = Jsoup.parse(html);
// 获取tag是tagName的所有DOM元素,数组
Elements elements = doc.getElementsByTag(tagName);
return elements;
}
/**
* 获取https连接(不验证证书)
*
* @return
*/
private static CloseableHttpClient getHttpsClient() {
RegistryBuilder registryBuilder = RegistryBuilder.create();
ConnectionSocketFactory plainSF = new PlainConnectionSocketFactory();
registryBuilder.register("http", plainSF);
// 指定信任密钥存储对象和连接套接字工厂
try {
KeyStore trustStore = KeyStore.getInstance(KeyStore.getDefaultType());
// 信任任何链接
TrustStrategy anyTrustStrategy = new TrustStrategy() {
@Override
public boolean isTrusted(java.security.cert.X509Certificate[] arg0, String arg1) throws java.security.cert.CertificateException {
return true;
}
};
SSLContext sslContext = SSLContexts.custom().useTLS().loadTrustMaterial(trustStore, anyTrustStrategy).build();
LayeredConnectionSocketFactory sslSF = new SSLConnectionSocketFactory(sslContext, SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
registryBuilder.register("https", sslSF);
} catch (KeyStoreException e) {
throw new RuntimeException(e);
} catch (KeyManagementException e) {
throw new RuntimeException(e);
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
Registry registry = registryBuilder.build();
// 设置连接管理器
PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(registry);
// 构建客户端
return HttpClientBuilder.create().setConnectionManager(connManager).build();
}
}
三、调用测试:
import com.gourd.base.utils.JsoupHttpClientUtils;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import org.jsoup.select.Elements;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
/**
* @author gourd
* createAt: 2018/9/17
*/
@RestController
@Api(description = "爬虫控制器")
@RequestMapping("/spider")
public class SpiderController {
@GetMapping(value = "/spider" )
@ApiOperation(value = "获取网页爬虫数据", notes = "获取网页爬虫数据")
public void logout(@RequestParam String url){
String htmlByUrl = JsoupHttpClientUtils.getHtmlByUrl(url);
Elements elements = JsoupHttpClientUtils.getElements(htmlByUrl, "a");
System.out.println("success");
}
}