HttpClient可以用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包,并且它支持 HTTP 协议最新的版本和建议。
HttpClient的作用
jsoup是一款Java的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。
JSoup的作用
httpClient 属于专业的抓取网页的库,可以设置代理,抓取失败可以重试抓取
在我的实际使用中,单独用jsoup也可以直接抓取网页,但是在抓取上,jsoup比较弱,API简单,功能也简单,主要是扩展htmlparser的功能吧,解析html。测试过程中jsoup抓取页面经常报错(time out等等)。
因此,我们可以用httpclient抓取网页,再用Jsoup.parse解析页面。
<dependency>
<groupId>org.jsoupgroupId>
<artifactId>jsoupartifactId>
<version>1.12.1version>
dependency>
<dependency>
<groupId>org.apache.httpcomponentsgroupId>
<artifactId>httpclientartifactId>
<version>4.5.10version>
dependency>
@SuppressWarnings("unused")
public class HttpClientUtils {
private static String CHARSET = "utf-8";
private static Integer STATUS_CODE = 200;
private static CloseableHttpClient httpClient;
private static Pattern pattern = Pattern.compile("([\\s\\S]*?) true;
javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
javax.net.ssl.TrustManager tm = new miTM();
trustAllCerts[0] = tm;
javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext
.getInstance("SSL");
sc.init(null, trustAllCerts, null);
javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(5000).build();
httpClient = HttpClientBuilder.create().setDefaultRequestConfig(config).setSSLContext(sc).setSSLHostnameVerifier(hv).build();
} catch (Exception e) {
e.printStackTrace();
}
}
static class miTM implements javax.net.ssl.TrustManager, javax.net.ssl.X509TrustManager {
@Override
public void checkClientTrusted(X509Certificate[] x509Certificates, String s) {
}
@Override
public void checkServerTrusted(X509Certificate[] x509Certificates, String s) {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
}
/**
* 带请求头的GET请求 (已设置默认请求头)
*
* @param url 链接url
* @return 网页内容
*/
public static String doGetWithHeaders(String url) {
//
String responseTex = "";
CloseableHttpResponse response = null;
try {
// 2.创建uri对象
URIBuilder builder = new URIBuilder(url);
URI uri = builder.build();
// 3.创建http GET请求
HttpGet httpGet = new HttpGet(uri);
//携带请求头的信息
httpGet.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
httpGet.setHeader("Accept-Encoding", "gzip, deflate, br");
httpGet.setHeader("Accept-Language", "en-US,en;q=0.9,zh;q=0.8,zh-CN;q=0.7");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36");
// 4.执行请求
response = httpClient.execute(httpGet);
// 5.判断返回状态是否为200
if (response.getStatusLine().getStatusCode() == STATUS_CODE) {
// 6.进行编码自适应处理
HttpEntity entity = response.getEntity();
byte[] bytes = EntityUtils.toByteArray(entity);
String content = new String(bytes);
Matcher matcher = pattern.matcher(content.toLowerCase());
if (matcher.find()) {
CHARSET = matcher.group(4);
String gb2312 = "gb2312";
if (gb2312.equals(CHARSET)) {
byte[] gbkBytes = new String(bytes, "gb2312").getBytes();
responseTex = new String(gbkBytes);
}
}
responseTex = new String(bytes, CHARSET);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
close(httpClient, response);
}
return responseTex;
}
/**
* 携带请求参数的GET请求
*
* @param url 链接url
* @param params 请求参数
* @return 网页内容
*/
public static String doGet(String url, Map params) {
//
String responseTex = "";
CloseableHttpResponse response = null;
try {
// 2.创建uri对象
URIBuilder builder = new URIBuilder(url);
if (params != null && !params.isEmpty()) {
for (String key : params.keySet()) {
builder.addParameter(key, params.get(key));
}
}
URI uri = builder.build();
// 3.创建http GET请求
HttpGet httpGet = new HttpGet(uri);
// 4.执行请求
response = httpClient.execute(httpGet);
// 5.判断返回状态是否为200
if (response.getStatusLine().getStatusCode() == STATUS_CODE) {
// 6.进行UTF-8编码处理
responseTex = EntityUtils.toString(response.getEntity(), CHARSET);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
close(httpClient, response);
}
return responseTex;
}
/**
* 不需要携带参数的GET请求
*
* @param url 链接url
* @return 网页内容
*/
public static String doGet(String url) {
return doGet(url, null);
}
/**
* 既携带请求头又携带请求参数的GET请求
* 该方法未实现
*
* @param url 链接url
* @param params 请求参数
* @param header 请求头
* @return 网页内容
*/
public static String doGet(String url, Map params, Map header) {
return null;
}
/**
* 携带请求参数的POST请求
*
* @param url 链接url
* @param param 请求参数
* @return 网页内容
*/
public static String doPost(String url, Map param) {
//
String result = "";
CloseableHttpResponse response = null;
try {
// 2.创建Http Post请求
HttpPost httpPost = new HttpPost(url);
// 3.创建参数列表
if (param != null) {
List paramList = new ArrayList<>();
for (String key : param.keySet()) {
paramList.add(new BasicNameValuePair(key, param.get(key)));
}
// 4.模拟表单
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(paramList);
httpPost.setEntity(entity);
}
// 5.执行http请求
response = httpClient.execute(httpPost);
// 6.获取响应的结果
result = EntityUtils.toString(response.getEntity(), CHARSET);
} catch (Exception e) {
e.printStackTrace();
} finally {
close(httpClient, response);
}
return result;
}
/**
* 发送无携带请求参数的POST请求
*
* @param url 链接url
* @return 网页内容
*/
public static String doPost(String url) {
return doPost(url, null);
}
/**
* 以json的方式传递请求参数,发送POST请求
*
* @param url 链接url
* @param json json格式的参数
* @return 网页内容
*/
public static String doPostJson(String url, String json) {//map json
//
String result = "";
CloseableHttpResponse response = null;
try {
// 2.创建Http Post请求
HttpPost httpPost = new HttpPost(url);
// 3.创建请求内容
StringEntity entity = new StringEntity(json, ContentType.APPLICATION_JSON);
httpPost.setEntity(entity);
// 4.执行http请求
response = httpClient.execute(httpPost);
// 5.获取响应结果
result = EntityUtils.toString(response.getEntity(), "utf-8");
} catch (Exception e) {
e.printStackTrace();
} finally {
close(httpClient, response);
}
return result;
}
/**
* 释放资源
*
* @param httpClient httpClient
* @param httpResponse httpResponse
*/
private static void close(CloseableHttpClient httpClient, CloseableHttpResponse httpResponse) {
if (httpResponse != null) {
try {
httpResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//暂不释放资源
// try {
//
// httpClient.close();
// } catch (IOException e) {
// e.printStackTrace();
// }
}
}
注意:该工具类的url地址要带http协议或https协议,否则会报错
public class Test4 {
public static void main(String[] args) {
String uri = "https://www.yiibai.com/jsoup/jsoup-quick-start.html";
String html = HttpClientUtils.doGet(uri);
//使用JSoup解析html
Document document = Jsoup.parse(html);
//使用document.getElementsByTag("a") 获取所有a标签 通过forEach遍历内容
Elements aTag = document.getElementsByTag("a");
for (Element element : aTag) {
//使用text()方法获取文本内容
String text = element.text();
//使用html()方法获取标签体的HTML
String html1 = element.html();
//attr(String key)来获取属性为key的内容
String href = element.attr("href");
}
//使用document.getElementById("xx")获取id为xx的标签
Element id = document.getElementById("qq-group");
String text = id.text();
//attributes()获取所有属性
Attributes attributes = id.attributes();
//输出属性和属性值
for (Attribute attribute : attributes) {
String key = attribute.getKey();
String value = attribute.getValue();
System.out.println("key="+key+"--->value="+value);
}
//document.getElementsByClass("yy")获取class为yy的所有标签
Elements aClass = document.getElementsByClass("article-content");
}
}
public class Test5 {
public static void main(String[] args) {
String uri = "https://www.yiibai.com/jsoup/jsoup-quick-start.html";
String html = HttpClientUtils.doGet(uri);
//使用JSoup解析html
Document document = Jsoup.parse(html);
//select("tagname") 通过标签查找元素
Elements aTag = document.select("a");
//select("#id") 通过ID查找元素
Elements id = document.select("#qq-group");
//select(".class") 通过class名称查找元素
Elements class1 = document.select(".article-content");
//select("[attribute]") 通过属性查找元素
Elements href = document.select("[href]");
//select(":contains(text)") 查找包含给定文本的元素,搜索不区分大不写
Elements contains = document.select(":contains(JSoup安装)");
for (Element element : contains) {
//输出包含 'JSoup安装' 内容的链接
String href1 = element.attr("href");
if (href1!=""){
System.out.println(href1);
}
}
//select(":matches(regex)") 查找哪些元素的文本匹配指定的正则表达式
Elements select = document.select(":matches(regex)");
}
}