HttpClient是Apache Jakarta Common下的子项目,可以用来提供高效的、最新的、功能丰富的支持HTTP协议的客户端编程工具包,并且它支持 HTTP 协议最新的版本。它的主要功能有:
jsoup是一款Java的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。它的主要功能有:
- (1) 从一个URL,文件或字符串中解析HTML;
- (2) 使用DOM或CSS选择器来查找、取出数据;
- (3) 可操作HTML元素、属性、文本;
pom.xml文件依赖如下:
<dependency>
<groupId>org.apache.httpcomponentsgroupId>
<artifactId>httpclientartifactId>
<version>4.5.2version>
dependency>
<dependency>
<groupId>org.jsoupgroupId>
<artifactId>jsoupartifactId>
<version>1.8.3version>
dependency>
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;
import java.util.List;
/**
* HttpClient & Jsoup libruary test class
*
* Created by xuyh at 2017/11/6 15:28.
*/
public class HttpClientJsoupTest {
@Test
public void test() {
//通过httpClient获取网页响应,将返回的响应解析为纯文本
HttpGet httpGet = new HttpGet("http://sports.sina.com.cn/");
httpGet.setConfig(RequestConfig.custom().setSocketTimeout(30000).setConnectTimeout(30000).build());
CloseableHttpClient httpClient = null;
CloseableHttpResponse response = null;
String responseStr = "";
try {
httpClient = HttpClientBuilder.create().build();
HttpClientContext context = HttpClientContext.create();
response = httpClient.execute(httpGet, context);
int state = response.getStatusLine().getStatusCode();
if (state != 200)
responseStr = "";
HttpEntity entity = response.getEntity();
if (entity != null)
responseStr = EntityUtils.toString(entity, "utf-8");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (response != null)
response.close();
if (httpClient != null)
httpClient.close();
} catch (Exception ex) {
ex.printStackTrace();
}
}
if (responseStr == null)
return;
//将解析到的纯文本用Jsoup工具转换成Document文档并进行操作
Document document = Jsoup.parse(responseStr);
List elements = document.getElementsByAttributeValue("class", "phdnews_txt fr").first()
.getElementsByAttributeValue("class", "phdnews_hdline");
elements.forEach(element -> {
for (Element e : element.getElementsByTag("a")) {
System.out.println(e.attr("href"));
System.out.println(e.text());
}
});
}
}
HttpGet httpGet = new HttpGet("http://sports.sina.com.cn/");
httpGet.setConfig(RequestConfig.custom().setSocketTimeout(30000).setConnectTimeout(30000).build());
CloseableHttpClient httpClient = null;
CloseableHttpResponse response = null;
String responseStr = "";
try {
httpClient = HttpClientBuilder.create().build();
HttpClientContext context = HttpClientContext.create();
response = httpClient.execute(httpGet, context);
int state = response.getStatusLine().getStatusCode();
if (state != 200)
responseStr = "";
HttpEntity entity = response.getEntity();
if (entity != null)
responseStr = EntityUtils.toString(entity, "utf-8");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (response != null)
response.close();
if (httpClient != null)
httpClient.close();
} catch (Exception ex) {
ex.printStackTrace();
}
}
Document document = Jsoup.parse(responseStr);
List elements = document.getElementsByAttributeValue("class", "phdnews_txt fr").first()
.getElementsByAttributeValue("class", "phdnews_hdline");
elements.forEach(element -> {
for (Element e : element.getElementsByTag("a")) {
System.out.println(e.attr("href"));
System.out.println(e.text());
}
});
public Element getElementById(String id);//通过id获取元素
public Elements getElementsByClass(String className);//通过className获取元素
public Elements getElementsByAttributeValue(String key, String value);//通过属性值获取元素
public Elements getElementsByTag(String tagName);//通过标签名获取元素
public String attr(String attributeKey);//获取本元素的属性值
public String text();//获取本元素的内容
<div class="code">
<div>
<br>
这是第一个段落。
<br>
div>
div>
http://sports.sina.com.cn/sportsevents/3v3/2017-11-05/doc-ifynmzrs7218551.shtml
3X3黄金联赛冠军赛山西队夺冠!独享48万
http://video.sina.com.cn/sports/k/cba/1105final3x3/
视频
http://video.sina.com.cn/p/sports/k/v/doc/2017-11-05/181467390769.html
黄金mvp集锦
http://video.sina.com.cn/p/sports/k/v/doc/2017-11-05/170167390621.html
直捣黄龙1v2
http://video.sina.com.cn/p/sports/k/v/doc/2017-11-05/183267390917.html
5佳球:库里式虚晃
http://video.sina.com.cn/p/sports/k/v/doc/2017-11-05/150067390331.html
大嫂徐冬冬亮相
http://video.sina.com.cn/p/sports/k/v/doc/2017-11-05/145367390313.html
现场众多美女云集
http://video.sina.com.cn/p/sports/c/zj/v/doc/2017-11-05/150867390337.html
啦啦队热舞表演
http://sports.sina.com.cn/nba/
哈登56分周琦暴扣火箭胜
http://sports.sina.com.cn/basketball/nba/2017-11-06/doc-ifynmzrs7300047.shtml
詹皇26分骑士负
将HttpClient和Jsoup进行封装,形成一个工具类,内容如下:
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import javax.net.ssl.*;
import java.io.IOException;
import java.security.GeneralSecurityException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
*
* Http工具,包含:
* 普通http请求工具(使用httpClient进行http,https请求的发送)
*
* Created by xuyh at 2017/7/17 19:08.
*/
public class HttpUtils {
/**
* 请求超时时间,默认20000ms
*/
private int timeout = 20000;
/**
* cookie表
*/
private Map cookieMap = new HashMap<>();
/**
* 请求编码(处理返回结果),默认UTF-8
*/
private String charset = "UTF-8";
private static HttpUtils httpUtils;
private HttpUtils() {
}
/**
* 获取实例
*
* @return
*/
public static HttpUtils getInstance() {
if (httpUtils == null)
httpUtils = new HttpUtils();
return httpUtils;
}
/**
* 清空cookieMap
*/
public void invalidCookieMap() {
cookieMap.clear();
}
public int getTimeout() {
return timeout;
}
/**
* 设置请求超时时间
*
* @param timeout
*/
public void setTimeout(int timeout) {
this.timeout = timeout;
}
public String getCharset() {
return charset;
}
/**
* 设置请求字符编码集
*
* @param charset
*/
public void setCharset(String charset) {
this.charset = charset;
}
/**
* 将网页返回为解析后的文档格式
*
* @param html
* @return
* @throws Exception
*/
public static Document parseHtmlToDoc(String html) throws Exception {
return removeHtmlSpace(html);
}
private static Document removeHtmlSpace(String str) {
Document doc = Jsoup.parse(str);
String result = doc.html().replace(" ", "");
return Jsoup.parse(result);
}
/**
* 执行get请求,返回doc
*
* @param url
* @return
* @throws Exception
*/
public Document executeGetAsDocument(String url) throws Exception {
return parseHtmlToDoc(executeGet(url));
}
/**
* 执行get请求
*
* @param url
* @return
* @throws Exception
*/
public String executeGet(String url) throws Exception {
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));
httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
CloseableHttpClient httpClient = null;
String str = "";
try {
httpClient = HttpClientBuilder.create().build();
HttpClientContext context = HttpClientContext.create();
CloseableHttpResponse response = httpClient.execute(httpGet, context);
getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
int state = response.getStatusLine().getStatusCode();
if (state == 404) {
str = "";
}
try {
HttpEntity entity = response.getEntity();
if (entity != null) {
str = EntityUtils.toString(entity, charset);
}
} finally {
response.close();
}
} catch (IOException e) {
throw e;
} finally {
try {
if (httpClient != null)
httpClient.close();
} catch (IOException e) {
throw e;
}
}
return str;
}
/**
* 用https执行get请求,返回doc
*
* @param url
* @return
* @throws Exception
*/
public Document executeGetWithSSLAsDocument(String url) throws Exception {
return parseHtmlToDoc(executeGetWithSSL(url));
}
/**
* 用https执行get请求
*
* @param url
* @return
* @throws Exception
*/
public String executeGetWithSSL(String url) throws Exception {
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));
httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
CloseableHttpClient httpClient = null;
String str = "";
try {
httpClient = createSSLInsecureClient();
HttpClientContext context = HttpClientContext.create();
CloseableHttpResponse response = httpClient.execute(httpGet, context);
getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
int state = response.getStatusLine().getStatusCode();
if (state == 404) {
str = "";
}
try {
HttpEntity entity = response.getEntity();
if (entity != null) {
str = EntityUtils.toString(entity, charset);
}
} finally {
response.close();
}
} catch (IOException e) {
throw e;
} catch (GeneralSecurityException ex) {
throw ex;
} finally {
try {
if (httpClient != null)
httpClient.close();
} catch (IOException e) {
throw e;
}
}
return str;
}
/**
* 执行post请求,返回doc
*
* @param url
* @param params
* @return
* @throws Exception
*/
public Document executePostAsDocument(String url, Map params) throws Exception {
return parseHtmlToDoc(executePost(url, params));
}
/**
* 执行post请求
*
* @param url
* @param params
* @return
* @throws Exception
*/
public String executePost(String url, Map params) throws Exception {
String reStr = "";
HttpPost httpPost = new HttpPost(url);
httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));
List paramsRe = new ArrayList<>();
for (String key : params.keySet()) {
paramsRe.add(new BasicNameValuePair(key, params.get(key)));
}
CloseableHttpClient httpclient = HttpClientBuilder.create().build();
CloseableHttpResponse response;
try {
httpPost.setEntity(new UrlEncodedFormEntity(paramsRe));
HttpClientContext context = HttpClientContext.create();
response = httpclient.execute(httpPost, context);
getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
HttpEntity entity = response.getEntity();
reStr = EntityUtils.toString(entity, charset);
} catch (IOException e) {
throw e;
} finally {
httpPost.releaseConnection();
}
return reStr;
}
/**
* 用https执行post请求,返回doc
*
* @param url
* @param params
* @return
* @throws Exception
*/
public Document executePostWithSSLAsDocument(String url, Map params) throws Exception {
return parseHtmlToDoc(executePostWithSSL(url, params));
}
/**
* 用https执行post请求
*
* @param url
* @param params
* @return
* @throws Exception
*/
public String executePostWithSSL(String url, Map params) throws Exception {
String re = "";
HttpPost post = new HttpPost(url);
List paramsRe = new ArrayList<>();
for (String key : params.keySet()) {
paramsRe.add(new BasicNameValuePair(key, params.get(key)));
}
post.setHeader("Cookie", convertCookieMapToString(cookieMap));
post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
CloseableHttpResponse response;
try {
CloseableHttpClient httpClientRe = createSSLInsecureClient();
HttpClientContext contextRe = HttpClientContext.create();
post.setEntity(new UrlEncodedFormEntity(paramsRe));
response = httpClientRe.execute(post, contextRe);
HttpEntity entity = response.getEntity();
if (entity != null) {
re = EntityUtils.toString(entity, charset);
}
getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);
} catch (Exception e) {
throw e;
}
return re;
}
/**
* 发送JSON格式body的POST请求
*
* @param url 地址
* @param jsonBody json body
* @return
* @throws Exception
*/
public String executePostWithJson(String url, String jsonBody) throws Exception {
String reStr = "";
HttpPost httpPost = new HttpPost(url);
httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));
CloseableHttpClient httpclient = HttpClientBuilder.create().build();
CloseableHttpResponse response;
try {
httpPost.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));
HttpClientContext context = HttpClientContext.create();
response = httpclient.execute(httpPost, context);
getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
HttpEntity entity = response.getEntity();
reStr = EntityUtils.toString(entity, charset);
} catch (IOException e) {
throw e;
} finally {
httpPost.releaseConnection();
}
return reStr;
}
/**
* 发送JSON格式body的SSL POST请求
*
* @param url 地址
* @param jsonBody json body
* @return
* @throws Exception
*/
public String executePostWithJsonAndSSL(String url, String jsonBody) throws Exception {
String re = "";
HttpPost post = new HttpPost(url);
post.setHeader("Cookie", convertCookieMapToString(cookieMap));
post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
CloseableHttpResponse response;
try {
CloseableHttpClient httpClientRe = createSSLInsecureClient();
HttpClientContext contextRe = HttpClientContext.create();
post.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));
response = httpClientRe.execute(post, contextRe);
HttpEntity entity = response.getEntity();
if (entity != null) {
re = EntityUtils.toString(entity, charset);
}
getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);
} catch (Exception e) {
throw e;
}
return re;
}
private void getCookiesFromCookieStore(CookieStore cookieStore, Map cookieMap) {
List cookies = cookieStore.getCookies();
for (Cookie cookie : cookies) {
cookieMap.put(cookie.getName(), cookie.getValue());
}
}
private String convertCookieMapToString(Map map) {
String cookie = "";
for (String key : map.keySet()) {
cookie += (key + "=" + map.get(key) + "; ");
}
if (map.size() > 0) {
cookie = cookie.substring(0, cookie.length() - 2);
}
return cookie;
}
/**
* 创建 SSL连接
*
* @return
* @throws GeneralSecurityException
*/
private static CloseableHttpClient createSSLInsecureClient() throws GeneralSecurityException {
try {
SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, (chain, authType) -> true).build();
SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(sslContext,
(s, sslContextL) -> true);
return HttpClients.custom().setSSLSocketFactory(sslConnectionSocketFactory).build();
} catch (GeneralSecurityException e) {
throw e;
}
}
}
上面的工具类不仅可以进行网页内容的获取,还能够进行http请求的发送。
https://github.com/johnsonmoon/HttpUtils.git