HttpClient是Apache中一个开源的项目,用来提供高效的,最新的,功能丰富的支持HTTP协议的客户端编程工具包,并且它支持HTTP协议最新版本和建议。
引入依赖
org.apache.httpcomponents
httpclient
4.5.2
org.slf4j
slf4j-log4j12
1.7.25
配置文件
log4j.rootLogger=DEBUGOER, m
log4j.appender.m=org.apache.log4j.ConsoleAppender
log4j.appender.m.layout=org.apache.log4j.PatternLayout
log4j.appender.m.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
例子:
package com.sihi.crawler.test;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class TestHttpClient {
public static void main(String[] args) throws IOException {
//1.打开浏览器,创建HTTP客户端
CloseableHttpClient client = HttpClients.createDefault();
//2.输入网址,创建请求
// HttpGet httpGet = new HttpGet("http://www.sikiedu.com");
HttpGet httpGet = new HttpGet("http://www.sikiedu.com/course/search?categoryId=0&orderBy=recommendedSeq");
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75");
//3.发送请求,发送请求
CloseableHttpResponse response = client.execute(httpGet);
//4.服务器响应,解析响应
HttpEntity entity = response.getEntity();
System.out.println(EntityUtils.toString(entity,"UTF-8"));
}
}
package com.sihi.crawler.test;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class TestHttpGet {
public static void main(String[] args) {
//get无参请求
//httpGet();
//get带参数
//httpgetByParam1();
//get带参数2
httpgetByParam2();
}
private static void httpGet(){
//1.创建httpClient
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.创建httpGet请求
HttpGet httpGet = new HttpGet("http://www.sikiedu.com");
//3.执行请求
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();//获得状态码
if(200 == statusCode){
HttpEntity entity = response.getEntity();
System.out.println(EntityUtils.toString(entity,"UTF-8"));
}
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
if(response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
private static void httpgetByParam1(){
//1.创建httpClient
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.创建httpGet请求
HttpGet httpGet = new HttpGet("http://www.sikiedu.com/course/search?categoryId=0&orderBy=recommendedSeq");
//3.执行请求
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();//获得状态码
if(200 == statusCode){
HttpEntity entity = response.getEntity();
System.out.println(EntityUtils.toString(entity,"UTF-8"));
}
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
if(response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
private static void httpgetByParam2(){
//1.创建httpClient
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.创建httpGet请求
//HttpGet httpGet = new HttpGet("http://www.sikiedu.com/course/search?categoryId=0&orderBy=recommendedSeq");
URI uri = null;
try {
uri = new URIBuilder().setScheme("http").setHost("www.sikiedu.com").setPath("/course/search")
.setParameter("categoryId", "0")
//.setParameters(new BasicNameValuePair("categoryId", "0"),new BasicNameValuePair("orderBy", "recommendedSeq")).build();
.setParameter("orderBy", "recommendedSeq").build();
} catch (URISyntaxException e) {
e.printStackTrace();
}
HttpGet httpGet = new HttpGet(uri);
//3.执行请求
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();//获得状态码
if(200 == statusCode){
HttpEntity entity = response.getEntity();
System.out.println(EntityUtils.toString(entity,"UTF-8"));
}
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
if(response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
导入alibaba的一个json的解析依赖
com.alibaba
fastjson
1.2.68
package com.sihi.crawler.test;
import com.alibaba.fastjson.JSONObject;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class TestHttpPost {
public static void main(String[] args) {
CloseableHttpClient httpClient = HttpClients.createDefault();
List nameValuePairs = new ArrayList();
nameValuePairs.add(new BasicNameValuePair("app","ip.local"));
nameValuePairs.add(new BasicNameValuePair("format","json"));
//http://api.k780.com/?app=ip.local&format=json
URI uri = null;
try {
uri = new URIBuilder().setScheme("http").setHost("api.k780.com").setParameters(nameValuePairs).build();
} catch (URISyntaxException e) {
e.printStackTrace();
}
HttpPost httpPost = new HttpPost(uri);
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpPost);
int statusCode = response.getStatusLine().getStatusCode();
if (200 == statusCode){
HttpEntity entity = response.getEntity();
//System.out.println(EntityUtils.toString(entity,"UTF-8"));
String json = EntityUtils.toString(entity,"UTF-8");
//解析json拿到具体的ip {"success":"1","result":{"ip":"113.101.45.58","proxy":"1","att":"中国,广东,揭阳","operators":"电信"}}
Map map =(Map) JSONObject.parse(json);
Map result =(Map) map.get("result");
Object ip = result.get("ip");
System.out.println(ip.toString());
}
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
if(response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
爬虫一个相同的IP地址频繁的访问相同的服务器,就会被禁止,所以设置一个互联网上的一个IP地址去访问,获得响应
https://www.xicidaili.com/nn 西刺代理
package com.sihi.crawler.test;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class TestHttpPool {
public static void main(String[] args) {
//配置连接池的参数
PoolingHttpClientConnectionManager poolManger = new PoolingHttpClientConnectionManager();
poolManger.setMaxTotal(100);//最大连接数
poolManger.setDefaultMaxPerRoute(20);//路由最大连接数
//配置连接池中连接的参数
RequestConfig config = RequestConfig.custom()
.setConnectTimeout(5000)//发送请求的超时时间
.setSocketTimeout(2000)//响应超时时间
.setConnectionRequestTimeout(500)//从连接池中获取的超时时间
.setProxy(new HttpHost("114.105.103.142",4216)) //设置代理IP和端口号
.build();
//拿到httpClient
CloseableHttpClient httpClient = HttpClients.custom()
.setConnectionManager(poolManger)
.setDefaultRequestConfig(config)
.build();
//2.创建httpGet请求
HttpGet httpGet = new HttpGet("http://www.sikiedu.com");
//3.执行请求
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();//获得状态码
if(200 == statusCode){
HttpEntity entity = response.getEntity();
System.out.println(EntityUtils.toString(entity,"UTF-8"));
}
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
if(response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
将上面的操作封装成一个工具类
HttpClientUtil
package com.sihi.crawler.test;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpClientUtil {
private static PoolingHttpClientConnectionManager poolManger;
private static RequestConfig config;
public static CloseableHttpClient getHttpClient(){
if(poolManger == null){
//配置连接池的参数
poolManger = new PoolingHttpClientConnectionManager();
poolManger.setMaxTotal(100); //最大的连接数
poolManger.setDefaultMaxPerRoute(20); //路由最大连接数
}
if(config == null){
//配置连接池中连接的参数
config = RequestConfig.custom()
.setConnectTimeout(5000)//发送请求的超时时间
.setSocketTimeout(2000)//响应超时时间
.setConnectionRequestTimeout(500)//从连接池中获取的超时时间
//.setProxy(new HttpHost("114.105.103.142",4216)) //设置代理IP和端口号
.build();
}
//拿到httpClient
CloseableHttpClient httpClient = HttpClients.custom()
.setConnectionManager(poolManger)
.setDefaultRequestConfig(config)
.build();
return httpClient;
}
/**
* 执行get请求返回的结果
* @param url
* @return
*/
public static String doGet(String url){
String result = "";
//1.创建httpClient
CloseableHttpClient httpClient = getHttpClient();
//2.创建httpGet请求
HttpGet httpGet = new HttpGet(url);
//3.执行请求
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();//获得状态码
if(200 == statusCode){
HttpEntity entity = response.getEntity();
result = EntityUtils.toString(entity,"UTF-8");
}
} catch (IOException e) {
e.printStackTrace();
}finally {
if(response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return result;
}
/**
* 执行post请求返回的结果
* @param url
* @return
*/
public static String doPost(String url){
String result = "";
//1.创建httpClient
CloseableHttpClient httpClient = getHttpClient();
//2.创建httpGet请求
HttpPost httpPost = new HttpPost(url);
//3.执行请求
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpPost);
int statusCode = response.getStatusLine().getStatusCode();//获得状态码
if(200 == statusCode){
HttpEntity entity = response.getEntity();
result = EntityUtils.toString(entity,"UTF-8");
}
} catch (IOException e) {
e.printStackTrace();
}finally {
if(response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return result;
}
}
Java爬虫②Jsoup