注解:此处仅仅是介绍一些类库及常规使用,如果要详细了解Http协议推荐看下《Http权威指南》
。net 方向 主要是用到HttpWebRequest下载内容:
JAVA方向:
主要是用到HttpClient下载内容
示例代码:
相关类库(httpclient-4.1.2 httpcore-4.1.4)
示例代码c#:
package com.data.crawl.qa.baiduzhidao;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.cookie.CookieSpecProvider;
import org.apache.http.impl.cookie.BestMatchSpecFactory;
import org.apache.http.impl.cookie.BrowserCompatSpecFactory;
/**
* HttpClient连接池
*
* @author wqj
*
*/
public class HttpClientPool {
private static Log log = LogFactory.getLog(HttpClientPool.class);
/**
* 最大HttpClient连接数
*/
private final int MAX_TOTAL_CONNECTIONS = 10;
/**
* HttpClient连接池
*/
private PoolingHttpClientConnectionManager connectionManager;
/**
* cookie 上下文
*/
protected HttpClientContext context = null;
/**
* default constructor
*/
public HttpClientPool(){
connectionManager = new PoolingHttpClientConnectionManager();
/* 连接池最大生成连接数200 */
connectionManager.setMaxTotal(MAX_TOTAL_CONNECTIONS);
/* 默认设置route最大连接数为20 */
connectionManager.setDefaultMaxPerRoute(10);
// 实例化cookie
context = HttpClientContext.create();
Registry<CookieSpecProvider> registry = RegistryBuilder.<CookieSpecProvider> create()
.register(CookieSpecs.BEST_MATCH, new BestMatchSpecFactory())
.register(CookieSpecs.BROWSER_COMPATIBILITY, new BrowserCompatSpecFactory()).build();
context.setCookieSpecRegistry(registry);
}
/**
* 从线程池实例化HttpClient
*
* @return
*/
private CloseableHttpClient getHttpClient() {
int socketTimeOut = 120000;
int connectionTimeOut = 60000;
RequestConfig config = RequestConfig.custom().setSocketTimeout(socketTimeOut)
.setConnectTimeout(connectionTimeOut).setCookieSpec(CookieSpecs.BEST_MATCH).build();
return HttpClients.custom().setDefaultRequestConfig(config).setConnectionManager(connectionManager).build();
}
/**
* Post方式
*/
public String Post(String uri, Map<String, String> params) {
CloseableHttpClient httpclient = getHttpClient();
HttpPost httpost = new HttpPost(uri);
List<NameValuePair> post_data = new ArrayList<NameValuePair>();
Set<String> keySet = params.keySet();
for (String key : keySet) {
post_data.add(new BasicNameValuePair(key, params.get(key)));
}
CloseableHttpResponse response = null;
try {
httpost.setEntity(new UrlEncodedFormEntity(post_data, "UTF-8"));
response = httpclient.execute(httpost, context);
//默认编码
String charset = "utf-8";
HttpEntity entity = response.getEntity();
String html = null;
if (entity != null) {
InputStream in = entity.getContent();
/* 侦测编码 */
ByteArrayOutputStream swapStream = new ByteArrayOutputStream();
byte[] buff = new byte[1024];
int rc = 0;
while ((rc = in.read(buff, 0, 1024)) > 0) {
swapStream.write(buff, 0, rc);
}
byte[] data = swapStream.toByteArray();
String charset_1 = Icu4jDetector.getEncode(data);
charset = charset_1 == null ? charset : charset_1;
html = new String(data, charset);
System.out.println(html);
in.close();
}
return html;
} catch (UnsupportedEncodingException e) {
log.error(e.getMessage());
} catch (ClientProtocolException e) {
log.error(e.getMessage());
} catch (IOException e) {
log.error(e.getMessage());
}
return null;
}
/**
* 模拟登陆时,访问首页时使用此方法,此方法不带cookie
*
* @param uri 统一资源定位符
* @return html文档
*/
public String downHtml(String uri) {
CloseableHttpClient httpclient = getHttpClient();
HttpGet httpget = new HttpGet(uri);
CloseableHttpResponse response = null;
try {
response = httpclient.execute(httpget);
/* 判断访问的状态码 */
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode != HttpStatus.SC_OK) {
log.info("request failed: " + response.getStatusLine());
return null;
}
/* 侦测编码 */
Pattern pattern = Pattern.compile("text/html;[\\s]*charset=(.*)");
Header[] arr = response.getHeaders("Content-Type");
String charset = "utf-8";
if (arr != null) {
String content = arr[0].getValue().toLowerCase();
Matcher m = pattern.matcher(content);
if (m.find()) {
charset = m.group(1);
}
}
HttpEntity entity = response.getEntity();
String html = null;
if (entity != null) {
InputStream in = entity.getContent();
/* 侦测编码 */
ByteArrayOutputStream swapStream = new ByteArrayOutputStream();
byte[] buff = new byte[1024];
int rc = 0;
while ((rc = in.read(buff, 0, 1024)) > 0) {
swapStream.write(buff, 0, rc);
}
byte[] data = swapStream.toByteArray();
String charset_1 = Icu4jDetector.getEncode(data);
charset = charset_1 == null ? charset : charset_1;
html = new String(data, charset);
in.close();
}
return html;
} catch (ClientProtocolException e) {
log.info(e.getMessage());
} catch (IOException e) {
log.info(e.getMessage());
}
return null;
}
}