定向爬虫基本原理
定向爬虫就是指定某一些网站的数据源做为数据来源,进行页面数据的抓取,这里有别于传统的搜索引擎爬虫,传统的搜索引擎爬虫主要是针对整个互联网的数据进行爬取以及数据分析,难度更大,不论是从抓取的调度,还是性能要求,又或者是数据的存储都有很大的区别。
定向爬虫只有单个或者少量的网站做为数据源头,抓取整个网站有用的数据以及图片等信息,本文主要介绍利用Java开源库用于处理http请求以及进行页面的解析的库。本文主要介绍httpclient和jsoup两个,以及一些爬取策略。
页面抓取
页面的抓取,其实简单的来说就是模拟浏览器发送http请求,并得到响应的过程,我们可以用firefox里的httpfox插件来观察一个网站请求需要的参数以及一些特别的策略,如下图所示,
通过对百度页面的访问可以看到它的,http请求的内容,包括http 请求的header,以及cookie,得到的页面内容如下所示:
返回的内容就是页面的html信息,这里可以通过xpath 或者 jsoup来解析,得到页面的具体内容,具体细节可以能考:http://jsoup.org/cookbook/input/parse-document-from-string
关于抓取被封IP的问题
每个网站对于自己的信息都会有所保护,所以对于爬虫来说,有时候,长时间的对一个网站的抓取会IP会被禁止,这里http请求返回的内容有如下几种:
1. 返回403,503的错误码,并且没有页面的内容
2. 返回一个带有验证码的页面
此时,有几种方式避免这个问题:
1. 在请求页面的时候,进行速度的控制,一般网站会对同一个IP的访问频率做监控,如果频率过高就会被认为是恶意访问
2. 在请求页面的时候,换不同的cookie,cookie做为用户的一个标识,如果换不同的cookie进行访问,就会认为是不同的用户,以避免网站判定同一个用户的恶意访问
3. 另外,可以通过代理IP,使得请求伪装成不同的用记在不同的机器上做的访问
四、httpclient的代码
1. 普通http请求
<span style="font-size:14px;">import java.io.IOException;
import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.Header;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import com.travel.utils.ConfigTool;
import com.travel.utils.TravelUtils;
public class NewFetchTask extends HttpTask{
static public final Log log = LogFactory.getLog(NewFetchTask.class);
static public int TIMEOUT_MS;
static{
TIMEOUT_MS = Integer.parseInt(ConfigTool.props.getProperty("timeout_ts", "20000"));
}
public String url;
public int trynt = 10;
public String charsetName = "UTF-8";
public String run() {
for(int i=0;i<trynt;i++){
HttpGet m = null;
CloseableHttpClient client = HttpClients.createDefault();
CloseableHttpResponse response = null;
RequestConfig config = null;
try{
if(proxy!=null && TravelUtils.isNotEmpty(proxy.getIp())){
log.info("use proxy "+proxy.getIp()+" port:"+proxy.getPort());
HttpHost httpProxy = new HttpHost(proxy.getIp(), proxy.getPort(), "http");
config = RequestConfig.custom()
.setSocketTimeout(TIMEOUT_MS)
.setConnectTimeout(TIMEOUT_MS)
.setCircularRedirectsAllowed(true)
.setRedirectsEnabled(true)
.setConnectionRequestTimeout(TIMEOUT_MS)
.setProxy(httpProxy)
.build();
}else {
config = RequestConfig.custom()
.setSocketTimeout(TIMEOUT_MS)
.setConnectTimeout(TIMEOUT_MS)
.setCircularRedirectsAllowed(true)
.setRedirectsEnabled(true)
.setConnectionRequestTimeout(TIMEOUT_MS)
.build();
log.info("use local http");
}
m = new HttpGet(url);
if(config!=null)m.setConfig(config);
if(params!=null){
//TODO
}
if(headers!=null){
for(Header header:headers){
m.addHeader(convert(header));
}
}
if(cookies!=null){
for(Cookie c:cookies){
// client.getState().addCookie(c);
}
}
response = client.execute(m);
for(org.apache.http.Header header:response.getAllHeaders()){
log.debug(header.getName()+"="+header.getValue());
}
int code = response.getStatusLine().getStatusCode();
redirectUrl = m.getURI().toString();
if(code==404){
log.warn("page has removed code:"+code+" for url:"+url);
return null;
}
if(code == 403 || code == 503 ){//|| code==500){
log.warn("ip is forbidden code:"+code+" for url:"+url);
throw new Exception("Exception response code :"+code);
}
if(code!=200){
log.warn("code:"+code+" for url:"+url + " try after 10s");
sleep(10);
throw new Exception("Exception response code :"+code);
}
HttpEntity entity = response.getEntity();
String result = EntityUtils.toString(entity);
if(TravelUtils.isEmpty(result)){
throw new Exception("Exception http response is empty ");
}
EntityUtils.consume(entity);
return result;
}catch(Exception e){
log.info("http request try "+(i+1)+" times url:"+url);
log.warn("exception proxy:"+(proxy==null?"local":proxy.getIp()), e);
//if code is 403 503 ,then there is no need to retry
sleep(1);
if((code==403||code==503)&&i==2){
proxy.setBad(true);
break;
}
}finally{
if(m!=null){
m.releaseConnection();
m.abort();
}
try {
if(response!=null)response.close();
} catch (IOException e) {
log.warn("exception",e);
}
try {
if(client!=null)client.close();
} catch (IOException e) {
log.warn("exception",e);
}
}
}
log.warn("url request exception:"+url);
if(proxy!=null)proxy.setBad(true);
return null;
}
}</span>
2. 抓取异步的ajax请求
import java.io.InputStream;
import java.util.Map;
import java.util.concurrent.Future;
import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.Header;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.ning.http.client.AsyncHttpClient;
import com.ning.http.client.AsyncHttpClient.BoundRequestBuilder;
import com.ning.http.client.ProxyServer;
import com.ning.http.client.Response;
import com.travel.comment.fetcher.common.HttpCommonUtils;
import com.travel.utils.TravelUtils;
public class AsyncFetchTask extends HttpTask{
static public final Log log = LogFactory.getLog(AsyncFetchTask.class);
public String url;
public int trynt = 3;
public Map<String,String> params;
public String charsetName = "UTF-8";
public String run(){
for(int i=0;i<trynt;i++){
AsyncHttpClient client = new AsyncHttpClient();
try {
BoundRequestBuilder requestBuilder = client.preparePost(url);
if(params!=null)for(String k:params.keySet()){
requestBuilder.addFormParam(k, params.get(k));
}
if(headers!=null){
for(Header header:headers){
requestBuilder.addHeader(header.getName(), header.getName());
}
}
if(proxy!=null && TravelUtils.isNotEmpty(proxy.getIp())){
log.info("use proxy "+proxy.getIp()+" port:"+proxy.getPort());
requestBuilder.setProxyServer(new ProxyServer(proxy.getIp(),
proxy.getPort()));
}else log.info("use local http");
if(cookies!=null){
for(Cookie c:cookies){
requestBuilder.addCookie(convert(c));
}
}
Future<Response> f = requestBuilder.execute();
Response r = f.get();
code = r.getStatusCode();
if(code==404){
log.warn("page has removed code:"+code+" for url:"+url);
return null;
}
if(code == 403 || code == 503 || code==500){
log.warn("ip is forbidden code:"+code+" for url:"+url);
throw new Exception("Exception response code :"+code);
}
if(code!=200){
log.warn("code:"+code+" for url:"+url);
sleep(10);
throw new Exception("Exception response code :"+code);
}
InputStream in = r.getResponseBodyAsStream();
String result = HttpCommonUtils.readFromInputStream(in,charsetName);
if(TravelUtils.isEmpty(result)){
throw new Exception("Exception http response is empty ");
}
return result;
} catch (Exception e) {
log.info("http request try "+(i+1)+" times url:"+url);
log.warn("exception",e);
//if code is 403 503 ,then there is no need to retry
if(code==403||code==503){
proxy.setBad(true);
break;
}else sleep(1);
} finally{
client.close();
}
}
log.warn("url request exception:"+url);
proxy.setBad(true);
return null;
}
public com.ning.http.client.cookie.Cookie convert(Cookie c){
return new com.ning.http.client.cookie.Cookie(c.getName(),
c.getValue(),
"", c.getDomain(), c.getPath(), 100000,
10000, false, false);
}
}
3. 对应http post json
import java.io.IOException;
import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.Header;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import com.travel.utils.ConfigTool;
import com.travel.utils.TravelUtils;
public class PostJsonTask extends HttpTask {
static public final Log log = LogFactory.getLog(PostJsonTask.class);
static public int TIMEOUT_MS;
static{
TIMEOUT_MS = Integer.parseInt(ConfigTool.props.getProperty("timeout_ts", "20000"));
}
public String url;
public int trynt = 10;
public String charsetName = "UTF-8";
public String run() {
for(int i=0;i<trynt;i++){
HttpPost m = null;
CloseableHttpClient client = HttpClients.createDefault();
CloseableHttpResponse response = null;
RequestConfig config = null;
try{
if(proxy!=null && TravelUtils.isNotEmpty(proxy.getIp())){
log.info("use proxy "+proxy.getIp()+" port:"+proxy.getPort());
HttpHost httpProxy = new HttpHost(proxy.getIp(), proxy.getPort(), "http");
config = RequestConfig.custom()
.setSocketTimeout(TIMEOUT_MS)
.setConnectTimeout(TIMEOUT_MS)
.setCircularRedirectsAllowed(true)
.setConnectionRequestTimeout(TIMEOUT_MS)
.setRedirectsEnabled(true)
.setProxy(httpProxy)
.build();
}else log.info("use local http");
m = new HttpPost(url);
if(config!=null)m.setConfig(config);
if(params!=null){
//TODO
}
if(TravelUtils.isNotEmpty(jsonParams)){
StringEntity entity = new StringEntity(jsonParams);
entity.setContentEncoding("UTF-8");
entity.setContentType("application/json");
m.setEntity(entity);
}
if(headers!=null){
for(Header header:headers){
m.addHeader(convert(header));
}
}
if(cookies!=null){
for(Cookie c:cookies){
// client.getState().addCookie(c);
}
}
response = client.execute(m);
for(org.apache.http.Header header:response.getAllHeaders()){
log.debug(header.getName()+"="+header.getValue());
}
int code = response.getStatusLine().getStatusCode();
redirectUrl = m.getURI().toString();
if(code==404){
log.warn("page has removed code:"+code+" for url:"+url);
return null;
}
if(code == 403 || code == 503){
log.warn("ip is forbidden code:"+code+" for url:"+url);
throw new Exception("Exception response code :"+code);
}
if(code!=200){
log.warn("code:"+code+" for url:"+url + " try after 10s");
sleep(10);
throw new Exception("Exception response code :"+code);
}
HttpEntity entity = response.getEntity();
String result = EntityUtils.toString(entity);
if(TravelUtils.isEmpty(result)){
throw new Exception("Exception http response is empty ");
}
EntityUtils.consume(entity);
return result;
}catch(Exception e){
log.info("http request try "+(i+1)+" times url:"+url);
log.warn("exception proxy:"+(proxy==null?"local":proxy.getIp()), e);
//if code is 403 503 ,then there is no need to retry
sleep(1);
if((code==403||code==503)&&i==2){
proxy.setBad(true);
break;
}
}finally{
if(m!=null){
m.releaseConnection();
m.abort();
}
try {
if(response!=null)response.close();
} catch (IOException e) {
log.warn("exception",e);
}
try {
if(client!=null)client.close();
} catch (IOException e) {
log.warn("exception",e);
}
}
}
log.warn("url request exception:"+url);
proxy.setBad(true);
return null;
}
}
以上为不同的请求方式去抓取不同的页面
五、关于抓取频率的控制
Http ConcurrencyControll
<span style="font-size:18px;">import java.util.Calendar;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.stereotype.Component;
import com.travel.utils.ConfigTool;
import com.travel.utils.SourceName;
@Component
public class ConcurrencyController implements InitializingBean, DisposableBean {
static public final Log log = LogFactory
.getLog(ConcurrencyController.class);
private int sourceTurnOffTime;
Map<String, Semaphore> semaphoreMap;
Map<String, Long> sourceSwitch;
public boolean turnOff(String source,long untilTime){
sourceSwitch.put(source, untilTime);
return true;
}
public boolean turnOffWithFixedTime(String source){
Calendar calendar = Calendar.getInstance();
int minute = calendar.get(Calendar.MINUTE);
calendar.set(Calendar.MINUTE, minute+sourceTurnOffTime);
sourceSwitch.put(source, calendar.getTimeInMillis());
log.warn("stop fetch:"+source+" untill "+calendar.getTime());
return true;
}
public boolean turnOn(String source){
long currTs = Calendar.getInstance().getTimeInMillis();
sourceSwitch.put(source, currTs);
return true;
}
public boolean isSourceTurnOn(String source){
long currTs = Calendar.getInstance().getTimeInMillis();
long ts = sourceSwitch.get(source);
if(currTs > ts)return true;
return false;
}
public void borrow(String sourceName) {
try {
semaphoreMap.get(sourceName).acquire();
log.info("source:" + sourceName + " borrow a token");
log.info(sourceName + " remain token:"
+ semaphoreMap.get(sourceName).availablePermits());
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public void release(String sourceName) {
try {
semaphoreMap.get(sourceName).release();
log.info("source:" + sourceName + " return a token");
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void destroy() throws Exception {
}
}</span>
这里主要使用信号量来做为请求的控制,每一次请求之前先取一个“准许证”,只有得到准许证的才可以访问,否则
<span style="font-size:18px;">semaphoreMap.get(sourceName).acquire();</span>
会使爬虫block在这里
这里信息量的凭证,是人为设置的,另外如果使用的代理服务器,则可以用下面的方式进行控制,也就是说,只有得到代理使用权的线程可以进行下去
<span style="font-size:18px;">@Component
public class ProxyPool implements InitializingBean{
static public final Log log = LogFactory.getLog(ProxyPool.class);
private int proxy_delay;
Map<String,Deque<Proxy>> proxyMap;
ScheduledExecutorService exe ;
public Proxy borrow(String source){
Proxy proxy = proxyMap.get(source).pollFirst();
if(proxy!=null){
log.info(source+" borrow:"+proxy.getIp());
}else{
log.info(source+" failed to borrow");
}
return proxy;
}
public Proxy borrowUntill(String source){
Proxy proxy = borrow(source);
while(proxy==null){
try {
TimeUnit.SECONDS.sleep(10);
log.info(source+" cann't borrow proxy.");
} catch (InterruptedException e) {
log.warn("InterruptedException",e);
}
proxy = borrow(source);
}
return proxy;
}
public boolean release(String source,Proxy proxy){
if(isOneShotProxy==1)return true;
if(proxy.isBad()){
releaseWithDelay(source, proxy);
}else{
while(true){
try{
proxyMap.get(source).addLast(proxy);
log.info("return proxy:"+source+" ip:"+proxy.getIp());
return true;
}catch(Exception e){
log.warn("exception",e);
}
}
}
return true;
}
public void releaseWithDelay(final String source,final Proxy proxy){
if(isAutoProxy==1)return;
log.info("release source with delay:"+source+" ip:"+proxy.getIp());
exe.schedule(new Runnable(){
public void run(){
while(true){
try{
proxy.setBad(false);
proxyMap.get(source).addLast(proxy);
log.info("return proxy after delay:"+source+" ip:"+proxy.getIp());
return;
}catch(Exception e){
log.warn("exception",e);
}
}
}
}, proxy_delay, TimeUnit.MINUTES);
}
private int isAutoProxy ;
private int isOneShotProxy;
@Autowired
private ProxyCheckService proxyCheckService;
@Autowired
private ExecutorServiceFactory threadPoolFactory;
@Override
public void afterPropertiesSet() throws Exception {
final String[] sources = ConfigTool.props.getProperty("source_list","yelp").split(",");
isOneShotProxy = Integer.parseInt(ConfigTool.props.getProperty("is_one_shot_proxy", "0"));
int proxyFlag = Integer.parseInt(ConfigTool.props.getProperty("proxy","1"));
int useLocal = Integer.parseInt(ConfigTool.props.getProperty("use_local","1"));
// int loadProxyFreq = Integer.parseInt(ConfigTool.props.getProperty("load_proxy_freq", "5"));
proxy_delay = Integer.parseInt(ConfigTool.props
.getProperty("proxy_delay", "1"));
exe = threadPoolFactory.newScheduledExecutorService("proxy-pool-thread", 10, Thread.NORM_PRIORITY);
List<String> proxys = FileUtils.readLinesFromFile("proxy_list");
proxyMap = new ConcurrentHashMap<String,Deque<Proxy>>();
for(String source:SourceName.COMMENT_SOURCES){
proxyMap.put(source, new LinkedBlockingDeque<Proxy>());
if(useLocal==1){
Proxy local = new Proxy("",0);
proxyMap.get(source).addLast(local);
}
}
exe.scheduleWithFixedDelay(new Runnable(){
public void run(){
for(String source:SourceName.COMMENT_SOURCES){
log.info("【proxy pool】source:"+source+"proxy number:"+proxyMap.get(source).size());
}
}
}, 10,10, TimeUnit.SECONDS);
if(proxyFlag==0)return;
isAutoProxy = Integer.parseInt(ConfigTool.props.getProperty("is_auto_proxy", "0"));
if(isAutoProxy!=1)for(String str:proxys){
log.info("load proxy:"+str);
String[] p = str.split("=");
for(String source:SourceName.COMMENT_SOURCES){
Proxy proxy = new Proxy(p[0],Integer.parseInt(p[1]));
proxyMap.get(source).addLast(proxy);
}
}
if(isAutoProxy==1){
exe.submit(new Runnable(){
public void run(){
log.info("load proxy with fixed delay");
while(true){
try{
//loadAutoProxy(sources);
for(String source:sources){
proxyCheckService.fetchProxy(proxyMap,source);
}
TimeUnit.SECONDS.sleep(5);
}catch(Exception e){
log.warn("exceptino",e);
}
}
}
});
}
}
public void loadAutoProxy(String[] sources){
log.info("start loadAutoProxy");
List<String> proxys = FileUtils.readLinesFromFile("proxy_list");
for(String str:proxys){
String[] fields = str.split("\t");
String[] p = fields[0].split(":");
String source = fields[1];
Proxy proxy = new Proxy(p[0],Integer.parseInt(p[1]));
Deque<Proxy> proxyDeque = proxyMap.get(source);
if(proxyDeque==null)continue;
proxyDeque.addLast(proxy);
log.info("load proxy:"+p[0]+"="+p[1]);
}
}
}</span>
并发调度策略(待续)
开源的爬虫介绍(待续)nutch/scrapy/heritrix
遇到的一些问题总结(待续)
另外补充一句,部门需要招一个做数据开发的同僚,有兴趣可以私信我,或者评论回复