这系列文章主要分析分析webmagic框架,没有实战内容,如有实战问题可以讨论,也可以提供技术支持。
欢迎加群313557283(刚创建),小白互相学习~
我们先来看看接口
package us.codecraft.webmagic.downloader;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
* Downloader is the part that downloads web pages and store in Page object.
* Downloader has {@link #setThread(int)} method because downloader is always the bottleneck of a crawler,
* there are always some mechanisms such as pooling in downloader, and pool size is related to thread numbers.
*
* @author [email protected]
* @since 0.1.0
*/
public interface Downloader {
/**
* Downloads web pages and store in Page object.
*
* @param request request
* @param task task
* @return page
*/
public Page download(Request request, Task task);
/**
* Tell the downloader how many threads the spider used.
* @param threadNum number of threads
*/
public void setThread(int threadNum);
}
提供了两个方法,一个是线程数控制,还有个是下载方法
我们再来看看默认调用实现downloader的那个类HttpClientDownloader
package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpClientUtils;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
/**
* The http downloader based on HttpClient.
*
* @author [email protected]
* @since 0.1.0
*/
public class HttpClientDownloader extends AbstractDownloader {
private Logger logger = LoggerFactory.getLogger(getClass());
private final Map httpClients = new HashMap();
//下载方法包装类
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
//代理提供类
private ProxyProvider proxyProvider;
private boolean responseHeader = true;
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
this.httpUriRequestConverter = httpUriRequestConverter;
}
public void setProxyProvider(ProxyProvider proxyProvider) {
this.proxyProvider = proxyProvider;
}
//通过site 构造 CloseableHttpClient
private CloseableHttpClient getHttpClient(Site site) {
if (site == null) {
return httpClientGenerator.getClient(null);
}
String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) {
synchronized (this) {
httpClient = httpClients.get(domain);
if (httpClient == null) {
httpClient = httpClientGenerator.getClient(site);
httpClients.put(domain, httpClient);
}
}
}
return httpClient;
}
@Override
public Page download(Request request, Task task) {
//如果要重写主要改造下面直到page = handleResponse();
if (task == null || task.getSite() == null) {
throw new NullPointerException("task or site can not be null");
}
CloseableHttpResponse httpResponse = null;
//解析site 里面的header 以及传递参数 主要调用的是HttpClientGenerator
CloseableHttpClient httpClient = getHttpClient(task.getSite());
//判断有没有代理
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
//初始化page
Page page = Page.fail();
try {
//发送请求HttpUriRequest 方法,HttpUriRequest 内容
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
//包装page
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
onSuccess(request);
logger.info("downloading page success {}", request.getUrl());
return page;
} catch (IOException e) {
logger.warn("download page {} error", request.getUrl(), e);
onError(request);
return page;
} finally {
if (httpResponse != null) {
//ensure the connection is released back to pool
EntityUtils.consumeQuietly(httpResponse.getEntity());
}
if (proxyProvider != null && proxy != null) {
//未实现
proxyProvider.returnProxy(proxy, page, task);
}
}
}
@Override
public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread);
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
Page page = new Page();
page.setBytes(bytes);
if (!request.isBinaryContent()){
if (charset == null) {
charset = getHtmlCharset(contentType, bytes);
}
page.setCharset(charset);
page.setRawText(new String(bytes, charset));
}
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
page.setDownloadSuccess(true);
if (responseHeader) {
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
}
return page;
}
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
if (charset == null) {
charset = Charset.defaultCharset().name();
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
}
return charset;
}
}
比较主要的还有个类HttpClientGenerator,主要是提供下载参数构造的
这边不解析了 有疑问老规矩
package us.codecraft.webmagic.downloader;
import org.apache.http.HttpException;
import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor;
import org.apache.http.client.CookieStore;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.*;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Site;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;
/**
* @author [email protected]
* @since 0.4.0
*/
public class HttpClientGenerator {
private transient Logger logger = LoggerFactory.getLogger(getClass());
private PoolingHttpClientConnectionManager connectionManager;
public HttpClientGenerator() {
Registry reg = RegistryBuilder.create()
.register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", buildSSLConnectionSocketFactory())
.build();
connectionManager = new PoolingHttpClientConnectionManager(reg);
connectionManager.setDefaultMaxPerRoute(100);
}
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
try {
return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"},
null,
new DefaultHostnameVerifier()); // 优先绕过安全证书
} catch (KeyManagementException e) {
logger.error("ssl connection fail", e);
} catch (NoSuchAlgorithmException e) {
logger.error("ssl connection fail", e);
}
return SSLConnectionSocketFactory.getSocketFactory();
}
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
X509TrustManager trustManager = new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
SSLContext sc = SSLContext.getInstance("SSLv3");
sc.init(null, new TrustManager[] { trustManager }, null);
return sc;
}
public HttpClientGenerator setPoolSize(int poolSize) {
connectionManager.setMaxTotal(poolSize);
return this;
}
public CloseableHttpClient getClient(Site site) {
return generateClient(site);
}
private CloseableHttpClient generateClient(Site site) {
HttpClientBuilder httpClientBuilder = HttpClients.custom();
httpClientBuilder.setConnectionManager(connectionManager);
if (site.getUserAgent() != null) {
httpClientBuilder.setUserAgent(site.getUserAgent());
} else {
httpClientBuilder.setUserAgent("");
}
if (site.isUseGzip()) {
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
public void process(
final HttpRequest request,
final HttpContext context) throws HttpException, IOException {
if (!request.containsHeader("Accept-Encoding")) {
request.addHeader("Accept-Encoding", "gzip");
}
}
});
}
//解决post/redirect/post 302跳转问题
httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy());
SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
socketConfigBuilder.setSoTimeout(site.getTimeOut());
SocketConfig socketConfig = socketConfigBuilder.build();
httpClientBuilder.setDefaultSocketConfig(socketConfig);
connectionManager.setDefaultSocketConfig(socketConfig);
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
generateCookie(httpClientBuilder, site);
return httpClientBuilder.build();
}
private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {
if (site.isDisableCookieManagement()) {
httpClientBuilder.disableCookieManagement();
return;
}
CookieStore cookieStore = new BasicCookieStore();
for (Map.Entry cookieEntry : site.getCookies().entrySet()) {
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie.setDomain(site.getDomain());
cookieStore.addCookie(cookie);
}
for (Map.Entry> domainEntry : site.getAllCookies().entrySet()) {
for (Map.Entry cookieEntry : domainEntry.getValue().entrySet()) {
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie.setDomain(domainEntry.getKey());
cookieStore.addCookie(cookie);
}
}
httpClientBuilder.setDefaultCookieStore(cookieStore);
}
}
目前还写了两个downloader,主要用于模拟浏览器,个人比较推荐selenium 用googledriver,目前比较重视自动化测试,更新很快。
直接贴代码
PhantomJSDownloader
package us.codecraft.webmagic.downloader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.PlainText;
import java.io.*;
/**
* this downloader is used to download pages which need to render the javascript
*
* @author [email protected]
* @version 0.5.3
*/
public class PhantomJSDownloader extends AbstractDownloader {
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
private static String crawlJsPath;
private static String phantomJsCommand = "phantomjs"; // default
private int retryNum;
private int threadNum;
public PhantomJSDownloader() {
this.initPhantomjsCrawlPath();
}
/**
* 添加新的构造函数,支持phantomjs自定义命令
*
* example:
* phantomjs.exe 支持windows环境
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
*
* @param phantomJsCommand phantomJsCommand
*/
public PhantomJSDownloader(String phantomJsCommand) {
this.initPhantomjsCrawlPath();
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
}
/**
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
*
* crawl.js start --
*
* var system = require('system');
* var url = system.args[1];
*
* var page = require('webpage').create();
* page.settings.loadImages = false;
* page.settings.resourceTimeout = 5000;
*
* page.open(url, function (status) {
* if (status != 'success') {
* console.log("HTTP request failed!");
* } else {
* console.log(page.content);
* }
*
* page.close();
* phantom.exit();
* });
*
* -- crawl.js end
*
* 具体项目时可以将以上js代码复制下来使用
*
* example:
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
*
* @param phantomJsCommand phantomJsCommand
* @param crawlJsPath crawlJsPath
*/
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
PhantomJSDownloader.crawlJsPath = crawlJsPath;
}
private void initPhantomjsCrawlPath() {
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
}
@Override
public Page download(Request request, Task task) {
if (logger.isInfoEnabled()) {
logger.info("downloading page: " + request.getUrl());
}
String content = getPage(request);
if (content.contains("HTTP request failed")) {
for (int i = 1; i <= getRetryNum(); i++) {
content = getPage(request);
if (!content.contains("HTTP request failed")) {
break;
}
}
if (content.contains("HTTP request failed")) {
//when failed
Page page = new Page();
page.setRequest(request);
return page;
}
}
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(200);
return page;
}
@Override
public void setThread(int threadNum) {
this.threadNum = threadNum;
}
protected String getPage(Request request) {
try {
String url = request.getUrl();
Runtime runtime = Runtime.getRuntime();
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
InputStream is = process.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
StringBuffer stringBuffer = new StringBuffer();
String line;
while ((line = br.readLine()) != null) {
stringBuffer.append(line).append("\n");
}
return stringBuffer.toString();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
public int getRetryNum() {
return retryNum;
}
public PhantomJSDownloader setRetryNum(int retryNum) {
this.retryNum = retryNum;
return this;
}
}
SeleniumDownloader
package us.codecraft.webmagic.downloader.selenium;
import org.apache.log4j.Logger;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import java.io.Closeable;
import java.io.IOException;
import java.util.Map;
/**
* 使用Selenium调用浏览器进行渲染。目前仅支持chrome。
* 需要下载Selenium driver支持。
*
* @author [email protected]
* Date: 13-7-26
* Time: 下午1:37
*/
public class SeleniumDownloader implements Downloader, Closeable {
private volatile WebDriverPool webDriverPool;
private Logger logger = Logger.getLogger(getClass());
private int sleepTime = 0;
private int poolSize = 1;
private static final String DRIVER_PHANTOMJS = "phantomjs";
/**
* 新建
*
* @param chromeDriverPath chromeDriverPath
*/
public SeleniumDownloader(String chromeDriverPath) {
System.getProperties().setProperty("webdriver.chrome.driver",
chromeDriverPath);
}
/**
* Constructor without any filed. Construct PhantomJS browser
*
* @author [email protected]
*/
public SeleniumDownloader() {
// System.setProperty("phantomjs.binary.path",
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
}
/**
* set sleep time to wait until load success
*
* @param sleepTime sleepTime
* @return this
*/
public SeleniumDownloader setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
@Override
public Page download(Request request, Task task) {
checkInit();
WebDriver webDriver;
try {
webDriver = webDriverPool.get();
} catch (InterruptedException e) {
logger.warn("interrupted", e);
return null;
}
logger.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl());
try {
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
if (site.getCookies() != null) {
for (Map.Entry cookieEntry : site.getCookies()
.entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(),
cookieEntry.getValue());
manage.addCookie(cookie);
}
}
/*
* TODO You can add mouse event or other processes
*
* @author: [email protected]
*/
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
Page page = new Page();
page.setRawText(content);
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
webDriverPool.returnToPool(webDriver);
return page;
}
private void checkInit() {
if (webDriverPool == null) {
synchronized (this) {
webDriverPool = new WebDriverPool(poolSize);
}
}
}
@Override
public void setThread(int thread) {
this.poolSize = thread;
}
@Override
public void close() throws IOException {
webDriverPool.closeAll();
}
}
如果想要改下载器 可以写个类实现下这个接口downloader
然后形式不变采取其他组件下载或者配置动态代理
因为这部分不是很完整。