这系列文章主要分析分析webmagic框架,没有实战内容,如有实战问题可以讨论,也可以提供技术支持。
欢迎加群313557283(刚创建),小白互相学习~
package us.codecraft.webmagic;
import us.codecraft.webmagic.utils.HttpConstant;
import java.util.*;
/**
* Object contains setting for crawler.
*
* @author [email protected]
* @see us.codecraft.webmagic.processor.PageProcessor
* @since 0.1.0
*/
//下载器初始化需要site里面提供header
public class Site {
//域名
private String domain;
//浏览器表示
private String userAgent;
//cookies
private Map defaultCookies = new LinkedHashMap();
//cookies
private Map> cookies = new HashMap>();
//编码
private String charset;
//默认线程休眠时间5秒
private int sleepTime = 5000;
//重试次数
private int retryTimes = 0;
//循环重试次数
private int cycleRetryTimes = 0;
//重试休眠时间1秒
private int retrySleepTime = 1000;
//默认请求5秒未建立连接就关闭
private int timeOut = 5000;
//状态码集合
private static final Set DEFAULT_STATUS_CODE_SET = new HashSet();
//状态码集合
private Set acceptStatCode = DEFAULT_STATUS_CODE_SET;
//header,Map集合
private Map headers = new HashMap();
//默认使用gzip 压缩
private boolean useGzip = true;
//需不需要cookie 管理
private boolean disableCookieManagement = false;
static {
DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
}
/**
* new a Site
*
* @return new site
*/
public static Site me() {
return new Site();
}
/**
* Add a cookie with domain {@link #getDomain()}
*
* @param name name
* @param value value
* @return this
*/
public Site addCookie(String name, String value) {
defaultCookies.put(name, value);
return this;
}
/**
* Add a cookie with specific domain.
*
* @param domain domain
* @param name name
* @param value value
* @return this
*/
public Site addCookie(String domain, String name, String value) {
if (!cookies.containsKey(domain)){
cookies.put(domain,new HashMap());
}
cookies.get(domain).put(name, value);
return this;
}
/**
* set user agent
*
* @param userAgent userAgent
* @return this
*/
public Site setUserAgent(String userAgent) {
this.userAgent = userAgent;
return this;
}
/**
* get cookies
*
* @return get cookies
*/
public Map getCookies() {
return defaultCookies;
}
/**
* get cookies of all domains
*
* @return get cookies
*/
public Map> getAllCookies() {
return cookies;
}
/**
* get user agent
*
* @return user agent
*/
public String getUserAgent() {
return userAgent;
}
/**
* get domain
*
* @return get domain
*/
public String getDomain() {
return domain;
}
/**
* set the domain of site.
*
* @param domain domain
* @return this
*/
public Site setDomain(String domain) {
this.domain = domain;
return this;
}
/**
* Set charset of page manually.
* When charset is not set or set to null, it can be auto detected by Http header.
*
* @param charset charset
* @return this
*/
public Site setCharset(String charset) {
this.charset = charset;
return this;
}
/**
* get charset set manually
*
* @return charset
*/
public String getCharset() {
return charset;
}
public int getTimeOut() {
return timeOut;
}
/**
* set timeout for downloader in ms
*
* @param timeOut timeOut
* @return this
*/
public Site setTimeOut(int timeOut) {
this.timeOut = timeOut;
return this;
}
/**
* Set acceptStatCode.
* When status code of http response is in acceptStatCodes, it will be processed.
* {200} by default.
* It is not necessarily to be set.
*
* @param acceptStatCode acceptStatCode
* @return this
*/
//可接受的状态码 比如 我只想要200 其他都放弃
public Site setAcceptStatCode(Set acceptStatCode) {
this.acceptStatCode = acceptStatCode;
return this;
}
/**
* get acceptStatCode
*
* @return acceptStatCode
*/
public Set getAcceptStatCode() {
return acceptStatCode;
}
/**
* Set the interval between the processing of two pages.
* Time unit is micro seconds.
*
* @param sleepTime sleepTime
* @return this
*/
public Site setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
/**
* Get the interval between the processing of two pages.
* Time unit is micro seconds.
*
* @return the interval between the processing of two pages,
*/
public int getSleepTime() {
return sleepTime;
}
/**
* Get retry times immediately when download fail, 0 by default.
*
* @return retry times when download fail
*/
public int getRetryTimes() {
return retryTimes;
}
public Map getHeaders() {
return headers;
}
/**
* Put an Http header for downloader.
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent.
*
* @param key key of http header, there are some keys constant in {@link HttpConstant.Header}
* @param value value of header
* @return this
*/
public Site addHeader(String key, String value) {
headers.put(key, value);
return this;
}
/**
* Set retry times when download fail, 0 by default.
*
* @param retryTimes retryTimes
* @return this
*/
public Site setRetryTimes(int retryTimes) {
this.retryTimes = retryTimes;
return this;
}
/**
* When cycleRetryTimes is more than 0, it will add back to scheduler and try download again.
*
* @return retry times when download fail
*/
public int getCycleRetryTimes() {
return cycleRetryTimes;
}
/**
* Set cycleRetryTimes times when download fail, 0 by default.
*
* @param cycleRetryTimes cycleRetryTimes
* @return this
*/
public Site setCycleRetryTimes(int cycleRetryTimes) {
this.cycleRetryTimes = cycleRetryTimes;
return this;
}
public boolean isUseGzip() {
return useGzip;
}
public int getRetrySleepTime() {
return retrySleepTime;
}
/**
* Set retry sleep times when download fail, 1000 by default.
*
* @param retrySleepTime retrySleepTime
* @return this
*/
public Site setRetrySleepTime(int retrySleepTime) {
this.retrySleepTime = retrySleepTime;
return this;
}
/**
* Whether use gzip.
* Default is true, you can set it to false to disable gzip.
*
* @param useGzip useGzip
* @return this
*/
public Site setUseGzip(boolean useGzip) {
this.useGzip = useGzip;
return this;
}
public boolean isDisableCookieManagement() {
return disableCookieManagement;
}
/**
* Downloader is supposed to store response cookie.
* Disable it to ignore all cookie fields and stay clean.
* Warning: Set cookie will still NOT work if disableCookieManagement is true.
* @param disableCookieManagement disableCookieManagement
* @return this
*/
//下载器提供了cookie 管理
public Site setDisableCookieManagement(boolean disableCookieManagement) {
this.disableCookieManagement = disableCookieManagement;
return this;
}
//返回任务信息
public Task toTask() {
return new Task() {
@Override
public String getUUID() {
String uuid = Site.this.getDomain();
if (uuid == null) {
uuid = UUID.randomUUID().toString();
}
return uuid;
}
@Override
public Site getSite() {
return Site.this;
}
};
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Site site = (Site) o;
if (cycleRetryTimes != site.cycleRetryTimes) return false;
if (retryTimes != site.retryTimes) return false;
if (sleepTime != site.sleepTime) return false;
if (timeOut != site.timeOut) return false;
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
return false;
if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
if (defaultCookies != null ? !defaultCookies.equals(site.defaultCookies) : site.defaultCookies != null)
return false;
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
return true;
}
@Override
public int hashCode() {
int result = domain != null ? domain.hashCode() : 0;
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0);
result = 31 * result + (charset != null ? charset.hashCode() : 0);
result = 31 * result + sleepTime;
result = 31 * result + retryTimes;
result = 31 * result + cycleRetryTimes;
result = 31 * result + timeOut;
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
result = 31 * result + (headers != null ? headers.hashCode() : 0);
return result;
}
@Override
public String toString() {
return "Site{" +
"domain='" + domain + '\'' +
", userAgent='" + userAgent + '\'' +
", cookies=" + defaultCookies +
", charset='" + charset + '\'' +
", sleepTime=" + sleepTime +
", retryTimes=" + retryTimes +
", cycleRetryTimes=" + cycleRetryTimes +
", timeOut=" + timeOut +
", acceptStatCode=" + acceptStatCode +
", headers=" + headers +
'}';
}
}