从头学习爬虫(十八)重构篇----WebMagic框架分析之site

这系列文章主要分析分析webmagic框架,没有实战内容,如有实战问题可以讨论,也可以提供技术支持。


欢迎加群313557283(刚创建),小白互相学习~


Site

package us.codecraft.webmagic;

import us.codecraft.webmagic.utils.HttpConstant;

import java.util.*;

/**
 * Object contains setting for crawler.
* * @author [email protected]
* @see us.codecraft.webmagic.processor.PageProcessor * @since 0.1.0 */ //下载器初始化需要site里面提供header public class Site { //域名 private String domain; //浏览器表示 private String userAgent; //cookies private Map defaultCookies = new LinkedHashMap(); //cookies private Map> cookies = new HashMap>(); //编码 private String charset; //默认线程休眠时间5秒 private int sleepTime = 5000; //重试次数 private int retryTimes = 0; //循环重试次数 private int cycleRetryTimes = 0; //重试休眠时间1秒 private int retrySleepTime = 1000; //默认请求5秒未建立连接就关闭 private int timeOut = 5000; //状态码集合 private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); //状态码集合 private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; //header,Map集合 private Map headers = new HashMap(); //默认使用gzip 压缩 private boolean useGzip = true; //需不需要cookie 管理 private boolean disableCookieManagement = false; static { DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200); } /** * new a Site * * @return new site */ public static Site me() { return new Site(); } /** * Add a cookie with domain {@link #getDomain()} * * @param name name * @param value value * @return this */ public Site addCookie(String name, String value) { defaultCookies.put(name, value); return this; } /** * Add a cookie with specific domain. * * @param domain domain * @param name name * @param value value * @return this */ public Site addCookie(String domain, String name, String value) { if (!cookies.containsKey(domain)){ cookies.put(domain,new HashMap()); } cookies.get(domain).put(name, value); return this; } /** * set user agent * * @param userAgent userAgent * @return this */ public Site setUserAgent(String userAgent) { this.userAgent = userAgent; return this; } /** * get cookies * * @return get cookies */ public Map getCookies() { return defaultCookies; } /** * get cookies of all domains * * @return get cookies */ public Map> getAllCookies() { return cookies; } /** * get user agent * * @return user agent */ public String getUserAgent() { return userAgent; } /** * get domain * * @return get domain */ public String getDomain() { return domain; } /** * set the domain of site. * * @param domain domain * @return this */ public Site setDomain(String domain) { this.domain = domain; return this; } /** * Set charset of page manually.
* When charset is not set or set to null, it can be auto detected by Http header. * * @param charset charset * @return this */ public Site setCharset(String charset) { this.charset = charset; return this; } /** * get charset set manually * * @return charset */ public String getCharset() { return charset; } public int getTimeOut() { return timeOut; } /** * set timeout for downloader in ms * * @param timeOut timeOut * @return this */ public Site setTimeOut(int timeOut) { this.timeOut = timeOut; return this; } /** * Set acceptStatCode.
* When status code of http response is in acceptStatCodes, it will be processed.
* {200} by default.
* It is not necessarily to be set.
* * @param acceptStatCode acceptStatCode * @return this */ //可接受的状态码 比如 我只想要200 其他都放弃 public Site setAcceptStatCode(Set acceptStatCode) { this.acceptStatCode = acceptStatCode; return this; } /** * get acceptStatCode * * @return acceptStatCode */ public Set getAcceptStatCode() { return acceptStatCode; } /** * Set the interval between the processing of two pages.
* Time unit is micro seconds.
* * @param sleepTime sleepTime * @return this */ public Site setSleepTime(int sleepTime) { this.sleepTime = sleepTime; return this; } /** * Get the interval between the processing of two pages.
* Time unit is micro seconds.
* * @return the interval between the processing of two pages, */ public int getSleepTime() { return sleepTime; } /** * Get retry times immediately when download fail, 0 by default.
* * @return retry times when download fail */ public int getRetryTimes() { return retryTimes; } public Map getHeaders() { return headers; } /** * Put an Http header for downloader.
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent.
* * @param key key of http header, there are some keys constant in {@link HttpConstant.Header} * @param value value of header * @return this */ public Site addHeader(String key, String value) { headers.put(key, value); return this; } /** * Set retry times when download fail, 0 by default.
* * @param retryTimes retryTimes * @return this */ public Site setRetryTimes(int retryTimes) { this.retryTimes = retryTimes; return this; } /** * When cycleRetryTimes is more than 0, it will add back to scheduler and try download again.
* * @return retry times when download fail */ public int getCycleRetryTimes() { return cycleRetryTimes; } /** * Set cycleRetryTimes times when download fail, 0 by default.
* * @param cycleRetryTimes cycleRetryTimes * @return this */ public Site setCycleRetryTimes(int cycleRetryTimes) { this.cycleRetryTimes = cycleRetryTimes; return this; } public boolean isUseGzip() { return useGzip; } public int getRetrySleepTime() { return retrySleepTime; } /** * Set retry sleep times when download fail, 1000 by default.
* * @param retrySleepTime retrySleepTime * @return this */ public Site setRetrySleepTime(int retrySleepTime) { this.retrySleepTime = retrySleepTime; return this; } /** * Whether use gzip.
* Default is true, you can set it to false to disable gzip. * * @param useGzip useGzip * @return this */ public Site setUseGzip(boolean useGzip) { this.useGzip = useGzip; return this; } public boolean isDisableCookieManagement() { return disableCookieManagement; } /** * Downloader is supposed to store response cookie. * Disable it to ignore all cookie fields and stay clean. * Warning: Set cookie will still NOT work if disableCookieManagement is true. * @param disableCookieManagement disableCookieManagement * @return this */ //下载器提供了cookie 管理 public Site setDisableCookieManagement(boolean disableCookieManagement) { this.disableCookieManagement = disableCookieManagement; return this; } //返回任务信息 public Task toTask() { return new Task() { @Override public String getUUID() { String uuid = Site.this.getDomain(); if (uuid == null) { uuid = UUID.randomUUID().toString(); } return uuid; } @Override public Site getSite() { return Site.this; } }; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Site site = (Site) o; if (cycleRetryTimes != site.cycleRetryTimes) return false; if (retryTimes != site.retryTimes) return false; if (sleepTime != site.sleepTime) return false; if (timeOut != site.timeOut) return false; if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) return false; if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; if (defaultCookies != null ? !defaultCookies.equals(site.defaultCookies) : site.defaultCookies != null) return false; if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false; if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false; if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; return true; } @Override public int hashCode() { int result = domain != null ? domain.hashCode() : 0; result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0); result = 31 * result + (charset != null ? charset.hashCode() : 0); result = 31 * result + sleepTime; result = 31 * result + retryTimes; result = 31 * result + cycleRetryTimes; result = 31 * result + timeOut; result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); result = 31 * result + (headers != null ? headers.hashCode() : 0); return result; } @Override public String toString() { return "Site{" + "domain='" + domain + '\'' + ", userAgent='" + userAgent + '\'' + ", cookies=" + defaultCookies + ", charset='" + charset + '\'' + ", sleepTime=" + sleepTime + ", retryTimes=" + retryTimes + ", cycleRetryTimes=" + cycleRetryTimes + ", timeOut=" + timeOut + ", acceptStatCode=" + acceptStatCode + ", headers=" + headers + '}'; } }

你可能感兴趣的:(网络爬虫)