从头学习爬虫(十九)重构篇----WebMagic框架分析之page

这系列文章主要分析分析webmagic框架,没有实战内容,如有实战问题可以讨论,也可以提供技术支持。


欢迎加群313557283(刚创建),小白互相学习~


Page

package us.codecraft.webmagic;

import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

/**
 * Object storing extracted result and urls to fetch.
* Not thread safe.
* Main method:
* {@link #getUrl()} get url of current page
* {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch
* * @author [email protected]
* @see us.codecraft.webmagic.downloader.Downloader * @see us.codecraft.webmagic.processor.PageProcessor * @since 0.1.0 */ public class Page { //请求 private Request request; //结果 private ResultItems resultItems = new ResultItems(); private Html html; private Json json; //html.tostring() 响应内容 private String rawText; //选择器 请仔细阅读 Selectable 这个接口 提供页面解析方法 private Selectable url; //请求头map private Map> headers; //响应码 200 private int statusCode = HttpConstant.StatusCode.CODE_200; //是否下载成功 private boolean downloadSuccess = true; //响应流 private byte[] bytes; //目标请求 private List targetRequests = new ArrayList(); //编码 private String charset; public Page() { } //下载失败 public static Page fail(){ Page page = new Page(); page.setDownloadSuccess(false); return page; } //跳过这个请求 ,不去处理 public Page setSkip(boolean skip) { resultItems.setSkip(skip); return this; } /** * store extract results * * @param key key * @param field field */ //把所要信息装进resultItems public void putField(String key, Object field) { resultItems.put(key, field); } /** * get html content of page * * @return html */ //获得html对象,没有将响应内容和请求地址构造一个html public Html getHtml() { if (html == null) { html = new Html(rawText, request.getUrl()); } return html; } /** * get json content of page * * @return json * @since 0.5.0 */ //响应内容转json ,如果要使用请详细阅读此类 public Json getJson() { if (json == null) { json = new Json(rawText); } return json; } /** * @param html html * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ public void setHtml(Html html) { this.html = html; } public List getTargetRequests() { return targetRequests; } /** * add urls to fetch * * @param requests requests */ public void addTargetRequests(List requests) { //string转request //请求地址格式化 for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; } s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s)); } } /** * add urls to fetch * * @param requests requests * @param priority priority */ //重载,加优先级 public void addTargetRequests(List requests, long priority) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; } s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s).setPriority(priority)); } } /** * add url to fetch * * @param requestString requestString */ //单个request string public void addTargetRequest(String requestString) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { return; } requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); targetRequests.add(new Request(requestString)); } /** * add requests to fetch * * @param request request */ //单个request request public void addTargetRequest(Request request) { targetRequests.add(request); } /** * get url of current page * * @return url of current page */ public Selectable getUrl() { return url; } public void setUrl(Selectable url) { this.url = url; } /** * get request of current page * * @return request */ public Request getRequest() { return request; } public void setRequest(Request request) { this.request = request; this.resultItems.setRequest(request); } public ResultItems getResultItems() { return resultItems; } public int getStatusCode() { return statusCode; } public void setStatusCode(int statusCode) { this.statusCode = statusCode; } public String getRawText() { return rawText; } public Page setRawText(String rawText) { this.rawText = rawText; return this; } public Map> getHeaders() { return headers; } public void setHeaders(Map> headers) { this.headers = headers; } public boolean isDownloadSuccess() { return downloadSuccess; } public void setDownloadSuccess(boolean downloadSuccess) { this.downloadSuccess = downloadSuccess; } public byte[] getBytes() { return bytes; } public void setBytes(byte[] bytes) { this.bytes = bytes; } public String getCharset() { return charset; } public void setCharset(String charset) { this.charset = charset; } @Override public String toString() { return "Page{" + "request=" + request + ", resultItems=" + resultItems + ", html=" + html + ", json=" + json + ", rawText='" + rawText + '\'' + ", url=" + url + ", headers=" + headers + ", statusCode=" + statusCode + ", downloadSuccess=" + downloadSuccess + ", targetRequests=" + targetRequests + ", charset='" + charset + '\'' + ", bytes=" + Arrays.toString(bytes) + '}'; } }

你可能感兴趣的:(网络爬虫)