这系列文章主要分析分析webmagic框架,没有实战内容,如有实战问题可以讨论,也可以提供技术支持。
欢迎加群313557283(刚创建),小白互相学习~
package us.codecraft.webmagic;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
/**
* Object storing extracted result and urls to fetch.
* Not thread safe.
* Main method:
* {@link #getUrl()} get url of current page
* {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch
*
* @author [email protected]
* @see us.codecraft.webmagic.downloader.Downloader
* @see us.codecraft.webmagic.processor.PageProcessor
* @since 0.1.0
*/
public class Page {
//请求
private Request request;
//结果
private ResultItems resultItems = new ResultItems();
private Html html;
private Json json;
//html.tostring() 响应内容
private String rawText;
//选择器 请仔细阅读 Selectable 这个接口 提供页面解析方法
private Selectable url;
//请求头map
private Map> headers;
//响应码 200
private int statusCode = HttpConstant.StatusCode.CODE_200;
//是否下载成功
private boolean downloadSuccess = true;
//响应流
private byte[] bytes;
//目标请求
private List targetRequests = new ArrayList();
//编码
private String charset;
public Page() {
}
//下载失败
public static Page fail(){
Page page = new Page();
page.setDownloadSuccess(false);
return page;
}
//跳过这个请求 ,不去处理
public Page setSkip(boolean skip) {
resultItems.setSkip(skip);
return this;
}
/**
* store extract results
*
* @param key key
* @param field field
*/
//把所要信息装进resultItems
public void putField(String key, Object field) {
resultItems.put(key, field);
}
/**
* get html content of page
*
* @return html
*/
//获得html对象,没有将响应内容和请求地址构造一个html
public Html getHtml() {
if (html == null) {
html = new Html(rawText, request.getUrl());
}
return html;
}
/**
* get json content of page
*
* @return json
* @since 0.5.0
*/
//响应内容转json ,如果要使用请详细阅读此类
public Json getJson() {
if (json == null) {
json = new Json(rawText);
}
return json;
}
/**
* @param html html
* @deprecated since 0.4.0
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
*/
public void setHtml(Html html) {
this.html = html;
}
public List getTargetRequests() {
return targetRequests;
}
/**
* add urls to fetch
*
* @param requests requests
*/
public void addTargetRequests(List requests) {
//string转request
//请求地址格式化
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;
}
s = UrlUtils.canonicalizeUrl(s, url.toString());
targetRequests.add(new Request(s));
}
}
/**
* add urls to fetch
*
* @param requests requests
* @param priority priority
*/
//重载,加优先级
public void addTargetRequests(List requests, long priority) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;
}
s = UrlUtils.canonicalizeUrl(s, url.toString());
targetRequests.add(new Request(s).setPriority(priority));
}
}
/**
* add url to fetch
*
* @param requestString requestString
*/
//单个request string
public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
return;
}
requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
targetRequests.add(new Request(requestString));
}
/**
* add requests to fetch
*
* @param request request
*/
//单个request request
public void addTargetRequest(Request request) {
targetRequests.add(request);
}
/**
* get url of current page
*
* @return url of current page
*/
public Selectable getUrl() {
return url;
}
public void setUrl(Selectable url) {
this.url = url;
}
/**
* get request of current page
*
* @return request
*/
public Request getRequest() {
return request;
}
public void setRequest(Request request) {
this.request = request;
this.resultItems.setRequest(request);
}
public ResultItems getResultItems() {
return resultItems;
}
public int getStatusCode() {
return statusCode;
}
public void setStatusCode(int statusCode) {
this.statusCode = statusCode;
}
public String getRawText() {
return rawText;
}
public Page setRawText(String rawText) {
this.rawText = rawText;
return this;
}
public Map> getHeaders() {
return headers;
}
public void setHeaders(Map> headers) {
this.headers = headers;
}
public boolean isDownloadSuccess() {
return downloadSuccess;
}
public void setDownloadSuccess(boolean downloadSuccess) {
this.downloadSuccess = downloadSuccess;
}
public byte[] getBytes() {
return bytes;
}
public void setBytes(byte[] bytes) {
this.bytes = bytes;
}
public String getCharset() {
return charset;
}
public void setCharset(String charset) {
this.charset = charset;
}
@Override
public String toString() {
return "Page{" +
"request=" + request +
", resultItems=" + resultItems +
", html=" + html +
", json=" + json +
", rawText='" + rawText + '\'' +
", url=" + url +
", headers=" + headers +
", statusCode=" + statusCode +
", downloadSuccess=" + downloadSuccess +
", targetRequests=" + targetRequests +
", charset='" + charset + '\'' +
", bytes=" + Arrays.toString(bytes) +
'}';
}
}