WebMagic 如何设置 cookie, 以及发送 post 请求

步骤一:先用 httpclient 发起请求获取请求页面的 cookie, 以及其他参数
步骤二:利用获取到的参数构造 request 请求
步骤三:WebMagic 利用获取到的 cookie ,以及构造好的 request 发送 post 请求

//爬取视频页面信息
class VideoSpider {
    public String cookie;
    public String showMoreURL;
    public String session_token;
    public String client_url;

    public VideoSpider(String url, String proxyStr) {
        this.client_url = url;
        String[] tmp = proxyStr.split(":");
        HttpHost proxy = new HttpHost(tmp[1].substring(2), Integer.parseInt(tmp[2]), tmp[0]);
        Site site = Site.me().setRetryTimes(3).setHttpProxy(proxy).setSleepTime(100).setTimeOut(10 * 1000).setCharset("UTF-8")
                .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");

        GPHttpClientDownloader downloader = new GPHttpClientDownloader();
        Request request = new Request(this.client_url);

        this.setCookie(request, site, downloader);
        this.setParameters(request, site, downloader);
    }


    public void setCookie(Request request, Site site, GPHttpClientDownloader downloader) {
        CloseableHttpResponse httpResponse = downloader.downloadForResponse(request, site.toTask());

        Header headers[] = httpResponse.getHeaders("Set-Cookie");
        this.cookie = "hl=en; ";
//      this.cookie = "";
        for (int i = 0; i < headers.length; i++) {
            String tmp[] = headers[i].getValue().split(";");
            this.cookie += tmp[0] + ";";
        }
//      System.out.println("cookie: " + this.cookie);
    }


    public void setParameters(Request request, Site site, GPHttpClientDownloader downloader) {
        Html contentHtml = downloader.download(request, site.toTask()).getHtml();
        this.showMoreURL = "https://www.youtube.com/watch_fragments_ajax?v=" + this.client_url.substring(32)+"&tr=time&distiller=1&ctoken=" + contentHtml.regex("'COMMENTS_TOKEN': \"(.*?)\"").toString() + "&frags=comments&spf=load";
        this.session_token = contentHtml.regex("'XSRF_TOKEN': \"(.*?)\"").toString();
//      System.out.println(showMoreURL);
//      System.out.println(session_token);

    }


    public String getCookie() {
        return cookie;
    }


    public String getShowMoreURL() {
        return showMoreURL;
    }


    public String getSession_token() {
        return session_token;
    }


    public String getClient_url() {
        return client_url;
    }
}


//爬取showMore信息
class ShowMoreSpider implements PageProcessor
{
    private Site site;

    public  ShowMoreSpider(String proxyStr, String cookie)
    {
        String[] tmp = proxyStr.split(":");
        HttpHost proxy = new HttpHost(tmp[1].substring(2), Integer.parseInt(tmp[2]), tmp[0]);

        this.site = Site.me().setRetryTimes(3).setHttpProxy(proxy).addHeader("Cookie", cookie).setSleepTime(100).setTimeOut(10 * 1000).setCharset("UTF-8")
                .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
    }

    @Override
    public void process(Page page)
    {
        System.out.println(page.getJson().jsonPath("body").regex("u2022 (.*?)<").toString());
    }

    @Override
    public Site getSite()
    {
        return site;
    }    
}

public class VideoCommentSpider {
    public static String proxyString = "http://XXX.XX.XXX.XXX:XXXX";

    public static void main(String[] args)
    {
        //获取Video页面的Cookie、ShowMoreURL、client_url、session_token
        String url = "https://www.youtube.com/watch?v=Xo94zT93fAY";
        VideoSpider vs = new VideoSpider(url, proxyString);


        //获取comment
        PageProcessor spider = new ShowMoreSpider(proxyString, vs.getCookie());

        NameValuePair[] values = new NameValuePair[2];
        values[0] = new BasicNameValuePair("client_url", vs.getClient_url());
        values[1] = new BasicNameValuePair("session_token", vs.getSession_token());

        Map nameValuePair = new HashMap();
        nameValuePair.put("nameValuePair", values);

        Request request = new Request(vs.getShowMoreURL());
        request.setExtras(nameValuePair);
        request.setMethod(HttpConstant.Method.POST);

        Spider.create(spider).thread(5).addRequest(request).run();
    }
}
@ThreadSafe
public class GPHttpClientDownloader extends AbstractDownloader {

    private Logger logger = LoggerFactory.getLogger(getClass());

    private final Map httpClients = new HashMap();

    private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();

    private CloseableHttpClient getHttpClient(Site site) {
        if (site == null) {
            return httpClientGenerator.getClient(null);
        }
        String domain = site.getDomain();
        CloseableHttpClient httpClient = httpClients.get(domain);
        if (httpClient == null) {
            synchronized (this) {
                httpClient = httpClients.get(domain);
                if (httpClient == null) {
                    httpClient = httpClientGenerator.getClient(site);
                    httpClients.put(domain, httpClient);
                }
            }
        }
        return httpClient;
    }

    @Override
    public Page download(Request request, Task task) {
        Site site = null;
        if (task != null) {
            site = task.getSite();
        }
        Set acceptStatCode;
        String charset = null;
        Map headers = null;
        if (site != null) {
            acceptStatCode = site.getAcceptStatCode();
            charset = site.getCharset();
            headers = site.getHeaders();
        } else {
            acceptStatCode = Sets.newHashSet(200);
        }
        logger.info("downloading page {}", request.getUrl());
        CloseableHttpResponse httpResponse = null;
        int statusCode=0;
        try {
            HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
            httpResponse = getHttpClient(site).execute(httpUriRequest);
            statusCode = httpResponse.getStatusLine().getStatusCode();
            request.putExtra(Request.STATUS_CODE, statusCode);
            if (statusAccept(acceptStatCode, statusCode)) {
                Page page = handleResponse(request, charset, httpResponse, task);
                onSuccess(request);
                return page;
            } else {
                logger.warn("code error " + statusCode + "\t" + request.getUrl());
                return null;
            }
        } catch (IOException e) {
            logger.warn("download page " + request.getUrl() + " error", e);
            if (site.getCycleRetryTimes() > 0) {
                return addToCycleRetry(request, site);
            }
            onError(request);
            return null;
        } finally {
            request.putExtra(Request.STATUS_CODE, statusCode);
            try {
                if (httpResponse != null) {
                    //ensure the connection is released back to pool
                    EntityUtils.consume(httpResponse.getEntity());
                }
            } catch (IOException e) {
                logger.warn("close response fail", e);
            }
        }
    }

    public CloseableHttpResponse downloadForResponse(Request request, Task task) {
        Site site = null;
        if (task != null) {
            site = task.getSite();
        }
        Set acceptStatCode;
        String charset = null;
        Map headers = null;
        if (site != null) {
            acceptStatCode = site.getAcceptStatCode();
            charset = site.getCharset();
            headers = site.getHeaders();
        } else {
            acceptStatCode = Sets.newHashSet(200);
        }
        logger.info("downloading page {}", request.getUrl());
        CloseableHttpResponse httpResponse = null;
        int statusCode=0;
        try {
            HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
            httpResponse = getHttpClient(site).execute(httpUriRequest);
            statusCode = httpResponse.getStatusLine().getStatusCode();
            request.putExtra(Request.STATUS_CODE, statusCode);
            if (statusAccept(acceptStatCode, statusCode)) {
                Page page = handleResponse(request, charset, httpResponse, task);
                onSuccess(request);
                return httpResponse;
            } else {
                logger.warn("code error " + statusCode + "\t" + request.getUrl());
                return null;
            }
        } catch (IOException e) {
            logger.warn("download page " + request.getUrl() + " error", e);

            onError(request);
            return null;
        }
        catch (Exception e) {
            e.printStackTrace();
            return null;
        }
        finally {
            request.putExtra(Request.STATUS_CODE, statusCode);
            try {
                if (httpResponse != null) {
                    //ensure the connection is released back to pool
                    EntityUtils.consume(httpResponse.getEntity());
                }
            } catch (IOException e) {
                logger.warn("close response fail", e);
            }
        }
    }

    @Override
    public void setThread(int thread) {
        httpClientGenerator.setPoolSize(thread);
    }

    protected boolean statusAccept(Set acceptStatCode, int statusCode) {
        return acceptStatCode.contains(statusCode);
    }

    protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers) {
        RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
        if (headers != null) {
            for (Map.Entry headerEntry : headers.entrySet()) {
                requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
            }
        }
        RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
                .setConnectionRequestTimeout(site.getTimeOut())
                .setSocketTimeout(site.getTimeOut())
                .setConnectTimeout(site.getTimeOut())
                .setCookieSpec(CookieSpecs.BEST_MATCH);
        if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
            HttpHost host = site.getHttpProxyFromPool();
            requestConfigBuilder.setProxy(host);
            request.putExtra(Request.PROXY, host);
        }else if(site.getHttpProxy()!= null){
            HttpHost host = site.getHttpProxy();
            requestConfigBuilder.setProxy(host);
            request.putExtra(Request.PROXY, host);  
        }
        requestBuilder.setConfig(requestConfigBuilder.build());
        return requestBuilder.build();
    }

    protected RequestBuilder selectRequestMethod(Request request) {
        String method = request.getMethod();
        if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
            //default get
            return RequestBuilder.get();
        } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
            RequestBuilder requestBuilder = RequestBuilder.post();
            NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair");
            if (nameValuePair != null && nameValuePair.length > 0) {
                requestBuilder.addParameters(nameValuePair);
            }
            return requestBuilder;
        } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
            return RequestBuilder.head();
        } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
            return RequestBuilder.put();
        } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
            return RequestBuilder.delete();
        } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
            return RequestBuilder.trace();
        }
        throw new IllegalArgumentException("Illegal HTTP Method " + method);
    }

    public static String testContent;

    protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
        String content = getContent(charset, httpResponse);

        testContent = content;

        Page page = new Page();
        page.setRawText(content);
        page.setUrl(new PlainText(request.getUrl()));
        page.setRequest(request);
        page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
        return page;
    }

    public String getConentForTest(String charset, HttpResponse httpResponse) throws IOException {
        return this.getContent(charset, httpResponse);
    }

    protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
        if (charset == null) {
            byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
            String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
            if (htmlCharset != null) {
                return new String(contentBytes, htmlCharset);
            } else {
                logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
                return new String(contentBytes);
            }
        } else {
            return IOUtils.toString(httpResponse.getEntity().getContent(), charset);
        }
    }

    protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
        String charset;
        // charset
        // 1、encoding in http header Content-Type
        String value = httpResponse.getEntity().getContentType().getValue();
        charset = UrlUtils.getCharset(value);
        if (StringUtils.isNotBlank(charset)) {
            logger.debug("Auto get charset: {}", charset);
            return charset;
        }
        // use default charset to decode first time
        Charset defaultCharset = Charset.defaultCharset();
        String content = new String(contentBytes, defaultCharset.name());
        // 2、charset in meta
        if (StringUtils.isNotEmpty(content)) {
            Document document = Jsoup.parse(content);
            Elements links = document.select("meta");
            for (Element link : links) {
                // 2.1、html4.01 
                String metaContent = link.attr("content");
                String metaCharset = link.attr("charset");
                if (metaContent.indexOf("charset") != -1) {
                    metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
                    charset = metaContent.split("=")[1];
                    break;
                }
                // 2.2、html5 
                else if (StringUtils.isNotEmpty(metaCharset)) {
                    charset = metaCharset;
                    break;
                }
            }
        }
        logger.debug("Auto get charset: {}", charset);
        // 3、todo use tools as cpdetector for content decode
        return charset;
    }
}

你可能感兴趣的:(搜索引擎,WebMagic)