抽取百度收录链接(二)—爬取链接获取真实链接

2019独角兽企业重金招聘Python工程师标准>>> hot3.png

接上篇《抽取百度收录链接(一)》

现在我们的大概思路有了。就是实现问题了。 这里我推荐一个java爬取页面的好工具。[weblink url="https://github.com/code4craft/webmagic"]webmagic[/weblink]
webmagic webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。 web爬虫是一种技术,webmagic致力于将这种技术的实现成本降低,但是出于对资源提供者的尊重,webmagic不会做反封锁的事情,包括:验证码破解、代理切换、自动登录等。
下面是利用这个工具爬取页面的代码:
package com.wbdb.action.baidu;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.http.HttpHost;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;

import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ComboExtract;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.MultiPagePipeline;

/**
 * @author www.xxku.net
*/ @TargetUrl(value = "http://www.baidu.com/s\\?wd=site%3Awww.xxku.net&pn=\\d+&ie=utf-8") public class Search implements MultiPageModel { @ExtractBy(value = "href=\"(http://www\\.baidu\\.com/link\\?url=.*?)\"", type = ExtractBy.Type.Regex) private List baiduUrl; @ExtractBy(value = "//p[@id='page']/strong/span[@class='pc']", type = ExtractBy.Type.XPath) private String pageKey; @ExtractBy(value = "(\\d+)", type = ExtractBy.Type.Regex) private String page; @ComboExtract(value = { @ExtractBy("//p[@id='page']/a"), @ExtractBy(value = "http://www.baidu.com/s\\?wd=site%3Awww.xxku.net&pn=\\d+&ie=utf-8", type = ExtractBy.Type.Regex) }, multi = true, notNull = false) private List otherPage; @Override public String getPageKey() { return pageKey; } @Override public Collection getOtherPages() { return otherPage; } @Override public String getPage() { if (page == null) { return "1"; } return page; } @Override public MultiPageModel combine(MultiPageModel multiPageModel) { Search s = new Search(); Search pagedModel1 = (Search) multiPageModel; this.baiduUrl.addAll(pagedModel1.baiduUrl); return s; } public List getBaiduUrl() { return baiduUrl; } public void setBaiduUrl(List baiduUrl) { this.baiduUrl = baiduUrl; } @Override public String toString() { return "Search [baiduUrl=" + baiduUrl + ", pageKey=" + pageKey + ", page=" + page + ", otherPage=" + otherPage + "]"; } public static void main(String[] args) throws IOException { OOSpider o = OOSpider.create( Site.me().addStartUrl("http://www.baidu.com/s?wd=site%3Awww.xxku.net&pn=0&ie=utf-8"), Search.class); o.addPipeline(new MultiPagePipeline()); o.addPipeline(new SearchPipeline()); o.run(); List baiduUrlList = SearchPipeline.getBaiduUrlList(); Search s = new Search(); ArrayList realUrl404 = new ArrayList(); // 获取404 URl for (int i = 0; i < baiduUrlList.size(); i++) { String url404 = s.getRealUrl(baiduUrlList.get(i)); if (url404 != null) { realUrl404.add(url404); } } // 组件xml //这里可以用dom4j来做比较容易 //我直接把链接打出来。通过在线的sitemap生成器生成了 } /** * 获取真实链接 * * @param url * @return * @throws IOException * @throws ClientProtocolException */ private String getRealUrl(String url) throws IOException { CloseableHttpClient httpclient = HttpClients.createDefault(); HttpGet httpget = new HttpGet(url); HttpContext localContext = new BasicHttpContext(); CloseableHttpResponse response = httpclient.execute(httpget, localContext); try { int status = response.getStatusLine().getStatusCode(); if (status == 404) { HttpHost target = (HttpHost) localContext.getAttribute(ExecutionContext.HTTP_TARGET_HOST); HttpUriRequest req = (HttpUriRequest) localContext.getAttribute(ExecutionContext.HTTP_REQUEST); return target.toString() + req.getURI(); } } catch (Exception e) { e.printStackTrace(); } finally { response.close(); } return null; } }

转载于:https://my.oschina.net/u/265943/blog/292892

你可能感兴趣的:(抽取百度收录链接(二)—爬取链接获取真实链接)