抽取百度收录链接(二)—爬取链接获取真实链接

接上篇《抽取百度收录链接(一)》

现在我们的大概思路有了。就是实现问题了。 这里我推荐一个java爬取页面的好工具。[weblink url="https://github.com/code4craft/webmagic"]webmagic[/weblink]
webmagic webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。 web爬虫是一种技术,webmagic致力于将这种技术的实现成本降低,但是出于对资源提供者的尊重,webmagic不会做反封锁的事情,包括:验证码破解、代理切换、自动登录等。
下面是利用这个工具爬取页面的代码:
package com.wbdb.action.baidu;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.http.HttpHost;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;

import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ComboExtract;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.MultiPagePipeline;

/**
 * @author www.xxku.net<br>
 */
@TargetUrl(value = "http://www.baidu.com/s\\?wd=site%3Awww.xxku.net&pn=\\d+&ie=utf-8")
public class Search implements MultiPageModel {

	@ExtractBy(value = "href=\"(http://www\\.baidu\\.com/link\\?url=.*?)\"", type = ExtractBy.Type.Regex)
	private List<String> baiduUrl;

	@ExtractBy(value = "//p[@id='page']/strong/span[@class='pc']", type = ExtractBy.Type.XPath)
	private String pageKey;
	@ExtractBy(value = "<span class=\"current\">(\\d+)</span>", type = ExtractBy.Type.Regex)
	private String page;
	@ComboExtract(value = {
			@ExtractBy("//p[@id='page']/a"),
			@ExtractBy(value = "http://www.baidu.com/s\\?wd=site%3Awww.xxku.net&pn=\\d+&ie=utf-8", type = ExtractBy.Type.Regex) }, multi = true, notNull = false)
	private List<String> otherPage;

	@Override
	public String getPageKey() {
		return pageKey;
	}

	@Override
	public Collection<String> getOtherPages() {
		return otherPage;
	}

	@Override
	public String getPage() {
		if (page == null) {
			return "1";
		}
		return page;
	}

	@Override
	public MultiPageModel combine(MultiPageModel multiPageModel) {
		Search s = new Search();
		Search pagedModel1 = (Search) multiPageModel;
		this.baiduUrl.addAll(pagedModel1.baiduUrl);
		return s;
	}



	public List<String> getBaiduUrl() {
		return baiduUrl;
	}

	public void setBaiduUrl(List<String> baiduUrl) {
		this.baiduUrl = baiduUrl;
	}

	@Override
	public String toString() {
		return "Search [baiduUrl=" + baiduUrl + ", pageKey=" + pageKey + ", page=" + page + ", otherPage=" + otherPage
				+ "]";
	}

	public static void main(String[] args) throws IOException {
		OOSpider o = OOSpider.create(
				Site.me().addStartUrl("http://www.baidu.com/s?wd=site%3Awww.xxku.net&pn=0&ie=utf-8"), Search.class);
		o.addPipeline(new MultiPagePipeline());
		o.addPipeline(new SearchPipeline());
		o.run();
		List<String> baiduUrlList = SearchPipeline.getBaiduUrlList();
		Search s = new Search();
		ArrayList<String> realUrl404 = new ArrayList<String>();
		// 获取404 URl
		for (int i = 0; i < baiduUrlList.size(); i++) {
			String url404 = s.getRealUrl(baiduUrlList.get(i));
			if (url404 != null) {
				realUrl404.add(url404);
			}
		}
		// 组件xml
                //这里可以用dom4j来做比较容易
                //我直接把链接打出来。通过在线的sitemap生成器生成了
	}

	/**
	 * 获取真实链接
	 * 
	 * @param url
	 * @return
	 * @throws IOException
	 * @throws ClientProtocolException
	 */
	private String getRealUrl(String url) throws IOException {
		CloseableHttpClient httpclient = HttpClients.createDefault();
		HttpGet httpget = new HttpGet(url);
		HttpContext localContext = new BasicHttpContext();
		CloseableHttpResponse response = httpclient.execute(httpget, localContext);
		try {
			int status = response.getStatusLine().getStatusCode();
			if (status == 404) {
				HttpHost target = (HttpHost) localContext.getAttribute(ExecutionContext.HTTP_TARGET_HOST);
				HttpUriRequest req = (HttpUriRequest) localContext.getAttribute(ExecutionContext.HTTP_REQUEST);
				return target.toString() + req.getURI();
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			response.close();
		}
		return null;
	}

}

你可能感兴趣的:(java,httpclient,百度蜘蛛)