用webmagic抓文章列表和详细页

webMaigc文档:http://webmagic.io


public class ForumPageprocess implements PageProcessor {
private Sitesite= Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
privatestatic String username="qq_14955245";
private String domian="http://blog.csdn.net/";//网站首页
private int size=0;//抓取到的文章数量

public Site getSite() {

return site ;

}
public void process(Pagepage) {
if(!page.getUrl().regex(""+domian+""+username+"/article/details/\\d+").match()) {
page.addTargetRequests(page.getHtml().xpath("//*[@id=\"article_toplist\"]").regex(""+username+"/article/details/\\d+").replace(""+username+"",domian+username+"").all());
page.addTargetRequests(page.getHtml().xpath("//*[@id=\"article_list\"]").regex(""+username+"/article/details/\\d+").replace(""+username+"",domian+username+"").all());
 
}

else {     page .putField("title:" , page .getHtml().xpath("//*[@id=\"article_details\"]/div[1]/h1/span/" ).replace("<[^>]*>" ,"" ));   page.putField( "原创:", page.getHtml().regex( "ico_type_(Original)",1).get());   page .putField("阅读次数:" , page .getHtml().xpath("//*[@id=\"article_details\"]/div[2]/div[2]/span[2]" ).regex("(\\d+)人阅读" , 1));   page .putField("评论次数:" , page .getHtml().xpath("//*[@id=\"article_details\"]/div[2]/div[2]/span[3]" ).regex("(\\d+)" , 1));   page.putField( "发布时间:", page.getHtml().css( "span.link_postdate", "text")); page .putField("标签:" , page .getHtml().xpath("//*[@id=\"article_details\"]/div[2]/div[1]/span" ).regex("]*>(.*?)" ,1).all(); page .putField("分类:" , page .getHtml().xpath("//*[@id=\"article_details\"]/div[3]/div[2]/label/span/text()" )); page .putField("文章内容:" , page .getHtml().xpath("//*[@id=\"article_content\"]" ).replace("<[^>]*>" ,"" )); System. out.println( "抓取第"+ size+ "文章");
 
}
 
  
public static void main(String[] args) {
		long startTime,endTime;
		System.out.println("爬虫开始请大家耐心等待");
		startTime =System.currentTimeMillis();
		System.out.println(startTime);
		 // 从用户博客首页开始抓,开启2个线程,启动爬虫
		Spider.create(new ForumPageprocess()).addPipeline(new ConsolePipeline()) .addUrl("http://blog.csdn.net/"+username+"").thread(2).run();
		endTime=System.currentTimeMillis();
		System.out.println("总共用时"+((endTime-startTime)/1000)+"秒");
		
	}
 
  
 
  
 
  
运行结果:

用webmagic抓文章列表和详细页_第1张图片



你可能感兴趣的:(爬虫框架)