使用webmagic爬取csdn用户个性签名

思路:

首先爬取一个用户的个人信息,然后根据该用户的好友关系去爬取好友信息,依次类推,爬取所有用户。

根据获取的用户id,访问blog主页获取个签名。

package com.cuihs.mySpider;

import org.jsoup.select.Elements;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;

public class CsdnUserPageProcessor implements PageProcessor{
	
	private Site site = Site
			.me()
			.setRetryTimes(3)
			.setSleepTime(1000);
	
	@Override
	public void process(Page page) {
		if(page.getUrl().regex("http://my.csdn.net/\\w+").match()){
			Elements mainElements = page.getHtml().getDocument()
					.getElementsByTag("div").get(1).children();
			Elements relationElements = mainElements.get(2).getElementsByTag("div");
			String html = relationElements.get(0).html();
			
			Elements skillElements = mainElements.get(1).getElementsByTag("div");
			String id_user = getLastSlantContent(skillElements.get(0)
					.getElementsByTag("a")
					.get(0)
					.attr("href"));
					
			
			List all = new Html(html).xpath("//div[@class='mod_relations']")
					.links()
					.all();

			page.addTargetRequests(all);
			page.addTargetRequest(new Request("http://blog.csdn.net/"+id_user));
			page.setSkip(true);
			
		}else if(page.getUrl().regex("http://blog.csdn.net").match()){
			Object word = page.getHtml().xpath("//div[@id=blog_title]/h3/allText()");
			if(word instanceof PlainText&&((PlainText)word).all().size()<1){
				word = page.getHtml().xpath("//div[@class='person-sign']/span/allText()");
			}
			
			if(word instanceof PlainText&&((PlainText)word).all().size()>0&&!((PlainText)word).all().get(0).isEmpty()){
				page.putField("左右铭", word);
			}else{
				page.setSkip(true);}
		}
	}

	@Override
	public Site getSite() {
		return site;
	}
	
	public static void main(String[] args){
		Spider.create(new CsdnUserPageProcessor())
			.addUrl("http://my.csdn.net/CHS007chs")
			.thread(10)
			.addPipeline(new FilePipeline("D:\\webmagic\\"))
			.run();		

	}
	//获取最后“/”后面的内容
	public static String getLastSlantContent(String fullPath){
		int pos = fullPath.lastIndexOf("/");
				if(pos!=-1){
					return fullPath.substring(pos+1);
				}else{
					return null;
				}
	}
}

 

最后欢迎大家访问我的个人网站:1024s​​​​​​​

你可能感兴趣的:(java)