webmagic首次demo

package com.tvs.webmgic;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

public class MyWebmagic implements PageProcessor {
	// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

    @Override
    public Site getSite() {
        return site;
    }
	
	@Override
	public void process(Page page) {
		page.addTargetRequests(page.getHtml().css("div#page").links().all());
		/*String title = page.getHtml().xpath("//*[@id=\"main\"]/div[1]/div["+i+"]/dl/dd/a//text()").toString();
		System.out.println(title);*/
		if(page.getUrl().regex("http://www.bjnews.com.cn/opinion/[?page=\\d{0,}]{0,1}").match()){
			page.addTargetRequests(page.getHtml().css("div.news").links().all());
		}
		if(page.getUrl().regex("http://www.bjnews.com.cn/opinion/2017/\\d{2}/\\d{2}/\\d{6}.html").match()){
			String author = page.getHtml().xpath("//*[@id=\"author_baidu\"]//text()").toString();
			System.out.println(author);
			String title = page.getHtml().xpath("//*[@id=\"main\"]/div[1]/h1//text()").toString();
			System.out.println(title);
		}
	}

	public static void main(String[] args) {
		Spider.create(new MyWebmagic()).addUrl("http://www.bjnews.com.cn/opinion/").thread(5).run();
	}
}


你可能感兴趣的:(13k)