使用webmagic爬取搜狗上公众账号发布的文章信息

/**
* 爬取文章类
*/
public class ArticleCrawler {

Static ApplicationContext context = SpringFactory.getApplicationContext();
Static ArticleDao articleDao=(ArticleDao)context.getBean("articleDao");
Static AccountCrawlerDao accountCrawlerDao = (AccountCrawlerDao)context.getBean("accountCrawlerDao");

/**
 * 爬取文章信息
 * @param pageUrl
 * @return
 * @throws ParseException
 */
public ArrayList<ArticleBo> getPublicArticles(String pageUrl){
    ArrayList<ArticleBo> boList = new ArrayList<>();
    Spider spider = Spider.create(new ArticleCrawlerProcessor())
            .thread(3).addPipeline(new ResultItemsCollectorPipeline());

    ResultItems resultItems = spider.get(pageUrl);
    spider.close();

    boList=resultItems.get("articleList");
    return boList;
}


/**
 * 爬取+入库流程
 */
public void run(String openId){
    // 得到公众号发布文章信息
    ArrayList<ArticleBo> articleList = new ArrayList<ArticleBo>();

    String Url = "http://weixin.sogou.com/gzhjs?openid="+openId;

    // 爬取文章
    articleList = getPublicArticles(Url);

    if(null!=articleList&&articleList.size()>0){
        // 文章入库
        for(ArticleBo bo:articleList){
            try {
                articleDao.addArticle(bo);
            } catch (Exception e){
                // 入库失败
            }
        }
    } 
}

public static void main(String args[]){
    ArticleCrawler crawler = new ArticleCrawler();

    // 得到需要爬取发布文章的公众账号
    List<AccountCrawlerBo> boList = accountCrawlerDao.getCrawlerList();
    if (null!=boList&&boList.size()>0) {
        for (AccountCrawlerBo bo:boList) {
            crawler.run(bo.getPublicOpenId());
        }
    }
}

}

/**
* 爬虫进程类
*/
public class ArticleCrawlerProcessor implements PageProcessor {

private Site site = Site.me().setRetryTimes(3).setSleepTime(60000);

@Override
public void process(Page page) {

    // 文章列表对应的审查元素
    List<String> resultList = page.getHtml().xpath("//item").all();

    List<ArticleBo> articleList = new ArrayList<>();
    if((resultList!=null)&&(resultList.size()>0))
    {
        // 解析文章信息:title,biz,mid,url,abstract,createTime等
        for(int i=0; i<resultList.size(); i++){

            Html html = new Html(resultList.get(i));

            String title = StringTool.getMiddleStr("<![CDATA[","]]>",html.xpath("title/text()").get());
            String url = html.xpath("url/text()").get();
            String biz = StringTool.getMiddleStr("__biz=","&mid=",url);
            String mid = StringTool.getMiddleStr("&mid=","&idx=",url);
            int idx = Integer.parseInt(StringTool.getMiddleStr("&idx=", "&sn=", url));
            String sn = StringTool.getMiddleStr("&sn=","&3rd=",url);
            String publicOpenId = StringTool.getMiddleStr(" ","<",html.xpath("openid/text()").get());
            String articleAbstract = StringTool.getMiddleStr(" ","<", html.xpath("content168/text()").get());
            String articleAbstractPicture = StringTool.getMiddleStr(" ","<", html.xpath("imglink/text()").get());
            String articleCreatetime = StringTool.getMiddleStr(" ", "<", html.xpath("lastmodified/text()").get());
            articleCreatetime = StringTool.getTimeFromUnix(articleCreatetime);

            ArticleBo bo = new ArticleBo();

            bo.setArticleTitle(title);
            bo.setArticleUrl(url);
            bo.setArticleBiz(biz);
            bo.setArticleMid(mid);
            bo.setArticleIdx(idx);
            bo.setArticleSn(sn);
            bo.setPublicOpenId(publicOpenId);
            bo.setArticleAbstract(articleAbstract);
            bo.setArticleAbstractPicture(articleAbstractPicture);
            bo.setArticleCreatetime(articleCreatetime);

            articleList.add(bo);
        }
    }

    page.putField("articleList", articleList);
}

@Override
public Site getSite() {
    return site;
}

}

版权声明:本文为博主原创文章,未经博主允许不得转载。

你可能感兴趣的:(webmagic)