java爬虫之webMagic学习

webMagic爬虫

    • webMagic介绍
    • 项目引入webMagic
    • webMagic配置
    • PageProcessor类
    • pipeline类
    • 保存爬取的数据
    • 参考文档

webMagic介绍

WebMagic是一个简单灵活的Java爬虫框架。它提供简单灵活的API,只需少量代码即可实现一个爬虫。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。

项目引入webMagic

  1. pom.xml加入相关依赖
    java爬虫之webMagic学习_第1张图片
  2. 直接将源码放入项目里面(我是这样做的)
    java爬虫之webMagic学习_第2张图片

webMagic配置

  1. 创建config.json文件放在项目src/main/resources下
{
  "site": {
  	# 网站域名
    "domain": "139.159.3.18",
    # 请求头,主要是模拟浏览器请求
    "headers": {
      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
      "authorization": "Your own authorization here."
    },
    # 如果爬取的网站需要登录,在这里设置cookie信息
    "cookie": {
      "JSESSIONID":"FBCC0D50EC568B1A7A6EF7FD94C50079"
    },
    "retryTimes": 3,
    "sleepTime": 500
  },
  "base_dir": "/Users/zz/"
}

创建Configuration配置类用来获取config.json配置
2. 不用config.json,完全可以用代码来配置,使用Site类。例如:

Site site = Site.me()
            .setRetryTimes(3)
            .setSleepTime(2000)
            .setTimeOut(60000)
            .setCharset("utf-8")
            .addCookie("域","名称", "内容")
            .addCookie("域","名称", "内容");

PageProcessor类

import java.util.List;

import cn.dofuntech.spider.collector.site99.Configuration;
import cn.dofuntech.spider.webmagic.Page;
import cn.dofuntech.spider.webmagic.Site;
import cn.dofuntech.spider.webmagic.Spider;
import cn.dofuntech.spider.webmagic.pipeline.Pipeline;
import cn.dofuntech.spider.webmagic.processor.PageProcessor;
import cn.dofuntech.spider.webmagic.scheduler.BloomFilterDuplicateRemover;
import cn.dofuntech.spider.webmagic.scheduler.FileCacheQueueScheduler;
import cn.dofuntech.spider.webmagic.selector.Selectable;

import com.hs2e.common.collect.ListUtils;
import com.hs2e.common.lang.StringUtils;

/**
 *
 * 爬取症状详细信息
 */
public class ZzPageProcessor2 implements PageProcessor {
	// 获取配置
    private Site               site            = new Configuration().getSite();
	// 爬取符合正则的网页
    public static final String URL_LIST_PINYIN = "https://jbk.99.com.cn/zz/py/[A-Z]-[0-9]\\.html";
    
    public void process(Page page) {

        //拼音症状列表页面
        if (page.getUrl().regex(URL_LIST_PINYIN).match()) {
        	// 获取所有url
            page.addTargetRequests(page.getHtml().xpath("//div[@class=\"part-cont3\"]/dl/dt").links().all());
        }
        //症状详情页
        else {
            Selectable selectable = page.getHtml().xpath("//div[@id='d-top2']//li/font");

            List nodes = selectable.nodes();
//            System.out.println("症状:" + nodes.get(0).$("font", "text"));
//            System.out.println("部位:" + nodes.get(1).$("font", "text"));

            List a2 = ListUtils.newArrayList();
            List a3 = ListUtils.newArrayList();
            nodes.get(2).$("font > a").nodes().forEach(a -> {
                a2.add(a.$("a", "text").get());
            });
            nodes.get(3).$("font > a").nodes().forEach(a -> {
                a3.add(a.$("a", "text").get());
            });

//            System.out.println("科室:" + StringUtils.join(a2, " "));
//            System.out.println("疾病:" + StringUtils.join(a3, " "));
            
            page.putField("name", nodes.get(0).$("font", "text").toString());
            page.putField("bw", nodes.get(1).$("font", "text").toString());
            page.putField("dept", StringUtils.join(a2, ",").trim());
            page.putField("disease", StringUtils.join(a3, ",").trim());
        }

    }

    public Site getSite() {
        return site;
    }
    
    @SuppressWarnings("resource")
    public void start(Pipeline pipeline,String url)
    {
        String pipelinePath = new Configuration().getZzPath();
        int crawlSize = 100_0000;
        Spider.create(new ZzPageProcessor2())
            .setScheduler(new FileCacheQueueScheduler(pipelinePath)
                .setDuplicateRemover(new BloomFilterDuplicateRemover(crawlSize)))
            .addUrl(url)
            .addPipeline(pipeline)
            .thread(300)
            .run();
    }

    /**
     * 下载关注列表的用户数据,用于提取 url_tokens
     * @param args 无须其他参数
     */
    public static void main(String[] args) {
        String pipelinePath = new Configuration().getZzPath();
        int crawlSize = 100_0000;
        System.out.println(crawlSize);
        Spider.create(new ZzPageProcessor2()).setScheduler(//new QueueScheduler()
        new FileCacheQueueScheduler(pipelinePath).setDuplicateRemover(new BloomFilterDuplicateRemover(crawlSize))).addPipeline(new ZzPipeline()).addUrl("https://jbk.99.com.cn/zz/py/Z-2.html").thread(200).run();
    }
}

pipeline类

webmagic内置有很多pipeline,基本能够满足开发者的需求。例如ResultItemsCollectorPipeline将爬取的所有数据保存ResultItems集合里面;FilePipeline将爬取的数据或者url写入带文件里面,等等。还可以自定义pipeline

  1. ResultItemsCollectorPipeline使用实例
package cn.dofuntech.spider.collector.site99.download;

import java.util.ArrayList;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import cn.dofuntech.spider.webmagic.ResultItems;
import cn.dofuntech.spider.webmagic.Task;
import cn.dofuntech.spider.webmagic.pipeline.ResultItemsCollectorPipeline;

/**
 * 

* 症状详细信息的 pipeline *

* Copyright (C) 2019 dofuntech. All Rights Reserved. * @author * @version 1.0 * filename:ZzPipeline.java */ public class ZzPipeline extends ResultItemsCollectorPipeline { private Logger logger = LoggerFactory.getLogger(getClass()); List collector = new ArrayList(); @Override public void process(ResultItems resultItems, Task task) { collector.add(resultItems); } @Override public List getCollected() { return collector; } }
  1. FilePipeline使用实例
public class ZzPipeline extends FilePipeline {

    private Logger      logger   = LoggerFactory.getLogger(getClass());

    static final String URL      = "url";
    static final String RESPONSE = "response";

    /**
     * create a ZhihuPipeline with default path"/data/webporter/"
     */
    public ZzPipeline() {
        setPath("/data/webporter/");
    }

    public ZzPipeline(String path) {
        setPath(path);
    }

    @Override
    public void process(ResultItems resultItems, Task task) {
       String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
        try {
            PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")), "UTF-8"));
            Map results = resultItems.getAll();

            printWriter.println(results.get(URL));
           printWriter.println(results.get(RESPONSE));
            printWriter.close();
        }
        catch (IOException e) {
            logger.warn("write file error", e);
        }
    }
    
}

保存爬取的数据

 ZzPipeline zzPipeline = new ZzPipeline();
 new ZzPageProcessor2().start(zzPipeline, url);
 List resultItems = zzPipeline.getCollected();
 List list = new ArrayList<>();
 
if (ListUtils.isNotEmpty(resultItems))
                {
                    for (ResultItems r : resultItems)
                    {
                        try
                        {
                            BasicZz z = new BasicZz();
                            z.setName(r.get("name").toString());
                            z.setDept(r.get("dept").toString());
                            z.setDisease(r.get("disease").toString());
                            z.setBw(r.get("bw").toString());
                            list.add(z);
                        }
                        catch (Exception e)
                        {
                            continue;
                        }
                    }
                   
                        baseZzService.saveBatch(list);
                   
                }

到这里就简单的实现了一个爬虫

参考文档

  1. WebMagic中文文档:http://webmagic.io/docs/zh/
  2. pipeline使用:https://blog.csdn.net/qq_36783371/article/details/79943211
  3. 自定义pipeline:https://www.jianshu.com/p/52785e3cf41e

你可能感兴趣的:(webMagic)