Java利用springboot+WebMagic爬取招聘网信息并保存到mysql案例

Java利用springboot+WebMagic爬取招聘网信息并保存到mysql中案例

WebMagic 简介:

webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic主要由Downloader(下载器)、PageProcesser(解析器)、Schedule(调度器)和Pipeline(管道)四部分组成。

该案例的工程目录结构:

Java利用springboot+WebMagic爬取招聘网信息并保存到mysql案例_第1张图片
其中MyTask实现PageProcesser接口爬取并解析内容,MyPipeLine实现Pipeline接口自定义管道将解析好的内容存入mysql数据库中

环境搭建:

pom.xml 中的核心依赖

		<!--SpringMVC-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <!--SpringData Jpa-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
        </dependency>

        <!--MySQL连接包-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
        </dependency>

        <!--WebMagic核心包-->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <!--WebMagic扩展-->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>
        <!--WebMagic对布隆过滤器的支持-->
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>16.0</version>
        </dependency>

获取并解析内容代码:

@Component
public class MyTask implements PageProcessor {
    private String url = "https://search.51job.com/list/050000,000000,0100%252C2529,00,9,99,java,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
    @Autowired
    private MyPipeLine myPipeLine;

    @Override
    public void process(Page page) {
        List<Selectable> list = page.getHtml().css("div.dw_table div.el").nodes();
        System.out.println(list);
        if (list.size() == 0) {
            //此时是详情页面,直接获取信息
            this.saveJobInfo(page);
        } else {
            //此时是列表页面,需要获取到链接进入到详情页面
            for (Selectable selectable : list) {
                String jobInfoUrl = selectable.links().toString();
                if (StringUtils.isNotBlank(jobInfoUrl)) {
                    //通过url进入详情页面并加入到任务中
                    page.addTargetRequest(jobInfoUrl);
                }
            }
            //获取下一页按钮的url,并将其内容加入到任务中
            String buttonUrl = page.getHtml().css("div.p_in li.bk").nodes().get(1).links().get();
            page.addTargetRequest(buttonUrl);
        }
    }

    private void saveJobInfo(Page page) {
        JobInfo jobInfo = new JobInfo();
        Html html = page.getHtml();
        String s = html.css("div.cn p.msg", "text").regex(".*发布").toString();
        jobInfo.setTime(s.trim().substring(s.length() - 8, s.length() - 3));
        jobInfo.setUrl(page.getUrl().toString());
        jobInfo.setCompanyName(Jsoup.parse(html.css("div.cn p.cname a[title]").toString()).text());
        jobInfo.setCompanyInfo(Jsoup.parse(html.css("div.tmsg").toString()).text());
        String addr = Jsoup.parse(html.css("div.tBorderTop_box p.fp").nodes().get(1).toString()).text();
        jobInfo.setCompanyAddr(addr.substring(addr.indexOf(":") + 1));
        jobInfo.setJobAddr(addr.substring(addr.indexOf(":") + 1));
        jobInfo.setJobInfo(Jsoup.parse(html.css("div.job_msg").toString()).text());
        jobInfo.setJobName(Jsoup.parse(html.css("h1").toString()).text());
        String salary = html.css("div.cn strong", "text").toString();
        jobInfo.setSalaryMax(MathSalary.getSalary(salary)[0]);
        jobInfo.setSalaryMin(MathSalary.getSalary(salary)[1]);
        page.putField("jobInfo", jobInfo);
    }
    //通过site配置爬虫的配置信息
    private Site site = Site.me()
            .setCharset("gbk")
            .setTimeOut(10 * 1000)
            .setRetrySleepTime(3000)
            .setRetryTimes(3);

    @Override
    public Site getSite() {
        return site;
    }

    //开启定时任务,程序运行后间隔3秒执行爬虫程序,每隔100秒再次爬取数据,以便更新数据
    @Scheduled(initialDelay = 3000, fixedDelay = 100 * 1000)
    public void process() {
        Spider.create(new MyTask())
                .addUrl(url)
                //设置线程数为10
                .thread(10)
                //利用布隆过滤器设置网页去重
                .setScheduler(new QueueScheduler()
                        .setDuplicateRemover(new BloomFilterDuplicateRemover(1000000)))
                //通过设置自定义pipeline将爬取到的数据存入mysql中
                .addPipeline(myPipeLine)
                .run();
    }
}

自定义PipeLine如下:

@Component
public class MyPipeLine implements Pipeline {
    @Autowired
    private JobInfoService jobInfoService;

    @Override
    public void process(ResultItems resultItems, Task task) {
        //通过resultItems获取爬取的信息
        JobInfo jobInfo = resultItems.get("jobInfo");
        if (jobInfo != null) {
            jobInfoService.save(jobInfo);
        }
    }
}

Service 的代码:

@Service
public class JobInfoService {
    @Autowired
    private JobInfoDao jobInfoDao;
    public void save(JobInfo jobInfo){
        JobInfo info = new JobInfo();
        info.setUrl(jobInfo.getUrl());
        info.setTime(jobInfo.getTime());
        //通过工作的url和招聘时间确定唯一的jobInfo类
        List<JobInfo> list = findJobInfo(info);
        //如果没有查询到数据,再进行新增
        if(list.size()==0){
            jobInfoDao.saveAndFlush(jobInfo);
        }
    }
    public List<JobInfo> findJobInfo(JobInfo jobInfo){
        Example<JobInfo> example = Example.of(jobInfo);
        List<JobInfo> list = jobInfoDao.findAll(example);
        return list;
    }
}

运行结果:

Java利用springboot+WebMagic爬取招聘网信息并保存到mysql案例_第2张图片

你可能感兴趣的:(Java后端)