基于jsoup的java爬虫-爬取豆瓣小组租房信息

主要框架为springboot+mybatis+jsoup

jsoup官方文档:https://www.open-open.com/jsoup/

爬取的豆瓣网址为:https://www.douban.com/group/tianhezufang/discussion?start=0

1.首先对网页F12进行节点分析

2.分析得到id为group-new-topic-bar下的tbody下的tr就是我们要的每一行租房信息

3.遍历每一个tr节点,得到详细信息的url,进入到该url里获取到帖子的详细信息

4.防止爬取过于频繁,可以在每次爬取详情信息时sleep一会

代码如下:

@Override
@Transactional
public Result doubanzufang(DouBanGroup douBanGroup){
    try{
        int pageStrat = 0;
        String url = douBanGroup.getUrl().replace("{pageStart}",pageStrat+"");
        Map heards = new HashMap<>();
        heards.put("Cookie", "bid=yeV9XIiyQ8w; douban-fav-remind=1; viewed=\"25971624\"; gr_user_id=dcf5e665-be30-4060-8ef6-b46949651977; _vwo_uuid_v2=D4FE434D653C82EE6B424B779E66ED8A6|5a140e4d6500abf642237676628ea9a7; ll=\"118281\"; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1585016248%2C%22https%3A%2F%2Fblog.csdn.net%2Fweixin_42211601%2Farticle%2Fdetails%2F95076934%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.1992864926.1576061677.1584599272.1585016249.15; __utmc=30149280; __utmz=30149280.1585016249.15.15.utmcsr=blog.csdn.net|utmccn=(referral)|utmcmd=referral|utmcct=/weixin_42211601/article/details/95076934; ct=y; ap_v=0,6.0; __utmt=1; _pk_id.100001.8cb4=d343c8a5a42b363f.1576061677.12.1585017177.1584599339.; __utmb=30149280.119.5.1585017177273");
        heards.put("Host", "www.douban.com");
        heards.put("Referer", "https://www.douban.com/group/longgangzufang/discussion?start=25");
        heards.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36");

        Document document = Jsoup.connect(url)
                .headers(heards)
                .timeout(10000)
                .ignoreContentType(true)
                .get();

        Element content = document.getElementById("content");
        Elements olt = content.select("table.olt");
        Elements tbody = olt.select("tbody");

        Elements tr = tbody.select("tr");

        Elements title = tr.select(".title");
        Elements time = tr.select(".time");

        Elements a = title.select("a");

        List> list = new ArrayList<>();
        // 获取信息
        for (int i = 0; i < a.size(); i++) {
            Map map = new HashMap<>();
            String hrefTemp = a.get(i).attr("abs:href");
            String titleTemp = a.get(i).text();
            String updateTime = time.get(i).text();

            map.put("href",hrefTemp);
            map.put("title",titleTemp);
            map.put("updateTime",updateTime);
            // 进入详情页
            Thread.sleep(2000);
            Document documentDetail = Jsoup.connect(hrefTemp)
                    .headers(heards)
                    .timeout(10000)
                    .ignoreContentType(true)
                    .get();

            Element content1 = documentDetail.getElementById("content");
            Elements topicdoc = content1.getElementsByClass("topic-doc");
            // 获取创建时间
            String createTime = topicdoc.select("h3").select("span.color-green").first().text();
            Element linkreport = topicdoc.first().getElementById("link-report");
            Elements topicrichtext = linkreport.getElementsByClass("topic-richtext");

            // 介绍
            Elements p = topicrichtext.select("p");
            StringBuffer text = new StringBuffer();
            for(Element element:p){
                text.append(element.text() + "||");
            }
            // 图片
            Elements img = topicrichtext.select("img");
            List imageList = new ArrayList<>();
            for(Element element:img){
                String src = element.attr("abs:src");
                imageList.add(src);
            }
            int index1 = hrefTemp.indexOf("/");
            int count = getOccur(hrefTemp, "/");
            int countTemp = 1;
            while (countTemp != count-1){
                countTemp++;
                index1 = hrefTemp.indexOf("/", index1 + 1);
            }

            String id = hrefTemp.substring(index1+1,hrefTemp.lastIndexOf("/"));
            // 存入数据库
            int countByTopicId = douBanTopicMapper.countByTopicId(id);
            if(countByTopicId > 0) continue;
            DouBanTopic douBanTopic = new DouBanTopic();
            douBanTopic.setId(IdUtil.getId())
                    .setDoubanTopicId(id)
                    .setGroupId(douBanGroup.getId())
                    .setTitle(titleTemp)
                    .setText(text.toString())
                    .setCreateTime(createTime)
                    .setUpdateTime(updateTime)
                    .setUrl(hrefTemp);

            douBanTopicMapper.insertSelective(douBanTopic);

            if(imageList != null && imageList.size() > 0){
                for (String image:
                imageList) {
                    DouBanTopicImg douBanTopicImg = new DouBanTopicImg();
                    douBanTopicImg.setId(IdUtil.getId())
                            .setTopicId(douBanTopic.getId())
                            .setImageUrl(image)
                            .setCreateTime(new Date());
                    douBanTopicImgMapper.insertSelective(douBanTopicImg);
                }

            }

            map.put("text",text);
            map.put("imageList",imageList);
            map.put("createTime",createTime);
            map.put("id", id);
            list.add(map);
        }



        return Result.success().setT(list);
    }catch (Exception e){
        e.printStackTrace();
        return Result.error();
    }
}

5.我采用的是每20分钟定时爬取一次,建立一个小组表。每次从小组表获取链接开始爬取。

CREATE TABLE `douban_group` (
  `id` varchar(30) NOT NULL COMMENT '主键id',
  `url` varchar(255) DEFAULT NULL COMMENT '链接',
  `name` varchar(128) NOT NULL COMMENT '小组名称',
  `douban_group_id` varchar(30) DEFAULT NULL COMMENT '豆瓣组id',
  `create_time` datetime DEFAULT NULL COMMENT '创建时间',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

 

你可能感兴趣的:(基于jsoup的java爬虫-爬取豆瓣小组租房信息)