使用Jsoup对豆瓣读书进行爬虫

package Spider;

import java.io.IOException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

public class Spider {
//数据库层,我就不贴代码出来了,每个人数据库不一样,我会在涉及到数据库的地方注释
    DataBaseHelp db = new DataBaseHelp();

    public void run() {
        //调用getTagsStringList()获取标签列表并且对这个链表遍历获取该标签的图书。
        List tagList = getTagsStringList();
        for (int i = 0; i < tagList.size(); i++) {
            //在数据库中插入该标签
            db.selectAndInsertTagId(tagList.get(i));
            //获取该标签的图书
            getBookListbyTag(tagList.get(i));
        }
    }
    //获取标签tag的所有图书
    public List getBookListbyTag(String tag) {
    //先将该标签插入数据库,并获得标签id
        int tagId = db.selectAndInsertTagId(tag);
        //循环豆瓣的页面,豆瓣在https://www.douban.com/tag/标签/book?strat=起始书本编号  这个页面展示数量为20的书本,利用循环控制起始编号获取该标签的所有书本
        for (int num = 0;; num += 15) {
            try {
            //使用Jsoup连接,需要导入Jsoup包,网上下一个就好了,注意必须设置cookie,要不然多访问几次豆瓣会禁止访问,cookie随便设置一个值,不过最好模仿豆瓣给浏览器返回的。当然也可以实现先从豆瓣得到cookie。
                Document doc = Jsoup
                        .connect(
                                "https://www.douban.com/tag/"
                                        + URLEncoder.encode(tag, "UTF-8")
                                        + "/book")
                        .data("start", Integer.toString(num))
                        .userAgent(
                                "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36")
                        .header("cookie", "bid=\"Q5KWZL7y8g7\";").get();
                Elements bookElements = doc.select("div.book-list > dl");
                //如果当前页已经没有书本了,跳出循环
                if (bookElements.size() < 1)
                    break;
                 System.out.println(bookElements.select("a.title").html());
                for (int i = 0; i < bookElements.size(); i++) {
                    String bookName = bookElements.get(i).select("a.title")
                            .html();
                    String bookDetail = bookElements.get(i).html();
                    String bookRank = bookElements.get(i)
                            .select("span.rating_nums").html();
                    if (!bookRank.equals("")) {
                        Book book = new Book(bookName, bookDetail, bookRank);
                        System.out.println(bookName + bookRank + "tag" + tag+ "num" + num);
                        //将书本数据插入数据库
                        db.insertBook(bookName, bookDetail, bookRank, tagId);
                        //bookList.add(book);
                    }
                }
                //暂停2秒,豆瓣对一定时间内范围有次数限制,2秒是我实验过程中最小的数字了,多了就会403错误,2s间隔大概2小时能读完豆瓣自给标签的50000本书
                Thread.sleep(2000);
            } catch (IOException | InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                //若报错了,则重新读取这个页面
                num-=15;
            }

        }
        //return bookList;
                return  null;
    }
    //该函数实现获取标签List
    public List getTagsStringList() {
        List lis = new ArrayList();
        try {
            //利用Jsoup连接标签列表页面,同样设置cookie
            Document doc = Jsoup
                    .connect("https://book.douban.com/tag/")
                    .header("host", "book.douban.com")
                    .header("accept",
                            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
                    .header("scheme", "https")
                    .header("version", "HTTP/1.1")
                    .header("path", "/tag/")
                    .header("accept-encoding", "gzip, deflate, sdch")
                    .header("accept-language", "zh-CN,zh;q=0.8")
                    .header("cookie", "bid=\"Q5KWZL7y8g7\";")
                    .header("cache-control", "max-age=0")
                    .userAgent(
                            "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36")
                    .get();

            Elements tag = doc.select("div.article");
            String[] str = tag.select("a.tag").html().split("\n");
            Collections.addAll(lis, str);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return lis;
    }

    public static void main(String[] args) {
        Spider s = new Spider();
        s.run();

    }
}

你可能感兴趣的:(爬虫,java)