爬虫 初体验(爬取小说)

目标:

爬取笔下文学小说网的 《神墓》,下载到本地txt

材料:

jsoup

以下:

项目结构:

爬虫 初体验(爬取小说)_第1张图片

pom:


    org.jsoup
	jsoup
	1.9.2

网页元素:

爬虫 初体验(爬取小说)_第2张图片

爬虫 初体验(爬取小说)_第3张图片

实现:

package com.lxl.txt.controller;

import com.lxl.txt.bean.RuYi;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

public class MySelf {

    private static RuYi getDetail(String url){

        //基础地址
        String next_url = "https://www.bxwxorg.com";

        //获取html对象
        Document dc = null;
        try {
            dc = Jsoup.connect(url).timeout(5000).get();
        } catch (IOException e) {
            System.out.println(e.getMessage());
        }
        //System.out.println(dc);

        //获得标题
        String title = dc.select("div.bookname > h1").text(); // 获取class=bookname的div元素后面的h1元素
        //内容
        String content = dc.getElementById("content").text();
        //下一章地址
        String pager_next = dc.getElementById("A3").attr("href");
        next_url = next_url + pager_next;

        RuYi ruyi = new RuYi();
        ruyi.setTitle(title);
        ruyi.setContent(content);
        ruyi.setNextUrl(next_url);
        ruyi.setPager_next(pager_next);

        return ruyi;
    }

    //outPut
    private static void outPut(StringBuffer buffer){
        try {
            File writeName = new File("E:\\output.txt");     // 相对路径,如果没有则要建立一个新的output.txt文件
            writeName.createNewFile();                       // 创建新文件,有同名的文件的话直接覆盖
            try (FileWriter writer = new FileWriter(writeName);
                 BufferedWriter out = new BufferedWriter(writer)
            ) {
                out.write(buffer + "\r\n");                  // \r\n即为换行
                out.flush();                                 // 把缓存区内容压入文件
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    public static void main(String[] args) throws IOException {

        //起始地址(第一章地址)
        String url = "https://www.bxwxorg.com/read/36/677091.html";

        //获取
        RuYi ruyi = getDetail(url);

        //添加stringBuffer
        StringBuffer buffer = new StringBuffer();
        buffer.append(ruyi.getTitle() + "\r\n");
        buffer.append(ruyi.getContent() + "\r\n");

        //循环,等于最终章的“/read/36/”时,停止继续探索
        while(ruyi.getNextUrl() != null && ruyi.getContent() != null && !ruyi.getPager_next().equals("/read/36/")){
            //重复获取
            ruyi = getDetail(ruyi.getNextUrl().toString());

            //添加stringBuffer
            buffer.append(ruyi.getTitle() + "\r\n");
            buffer.append(ruyi.getContent() + "\r\n");
        }

        //输出
        outPut(buffer);
    }

}

实体类:

package com.lxl.txt.bean;

public class RuYi {

    private String title;       //标题
    private String content;     //内容
    private String nextUrl;     //下一章地址
    private String pager_next;  //下一章编码


    public String getPager_next() {
        return pager_next;
    }

    public void setPager_next(String pager_next) {
        this.pager_next = pager_next;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public String getNextUrl() {
        return nextUrl;
    }

    public void setNextUrl(String nextUrl) {
        this.nextUrl = nextUrl;
    }
}

结果:

爬虫 初体验(爬取小说)_第4张图片

以上。

你可能感兴趣的:(爬虫)