Java多线程爬取全书网小说

先给大家贴上全书网网址:http://www.quanshuwang.com/

本程序采用的webmagic爬虫框架;WebMagic是一个简单灵活的Java爬虫框架。你可以快速开发出一个高效、易维护的爬虫。
程序需要使用到WebMagic框架的jar,大家可以自行百度。
废话不说,直接上代码

package com.baweihu.wler;

import java.util.UUID;

import com.alibaba.fastjson.JSONObject;
import com.baweihu.entity.Novel;
import com.baweihu.util.FileUtil;
import com.baweihu.util.ImageDownload;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
/**
 * @author huluwa
 * @version 1.0.0
 * */
public class QuanShu implements Runnable,PageProcessor {
    // 抓取网站的相关配置,可以包括编码、抓取间隔1s、重试次数等
    private Site site = Site.me().setCharset("gbk").setRetryTimes(0).setSleepTime(0).setTimeOut(8000);
    private int start;
    private int end;
    public QuanShu(int start, int end) {
        super();
        this.start = start;
        this.end = end;
    }
    public QuanShu() {
        super();
    }
    public Site getSite() {
        return site;
    }
    
    public void run() {
        for(; start < end; start ++) {
            Spider.create(new QuanShu()).addUrl("http://www.quanshuwang.com/book_" + start + ".html").thread(2).run();
        }
    }
    
    /**
     * @param length:长度
     *        threadNum:要开启线程的数量
     * */
    public synchronized void handleList(int length, int threadNum) {
        //如果线程个数不能被整除则多加个线程来处理剩余的数据
        int t = length % threadNum == 0 ? length / threadNum : length / (threadNum - 1);
        for (int i = 0; i < threadNum; i++) {
            int start = i * t;
            int end = (i + 1) * t;
            QuanShu test = new QuanShu(start, end);
            new Thread(test).start();
        }
    }
    
    public void process(Page page) {
        try {
            Html html = page.getHtml();
            String title = html.xpath("/html/head/meta[5]/@content").get();
            if(null == title) {
                return;         //获取不到小说名字  就表示为null  不爬这条数据
            }
            String author = html.xpath("/html/head/meta[9]/@content").get();
            String classfy =html.xpath("/html/head/meta[8]/@content").get();
            String isFinish =html.xpath("/html/head/meta[11]/@content").get();
            String coverUrl =html.xpath("/html/head/meta[7]/@content").get();
            String chapterUrl =html.xpath("/html/head/meta[14]/@content").get();
            String introduce =html.xpath("/html/head/meta[6]/@content").get();
            String updateTime =html.xpath("/html/head/meta[12]/@content").get();
            String json = JSONObject.toJSONString(new Novel(title, author, classfy, isFinish, introduce, coverUrl, chapterUrl, updateTime));
            String no = UUID.randomUUID().toString().replace("-", "");
            FileUtil.writeToFile("file/" + no + "/info.txt", json);
            ImageDownload.downloadPicture(coverUrl, "file/" + no + "/cover.jpg");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    public static void main(String[] args) {
        QuanShu qs = new QuanShu();
        qs.handleList(95730, 12);
    }
}

我的个人博客网站:王甲斌个人博客 大家没事的话可以多逛逛

你可能感兴趣的:(Java多线程爬取全书网小说)