java爬取当当网所有分类的图书信息(ISBN,作者,出版社,价格,所属分类等)

java爬取当当网所有分类的图书信息(ISBN,作者,出版社,价格,所属分类等)

顺手写的,没有建立新项目,放我自己的项目的一个文件夹里了,有兴趣的朋友可以拉下来试试

https://gitee.com/panlufei/demo/blob/master/src/main/java/com/plf/demo1/spider/DangDangBook.java

首先百度了一下,当当网大概有900-1000w本书,量比较大, i5(8代 ) +16G 内存,大概1分钟60多条, 单台电脑估计要 5天左右,当然你也可以用多台电脑跑,时间节省不少.

跑下来的数据 我用sql 格式,保存到了txt文本文档里,也会有错误url收集文档.
到时候取mysql执行这些insert 语句就行了;
这是表结构:

CREATE TABLE `book` (
  `id` varbinary(32) NOT NULL,
  `isbn` varchar(20) NOT NULL,
  `book_name` varbinary(300) NOT NULL,
  `price` varchar(20) NOT NULL,
  `author` varchar(100) NOT NULL,
  `book_time` varchar(50) NOT NULL,
  `type` varchar(400) NOT NULL,
  `publisher` varchar(200) NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8

jar引用:(maven)

		
            org.jsoup
            jsoup
            1.11.3
        
         
            com.squareup.okhttp3
            okhttp
            3.9.0
        
        
            org.apache.commons
            commons-lang3
            3.8.1
        
        
            org.apache.commons
            commons-collections4
            4.1
        

当当网先拿来所有的分类子url(可忽略)

打开http://category.dangdang.com/cp01.31.00.00.00.00.html
浏览器f12,console下面, 执行下面代码,拿到所有分类,当然你也可以直接跑我的代码,但是可能图书信息就不是最新的了

var url=''; $('.list_product').find('a').each(function(){url+='","'+$(this).attr('href')});console.log(url.substr(2)+'"');

**

爬取数据的代码

**

package com.plf.demo1.test;

import com.plf.demo1.utils.UuidUtils;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

/**
 * @Author: panlf
 * @Date: 2019/7/22 15:25
 */
public class Test01 {
    private static String BASE_URL="http://category.dangdang.com/";
    private static String[] TYPE_URL1={
            "cp01.43.00.00.00.00.html","cp01.41.00.00.00.00.html","cp01.47.00.00.00.00.html","cp01.03.00.00.00.00.html",
            "cp01.22.00.00.00.00.html","cp01.21.00.00.00.00.html","cp01.07.00.00.00.00.html","cp01.05.00.00.00.00.html",
            "cp01.45.00.00.00.00.html","cp01.56.00.00.00.00.html","cp01.28.00.00.00.00.html","cp01.01.00.00.00.00.html"};
    private static String[] TYPE_URL2={
        "cp01.38.00.00.00.00.html","cp01.24.00.00.00.00.html","cp01.31.00.00.00.00.html","cp01.06.00.00.00.00.html",
        "cp01.55.00.00.00.00.html","cp01.54.00.00.00.00.html","cp01.26.00.00.00.00.html","cp01.18.00.00.00.00.html",
        "cp01.27.00.00.00.00.html","cp01.10.00.00.00.00.html","cp01.15.00.00.00.00.html","cp01.32.00.00.00.00.html",
        "cp01.58.00.00.00.00.html","cp01.25.00.00.00.00.html","cp01.63.00.00.00.00.html","cp01.50.00.00.00.00.html"};
    private static String[] TYPE_URL3={
        "cp01.17.00.00.00.00.html","cp01.52.00.00.00.00.html","cp01.12.00.00.00.00.html","cp01.62.00.00.00.00.html",
        "cp01.09.00.00.00.00.html","cp01.20.00.00.00.00.html","cp01.30.00.00.00.00.html","cp01.34.00.00.00.00.html",
        "cp01.16.00.00.00.00.html","cp01.14.00.00.00.00.html","cp01.66.00.00.00.00.html","cp01.19.00.00.00.00.html"};
    private static String[] TYPE_URL4={
        "cp01.49.00.00.00.00.html","cp01.11.00.00.00.00.html","cp01.04.00.00.00.00.html","cp01.59.00.00.00.00.html",
        "cp01.77.00.00.00.00.html","cp01.76.00.00.00.00.html","cp01.78.00.00.00.00.html","cp01.75.00.00.00.00.html",
        "cp01.68.00.00.00.00.html","cp01.69.00.00.00.00.html","cp01.73.00.00.00.00.html","cp01.74.00.00.00.00.html"};

    public static void main(String[] args) throws Exception {
        //结果输出到4个文件
        BufferedWriter out1 = getBufferedOut("C:\\Users\\user\\Desktop\\sqlresult01.txt");
        BufferedWriter out2 = getBufferedOut("C:\\Users\\user\\Desktop\\sqlresult02.txt");
        BufferedWriter out3 = getBufferedOut("C:\\Users\\user\\Desktop\\sqlresult03.txt");
        BufferedWriter out4 = getBufferedOut("C:\\Users\\user\\Desktop\\sqlresult04.txt");

        //错误文件输出
        BufferedWriter outErr = getBufferedOut("C:\\Users\\user\\Desktop\\errorUrl.txt");

        //启动4条线程
        startThread(out1, outErr,TYPE_URL1,"第一条线程");
        startThread(out2, outErr,TYPE_URL2,"第二条线程");
        startThread(out3, outErr,TYPE_URL3,"第三条线程");
        startThread(out4, outErr,TYPE_URL4,"第四条线程");
    }



    private static void running(BufferedWriter out, BufferedWriter outErr,String[] typeUrl) throws IOException {
        //int index=0;
        for (String type : typeUrl) {
            for (int i = 1; i <=100 ; i++) {
                String page="";
                if(i>1){
                    page="pg"+i+"-";
                }
                String url=BASE_URL+page+type;
                URL detailUrl = new URL(url);
                Document doc = Jsoup.parse(detailUrl, 30000);
                Elements select = doc.select("a[name=itemlist-picture]");
                for (Element element : select) {
                    String sql="insert into book (id,book_name,price,author,publisher,book_time,isbn,type,url) values";

                    List params=new ArrayList<>();
                    String id = UUID.randomUUID().toString().replaceAll("-","");
                    params.add(id);

                    String detail = element.attr("href");
                    try {
                        Document parse = Jsoup.parse(new URL(detail), 50000);


                        //书名
                        String bookName = parse.select(".name_info").get(0).select("h1").get(0).attr("title");
                        params.add(bookName);


                        //价格
                        Elements e1 = parse.select("#original-price");
                        String price = e1.html();
                        int n = price.lastIndexOf(">");
                        price = price.substring(n+1);
                        params.add(price);


                        Elements select1 = parse.select(".messbox_info");
                        if(CollectionUtils.isEmpty(select1))continue;
                        Elements elements = select1.get(0).select(".t1");

                        //作者
                        List author = elements.get(0).select("a").stream().map(x -> x.html()).collect(Collectors.toList());
                        params.add(StringUtils.join(author,","));

                        //出版社
                        List publisher = elements.get(1).select("a").stream().map(x -> x.html()).collect(Collectors.toList());
                        params.add(StringUtils.join(publisher,","));

                        //出版时间
                        String time = elements.get(2).html();
                        params.add(time.substring(5,time.indexOf("&")));

                        //ISBN
                        String ISBN = parse.select("#detail_describe").get(0).child(0).child(4).html();
                        params.add(ISBN.substring(11));

                        //分类
                        List typeList = parse.select("#detail-category-path").get(0)
                                .child(1).select("a").stream().map(x -> x.html()).collect(Collectors.toList());
                        params.add(StringUtils.join(typeList,","));

                        //url
                        params.add(detail);


                        sql+="('"+StringUtils.join(params,"','")+"');";
                        out.write(sql+"\r\n");

                        //System.out.println(Thread.currentThread().getName() +"  第: "+(++index)+" 条");
                    } catch (Exception e) {
                        //System.out.println("异常的url: "+ detail);
                        e.printStackTrace();
                        outErr.write(detail+"\r\n");
                        outErr.write(e.getMessage()+"\r\n");
                        outErr.flush();
                    }

                }
                out.flush();

            }
        }
        out.close();
        outErr.close();
    }

    private static void startThread(BufferedWriter out, BufferedWriter outErr, String[] typeUrl,String threadName) {
        new Thread(()-> {
            try {
                running(out, outErr,typeUrl);
            } catch (IOException e) {
                e.printStackTrace();
            }
        },threadName).start();
    }
    private static BufferedWriter getBufferedOut(String result1) throws IOException {
        File writeName = new File(result1); // 相对路径,如果没有则要建立一个新的output.txt文件
        writeName.createNewFile();
        FileWriter writer = new FileWriter(writeName, true);
        return new BufferedWriter(writer);
    }


}

你可能感兴趣的:(java)