顺手写的,没有建立新项目,放我自己的项目的一个文件夹里了,有兴趣的朋友可以拉下来试试
https://gitee.com/panlufei/demo/blob/master/src/main/java/com/plf/demo1/spider/DangDangBook.java
首先百度了一下,当当网大概有900-1000w本书,量比较大, i5(8代 ) +16G 内存,大概1分钟60多条, 单台电脑估计要 5天左右,当然你也可以用多台电脑跑,时间节省不少.
跑下来的数据 我用sql 格式,保存到了txt文本文档里,也会有错误url收集文档.
到时候取mysql执行这些insert 语句就行了;
这是表结构:
CREATE TABLE `book` (
`id` varbinary(32) NOT NULL,
`isbn` varchar(20) NOT NULL,
`book_name` varbinary(300) NOT NULL,
`price` varchar(20) NOT NULL,
`author` varchar(100) NOT NULL,
`book_time` varchar(50) NOT NULL,
`type` varchar(400) NOT NULL,
`publisher` varchar(200) NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
jar引用:(maven)
org.jsoup
jsoup
1.11.3
com.squareup.okhttp3
okhttp
3.9.0
org.apache.commons
commons-lang3
3.8.1
org.apache.commons
commons-collections4
4.1
打开http://category.dangdang.com/cp01.31.00.00.00.00.html
浏览器f12,console下面, 执行下面代码,拿到所有分类,当然你也可以直接跑我的代码,但是可能图书信息就不是最新的了
var url=''; $('.list_product').find('a').each(function(){url+='","'+$(this).attr('href')});console.log(url.substr(2)+'"');
**
**
package com.plf.demo1.test;
import com.plf.demo1.utils.UuidUtils;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
/**
* @Author: panlf
* @Date: 2019/7/22 15:25
*/
public class Test01 {
private static String BASE_URL="http://category.dangdang.com/";
private static String[] TYPE_URL1={
"cp01.43.00.00.00.00.html","cp01.41.00.00.00.00.html","cp01.47.00.00.00.00.html","cp01.03.00.00.00.00.html",
"cp01.22.00.00.00.00.html","cp01.21.00.00.00.00.html","cp01.07.00.00.00.00.html","cp01.05.00.00.00.00.html",
"cp01.45.00.00.00.00.html","cp01.56.00.00.00.00.html","cp01.28.00.00.00.00.html","cp01.01.00.00.00.00.html"};
private static String[] TYPE_URL2={
"cp01.38.00.00.00.00.html","cp01.24.00.00.00.00.html","cp01.31.00.00.00.00.html","cp01.06.00.00.00.00.html",
"cp01.55.00.00.00.00.html","cp01.54.00.00.00.00.html","cp01.26.00.00.00.00.html","cp01.18.00.00.00.00.html",
"cp01.27.00.00.00.00.html","cp01.10.00.00.00.00.html","cp01.15.00.00.00.00.html","cp01.32.00.00.00.00.html",
"cp01.58.00.00.00.00.html","cp01.25.00.00.00.00.html","cp01.63.00.00.00.00.html","cp01.50.00.00.00.00.html"};
private static String[] TYPE_URL3={
"cp01.17.00.00.00.00.html","cp01.52.00.00.00.00.html","cp01.12.00.00.00.00.html","cp01.62.00.00.00.00.html",
"cp01.09.00.00.00.00.html","cp01.20.00.00.00.00.html","cp01.30.00.00.00.00.html","cp01.34.00.00.00.00.html",
"cp01.16.00.00.00.00.html","cp01.14.00.00.00.00.html","cp01.66.00.00.00.00.html","cp01.19.00.00.00.00.html"};
private static String[] TYPE_URL4={
"cp01.49.00.00.00.00.html","cp01.11.00.00.00.00.html","cp01.04.00.00.00.00.html","cp01.59.00.00.00.00.html",
"cp01.77.00.00.00.00.html","cp01.76.00.00.00.00.html","cp01.78.00.00.00.00.html","cp01.75.00.00.00.00.html",
"cp01.68.00.00.00.00.html","cp01.69.00.00.00.00.html","cp01.73.00.00.00.00.html","cp01.74.00.00.00.00.html"};
public static void main(String[] args) throws Exception {
//结果输出到4个文件
BufferedWriter out1 = getBufferedOut("C:\\Users\\user\\Desktop\\sqlresult01.txt");
BufferedWriter out2 = getBufferedOut("C:\\Users\\user\\Desktop\\sqlresult02.txt");
BufferedWriter out3 = getBufferedOut("C:\\Users\\user\\Desktop\\sqlresult03.txt");
BufferedWriter out4 = getBufferedOut("C:\\Users\\user\\Desktop\\sqlresult04.txt");
//错误文件输出
BufferedWriter outErr = getBufferedOut("C:\\Users\\user\\Desktop\\errorUrl.txt");
//启动4条线程
startThread(out1, outErr,TYPE_URL1,"第一条线程");
startThread(out2, outErr,TYPE_URL2,"第二条线程");
startThread(out3, outErr,TYPE_URL3,"第三条线程");
startThread(out4, outErr,TYPE_URL4,"第四条线程");
}
private static void running(BufferedWriter out, BufferedWriter outErr,String[] typeUrl) throws IOException {
//int index=0;
for (String type : typeUrl) {
for (int i = 1; i <=100 ; i++) {
String page="";
if(i>1){
page="pg"+i+"-";
}
String url=BASE_URL+page+type;
URL detailUrl = new URL(url);
Document doc = Jsoup.parse(detailUrl, 30000);
Elements select = doc.select("a[name=itemlist-picture]");
for (Element element : select) {
String sql="insert into book (id,book_name,price,author,publisher,book_time,isbn,type,url) values";
List params=new ArrayList<>();
String id = UUID.randomUUID().toString().replaceAll("-","");
params.add(id);
String detail = element.attr("href");
try {
Document parse = Jsoup.parse(new URL(detail), 50000);
//书名
String bookName = parse.select(".name_info").get(0).select("h1").get(0).attr("title");
params.add(bookName);
//价格
Elements e1 = parse.select("#original-price");
String price = e1.html();
int n = price.lastIndexOf(">");
price = price.substring(n+1);
params.add(price);
Elements select1 = parse.select(".messbox_info");
if(CollectionUtils.isEmpty(select1))continue;
Elements elements = select1.get(0).select(".t1");
//作者
List author = elements.get(0).select("a").stream().map(x -> x.html()).collect(Collectors.toList());
params.add(StringUtils.join(author,","));
//出版社
List publisher = elements.get(1).select("a").stream().map(x -> x.html()).collect(Collectors.toList());
params.add(StringUtils.join(publisher,","));
//出版时间
String time = elements.get(2).html();
params.add(time.substring(5,time.indexOf("&")));
//ISBN
String ISBN = parse.select("#detail_describe").get(0).child(0).child(4).html();
params.add(ISBN.substring(11));
//分类
List typeList = parse.select("#detail-category-path").get(0)
.child(1).select("a").stream().map(x -> x.html()).collect(Collectors.toList());
params.add(StringUtils.join(typeList,","));
//url
params.add(detail);
sql+="('"+StringUtils.join(params,"','")+"');";
out.write(sql+"\r\n");
//System.out.println(Thread.currentThread().getName() +" 第: "+(++index)+" 条");
} catch (Exception e) {
//System.out.println("异常的url: "+ detail);
e.printStackTrace();
outErr.write(detail+"\r\n");
outErr.write(e.getMessage()+"\r\n");
outErr.flush();
}
}
out.flush();
}
}
out.close();
outErr.close();
}
private static void startThread(BufferedWriter out, BufferedWriter outErr, String[] typeUrl,String threadName) {
new Thread(()-> {
try {
running(out, outErr,typeUrl);
} catch (IOException e) {
e.printStackTrace();
}
},threadName).start();
}
private static BufferedWriter getBufferedOut(String result1) throws IOException {
File writeName = new File(result1); // 相对路径,如果没有则要建立一个新的output.txt文件
writeName.createNewFile();
FileWriter writer = new FileWriter(writeName, true);
return new BufferedWriter(writer);
}
}