用到的工具:
1、FastJson(JSON与Java对象进行转换)
2、Loombok(简化JavaBean的编写)
3、线程池(提高爬虫爬取效率,多线程执行任务)
4、批量提交执行SQL(减少与数据库的连接,可以批量插入数据)
5、Jsoup(一个基于Java开发的爬虫库)
建议使用Maven进行开发,以下是我用到的Pom文件:
mysql
mysql-connector-java
5.1.47
org.jsoup
jsoup
1.11.3
com.alibaba
fastjson
1.2.54
org.projectlombok
lombok
1.18.4
org.projectlombok
lombok
1.18.4
实体类对象VO:QuestionVO
package drive;
import com.alibaba.fastjson.annotation.JSONField;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.io.Serializable;
/**
* The type QuestionVO.
*
* @author 小书包
* @date 2018 /12/31 11:48
*/
@Data
@AllArgsConstructor
@NoArgsConstructor
public class QuestionVO implements Serializable {
/**
* id : 2
* question : 机动车驾驶人违法驾驶造成重大交通事故构成犯罪的,依法追究什么责任?
* answerA : 刑事责任
* answerB : 民事责任
* answerC : 经济责任
* answerD : 直接责任
* correctAnswer : 1
* imageUrl :
* bestAnswer : 《道路交通安全法》第一百零一条:违反道路交通安全法律、法规的规定,发生重大交通事故,构成犯罪的,依法追究刑事责任,并由公安机关交通管理部门吊销机动车驾驶证。
* bestAnswerId : 2600002
* type : 2
* sinaImg :
*/
//id
@JSONField(name = "id")
private int id;
//问题
@JSONField(name = "question")
private String question;
@JSONField(name = "a")
private String answerA;
@JSONField(name = "b")
private String answerB;
@JSONField(name = "c")
private String answerC;
@JSONField(name = "d")
private String answerD;
/**
* 正确答案
* 单选题:1:A;2:B;3:C;4:D;
* 多选题:12:AB;13:AC;14:AD;23:BC;24:BD;34:CD;123:ABC;124:ABD;234:BCD;1234:ABCD;
* 判断题:1:正确;2:错误;
*/
@JSONField(name = "correctAnswer")
private int correctAnswer;
//图片原始url
@JSONField(name = "imageurl")
private String imageUrl;
//题目解释
@JSONField(name = "bestanswer")
private String bestAnswer;
@JSONField(name = "bestanswerid")
private String bestAnswerId;
/**
* 题目类型
* 1.判断题
* 2.单选题
* 3.多选题
*/
@JSONField(name = "Type")
private int type;
//如果为空,则使用imageUrl的地址,如果不为空"http://ww"+(random.nextInt(5))+".sinaimg.cn/mw600/"+sinaImg;
@JSONField(name = "sinaimg")
private String sinaImg;
}
@Data注解:提供了setter、getter、toString等方法的编写
@AllArgsConstructor:具有所有参数构造方法
@NoArgsConstructor:无参构造方法
@JSONField:json对象中对应的名称
实体类对象DO:QuestionDO
package drive;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.sql.Timestamp;
import java.util.Random;
/**
* @author 小书包
* @date 2018/12/31 13:40
*/
@Data
@AllArgsConstructor
@NoArgsConstructor
public class QuestionDO {
private String question;
private String answerA;
private String answerB;
private String answerC;
private String answerD;
private int correctAnswer;
private String imageUrl;
private String bestAnswer;
private int type;
private Timestamp createTime;
private Timestamp updateTime;
public QuestionDO(QuestionVO questionVO) {
this.setQuestion(questionVO.getQuestion());
this.setAnswerA(questionVO.getAnswerA());
this.setAnswerB(questionVO.getAnswerB());
this.setAnswerC(questionVO.getAnswerC());
this.setAnswerD(questionVO.getAnswerD());
this.setCorrectAnswer(questionVO.getCorrectAnswer());
if (!"".equals(questionVO.getSinaImg())) {
String imageUrl = "http://ww" + (new Random().nextInt(4) + 1) + ".sinaimg.cn/mw600/" + questionVO.getSinaImg();
this.setImageUrl(imageUrl);
} else {
this.setImageUrl(questionVO.getImageUrl());
}
this.setBestAnswer(questionVO.getBestAnswer());
this.setType(questionVO.getType());
this.setCreateTime(new Timestamp(System.currentTimeMillis()));
this.setUpdateTime(new Timestamp(System.currentTimeMillis()));
}
}
里面有一个构造函数,用于实现VO到DO的转换
爬虫类对象
package drive;
import com.alibaba.fastjson.JSON;
import lombok.Cleanup;
import lombok.NoArgsConstructor;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import sql.JDBCUtils;
import java.io.*;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Random;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author 小书包
* @date 2018/12/31 12:11
*/
@NoArgsConstructor
public class ParseUrl implements Runnable {
//原子类,用于每个线程执行的时候自增
private static AtomicInteger id = new AtomicInteger(0);
//获得当前机器的cpu核数
private final int processor = Runtime.getRuntime().availableProcessors();
//线程池,初始线程数为:核数*10,最大线程数为:核数*核数,如果有其他任务将放置在阻塞队列中
private ThreadPoolExecutor executor = new ThreadPoolExecutor(processor * 10, processor * processor,
0L, TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<>());
//科目一题目总数
private final static int QUESTION_NUMBER = 13696;
//初始化arrayList大小,避免反复扩容引起的性能下降,存储从页面中解析的json对象
private static ArrayList arrayList = new ArrayList<>(QUESTION_NUMBER);
//需要放入数据库的对象
private static ArrayList questionDOArrayList = new ArrayList<>(QUESTION_NUMBER);
//当所有的线程全部执行完毕之后,再关闭线程池
private CountDownLatch countDownLatch = new CountDownLatch(QUESTION_NUMBER);
//保存从页面解析的文本,需要稍后写入文件中
private static StringBuilder builder = new StringBuilder();
private ParseUrl(CountDownLatch countDownLatch) {
this.countDownLatch = countDownLatch;
}
@Override
public void run() {
try {
TimeUnit.MILLISECONDS.sleep(new Random().nextInt(500));
String url = "http://mnks.jxedt.com/get_question?index=" + id.incrementAndGet();
System.out.println("当前访问的地址为:" + url);
String json = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36")
.timeout(3000)
.get()
.body()
.text();
builder.append(json).append("\r\n");
} catch (Exception e) {
e.printStackTrace();
} finally {
countDownLatch.countDown();
}
}
public static void main(String[] args) throws SQLException, ClassNotFoundException, IOException {
ParseUrl parseUrl = new ParseUrl();
parseUrl.runThreadPool();
parseUrl.writeJsonToFile();
parseUrl.jsonTransFormToObject();
parseUrl.objectTransFormToObject();
parseUrl.insertToDB();
}
/**
* 执行爬虫线程
*/
private void runThreadPool() {
long startTime = System.currentTimeMillis();
for (int i = 0; i < QUESTION_NUMBER; i++) {
//提交线程任务
executor.submit(new ParseUrl(countDownLatch));
}
//当所有的任务执行完毕之后,线程池关闭
try {
countDownLatch.await();
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
executor.shutdown();
}
System.out.println("跑线程的时间为:" + (System.currentTimeMillis() - startTime) + "ms");
}
/**
* 将从页面获得的值写入文本
*
* @throws IOException
*/
private void writeJsonToFile() throws IOException {
@Cleanup BufferedWriter writer = new BufferedWriter(new FileWriter(new File("driver.json")));
writer.write(builder.toString());
writer.flush();
}
/**
* 将文本转成对象
*
* @throws IOException
*/
private void jsonTransFormToObject() throws IOException {
@Cleanup BufferedReader reader = new BufferedReader(new FileReader(new File("driver-copy.json")));
String json;
while ((json = reader.readLine()) != null) {
if (json.startsWith("ERROR")) {
continue;
}
QuestionVO question;
try {
question = JSON.parseObject(json, QuestionVO.class);
} catch (Exception e) {
try {
json = json.replaceAll("\"\"", "\"");
json = json.replaceAll(" \",", "\"\",");
json = json.replaceAll(": \" }", ": \"\" }");
question = JSON.parseObject(json, QuestionVO.class);
} catch (Exception e1) {
json = json.replaceAll("\\\\", "/");
question = JSON.parseObject(json, QuestionVO.class);
}
}
arrayList.add(question);
}
}
/**
* 将VO对象转成DO对象
*/
private void objectTransFormToObject() {
System.out.println(arrayList.size());
for (QuestionVO questionVO : arrayList) {
questionDOArrayList.add(new QuestionDO(questionVO));
}
}
/**
* 批量插入数据到数据库
*
* @throws SQLException
* @throws ClassNotFoundException
*/
private void insertToDB() throws SQLException, ClassNotFoundException {
long startTime = System.currentTimeMillis();
@Cleanup Connection connection = JDBCUtils.getConnection();
connection.setAutoCommit(false);
String sql = "insert into question(question,answerA,answerB,answerC,answerD,correctAnswer,imageUrl,bestAnswer,type,createTime,updateTime ) values(?,?,?,?,?,?,?,?,?,?,?)";
PreparedStatement statement = connection.prepareStatement(sql);
for (QuestionDO questionDO : questionDOArrayList) {
int index = 1;
statement.setString(index++, questionDO.getQuestion());
statement.setString(index++, questionDO.getAnswerA());
statement.setString(index++, questionDO.getAnswerB());
statement.setString(index++, questionDO.getAnswerC());
statement.setString(index++, questionDO.getAnswerD());
statement.setInt(index++, questionDO.getCorrectAnswer());
statement.setString(index++, questionDO.getImageUrl());
statement.setString(index++, questionDO.getBestAnswer());
statement.setInt(index++, questionDO.getType());
statement.setTimestamp(index++, questionDO.getCreateTime());
statement.setTimestamp(index++, questionDO.getUpdateTime());
statement.addBatch();
}
statement.executeBatch();
connection.commit();
System.out.println("插入数据库的时间为:" + (System.currentTimeMillis() - startTime) + "ms");
}
}
数据库连接工具类:JDBCUtils
package sql;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
/**
* @author 小书包
* @date 2018/12/22 21:43
*/
public class JDBCUtils {
public static Connection getConnection() throws SQLException, ClassNotFoundException {
String driver = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://localhost:3306/driver";
String username = "root";
String password = "xxx";
Class.forName(driver);
Connection connection = null;
connection = DriverManager.getConnection(url, username, password);
return connection;
}
}
爬取的j结果共有12000多条数据