在京东首页输入手机,我们能看到很多手机商品信息,接下来用java实现100页商品信息的爬取并录入到数据库
使用到的技术:HttpClient,Jsoup,多线程,阻塞队列
1.创建数据库,创建手机信息表
DROP TABLE IF EXISTS `phone`;
CREATE TABLE `phone` (
`id` bigint(11) DEFAULT NULL,
`name` varchar(255) DEFAULT NULL,
`price` double DEFAULT NULL,
`shop` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
2.创建maven工程,导入依赖
mysql
mysql-connector-java
5.1.32
commons-dbutils
commons-dbutils
1.4
org.apache.httpcomponents
httpclient
4.5.3
org.jsoup
jsoup
1.7.2
com.alibaba
druid
1.0.9
3.创建jdbcConfig.properties文件,封装了DataSource数据源的JDBCUtils类和商品实体类Phone
3.1 在工程目录下新建jdbcConfig.properties配置文件
jdbc.driver=com.mysql.jdbc.Driver
jdbc.url=jdbc:mysql://localhost:3306/db1
jdbc.username=root
jdbc.password=123456
3.2 JDBCUtils.java
package cn.swun.utils;
import com.alibaba.druid.pool.DruidDataSource;
import javax.sql.DataSource;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
public class JDBCUtils {
private static String driver;
private static String url;
private static String username;
private static String password;
private static DruidDataSource dataSource = new DruidDataSource();;
static {
try {
InputStream is = new FileInputStream("jdbcConfig1.properties");
Properties properties = new Properties();
properties.load(is);
driver = properties.getProperty("jdbc.driver");
url = properties.getProperty("jdbc.url");
username = properties.getProperty("jdbc.username");
password = properties.getProperty("jdbc.password");
} catch (FileNotFoundException e) {
System.out.println("配置文件不存在");
System.exit(0);
} catch (IOException e) {
System.out.println("配置文件有误");
System.exit(0);
}
}
public static DataSource getDataSource(){
dataSource.setDriverClassName(driver);
dataSource.setUrl(url);
dataSource.setUsername(username);
dataSource.setPassword(password);
return dataSource;
}
}
3.3 Phone.java
package cn.swun.domain;
public class Phone {
private Long id;
private String name;
private Double price;
private String shop;
public Long getId() {
return id;
}
public void setId(String id) {
this.id = Long.parseLong(id);
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Double getPrice() {
return price;
}
public void setPrice(String price) {
this.price = Double.parseDouble(price);
}
public String getShop() {
return shop;
}
public void setShop(String shop) {
this.shop = shop;
}
@Override
public String toString() {
return "Phone{" +
"id=" + id +
", name='" + name + '\'' +
", price=" + price +
", shop='" + shop + '\'' +
'}';
}
}
4.京东爬虫核心类JdSplider.java
package cn.swun.splider;
import cn.swun.domain.Phone;
import cn.swun.utils.JDBCUtils;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.sql.SQLException;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class JdSplider {
//线程池
private static final ExecutorService executorService = Executors.newCachedThreadPool();
//阻塞队列,用于存放商品盒子li
private static final BlockingQueue queueLi = new ArrayBlockingQueue(100);
//阻塞队列,用于存放phone
private static final BlockingQueue queuePhone = new ArrayBlockingQueue(100);
//爬取的首页
private String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=5b2751339d874f89b1a53a0b7eb6a55c";
//开始爬取
public void start() throws IOException {
final String sql = "insert into phone(id,name,price,shop) values(?,?,?,?)";
//创建10个消费者,消费phone队列并向数据库中插入商品信息
for (int i = 0; i < 10; i++) {
executorService.execute(new Runnable() {
public void run() {
QueryRunner queryRunner = new QueryRunner(JDBCUtils.getDataSource());
while (true) {
try {
Phone phone = queuePhone.take();
queryRunner.update(sql,phone.getId(),phone.getName(),phone.getPrice(),phone.getShop());
} catch (InterruptedException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
});
}
//创建10个消费者(解析队列中存放的li)
for (int i = 0; i < 10; i++) {
executorService.execute(new Runnable() {
public void run() {
//从队列中取出li进行解析
while (true) {
Element li = null;
try {
li = queueLi.take();
} catch (InterruptedException e) {
e.printStackTrace();
}
Phone phone = parseLi(li);
if (phone != null) {
queuePhone.offer(phone);
}
}
}
});
}
//获取首页
CloseableHttpResponse indexRes = sendGet(url);
//解析结果
parseIndex(indexRes, 1);
}
//发送get请求,获取响应结果
public CloseableHttpResponse sendGet(String url) throws IOException {
//创建httpClient客户端
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建请求对象,发送请求
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3514.0 Safari/537.36");
CloseableHttpResponse response = httpClient.execute(httpGet);
return response;
}
//解析首页
public void parseIndex(CloseableHttpResponse indexRes, int page) throws IOException {
System.out.println("---第" + page + "页抓取完毕---");
//得到document对象
String indexHtml = EntityUtils.toString(indexRes.getEntity(), "UTF-8");
//System.out.println(indexHtml);
Document document = Jsoup.parse(indexHtml);
//获取所有商品盒子(li.gl-item)
Elements lis = document.select("li[class=gl-item]");
//取出每个盒子置于队列中
for (Element li : lis) {
queueLi.offer(li);
}
if (++page <= 100) {
int index = 2 * page - 1;
String url = "https://search.jd.com/Search?keyword=手机&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=手机&cid2=653&cid3=655&page=" + index + "&click=0";
CloseableHttpResponse nextRes = sendGet(url);
parseIndex(nextRes, page);
}
}
//解析每个盒子,封装到phone并返回
public Phone parseLi(Element li) {
try {
Phone phone = new Phone();
String id = li.attr("data-sku");
String name = li.select("div.p-name em").get(0).text();
String price = li.select("div.p-price i").get(0).text();
String shop = li.select("div.p-shop a").attr("title");
phone.setId(id);
phone.setName(name);
phone.setPrice(price);
phone.setShop(shop);
return phone;
} catch (Exception e) {
//System.out.println("错误数据");
}
return null;
}
}
5.程序入口类JdSpliderApp.java
package cn.swun.app;
import cn.swun.splider.JdSplider;
import java.io.IOException;
public class JdSpliderApp {
public static void main(String[] args) throws IOException, ClassNotFoundException {
long start = System.currentTimeMillis();
JdSplider jdSplider = new JdSplider();
jdSplider.start();
long end = System.currentTimeMillis();
System.out.println("100页抓取完毕并保存至数据库用时:" + ((double)(end-start))/1000.00 + "s");
}
}
项目目录结构如下
导出jar包,打开cmd窗口java -jar jd_splider.jar,数据已经保存到数据库