第一步
我们先来分析一下我们本次需要的参数内容
入口如下
https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&enc=utf-8&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&pvid=0b09350ac3df4f24886bb7a35d3b69ff
位置分析 |
||
id="J_goodsList" |
所有商品都在这个容器中 |
|
data-sku="5025518" |
商品的编号 |
|
class="p-price" |
商品的价格 |
|
class="p-name p-name-type-2" |
商品名称 |
|
class="err-product" src |
图片位置所在的img |
我们需要去下总页数
入口如下
https://search.jd.com/Search?keyword=笔记本电&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=笔记本电脑&page=3&s=57&click=0
参数解析 |
||
keyword |
笔记本电脑 |
关键字 |
enc |
utf-8 |
编码格式 |
wq |
笔记本电脑 |
关键字 |
qrst |
1 |
不知道是个什么鬼,没有也行 |
rt |
1 |
|
stop |
1 |
|
vt |
2 |
我猜可能是步长的 |
page |
3 |
Page 都是奇数 不知道为什么 |
第二步
直接上代码
4.0.0
com.jianqiao.clawer
clawer-system
pom
1.0-SNAPSHOT
clawer-jd-product
clawer-system Maven Webapp
http://maven.apache.org
UTF-8
4.12
4.1.3.RELEASE
3.4.1
1.3.1
1.2.15
5.1.32
1.6.4
2.4.2
1.0.9
0.8.0.RELEASE
1.2
2.5
2.0
2.5
3.3.2
1.3.2
3.3
5.0.3
2.3.4
0.9.1
1.3.1
1.9
2.7.2
4.10.3
2.5.3
3.4.7
0.1
5.12.0
2.3.23
2.2.2
1.1.1
20160212
1.25
1.4.0.RELEASE
4.3.5
3.4.1
1.10.3
junit
junit
${junit.version}
test
org.springframework
spring-webmvc
${spring.version}
org.springframework
spring-jdbc
${spring.version}
org.springframework
spring-aspects
${spring.version}
org.springframework
spring-context-support
${spring.version}
com.github.abel533
mapper
${mapper.version}
org.mybatis
mybatis
${mybatis.version}
org.mybatis
mybatis-spring
${mybatis.spring.version}
com.github.pagehelper
pagehelper
${pagehelper.version}
com.github.jsqlparser
jsqlparser
${jsqlparser.version}
mysql
mysql-connector-java
${mysql.version}
org.slf4j
slf4j-log4j12
${slf4j.version}
com.fasterxml.jackson.core
jackson-databind
${jackson.version}
com.jolbox
bonecp-spring
${jolbox.version}
jstl
jstl
${jstl.version}
javax.servlet
servlet-api
${servlet-api.version}
provided
javax.servlet
jsp-api
${jsp-api.version}
provided
joda-time
joda-time
${joda-time.version}
org.apache.commons
commons-lang3
${commons-lang3.version}
org.apache.commons
commons-io
${commons-io.version}
commons-fileupload
commons-fileupload
${commons-fileupload.version}
com.alibaba
dubbo
${dubbo.version}
org.springframework
spring
org.jboss.netty
netty
org.apache.zookeeper
zookeeper
${zookeeper.version}
com.github.sgroschupf
zkclient
${zkclient.version}
commons-codec
commons-codec
${commons-codec.version}
org.quartz-scheduler
quartz
${quartz.version}
org.apache.activemq
activemq-all
${activemq.version}
org.springframework
spring-jms
${spring.version}
org.springframework.amqp
spring-rabbit
${spring-rabbit.version}
com.rabbitmq
amqp-client
${rabbitmq.version}
org.freemarker
freemarker
${freemarker.version}
redis.clients
jedis
${jedis.version}
org.apache.solr
solr-solrj
${solrj.version}
com.baidu
ueditor
${uediter.version}
org.json
json
${json.version}
com.alibaba.fastdfs
fastdfs_client
${fastdfs_client.version}
org.apache.httpcomponents
httpclient
${httpclient.version}
org.jsoup
jsoup
${jsoup.version}
${project.artifactId}
org.apache.maven.plugins
maven-resources-plugin
2.7
UTF-8
org.apache.maven.plugins
maven-compiler-plugin
3.2
1.7
UTF-8
org.apache.tomcat.maven
tomcat7-maven-plugin
2.2
src/main/java
**/*.properties
**/*.xml
**/*.cnf
false
src/main/resources
**/*.properties
**/*.xml
**/*.cnf
false
2.创建一个子模块 clawer-jd-product
clawer-system
com.jianqiao.clawer
1.0-SNAPSHOT
4.0.0
clawer-jd-product
war
clawer-jd-product Maven Webapp
junit
junit
org.apache.httpcomponents
httpclient
org.apache.commons
commons-lang3
org.apache.commons
commons-io
commons-fileupload
commons-fileupload
com.fasterxml.jackson.core
jackson-databind
org.springframework
spring-webmvc
org.springframework
spring-jdbc
org.springframework
spring-aspects
org.springframework
spring-context-support
com.github.abel533
mapper
org.mybatis
mybatis
org.mybatis
mybatis-spring
com.github.jsqlparser
jsqlparser
com.github.pagehelper
pagehelper
com.github.jsqlparser
jsqlparser
mysql
mysql-connector-java
com.fasterxml.jackson.core
jackson-databind
com.jolbox
bonecp-spring
jstl
jstl
javax.servlet
servlet-api
provided
javax.servlet
jsp-api
provided
org.slf4j
slf4j-log4j12
org.slf4j
slf4j-log4j12
org.jsoup
jsoup
clawer-jd-product
org.apache.maven.plugins
maven-resources-plugin
2.7
UTF-8
org.apache.maven.plugins
maven-compiler-plugin
3.2
1.7
UTF-8
org.apache.tomcat.maven
tomcat7-maven-plugin
8081
/
src/main/java
**/*.properties
**/*.xml
**/*.cnf
false
src/main/resources
**/*.properties
**/*.xml
**/*.cnf
false
package com.jianqiao.util;
import com.jianqiao.pojo.HttpResult;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.*;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* @Auther: Alone_XuXu
* @Description: 描述信息: 这个是协助 发送 HTTP 请求的小工具
* @Date: Created in 19:24 - 27 - 10 -2017
* @Modified By:
*/
@Component
public class HttpClientUtilImpl{
@Autowired
private CloseableHttpClient httpClient;
@Autowired
private RequestConfig config;
/**
* 带参数的 doGet 请求
*
* @param url 请求地址
* @return 响应200 返回网页内容 其它,返回为null
* @throws Exception
*/
public String doGet1(String url, Map params) throws Exception {
StringBuilder sb = new StringBuilder(url);
sb.append("?");
//如果调用者携带了参数过来
if(params != null && params.size() > 0){
//设置请求参数
Set> entries = params.entrySet();
//遍历整理参数信息 啊
for (Map.Entry entry : entries) {
sb.append(entry.getKey() + "=" + entry.getValue() + "&");
}
url = sb.substring(0, sb.length() - 1).toString();
}
//创建请求
URIBuilder uriBuilder = new URIBuilder(url);
URI uriBuild = uriBuilder.build();
//声明一个请求
HttpGet httpGet = new HttpGet(uriBuild);
//执行了这个请求
CloseableHttpResponse executeResponse = null;
try {
executeResponse = httpClient.execute(httpGet);
if(executeResponse.getStatusLine().getStatusCode() == 200){
return EntityUtils.toString(executeResponse.getEntity(), "UTF-8");
}
} finally{
if(executeResponse != null){
executeResponse.close();
}
}
return null;
}
/**
* 带参数的 doGet 请求
*
* @param url 请求地址
* @return 响应200 返回网页内容 其它,返回为null
* @throws Exception
*/
public String doGet(String url, Map params) throws Exception {
//创建请求
URIBuilder uriBuilder = new URIBuilder(url);
//如果调用者携带了参数过来
if(params != null && params.size() > 0){
//设置请求参数
Set> entries = params.entrySet();
//遍历整理参数信息 啊
for (Map.Entry entry : entries) {
uriBuilder.setParameter(entry.getKey(),entry.getValue().toString());
}
}
URI uriBuild = uriBuilder.build();
//声明一个请求
HttpGet httpGet = new HttpGet(uriBuild);
//执行了这个请求
CloseableHttpResponse executeResponse = null;
try {
executeResponse = httpClient.execute(httpGet);
if(executeResponse.getStatusLine().getStatusCode() == 200){
return EntityUtils.toString(executeResponse.getEntity(), "UTF-8");
}
} finally{
if(executeResponse != null){
executeResponse.close();
}
}
return null;
}
/**
* 带有参数的 doPost 请求
*
* @throws IOException
* @throws ClientProtocolException
*/
public HttpResult doPost(String url, Map params) throws IOException,ClientProtocolException {
//声明一个请求
HttpPost httpPost = new HttpPost(url);
//整理参数列表
List paramterList = getNameValuePairs(params);
// 将请求实体设置到httpPost对象中
//设置 参数信息
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(paramterList, "utf-8");
httpPost.setEntity(formEntity);
httpPost.setConfig(config);
//执行
return executePostOrPutOrDeleteMethod(httpPost);
}
/**
* 带参数 格式为json类型的 的 doPost 请求
*
* @param url
* @param json 请求参数信息
* @return 状态码和请求的body
* @throws IOException
*/
public HttpResult doPostJson(String url, String json) throws IOException {
// 创建http POST请求
HttpPost httpPost = new HttpPost(url);
httpPost.setConfig(this.config);
//我们需要把json参数解析出来
if(json != null){
//给他说明他是什么类型的实体类型
StringEntity stringEntity = new StringEntity(json, ContentType.APPLICATION_JSON);
//将实体参数设置回去
httpPost.setEntity(stringEntity);
}
//执行
return executePostOrPutOrDeleteMethod(httpPost);
}
/**
* 带参数PUT请求
*
* @param url
* @param params 请求参数
* @return 状态码和请求的body
* @throws IOException
*/
public HttpResult doPut(String url, Map params) throws IOException {
//构造一个httpPut 请求
HttpPut httpPut = new HttpPut(url);
//设置参数信息
httpPut.setConfig(config);
//整理参数列表
List paramterList = getNameValuePairs(params);
// 将请求实体设置到httpPost对象中
//设置 参数信息
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(paramterList, "utf-8");
httpPut.setEntity(formEntity);
//执行
return executePostOrPutOrDeleteMethod(httpPut);
}
/**
* DELETE请求,通过POST提交,_method指定真正的请求方法
*
* @param url
* @param param 请求参数
* @return 状态码和请求的body
* @throws IOException
*/
public HttpResult doDelete(String url, Map param) throws Exception {
param.put("_method", "DELETE");
return this.doPost(url, param);
}
/**
* 不带参数的Doget请求
*
* @param url 请求地址
* @return 响应200 返回网页内容 其它,返回为null
* @throws Exception
*/
public String doGet(String url) throws Exception {
//这里我们直接调用了他的 doGet 带参数的请求方式
return doGet(url, null);
}
/**
* 没有带参数的 doPost
*
* @throws Exception
*/
public HttpResult doPost(String url) throws Exception {
//我直接调用了,单携带参数的doPost
return doPost(url,null);
}
/**
* 不带参数PUT请求
*
* @param url
* @return 状态码和请求的body
* @throws IOException
*/
public HttpResult doPut(String url) throws IOException {
//其实我也是调用的有参数的构造器实现的功能
return doPut(url,null);
}
/**
* 执行DELETE请求(真正的DELETE请求)
*
* @param url
* @return 状态码和请求的body
* @throws IOException
*/
public HttpResult doDelete(String url) throws Exception {
// 创建http DELETE请求
HttpDelete httpDelete = new HttpDelete(url);
httpDelete.setConfig(config);
//执行
return executePostOrPutOrDeleteMethod(httpDelete);
}
/**
* 开始执行POST 或者 PUT 或者 DELETE 方法,并且返回结果集
* @param postOrPutOrDelete 需要执行的post 或者 put 请求 或者 DELETE 请求
* @return
* @throws IOException
* HttpEntityEnclosingRequestBase HttpEntityEnclosingRequestBase
*/
private HttpResult executePostOrPutOrDeleteMethod(HttpUriRequest postOrPutOrDelete) throws IOException {
CloseableHttpResponse closeableHttpResponse = null;
try {
closeableHttpResponse = httpClient.execute(postOrPutOrDelete);
if(closeableHttpResponse.getEntity() != null){
return new HttpResult(closeableHttpResponse.getStatusLine().getStatusCode(), EntityUtils.toString(closeableHttpResponse.getEntity(), "utf-8"));
}
// int status = closeableHttpResponse.getStatusLine().getStatusCode();
// if ( status == 200) {
// return new HttpResult(status, EntityUtils.toString(closeableHttpResponse.getEntity(), "utf-8"));
// }
//返回状态码回去呢
return new HttpResult(closeableHttpResponse.getStatusLine().getStatusCode(), null);
} finally {
if( closeableHttpResponse != null){
closeableHttpResponse.close();
}
}
}
/**
* 这个方法是 整理 请求的时候的 POST 或者PUT 携带的参数整理成我们需要的类型
* @param params
* @return
*/
private List getNameValuePairs(Map params) {
List paramterList = new ArrayList<>();
//遍历参数信息,整理参数信息
//如果有数据
if (params != null) {
for (Map.Entry entry : params.entrySet()) {
NameValuePair nameValuePair = new BasicNameValuePair(entry.getKey(),entry.getValue().toString());
paramterList.add(nameValuePair);
}
}
return paramterList;
}
}
package com.jianqiao.util;
import org.apache.http.conn.HttpClientConnectionManager;
/**
* @Auther: Alone_XuXu
* @Description: 使用线程来管理不使用的连接操作啊
* @Date: Created in 19:53 - 27 - 10 -2017
* @Modified By:
*/
public class IdleConnectionEvictor extends Thread{
//管理对象
private HttpClientConnectionManager httpClientConnectionManager;
//判断是不是停止的条件
private volatile boolean shutdown;
//构造器
public IdleConnectionEvictor(HttpClientConnectionManager httpClientConnectionManager) {
this.httpClientConnectionManager = httpClientConnectionManager;
this.start();
}
@Override
public void run() {
while(!shutdown){
try {
synchronized(this){
wait(5000);
//清理不使用的连接
httpClientConnectionManager.closeExpiredConnections();
}
} catch (InterruptedException e) {
//
}
}
}
public void shutdown() {
shutdown = true;
synchronized (this) {
notifyAll();
}
}
}
4.1 准备 vo
package com.jianqiao.vo;
/**
* @Auther: Alone_XuXu
* @Description: 描述信息
* 主要也就是关键字了
*
* @Date: Created in 6:41 - 27 - 11 -2017
* @Modified By:
*/
public class KeyWord {
private String keyword;
private String enc;
private String wq;
private String page;
public String getKeyword() {
return keyword;
}
public void setKeyword(String keyword) {
this.keyword = keyword;
}
public String getEnc() {
return enc;
}
public void setEnc(String enc) {
this.enc = enc;
}
public String getWq() {
return wq;
}
public void setWq(String wc) {
this.wq = wc;
}
public String getPage() {
return page;
}
public void setPage(String page) {
this.page = page;
}
}
package com.jianqiao.pojo;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
public class Product implements Serializable {
private Long id;
private String title;
private String sellpoint;
private String price;
private Integer num;
private String image;
private Long cid;
private Boolean status=true;
// 在映射数据库表的时候,忽略该属性
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSellpoint() {
return sellpoint;
}
public void setSellpoint(String sellpoint) {
this.sellpoint = sellpoint;
}
public String getPrice() {
return price;
}
public void setPrice(String price) {
this.price = price;
}
public Integer getNum() {
return num;
}
public void setNum(Integer num) {
this.num = num;
}
public String getImage() {
return image;
}
public void setImage(String image) {
this.image = image;
}
public Long getCid() {
return cid;
}
public void setCid(Long cid) {
this.cid = cid;
}
public Boolean getStatus() {
return status;
}
public void setStatus(Boolean status) {
this.status = status;
}
@Override
public String toString() {
return "Product [id=" + id + ", title=" + title + ", sellPoint="
+ sellpoint + ", price=" + price + ", num=" + num + ", image="
+ image + ", cid=" + cid + ", status=" + status + "]";
}
}
package com.jianqiao.pojo;
public class HttpResult {
// 状态码
private Integer code;
// 响应body
private String body;
public HttpResult() {
super();
}
public HttpResult(Integer code, String body) {
this.code = code;
this.body = body;
}
public Integer getCode() {
return code;
}
public void setCode(Integer code) {
this.code = code;
}
public String getBody() {
return body;
}
public void setBody(String body) {
this.body = body;
}
}
5.0 准备和数据库相关的内容
package com.jianqiao.mapper;
import com.github.abel533.mapper.Mapper;
import com.jianqiao.pojo.Product;
public interface ProductMapper extends Mapper {
}
6.0 服务层准备
package com.jianqiao.service;
import com.github.abel533.entity.Example;
import com.github.abel533.mapper.Mapper;
import com.github.pagehelper.PageHelper;
import org.springframework.beans.factory.annotation.Autowired;
import java.lang.reflect.ParameterizedType;
import java.lang.reflect.Type;
import java.util.List;
public class BaseServiceImpl{
@Autowired
protected Mapper mapper;
Class clazz;
public BaseServiceImpl() {
Type type = this.getClass().getGenericSuperclass();
ParameterizedType ptype = (ParameterizedType)type;
this.clazz =(Class)ptype.getActualTypeArguments()[0];
}
public T queryById(Long id) {
return this.mapper.selectByPrimaryKey(id);
}
public List queryAll() {
//我们如果在缓存中查找不导数据,这个时候我们才需要去查询数据库
return this.mapper.select(null);
}
public List queryByWhere(T t) {
return this.mapper.select(t);
}
public Integer queryByWhereCount(T t) {
return this.mapper.selectCount(t);
}
public List queryByPage(Integer page, Integer rows) {
//第一个参数:当前页,第二参数:每页显示记录数
PageHelper.startPage(page, rows);
List list = this.mapper.select(null);
return list;
}
public T queryOne(T t) {
return this.mapper.selectOne(t);
}
public void save(T t) {
this.mapper.insert(t);
}
public void saveSelective(T t) {
this.mapper.insertSelective(t);
}
public void update(T t) {
this.mapper.updateByPrimaryKey(t);
}
public void updateSelective(T t) {
this.mapper.updateByPrimaryKeySelective(t);
}
public void deleteById(Long id) {
this.mapper.deleteByPrimaryKey(id);
}
public void deleteByIds(List
package com.jianqiao.service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jianqiao.mapper.ProductMapper;
import com.jianqiao.pojo.Product;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
@Service
public class ProductServiceImpl extends BaseServiceImpl{
}
package com.jianqiao.service;
import com.jianqiao.constant.AppConstants;
import com.jianqiao.pojo.Product;
import com.jianqiao.util.HttpClientUtilImpl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* @Auther: Alone_XuXu
* @Description: 描述信息
* @Date: Created in 6:59 - 27 - 11 -2017
* @Modified By:
*/
@Service
public class ClawerService {
private Logger logger = LoggerFactory.getLogger(ClawerService.class);
//工具
@Autowired
private HttpClientUtilImpl httpClientUtil;
/**
* 获取都页数
*
* @param url
* @return
*/
public Integer getTotalPage(String url) {
try {
String html = httpClientUtil.doGet(url);
if (html != null) {
Document document = Jsoup.parse(html);
//解析文档
//id="J_topPage" 表示这个页数所在的位置
String jtopPageText = document.select("#J_topPage").text();
//使用正则表达式来取值
String[] strings = jtopPageText.split("\\D+");
System.out.println("总页数: " + strings[1]);
return Integer.parseInt(strings[1]);
}
} catch (Exception e) {
e.printStackTrace();
}
return 0;
}
/**
* 循环抓取内容啊
*
* @param url
* @return
*/
public Map findProductByPage(final String url,final Map params) {
Map maps = new ConcurrentHashMap<>();
//替换页码
try {
String doGetHtml = httpClientUtil.doGet1(url,params);
//去除中间多的空格啊,换行之类的
doGetHtml = doGetHtml.replaceAll("\r\n|\r|\n|\t|\b|~|\f", "");//去掉回车换行符
getProductList(maps, doGetHtml);
return maps;
} catch (Exception e) {
e.printStackTrace();
}
return maps;
}
/**
* 将html中的产品信息,取出来
* @param maps
* @param doGetHtml
*/
private void getProductList(Map maps, String doGetHtml) {
if (doGetHtml != null) {
//解析到 document 文档
Document rootDocument = Jsoup.parse(doGetHtml);
// 获取到整个商品列表信息
Elements listElement = rootDocument.select("ul[class=gl-warp clearfix]")
.select(".gl-item");
for (Element element : listElement) {
Product product = new Product();
Element childDiv = element.child(0);
String data_sku = element.attr("data-sku");
String p_name = childDiv.select(".p-name").text();
String image_src = element.select(".p-img").select("a img").attr("src");
String price = element.select(".p-price strong").select("i").text();
product.setId(Long.parseLong(data_sku));
product.setTitle(p_name);
product.setImage(AppConstants.HTTPS + image_src);
product.setPrice(price);
//将数据添加到整个列表里面呢
maps.put(data_sku,product);
}
}
}
}
7.0 controller层准备
package com.jianqiao.controller;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jianqiao.constant.AppConstants;
import com.jianqiao.pojo.Product;
import com.jianqiao.service.ClawerService;
import com.jianqiao.service.ProductServiceImpl;
import com.jianqiao.vo.KeyWord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
import org.springframework.stereotype.Controller;
import org.springframework.util.StringUtils;
import org.springframework.web.bind.annotation.RequestMapping;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* @Auther: Alone_XuXu
* @Description: 描述信息
* @Date: Created in 6:39 - 27 - 11 -2017
* @Modified By:
*/
@Controller
public class JDClawerController {
//设置总的记录shutdown
private static Long count = 0L;
//设置总页数
private static Integer totalPage = 0;
//最后得到的结果
private Map finalMaps = new ConcurrentHashMap<>();
//json 转换工具
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@Autowired
private ThreadPoolTaskExecutor threadPoolTaskExecutor;
@Autowired
private ClawerService clawerService;
@Autowired
private ProductServiceImpl productService;
/**
* 爬取的京东数据
*
* @param keyWord 接受到的参数组合
*/
@RequestMapping("/jd/clawer")
public void clawerJD(KeyWord keyWord) {
//我们先替换掉所有的参数信息先啊
String url = "https://search.jd.com/Search?keyword={keyword}&enc={enc}&qrst=1&rt=1&stop=1&vt=2&wq={wq}&page={page}&s=57&click=0";
String operationUrl = url.replace("{keyword}", keyWord.getKeyword());
operationUrl = operationUrl.replace("{enc}", keyWord.getEnc());
operationUrl = operationUrl.replace("{wq}", keyWord.getWq());
if(keyWord.getPage() != null){
operationUrl = operationUrl.replace("{page}", keyWord.getPage());
}else{
operationUrl = operationUrl.replace("{page}", "1");
}
totalPage = clawerService.getTotalPage(operationUrl);
Integer vtPage = totalPage * 2; //在京东有个问题,serach查询的时候,有个步长的概念,神知道他想做什么......
final CountDownLatch countDownLatch = new CountDownLatch(totalPage);//为了我们的线程可以计数,多少页我们就执行多少次
long startTime = System.currentTimeMillis();
//步长为2,等这里面所有线程执行结束
for (int i = 1; i < vtPage; i += 2) {
System.out.println("第" + i + "页");
final Map params = new ConcurrentHashMap<>();
params.put("keyword", keyWord.getKeyword());
params.put("enc", keyWord.getEnc());
params.put("wc", keyWord.getWq());
params.put("page", i + "");
threadPoolTaskExecutor.submit(new Runnable() {
@Override
public void run() {
try {
Map productByPage = clawerService.findProductByPage(AppConstants.BASE_URL,params);
finalMaps.putAll(productByPage);
} finally {
countDownLatch.countDown();//执行一次计数一次
}
}
});
}
//让主线程等待啊
try {
countDownLatch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
long endTime = System.currentTimeMillis();
//遍历一下先
for(Map.Entry entry : finalMaps.entrySet()){
productService.saveSelective(entry.getValue());
}
//在这里我们可以开启多线程了
System.out.println("消耗时间:" + (endTime - startTime));
//消耗时间:19094 这个是开了三个线程操作的时候的数据
//消耗时间:6337 这个是我开了十个线程的时候的数据
}
}
系统常量类
package com.jianqiao.constant;
/**
* @Auther: Alone_XuXu
* @Description: 描述信息
* @Date: Created in 6:46 - 27 - 11 -2017
* @Modified By:
*/
public interface AppConstants {
//默认编码
String DEFAULT_CHARSET = "utf-8";
//需要爬取的网站入口
//https://search.jd.com/Search?keyword=笔记本电&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=笔记本电脑&page=3&s=57&click=0
String BASE_URL = "https://search.jd.com/Search";
String HTTPS = "https:";
/**
* 浏览器头信息
*/
interface Header {
String ACCEPT = "Accept";
String ACCEPT_ENCODING = "Accept-Encoding";
String ACCEPT_LANGUAGE = "Accept-Language";
String CACHE_CONTROL = "Cache-Controle";
String COOKIE = "Cookie";
String HOST = "Host";
String PROXY_CONNECTION = "Proxy-Connection";
String REFERER = "Referer";
String USER_AGENT = "User-Agent";
}
}
8.0 配置文件准备
8.1 web.xml
Archetype Created Web Application
contextConfigLocation
classpath*:spring/spring-*.xml
org.springframework.web.context.ContextLoaderListener
DispatcherServlet
org.springframework.web.servlet.DispatcherServlet
contextConfigLocation
classpath:spring/springmvc-*.xml
1
DispatcherServlet
/
CharacterEncodingFilter
org.springframework.web.filter.CharacterEncodingFilter
encoding
utf-8
forceRequestEncoding
true
forceResponseEncoding
true
CharacterEncodingFilter
/*
HiddenHttpMethodFilter
org.springframework.web.filter.HiddenHttpMethodFilter
HiddenHttpMethodFilter
/*
jdbc相关
jdbc.username=root
jdbc.password=1230
jdbc.url=jdbc:mysql://localhost:3306/clawerDB?rewriteBatchedStatements=true&useUnicode=true&characterEncoding=utf8
jdbc.driver=com.mysql.jdbc.Driver
httpclient.maxTotal = 200
httpclient.DefaultMaxPerRoute = 20
httpclient.connectTimeout =1000
httpclient.connectionRequestTimeout =500
httpclient.socketTimeout =10000
httpclient.staleConnectionCheckEnabled = true
8.4 spring相关
8.4.1 spring-beans.xml
作者注:
本文可以实现商品的基本搜索,并且保存到数据库
不足之处是,许多代码还需要做调整
最主要一点,jsoup解析的时候,解析图片的时候有时候会取不出来,希望看到这篇文章的人,也能帮我修复这个bug.谢谢