要爬的数据
数据库表结构
数据库建表语句
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for `spider`
-- ----------------------------
DROP TABLE IF EXISTS `spider`;
CREATE TABLE `spider` (
`id` int(10) NOT NULL AUTO_INCREMENT,
`goods_id` varchar(20) DEFAULT NULL,
`data_url` varchar(300) DEFAULT NULL,
`pic_url` varchar(300) DEFAULT NULL,
`title` varchar(300) DEFAULT NULL,
`price` varchar(10) DEFAULT NULL,
`param` text,
`current_time` datetime DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8;
项目的包结构
pom.xml 文件中的jar包依赖
junit
junit
4.12
test
org.apache.httpcomponents
httpclient
4.4
net.sourceforge.htmlcleaner
htmlcleaner
2.16
org.json
json
20160212
mysql
mysql-connector-java
5.1.38
commons-dbutils
commons-dbutils
1.6
编写实体类
import java.util.HashMap;
import java.util.Map;
/**
* 页面实体类
* 保存页面信息
*/
public class Page {
private String goodId;// 商品ID
private String goodName;//商品名称
private String dataUrl;//商品URL地址
private String picUrl;//商品图片URL地址
private String price;//价格
private Map param = new HashMap();//商品参数规格
private String content;//页面原始源代码内容
public String getGoodId() {
return goodId;
}
public void setGoodId(String goodId) {
this.goodId = goodId;
}
public String getGoodName() {
return goodName;
}
public void setGoodName(String goodName) {
this.goodName = goodName;
}
public String getDataUrl() {
return dataUrl;
}
public void setDataUrl(String dataUrl) {
this.dataUrl = dataUrl;
}
public Map getParam() {
return param;
}
public void setParam(String key,String value) {
this.param.put(key, value);
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getPicUrl() {
return picUrl;
}
public void setPicUrl(String picUrl) {
this.picUrl = picUrl;
}
public String getPrice() {
return price;
}
public void setPrice(String price) {
this.price = price;
}
}
spider类
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.download.Downloadable;
import cn.crxy.maven.Spider.process.Processable;
import cn.crxy.maven.Spider.store.Storeable;
public class Spider {
private Downloadable downloadable;
private Processable processable;
private Storeable storeable;
//下载页面源代码
public Page download(String url){
return downloadable.download(url);
}
//解析页面源代码
public void process(Page page){
processable.process(page);
}
//将解析后的数据保存到数据库
public void store(Page page){
storeable.store(page);
}
public Downloadable getDownloadable() {
return downloadable;
}
public void setDownloadable(Downloadable downloadable) {
this.downloadable = downloadable;
}
public Processable getProcessable() {
return processable;
}
public void setProcessable(Processable processable) {
this.processable = processable;
}
public Storeable getStoreable() {
return storeable;
}
public void setStoreable(Storeable storeable) {
this.storeable = storeable;
}
}
Downloadable接口类
import cn.crxy.maven.Spider.domain.Page;
public interface Downloadable {
Page download(String url);
}
DownloadImpl实现类
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.utils.PageUtil;
public class DownloadImpl implements Downloadable {
public Page download(String url) {
Page page = new Page();
String content=PageUtil.getContent(url);//根据url得到内容
page.setContent(content);
page.setDataUrl(url);
return page;
}
}
PageUtil页面工具类
import java.io.IOException;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
/**
* 根据URL获取url对应的内容
*/
public class PageUtil {
public static String getContent(String url){
HttpClientBuilder custom = HttpClients.custom();//创建httpclient
//通过构建器构建一个httpclient对象,可以认为是获取到一个浏览器对象
CloseableHttpClient build = custom.build();
//把url封装到get请求中
HttpGet httpGet = new HttpGet(url);
String content = null;
try {
//使用client执行get请求,获取请求的结果,请求的结果被封装到response中
CloseableHttpResponse response = build.execute(httpGet);
//表示获取返回的内容实体对象
HttpEntity entity = response.getEntity();
//解析实体中页面的内容,返回字符串形式
content = EntityUtils.toString(entity);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return content;
}
}
Processable.java
import cn.crxy.maven.Spider.domain.Page;
public interface Processable {
void process(Page page);
}
ProcessImpl.java
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import org.json.JSONArray;
import org.json.JSONObject;
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.utils.HtmlUtil;
import cn.crxy.maven.Spider.utils.PageUtil;
public class ProcessImpl implements Processable {
public void process(Page page) {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode rootNode = htmlCleaner.clean(page.getContent());
try {
String goodName = HtmlUtil.getText(rootNode, "//*[@id='name']/h1");// 得到商品名称
page.setGoodName(goodName);
String picUrl = HtmlUtil.getAttributeByName(rootNode, "//*[@id='spec-n1']/img","src");// 获取商品图片url
page.setPicUrl("http:"+picUrl);
// 获取商品号
String url = page.getDataUrl();
Pattern compile = Pattern.compile("http://item.jd.com/([0-9]+).html");
Matcher matcher = compile.matcher(url);
String goodid = null;
if (matcher.find()) {
goodid = matcher.group(1);
page.setGoodId(goodid);
}
// 获取商品价格
// 得到价格的json格式[{"id":"J_1593512","p":"17988.00","m":"17989.00"}]
String pricejson = PageUtil
.getContent("http://p.3.cn/prices/get?skuid=J_" + goodid);
JSONArray jsonArray = new JSONArray(pricejson);
JSONObject jsonObject = jsonArray.getJSONObject(0);
String price = jsonObject.getString("p");
page.setPrice(price);
// 获取规格参数
// *[@id="product-detail-2"]
// *[@id="product-detail-2"]/table/tbody/tr[1]/th
Object[] evaluateXPath = rootNode
.evaluateXPath("//*[@id='product-detail-2']/table/tbody/tr");
JSONArray jsonArray2 = new JSONArray();
if(evaluateXPath != null && evaluateXPath.length > 0){
for(Object object : evaluateXPath){
TagNode tagnode = (TagNode) object;
if(!"".equals(tagnode.getText().toString().trim())){//有数据
Object[] evaluateXPath2 = tagnode.evaluateXPath("/th");
JSONObject jsonObject2 = new JSONObject();
if(evaluateXPath2.length>0){
TagNode tagNode2 = (TagNode) evaluateXPath2[0];
jsonObject2.put("name", tagNode2.getText().toString());
jsonObject2.put("value", "");
}else {
Object[] evaluateXPath3 = tagnode.evaluateXPath("/td");
TagNode tagNode1 = (TagNode) evaluateXPath3[0];
TagNode tagNode2 = (TagNode) evaluateXPath3[1];
jsonObject2.put("name", tagNode1.getText().toString());
jsonObject2.put("value", tagNode2.getText().toString());
}
jsonArray2.put(jsonObject2);
}
}
}
page.setParam("spec",jsonArray2.toString());
} catch (XPatherException e) {
e.printStackTrace();
}
}
}
ProcessImpl.java代码中的几个注意点: 获取商品名称、图片URL的xpath路径
在京东商品页面获取商品价格的方式 得到如下的连接地址:
http://p.3.cn/prices/get?type=1&area=1_72_4137&pdtk=&pduid=1112434089&pdpin=&pdbp=0&skuid=J_1593512&callback=cnp
对连接进行处理后得到如下结果
商品参数规格的Xpath
Storeable.java
package cn.crxy.maven.Spider.store;
import cn.crxy.maven.Spider.domain.Page;
public interface Storeable {
void store(Page page);
}
StoreImple.java
package cn.crxy.maven.Spider.store;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.utils.MyDBUtils;
public class StoreImpl implements Storeable {
public void store(Page page) {
String dataUrl = page.getDataUrl();
String goodid = page.getGoodId();
String goodname = page.getGoodName();
String picUrl = page.getPicUrl();
String price = page.getPrice();
Map values = page.getParam();
String param = values.get("spec");
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String currtime = sdf.format(new Date());
MyDBUtils.update(MyDBUtils.INSERT_LOG, goodid,dataUrl,picUrl,goodname,price,param,currtime);
}
}
MyDBUtils.java
package cn.crxy.maven.Spider.utils;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.dbutils.BasicRowProcessor;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.ArrayListHandler;
public class MyDBUtils {
private static String className = "com.mysql.jdbc.Driver";
private static String url = "jdbc:mysql://localhost:3306/spider?"
+ "useUnicode=true&characterEncoding=utf-8";
private static String user = "root";
private static String password = "1234";
private static QueryRunner queryRunner = new QueryRunner();
public static final String INSERT_LOG = "INSERT INTO SPIDER(good_id,"
+ "data_url,pic_url,good_name,price,param,`current_time`) "
+ "VALUES(?,?,?,?,?,?,?)";
// 拒绝new一个实例
private MyDBUtils() {
};
static {// 调用该类时既注册驱动
try {
Class.forName(className);
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException();
}
}
//查询
public static List executeQuerySql(String sql) {
List result = new ArrayList();
try {
List requstList = queryRunner.query(getConnection(), sql,
new ArrayListHandler(new BasicRowProcessor() {
@Override
public List toBeanList(ResultSet rs,
Class type) throws SQLException {
return super.toBeanList(rs, type);
}
}));
for (Object[] objects : requstList) {
result.add(objects[0].toString());
}
} catch (SQLException e) {
e.printStackTrace();
}
return result;
}
//这个方法可以执行一些更新或者新增的sql语句或者删除
public static void update(String sql, Object... params) {
try {
Connection connection = getConnection();
queryRunner.update(connection, sql, params);
connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
// 获取连接
private static Connection getConnection() throws SQLException {
return DriverManager.getConnection(url, user, password);
}
}
HtmlUtils.java
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
public class HtmlUtils {
/**
* 根据xpath获取对应标签的内容
* @param tagNode
* @param xpath
* @return
*/
public static String getText(TagNode tagNode,String xpath){
String content = null;
Object[] evaluateXPath;
try {
evaluateXPath = tagNode.evaluateXPath(xpath);
if(evaluateXPath!=null && evaluateXPath.length>0){
TagNode node = (TagNode)evaluateXPath[0];
content = node.getText().toString();
}
} catch (XPatherException e) {
e.printStackTrace();
}
return content;
}
/**
* 获取对应标签中指定属性的值
* @param tagNode
* @param xpath
* @param attr
* @return
*/
public static String getAttributeByName(TagNode tagNode,String xpath,String attr){
String content = null;
Object[] evaluateXPath;
try {
evaluateXPath = tagNode.evaluateXPath(xpath);
if(evaluateXPath!=null && evaluateXPath.length>0){
TagNode node = (TagNode)evaluateXPath[0];
content = node.getAttributeByName(attr);
}
} catch (XPatherException e) {
e.printStackTrace();
}
return content;
}
}
在src/test/java文件夹下面的包中新建test类
TestSpider.java
package cn.crxy.maven.Spider;
import org.junit.Test;
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.download.DownloadImpl;
import cn.crxy.maven.Spider.process.ProcessImpl;
import cn.crxy.maven.Spider.store.StoreImpl;
public class TestSpider {
@Test
public void test1() throws Exception {
Spider spider = new Spider();
//给接口注入实现类
spider.setDownloadable(new DownloadImpl());
spider.setProcessable(new ProcessImpl());
spider.setStoreable(new StoreImpl());
String url = "http://item.jd.com/1593512.html";
Page page = spider.download(url);
spider.process(page);
spider.store(page);
}
}
运行test测试方法,在数据库中插入了数据