java爬虫 京东商品页 简单案例

HttpClient + htmlcleaner + xpath  + MySQL      Java语言
要爬的数据
java爬虫 京东商品页 简单案例_第1张图片
java爬虫 京东商品页 简单案例_第2张图片

数据库表结构
java爬虫 京东商品页 简单案例_第3张图片
数据库建表语句
SET FOREIGN_KEY_CHECKS=0;

-- ----------------------------
-- Table structure for `spider`
-- ----------------------------
DROP TABLE IF EXISTS `spider`;
CREATE TABLE `spider` (
  `id` int(10) NOT NULL AUTO_INCREMENT,
  `goods_id` varchar(20) DEFAULT NULL,
  `data_url` varchar(300) DEFAULT NULL,
  `pic_url` varchar(300) DEFAULT NULL,
  `title` varchar(300) DEFAULT NULL,
  `price` varchar(10) DEFAULT NULL,
  `param` text,
  `current_time` datetime DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8;

项目的包结构
java爬虫 京东商品页 简单案例_第4张图片

pom.xml 文件中的jar包依赖

	        
			junit
			junit
			4.12
			test
		
		
			org.apache.httpcomponents
			httpclient
			4.4
		
		
			net.sourceforge.htmlcleaner
			htmlcleaner
			2.16
		
		
			org.json
			json
			20160212
		
		
			mysql
			mysql-connector-java
			5.1.38
		
		
			commons-dbutils
			commons-dbutils
			1.6
		

编写实体类
import java.util.HashMap;
import java.util.Map;

/**
 * 页面实体类
 * 保存页面信息
 */
public class Page {

	private String goodId;// 商品ID
	private String goodName;//商品名称
	private String dataUrl;//商品URL地址
	private String picUrl;//商品图片URL地址
	private String price;//价格
	private Map param = new HashMap();//商品参数规格
	private String content;//页面原始源代码内容
	
	public String getGoodId() {
		return goodId;
	}
	public void setGoodId(String goodId) {
		this.goodId = goodId;
	}
	public String getGoodName() {
		return goodName;
	}
	public void setGoodName(String goodName) {
		this.goodName = goodName;
	}
	public String getDataUrl() {
		return dataUrl;
	}
	public void setDataUrl(String dataUrl) {
		this.dataUrl = dataUrl;
	}
	public Map getParam() {
		return param;
	}
	public void setParam(String key,String value) {
		this.param.put(key, value);
	}
	public String getContent() {
		return content;
	}
	public void setContent(String content) {
		this.content = content;
	}
	public String getPicUrl() {
		return picUrl;
	}
	public void setPicUrl(String picUrl) {
		this.picUrl = picUrl;
	}
	public String getPrice() {
		return price;
	}
	public void setPrice(String price) {
		this.price = price;
	}
}

spider类
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.download.Downloadable;
import cn.crxy.maven.Spider.process.Processable;
import cn.crxy.maven.Spider.store.Storeable;

public class Spider {

	private Downloadable downloadable;
	private Processable processable;
	private Storeable storeable;
	
	//下载页面源代码
	public Page download(String url){
		return downloadable.download(url);
	}
	
	//解析页面源代码
	public void process(Page page){
		processable.process(page);
	}
	
	 //将解析后的数据保存到数据库 
	public void store(Page page){
		storeable.store(page);
	}

	public Downloadable getDownloadable() {
		return downloadable;
	}

	public void setDownloadable(Downloadable downloadable) {
		this.downloadable = downloadable;
	}

	public Processable getProcessable() {
		return processable;
	}

	public void setProcessable(Processable processable) {
		this.processable = processable;
	}

	public Storeable getStoreable() {
		return storeable;
	}

	public void setStoreable(Storeable storeable) {
		this.storeable = storeable;
	}
}

Downloadable接口类
import cn.crxy.maven.Spider.domain.Page;

public interface Downloadable {
	Page download(String url);
}

DownloadImpl实现类
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.utils.PageUtil;

public class DownloadImpl implements Downloadable {
	
	public Page download(String url) {
		Page page = new Page();
		String content=PageUtil.getContent(url);//根据url得到内容
		page.setContent(content);
		page.setDataUrl(url);
		return page;
	}
}

PageUtil页面工具类
import java.io.IOException;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

/**
 * 根据URL获取url对应的内容
 */
public class PageUtil {

	public static String getContent(String url){
		HttpClientBuilder custom = HttpClients.custom();//创建httpclient
		//通过构建器构建一个httpclient对象,可以认为是获取到一个浏览器对象
		CloseableHttpClient build = custom.build();
		//把url封装到get请求中
		HttpGet httpGet = new HttpGet(url);
		String content = null;
		try {
			//使用client执行get请求,获取请求的结果,请求的结果被封装到response中
			CloseableHttpResponse response = build.execute(httpGet);
			//表示获取返回的内容实体对象
			HttpEntity entity = response.getEntity();
			//解析实体中页面的内容,返回字符串形式
			content = EntityUtils.toString(entity);
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return content;
	} 
}

Processable.java
import cn.crxy.maven.Spider.domain.Page;

public interface Processable {
	void process(Page page);
}

ProcessImpl.java
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import org.json.JSONArray;
import org.json.JSONObject;

import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.utils.HtmlUtil;
import cn.crxy.maven.Spider.utils.PageUtil;

public class ProcessImpl implements Processable {

	public void process(Page page) {

		HtmlCleaner htmlCleaner = new HtmlCleaner();
		TagNode rootNode = htmlCleaner.clean(page.getContent());
		try {
			String goodName = HtmlUtil.getText(rootNode, "//*[@id='name']/h1");// 得到商品名称
			page.setGoodName(goodName);

			String picUrl = HtmlUtil.getAttributeByName(rootNode, "//*[@id='spec-n1']/img","src");// 获取商品图片url
			page.setPicUrl("http:"+picUrl);

			// 获取商品号
			String url = page.getDataUrl();
			Pattern compile = Pattern.compile("http://item.jd.com/([0-9]+).html");
			Matcher matcher = compile.matcher(url);
			String goodid = null;
			if (matcher.find()) {
				goodid = matcher.group(1);
				page.setGoodId(goodid);
			}

			// 获取商品价格
			// 得到价格的json格式[{"id":"J_1593512","p":"17988.00","m":"17989.00"}]
			String pricejson = PageUtil
					.getContent("http://p.3.cn/prices/get?skuid=J_" + goodid);
			JSONArray jsonArray = new JSONArray(pricejson);
			JSONObject jsonObject = jsonArray.getJSONObject(0);
			String price = jsonObject.getString("p");
			page.setPrice(price);

			// 获取规格参数
			// *[@id="product-detail-2"]
			// *[@id="product-detail-2"]/table/tbody/tr[1]/th
			Object[] evaluateXPath = rootNode
					.evaluateXPath("//*[@id='product-detail-2']/table/tbody/tr");
			JSONArray jsonArray2 = new JSONArray();
			if(evaluateXPath != null && evaluateXPath.length > 0){
				for(Object object : evaluateXPath){
					TagNode tagnode = (TagNode) object;
					if(!"".equals(tagnode.getText().toString().trim())){//有数据
						
						Object[] evaluateXPath2 = tagnode.evaluateXPath("/th");
						JSONObject jsonObject2 = new JSONObject();
						if(evaluateXPath2.length>0){
							TagNode tagNode2 = (TagNode) evaluateXPath2[0];
							jsonObject2.put("name", tagNode2.getText().toString());
							jsonObject2.put("value", "");
						}else {
							
							Object[] evaluateXPath3 = tagnode.evaluateXPath("/td");
							TagNode tagNode1 = (TagNode) evaluateXPath3[0];
							TagNode tagNode2 = (TagNode) evaluateXPath3[1];
							jsonObject2.put("name", tagNode1.getText().toString());
							jsonObject2.put("value", tagNode2.getText().toString());
						}
						jsonArray2.put(jsonObject2);
					}
				}
			}
			page.setParam("spec",jsonArray2.toString());
		} catch (XPatherException e) {
			e.printStackTrace();
		}
	}
}

ProcessImpl.java代码中的几个注意点:
获取商品名称、图片URL的xpath路径

java爬虫 京东商品页 简单案例_第5张图片

在京东商品页面获取商品价格的方式
java爬虫 京东商品页 简单案例_第6张图片

   java爬虫 京东商品页 简单案例_第7张图片
得到如下的连接地址:
http://p.3.cn/prices/get?type=1&area=1_72_4137&pdtk=&pduid=1112434089&pdpin=&pdbp=0&skuid=J_1593512&callback=cnp
对连接进行处理后得到如下结果
  
商品参数规格的Xpath
java爬虫 京东商品页 简单案例_第8张图片

Storeable.java
package cn.crxy.maven.Spider.store;

import cn.crxy.maven.Spider.domain.Page;

public interface Storeable {
	void store(Page page);
}

StoreImple.java
package cn.crxy.maven.Spider.store;

import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;

import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.utils.MyDBUtils;

public class StoreImpl implements Storeable {

	public void store(Page page) {
		String dataUrl = page.getDataUrl();
		String goodid = page.getGoodId();
		String goodname = page.getGoodName();
		String picUrl = page.getPicUrl();
		String price  = page.getPrice();
		
		Map values = page.getParam();
		String param = values.get("spec");
		
		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
		String currtime = sdf.format(new Date());
		MyDBUtils.update(MyDBUtils.INSERT_LOG, goodid,dataUrl,picUrl,goodname,price,param,currtime);
	}

}

MyDBUtils.java
package cn.crxy.maven.Spider.utils;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.dbutils.BasicRowProcessor;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.ArrayListHandler;

public class MyDBUtils {
	private static String className = "com.mysql.jdbc.Driver";
	private static String url = "jdbc:mysql://localhost:3306/spider?"
			+ "useUnicode=true&characterEncoding=utf-8";
	private static String user = "root";
	private static String password = "1234";
	private static QueryRunner queryRunner = new QueryRunner();

	public static final String INSERT_LOG = "INSERT INTO SPIDER(good_id,"
			+ "data_url,pic_url,good_name,price,param,`current_time`) "
			+ "VALUES(?,?,?,?,?,?,?)";

	// 拒绝new一个实例
	private MyDBUtils() {
	};

	static {// 调用该类时既注册驱动
		try {
			Class.forName(className);
		} catch (Exception e) {
			e.printStackTrace();
			throw new RuntimeException();
		}
	}
	
	//查询
	public static List executeQuerySql(String sql) {
		List result = new ArrayList();
		try {
			List requstList = queryRunner.query(getConnection(), sql,
					new ArrayListHandler(new BasicRowProcessor() {
						@Override
						public  List toBeanList(ResultSet rs,
								Class type) throws SQLException {
							return super.toBeanList(rs, type);
						}
					}));
			for (Object[] objects : requstList) {
				result.add(objects[0].toString());
			}
		} catch (SQLException e) {
			e.printStackTrace();
		}
		return result;
	}
	
	 //这个方法可以执行一些更新或者新增的sql语句或者删除
	public static void update(String sql, Object... params) {
		try {
			Connection connection = getConnection();
			queryRunner.update(connection, sql, params);
			connection.close();
		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	// 获取连接
	private static Connection getConnection() throws SQLException {
		return DriverManager.getConnection(url, user, password);
	}
}
 
   

HtmlUtils.java
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;

public class HtmlUtils {
	
	/**
	 * 根据xpath获取对应标签的内容
	 * @param tagNode
	 * @param xpath
	 * @return
	 */
	public static String getText(TagNode tagNode,String xpath){
		String content = null;
		Object[] evaluateXPath;
		try {
			evaluateXPath = tagNode.evaluateXPath(xpath);
			if(evaluateXPath!=null && evaluateXPath.length>0){
				TagNode node = (TagNode)evaluateXPath[0];
				content = node.getText().toString();
			}
		} catch (XPatherException e) {
			e.printStackTrace();
		}
		return content;
	}
	
	/**
	 * 获取对应标签中指定属性的值
	 * @param tagNode
	 * @param xpath
	 * @param attr
	 * @return
	 */
	public static String getAttributeByName(TagNode tagNode,String xpath,String attr){
		String content = null;
		Object[] evaluateXPath;
		try {
			evaluateXPath = tagNode.evaluateXPath(xpath);
			if(evaluateXPath!=null && evaluateXPath.length>0){
				TagNode node = (TagNode)evaluateXPath[0];
				content = node.getAttributeByName(attr);
			}
		} catch (XPatherException e) {
			e.printStackTrace();
		}
		return content;
	}
	
	

}



在src/test/java文件夹下面的包中新建test类
TestSpider.java
package cn.crxy.maven.Spider;

import org.junit.Test;

import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.download.DownloadImpl;
import cn.crxy.maven.Spider.process.ProcessImpl;
import cn.crxy.maven.Spider.store.StoreImpl;

public class TestSpider {

	@Test
	public void test1() throws Exception {
		Spider spider = new Spider();
		
		//给接口注入实现类
		spider.setDownloadable(new DownloadImpl());
		spider.setProcessable(new ProcessImpl());
		spider.setStoreable(new StoreImpl());
		
		String url = "http://item.jd.com/1593512.html";
		Page page = spider.download(url);
		spider.process(page);
		spider.store(page);

	}
}

运行test测试方法,在数据库中插入了数据



你可能感兴趣的:(项目)