java jsoup 网络爬虫 学习例子(六)京东和当当商品比价
package com.iteye.injavawetrust.jdvsdd; import java.util.List; /** * * @author InJavaWeTrust * */ public interface ProductList { /** * 爬取商品列表 * @return */ public List<ProductInfo> getProductList(); } package com.iteye.injavawetrust.jdvsdd; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * * @author InJavaWeTrust * */ public class JDProductList implements ProductList{ private String jdUrl; private String productName; private static PriceCheckUtil pcu = PriceCheckUtil.getInstance(); public JDProductList(String jdUrl, String productName){ this.jdUrl = jdUrl; this.productName = productName; } @Override public List<ProductInfo> getProductList() { List<ProductInfo> jdProductList = new ArrayList<ProductInfo>(); ProductInfo productInfo = null; String url = ""; for(int i = 0; i < 10; i++){ try { System.out.println("JD Product 第[" + (i + 1) + "]页"); if(i == 0) { url = jdUrl; }else{ url = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC + Constants.JDPAGE + (i + 1); } System.out.println(url); Document document = Jsoup.connect(url).timeout(5000).get(); Elements uls = document.select("ul[class=gl-warp clearfix]"); Iterator<Element> ulIter = uls.iterator(); while(ulIter.hasNext()) { Element ul = ulIter.next(); Elements lis = ul.select("li[data-sku]"); Iterator<Element> liIter = lis.iterator(); while(liIter.hasNext()) { Element li = liIter.next(); Element div = li.select("div[class=gl-i-wrap]").first(); Elements title = div.select("div[class=p-name p-name-type-2]>a"); String productName = title.attr("title"); //得到商品名称 Elements price = div.select(".p-price>strong"); String productPrice =price.attr("data-price"); //得到商品价格 productInfo = new ProductInfo(); productInfo.setProductName(productName); productInfo.setProductPrice(productPrice); jdProductList.add(productInfo); } } } catch(Exception e) { System.out.println("Get JD product has error [" + url + "]"); System.out.println(e.getMessage()); } } return jdProductList; } public static void main(String[] args) { try { String productName = "书包"; String jdUrl = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC; List<ProductInfo> list = new JDProductList(jdUrl, productName).getProductList(); System.out.println(list.size()); for(ProductInfo pi : list){ System.out.println(pi.getProductName() + " " + pi.getProductPrice()); } } catch (Exception e) { e.printStackTrace(); } } } package com.iteye.injavawetrust.jdvsdd; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * * @author InJavaWeTrust * */ public class DDProductList implements ProductList{ private String ddUrl; private String productName; public DDProductList(String ddUrl, String productName) { this.ddUrl = ddUrl; this.productName = productName; } @Override public List<ProductInfo> getProductList() { List<ProductInfo> ddProductList = new ArrayList<ProductInfo>(); ProductInfo productInfo = null; String url = ""; for(int i = 0; i < 10; i++){ try{ System.out.println("DD Product 第[" + (i + 1) + "]页"); if(i == 0){ url = ddUrl; }else{ url = Constants.DDURL + productName + Constants.ACT + Constants.DDPAGE + (i + 1); } System.out.println(url); Document document = Jsoup.connect(url).timeout(5000).get(); Elements uls = document.select("ul[class=bigimg cloth_shoplist]"); Iterator<Element> ulIter = uls.iterator(); while(ulIter.hasNext()){ Element ul = ulIter.next(); Elements lis = ul.select("li"); Iterator<Element> liIter = lis.iterator(); while(liIter.hasNext()){ Element li = liIter.next(); Elements price = li.select("p[class=price]>span"); String productPrice = price.html().replaceAll("¥", ""); Elements title = li.select("p[class=name]>a"); String productName = title.attr("title"); productInfo = new ProductInfo(); productInfo.setProductName(productName); productInfo.setProductPrice(productPrice); ddProductList.add(productInfo); } } } catch(Exception e){ System.out.println("Get DD product has error [" + url + "]"); System.out.println(e.getMessage()); } } return ddProductList; } public static void main(String[] args) { String productName = "学生铅笔"; String ddUrl = Constants.DDURL + productName; List<ProductInfo> list = new DDProductList(ddUrl, productName).getProductList(); System.out.println(list.size()); for(ProductInfo pi : list){ System.out.println(pi.getProductName() + " " + pi.getProductPrice()); } } } package com.iteye.injavawetrust.jdvsdd; import java.io.Serializable; import java.util.Date; /** * * @author InJavaWeTrust * */ public class ProductInfo implements Serializable{ private static final long serialVersionUID = 8179244535272774089L; /** * 商品ID */ private String productid; /** * 商品名称 */ private String productName; /** * 商品价格 */ private String productPrice; /** * 月销售笔数 */ private String tradeNum; /** * 商品URL */ private String productUrl; /** * 商品网店名称 */ private String shopName; /** * 电商名称 */ private String ecName; /** * 爬取入库日期 */ private Date date; public String getProductid() { return productid; } public void setProductid(String productid) { this.productid = productid; } public String getProductName() { return productName; } public void setProductName(String productName) { this.productName = productName; } public String getProductPrice() { return productPrice; } public void setProductPrice(String productPrice) { this.productPrice = productPrice; } public String getTradeNum() { return tradeNum; } public void setTradeNum(String tradeNum) { this.tradeNum = tradeNum; } public String getProductUrl() { return productUrl; } public void setProductUrl(String productUrl) { this.productUrl = productUrl; } public String getShopName() { return shopName; } public void setShopName(String shopName) { this.shopName = shopName; } public String getEcName() { return ecName; } public void setEcName(String ecName) { this.ecName = ecName; } public Date getDate() { return date; } public void setDate(Date date) { this.date = date; } } package com.iteye.injavawetrust.jdvsdd; /** * * @author InJavaWeTrust * */ public class Constants { /** * JDURL */ public static String JDURL = "http://search.jd.com/Search?keyword="; /** * JD汉字编码格式 */ public static String JDENC = "&enc=utf-8"; /** * JD分页 */ public static String JDPAGE ="&page="; /** * 当当URL */ public static String DDURL = "http://search.dangdang.com/?key="; /** * 当当ACT */ public static String ACT = "&act=input"; /** * 当当分页 */ public static String DDPAGE = "&page_index="; } package com.iteye.injavawetrust.jdvsdd; import java.io.UnsupportedEncodingException; import java.text.SimpleDateFormat; import java.util.List; import java.util.TimeZone; /** * * @author InJavaWeTrust * */ public class PriceCheckUtil { private PriceCheckUtil() { } private static final PriceCheckUtil instance = new PriceCheckUtil(); public static PriceCheckUtil getInstance() { return instance; } /** * 商品汉字转码 * @param productName 商品名称 * @return */ public String getGbk(String productName){ String retGbk = ""; try { retGbk = new String(productName.getBytes("UTF-8"), "GBK"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return retGbk; } /** * 从列表list中找到与productName相似度最高的ProductInfo * * @param productName * @param list * @return 相似度最高的productName */ public ProductInfo getSimilarity(String productName, List<ProductInfo> list) { ProductInfo productInfo = null; /** * 找到list中所有的productName与字符串productName的相似度,保存在lens数组中 */ double lens[] = new double[list.size()]; for (int i = 0; i < list.size() - 1; i++) { lens[i] = sim(productName, list.get(i).getProductName()); } /** * 遍历出最大的相似度maxLen */ double maxLen = 0.0; for (int i = 0; i < lens.length; i++) { if (maxLen < lens[i]) { maxLen = lens[i]; } } /** * 遍历出最大的相似度的索引maxLenIndex */ int maxLenIndex = 0; for (int i = 0; i < lens.length; i++) { if (maxLen == lens[i]) { maxLenIndex = i; } } productInfo = list.get(maxLenIndex); return productInfo; } /** * 求三个数中最小的一个 * @param one * @param two * @param three * @return */ public int min(int one, int two, int three) { int min = one; if(two < min) { min = two; } if(three < min) { min = three; } return min; } /** * 计算矢量距离 * Levenshtein Distance(LD) * @param str1 * @param str2 * @return */ public int ld(String str1, String str2) { int d[][]; //矩阵 int n = str1.length(); int m = str2.length(); int i; //遍历str1的 int j; //遍历str2的 char ch1; //str1的 char ch2; //str2的 int temp; //记录相同字符,在某个矩阵位置值的增量,不是0就是1 if(n == 0) { return m; } if(m == 0) { return n; } d = new int[n+1][m+1]; for(i=0; i<=n; i++) { //初始化第一列 d[i][0] = i; } for(j=0; j<=m; j++) { //初始化第一行 d[0][j] = j; } for(i=1; i<=n; i++) { //遍历str1 ch1 = str1.charAt(i-1); //去匹配str2 for(j=1; j<=m; j++) { ch2 = str2.charAt(j-1); if(ch1 == ch2) { temp = 0; } else { temp = 1; } //左边+1,上边+1, 左上角+temp取最小 d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp); } } return d[n][m]; } /** * 计算相似度 * @param str1 * @param str2 * @return */ public double sim(String str1, String str2) { int ld = ld(str1, str2); return 1 - (double) ld / Math.max(str1.length(), str2.length()); } /** * 毫秒转换成hhmmss * @param ms 毫秒 * @return hh:mm:ss */ public String msToss(long ms) { SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss"); formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00")); String ss = formatter.format(ms); return ss; } } package com.iteye.injavawetrust.jdvsdd; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Scanner; /** * * @author InJavaWeTrust * */ public class PriceCheckMain { private static PriceCheckUtil pcu = PriceCheckUtil.getInstance(); public List<Map<String, ProductInfo>> getProductList(String productName) { String jdUrl = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC; String ddUrl = Constants.DDURL + productName; return getProductFromUrls(jdUrl, ddUrl, productName); } public List<Map<String, ProductInfo>> getProductFromUrls(String jdUrl, String ddUrl, String productName) { List<Map<String, ProductInfo>> retListMap = new ArrayList<Map<String,ProductInfo>>(); List<ProductInfo> jdProductList = new JDProductList(jdUrl, productName).getProductList(); List<ProductInfo> ddProductList = new DDProductList(ddUrl, productName).getProductList(); for(int i = 0; i < jdProductList.size(); i++){ String jdProductName = jdProductList.get(i).getProductName(); Map<String, ProductInfo> map = new HashMap<String, ProductInfo>(); map.put("JD", jdProductList.get(i)); ProductInfo ddProduct = pcu.getSimilarity(jdProductName, ddProductList); map.put("DD", ddProduct); retListMap.add(map); } return retListMap; } public static void main(String[] args) { System.out.println("输入商品名称:"); Scanner scanner = new Scanner(System.in); String productName = scanner.next(); scanner.close(); System.out.println("京东和当当[" + productName + "]商品比价开始。。。。。。"); try{ long starTime = System.currentTimeMillis(); List<Map<String, ProductInfo>> list = new PriceCheckMain().getProductList(productName); for(Map<String, ProductInfo> map : list) { String jdName = map.get("JD").getProductName(); String jdPrice = map.get("JD").getProductPrice(); String ddName = map.get("DD").getProductName(); String ddPrice = map.get("DD").getProductPrice(); System.out.println("[" + jdName + "] [" + ddName + "]"); System.out.println("[" + jdPrice + "] [" + ddPrice + "]"); System.out.println("-----------------------------------------------------------"); } long endTime = System.currentTimeMillis(); System.out.println("用时 [" + pcu.msToss(endTime - starTime) + "]"); }catch(Exception e){ System.out.println("error"); System.out.println(e.getMessage()); } } }
运行结果:
输入商品名称:
铅笔
京东和当当[铅笔]商品比价开始。。。。。。
JD Product 第[1]页
http://search.jd.com/Search?keyword=閾呯瑪&enc=utf-8
JD Product 第[2]页
http://search.jd.com/Search?keyword=閾呯瑪&enc=utf-8&page=2
..............................
DD Product 第[1]页
http://search.dangdang.com/?key=铅笔
DD Product 第[2]页
http://search.dangdang.com/?key=铅笔&act=input&page_index=2
DD Product 第[3]页
...................................
[得力(deli) S908 木世界系列六角笔杆原木HB铅笔/素描绘图学生铅笔 50支/桶] [ 【开学必备文具】正品 得力文具S907/s908原木HB/2B铅笔绘图素描儿童学生铅笔50支装 ]
[18.00] [16.80]
-----------------------------------------------------------
[得力(deli) 7084 安全考试专用填涂答题卡2B木质铅笔/学生铅笔 12支/盒] [ 得力文具(deli) 0641 削笔器 削笔刀 卷笔刀 削笔机学习用品手摇转笔刀笔刨文具 ]
[10.00] [13.50]
-----------------------------------------------------------
[得力(deli)7083 安全石墨铅芯素描 绘图HB铅笔/学生铅笔 12支/盒] [ 六一儿童节礼物!三菱 绘图HB/2B铅笔12支装 ]
[9.00] [35.00]
-----------------------------------------------------------
[辉柏嘉(Faber-castell)114468 水溶性彩色铅笔 水溶彩铅 48色套装(赠毛笔+笔刨)] [ 德国Faber-castell辉柏嘉三角杆学生铅笔 儿童铅笔 HB 2H 2B 12支 ]
[109.00] [12.00]
................................................
[中华6725桶装彩色铅笔36色24色18色12色原木三角杆彩色铅笔 美术绘画涂鸦涂色彩铅 12色] [ 晨光彩色铅笔36色 24色绘图涂鸦 桶装绘画彩铅彩笔 儿童绘画 ]
[8.80] [12.00]
-----------------------------------------------------------
用时 [00:00:10]