java jsoup 网络爬虫 学习例子(七)京东和淘宝商品比价 htmlunit
由于淘宝的页面采用了独特的Kissy Javascript组件,鼠标右键查看源代码的时候看到并不是jsoup能直接解析的dom,jsoup不能直接通过选择器处理标签,所以要解析标签可以结合htmlunit。htmlunit是一款开源的java 页面分析工具,读取页面后,可以有效的使用htmlunit分析页面上的内容。项目可以模拟浏览器运行,被誉为java浏览器的开源实现。这个没有界面的浏览器,可以有效的分析出 dom的标签,并且有效的运行页面上的js以便得到一些需要执行JS才能得到的值。
package com.iteye.injavawetrust.pricecheck; import java.util.List; /** * * @author InJavaWeTrust * */ public interface ProductList { /** * 爬取商品列表 * @return */ public List<ProductInfo> getProductList(); } package com.iteye.injavawetrust.pricecheck; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * * @author InJavaWeTrust * */ public class JDProductList implements ProductList{ private String jdUrl; private String productName; private static PriceCheckUtil pcu = PriceCheckUtil.getInstance(); public JDProductList(String jdUrl, String productName){ this.jdUrl = jdUrl; this.productName = productName; } @Override public List<ProductInfo> getProductList() { List<ProductInfo> jdProductList = new ArrayList<ProductInfo>(); ProductInfo productInfo = null; String url = ""; for(int i = 0; i < 10; i++){ try { System.out.println("JD Product 第[" + (i + 1) + "]页"); if(i == 0) { url = jdUrl; }else{ url = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC + Constants.JDPAGE + (i + 1); } System.out.println(url); Document document = Jsoup.connect(url).timeout(5000).get(); Elements uls = document.select("ul[class=gl-warp clearfix]"); Iterator<Element> ulIter = uls.iterator(); while(ulIter.hasNext()) { Element ul = ulIter.next(); Elements lis = ul.select("li[data-sku]"); Iterator<Element> liIter = lis.iterator(); while(liIter.hasNext()) { Element li = liIter.next(); Element div = li.select("div[class=gl-i-wrap]").first(); Elements title = div.select("div[class=p-name p-name-type-2]>a"); String productName = title.attr("title"); //得到商品名称 Elements price = div.select(".p-price>strong"); String productPrice =price.attr("data-price"); //得到商品价格 productInfo = new ProductInfo(); productInfo.setProductName(productName); productInfo.setProductPrice(productPrice); jdProductList.add(productInfo); } } } catch(Exception e) { System.out.println("Get JD product has error [" + url + "]"); System.out.println(e.getMessage()); } } return jdProductList; } public static void main(String[] args) { try { String productName = "书包"; String jdUrl = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC; List<ProductInfo> list = new JDProductList(jdUrl, productName).getProductList(); System.out.println(list.size()); for(ProductInfo pi : list){ System.out.println(pi.getProductName() + " " + pi.getProductPrice()); } } catch (Exception e) { e.printStackTrace(); } } } package com.iteye.injavawetrust.pricecheck; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * * @author InJavaWeTrust * */ public class TBProductList implements ProductList{ private static PriceCheckUtil pcu = PriceCheckUtil.getInstance(); private String tbUrl; private String productName; public TBProductList(String tbUrl, String productName) { this.tbUrl = tbUrl; this.productName = productName; } @Override public List<ProductInfo> getProductList() { List<ProductInfo> tbProductList = new ArrayList<ProductInfo>(); ProductInfo productInfo = null; String url = ""; int page = 0; for(int i = 0; i < 10; i++){ try { System.out.println("TB Product 第[" + (i + 1) + "]页"); if(i == 0){ url = tbUrl; }else{ page += 44; url = Constants.TBURL + pcu.getUrlCode(productName) + Constants.TBPAGE + page; } System.out.println(url); Document doc = Jsoup.parse(pcu.getXmlByHtmlunit(url)); Elements itemlist = doc.select("div[class=m-itemlist]"); Iterator<Element> it = itemlist.iterator(); while(it.hasNext()){ Element item = it.next(); Elements items = item.select("div[data-category=auctions]"); System.out.println(items.size()); Iterator<Element> one = items.iterator(); while(one.hasNext()){ Element e = one.next(); Elements price = e.select("div[class=price g_price g_price-highlight]>strong"); String productPrice = price.text(); Elements title = e.select("div[class=row row-2 title]>a"); String productName = title.text(); productInfo = new ProductInfo(); productInfo.setProductName(productName); productInfo.setProductPrice(productPrice); tbProductList.add(productInfo); } } } catch(Exception e) { System.out.println("Get TB product has error"); System.out.println(e.getMessage()); } } return tbProductList; } public static void main(String[] args) { try{ String productName = "铅笔"; String tbUrl = Constants.TBURL + pcu.getUrlCode(productName); List<ProductInfo> list = new TBProductList(tbUrl, productName).getProductList(); for(ProductInfo pi : list){ System.out.println("[" + pi.getProductName() + "] [" + pi.getProductPrice() + "]"); } }catch(Exception e){ e.printStackTrace(); } } } package com.iteye.injavawetrust.pricecheck; import java.io.Serializable; import java.util.Date; /** * * @author InJavaWeTrust * */ public class ProductInfo implements Serializable{ private static final long serialVersionUID = 8179244535272774089L; /** * 商品ID */ private String productid; /** * 商品名称 */ private String productName; /** * 商品价格 */ private String productPrice; /** * 月销售笔数 */ private String tradeNum; /** * 商品URL */ private String productUrl; /** * 商品网店名称 */ private String shopName; /** * 电商名称 */ private String ecName; /** * 爬取入库日期 */ private Date date; public String getProductid() { return productid; } public void setProductid(String productid) { this.productid = productid; } public String getProductName() { return productName; } public void setProductName(String productName) { this.productName = productName; } public String getProductPrice() { return productPrice; } public void setProductPrice(String productPrice) { this.productPrice = productPrice; } public String getTradeNum() { return tradeNum; } public void setTradeNum(String tradeNum) { this.tradeNum = tradeNum; } public String getProductUrl() { return productUrl; } public void setProductUrl(String productUrl) { this.productUrl = productUrl; } public String getShopName() { return shopName; } public void setShopName(String shopName) { this.shopName = shopName; } public String getEcName() { return ecName; } public void setEcName(String ecName) { this.ecName = ecName; } public Date getDate() { return date; } public void setDate(Date date) { this.date = date; } } package com.iteye.injavawetrust.pricecheck; /** * * @author InJavaWeTrust * */ public class Constants { /** * JDURL */ public static String JDURL = "http://search.jd.com/Search?keyword="; /** * JD汉字编码格式 */ public static String JDENC = "&enc=utf-8"; /** * JD分页 */ public static String JDPAGE ="&page="; /** * TBURL */ public static String TBURL = "https://s.taobao.com/search?q="; /** * 淘宝分页 */ public static String TBPAGE = "&s="; /** * 超时时间 */ public static int TIMEOUT = 50000; } package com.iteye.injavawetrust.pricecheck; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLEncoder; import java.text.SimpleDateFormat; import java.util.List; import java.util.TimeZone; import org.apache.commons.logging.LogFactory; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.HttpMethod; import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.WebRequest; import com.gargoylesoftware.htmlunit.html.HtmlPage; /** * * @author InJavaWeTrust * */ public class PriceCheckUtil { private PriceCheckUtil() { } private static final PriceCheckUtil instance = new PriceCheckUtil(); public static PriceCheckUtil getInstance() { return instance; } /** * 商品汉字转码 * @param productName 商品名称 * @return */ public String getGbk(String productName){ String retGbk = ""; try { retGbk = new String(productName.getBytes("UTF-8"), "GBK"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return retGbk; } /** * 对淘宝浏览器汉字进行转换 * @param productName 商品名称 * @return */ public String getUrlCode(String productName){ String retUrlCode = ""; try { retUrlCode = URLEncoder.encode(productName, "utf8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return retUrlCode; } /** * 从列表list中找到与productName相似度最高的ProductInfo * * @param productName * @param list * @return 相似度最高的productName */ public ProductInfo getSimilarity(String productName, List<ProductInfo> list) { ProductInfo productInfo = null; /** * 找到list中所有的productName与字符串productName的相似度,保存在lens数组中 */ double lens[] = new double[list.size()]; for (int i = 0; i < list.size() - 1; i++) { lens[i] = sim(productName, list.get(i).getProductName()); } /** * 遍历出最大的相似度maxLen */ double maxLen = 0.0; for (int i = 0; i < lens.length; i++) { if (maxLen < lens[i]) { maxLen = lens[i]; } } /** * 遍历出最大的相似度的索引maxLenIndex */ int maxLenIndex = 0; for (int i = 0; i < lens.length; i++) { if (maxLen == lens[i]) { maxLenIndex = i; } } productInfo = list.get(maxLenIndex); return productInfo; } /** * 求三个数中最小的一个 * @param one * @param two * @param three * @return */ public int min(int one, int two, int three) { int min = one; if(two < min) { min = two; } if(three < min) { min = three; } return min; } /** * 计算矢量距离 * Levenshtein Distance(LD) * @param str1 * @param str2 * @return */ public int ld(String str1, String str2) { int d[][]; //矩阵 int n = str1.length(); int m = str2.length(); int i; //遍历str1的 int j; //遍历str2的 char ch1; //str1的 char ch2; //str2的 int temp; //记录相同字符,在某个矩阵位置值的增量,不是0就是1 if(n == 0) { return m; } if(m == 0) { return n; } d = new int[n+1][m+1]; for(i=0; i<=n; i++) { //初始化第一列 d[i][0] = i; } for(j=0; j<=m; j++) { //初始化第一行 d[0][j] = j; } for(i=1; i<=n; i++) { //遍历str1 ch1 = str1.charAt(i-1); //去匹配str2 for(j=1; j<=m; j++) { ch2 = str2.charAt(j-1); if(ch1 == ch2) { temp = 0; } else { temp = 1; } //左边+1,上边+1, 左上角+temp取最小 d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp); } } return d[n][m]; } /** * 计算相似度 * @param str1 * @param str2 * @return */ public double sim(String str1, String str2) { int ld = ld(str1, str2); return 1 - (double) ld / Math.max(str1.length(), str2.length()); } /** * 毫秒转换成hhmmss * @param ms 毫秒 * @return hh:mm:ss */ public String msToss(long ms) { SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss"); formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00")); String ss = formatter.format(ms); return ss; } /** * 禁止htmlunit日志输出 */ public void offLog(){ LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); } /** * 获取淘宝数据 * @param url * @return * @throws Exception */ public String getXmlByHtmlunit(String url) throws Exception { offLog(); String ret = ""; WebClient webClient = new WebClient(BrowserVersion.CHROME); // 1 启动JS webClient.getOptions().setJavaScriptEnabled(true); // 2 禁用Css,可避免自动二次请求CSS进行渲染 webClient.getOptions().setCssEnabled(false); // 3 启动客户端重定向 webClient.getOptions().setRedirectEnabled(true); // 4 JS运行错误时,是否抛出异常 webClient.getOptions().setThrowExceptionOnScriptError(false); // 5AJAX support webClient.setAjaxController(new NicelyResynchronizingAjaxController()); // 6 设置超时 webClient.getOptions().setTimeout(Constants.TIMEOUT); WebRequest webRequest = new WebRequest(new URL(url)); webRequest.setHttpMethod(HttpMethod.GET); HtmlPage page = webClient.getPage(webRequest); webClient.waitForBackgroundJavaScript(10000); ret = page.asXml(); webClient.close(); return ret; } } package com.iteye.injavawetrust.pricecheck; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Scanner; /** * * @author InJavaWeTrust * */ public class PriceCheckMain { private static PriceCheckUtil pcu = PriceCheckUtil.getInstance(); public List<Map<String, ProductInfo>> getProductList(String productName) { String jdUrl = Constants.JDURL + productName + Constants.JDENC; String tbUrl = Constants.TBURL + productName; return getProductFromUrls(jdUrl, tbUrl, productName); } public List<Map<String, ProductInfo>> getProductFromUrls(String jdUrl, String tbUrl, String productName) { List<Map<String, ProductInfo>> retListMap = new ArrayList<Map<String,ProductInfo>>(); List<ProductInfo> jdProductList = new JDProductList(jdUrl, productName).getProductList(); List<ProductInfo> tbProductList = new TBProductList(tbUrl, productName).getProductList(); for(int i = 0; i < jdProductList.size(); i++){ String jdProductName = jdProductList.get(i).getProductName(); Map<String, ProductInfo> map = new HashMap<String, ProductInfo>(); map.put("JD", jdProductList.get(i)); ProductInfo tbProduct = pcu.getSimilarity(jdProductName, tbProductList); map.put("TB", tbProduct); retListMap.add(map); } return retListMap; } public static void main(String[] args) { System.out.println("输入商品名称:"); Scanner scanner = new Scanner(System.in); String productName = scanner.next(); scanner.close(); System.out.println("京东和淘宝[" + productName + "]商品比价开始。。。。。。"); try{ long starTime = System.currentTimeMillis(); List<Map<String, ProductInfo>> list = new PriceCheckMain().getProductList(productName); for(Map<String, ProductInfo> map : list){ String jdName = map.get("JD").getProductName(); String jdPrice = map.get("JD").getProductPrice(); String ddName = map.get("TB").getProductName(); String ddPrice = map.get("TB").getProductPrice(); System.out.println("[" + jdName + "] [" + ddName + "]"); System.out.println("[" + jdPrice + "] [" + ddPrice + "]"); System.out.println("-----------------------------------------------------------"); } long endTime = System.currentTimeMillis(); System.out.println("用时 [" + pcu.msToss(endTime - starTime) + "]"); }catch(Exception e){ System.out.println("error"); System.out.println(e.getMessage()); } } }
运行结果:
输入商品名称:
铅笔
京东和淘宝[铅笔]商品比价开始。。。。。。
JD Product 第[1]页
http://search.jd.com/Search?keyword=铅笔&enc=utf-8
JD Product 第[2]页
http://search.jd.com/Search?keyword=閾呯瑪&enc=utf-8&page=2
。。。。。。。。。。。。。。。。。。。
TB Product 第[1]页
https://s.taobao.com/search?q=铅笔
RHINO USAGE WARNING: Missed Context.javaToJS() conversion:
Rhino runtime detected object com.gargoylesoftware.htmlunit.ScriptException: TypeError: Cannot read property "domainLookupEnd" from undefined (https://g.alicdn.com/??kissy/k/1.4.15/import-style-min.js,tb/tracker/1.0.19/index.js,/tb/tsrp/1.61.0/config.js#9) of class com.gargoylesoftware.htmlunit.ScriptException where it expected String, Number, Boolean or Scriptable instance. Please check your code for missing Context.javaToJS() call.
46
TB Product 第[2]页
https://s.taobao.com/search?q=%E9%93%85%E7%AC%94&s=44
RHINO USAGE WARNING: Missed Context.javaToJS() conversion:
Rhino runtime detected object com.gargoylesoftware.htmlunit.ScriptException: TypeError: Cannot read property "domainLookupEnd" from undefined (https://g.alicdn.com/??kissy/k/1.4.15/import-style-min.js,tb/tracker/1.0.19/index.js,/tb/tsrp/1.61.0/config.js#9) of class com.gargoylesoftware.htmlunit.ScriptException where it expected String, Number, Boolean or Scriptable instance. Please check your code for missing Context.javaToJS() call.
。。。。。。。。。。。。。。。。
-----------------------------------------------------------
[施德楼(Staedtler)100 蓝杆画图绘图铅笔|素描铅笔 HB单支装] [原装进口德国Staedtler施德楼 铅笔 100蓝杆绘图 顶级绘画素描 铅笔]
[4.80] [3.60]
-----------------------------------------------------------
[施德楼(Staedtler) 铅芯|自动铅笔芯|替芯 250-12根/支0.9mm HB] [原装进口德国Staedtler施德楼 铅笔 100蓝杆绘图 顶级绘画素描 铅笔]
[12.00] [3.60]
-----------------------------------------------------------
用时 [00:02:40]