java jsoup 网络爬虫 学习例子(七)京东和淘宝商品比价 htmlunit
package com.iteye.injavawetrust.pricecheck; import java.util.List; /** * * @author InJavaWeTrust * */ public interface ProductList { /** * 爬取商品列表 * @return */ public ListgetProductList(); } package com.iteye.injavawetrust.pricecheck; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * * @author InJavaWeTrust * */ public class JDProductList implements ProductList{ private String jdUrl; private String productName; private static PriceCheckUtil pcu = PriceCheckUtil.getInstance(); public JDProductList(String jdUrl, String productName){ this.jdUrl = jdUrl; this.productName = productName; } @Override public List getProductList() { List jdProductList = new ArrayList (); ProductInfo productInfo = null; String url = ""; for(int i = 0; i < 10; i++){ try { System.out.println("JD Product 第[" + (i + 1) + "]页"); if(i == 0) { url = jdUrl; }else{ url = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC + Constants.JDPAGE + (i + 1); } System.out.println(url); Document document = Jsoup.connect(url).timeout(5000).get(); Elements uls = document.select("ul[class=gl-warp clearfix]"); Iterator ulIter = uls.iterator(); while(ulIter.hasNext()) { Element ul = ulIter.next(); Elements lis = ul.select("li[data-sku]"); Iterator liIter = lis.iterator(); while(liIter.hasNext()) { Element li = liIter.next(); Element div = li.select("div[class=gl-i-wrap]").first(); Elements title = div.select("div[class=p-name p-name-type-2]>a"); String productName = title.attr("title"); //得到商品名称 Elements price = div.select(".p-price>strong"); String productPrice =price.attr("data-price"); //得到商品价格 productInfo = new ProductInfo(); productInfo.setProductName(productName); productInfo.setProductPrice(productPrice); jdProductList.add(productInfo); } } } catch(Exception e) { System.out.println("Get JD product has error [" + url + "]"); System.out.println(e.getMessage()); } } return jdProductList; } public static void main(String[] args) { try { String productName = "书包"; String jdUrl = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC; List list = new JDProductList(jdUrl, productName).getProductList(); System.out.println(list.size()); for(ProductInfo pi : list){ System.out.println(pi.getProductName() + " " + pi.getProductPrice()); } } catch (Exception e) { e.printStackTrace(); } } } package com.iteye.injavawetrust.pricecheck; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * * @author InJavaWeTrust * */ public class TBProductList implements ProductList{ private static PriceCheckUtil pcu = PriceCheckUtil.getInstance(); private String tbUrl; private String productName; public TBProductList(String tbUrl, String productName) { this.tbUrl = tbUrl; this.productName = productName; } @Override public List getProductList() { List tbProductList = new ArrayList (); ProductInfo productInfo = null; String url = ""; int page = 0; for(int i = 0; i < 10; i++){ try { System.out.println("TB Product 第[" + (i + 1) + "]页"); if(i == 0){ url = tbUrl; }else{ page += 44; url = Constants.TBURL + pcu.getUrlCode(productName) + Constants.TBPAGE + page; } System.out.println(url); Document doc = Jsoup.parse(pcu.getXmlByHtmlunit(url)); Elements itemlist = doc.select("div[class=m-itemlist]"); Iterator it = itemlist.iterator(); while(it.hasNext()){ Element item = it.next(); Elements items = item.select("div[data-category=auctions]"); System.out.println(items.size()); Iterator one = items.iterator(); while(one.hasNext()){ Element e = one.next(); Elements price = e.select("div[class=price g_price g_price-highlight]>strong"); String productPrice = price.text(); Elements title = e.select("div[class=row row-2 title]>a"); String productName = title.text(); productInfo = new ProductInfo(); productInfo.setProductName(productName); productInfo.setProductPrice(productPrice); tbProductList.add(productInfo); } } } catch(Exception e) { System.out.println("Get TB product has error"); System.out.println(e.getMessage()); } } return tbProductList; } public static void main(String[] args) { try{ String productName = "铅笔"; String tbUrl = Constants.TBURL + pcu.getUrlCode(productName); List list = new TBProductList(tbUrl, productName).getProductList(); for(ProductInfo pi : list){ System.out.println("[" + pi.getProductName() + "] [" + pi.getProductPrice() + "]"); } }catch(Exception e){ e.printStackTrace(); } } } package com.iteye.injavawetrust.pricecheck; import java.io.Serializable; import java.util.Date; /** * * @author InJavaWeTrust * */ public class ProductInfo implements Serializable{ private static final long serialVersionUID = 8179244535272774089L; /** * 商品ID */ private String productid; /** * 商品名称 */ private String productName; /** * 商品价格 */ private String productPrice; /** * 月销售笔数 */ private String tradeNum; /** * 商品URL */ private String productUrl; /** * 商品网店名称 */ private String shopName; /** * 电商名称 */ private String ecName; /** * 爬取入库日期 */ private Date date; public String getProductid() { return productid; } public void setProductid(String productid) { this.productid = productid; } public String getProductName() { return productName; } public void setProductName(String productName) { this.productName = productName; } public String getProductPrice() { return productPrice; } public void setProductPrice(String productPrice) { this.productPrice = productPrice; } public String getTradeNum() { return tradeNum; } public void setTradeNum(String tradeNum) { this.tradeNum = tradeNum; } public String getProductUrl() { return productUrl; } public void setProductUrl(String productUrl) { this.productUrl = productUrl; } public String getShopName() { return shopName; } public void setShopName(String shopName) { this.shopName = shopName; } public String getEcName() { return ecName; } public void setEcName(String ecName) { this.ecName = ecName; } public Date getDate() { return date; } public void setDate(Date date) { this.date = date; } } package com.iteye.injavawetrust.pricecheck; /** * * @author InJavaWeTrust * */ public class Constants { /** * JDURL */ public static String JDURL = "http://search.jd.com/Search?keyword="; /** * JD汉字编码格式 */ public static String JDENC = "&enc=utf-8"; /** * JD分页 */ public static String JDPAGE ="&page="; /** * TBURL */ public static String TBURL = "https://s.taobao.com/search?q="; /** * 淘宝分页 */ public static String TBPAGE = "&s="; /** * 超时时间 */ public static int TIMEOUT = 50000; } package com.iteye.injavawetrust.pricecheck; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLEncoder; import java.text.SimpleDateFormat; import java.util.List; import java.util.TimeZone; import org.apache.commons.logging.LogFactory; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.HttpMethod; import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.WebRequest; import com.gargoylesoftware.htmlunit.html.HtmlPage; /** * * @author InJavaWeTrust * */ public class PriceCheckUtil { private PriceCheckUtil() { } private static final PriceCheckUtil instance = new PriceCheckUtil(); public static PriceCheckUtil getInstance() { return instance; } /** * 商品汉字转码 * @param productName 商品名称 * @return */ public String getGbk(String productName){ String retGbk = ""; try { retGbk = new String(productName.getBytes("UTF-8"), "GBK"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return retGbk; } /** * 对淘宝浏览器汉字进行转换 * @param productName 商品名称 * @return */ public String getUrlCode(String productName){ String retUrlCode = ""; try { retUrlCode = URLEncoder.encode(productName, "utf8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return retUrlCode; } /** * 从列表list中找到与productName相似度最高的ProductInfo * * @param productName * @param list * @return 相似度最高的productName */ public ProductInfo getSimilarity(String productName, List list) { ProductInfo productInfo = null; /** * 找到list中所有的productName与字符串productName的相似度,保存在lens数组中 */ double lens[] = new double[list.size()]; for (int i = 0; i < list.size() - 1; i++) { lens[i] = sim(productName, list.get(i).getProductName()); } /** * 遍历出最大的相似度maxLen */ double maxLen = 0.0; for (int i = 0; i < lens.length; i++) { if (maxLen < lens[i]) { maxLen = lens[i]; } } /** * 遍历出最大的相似度的索引maxLenIndex */ int maxLenIndex = 0; for (int i = 0; i < lens.length; i++) { if (maxLen == lens[i]) { maxLenIndex = i; } } productInfo = list.get(maxLenIndex); return productInfo; } /** * 求三个数中最小的一个 * @param one * @param two * @param three * @return */ public int min(int one, int two, int three) { int min = one; if(two < min) { min = two; } if(three < min) { min = three; } return min; } /** * 计算矢量距离 * Levenshtein Distance(LD) * @param str1 * @param str2 * @return */ public int ld(String str1, String str2) { int d[][]; //矩阵 int n = str1.length(); int m = str2.length(); int i; //遍历str1的 int j; //遍历str2的 char ch1; //str1的 char ch2; //str2的 int temp; //记录相同字符,在某个矩阵位置值的增量,不是0就是1 if(n == 0) { return m; } if(m == 0) { return n; } d = new int[n+1][m+1]; for(i=0; i<=n; i++) { //初始化第一列 d[i][0] = i; } for(j=0; j<=m; j++) { //初始化第一行 d[0][j] = j; } for(i=1; i<=n; i++) { //遍历str1 ch1 = str1.charAt(i-1); //去匹配str2 for(j=1; j<=m; j++) { ch2 = str2.charAt(j-1); if(ch1 == ch2) { temp = 0; } else { temp = 1; } //左边+1,上边+1, 左上角+temp取最小 d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp); } } return d[n][m]; } /** * 计算相似度 * @param str1 * @param str2 * @return */ public double sim(String str1, String str2) { int ld = ld(str1, str2); return 1 - (double) ld / Math.max(str1.length(), str2.length()); } /** * 毫秒转换成hhmmss * @param ms 毫秒 * @return hh:mm:ss */ public String msToss(long ms) { SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss"); formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00")); String ss = formatter.format(ms); return ss; } /** * 禁止htmlunit日志输出 */ public void offLog(){ LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); } /** * 获取淘宝数据 * @param url * @return * @throws Exception */ public String getXmlByHtmlunit(String url) throws Exception { offLog(); String ret = ""; WebClient webClient = new WebClient(BrowserVersion.CHROME); // 1 启动JS webClient.getOptions().setJavaScriptEnabled(true); // 2 禁用Css,可避免自动二次请求CSS进行渲染 webClient.getOptions().setCssEnabled(false); // 3 启动客户端重定向 webClient.getOptions().setRedirectEnabled(true); // 4 JS运行错误时,是否抛出异常 webClient.getOptions().setThrowExceptionOnScriptError(false); // 5AJAX support webClient.setAjaxController(new NicelyResynchronizingAjaxController()); // 6 设置超时 webClient.getOptions().setTimeout(Constants.TIMEOUT); WebRequest webRequest = new WebRequest(new URL(url)); webRequest.setHttpMethod(HttpMethod.GET); HtmlPage page = webClient.getPage(webRequest); webClient.waitForBackgroundJavaScript(10000); ret = page.asXml(); webClient.close(); return ret; } } package com.iteye.injavawetrust.pricecheck; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Scanner; /** * * @author InJavaWeTrust * */ public class PriceCheckMain { private static PriceCheckUtil pcu = PriceCheckUtil.getInstance(); public List
运行结果:
输入商品名称:
铅笔
京东和淘宝[铅笔]商品比价开始。。。。。。
JD Product 第[1]页
http://search.jd.com/Search?keyword=铅笔&enc=utf-8
JD Product 第[2]页
http://search.jd.com/Search?keyword=閾呯瑪&enc=utf-8&page=2
。。。。。。。。。。。。。。。。。。。
TB Product 第[1]页
https://s.taobao.com/search?q=铅笔
RHINO USAGE WARNING: Missed Context.javaToJS() conversion:
Rhino runtime detected object com.gargoylesoftware.htmlunit.ScriptException: TypeError: Cannot read property "domainLookupEnd" from undefined (https://g.alicdn.com/??kissy/k/1.4.15/import-style-min.js,tb/tracker/1.0.19/index.js,/tb/tsrp/1.61.0/config.js#9) of class com.gargoylesoftware.htmlunit.ScriptException where it expected String, Number, Boolean or Scriptable instance. Please check your code for missing Context.javaToJS() call.
46
TB Product 第[2]页
https://s.taobao.com/search?q=%E9%93%85%E7%AC%94&s=44
RHINO USAGE WARNING: Missed Context.javaToJS() conversion:
Rhino runtime detected object com.gargoylesoftware.htmlunit.ScriptException: TypeError: Cannot read property "domainLookupEnd" from undefined (https://g.alicdn.com/??kissy/k/1.4.15/import-style-min.js,tb/tracker/1.0.19/index.js,/tb/tsrp/1.61.0/config.js#9) of class com.gargoylesoftware.htmlunit.ScriptException where it expected String, Number, Boolean or Scriptable instance. Please check your code for missing Context.javaToJS() call.
。。。。。。。。。。。。。。。。
-----------------------------------------------------------
[施德楼(Staedtler)100 蓝杆画图绘图铅笔|素描铅笔 HB单支装] [原装进口德国Staedtler施德楼 铅笔 100蓝杆绘图 顶级绘画素描 铅笔]
[4.80] [3.60]
-----------------------------------------------------------
[施德楼(Staedtler) 铅芯|自动铅笔芯|替芯 250-12根/支0.9mm HB] [原装进口德国Staedtler施德楼 铅笔 100蓝杆绘图 顶级绘画素描 铅笔]
[12.00] [3.60]
-----------------------------------------------------------
用时 [00:02:40]