java jsoup 网络爬虫 学习例子(八)京东和淘宝商品比价 PhantomJS
由于淘宝的页面采用了独特的Kissy Javascript组件,鼠标右键查看源代码的时候看到并不是jsoup能直接解析的dom,jsoup不能直接通过选择器处理标签,之前用htmlunit+jsoup组合处理,但是htmlunit在处理时间上不是很理想,通过查找资料发现了一个比htmlunit好一点的PhantomJS,PhantomJS是一个基于WebKit的服务器端JavaScript API。它全面支持web而不需浏览器支持,其快速,原生支持各种Web标准: DOM 处理, CSS 选择器, JSON,Canvas,和 SVG。PhantomJS 可以用于页面自动化,网络监测,网页截屏,以及无界面测试等。
/* * filename getHtml.js * phantomjs.exe 2.0.0 * author InJavaWeTrust */ var system = require('system'); var address = ''; if (system.args.length != 2) { console.log('Try to pass two args when invoking this script!'); phantom.exit(); } else { address = system.args[1]; } var page = require('webpage').create(); var url = address; phantom.outputEncoding = 'GBK'; page.open(url, function (status) { if (status !== 'success') { console.log('Failed to get the page!'); } else { console.log(page.content); } phantom.exit(); }); package com.iteye.injavawetrust.phantomjs; import java.util.List; /** * * @author InJavaWeTrust * */ public interface ProductList { /** * 爬取商品列表 * @return */ public List<ProductInfo> getProductList(); } package com.iteye.injavawetrust.phantomjs; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * * @author InJavaWeTrust * */ public class TBProductList implements ProductList{ private static PriceCheckUtil pcu = PriceCheckUtil.getInstance(); private String tbUrl; private String productName; public TBProductList(String tbUrl, String productName) { this.tbUrl = tbUrl; this.productName = productName; } @Override public List<ProductInfo> getProductList() { List<ProductInfo> tbProductList = new ArrayList<ProductInfo>(); ProductInfo productInfo = null; String url = ""; int page = 0; for(int i = 0; i < 10; i++){ try { System.out.println("TB Product 第[" + (i + 1) + "]页"); if(i == 0){ url = tbUrl; }else{ page += 44; url = Constants.TBURL + pcu.getUrlCode(productName) + Constants.TBPAGE + page; } System.out.println(url); Document doc = Jsoup.parse(pcu.getHtmlByPhantomjs(url)); Elements itemlist = doc.select("div[class=m-itemlist]"); Iterator<Element> it = itemlist.iterator(); while(it.hasNext()){ Element item = it.next(); Elements items = item.select("div[data-category=auctions]"); Iterator<Element> one = items.iterator(); while(one.hasNext()){ Element e = one.next(); Elements price = e.select("div[class=price g_price g_price-highlight]>strong"); String productPrice = price.text(); Elements title = e.select("div[class=row row-2 title]>a"); String productName = title.text(); productInfo = new ProductInfo(); productInfo.setProductName(productName); productInfo.setProductPrice(productPrice); tbProductList.add(productInfo); } } } catch(Exception e) { System.out.println("Get TB product has error"); System.out.println(e.getMessage()); } } return tbProductList; } public static void main(String[] args) { try{ String productName = "铅笔"; String tbUrl = Constants.TBURL + pcu.getUrlCode(productName); List<ProductInfo> list = new TBProductList(tbUrl, productName).getProductList(); for(ProductInfo pi : list){ System.out.println("[" + pi.getProductName() + "] [" + pi.getProductPrice() + "]"); } }catch(Exception e){ e.printStackTrace(); } } } package com.iteye.injavawetrust.phantomjs; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * * @author InJavaWeTrust * */ public class JDProductList implements ProductList{ private String jdUrl; private String productName; private static PriceCheckUtil pcu = PriceCheckUtil.getInstance(); public JDProductList(String jdUrl, String productName){ this.jdUrl = jdUrl; this.productName = productName; } @Override public List<ProductInfo> getProductList() { List<ProductInfo> jdProductList = new ArrayList<ProductInfo>(); ProductInfo productInfo = null; String url = ""; for(int i = 0; i < 10; i++){ try { System.out.println("JD Product 第[" + (i + 1) + "]页"); if(i == 0) { url = jdUrl; }else{ url = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC + Constants.JDPAGE + (i + 1); } System.out.println(url); Document document = Jsoup.connect(url).timeout(5000).get(); Elements uls = document.select("ul[class=gl-warp clearfix]"); Iterator<Element> ulIter = uls.iterator(); while(ulIter.hasNext()) { Element ul = ulIter.next(); Elements lis = ul.select("li[data-sku]"); Iterator<Element> liIter = lis.iterator(); while(liIter.hasNext()) { Element li = liIter.next(); Element div = li.select("div[class=gl-i-wrap]").first(); Elements title = div.select("div[class=p-name p-name-type-2]>a"); String productName = title.attr("title"); //得到商品名称 Elements price = div.select(".p-price>strong"); String productPrice =price.attr("data-price"); //得到商品价格 productInfo = new ProductInfo(); productInfo.setProductName(productName); productInfo.setProductPrice(productPrice); jdProductList.add(productInfo); } } } catch(Exception e) { System.out.println("Get JD product has error [" + url + "]"); System.out.println(e.getMessage()); } } return jdProductList; } public static void main(String[] args) { try { String productName = "书包"; String jdUrl = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC; List<ProductInfo> list = new JDProductList(jdUrl, productName).getProductList(); System.out.println(list.size()); for(ProductInfo pi : list){ System.out.println(pi.getProductName() + " " + pi.getProductPrice()); } } catch (Exception e) { e.printStackTrace(); } } } package com.iteye.injavawetrust.phantomjs; /** * * @author InJavaWeTrust * */ public class Constants { /** * JDURL */ public static String JDURL = "http://search.jd.com/Search?keyword="; /** * JD汉字编码格式 */ public static String JDENC = "&enc=utf-8"; /** * JD分页 */ public static String JDPAGE ="&page="; /** * TBURL */ public static String TBURL = "https://s.taobao.com/search?q="; /** * 淘宝分页 */ public static String TBPAGE = "&s="; /** * 超时时间 */ public static int TIMEOUT = 50000; /** * 获取页面script */ public static String SCRIPT = "E:\\InJavaWeTrust\\js\\getHtml.js "; /** * phantomjs.exe path */ public static String PHANTOMJSPATH = "D:\\Program Files\\phantomjs\\bin\\phantomjs.exe "; } package com.iteye.injavawetrust.phantomjs; import java.io.Serializable; import java.util.Date; /** * * @author InJavaWeTrust * */ public class ProductInfo implements Serializable{ private static final long serialVersionUID = 8179244535272774089L; /** * 商品ID */ private String productid; /** * 商品名称 */ private String productName; /** * 商品价格 */ private String productPrice; /** * 月销售笔数 */ private String tradeNum; /** * 商品URL */ private String productUrl; /** * 商品网店名称 */ private String shopName; /** * 电商名称 */ private String ecName; /** * 爬取入库日期 */ private Date date; public String getProductid() { return productid; } public void setProductid(String productid) { this.productid = productid; } public String getProductName() { return productName; } public void setProductName(String productName) { this.productName = productName; } public String getProductPrice() { return productPrice; } public void setProductPrice(String productPrice) { this.productPrice = productPrice; } public String getTradeNum() { return tradeNum; } public void setTradeNum(String tradeNum) { this.tradeNum = tradeNum; } public String getProductUrl() { return productUrl; } public void setProductUrl(String productUrl) { this.productUrl = productUrl; } public String getShopName() { return shopName; } public void setShopName(String shopName) { this.shopName = shopName; } public String getEcName() { return ecName; } public void setEcName(String ecName) { this.ecName = ecName; } public Date getDate() { return date; } public void setDate(Date date) { this.date = date; } } package com.iteye.injavawetrust.phantomjs; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLEncoder; import java.text.SimpleDateFormat; import java.util.List; import java.util.TimeZone; import org.apache.commons.logging.LogFactory; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.HttpMethod; import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.WebRequest; import com.gargoylesoftware.htmlunit.html.HtmlPage; /** * * @author InJavaWeTrust * */ public class PriceCheckUtil { private PriceCheckUtil() { } private static final PriceCheckUtil instance = new PriceCheckUtil(); public static PriceCheckUtil getInstance() { return instance; } /** * 商品汉字转码 * @param productName 商品名称 * @return */ public String getGbk(String productName){ String retGbk = ""; try { retGbk = new String(productName.getBytes("UTF-8"), "GBK"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return retGbk; } /** * 对淘宝浏览器汉字进行转换 * @param productName 商品名称 * @return */ public String getUrlCode(String productName){ String retUrlCode = ""; try { retUrlCode = URLEncoder.encode(productName, "utf8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return retUrlCode; } /** * 从列表list中找到与productName相似度最高的ProductInfo * * @param productName * @param list * @return 相似度最高的productName */ public ProductInfo getSimilarity(String productName, List<ProductInfo> list) { ProductInfo productInfo = null; /** * 找到list中所有的productName与字符串productName的相似度,保存在lens数组中 */ double lens[] = new double[list.size()]; for (int i = 0; i < list.size() - 1; i++) { lens[i] = sim(productName, list.get(i).getProductName()); } /** * 遍历出最大的相似度maxLen */ double maxLen = 0.0; for (int i = 0; i < lens.length; i++) { if (maxLen < lens[i]) { maxLen = lens[i]; } } /** * 遍历出最大的相似度的索引maxLenIndex */ int maxLenIndex = 0; for (int i = 0; i < lens.length; i++) { if (maxLen == lens[i]) { maxLenIndex = i; } } productInfo = list.get(maxLenIndex); return productInfo; } /** * 求三个数中最小的一个 * @param one * @param two * @param three * @return */ public int min(int one, int two, int three) { int min = one; if(two < min) { min = two; } if(three < min) { min = three; } return min; } /** * 计算矢量距离 * Levenshtein Distance(LD) * @param str1 * @param str2 * @return */ public int ld(String str1, String str2) { int d[][]; //矩阵 int n = str1.length(); int m = str2.length(); int i; //遍历str1的 int j; //遍历str2的 char ch1; //str1的 char ch2; //str2的 int temp; //记录相同字符,在某个矩阵位置值的增量,不是0就是1 if(n == 0) { return m; } if(m == 0) { return n; } d = new int[n+1][m+1]; for(i=0; i<=n; i++) { //初始化第一列 d[i][0] = i; } for(j=0; j<=m; j++) { //初始化第一行 d[0][j] = j; } for(i=1; i<=n; i++) { //遍历str1 ch1 = str1.charAt(i-1); //去匹配str2 for(j=1; j<=m; j++) { ch2 = str2.charAt(j-1); if(ch1 == ch2) { temp = 0; } else { temp = 1; } //左边+1,上边+1, 左上角+temp取最小 d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp); } } return d[n][m]; } /** * 计算相似度 * @param str1 * @param str2 * @return */ public double sim(String str1, String str2) { int ld = ld(str1, str2); return 1 - (double) ld / Math.max(str1.length(), str2.length()); } /** * 毫秒转换成hhmmss * @param ms 毫秒 * @return hh:mm:ss */ public String msToss(long ms) { SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss"); formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00")); String ss = formatter.format(ms); return ss; } /** * 禁止htmlunit日志输出 */ public void offLog(){ LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); } /** * 获取淘宝数据 * @param url * @return * @throws Exception */ public String getXmlByHtmlunit(String url) throws Exception { offLog(); String ret = ""; WebClient webClient = new WebClient(BrowserVersion.CHROME); // 1 启动JS webClient.getOptions().setJavaScriptEnabled(true); // 2 禁用Css,可避免自动二次请求CSS进行渲染 webClient.getOptions().setCssEnabled(false); // 3 启动客户端重定向 webClient.getOptions().setRedirectEnabled(true); // 4 JS运行错误时,是否抛出异常 webClient.getOptions().setThrowExceptionOnScriptError(false); // 5AJAX support webClient.setAjaxController(new NicelyResynchronizingAjaxController()); // 6 设置超时 webClient.getOptions().setTimeout(Constants.TIMEOUT); WebRequest webRequest = new WebRequest(new URL(url)); webRequest.setHttpMethod(HttpMethod.GET); HtmlPage page = webClient.getPage(webRequest); webClient.waitForBackgroundJavaScript(10000); ret = page.asXml(); webClient.close(); return ret; } /** * 通过Phantomjs得到html页面 * @param url * @return */ public String getHtmlByPhantomjs(String url) { StringBuilder html = new StringBuilder(); try { Runtime rt = Runtime.getRuntime(); Process p = rt.exec(Constants.PHANTOMJSPATH + Constants.SCRIPT + url); InputStream is = p.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is)); String tmp = ""; while ((tmp = br.readLine()) != null) { html.append(tmp); } } catch (IOException e) { e.printStackTrace(); } return html.toString(); } } package com.iteye.injavawetrust.phantomjs; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Scanner; /** * * @author InJavaWeTrust * */ public class PriceCheckMain { private static PriceCheckUtil pcu = PriceCheckUtil.getInstance(); public List<Map<String, ProductInfo>> getProductList(String productName) { String jdUrl = Constants.JDURL + productName + Constants.JDENC; String tbUrl = Constants.TBURL + productName; return getProductFromUrls(jdUrl, tbUrl, productName); } public List<Map<String, ProductInfo>> getProductFromUrls(String jdUrl, String tbUrl, String productName) { List<Map<String, ProductInfo>> retListMap = new ArrayList<Map<String,ProductInfo>>(); List<ProductInfo> jdProductList = new JDProductList(jdUrl, productName).getProductList(); List<ProductInfo> tbProductList = new TBProductList(tbUrl, productName).getProductList(); for(int i = 0; i < jdProductList.size(); i++){ String jdProductName = jdProductList.get(i).getProductName(); Map<String, ProductInfo> map = new HashMap<String, ProductInfo>(); map.put("JD", jdProductList.get(i)); ProductInfo tbProduct = pcu.getSimilarity(jdProductName, tbProductList); map.put("TB", tbProduct); retListMap.add(map); } return retListMap; } public static void main(String[] args) { System.out.println("输入商品名称:"); Scanner scanner = new Scanner(System.in); String productName = scanner.next(); scanner.close(); System.out.println("京东和淘宝[" + productName + "]商品比价开始。。。。。。"); try{ long starTime = System.currentTimeMillis(); List<Map<String, ProductInfo>> list = new PriceCheckMain().getProductList(productName); for(Map<String, ProductInfo> map : list){ String jdName = map.get("JD").getProductName(); String jdPrice = map.get("JD").getProductPrice(); String ddName = map.get("TB").getProductName(); String ddPrice = map.get("TB").getProductPrice(); System.out.println("[" + jdName + "] [" + ddName + "]"); System.out.println("[" + jdPrice + "] [" + ddPrice + "]"); System.out.println("-----------------------------------------------------------"); } long endTime = System.currentTimeMillis(); System.out.println("用时 [" + pcu.msToss(endTime - starTime) + "]"); }catch(Exception e){ System.out.println("error"); System.out.println(e.getMessage()); } } }
运行结果:
输入商品名称:
铅笔
京东和淘宝[铅笔]商品比价开始。。。。。。
JD Product 第[1]页
http://search.jd.com/Search?keyword=铅笔&enc=utf-8
JD Product 第[2]页
http://search.jd.com/Search?keyword=閾呯瑪&enc=utf-8&page=2
。。。。。。。。。。。。
TB Product 第[1]页
https://s.taobao.com/search?q=铅笔
TB Product 第[2]页
https://s.taobao.com/search?q=%E9%93%85%E7%AC%94&s=44
。。。。。。。。。。。。。。。。。
[马可9002铅笔 马克三角铅 笔易握正姿木杆 安全无毒2H HB 2B HB HB] [马可9001铅笔 三角形杆橡皮头 学生写字铅笔 HB 2B 满28元包邮]
[12.00] [8.96]
-----------------------------------------------------------
用时 [00:01:35]