java jsoup 网络爬虫 学习例子(八)京东和淘宝商品比价 PhantomJS

 

java jsoup 网络爬虫 学习例子(八)京东和淘宝商品比价 PhantomJS

 

      由于淘宝的页面采用了独特的Kissy Javascript组件,鼠标右键查看源代码的时候看到并不是jsoup能直接解析的dom,jsoup不能直接通过选择器处理标签,之前用htmlunit+jsoup组合处理,但是htmlunit在处理时间上不是很理想,通过查找资料发现了一个比htmlunit好一点的PhantomJS,PhantomJS是一个基于WebKit的服务器端JavaScript API。它全面支持web而不需浏览器支持,其快速,原生支持各种Web标准: DOM 处理, CSS 选择器, JSON,Canvas,和 SVG。PhantomJS 可以用于页面自动化,网络监测,网页截屏,以及无界面测试等。

/*
 * filename getHtml.js
 * phantomjs.exe 2.0.0
 * author InJavaWeTrust
 */

var system = require('system');
var address = '';

if (system.args.length != 2) {
	console.log('Try to pass two args when invoking this script!');
	phantom.exit();
} else {
	address = system.args[1];
}

var page = require('webpage').create();
var url  = address;
phantom.outputEncoding = 'GBK';
page.open(url, function (status) {
	if (status !== 'success') {
		console.log('Failed to get the page!');
	} else {
		console.log(page.content);
	}
	phantom.exit();
});


package com.iteye.injavawetrust.phantomjs;

import java.util.List;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public interface ProductList {
	
	/**
	 * 爬取商品列表
	 * @return
	 */
	public List<ProductInfo> getProductList();

}


package com.iteye.injavawetrust.phantomjs;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class TBProductList implements ProductList{
	
	private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();
	
	private String tbUrl;
	
	private String productName;
	
	public TBProductList(String tbUrl, String productName) {
		this.tbUrl = tbUrl;
		this.productName = productName;
	}

	@Override
	public List<ProductInfo> getProductList() {
		List<ProductInfo> tbProductList = new ArrayList<ProductInfo>();
		ProductInfo productInfo = null;
		String url = "";
		int page = 0;
		for(int i = 0; i < 10; i++){
			try {
				System.out.println("TB Product 第[" + (i + 1) + "]页");
				if(i == 0){
					url = tbUrl;
				}else{
					page += 44;
					url = Constants.TBURL + pcu.getUrlCode(productName) + Constants.TBPAGE + page;
				}
				System.out.println(url);
				Document doc = Jsoup.parse(pcu.getHtmlByPhantomjs(url));
				Elements itemlist = doc.select("div[class=m-itemlist]");
				Iterator<Element> it = itemlist.iterator();
				while(it.hasNext()){
					Element item = it.next();
					Elements items = item.select("div[data-category=auctions]");
					Iterator<Element> one = items.iterator();
					while(one.hasNext()){
						Element e = one.next();
						Elements price = e.select("div[class=price g_price g_price-highlight]>strong");
						String productPrice = price.text();
						Elements title = e.select("div[class=row row-2 title]>a");
						String productName = title.text();
						productInfo = new ProductInfo();
						productInfo.setProductName(productName);
						productInfo.setProductPrice(productPrice);
						tbProductList.add(productInfo);
					}
					
				}
			} catch(Exception e) {
				System.out.println("Get TB product has error");
				System.out.println(e.getMessage());
			}
		}
		
		
		return tbProductList;
	}
	
	public static void main(String[] args) {		
		try{
			String productName = "铅笔";
			String tbUrl = Constants.TBURL + pcu.getUrlCode(productName);
			List<ProductInfo> list = new TBProductList(tbUrl, productName).getProductList();
			for(ProductInfo pi : list){
				System.out.println("[" + pi.getProductName() + "]  [" + pi.getProductPrice() + "]");
			}
		}catch(Exception e){
			e.printStackTrace();
		}
	}

}


package com.iteye.injavawetrust.phantomjs;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class JDProductList implements ProductList{
	
private String jdUrl;
	
	private String productName;
	
	private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();
	
	public JDProductList(String jdUrl, String productName){
		this.jdUrl = jdUrl;
		this.productName = productName;
	}

	@Override
	public List<ProductInfo> getProductList() {
		List<ProductInfo> jdProductList = new ArrayList<ProductInfo>();
		ProductInfo productInfo = null;
		String url = "";
		for(int i = 0; i < 10; i++){
			try {
				System.out.println("JD Product 第[" + (i + 1) + "]页");
				if(i == 0) {
					url = jdUrl;
				}else{
					url = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC + Constants.JDPAGE + (i + 1);
				}
				System.out.println(url);
				Document document = Jsoup.connect(url).timeout(5000).get();
				Elements uls = document.select("ul[class=gl-warp clearfix]");
				Iterator<Element> ulIter = uls.iterator();
				while(ulIter.hasNext()) {
					Element ul = ulIter.next();
					Elements lis = ul.select("li[data-sku]");
					Iterator<Element> liIter = lis.iterator();
					while(liIter.hasNext()) {
						Element li = liIter.next();
						Element div = li.select("div[class=gl-i-wrap]").first();
						Elements title = div.select("div[class=p-name p-name-type-2]>a");
						String productName = title.attr("title"); //得到商品名称
						Elements price = div.select(".p-price>strong");
						String productPrice =price.attr("data-price"); //得到商品价格
						productInfo = new ProductInfo();
						productInfo.setProductName(productName);
						productInfo.setProductPrice(productPrice);
						jdProductList.add(productInfo);
					}
				}
			} catch(Exception e) {
				System.out.println("Get JD product has error [" + url + "]");
				System.out.println(e.getMessage());
			}
		}
		return jdProductList;
	}
	
	public static void main(String[] args) {
		try {
			String productName = "书包";
			String jdUrl = Constants.JDURL + pcu.getGbk(productName)  + Constants.JDENC;
			List<ProductInfo> list = new JDProductList(jdUrl, productName).getProductList();
			System.out.println(list.size());
			for(ProductInfo pi : list){
				System.out.println(pi.getProductName() + "  " + pi.getProductPrice());
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

}


package com.iteye.injavawetrust.phantomjs;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class Constants {
	
	/**
	 * JDURL
	 */
	public static String JDURL = "http://search.jd.com/Search?keyword=";
	/**
	 * JD汉字编码格式
	 */
	public static String JDENC = "&enc=utf-8";
	/**
	 * JD分页
	 */
	public static String JDPAGE ="&page=";
	/**
	 * TBURL
	 */
	public static String TBURL = "https://s.taobao.com/search?q=";
	/**
	 * 淘宝分页
	 */
	public static String TBPAGE = "&s=";
	/**
	 * 超时时间
	 */
	public static int TIMEOUT = 50000;
	/**
	 * 获取页面script
	 */
	public static String SCRIPT = "E:\\InJavaWeTrust\\js\\getHtml.js ";
	/**
	 * phantomjs.exe path
	 */
	public static String PHANTOMJSPATH = "D:\\Program Files\\phantomjs\\bin\\phantomjs.exe ";

}


package com.iteye.injavawetrust.phantomjs;

import java.io.Serializable;
import java.util.Date;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class ProductInfo implements Serializable{

	private static final long serialVersionUID = 8179244535272774089L;
	
	/**
	 * 商品ID
	 */
	private String productid;
	/**
	 * 商品名称
	 */
	private String productName;
	/**
	 * 商品价格
	 */
	private String productPrice;
	/**
	 * 月销售笔数
	 */
	private String tradeNum;
	/**
	 * 商品URL
	 */
	private String productUrl;
	/**
	 * 商品网店名称
	 */
	private String shopName;
	/**
	 * 电商名称
	 */
	private String ecName;
	/**
	 * 爬取入库日期
	 */
	private Date date;
	
	public String getProductid() {
		return productid;
	}
	public void setProductid(String productid) {
		this.productid = productid;
	}
	public String getProductName() {
		return productName;
	}
	public void setProductName(String productName) {
		this.productName = productName;
	}
	public String getProductPrice() {
		return productPrice;
	}
	public void setProductPrice(String productPrice) {
		this.productPrice = productPrice;
	}
	public String getTradeNum() {
		return tradeNum;
	}
	public void setTradeNum(String tradeNum) {
		this.tradeNum = tradeNum;
	}
	public String getProductUrl() {
		return productUrl;
	}
	public void setProductUrl(String productUrl) {
		this.productUrl = productUrl;
	}
	public String getShopName() {
		return shopName;
	}
	public void setShopName(String shopName) {
		this.shopName = shopName;
	}
	public String getEcName() {
		return ecName;
	}
	public void setEcName(String ecName) {
		this.ecName = ecName;
	}
	public Date getDate() {
		return date;
	}
	public void setDate(Date date) {
		this.date = date;
	}
	
}


package com.iteye.injavawetrust.phantomjs;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.TimeZone;

import org.apache.commons.logging.LogFactory;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.HttpMethod;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class PriceCheckUtil {
	
	private PriceCheckUtil() {
		
	}
	
	private static final PriceCheckUtil instance = new PriceCheckUtil();
	
	public static PriceCheckUtil getInstance() {
		return instance;
	}
	
	
	/**
	 * 商品汉字转码
	 * @param productName 商品名称
	 * @return
	 */
	public String getGbk(String productName){
		String retGbk = "";
		try {
			retGbk = new String(productName.getBytes("UTF-8"), "GBK");
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		return retGbk;
	}
	/**
	 * 对淘宝浏览器汉字进行转换
	 * @param productName 商品名称
	 * @return
	 */
	public String getUrlCode(String productName){
		String retUrlCode = "";
		try {
			retUrlCode = URLEncoder.encode(productName, "utf8");
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		return retUrlCode;
	}
	
	/**
     * 从列表list中找到与productName相似度最高的ProductInfo
     *
     * @param productName
     * @param list
     * @return 相似度最高的productName
     */
	public ProductInfo getSimilarity(String productName, List<ProductInfo> list) {
		ProductInfo productInfo = null;
		/**
		 * 找到list中所有的productName与字符串productName的相似度,保存在lens数组中
		 */
		double lens[] = new double[list.size()];
		for (int i = 0; i < list.size() - 1; i++) {
			lens[i] = sim(productName, list.get(i).getProductName());
		}
		/**
		 * 遍历出最大的相似度maxLen
		 */
		double maxLen = 0.0;
		for (int i = 0; i < lens.length; i++) {
			if (maxLen < lens[i]) {
				maxLen = lens[i];
			}
		}
		/**
		 * 遍历出最大的相似度的索引maxLenIndex
		 */
		int maxLenIndex = 0;
		for (int i = 0; i < lens.length; i++) {
			if (maxLen == lens[i]) {
				maxLenIndex = i;
			}
		}
		productInfo = list.get(maxLenIndex);
		return productInfo;
	}
	
	/**
     * 求三个数中最小的一个
     * @param one
     * @param two
     * @param three
     * @return
     */
    public int min(int one, int two, int three) {
        int min = one;
        if(two < min) {
            min = two;
        }
        if(three < min) {
            min = three;
        }
        return min;
    }

    /**
     * 计算矢量距离
     * Levenshtein Distance(LD)
     * @param str1
     * @param str2
     * @return
     */
    public int ld(String str1, String str2) {
        int d[][];    //矩阵
        int n = str1.length();
        int m = str2.length();
        int i;    //遍历str1的
        int j;    //遍历str2的
        char ch1;    //str1的
        char ch2;    //str2的
        int temp;    //记录相同字符,在某个矩阵位置值的增量,不是0就是1
        if(n == 0) {
            return m;
        }
        if(m == 0) {
            return n;
        }
        d = new int[n+1][m+1];
        for(i=0; i<=n; i++) {    //初始化第一列
            d[i][0] = i;
        }
        for(j=0; j<=m; j++) {    //初始化第一行
            d[0][j] = j;
        }
        for(i=1; i<=n; i++) {    //遍历str1
            ch1 = str1.charAt(i-1);
            //去匹配str2
            for(j=1; j<=m; j++) {
                ch2 = str2.charAt(j-1);
                if(ch1 == ch2) {
                    temp = 0;
                } else {
                    temp = 1;
                }
                //左边+1,上边+1, 左上角+temp取最小
                d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp);
            }
        }
        return d[n][m];
    }

    /**
     * 计算相似度
     * @param str1
     * @param str2
     * @return
     */
	public double sim(String str1, String str2) {
		int ld = ld(str1, str2);
		return 1 - (double) ld / Math.max(str1.length(), str2.length());
	}
	
	/** 
     * 毫秒转换成hhmmss 
     * @param ms 毫秒 
     * @return hh:mm:ss 
     */  
    public String msToss(long ms) {
        SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss");  
        formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00"));  
        String ss = formatter.format(ms);  
        return ss;  
    }
    
    /**
	 * 禁止htmlunit日志输出
	 */
	public void offLog(){
		LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log",
				"org.apache.commons.logging.impl.NoOpLog");
	}
	/**
	 * 获取淘宝数据
	 * @param url
	 * @return
	 * @throws Exception
	 */
	public String getXmlByHtmlunit(String url) throws Exception {
		offLog();
		String ret = "";
		WebClient webClient = new WebClient(BrowserVersion.CHROME);
		// 1 启动JS
		webClient.getOptions().setJavaScriptEnabled(true);
		// 2 禁用Css,可避免自动二次请求CSS进行渲染
		webClient.getOptions().setCssEnabled(false);
		// 3 启动客户端重定向
		webClient.getOptions().setRedirectEnabled(true);
		// 4 JS运行错误时,是否抛出异常
		webClient.getOptions().setThrowExceptionOnScriptError(false);
		// 5AJAX support
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());
		// 6 设置超时
		webClient.getOptions().setTimeout(Constants.TIMEOUT);
		WebRequest webRequest = new WebRequest(new URL(url));
		webRequest.setHttpMethod(HttpMethod.GET);
		HtmlPage page = webClient.getPage(webRequest);
		webClient.waitForBackgroundJavaScript(10000);
		ret = page.asXml();
		webClient.close();
		return ret;
	}
	
	/**
	 * 通过Phantomjs得到html页面
	 * @param url
	 * @return
	 */
	public String getHtmlByPhantomjs(String url) {
		StringBuilder html = new StringBuilder();
		try {
			Runtime rt = Runtime.getRuntime();
			Process p = rt.exec(Constants.PHANTOMJSPATH + Constants.SCRIPT + url);
			InputStream is = p.getInputStream();
			BufferedReader br = new BufferedReader(new InputStreamReader(is));
			String tmp = "";
			while ((tmp = br.readLine()) != null) {
				html.append(tmp);
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		return html.toString();
	}

}


package com.iteye.injavawetrust.phantomjs;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class PriceCheckMain {
	
	private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();
	
	public List<Map<String, ProductInfo>> getProductList(String productName) {
		
		String jdUrl = Constants.JDURL + productName + Constants.JDENC;
		
		String tbUrl = Constants.TBURL + productName;
		
		return getProductFromUrls(jdUrl, tbUrl, productName);
	}
	
	public List<Map<String, ProductInfo>> getProductFromUrls(String jdUrl, String tbUrl, String productName) {
		List<Map<String, ProductInfo>> retListMap = new ArrayList<Map<String,ProductInfo>>();
		List<ProductInfo> jdProductList = new JDProductList(jdUrl, productName).getProductList();
		List<ProductInfo> tbProductList = new TBProductList(tbUrl, productName).getProductList();
		for(int i = 0; i < jdProductList.size(); i++){
			String jdProductName = jdProductList.get(i).getProductName();
			Map<String, ProductInfo> map = new HashMap<String, ProductInfo>();
			map.put("JD", jdProductList.get(i));
			ProductInfo tbProduct = pcu.getSimilarity(jdProductName, tbProductList);
			map.put("TB", tbProduct);
			retListMap.add(map);
		}
		
		return retListMap;
	}
	
	
	public static void main(String[] args) {
		System.out.println("输入商品名称:");
		Scanner scanner = new Scanner(System.in);
		String productName = scanner.next();
		scanner.close();
		System.out.println("京东和淘宝[" + productName + "]商品比价开始。。。。。。");
		try{
			long starTime = System.currentTimeMillis();
			List<Map<String, ProductInfo>> list = new PriceCheckMain().getProductList(productName);
			for(Map<String, ProductInfo> map : list){
				String jdName = map.get("JD").getProductName();
				String jdPrice = map.get("JD").getProductPrice();
				String ddName = map.get("TB").getProductName();
				String ddPrice = map.get("TB").getProductPrice();
				
				System.out.println("[" + jdName + "]  [" + ddName + "]");
				System.out.println("[" + jdPrice + "]  [" + ddPrice + "]");
				System.out.println("-----------------------------------------------------------");
			}
			long endTime = System.currentTimeMillis();
			System.out.println("用时 [" + pcu.msToss(endTime - starTime) + "]");
		}catch(Exception e){
			System.out.println("error");
			System.out.println(e.getMessage());
		}
	}

}

 

运行结果:

 

输入商品名称:
铅笔
京东和淘宝[铅笔]商品比价开始。。。。。。
JD Product 第[1]页
http://search.jd.com/Search?keyword=铅笔&enc=utf-8
JD Product 第[2]页
http://search.jd.com/Search?keyword=閾呯瑪&enc=utf-8&page=2

。。。。。。。。。。。。

TB Product 第[1]页
https://s.taobao.com/search?q=铅笔
TB Product 第[2]页
https://s.taobao.com/search?q=%E9%93%85%E7%AC%94&s=44

。。。。。。。。。。。。。。。。。

 

[马可9002铅笔 马克三角铅 笔易握正姿木杆 安全无毒2H HB 2B HB HB]  [马可9001铅笔 三角形杆橡皮头 学生写字铅笔 HB 2B 满28元包邮]
[12.00]  [8.96]
-----------------------------------------------------------
用时 [00:01:35]

你可能感兴趣的:(java,JSoup,phantomjs,网络爬虫,InJavaWeTrust)