java jsoup 网络爬虫 学习例子(六)京东和当当商品比价

 

java jsoup 网络爬虫 学习例子(六)京东和当当商品比价

 

 

 

 

package com.iteye.injavawetrust.jdvsdd;

import java.util.List;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public interface ProductList {
	
	/**
	 * 爬取商品列表
	 * @return
	 */
	public List<ProductInfo> getProductList();

}

package com.iteye.injavawetrust.jdvsdd;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class JDProductList implements ProductList{
	
	private String jdUrl;
	
	private String productName;
	
	private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();
	
	public JDProductList(String jdUrl, String productName){
		this.jdUrl = jdUrl;
		this.productName = productName;
	}

	@Override
	public List<ProductInfo> getProductList() {
		List<ProductInfo> jdProductList = new ArrayList<ProductInfo>();
		ProductInfo productInfo = null;
		String url = "";
		for(int i = 0; i < 10; i++){
			try {
				System.out.println("JD Product 第[" + (i + 1) + "]页");
				if(i == 0) {
					url = jdUrl;
				}else{
					url = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC + Constants.JDPAGE + (i + 1);
				}
				System.out.println(url);
				Document document = Jsoup.connect(url).timeout(5000).get();
				Elements uls = document.select("ul[class=gl-warp clearfix]");
				Iterator<Element> ulIter = uls.iterator();
				while(ulIter.hasNext()) {
					Element ul = ulIter.next();
					Elements lis = ul.select("li[data-sku]");
					Iterator<Element> liIter = lis.iterator();
					while(liIter.hasNext()) {
						Element li = liIter.next();
						Element div = li.select("div[class=gl-i-wrap]").first();
						Elements title = div.select("div[class=p-name p-name-type-2]>a");
						String productName = title.attr("title"); //得到商品名称
						Elements price = div.select(".p-price>strong");
						String productPrice =price.attr("data-price"); //得到商品价格
						productInfo = new ProductInfo();
						productInfo.setProductName(productName);
						productInfo.setProductPrice(productPrice);
						jdProductList.add(productInfo);
					}
				}
			} catch(Exception e) {
				System.out.println("Get JD product has error [" + url + "]");
				System.out.println(e.getMessage());
			}
		}
		return jdProductList;
	}
	
	public static void main(String[] args) {
		try {
			String productName = "书包";
			String jdUrl = Constants.JDURL + pcu.getGbk(productName)  + Constants.JDENC;
			List<ProductInfo> list = new JDProductList(jdUrl, productName).getProductList();
			System.out.println(list.size());
			for(ProductInfo pi : list){
				System.out.println(pi.getProductName() + "  " + pi.getProductPrice());
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

}


package com.iteye.injavawetrust.jdvsdd;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class DDProductList implements ProductList{
	
	private String ddUrl;
	
	private String productName;
	
	public DDProductList(String ddUrl, String productName) {
		this.ddUrl = ddUrl;
		this.productName = productName;
	}

	@Override
	public List<ProductInfo> getProductList() {
		List<ProductInfo> ddProductList = new ArrayList<ProductInfo>();
		ProductInfo productInfo = null;
		String url = "";
		for(int i = 0; i < 10; i++){
			try{
				System.out.println("DD Product 第[" + (i + 1) + "]页");
				if(i == 0){
					url = ddUrl;
				}else{
					url = Constants.DDURL + productName + Constants.ACT + Constants.DDPAGE + (i + 1);
				}
				System.out.println(url);
				Document document = Jsoup.connect(url).timeout(5000).get();
				Elements uls = document.select("ul[class=bigimg cloth_shoplist]");
				Iterator<Element> ulIter = uls.iterator();
				while(ulIter.hasNext()){
					Element ul = ulIter.next();
					Elements lis = ul.select("li");
					Iterator<Element> liIter = lis.iterator();
					while(liIter.hasNext()){
						Element li = liIter.next();
						Elements price = li.select("p[class=price]>span");
						String productPrice = price.html().replaceAll("&yen;", "");
						Elements title = li.select("p[class=name]>a");
						String productName = title.attr("title");
						productInfo = new ProductInfo();
						productInfo.setProductName(productName);
						productInfo.setProductPrice(productPrice);
						ddProductList.add(productInfo);
					}
					
				}
			} catch(Exception e){
				System.out.println("Get DD product has error [" + url + "]");
				System.out.println(e.getMessage());
			}
		}
		return ddProductList;
	}
	
	public static void main(String[] args) {
		String productName = "学生铅笔";
		String ddUrl = Constants.DDURL + productName;
		List<ProductInfo> list = new DDProductList(ddUrl, productName).getProductList();
		System.out.println(list.size());
		for(ProductInfo pi : list){
			System.out.println(pi.getProductName() + "  " + pi.getProductPrice());
		}
	}

}


package com.iteye.injavawetrust.jdvsdd;

import java.io.Serializable;
import java.util.Date;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class ProductInfo implements Serializable{

	private static final long serialVersionUID = 8179244535272774089L;
	
	/**
	 * 商品ID
	 */
	private String productid;
	/**
	 * 商品名称
	 */
	private String productName;
	/**
	 * 商品价格
	 */
	private String productPrice;
	/**
	 * 月销售笔数
	 */
	private String tradeNum;
	/**
	 * 商品URL
	 */
	private String productUrl;
	/**
	 * 商品网店名称
	 */
	private String shopName;
	/**
	 * 电商名称
	 */
	private String ecName;
	/**
	 * 爬取入库日期
	 */
	private Date date;
	
	public String getProductid() {
		return productid;
	}
	public void setProductid(String productid) {
		this.productid = productid;
	}
	public String getProductName() {
		return productName;
	}
	public void setProductName(String productName) {
		this.productName = productName;
	}
	public String getProductPrice() {
		return productPrice;
	}
	public void setProductPrice(String productPrice) {
		this.productPrice = productPrice;
	}
	public String getTradeNum() {
		return tradeNum;
	}
	public void setTradeNum(String tradeNum) {
		this.tradeNum = tradeNum;
	}
	public String getProductUrl() {
		return productUrl;
	}
	public void setProductUrl(String productUrl) {
		this.productUrl = productUrl;
	}
	public String getShopName() {
		return shopName;
	}
	public void setShopName(String shopName) {
		this.shopName = shopName;
	}
	public String getEcName() {
		return ecName;
	}
	public void setEcName(String ecName) {
		this.ecName = ecName;
	}
	public Date getDate() {
		return date;
	}
	public void setDate(Date date) {
		this.date = date;
	}
	
}


package com.iteye.injavawetrust.jdvsdd;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class Constants {
	
	/**
	 * JDURL
	 */
	public static String JDURL = "http://search.jd.com/Search?keyword=";
	/**
	 * JD汉字编码格式
	 */
	public static String JDENC = "&enc=utf-8";
	/**
	 * JD分页
	 */
	public static String JDPAGE ="&page=";
	/**
	 * 当当URL
	 */
	public static String DDURL = "http://search.dangdang.com/?key=";
	/**
	 * 当当ACT
	 */
	public static String ACT = "&act=input";
	/**
	 *  当当分页
	 */
	public static String DDPAGE = "&page_index=";

}


package com.iteye.injavawetrust.jdvsdd;

import java.io.UnsupportedEncodingException;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.TimeZone;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class PriceCheckUtil {
	
	private PriceCheckUtil() {
		
	}
	
	private static final PriceCheckUtil instance = new PriceCheckUtil();
	
	public static PriceCheckUtil getInstance() {
		return instance;
	}
	
	/**
	 * 商品汉字转码
	 * @param productName 商品名称
	 * @return
	 */
	public String getGbk(String productName){
		String retGbk = "";
		try {
			retGbk = new String(productName.getBytes("UTF-8"), "GBK");
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		return retGbk;
	}
	
	/**
     * 从列表list中找到与productName相似度最高的ProductInfo
     *
     * @param productName
     * @param list
     * @return 相似度最高的productName
     */
	public ProductInfo getSimilarity(String productName, List<ProductInfo> list) {
		ProductInfo productInfo = null;
		/**
		 * 找到list中所有的productName与字符串productName的相似度,保存在lens数组中
		 */
		double lens[] = new double[list.size()];
		for (int i = 0; i < list.size() - 1; i++) {
			lens[i] = sim(productName, list.get(i).getProductName());
		}
		/**
		 * 遍历出最大的相似度maxLen
		 */
		double maxLen = 0.0;
		for (int i = 0; i < lens.length; i++) {
			if (maxLen < lens[i]) {
				maxLen = lens[i];
			}
		}
		/**
		 * 遍历出最大的相似度的索引maxLenIndex
		 */
		int maxLenIndex = 0;
		for (int i = 0; i < lens.length; i++) {
			if (maxLen == lens[i]) {
				maxLenIndex = i;
			}
		}
		productInfo = list.get(maxLenIndex);
		return productInfo;
	}
	
	/**
     * 求三个数中最小的一个
     * @param one
     * @param two
     * @param three
     * @return
     */
    public int min(int one, int two, int three) {
        int min = one;
        if(two < min) {
            min = two;
        }
        if(three < min) {
            min = three;
        }
        return min;
    }

    /**
     * 计算矢量距离
     * Levenshtein Distance(LD)
     * @param str1
     * @param str2
     * @return
     */
    public int ld(String str1, String str2) {
        int d[][];    //矩阵
        int n = str1.length();
        int m = str2.length();
        int i;    //遍历str1的
        int j;    //遍历str2的
        char ch1;    //str1的
        char ch2;    //str2的
        int temp;    //记录相同字符,在某个矩阵位置值的增量,不是0就是1
        if(n == 0) {
            return m;
        }
        if(m == 0) {
            return n;
        }
        d = new int[n+1][m+1];
        for(i=0; i<=n; i++) {    //初始化第一列
            d[i][0] = i;
        }
        for(j=0; j<=m; j++) {    //初始化第一行
            d[0][j] = j;
        }
        for(i=1; i<=n; i++) {    //遍历str1
            ch1 = str1.charAt(i-1);
            //去匹配str2
            for(j=1; j<=m; j++) {
                ch2 = str2.charAt(j-1);
                if(ch1 == ch2) {
                    temp = 0;
                } else {
                    temp = 1;
                }
                //左边+1,上边+1, 左上角+temp取最小
                d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp);
            }
        }
        return d[n][m];
    }

    /**
     * 计算相似度
     * @param str1
     * @param str2
     * @return
     */
	public double sim(String str1, String str2) {
		int ld = ld(str1, str2);
		return 1 - (double) ld / Math.max(str1.length(), str2.length());
	}
	
	/** 
     * 毫秒转换成hhmmss 
     * @param ms 毫秒 
     * @return hh:mm:ss 
     */  
    public String msToss(long ms) {
        SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss");  
        formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00"));  
        String ss = formatter.format(ms);  
        return ss;  
    }  

}


package com.iteye.injavawetrust.jdvsdd;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class PriceCheckMain {
	
	private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();
	
	public List<Map<String, ProductInfo>> getProductList(String productName) {
		
		String jdUrl = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC;
		
		String ddUrl = Constants.DDURL + productName;
		
		return getProductFromUrls(jdUrl, ddUrl, productName);
	}
	
	public List<Map<String, ProductInfo>> getProductFromUrls(String jdUrl, String ddUrl, String productName) {
		List<Map<String, ProductInfo>> retListMap = new ArrayList<Map<String,ProductInfo>>();
		List<ProductInfo> jdProductList = new JDProductList(jdUrl, productName).getProductList();
		List<ProductInfo> ddProductList = new DDProductList(ddUrl, productName).getProductList();
		for(int i = 0; i < jdProductList.size(); i++){
			String jdProductName = jdProductList.get(i).getProductName();
			Map<String, ProductInfo> map = new HashMap<String, ProductInfo>();
			map.put("JD", jdProductList.get(i));
			ProductInfo ddProduct = pcu.getSimilarity(jdProductName, ddProductList);
			map.put("DD", ddProduct);
			retListMap.add(map);
		}
		
		return retListMap;
	}
	
	
	public static void main(String[] args) {
		System.out.println("输入商品名称:");
		Scanner scanner = new Scanner(System.in);
		String productName = scanner.next();
		scanner.close();
		System.out.println("京东和当当[" + productName + "]商品比价开始。。。。。。");
		try{
			long starTime = System.currentTimeMillis();
			List<Map<String, ProductInfo>> list = new PriceCheckMain().getProductList(productName);
			for(Map<String, ProductInfo> map : list) {
				
				String jdName = map.get("JD").getProductName();
				String jdPrice = map.get("JD").getProductPrice();
				String ddName = map.get("DD").getProductName();
				String ddPrice = map.get("DD").getProductPrice();
				
				System.out.println("[" + jdName + "]  [" + ddName + "]");
				System.out.println("[" + jdPrice + "]  [" + ddPrice + "]");
				System.out.println("-----------------------------------------------------------");
			}
			long endTime = System.currentTimeMillis();
			System.out.println("用时 [" + pcu.msToss(endTime - starTime) + "]");
		}catch(Exception e){
			System.out.println("error");
			System.out.println(e.getMessage());
		}
	}

}

 

运行结果:

 

 输入商品名称:
铅笔
京东和当当[铅笔]商品比价开始。。。。。。
JD Product 第[1]页
http://search.jd.com/Search?keyword=閾呯瑪&enc=utf-8
JD Product 第[2]页
http://search.jd.com/Search?keyword=閾呯瑪&enc=utf-8&page=2

 

..............................

 

DD Product 第[1]页
http://search.dangdang.com/?key=铅笔
DD Product 第[2]页
http://search.dangdang.com/?key=铅笔&act=input&page_index=2
DD Product 第[3]页

 

...................................

 

[得力(deli) S908 木世界系列六角笔杆原木HB铅笔/素描绘图学生铅笔 50支/桶]  [ 【开学必备文具】正品 得力文具S907/s908原木HB/2B铅笔绘图素描儿童学生铅笔50支装 ]
[18.00]  [16.80]
-----------------------------------------------------------
[得力(deli) 7084 安全考试专用填涂答题卡2B木质铅笔/学生铅笔 12支/盒]  [ 得力文具(deli) 0641 削笔器 削笔刀 卷笔刀 削笔机学习用品手摇转笔刀笔刨文具 ]
[10.00]  [13.50]
-----------------------------------------------------------
[得力(deli)7083 安全石墨铅芯素描 绘图HB铅笔/学生铅笔 12支/盒]  [ 六一儿童节礼物!三菱 绘图HB/2B铅笔12支装 ]
[9.00]  [35.00]
-----------------------------------------------------------
[辉柏嘉(Faber-castell)114468 水溶性彩色铅笔 水溶彩铅 48色套装(赠毛笔+笔刨)]  [ 德国Faber-castell辉柏嘉三角杆学生铅笔 儿童铅笔 HB 2H 2B 12支 ]
[109.00]  [12.00]

 

................................................

 

[中华6725桶装彩色铅笔36色24色18色12色原木三角杆彩色铅笔 美术绘画涂鸦涂色彩铅 12色]  [ 晨光彩色铅笔36色 24色绘图涂鸦 桶装绘画彩铅彩笔 儿童绘画 ]
[8.80]  [12.00]
-----------------------------------------------------------
用时 [00:00:10]

 

 

 

 

 

你可能感兴趣的:(java,JSoup,网络爬虫)