亚马逊商品信息爬虫应用
1、实体
@Table(name = "amazon_product")
public class AmazonProduct implements Serializable {
/** */
@NotDBColumn
private static final long serialVersionUID = 2241990806968782041L;
@Id(insertable = false)
@Column(name="id")
private BigInteger id;
@Column(name="asin")
private String asin; //ASIN
@Column(name="sku")
private String sku;
@Column(name="site_id")
private BigInteger siteId; //站点id
@Column(name="sell_shop_num")
private int sellShopNum; //跟卖商家数
@Column(name="shpping_type")
private String shppingType; //运费方式
@Column(name="cate_id")
private BigInteger cateId; //类别id
@Column(name="product_name")
private String productName; //产品名称
@Column(name="img_url")
private String imgUrl; //产品图片
@Column(name="market_price")
private double marketPrice; //市场价格
@Column(name="sales_price")
private double salesPrice; // 销售价格
get/set方法自己添加
2、公共的dao接口
public interface BaseDao
/**
* 保持一个实体
*
*@author zhaishanhui
* @param t
*/
void save(T t);
/**
* 根据id删除一个对象
*
*@author zhaishanhui
* @param id
*/
void delete(BigInteger id);
/**
* 更新一个对象
*
*@author zhaishanhui
* @param t
*/
void update(T t);
/**
* 查询所有对象
*
*@author zhaishanhui
* @param params
* @return
*/
List
}
亚马逊商品接口子类
/**
*
* @author zhaishanhui
* @version $Id: AmazonProductDao.java, v 0.1 2014年9月15日 下午3:53:38 zhaishanhui Exp $
*/
public interface AmazonProductDao extends BaseDao
// void save(AmazonProduct amazonProduct);
}
3、接口父类的实现类
public class BaseDaoImpl
/**
*
*/
@Override
public void save(T t) {
try{
DBHelper.daoHelper.session.save(t);
}catch(Exception e){
e.printStackTrace();
}
}
/**
*
*/
@Override
public void delete(BigInteger id) {
}
/**
*
*/
@Override
public void update(T t) {
}
/**
*
*/
@SuppressWarnings("unchecked")
@Override
public List
try{
Query query= DBHelper.daoHelper.session.createQuery(getCls(), condtion);
query.addParam(params);
return (List
}catch(Exception e){
e.printStackTrace();
}
return null;
}
@SuppressWarnings("unchecked")
private Class
ParameterizedType type= (ParameterizedType) this.getClass().getGenericSuperclass();
return (Class
}
}
商品接口的实现类,继承基础类
public class AmazonProductDaoImpl extends BaseDaoImpl
}
4、亚马逊商品信息serive接口
public interface AmazonProductService {
void saveAmazonProduct(AmazonProduct amazonProduct);
}
serive接口的实现类
public class AmazonProductServiceImpl implements AmazonProductService {
private AmazonProductDao amazonProductDao;
/**
*
*/
@Override
public void saveAmazonProduct(AmazonProduct amazonProduct) {
if(null!=amazonProduct){
//amazonProduct.setId(GenerateIdUtil.getUUID());
amazonProduct.setCreatetime(new Date());
amazonProductDao.save(amazonProduct);
}
5、线程类的基础类
public class BaseCrawler {
public final String CLASS = "class";
public int totalPage=0;
public WebClient buildWebClient(){
WebClient client = new WebClient(BrowserVersion.FIREFOX_24);
WebClientOptions clientOptions = client.getOptions();
// 设置webClient的相关参数
clientOptions.setUseInsecureSSL(true);
clientOptions.setRedirectEnabled(true);
clientOptions.setCssEnabled(false);
client.setAjaxController(new NicelyResynchronizingAjaxController());
clientOptions.setTimeout(60000);
clientOptions.setThrowExceptionOnFailingStatusCode(false);
clientOptions.setThrowExceptionOnScriptError(false);
return client;
}
@SuppressWarnings("unchecked")
public Document baseParse(HtmlPage htmlPage, String xpath) {
List
DomElement domElement = list.size() == 0 ? null : list.get(0);
if (null == domElement) {
return null;
}
HtmlElement htmlElement = (HtmlElement) domElement;
String xmlContent = htmlElement.asXml();
Document document = Jsoup.parse(xmlContent);
return document;
}
@SuppressWarnings("unchecked")
public List
List
List
if(null!=lists&&lists.size()>0){
for(DomElement dom:lists){
HtmlElement htmlElement = (HtmlElement) dom;
String xmlContent = htmlElement.asXml();
Document document = Jsoup.parse(xmlContent);
documents.add(document);
}
}
return documents;
}
public int getToalPageForEbay(int pageNo,String css,HtmlPage htmlPage,int len){
if(htmlPage==null){
return 0;
}
if(pageNo==1){
String html=htmlPage.getWebResponse().getContentAsString();
Document document=Jsoup.parse(html);
Elements resultDiv= document.select(css);
if(null!=resultDiv&&resultDiv.size()>0){
String result=resultDiv.text();
if(!StringUtils.isEmpty(result)){
result=result.replace(",", "").replace(".", "");
int total=Integer.parseInt(result);
totalPage=(total%len)==0?total/len:(total/len)+1;
}
}
}
return totalPage;
}
/**
* 计算亚马逊搜索商品的总页数
*
*@author zhaishanhui
* @param pageNo
* @param id
* @param htmlPage
* @param len
* @return
*/
public int getToalPageForAmazon(int pageNo,String id,HtmlPage htmlPage,int len){
if(htmlPage==null){
return 0;
}
if(pageNo==1){
String html=htmlPage.getWebResponse().getContentAsString();
Document document=Jsoup.parse(html);
Element resultDiv= document.getElementById(id);
if(null!=resultDiv){
String result=resultDiv.text();
if(!StringUtils.isEmpty(result)){
//截取取出总总记录数
int begin=result.indexOf("of")+2;
int end=result.indexOf("results");
//判断截串位置,以免报null
if(begin>0&&end>0&&begin
result=result.replaceAll(",", "").trim();
int total=Integer.parseInt(result);
//计算总页数
totalPage=(total%len)==0?total/len:(total/len)+1;
}
}
}
}
return totalPage;
}
}
public void setAmazonProductDao(AmazonProductDao amazonProductDao) {
this.amazonProductDao = amazonProductDao;
}
}
线程类
public class AmazonPrductThread extends BaseCrawler implements Runnable{
private final String baseUrl="http://www.amazon.com";
private Logger logger = Logger.getLogger(AmazonPrductThread.class);
private WebClient client;
private final String CLASS="class";
private String condtion=" site_id=? and third_cate_id=? and platform_id=?";
private String regex2Double="\\d+(\\.\\d{2})?";
private AmazonDetailService amazonDetailService;
private AmazonLostbuyProductService amazonLostbuyProductService;
private AmazonProductService amazonProductService;
private AmazonNewProductService amazonNewProductService;
private CategoryService categoryService; //类别接口
private List
private int begin;
private int end;
public AmazonPrductThread(AmazonDetailService amazonDetailService,AmazonLostbuyProductService amazonLostbuyProductService,
AmazonProductService amazonProductService,AmazonNewProductService amazonNewProductService,
CategoryService categoryService,List
this.amazonDetailService=amazonDetailService;
this.amazonLostbuyProductService=amazonLostbuyProductService;
this.amazonProductService=amazonProductService;
this.amazonNewProductService=amazonNewProductService;
this.categoryService=categoryService;
this.amazonUrlNews=amazonUrlNews;
this.begin=begin;
this.end=end;
}
/**
* @see java.lang.Runnable#run()
*/
@Override
public void run() {
if(null!=amazonUrlNews&&amazonUrlNews.size()>0){
for(int i=begin;i
if(stieName.indexOf("US")>=0){
String url=baseUrl+"/dp/"+amazonUrlNews.get(i).getAsin();
getHtml(url,amazonUrlNews.get(i).getSku());
}
}
}
}
public HtmlPage getHtmlPage(String url)throws Exception{
client=this.buildWebClient();
HtmlPage htmlPage = client.getPage(url);
return htmlPage;
}
/**
* 通过httpclient得到html内容
*@author zhaishanhui
* @param pid
* @return String
* @throws Exception
*/
public void getHtml(String url,String sku){
try{
HtmlPage htmlPage=getHtmlPage(url);
String topHref=parseTop(htmlPage);
AmazonProduct amazonProduct= new AmazonProduct();
if(StringUtils.isNoneBlank(topHref)){
String cateId=topHref.substring(topHref.lastIndexOf("=")+1);
amazonProduct.setCateId(new BigInteger(cateId));
Category category= categoryService.findByCondtion(condtion, new Object[]{1,cateId,1});
if(null!=category){
//设置站点id
amazonProduct.setSiteId(BigInteger.valueOf(category.getId()));
}
}
BigInteger id=GenerateIdUtil.getUUID();
amazonProduct.setId(id);
amazonProduct.setSku(sku);
amazonProduct.setAsin(url.substring(url.lastIndexOf("dp/")+3));
amazonProduct= parseProductUrl(htmlPage,amazonProduct);
amazonProduct=parseProductPrice(htmlPage,amazonProduct);
amazonProduct= parseShopNum(htmlPage,amazonProduct);
List
//市场售价
double salesPrice=amazonProduct.getSalesPrice();
//实际销售价格
double marketPrice=amazonProduct.getMarketPrice();
amazonProductService.saveAmazonProduct(amazonProduct);
client.closeAllWindows();
}catch(Exception e){
e.printStackTrace();
logger.error(e.getMessage());
}
}
6、job类的实现
基础job类
public abstract class BaseJob
private Logger logger = Logger.getLogger(BaseJob.class);
/**
* 计算创建线程的个数
*
*@author zhaishanhui
* @param t
*/
public void countThread(List
if(null!=t&&t.size()>0){
int len=t.size();
//创建线程的个数
int count=len%ConstantTools.MAX_SIZE==0?len/ConstantTools.MAX_SIZE:(len/ConstantTools.MAX_SIZE)+1;
for(int i=0;i
int start=i*ConstantTools.MAX_SIZE;
//每个线程负责插入数据的结束位置
int end=i==count-1?i*ConstantTools.MAX_SIZE+(len-i*ConstantTools.MAX_SIZE):i*ConstantTools.MAX_SIZE+ConstantTools.MAX_SIZE;
createThread(t,start,end);
logger.info("from:"+start+" to:"+end);
}
}
}
//模块模块,带钩子的方法,要求子类去实现
protected void createThread(List
}
/**亚马逊商品抓取定时任务
*
* @author zhaishanhui
* @version Id: AmazonPrductJob.java, v 0.1 2014年9月25日 下午3:14:56 zhaishanhui
*/
public class AmazonPrductJob extends BaseJob
private Logger logger = Logger.getLogger(AmazonPrductJob.class);
private AmazonDetailService amazonDetailService; //商品详情接口
private AmazonLostbuyProductService amazonLostbuyProductService; //流失商品接口
private AmazonProductService amazonProductService; //亚马逊商品接口
private AmazonNewProductService amazonNewProductService; //跟卖商品接口
private CategoryService categoryService; //类别接口
private AmazonUrlNewService amazonUrlNewService; //url接口类
public void execute(){
/*String key="amazonProductPath";
String path=ParseXml.getPath(key);
List
if(urls!=null&&urls.size()>0){
for(String url:urls){
getHtml(url);
}
}*/
//B00I4XMEYA
logger.info("开始插入数据");
//long begin=System.currentTimeMillis();
//查询所有的url地址
List
this.countThread(amazonUrlNews);
/* if(null!=amazonUrlNews&&amazonUrlNews.size()>0){
int len=amazonUrlNews.size();
int count=len%MAX_SIZE==0?len/MAX_SIZE:(len/MAX_SIZE)+1;
for(int i=0;i
int end=i==count-1?i*MAX_SIZE+(len-i*MAX_SIZE):i*MAX_SIZE+MAX_SIZE;
creatAmazonProductThread(amazonUrlNews,start,end);
}
}*/
}
/* public void creatAmazonProductThread(List
AmazonPrductThread amazonPrductThread= new AmazonPrductThread(amazonDetailService,amazonLostbuyProductService,amazonProductService,
amazonNewProductService,categoryService,amazonUrlNews,begin,end);
new Thread(amazonPrductThread).start();
}*/
//创建线程,由于子类各自的实现不同,父类交给子类来实现,带钩子的方法
@Override
protected void createThread(List
AmazonPrductThread amazonPrductThread= new AmazonPrductThread(amazonDetailService,amazonLostbuyProductService,amazonProductService,
amazonNewProductService,categoryService,amazonUrlNews,begin,end);
new Thread(amazonPrductThread).start();
}
public void setAmazonDetailService(AmazonDetailService amazonDetailService) {
this.amazonDetailService = amazonDetailService;
}
public void setAmazonLostbuyProductService(AmazonLostbuyProductService amazonLostbuyProductService) {
this.amazonLostbuyProductService = amazonLostbuyProductService;
}
public void setAmazonProductService(AmazonProductService amazonProductService) {
this.amazonProductService = amazonProductService;
}
public void setAmazonNewProductService(AmazonNewProductService amazonNewProductService) {
this.amazonNewProductService = amazonNewProductService;
}
public void setCategoryService(CategoryService categoryService) {
this.categoryService = categoryService;
}
public void setAmazonUrlNewService(AmazonUrlNewService amazonUrlNewService) {
this.amazonUrlNewService = amazonUrlNewService;
}
}
代码不是很完成,这里给大家提供下思路,所用的技术htmlUnit+quartz+jsoup+58同城的框架。