java多线程爬亚马逊商品信息

亚马逊商品信息爬虫应用

1、实体

@Table(name = "amazon_product")
public class AmazonProduct implements Serializable {

    /**  */
    @NotDBColumn
    private static final long serialVersionUID = 2241990806968782041L;

    @Id(insertable = false)
    @Column(name="id")
    private BigInteger id;

    @Column(name="asin")
    private String  asin;                                   //ASIN

    @Column(name="sku")
    private String sku;
   
    @Column(name="site_id")
    private BigInteger  siteId;                                 //站点id

    @Column(name="sell_shop_num")
    private int sellShopNum;                                //跟卖商家数

    @Column(name="shpping_type")
    private String shppingType;                                //运费方式
  
    @Column(name="cate_id")
    private BigInteger cateId;                                 //类别id

    @Column(name="product_name")
    private String  productName;                            //产品名称

    @Column(name="img_url")
    private String  imgUrl;                                 //产品图片

    @Column(name="market_price")
    private double marketPrice;                            //市场价格

    @Column(name="sales_price")
    private double salesPrice;                             // 销售价格

 

get/set方法自己添加

2、公共的dao接口

public interface BaseDao {
 
    /**
     * 保持一个实体
     *
     *@author zhaishanhui
     * @param t
     */
    void save(T t);
   
    /**
     * 根据id删除一个对象
     *
     *@author zhaishanhui
     * @param id
     */
    void delete(BigInteger id);
   
    /**
     * 更新一个对象
     *
     *@author zhaishanhui
     * @param t
     */
    void update(T t);
   
    /**
     * 查询所有对象
     *
     *@author zhaishanhui
     * @param params
     * @return
     */
    List find(String condtion,Object[] params);
}

亚马逊商品接口子类

/**
 *
 * @author zhaishanhui
 * @version $Id: AmazonProductDao.java, v 0.1 2014年9月15日 下午3:53:38 zhaishanhui Exp $
 */
public interface AmazonProductDao extends BaseDao{
 
   // void save(AmazonProduct amazonProduct);
}

3、接口父类的实现类

public class BaseDaoImpl implements BaseDao {

    /**
     * 
     */
    @Override
    public void save(T t) {
        try{
            DBHelper.daoHelper.session.save(t);
         }catch(Exception e){
             e.printStackTrace();
         }
    }

    /**
     * 
     */
    @Override
    public void delete(BigInteger id) {
    }

    /**
     * 
     */
    @Override
    public void update(T t) {
    }

    /**
     * 
     */
    @SuppressWarnings("unchecked")
    @Override
    public List find(String condtion,Object[] params) {
       try{
           Query query= DBHelper.daoHelper.session.createQuery(getCls(), condtion);
           query.addParam(params);
            return (List) query.list(); 
       }catch(Exception e){
           e.printStackTrace();
       }
       return null;
    }
   
    @SuppressWarnings("unchecked")
    private Class getCls(){
        ParameterizedType type= (ParameterizedType) this.getClass().getGenericSuperclass();
       return (Class) type.getActualTypeArguments()[0];
    }
 
}

商品接口的实现类,继承基础类

public class AmazonProductDaoImpl extends BaseDaoImpl implements AmazonProductDao {

}

4、亚马逊商品信息serive接口

public interface AmazonProductService {
 
    void saveAmazonProduct(AmazonProduct amazonProduct);
}

serive接口的实现类

public class AmazonProductServiceImpl implements AmazonProductService {

    private AmazonProductDao amazonProductDao;
    /**
     * 
     */
    @Override
    public void saveAmazonProduct(AmazonProduct amazonProduct) {
       
        if(null!=amazonProduct){
            //amazonProduct.setId(GenerateIdUtil.getUUID());
            amazonProduct.setCreatetime(new Date());
            amazonProductDao.save(amazonProduct);
        }

 

5、线程类的基础类

public class BaseCrawler {

    public final String CLASS = "class";
   
    public int totalPage=0;
   
    public WebClient buildWebClient(){
        WebClient client = new WebClient(BrowserVersion.FIREFOX_24);
        WebClientOptions clientOptions = client.getOptions();
        // 设置webClient的相关参数
        clientOptions.setUseInsecureSSL(true);
        clientOptions.setRedirectEnabled(true);
        clientOptions.setCssEnabled(false);
        client.setAjaxController(new NicelyResynchronizingAjaxController());
        clientOptions.setTimeout(60000);
        clientOptions.setThrowExceptionOnFailingStatusCode(false);
        clientOptions.setThrowExceptionOnScriptError(false);
        return client;
    }
   
    @SuppressWarnings("unchecked")
    public Document baseParse(HtmlPage htmlPage, String xpath) {
        List list = (List) htmlPage.getByXPath(xpath);
        DomElement domElement = list.size() == 0 ? null : list.get(0);
        if (null == domElement) {
            return null;
        }
        HtmlElement htmlElement = (HtmlElement) domElement;
        String xmlContent = htmlElement.asXml();
        Document document = Jsoup.parse(xmlContent);
        return document;

    }
   
    @SuppressWarnings("unchecked")
    public List baseParse2Documents(HtmlPage htmlPage, String xpath) {
        List documents=new ArrayList();
        List lists = (List) htmlPage.getByXPath(xpath);
        if(null!=lists&&lists.size()>0){
           for(DomElement dom:lists){
               HtmlElement htmlElement = (HtmlElement) dom;
               String xmlContent = htmlElement.asXml();
               Document document = Jsoup.parse(xmlContent);
               documents.add(document);
           }
        }
      
        return documents;

    }
   
    public int getToalPageForEbay(int pageNo,String css,HtmlPage htmlPage,int len){
        if(htmlPage==null){
            return 0;
        }
        if(pageNo==1){
            String html=htmlPage.getWebResponse().getContentAsString();
            Document document=Jsoup.parse(html);
            Elements resultDiv= document.select(css);
            if(null!=resultDiv&&resultDiv.size()>0){
                String result=resultDiv.text();
                if(!StringUtils.isEmpty(result)){
                    result=result.replace(",", "").replace(".", "");
                    int total=Integer.parseInt(result);
                    totalPage=(total%len)==0?total/len:(total/len)+1;
                }
            }
        }
        return totalPage;
    }
   
    /**
     * 计算亚马逊搜索商品的总页数
     *
     *@author zhaishanhui
     * @param pageNo
     * @param id
     * @param htmlPage
     * @param len
     * @return
     */
    public int getToalPageForAmazon(int pageNo,String id,HtmlPage htmlPage,int len){
       
        if(htmlPage==null){
            return 0;
        }
        if(pageNo==1){
            String html=htmlPage.getWebResponse().getContentAsString();
            Document document=Jsoup.parse(html);
            Element resultDiv= document.getElementById(id);
            if(null!=resultDiv){
                String result=resultDiv.text();
                if(!StringUtils.isEmpty(result)){
                    //截取取出总总记录数
                    int begin=result.indexOf("of")+2;
                    int end=result.indexOf("results");
                    //判断截串位置,以免报null
                    if(begin>0&&end>0&&begin                         result=result.substring(begin,end);
                        result=result.replaceAll(",", "").trim();
                        int total=Integer.parseInt(result);
                        //计算总页数
                        totalPage=(total%len)==0?total/len:(total/len)+1;
                    }
                 
                }
            }
        }
        return totalPage;
    }
    
        
    }
    public void setAmazonProductDao(AmazonProductDao amazonProductDao) {
        this.amazonProductDao = amazonProductDao;
    }

}

线程类

public class AmazonPrductThread extends BaseCrawler implements Runnable{

   
    private final String baseUrl="http://www.amazon.com";

    private  Logger logger = Logger.getLogger(AmazonPrductThread.class); 
   
    private WebClient    client;

    private final String CLASS="class";
   
    private String condtion=" site_id=? and third_cate_id=? and platform_id=?";
   
    private String regex2Double="\\d+(\\.\\d{2})?";

    private AmazonDetailService amazonDetailService;
   

    private AmazonLostbuyProductService amazonLostbuyProductService;

    private AmazonProductService amazonProductService;
   
    private AmazonNewProductService amazonNewProductService;
   
    private CategoryService categoryService; //类别接口
   
    private List amazonUrlNews;
   
    private int begin;
   
    private int end;
   
   
    public AmazonPrductThread(AmazonDetailService amazonDetailService,AmazonLostbuyProductService amazonLostbuyProductService,
                              AmazonProductService amazonProductService,AmazonNewProductService amazonNewProductService,
                              CategoryService categoryService,List amazonUrlNews,int begin,int end){
        this.amazonDetailService=amazonDetailService;
        this.amazonLostbuyProductService=amazonLostbuyProductService;
        this.amazonProductService=amazonProductService;
        this.amazonNewProductService=amazonNewProductService;
        this.categoryService=categoryService;
        this.amazonUrlNews=amazonUrlNews;
        this.begin=begin;
        this.end=end;
       
    }
    /**
     * @see java.lang.Runnable#run()
     */
    @Override
    public void run() {
        if(null!=amazonUrlNews&&amazonUrlNews.size()>0){
            for(int i=begin;i                 String stieName=amazonUrlNews.get(i).getStieName();
                if(stieName.indexOf("US")>=0){
                    String url=baseUrl+"/dp/"+amazonUrlNews.get(i).getAsin();
                    getHtml(url,amazonUrlNews.get(i).getSku());
                }
               
            }
        }
    }

 
    public HtmlPage getHtmlPage(String url)throws Exception{
        client=this.buildWebClient();
        HtmlPage htmlPage = client.getPage(url);
        return htmlPage;
    }
   
    /**

     * 通过httpclient得到html内容

     *@author zhaishanhui

     * @param pid

     * @return String

     * @throws Exception 

     */
    public void getHtml(String url,String sku){
        try{
            HtmlPage htmlPage=getHtmlPage(url);
            String topHref=parseTop(htmlPage);
            AmazonProduct amazonProduct= new AmazonProduct();
            if(StringUtils.isNoneBlank(topHref)){
                 String cateId=topHref.substring(topHref.lastIndexOf("=")+1);
                 amazonProduct.setCateId(new BigInteger(cateId));
                 Category category= categoryService.findByCondtion(condtion, new Object[]{1,cateId,1});
                 if(null!=category){
                     //设置站点id
                     amazonProduct.setSiteId(BigInteger.valueOf(category.getId()));
                 }
            }
            BigInteger id=GenerateIdUtil.getUUID();
            amazonProduct.setId(id);
            amazonProduct.setSku(sku);
            amazonProduct.setAsin(url.substring(url.lastIndexOf("dp/")+3));
            amazonProduct= parseProductUrl(htmlPage,amazonProduct);
            amazonProduct=parseProductPrice(htmlPage,amazonProduct);
            amazonProduct= parseShopNum(htmlPage,amazonProduct);
            List urls=amazonProduct.getUrls();
            //市场售价
            double salesPrice=amazonProduct.getSalesPrice();
            //实际销售价格
            double marketPrice=amazonProduct.getMarketPrice();
            amazonProductService.saveAmazonProduct(amazonProduct);
            
            
           
            client.closeAllWindows();
        }catch(Exception e){
            e.printStackTrace();
            logger.error(e.getMessage());
        }
    }

  
6、job类的实现

基础job类

public abstract class BaseJob {

    private  Logger logger = Logger.getLogger(BaseJob.class);
    /**
     * 计算创建线程的个数
     *
     *@author zhaishanhui
     * @param t
     */
    public void countThread(List t){
        if(null!=t&&t.size()>0){
            int len=t.size();
            //创建线程的个数
            int count=len%ConstantTools.MAX_SIZE==0?len/ConstantTools.MAX_SIZE:(len/ConstantTools.MAX_SIZE)+1;
            for(int i=0;i                 //每个线程负责插入数据的开始位置
                int start=i*ConstantTools.MAX_SIZE;
                //每个线程负责插入数据的结束位置
                int end=i==count-1?i*ConstantTools.MAX_SIZE+(len-i*ConstantTools.MAX_SIZE):i*ConstantTools.MAX_SIZE+ConstantTools.MAX_SIZE;
                createThread(t,start,end); 
                logger.info("from:"+start+" to:"+end);
              }
           }
    }
   
    //模块模块,带钩子的方法,要求子类去实现
    protected void createThread(List t,int begin,int end){};
}

/**亚马逊商品抓取定时任务
 *
 * @author zhaishanhui
 * @version Id: AmazonPrductJob.java, v 0.1 2014年9月25日 下午3:14:56 zhaishanhui 
 */
public class AmazonPrductJob extends BaseJob{

  private  Logger logger = Logger.getLogger(AmazonPrductJob.class); 
   

    private AmazonDetailService amazonDetailService; //商品详情接口
   

    private AmazonLostbuyProductService amazonLostbuyProductService; //流失商品接口

    private AmazonProductService amazonProductService; //亚马逊商品接口
   
    private AmazonNewProductService amazonNewProductService; //跟卖商品接口
   
    private CategoryService categoryService; //类别接口
   
    private AmazonUrlNewService amazonUrlNewService; //url接口类
   
   
    public void execute(){
        /*String key="amazonProductPath";
        String path=ParseXml.getPath(key);
       List urls=ParseXml.getCommonUrl(path);
       if(urls!=null&&urls.size()>0){
           for(String url:urls){
               getHtml(url);  
           }
       }*/
         //B00I4XMEYA 
        logger.info("开始插入数据");
        //long begin=System.currentTimeMillis();
        //查询所有的url地址
        List amazonUrlNews=amazonUrlNewService.findALL(null, null);
        this.countThread(amazonUrlNews);
     /*   if(null!=amazonUrlNews&&amazonUrlNews.size()>0){
            int len=amazonUrlNews.size();
            int count=len%MAX_SIZE==0?len/MAX_SIZE:(len/MAX_SIZE)+1;
            for(int i=0;i                 int start=i*MAX_SIZE;
                int end=i==count-1?i*MAX_SIZE+(len-i*MAX_SIZE):i*MAX_SIZE+MAX_SIZE;
                creatAmazonProductThread(amazonUrlNews,start,end); 
              }
           }*/
    }
   
   /* public void creatAmazonProductThread(List amazonUrlNews,int begin,int end){
        AmazonPrductThread amazonPrductThread= new AmazonPrductThread(amazonDetailService,amazonLostbuyProductService,amazonProductService,
            amazonNewProductService,categoryService,amazonUrlNews,begin,end);
        new Thread(amazonPrductThread).start();
    }*/

    //创建线程,由于子类各自的实现不同,父类交给子类来实现,带钩子的方法
    @Override
    protected void createThread(List amazonUrlNews, int begin, int end) {
        AmazonPrductThread amazonPrductThread= new AmazonPrductThread(amazonDetailService,amazonLostbuyProductService,amazonProductService,
            amazonNewProductService,categoryService,amazonUrlNews,begin,end);
        new Thread(amazonPrductThread).start();
    }

    public void setAmazonDetailService(AmazonDetailService amazonDetailService) {
        this.amazonDetailService = amazonDetailService;
    }

    public void setAmazonLostbuyProductService(AmazonLostbuyProductService amazonLostbuyProductService) {
        this.amazonLostbuyProductService = amazonLostbuyProductService;
    }

    public void setAmazonProductService(AmazonProductService amazonProductService) {
        this.amazonProductService = amazonProductService;
    }

    public void setAmazonNewProductService(AmazonNewProductService amazonNewProductService) {
        this.amazonNewProductService = amazonNewProductService;
    }

    public void setCategoryService(CategoryService categoryService) {
        this.categoryService = categoryService;
    }

    public void setAmazonUrlNewService(AmazonUrlNewService amazonUrlNewService) {
        this.amazonUrlNewService = amazonUrlNewService;
    }
}

代码不是很完成,这里给大家提供下思路,所用的技术htmlUnit+quartz+jsoup+58同城的框架。

 

 


 

你可能感兴趣的:(java)