使用webmagic 爬取中关村评论

和之前爬取天气网站一样,现在用webmagic爬取中关村在线华为手机的评论。(http://detail.zol.com.cn/405/404275/review.shtml)
之前的天气网站由数据是静态的,解析时很容易就能获取,这次的评论数据不一样,是js动态加载的。

f12打开开发人员工具,点击第二页时我们可以在network中看到一条xhr请求
使用webmagic 爬取中关村评论_第1张图片

可以发现请求的规律,一款手机对应一个proid,epage为页码,因此我们可以构造请求,模拟浏览器发送来获得每一页数据。

完整代码如下:

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;


public class HuaweiRepoPageProcessor implements PageProcessor {

    // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

    /*
     * ascii码转汉字
     */
    private static String ascii2native(String asciicode)
    {
        String[] asciis = asciicode.split ("\\\\u");
        String nativeValue = asciis[0];
        try
        {
            for ( int i = 1; i < asciis.length; i++ )
            {
                String code = asciis[i];
                nativeValue += (char) Integer.parseInt (code.substring (0, 4), 16);
                if (code.length () > 4)
                {
                    nativeValue += code.substring (4, code.length ());
                }
            }
        }
        catch (NumberFormatException e)
        {
            return asciicode;
        }
        return nativeValue;
    }
    /*
     * 获取Ajax请求
     */
    public static String getAjax(int arg,int pagei){
        return "http://detail.zol.com.cn/xhr3_Review_GetListAndPage_isFilter=0%5EproId="+arg+"%5Epage="+pagei+".html";
    }
    public static String getType(int arg){
        return "/"+(arg/1000+1);
    }
    public static int getMobileArg(String url){
        Pattern p = Pattern.compile("[\\d]+");  
         Matcher m = p.matcher(url);  

         m.find();
         m.group();
         m.find();
         return Integer.parseInt(m.group());
    }
    /*
     * 获取下一页详情页链接
     */
    public static List getNext(String nextUrl,String type){
        List res = new ArrayList();
        URL url;
        try {
            url = new URL(nextUrl);

            HttpURLConnection connection = (HttpURLConnection) url.openConnection();    
            connection.connect();    
            InputStream urlStream = connection.getInputStream();


            BufferedReader reader = new BufferedReader(new InputStreamReader(urlStream));   
           //str就是页面代码,用split函数和正则表达式分割str
            String str=reader.readLine(); 

            while(true){
                int l = str.indexOf(type);
                if(l==-1)break;
                int r = str.indexOf("tagNav");
                String nexturl = str.substring(l, r+6).replace("\\", "");
                //System.out.println(nexturl);
                res.add("http://detail.zol.com.cn"+nexturl);
                str = str.substring(r+6);
            }

        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } 
        return res;
    }
    /*
     * 获取回复的Ajax请求
     */
    public static String getReply(String url,int pagei){
         Pattern p = Pattern.compile("[\\d]+");  
         Matcher m = p.matcher(url);  
         List num = new ArrayList();
         int numi = 0;
         while(m.find()){  
             //System.out.println(m.group());
             num.add(m.group());  
         } 
         String nu [] = {"",""};
         int i=0;
         for(String s : num){
             if(s.length()>5){
                 nu[i++] = s;
             } 
         }
         String res = "http://detail.zol.com.cn/xhr3_Review_GetReplyPart_reviewId="+nu[1]+"%5EsubcateId=57%5EproId="+nu[0]+"%5EisReviewDetail=1%5EsubPageType=Review%5Epage="+pagei+".html";
         //System.out.println(res);
         //System.out.println("****************");
         return res;
    }
    /*
     * 根据Ajax请求得到Document
     */
    public static Document getReplyDoc(String docUrl){
        URL url;
        Document docList = null;
        try {
            url = new URL(docUrl);
            HttpURLConnection connection = (HttpURLConnection) url.openConnection();    
            connection.connect();    
            InputStream urlStream = connection.getInputStream();


            BufferedReader reader = new BufferedReader(new InputStreamReader(urlStream));   
           //str就是页面代码
            String str=reader.readLine(); 
            str = ascii2native(str);
            //System.out.println(str);
            String s = str.replace("\\", "");
            //System.out.println(s);
            docList = Jsoup.parse(s);

        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } 
        return docList;
    }

    public static void print(HuaweiMobile review){
        System.out.println("型号:"+(review.getXinghao()==null?"":review.getXinghao()));
        System.out.println("时间:"+(review.getTime()==null?"":review.getTime()));
        System.out.println("地点:"+(review.getPlace()==null?"":review.getPlace()));
        System.out.println("标题:"+review.getTitle());
        System.out.println("优点:"+review.getYoudian());
        System.out.println("缺点:"+review.getQuedian());
        System.out.println("总结:"+review.getZongjie());
        System.out.println("内容:"+review.getContent());

        for(Entry map : review.getReply().entrySet()){
            System.out.println(map.getKey()+"\t"+map.getValue());
        }
        System.out.println("****************************");
    }
    /*
     * 解析页面
     */
    public void process(Page page) {
        // 部分二:定义如何抽取页面信息,并保存下来
        //System.out.println(my++);
        String url = page.getUrl().toString();
        //System.out.println(page.);
        //System.out.println(url);
        if(url.endsWith("tagNav")){

            HuaweiMobile review = new HuaweiMobile();
            Html html = page.getHtml();

            String xinghao = Jsoup.parse(html.xpath("/html/body/div[3]/div[3]/a[4]").toString()).getElementsByTag("a").get(0).text();
            review.setXinghao(xinghao);
            String time = Jsoup.parse(html.xpath("//*[@id=\"J_CommentContent\"]/div[2]/h3/span").toString()).getElementsByTag("span").get(0).text();
            review.setTime(time);
            //System.out.println(xinghao);
            //System.out.println(time);

            String content = html.toString();
            Document docList = Jsoup.parse(content);

            Elements pro = docList.getElementsByClass("product-parameter");

            if(pro.size()>0){
                Elements LiTag = pro.get(0).getElementsByTag("li");

                for(int i = 0;i
                    String text = LiTag.get(i).text();
                    //System.out.println(LiTag.get(i).text());
                    String span = LiTag.get(i).getElementsByTag("span").get(0).text();
                    if(span.matches(".*型号.*")){
                        review.setXinghao(LiTag.get(i).text().replaceAll("产品型号:", ""));

                    }else if(span.matches(".*时间.*")){
                        review.setPlace(LiTag.get(i).text().replaceAll("[\\d-()时间地点:]", ""));
                        review.setTime(LiTag.get(i).text().replaceAll("[^\\d-]", ""));
                    }
                }

            }
            String comcontent = null;
            Elements comtit = docList.getElementsByClass("comments-content");
            if(comtit.size()>0){
                if(comtit.get(0).getElementsByTag("h3").size()>0){
                    String tit = comtit.get(0).getElementsByTag("h3").get(0).text();
                    comcontent = tit.replaceAll("[\\d-]", "");
                    review.setTitle(comcontent);
                }   
            }


            Elements com = docList.getElementsByClass("comments-words");
            for(int i = 0;i
                Elements strongs = com.get(i).getElementsByTag("strong");
                String strong = strongs.get(0).html();

                //System.out.println(strongs.get(0).html());
                Elements ps = com.get(i).getElementsByTag("p");
                Elements spans = ps.get(0).getElementsByTag("span");
                //System.out.println(spans.get(0).html());
                if(strong.matches(".*优点.*")){
                    review.setYoudian(spans.get(0).html());
                    comcontent += "\n优点:\n"+spans.get(0).html();
                }else if(strong.matches(".*缺点.*")){
                    review.setQuedian(spans.get(0).html());
                    comcontent += "\n缺点:\n"+spans.get(0).html();
                }else if(strong.matches(".*总结.*")){
                    review.setZongjie(spans.get(0).html());
                    comcontent += "\n总结:\n"+spans.get(0).html();
                }
            }
            review.setContent(comcontent);
            //System.out.println(getReply(url, 1));
            int pagei = 1;
            while(true){
                Document doc = getReplyDoc(getReply(url, pagei));
                Elements ereply = doc.getElementsByClass("reply-item");
                //System.out.println(ereply.size());
                if(ereply.size()==0)break;
                for(int i = 0 ;i
                    review.addReply(ereply.get(i).getElementsByTag("em").get(0).text(), ereply.get(i).getElementsByTag("p").get(0).text());
                }
                pagei++;
            }

            print(review);

        }else if(url.matches("http://detail.zol.com.cn/[\\d]+/[\\d]+/review.shtml")){
            int mobileArg = getMobileArg(url);
            int pagei = 1;
            while(true){
                List s = getNext(getAjax(mobileArg, pagei),getType(mobileArg));

                if(s.size()==0)break;
                page.addTargetRequests(s);  
                pagei++;
            }
        }




        // 部分三:从页面发现后续的url地址来抓取
        page.addTargetRequests(page.getHtml().links().regex("http://detail.zol.com.cn/cell_phone/index[\\d]+.shtml").all());
        page.addTargetRequests(page.getHtml().links().regex("http://detail.zol.com.cn/[\\d]+/[\\d]+/review.shtml").all());
        //page.addTargetRequests(page.getHtml().links().regex("http://detail\\.zol\\.com\\.cn/index\\.php?c=AjaxVer3_Review&a=GetListAndPage&isFilter=0&proId=386269&page=[\\d]+").all());
    }
    /*
     * 获取链接,启动爬虫
     */
    public static void huaweiSpider(int mobileArg){
        Spider spider = Spider.create(new HuaweiRepoPageProcessor());
        int pagei = 1;
        while(true){
            List s = getNext(getAjax(mobileArg, pagei),getType(mobileArg));

            if(s.size()==0)break;
            for(String a:s){
                //System.out.println(a);
                spider.addUrl(a);   
            }
            pagei++;
        }

        //Spider.create(new HuaweiRepoPageProcessor()).addUrls(ss).thread(5).run();
        spider.thread(5).run();
    }

    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        //huaweiSpider(395493);
        Spider.create(new HuaweiRepoPageProcessor()).addUrl("http://detail.zol.com.cn/cell_phone_index/subcate57_613_list_1.html").thread(5).run();
    }
}

你可能感兴趣的:(web爬虫)