和之前爬取天气网站一样,现在用webmagic爬取中关村在线华为手机的评论。(http://detail.zol.com.cn/405/404275/review.shtml)
之前的天气网站由数据是静态的,解析时很容易就能获取,这次的评论数据不一样,是js动态加载的。
f12打开开发人员工具,点击第二页时我们可以在network中看到一条xhr请求
可以发现请求的规律,一款手机对应一个proid,epage为页码,因此我们可以构造请求,模拟浏览器发送来获得每一页数据。
完整代码如下:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
public class HuaweiRepoPageProcessor implements PageProcessor {
// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
/*
* ascii码转汉字
*/
private static String ascii2native(String asciicode)
{
String[] asciis = asciicode.split ("\\\\u");
String nativeValue = asciis[0];
try
{
for ( int i = 1; i < asciis.length; i++ )
{
String code = asciis[i];
nativeValue += (char) Integer.parseInt (code.substring (0, 4), 16);
if (code.length () > 4)
{
nativeValue += code.substring (4, code.length ());
}
}
}
catch (NumberFormatException e)
{
return asciicode;
}
return nativeValue;
}
/*
* 获取Ajax请求
*/
public static String getAjax(int arg,int pagei){
return "http://detail.zol.com.cn/xhr3_Review_GetListAndPage_isFilter=0%5EproId="+arg+"%5Epage="+pagei+".html";
}
public static String getType(int arg){
return "/"+(arg/1000+1);
}
public static int getMobileArg(String url){
Pattern p = Pattern.compile("[\\d]+");
Matcher m = p.matcher(url);
m.find();
m.group();
m.find();
return Integer.parseInt(m.group());
}
/*
* 获取下一页详情页链接
*/
public static List getNext(String nextUrl,String type){
List res = new ArrayList();
URL url;
try {
url = new URL(nextUrl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.connect();
InputStream urlStream = connection.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(urlStream));
//str就是页面代码,用split函数和正则表达式分割str
String str=reader.readLine();
while(true){
int l = str.indexOf(type);
if(l==-1)break;
int r = str.indexOf("tagNav");
String nexturl = str.substring(l, r+6).replace("\\", "");
//System.out.println(nexturl);
res.add("http://detail.zol.com.cn"+nexturl);
str = str.substring(r+6);
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return res;
}
/*
* 获取回复的Ajax请求
*/
public static String getReply(String url,int pagei){
Pattern p = Pattern.compile("[\\d]+");
Matcher m = p.matcher(url);
List num = new ArrayList();
int numi = 0;
while(m.find()){
//System.out.println(m.group());
num.add(m.group());
}
String nu [] = {"",""};
int i=0;
for(String s : num){
if(s.length()>5){
nu[i++] = s;
}
}
String res = "http://detail.zol.com.cn/xhr3_Review_GetReplyPart_reviewId="+nu[1]+"%5EsubcateId=57%5EproId="+nu[0]+"%5EisReviewDetail=1%5EsubPageType=Review%5Epage="+pagei+".html";
//System.out.println(res);
//System.out.println("****************");
return res;
}
/*
* 根据Ajax请求得到Document
*/
public static Document getReplyDoc(String docUrl){
URL url;
Document docList = null;
try {
url = new URL(docUrl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.connect();
InputStream urlStream = connection.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(urlStream));
//str就是页面代码
String str=reader.readLine();
str = ascii2native(str);
//System.out.println(str);
String s = str.replace("\\", "");
//System.out.println(s);
docList = Jsoup.parse(s);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return docList;
}
public static void print(HuaweiMobile review){
System.out.println("型号:"+(review.getXinghao()==null?"":review.getXinghao()));
System.out.println("时间:"+(review.getTime()==null?"":review.getTime()));
System.out.println("地点:"+(review.getPlace()==null?"":review.getPlace()));
System.out.println("标题:"+review.getTitle());
System.out.println("优点:"+review.getYoudian());
System.out.println("缺点:"+review.getQuedian());
System.out.println("总结:"+review.getZongjie());
System.out.println("内容:"+review.getContent());
for(Entry map : review.getReply().entrySet()){
System.out.println(map.getKey()+"\t"+map.getValue());
}
System.out.println("****************************");
}
/*
* 解析页面
*/
public void process(Page page) {
// 部分二:定义如何抽取页面信息,并保存下来
//System.out.println(my++);
String url = page.getUrl().toString();
//System.out.println(page.);
//System.out.println(url);
if(url.endsWith("tagNav")){
HuaweiMobile review = new HuaweiMobile();
Html html = page.getHtml();
String xinghao = Jsoup.parse(html.xpath("/html/body/div[3]/div[3]/a[4]").toString()).getElementsByTag("a").get(0).text();
review.setXinghao(xinghao);
String time = Jsoup.parse(html.xpath("//*[@id=\"J_CommentContent\"]/div[2]/h3/span").toString()).getElementsByTag("span").get(0).text();
review.setTime(time);
//System.out.println(xinghao);
//System.out.println(time);
String content = html.toString();
Document docList = Jsoup.parse(content);
Elements pro = docList.getElementsByClass("product-parameter");
if(pro.size()>0){
Elements LiTag = pro.get(0).getElementsByTag("li");
for(int i = 0;i
String text = LiTag.get(i).text();
//System.out.println(LiTag.get(i).text());
String span = LiTag.get(i).getElementsByTag("span").get(0).text();
if(span.matches(".*型号.*")){
review.setXinghao(LiTag.get(i).text().replaceAll("产品型号:", ""));
}else if(span.matches(".*时间.*")){
review.setPlace(LiTag.get(i).text().replaceAll("[\\d-()时间地点:]", ""));
review.setTime(LiTag.get(i).text().replaceAll("[^\\d-]", ""));
}
}
}
String comcontent = null;
Elements comtit = docList.getElementsByClass("comments-content");
if(comtit.size()>0){
if(comtit.get(0).getElementsByTag("h3").size()>0){
String tit = comtit.get(0).getElementsByTag("h3").get(0).text();
comcontent = tit.replaceAll("[\\d-]", "");
review.setTitle(comcontent);
}
}
Elements com = docList.getElementsByClass("comments-words");
for(int i = 0;i
Elements strongs = com.get(i).getElementsByTag("strong");
String strong = strongs.get(0).html();
//System.out.println(strongs.get(0).html());
Elements ps = com.get(i).getElementsByTag("p");
Elements spans = ps.get(0).getElementsByTag("span");
//System.out.println(spans.get(0).html());
if(strong.matches(".*优点.*")){
review.setYoudian(spans.get(0).html());
comcontent += "\n优点:\n"+spans.get(0).html();
}else if(strong.matches(".*缺点.*")){
review.setQuedian(spans.get(0).html());
comcontent += "\n缺点:\n"+spans.get(0).html();
}else if(strong.matches(".*总结.*")){
review.setZongjie(spans.get(0).html());
comcontent += "\n总结:\n"+spans.get(0).html();
}
}
review.setContent(comcontent);
//System.out.println(getReply(url, 1));
int pagei = 1;
while(true){
Document doc = getReplyDoc(getReply(url, pagei));
Elements ereply = doc.getElementsByClass("reply-item");
//System.out.println(ereply.size());
if(ereply.size()==0)break;
for(int i = 0 ;i
review.addReply(ereply.get(i).getElementsByTag("em").get(0).text(), ereply.get(i).getElementsByTag("p").get(0).text());
}
pagei++;
}
print(review);
}else if(url.matches("http://detail.zol.com.cn/[\\d]+/[\\d]+/review.shtml")){
int mobileArg = getMobileArg(url);
int pagei = 1;
while(true){
List s = getNext(getAjax(mobileArg, pagei),getType(mobileArg));
if(s.size()==0)break;
page.addTargetRequests(s);
pagei++;
}
}
// 部分三:从页面发现后续的url地址来抓取
page.addTargetRequests(page.getHtml().links().regex("http://detail.zol.com.cn/cell_phone/index[\\d]+.shtml").all());
page.addTargetRequests(page.getHtml().links().regex("http://detail.zol.com.cn/[\\d]+/[\\d]+/review.shtml").all());
//page.addTargetRequests(page.getHtml().links().regex("http://detail\\.zol\\.com\\.cn/index\\.php?c=AjaxVer3_Review&a=GetListAndPage&isFilter=0&proId=386269&page=[\\d]+").all());
}
/*
* 获取链接,启动爬虫
*/
public static void huaweiSpider(int mobileArg){
Spider spider = Spider.create(new HuaweiRepoPageProcessor());
int pagei = 1;
while(true){
List s = getNext(getAjax(mobileArg, pagei),getType(mobileArg));
if(s.size()==0)break;
for(String a:s){
//System.out.println(a);
spider.addUrl(a);
}
pagei++;
}
//Spider.create(new HuaweiRepoPageProcessor()).addUrls(ss).thread(5).run();
spider.thread(5).run();
}
public Site getSite() {
return site;
}
public static void main(String[] args) {
//huaweiSpider(395493);
Spider.create(new HuaweiRepoPageProcessor()).addUrl("http://detail.zol.com.cn/cell_phone_index/subcate57_613_list_1.html").thread(5).run();
}
}