/*使用javascript编写的爬虫源码,用于爬取京东商品信息和评价。
代码粘贴到神箭手云爬虫平台(http://www.shenjianshou.cn/)上就可以直接跑了,
不需要安装编译环境。要爬取其他网站,可以更改源码即可。
代码执行具体步骤点这里
更多源码下载点这里
*/
var keyword =
"d3.js";//@input(keyword,查询关键字,爬取该关键字搜索出来的京东商品)
var comment_count =
100;//@input(comment_count,爬取的评论数,最多爬取多少条评论)
var page_count = comment_count / 10;
keyword = keyword.trim();
var scanUrls = [];
scanUrls.push("http://search.jd.com/Search?keyword="+keyword.replace(/
/g,
"+")+"&enc=utf-8&qrst=1&rt=1&stop=1&book=y&vt=2&page=1&s=1&click=0");
var helperUrlRegexes = [];
helperUrlRegexes.push("http://search\\.jd\\.com/Search\\?keyword="+keyword.replace(/
/g, "\\+").replace(/\./g,
"\\.")+"&enc=utf-8&qrst=1&rt=1&stop=1&book=y&vt=2&page=\\d+&s=1&click=0");
var configs = {
domains: ["search.jd.com","item.jd.com","club.jd.com"],
scanUrls: scanUrls,
contentUrlRegexes: ["http://item\\.jd\\.com/\\d+.html"],
helperUrlRegexes: helperUrlRegexes,
fields: [
{
//第一个抽取项
name: "title",
selector: "//div[@id='name']/h1",
required: true
},
{
//第一个抽取项
name: "productid",
selector: "//div[contains(@class,'fl')]/span[2]",
required: true
},
{
name: "comments",
selector: "//div[@id='comment-pages']/span",
repeated: true,
children: [
{
name: "page",
selector:"//text()"
},
{
name: "comments",
sourceType:SourceType.AttachedUrl,
attachedUrl:"http://club.jd.com/productpage/p-{$.productid}-s-0-t-3-p-{page}.html",
selectorType:SelectorType.JsonPath,
selector: "$.comments",
repeated: true,
children:[
{
name:"com_content",
selectorType:SelectorType.JsonPath,
selector: "$.content"
},
{
name:"com_nickname",
selectorType:SelectorType.JsonPath,
selector:"$.nickname"
}
]
}
]
}
]
};
configs.afterDownloadPage = function(page,
site) {
var matches = /item\.jd\.com\/(\d+)\.html/.exec(page.url);
if (!matches) return page;
var commentUrl ="http://club.jd.com/productpage/p-"+matches[1]+"-s-0-t-3-p-0.html";
var result = site.requestUrl(commentUrl);
var data = JSON.parse(result);
var commentCount = data.productCommentSummary.commentCount;
var pages = commentCount / 10;
if (pages > page_count) pages = page_count;
var pageHtml = "
";for (var i = 0; i < pages; i++) {
pageHtml += "" + i + "";
}
pageHtml += "";
var index = page.raw.indexOf("");
page.raw = page.raw.substring(0, index) + pageHtml +page.raw.substring(index);
return page;
};
configs.onProcessHelperUrl = function(url,
content, site){
if(!content.indexOf("抱歉,没有找到")){
var currentPage =parseInt(url.substring(url.indexOf("&page=") + 6));
if(currentPage === 0){
currentPage = 1;
}
var page = currentPage + 2;
var nextUrl = url.replace("&page=" + currentPage,"&page=" + page);
site.addUrl(nextUrl);
}
return true;
};
configs.afterExtractPage = function(page,
data) {
if (data.comments === null || data.comments === undefined) return data;
var comments = [];
for (var i = 0; i < data.comments.length; i++) {
var p = data.comments[i];
for (var j = 0; j < p.comments.length; j++) {
comments.push(p.comments[j]);
}
}
data.comments = comments;
return data;
};
var crawler = new Crawler(configs);
crawler.start();