京东商品信息和评价采集爬虫源码

/*使用javascript编写的爬虫源码,用于爬取京东商品信息和评价。

代码粘贴到神箭手云爬虫平台(http://www.shenjianshou.cn/)上就可以直接跑了,

不需要安装编译环境。要爬取其他网站,可以更改源码即可。

代码执行具体步骤点这里

更多源码下载点这里

*/


var keyword =

"d3.js";//@input(keyword,查询关键字,爬取该关键字搜索出来的京东商品)

var comment_count =

100;//@input(comment_count,爬取的评论数,最多爬取多少条评论)

var page_count = comment_count / 10;

keyword = keyword.trim();

var scanUrls = [];

scanUrls.push("http://search.jd.com/Search?keyword="+keyword.replace(/

/g,

"+")+"&enc=utf-8&qrst=1&rt=1&stop=1&book=y&vt=2&page=1&s=1&click=0");

var helperUrlRegexes = [];

helperUrlRegexes.push("http://search\\.jd\\.com/Search\\?keyword="+keyword.replace(/

/g, "\\+").replace(/\./g,

"\\.")+"&enc=utf-8&qrst=1&rt=1&stop=1&book=y&vt=2&page=\\d+&s=1&click=0");

var configs = {

domains: ["search.jd.com","item.jd.com","club.jd.com"],

scanUrls: scanUrls,

contentUrlRegexes: ["http://item\\.jd\\.com/\\d+.html"],

helperUrlRegexes: helperUrlRegexes,

fields: [

{

//第一个抽取项

name: "title",

selector: "//div[@id='name']/h1",

required: true

},

{

//第一个抽取项

name: "productid",

selector: "//div[contains(@class,'fl')]/span[2]",

required: true

},

{

name: "comments",

selector: "//div[@id='comment-pages']/span",

repeated: true,

children: [

{

name: "page",

selector:"//text()"

},

{

name: "comments",

sourceType:SourceType.AttachedUrl,

attachedUrl:"http://club.jd.com/productpage/p-{$.productid}-s-0-t-3-p-{page}.html",

selectorType:SelectorType.JsonPath,

selector: "$.comments",

repeated: true,

children:[

{

name:"com_content",

selectorType:SelectorType.JsonPath,

selector: "$.content"

},

{

name:"com_nickname",

selectorType:SelectorType.JsonPath,

selector:"$.nickname"

}

]

}

]

}

]

};

configs.afterDownloadPage = function(page,

site) {

var matches = /item\.jd\.com\/(\d+)\.html/.exec(page.url);

if (!matches) return page;

var commentUrl ="http://club.jd.com/productpage/p-"+matches[1]+"-s-0-t-3-p-0.html";

var result = site.requestUrl(commentUrl);

var data = JSON.parse(result);

var commentCount = data.productCommentSummary.commentCount;

var pages = commentCount / 10;

if (pages > page_count) pages = page_count;

var pageHtml = "

";

for (var i = 0; i < pages; i++) {

pageHtml += "" + i + "";

}

pageHtml += "";

var index = page.raw.indexOf("");

page.raw = page.raw.substring(0, index) + pageHtml +page.raw.substring(index);

return page;

};

configs.onProcessHelperUrl = function(url,

content, site){

if(!content.indexOf("抱歉,没有找到")){

var currentPage =parseInt(url.substring(url.indexOf("&page=") + 6));

if(currentPage === 0){

currentPage = 1;

}

var page = currentPage + 2;

var nextUrl = url.replace("&page=" + currentPage,"&page=" + page);

site.addUrl(nextUrl);

}

return true;

};

configs.afterExtractPage = function(page,

data) {

if (data.comments === null || data.comments === undefined) return data;

var comments = [];

for (var i = 0; i < data.comments.length; i++) {

var p = data.comments[i];

for (var j = 0; j < p.comments.length; j++) {

comments.push(p.comments[j]);

}

}

data.comments = comments;

return data;

};

var crawler = new Crawler(configs);

crawler.start();

你可能感兴趣的:(京东商品信息和评价采集爬虫源码)