var http = require('http');
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs'); //用来操作文件
//var url = 'http://www.budejie.com/';
var url="http://www.budejie.com/text/"
let arcList=[];
let zoomNum=0;
function GetData(url,num,arcList,zoomNum){
http.get(url+num, function (res) {
var html = '';
// var arcInfo = {};
res.on('data', function (chunk) {
//console.log(chunk)
html += chunk;
});
res.on('end', function () {
arcList = filterHtml(url,html,arcList);
console.log(
"第"+num+"页已完成"
+"#上次数据"+zoomNum
+"现在数据"+arcList.length
)
if(zoomNum==arcList.length){
console.log("爬取结束: 爬去总数据如下")
}else{
zoomNum=arcList.length
num++
GetData(url,num,arcList,zoomNum)
}
});
});
}
GetData(url,1,arcList,zoomNum)
function filterHtml(URL,html,arcList) {
//console.log(html)
var $ = cheerio.load(html);
var aPost = $(".j-r-list").find("ul").find("li");
aPost.each(function () {
var ele = $(this);
var title = ele.find(".j-list-user .u-txt a").text();
var url = ele.find(".j-list-user .u-txt a").attr("href");
//var imgurl = ele.find(".j-list-user .u-img a img").attr("src");
//ele.find(".postTitle a").remove();
var entry = ele.find(".j-r-list-c .j-r-list-c-desc a").text();
//ele.find("small a").remove();
//var entryurl = ele.find(".j-r-list-c .j-r-list-c-desc a").attr("href");
//var entryimgurl = ele.find(".j-r-list-c .j-r-list-c-img a img").attr("src");
//通过管道的方式用fs模块将图片写到本地的images文件下
//request(imgurl).pipe(fs.createWriteStream('./images/'+"a"+Math.random()+".js"));
//var re = /\d{4}-\d{2}-\d{2}\s*\d{2}[:]\d{2}/;
//listTime = listTime.match(re)[0];
if(title=="") return;
arcList.push({
title: title,
url: URL+url,
//imgurl:imgurl,
entry: entry,
//entryurl: URL+entryurl,
//entryimgurl:entryimgurl,
});
});
return arcList;
}