使用node.js 抓取网页页面
安装node环境
首先使用npm 安装requests、iconv-lite、cheerio、async;
注:
request.js:进行网络请求(http://docs.python-requests.org/zh_CN/latest/user/quickstart.html);
iconv-lite.js:进行转码。
cheerio:cheerio是nodejs的抓取页面模块,为服务器特别定制的,快速、灵活、实施的jQuery核心实现。适合各种Web爬虫程序。
async:处理异步问题
npm install requests--save
npm install iconv-lite --s
npm install cheerio --s
npm install async--s
新建index.js:代码如下
var request = require('requests');
var iconv = require('iconv-lite'); //转码
var cheerio = require('cheerio'); //快速、灵活、实施的jQuery核心实现
var fs = require("fs"); // node自带的模块
var async = require("async"); // 解决异步问题
var hrefArr = ['categories/2017627968.html',
'categories/2017627967.html',
'categories/2017627966.html',
'categories/2017627965.html',
'categories/2017627964.html',
'categories/2017627963.html',
'categories/2017627962.html',
'categories/2017627961.html',
'categories/2017627960.html',
'categories/2017627977.html',
'categories/2017627976.html',
'categories/2017627975.html',
'categories/2017627974.html',
'categories/2017627973.html',
'categories/2017627972.html',
'categories/2017627971.html',
'categories/2017627970.html',
'categories/2017627969.html',
'categories/2017627978.html',
'categories/2017627983.html',
'categories/2017627986.html',
'categories/2017627985.html',
'categories/2017627984.html',
'categories/2017627982.html',
'categories/2017627981.html',
'categories/2017627980.html',
'categories/2017627979.html',
'categories/2017627994.html',
'categories/2017627993.html',
'categories/2017627992.html',
'categories/2017627991.html',
'categories/2017627990.html',
'categories/2017627989.html',
'categories/2017627988.html',
'categories/2017627987.html',
'categories/2017627950.html',
'categories/20176281003.html',
'categories/20176281002.html',
'categories/20176281001.html',
'categories/20176281000.html',
'categories/2017628999.html',
'categories/2017628998.html',
'categories/2017628997.html',
'categories/2017628996.html',
'categories/2017628995.html',
'categories/20176281012.html',
'categories/20176281011.html',
'categories/20176281010.html',
'categories/20176281009.html',
'categories/20176281008.html',
'categories/20176281007.html',
'categories/20176281006.html',
'categories/20176281005.html',
'categories/20176281004.html',
'categories/20176281022.html',
'categories/20176281021.html',
'categories/20176281020.html',
'categories/20176281019.html',
'categories/20176281018.html',
'categories/20176281017.html',
'categories/20176281016.html',
'categories/20176281015.html',
'categories/20176281014.html',
'categories/20176281031.html',
'categories/20176281030.html',
'categories/20176281029.html',
'categories/20176281028.html',
'categories/20176281027.html',
'categories/20176281026.html',
'categories/20176281025.html',
'categories/20176281024.html',
'categories/20176281023.html',
'categories/2017627959.html',
'categories/2017627958.html',
'categories/2017627957.html',
'categories/2017627956.html',
'categories/2017627955.html',
'categories/2017627954.html',
'categories/2017627952.html',
'categories/2017627951.html',
'categories/2017627949.html' ]
var viewInfo = [];
var requestData = function (item) {
return new Promise(function (resolve, reject) {
var href = "http://www.yinghexinxi.cn/"+item;
request.get({url:href,encoding:null},function(err,response,body){
var buf = iconv.decode(body, 'gb2312'); // 爬取的网页是‘gb2312’格式
var $ = cheerio.load(buf);
var data = [];
$("[height='22']").each(function(index, element){
var info = $(element).text().trim();
var splitInfo = info.split(":");
data.push(splitInfo[1]);
})
var obj = {
id: href.replace(/[^0-9]/ig,""),
name: data[0],
tel: data[1],
qq: data[2],
date: data[3],
area: data[4],
email: data[5],
location: data[6],
ip: data[7],
phone: data[8],
}
viewInfo.push(obj);
resolve(viewInfo);
});
})
};
(async function () {
for(var i = 0; i < hrefArr.length; i++ ) {
let result = await requestData(hrefArr[i]);
console.log(result);
fs.writeFile('export.json', JSON.stringify(result) , function(err) { // 将文件写入到expor.json中
if (err) {
return console.error(err);
}
console.log("数据写入成功!");
});
}
})();
使用 node index.js 启动项目