使用node.js来抓取网页页面

使用node.js 抓取网页页面
安装node环境
  首先使用npm 安装requests、iconv-lite、cheerio、async;

注:
  request.js:进行网络请求(http://docs.python-requests.org/zh_CN/latest/user/quickstart.html);
  iconv-lite.js:进行转码。
  cheerio:cheerio是nodejs的抓取页面模块,为服务器特别定制的,快速、灵活、实施的jQuery核心实现。适合各种Web爬虫程序。
  async:处理异步问题

npm install requests--save
npm install iconv-lite --s
npm install cheerio --s

npm install async--s

新建index.js:代码如下

var request = require('requests');
var iconv = require('iconv-lite'); //转码
var cheerio = require('cheerio'); //快速、灵活、实施的jQuery核心实现
var fs = require("fs"); // node自带的模块
var async = require("async"); // 解决异步问题

var hrefArr = ['categories/2017627968.html',
  'categories/2017627967.html',
  'categories/2017627966.html',
  'categories/2017627965.html',
  'categories/2017627964.html',
  'categories/2017627963.html',
  'categories/2017627962.html',
  'categories/2017627961.html',
  'categories/2017627960.html',
  'categories/2017627977.html',
  'categories/2017627976.html',
  'categories/2017627975.html',
  'categories/2017627974.html',
  'categories/2017627973.html',
  'categories/2017627972.html',
  'categories/2017627971.html',
  'categories/2017627970.html',
  'categories/2017627969.html',
  'categories/2017627978.html',
  'categories/2017627983.html',
  'categories/2017627986.html',
  'categories/2017627985.html',
  'categories/2017627984.html',
  'categories/2017627982.html',
  'categories/2017627981.html',
  'categories/2017627980.html',
  'categories/2017627979.html',
  'categories/2017627994.html',
  'categories/2017627993.html',
  'categories/2017627992.html',
  'categories/2017627991.html',
  'categories/2017627990.html',
  'categories/2017627989.html',
  'categories/2017627988.html',
  'categories/2017627987.html',
  'categories/2017627950.html',
  'categories/20176281003.html',
  'categories/20176281002.html',
  'categories/20176281001.html',
  'categories/20176281000.html',
  'categories/2017628999.html',
  'categories/2017628998.html',
  'categories/2017628997.html',
  'categories/2017628996.html',
  'categories/2017628995.html',
  'categories/20176281012.html',
  'categories/20176281011.html',
  'categories/20176281010.html',
  'categories/20176281009.html',
  'categories/20176281008.html',
  'categories/20176281007.html',
  'categories/20176281006.html',
  'categories/20176281005.html',
  'categories/20176281004.html',
  'categories/20176281022.html',
  'categories/20176281021.html',
  'categories/20176281020.html',
  'categories/20176281019.html',
  'categories/20176281018.html',
  'categories/20176281017.html',
  'categories/20176281016.html',
  'categories/20176281015.html',
  'categories/20176281014.html',
  'categories/20176281031.html',
  'categories/20176281030.html',
  'categories/20176281029.html',
  'categories/20176281028.html',
  'categories/20176281027.html',
  'categories/20176281026.html',
  'categories/20176281025.html',
  'categories/20176281024.html',
  'categories/20176281023.html',
  'categories/2017627959.html',
  'categories/2017627958.html',
  'categories/2017627957.html',
  'categories/2017627956.html',
  'categories/2017627955.html',
  'categories/2017627954.html',
  'categories/2017627952.html',
  'categories/2017627951.html',
  'categories/2017627949.html' ]
var viewInfo = [];
var requestData = function (item) {
  return new Promise(function (resolve, reject) {
    var href = "http://www.yinghexinxi.cn/"+item;
    request.get({url:href,encoding:null},function(err,response,body){
      var buf =  iconv.decode(body, 'gb2312'); // 爬取的网页是‘gb2312’格式
      var $ = cheerio.load(buf);
      var data = [];

      $("[height='22']").each(function(index, element){
        var info = $(element).text().trim();
        var splitInfo = info.split(":");
        data.push(splitInfo[1]);
      })
      var obj = {
        id: href.replace(/[^0-9]/ig,""),
        name: data[0],
        tel: data[1],
        qq: data[2],
        date: data[3],
        area: data[4],
        email: data[5],
        location: data[6],
        ip: data[7],
        phone: data[8],
      }
      viewInfo.push(obj);
      resolve(viewInfo);
    });
  })
};
(async function () {
  for(var i = 0; i < hrefArr.length; i++ ) {
    let result = await requestData(hrefArr[i]);
    console.log(result);
	fs.writeFile('export.json', JSON.stringify(result) , function(err) { // 将文件写入到expor.json中
	if (err) {
        return console.error(err);
    }
        console.log("数据写入成功!");
    });
  }
})();

使用 node index.js 启动项目

你可能感兴趣的:(使用node.js来抓取网页页面)