app.js文件:
const fs = require('fs');
const request = require('superagent');
const cheerio = require('cheerio');
const mapLimit = require('async/mapLimit');
const url = 'http://www.hiynn.com/hy-zh';
let counter = 0
// const fetch = require('whatwg-fetch');
function getNews() {
return new Promise((resolve, reject) => {
request.get(`${url}/news.html`)
.end((err, data) => {
if (err) throw err
let $ = cheerio.load(data.text), links = []
$('h3 > a').each((index, item) => {
let $item = $(item)
// href="#" || href="undefined"
if ($item.attr('href').match('#') || !$item.attr('href')) return
links.push($item.attr('href'))
})
resolve(links)
})
})
.then(links => {
/**
* mapLimit
*@param arg[0] 需要遍历的集合
*@param arg[1] 最大并发请求数
*@param arg[2] 迭代处理函数 -> 参数1:传入集合中的每一项;参数2:回调函数
*@param arg[3] 所有迭代完成后的回调函数 -> 参数1:err;参数2:结果集
*/
mapLimit(links, 10, function (link, cb) {
request.get(`${url}/${link}`)
.end((err, data) => {
if (err) throw err
let $ = cheerio.load(data.text, {
xmlMode: true,
decodeEntities: false,
normalizeWhitespace: true,
withDomLvl1: false
}), news = [], create_time=[]
let creat_time = link.slice(5, 11);
create_time.push(creat_time.slice(0, 2), creat_time.slice(2, 4), creat_time.slice(4, 6))
let createTime = create_time.join('-');
let title = $('#tab1').html();
let Title = title.match(/(.+)<\/h3>/
)[1].trim();
let typeArr = [1, 2];
let allStr = $('#tab1').html();
//匹配图片(g表示匹配所有结果i表示区分大小写)
var imgReg = /|\/>)/gi ;
//匹配src属性
var srcReg = /src=[\'\"]?([^\'\"]*)[\'\"]?/i;
var arr = allStr.match(imgReg);
console.log('arr------->', arr);
let imgSrc = [];
if(arr) {
for (var i = 0; i < arr.length; i++) {
imgSrc = arr[i].match(srcReg);
}
}else {
imgSrc = [ '', "./images/news161222/3.png"]
}
// console.log('imgSrc[0]------->', imgSrc[0]);
console.log('imgSrc---->', imgSrc);
news.push({
counter:counter,
author:'小明',
content:'',
creat_time:`20${createTime}`,
link: link,
content_html: $('#tab1').html(),
deleted_flag: 0,
important: 2,
title: Title,
img: imgSrc[1],
type: Math.ceil(Math.random() * typeArr.length)
})
counter++;
console.log(`${counter}/${links.length}`)
// 限制每秒的请求个数
setTimeout(() => {
// callback 必须调用,否则不会向下执行
cb(null, news)
}, 1000)
})
}, function (err, coll) {
if (err) throw err
// 二维数组 -> 一维数组
let news = Array.prototype.concat.apply([], coll)
// 写入文件
fs.writeFile('./links.js', JSON.stringify(news), 'utf8', function (err) {
if (err) throw err
console.log('写入成功');
})
})
})
}
getNews()
package.json
{
"name": "Crawler",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1",
"start": "node app.js"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"async": "^2.6.0",
"cheerio": "^1.0.0-rc.2",
"superagent": "^3.8.1",
"whatwg-fetch": "^2.0.3"
}
}
最后生成一个links.js文件,文件内容是一个数组,包括所有新闻。