已知某个网址http://www.***.com,扒出所有的新闻

app.js文件:

const fs = require('fs');
const request = require('superagent');
const cheerio = require('cheerio');
const mapLimit = require('async/mapLimit');
const url = 'http://www.hiynn.com/hy-zh';
let counter = 0

// const fetch = require('whatwg-fetch');

function getNews() {
    return new Promise((resolve, reject) => {
        request.get(`${url}/news.html`)
            .end((err, data) => {
                if (err) throw err

                let $ = cheerio.load(data.text), links = []
                $('h3 > a').each((index, item) => {
                    let $item = $(item)

                    // href="#" || href="undefined"
                    if ($item.attr('href').match('#') || !$item.attr('href')) return
                    links.push($item.attr('href'))
                })

                resolve(links)
            })
    })
        .then(links => {
            /**
             * mapLimit
             *@param arg[0]  需要遍历的集合
             *@param arg[1]  最大并发请求数
             *@param arg[2]  迭代处理函数 -> 参数1:传入集合中的每一项;参数2:回调函数
             *@param arg[3]  所有迭代完成后的回调函数 -> 参数1:err;参数2:结果集
             */
            mapLimit(links, 10, function (link, cb) {
                request.get(`${url}/${link}`)
                    .end((err, data) => {
                        if (err) throw err

                        let $ = cheerio.load(data.text, {
                            xmlMode: true,
                            decodeEntities: false,
                            normalizeWhitespace: true,
                            withDomLvl1: false
                        }), news = [], create_time=[]
                        let creat_time = link.slice(5, 11);
                        create_time.push(creat_time.slice(0, 2), creat_time.slice(2, 4), creat_time.slice(4, 6))
                        let createTime = create_time.join('-');

                        let title = $('#tab1').html();

                        let Title = title.match(/

(.+)<\/h3>/)[1].trim(); let typeArr = [1, 2]; let allStr = $('#tab1').html(); //匹配图片(g表示匹配所有结果i表示区分大小写) var imgReg = /|\/>)/gi; //匹配src属性 var srcReg = /src=[\'\"]?([^\'\"]*)[\'\"]?/i; var arr = allStr.match(imgReg); console.log('arr------->', arr); let imgSrc = []; if(arr) { for (var i = 0; i < arr.length; i++) { imgSrc = arr[i].match(srcReg); } }else { imgSrc = [ '', "./images/news161222/3.png"] } // console.log('imgSrc[0]------->', imgSrc[0]); console.log('imgSrc---->', imgSrc); news.push({ counter:counter, author:'小明', content:'', creat_time:`20${createTime}`, link: link, content_html: $('#tab1').html(), deleted_flag: 0, important: 2, title: Title, img: imgSrc[1], type: Math.ceil(Math.random() * typeArr.length) }) counter++; console.log(`${counter}/${links.length}`) // 限制每秒的请求个数 setTimeout(() => { // callback 必须调用,否则不会向下执行 cb(null, news) }, 1000) }) }, function (err, coll) { if (err) throw err // 二维数组 -> 一维数组 let news = Array.prototype.concat.apply([], coll) // 写入文件 fs.writeFile('./links.js', JSON.stringify(news), 'utf8', function (err) { if (err) throw err console.log('写入成功'); }) }) }) } getNews()

package.json

{
  "name": "Crawler",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1",
    "start": "node app.js"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "dependencies": {
    "async": "^2.6.0",
    "cheerio": "^1.0.0-rc.2",
    "superagent": "^3.8.1",
    "whatwg-fetch": "^2.0.3"
  }
}

最后生成一个links.js文件,文件内容是一个数组,包括所有新闻。

你可能感兴趣的:(javascript)