nodejs爬虫(单线程版,后续补充多线程)

写在前面

nodejs爬虫使用的是request+cheerio+fs,会输出到本地,所以目前是单线程效率,比较慢1 page/s
可以看到爬的是51CTO的查询页面,用于后续的数据监控和分析。
cheerio介绍:https://www.npmjs.com/package/cheerio
request介绍:https://www.npmjs.com/package/request
fs介绍:http://nodejs.cn/api/fs.html

创建request工具页getHtml.js

const req = require('request');
var cheerio = require('cheerio');

//获取页面的源码
function getHtml(url){
    return new Promise((resolve,reject)=>{
        req.get({
            url : url,
            headers : {
                "user-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
                "referer":"https://ost.51cto.com/"
            },
            encoding:"utf-8"
        },(err,res,body)=>{
            if(err) reject(err);
            else resolve(body)
        })
    })
}

//获取查询结果的总页数
async function getPageNumber(){
    let url = "https://ost.51cto.com/postlist/";
    let htmlStr = await getHtml(url);
    let $ = cheerio.load(htmlStr);
    let numberStr = $('ul.el-pager').children('li').last();
    let pageNumber = numberStr.find('a').text().trim();
    return pageNumber;
}

exports.getHtml = getHtml;
exports.getPageNumber = getPageNumber;

创建主要页面index.js

var fs = require("fs");

var cheerio = require('cheerio');

var getHtml = require('./getHtml');

//获取页面的目标字段
function getPages(htmlStr){
    let $ = cheerio.load(htmlStr);
    let pageList = $('ul.infinite-list').children('li');
    let rst = [];

    pageList.each((i,item)=>{
        let node = $(item);
        // console.log(item);
        let title = node.find('h3').text().trim();
        let content = node.find('div.content').text().trim();
        let urlRight = node.find('a').attr('href').trim();
        let url = "https://ost.51cto.com"+urlRight;

        rst.push({
            title : title,
            content : content,
            url : url
        });
    })

    return rst;
}

//输出json文件
function outputJsonFile(file) {
    fs.writeFile('./file.json', JSON.stringify(file), function (err) {
        if (!err) {
            console.log('数组输入完毕')
        }
        else {
            console.log(err)
        }
    })
}

//主要执行方法
async function run(){
    let pageNumber = await getHtml.getPageNumber();
    let pages = []
    for(let i = 1 ; i <= pageNumber ; i++){
        let url = "https://ost.51cto.com/postlist/p"+i;
        let htmlStr = await getHtml.getHtml(url);
        let rst = getPages(htmlStr);
        pages.push(rst);
        console.log('装载了第'+i+'页')
    }
    outputJsonFile(pages);
}

run();//执行

你可能感兴趣的:(爬虫,爬虫,javascript,前端,node.js)