nodejs-promise实现深度爬虫

代码来源慕课网

const http = require('http');
const cheerio = require('cheerio');
const Promise = require('bluebird')
let url = 'http://www.imooc.com/learn/348'
let baseUrl = 'http://www.imooc.com/learn/'
let videoIds = [348,259,197,134,75]

function filterChapters(html) {
    let $ = cheerio.load(html)
    let chapters = $('.chapter')
    let title = $('#page_header .path span').text()
    let number = parseInt($($('.info_num_ i')[0]).text().trim(),10)
    let courseData = {
        title:title,
        number:number,
        videos:[]
    };

    chapters.each(function (item) {
        let chapter = $(this)
        let chapterTitle = chapter.find('h3').text();
        let videos = chapter.find('ul').children('li')
        let chapterData = {
            chapterTitle:chapterTitle,
            videos:[]
        }
        videos.each(function (item) {
            let video = $(this).find('.J-media-item')
            let videoTitle = video.text().trim();
            let id = video.attr('href').split('video/')[1]
            chapterData.videos.push({
                title: videoTitle,
                id: id
            })
        })
        courseData.videos.push(chapterData)
    })
    return courseData;
}

function printCourseInfo(coursesData) {
    coursesData.forEach(function (courseData) {
        console.log(courseData.number + '人学过' + courseData.titel
        +'\n')
    })
    coursesData.videos.forEach(function (courseData) {
        console.log('###'+courseData.title+'\n')
        courseData.forEach(function (item) {
            let chapterTitle = item.chapterTitle
            console.log(chapterTitle + '\n')
            item.videos.forEach(function (video) {
                console.log('   【'+video.id+'】' + video.title + '\n')
            })
        })
    })
}

function getPageAsync(url) {
    return new Promise(function (resolve,reject) {
        console.log('正在爬去')
        http.get(url,function (res) {
            let html = ''
            res.on('data',function (data) {
                html += data;
            })
            res.on('end',function () {
                resolve(html)
            })
        }).on('error',function (e) {
            reject(e)
            console.log('获取课程数据出错')
        })
    })
}

let fetchCourseArray = []

videoIds.forEach(function (id) {
    fetchCourseArray.push(getPageAsync(baseUrl + id))
})

Promise
    .all(fetchCourseArray)
    .then(function (pages) {
        let coursesData = []
        pages.forEach(function (html) {
            let courses = filterChapters(html)
            coursesData.push(courses)
        })
        coursesData.sort(function (a,b) {
            return a.number < b.number
        })
        printCourseInfo(coursesData)
    })

你可能感兴趣的:(node)