利用nodejs和nightmare爬取携程网的评论

/**
 * 作者 陈长裕 2018年8月14日
 * 爬取携程酒店的评论,保存到数据库
 * @type {request}
 */
let cheerio = require('cheerio')//爬虫框架
let Sequelize = require('sequelize');//mysql orm
const Nightmare = require('nightmare');//基于electron的爬虫框架
let nightmare = Nightmare({show: true, waitTimeout: 10000, gotoTimeout: 10000, width: 1920, height: 1080});
let DataTypes = Sequelize.DataTypes//数据类型

//首先本地mysql创建数据库,命名为rujia
let sequelize = new Sequelize('rujia', 'root', 'root', {
    host: 'localhost',
    dialect: 'mysql',
    pool: {
        max: 5,
        min: 0,
        acquire: 30000,
        idle: 10000
    },
    operatorsAliases: false
});
// 定义数据库表的字段
let CommentTable = sequelize.define('comments', {
    cid: {
        type: DataTypes.INTEGER,
        primaryKey: true,
        autoIncrement: true
    },
    id: DataTypes.INTEGER,
    score: DataTypes.STRING,
    content: DataTypes.STRING(3000),
    imageCount: DataTypes.TINYINT(1),
    isMobile: DataTypes.BOOLEAN,
    hotelName: DataTypes.STRING,
    hotelId: DataTypes.STRING,
    pageUrl: DataTypes.STRING,
    baseRoomName: DataTypes.STRING,
    date: DataTypes.STRING,
    type: DataTypes.STRING,
    userLevel: DataTypes.STRING
}, {
    createdAt: 'created_at',
    updatedAt: 'updated_at'
});

// 下面这段如果不注释会自动生成表,可以放在初始化构造函数的前面await 执行
// sequelize.sync()

// 请求休眠,防止ip被封
const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));

class Main {
    constructor(mainPageUrl) {
        this.mainPageUrl = mainPageUrl
        this.currentHotelIndex = 0;
        this.hotelCount = 24;
        this.currentUrl = 0
        this.pageUrl = ""
        this.hotelId = ''
        this.hotelName = ""
    }

    async getMainPage() {
        await nightmare.goto(this.mainPageUrl).inject('js', 'jquery.min.js').wait('.hotel_new_list');
        await nightmare.evaluate((currentHotelIndex) => {
            let $hotel = $('.hotel_new_list .hotel_item_name a')
            $hotel.removeAttr('target')
            $('.hotel_new_list').each(function (index, item) {
                if (index < currentHotelIndex) {
                    $(item).remove()
                }
            })
        }, this.currentHotelIndex)
        await nightmare.click('.hotel_new_list .hotel_item_name a').wait('.hotel_tabs ')
        let hotelInfo = await nightmare.evaluate(() => {
            return {
                currentUrl: location.href,
                hotelName: $('#J_htl_info .name .cn_n').html()
            }
        })
        this.pageUrl = hotelInfo.currentUrl
        this.hotelName = hotelInfo.hotelName
        this.hotelId = this.pageUrl.match(/\/(\d{0,})\.html/)[1]
        await nightmare.click('#commentTab a').wait('.comment_detail_list')
        await this.nextPage();
    }

    async nextHotel() {
        this.currentHotelIndex++
        if (this.currentHotelIndex > this.hotelCount) {
            await nightmare.end()
            console.log('全部完成。。。')
        } else {
            await this.getMainPage()
        }
    }

    async nextPage() {
        await sleep(2000)
        let pageData = await nightmare.evaluate(function () {
            let currentPage = +$('.c_page_list .current').text()
            let allPage = +$('.c_page_box .c_page_list a').last().text()
            return {currentPage, allPage}
        })
        this.currentPage = pageData.currentPage
        this.allPage = pageData.allPage
        let commentListData = await nightmare.evaluate(() => {
            return $('#commentList').html()
        })
        let isContinue = await this.saveData(commentListData)
        if ((this.currentPage < this.allPage) && isContinue) {
            // if(this.currentPage < 2){
            await nightmare.click('.c_down').wait('.comment_detail_list')
            await this.nextPage()
        } else {
            await this.nextHotel()
        }
    }

    async saveData(commentListData) {
        let _this = this
        let $ = cheerio.load(commentListData, {decodeEntities: false})
        let isContinue = true
        $('.comment_block').each(async (index, item) => {
            if ($(item).find('.score').length === 0) {
                isContinue = false
            } else {
                let saveObj = {}
                saveObj.id = $(item).attr('data-cid')
                saveObj.score = $(item).find('.score .n').text().trim()
                saveObj.content = $(item).find('.J_commentDetail').text().trim()
                saveObj.imageCount = $(item).find('.comment_pic .pic').length
                saveObj.isMobile = $(item).find('.comment_bar .phone').length > 0
                saveObj.hotelName = _this.hotelName
                saveObj.hotelId = _this.hotelId
                saveObj.pageUrl = _this.pageUrl
                saveObj.baseRoomName = $(item).find('.J_baseroom_link').attr('data-baseroomname')
                saveObj.date = ($(item).find('.comment_bar .time').html() || "").replace('发表于', '')
                saveObj.type = $(item).find('.type').text().trim()
                saveObj.userLevel = $(item).find('.name').next('p').attr('class')
                try{
                    await CommentTable.create(saveObj)
                }catch(e){
                    console.error(e)
                }
            }
        })

        return isContinue
    }
}

//将搜索结果页的地址传入构造函数即可
let main = new Main(`http://hotels.ctrip.com/hotel/qingdao7/h110#ctm_ref=hod_hp_sb_lst`);

main.getMainPage();

抽取了一万条数据,https://download.csdn.net/download/sinat_23076629/10603591

你可能感兴趣的:(爬虫开发)