记录使用node写一个简易爬虫

准备工作-使用到的模块
//全局安装自动重启工具nodemon
cnpm install -g  nodemon
cnpm i --save koa koa-router mysql cheerio superagent-charset superagent
----app.js----
const Koa = require('koa'),
app = new Koa(),
index = require('./routes/index');
app.use(index.routes(), index.allowedMethods());
app.listen(3000);

//路由信息
----/routes/index----
const router = require('koa-router')(),
  mysql = require('../db/mysql'),
  superagent = require('../caiji/superagent');
router.get('/caiji/:page', async (ctx, next) => {
  let page = ctx.params;
  let html = await superagent.get(`https://cnodejs.org/?tab=all&page=${page.page}`);
  if (html.length > 0) {
    try {
      let count = 0;
      for (let i in html) {
        await mysql.query(`insert into nodeData(title,userName,time) 
values('${html[i].title}','${html[i].userName}','${html[i].time}')`);
        count += 1
      }
      ctx.body = {
        code: 1,
        message: `该页采集完成,共采集【${count}】条`
      }
    } catch (error) {
      ctx.body = {
        code: 0,
        message: `采集失败:${error}`
      }
    }
  }
})
//"数据库配置"
----/db/config----
module.exports = {
   DATABASE:'test',
        USERNAME:'root',
        PASSWORD:'zhy123456',
        PORT:'3306',
        HOST:'localhost'
}
----/db/mysql----
const mysql = require('mysql');
const config = require('./config');
let pool = mysql.createPool({
    host:config.HOST,
    user:config.USERNAME,
    password:config.PASSWORD,
    database:config.DATABASE
})

class Mysql{
    constructor(){

    }
    query(sql){
        console.log(sql)
        return new Promise((resolve,resject)=>{
            pool.query(sql,(err,res,fields)=>{
                if (err) {
                    throw err;
                }
                resolve(res)
            })
        })
    }
}
module.exports = new Mysql()
//采集模块
----/db/caiji----
const cheerio = require('cheerio'),
    superagent = require('superagent'),
    charset = require('superagent-charset');
charset(superagent);
module.exports = {
    get(url) {
        return new Promise((resolve, reject) => {
            superagent.get(url)
                .charset('utf-8')
                .end((err, res) => {
                    if (err) {
                        resolve([])
                    }
                    if (res) {
                        let $ = cheerio.load(res.text, {
                            decodeEntities: false
                        })
                        let arr = [];
                        for (let i in $('.cell')) {
                            let title = $('.cell').eq(i).find('.topic_title').eq(0).html(),
                                userName = $('.cell').eq(i).find('.user_avatar').eq(0).find('img').eq(0).attr('title'),
                                time = $('.cell').eq(i).find('.last_active_time').eq(0).html(),
                                views = $('.cell').eq(i).find('.count_of_visits').eq(0).text();
                            if (title) {
                                arr.push({
                                    title: title.trim(),
                                    userName,
                                    time,
                                    views: views.trim()
                                })
                            }
                        }
                        resolve(arr)
                    } else {
                        resolve([])
                    }
                })
        })
    }
}
//启动
nodemon app.js
//浏览器地址
[http://127.0.0.1:3000/caiji/1](http://127.0.0.1:3000/caiji/1)
第一页采集完成
![image.png](https://upload-images.jianshu.io/upload_images/5814981-6d1c2143f10bc9ca.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)

你可能感兴趣的:(记录使用node写一个简易爬虫)