node爬虫抓取拉勾网数据

初始化:

1.安装了node
2.新建一个文件夹
3.在该文件夹中初始化node应用

npm init 

安装依赖:

  1. 使用express框架
  2. 使用superagent库:

superagent 是一个轻量级、渐进式的请求库,内部依赖 nodejs 原生的请求 api,适用于 nodejs 环境

  1. 使用cheerio库:

cheerio 是 nodejs 的抓取页面模块,为服务器特别定制的,快速、灵活、实施的 jQuery 核心实现。适合各种 Web 爬虫程序。node.js 版的 jQuery。

npm i express cheerio superagent -D

代码展示:

1.首先引入模块

const express = require("express")
const app = express()
const fs = require("fs")
const superagent = require("superagent")
const cheerio = require("cheerio")

2.声明要爬的网站URL

const lagouURL = "https://www.lagou.com/guangzhou-zhaopin/webqianduan/";
const code = "/?filterOption=3&sid=b87c46399fd24f618b97b395f945ab1b";

3.请求数据

 superagent.get(url).end((err, res) => {
                if (err) {
                    console.log("获取失败")
                } else {
                    let data = getHotData(res,i)

                  ListData=ListData.concat(data)
                }
            })

4.分析数据

let getHotData = (res,i) => {
    let hotNews = []
    let $ = cheerio.load(res.text);
    $("#s_position_list ul>.con_list_item").each(function (index) {
        // if (index !== 0) {
            const $item = $(this).children();
            const position=$item.find(".list_item_top .position .p_top a h3").text()
            const region=$item.find(".list_item_top .position .p_top a .add").text()
            const title=$item.find(".list_item_top .position .p_top a .format-time").text()
            const link=$item.find(".list_item_top .position .p_top a").attr("href")
            const money=$item.find(".list_item_top .position .p_bot .li_b_l").text()
            const education=$item.find(".list_item_top .position .p_bot .li_b_l span").text()
            const company_name=$item.find(".list_item_top .company .company_name a").text()
            const industry=$item.find(".list_item_top .company .industry").text()
            const com_logo=$item.find(".list_item_top .com_logo a img").attr("src")
            const com_link=$item.find(".list_item_top .com_logo a").attr("href")
            const skill=$item.find(".list_item_bot .li_b_l span").text()
            const guarantee=$item.find(".list_item_bot .li_b_r").text()
            const id=(i-1)*15+index+1
            hotNews.push({ id, position, region, title, link,money,education,company_name,com_link,com_logo,industry,skill,guarantee})
        // }

    })

    return hotNews

}

5.保存数据

fs.writeFileSync(`${__dirname}/data.json`, JSON.stringify(ListData), 'utf-8', (err) => {
                            if (err) {
                                console.log(err)
                            }
                        })

6.完整代码

const express = require("express")
const app = express()
const fs = require("fs")
const superagent = require("superagent")
const cheerio = require("cheerio")
const lagouURL = "https://www.lagou.com/guangzhou-zhaopin/webqianduan/";
const code = "/?filterOption=3&sid=b87c46399fd24f618b97b395f945ab1b";



app.get("/", (req, key) => {


    let i = 0
    let ListData=[]

    let timer = setInterval(() => {
        i++
        let url = lagouURL + i + code

        if (i >=30) {
            // key.send(ListData)
             if (ListData.length > 0) {
                        fs.writeFileSync(`${__dirname}/data.json`, JSON.stringify(ListData), 'utf-8', (err) => {
                            if (err) {
                                console.log(err)
                            }
                        })
                    }
            clearInterval(timer)
        } else {
            console.log(url,i)
            superagent.get(url).end((err, res) => {
                if (err) {
                    console.log("获取失败")
                } else {
                    let data = getHotData(res,i)

                  ListData=ListData.concat(data)
                }
            })
        }


    }, 1000)


   

})

let getHotData = (res,i) => {
    let hotNews = []
    let $ = cheerio.load(res.text);
    $("#s_position_list ul>.con_list_item").each(function (index) {
        // if (index !== 0) {
            const $item = $(this).children();
            const position=$item.find(".list_item_top .position .p_top a h3").text()
            const region=$item.find(".list_item_top .position .p_top a .add").text()
            const title=$item.find(".list_item_top .position .p_top a .format-time").text()
            const link=$item.find(".list_item_top .position .p_top a").attr("href")
            const money=$item.find(".list_item_top .position .p_bot .li_b_l").text()
            const education=$item.find(".list_item_top .position .p_bot .li_b_l span").text()
            const company_name=$item.find(".list_item_top .company .company_name a").text()
            const industry=$item.find(".list_item_top .company .industry").text()
            const com_logo=$item.find(".list_item_top .com_logo a img").attr("src")
            const com_link=$item.find(".list_item_top .com_logo a").attr("href")
            const skill=$item.find(".list_item_bot .li_b_l span").text()
            const guarantee=$item.find(".list_item_bot .li_b_r").text()
            const id=(i-1)*15+index+1
            hotNews.push({ id, position, region, title, link,money,education,company_name,com_link,com_logo,industry,skill,guarantee})
        // }

    })

    return hotNews

}

app.listen(3000, () => console.log("启动成功"))

你可能感兴趣的:(node)