使用node.js+puppeteer爬取数据,实现无视接口是否加密都可以爬取数据

使用nodejs实现爬虫,无视接口是否加密

  1. 准备工作:
    安装node所需要的的开发环境,以及安装puppeteer包–>在终端中输入npm install puppeteer即可安装,不过这个包比较大,最好切换到淘宝镜像进行安装,这里不做过多赘述;并且我们还需要将爬取到的数据放到数据库,所以还需要安装mysql包,也是直接在终端输入npm install mysql即可。
  2. puppeteer介绍:
    Puppeteer 是 Chrome 开发团队在 2017 年发布的一个 Node.js 包,用来模拟 Chrome 浏览器的运行。可以说只要是浏览器事件用puppeteer的api都可以实现模拟事件,所以是一个很强大的工具,并且由于我们经常爬取数据的时候会发现很多网站的接口数据其实是进行了加密的,这样就导致我们爬取数据的难度很大,但是如果使用puppeteer我们就可以不再关注接口数据是否加密,只要数据是展示到了页面上,我们都是可以爬取的。
  3. 具体代码:
let puppeteer = require('puppeteer');
let mysql = require('mysql');

//输入数据库相关信息
let connection = mysql.createConnection({
    host : ip,
    user: 用户名,
    password: 密码,
    port : 端口,
    database: 表名
});
//连接数据库
connection.connect();
//数据库新增数据方法
let shopAddSql = 'INSERT INTO huawei_shop_place (province,city,area_town,shop_name,shop_location) VALUES (?,?,?,?,?)';
//插入数据方法
function insertShop(shopMessage){
    return new Promise((resolve,reject)=>{
        connection.query(shopAddSql,shopMessage,(err,results)=>{
            if(err){
                console.log('[INSERT ERROR] - ',err.message);
                return;
            }else{
                console.log('成功');
                //resolve(results);
            }
        })
    })
}

async function addShopMessage(){
    
    //puppeteer.launch实例开启浏览器
    //可以传入一个options对象,可以配置为无界面浏览器,也可以配置为有界面浏览器
    //无界面浏览器性能更高更快,有界面用于调试开发
    let options = {
        //设置视窗的宽高
        defaultViewport:{
            width:1400,
            height:800
        },
        //设置为有界面,如果为true则为无界面
        headless:false
    }
    let browser = await puppeteer.launch(options);

    //打开新页面
    let page = await browser.newPage();
    //访问页面
    await page.goto('https://consumer.huawei.com/cn/support/where-to-buy/');
    
    // 循环省份
    for(let i = 1; i < 32; i++){
        //先等待200ms再点击
        await page.waitFor(200);
        //获取省份下拉框
        provinceListHandlers = await page.$('.province .select');
        //点击省份下拉框
        provinceListHandlers.click();
        //获取下拉框内容
        provinceSelect = await page.$$('.province .select .select-province li');
        //循环依次点击下拉框内容
        await provinceSelect[i].click();
        //获取城市下拉框
        cityListHandlers = await page.$('.city .select');
        //等待200ms
        await page.waitFor(200);
        //点击城市下拉框
        cityListHandlers.click();
        //获取下拉框内容
        citySelect = await page.$$('.city .select .select-city li');
        //等待200ms
        await page.waitFor(200);
        //循环点击城市下拉框
        console.log(citySelect.length);
        // 循环市
        for(let j = 1; j < citySelect.length ; j++){
            await page.waitFor(200);
            if(j > 1){
                console.log('大于1');
                cityListHandlers = await page.$('.city .select');
                console.log('获取城市列表');
                cityListHandlers.click();
                console.log('点击城市列表');
                citySelect = await page.$$('.city .select .select-city li');
                console.log('获取选择列表');
                await citySelect[j].click();
                await page.waitForResponse(res => {
                    return res.request().url().startsWith('https://consumer.huawei.com/cmkt/services/service/store/query/v2') && res.ok()
                })
                await page.waitFor(100);
                areaListHandlers = await page.$('.area .select');
                //点击乡镇下拉框
                areaListHandlers.click();
                //获取下拉框内容
                areaSelect = await page.$$('.area .select .select-area li');
                await page.waitFor(200);
                for(let k = 1; k < areaSelect.length; k++){
                    await page.waitFor(200);
                    if(k > 1){
                        areaListHandlers = await page.$('.area .select');
                        //点击乡镇下拉框
                        areaListHandlers.click();
                        //获取下拉框内容
                        areaSelect = await page.$$('.area .select .select-area li');
                        areaSelect[k].click();
                        // 点击操作执行后再执行这一步
                        await page.waitForResponse(res => {
                            return res.request().url().startsWith('https://consumer.huawei.com/cmkt/services/service/store/query/v2') && res.ok()
                        })
                        await page.waitFor(1000)
                        shopNameList = await page.$$eval('.point-list #pointList li .point-title p',(elements)=>{
                            //设置存放数据的数组
                            var PointList = [];
                            elements.forEach((item,i) => {
                                let pointName = item.innerText;
                                PointList.push(pointName);
                            });
                            return PointList
                        });
                        shopPositionList = await page.$$eval('.point-list #pointList li .left-box .pa-wrap span a',(elements)=>{
                            var PositionListArr = [];
                            elements.forEach((item,i) => {
                                let Position = item.innerText;
                                PositionListArr.push(Position);
                            })
                            return PositionListArr;
                        });
                        for(let i = 0; i < shopNameList.length; i++){
                            let shopMessage = [];
                            let province = await page.$eval('.province .select span',el => el.innerText);
                            let city = await page.$eval('.city .select span',el=> el.innerText);
                            let area = await page.$eval('.area .select span',el => el.innerText);
                            let shopName = shopNameList[i];
                            let shopPosition = shopPositionList[i];
                            await page.waitFor(100);
                            shopMessage = [province,city,area,shopName,shopPosition];
                            insertShop(shopMessage);
                            console.log(province);
                            console.log(city);
                            console.log(area);
                            console.log(shopName);
                            console.log(shopPosition);
                        }
                    }else{
                        
                        areaSelect[k].click();
                        // 点击操作执行后再执行这一步
                        await page.waitForResponse(res => {
                            return res.request().url().startsWith('https://consumer.huawei.com/cmkt/services/service/store/query/v2') && res.ok()
                        })
                        await page.waitFor(1000)
                        shopNameList = await page.$$eval('.point-list #pointList li .point-title p',(elements)=>{
                            //设置存放数据的数组
                            var PointList = [];
                            elements.forEach((item,i) => {
                                let pointName = item.innerText;
                                PointList.push(pointName);
                            });
                            return PointList
                        });
                        shopPositionList = await page.$$eval('.point-list #pointList li .left-box .pa-wrap span a',(elements)=>{
                            var PositionListArr = [];
                            elements.forEach((item,i) => {
                                let Position = item.innerText;
                                PositionListArr.push(Position);
                            })
                            return PositionListArr;
                        });
                        for(let i = 0; i < shopNameList.length; i++){
                            let shopMessage = [];
                            let province = await page.$eval('.province .select span',el => el.innerText);
                            let city = await page.$eval('.city .select span',el=> el.innerText);
                            let area = await page.$eval('.area .select span',el => el.innerText);
                            let shopName = shopNameList[i];
                            let shopPosition = shopPositionList[i];
                            await page.waitFor(100);
                            shopMessage = [province,city,area,shopName,shopPosition];
                            insertShop(shopMessage);
                            console.log(province);
                            console.log(city);
                            console.log(area);
                            console.log(shopName);
                            console.log(shopPosition);
                        }
                    }
                }
            }else{
                await citySelect[j].click();
                console.log('城市下拉选择');
                await page.waitForResponse(res => {
                    return res.request().url().startsWith('https://consumer.huawei.com/cmkt/services/service/store/query/v2') && res.ok()
                })
                await page.waitFor(100);
                console.log('等待100ms');
                areaListHandlers = await page.$('.area .select');
                //点击乡镇下拉框
                areaListHandlers.click();
                console.log('乡镇下拉点击');
                //获取下拉框内容
                areaSelect = await page.$$('.area .select .select-area li');
                await page.waitFor(200);
                for(let k = 1; k < areaSelect.length; k++){
                    await page.waitFor(200);
                    if(k > 1){
                        areaListHandlers = await page.$('.area .select');
                        //点击乡镇下拉框
                        areaListHandlers.click();
                        //获取下拉框内容
                        areaSelect = await page.$$('.area .select .select-area li');
                        areaSelect[k].click();
                        // 点击操作执行后再执行这一步
                        await page.waitForResponse(res => {
                            return res.request().url().startsWith('https://consumer.huawei.com/cmkt/services/service/store/query/v2') && res.ok()
                        })
                        await page.waitFor(1000)
                        shopNameList = await page.$$eval('.point-list #pointList li .point-title p',(elements)=>{
                            //设置存放数据的数组
                            var PointList = [];
                            elements.forEach((item,i) => {
                                let pointName = item.innerText;
                                PointList.push(pointName);
                            });
                            return PointList
                        });
                        shopPositionList = await page.$$eval('.point-list #pointList li .left-box .pa-wrap span a',(elements)=>{
                            var PositionListArr = [];
                            elements.forEach((item,i) => {
                                let Position = item.innerText;
                                PositionListArr.push(Position);
                            })
                            return PositionListArr;
                        });
                        for(let i = 0; i < shopNameList.length; i++){
                            let shopMessage = [];
                            let province = await page.$eval('.province .select span',el => el.innerText);
                            let city = await page.$eval('.city .select span',el=> el.innerText);
                            let area = await page.$eval('.area .select span',el => el.innerText);
                            let shopName = shopNameList[i];
                            let shopPosition = shopPositionList[i];
                            await page.waitFor(100);
                            shopMessage = [province,city,area,shopName,shopPosition];
                            insertShop(shopMessage);
                            console.log(province);
                            console.log(city);
                            console.log(area);
                            console.log(shopName);
                            console.log(shopPosition);
                        }
                    }else{
                        
                        areaSelect[k].click();
                        // 点击操作执行后再执行这一步
                        await page.waitForResponse(res => {
                            return res.request().url().startsWith('https://consumer.huawei.com/cmkt/services/service/store/query/v2') && res.ok()
                        })
                        await page.waitFor(1000)
                        shopNameList = await page.$$eval('.point-list #pointList li .point-title p',(elements)=>{
                            //设置存放数据的数组
                            var PointList = [];
                            elements.forEach((item,i) => {
                                let pointName = item.innerText;
                                PointList.push(pointName);
                            });
                            return PointList
                        });
                        shopPositionList = await page.$$eval('.point-list #pointList li .left-box .pa-wrap span a',(elements)=>{
                            var PositionListArr = [];
                            elements.forEach((item,i) => {
                                let Position = item.innerText;
                                PositionListArr.push(Position);
                            })
                            return PositionListArr;
                        });
                        for(let i = 0; i < shopNameList.length; i++){
                            let shopMessage = [];
                            let province = await page.$eval('.province .select span',el => el.innerText);
                            let city = await page.$eval('.city .select span',el=> el.innerText);
                            let area = await page.$eval('.area .select span',el => el.innerText);
                            let shopName = shopNameList[i];
                            let shopPosition = shopPositionList[i];
                            await page.waitFor(100);
                            shopMessage = [province,city,area,shopName,shopPosition];
                            insertShop(shopMessage);
                            console.log(province);
                            console.log(city);
                            console.log(area);
                            console.log(shopName);
                            console.log(shopPosition);
                        }
                    }
                }
            }
            
        }
    }
}
addShopMessage().catch(error => console.log(error))
  1. 启动说明:
    由于涉及到自动打开浏览器等操作,有一些电脑可能会出现直接在编辑器自带终端里边启动会导致报错或者打开后一直没反应,这是因为权限不够,所以一般启动的时候这里建议直接使用电脑的cmd管理员窗口进行项目启动,这样就不会出现其他问题了,关于代码,还没有做优化,毕竟只是把数据拿过来用一下,有兴趣的朋友可以将我的代码优化一下给点优化建议,本代码仅仅适用于学习,请不要用于商业用途。谢谢合作

你可能感兴趣的:(puppeteer)