一. 安装插件
- 安装 puppeteer插件
npm install puppeteer -S - 引入puppeteer这个插件
const puppeteer = require('puppeteer');
- 编写一个异步的async自执行函数
二. 爬去数据(爬取豆瓣的电影数据)
- 使用puppetee插件启动一个浏览器,并开启一个新页面
const url = `https://movie.douban.com/explore#!type=movie&tag=%E7%BB%8F%E5%85%B8&sort=rank&page_limit=20&page_start=0`;
const brower = await puppeteer.launch({
args: ['--no-sandbox'],
dumpio: false
});
// 开启一个新页面
const page = await brower.newPage()
- 跳转到需要爬取得页面
// 去豆瓣那个页面
await page.goto(url, {
waitUntil: 'networkidle2' // 网络空闲说明已加载完毕
});
- 由于我们要爬取两页的数据,所以要等待页面的等多数据出现,然后模拟点击
// 页面加载更多按钮出现(查找元素)
await page.waitForSelector('.more');
// 只爬取两页的数据
for (let i = 0; i < 1; i++) {
await sleep(3000);
// 点击加载更多
await page.click('.more')
}
- 这时我们可以执行爬取我们需要的数据了,我们可以去审查页面的dom结果,来循环便利这些数据。
const result = await page.evaluate(() => {
// 拿到页面上的jQuery
var $ = window.$;
var items = $('.list-wp a');
var links = [];
if (items.length >= 1) {
items.each((index,item)=>{
let it = $(item)
let doubanId = it.find('div').data('id')
let title = it.find('img').attr('alt')
let rate = Number(it.find('strong').text())
let poster = it.find('img').attr('src')
links.push({
doubanId,
title,
rate,
poster
})
});
}
return links
});
- 关闭浏览器,在console里面打印我们需要的数据
// 关闭浏览器
brower.close();
console.log(result);
结果
[ { doubanId: 1292052,
title: '肖申克的救赎',
rate: 9.6,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p480747492.jpg' },
{ doubanId: 1291546,
title: '霸王别姬',
rate: 9.5,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p1910813120.jpg' },
{ doubanId: 1295644,
title: '这个杀手不太冷',
rate: 9.4,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p511118051.jpg' },
{ doubanId: 1292720,
title: '阿甘正传',
rate: 9.4,
poster: 'https://img1.doubanio.com/view/photo/s_ratio_poster/public/p510876377.jpg' },
{ doubanId: 1292063,
title: '美丽人生',
rate: 9.5,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p510861873.jpg' },
{ doubanId: 1295124,
title: '辛德勒的名单',
rate: 9.4,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p492406163.jpg' },
{ doubanId: 1291561,
title: '千与千寻',
rate: 9.2,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p1606727862.jpg' },
{ doubanId: 1292722,
title: '泰坦尼克号',
rate: 9.3,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p457760035.jpg' },
{ doubanId: 1296141,
title: '控方证人',
rate: 9.6,
poster: 'https://img1.doubanio.com/view/photo/s_ratio_poster/public/p1505392928.jpg' },
{ doubanId: 3541415,
title: '盗梦空间',
rate: 9.3,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p513344864.jpg' },
{ doubanId: 25662329,
title: '疯狂动物城',
rate: 9.2,
poster: 'https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2315672647.jpg' },
{ doubanId: 2131459,
title: '机器人总动员',
rate: 9.3,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p1461851991.jpg' },
{ doubanId: 3011091,
title: '忠犬八公的故事',
rate: 9.2,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p524964016.jpg' },
{ doubanId: 3793023,
title: '三傻大闹宝莱坞',
rate: 9.2,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p579729551.jpg' },
{ doubanId: 1889243,
title: '星际穿越',
rate: 9.2,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2206088801.jpg' },
{ doubanId: 1292213,
title: '大话西游之大圣娶亲',
rate: 9.2,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2455050536.jpg' },
{ doubanId: 1291549,
title: '放牛班的春天',
rate: 9.2,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p1910824951.jpg' },
{ doubanId: 1292001,
title: '海上钢琴师',
rate: 9.2,
poster: 'https://img1.doubanio.com/view/photo/s_ratio_poster/public/p511146807.jpg' },
{ doubanId: 1292064,
title: '楚门的世界',
rate: 9.1,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p479682972.jpg' },
{ doubanId: 1291841,
title: '教父',
rate: 9.2,
poster: 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2190556185.jpg' },
{ rate: 0 } ]
完整代码
const puppeteer = require('puppeteer');
// 等待3000毫秒
const sleep = time => new Promise(resolve => {
setTimeout(resolve, time);
})
const url = `https://movie.douban.com/explore#!type=movie&tag=%E7%BB%8F%E5%85%B8&sort=rank&page_limit=20&page_start=0`;
;(async() => {
console.log('Start visit');
// 启动一个浏览器
const brower = await puppeteer.launch({
args: ['--no-sandbox'],
dumpio: false
});
const page = await brower.newPage() // 开启一个新页面
// 去豆瓣那个页面
await page.goto(url, {
waitUntil: 'networkidle2' // 网络空闲说明已加载完毕
});
await sleep(3000);
// 页面加载更多按钮出现
await page.waitForSelector('.more');
// 只爬取两页的数据
for (let i = 0; i < 1; i++) {
await sleep(3000);
// 点击加载更多
await page.click('.more')
}
// 结果
const result = await page.evaluate(() => {
// 拿到页面上的jQuery
var $ = window.$;
var items = $('.list-wp a');
var links = [];
if (items.length >= 1) {
items.each((index,item)=>{
let it = $(item)
let doubanId = it.find('div').data('id')
let title = it.find('img').attr('alt')
let rate = Number(it.find('strong').text())
let poster = it.find('img').attr('src')
links.push({
doubanId,
title,
rate,
poster
})
});
}
return links
});
// 关闭浏览器
brower.close();
console.log(result);
})();
总结
- 使用了async和await这种写法,需要node更高的版本
- 爬去之后我们也可以将数据存入mongodb
- 最好开启一个子线程来执行这些操作。
会在下一遍文章写开启子线程来爬取数据