使用 puppeteer 爬取古诗文网 https://www.gushiwen.org/shiwen/ 内容及音频,通过 mysqljs 保存到数据库
核心代码如下:
'use strict';
const puppeteer = require('puppeteer');
var request = require('request');
var fs = require('fs');
const gushiwen = require('./sql');
(async () => {
const browser = await puppeteer.launch({
// headless: false
});
for(let i=1 ; i < 999; i++){
let url = 'https://www.gushiwen.org/shiwen/default_0AA'+i+'.aspx';
this.page = await browser.newPage();
//添加await降低请求的速度,避免影响对端服务器
await click(this.page,url);
// click(this.page,url);
}
})();
async function click(page,url){
// let page = this.page;
await page.goto(url);
let mp3IDList = await page.$$('img[id*="speakerimg"]')
// console.debug(mp3IDList)
for(let i=0 ; i < mp3IDList.length; i++){
await mp3IDList[i].click();
}
let cont = await page.$$('.left .sons')
// console.debug(cont);
for(let i=0 ; i < cont.length; i++){
// 获取标题
let title = await cont[i].$eval('.sons .cont a', el => el.innerText);
console.debug(title);
// 获取作者和朝代
let source = await cont[i].$eval('.source', el => el.innerText);
// console.debug(source);
let dynasty = source.split(":")[0]
let author = source.split(":")[1]
// console.debug(dynasty);
// console.debug(author);
// 获取内容ID
let id = (await cont[i].$eval('.contson', el => el.id)).substring(7)
// console.debug(id);
// 获取内容
let contson = await cont[i].$eval('.contson', el => el.innerHTML);
// console.debug(contson);
// 获取tag
let tag = await cont[i].$eval('.tag', el => el.innerText).catch(function (err){
console.error(err);
});
if (tag === undefined){
tag = null
}else{
tag = tag.replace(/[\r\n]/g,"").replace(/,/g,",")
}
// console.debug(tag)
// 获取点赞数
let scores = (await cont[i].$eval('.good', el => el.innerText)).trim();
// console.debug(scores);
// 获取音频地址
let audiosrc = await cont[i].$eval('audio', el => el.src).catch(function (err){
console.error(err);
});
// console.debug(audiosrc)
let filename = ''
if (audiosrc === undefined){
audiosrc = null
}else{
// 下载文件
filename = './mp3/' + audiosrc.split('/')[5]
await downloadFile(audiosrc,filename,function(){
console.debug(filename+'下载完毕');
});
}
gushiwen.insertGushiwen(id,title,author,contson,dynasty,filename,scores,tag)
}
}
/*
* url 网络文件地址
* filename 文件名
* callback 回调函数
*/
function downloadFile(url,filename,callback){
fs.open(filename, 'wx', (err, fd) => {
if (err) {
if (err.code === 'EEXIST') {
console.error(filename + ' already exists');
return;
}
throw err;
}
// console.debug('downloading');
let stream = fs.createWriteStream(filename);
try {
request(url).pipe(stream).on('close', callback);
} catch (err) {
console.error(err);
}
});
}
使用 mysqljs 数据库插入操作:
var mysql = require('mysql');
var connection = mysql.createConnection({
host : 'localhost',
user : 'root',
password : 'root',
database : 'nichuiniu'
});
module.exports = {
insertGushiwen: function(num, title,author,content,dynasty,audiourl,scores,tag){
console.log('insert Gushiwen into tables')
let sql = {num: num, title: title,author:author,content: content,
dynasty: dynasty,audiourl:audiourl,scores: scores, tag: tag};
connection.query('INSERT ignore INTO tbl_nichuiniu_gushiwen SET ?', sql, function(err, results, fields){
if (err) throw err;
console.log('The affect row is: ' + results.insertId);
}
);
}
}
// 单独测试使用的方法
// connection.connect();
// var post = {num: 2, title: 'title',author:'author',content: 'content',
// dynasty: 'dynasty',audiourl:'audiourl',scores: 1, tag: 'title'};
// connection.query('INSERT ignore INTO tbl_nichuiniu_gushiwen SET ?', post, function (error, results, fields) {
// if (error) throw error;
// console.log('The solution is: ' + results.insertId);
// });
// connection.end();
GitHub地址:https://github.com/libp/gushiwenpuppeteer