一直想有个机会促使自己学习一下node.js,之前也说过自己心中的一个想法,于是准备学爬虫,以下的代码是自己看别人的代码写的,只能算是学习吧!
爬取的歌单网址:http://www.luoo.net/music/
用到的node.js的原生模块:fs、path
用到的第三方包:async(异步流程控制)、request(发起HTTP请求)、colors(在控制台输出带颜色的文字)、cheerio(服务端操作DOM)
涉及到的ES6知识:类、模板字符串、promise、数组实例的keys()方法、数组空位
"use strict"
const fs = require("fs");
const path = require("path");
const async = require("async");
const request = require("request");
const colors = require("colors");
const cheerio = require("cheerio");
const opts = {
baseUrl: "http://www.luoo.net/music/",
range:[...Array(854).keys()].slice(1)
}
class Crawler {
constructor() {
}
checkImgPath(p){
try{
fs.accessSync(path.join(__dirname, p) , fs.F_OK);
} catch(e) {
fs.mkdirSync(path.join(__dirname,p))
}
}
getSongList(url, n){
const self = this;
return new Promise(function(resolve, reject){
request(url, function(err,res,body){
if (!err && res.statusCode == 200) {
let $ = cheerio.load(body)
const title = $('.vol-title').text()
const dir = `/luowang/vol.${n} ${title}`
const songs = $('.track-wrapper').map(function(ele,i,arr) {
return ($(i).find('.trackname').text() + '-' + $(i).find('.artist').text())
})
self.checkImgPath(dir)
resolve({title,songs,dir})
}
})
})
}
downloadSong(radio, title, num, dir, callback2) {
num = radio > 2 && num < 10 ? "0" + num : num;
const uri=`http://luoo-mp3.kssws.ks-cdn.com/low/luoo/radio${radio}/${num}.mp3`;
request(uri)
.pipe(fs.createWriteStream(path.join(__dirname, dir, title + ".mp3")))
.on("error",function(err){
callback2(null);
})
.on("close",() => {
console.log(title," is downloaded!");
callback2(null);
})
}
start(){
this.checkImgPath("luowang");
async.eachOfSeries(opts.range, (n, idx, callback) => {
this.getSongList(opts.baseUrl + n, n, callback)
.then((songInfo) => {
console.log(colors.green(`\nvol.${n} ${songInfo.title}'s downloading is started!`));
async.eachOfSeries(songInfo.songs, (s, i, callback2) => {
this.downloadSong(n, s, i+1, songInfo.dir, callback2);
}, () => {
console.log("d");
console.log(colors.green(`vol.${n} ${songInfo.title} is downloaded!`));
callback(null);
})
})
}, () => {
console.log(colors.magenta("All is downloaded!!!"));
})
}
}
const crawler = new Crawler();
crawler.start();
运行代码20分钟,得到共38个文件夹,442个mp3文件并且已经命名好,共计1G的音乐,从此再也不用担心断网后没歌听了。
(也是不敢再运行下去了,目前网站共有866个歌单,爬取下来也就是866个文件夹,约23G的mp3文件)
最后是一个广告贴,最近新开了一个分享技术的公众号,欢迎大家关注