现在博客园首页文章质量参差不齐,比如我这篇就要水了。于是弄了个小爬虫定时去爬首页的文章,超过1000点击的就自动发送邮件。
https://github.com/kklldog/cnblogs_notice
接口
博客园的首页列表其实是有ajax接口的阅读量就在这里面,使用cheerio就可以抽取出来。
https://www.cnblogs.com/mvc/AggSite/PostList.aspx
使用request发送请求
var request = require('request');
var get = function (url, callback, errCallback, trytimes) {
req({ url: url, timeout: 30000 }, callback, errCallback, trytimes);
}
var post = function(url,body,isJson,callback,errCallback,trytimes){
req({ url: url, timeout: 30000,body:body,method:'POST',json:isJson }, callback, errCallback, trytimes);
}
var req = function (option, callback, errCallback, trytimes) {
if (trytimes === undefined) {
trytimes = 5;
}
request(option, function (err, res) {
if (err) {
console.error('request ' + option.url + ' error .');
console.error(err);
if (trytimes > 0) {
req(option, callback, errCallback, trytimes - 1);
}
else {
if (errCallback) {
errCallback(err);
}
}
}
else {
callback(res);
}
});
}
exports.get = get;
exports.post = post;
exports.req = req;
使用cheerio抽取数据
var $ = cheerio.load(body);
$('div.post_item_body').each((index, postBody) => {
var name = $(postBody).find('a.titlelnk').text();
$(postBody).find('span.article_view a').each((i, e) => {
var link = $(e).attr('href');
var text = $(e).text();
var sIndex = text.indexOf('(');
var eIndex = text.indexOf(')');
var viewCount = text.substr(sIndex + 1, eIndex - sIndex - 1);
var intViewCount = parseInt(viewCount);
console.log(link + ' ' + viewCount + ' ' + name);
if (intViewCount > 1000) {
// console.log(link + ' ' + viewCount+' '+name);
trySendMail(link, name,mailAddress);
}
});
});
使用mongodb储存数据
var Db = require('mongodb').Db;
var Server = require('mongodb').Server;
var MongoClient = require('mongodb').MongoClient;
var db;
var init = function () {
MongoClient.connect("mongodb://localhost:27017/notice", (err, database) => {
if (err) {
console.error(err);
return;
}
console.log('connect to db success');
db = database;
});
}
var insert = function (collName, data, callback) {
var coll = db.collection(collName);
coll.insert(data, (err, r) => {
if (!err) {
console.log('save to ' + collName + ' success !');
if (callback) {
callback(r);
}
}
else {
console.error(err);
}
});
};
var queryPage = function (collName, filter, skip, limit, callback) {
var coll = db.collection(collName);
coll.find(filter).sort({ videoId: 1 }).skip(skip).limit(limit).toArray((err, r) => {
if (!err) {
callback(r);
}
else {
console.error(err);
callback([]);
}
});
}
var remove = function (collName, filter, callback) {
var coll = db.collection(collName);
coll.remove(filter, ((err, r) => {
if (!err) {
console.log('remove to ' + collName + ' success !');
if (callback) {
callback(r);
}
}
else {
console.error(err);
}
}));
}
var find = function (collName, filter, callback) {
var coll = db.collection(collName);
coll.find(filter).toArray((err, r) => {
if (!err) {
callback(r);
}
else {
console.error(err);
callback([]);
}
})
}
var update = function (collName, filter, updateObj, callback, errCallback) {
var coll = db.collection(collName);
coll.update(filter, { $set: updateObj }, (err, r) => {
if (!err) {
console.log('update to ' + collName + ' success !');
if (callback) {
callback(r);
}
}
else {
console.error(err);
errCallback(err);
}
});
}
exports.insert = insert;
exports.queryPage = queryPage;
exports.remove = remove;
exports.find = find;
exports.update = update;
exports.init = init;
使用node-schedule来执行定时任务
var schedule = require('node-schedule');
var cnblogs =require('./cnblogs');
var filter = function(){
cnblogs.filter(1,10);
}
var initSchedule = function () {
schedule.scheduleJob({ hour:10, minute: 01 }, filter);
console.log('schedule inited .');
}
订阅
回复邮件地址就可以自动订阅推送 :)
我的博客即将搬运同步至腾讯云+社区,邀请大家一同入驻:https://cloud.tencent.com/developer/support-plan