使用node抓取某个页面的所以新闻链接及相关文章链接里面的具体内容
var eventproxy=require('eventproxy');
var request=require('superagent');
var superagent=require('superagent-charset')(request);
var cheerio=require('cheerio');
var url=require('url');
var fs=require('fs');
var cnodeUrl='http://gd.qq.com/l/gdfabu/gdfbzwjj/more.htm';
var sql=require('mssql');
superagent.get(cnodeUrl).charset()
.end(function(err, res) {
if(err) {
returnconsole.error(err);
}
var topicUrls= [];
var $ =cheerio.load(res.text,{decodeEntities:false});//cheerio like jquery
$('.list01 li a').each(function(idx, element) {
var $element=$(element);
var href=url.resolve(cnodeUrl,$element.attr('href'));
var title=$element.text();
topicUrls.push({href:href,title:title});
});
var fileStr="";
var ep=new eventproxy();
ep.after('topic_html',topicUrls.length,function(topics) {
var topics= topics.map(function(topicPair) {
var topicUrl= topicPair[0];
var topicHtml= topicPair[1];
var $=cheerio.load(topicHtml,{decodeEntities:false});
var cm=$('#Cnt-Main-Article-QQ').html()||"";
fileStr+="title:"+topicUrl.title+";url:"+topicUrl.href+/*";content:"+cm+*/"\n";
return({
title:topicUrl.title||"无",
url:topicUrl.href,
comment:cm
});
});
//console.log(topics);
//write file
/* fs.writeFile("node4.txt",fileStr,function(err){
if(err) return console.error(err);
});*/
// insert into database
var conUrl="mssql://sa:123456@localhost:1433/text";
sql.connect(conUrl,function(err, conn) {
if(err) {
console.log(err);
return;
}
constrequest=newsql.Request();
topics.forEach(function(topic){
var sqlL="INSERT INTO press (Title, Content, Origin) VALUES ('"+topic.title+"','"+topic.comment+"','"+topic.url+"')";
request.query(sqlL,function( e, r ) {
if(e) {
console.log(e);
}
// console.log(r);
console.log("success");
});
});
});
});
topicUrls.forEach(function(topicUrl) {
superagent.get(topicUrl.href).charset()
.end(function(err, res) {
ep.emit('topic_html', [topicUrl, res.text]);
});
});
});
eventproxy这前四个模块自己install,这个抓取方法不是最后,可以采用async去代替eventproxy,eventproxy是一次性抓取,容易 被别人发现,async可以控制抓取次数
写入数据库的自己模拟个,写入txt效果如下图: