这学期web编程课,第一个实验项目是新闻网站的爬虫。第一次接触,还很生疏,中间也有很多困难。
具体要求
1、选取3-5个代表性的新闻网站(比如新浪新闻、网易新闻等,或者某个垂直领域权威性的网站比如经济领域的雪球财经、东方财富等,或者体育领域的腾讯体育、虎扑体育等等)建立爬虫,针对不同网站的新闻页面进行分析,爬取出编码、标题、作者、时间、关键词、摘要、内容、来源等结构化信息,存储在数据库中。
2、建立网站提供对爬取内容的分项全文搜索,给出所查关键词的时间热度分析。
技术要求:
1、必须采用Node.JS实现网络爬虫
2、必须采用Node.JS实现查询网站后端,HTML+JS实现前端(尽量不要使用任何前后端框架)
前期准备
安装node.js和几个模块库
安装IDE:vscode
过程
1.爬虫
新闻网站选取了中国青年网的新闻版块http://news.youth.cn/
var source_name = "中国青年网";
var domain = 'http://www.news.youth.cn/';
var myEncoding = "GBK";
var seedURL = 'http://www.news.youth.cn/';
观察网页源代码得到
var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('#publishdate').text()";
var author_format = "$('#author').text()";
var content_format = "$('page_text').text()";
var desc_format = " $('Meta[name=\"Description\"]').eq(0).attr(\"content\")";
var source_format = "$('#source').text()";
var url_reg = /\/(\d{6})\/t(\d{8})_(\d{8}).htm/;
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/
引用模块库
var fs = require('fs');
var myRequest = require('request');
var myCheerio = require('cheerio');
var myIconv = require('iconv-lite');
require('date-utils');
老师提供的防止网站屏蔽爬虫的代码
//防止网站屏蔽我们的爬虫
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
后面借鉴了老师的代码
seedget();
function seedget() {
request(seedURL, function(err, res, body) { //读取种子页面
// try {
//用iconv转换编码
var html = myIconv.decode(body, myEncoding);
//console.log(html);
//准备用cheerio解析html
var $ = myCheerio.load(html, { decodeEntities: true });
// } catch (e) { console.log('读种子页面并转码出错:' + e) };
var seedurl_news;
try {
seedurl_news = eval(seedURL_format);
} catch (e) { console.log('url列表所处的html块识别出错:' + e) };
seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
var myURL = "";
try {
//得到具体新闻url
var href = "";
href = $(e).attr("href");
if (href == undefined) return;
if (href.toLowerCase().indexOf('http://') >= 0) myURL = href; //http://开头的
else if (href.startsWith('//')) myURL = 'http:' + href; ////开头的
else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; //其他
} catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }
if (!url_reg.test(myURL)) return; //检验是否符合新闻url的正则表达式
//console.log(myURL);
var fetch_url_Sql = 'select url from fetches where url=?';
var fetch_url_Sql_Params = [myURL];
mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
if (vals.length > 0) {
console.log('URL duplicate!')
} else newsGet(myURL); //读取新闻页面
});
});
});
};
function newsGet(myURL) { //读取新闻页面
request(myURL, function(err, res, body) { //读取新闻页面
//try {
var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
//console.log(html_news);
//准备用cheerio解析html_news
var $ = myCheerio.load(html_news, { decodeEntities: true });
myhtml = html_news;
//} catch (e) { console.log('读新闻页面并转码出错:' + e);};
console.log("转码读取成功:" + myURL);
//动态执行format字符串,构建json对象准备写入文件或数据库
var fetch = {};
fetch.title = "";
fetch.content = "";
fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
//fetch.html = myhtml;
fetch.url = myURL;
fetch.source_name = source_name;
fetch.source_encoding = myEncoding; //编码
fetch.crawltime = new Date();
if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format); //没有关键词就用sourcename
else fetch.keywords = eval(keywords_format);
if (title_format == "") fetch.title = ""
else fetch.title = eval(title_format); //标题
if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期
console.log('date: ' + fetch.publish_date);
fetch.publish_date = regExp.exec(fetch.publish_date)[0];
fetch.publish_date = fetch.publish_date.replace('年', '-')
fetch.publish_date = fetch.publish_date.replace('月', '-')
fetch.publish_date = fetch.publish_date.replace('日', '')
fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");
if (author_format == "") fetch.author = source_name; //eval(author_format); //作者
else fetch.author = eval(author_format);
if (content_format == "") fetch.content = "";
else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容,是否要去掉作者信息自行决定
if (source_format == "") fetch.source = fetch.source_name;
else fetch.source = eval(source_format).replace("\r\n", ""); //来源
if (desc_format == "") fetch.desc = fetch.title;
else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要
// var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
// "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
// ////存储json
// fs.writeFileSync(filename, JSON.stringify(fetch));
var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)';
var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content
];
//执行sql,数据库中fetch表里的url属性是unique的,不会把重复的url内容写入数据库
mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
if (qerr) {
console.log(qerr);
}
}); //mysql写入
});
}
引入MySQL
var mysql = require("mysql");
var pool = mysql.createPool({
host: '127.0.0.1',
user: 'root',
password: 'root',
database: 'crawl'
});
var query = function(sql, sqlparam, callback) {
pool.getConnection(function(err, conn) {
if (err) {
callback(err, null, null);
} else {
conn.query(sql, sqlparam, function(qerr, vals, fields) {
conn.release(); //释放连接
callback(qerr, vals, fields); //事件驱动回调
});
}
});
};
var query_noparam = function(sql, callback) {
pool.getConnection(function(err, conn) {
if (err) {
callback(err, null, null);
} else {
conn.query(sql, function(qerr, vals, fields) {
conn.release(); //释放连接
callback(qerr, vals, fields); //事件驱动回调
});
}
});
};
exports.query = query;
exports.query_noparam = query_noparam;
建立mysql数据库
CREATE TABLE `fetches` (
`id_fetches` int(11) NOT NULL AUTO_INCREMENT,
`url` varchar(200) DEFAULT NULL,
`source_name` varchar(200) DEFAULT NULL,
`source_encoding` varchar(45) DEFAULT NULL,
`title` varchar(200) DEFAULT NULL,
`keywords` varchar(200) DEFAULT NULL,
`author` varchar(200) DEFAULT NULL,
`publish_date` date DEFAULT NULL,
`crawltime` datetime DEFAULT NULL,
`content` longtext,
`createtime` datetime DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id_fetches`),
UNIQUE KEY `id_fetches_UNIQUE` (`id_fetches`),
UNIQUE KEY `url_UNIQUE` (`url`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
2.建立网站
都借鉴了老师的代码
前段搭建
后端搭建
var express = require('express');
var mysql = require('./mysql.js')
var app = express();
//app.use(express.static('public'));
app.get('/7.03.html', function(req, res) {
res.sendFile(__dirname + "/" + "7.03.html");
})
app.get('/7.04.html', function(req, res) {
res.sendFile(__dirname + "/" + "7.04.html");
})
app.get('/process_get', function(req, res) {
res.writeHead(200, { 'Content-Type': 'text/html;charset=utf-8' }); //设置res编码为utf-8
//sql字符串和参数
var fetchSql = "select url,source_name,title,author,publish_date from fetches where title like '%" +
req.query.title + "%'";
mysql.query(fetchSql, function(err, result, fields) {
console.log(result);
res.end(JSON.stringify(result));
});
})
var server = app.listen(8080, function() {
console.log("访问地址为 http://127.0.0.1:8080/7.03.html")
})
第一次爬虫项目,觉得很难,很有挑战性,希望下一次能有所进步。