满分100分,作为平时成绩,占最终总评成绩的40%。
一、爬虫部分 30分
1、完成最少一个目标网站(网站主题不限,不允许直接使用示例中的中国新闻网)的分析和爬虫设计。
2、爬取不少于100条数据(每条数据包括最少3个字段,标题、内容和时间),并存储在数据库中。
3、需提交源代码,完成多个网站的爬虫酌情加分。
二、搜索网站部分 30分
1、完成对数据库中爬取数据内容或标题的搜索功能,搜索结果以表格形式展示在前端页面中。
2、完成对搜索内容的时间热度分析,比如搜索“新冠”,可以展示爬取数据内容中每一天包含“新冠”的条数,具体展示形式不限,可以用文字或表格展示,也可以用图表展示。
3、需提交源代码,网站页面设计简洁美观酌情加分。
三、报告 40分
1、详细描述对目标网站的分析过程。
2、详细描述爬虫整体结构,使用的工具包,以及数据库设计。
3、详细描述搜索网站前后端的设计。
4、图文展示实现的效果。
先下载好node.js和vscode并安装一些所需的插件。
对示例代码crawler2.js进行具体分析,弄明白爬虫的原理。
var fs = require('fs');
var myRequest = require('request');
var myCheerio = require('cheerio');
var myIconv = require('iconv-lite');
require('date-utils');
var mysql = require('./mysql.js');
var source_name = "中国新闻网";
var domain = 'http://www.chinanews.com/';
var myEncoding = "utf-8";
var seedURL = 'http://www.chinanews.com/';
确定目标网站为中国新闻网
var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('#pubtime_baidu').text()";
var author_format = "$('#editor_baidu').text()";
var content_format = "$('.left_zw').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('#source_baidu').text()";
var url_reg = /\/(\d{4})\/(\d{2})-(\d{2})\/(\d{7}).shtml/;
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/
其中最后两行代码为正则表达式,它描述了一种字符串匹配的模式,可以用来检查一个串是否含有某种子串、将匹配的子串替换或者从某个串中取出符合某个条件的子串等。
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
//request模块异步fetch url
function request(url, callback) {
var options = {
url: url,
encoding: null,
//proxy: 'http://x.x.x.x:8080',
headers: headers,
timeout: 10000 //
}
myRequest(options, callback)
};
seedget();
function seedget() {
request(seedURL, function(err, res, body) { //读取种子页面
// try {
//用iconv转换编码
var html = myIconv.decode(body, myEncoding);
//console.log(html);
//准备用cheerio解析html
var $ = myCheerio.load(html, { decodeEntities: true });
// } catch (e) { console.log('读种子页面并转码出错:' + e) };
var seedurl_news;
try {
seedurl_news = eval(seedURL_format);
} catch (e) { console.log('url列表所处的html块识别出错:' + e) };
seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
var myURL = "";
try {
//得到具体新闻url
var href = "";
href = $(e).attr("href");
if (href == undefined) return;
if (href.toLowerCase().indexOf('http://') >= 0) myURL = href; //http://开头的
else if (href.startsWith('//')) myURL = 'http:' + href; 开头的
else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; //其他
} catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }
if (!url_reg.test(myURL)) return; //检验是否符合新闻url的正则表达式
//console.log(myURL);
var fetch_url_Sql = 'select url from fetches where url=?';
var fetch_url_Sql_Params = [myURL];
mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
if (vals.length > 0) {
console.log('URL duplicate!')
} else newsGet(myURL); //读取新闻页面
});
});
});
};
function newsGet(myURL) { //读取新闻页面
request(myURL, function(err, res, body) { //读取新闻页面
//try {
var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
//console.log(html_news);
//准备用cheerio解析html_news
var $ = myCheerio.load(html_news, { decodeEntities: true });
myhtml = html_news;
//} catch (e) { console.log('读新闻页面并转码出错:' + e);};
}
console.log("转码读取成功:" + myURL);
//动态执行format字符串,构建json对象准备写入文件或数据库
var fetch = {};
fetch.title = "";
fetch.content = "";
fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
fetch.url = myURL;
fetch.source_name = source_name;
fetch.source_encoding = myEncoding; //编码
fetch.crawltime = new Date();
if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format); //
else fetch.keywords = eval(keywords_format);
if (title_format == "") fetch.title = ""
else fetch.title = eval(title_format); //标题
if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期
console.log('date: ' + fetch.publish_date);
fetch.publish_date = regExp.exec(fetch.publish_date)[0];
fetch.publish_date = fetch.publish_date.replace('年', '-')
fetch.publish_date = fetch.publish_date.replace('月', '-')
fetch.publish_date = fetch.publish_date.replace('日', '')
fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");
if (author_format == "") fetch.author = source_name; //eval(author_format); //作者
else fetch.author = eval(author_format);
if (content_format == "") fetch.content = "";
else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容
if (source_format == "") fetch.source = fetch.source_name;
else fetch.source = eval(source_format).replace("\r\n", ""); //来源
if (desc_format == "") fetch.desc = fetch.title;
else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要
var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)';
var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content
];
mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
if (qerr) {
console.log(qerr);
}
});
});
}
可以看出,代码对写入数据库的时间变量的格式有所规定,以"-"的形式来取代网页中可能存在的”年“”月“”日“,使代码整体更具有规范性和可读性。
先按F12查看网页源代码,确定网页的标题、来源、keywords等class属性,再利用jQuery选择器,对class和id等分别进行选择。
var fs = require('fs');
var myRequest = require('request');
var myCheerio = require('cheerio');
var myIconv = require('iconv-lite');
require('date-utils');
var mysql = require('./mysql.js');
var source_name = "网易新闻";
var domain = 'https://news.163.com/';
var myEncoding = "utf-8";
var seedURL = 'https://news.163.com/';
var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('.post_info').text()";
var content_format = "$('.left_zw').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('.post_info').text()";
var url_reg = /article/;
var regExp = /(\d{1,2}(\-|\/|\.)\d{1,2}\2\d{4})/
//防止网站屏蔽我们的爬虫
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
//request模块异步fetch url
function request(url, callback) {
var options = {
url: url,
encoding: null,
//proxy: 'http://x.x.x.x:8080',
headers: headers,
timeout: 10000 //
}
myRequest(options, callback)
};
seedget();
function seedget() {
request(seedURL, function(err, res, body) { //读取种子页面
// try {
//用iconv转换编码
var html = myIconv.decode(body, myEncoding);
//console.log(html);
//准备用cheerio解析html
var $ = myCheerio.load(html, { decodeEntities: true });
// } catch (e) { console.log('读种子页面并转码出错:' + e) };
var seedurl_news;
try {
seedurl_news = eval(seedURL_format);
} catch (e) { console.log('url列表所处的html块识别出错:' + e) };
seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
var myURL = "";
try {
//得到具体新闻url
var href = "";
href = $(e).attr("href");
if (href == undefined) return;
if (href.toLowerCase().indexOf('http://') >= 0) myURL = href; //http://开头的
else if (href.startsWith('//')) myURL = 'http:' + href; 开头的
else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; //其他
} catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }
if (!url_reg.test(myURL)) return; //检验是否符合新闻url的正则表达式
//console.log(myURL);
var fetch_url_Sql = 'select url from fetches where url=?';
var fetch_url_Sql_Params = [myURL];
mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
if (vals.length > 0) {
console.log('URL duplicate!')
} else newsGet(myURL); //读取新闻页面
});
});
});
};
function newsGet(myURL) { //读取新闻页面
request(myURL, function(err, res, body) { //读取新闻页面
//try {
var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
//console.log(html_news);
//准备用cheerio解析html_news
var $ = myCheerio.load(html_news, { decodeEntities: true });
myhtml = html_news;
//} catch (e) { console.log('读新闻页面并转码出错:' + e);};
console.log("转码读取成功:" + myURL);
//动态执行format字符串,构建json对象准备写入文件或数据库
var fetch = {};
fetch.title = "";
fetch.content = "";
fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
//fetch.html = myhtml;
fetch.url = myURL;
fetch.source_name = source_name;
fetch.source_encoding = myEncoding; //编码
fetch.crawltime = new Date();
if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format); //没有关键词就用sourcename
else fetch.keywords = eval(keywords_format);
if (title_format == "") fetch.title = ""
else fetch.title = eval(title_format); //标题
if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期
if (fetch.publish_date == undefined || fetch.publish_date == "") return;
console.log('date: ' + fetch.publish_date);
fetch.publish_date = regExp.exec(fetch.publish_date)[0];
fetch.publish_date = fetch.publish_date.replace('年', '-')
fetch.publish_date = fetch.publish_date.replace('月', '-')
fetch.publish_date = fetch.publish_date.replace('日', '')
fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");
if (content_format == "") fetch.content = "";
else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容,是否要去掉作者信息自行决定
if (source_format == "") fetch.source = fetch.source_name;
else fetch.source = eval(source_format).replace("\r\n", ""); //来源
if (desc_format == "") fetch.desc = fetch.title;
else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要
// var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
// "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
// 存储json
// fs.writeFileSync(filename, JSON.stringify(fetch));
var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
'keywords,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?)';
var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
fetch.title, fetch.keywords, fetch.publish_date,
fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content
];
//执行sql,数据库中fetch表里的url属性是unique的,不会把重复的url内容写入数据库
mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
if (qerr) {
console.log(qerr);
}
}); //mysql写入
});
}
有了先前对示例和网页的详细分析,自己写后端代码时轻松了很多。
var express = require('express');
var router = express.Router();
var mysql = require('../mysql.js');
/* GET home page. */
router.get('/', function(req, res, next) {
res.render('index', { title: 'Express' });
});
router.get('/process_get', function(request, response) {
//sql字符串和参数
var fetchSql = "select url,source_name,source_encoding,title,keywords,publish_date,crawltime,content " +
"from fetches where title like '%" + request.query.title + "%'";
mysql.query(fetchSql, function(err, result, fields) {
response.writeHead(200, {
"Content-Type": "application/json"
});
response.write(JSON.stringify(result));
response.end();
});
});
module.exports = router;
<html>
<header>
<script src="https://cdn.bootcss.com/jquery/3.4.1/jquery.js">script>
header>
<body>
<form>
<br> 标题:<input type="text" name="title_text">
<input class="form-submit" type="button" value="查询">
form>
<div class="cardLayout" style="margin: 10px 0px">
<table width="100%" id="record2">table>
div>
<script>
$(document).ready(function() {
$("input:button").click(function() {
$.get('/process_get?title=' + $("input:text").val(), function(data) {
$("#record2").empty();
$("#record2").append('url source_name ' +
'source_encoding title keywords publish_date crawltime content ');
for (let list of data) {
let table = '';
Object.values(list).forEach(element => {
table += (element + ' ');
});
$("#record2").append(table + ' ');
}
});
});
});
script>
body>
html>
利用HTML的知识制作前端网页。
历时一个月,总算大体上弄明白了该如何用爬虫爬取网页信息。当自己的爬虫成功运行的那一刻,心里还是非常激动的。而对于解析数据部分的代码,尚停留在只能看懂的阶段,还并未能完全掌握如何自己设计这样的解析数据的函数,未来仍需进一步努力。通过这次爬虫初尝试,也是学会了很多,掌握了部分JavaScript的语法、正则表达式的规则已经jQuery选择器的使用等等,也算小有所得。