第一步:创建一个nodejs项目
mkdir
cd
npm init -y
npm install -D eslint
npx eslint --init # 注:会让你选择一些配置,popular -> airbnb -> no react -> yml包 其余默认
然后安装项目中所要用到的模块 npm install request request-promise 等等,这里就不一一列举了
创建 .gitignore文件,里面加入一些不需要传git的文件,git时会忽略掉
在自动生成的风格包中 .eslintrc.yml 内容替换成如下:
parserOptions:
ecmaVersion: 8
env:
es6: true
node: true
mocha: true
globals:
Service: true
extends: 'eslint:recommended'
rules:
indent:
- warn
- 2
- SwitchCase: 1
VariableDeclarator:
var: 2
let: 2
const: 3
linebreak-style:
- error
- unix
quotes:
- warn
- single
semi:
- error
- always
comma-dangle:
- warn
- always-multiline
no-dupe-keys: error
no-dupe-args: error
use-isnan: error
valid-typeof: error
curly: error
default-case: error
eqeqeq:
- error
- allow-null
guard-for-in: warn
no-else-return: warn
no-fallthrough: error
no-floating-decimal: warn
no-multi-str: error
no-octal: error
no-octal-escape: error
no-redeclare: error
no-with: error
no-void: error
radix: error
strict: error
no-delete-var: error
array-bracket-spacing:
- error
- never
block-spacing: error
brace-style:
- error
- 1tbs
- allowSingleLine: true
comma-spacing: error
comma-style:
- error
- last
computed-property-spacing: error
camelcase: warn
key-spacing:
- error
- beforeColon: false
afterColon: true
keyword-spacing: error
max-params:
- warn
- 6
new-cap:
- error
- newIsCap: true
capIsNew: false
properties: true
no-array-constructor: error
no-spaced-func: error
no-whitespace-before-property: error
no-trailing-spaces:
- error
- skipBlankLines: true
operator-linebreak: off
space-before-blocks:
- error
- always
space-before-function-paren:
- error
- anonymous: never
named: never
asyncArrow: always
space-in-parens:
- error
- never
space-infix-ops: error
space-unary-ops: error
spaced-comment:
- warn
- always
arrow-spacing: error
semi-spacing: error
constructor-super: error
generator-star-spacing: warn
yield-star-spacing: warn
no-const-assign: error
no-dupe-class-members: error
no-this-before-super: error
no-var: error
no-unused-vars:
- warn
- vars: local
args: none
no-use-before-define:
- error
- functions: false
classes: false
variables: false
prefer-arrow-callback: warn
prefer-const: off
prefer-rest-params: warn
prefer-spread: warn
prefer-template: warn
template-curly-spacing:
- warn
- never
object-curly-spacing:
- warn
- always
no-multi-spaces:
- warn
- ignoreEOLComments: true
valid-jsdoc: off
no-global-assign: error
no-unsafe-negation: error
require-yield: off
no-warning-comments:
- warn
- location: start
terms:
- todo
- fixme
- xxx
- hack
- review
index.js爬虫代码
'use strict';
const rp = require('request-promise');
const log = require('xxd-log');
const bluebird = require('bluebird');
// 网页解析库
const cheerio = require('cheerio');
// 加密库
const crypto = require('crypto');
// 读写文件库
const fs = require('fs');
// 载入通行证json文件
const ticket = require('./ticket.json');
// 日历库,定时执行任务
const schedule = require('node-schedule');
// http请求库
const request = rp.defaults({
jar: rp.jar(),
gzip: true,
headers: {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
},
});
// 构造post请求,要看具体服务器要求配置
const post = rp.defaults({
baseUrl: ticket.baseUrl,
method: 'post',
uri: '/news/import',
headers: {
'x-xxd-ticket': ticket.ticket,
},
json: true,
});
// sha1加密封装
function sha1(str) {
return crypto.createHash('sha1').update(str).digest('hex');
}
// 缓存html文件,注意request时要try catch
async function getTmpRecord(url) {
const sha1Url = sha1(url);
const fileUrl = `${__dirname}/tmpRecord/${sha1Url}.txt`;
const bool = fs.existsSync(fileUrl);
if (bool) {
return fs.readFileSync(fileUrl, 'utf-8');
}
try {
const html = await request(url);
fs.writeFileSync(fileUrl, html, 'utf-8');
return html;
} catch (err) {
log.error(err.message.slice(0, 200));
return null;
}
}
// sleep函数,默认参数1000
function sleep(milliseconds = 1000) {
// 封装sleep函数,返回一个Promise,在异步使用时前面加个await
return new Promise((resolve, reject) => {
setTimeout(() => {
resolve();
}, milliseconds);
});
}
async function main() {
const dataList = [];
const sections = [
{
sectionName: '申请条件',
section: 'tiaojian',
},
{
sectionName: '国家优势',
section: 'yuanxiaozhuanye/guojia',
},
{
sectionName: '教育体系',
section: 'yuanxiaozhuanye/jiaoyutixi',
},
{
sectionName: '专业咨询',
section: 'yuanxiaozhuanye/zhuanyezixun',
},
{
sectionName: '热门专业',
section: 'yuanxiaozhuanye/remenzhuanye',
},
];
for (let i = 0; i < 5; i += 1) {
for (let page = 0; page < 10; page += 1) {
try {
const url = `https://www.liuxue86.com/${sections[i].section}/${(page === 0) ? '' : `${page + 1}.html`}`;
log.trace('正在抓取-->', url);
const html = await request(url);
const section = sections[i].sectionName;
const $ = cheerio.load(html, { decodeEntities: false });
// bulebird执行异步数组任务
await bluebird.map($('.news-title').get(), async (element) => {
const contentUrl = $('a', element).attr('href');
const data = {};
const contentHtml = await getTmpRecord(contentUrl);
if (contentHtml == null) {
// 当前bluebird函数不需要返回值,所以return相当于continue
return;
}
const dollar = cheerio.load(`${contentHtml}`, { decodeEntities: false });
data.section = section;
data.url = contentUrl;
data.title = dollar('h1').text();
log.trace('正在抓取--->', data.title);
data.time = dollar('.conter_main_one_nav').children('p').text();
dollar('#article-content img').remove();
dollar('#article-content a').replaceWith(function() { return dollar(this).html(); });
dollar('#article-content [style]').removeAttr('style');
dollar('#article-content [class]').removeAttr('class');
dollar('p:contains(推荐阅读:)').nextAll().remove();
dollar('p:contains(推荐阅读:)').remove();
data.content = dollar('#article-content').html().trim();
data.content = data.content.replace(/出国留学网/g, '智课网');
dataList.push(data);
}, { concurrency: 4 });
} catch (err) {
log.error(err.stack);
}
}
}
log.trace('全部抓取完毕');
// 将抓取到的数据的url取出,处理好后装入Set对象待比较
// 注Set函数查找元素,效率高于普通数组查找
const dataListUrlSet = new Set(dataList.map(data => `${sha1(data.url)}.txt`));
// 通过Set.has() 过滤需要删除的缓存文件
const tmpRecordList = fs.readdirSync(`${__dirname}/tmpRecord`)
.filter((x) => !dataListUrlSet.has(x));
const successRecordList = fs.readdirSync(`${__dirname}/successRecord`)
.filter((x) => !dataListUrlSet.has(x));
// 删除缓存文件
tmpRecordList.forEach((element) => {
fs.unlinkSync(`${__dirname}/tmpRecord/${element}`);
});
successRecordList.forEach((element) => {
fs.unlinkSync(`${__dirname}/successRecord/${element}`);
});
// for (let i = 0; i < dataList.length; i++) {
// if (fs.existsSync(`${__dirname}/successRecord/${sha1(dataList[i].url)}.txt`)) {
// dataList.splice(i, i);
// }
// }
// 使用filter,对上面注释代码的优化
const sendList = dataList.filter((x) => !fs.existsSync(`${__dirname}/successRecord/${sha1(x.url)}.txt`));
bluebird.map(sendList, async (item) => {
try {
// 每半秒发送一次post请求
await sleep(500);
// 装载formData内容,然后发送
const res = await post({
formData: {
title: item.title,
content: item.content,
source: `liuxue86-院校专业-${item.section}`,
},
});
// 如果返回内容发生错误,且不是`该资讯已存在`,抛出异常
if (res.code !== 0 && res.msg !== '该资讯已存在') {
throw new Error(res.msg);
}
// 请求成功,把成功的处理后的url存成文件名,放入缓存文件中
fs.writeFileSync(`${__dirname}/successRecord/${sha1(item.url)}.txt`, null);
log.trace(item.title, '添加成功');
} catch (err) {
log.error(err.message);
}
}, { concurrency: 1 })
.catch((err) => {
log.fatal(err.stack);
});
}
// 刚开始运行时,服务器中当前目录不存在如下两个文件夹,则创建
if (!fs.existsSync(`${__dirname}/tmpRecord`)) {
fs.mkdirSync(`${__dirname}/tmpRecord`);
}
if (!fs.existsSync(`${__dirname}/successRecord`)) {
fs.mkdirSync(`${__dirname}/successRecord`);
}
// 设置执行任务的时间,字符串中6个位置分别表示(秒 分 时 天 月 年),第二个参数就是所要执行的任务
schedule.scheduleJob('0 0 4 * * *', () => { // 详细内容可查看schedule模块文档
main().catch((err) => {
log.fatal(err.stack);
});
});
// main().catch((err) => {
// log.fatal(err.stack);
// });