用node写的爬虫,抓取王者荣耀英雄资料库,只是为了学习,侵删!
主要包括(基本上相关的都抓取):
没有搞懂如何动态抓取内容,我觉得抓取之前你要知道网页的js逻辑,emmmm....
这里我已经下载所需要的json文件到项目里面,如何下载:node下载文件
数据库的配置:
module.exports = {
host: 'localhost',
database: 'glory_of_kings',
user: 'root',
password: 'root'
};
也可以直接配置,不那么麻烦,只是为了学习新的方式。
// mysql的包 很多安利easymysql的,暂时还没改
let mysql = require('mysql');
// 文件操作
let fs = require('fs');
// mysql配置
let mysqlConfig = require('./mysql.config');
// 爬虫所需要的库
let http = require('http');
let cheerio = require('cheerio');
let iconv = require('iconv-lite');
//创建连接,数据操作完成之后要关闭连接
let connection = mysql.createConnection(mysqlConfig);
//连接mysql
connection.connect(function (err) {
if (err) {
console.log('数据库连接失败');
throw err;
}
});
// json文件的基础路径
const BASE_PATH = '../assets/jsons/';
// 文件编码方式
const FILE_TYPE = 'utf-8';
// 英雄资料页面基础路径
const HERO_DETAIL_PATH = `http://pvp.qq.com/web201605/herodetail/`;
//英雄图片的基础路径
const HERO_IMG_PATH = `http://game.gtimg.cn/images/yxzj/img201606/heroimg/`;
// 英雄皮肤大图的基础路径
const HERO_BIGSKIN_PATH = `http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/`;
// 装备图片的基础路径
const EQUIP_IMG_PATH = `http://game.gtimg.cn/images/yxzj/img201606/itemimg/`;
//技能图片的基础路径
const SUMMONER_PATH = `http://game.gtimg.cn/images/yxzj/img201606/summoner/`;
// 铭文图片的基础路径
const INSCRIPTION_PATH = `http://game.gtimg.cn/images/yxzj/img201606/mingwen/`;
// 召唤师技能的插入语句
const SUMMONER_SQL = `insert into summoner(summoner_id,name,rank,cd,description,img_url,big_img_url) values(?,?,?,?,?,?,?)`;
// 铭文的插入语句
const INSCRIPTION_INSERT = `insert into inscription(inscription_id,type,grade,name,description,img_url) values(?,?,?,?,?,?)`;
// 英雄的插入语句
const HERO_INSERT = `insert into hero(hero_id,name,pay_type,new_type,hero_type,hero_type2,skin_name,img_url,live,attack,skill,difficulty) values (?,?,?,?,?,?,?,?,?,?,?,?)`;
//英雄皮肤的插入语句
const HERO_SKIN_INSERT = `insert into skin(skin_id,hero_id,skin_name,small_img_url,big_img_url) values (?,?,?,?,?)`;
// 装备的插入语句
const EQUIP_INSERT = `insert into equip(equip_id,name,type,sale_price,total_price,des1,des2,img_url) values (?,?,?,?,?,?,?,?)`;
//英雄故事插入语句
const STORY_INSERT = `insert into story(hero_id,story) values (?,?)`;
// 英雄技能插入语句
const SKILL_INSERT = `insert into skill(skill_id,hero_id,name,cool,waste,description,tips,img_url) values (?,?,?,?,?,?,?,?)`;
// 推荐铭文的插入语句
const HERO_INSCRIPTION_INSERT = `insert into hero_inscription(hero_id,inscription_ids,tips) values (?,?,?)`;
// 推荐装备的插入语句
const HERO_EQUIP_INSERT = `insert into hero_equip(hero_id,equip_ids1,tips1,equip_ids2,tips2) values (?,?,?,?,?)`;
// 英雄关系的插入语句
// 网页中每个项都是两个英雄,所以这里一次插入两条数据
const LINKS_INSERT = `insert into links(hero_id,hero_id1,type,tips) values (?,?,?,?),(?,?,?,?)`;
// 技能加点的建议
const SKILL_SUMMONER_INSERT = `insert into skill_summoner(ename,skill_id1,skill_id2,summoner_id1,summoner_id2) values (?,?,?,?,?)`;
/**
* 更新召唤师技能
*/
function updateSummoner() {
fs.readFile(`${BASE_PATH}summoner.json`, FILE_TYPE, function (err, data) {
if (err) {
console.log(`文件读取失败`)
} else {
if (JSON.parse(data).length === 0) {
console.log(`暂无数据`);
} else {
let params = [];
JSON.parse(data).forEach(function (it, index) {
params = [
it.summoner_id,
it.summoner_name,
parseInt(it.summoner_rank.substring(3)),
parseInt(it.summoner_description),
it.summoner_description.split(':')[1],
`${SUMMONER_PATH}${it.summoner_id}.jpg`,
`${SUMMONER_PATH}${it.summoner_id}-big.jpg`
];
connection.query(SUMMONER_SQL, params, function (error, res) {
if (error) {
console.log(error);
throw error;
}
console.log(`技能${it.summoner_name}插入成功!`);
});
});
}
}
});
}
/**
* 更新铭文
*/
function updateInscription() {
fs.readFile(`${BASE_PATH}ming.json`, FILE_TYPE, function (err, data) {
if (err) {
console.log(`文件读取失败`)
} else {
if (JSON.parse(data).length === 0) {
console.log(`暂无数据`);
} else {
let params = [];
JSON.parse(data).forEach(function (it, index) {
params = [
it.ming_id,
it.ming_type,
it.ming_grade,
it.ming_name,
it.ming_des,
`${INSCRIPTION_PATH}${it.ming_id}.png`
];
connection.query(INSCRIPTION_INSERT, params, function (error, res) {
if (error) {
console.log(error);
throw error;
}
console.log(`铭文${it.ming_name}插入成功!`);
});
});
}
}
});
}
/**
* 更新装备
*/
function updateEquipments() {
fs.readFile(`${BASE_PATH}item.json`, FILE_TYPE, function (err, data) {
if (err) {
console.log(`文件读取失败`)
} else {
if (JSON.parse(data).length === 0) {
console.log(`暂无装备数据`);
} else {
let params = [];
JSON.parse(data).forEach(function (it, index) {
params = [
it.item_id,
it.item_name,
it.item_type,
it.price,
it.total_price,
it.des1,
it.des2,
`${EQUIP_IMG_PATH}${it.item_id}.jpg`
];
connection.query(EQUIP_INSERT, params, function (error, res) {
if (error) {
console.log(error);
throw error;
}
console.log(`装备:${it.item_name} 插入成功!`);
});
});
}
}
});
}
/**
* 更新英雄
*/
function updateHero() {
fs.readFile(`${BASE_PATH}heros.json`, FILE_TYPE, function (err, data) {
if (err) {
console.log(`文件读取失败`)
} else {
if (JSON.parse(data).length === 0) {
console.log(`暂无数据`);
} else {
let params = [];
JSON.parse(data).forEach(function (it, index) {
// 测试
http.get(`${HERO_DETAIL_PATH}${it.ename}.shtml`, function (res) {
let chunks = [];
res.on('data', function (item) {
chunks.push(item);
});
res.on('end', function () {
// 解码页面,防止乱码
let html = iconv.decode(Buffer.concat(chunks), 'gbk');
let $ = cheerio.load(html, {decodeEntities: false});
let list = $('.ibar');
params = [
it.ename,
it.cname,
it.pay_type,
it.new_type,
it.hero_type,
it.hero_type2,
it.skin_name,
`${HERO_IMG_PATH}${it.ename}/${it.ename}.jpg`,
parseInt(list[0].attribs.style.substring(6)),
parseInt(list[1].attribs.style.substring(6)),
parseInt(list[2].attribs.style.substring(6)),
parseInt(list[3].attribs.style.substring(6))
];
connection.query(HERO_INSERT, params, function (error, res) {
if (error) {
console.log(error);
throw error;
}
console.log(`英雄${it.ename}插入成功!`);
});
})
}).on('error', function () {
console.log(`获取页面数据出错`);
});
});
}
}
});
}
/**
* 更新皮肤
*/
function updateSkin() {
fs.readFile(`${BASE_PATH}heros.json`, FILE_TYPE, function (err, data) {
if (err) {
console.log(`文件读取失败`)
} else {
if (JSON.parse(data).length === 0) {
console.log(`暂无数据`);
} else {
let params = [];
let names = [];
JSON.parse(data).forEach(function (it, index) {
names = it.skin_name.split('|');
names.forEach(function (item, ind) {
params = [
it.ename + '' + (ind + 1),
it.ename,
item,
`${HERO_IMG_PATH}${it.ename}/${it.ename}-smallskin-${ind + 1}.jpg`,
`${HERO_BIGSKIN_PATH}${it.ename}/${it.ename}-bigskin-${ind + 1}.jpg`
];
connection.query(HERO_SKIN_INSERT, params, function (error, res) {
if (error) {
console.log(error);
throw error;
}
});
console.log(`皮肤${item}插入成功!`);
});
});
}
}
});
}
/**
* 更新故事
*/
function updateStory() {
fs.readFile(`${BASE_PATH}heros.json`, FILE_TYPE, function (err, data) {
if (err) {
console.log(`文件读取失败`);
} else {
let datas = JSON.parse(data);
if (datas.length === 0) {
console.log(`暂无英雄故事`);
} else {
let params = [];
datas.forEach(function (it, index) {
http.get(`${HERO_DETAIL_PATH}${it.ename}.shtml`, function (res) {
let chunks = [];
res.on('data', function (item) {
chunks.push(item);
});
res.on('end', function () {
// 解码页面,防止乱码
let html = iconv.decode(Buffer.concat(chunks), 'gbk');
let $ = cheerio.load(html, {decodeEntities: false});
params = [
it.ename,
$('.pop-story .pop-bd p').html()
];
connection.query(STORY_INSERT, params, function (error, res) {
if (error) {
console.log(error);
throw error;
}
console.log(`${it.ename}的故事插入成功`);
});
})
}).on('error', function () {
console.log(`获取页面数据出错`);
});
});
}
}
});
}
/**
* 更新技能
*/
function updateSkill() {
fs.readFile(`${BASE_PATH}heros.json`, FILE_TYPE, function (err, data) {
if (err) {
console.log(`文件读取失败`);
} else {
if (JSON.parse(data).length === 0) {
console.log(`暂无数据`);
} else {
let params = [];
JSON.parse(data).forEach(function (it, index) {
// 测试
http.get(`${HERO_DETAIL_PATH}${it.ename}.shtml`, function (res) {
let chunks = [];
res.on('data', function (item) {
chunks.push(item);
});
res.on('end', function () {
// 解码页面,防止乱码
let html = iconv.decode(Buffer.concat(chunks), 'gbk');
let $ = cheerio.load(html, {decodeEntities: false});
let skills = $('.skill-show .show-list');
skills.each(function (ind, el) {
let name = $(el).find('.skill-name b').html();
if (name) {
params = [
it.ename + '' + ind,
it.ename,
name,
parseInt($($(el).find('.skill-name span')[0]).html().substring(4)),
parseInt($($(el).find('.skill-name span')[1]).html().substring(3)),
$(el).find('.skill-desc').html(),
$(el).find('.skill-tips').html(),
`${HERO_IMG_PATH}${it.ename}/${it.ename}${ind}0.png`,
];
connection.query(SKILL_INSERT, params, function (error, res) {
if (error) {
console.log(error);
throw error;
}
});
console.log(`技能${name}插入成功!`);
}
});
})
}).on('error', function () {
console.log(`获取页面数据出错`);
});
});
}
}
});
}
/**
* 获取搭配铭文
*/
function updateHeroInscription() {
fs.readFile(`${BASE_PATH}heros.json`, FILE_TYPE, function (err, data) {
if (err) {
console.log("文件读取失败");
} else {
if (JSON.parse(data).length === 0) {
console.log('暂无数据');
} else {
let params = [];
JSON.parse(data).forEach(function (it, index) {
http.get(`${HERO_DETAIL_PATH}${it.ename}.shtml`, function (res) {
let chunks = [];
res.on('data', function (item) {
chunks.push(item);
});
res.on('end', function () {
// 解码页面,防止乱码
let html = iconv.decode(Buffer.concat(chunks), 'gbk');
let $ = cheerio.load(html, {decodeEntities: false});
params = [
it.ename,
$('.sugg-info ul').attr('data-ming').split('|').join(','),
$('.sugg-tips').text().substring('5')
];
connection.query(HERO_INSCRIPTION_INSERT, params, function (err, result) {
if (err) {
console.log(err);
throw err;
}
console.log(`第${index}条数据插入成功!`);
})
})
}).on('error', function () {
console.log("获取页面数据出错");
})
});
}
}
});
}
/**
* 获取推荐装备
*/
function updateHeroEquip() {
fs.readFile(`${BASE_PATH}heros.json`, FILE_TYPE, function (err, data) {
if (err) {
console.log("文件读取失败");
} else {
if (JSON.parse(data).length === 0) {
console.log('暂无数据');
} else {
let params = [];
JSON.parse(data).forEach(function (it, index) {
http.get(`${HERO_DETAIL_PATH}${it.ename}.shtml`, function (res) {
let chunks = [];
res.on('data', function (item) {
chunks.push(item);
});
res.on('end', function () {
// 解码页面,防止乱码
let html = iconv.decode(Buffer.concat(chunks), 'gbk');
let $ = cheerio.load(html, {decodeEntities: false});
params = [
it.ename,
$($('.equip-info ul')[0]).attr('data-item').split('|').join(','),
$($('.equip-tips')[0]).text().substring('5'),
$($('.equip-info ul')[1]).attr('data-item').split('|').join(','),
$($('.equip-tips')[1]).text().substring('5')
];
// console.log(params);
connection.query(HERO_EQUIP_INSERT, params, function (err, result) {
if (err) {
console.log(err);
throw err;
}
console.log(`第${index}条数据插入成功!`);
})
})
}).on('error', function () {
console.log("获取页面数据出错");
})
});
}
}
});
}
/**
* 获取英雄关系
*/
function updateLinks() {
fs.readFile(`${BASE_PATH}heros.json`, FILE_TYPE, function (err, data) {
if (err) {
console.log("文件读取失败");
} else {
if (JSON.parse(data).length === 0) {
console.log('暂无数据');
} else {
let params = [];
JSON.parse(data).forEach(function (it, index) {
http.get(`${HERO_DETAIL_PATH}${it.ename}.shtml`, function (res) {
let chunks = [];
res.on('data', function (item) {
chunks.push(item);
});
res.on('end', function () {
// 解码页面,防止乱码
let html = iconv.decode(Buffer.concat(chunks), 'gbk');
let $ = cheerio.load(html, {decodeEntities: false});
$('.hero-info').each(function (ind, el) {
params = [
it.ename,
$($(el).find('ul a')[0]).attr('href').substr(0, 3),
ind,
$($(el).find('p')[0]).html(),
it.ename,
$($(el).find('ul a')[1]).attr('href').substr(0, 3),
ind,
$($(el).find('p')[1]).html(),
];
connection.query(LINKS_INSERT, params, function (err, result) {
if (err) {
console.log(err);
throw err;
}
})
});
console.log(`第${index}条数据插入成功!`);
})
}).on('error', function () {
console.log("获取页面数据出错");
})
});
}
}
});
}
/**
* 英雄技能加点推荐
*/
function updateSkillSummoner() {
fs.readFile(`${BASE_PATH}heros.json`, FILE_TYPE, function (err, data) {
if (err) {
console.log("文件读取失败");
} else {
if (JSON.parse(data).length === 0) {
console.log('暂无数据');
} else {
let params = [];
JSON.parse(data).forEach(function (it, index) {
http.get(`${HERO_DETAIL_PATH}${it.ename}.shtml`, function (res) {
let chunks = [];
res.on('data', function (item) {
chunks.push(item);
});
res.on('end', function () {
// 解码页面,防止乱码
let html = iconv.decode(Buffer.concat(chunks), 'gbk');
let $ = cheerio.load(html, {decodeEntities: false});
params = [
it.ename,
$($('.sugg-skill img')[0]).attr('src').substring(50, 54),
$($('.sugg-skill img')[1]).attr('src').substring(50, 54),
$($('.sugg-info2 p')[5]).attr('data-skill').substring(0, 5),
$($('.sugg-info2 p')[5]).attr('data-skill').substring(6)
];
connection.query(SKILL_SUMMONER_INSERT, params, function (err, result) {
if (err) {
console.log(err);
throw err;
}
console.log(`第${index}条数据插入成功!`);
});
})
}).on('error', function () {
console.log("获取页面数据出错");
})
});
}
}
});
}
// 关闭数据库连接
connection.end(function (e) {
if (e) {
console.log(`关闭数据库失败`);
throw e;
}
});
exports.updateSummoner = updateSummoner;
exports.updateInscription = updateInscription;
exports.updateHero = updateHero;
exports.updateSkin = updateSkin;
exports.updateEquipments = updateEquipments;
exports.updateStory = updateStory;
exports.updateSkill = updateSkill;
exports.updateHeroInscription = updateHeroInscription;
exports.updateHeroEquip = updateHeroEquip;
exports.updateLinks = updateLinks;
exports.updateSkillSummoner = updateSkillSummoner;
所有英雄的herolist.json文件有点问题,解析json对象,老是报错,然后手动修改成了heros.json,两个文件内容是一样的,但是heros可以,herolist不可以,很尴尬,如果你解决了,可以和我说下,谢谢
周免英雄这里也有点意思,动态插入的,不方便直接获取,然后费了老大劲才发现,抓取的所有英雄的json文件中,pay_type=10的英雄就是周免,pay_type=11的英雄是新手推荐,header.js里面处理了,emmmm........(传送:周免英雄,代码在339-347)
var freeHeroData = [],
freeHeroHtml = "";
for (var i = 0; i < data.length; i++) {
var payarr = [],
payarr = ('' + data[i].pay_type).split(',');
// 如果pay_type=10,插入的freeHeroData
if (payarr == 10 || payarr[0] == 10 || payarr[1] == 10) {
freeHeroData.push(data[i]);
}
}
// console.log(freeHeroData);
OK!