使用nodejs实现爬虫,无视接口是否加密
- 准备工作:
安装node所需要的的开发环境,以及安装puppeteer包–>在终端中输入npm install puppeteer即可安装,不过这个包比较大,最好切换到淘宝镜像进行安装,这里不做过多赘述;并且我们还需要将爬取到的数据放到数据库,所以还需要安装mysql包,也是直接在终端输入npm install mysql即可。
- puppeteer介绍:
Puppeteer 是 Chrome 开发团队在 2017 年发布的一个 Node.js 包,用来模拟 Chrome 浏览器的运行。可以说只要是浏览器事件用puppeteer的api都可以实现模拟事件,所以是一个很强大的工具,并且由于我们经常爬取数据的时候会发现很多网站的接口数据其实是进行了加密的,这样就导致我们爬取数据的难度很大,但是如果使用puppeteer我们就可以不再关注接口数据是否加密,只要数据是展示到了页面上,我们都是可以爬取的。
- 具体代码:
let puppeteer = require('puppeteer');
let mysql = require('mysql');
let connection = mysql.createConnection({
host : ip,
user: 用户名,
password: 密码,
port : 端口,
database: 表名
});
connection.connect();
let shopAddSql = 'INSERT INTO huawei_shop_place (province,city,area_town,shop_name,shop_location) VALUES (?,?,?,?,?)';
function insertShop(shopMessage){
return new Promise((resolve,reject)=>{
connection.query(shopAddSql,shopMessage,(err,results)=>{
if(err){
console.log('[INSERT ERROR] - ',err.message);
return;
}else{
console.log('成功');
}
})
})
}
async function addShopMessage(){
let options = {
defaultViewport:{
width:1400,
height:800
},
headless:false
}
let browser = await puppeteer.launch(options);
let page = await browser.newPage();
await page.goto('https://consumer.huawei.com/cn/support/where-to-buy/');
for(let i = 1; i < 32; i++){
await page.waitFor(200);
provinceListHandlers = await page.$('.province .select');
provinceListHandlers.click();
provinceSelect = await page.$$('.province .select .select-province li');
await provinceSelect[i].click();
cityListHandlers = await page.$('.city .select');
await page.waitFor(200);
cityListHandlers.click();
citySelect = await page.$$('.city .select .select-city li');
await page.waitFor(200);
console.log(citySelect.length);
for(let j = 1; j < citySelect.length ; j++){
await page.waitFor(200);
if(j > 1){
console.log('大于1');
cityListHandlers = await page.$('.city .select');
console.log('获取城市列表');
cityListHandlers.click();
console.log('点击城市列表');
citySelect = await page.$$('.city .select .select-city li');
console.log('获取选择列表');
await citySelect[j].click();
await page.waitForResponse(res => {
return res.request().url().startsWith('https://consumer.huawei.com/cmkt/services/service/store/query/v2') && res.ok()
})
await page.waitFor(100);
areaListHandlers = await page.$('.area .select');
areaListHandlers.click();
areaSelect = await page.$$('.area .select .select-area li');
await page.waitFor(200);
for(let k = 1; k < areaSelect.length; k++){
await page.waitFor(200);
if(k > 1){
areaListHandlers = await page.$('.area .select');
areaListHandlers.click();
areaSelect = await page.$$('.area .select .select-area li');
areaSelect[k].click();
await page.waitForResponse(res => {
return res.request().url().startsWith('https://consumer.huawei.com/cmkt/services/service/store/query/v2') && res.ok()
})
await page.waitFor(1000)
shopNameList = await page.$$eval('.point-list #pointList li .point-title p',(elements)=>{
var PointList = [];
elements.forEach((item,i) => {
let pointName = item.innerText;
PointList.push(pointName);
});
return PointList
});
shopPositionList = await page.$$eval('.point-list #pointList li .left-box .pa-wrap span a',(elements)=>{
var PositionListArr = [];
elements.forEach((item,i) => {
let Position = item.innerText;
PositionListArr.push(Position);
})
return PositionListArr;
});
for(let i = 0; i < shopNameList.length; i++){
let shopMessage = [];
let province = await page.$eval('.province .select span',el => el.innerText);
let city = await page.$eval('.city .select span',el=> el.innerText);
let area = await page.$eval('.area .select span',el => el.innerText);
let shopName = shopNameList[i];
let shopPosition = shopPositionList[i];
await page.waitFor(100);
shopMessage = [province,city,area,shopName,shopPosition];
insertShop(shopMessage);
console.log(province);
console.log(city);
console.log(area);
console.log(shopName);
console.log(shopPosition);
}
}else{
areaSelect[k].click();
await page.waitForResponse(res => {
return res.request().url().startsWith('https://consumer.huawei.com/cmkt/services/service/store/query/v2') && res.ok()
})
await page.waitFor(1000)
shopNameList = await page.$$eval('.point-list #pointList li .point-title p',(elements)=>{
var PointList = [];
elements.forEach((item,i) => {
let pointName = item.innerText;
PointList.push(pointName);
});
return PointList
});
shopPositionList = await page.$$eval('.point-list #pointList li .left-box .pa-wrap span a',(elements)=>{
var PositionListArr = [];
elements.forEach((item,i) => {
let Position = item.innerText;
PositionListArr.push(Position);
})
return PositionListArr;
});
for(let i = 0; i < shopNameList.length; i++){
let shopMessage = [];
let province = await page.$eval('.province .select span',el => el.innerText);
let city = await page.$eval('.city .select span',el=> el.innerText);
let area = await page.$eval('.area .select span',el => el.innerText);
let shopName = shopNameList[i];
let shopPosition = shopPositionList[i];
await page.waitFor(100);
shopMessage = [province,city,area,shopName,shopPosition];
insertShop(shopMessage);
console.log(province);
console.log(city);
console.log(area);
console.log(shopName);
console.log(shopPosition);
}
}
}
}else{
await citySelect[j].click();
console.log('城市下拉选择');
await page.waitForResponse(res => {
return res.request().url().startsWith('https://consumer.huawei.com/cmkt/services/service/store/query/v2') && res.ok()
})
await page.waitFor(100);
console.log('等待100ms');
areaListHandlers = await page.$('.area .select');
areaListHandlers.click();
console.log('乡镇下拉点击');
areaSelect = await page.$$('.area .select .select-area li');
await page.waitFor(200);
for(let k = 1; k < areaSelect.length; k++){
await page.waitFor(200);
if(k > 1){
areaListHandlers = await page.$('.area .select');
areaListHandlers.click();
areaSelect = await page.$$('.area .select .select-area li');
areaSelect[k].click();
await page.waitForResponse(res => {
return res.request().url().startsWith('https://consumer.huawei.com/cmkt/services/service/store/query/v2') && res.ok()
})
await page.waitFor(1000)
shopNameList = await page.$$eval('.point-list #pointList li .point-title p',(elements)=>{
var PointList = [];
elements.forEach((item,i) => {
let pointName = item.innerText;
PointList.push(pointName);
});
return PointList
});
shopPositionList = await page.$$eval('.point-list #pointList li .left-box .pa-wrap span a',(elements)=>{
var PositionListArr = [];
elements.forEach((item,i) => {
let Position = item.innerText;
PositionListArr.push(Position);
})
return PositionListArr;
});
for(let i = 0; i < shopNameList.length; i++){
let shopMessage = [];
let province = await page.$eval('.province .select span',el => el.innerText);
let city = await page.$eval('.city .select span',el=> el.innerText);
let area = await page.$eval('.area .select span',el => el.innerText);
let shopName = shopNameList[i];
let shopPosition = shopPositionList[i];
await page.waitFor(100);
shopMessage = [province,city,area,shopName,shopPosition];
insertShop(shopMessage);
console.log(province);
console.log(city);
console.log(area);
console.log(shopName);
console.log(shopPosition);
}
}else{
areaSelect[k].click();
await page.waitForResponse(res => {
return res.request().url().startsWith('https://consumer.huawei.com/cmkt/services/service/store/query/v2') && res.ok()
})
await page.waitFor(1000)
shopNameList = await page.$$eval('.point-list #pointList li .point-title p',(elements)=>{
var PointList = [];
elements.forEach((item,i) => {
let pointName = item.innerText;
PointList.push(pointName);
});
return PointList
});
shopPositionList = await page.$$eval('.point-list #pointList li .left-box .pa-wrap span a',(elements)=>{
var PositionListArr = [];
elements.forEach((item,i) => {
let Position = item.innerText;
PositionListArr.push(Position);
})
return PositionListArr;
});
for(let i = 0; i < shopNameList.length; i++){
let shopMessage = [];
let province = await page.$eval('.province .select span',el => el.innerText);
let city = await page.$eval('.city .select span',el=> el.innerText);
let area = await page.$eval('.area .select span',el => el.innerText);
let shopName = shopNameList[i];
let shopPosition = shopPositionList[i];
await page.waitFor(100);
shopMessage = [province,city,area,shopName,shopPosition];
insertShop(shopMessage);
console.log(province);
console.log(city);
console.log(area);
console.log(shopName);
console.log(shopPosition);
}
}
}
}
}
}
}
addShopMessage().catch(error => console.log(error))
- 启动说明:
由于涉及到自动打开浏览器等操作,有一些电脑可能会出现直接在编辑器自带终端里边启动会导致报错或者打开后一直没反应,这是因为权限不够,所以一般启动的时候这里建议直接使用电脑的cmd管理员窗口进行项目启动,这样就不会出现其他问题了,关于代码,还没有做优化,毕竟只是把数据拿过来用一下,有兴趣的朋友可以将我的代码优化一下给点优化建议,本代码仅仅适用于学习,请不要用于商业用途。谢谢合作