1.抓取网页结构
2.人工判断需要解析的信息与配置对应正则
3.正则抓出验证码ID
4.使用获取到的ID请求图片并且保存(由于对方接口返回的不是带有正确后缀的图片,所以不能直接传给百度url)
5.上传百度AI文字或图片识别接口,根据返回内容判断图片识别是否正确
6.发送对应内容表单并处理返回值
7.返回正确保存对应内容,返回错误则循环之前流程直到成功(百度AI识别完全正确率30%左右,要注意接口调用量)
代码如下
let co = require('co'); //异步控制器
let {
sendHttpRequest } = require('../tools/httpTool');//封装的http处理
var fs = require('fs');//文件操作
const xlsx = require('node-xlsx');//xlsx解析
const {
sqlModel } = require('./riskInquiryApi-sqlModel')//sql相关操作
var path = require("path");
/**
* url 文件夹路径
* delFolder 是否删除文件夹
*/
let delFolderContents = function (url, delFolder) {
var files = [];
//判断给定的路径是否存在
if (fs.existsSync(url)) {
//返回文件和子目录的数组
files = fs.readdirSync(url);
//遍历当前文件夹下内容
files.forEach(function (file, index) {
var curPath = path.join(url, file);
//fs.statSync同步读取文件夹文件,如果是文件夹,在重复触发函数
if (fs.statSync(curPath).isDirectory()) {
delFolderContents(curPath, true);
} else {
// 是文件delete file
fs.unlinkSync(curPath);
}
});
//清除文件夹
if (delFolder) {
fs.rmdirSync(url);
}
} else {
console.log("给定的路径不存在,请给出正确的路径");
}
};
var AipOcrClient = require("baidu-aip-sdk").ocr;
let checking = false;
// 设置APPID/AK/SK
var APP_ID = "APP_ID APP_ID APP_ID APP_ID APP_ID ";
var API_KEY = "API_KEY API_KEY API_KEY ";
var SECRET_KEY = "SECRET_KEY SECRET_KEY SECRET_KEY SECRET_KEY ";
// 新建一个对象,建议只保存一个对象调用服务接口
var client = new AipOcrClient(APP_ID, API_KEY, SECRET_KEY);
let nowDate = new Date();
nowDate = nowDate.getFullYear() + "-" + (nowDate.getMonth() * 1 + 1) + "-" + nowDate.getDate();
let checkIndex = 0;
let sleepTime = 1000;
function getExecStrs(str) {
var reg = /value=\"(.+?)\"/g
var list = []
var result = null
do {
result = reg.exec(str)
result && list.push(result[1])
} while (result)
return list
}
let total = 0;
let nowIndex = 0;
let partnerList = [];
let partnersearch = async (ctx, next) => {
let url = ctx.url;
let request = ctx.request;
let req_query = request.query;
let req_queryString = request.queryString;
let req_body = request.body;
let filters = {
...req_body };
await sqlModel.partnersearch(filters).then(res => {
console.log("已获取到名单:" + res.length)
total = res.length;
partnerList = res;
ctx.body = {
code: 0,
data: res,
success: true
}
})
}
let checkSafe = (ctx, next) => {
if (checking) {
ctx.body = {
code: -1,
msg: "排查运行中,当前进度:" + checkIndex + "/" + partnerList.length,
success: true
}
} else {
ctx.body = {
code: 0,
msg: "开始确认名单",
success: true
}
delFolderContents("./webData", false);//运行前清理之前保存的内容,避免一个图片写入多次导致结构错误
checkSafeRun();
}
}
let checkSafeRun= async () => {
async function checkOne(options) {
try {
var body_request = {
hostname: "aaaa.aaa.aaa",
path: "/aaa/",
port: 80,
method: "get",
headers: {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "zh-CN,zh;q=0.9",
"Host": "aaaaa.aaa.aa",
"Referer": "bbbb.bbb.bb",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
}
};
let res = await co(function* () {
let req_res = yield sendHttpRequest(body_request, "");
return req_res
});
res = res.data
let t1 = res.indexOf("captchaXgl.do?captchaId=");
let t2 = res.indexOf("&random=");
let captCode = res.slice(t1, t2).replace("captchaXgl.do?captchaId=", "")
let filename = options.codeIndex;
if (!captCode) {
if (sleepTime != 1000) {
sleepTime = sleepTime * 2;
} else {
sleepTime = 1000 * 60;
}
return false
} else {
sleepTime = 1000
}
var imgdata_request = {
hostname: "aaa.aa.aa",
path: "aaaa.aa.aa/captchaXgl.do?captchaId=" + captCode + "&random=" + Math.random().toFixed(16),
port: 80,
method: "get",
headers: {
"Accept": " image/webp,image/apng,image/*,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9",
"Host": "aaaa.aaa.aa",
"Referer": "bbbb.bbb.bb",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
}
};
let imgdata = await co(function* () {
let req_res = yield sendHttpRequest(imgdata_request, "", {
setEncoding: "binary", notpackage: true });
return req_res
});
try {
fs.writeFile("webData/" + filename + ".jpg", imgdata, 'binary', async function (err) {
//path为本地路径例如public/logo.png
if (err) {
fs.readFile('errorlog.txt', async function (err, data) {
if (err) {
let text = "保存失败:" + err
fs.writeFile('errorlog.txt', data.toString() + '/r/n' + text, async function (err) {
if (err) {
return console.error(err);
}
});
} else {
}
});
} else {
var image = fs.readFileSync("webData/" + filename + ".jpg").toString("base64");
// 调用通用文字识别, 图片参数为远程url图片
var bdAPIoptions = {
};
bdAPIoptions["language_type"] = "ENG";
//client.accurateBasic(image,options).then(async function (result) {
client.generalBasic(image, bdAPIoptions).then(async function (result) {
let words_result = result?.words_result[0]?.words || "";
words_result = words_result.replace(/ /ig, "");
if (/^[0-9a-zA-Z]{4}$/.test(words_result)) {
let Words_resultBack = await co(function* () {
let postdata = {
/**** datas *****/
};
let temp = "";
for (let key in postdata) {
temp = temp + (temp == "" ? "" : "&") + key + "=" + postdata[key]
}
postdata = temp;
var Words_result_request = {
hostname: "aa.aa.aa",
path: "aaaa.aaa.aa/checkyzm?captchaId=" + captCode + "&pCode=" + words_result,
port: 80,
method: "get",
headers: {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "zh-CN,zh;q=0.9",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Host": "aaaa.aaaa.aa",
"Referer": "bbbbbb.bbb.bb",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
}
};
let req_res = yield sendHttpRequest(Words_result_request, "");
return req_res
});
console.log(Words_resultBack, Words_resultBack.data * 1 == 1 ? "验证码通过" : "验证码未通过");
if (Words_resultBack.data * 1 == 1) {
let checkBack = await co(function* () {
let postdata = {
/**** datas *****/
};
let temp = "";
for (let key in postdata) {
temp = temp + (temp == "" ? "" : "&") + key + "=" + encodeURI(postdata[key]);
}
postdata = temp;
var check_request = {
hostname: "AAAAA.AAAAA.AA",
path: "AAAAA.AAAAA.AA/AAA.do?" + postdata,
port: 80,
method: "get",
headers: {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "zh-CN,zh;q=0.9",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Host": "AAAAA.AAAAA.AA",
"Referer": "BBBBB.BBBBBB.BB",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
}
};
let req_res = yield sendHttpRequest(check_request, postdata, {
showcontent: true });
return req_res
});
let issafe = "";
if (checkBack?.data[0]?.result.length > 0) {
issafe = "-1";
checkIndex++;
sqlModel.updatePartnerState([issafe, nowDate, options.id]);
console.log("checkIndex++", checkIndex)
} else {
issafe = "1";
checkIndex++;
sqlModel.updatePartnerState([issafe, nowDate, options.id]);
console.log("checkIndex++", checkIndex)
}
}
} else {
console.log("验证码无效");
console.log("==================");
}
}).catch(function (err) {
// 如果发生网络错误
console.log(err);
});
}
})
} catch (e) {
console.log("写入失败")
}
} catch (e) {
console.log(e);
}
}
if (partnerList.length == 0) {
console.log("请先获取合伙人列表");
return false
}
for (checkIndex = 0; checkIndex < partnerList.length; checkIndex = checkIndex) {
if (
//partnerList[checkIndex].checkTime != nowDate ||
(partnerList[checkIndex].issafe * 1 != 1 && partnerList[checkIndex].issafe * 1 != -1)
||
((new Date(partnerList[checkIndex].checkTime).getTime() + (1000 * 60 * 60 * 24 * 30)) < new Date(nowDate).getTime())
) {
console.log("当前进度:" + checkIndex + "/" + partnerList.length, new Date().getTime())
await checkOne({
codeIndex: checkIndex,
id: partnerList[checkIndex].id,
pName: partnerList[checkIndex].partner_name || "",
pCardNum: partnerList[checkIndex].partner_id_or_org_id || "",
});
await sleepPromise(sleepTime);
} else {
checkIndex++
}
}
}
/*** 防止高频访问出现二次验证的等待函数**/
function sleepPromise(ms) {
return new Promise(resolve => setTimeout(resolve, ms))
}
let cacheExcl = async (ctx, next) => {
let url = ctx.url;
let request = ctx.request;
let req_query = request.query;
let req_queryString = request.queryString;
let req_body = request.body;
let xlsxData = [];
await new Promise((resolve, reject) => {
//解析xlsx
xlsxData = xlsx.parse(request.files.file.path);
resolve(xlsxData);
})
try {
for (let i = 0; i < xlsxData[0].data.length; i++) {
if (i != 0) {
await sqlModel.addPartner(xlsxData[0].data[i])
}
}
ctx.body = {
code: 0,
data: req_body,
status: true,
success: true
}
} catch (e) {
ctx.body = {
code: -1,
err: JSON.stringify(e),
status: false,
success: false
}
}
}
module.exports = {
partnersearch, checkSafe, cacheExcl }