号码 编号 姓名 身份证后四位
这里用于显示验证码图片和识别结果/运行状态
关键词:跨域,JS,JavaScript,爬虫,验证码,图像识别
可能有小伙伴知道,构建一个网页,编写JS爬虫识别验证码很难实现,这是因为,要使用JS识别验证码,需要先把验证码的图片绘制到Canvas中,再使用训练好的识别库去遍历像素及进行操作,但因为你的网页和验证码的域不同,无论用哪种方法,浏览器都会报各种各样的跨域警告:
浏览器的JS似乎不擅长用于图像识别,那为何还要这样做?
使用浏览器的JS来构建爬虫看来确实是个糟糕的注意,但当前是最佳解决方案。
于是我百度了一下,找了一个可以向已经加载完毕的网页添加JS代码的JS代码,非常简单,直接在开发者工具中插入
function loadJs(url,callback){
var script=document.createElement('script');
script.type="text/javascript";
if(typeof(callback)!="undefined"){
if(script.readyState){
script.onreadystatechange=function(){
if(script.readyState == "loaded" || script.readyState == "complete"){
script.onreadystatechange=null;
callback();
}
}
}else{
script.onload=function(){
callback();
}
}
}
script.src=url;
document.body.appendChild(script);
};
首先我尝试加载本地代码:
loadJs('C:\Users\Antonio\test.js');
浏览器报错:不能加载本地代码,可以理解,安全限制,也就是说这个代码必须从远程服务器加载,只要是http或者https协议的就行,我想了一下,别人的代码都可以用cdn(内容分发网络)加载,为啥我的不行,于是我去腾讯云开了一个对象存储服务,把我的JS放到里面,直接加载该链接即可:
loadJs('https://tools-*********.cos.ap-guangzhou.myqcloud.com/main.js');
之所以要用网页部署爬虫,是因为网页的界面比较容易设计,在目标网页植入JS之后,我们需要对目标网页进行更改来显示我们的操作界面,然后再进行操作:
//删除body
document.body.innerHTML='';
//删除css
var styles = document.getElementsByTagName('style');
for(var i = styles.length - 1; i >= 0; i--) {
document.head.removeChild(styles[i]);
}
//删除link
var links = document.getElementsByTagName('link');
for(var i = links.length - 1; i >= 0; i--) {
document.head.removeChild(links[i]);
}
//加载js
loadJs('https://unpkg.com/[email protected]/dist/tesseract.min.js');
loadJs('https://cdn.bootcss.com/jquery/3.4.1/jquery.min.js');
function heredoc(fn) {
return fn.toString().split('\n').slice(1,-1).join('\n') + '\n'
}
//添加style
var style = document.createElement('style');
style.type = 'text/css';
style.innerHTML= heredoc(function(){/*
body h1 {
font-family: Cambria, 'Hoefler Text', 'Liberation Serif', Times, 'Times New Roman', 'serif';
color: aliceblue;
}
body h2 {
font-family: Cambria, 'Hoefler Text', 'Liberation Serif', Times, 'Times New Roman', 'serif';
color: aliceblue;
}
body h7 {
font-family: Cambria, 'Hoefler Text', 'Liberation Serif', Times, 'Times New Roman', 'serif';
}
body h4 {
font-family: Cambria, 'Hoefler Text', 'Liberation Serif', Times, 'Times New Roman', 'serif';
}
*/});
document.head.appendChild(style);
//构建网页
document.title = 'coded by AntoniotheFuture';
document.body.innerHTML= heredoc(function(){/*
Demo
爬虫
号码
编号
姓名
身份证后四位
这里用于显示验证码图片和识别结果/运行状态
*/});
为了避免麻烦,我已经隐去了部分信息
//定义常量
var captchaURL = 'http://somesite/captchacn.svl';
var queryURL = 'http://somesite/do.do';
var defaultst = 500;
var defaultrt = 3;
var OKColor = 'lightgreen';
var ErrColor = 'lightcoral';
var ShowFunctionInfoHint = "FunctionInfo 显示函数信息";
var NotShowFunctionInfoHint = "NoFunctionInfo 不显示函数信息";
var ShowFunctionInfo = true;
var source = []; //元数据
var logmsg = ''; //日志信息
var result = ''; //结果信息
var splittime = 500; //间隔时间
var retrytime = 3;//重试时间
var status = '';//状态信息
var types = [];
var running = false;
//获取参数
function getAttrs(){
var sourcetext = document.getElementById('sourcetext').innerText;
source=sourcetext.split(/[(\r\n)\r\n]+/);
source.forEach((item,index)=>{
if(!item){
source.splice(index,1);//删除空项
}
});
splittime = document.getElementById('splittime').value;
retrytime = document.getElementById('retrytime').value;
if(!isInteger(splittime)){
splittime = defaultst;
}
if(splittime < 0){
splittime = defaultst;
}
if(!isInteger(retrytime)){
retrytime = defaultrt;
}
if(retrytime < 0){
retrytime = defaultrt;
}
types.length = 0;
//types.splice(0,types.length);
var typesoption = document.getElementsByName("types");
for (var i = 0; i < typesoption.length; ++i) {
if(typesoption[i].checked) {
types.push(typesoption[i].value);
}
}
}
//检查参数
function checkAttrs(){
if(splittime == ''){splittime = defaultst}
if(retrytime == ''){retrytime = defaultrt}
if(!source){return '请输入要查询的数据'}
if(types.length < 2){return '至少需要两个查询方式'}
return true;
}
//将信息显示出来
function syncmsg(){
document.getElementById('logtext').innerHTML = logmsg;
document.getElementById('status').innerHTML = status;
document.getElementById('resulttext').innerHTML = result;
}
//判断整型
function isInteger(obj) {
return obj%1 === 0
}
//清空输入框
function ClearInput(){
document.getElementById('sourcetext').innerHTML = '';
}
//清空状态
function chearstatus(){
var el = document.getElementById('status');
var childs = el.childNodes;
for(var i = childs.length - 1; i >= 0; i--) {
el.removeChild(childs[i]);
}
}
//测试图片与识别
function test(){
//获取图片
chearstatus();
var img = new Image();
img.src = captchaURL + '?v=' + Math.random();
img.onload = function(){
document.getElementById('status').appendChild(img);
Tesseract.recognize(img, 'eng')
.then(function(result){
alert(result.text);
});
}
}
//清空日志
function clearlog(){
var el = document.getElementById('logtext');
var childs = el.childNodes;
for(var i = childs.length - 1; i >= 0; i--) {
el.removeChild(childs[i]);
}
}
function addlog(content){
document.getElementById('logtext').appendChild(content);
}
//构建状态语
function showstatus(total,success,trytime,hit,totaltime){
var t = '进度:' + success + '/' + total + ' ' + Number(success/total*100).toFixed() + '%' + '
' +
'识别成功率' + Number(hit/trytime*100).toFixed() + '%' + '
' +
'总用时:' + totaltime + '秒;平均用时:' + Number(totaltime/total).toFixed();
document.getElementById('status').innerHTML = t;
}
//判断验证码是否符合要求
function checkcap(cap){
if(cap.length != 4){
return false;
}
for (var i in cap) {
var asc = cap.charCodeAt(i);
if (!(asc >= 48 && asc <= 57 || asc >= 65 && asc <= 90 || asc >= 97 && asc <= 122)) {
return false;
}
}
return true;
}
//ajax同步
function fetch(url,querydata) {
const p = new Promise((resolve, reject) => {
$.ajax(url, {
dataType: 'json',
processData: false,
contentType: false,
timeout: 5000,
type : "post",
data: querydata,
dataType : "json",
success: function (data) {
resolve(data);
},
error: function () {
reject(new Error('返回错误'))
}
})
})
return p
}
为了避免麻烦,我已经隐去了部分信息
基本流程如下:
async function start(){
//构建参数
getAttrs();
checkresult = checkAttrs();
if(checkresult != true){
alert(checkresult);
}
clearlog();
var total = source.length;
var success = 0;
var trytime = 0;
var hit = 0;
var img = new Image();
var querydata = {};
var ishit = false;
var tryc = 0;
var i;
var ii;
var r;
var cap;
var cans = document.createElement("canvas");
cans.style.backgroundColor = "#808080";
var ctrx = cans.getContext('2d');
var t1 = (new Date()).valueOf()/1000;
var worker = new Tesseract.createWorker();
for(i = 0;i<types.length;i++){
querydata[types[i]] = '';
}
querydata['captcha'] = '';
result = '';
for(i = 0;i < total;i++){
var t2 = (new Date()).valueOf()/1000;
showstatus(total,success,trytime,hit,t2 - t1);
s = source[i].split('\t');
var fdata = new FormData();
for(ii = 0;ii < s.length;ii++){
fdata.append(Object.keys(querydata)[ii],s[ii]);
querydata[Object.keys(querydata)[ii]] = s[ii];
}
ishit = false;
tryc = 0;
do {
//todo:加入延时
trytime ++;
tryc ++;
img.src = captchaURL + '?v=' + new Date();
var promise = new Promise((reslove)=>{
img.onload = async function(){
cans.width = img.width;
cans.height = img.height;
ctrx.drawImage(img,0,0);//,img.width+20,img.height+20
//var logimg = new Image();
var newcans = document.createElement("canvas");
newcans.width = img.width;
newcans.height = img.height;
newcans.getContext('2d').drawImage(img,0,0);//,img.width,img.height
addlog(newcans);
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
var checkc = false;
var nospace = '';
var cc = 0;
do {
const { data } = await worker.recognize(cans);
//去除空格
nospace = data.text.replace(/\s+/g,"");
checkc = checkcap(nospace);
cc++;
}
while(!checkc && cc < 5);
querydata['captcha'] = nospace.substring(0,4);
fdata.append('captcha',querydata['captcha']);
console.log(JSON.stringify(querydata));
var ele = document.createElement('a');
ele.innerHTML = querydata['captcha'];
addlog(ele);
addlog(document.createElement('br'));
var queryresult = await fetch(queryURL + "?checkcaptch&time="+new Date(),fdata);
if (queryresult.result === "succeed") {
console.log('识别成功');
hit ++;
ishit = true;
if(queryresult.dataGrid.total >= 1){
for(r = 0;r < queryresult.dataGrid.total; r++){
result += source[i] + ',' + JSON.stringify(queryresult.dataGrid.rows[r]) + '
';
}
}else{
result += source[i] + ',无信息
';
}
} else{
console.log('识别失败');
}
reslove();
}
})
await promise
}
while(!ishit && tryc < retrytime);
if(!ishit){
result += source[i] + ',查询失败
';
}
success ++;
document.getElementById('resulttext').innerHTML = result;
}
t2 = (new Date()).valueOf()/1000;
showstatus(total,success,trytime,hit,t2 - t1);
alert('执行完毕');
}
完整的代码已在上面列出,如需完整文件可到GIthub下载:
github