云打码平台
- 注册:普通用户和开发者用户
- 登录:
- 登录开发者用户
- 创建一个软件:我的软件 -> 创建软件
- 下载示例代码:开发者中心 -> 下载最新的DDL -> pythonHttp示例代码下载
1、模拟人人网登录
代码示例
1 import http.client, mimetypes, urllib, json, time, requests 2 3 ###################################################################### 4 5 class YDMHttp: 6 7 apiurl = 'http://api.yundama.com/api.php' 8 username = '' 9 password = '' 10 appid = '' 11 appkey = '' 12 13 def __init__(self, username, password, appid, appkey): 14 self.username = username 15 self.password = password 16 self.appid = str(appid) 17 self.appkey = appkey 18 19 def request(self, fields, files=[]): 20 response = self.post_url(self.apiurl, fields, files) 21 response = json.loads(response) 22 return response 23 24 def balance(self): 25 data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 26 response = self.request(data) 27 if (response): 28 if (response['ret'] and response['ret'] < 0): 29 return response['ret'] 30 else: 31 return response['balance'] 32 else: 33 return -9001 34 35 def login(self): 36 data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 37 response = self.request(data) 38 if (response): 39 if (response['ret'] and response['ret'] < 0): 40 return response['ret'] 41 else: 42 return response['uid'] 43 else: 44 return -9001 45 46 def upload(self, filename, codetype, timeout): 47 data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} 48 file = {'file': filename} 49 response = self.request(data, file) 50 if (response): 51 if (response['ret'] and response['ret'] < 0): 52 return response['ret'] 53 else: 54 return response['cid'] 55 else: 56 return -9001 57 58 def result(self, cid): 59 data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} 60 response = self.request(data) 61 return response and response['text'] or '' 62 63 def decode(self, filename, codetype, timeout): 64 cid = self.upload(filename, codetype, timeout) 65 if (cid > 0): 66 for i in range(0, timeout): 67 result = self.result(cid) 68 if (result != ''): 69 return cid, result 70 else: 71 time.sleep(1) 72 return -3003, '' 73 else: 74 return cid, '' 75 76 def report(self, cid): 77 data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'} 78 response = self.request(data) 79 if (response): 80 return response['ret'] 81 else: 82 return -9001 83 84 def post_url(self, url, fields, files=[]): 85 for key in files: 86 files[key] = open(files[key], 'rb'); 87 res = requests.post(url, files=files, data=fields) 88 return res.text 89 90 ###################################################################### 91 92 # 用户名(普通用户) 93 username = 'bobo328410948' 94 95 # 密码 96 password = 'bobo328410948' 97 98 # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 99 appid = 6003 100 101 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 102 appkey = '1f4b564483ae5c907a1d34f8e2f2776c' 103 104 # 图片文件 105 filename = 'getimage.jpg' 106 107 # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 108 codetype = 1004 109 110 # 超时时间,秒 111 timeout = 10 112 113 # 检查 114 if (username == 'username'): 115 print('请设置好相关参数再测试') 116 else: 117 # 初始化 118 yundama = YDMHttp(username, password, appid, appkey) 119 120 # 登陆云打码 121 uid = yundama.login(); 122 print('uid: %s' % uid) 123 124 # 查询余额 125 balance = yundama.balance(); 126 print('balance: %s' % balance) 127 128 # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 129 cid, result = yundama.decode(filename, codetype, timeout); 130 print('cid: %s, result: %s' % (cid, result)) 131 132 ######################################################################
解析验证码
1 def getCodeDate(userName,pwd,codePath,codeType): 2 # 用户名(普通用户) 3 username = userName 4 5 # 密码 6 password = pwd 7 8 # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 9 appid = 6003 10 11 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 12 appkey = '1f4b564483ae5c907a1d34f8e2f2776c' 13 14 # 图片文件 15 filename = codePath 16 17 # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 18 codetype = codeType 19 20 # 超时时间,秒 21 timeout = 2 22 result = None 23 # 检查 24 if (username == 'username'): 25 print('请设置好相关参数再测试') 26 else: 27 # 初始化 28 yundama = YDMHttp(username, password, appid, appkey) 29 30 # 登陆云打码 31 uid = yundama.login(); 32 #print('uid: %s' % uid) 33 34 # 查询余额 35 balance = yundama.balance(); 36 #print('balance: %s' % balance) 37 38 # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 39 cid, result = yundama.decode(filename, codetype, timeout); 40 #print('cid: %s, result: %s' % (cid, result)) 41 return result
利用抓包工具获取请求的url和参数,这里发送的是post请求
模拟人人网登录
1 import requests 2 import urllib 3 from lxml import etree 4 #获取session对象 5 session = requests.Session() 6 #将验证码图片进行下载 7 headers = { 8 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' 9 } 10 url = 'http://www.renren.com/' 11 page_text = requests.get(url=url,headers=headers).text 12 13 tree = etree.HTML(page_text) 14 code_img_url = tree.xpath('//*[@id="verifyPic_login"]/@src')[0] 15 urllib.request.urlretrieve(url=code_img_url,filename='code.jpg') 16 17 #识别验证码图片中的数据值 18 code_data = getCodeDate('bobo328410948','bobo328410948','./code.jpg',2004) 19 20 #模拟登录 21 login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201914927558' 22 data = { 23 "email":"[email protected]", 24 "icode":code_data, 25 "origURL":"http://www.renren.com/home", 26 "domain":"renren.com", 27 "key_id":"1", 28 "captcha_type":"web_login", 29 "password":"4f0350f09aeffeef86307747218b214b0960bdf35e30811c0d611fe39db96ec1", 30 "rkey":"9e75e8dc3457b14c55a74627fa64fb43", 31 "f":"http%3A%2F%2Fwww.renren.com%2F289676607", 32 } 33 #该次请求产生的cookie会被自动存储到session对象中 34 session.post(url=login_url,data=data,headers=headers) 35 36 url = "" # 这里是登录之后才能访问的页面的url 37 page_text = session.get(url=url,headers=headers).text 38 39 with open('renren.html','w',encoding='utf-8') as fp: 40 fp.write(page_text)
二 利用线程池爬取数据
import requests import re from lxml import etree from multiprocessing.dummy import Pool import random
1 #实例化一个线程池对象 2 pool = Pool(5) 3 url = 'https://www.pearvideo.com/category_1' 4 headers = { 5 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' 6 } 7 page_text = requests.get(url=url,headers=headers).text 8 tree = etree.HTML(page_text) 9 li_list = tree.xpath('//div[@id="listvideoList"]/ul/li') 10 11 video_url_list = [] 12 for li in li_list: 13 detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0] 14 detail_page = requests.get(url=detail_url,headers=headers).text 15 video_url = re.findall('srcUrl="(.*?)",vdoUrl',detail_page,re.S)[0] 16 video_url_list.append(video_url) 17 18 video_data_list = pool.map(getVideoData,video_url_list) 19 20 pool.map(saveVideo,video_data_list)
由于我们要获取的视屏连接不是在标签里,而是在js代码中,因此只能通过正则表达式来获取
通过回调函数来下载和保存列表里的视频
def getVideoData(url): return requests.get(url=url,headers=headers).content def saveVideo(data): fileName = str(random.randint(0,5000))+'.mp4' with open(fileName,'wb') as fp: fp.write(data)