def parse_url(self, url):
response = requests.get(url=url, headers=self.headers)
return response.content
start = time.time()
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9', 'referer': 'https://www.v2ex.com/signin'}
url = r'https://www.v2ex.com/signin'
session = requests.session()
resp_content = session.get(url, headers=headers).content
resp_html = html.fromstring(resp_content)
names = resp_html.xpath('//*[@id="Main"]/div[2]/div[2]/form/table[1]//input[1]//@name')
once_url = resp_html.xpath(r'//*[@id="Main"]/div[2]/div[2]/form/table[1]//tr//@style')[0]
once_value = re.match(".*once=([0-9]*)(.*)", once_url).group(1)
verifyUrl = "https://www.v2ex.com/_captcha?once=" + once_value
resp = requests.get(verifyUrl, headers=headers, cookies=session.cookies.get_dict())
verifyBytes = resp.content
verify_start = time.time()
dama = indetifyCode(verifyBytes)
verify_end = time.time()
print("识别验证码耗时:", verify_end - verify_start)
encode_verify = str(dama, encoding="utf-8")
data = {
names[0]: 'xxx',
names[1]: 'xxx',
names[2]: encode_verify,
'once': once_value,
'next': r'/'
}
login_url = 'https://www.v2ex.com/signin'
final_resp = session.post(url=login_url, data=data)
print(final_resp.status_code)
end = time.time()
print("总耗时: ", end - start)
v2ex中,由于账号密码栏的名称是变化的,所以只有先加载得到html后再通过xpath进行匹配,匹配成功后获取对应post请求时所需填充的数据,并且由于其中的验证码是和cookie绑定通过once参数进行获取的,所以获取验证码一定要加上cookie参数。
2. selenium
browser = webdriver.Chrome("D:/ChromeDownload/chromedriver_win32/chromedriver.exe")
url = r'https://www.v2ex.com/signin'
browser.get(url=url)
username = browser.find_element_by_xpath(r'//input[@placeholder="用户名或电子邮箱地址"]')
username.send_keys('xxx')
pwd = browser.find_element_by_xpath(r'//tbody[1]/tr[2]/td[2]/input')
pwd.send_keys('xxx')
# 验证码
style = browser.find_element_by_xpath(r'//tbody[1]/tr[3]/td[2]/div[1]').get_attribute("style")
once = re.match(".*once=([0-9]*)\"\);(.*)", style).group(1)
verifyUrl = "https://www.v2ex.com/_captcha?once=" + once
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9', 'referer': 'https://www.v2ex.com/signin'}
cookies = {i["name"]: i["value"] for i in browser.get_cookies()}
resp = requests.get(verifyUrl, headers=headers, cookies=cookies)
verifyBytes = resp.content
print(resp.status_code)
dama = indetifyCode(verifyBytes)
verify = browser.find_element_by_xpath(r'//input[@placeholder="请输入上图中的验证码"]')
#这里不能直接用verify.send_keys(dama),类型不匹配会报错
#入参要求是str,这种获取的是byte
#报错信息:TypeError: sequence item 0: expected str instance, int found
verify.send_keys(str(dama, encoding="utf-8"))
browser.find_element_by_xpath(r'//input[@value="登录"]').click()
以上是完整的代码,注释中记录了部分自己遇到的坑,用的是yundama平台的验证码识别,其中的api需要改一部分,官网给的打码api是要传入参路径调用的是YDM_DecodeByByPath
,而我们这里可以直接通过请求获取到二维码图片的二进制流,无需存储到本地,直接把其中调用的api改成YDM_DecodeByByBytes
。
通过以上代码测试,发现确实如果直接用selenium+PhantomJS耗时会比session耗时多很多,测试过程(出去验证码识别阶段)session登录整体流程耗时2秒左右,而selenium则需要接近7秒。