云打码地址:
爬取的目标网站,是一个古诗文网站
https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx
云达码是一款打码平台,
云达码使用流程
login
里面的email
和pwd
是登录的账号和密码,这个可以登录自己的用户账号,来进行爬取,注意::每爬取一次就会扣除一定的提分,扣除多少 都有一定的规定 :http://www.yundama.com/price.html
这是提分价格表的链接,可以去了解下!对了!还有一点就是,再登录的时候一定要创建会话,否则代码写的再No Problem**!都是单,一定要携带 cookie
去访问,
# -*- coding = utf-8 -*-
#@time :2020/5/18 13:39
#Author :Song
#@file 古诗文网验证码识别.py
#@Software: PyCharm
#下载验证码图片到本地
import requests
from fake_useragent import UserAgent
from lxml import etree
from webspider.day18.verification_code import get_code
def indexHTML(s):
url ="https://so.gushiwen.org/user/login.aspx?from="
r =s.get(url=url,headers={"User-Agent":UserAgent().chrome})
return r.text
def download_image(html,s):
tree = etree.HTML(html)
#获取验证码图片
image_src =tree.xpath('//*[@id="imgCode"]/@src')[0]
#拼接完整的image url
image_url ="https://so.gushiwen.org" + image_src
r =s.get(url=image_url,headers={"User-Agent":UserAgent().random})
with open("yzm.png","wb")as fp:
fp.write(r.content)
code =get_code("yzm.png",1004)
print(code)
# print(code)
#指令解析
viewstate = __VIEWSTATE = tree.xpath('//*[@id="aspnetForm"]/div[1]/input/@value')[0]
viewstategenerator = __VIEWSTATEGENERATOR =tree.xpath('//*[@id="aspnetForm"]/div[2]/input/@value')[0]
# print(viewstategenerator)
# print(viewstate)
return code,viewstate,viewstategenerator
def login(code,viewstate,viewstategenerator,s):
post_url="https://so.gushiwen.org/user/login.aspx?from="
formdata={
"__VIEWSTATE":viewstate,
"__VIEWSTATEGENERATOR":viewstategenerator,
"from":"",
"email":"**********",
"pwd":"**********",
"wasd":"",
"code":code,
"denglu":"登录",
}
r =s.post(url=post_url,headers={"User-Agent":UserAgent().chrome},data=formdata)
# print(r.text)
with open("gs.html","w",encoding="utf8")as fp:
fp.write(r.text)
def main():
#创建会话,来进行登录
s =requests.Session()
#古诗文网页没登陆之前的页面,获取页面信息,来下载验证码图片
html = indexHTML(s)
#下载验证码,
code,viewstate,viewstategenerator = download_image(html,s)
#登录
login(code,viewstate,viewstategenerator,s)
if __name__ == '__main__':
main()