第一篇博文:低手,刚学,求勿喷。
前段时间,使用爬虫访问一个磁力链接下载网站(target_url),收集电影下载链接。一段时间使用同一ip地址连续访问,链接网站会报错,response.status=403 forbidden,显示百度云加速机器人检测balabala。。。浏览器手动填写验证码后,浏览器可正常访问,但是爬虫还是无法爬取,估计是要带上通过检验的Header。各种网站上搜了一圈+自己试了几天,问题依旧在。。哎。。
看哪位大神遇到过类似的问题,麻烦指导一下,谢谢!
初学者用笨拙的requests.get各种尝试
特别提示
上述request的时候需要添加浏览器里复制的特定header,其中,Referer填写要爬取的目标地址;合成最终验证链接里面【】也是目标爬取地址。
header={
‘Cookie’: ‘BAIDUID=961EB1A4B665388CE6EA47C30C2E0725:FG=1; BIDUPSID=961EB1A4B665388CE6EA47C30C2E0725; PSTM=1561956798; BDUSS=5mek5LNmlkQXRrUWp-REdadTJCS1lNdXptcFlqSVFyMnVmfnBVR1VDUUxSVUZkSVFBQUFBJCQAAAAAAAAAAAEAAADdZhgDemhvdWNobGN5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAu4GV0LuBldan; MCITY=-131%3A; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1423_21117_29522_29518_28518_29099_29568_28835_29220_26350’,
‘Host’:‘captcha.su.baidu.com’,
‘Referer’: ‘【】’,
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36’
}
import requests
import re,os
header={
'Cookie': 'BAIDUID=961EB1A4B665388CE6EA47C30C2E0725:FG=1; BIDUPSID=961EB1A4B665388CE6EA47C30C2E0725; PSTM=1561956798; BDUSS=5mek5LNmlkQXRrUWp-REdadTJCS1lNdXptcFlqSVFyMnVmfnBVR1VDUUxSVUZkSVFBQUFBJCQAAAAAAAAAAAEAAADdZhgDemhvdWNobGN5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAu4GV0LuBldan; MCITY=-131%3A; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1423_21117_29522_29518_28518_29099_29568_28835_29220_26350',
'Host':'captcha.su.baidu.com',
'Referer': '【】',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
url_tg=r"【】"
rq=requests.get(url=url_tg)
s=re.findall("name=\"s\" value=\"(.*)\"",rq.content.decode("utf8"))[0]
id=re.findall("Event ID: (.*)",rq.content.decode("utf8"))[0]
print("s:"+s)
print("id_s:"+id)
url_pub=r"https://captcha.su.baidu.com/session_cb/?" \
r"pub=377e4907e1a3b419708dbd00df9e8f79&callback=callback"
pub=re.findall(r"pub=([a-z0-9]*)",url_pub)[0]
print("pub:"+pub)
rq=requests.get(url=url_pub,headers=header)
dt=json.loads(rq.text.lstrip('callback(').rstrip(')'))
print("session:"+dt["sessionstr"])
url_img="https://captcha.su.baidu.com/image/?session=%s&pub=377e4907e1a3b419708dbd00df9e8f79" %dt['sessionstr']
rq=requests.get(url=url_img,headers=header)
f=open("./temppage.jpg","wb")
f.write(rq.content)
pic_path=os.getcwd()+r"\temppage.jpg"
print(pic_path)
# os.system(pic_path)
url_comeback= 'http://【】/cdn-cgi/l/chk_captcha?s=%s&id=%s&captcha_challenge_field=%s&manual_captcha_challenge_field=' %(s,id,dt["sessionstr"])
f=open("./templic.txt","w")
f.write(url_comeback)
print(url_comeback)
import requests
import urllib.request
from http import cookiejar
url_finaltest=open("./templic.txt","r").readline()
test_code=input("验证码:")
url_finaltest=test_code
header={
'Host':'captcha.su.baidu.com',
'Referer': '【】',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
cookiestr="__cfduid=d065c414b8038a7e37a08c19ec38dc5ba1561956739; UM_distinctid=16bd43d3ae5ef-0fddc885546556-43450521-1fa400-16bd43d3ae6622; pgv_pvi=1668221952; _ga=GA1.2.1107443465.1562633913; Hm_lvt_6121c137910457c550f533490d449957=1562635173,1562906808,1564105223; _gid=GA1.2.1468546746.1564386430; __atuvc=24%7C28%2C10%7C29%2C5%7C30%2C2%7C31; CNZZDATA1261857817=90924701-1562633067-%7C1564472807; CNZZDATA1277219225=1241614942-1562630884-%7C1564472424; Hm_lvt_af921870f8ea1d356f9d403ce4edfc92=1564103796,1564386429,1564454948,1564476300; Hm_lvt_f75b813e9c1ef4fb27eaa613c9f307b2=1564103797,1564386429,1564454948,1564476300"
cookie={}
for item in cookiestr.split(";"):
cookie[item.split("=")[0]]=item.split("=")[1]
print(cookie)
url_target=r"【】"
ft=requests.get(url=url_finaltest,headers=header,cookies=cookie)
rq1=requests.get(url=url_target,cookies=ft.cookie)
print(rq1.content.decode("utf8"))