登录反爬方法(跳过登录(推荐)、登陆成功)
import requests
# response = requests.get('https://www.zhihu.com/signin?next=%2F')
# print(response.text) # 首次登录得到的是登陆页面
requests自动登录
第一步:人工对需要自动登陆网页进行登录
第二步:获取网站登录后的cookie信息
第三步:发送请求的时候在headers中添加cookie值 右键检查-Network-All-刷新-网站名-Headers-找到cookie
headers ={
'cookie': 'd_c0="AHDa8IoxPBKPToCDuS1p4bI2uLk5vOJu2bs=|1606057164"; _zap=b7f7057b-231f-4417-bd82-1e5ee7349c05; YD00517437729195%3AWM_TID=4UejNw0uqCtBQEBAFEKAbfKTANRm9zoU; _xsrf=cce6eeb9-4cfb-41f6-a108-375edba9db0d; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1678526066,1678678145,1680318427; captcha_session_v2=2|1:0|10:1680318426|18:captcha_session_v2|88:TEFWc3h4ZHZoNEYwak40OEsxcmI0U3MxcWRSK1pPaEZ4ZzgwUGR6WWoySkdSQWtDYzhsK1BWajlncit6dldpdg==|68c10bd49ea053ffe6bf7aba6da2dfbc2aabb3cb0fbefcb158b207341c79b1a1; SESSIONID=RYT22ewqJszcVZnxfsea8rVfdOodMeG0KJNQXuNWOcl; JOID=VloTCk6T1IyB2FO7AZ5TFr4aC8oVx4nXv-swyH_YofLtggjTfUpAqOjfUbAC1q1qL0D67TLSVEUv56bpl4dd9wQ=; osd=WlwRB0qf0o6M3F-9A5NXGrgYBs4ZwYvau-c2ynLcrfTvjwzfe0hNrOTZU70G2qtoIkT26zDfUEkp5avtm4Ff-gA=; __snaker__id=VVQkMjOwsMhtAFEP; gdxidpyhxdE=fxNNeDaUgZLKiJvLhQPjd2ElJ49vMZVArJ%2B1CteiN9TOgpg3ZYb%5Cwwh7zqdXXAV6Q27yfVAXdWENpYsqwqhgRmNpscnIVeI6g%2FVxX4UCYs5mseuz4j14djXGGqa8g54ThaRn6zOpAtslBUbzum57HcsE1De47G%2Bt6z5Z3yoG%5CTyyWKWX%3A1680319328421; YD00517437729195%3AWM_NI=Pjs0MpaDZva3xd134m6ic8adTVfMjmhTmfafR7EbralpMRgJPX%2Fp8CEof6YoI8CVW79paGYg1YINmHCYli%2Fqq8mscA57JEUkBjYJaoHiv3uf4P4dSv2hwY6L4Uk4shvsbWo%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eed5d45da18df8b6e74f85968fa7c55f969b9aacd56ff3aaacd6cd4af6bc8ed4e82af0fea7c3b92a94b6bdd1d07ca39c9697e439bcbcffd4b767f6b7add9e470988aaaaee967ac8ca387e46e92e98b83f16bb0b083b3e6429488e191bb4987b1aad7dc658be88db7d63dfbb0afa8dc6df38787aff0448187a48aae7bb3e8a394e24bf7e986a6d86182b697abe4658cb7bad4f86ef890fc85d27ef2b8b9bab548f88ba5d5f264b6adaeb8ee37e2a3; captcha_ticket_v2=2|1:0|10:1680318689|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfb2puTXRUdEtSRHNLcjFvUkYwVFFxak9vbUpuREZhdGN2MHBGY2NDNmEyYzdoTUY3bVlQQlJaOXpuU0NQMl90aTRKaFRQeVlFUmtXZ01JMUxJME9DMWs2NEFpYi1GeXZ3Qi10OS42ZlVtMWkwRXJLaVpXTWlWdXFxaDZramloODl4TlpJY1VISnpwckZwb1hEZTBGelQ0Z0J1ZGY1MEJnYjJzRnpLUVlLXzZmVXA1THFCMUFJRTVpaENFR3gtQXFDczZLb3FNWmx1LmNfbVhWU3ZXc29SSlA2aFgubklFY3Atd0hsdVlySVFFV29oeVdVcmtYbjI4akRKYWFrZ1RNYy1FUExLbU5zcFZsN2xnSUNuS0pWVkRZS1BJay5lYTRIcjZ3NFVFU29LLWU3WXZjTHUyRmNfd2Y3cTdJNzJNTW01aDZSYWRoTTVmWkRKeXNsc2RYc19rQ01Ea1REcFNObUN1aXdwRDdGdnoubDlDSGF6WmouT1lzV21XZk5aUGwxUk9UVHp2Li15bjBPTmRRSVdRYTZPRlBvQmo4eHBfUGJvVnJELUI1bVp3VUo2SmdSZWNDdm5UaUlBUVlUdkFpSGJMMTlkTk12aFk1N05MY2FyNVYtVERwT2xOcjB6TUJhZjRkc3RwYXFrN0JCUEhGUENfVHNDMjRnVkdxMyJ9|ca3de6f93e03e618ea5222e012285d37ca10f775b183e2718a49a958d8e56aac; z_c0=2|1:0|10:1680318710|4:z_c0|92:Mi4xVkI2SUVnQUFBQUFBY05yd2lqRThFaVlBQUFCZ0FsVk45dTRVWlFEbmR4ZXlVVnh3dzdwUldKM2JsWXdwRXpBcWpR|3992be9cf2ca57d44573a0228dc6c51302285c86681dc8fb3013d4d6acd2464b; q_c1=1236f84b745649768e8d92502e7ca140|1680318710000|1680318710000; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1680318713; tst=r; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1680318718|1680318425',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
response = requests.get('https://www.zhihu.com/', headers=headers)
print(response.text)
from selenium.webdriver import Chrome
# 1. 创建浏览器打开需要自动登录的网页
b = Chrome()
b.get('https://www.taobao.com')
# 2. 留足够长的时间,人工完成登录(必须保证b指向窗口的网页信息能看到登录以后的信息)
input('是否已经完成登录(输入回车后继续运行后续代码):')
# 3. 获取登陆成功后的cookie信息,保存到本地文件
result = b.get_cookies()
print(result)
with open('files/taobao.txt','w',encoding='utf-8') as f:
f.write(str(result))
#要使用cookie,需要先运行获取cookie的代码获取保存后再使用,cookie一般会十天半个月左右会更新,就需要再次获取
# 1. 创建浏览器,打开需要自动登录的网页
b = Chrome()
b.get('https://www.taobao.com')
# 2. 获取本地保存的cookie
with open('files/taobao.txt',encoding='utf-8') as f:
result = eval(f.read())
# 3. 添加cookie
for i in result:
b.add_cookie(i)
# 4. 重新打开网页
b.get('https://www.taobao.com')
在爬取数据多次导致ip被封,则需要使用代理ip
ip 需要直接在网上购买即可
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
# 创建代理ip
proxies = {'https': '58.52.81.8:4526'}
#
# # 使用代理ip
res = requests.get('https://movie.douban.com/top250?start=0&filter=', headers=headers,proxies=proxies)
print(res.text)
from selenium.webdriver import Chrome,ChromeOptions
options = ChromeOptions()
# 设置代理
options.add_argument('--proxy-server=http://58.52.81.8:4526')
b=Chrome(options=options)
b.get('https://movie.douban.com/top250?start=0&filter=')
input()
Xpath用来解析网页数据或者xml数据的一种解析方法,它是通过路径来获取标签(元素)。
"""
Python数据:{'name':'小明','age':18,'is_ad':True,'car_no':None}
Json数据:(必须用双引号,true小写,空为null) 效率高
python 中有json模块可导入:import json
{"name":"小明","age":18,"is_ad":true,"car_no":null}
xml数据:(标签名和属性值可以任意设置) 安全性相对要高
小明
18
是
"""
获取标签
绝对路径: 以斜杠’/'开头,然后从根节点开始层层往下写路径,若以/开头,但紧接着的不是根节点则获得空列表
相对路径:‘./’(可省略)开头或 '…/'开头,其中 ‘.’ 表示当前节点, ‘…’ 表示当前节点的父节点
全路径:以’//'开头的路径
获取标签内容:在获取标签路径的最后加’/text()’
获取标签属性:在获取标签路径的最后加’/@属性名’
# 用Xpath解析数据前须先导入一个第三方库
from lxml import etree
html = open('data.html',encoding='utf-8').read()
root = etree.HTML(html)
# root1 = etree.XML() 具体用html还是xml由数据决定
# 节点对象.xpath(路径) -- 根据路径获取标签,返回一个列表,元素为节点对象(标签)
# 不同节点对象去.xpath获取标签,相对路径会因此而不同
result= root.xpath('/html/body/div/a') # 绝对路径
# 获取标签内容
result1= root.xpath('/html/body/div/a/text()')
# 获取标签属性
result2= root.xpath('/html/body/div/a/@href') # 绝对路径
绝对路径的写法和xpath前面的节点对象无关
div = root.xpath('/html/body/div')[0]
r = div.xpath('/html/body/div/a/text()')
print(r == result1) # True
相对路径和全路径
# 2) 相对路径
result3 = root.xpath('./body/div/a/text()')
result4 = div.xpath('./a/text()')
print(result4 == result3) # True
# 3)全路径 以//开头
res = root.xpath('//a/text()') # 找所有的a标签的标签内容
print(res)
res1 = root.xpath('//div/a/text()') # 找所有的在div中的a标签的标签内容
print(res1)
res2 = div.xpath('//a/text()') # 找所有的在div中的a标签的标签内容
print(res2)
位置相关谓语
"""
[N] - 第N个指定标签(从1开始)
[last()] - 最后一个指定标签
[position()>N]、[position()>=N]、[position()
result = root.xpath('//span/p[2]/text()')
result = root.xpath('//span/p[last()-1]/text()')
属性相关谓语
"""
[@属性名=属性值]
"""
result = root.xpath('//span/p[@id="p1"]/text()')
# 获取span中id属性值为p1的p标签
在xpath中可以通过*来表示任意标签或任意属性
result = root.xpath('//span/*/text()') # 获取span下所有标签的标签内容
print(result)
result = root.xpath('//span/*[@class="c1"]/text()') # 获取span下所有class属性值为c1的标签的标签内容
print(result)
result = root.xpath('//span/span/@*') # 获取span下span的所有属性值
print(result)
result = root.xpath('//*[@class="c1"]/text()')
print(result) # 获取所有class属性值为c1的标签的标签内容