HttpConnectinPool:
原因:
1.短时间内发起了高频的轻轻导致ip被禁
2.http连接池中的连接资源被耗尽
解决:
1.使用代理
2.headers中加入Connection:"close"
代理服务器
可以接收请求,然后将其转发。
匿名度
类型
免费代理:
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
url = 'https://www.baidu.com/s?wd=ip'
page_text = requests.get(url=url, headers=headers, proxies={
'http':'119.120.6.141:9999'}).text # proxies 添加代理,更换ip,防止被封禁ip
with open('ip.html', 'w', encoding='utf-8') as fp:
fp.write(page_text)
# 代理池:列表
proxy_list = [
{
'http':'119.120.6.141:9999'},
{
'http':'36.249.52.16:9999'},
{
'https':'123.149.141.204:9999'},
]
url = 'https://www.baidu.com/s?wd=ip'
page_text = requests.get(url=url, headers=headers, proxies=random.choice(proxy_list)).text # proxies 添加代理,更换ip,防止被封禁ip
"""
获取快代理的免费代理ip
https://www.kuaidaili.com/free/
会封你的IP
"""
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
proxy_list_http = []
proxy_list_https = []
url = 'https://www.kuaidaili.com/free/inha/%d/'
for page in range(1, 20):
new_url = format(url % page)
page_text = requests.get(url=new_url, headers=headers).text
tree = etree.HTML(page_text)
tr_list = tree.xpath('//*[@id="list"]/table/tbody/tr')
for tr in tr_list:
ip = tr.xpath('./td[1]/text()')[0]
port = tr.xpath('./td[2]/text()')[0]
t_type = tr.xpath('./td[4]/text()')[0]
ips = ip+':'+port
dic = {
t_type: ips
}
if t_type.upper() == 'http'.upper():
proxy_list_http.append(dic)
else:
proxy_list_https.append(dic)
print(len(proxy_list_http), len(proxy_list_https))
# print(proxy_list_http)
# print(proxy_list_https)
http://http.zhiliandaili.cn
会给你生成一个url,你去访问这个页面,里面就是代理。
之后用requests去获取,就可以用作代理池了。
"""
获取快代理的免费代理ip
https://www.kuaidaili.com/free/
会封你的IP
"""
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
proxy_list_http = []
proxy_list_https = []
url = 'https://www.kuaidaili.com/free/inha/%d/'
for page in range(1, 200): # 会封你的IP,由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败
new_url = format(url % page)
page_text = requests.get(url=new_url, headers=headers, proxies={
'https':'购买的代理'}).text
tree = etree.HTML(page_text)
tr_list = tree.xpath('//*[@id="list"]/table/tbody/tr')
for tr in tr_list:
ip = tr.xpath('./td[1]/text()')[0]
port = tr.xpath('./td[2]/text()')[0]
t_type = tr.xpath('./td[4]/text()')[0]
ips = ip+':'+port
dic = {
t_type: ips
}
if t_type.upper() == 'http'.upper():
proxy_list_http.append(dic)
else:
proxy_list_https.append(dic)
print(len(proxy_list_http), len(proxy_list_https))
# print(proxy_list_http)
# print(proxy_list_https)
# 检测
for ip in proxy_list_http:
response = requests.get('https://sogou.com', headers=headers, proxies=ip)
if response.status_code == '200':
print('检测到此代理可用:', ip)
"""
对雪球网中的新闻数据进行获取
https://www.xueqiu.com
"""
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
url = 'https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=146971&size=15'
page_text = requests.get(url=url, headers=headers).json()
print(page_text) # 遇到错误,请刷新页面或者重新登录帐号后再试 有Cookie
"""
对雪球网中的新闻数据进行获取
https://www.xueqiu.com
"""
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Cookie':'acw_tc=2760820516087020614293643e9e23b0db0e4017f6fd180075dcf7cc4c549d; xq_a_token=ad26f3f7a7733dcd164fe15801383e62b6033003; xqat=ad26f3f7a7733dcd164fe15801383e62b6033003; xq_r_token=15b43888685621c645835bfe2d97242dc20b9005; xq_id_token=eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ1aWQiOi0xLCJpc3MiOiJ1YyIsImV4cCI6MTYxMTI4MzA4NCwiY3RtIjoxNjA4NzAyMDMwNDkzLCJjaWQiOiJkOWQwbjRBWnVwIn0.LqWFRqQkTyRAw-jS8sozPTtHeq321XUxbYwYEgF9el-ee98NtjMmxfULBprqRxIII9ohWja1XLJFNP4VHwM3RHMRusMPeyxl_T8M14Y4loJdf6uOEs_-hQYBwl2juxr3CX4KKcJW3qztzKS4HwEkG3T93UW0UzAcLVXwFxNy8AMFnEp5zHS1jl0Vx1UjVmpuT_wea76F9y4_F_0VutQPbaSVIqJsxZFtfBNpjkhAZ6I4x_yc_xyZvWWuizRzNPkb8IfzfqFFqMMKxIJQAyFqE06gQHmBD-vjWHuxM1GRJxDP5YOVhPzWJ1q_cQEXegEAUU22Kq602kjWw9BEc5IufQ; u=191608702061437; device_id=24700f9f1986800ab4fcc880530dd0ed; Hm_lvt_1db88642e346389874251b5a1eded6e3=1608702065; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1608702065'
}
url = 'https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=146971&size=15'
page_text = requests.get(url=url, headers=headers).json()
print(page_text) # 带上Cookie 可以拿到
"""
对雪球网中的新闻数据进行获取
https://www.xueqiu.com
"""
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
}
# 创建session对象
session = requests.Session()
# 在第一次访问这个网站时它会给你注入session
session.get('https://www.xueqiu.com',headers=headers) # 先对首页访问,加载cookie
url = 'https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=146971&size=15'
page_text = session.get(url=url, headers=headers).json()
print(page_text) # 自动加Cookie 可以拿到
验证码识别
有些网站必须登录之后才能访问其他页面,往往会遇到不同形式的验证码。所以要处理验证码的问题。
打码平台
超级鹰
http://www.chaojiying.com/
注册:用户中心身份
登录:
创建一个软件:911057
下载示例代码:开发文档—>python—>点击下载。下载的文件解压到你的项目根目录下,其他文件使用的化直接导入即可。
初次微信关注超级鹰公众号赠送1000题分
打码兔
云打码
"""
识别古诗文网中的验证码
https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx
"""
import requests
from lxml import etree
from chaojiying_Python.chaojiying import Chaojiying_Client
def transform_imag_data(path, t_type):
chaojiying = Chaojiying_Client('用户名', '密码', '911057') # 用户中心>>软件ID 生成一个替换 96001
im = open(path, 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
return chaojiying.PostPic(im, t_type)['pic_str']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
}
# 2.考虑session
# 可能会产生session的请求都用session发起
session = requests.Session()
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
page_text = session.get(url, headers=headers).text
tree = etree.HTML(page_text)
img_url = 'https://so.gushiwen.cn/' + tree.xpath('//*[@id="imgCode"]/@src')[0]
img_data = session.get(img_url, headers=headers).content
with open('code.jpg', 'wb') as fp:
fp.write(img_data)
# 获取动态变化的请求值
__VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
__VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
code_text = transform_imag_data('code.jpg', '1004')
print('验证码:', code_text) # 验证码内容
# 模拟登录
data = {
'__VIEWSTATE': __VIEWSTATE,
'__VIEWSTATEGENERATOR': __VIEWSTATEGENERATOR,
'from': 'http://so.gushiwen.cn/user/collect.aspx',
'email': '用户邮箱',
'pwd': '密码',
'code': code_text,
'denglu': '登录',
}
# 考虑1:动态变化的请求参数
# 通常会隐藏在当前对应的前端页面的代码中
login_url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'
page_index = session.post(url=login_url, headers=headers, data=data).text
with open('index.html', 'w', encoding='utf-8') as fp:
fp.write(page_index)