常用代理有:
1、购买的动态IP隧道:比如阿布云动态隧道,请求不返回IP,代理访问请求,返回请求值;
2、私密代理IP:即为能够拿到返回的具体IP值(有时间限制),然后我们再用拿到的代理IP构造代理池,然后发起请求;
3、自己通过抓取免费代理IP,构造自己的IP代理池,有兴趣请移步:https://blog.csdn.net/Owen_goodman/article/details/100074822
1、requests脚本:get/post请求
2、scrapy:get/post 请求,在中间件添加代理
3、自动化脚本:selenium+webdriver+代理
一、在requests库中代理的使用
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
# 这里是阿布云服务器
# 代理服务器
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = "****" # 你的通行证书
proxyPass = "****" # 你的通行秘钥
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
time.sleep(0.1)
url = 'baidu.com'
# get请求使用阿布云
response = requests.get(url=url, proxies=proxies, headers=headers)
# post请求使用方法雷同
data={}
response = requests.post(url=url, proxies=proxies, headers=headers, data=data)
class dandd():
def spider(self):
# 以 Python3 为例,其他编程语言也类似,仅需发送 HTTP GET 请求接口即可
import requests
# 点击上面的生成按钮生成的接口地址
targetUrl = "***" # 你的接口地址
resp = requests.get(targetUrl)
print(resp.status_code)
print(resp.text)
# 这里是 蜻蜓代理
with open("./ip.txt", "w+") as f:
iplist = f.readlines()
ipList = iplist[0::2]
self.count = len(ipList)
ip = random.choice(ipList)
proxies = {
"http": 'http://' + ip.replace("\n", "")
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
url = 'baidu.com'
res = requests.get(url=url, proxies=proxies, headers=headers)
res = requests.post(url=url, proxies=proxies, headers=headers)
if __name__ == '__main__':
d1 = dandd()
d1.spider()
二、在scrapy框架中代理的使用
from requests import Request
def start_requests(self, *args):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
url = 'baidu.com'
proxies = {"http": 'http://' + ip.replace("\n", "")} # 见上面
# post请求同理
request = Request(url, callback=self.parse, dont_filter=True,headers=headers, meta={'proxy': proxies})
# time.sleep(0.5)
yield request
# 这里是阿布云代理
# 代理服务器
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
proxyServer = "http://http-dyn.abuyun.com:9020"
# 代理隧道验证信息
proxyUser = "****"
proxyPass = "****"
'''
# for Python2
proxyAuth = "Basic " + base64.b64encode(proxyUser + ":" + proxyPass)
'''
# for Python3
proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((proxyUser + ":" + proxyPass), "ascii")).decode("utf8")
class ProxyMiddleware(object):
# 将原本的process_request(self,request,spider)注释掉
# 第一种
def process_request(self, request, spider):
request.meta["proxy"] = proxyServer
print('使用代理服务器')
request.headers["Proxy-Authorization"] = proxyAuth
# 第二种
# 通过制定spider名字,可以指定对特定脚本使用代理
def process_request(self, request, spider):
if spider.name in ["name1", "name2", "name3"]:
request.meta["proxy"] = proxyServer
request.headers["Proxy-Authorization"] = proxyAuth
# 这里是蜻蜓代理
# 下面示例是写在中间件里,然后在settings里开启这个中间件
class qingTingMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
num = 1 #
count = 0 # 统计所有的ip个数
index = 0 # 取第几个ip
now = 0
def getIP(self): # 获取一个ip
if self.count - 1 == self.index or self.index == 0: # 取完所有IP,或者第一次取IP
pre = self.now
self.now = time.time()
if int((self.now - pre)) < 6:
time.sleep(6 - int((self.now - pre)))
self.now = time.time()
print("重新调用IP")
getAllIp = "your api"
es = requests.get(url=getAllIp)
res.encoding = "utf-8"
with open("./ip.txt", "w") as f:
f.write(res.text)
if self.index != 0:
self.index = 0
ip = re.findall(r'(\d+\.\d+\.\d+\.\d+:\d+)', res.text)[self.index]
self.index += 1
else:
with open("./ip.txt", "r") as f:
iplist = f.readlines()
ipList = iplist[0::2]
self.count = len(ipList)
ip = ipList[self.index]
self.index += 1
return 'http://' + ip.replace("\n", "")
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
# 将原本的process_request(self,request,spider)注释掉
# 第一种
def process_request(self, request, spider):
ip = self.getIp()
request.meta['proxy'] = ip
# 第二种
def process_request(self, request, spider):
if spider.name in ["name1", "name2", "name3"]:
ip = self.getIp()
request.meta['proxy'] = ip
else:
return None
# def process_request(self, request, spider):
# # Called for each request that goes through the downloader
# # middleware.
#
# # Must either:
# # - return None: continue processing this request
# # - or return a Response object
# # - or return a Request object
# # - or raise IgnoreRequest: process_exception() methods of
# # installed downloader middleware will be called
# return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
# #####2020-05-26添加的重试机制,导致断开十几分钟都连着,费时
# if isinstance(exception,TimeoutError):
# return request
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
三、自动化工具selenium
def parse(self, response):
url = "http://.aspx"
chrome_options = Options()
# proxies = random.choice([
# "116.239.105.250:40049",
# "117.26.88.235:23525",
# "60.182.178.192:30221",
# "123.163.184.232:43565",
# "113.120.62.57:43358",
# "1.199.187.37:41380",
# "117.87.139.65:49842",
# "113.128.26.228:31984",
# "125.117.146.134:48840",
# "113.120.63.82:42216",
# ])
# 设置代理
chrome_options.add_argument('--proxy-server=%s' % proxies)
# chrome_options.add_argument('--headless') # 无头模式
chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--no-sandbox') # 以最高权限运行
chrome_options.add_argument("--test-type")
chrome_options.add_argument(
'user-agent="MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url)
page = driver.page_source
res = etree.HTML(page) # 是将HTML转化为二进制/html 格式