目录
一、背景
代理IP池
二、UA代理池
2.1资源准备
2.2头部生成
2.3请求
三、代理IP池
2.1抓取代理IP
2.2测试代理IP可用性
2.3存储数据
四、代理IP池使用
完!!!
在爬虫的时候,可能IP会被封掉!!!怎样子才能解决此问题呢?
由于题目的原因,我想先给大家看看,UA代理池是如何实现的!!!
首先准备一个列表存放多个UA:
# 用户代理池
agent = [
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
]
开始构造头部:
备注:
由于我这代码是我代理IP池构建的过程中写的(生成头部我用了一个Utils类),所以我直接贴出我的源代码;
getHeaders函数用于构造头部
class Utils(object):
def getHeaders(self):
"""
构造头部
"""
headers = {
'User-Agent' : random.sample(agent, 1)[0]
}
return headers
...
构造原理:
随机生成一个数字,获取列表之中的值!!!
utils = Utils()
request = requests.get(url, headers=utils.getHeaders(), params = prms)
现在大家已经了解完了UA代理池的基本原理!其实代理IP池的请求也和上面差不多!UA可以直接从浏览器或者其他地方获取,但是代理IP改如何获取呢?其实大家可以百度搜索“免费代理IP”,找到其网站直接爬回来即可!!!
备注:
这里我使用的是:快代理
备注:
我们需要抓取的是IP、PORT、类型、响应时间!!!
由于本文章是专属代理构建的,在这里我不教学该如何去抓取数据(代理IP),我直接贴上我的代码:
class Spider(object):
"""
捕获数据类
"""
def __init__(self, url):
self.url = url
def send_request(self, prms = {}, url = '', page = 1):
"""
发送请求
:prms:请求字段
:url:请求地址
"""
if not url:
url = self.url
url = url + str(page)
print(url)
utils = Utils()
request = requests.get(url, headers=utils.getHeaders(), params = prms)
return request.text
def parse_request(self, text):
"""
解析文本
:text:文本数据
"""
soup = BeautifulSoup(text,'lxml')
ip_list = soup.find('tbody').find_all('tr')
data_list = []
for data in ip_list:
data_list.append({
'ip' : data.find('td', attrs={"data-title": "IP"}).get_text(),
'port' : data.find('td', attrs={"data-title": "PORT"}).get_text(),
'request_type' : data.find('td', attrs={"data-title": "类型"}).get_text(),
'time' : re.search(r"(\d+\.?\d*)", data.find('td',attrs={"data-title": "响应速度"}).get_text()).group(1)
})
return data_list
通过抓回来的IP,直接使用与请求一个网站,假如响应=>代理IP可用!!!
class Test(object):
"""
验证类
"""
def check_ip(self, req_type, ip, port):
"""
测试代理ip
:req_type:协议类型
:ip:代理ip
:port:端口号
"""
utils = Utils()
proxies = {
req_type.lower() : req_type.lower() + '://' + ip + ':' + port
}
request = requests.get("https://www.baidu.com",headers = utils.getHeaders(), proxies = proxies, timeout = 1)
return request.status_code == 200
将抓取回来的IP进行存储,这里我使用的是CSV!!!
class Save(object):
def __init__(self, path):
self.path = path
def csv_save(self, ip_list):
utils = Utils()
test = Test()
if(test.check_ip(ip_list['request_type'], ip_list['ip'], ip_list['port'])):
with open('ip.csv',mode = 'a', newline='') as f:
writer = csv.DictWriter(f, csv_headers)
writer.writerow(ip_list)
else:
print("此ip不可用!!!")
在这里,代理IP池就已经构造完毕了!!!如何使用请看下一节!!!
下面是我全部代码:
import requests import random import re import os import csv import time from bs4 import BeautifulSoup # 免费快代理 url = 'https://www.kuaidaili.com/free/inha/' # 用户代理池 agent = [ 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11', 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' ] # 请求头部 params = {} # csv 头部 csv_headers = ['ip','port','request_type','time'] # 工具类 class Utils(object): def getHeaders(self): """ 构造头部 """ headers = { 'User-Agent' : random.sample(agent, 1)[0] } return headers def get_path(self): """ 获取本地路径 """ return os.getcwd() # 数据获取类 class Spider(object): """ 捕获数据类 """ def __init__(self, url): self.url = url def send_request(self, prms = {}, url = '', page = 1): """ 发送请求 :prms:请求字段 :url:请求地址 """ if not url: url = self.url url = url + str(page) print(url) utils = Utils() request = requests.get(url, headers=utils.getHeaders(), params = prms) return request.text def parse_request(self, text): """ 解析文本 :text:文本数据 """ soup = BeautifulSoup(text,'lxml') ip_list = soup.find('tbody').find_all('tr') data_list = [] for data in ip_list: data_list.append({ 'ip' : data.find('td', attrs={"data-title": "IP"}).get_text(), 'port' : data.find('td', attrs={"data-title": "PORT"}).get_text(), 'request_type' : data.find('td', attrs={"data-title": "类型"}).get_text(), 'time' : re.search(r"(\d+\.?\d*)", data.find('td',attrs={"data-title": "响应速度"}).get_text()).group(1) }) return data_list # 测试类 class Test(object): """ 验证类 """ def check_ip(self, req_type, ip, port): """ 测试代理ip :req_type:协议类型 :ip:代理ip :port:端口号 """ utils = Utils() proxies = { req_type.lower() : req_type.lower() + '://' + ip + ':' + port } request = requests.get("https://www.baidu.com",headers = utils.getHeaders(), proxies = proxies, timeout = 1) return request.status_code == 200 # 数据存储类 class Save(object): def __init__(self, path): self.path = path def csv_save(self, ip_list): utils = Utils() test = Test() if(test.check_ip(ip_list['request_type'], ip_list['ip'], ip_list['port'])): with open('ip.csv',mode = 'a', newline='') as f: writer = csv.DictWriter(f, csv_headers) writer.writerow(ip_list) else: print("此ip不可用!!!") # 入口函数(保存可行代理ip) if __name__ == '__main__': spider = Spider(url) utils = Utils() save = Save(utils.get_path()) # 存储csv头 with open('ip.csv',mode = 'a', newline='') as f: writer = csv.DictWriter(f, csv_headers) writer.writeheader() page_list = spider.parse_request(spider.send_request(params, 0)) for ip in page_list: save.csv_save(ip) time.sleep(1)
这里我主要是通过get_proxies函数从CSV文件获取代理IP,并且通过get_proxies_agent函数生成proxies,两个函数具体信息如下:
def get_proxies(self):
"""
读取csv文件获取代理ip
"""
with open('ip.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
proxies_agent.append(row) # proxies_agent属于一个全局列表
print("获取代理ip成功!!!")
def get_proxies_agent(self):
"""
生成代理ip头部
{
http : 'http://ip:port'
}
"""
tmp = random.sample(proxies_agent, 1)[0]
proxies = {
tmp[2].lower() : tmp[2].lower() + '://' + str(tmp[0]) + ':' + str(tmp[1])
}
print('本次请求的代理ip:' + proxies[tmp[2].lower()])
return proxies
使用代理IP:
utils = Utils()
request = requests.get(url, headers=utils.get_headers(), proxies = utils.get_proxies_agent(), params = prms)