因为玩爬虫经常要换一些IP,之前写了个爬取IP的小玩具,用的是redis来存取IP地址,同时使用打分机制验证,使用异步IO(asyncio+aiohttp)同时批量使用IP请求一个测试网址,检测携带该IP是否能正常请求成功
主要包括4个py文件
效果展示
ip存放在列表中,后面紧跟评分
爬取的IP地址是http://www.xicidaili.com/wt 国内一个不太稳定的ip免费提供网
main.py 提供了3个方法a.b.c方法,直接输入a开始爬取ip数据并且经过check.py方法检测ip可用性(设置检测次数)然后打分,并且存储到redis中;b清除数据库中所有的ip数据;c展示当前IP的分数
crawl.py 中主要就是怼爬取的网址进行数据处理得到想要的IP
pool.py 是连接redis的配置以及根据python和redis的驱动包自己定义一些操作数据库方法,以及打分
check.py 使用一个公开的测试IP网站http://httpbin.org/get进行测试当前IP是否可用,同时也是异步IO的使用范例
对于打分约定是,当一个IP在规定时间内请求只要成功设置为100分,如果未响应直接设置为5分。100分的IP每次检测的话,一次未响应-1分。
main.py方法
# -*- coding=utf-8 -*-
from redissss import check,crawl,pool
import redis
#目前只能维护单一的http一个代理池,已经支持选择爬取http or https,但
# 是爬取到的IP还没能够区分存入redis,以及分开检测可用性问题
if __name__ == '__main__':
print('begin')
b = pool.Poll()
print('启动前库中有总共%s'%b.get_ip_len())
print('ip为%s'%b.get_ip_all_list())
while True:
comd = input('a开始爬取,d清空全部,c分数展示:')
if comd == 'a':
a = crawl.Getter()
a.run(type='http',page=1)
c = check.Check()
c.check(count=5)
print('检测结束后可用总数为%s'%b.get_ip_len())
print('列表为%s'%b.get_ip_all_list())
elif comd == 'd':
bb = b.r.zrangebyscore('proxies',0,10,withscores=True)
print(bb)
b.r.flushall()
print('最后前库中有总共%s'%b.get_ip_len())
print('ip为%s'%b.get_ip_all_list())
elif comd == 'c':
print('IP分数展示')
c = b.view_socre()
print(c)
check.py
# -*- coding=utf-8 -*-
import aiohttp
import asyncio
from redissss import pool
url_text = 'http://httpbin.org/get'
class Check(object):
def __init__(self):
self.redis = pool.Poll()
async def text(self,proxy):
#把SSL关掉
conn = aiohttp.TCPConnector(verify_ssl=False)
#从redis拿出来的数据是b格式的得转换成字符串
proxy = proxy.decode('utf-8')
proxy_a = 'http://' + proxy
#创建一个回话
async with aiohttp.ClientSession(connector=conn) as session:
try:
async with session.get(url_text,proxy=proxy_a,timeout= 5) as resp:
if resp.status in [200]:
print('可用IP--->>%s'%proxy_a)
self.redis.max(proxy)
else:
self.redis.reduce_score(proxy)
print('请求错误代码%s.%s'%(proxy,resp.status))
except Exception as e:
self.redis.reduce_score(proxy)
print('请求错误代码%s.%s' % (proxy, resp.status))
def check(self,count = None):
if count == None:
count = 60
for i in range(count):
try:
proxies = self.redis.get_ip_all_list()
#大概理解是启动一个事件环,系统不断访问这个事件环看好了没有
loop = asyncio.get_event_loop()
for i in range(0,len(proxies),300):
text_proxies = proxies[i:i + 300]
task = [self.text(proxy) for proxy in text_proxies]
#运行直到(协程任务task全部执行完)
loop.run_until_complete(asyncio.wait(task))
except Exception as e:
print('错误')
count = count-1
print('check count --->%s'%count)
else:
countss = count
for i in range(countss):
try:
proxies = self.redis.get_ip_all_list()
#大概理解是启动一个事件环,系统不断访问这个事件环看好了没有
loop = asyncio.get_event_loop()
for i in range(0,len(proxies),200):
text_proxies = proxies[i:i + 200]
task = [self.text(proxy) for proxy in text_proxies]
#运行直到(协程任务task全部执行完)
loop.run_until_complete(asyncio.wait(task))
except Exception as e:
print('错误')
count = count-1
print('check count --->%s'%count)
crawl.py
import requests
from bs4 import BeautifulSoup
from redissss import pool
from urllib.parse import urlencode
class Crawl(object):
def geturl(self,type=None, page=11):
#默认制造http和https页面,返回2个列表,type赋值只返回所需要的类型页面,
list1 = []
list2 = []
http_url = 'http://www.xicidaili.com/wt/'
https_url = 'http://www.xicidaili.com/wn/'
if type == None:
for i in range(1, page+1):
http_url_s = http_url + str(i)
http_urls_ss = https_url + str(i)
list1.append(http_url_s)
list2.append(http_urls_ss)
return list1, list2
elif type == 'http':
for i in range(1, page+1):
http_url_s = http_url + str(i)
list1.append(http_url_s)
return list1
elif type == 'https':
for i in range(1, page+1):
http_urls_ss = https_url + str(i)
list2.append(http_urls_ss)
return list2
def getinfo(self,url):
#请求一个网址得到IP
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}
rsp = requests.get(url=url,headers = head)
text = rsp.content.decode('utf-8')
soup = BeautifulSoup(text,'lxml')
tags = soup.find_all(class_="odd")
for i in tags:
#输出的字符串中可能包含了很多空格或空行,使用 .stripped_strings 可以去除多余空白内容
ip = list(i.stripped_strings)[0]
port = list(i.stripped_strings)[1]
yield ':'.join([ip,port])
def get_proxies(self,type=None, page=11):
#联合爬取操作
proxies = []
print('当前爬取类型%s'%type)
n = 0
if type==None:
aaa = self.geturl(page=page)
for i in aaa[0]:
http = self.getinfo(url=i)
for proxy in http:
print('成功得到的HttpIP:',proxy)
proxies.append(proxy)
for ii in aaa[1]:
https = self.getinfo(url=ii)
for proxy in https:
print('成功得到的HttpsIP:',proxy)
proxies.append(proxy)
return proxies
elif type == 'http':
aaa = self.geturl(type=type, page=page)
for i in aaa:
n=n+1
print('正在爬取http页码-->%s'%n)
http = self.getinfo(url=i)
for proxy in http:
print('爬取到HttpIP:',proxy)
proxies.append(proxy)
return proxies
elif type == 'https':
aaa = self.geturl(type=type, page=page)
for ii in aaa:
n=n+1
print('正在爬取https页码-->%s'%n)
https = self.getinfo(url=ii)
for proxy in https:
print('爬取到HttpsIP:', proxy)
proxies.append(proxy)
return proxies
class Getter():
def __init__(self):
self.redis = pool.Poll()
self.crawl = Crawl()
def run(self,type=None, page=11):
print('开始获取')
proxies = self.crawl.get_proxies(type=type, page=page)
for proxy in proxies:
self.redis.add_ip(proxy)
pool.py
# -*- coding=utf-8 -*-
import redis
import random
init_score = 10
max_score = 100
min_score = 0
my_host = '127.0.0.1'
my_port = '6379'
sequence = 'proxies'
class PoolNoError(ValueError):
pass
class Poll(object):
def __init__(self,host = my_host,port = my_port):
self.r = redis.StrictRedis(host=host, port=port)
def add_ip(self,proxy,score = init_score):
if not self.r.zscore(sequence,proxy):
return self.r.zadd(sequence,score,proxy)
def reduce_score(self,proxy):
score = self.r.zscore(sequence,proxy)
if score >min_score:
self.r.zincrby(sequence,proxy,amount=-1)
else:
self.r.zrem(sequence,proxy)
def max(self,proxy):
self.r.zadd(sequence,max_score,proxy)
def get_ip_len(self):
return self.r.zcard(sequence)
def get_ip_all_list(self):
return self.r.zrangebyscore(sequence,min_score,max_score)
def choice(self):
result = self.r.zrangebyscore(sequence,max_score,max_score)
if len(result):
return random.choice(result)
else:
result = self.r.zrevrange(sequence,0,100)
if len(result):
return random.choice(result)
else:
raise PoolNoError
def exists(self,proxy):
result = self.r.zscore(sequence,proxy)
return not self.r.zscore(sequence,proxy) == None
def view_socre(self):
result = self.r.zrevrange(sequence,start=0,end=10000,withscores=True)
return result