最近在学习爬虫,经常会因为频繁访问,而导致ip被封的情况,为了解决这个问题,自己做一个ip池来为爬虫提供代理ip
win10 PyCharm Requests
代码中的tools对象为自己瞎写的一个工具类,工具类的代码附在最后
import re
import requests
import threading
import time
from fake_useragent import UserAgent
from tools import mytools
from lxml import etree
# 创建工具对象
tools = mytools.Tools()
# 连接数据库
connection = tools.set_sqlconnect('spider')
conn = connection[0]
cursor = connection[1]
# 西刺页面抓取ip
def xici():
'''
1. 在西刺页面上批量抓取高匿ip
2. 调用test函数测试ip是否可用
3. 将可用ip存入数据库
:return:
'''
with open('xicilog.txt', 'r') as r: # 记录执行节点 -- 断点续传
log = r.readlines()
page_index = int(log[0])
stop_flag = 0
with requests.session() as s:
while 1:
try:
if stop_flag == 10: # 运行一次爬10页
break
headers = {'User-Agent': ua.random}
s.headers.update(headers) # 爬一页换一个请求头
url = 'https://www.xicidaili.com/nn/' + str(page_index)
resp = requests.get(url, headers=headers).text
html = etree.HTML(resp)
trs = html.xpath('//table[@id="ip_list"]//tr')[1:]
for i in trs: # 所有的tr标签(一个ip占一个tr)
# 取下标2\4\12得到ip信息
tr = i.xpath('.//text()')
ip = tr[2]
port = tr[4]
type = tr[12].lower()
realip = {type: type + '://' + ip + ':' + port}
print(realip)
# 测试ip是否可用
result = test(realip)
if result == True:
print('西刺可用')
if save(realip):
print('西刺存入成功')
else:
print('西刺存入失败')
if result == False:
print('西刺不可用')
page_index += 1
stop_flag += 1
with open('xicilog.txt', 'w') as w:
w.write(str(page_index))
except Exception as e:
print('西刺while出错', e)
continue
# 工具方法 -- 查询公网ip
def self_ip():
'''
由于公网ip是动态的,所以需要先查询一下自己当前的ip是多少
'''
url = 'http://txt.go.sohu.com/ip/soip'
resp = requests.get(url).text
result = re.findall(r'user_ip="(.*?)";sohu', resp)
return result[0]
def test(ip):
'''
1. True为可用,False为不可用
2. 将测试结果返回给调用者(xici\kuai)
:param ip: 接收xici函数爬取到的ip并进行测试
:return:
'''
url = 'http://www.httpbin.org/ip'
my_ip = self_ip()
ip = ip
try:
resp = requests.get(url, proxies=ip, timeout=4).text # 超过三秒还无响应的ip大概率已死亡,直接跳过
# 据说有的ip响应时间就是3,换成四以免误杀
except:
return False
if my_ip in resp:
return False
return True
# 快代理页面抓取ip
def kuai():
with open('kuailog.txt', 'r') as r:
log = r.readlines()
page_index = int(log[0])
stop_flag = 0
while 1:
try:
if stop_flag == 20:
break
url = 'https://www.kuaidaili.com/free/inha/'+ str(page_index) +'/'
resp = requests.get(url).text
html = etree.HTML(resp)
trs = html.xpath('//table[@class="table table-bordered table-striped"]/tbody//tr')
for i in trs: # 当前页下所有ip的tr标签
# ip:1 port:3 type:7
tr = i.xpath('.//text()')
ip = tr[1]
port = tr[3]
type = tr[7].lower()
realip = {type: type + '://' + ip + ':' + port}
print(realip)
if test(realip):
print('快代理可用')
if save(realip):
print('快代理存入成功')
else:
print('快代理存入失败')
else:
print('快代理不可用')
stop_flag += 1
page_index += 1
with open('kuailog.txt', 'w') as w:
w.write(str(page_index))
except Exception as e:
print('快代理while出错', e)
def save(ip):
'''
将可用ip存入数据库
:param ip:
:return:
'''
try:
sql = 'insert into ip_pool(ip) values(%s)'
rows = cursor.execute(sql, [str(ip)])
except:
print('出现重复ip,取消保存')
return False
if rows:
conn.commit()
return True
return False
def get(n):
'''
:param n: 从数据库取出几个ip
:return:
'''
result = []
try:
for i in range(n):
try:
sql = 'select * from ip_pool' # 获取所有ip
cursor.execute(sql)
ip = cursor.fetchone()[1] # ip
id = cursor.fetchone()[0] # id
print(ip)
sql = 'delete from ip_pool where id=%s' # 取出一个删一个,因为取出的ip不久也会被封,确保每次调用ip都是可用的ip
cursor.execute(sql, [id])
conn.commit()
result.append(eval(ip))
except Exception as e:
return print('get_for出错', e)
print('取出', n, '个ip')
return result
except Exception as e:
return print('get出错', e)
def rush_db():
'''
1. 定期清理数据库中的失效ip
2. 十分钟执行一次
:return:
'''
sql = 'select * from ip_pool' # 查询数据库中所有ip
cursor.execute(sql)
all = cursor.fetchall() # 得到一个二维元组
for one in all: # 遍历取出每一个ip的id和ip
try:
id = one[0]
ip = one[1]
if test(eval(ip)): # 如果该ip通过测试那么不删除
print('通过测试,未删除')
pass
else: # 如果测试为失效ip,通过id来删除该ip
sql = 'delete from ip_pool where id=%s'
rows = cursor.execute(sql, [id])
if rows:
conn.commit()
print('清掉一个失效ip')
except Exception as e:
print('定期删除for错误', e)
print('清理完毕')
if __name__ == '__main__':
'''
五分钟采集一次ip
十分钟清理一次ip
'''
ua = UserAgent() # 随机请求头对象
def run():
while True:
kuai() # 开始采集ip
time.sleep(5 * 60)
def run1():
while True:
xici() # 开始采集ip
time.sleep(5 * 60)
def run2():
while True:
time.sleep(10 * 60)
rush_db()
run = threading.Thread(target=run) # 爬取ip
run1 = threading.Thread(target=run1) # 爬取ip
run2 = threading.Thread(target=run2) # 清理失效ip
run.start()
run1.start()
run2.start()
run2.join() # 阻塞主程序,以免主进程在线程休眠时结束程序
print('测试完毕')
这个就是为了减少代码冗余瞎写的一个类,因为爬虫代码中经常会出现一些重复的代码,抽离出来做成方法,用的时候调用就方便很多了
import MySQLdb
import requests
import re
class Tools:
def __init__(self):
pass
def string_to_dict(self, string):
dict = {}
for i in string.split('\n'):
i = i.replace(' ', '')
i = i.split(':', maxsplit=1)
dict[i[0]] = i[1].strip()
return dict
def imp(self):
return print('from lxml import etree\nimport requests\nfrom fake_useragent import UserAgent\nimport time')
def set_sqlconnect(self, db, charset='utf8'):
conn = MySQLdb.connect(
user = 'root',
password = '123456',
db = db,
host = 'localhost',
port = 3306,
charset = charset
)
cursor = conn.cursor()
return [conn, cursor]
def list_to_string(self, list):
'''
将列表循环,拼接出一个长字符串
:return:
'''
string = ''
for i in list:
i = i.strip()
i = i.replace(' ', '')
string += i
return string
def xpath(self, str):
'''
返回
:param str:
:return:
'''
def self_ip(self):
url = 'http://txt.go.sohu.com/ip/soip'
resp = requests.get(url).text
result = re.findall(r'user_ip="(.*?)";sohu', resp)
return result[0]
亲测,实用性还是有一点的,不过有一点可以补充,以提高实用性,在ip池的get方法中再测试一下取出的ip是否存活,存活才返回,这样就可以保证每次取到的都是能用的ip了