搭建一个简易的ip池

最近在学习爬虫,经常会因为频繁访问,而导致ip被封的情况,为了解决这个问题,自己做一个ip池来为爬虫提供代理ip

实现思路

  1. 自动爬取ip并存入数据库
  2. 每五分钟爬取一些新的ip来填充ip池
  3. 每十分钟自动检测数据库中现存的这些ip哪些依然存活,哪些已经死亡,将死掉的ip从数据库清理掉
  4. 采用多线程串行,去不同的网页抓取ip并测试

工具与环境

win10 PyCharm Requests
代码中的tools对象为自己瞎写的一个工具类,工具类的代码附在最后

ip池完整代码

import re
import requests
import threading
import time
from fake_useragent import UserAgent
from tools import mytools
from lxml import etree

# 创建工具对象
tools = mytools.Tools()
#  连接数据库
connection = tools.set_sqlconnect('spider')
conn = connection[0]
cursor = connection[1]

# 西刺页面抓取ip
def xici():
    '''
    1. 在西刺页面上批量抓取高匿ip
    2. 调用test函数测试ip是否可用
    3. 将可用ip存入数据库
    :return:
    '''
    with open('xicilog.txt', 'r') as r:  # 记录执行节点 -- 断点续传
        log = r.readlines()
    page_index = int(log[0])
    stop_flag = 0
    with requests.session() as s:
        while 1:
            try:
                if stop_flag == 10:  # 运行一次爬10页
                    break
                headers = {'User-Agent': ua.random}
                s.headers.update(headers)  # 爬一页换一个请求头
                url = 'https://www.xicidaili.com/nn/' + str(page_index)
                resp = requests.get(url, headers=headers).text
                html = etree.HTML(resp)
                trs = html.xpath('//table[@id="ip_list"]//tr')[1:]
                for i in trs:  # 所有的tr标签(一个ip占一个tr)
                    # 取下标2\4\12得到ip信息
                    tr = i.xpath('.//text()')
                    ip = tr[2]
                    port = tr[4]
                    type = tr[12].lower()
                    realip = {type: type + '://' + ip + ':' + port}
                    print(realip)
                    # 测试ip是否可用
                    result = test(realip)
                    if result == True:
                        print('西刺可用')
                        if save(realip):
                            print('西刺存入成功')
                        else:
                            print('西刺存入失败')
                    if result == False:
                        print('西刺不可用')
                page_index += 1
                stop_flag += 1
                with open('xicilog.txt', 'w') as w:
                    w.write(str(page_index))
            except Exception as e:
                print('西刺while出错', e)
                continue

# 工具方法 -- 查询公网ip
def self_ip():
	'''
	由于公网ip是动态的,所以需要先查询一下自己当前的ip是多少
	'''
    url = 'http://txt.go.sohu.com/ip/soip'
    resp = requests.get(url).text
    result = re.findall(r'user_ip="(.*?)";sohu', resp)
    return result[0]

def test(ip):
    '''
    1. True为可用,False为不可用
    2. 将测试结果返回给调用者(xici\kuai)
    :param ip: 接收xici函数爬取到的ip并进行测试
    :return:
    '''
    url = 'http://www.httpbin.org/ip'
    my_ip = self_ip()
    ip = ip
    try:
        resp = requests.get(url, proxies=ip, timeout=4).text  # 超过三秒还无响应的ip大概率已死亡,直接跳过
        # 据说有的ip响应时间就是3,换成四以免误杀
    except:
        return False
    if my_ip in resp:
        return False
    return True

# 快代理页面抓取ip
def kuai():
    with open('kuailog.txt', 'r') as r:
        log = r.readlines()
    page_index = int(log[0])
    stop_flag = 0
    while 1:
        try:
            if stop_flag == 20:
                break
            url = 'https://www.kuaidaili.com/free/inha/'+ str(page_index) +'/'
            resp = requests.get(url).text
            html = etree.HTML(resp)
            trs = html.xpath('//table[@class="table table-bordered table-striped"]/tbody//tr')
            for i in trs:  # 当前页下所有ip的tr标签
                # ip:1 port:3 type:7
                tr = i.xpath('.//text()')
                ip = tr[1]
                port = tr[3]
                type = tr[7].lower()
                realip = {type: type + '://' + ip + ':' + port}
                print(realip)
                if test(realip):
                    print('快代理可用')
                    if save(realip):
                        print('快代理存入成功')
                    else:
                        print('快代理存入失败')
                else:
                    print('快代理不可用')
            stop_flag += 1
            page_index += 1
            with open('kuailog.txt', 'w') as w:
                w.write(str(page_index))
        except Exception as e:
            print('快代理while出错', e)

def save(ip):
    '''
    将可用ip存入数据库
    :param ip:
    :return:
    '''
    try:
        sql = 'insert into ip_pool(ip) values(%s)'
        rows = cursor.execute(sql, [str(ip)])
    except:
        print('出现重复ip,取消保存')
        return False
    if rows:
        conn.commit()
        return True
    return False

def get(n):
    '''
    :param n: 从数据库取出几个ip
    :return:
    '''
    result = []
    try:
        for i in range(n):
            try:
                sql = 'select * from ip_pool'  # 获取所有ip
                cursor.execute(sql)
                ip = cursor.fetchone()[1]  # ip
                id = cursor.fetchone()[0]  # id
                print(ip)
                sql = 'delete from ip_pool where id=%s'  # 取出一个删一个,因为取出的ip不久也会被封,确保每次调用ip都是可用的ip
                cursor.execute(sql, [id])
                conn.commit()
                result.append(eval(ip))
            except Exception as e:
                return print('get_for出错', e)
        print('取出', n, '个ip')
        return result
    except Exception as e:
        return print('get出错', e)

def rush_db():
    '''
    1. 定期清理数据库中的失效ip
    2. 十分钟执行一次
    :return:
    '''
    sql = 'select * from ip_pool'  # 查询数据库中所有ip
    cursor.execute(sql)
    all = cursor.fetchall()  # 得到一个二维元组
    for one in all:  # 遍历取出每一个ip的id和ip
        try:
            id = one[0]
            ip = one[1]
            if test(eval(ip)):  # 如果该ip通过测试那么不删除
                print('通过测试,未删除')
                pass
            else:  # 如果测试为失效ip,通过id来删除该ip
                sql = 'delete from ip_pool where id=%s'
                rows = cursor.execute(sql, [id])
                if rows:
                    conn.commit()
                    print('清掉一个失效ip')
        except Exception as e:
            print('定期删除for错误', e)
    print('清理完毕')

if __name__ == '__main__':
    '''
    五分钟采集一次ip
    十分钟清理一次ip
    '''
    ua = UserAgent()  # 随机请求头对象

    def run():
        while True:
            kuai()  # 开始采集ip
            time.sleep(5 * 60)

    def run1():
        while True:
            xici()  # 开始采集ip
            time.sleep(5 * 60)

    def run2():
        while True:
            time.sleep(10 * 60)
            rush_db()

    run = threading.Thread(target=run)  # 爬取ip
    run1 = threading.Thread(target=run1)  # 爬取ip
    run2 = threading.Thread(target=run2)  # 清理失效ip

    run.start()
    run1.start()
    run2.start()

    run2.join()  # 阻塞主程序,以免主进程在线程休眠时结束程序

    print('测试完毕')

工具类代码

这个就是为了减少代码冗余瞎写的一个类,因为爬虫代码中经常会出现一些重复的代码,抽离出来做成方法,用的时候调用就方便很多了

import MySQLdb
import requests
import re

class Tools:
    def __init__(self):
        pass

    def string_to_dict(self, string):
        dict = {}
        for i in string.split('\n'):
            i = i.replace(' ', '')
            i = i.split(':', maxsplit=1)
            dict[i[0]] = i[1].strip()
        return dict


    def imp(self):
        return print('from lxml import etree\nimport requests\nfrom fake_useragent import UserAgent\nimport time')

    def set_sqlconnect(self, db, charset='utf8'):
        conn = MySQLdb.connect(
            user = 'root',
            password = '123456',
            db = db,
            host = 'localhost',
            port = 3306,
            charset = charset
        )
        cursor = conn.cursor()
        return [conn, cursor]

    def list_to_string(self, list):
        '''
        将列表循环,拼接出一个长字符串
        :return:
        '''
        string = ''
        for i in list:
            i = i.strip()
            i = i.replace(' ', '')
            string += i
        return string

    def xpath(self, str):
        '''
        返回
        :param str:
        :return:
        '''

    def self_ip(self):
        url = 'http://txt.go.sohu.com/ip/soip'
        resp = requests.get(url).text
        result = re.findall(r'user_ip="(.*?)";sohu', resp)
        return result[0]

亲测,实用性还是有一点的,不过有一点可以补充,以提高实用性,在ip池的get方法中再测试一下取出的ip是否存活,存活才返回,这样就可以保证每次取到的都是能用的ip了

你可能感兴趣的:(爬虫工具)