Python 多线程检测代理IP可用性

测试样本

# ip.txt
110.52.235.87:9999@HTTP#[高匿]湖南省岳阳市 联通  
111.43.70.58:51547@HTTP#[未知]黑龙江省 移动(全省通用) 
183.196.97.125:41397@HTTP#[未知]河北省廊坊市 移动  
110.52.235.210:9999@HTTP#[高匿]湖南省岳阳市 联通  
183.166.167.163:8080@HTTP#[未知]安徽省黄山市 电信  
111.177.171.242:9999@HTTP#[未知]湖北省随州市 电信  
123.127.93.188:44399@HTTP#
....
....

python脚本

需要安装requests
pip install requests

# coding=utf-8
#

import os
import re
import requests
import threading
from datetime import datetime
from requests import RequestException
from time import sleep


BASE_DIR = os.path.dirname(__file__)
IP_TXT = os.path.join(BASE_DIR, datetime.now().strftime('%Y-%m-%d') + '-ip.txt')
SCR_IP_TXT = os.path.join(BASE_DIR, 'ip.txt')

MAX_TEST_THREADS = 100


class TestThread(threading.Thread):
    def __init__(self, _ip_li):
        self.li = _ip_li
        super(TestThread, self).__init__()

    def run(self):
        self.li = [_ for _ in self.li if self.test_ip_available(_)]

    def test_ip_available(self, ip):
        _proxy = {'https': ip}
        try:
            print u'正在检测IP: %s 有效性\n' % ip
            r = requests.get('https://www.so.com/s?ie=utf-8&fr=none&src=360sou_newhome&q=123', 
                             proxies=_proxy,
                             timeout=5)
            assert u'_360搜索' in r.text
        except (RequestException, AssertionError):
            return False
        print u'找到可用代理IP: %s\n' % ip
        return True


def time_wrapper(func):
    def _wrapper():
        start_time = datetime.now()
        func()
        end_time = datetime.now()
        seconds = (end_time - start_time).total_seconds()
        print u'本次执行共消耗: %d分%d秒\n' % (seconds / 60, seconds % 60)
    return _wrapper


@time_wrapper
def parse_ip():
    # 读取本地ip文件
    with open(SCR_IP_TXT) as f:
        _ip_list = re.findall(r'\d+\.\d+\.\d+\.\d+:\d+', f.read())
    print u'检索到: %d 个IP地址\n' % len(_ip_list)

    # 平均分配_ip_list到各个线程检测
    threads = []
    avg = len(_ip_list) / MAX_TEST_THREADS
    if len(_ip_list) % MAX_TEST_THREADS != 0:
        avg += 1
    for i in range(MAX_TEST_THREADS):
        _thread = TestThread(_ip_list[i*avg:(i+1)*avg])
        threads.append(_thread)
        _thread.start()

    # 等待所有检测线程退出
    while threading.active_count() > 1:
        sleep(10)

    # 读取所有有效IP并写入文件
    _ip_list = []
    for th in threads:
        _ip_list.extend(th.li)

    _ip_list = set(_ip_list)
    print u'\n总共找到 %d 个可用IP\n' % len(_ip_list)
    with open(IP_TXT, 'w') as f:
        f.write('\n'.join(_ip_list))


if __name__ == '__main__':
    parse_ip()

结果

检索到: 1899 个IP地址

......
......
......

正在检测IP: 124.81.245.148:8080 有效性

正在检测IP: 111.177.160.17:9999 有效性


总共找到 60 个可用IP

本次执行共消耗: 2分13秒

你可能感兴趣的:(脚本)