Python爬虫爬取开源IP

因为玩爬虫经常要换一些IP,之前写了个爬取IP的小玩具,用的是redis来存取IP地址,同时使用打分机制验证,使用异步IO(asyncio+aiohttp)同时批量使用IP请求一个测试网址,检测携带该IP是否能正常请求成功

主要包括4个py文件
Python爬虫爬取开源IP_第1张图片
效果展示
Python爬虫爬取开源IP_第2张图片
ip存放在列表中,后面紧跟评分

爬取的IP地址是http://www.xicidaili.com/wt 国内一个不太稳定的ip免费提供网

main.py 提供了3个方法a.b.c方法,直接输入a开始爬取ip数据并且经过check.py方法检测ip可用性(设置检测次数)然后打分,并且存储到redis中;b清除数据库中所有的ip数据;c展示当前IP的分数
crawl.py 中主要就是怼爬取的网址进行数据处理得到想要的IP
pool.py 是连接redis的配置以及根据python和redis的驱动包自己定义一些操作数据库方法,以及打分
check.py 使用一个公开的测试IP网站http://httpbin.org/get进行测试当前IP是否可用,同时也是异步IO的使用范例

对于打分约定是,当一个IP在规定时间内请求只要成功设置为100分,如果未响应直接设置为5分。100分的IP每次检测的话,一次未响应-1分。

main.py方法

# -*- coding=utf-8 -*-
from redissss import check,crawl,pool
import redis
#目前只能维护单一的http一个代理池,已经支持选择爬取http or https,但
# 是爬取到的IP还没能够区分存入redis,以及分开检测可用性问题
if __name__ == '__main__':
    print('begin')
    b = pool.Poll()
    print('启动前库中有总共%s'%b.get_ip_len())
    print('ip为%s'%b.get_ip_all_list())

    while True:
        comd = input('a开始爬取,d清空全部,c分数展示:')
        if comd == 'a':
            a = crawl.Getter()
            a.run(type='http',page=1)
            c = check.Check()
            c.check(count=5)
            print('检测结束后可用总数为%s'%b.get_ip_len())
            print('列表为%s'%b.get_ip_all_list())

        elif comd == 'd':
            bb = b.r.zrangebyscore('proxies',0,10,withscores=True)
            print(bb)
            b.r.flushall()
            print('最后前库中有总共%s'%b.get_ip_len())
            print('ip为%s'%b.get_ip_all_list())

        elif comd == 'c':
            print('IP分数展示')
            c = b.view_socre()
            print(c)

check.py

# -*- coding=utf-8 -*-
import aiohttp
import asyncio

from redissss import pool

url_text = 'http://httpbin.org/get'

class Check(object):

    def __init__(self):
        self.redis = pool.Poll()

    async def text(self,proxy):
        #把SSL关掉
        conn = aiohttp.TCPConnector(verify_ssl=False)
        #从redis拿出来的数据是b格式的得转换成字符串
        proxy = proxy.decode('utf-8')
        proxy_a = 'http://' + proxy
        #创建一个回话
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                async with session.get(url_text,proxy=proxy_a,timeout= 5) as resp:
                    if resp.status in [200]:
                        print('可用IP--->>%s'%proxy_a)
                        self.redis.max(proxy)
                    else:
                        self.redis.reduce_score(proxy)
                        print('请求错误代码%s.%s'%(proxy,resp.status))

            except Exception as e:
                self.redis.reduce_score(proxy)
                print('请求错误代码%s.%s' % (proxy, resp.status))


    def check(self,count = None):
        if count == None:
            count = 60
            for i in range(count):
                try:
                    proxies = self.redis.get_ip_all_list()
                    #大概理解是启动一个事件环,系统不断访问这个事件环看好了没有
                    loop = asyncio.get_event_loop()
                    for i in range(0,len(proxies),300):
                        text_proxies = proxies[i:i + 300]
                        task = [self.text(proxy) for proxy in text_proxies]
                        #运行直到(协程任务task全部执行完)
                        loop.run_until_complete(asyncio.wait(task))
                except Exception as e:
                    print('错误')
                count = count-1
                print('check count --->%s'%count)
        else:
            countss = count
            for i in range(countss):
                try:
                    proxies = self.redis.get_ip_all_list()
                    #大概理解是启动一个事件环,系统不断访问这个事件环看好了没有
                    loop = asyncio.get_event_loop()
                    for i in range(0,len(proxies),200):
                        text_proxies = proxies[i:i + 200]
                        task = [self.text(proxy) for proxy in text_proxies]
                        #运行直到(协程任务task全部执行完)
                        loop.run_until_complete(asyncio.wait(task))
                except Exception as e:
                    print('错误')
                count = count-1
                print('check count --->%s'%count)


crawl.py

import requests
from bs4 import BeautifulSoup
from redissss import pool
from urllib.parse import urlencode

class Crawl(object):

    def geturl(self,type=None, page=11):
        #默认制造http和https页面,返回2个列表,type赋值只返回所需要的类型页面,
        list1 = []
        list2 = []
        http_url = 'http://www.xicidaili.com/wt/'
        https_url = 'http://www.xicidaili.com/wn/'
        if type == None:
            for i in range(1, page+1):
                http_url_s = http_url + str(i)
                http_urls_ss = https_url + str(i)
                list1.append(http_url_s)
                list2.append(http_urls_ss)
            return list1, list2
        elif type == 'http':
            for i in range(1, page+1):
                http_url_s = http_url + str(i)
                list1.append(http_url_s)
            return list1
        elif type == 'https':
            for i in range(1, page+1):
                http_urls_ss = https_url + str(i)
                list2.append(http_urls_ss)
            return list2


    def getinfo(self,url):
        #请求一个网址得到IP
        head = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        }
        rsp = requests.get(url=url,headers = head)
        text = rsp.content.decode('utf-8')
        soup = BeautifulSoup(text,'lxml')
        tags = soup.find_all(class_="odd")
        for i in tags:
            #输出的字符串中可能包含了很多空格或空行,使用 .stripped_strings 可以去除多余空白内容
            ip = list(i.stripped_strings)[0]
            port = list(i.stripped_strings)[1]
            yield ':'.join([ip,port])

    def get_proxies(self,type=None, page=11):
        #联合爬取操作
        proxies = []
        print('当前爬取类型%s'%type)
        n = 0

        if type==None:
            aaa = self.geturl(page=page)
            for i in aaa[0]:
                http = self.getinfo(url=i)
                for proxy in http:
                    print('成功得到的HttpIP:',proxy)
                    proxies.append(proxy)
            for ii in aaa[1]:
                https = self.getinfo(url=ii)
                for proxy in https:
                    print('成功得到的HttpsIP:',proxy)
                    proxies.append(proxy)
            return proxies

        elif type == 'http':
            aaa = self.geturl(type=type, page=page)
            for i in aaa:
                n=n+1
                print('正在爬取http页码-->%s'%n)
                http = self.getinfo(url=i)
                for proxy in http:
                    print('爬取到HttpIP:',proxy)
                    proxies.append(proxy)
            return proxies
        elif type == 'https':
            aaa = self.geturl(type=type, page=page)
            for ii in aaa:
                n=n+1
                print('正在爬取https页码-->%s'%n)
                https = self.getinfo(url=ii)
                for proxy in https:
                    print('爬取到HttpsIP:', proxy)
                    proxies.append(proxy)
            return proxies


class Getter():
    def __init__(self):
        self.redis = pool.Poll()
        self.crawl = Crawl()
    def run(self,type=None, page=11):
        print('开始获取')
        proxies = self.crawl.get_proxies(type=type, page=page)
        for proxy in proxies:
            self.redis.add_ip(proxy)



pool.py

# -*- coding=utf-8 -*-
import redis
import random

init_score = 10
max_score = 100
min_score = 0
my_host = '127.0.0.1'
my_port = '6379'
sequence = 'proxies'

class PoolNoError(ValueError):
    pass

class Poll(object):
    def __init__(self,host = my_host,port = my_port):
        self.r = redis.StrictRedis(host=host, port=port)

    def add_ip(self,proxy,score = init_score):
        if not self.r.zscore(sequence,proxy):
            return self.r.zadd(sequence,score,proxy)

    def reduce_score(self,proxy):
        score = self.r.zscore(sequence,proxy)
        if score >min_score:
            self.r.zincrby(sequence,proxy,amount=-1)
        else:
            self.r.zrem(sequence,proxy)
    def max(self,proxy):
        self.r.zadd(sequence,max_score,proxy)

    def get_ip_len(self):
        return self.r.zcard(sequence)

    def get_ip_all_list(self):
        return self.r.zrangebyscore(sequence,min_score,max_score)


    def choice(self):
        result = self.r.zrangebyscore(sequence,max_score,max_score)
        if len(result):
            return random.choice(result)
        else:
            result = self.r.zrevrange(sequence,0,100)
            if len(result):
                return random.choice(result)
            else:
                raise PoolNoError

    def exists(self,proxy):
        result = self.r.zscore(sequence,proxy)
        return not self.r.zscore(sequence,proxy) == None

    def view_socre(self):
        result = self.r.zrevrange(sequence,start=0,end=10000,withscores=True)
        return result


你可能感兴趣的:(python)