如何构建自己的IP代理池

在学习爬虫的过程中,肯定遇到过被封IP的问题,这个时候我们就需要去用代理来帮我们完成爬取任务,然而,爬着爬着又被封了,好,再换一个代理,一会过后又封了。这种情况有两种方法来解决
方法一:对我们的爬虫进行限速,使爬取速度接近人工访问的速度
方法二:使用代理池
在这里只说明一下代理池,我们可以使用西刺的免费代理,将其爬取下来存在数据库中,这样只需要在每次爬取的时候从数据库中取一个IP就可以了
下面是从西刺上爬取数据并存储的代码

import requests
import pymongo
import threading
from requests.exceptions import HTTPError
from datetime import datetime
from lxml.html import fromstring


class DownLoad(object):
    def __init__(self, proxy=None, headers=None):
        self.proxy = proxy
        self.headers = headers
        self.client = pymongo.MongoClient(
            'mongodb_url'
        )
        self.db = self.client['scrapy_items']

    def __call__(self, url):
        tree = self.downloader(url)
        if tree is None:
            print('HTTP ERROR!')
        else:
            ip_info = self.get_ips(tree)
            for ip in ip_info:
                if ip is None:
                    print('invalid ip and port')
                else:
                    try:
                        self.db['IP'].insert_one(ip)
                    except Exception as e:
                        print(e)

    def close(self):
        self.client.close()

    def downloader(self, url):
        try:
            html = requests.get(url, headers=self.headers)
        except HTTPError as err:
            print(err)
        except Exception as e:
            print(e)
        else:
            try:
                tree = fromstring(html.text)
                return tree
            except Exception as e:
                print(e)
        return None

    def get_ips(self, tree):
        table = tree.xpath('//table[@id="ip_list"]//tr[@class]')
        for tr in table:
            ip_info = {}
            try:
                ip_info['ip'] = tr.xpath('.//td[2]/text()')[0]
                ip_info['port'] = tr.xpath('.//td[3]/text()')[0]
                ip_info['status'] = tr.xpath('.//td[5]/text()')[0]
                ip_info['type'] = tr.xpath('.//td[6]/text()')[0]
                ip_info['speed'] = float(tr.xpath('.//td[7]/div/@title')[0].split('秒')[0])
                ip_info['connect_time'] = float(tr.xpath('.//td[8]/div/@title')[0].split('秒')[0])
            except Exception as e:
                print(e)
                yield None
            if self.verification_ip(ip_info['ip'], ip_info['port'], ip_info['type']):
                ip_info['verification_time'] = datetime.now()
                yield ip_info
            else:
                print(ip_info['ip'], end='')
                yield None

    def verification_ip(self, ip, port, type):
        if type == 'HTTP':
            proxy_dict = {
                'http': 'http://%s:%s' % (ip, port),
            }
        else:
            proxy_dict = {
                'https': 'https://%s:%s' % (ip, port),
            }
        try:
            html = requests.get('https://hao.360.com/', headers=self.headers, proxies=proxy_dict,
                                timeout=5)
        except HTTPError as err:
            print(err)
        except Exception as e:
            print(e)
            return False
        else:
            if 200 <= html.status_code < 300:
                return True
            else:
                return False


def runspider(downloader, base_url, start_url, end_url):
	"""执行方法"""
    for i in range(start_url, end_url):
        url = base_url + str(i)
        downloader(url)

其中包括爬取IP以及对IP进行检测是否能用和存入数据库中(代码结构可能有些问题请见谅,新手一枚),对爬取的IP进行检测是为了防止无效的IP存入数据库,毕竟是免费的IP,稳定性是不怎么样的,如果你还嫌这些IP有点少的话, 可以再爬取一下其他的IP代理网站
具体怎么执行看你自己想法啦,剩余的部分就不贴了(建议不要爬取太快,会封IP的,我就是爬取的太快,还没有爬完呢,就被封了)
接下来就是随机从数据库中取IP了

class GetIP(object):

    def __init__(self):
        self.client = pymongo.MongoClient(
            'mongodb_url'
        )
        self.db = self.client['scrapy_items']
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
        }

    def judge_ip(self, ip):
        if ip['type'] == 'HTTP':
            proxy_dict = {
                'http': 'http://%s:%s' % (ip['ip'], ip['port']),
            }
        else:
            proxy_dict = {
                'https': 'https://%s:%s' % (ip['ip'], ip['port']),
            }
        try:
            html = requests.get('https://hao.360.com/', headers=self.headers, proxies=proxy_dict,
                                timeout=5)
        except HTTPError as err:
            print(ip['ip'], err)
            return False
        except Exception as e:
            print(ip['ip'], e)
            return False
        else:
            if 200 <= html.status_code < 300:
                return True
            else:
                return False

    def get_random_ip(self):
        ip_info = self.db['IP'].aggregate([
            {'$sample': {'size': 1}},
        ])
        for ip in ip_info:
            if self.judge_ip(ip):
                return '%s://%s:%s' % (ip['type'], ip['ip'], ip['port'])
            else:
                self.delete_ip(ip)
                return self.get_random_ip()

    def delete_ip(self, ip):
        self.db['IP'].remove({'ip': ip['ip']})

    def close(self):
        self.client.close()

这样我们就构建出了自己的IP代理池,我们可以使用下面方法来测试一下

    get_ip = GetIP()
    for i in range(5):
        ip_port = get_ip.get_random_ip()
        print(ip_port)
    get_ip.close()

好了,IP代理池搞定,在爬取信息的时候可能还会用到用户代理池,用户代理池有一个第三方库非常方便,fake_useragent,直接搜索就可以了
在爬取信息的时候最好是代理与限速配合,毕竟谁都不想自己的服务器被别人拿来随意蹂躏

你可能感兴趣的:(scrapy)