《分享项目_python爬取可用代理ip》

这里面已经带上了绕过限制ip访问的防爬策略的方法了,自己找找,下一篇文章再重点说明。

# coding = utf-8
# Author = ChristopherLam
# Deadline = 2017-04-19
# qq = 770304694
# csdn = http://blog.csdn.net/christopher_l1n

from urllib import request, error
from bs4 import BeautifulSoup
from random import choice
import time
import threading
import os
import re


class FreeProxyIPSpider:
    def __init__(self):
        self.User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
        self.header = {}
        self.header['User-Agent'] = self.User_Agent
        self.base_url = 'http://www.kuaidaili.com/free/inha/'
        self.file_path = os.getcwd() + '/FreeProxyIP'
        self.useable_file = os.getcwd() + '/UseableProxyIP'
        self.headers = [
            'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
            'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
            'IE 9.0User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
            'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
            'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ',
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36']
        self.regex = re.compile(r'baidu.com')

    def struct_url(self, page):
        """构造代理ip集成网站的url"""
        baseurl = self.base_url
        url = baseurl + str(page) + '/'
        return url

    def global_spider(self, url=None, proxy_ip=None):
        """通用爬虫,若有url,即爬取代理集成网站的url
        若有代理ip,即访问代理ip"""
        if proxy_ip is None:
            try:
                req = request.Request(url, headers=self.RandomUserAgent())
                res = request.urlopen(req).read()
                soup = BeautifulSoup(res, 'html.parser')
                IPs = soup.find_all(name='td', attrs={'data-title': 'IP'})
                Ports = soup.find_all(name='td', attrs={'data-title': 'PORT'})

            except error.URLError as e:
                print(e)
                IPs = None
                Ports = None
            return IPs, Ports
        else:
            opener_support = request.ProxyHandler({'http': proxy_ip})
            opener = request.build_opener(opener_support)
            request.install_opener(opener)
            try:
                rsp = request.urlopen('https://www.baidu.com', timeout=3)
            except:
                print('%s connect failed' % (proxy_ip))
                return
            else:
                try:
                    string = rsp.read()
                except:
                    print('%s connect failed' % (proxy_ip))
                if self.regex.search(str(string)):
                    self.WriteUsableProxyIP(proxy_ip)

    def WriteUsableProxyIP(self, proxy_ip):
        """可用代理ip写入文件"""
        with open(self.useable_file, 'a', encoding='utf-8') as file:
            file.write(proxy_ip)
        file.close()

    def RandomUserAgent(self):
        """随机header"""
        header = {'User-Agent': choice(self.headers)}
        return header

    def PackagedSpider(self, page_num):
        """封装获取代理ip爬虫"""
        url = self.struct_url(page_num)
        IPs, Ports = self.global_spider(url)
        if IPs is not None:  # IPs和Ports要么同时有值,要么同时为None
            with open(self.file_path, 'a', encoding='utf-8') as file:
                for x in range(0, len(IPs)):
                    ip_td = IPs[x]
                    port_td = Ports[x]
                    for y in range(0, len(ip_td)):
                        file.write(ip_td.contents[y] + ':' + port_td.contents[y] + '\n')
                        print(ip_td.contents[y])
            file.close()

    def __readFreeProxyIPfile(self):
        pass

    def VerifyProxyIP(self):
        """
        验证代理ip是否可用,若可用写入UseableProxyIP文件"""
        with open(self.file_path, 'r', encoding='utf-8') as file:
            while 1:
                lines = file.readlines(10000)
                if not lines:
                    break
                else:
                    for i in lines:
                        self.global_spider(proxy_ip=i)


def main():
    print('Author = ChristopherLam\nDeadline = 2017-04-19\nqq = 770304694\ncsdn = http://blog.csdn.net/christopher_l1n')
    spi = FreeProxyIPSpider()

    '''
    这里是多线程爬取代理ip,但源网址的防爬策略是限制ip访问,下面的多线程是无法实现的,
    还是老老实实单线程吧,或者你拿到可用代理ip后,用代理ip多线程爬取也是可以的,自己实现去。
    定制利用代理ip多线程爬取代理ip,收费,联系qq:770304694。不过我可能也懒得做……
    for i in range(1, 3):
        spi_threading = threading.Thread(target=spi.PackagedSpider(i))
        spi_threading.setDaemon(True)
        spi_threading.start()
'''

    spi.VerifyProxyIP()
if __name__ == '__main__':
    main()






你可能感兴趣的:(python,爬虫)