写个爬虫——爬取代理IP

写爬虫用上代理IP是很常见的事情,那么我们就写一个包来方便自己以后写爬虫。

还是用的老IP网站——http://www.xicidaili.com/

获取的是高匿的代理页面,可以上西刺代理网站上看源代码,发现写的不是很正规,我用的是xpath的方法抓取网页内容,也可以用re正则的方法来,个人认为xpath比较简单,速度也不慢。

下面是核心实现部分的代码:

def GetIP(nub, HTTPUrl=0, HTTPSUrl=0):
    '''
        获取ip
        nub 是获取的有效代理IP数量
        detectionURL 是测试的的网站URL
    '''
    print("正在获取代理IP...")
    page = 1
    while len(usableIP) <= nub:
        # print("测试第"+str(page)+"页")
        url = "http://www.xicidaili.com/nt/"+str(page)
        headers = header()
        sess = requests.Session()
        html = sess.get(url, headers=headers).text
        selector = etree.HTML(html)
        # 获取ip
        ipList = selector.xpath('//td[2]/text()')
        # 获取端口号
        portList = selector.xpath('//td[3]/text()')
        # 获取ip类型
        typeList = selector.xpath('//td[6]/text()')
        '''单个测试
        c = detectionIP('183.62.196.10', '3128', 'HTTP')
        return c
        '''

        '''这是程序测试ip可用性,速度很慢,采用多线程'''
        # 用来储存线程,这样可以让主线程等待
        threadc = []
        # 验证代理ip是否可用
        for i in range(0, 99):
            c = threading.Thread(target=detectionIP, args=(ipList[i], portList[i], typeList[i], HTTPSUrl, HTTPUrl))
            c.start()
            threadc.append(c)
        for c in threadc:
            c.join()
        page += 1
    print("获取代理IP成功")
    
    return usableIP

在上面的代码中用到了detectionIP 是用来验证代理ip的有效性的,这是我写的验证方法,不是很通用,当时也是为了下载听书小说。

验证的方法也是很简单的,就是用代理IP访问一下目标网站(最好不要用百度,因为基本上测不出来),推荐用自己要爬取的网站,这样比较有效。访问目标网站返回的code=200就算是访问成功了,但是我发现有的IP不是很健康,代理IP后不管访问什么网站都是跳转到了有道,所以在下面又做了一个小验证。

下面是验证IP的代码:

def detectionIP(ip, port, type, httpsurl, httpurl):
    '''
        IP验证存活性
    '''
    try:
        if type == 'HTTP'and httpurl != 0:
            IPAgency = {
                "http": "http://"+ip+":"+port
             }
            r = requests.get(httpurl, headers=header(), proxies=IPAgency, timeout=5)
            if r.status_code == 200:
                try:
                    content = requests.get(httpurl, headers=header(), proxies=IPAgency, timeout=5)  # 进一步测试ip的有效性
                    content.encoding = "GBK"
                    s = etree.HTML(content.text)
                    text = s.xpath('//a[@href="//www.wotingpingshu.com"]/text()')[0]  # xpath取出来的是列表
                    if text == "我听评书网":
                        usableIP.append(ip+":"+port)  # + "@" + type +" "+str(r.status_code))
                except BaseException:
                    pass
        elif type == 'HTTPS'and httpsurl != 0:
            IPAgency = {
                "https": "http://"+ip+":"+port
            }
            r = requests.get(httpsurl, headers=header(), proxies=IPAgency, timeout=5)
            if r.status_code == 200:
                usableIP.append(ip+":"+port)  #  + "@" + type" "+str(r.status_code))
    except BaseException:
        # print(ip+" 无效 "+type)
        pass

 

代码的主要部分就是上面这些,全部代码在下面:

import requests
import random
import threading
import re
from lxml import etree

headersList = [
                "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
                "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
                "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            ]
usableIP = []


def header():
    headers = {"User-Agent": random.choice(headersList)}
    return headers


def detectionIP(ip, port, type, httpsurl, httpurl):
    '''
        IP验证存活性
    '''
    try:
        if type == 'HTTP'and httpurl != 0:
            IPAgency = {
                "http": "http://"+ip+":"+port
             }
            r = requests.get(httpurl, headers=header(), proxies=IPAgency, timeout=5)
            if r.status_code == 200:
                try:
                    content = requests.get(httpurl, headers=header(), proxies=IPAgency, timeout=5)  # 进一步测试ip的有效性
                    content.encoding = "GBK"
                    s = etree.HTML(content.text)
                    text = s.xpath('//a[@href="//www.wotingpingshu.com"]/text()')[0]  # xpath取出来的是列表
                    if text == "我听评书网":
                        usableIP.append(ip+":"+port)  # + "@" + type +" "+str(r.status_code))
                except BaseException:
                    pass
        elif type == 'HTTPS'and httpsurl != 0:
            IPAgency = {
                "https": "http://"+ip+":"+port
            }
            r = requests.get(httpsurl, headers=header(), proxies=IPAgency, timeout=5)
            if r.status_code == 200:
                usableIP.append(ip+":"+port)  #  + "@" + type" "+str(r.status_code))
    except BaseException:
        # print(ip+" 无效 "+type)
        pass


def GetIP(nub, HTTPUrl=0, HTTPSUrl=0):
    '''
        获取ip
        nub 是获取的有效代理IP数量
        detectionURL 是测试的的网站URL
    '''
    print("正在获取代理IP...")
    page = 1
    while len(usableIP) <= nub:
        # print("测试第"+str(page)+"页")
        url = "http://www.xicidaili.com/nt/"+str(page)
        headers = header()
        sess = requests.Session()
        html = sess.get(url, headers=headers).text
        selector = etree.HTML(html)
        # 获取ip
        ipList = selector.xpath('//td[2]/text()')
        # 获取端口号
        portList = selector.xpath('//td[3]/text()')
        # 获取ip类型
        typeList = selector.xpath('//td[6]/text()')
        '''单个测试
        c = detectionIP('183.62.196.10', '3128', 'HTTP')
        return c
        '''

        '''这是程序测试ip可用性,速度很慢,采用多线程'''
        # 用来储存线程,这样可以让主线程等待
        threadc = []
        # 验证代理ip是否可用
        for i in range(0, 99):
            c = threading.Thread(target=detectionIP, args=(ipList[i], portList[i], typeList[i], HTTPSUrl, HTTPUrl))
            c.start()
            threadc.append(c)
        for c in threadc:
            c.join()
        page += 1
    print("获取代理IP成功")
    '''这是程序的最后一段,展示注释  现在不用这段代码了
    # 随机一个数,用来随机取IP及端口号
    nub = random.randint(0, 99)
    IPAgency = {
        "http": "http://"+ipList[nub]+":"+portList[nub]
    }
    '''

    '''
    newIP = []
    f = open("IP.txt", "w")
    for i in range(0, 99):
        usableIP.append(ipList[i] + ":" + portList[i])
        f.write(ipList[i] + ":" + portList[i]+"@"+typeList[i]+"\n")
    f.close()
    '''
    return usableIP


if __name__ == "__main__":
    # 测试用的是百度,但是百度屏蔽了几乎所有的免费IP代理
    IP = GetIP(3, 'http://www.wotingpingshu.com/',)
    # print("http:", ip['http'])
    pattern = re.compile(r'^(.*)\@(.*)$')
    f = open("IP.txt", "w")
    for i in IP:
        print(i)
        # print(pattern.match(i).group(1))
        f.write(i+'\n')
    f.close()

 

你可能感兴趣的:(python)