python3爬虫(6)爬虫代理的使用

网上免费代理有很多,免费的,爬取一下拿来用还是挺不错的,免费的意味着不提供任何服务,能用不能用人家才不管那么多,所以需要赛选一下。这两天研究了一下下,整理代码如下:

西刺代理:

#西刺:http://www.xicidaili.com/
#
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import threading

#代理是否成功测试网站
test_http = 'http://httpbin.org/get'
test_https = 'https://httpbin.org/get'

header ={
    'Accept':'*/*',
    'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
    'Accept-Language':'zh-CN',
    'Accept-Encoding':'gzip, deflate',
    'Connection': 'Keep-Alive',
    'Cache-Control': 'no-cache',
    'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.01)'
}

def pandas_to_xlsx(filename, info):  # 储存到xlsx
    pd_look = pd.DataFrame(info)
    pd_look.to_excel(filename, sheet_name='西刺-https')

def  getAllHttpsProxy(url):
    try:
        for i in range(1, 2):
            time.sleep(2)
            cururl = url + str(i)
            print('正在获取代理信息,网页', cururl)
            webcontent = requests.get(cururl,headers=header)
            if webcontent.status_code!=200 :
                print('获取错误网页,错误码:',webcontent.status_code)
                continue
            soup = BeautifulSoup(webcontent.text, 'lxml')
            ip_list = soup.select('#ip_list')
            if len(ip_list) == 0:
                print('获取错误网页,网页内容:',webcontent.text)
                continue
            a = ip_list[0]
            b = a.select('tr')
            for item in b:
                if item.text.find('国家') < 0:
                    td = item.select('td')
                    info = {}
                    info['ip'] = td[1].text
                    info['port'] = td[2].text
                    info['服务器地址'] = td[3].text
                    info['是否匿名'] = td[4].text
                    info['type'] = td[5].text
                    info['存活时间'] = td[8].text
                    info['验证时间'] = td[9].text
                    allProxies.append(info)
    except requests.exceptions.ConnectionError as e:
        print('Error', e.args)
        pandas_to_xlsx(allProxies)
    return allProxies


def TestOneProxy(ip,port,n):
    proxy = ip + ':' + port
    proxies = {
        'http': 'http://' + proxy,
        'https': 'https://' + proxy,
    }
    try:
        response = requests.get('http://httpbin.org/get', proxies=proxies , timeout=3)
        if response.status_code == 200 :
            print(n,'--验证代理通过 ip', ip, ' 端口:', port)
            return True
        else:
            print(n,'--验证代理失败 ip', ip, ' 端口:', port)
            return False
    except BaseException as e:
        print(n,'--Error', e.args)
        return False

#线程函数
num = 0
def threadFun(n):
    global num
    while True:

        #领取任务
        lock.acquire()
        if num >= len(allProxies):
            lock.release()#这个地方忘了写这一行代码,调试了一整天,泪奔
            break
        curTestProxy = allProxies[num]
        num = num + 1
        lock.release()
        #线程干活
        if TestOneProxy(curTestProxy['ip'],curTestProxy['port'],n):
            canUseProxies.append(curTestProxy)

    print(n,'--运行结束')

if __name__ == '__main__':
    allProxies = []
    canUseProxies = []

    #单线程获取所有可用代理
    url = 'https://www.xicidaili.com/wn/'
    getAllHttpsProxy(url)

    # 多线程测试是否可用
    lock = threading.Lock()
    res = []
    for i in range(15):  # 创建线程50个线程
        t = threading.Thread(target=threadFun, args=("thread-%s" % i,))
        t.start()
        res.append(t)
    for r in res:  # 循环线程实例列表,等待所有的线程执行完毕
        r.join()  # 线程执行完毕后,才会往后执行,相当于C语言中的wait()

    if len(canUseProxies)>0:
        pandas_to_xlsx('所有可用代理.xlsx',canUseProxies)



print ('end')

快代理:

#快代理:https://www.kuaidaili.com/free/
#
import requests
from bs4 import BeautifulSoup
import pandas as pd
import threading
import time
from time import sleep

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#代理是否成功测试网站
test_http = 'http://httpbin.org/get'
test_https = 'https://httpbin.org/get'

header ={
    'Accept':'*/*',
    'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
    'Accept-Language':'zh-CN',
    'Accept-Encoding':'gzip, deflate',
    'Connection': 'Keep-Alive',
    'Cache-Control': 'no-cache',
    'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.01)'
}

def pandas_to_xlsx(filename, info):  # 储存到xlsx
    pd_look = pd.DataFrame(info)
    pd_look.to_excel(filename, sheet_name='快代理')

def TestOneProxy(ip, port,n):
    proxy = ip + ':' + port
    proxies = {
        'http': 'http://' + proxy,
        'https': 'https://' + proxy,
    }
    try:
        response = requests.get('http://httpbin.org/get', proxies=proxies , timeout=3)
        if response.status_code == 200 :
            print(n,'--验证代理通过 ip', ip, ' 端口:', port)
            return True
        else:
            print(n,'--验证代理失败 ip', ip, ' 端口:', port)
            return False
    except BaseException as e:
        print(n,'--Error', e.args)
        return False

def  getHttpsProxy(url):
    for i in range(1,300):
        sleep(1)
        curUrl = url + str(i) + '/'
        try:
            print('正在获取代理信息,网页', curUrl)
            webcontent = requests.get(curUrl,verify=False)
            if webcontent.status_code!=200 :
                print('获取错误网页,错误码:',webcontent.status_code)
                continue
            soup = BeautifulSoup(webcontent.text, 'lxml')
            list = soup.select('#list')
            if len(list) == 0:
                print('获取错误网页,网页内容:',webcontent.text)
                continue
            a = list[0].select('tbody')[0]
            b = a.select('tr')
            for item in b:
                td = item.select('td')
                info = {}
                info['ip'] = td[0].text
                info['port'] = td[1].text
                info['匿名度'] = td[2].text
                info['类型'] = td[3].text
                info['位置'] = td[4].text
                info['响应速度'] = td[5].text
                info['最后验证时间'] = td[6].text
                allProxies.append(info)
        except requests.exceptions.ConnectionError as e:
            print('--Error', e.args)
            pandas_to_xlsx('所有代理.xlsx',allProxies)
    return allProxies


#线程函数
num = 0
def threadFun(n):
    global num
    while True:

        #领取任务
        lock.acquire()
        if num >= len(allProxies):
            lock.release()#这个地方忘了写这一行代码,调试了一整天,泪奔
            break
        curTestProxy = allProxies[num]
        num = num + 1
        lock.release()
        #线程干活
        if TestOneProxy(curTestProxy['ip'],curTestProxy['port'],n):
            canUseProxies.append(curTestProxy)

    print(n,'--运行结束')

if __name__ == '__main__':
    allProxies = []
    canUseProxies = []

    #单线程获取所有可用代理
    url = 'http://www.kuaidaili.com/free/inha/'
    getHttpsProxy(url)

    # 多线程测试是否可用
    lock = threading.Lock()
    res = []
    for i in range(15):  # 创建线程50个线程
        t = threading.Thread(target=threadFun, args=("thread-%s" % i,))
        t.start()
        res.append(t)
    for r in res:  # 循环线程实例列表,等待所有的线程执行完毕
        r.join()  # 线程执行完毕后,才会往后执行,相当于C语言中的wait()

    if len(canUseProxies)>0:
        pandas_to_xlsx('所有可用代理.xlsx',canUseProxies)


print ('end')

 

注意:

获取代理信息的时候最好不要使用多线程,高频率的访问会被服务器标记,西祠这个有黑名单机制的,大概就是12小时吧,快带来高频率的访问会返回503错误。并且这些信息不可以用代理方法获得,应该是服务器做了限制,禁止代理服务器访问之类的吧。

 

 

你可能感兴趣的:(python专栏)