多线程+代理ip池 爬虫

# coding=utf-8

import tushare as ts
import pandas as pd
import requests
import json
import re
import time
from retrying import retry
from concurrent.futures import ThreadPoolExecutor
import random


def get_pro():
    list = ['122.114.31.177:808', '61.135.217.7:80', '113.121.243.109:808', '171.39.40.5:8123', '121.31.199.30:8123',
            '111.155.116.240:8123', '125.121.121.171:808', '115.213.178.192:808']

    return list






start = time.clock()  # 计时-开始

urlnum = range(8)
listdo = urlnum


while True:
    listye = []
    listno = []
    event = []
    @retry(stop_max_attempt_number=8)  # 设置最大重试次数
    def crawl(n):

        pro_list = get_pro()

        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}

        proxies_l = {'http': pro_list[random.randint(0, len(pro_list))],

                     }
        print(proxies_l['http'])

        try:
            req = requests.get('http://httpbin.org/ip', headers=header, proxies=proxies_l)
            print('finish')
            listye.append(n)
            listdo.remove(n)
            print listdo

            return  req.text

        except:
            print('no proxies')
            listno.append(n)

    # 多线程
    def multithreading():

        number = listdo

        with ThreadPoolExecutor(max_workers=10) as executor:
            for result in executor.map(crawl, number, chunksize=10):
                event.append(result)

        return event


    event = multithreading()
    print 'listye'
    print listye
    print 'listno'
    print listno
    print 'listdo'
    print listdo




    if len(listdo) == 0:
        break

end = time.clock()  # 计时-结束
print ("爬取完成 用时:")
print (end - start)

你可能感兴趣的:(爬虫)