04.Cookie池设计

1.txt池子管理多线程获取cookie + 供给requests使用
cookie_server 和 cookie.txt 放在cookie_pool目录下
1.1永动机cookie_server

import time
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
import sys
sys.path.append("D:\Work\IPS")
from redis_cli import IPS_
from threading import Lock
ips = IPS_()
import random


class Cookie_():
    def __init__(self):
        self.lock1 = Lock()
        self.IpUrls  = ['https://xm.esfxiaoqu.zhuge.com/1007323/',
                        'https://xm.esfxiaoqu.zhuge.com/1001471/',
                        'https://xm.esfxiaoqu.zhuge.com/1007892/',
                        'https://xm.esfxiaoqu.zhuge.com/1003688/',
                        'https://xm.esfxiaoqu.zhuge.com/1007870/',
                        'https://xm.esfxiaoqu.zhuge.com/1003892/',
                        'https://xm.esfxiaoqu.zhuge.com/1003894/',
                        'https://xm.esfxiaoqu.zhuge.com/1003896/',
                        'https://xm.esfxiaoqu.zhuge.com/1004252/',
                        'https://xm.esfxiaoqu.zhuge.com/1004266/',
                    ]
        self.queue_ip = Queue()
        self.threadPoll = ThreadPoolExecutor(max_workers=8)

    def get_ip(self):
        self.lock1.acquire()
        ip = ips.one()
        self.lock1.release()
        return ip

    def thread_PullIP(self):
        # 最大线程数8个去访问目标url,主线程继续往下执行。
        for i in range(20):
            self.threadPoll.submit(self.pull_cookie())
            # self.pullIP()

    def pull_cookie(self):
        ip = self.get_ip()
        n = 0
        while True:
            try:
                url = random.choice(self.IpUrls)
                options = webdriver.ChromeOptions()
                options.add_experimental_option('excludeSwitches', ['enable-automation'])
                # options.add_argument('--headless')
                options.add_argument("--disable-blink-features=AutomationControlled")

                # 不加载图片
                prefs = {
                        # 不加载imgs
                        'profile.managed_default_content_settings.images': 2,
                         # 不加载弹窗
                         'profile.default_content_setting_values': {
                             'notifications': 2
                         }
                         }
                options.add_experimental_option('prefs', prefs)

                # 添加代理和头部
                options.add_argument(('--proxy-server=http://' + ip))
                options.add_argument(
                    'User-Agent={}'.format(
                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.81 Safari/537.36'))

                # 设置路径
                driver = webdriver.Chrome(options=options,executable_path='D:\zhoukai_workspace\WebDriver\chromedriver.exe')
                driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
                    "source": """
                                                                Object.defineProperty(navigator, 'webdriver', {
                                                                  get: () => undefined
                                                                })
                                                              """
                })
                driver.implicitly_wait(5)
                driver.set_page_load_timeout(20)
                driver.set_script_timeout(20)
                driver.get(url)
                time.sleep(1)
                if n > 15:
                    driver.quit()
                    return '', ip
                elif 'arg1=' in driver.page_source:
                    n += 1
                    ip = self.get_ip()
                    driver.quit()
                else:
                    try:
                        acw_tc = driver.get_cookie(name='acw_tc')['value']
                        acw_sc__v2 = driver.get_cookie(name='acw_sc__v2')['value']
                        coo = 'acw_tc={0}; acw_sc__v2={1}'.format(acw_tc, acw_sc__v2)
                        time1 = time.time()
                        cookie = '{}+{}--{}'.format(coo,ip,time1)
                        print(cookie)
                        driver.quit()
                        # 这里写入了coo+ip--时间戳(根据情况而定,这是因为网站将ip和cookie绑定在一起的原因)
                        with open('./cookie.txt', 'a') as f:
                            f.write(cookie)
                            f.write('\n')
                    except:
                        n += 1
                        ip = self.get_ip()
                        driver.quit()
            except TimeoutException as ex:
                driver.quit()
                n += 1
                ip = self.get_ip()
                print('关闭drive界面')
            except Exception as ex:
                driver.quit()
                print(ex)
                n += 1
                ip = self.get_ip()
                print('关闭drive界面')

    def delete_cookie(self):
        while True:
            datas = []
            with open('./cookie.txt', 'r') as f:
                for line in f.readlines():
                    line = line.strip('\n')  # 去掉列表中每一个元素的换行符
                    datas.append(line)

            with open('./cookie.txt', 'w') as f:
                for data in datas:
                    try:
                        local_time = float(data.split('--')[-1])
                        if int(float(time.time()) - local_time) > 300:
                            print('{} --- 过期'.format(data))
                            continue
                        f.write(data)
                        f.write('\n')
                    except:
                        pass

    def run(self):

        self.thread_PullIP()

        # self.delete_cookie()

if __name__ == '__main__':

    Cookie_().run()

1.2cookie.txt文件源源不断的添加cookie 也会删除一些失效的cookie
1.3需要使用cookie的程序的调用方式

import requests,sys,openpyxl
requests.packages.urllib3.disable_warnings()
sys.path.append("D:\Work\IPS")
from redis_cli import IPS_
ips = IPS_()
import time
from threading import Lock
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from del_cookie import delete_cook

class Test(object):
    def __init__(self):
        self.lastime = time.time()
        self.operexcel = OperateExcel()
        self.lock2 = Lock()
        self.lock3 = Lock()

    def get_cookie(self):
        while 1:
            with open('./Cookie_pool/cookie.txt', 'r')as f:
                list = [cookie.split('--')[0] for cookie in f.readlines()]
                if list:
                    self.lock3.acquire()
                    cookie = random.choice(list)
                    self.lock3.release()
                    return cookie
                else:
                    print('没有cookie了 在等待')
                    time.sleep(4.5)
                    continue

    def base_parse(self,url,coo,ip):
        n = 0
        while True:
            proxy = {
                'http': 'http://' + ip,
                'https': 'https://' + ip
            }
            head = {
                'Cookie': coo,
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.81 Safari/537.36',
            }
            try:
                resp = requests.get(url, headers=head,proxies=proxy,verify=False,timeout=5)
                time.sleep(0.3)
                if n > 20:
                    # print('访问超过15次')
                    with open('./book.txt','a') as f:
                        f.write(url + '\n' + resp.text + '\n')
                    return '',coo,ip
                elif '滑动验证页面' in resp.text:
                    delete_cook(coo)
                    print('滑动验证页面')
                    n += 1
                    ip = self.get_ip()
                elif '访问失败' in resp.text:
                    delete_cook(coo)
                    n += 1
                    cookie = self.get_cookie()
                    ip = cookie.split('+')[-1]
                    coo = cookie.split('+')[0]
                else:
                    if '访问成功' in resp.text:
                        return resp.text,coo,ip
                    else:
                        delete_cook(coo)
                        cookie = self.get_cookie()
                        ip = cookie.split('+')[-1]
                        coo = cookie.split('+')[0]
            except requests.exceptions.RequestException as e:
                delete_cook(coo)
                time.sleep(1)
                n += 1
                cookie = self.get_cookie()
                ip = cookie.split('+')[-1]
                coo = cookie.split('+')[0]
                if n > 50:
                    return '',coo,ip

    def writeToExcel(file_path,all_list):
        # total_list = [['A', 'B', 'C', 'D', 'E'], [1, 2, 4, 6, 8], [4, 6, 7, 9, 0], [2, 6, 4, 5, 8]]
        wb = openpyxl.Workbook()
        ws = wb.active
        ws.title = 'Data'
        tit_list = ['Name', 'Price','','','', '', '', '','oneyearinfo','','latitude','longtitude','houserentinfo']
        for tit in range(len(tit_list)):
            ws.cell(1, tit + 1).value = tit_list[tit]
        for r in range(len(all_list)):
            for c in range(len(all_list[r])):
                ws.cell(r + 2, c + 1).value = all_list[r][c]
                # excel中的行和列是从1开始计数的,所以需要+1
        wb.save(file_path)  # 注意,写入后一定要保存
        print("成功写入文件: " + file_path + " !")
        return 1

    def get_list(self,file_path,threadpool):
        
        # 先设置一个重置arg2的函数
        cookie = self.get_cookie()
        ip = cookie.split('+')[-1]
        coo = cookie.split('+')[0]

        print(ip,coo)
        # 初始化excel
        wbook, wsheet = self.operexcel.init_excel(file_path)
        row_num = 2  # 行数

        base_url = 'https://www.666666.222.com/'
        print(base_url)
        response,coo,ip = self.base_parse(base_url,coo,ip)

        """
        中间过程掠过
        ...
        直接到根据页数使用多线程爬取
        """
        
        future_list = []
        for p in range(1, 100):
            page_url = 'https://www.666666.222.com/page={}'.format(p)
            future = threadpool.submit(self.handle_true,page_url)
            future_list.append(future)
        for future in as_completed(future_list):
            data_list_fin = future.result()  # 线程运行结果
            if data_list_fin:
                self.lock2.acquire()
                row_max_num = self.operexcel.write_excel(wbook, wsheet, file_path, row_num,data_list_fin)
                self.lock2.release()
                # print('写入完成')
                row_num = row_max_num + 1
        print('下载完成了')

    def handle_true(self,page_url):
        data_list = []
        cookie = self.get_cookie()
        ip = cookie.split('+')[-1]
        coo = cookie.split('+')[0]
        
        # 中间过程掠过
        
        return data_list

if __name__ == '__main__':
    test = Test()
    # 4个requests爬取线程 对接 8个cookie获取(具体情况可以具体分析)
    threadpool = ThreadPoolExecutor(max_workers=4)
    
    outfile =  './' + '{}.xlsx'.format('给文件取的名字')
    fp = open(outfile,'w')
    fp.close()
    file_path = outfile
    test.get_list(file_path,threadpool)

你可能感兴趣的:(Python3,&,Crawler,爬虫)