1.txt池子管理多线程获取cookie + 供给requests使用
cookie_server 和 cookie.txt 放在cookie_pool目录下
1.1永动机cookie_server
import time
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
import sys
sys.path.append("D:\Work\IPS")
from redis_cli import IPS_
from threading import Lock
ips = IPS_()
import random
class Cookie_():
def __init__(self):
self.lock1 = Lock()
self.IpUrls = ['https://xm.esfxiaoqu.zhuge.com/1007323/',
'https://xm.esfxiaoqu.zhuge.com/1001471/',
'https://xm.esfxiaoqu.zhuge.com/1007892/',
'https://xm.esfxiaoqu.zhuge.com/1003688/',
'https://xm.esfxiaoqu.zhuge.com/1007870/',
'https://xm.esfxiaoqu.zhuge.com/1003892/',
'https://xm.esfxiaoqu.zhuge.com/1003894/',
'https://xm.esfxiaoqu.zhuge.com/1003896/',
'https://xm.esfxiaoqu.zhuge.com/1004252/',
'https://xm.esfxiaoqu.zhuge.com/1004266/',
]
self.queue_ip = Queue()
self.threadPoll = ThreadPoolExecutor(max_workers=8)
def get_ip(self):
self.lock1.acquire()
ip = ips.one()
self.lock1.release()
return ip
def thread_PullIP(self):
# 最大线程数8个去访问目标url,主线程继续往下执行。
for i in range(20):
self.threadPoll.submit(self.pull_cookie())
# self.pullIP()
def pull_cookie(self):
ip = self.get_ip()
n = 0
while True:
try:
url = random.choice(self.IpUrls)
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# options.add_argument('--headless')
options.add_argument("--disable-blink-features=AutomationControlled")
# 不加载图片
prefs = {
# 不加载imgs
'profile.managed_default_content_settings.images': 2,
# 不加载弹窗
'profile.default_content_setting_values': {
'notifications': 2
}
}
options.add_experimental_option('prefs', prefs)
# 添加代理和头部
options.add_argument(('--proxy-server=http://' + ip))
options.add_argument(
'User-Agent={}'.format(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.81 Safari/537.36'))
# 设置路径
driver = webdriver.Chrome(options=options,executable_path='D:\zhoukai_workspace\WebDriver\chromedriver.exe')
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.implicitly_wait(5)
driver.set_page_load_timeout(20)
driver.set_script_timeout(20)
driver.get(url)
time.sleep(1)
if n > 15:
driver.quit()
return '', ip
elif 'arg1=' in driver.page_source:
n += 1
ip = self.get_ip()
driver.quit()
else:
try:
acw_tc = driver.get_cookie(name='acw_tc')['value']
acw_sc__v2 = driver.get_cookie(name='acw_sc__v2')['value']
coo = 'acw_tc={0}; acw_sc__v2={1}'.format(acw_tc, acw_sc__v2)
time1 = time.time()
cookie = '{}+{}--{}'.format(coo,ip,time1)
print(cookie)
driver.quit()
# 这里写入了coo+ip--时间戳(根据情况而定,这是因为网站将ip和cookie绑定在一起的原因)
with open('./cookie.txt', 'a') as f:
f.write(cookie)
f.write('\n')
except:
n += 1
ip = self.get_ip()
driver.quit()
except TimeoutException as ex:
driver.quit()
n += 1
ip = self.get_ip()
print('关闭drive界面')
except Exception as ex:
driver.quit()
print(ex)
n += 1
ip = self.get_ip()
print('关闭drive界面')
def delete_cookie(self):
while True:
datas = []
with open('./cookie.txt', 'r') as f:
for line in f.readlines():
line = line.strip('\n') # 去掉列表中每一个元素的换行符
datas.append(line)
with open('./cookie.txt', 'w') as f:
for data in datas:
try:
local_time = float(data.split('--')[-1])
if int(float(time.time()) - local_time) > 300:
print('{} --- 过期'.format(data))
continue
f.write(data)
f.write('\n')
except:
pass
def run(self):
self.thread_PullIP()
# self.delete_cookie()
if __name__ == '__main__':
Cookie_().run()
1.2cookie.txt文件源源不断的添加cookie 也会删除一些失效的cookie
1.3需要使用cookie的程序的调用方式
import requests,sys,openpyxl
requests.packages.urllib3.disable_warnings()
sys.path.append("D:\Work\IPS")
from redis_cli import IPS_
ips = IPS_()
import time
from threading import Lock
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from del_cookie import delete_cook
class Test(object):
def __init__(self):
self.lastime = time.time()
self.operexcel = OperateExcel()
self.lock2 = Lock()
self.lock3 = Lock()
def get_cookie(self):
while 1:
with open('./Cookie_pool/cookie.txt', 'r')as f:
list = [cookie.split('--')[0] for cookie in f.readlines()]
if list:
self.lock3.acquire()
cookie = random.choice(list)
self.lock3.release()
return cookie
else:
print('没有cookie了 在等待')
time.sleep(4.5)
continue
def base_parse(self,url,coo,ip):
n = 0
while True:
proxy = {
'http': 'http://' + ip,
'https': 'https://' + ip
}
head = {
'Cookie': coo,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.81 Safari/537.36',
}
try:
resp = requests.get(url, headers=head,proxies=proxy,verify=False,timeout=5)
time.sleep(0.3)
if n > 20:
# print('访问超过15次')
with open('./book.txt','a') as f:
f.write(url + '\n' + resp.text + '\n')
return '',coo,ip
elif '滑动验证页面' in resp.text:
delete_cook(coo)
print('滑动验证页面')
n += 1
ip = self.get_ip()
elif '访问失败' in resp.text:
delete_cook(coo)
n += 1
cookie = self.get_cookie()
ip = cookie.split('+')[-1]
coo = cookie.split('+')[0]
else:
if '访问成功' in resp.text:
return resp.text,coo,ip
else:
delete_cook(coo)
cookie = self.get_cookie()
ip = cookie.split('+')[-1]
coo = cookie.split('+')[0]
except requests.exceptions.RequestException as e:
delete_cook(coo)
time.sleep(1)
n += 1
cookie = self.get_cookie()
ip = cookie.split('+')[-1]
coo = cookie.split('+')[0]
if n > 50:
return '',coo,ip
def writeToExcel(file_path,all_list):
# total_list = [['A', 'B', 'C', 'D', 'E'], [1, 2, 4, 6, 8], [4, 6, 7, 9, 0], [2, 6, 4, 5, 8]]
wb = openpyxl.Workbook()
ws = wb.active
ws.title = 'Data'
tit_list = ['Name', 'Price','','','', '', '', '','oneyearinfo','','latitude','longtitude','houserentinfo']
for tit in range(len(tit_list)):
ws.cell(1, tit + 1).value = tit_list[tit]
for r in range(len(all_list)):
for c in range(len(all_list[r])):
ws.cell(r + 2, c + 1).value = all_list[r][c]
# excel中的行和列是从1开始计数的,所以需要+1
wb.save(file_path) # 注意,写入后一定要保存
print("成功写入文件: " + file_path + " !")
return 1
def get_list(self,file_path,threadpool):
# 先设置一个重置arg2的函数
cookie = self.get_cookie()
ip = cookie.split('+')[-1]
coo = cookie.split('+')[0]
print(ip,coo)
# 初始化excel
wbook, wsheet = self.operexcel.init_excel(file_path)
row_num = 2 # 行数
base_url = 'https://www.666666.222.com/'
print(base_url)
response,coo,ip = self.base_parse(base_url,coo,ip)
"""
中间过程掠过
...
直接到根据页数使用多线程爬取
"""
future_list = []
for p in range(1, 100):
page_url = 'https://www.666666.222.com/page={}'.format(p)
future = threadpool.submit(self.handle_true,page_url)
future_list.append(future)
for future in as_completed(future_list):
data_list_fin = future.result() # 线程运行结果
if data_list_fin:
self.lock2.acquire()
row_max_num = self.operexcel.write_excel(wbook, wsheet, file_path, row_num,data_list_fin)
self.lock2.release()
# print('写入完成')
row_num = row_max_num + 1
print('下载完成了')
def handle_true(self,page_url):
data_list = []
cookie = self.get_cookie()
ip = cookie.split('+')[-1]
coo = cookie.split('+')[0]
# 中间过程掠过
return data_list
if __name__ == '__main__':
test = Test()
# 4个requests爬取线程 对接 8个cookie获取(具体情况可以具体分析)
threadpool = ThreadPoolExecutor(max_workers=4)
outfile = './' + '{}.xlsx'.format('给文件取的名字')
fp = open(outfile,'w')
fp.close()
file_path = outfile
test.get_list(file_path,threadpool)