一、设置属性
import requests
from random import choice
from threading import Thread
from queue import Queue
from bs4 import BeautifulSoup
import csv
import threading
class NoProxiesError(Exception):
pass
class NetThread(Thread):
def __init__(self, url, type, queue):
super().__init__()
self.url = url
self.type = type
self.queue = queue
def run(self):
new_get_net_data(self.url, self.queue)
def update_proxies_pool():
"""通过蘑菇代理获取代理服务器地址和端口,构建IP代理池"""
proxies_pool = []
resp = requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs'
'?appKey=4338998cd0824d9d9d75f8905bd687ba&count=5&'
'expiryDate=0&format=1&newLine=2')
if resp.status_code == 200:
result = resp.json()
if result['code'] == '0':
for item in result['msg']:
ip, port = item['ip'], item['port']
# proxies_pool.append(f'http://{ip}:{port}')
proxies_pool.append({
'http': f'{ip}:{port}'})
return proxies_pool
raise NoProxiesError('获取代理服务器信息失败,请重试!!!')
# ====旧的===
def get_proxies():
return choice(update_proxies_pool())
def new_get_proxies():
while True:
try:
proxies = update_proxies_pool()
return choice(proxies)
except:
print('iP获取异常')
continue
proxy = new_get_proxies()
def new_get_net_data(url, queue):
global proxy
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
}
# 代理
try:
response = requests.get(url, headers=headers, proxies=proxy)
response.encoding = 'gbk'
# print(response.text)
analysis_data(response.text, queue)
except requests.RequestException:
print('请求失败!')
# ==================方案一:把所有分类的所有页的数据保存到一个文件中================
# proxy = get_proxies()
# 请求每一个页面的数据
def get_net_data(url, queue):
global proxy
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
}
# 代理
try:
response = requests.get(url, headers=headers, proxies=proxy)
response.encoding = 'gbk'
# print(response.text)
analysis_data(response.text, queue)
except requests.RequestException:
print('请求失败!')
except NoProxiesError:
print('代理错误!')
proxy = get_proxies()
get_net_data(url, queue)
# 数据解析
def analysis_data(data, queue: Queue):
currentThread = threading.current_thread()
# print('类型:', currentThread.type)
doc = BeautifulSoup(data, 'lxml')
ul = doc.select('.seeWell.cf>li')
for li in ul:
li_doc = BeautifulSoup(str(li), 'lxml')
image_url = li_doc.img.attrs['src']
name = li_doc.img.attrs['alt']
au_name = li_doc.select('span.l>a:nth-child(2)')[0].get_text()
# print([name, au_name, image_url])
# 方案一:
# queue.put([name, au_name, image_url])
# 方案二:
queue.put({
currentThread.type: [name, au_name, image_url]})
# 创建线程对象获取每个页面的数据
def get_all_data():
queue = Queue()
t_list = []
for type in range(1,5):
for page in range(1, 3):
url = f'http://www.quanshuwang.com/list/{type}_{page}.html'
# =====方案一=====
# t = Thread(target=get_net_data, args=(url, queue))
# ======方案二======
t = NetThread(url, type, queue)
t.start()
t_list.append(t)
wait_t = Thread(target=new_write_data, args=(t_list, queue))
wait_t.start()
# 等待所有线程结束然后保存数据
def write_data(t_list, queue: Queue):
for t in t_list:
t.join()
queue.put('end')
all_data = []
while True:
data = queue.get()
if data == 'end':
break
else:
all_data.append(data)
with open('files/所有的小说数据.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(['名字', '作者', '封面'])
writer.writerows(all_data)
print('完成!')
def new_write_data(t_list, queue: Queue):
for t in t_list:
t.join()
queue.put('end')
all_data = {
1: [],
2: [],
3: [],
4: []
}
while True:
data = queue.get()
if data == 'end':
break
key = list(data.keys())[0]
all_data[key].append(data[key])
for ty in all_data:
with open(f'files/类型{ty}.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(['名称', '作者', '图片'])
writer.writerows(all_data[ty])
if __name__ == '__main__':
get_all_data()
# html = """
# 攻略极品作者:萨琳娜 斗极品?
# 不!
# 我们的口号是:走极品的路,让极品...更多马上阅读
# """
# s = BeautifulSoup(html, 'lxml')
# print(s.img)
# print(s.select('span.l>a:nth-child(2)'))
# q = Queue()
# q.put(100)
# q.put(200)
# q.put('end')
#
# while True:
# data = q.get()
# print(data)
# if data == 'end':
# break
二、selenium的使用
import time
# 1. 基本使用
# from selenium import webdriver
#
# # 创建浏览器
# browser = webdriver.Chrome()
# # 打开指定页面
# browser.get('https://www.baidu.com')
# time.sleep(5)
# browser.close()
# 2.配置浏览器
# from selenium import webdriver
# # 创建配置对象
# options = webdriver.ChromeOptions()
# # 1)不具备自动化测试工具的身份
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
# # 2)取消图片加载
# options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
# # 创建浏览器对象
# browser = webdriver.Chrome(options=options)
# browser.get('https://www.baidu.com')
# time.sleep(5)
# browser.close()
# 3.基本操作
# from selenium import webdriver
# from selenium.webdriver.common import keys
#
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support import expected_conditions as EC
#
# browser = webdriver.Chrome()
# browser.get('https://www.baidu.com')
# # 获取标签
# search_input = browser.find_element_by_id('kw')
# print(search_input)
# # 操作标签
# search_input.send_keys('帅哥')
# search_input.send_keys(keys.Keys.ENTER)
# # 等待操作
# wait = WebDriverWait(browser, 10)
# wait.until(EC.presence_of_element_located((By.ID, 'head')))
# # 获取相关信息
# # print(browser.current_url)
# # print(browser.page_source)
# print(browser.get_cookies())
# time.sleep(20)
# browser.close()
# 4. 简单的交互动作
# from selenium import webdriver
# browser = webdriver.Chrome()
# browser.get('https://www.jd.com')
# input = browser.find_element_by_id('key')
# button = browser.find_element_by_css_selector('#search > div > div.form > button')
# # 输入框输入内容
# input.send_keys('美食')
# # 点击事件
# button.click()
# time.sleep(10)
# browser.close()
# 5.动作链
# from selenium import webdriver
# from selenium.webdriver import ActionChains
#
# browser = webdriver.Chrome()
# url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
# browser.get(url)
# browser.switch_to.frame('iframeResult')
# source = browser.find_element_by_css_selector('#draggable')
# target = browser.find_element_by_css_selector('#droppable')
# # 创建动作连对象
# actions = ActionChains(browser)
# # actions.drag_and_drop(source, target)
# actions.drag_and_drop_by_offset(source, 0, 200)
# actions.perform() # 开始执行动作链的动作
#
#
# time.sleep(25)
# browser.close()
# 6.执行javascript代码
# from selenium import webdriver
#
# browser = webdriver.Chrome()
# browser.get('https://www.jd.com')
# body = browser.find_element_by_css_selector('body')
# print(body.size)
# time.sleep(1)
# browser.execute_script('window.scrollBy(0, 4474)')
# # browser.execute_script('alert("底部")')
# time.sleep(2)
# print(body.size)
# # time.sleep(10)
# browser.close()
# 7.前进和后退
# import time
# from selenium import webdriver
# browser = webdriver.Chrome()
# browser.get('https://www.baidu.com/')
# browser.get('https://www.taobao.com/')
# browser.get('https://www.jd.com/')
# browser.back()
# time.sleep(1)
# browser.forward()
# browser.close()
# 8.选项卡
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')
# 浏览器对象.window_handles - 获取当前浏览器中所有的选项卡
print(browser.window_handles)
# 切换选项卡
browser.switch_to.window(browser.window_handles[1])
browser.get('https://taobao.com')
time.sleep(1)
browser.switch_to.window(browser.window_handles[0])
time.sleep(10)
browser.close()