代理和selenium的使用——python

一、设置属性

import requests
from random import choice
from threading import Thread
from queue import Queue
from bs4 import BeautifulSoup
import csv
import threading


class NoProxiesError(Exception):
    pass


class NetThread(Thread):
    def __init__(self, url, type, queue):
        super().__init__()
        self.url = url
        self.type = type
        self.queue = queue

    def run(self):
        new_get_net_data(self.url, self.queue)


def update_proxies_pool():
    """通过蘑菇代理获取代理服务器地址和端口,构建IP代理池"""
    proxies_pool = []
    resp = requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs'
                        '?appKey=4338998cd0824d9d9d75f8905bd687ba&count=5&'
                        'expiryDate=0&format=1&newLine=2')
    if resp.status_code == 200:
        result = resp.json()
        if result['code'] == '0':
            for item in result['msg']:
                ip, port = item['ip'], item['port']
                # proxies_pool.append(f'http://{ip}:{port}')
                proxies_pool.append({
     'http': f'{ip}:{port}'})
            return proxies_pool
    raise NoProxiesError('获取代理服务器信息失败,请重试!!!')


# ====旧的===
def get_proxies():
    return choice(update_proxies_pool())


def new_get_proxies():
    while True:
        try:
            proxies = update_proxies_pool()
            return choice(proxies)
        except:
            print('iP获取异常')
            continue


proxy = new_get_proxies()


def new_get_net_data(url, queue):
    global proxy
    # 请求头
    headers = {
     
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
    }
    # 代理
    try:
        response = requests.get(url, headers=headers, proxies=proxy)
        response.encoding = 'gbk'
        # print(response.text)
        analysis_data(response.text, queue)
    except requests.RequestException:
        print('请求失败!')



# ==================方案一:把所有分类的所有页的数据保存到一个文件中================
# proxy = get_proxies()
# 请求每一个页面的数据
def get_net_data(url, queue):
    global proxy
    # 请求头
    headers = {
     
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
    }
    # 代理
    try:
        response = requests.get(url, headers=headers, proxies=proxy)
        response.encoding = 'gbk'
        # print(response.text)
        analysis_data(response.text, queue)
    except requests.RequestException:
        print('请求失败!')
    except NoProxiesError:
        print('代理错误!')
        proxy = get_proxies()
        get_net_data(url, queue)


# 数据解析
def analysis_data(data, queue: Queue):
    currentThread = threading.current_thread()
    # print('类型:', currentThread.type)
    doc = BeautifulSoup(data, 'lxml')
    ul = doc.select('.seeWell.cf>li')
    for li in ul:
        li_doc = BeautifulSoup(str(li), 'lxml')
        image_url = li_doc.img.attrs['src']
        name = li_doc.img.attrs['alt']
        au_name = li_doc.select('span.l>a:nth-child(2)')[0].get_text()
        # print([name, au_name, image_url])
        # 方案一:
        # queue.put([name, au_name, image_url])
        # 方案二:
        queue.put({
     currentThread.type: [name, au_name, image_url]})


# 创建线程对象获取每个页面的数据
def get_all_data():
    queue = Queue()
    t_list = []
    for type in range(1,5):
        for page in range(1, 3):
            url = f'http://www.quanshuwang.com/list/{type}_{page}.html'
            # =====方案一=====
            # t = Thread(target=get_net_data, args=(url, queue))
            # ======方案二======
            t = NetThread(url, type, queue)
            t.start()
            t_list.append(t)

    wait_t = Thread(target=new_write_data, args=(t_list, queue))
    wait_t.start()


# 等待所有线程结束然后保存数据
def write_data(t_list, queue: Queue):
    for t in t_list:
        t.join()
    queue.put('end')

    all_data = []
    while True:
        data = queue.get()
        if data == 'end':
            break
        else:
            all_data.append(data)

    with open('files/所有的小说数据.csv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['名字', '作者', '封面'])
        writer.writerows(all_data)
    print('完成!')


def new_write_data(t_list, queue: Queue):
    for t in t_list:
        t.join()
    queue.put('end')

    all_data = {
     
        1: [],
        2: [],
        3: [],
        4: []
    }

    while True:
        data = queue.get()
        if data == 'end':
            break

        key = list(data.keys())[0]
        all_data[key].append(data[key])

    for ty in all_data:
        with open(f'files/类型{ty}.csv', 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['名称', '作者', '图片'])
            writer.writerows(all_data[ty])


if __name__ == '__main__':
    get_all_data()

# html = """
# 
  • 攻略极品攻略极品作者:萨琳娜 斗极品? # 不! # 我们的口号是:走极品的路,让极品...更多马上阅读
  • # """ # s = BeautifulSoup(html, 'lxml') # print(s.img) # print(s.select('span.l>a:nth-child(2)')) # q = Queue() # q.put(100) # q.put(200) # q.put('end') # # while True: # data = q.get() # print(data) # if data == 'end': # break

    二、selenium的使用

    import time
    
    # 1. 基本使用
    # from selenium import webdriver
    #
    # # 创建浏览器
    # browser = webdriver.Chrome()
    # # 打开指定页面
    # browser.get('https://www.baidu.com')
    # time.sleep(5)
    # browser.close()
    
    # 2.配置浏览器
    # from selenium import webdriver
    # # 创建配置对象
    # options = webdriver.ChromeOptions()
    # # 1)不具备自动化测试工具的身份
    # options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # # 2)取消图片加载
    # options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
    # # 创建浏览器对象
    # browser = webdriver.Chrome(options=options)
    # browser.get('https://www.baidu.com')
    # time.sleep(5)
    # browser.close()
    
    # 3.基本操作
    # from selenium import webdriver
    # from selenium.webdriver.common import keys
    #
    # from selenium.webdriver.support.ui import WebDriverWait
    # from selenium.webdriver.common.by import By
    # from selenium.webdriver.support import expected_conditions as EC
    #
    # browser = webdriver.Chrome()
    # browser.get('https://www.baidu.com')
    # # 获取标签
    # search_input = browser.find_element_by_id('kw')
    # print(search_input)
    # # 操作标签
    # search_input.send_keys('帅哥')
    # search_input.send_keys(keys.Keys.ENTER)
    # # 等待操作
    # wait = WebDriverWait(browser, 10)
    # wait.until(EC.presence_of_element_located((By.ID, 'head')))
    # # 获取相关信息
    # # print(browser.current_url)
    # # print(browser.page_source)
    # print(browser.get_cookies())
    # time.sleep(20)
    # browser.close()
    
    # 4. 简单的交互动作
    # from selenium import webdriver
    # browser = webdriver.Chrome()
    # browser.get('https://www.jd.com')
    # input = browser.find_element_by_id('key')
    # button = browser.find_element_by_css_selector('#search > div > div.form > button')
    # # 输入框输入内容
    # input.send_keys('美食')
    # # 点击事件
    # button.click()
    # time.sleep(10)
    # browser.close()
    
    # 5.动作链
    # from selenium import webdriver
    # from selenium.webdriver import ActionChains
    #
    # browser = webdriver.Chrome()
    # url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
    # browser.get(url)
    # browser.switch_to.frame('iframeResult')
    # source = browser.find_element_by_css_selector('#draggable')
    # target = browser.find_element_by_css_selector('#droppable')
    # # 创建动作连对象
    # actions = ActionChains(browser)
    # # actions.drag_and_drop(source, target)
    # actions.drag_and_drop_by_offset(source, 0, 200)
    # actions.perform()    # 开始执行动作链的动作
    #
    #
    # time.sleep(25)
    # browser.close()
    
    # 6.执行javascript代码
    # from selenium import webdriver
    #
    # browser = webdriver.Chrome()
    # browser.get('https://www.jd.com')
    # body = browser.find_element_by_css_selector('body')
    # print(body.size)
    # time.sleep(1)
    # browser.execute_script('window.scrollBy(0, 4474)')
    # # browser.execute_script('alert("底部")')
    # time.sleep(2)
    # print(body.size)
    # # time.sleep(10)
    # browser.close()
    
    # 7.前进和后退
    # import time
    # from selenium import webdriver
    # browser = webdriver.Chrome()
    # browser.get('https://www.baidu.com/')
    # browser.get('https://www.taobao.com/')
    # browser.get('https://www.jd.com/')
    # browser.back()
    # time.sleep(1)
    # browser.forward()
    # browser.close()
    
    # 8.选项卡
    from selenium import webdriver
    import time
    
    browser = webdriver.Chrome()
    browser.get('https://www.baidu.com')
    browser.execute_script('window.open()')
    # 浏览器对象.window_handles  -  获取当前浏览器中所有的选项卡
    print(browser.window_handles)
    # 切换选项卡
    browser.switch_to.window(browser.window_handles[1])
    browser.get('https://taobao.com')
    time.sleep(1)
    browser.switch_to.window(browser.window_handles[0])
    
    
    time.sleep(10)
    browser.close()
    

    你可能感兴趣的:(selenium,python)