selenium学习笔记

# 基础用法
from selenium import webdriver
from time import sleep
from selenium.webdriver import ChromeOptions

# 一、直聘示例:防屏蔽
# driver = webdriver.Chrome()
# driver.get('https://www.zhipin.com/')
# driver.find_element_by_xpath('//*[@id="wrap"]/div[3]/div/div/div[1]/form/div[2]/p/input').send_keys('python')
# driver.find_element_by_xpath('//*[@id="wrap"]/div[3]/div/div/div[1]/form/button').click()
# 上面直接访问会报错,所以要更改options后再创建驱动
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
# option.add_experimental_option('useAutomationExtension',False)
# driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument',{'source':'Object.defineProperty(navigator,"webdriver",{get:()=>undefined})'})
driver.get('https://www.zhipin.com/')
driver.find_element_by_xpath('//*[@id="wrap"]/div[3]/div/div/div[1]/form/div[2]/p/input').send_keys('python')
# driver.find_element_by_xpath('//*[@id="wrap"]/div[3]/div/div/div[1]/form/button').click()
sleep(2)
# 补充:使用最新版的谷歌浏览器时,此防屏蔽方法会失效

# 二、百度示例
driver.maximize_window()  # driver.fullscreen_window()
driver.get('http://www.baidu.com')
driver.find_element_by_xpath('//*[@id="kw"]').send_keys('cj')
driver.find_element_by_xpath('//*[@id="su"]').click()
driver.find_element_by_xpath('//*[@id="s_tab"]/div/a[3]').click() # 不保留cj搜索词
print(driver.current_url)  # https://image.baidu.com/
print(driver.get_cookies())  # [{'':'','':''},{'':'','':'','':''}]
print(driver.page_source.find('input'))  # 13613
# print(driver.page_source)
sleep(2)

# 三、B站示例
driver.get('https://search.bilibili.com/')
node = driver.find_element_by_xpath('//*[@id="search-keyword"]')
print(node.get_attribute('maxlength'))  # 100
node.send_keys('c')
node.send_keys('j')
print(node.get_attribute('value'))  # cj
node.clear()
node.send_keys('gjl')
driver.find_element_by_xpath('//*[@id="server-search-app"]/div/div/div[2]/a').click()
driver.save_screenshot('1.png')
sleep(2)

# 四、关闭
driver.close()
driver.quit()

# 五、源码
# pycharm中,ctrl+b跳到定义
# from selenium.webdriver.chrome.webdriver import WebDriver
# from selenium.webdriver.common.by import By
# wb = WebDriver(executable_path='chromedriver')
# wb.execute('get', {'url': 'http://www.baidu.com'})
# element1 = wb.execute('findElement', {
#     'using': By.XPATH,
#     'value': '//input[@id="kw"]'
# })['value']
# element1._execute('sendKeysToElement', {'text': "cj", 'value': ""})
# element2 = wb.execute('findElement', {
#     'using': By.XPATH,
#     'value': '//input[@id="su"]'
# })['value']
# element2._execute('clickElement')

# 驱动下载路径:http://npm.taobao.org/mirrors/chromedriver
# 京东翻页按钮示例:常规等待,显式等待,隐式等待
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class JdSpider(object):
    def __init__(self):
        self.url = 'https://www.jd.com/'
        self.browser = webdriver.Chrome()
        # 隐式等待打开下面这句
        # self.browser.implicitly_wait(10)
        # 显式等待打开下面这句
        # self.wait = WebDriverWait(self.browser, 10)

    def get_page(self):
        self.browser.get(self.url)
        self.browser.find_element_by_xpath('//*[@id="key"]').send_keys('爬虫书籍')
        self.browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button').click()
        if 'pn-next disabled' in self.driver.page_source:  # 查看源码中是否有下页按钮的失效属性
            print('over')
        else:  # 没有的话就跳到下一页
            # 常规等待:因为网页加载较慢,所以要人为预定等待时间,否则会报错
            time.sleep(3)
            self.browser.find_element_by_class_name('pn-next').click()
            # 隐式等待:设置最长等待时间,如果在规定时间内网页加载完成,则执行下一步,否则抛异常
            # self.browser.find_element_by_class_name('pn-next').click()
            # 显式等待:设置最长等待时间,如果在规定时间内指定元素加载完成,则执行下一步,否则抛异常
            # self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'pn-next'))).click()

    def main(self):
        self.get_page()
        time.sleep(3)
        self.browser.close()
        self.browser.quit()


spider = JdSpider()
spider.main()


# CSDN示例:无头模式+截图
from selenium import webdriver
from selenium.webdriver import ChromeOptions
option = ChromeOptions()
option.add_argument('--headless')
browser = webdriver.Chrome(options=option)
url = 'https://blog.csdn.net/'
browser.get(url)
browser.save_screenshot('CSDN.png')
# 京东示例:模拟下拉,保存csv,遍历元素及访问文件
from selenium import webdriver
import time
import csv
url = 'https://www.jd.com/'
browser = webdriver.Chrome()
browser.get(url)
browser.find_element_by_xpath('//*[@id="key"]').send_keys('口罩')
browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button/i').click()

file = open('Jd_goods.csv', 'w', newline='')
writer = csv.writer(file)
writer.writerow(['商品价格', '商品名称', '评价人数', '店铺名称'])

while True:
    time.sleep(1)
    browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')  # 这个是javascript代码
    li_list = browser.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li')
    for li in li_list:
        goods_data = li.text.split('\n')
        goods_price = goods_data[0]
        if '¥' in goods_data[1]:  # 有时会有两个价值就往后推
            goods_name = goods_data[2]
            goods_comment = goods_data[3]
            shop = goods_data[4]
        else:  # 关于这个信息的
            goods_name = goods_data[1]
            goods_comment = goods_data[2]
            shop = goods_data[3]
        print('price:'+goods_price, 'name:'+goods_name, 'comment:'+goods_comment, 'shop:'+shop, sep='\n', end='\n\n')
        writer.writerow([goods_price, goods_name, goods_comment, shop])
    if browser.page_source.find('pn-next disabled') == -1:
        browser.find_element_by_class_name('pn-next').click()
    else:
        break
file.close()
browser.close()
browser.quit()
# 豆瓣示例:进入子页面(查看源码如果有iframe的话要先转入iframe中,因为是另一个页面了)
from selenium import webdriver
import time

url = 'https://www.douban.com/'
browser = webdriver.Chrome()
browser.get(url)

frame_node = browser.find_element_by_xpath('//*[@id="anony-reg-new"]/div/div[1]/iframe')
browser.switch_to.frame(frame_node)

browser.find_element_by_xpath('/html/body/div[1]/div[1]/ul[1]/li[2]').click()
browser.find_element_by_xpath('//*[@id="username"]').send_keys('18306658370')
browser.find_element_by_xpath('//*[@id="password"]').send_keys('db@1234')
browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[1]/div[5]/a').click()

time.sleep(3)
browser.close()
browser.quit()
import requests
api_url = 'http://music.163.com/song/media/outer/url?id=152428'
headers = {
    'user-agent': 'Mozilla/5.0'
}
resp = requests.get(api_url, headers=headers)
with open('朋友.mp3', 'wb') as f:
    f.write(resp.content)

# 朋友-152428
# 光年之外-486194122

 

你可能感兴趣的:(2021年研究生学习笔记)