# 基础用法
from selenium import webdriver
from time import sleep
from selenium.webdriver import ChromeOptions
# 一、直聘示例:防屏蔽
# driver = webdriver.Chrome()
# driver.get('https://www.zhipin.com/')
# driver.find_element_by_xpath('//*[@id="wrap"]/div[3]/div/div/div[1]/form/div[2]/p/input').send_keys('python')
# driver.find_element_by_xpath('//*[@id="wrap"]/div[3]/div/div/div[1]/form/button').click()
# 上面直接访问会报错,所以要更改options后再创建驱动
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
# option.add_experimental_option('useAutomationExtension',False)
# driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument',{'source':'Object.defineProperty(navigator,"webdriver",{get:()=>undefined})'})
driver.get('https://www.zhipin.com/')
driver.find_element_by_xpath('//*[@id="wrap"]/div[3]/div/div/div[1]/form/div[2]/p/input').send_keys('python')
# driver.find_element_by_xpath('//*[@id="wrap"]/div[3]/div/div/div[1]/form/button').click()
sleep(2)
# 补充:使用最新版的谷歌浏览器时,此防屏蔽方法会失效
# 二、百度示例
driver.maximize_window() # driver.fullscreen_window()
driver.get('http://www.baidu.com')
driver.find_element_by_xpath('//*[@id="kw"]').send_keys('cj')
driver.find_element_by_xpath('//*[@id="su"]').click()
driver.find_element_by_xpath('//*[@id="s_tab"]/div/a[3]').click() # 不保留cj搜索词
print(driver.current_url) # https://image.baidu.com/
print(driver.get_cookies()) # [{'':'','':''},{'':'','':'','':''}]
print(driver.page_source.find('input')) # 13613
# print(driver.page_source)
sleep(2)
# 三、B站示例
driver.get('https://search.bilibili.com/')
node = driver.find_element_by_xpath('//*[@id="search-keyword"]')
print(node.get_attribute('maxlength')) # 100
node.send_keys('c')
node.send_keys('j')
print(node.get_attribute('value')) # cj
node.clear()
node.send_keys('gjl')
driver.find_element_by_xpath('//*[@id="server-search-app"]/div/div/div[2]/a').click()
driver.save_screenshot('1.png')
sleep(2)
# 四、关闭
driver.close()
driver.quit()
# 五、源码
# pycharm中,ctrl+b跳到定义
# from selenium.webdriver.chrome.webdriver import WebDriver
# from selenium.webdriver.common.by import By
# wb = WebDriver(executable_path='chromedriver')
# wb.execute('get', {'url': 'http://www.baidu.com'})
# element1 = wb.execute('findElement', {
# 'using': By.XPATH,
# 'value': '//input[@id="kw"]'
# })['value']
# element1._execute('sendKeysToElement', {'text': "cj", 'value': ""})
# element2 = wb.execute('findElement', {
# 'using': By.XPATH,
# 'value': '//input[@id="su"]'
# })['value']
# element2._execute('clickElement')
# 驱动下载路径:http://npm.taobao.org/mirrors/chromedriver
# 京东翻页按钮示例:常规等待,显式等待,隐式等待
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class JdSpider(object):
def __init__(self):
self.url = 'https://www.jd.com/'
self.browser = webdriver.Chrome()
# 隐式等待打开下面这句
# self.browser.implicitly_wait(10)
# 显式等待打开下面这句
# self.wait = WebDriverWait(self.browser, 10)
def get_page(self):
self.browser.get(self.url)
self.browser.find_element_by_xpath('//*[@id="key"]').send_keys('爬虫书籍')
self.browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button').click()
if 'pn-next disabled' in self.driver.page_source: # 查看源码中是否有下页按钮的失效属性
print('over')
else: # 没有的话就跳到下一页
# 常规等待:因为网页加载较慢,所以要人为预定等待时间,否则会报错
time.sleep(3)
self.browser.find_element_by_class_name('pn-next').click()
# 隐式等待:设置最长等待时间,如果在规定时间内网页加载完成,则执行下一步,否则抛异常
# self.browser.find_element_by_class_name('pn-next').click()
# 显式等待:设置最长等待时间,如果在规定时间内指定元素加载完成,则执行下一步,否则抛异常
# self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'pn-next'))).click()
def main(self):
self.get_page()
time.sleep(3)
self.browser.close()
self.browser.quit()
spider = JdSpider()
spider.main()
# CSDN示例:无头模式+截图
from selenium import webdriver
from selenium.webdriver import ChromeOptions
option = ChromeOptions()
option.add_argument('--headless')
browser = webdriver.Chrome(options=option)
url = 'https://blog.csdn.net/'
browser.get(url)
browser.save_screenshot('CSDN.png')
# 京东示例:模拟下拉,保存csv,遍历元素及访问文件
from selenium import webdriver
import time
import csv
url = 'https://www.jd.com/'
browser = webdriver.Chrome()
browser.get(url)
browser.find_element_by_xpath('//*[@id="key"]').send_keys('口罩')
browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button/i').click()
file = open('Jd_goods.csv', 'w', newline='')
writer = csv.writer(file)
writer.writerow(['商品价格', '商品名称', '评价人数', '店铺名称'])
while True:
time.sleep(1)
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') # 这个是javascript代码
li_list = browser.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li')
for li in li_list:
goods_data = li.text.split('\n')
goods_price = goods_data[0]
if '¥' in goods_data[1]: # 有时会有两个价值就往后推
goods_name = goods_data[2]
goods_comment = goods_data[3]
shop = goods_data[4]
else: # 关于这个信息的
goods_name = goods_data[1]
goods_comment = goods_data[2]
shop = goods_data[3]
print('price:'+goods_price, 'name:'+goods_name, 'comment:'+goods_comment, 'shop:'+shop, sep='\n', end='\n\n')
writer.writerow([goods_price, goods_name, goods_comment, shop])
if browser.page_source.find('pn-next disabled') == -1:
browser.find_element_by_class_name('pn-next').click()
else:
break
file.close()
browser.close()
browser.quit()
# 豆瓣示例:进入子页面(查看源码如果有iframe的话要先转入iframe中,因为是另一个页面了)
from selenium import webdriver
import time
url = 'https://www.douban.com/'
browser = webdriver.Chrome()
browser.get(url)
frame_node = browser.find_element_by_xpath('//*[@id="anony-reg-new"]/div/div[1]/iframe')
browser.switch_to.frame(frame_node)
browser.find_element_by_xpath('/html/body/div[1]/div[1]/ul[1]/li[2]').click()
browser.find_element_by_xpath('//*[@id="username"]').send_keys('18306658370')
browser.find_element_by_xpath('//*[@id="password"]').send_keys('db@1234')
browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[1]/div[5]/a').click()
time.sleep(3)
browser.close()
browser.quit()
import requests
api_url = 'http://music.163.com/song/media/outer/url?id=152428'
headers = {
'user-agent': 'Mozilla/5.0'
}
resp = requests.get(api_url, headers=headers)
with open('朋友.mp3', 'wb') as f:
f.write(resp.content)
# 朋友-152428
# 光年之外-486194122