代码演示:
import time
from selenium import webdriver
# 加载驱动
# 如果驱动没有配置到环境变量,你就需要制定这个驱动的路径
# driver = webdriver.PhantomJS(r'C:\Users\王佳欣的windows\Desktop\phantomjs.exe')
driver = webdriver.PhantomJS()
# 加载网页
driver.get('https://www.baidu.com')
driver.find_element_by_id('kw').send_keys('python')
# 找到su并点击
driver.find_element_by_id('su').click()
time.sleep(2)
# print(driver.page_source) # 得到网页源码
print(driver.current_url) # 查看当前请求的url
driver.save_screenshot('baidu.jpg')
import time
from selenium import webdriver
# 加载驱动
driver=webdriver.Chrome()
# 加载网页
driver.get('https://baidu.com/') #直接打开了网站
driver.maximize_window() # 窗口最大化
time.sleep(3)
driver.close() # 作用是关闭当前窗口
time.sleep(1)
# python提供的驱动退出
driver.quit()# 作用关闭浏览器
find_element_by_id
:根据id来查找某个元素import time
from selenium import webdriver
from selenium.webdriver.common.by import By
# 加载驱动
drvier = webdriver.Chrome()
# 加载网页
drvier.get('https://www.baidu.com/')
time.sleep(2)
# find_element_by_id:根据id来查找某个元素
drvier.find_element_by_id('kw').send_keys('王佳欣') # 第一种方法
# drvier.find_element(By.ID, 'kw').send_keys('李蕊') # 第二种方法
find_element_by_class_name
:根据类名查找元素import time
from selenium import webdriver
from selenium.webdriver.common.by import By
# 加载驱动
drvier = webdriver.Chrome()
# 加载网页
drvier.get('https://www.baidu.com/')
time.sleep(2)
# find_element_by_class_name:根据类名查找元素
drvier.find_element_by_class_name('s_ipt').send_keys('李蕊') # 第一种方法
drvier.find_element(By.CLASS_NAME, 's_ipt').send_keys('李蕊') # 第二种写法
find_element_by_name
:根据name属性的值来查找元素import time
from selenium import webdriver
from selenium.webdriver.common.by import By
# 加载驱动
drvier = webdriver.Chrome()
# 加载网页
drvier.get('https://www.baidu.com/')
time.sleep(2)
# find_element_by_name:根据name属性的值来查找元素
# drvier.find_element_by_class_name('wd').send_keys('李蕊') # 第一种方法
drvier.find_element(By.NAME, 'wd').send_keys('李蕊') # 第二种方法
find_element_by_tag_name
:根据标签名来查找元素import time
from selenium import webdriver
from selenium.webdriver.common.by import By
# 加载驱动
drvier = webdriver.Chrome()
# 加载网页
drvier.get('https://www.baidu.com/')
time.sleep(2)
# find_element_by_tag_name:根据标签名来查找元素
head = drvier.find_element_by_tag_name('head')
print(head)
find_element_by_xpath
:根据xpath语法来获取元素import time
from selenium import webdriver
from selenium.webdriver.common.by import By
# 加载驱动
drvier = webdriver.Chrome()
# 加载网页
drvier.get('https://www.baidu.com/')
time.sleep(2)
# find_element_by_xpath:根据xpath语法来获取元素
drvier.find_element_by_xpath('//input[@id="kw"]').send_keys('景甜')
find_elements_by_css_selector
:根据css来定位元素import time
from selenium import webdriver
from selenium.webdriver.common.by import By
# 加载驱动
drvier = webdriver.Chrome()
# 加载网页
drvier.get('https://www.baidu.com/')
time.sleep(2)
# find_elements_by_css_selector:根据css来定位元素
drvier.find_elements_by_css_selector('.s_ipt').send_keys('李蕊')
send_keys()
设置内容import time
from selenium import webdriver
from selenium.webdriver.common.by import By
# 加载驱动
drvier = webdriver.Chrome()
# 加载网页
drvier.get('https://www.baidu.com/')
time.sleep(2)
# send_keys()清空内容
input_Tag = drvier.find_element_by_id('kw')
input_Tag.send_keys('李蕊')
clear()
清空内容from selenium import webdriver
import time
drvier = webdriver.Chrome()
drvier.get('https://www.baidu.com/')
# drvier.find_element_by_id('su') # 找到百度一下按钮
# drvier.find_element_by_id('wrapper').send_keys('王佳欣') # 报错,原因是wrapper不是inpmt标签
# send_keys() 设置内容
input_Tag = drvier.find_element_by_id('kw')
input_Tag.send_keys('李蕊')
time.sleep(3)
# clear()清空内容
input_Tag.clear()
click()
点击from selenium import webdriver
import time
drvier = webdriver.Chrome()
drvier.get('https://www.baidu.com/')
# drvier.find_element_by_id('su') # 找到百度一下按钮
# drvier.find_element_by_id('wrapper').send_keys('王佳欣') # 报错,原因是wrapper不是inpmt标签
# send_keys() 设置内容
input_Tag = drvier.find_element_by_id('kw')
input_Tag.send_keys('李蕊')
time.sleep(3)
# clear()清空内容
# input_Tag.clear()
input_tag = drvier.find_element_by_id('su')
input_tag.click()
注意:如果在页面中无法获取页面的元素,你要去查看一下这个页面中是否有iftame标签。
iframe标签(是一个HTML标签)的作用是文档中的文档。
如果有iframe标签,并且嵌套了你要找的元素。那你就要切换一下iframe。
引用select类:
from selenium.webdriver.support.ui import Select
select_tag = Select(drvier.find_element_by_class_name('nojs'))
select_tag.select_by_value('CA')
select_tag.select_by_index(1)
from selenium import webdriver
from selenium.webdriver.support.ui import Select
drvier = webdriver.Chrome()
# 加载网站
drvier.get('https://www.17sucai.com/pins/demo-show?id=5926')
# 切换iframe (switch_to_frame()该方法已经过时,但是还可以使用)
drvier.switch_to_frame(drvier.find_element_by_id('iframe'))
select_tag = Select(drvier.find_element_by_class_name('nojs'))
# select标签的选择方式有两种
# 1、根据值来选择
select_tag.select_by_value('CA')
# 2、根据下标索引来选择
select_tag.select_by_index(1)
代码演示:
from selenium import webdriver
import time
from selenium.webdriver.support.ui import Select
driver = webdriver.Chrome()
# 加载网站
driver.get('https://www.17sucai.com/pins/demo-show?id=5926')
# 切换iframe (switch_to_frame()该方法已经过时,但是还可以使用)
driver.switch_to_frame(driver.find_element_by_id('iframe'))
# 找到下拉菜单标签
divTag = driver.find_element_by_id('dk_container_country-nofake')
divTag.click()
time.sleep(1)
#
driver.find_element_by_xpath('//*[@id="dk_container_country-nofake"]/div/ul/li[5]/a').click()
注意:
1、检查元素标签是否被iframe嵌套
2、这个标签是否需要点击
3、开始操作表单元素
代码实现:
from selenium import webdriver
import time
# 导入驱动
driver = webdriver.Chrome()
# 访问网页
driver.get('https://www.douban.com/')
time.sleep(2)
# 查看网页代码显示有ifame,需要转化
'''
错误总结:缺少iframe切换
Message: no such element: Unable to locate element: {"method":"css selector","selector":".account-tab-account"}
(Session info: chrome=89.0.4389.82)
'''
login_frame = driver.find_element_by_xpath('//*[@id="anony-reg-new"]/div/div[1]/iframe')
driver.switch_to.frame(login_frame)
# 切换登陆方式,选择密码登陆
content = driver.find_element_by_class_name('account-tab-account')
content.click() # 点击
time.sleep(1)
# 定位账号和密码并输入内容
# 输入账号
account = driver.find_element_by_id('username').send_keys('15840039263')
time.sleep(0.5)
# 输入密码
password = driver.find_element_by_id('password').send_keys('789463521')
time.sleep(0.5)
# 点击登陆
driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[1]/div[5]/a').click()
actions = ActionChains(driver)
actions.move_to_element(inputTag)
actions.send_keys_to_element(inputTag,'python')
actions.move_to_element(submitTag)
actions.context_click()
actions.click(submitTag)
actions.perform()
click_and_hold(element)
:点击但不松开鼠标。context_click(element)
:右键点击。double_click(element)
:双击。from selenium import webdriver
from selenium.webdriver import ActionChains
import time
driver = webdriver.Chrome()
driver.get('https://www.baidu.com/')
# 定位输入框
input_Tag = driver.find_element_by_id('kw')
# 定位百度按钮
button_Tag = driver.find_element_by_id('su')
# 实例化一个对象
actions = ActionChains(driver)
# 移动鼠标至输入框,输入内容
actions.send_keys_to_element(input_Tag, '李蕊')
time.sleep(2)
# 移动鼠标至点击按钮,点击按钮
actions.move_to_element(button_Tag)
actions.click()
# 提交行为链操作
actions.perform()
# button_Tag.click()
# 可以把按钮点击放在鼠标行为链外面做,可以执行,但是不建议
cookies = driver.get_cookies()
value = driver.get_cookie(name)
driver.delete_cookie('key')
https://xui.ptlogin2.qq.com/cgi-bin/xlogin?proxy_url=https%3A//qzs.qq.com/qzone/v6/portal/proxy.html&daid=5&&hide_title_bar=1&low_login=0&qlogin_auto_login=1&no_verifyimg=1&link_target=blank&appid=549000912&style=22&target=self&s_url=https%3A%2F%2Fqzs.qzone.qq.com%2Fqzone%2Fv5%2Floginsucc.html%3Fpara%3Dizone&pt_qr_app=手机QQ空间&pt_qr_link=http%3A//z.qzone.com/download.html&self_regurl=https%3A//qzs.qq.com/qzone/v6/reg/index.html&pt_qr_help_link=http%3A//z.qzone.com/download.html&pt_no_auth=0
代码演示:
# cookie的作用
# 1、反反扒
# 2、模拟登陆
import json
import requests
from selenium import webdriver
import time
driver = webdriver.Chrome()
# 获取百度的cookie
# cookies = driver.get_cookies()
# for cookie in cookies:
# print(cookie)
# 模拟登陆QQ空间
driver.get('https://xui.ptlogin2.qq.com/cgi-bin/xlogin?proxy_url=https%3A//qzs.qq.com/qzone/v6/portal/proxy.html&daid=5&&hide_title_bar=1&low_login=0&qlogin_auto_login=1&no_verifyimg=1&link_target=blank&appid=549000912&style=22&target=self&s_url=https%3A%2F%2Fqzs.qzone.qq.com%2Fqzone%2Fv5%2Floginsucc.html%3Fpara%3Dizone&pt_qr_app=%E6%89%8B%E6%9C%BAQQ%E7%A9%BA%E9%97%B4&pt_qr_link=http%3A//z.qzone.com/download.html&self_regurl=https%3A//qzs.qq.com/qzone/v6/reg/index.html&pt_qr_help_link=http%3A//z.qzone.com/download.html&pt_no_auth=0')
button = driver.find_element_by_class_name('face')
button.click()
time.sleep(3)
listcookies = driver.get_cookies()
# 保存这个没有解析的cookie
# json.loads() 将一个json类型的字符串转换成python的数据类型
# json.dumps()将一个python类型的数据转换成json类型的数据
# print(type(listcookies))
# print('-'*80)
jsonCookie = json.dumps(listcookies)
# print(type(jsonCookie))
# 保存cookie
# with open('qqzone.json', 'w') as file_obj:
# file_obj.write(jsonCookie)
# 解析cookie--列表推导式
cookie = [item['name'] + '=' + item['value'] for item in listcookies]
cookie_str = '; '.join(item for item in cookie)
headers = {
'cookie': cookie_str,
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_3_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
# 发起请求
url = 'https://user.qzone.qq.com/1550023517'
html = requests.get(url, headers=headers)
time.sleep(1)
with open('qqzone.html', 'w', encoding='utf-8') as file:
file.write(html.text)
driver.implicitly_wait(10)
代码演示:
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get('https://www.baidu.com/')
# time.sleep(2) # 一定要等待n秒
# 隐式等待
driver.implicitly_wait(10) # 如果直接找到元素,就不需要等到n秒,会直接加载
driver.find_element_by_id('kw').send_keys('加油')
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("https://www.baidu.com/")
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "myDynamicElement"))
)
# 显示等待(针对隐式等待进行判断)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('https://kyfw.12306.cn/otn/leftTicket/init?linktypeid=dc')
driver.implicitly_wait(5)
driver.find_element_by_id('gb_closeDefaultWarningWindowDialog_id').click()
# 显示等待
# 等待出发地的ID加载出来,在设置出发地
WebDriverWait(driver,100).until(
EC.text_to_be_present_in_element_value((By.ID,'fromStationText'),'北京'))
# 等待目的地
WebDriverWait(driver, 100).until(EC.text_to_be_present_in_element_value((By.ID, 'toStationText'), '长沙'))
presence_of_element_located
:某个元素已经加载完毕了。presence_of_all_elements_located
:网页中所有满足条件的元素都加载完毕了。element_to_be_clickable
:某个元素是可以点击了。from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome()
driver.get('https://www.baidu.com/')
# driver.get('https://www.douban.com/')
driver.execute_script('window.open("https://www.douban.com/")')
# driver.close() # 因为只是打开了豆瓣,但是程序执行一直都是在执行百度页面,对豆瓣页面未执行任何操作
# driver.quit() # 关闭全部
# driver.find_element_by_id('kw').send_keys('王佳欣')
print(driver.current_url)
# 切换窗口
# driver.switch_to_window(driver.window_handles[1])
driver.switch_to.window(driver.window_handles[1])
time.sleep(2)
print(driver.current_url)
目标:
1、练习selenium所学知识点
2、如何通过selenium来爬取ajax数据
思路:
1、登陆
总结:
1、面向对象来实现,先搭建好整体的框架
2、驱动不要写在类里面,因为垃圾回收机制,类执行完毕就关闭了
3、如何证明登陆成功,显示等待,加载登陆成功的URL证明登陆成功
2、余票查询
弹框,这弹框比较特殊,不能点击确定按钮,需要定位gb_closeDefaultWarningWindowDialog_id按钮
设置出发地、目的地、出发日期
1、定位
2、找到站点代号
3、通过execute_script()去实现代码的设置
execute_script('arguments[0].value="%s"' % from_station_code, from_station_input)
4、点击确定查询按钮(做了设置)
search_btn = driver.find_element_by_id('query_ticket')
driver.execute_script('arguments[0].click();', search_btn)
3、解析车次列表数据
一、分析页面,得出结论。
这些数据大批早tbody里面的tr标签,然后发现,每个车次tr标签是不含有datrain属性
由于是ajax加载的数据,我们做出显示等待,如果页面中的tr标签加载出来,就证明车次列表数据加载出来
WebDriverWait(driver,1000).until( EC.presence_of_element_located((By.XPATH,'//tbody[@id="queryLeftTable"]/tr'))
)
获取tr标签(车次列表数据)
train_trs = driver.find_elements_by_xpath('//tbody[@id="queryLeftTable"]/tr[not(@datatran)]')
for train_tr in train_trs:
# print(train_tr.text)# 返回selenium对象
二、解析数据
替换和拼接的方式把车次信息的数据放在一个列表中
infos = train_tr.text.replace('\n', ' ').split(' ')
在init方法中初始化了一个用户车次以及席位
先获得车次,做出一个判断,判断这个车次是否在数据里面
进行了想要购买车次以及席位的数据解析
number = infos[0] # 车次
if number in self.trains:
seat_types = self.trains[number]
for seat_type in seat_types:
if seat_type == 'O':
count = infos[9] # 二等座
if count.isdigit() or count == '有':
is_searched = True
elif seat_type == 'M':
count = infos[8] # 一等座
if count.isdigit() or count == '有':
is_searched = True
# 点击预定按钮
if is_searched:
self.select_number = number
order_btn = train_tr.find_element_by_xpath('.//a[@class="btn72"]')
order_btn.click()
break
4、确认乘客和席位
首先确认页面是乘客席位的页面
显示等待
WebDriverWait(driver, 100).until(
EC.url_contains(self.confirm_passengers_url)
)
确认乘客信息
不要点击 框框,直接找人名,人名可能是多个elements
确认席位
席位是select标签, 常规的select标签操作
点击提交按钮
5、核对信息
点击确认按钮
这个按钮他无法点击,我们的解决办法是用了一个死循环,就连续不断的点击,知道点击提交订单
代码实现:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
driver = webdriver.Chrome()
class TrainSpider(object):
# 定义类属性
login_url = 'https://kyfw.12306.cn/otn/resources/login.html' # 登录
personal_url = 'https://kyfw.12306.cn/otn/view/index.html'# 个人中心
left_ticket_url = 'https://kyfw.12306.cn/otn/leftTicket/init?linktypeid=dc' # 车次余票
confirm_passengers_url = 'https://kyfw.12306.cn/otn/confirmPassenger/initDc' # 确认乘客信息
# 初始化实例对象
def __init__(self, from_station, to_station, train_data, trains, passengers):
self.from_station = from_station # 目的地
self.to_station = to_station # 出发地
self.train_data = train_data # 出发日期
self.trains = trains # 想要购买的车次 {'G485':['m', 'o']}
self.passengers = passengers # 确认成了姓名
self.station_codes = {
}
self.init_station_code() # 初始化站点的方法
# 封装方法
def init_station_code(self):
with open('stations.csv', 'r', encoding='utf-8') as file_obj:
reader = csv.DictReader(file_obj)
for line in reader:
name = line['name']
code = line['code']
self.station_codes[name] = code
# 登陆
def login(self):
# 登陆
driver.get(self.login_url)
# 显示等待
WebDriverWait(driver, 100).until(
EC.url_contains(self.personal_url)
)
print('登陆成功!')
# 查找车票
def search_left_ticket(self):
driver.get(self.left_ticket_url)
driver.implicitly_wait(5)
# 设置按钮,取消弹框(点击X)
driver.find_element_by_id('gb_closeDefaultWarningWindowDialog_id').click()
# 设置出发地
from_station_input = driver.find_element_by_id('fromStation')
from_station_code = self.station_codes[self.from_station]
# 这个input标签是一个hidden状态
driver.execute_script('arguments[0].value="%s"' % from_station_code, from_station_input)
# 设置目的地
to_station_input = driver.find_element_by_id('toStation')
to_station_code = self.station_codes[self.to_station]
# 这个input标签是一个hidden状态
driver.execute_script('arguments[0].value="%s"' % to_station_code, to_station_input)
# 设置出发日期
train_data_input = driver.find_element_by_id('train_date')
driver.execute_script('arguments[0].value="%s"' % self.train_data, train_data_input)
driver.implicitly_wait(3)
# 执行查询按钮
search_btn = driver.find_element_by_id('query_ticket')
driver.execute_script('arguments[0].click();', search_btn)
# 解析车次信息
WebDriverWait(driver,1000).until(
EC.presence_of_element_located((By.XPATH,'//tbody[@id="queryLeftTable"]/tr'))
)
# 得到所有tr标签将不需要的(datatran)做出过滤
train_trs = driver.find_elements_by_xpath('//tbody[@id="queryLeftTable"]/tr[not(@datatran)]')
is_searched = False
for train_tr in train_trs:
# print(train_tr.text) # selenium对象
infos = train_tr.text.replace('\n', ' ').split(' ')
number = infos[0] # 车次
if number in self.trains:
seat_types = self.trains[number]
for seat_type in seat_types:
if seat_type == 'O':
count = infos[9] # 二等座
if count.isdigit() or count == '有':
is_searched = True
elif seat_type == 'M':
count = infos[8] # 一等座
if count.isdigit() or count == '有':
is_searched = True
# 点击预定按钮
if is_searched:
self.select_number = number
order_btn = train_tr.find_element_by_xpath('.//a[@class="btn72"]')
order_btn.click()
break
def confirm_passengers(self):
WebDriverWait(driver, 100).until(
EC.url_contains(self.confirm_passengers_url)
)
# 确认需要购票的乘客 不要选中框框 直接定位人名
passenger_labels = driver.find_elements_by_xpath('//ul[@id="normal_passenger_id"]/li/label')
for passenger_label in passenger_labels:
name = passenger_label.text
if name in self.passengers:
passenger_label.click()
# 确认购买的席位
seat_select = Select(driver.find_element_by_id('seatType_1'))
seat_types = self.trains[self.select_number]
for seat_type in seat_types:
try:
seat_select.select_by_value(seat_type)
except NoSuchElementException:
continue
else:
break
# 提交订单
submit_btn = driver.find_element_by_id('submitOrder_id')
submit_btn.click()
WebDriverWait(driver, 100).until(
EC.presence_of_element_located((By.CLASS_NAME, 'dhtmlx_window_active'))
)
# 确认按钮
btn = driver.find_element_by_id('qr_submit_id')
while btn:
try:
btn.click()
btn = driver.find_element_by_id('qr_submit_id')
except:
break
# 封装整个项目的功能
def run(self):
# 1 登录
self.login()
# 2 车次余票查询
self.search_left_ticket()
# 3、确认乘客和车次信息
self.confirm_passengers()
def main():
spider = TrainSpider('北京', '长沙', '2021-03-25', {
'G485':['M','O']})
spider.run()
if __name__ == '__main__':
main()