pip3 install selenium
(注:此处是默认mac下的python3环境执行的命令)Selenium
之外在Python中有许多模拟浏览器运行库,如:Splash、PyV8、Ghost等1 ) 模拟谷歌浏览器访问百度首页,并输入python关键字搜索
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
#初始化一个浏览器(如:谷歌,使用Chrome需安装chromedriver)
driver = webdriver.Chrome()
#driver = webdriver.PhantomJS() #无界面浏览器
try:
#请求网页
driver.get("https://www.baidu.com")
#查找id值为kw的节点对象(搜索输入框)
input = driver.find_element_by_id("kw")
#模拟键盘输入字串内容
input.send_keys("python")
#模拟键盘点击回车键
input.send_keys(Keys.ENTER)
#显式等待,最长10秒
wait = WebDriverWait(driver,10)
#等待条件:10秒内必须有个id属性值为content_left的节点加载出来,否则抛异常。
wait.until(EC.presence_of_element_located((By.ID,'content_left')))
# 输出响应信息
print(driver.current_url)
print(driver.get_cookies())
print(driver.page_source)
finally:
#关闭浏览器
#driver.close()
pass
2 ) 声明浏览器对象
from selenium import webdriver
driver = webdriver.Chrome() #谷歌 需:ChromeDriver驱动
driver = webdriver.FireFox() #火狐 需:GeckoDriver驱动
driver = webdriver.Edge()
driver = webdriver.Safari()
driver = webdriver.PhantomJS() #无界面浏览器
3 ) 访问页面
from selenium import webdriver
driver = webdriver.Chrome()
#driver = webdriver.PhantomJS()
driver.get("http://www.taobao.com")
print(driver.page_source)
#driver.close()
4 ) 查找节点
获取单个节点的方法:
find_element_by_id()
find_element_by_name()
find_element_by_xpath()
find_element_by_link_text()
find_element_by_partial_link_text()
find_element_by_tag_name()
find_element_by_class_name()
find_element_by_css_seletor()
from selenium import webdriver
from selenium.webdriver.common.by import By
#创建浏览器对象
driver = webdriver.Chrome()
#driver = webdriver.PhantomJS()
driver.get("http://www.taobao.com")
#下面都是获取id属性值为q的节点对象
input = driver.find_element_by_id("q")
print(input)
input = driver.find_element_by_css_selector("#q")
print(input)
input = driver.find_element_by_xpath("//*[@id='q']")
print(input)
#效果同上
input = driver.find_element(By.ID,"q")
print(input)
#driver.close()
获取多个节点的方法:
find_elements_by_id()
find_elements_by_name()
find_elements_by_xpath()
find_elements_by_link_text()
find_elements_by_partial_link_text()
find_elements_by_tag_name()
find_elements_by_class_name()
find_elements_by_css_seletor()
5 )节点交互
from selenium import webdriver
import time
#创建浏览器对象
driver = webdriver.Chrome()
#driver = webdriver.PhantomJS()
driver.get("http://www.taobao.com")
#下面都是获取id属性值为q的节点对象
input = driver.find_element_by_id("q")
#模拟键盘输入iphone
input.send_keys('iphone')
time.sleep(3)
#清空输入框
input.clear()
#模拟键盘输入iPad
input.send_keys('iPad')
#获取搜索按钮节点
botton = driver.find_element_by_class_name("btn-search")
#触发点击动作
botton.click()
#driver.close()
6 ) 动态链
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
#创建浏览器对象
driver = webdriver.Chrome()
#加载指定url地址
url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
driver.get(url)
# 切换Frame窗口
driver.switch_to.frame('iframeResult')
#获取两个div节点对象
source = driver.find_element_by_css_selector("#draggable")
target = driver.find_element_by_css_selector("#droppable")
#创建一个动作链对象
actions = ActionChains(driver)
#将一个拖拽操作添加到动作链队列中
actions.drag_and_drop(source,target)
time.sleep(3)
#执行所有存储的操作(顺序被触发)
actions.perform()
#driver.close()
7 ) 执行JavaScript
from selenium import webdriver
#创建浏览器对象
driver = webdriver.Chrome()
#加载指定url地址
driver.get("https://www.zhihu.com/explore")
#执行javascript程序将页面滚动移至底部
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
#执行javascript实现一个弹框操作
driver.execute_script('window.alert("Hello Selenium!")')
#driver.close()
8 ) 获取节点信息
from selenium import webdriver
# from selenium.webdriver import ActionChains
#创建浏览器对象
driver = webdriver.Chrome()
#加载请求指定url地址
driver.get("https://www.zhihu.com/explore")
#获取id属性值为zh-top-link-logo的节点(logo)
logo = driver.find_element_by_id("zh-top-link-logo")
print(logo) #输出节点对象
print(logo.get_attribute('class')) #节点的class属性值
#获取id属性值为zu-top-add-question节点(提问按钮)
input = driver.find_element_by_id("zu-top-add-question")
print(input.text) #获取节点间内容
print(input.id) #获取id属性值
print(input.location) #节点在页面中的相对位置
print(input.tag_name) #节点标签名称
print(input.size) #获取节点的大小
#driver.close()
9 ) 切换Frame
switch_to.frame()
来切换Frame界面,实例详见上面动态链案例。10 ) 延迟等待
浏览器加载网页是需要时间的,Selenium也不例外,若要获取完整网页内容,就要延时等待。
在Selenium中延迟等待方式有两种:一种是隐式等待,一种是显式等待(推荐)。
隐式:
from selenium import webdriver
#创建浏览器对象
driver = webdriver.Chrome()
#使用隐式等待(固定时间)
driver.implicitly_wait(2)
#加载请求指定url地址
driver.get("https://www.zhihu.com/explore")
#获取节点
input = driver.find_element_by_id("Popover1-toggle")
print(input) # 获取节点间内容
#driver.close()
显式:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
#创建浏览器对象
driver = webdriver.Chrome()
#加载请求指定url地址
driver.get("https://www.zhihu.com/explore")
#显式等待,最长10秒
wait = WebDriverWait(driver,10)
#等待条件:10秒内必须有个id属性值为Popover1-toggle的节点加载出来,否则抛异常。
input = wait.until(EC.presence_of_element_located((By.ID,'Popover1-toggle')))
print(input) #获取节点间内容
#driver.close()
11 ) 前进和后退
from selenium import webdriver
import time
#创建浏览器对象
driver = webdriver.Chrome()
#加载请求指定url地址
driver.get("https://www.baidu.com")
driver.get("https://www.taobao.com")
driver.get("https://www.jd.com")
time.sleep(2)
driver.back() #后退
time.sleep(2) #前进
driver.forward()
#driver.close()
12 ) Cookies
from selenium import webdriver
from selenium.webdriver import ActionChains
#创建浏览器对象
driver = webdriver.Chrome()
#加载请求指定url地址
driver.get("https://www.zhihu.com/explore")
print(driver.get_cookies())
driver.add_cookie({'name':'Joh','domain':'www.zhihu.com','value':'zhangsan'})
print(driver.get_cookies())
driver.delete_all_cookies()
print(driver.get_cookies())
#driver.close()
13 ) 选项卡管理
from selenium import webdriver
import time
#创建浏览器对象
driver = webdriver.Chrome()
#加载请求指定url地址
driver.get("https://www.baidu.com")
#使用JavaScript开启一个新的选型卡
driver.execute_script('window.open()')
print(driver.window_handles)
#切换到第二个选项卡,并打开url地址
driver.switch_to_window(driver.window_handles[1])
driver.get("https://www.taobao.com")
time.sleep(2)
#切换到第一个选项卡,并打开url地址
driver.switch_to_window(driver.window_handles[0])
driver.get("https://www.jd.com")
#driver.close()
14 ) 异常处理
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException
#创建浏览器对象
driver = webdriver.Chrome()
try:
#加载请求指定url地址
driver.get("https://www.baidu.com")
except TimeoutException:
print('Time Out')
try:
#加载请求指定url地址
driver.find_element_by_id("demo")
except NoSuchElementException:
print('No Element')
finally:
#driver.close()
pass
1)任务
注:下面的代码应该不能正确运行,因为淘宝已对Selenium做了相关封禁处理,可以自行在网络上寻找解决方案
'''通过关键字爬取淘宝网站的信息数据'''
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
from urllib.parse import quote
KEYWORD = "mac"
MAX_PAGE = 10
# browser = webdriver.Chrome()
# browser = webdriver.PhantomJS()
#创建谷歌浏览器对象,启用Chrome的Headless无界面模式
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
#显式等待:
wait = WebDriverWait(browser, 10)
def index_page(page):
'''抓取索引页 :param page: 页码'''
print('正在爬取第', page, '页')
try:
url = 'https://s.taobao.com/search?q=' + quote(KEYWORD)
browser.get(url)
if page > 1:
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
input.clear()
input.send_keys(page)
submit.click()
#等待条件:显示当前页号,显式商品
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
get_products()
except TimeoutException:
index_page(page)
def get_products():
'''提取商品数据'''
html = browser.page_source
doc = pq(html)
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
product = {
'image': item.find('.pic .img').attr('data-src'),
'price': item.find('.price').text(),
'deal': item.find('.deal-cnt').text(),
'title': item.find('.title').text(),
'shop': item.find('.shop').text(),
'location': item.find('.location').text()
}
print(product)
save_data(product)
def save_data(result):
'''保存数据'''
pass
def main():
'''遍历每一页'''
for i in range(1, MAX_PAGE + 1):
index_page(i)
browser.close()
# 主程序入口
if __name__ == '__main__':
main()
1 ) 任务
2 ) 创建项目
说明:需要特别注意解决淘宝对selenium的封禁处理,需要网上自行寻找解决方案(网上很多)),并将配置加入代码中
scrapy startproject scrapyselenium
cd srapytselenium
scrapy genspider taobao www.baobao.com
ROBOTSTXT_OBEY = False
3 ) 定义Item类
# 定义信息封装类(图片、价格、购买人数、标题、店铺、发货源)
from scrapy import Item, Field
class ProductItem(Item):
collection = 'products'
image = Field()
price = Field()
deal = Field()
title = Field()
shop = Field()
location = Field()
4 ) 解析页面
在配置文件settings.py最后面定义搜索关键字和最大页码数信息
KEYWORDS = ['mac']
MAX_PAGE = 100
进入spider/taobao.py文件中编写
# -*- coding: utf-8 -*-
from scrapy import Request, Spider
from urllib.parse import quote
from scrapyseleniumtest.items import ProductItem
class TaobaoSpider(Spider):
name = 'taobao'
allowed_domains = ['www.taobao.com']
base_url = 'https://s.taobao.com/search?q='
def start_requests(self):
for keyword in self.settings.get('KEYWORDS'):
for page in range(1, self.settings.get('MAX_PAGE') + 1):
url = self.base_url + quote(keyword)
yield Request(url=url, callback=self.parse, meta={'page': page}, dont_filter=True)
def parse(self, response):
pass
5 ) 对接Selenium
通过定义DownloaderMiddleware中间件来实现对Selenium的使用。
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http import HtmlResponse
from logging import getLogger
class SeleniumMiddleware():
def __init__(self, timeout=None, service_args=[]):
self.logger = getLogger(__name__)
self.timeout = timeout
self.browser = webdriver.PhantomJS(service_args=service_args)
self.browser.set_window_size(1400, 700)
self.browser.set_page_load_timeout(self.timeout)
self.wait = WebDriverWait(self.browser, self.timeout)
def __del__(self):
self.browser.close()
def process_request(self, request, spider):
"""
用PhantomJS抓取页面
:param request: Request对象
:param spider: Spider对象
:return: HtmlResponse
"""
self.logger.debug('PhantomJS is Starting')
page = request.meta.get('page', 1)
try:
self.browser.get(request.url)
if page > 1:
input = self.wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
submit = self.wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
input.clear()
input.send_keys(page)
submit.click()
self.wait.until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
status=200)
except TimeoutException:
return HtmlResponse(url=request.url, status=500, request=request)
@classmethod
def from_crawler(cls, crawler):
return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
service_args=crawler.settings.get('PHANTOMJS_SERVICE_ARGS'))
在settings.py配置文件中.设置我们自定义的中间件设置:
DOWNLOADER_MIDDLEWARES = {
'scrapyselenium.middlewares.SeleniumMiddleware': 543,
}
6 ) 解析页面信息
# -*- coding: utf-8 -*-
from scrapy import Request, Spider
from urllib.parse import quote
from scrapyselenium.items import ProductItem
class TaobaoSpider(Spider):
name = 'taobao'
allowed_domains = ['www.taobao.com']
base_url = 'https://s.taobao.com/search?q='
def start_requests(self):
for keyword in self.settings.get('KEYWORDS'):
for page in range(1, self.settings.get('MAX_PAGE') + 1):
url = self.base_url + quote(keyword)
yield Request(url=url, callback=self.parse, meta={'page': page}, dont_filter=True)
def parse(self, response):
products = response.xpath(
'//div[@id="mainsrp-itemlist"]//div[@class="items"][1]//div[contains(@class, "item")]')
for product in products:
item = ProductItem()
item['price'] = ''.join(product.xpath('.//div[contains(@class, "price")]//text()').extract()).strip()
item['title'] = ''.join(product.xpath('.//div[contains(@class, "title")]//text()').extract()).strip()
item['shop'] = ''.join(product.xpath('.//div[contains(@class, "shop")]//text()').extract()).strip()
item['image'] = ''.join(product.xpath('.//div[@class="pic"]//img[contains(@class, "img")]/@data-src').extract()).strip()
item['deal'] = product.xpath('.//div[contains(@class, "deal-cnt")]//text()').extract_first()
item['location'] = product.xpath('.//div[contains(@class, "location")]//text()').extract_first()
yield item
7 ) 存储结果
配置文件信息
ITEM_PIPELINES = {
'scrapyselenium.pipelines.MongoPipeline': 300,
}
KEYWORDS = ['mac']
MAX_PAGE = 100
SELENIUM_TIMEOUT = 20
PHANTOMJS_SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
MONGO_URI = 'localhost'
MONGO_DB = 'taobao'
处理 Pipeline 文件
import pymongo
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DB'))
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
self.db[item.collection].insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()