网络爬虫requests+selenium总结

github项目:https://github.com/lei940324/toy/blob/master/笔记/网络爬虫.md

网络爬虫

selenium

常用命令总结

导入所需要的模块

from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

构建带有参数的谷歌浏览器

opt = Options()
opt.add_argument('--no-sandbox')          #解决DevToolsActivePort文件不存在的报错
opt._arguments = ['disable-infobars']     #去掉谷歌浏览器正在被自动测试控制字样
opt.add_argument('window-size=1920x3000') #指定浏览器分辨率
opt.add_argument('--disable-gpu')         #谷歌文档提到需要加上这个属性来规避bug
opt.add_argument('--hide-scrollbars')     #隐藏滚动条, 应对一些特殊页面
# 不加载图片, 提升速度
opt.add_argument('blink-settings=imagesEnabled=false') 
# 加载用户信息的谷歌浏览器
opt.add_argument("--user-data-dir="+r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data") 
opt.add_argument('--headless')            #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
# 添加UA
opt.add_argument('user-agent="MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"')
opt.add_argument('--no-sandbox')          #以最高权限运行
opt.add_argument("--disable-javascript")  #禁用JavaScript
# 设置开发者模式启动,该模式下webdriver属性为正常值
opt.add_experimental_option('excludeSwitches', ['enable-automation']) 
# 禁用浏览器弹窗
prefs = { 'profile.default_content_setting_values' :  { 'notifications' : 2 }}  
opt.add_experimental_option('prefs',prefs)
opt.add_argument('--disable-plugins')     #禁止插件
# 禁用弹出拦截
opt.add_argument('--disable-popup-blocking')     
# 调用带参数的谷歌浏览器  
driver = webdriver.Chrome(chrome_options=opt)  

设置窗口大小

driver.maximize_window()           #窗口最大化
driver.set_window_size(480, 800)   #设置窗口大小

调节最大等待时间

wait = WebDriverWait(driver, 15)    

网络爬虫requests+selenium总结_第1张图片

网络爬虫requests+selenium总结_第2张图片

加载指定网址

driver.get(url)

位置定位及交互

#查找元素方法
find_element_by_id
find_element_by_name
find_element_by_xpath
find_element_by_link_text
find_element_by_partial_link_text
find_element_by_tag_name
find_element_by_class_name
find_element_by_css_selector
#实例
wait.until(EC.presence_of_element_located((By.XPATH,'')))  #以Xpath定位第一个满足元素,等待最大响应时间 
driver.find_element_by_xpath('').click()                   #以Xpath定位第一个满足元素,并进行点击操作
driver.find_elements_by_id("")							   #以属性ID定位所有满足的元素
driver.find_element_by_id('jgwab').text               	   #获取控件id为’jgwab’所显示的内容
driver.find_element_by_id('su').get_attribute('type')      #获取控件id为’su’的属性,例如button,radio等
driver.find_element_by_id('kw').size                       #获得控件的尺寸
# 以属性ID定位第一个满足元素,等待最大响应时间
input = wait.until(EC.presence_of_element_located((By.ID, "")))
input.send_keys('')      #输入文本信息
input.clear()			 #清空

如果以上方法都定位不到控件,说明很可能在iframe里,下面为切换到iframe的方法

driver.switch_to.frame(1)               #转入网页内iframe(内嵌的网页元素)
driver.switch_to.parent_frame()         #切回上一层frame
driver.switch_to_default_content()      #返回到主页面

切换网页窗口

driver.execute_script('window.open()')  # 开启一个选项卡
windows=driver.window_handles           # 获得当前浏览器所有窗口
driver.switch_to.window(windows[0])     # 切换到最左侧窗口
driver.switch_to.window(windows[-1])    # 切换到最新打开窗口(注:也就是最右侧窗口)

滚动条操作

# 将滚动条从 0px的地方移动到5000px的地方
driver.execute_script("window.scrollBy(0,5000)")  
# 滚动到页面底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
# 滚动到页面顶部
driver.execute_script("window.scrollTo(document.body.scrollHeight,0)")
# 将滚动条向上移动500px单位
driver.execute_script("window.scrollTo(document.body.scrollHeight,500)")

浏览器操作

driver.back()         #返回界面
driver.forward()	  #前进界面
driver.refresh()      #刷新界面

获取网页信息

driver.current_url            # 获取当前网址
driver.page_source            # 获取源代码
driver.title                  # 获取当前页面标题内容
driver.delete_all_cookies()   # 删除所有cookie
driver.add_cookie({'name':5}) # 添加cookie
  • 获取cookies
driver.get_cookies()  
cookie_list = []
for dict in cookies:
    cookie = dict['name'] + '=' + dict['value']
    cookie_list.append(cookie)
cookie = ';'.join(cookie_list)
  • 获取属性
logo = browser.find_element_by_id ('zh-top-link-logo') 
print(logo.get_attribute('class'))
  • 获取文本值
logo.text
  • 获取 id 、位置、 标签名和大小
logo.id
logo.location
logo.tag_name
logo.size

窗口截图

driver.get_screenshot_as_file("scr.jpg")

提交表单

submit()方法用于提交表单,这里特别用于没提交按钮的情况

例如搜索框输入关键字之后的“回车”操作,那么就可以通过submit()来提交搜索框的内容。

driver.find_element_by_id('query').submit()

实现某个节点的拖曳操作,将某个节点从一处拖曳到另外一处

from selenium.webdriver import ActionChains
source = driver.find_element_by_xpath('')
target = driver.find_element_by_xpath('')
actions = ActionChains(driver)
actions.drag_and_drop(source, target)
actions. perform()

取消选择已经选择的元素

select = Select(driver.find_element_by_id('id'))
select.deselect_all()

对话框操作

alert = driver.switch_to_alert()    #访问对话框
alert.accept()                      #点击alert对话框的确定按钮
alert.dismiss()                     #点击alert对话框的取消按钮

层级定位

driver.find_element_by_id('ShippingMethod').find_element_by_xpath("//option[@value='7.45']").click()

元素是否可见

返回元素的结果是否可见,返回结果为True 或False

result=driver.find_element_by_id("kw").is_displayed()

模拟鼠标操作

from selenium.webdriver.common.action_chains import ActionChains
# 鼠标右击,perform() 是执行方法的语句
ActionChains(driver).context_click(driver.find_element_by_xpath("//*[text()='最新热剧']")).perform()
# 鼠标双击
ActionChains(driver).double_click(driver.find_element_by_id('kw')).perform()
# 停留在文本搜索框上
ActionChains(driver).move_to_element(driver.find_element_by_id('kw')).perform()
# 点击并且停留在搜索按钮上
ActionChains(driver).click_and_hold(driver.find_element_by_id('su')).perform()

模拟键盘操作

from selenium.webdriver.common.keys import Keys
driver.find_element_by_id('kw').send_keys(Keys.BACK_SPACE)   #Backspace键
driver.find_element_by_id('kw').send_keys(Keys.SPACE)        #Space键
driver.find_element_by_id('kw').send_keys(Keys.DELETE)       #Delete键
driver.find_element_by_id('kw').send_keys(Keys.CONTROL,'a')  # CTRL + A
driver.find_element_by_id('kw').send_keys(Keys.CONTROL,'x')  # CTRL + X
driver.find_element_by_id('kw').send_keys(Keys.CONTROL,'v')  # CTRL + V
driver.find_element_by_id('kw').send_keys(Keys.ENTER)        #ENTER键
driver.find_element_by_id('kw').send_keys(Keys.CONTROL,'c')  # CTRL + C
driver.find_element_by_id('kw').send_keys(Keys.TAB)          # TAB键

关闭浏览器

driver.close()  #关闭当前窗口
driver.quit()   #退出浏览器

官方文档与驱动下载

  • 更多内容可以参考:selenium中文官方文档

  • 若驱动与Chrome版本不兼容,请更新chromedriver版本:chromedriver驱动下载

selenium模板

selenium_chrome.py

# -*- coding: utf-8 -*-
"""
Created on Mon Mar  2 20:19:17 2020

@author: denglei
"""

from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait


def chrome(show=True,picture=False):
    opt = Options()
    opt.add_argument('--no-sandbox')
    opt._arguments = ['disable-infobars']
    opt.add_argument('window-size=1920x3000') #指定浏览器分辨率
    opt.add_argument('--disable-gpu') 
    opt.add_argument('--hide-scrollbars')     #隐藏滚动条, 应对一些特殊页面
    if not picture :opt.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
    ##加载用户信息的谷歌浏览器
    opt.add_argument("--user-data-dir="+r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data") 
    if not show : opt.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
    # 以最高权限运行
    opt.add_argument('--no-sandbox')
    # 禁用浏览器弹窗
# =============================================================================
#     prefs = {  
#     'profile.default_content_setting_values' :  {  
#         'notifications' : 2  
#          }  
#     }  
#     opt.add_experimental_option('prefs',prefs)
# =============================================================================

    # 禁用弹出拦截
    #opt.add_argument('--disable-popup-blocking')
    # 禁用JavaScript
    #opt.add_argument("--disable-javascript") 
    #修改UA
    #opt.add_argument('user-agent="MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"')
    driver = webdriver.Chrome(chrome_options=opt)  # 调用带参数的谷歌浏览器  
    driver.maximize_window() #窗口最大化
    wait = WebDriverWait(driver, 15)
    return driver,wait

if __name__ == '__main__':    
    url = 'https://daohang.qq.com/?fr=hmpage'
    driver,wait = chrome()
    driver.get(url)

编写selenium模板

# -*- coding: utf-8 -*-
"""
Created on Sat May 25 15:38:06 2019

@author: denglei
"""

import time, re
import requests
import numpy as np
import pandas as pd
from selenium.webdriver.common.by import By
from selenium_chrome import chrome
from selenium.webdriver.support import expected_conditions as EC


class Spyder():
    # file_name为保存文件名
    def __init__(self, file_name='数据结果.xlsx'):
        self.file_name = file_name
        self.driver, self.wait = chrome()
        self.run()

    def run(self):
        self.defined_var()     # 第一步:定义待爬取数据
        self.generate_urls()   # 第二步:生成待爬取网址
        self.main()            # 第三步:爬取主要过程
        self.driver.close()    # 第四步:关闭谷歌浏览器
        self.clean_data()      # 第五步:清洗数据
        self.save_data()       # 第六步:保存数据
        
    def defined_var(self):  # 定义待爬取数据
        self.value = [] # 原始数据
        self.val = []   # 清洗后数据
        
    def generate_urls(self):
        pass
    
    def main(self):  
        pass
        
    def clean_data(self):   # 清洗数据
 		pass
    
    def save_data(self):
        df = pd.DataFrame(self.val, columns=['爬取结果'])
        df.to_excel(self.file_name, index = False)
# =============================================================================
# with open(self.file_name,'w',encoding='utf-8') as f:
#         for i in val:
#             f.write('{}\n'.format(i))
# =============================================================================

if __name__ == '__main__':    
    example = Spyder()

selenium实例

模拟登陆微博获取cookie

from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def loginWeibo(username, password,url='https://passport.weibo.cn/signin/login',show=False,find_cookie=True):
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错
    chrome_options._arguments = ['disable-infobars']#去掉谷歌浏览器正在被自动测试控制字样
    chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
    chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
    chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
    chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
    ##加载用户信息的谷歌浏览器
    chrome_options.add_argument("--user-data-dir="+r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data") 
    if not show : chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
    driver = webdriver.Chrome(chrome_options=chrome_options)  # 调用带参数的谷歌浏览器  
    driver.maximize_window() #窗口最大化
    driver.get(url)
    print('正在加载界面....')
    WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, "loginAction")))
    time.sleep(1)
    driver.find_element_by_id("loginName").send_keys(username)
    driver.find_element_by_id("loginPassword").send_keys(password)
    driver.find_element_by_id("loginAction").click()
    print('已登录,正在获取cookie....')

    if find_cookie:    
        WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.NAME, "viewport")))
        time.sleep(3)
        cookies = driver.get_cookies()
        cookie_list = []
        for dict in cookies:
            cookie = dict['name'] + '=' + dict['value']
            cookie_list.append(cookie)
        cookie = ';'.join(cookie_list)
        print('获取cookie成功\ncookie:%s'%cookie)
    else : cookie=False
    driver.close()
    return cookie

if __name__ == '__main__':    
    cookie = loginWeibo('账号', '密码',show=True)

requests库

基本语法

  • 构建网址
from urllib.parse import urlencode 
import requests

base_url = 'https://www.baidu.com?'
params={'type':'uid',
        'value':'2830678474',
        'containerid':'1076032830678474',
        'page':2}
url=base_url + urlencode(params)

requests模板

单线程

  • GetUrl.py
# -*- coding: utf-8 -*-
"""
Created on Mon Mar  2 22:51:32 2020
实现单线程抓取

@author: denglei
"""

import time
import requests
import logging


class GetUrl():
    def __init__(self, url, num=5, headers={}, level=2):
    ''' 各参数说明:
    url     - 待爬取网站
    num     - 网站响应失败后,最大爬取次数
    headers - 表头信息
    level   - 信息显示等级,0级最低,5级最大
    err_url - 抓取失败的网站汇总
    data    - 爬取数据        '''
        self.url = url
        self.num = num
        self.headers = headers
        self.level = level
        self.err_url = ''
        self.data = ''
        self.run()
        
    def run(self):
        self.Info_level()
        self.data = self.judge()
    
    def Info_level(self):
        levels = [logging.NOTSET , logging.DEBUG, logging.INFO,
                  logging.WARNING, logging.ERROR, logging.CRITICAL]
        logging.basicConfig(level = levels[self.level])

    def response(self):
        try:
            req = requests.get(self.url, headers=self.headers,timeout=30)
            req.raise_for_status()
            req.encoding = 'utf-8'
            logging.debug('Python 返回URUL:%s  数据成功' %self.url)
            return req.text
        except:
            logging.debug('Python 返回URL:%s  数据失败' %self.url)
            return ''
        
    def judge(self):
        n = 1;
        while n <= self.num:
            logging.debug(f'开始第{n}次加载.......')
            data = self.response()
            if data : break             
            n += 1
            time.sleep(2)
        if data == '':
            logging.debug('***************************************')
            logging.debug('爬取次数已达到最大限制,已跳过URL:%s'%self.url)
            logging.debug('***************************************')
            self.err_url = self.url
        return data
    
if __name__ == '__main__':
    cookie = ''
    header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2714.400",
              'Cookie':cookie}  
    url = 'https://www.baidu.com/'  
    
    data = GetUrl(url, headers=header).data

多线程

# -*- coding: utf-8 -*-
"""
Created on Tue Mar  3 01:48:20 2020

@author: Administrator
"""

import numpy as np
import pandas as pd
import threading ,re, os ,time
from queue import Queue
from lxml import etree
import logging
from GetUrl import GetUrl


class ThreadGetData(threading.Thread):
   
	def __init__(self, urlQueue, headers={}):
    # urlQueue - 待爬取网站队列
    # headers  - 表头信息
        super(ThreadGetData, self).__init__()
        self.urlQueue = urlQueue
        self.headers = headers     
    
    def run(self):
        while True:
            try : url = self.urlQueue.get(block=False)   # 队列为空 产生异常
            except : break
            output = GetUrl(url, headers=self.headers)
            if output.err_url == True : err_url.append(output.err_url)
            
            logging.debug('正在解析url:%s'%url)
            selector = etree.HTML(output.data)
            text = selector.xpath('//div[@id="articlelistnew"]/div[contains(@class,"articleh")]')
            for t in text:
                result['value'].append(''.join(map(lambda x: x.strip() , t.xpath('./span[1]//text()'))))
            time.sleep(1)
            
            
def main():   
    # 1.构建爬取队列
    urlQueue = Queue()
    for i in range(1,4):
        url='http://guba.eastmoney.com/list,zssh000001,f_{}.html'.format(i)
        urlQueue.put(url)     
    
    # 2.构建表头信息
    cookie = ''
    header = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/\
537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 \
Core/1.63.6788.400 QQBrowser/10.3.2714.400"
             "Cookie" : cookie}
    
    # 3.创建线程
    threadParse = []
    for i in range(4):
        Cthread = ThreadGetData(urlQueue, header)
        Cthread.start()
        threadParse.append(Cthread)
    logging.info('采集线程开始运行--------')
    
    # 4.结束线程
    for single in threadParse:
        single.join()
    logging.info("所有采集线程退出循环")
    
    # 5.保存数据
    df = pd.DataFrame(result)
    df.to_excel('爬取结果.xlsx', index = False)
    
        
if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO, 
                        format='%(asctime)s - %(levelname)s - %(message)s')
    logging.info('程序开始运行...')
    start=time.perf_counter()   # 开始计时
    # 定义待爬取数据
    result = {}
    result['value'] = []
    err_url = []
   
    main()
    
    end=time.perf_counter()     # 结束计时
    logging.info('程序共运行%.2f秒'%(end-start))

requests实例

模拟登陆微博获取cookie

  • Requests_loginWeibo.py
#-*- encoding:utf-8 -*-

import base64
import requests
import re
import rsa
import binascii

def Get_cookies(username, password):
    '''登陆新浪微博,获取登陆后的Cookie,返回到变量cookie中'''

    url = 'http://login.sina.com.cn/sso/prelogin.php?entry=sso&callback=sinaSSOController.preloginCallBack&su=%s&rsakt=mod&client=ssologin.js(v1.4.4)%'+username
    html = requests.get(url).content.decode('utf-8')

    servertime = re.findall('"servertime":(.*?),',html,re.S)[0]
    nonce = re.findall('"nonce":"(.*?)"',html,re.S)[0]
    pubkey = re.findall('"pubkey":"(.*?)"',html,re.S)[0]
    rsakv = re.findall('"rsakv":"(.*?)"',html,re.S)[0]

    username = base64.b64encode(username.encode()) #加密用户名
    rsaPublickey = int(pubkey, 16)
    key = rsa.PublicKey(rsaPublickey, 65537) #创建公钥
    message = str(servertime) + '\t' + str(nonce) + '\n' + str(password) #拼接明文js加密文件中得到
    passwd = rsa.encrypt(message.encode('utf-8'), key) #加密
    passwd = binascii.b2a_hex(passwd) #将加密信息转换为16进制。

    login_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.4)'
    data = {'entry': 'weibo',
        'gateway': '1',
        'from': '',
        'savestate': '7',
        'userticket': '1',
        'ssosimplelogin': '1',
        'vsnf': '1',
        'vsnval': '',
        'su': username,
        'service': 'miniblog',
        'servertime': servertime,
        'nonce': nonce,
        'pwencode': 'rsa2',
        'sp': passwd,
        'encoding': 'UTF-8',
        'prelt': '115',
        'rsakv' : rsakv,
        'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
        'returntype': 'META'
        }
    html = requests.post(login_url,data=data).content.decode('gbk')
    urlnew = re.findall('location.replace\(\'(.*?)\'',html,re.S)[0]

    #发送get请求并保存cookies
    cookies = requests.get(urlnew).cookies.items()
    
    cookie = ''
    for name, value in cookies:
        cookie += '{0}={1};'.format(name, value)
   
    return cookie

if __name__ == '__main__':    
    cookie = Get_cookies('账号', '密码')
    header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400",
             'Cookie':cookie}
    url = 'https://s.weibo.com/weibo?q=%E4%B8%AD%E7%BE%8E%E8%B4%B8%E6%98%93%E6%88%98&wvr=6&b=1&reason=&retcode=&Refer=SWeibo_box'
    data = requests.get(url, headers=header).text

你可能感兴趣的:(python网络爬虫篇)