github项目:https://github.com/lei940324/toy/blob/master/笔记/网络爬虫.md
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
opt = Options()
opt.add_argument('--no-sandbox') #解决DevToolsActivePort文件不存在的报错
opt._arguments = ['disable-infobars'] #去掉谷歌浏览器正在被自动测试控制字样
opt.add_argument('window-size=1920x3000') #指定浏览器分辨率
opt.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
opt.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
# 不加载图片, 提升速度
opt.add_argument('blink-settings=imagesEnabled=false')
# 加载用户信息的谷歌浏览器
opt.add_argument("--user-data-dir="+r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data")
opt.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
# 添加UA
opt.add_argument('user-agent="MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"')
opt.add_argument('--no-sandbox') #以最高权限运行
opt.add_argument("--disable-javascript") #禁用JavaScript
# 设置开发者模式启动,该模式下webdriver属性为正常值
opt.add_experimental_option('excludeSwitches', ['enable-automation'])
# 禁用浏览器弹窗
prefs = { 'profile.default_content_setting_values' : { 'notifications' : 2 }}
opt.add_experimental_option('prefs',prefs)
opt.add_argument('--disable-plugins') #禁止插件
# 禁用弹出拦截
opt.add_argument('--disable-popup-blocking')
# 调用带参数的谷歌浏览器
driver = webdriver.Chrome(chrome_options=opt)
driver.maximize_window() #窗口最大化
driver.set_window_size(480, 800) #设置窗口大小
wait = WebDriverWait(driver, 15)
driver.get(url)
#查找元素方法
find_element_by_id
find_element_by_name
find_element_by_xpath
find_element_by_link_text
find_element_by_partial_link_text
find_element_by_tag_name
find_element_by_class_name
find_element_by_css_selector
#实例
wait.until(EC.presence_of_element_located((By.XPATH,''))) #以Xpath定位第一个满足元素,等待最大响应时间
driver.find_element_by_xpath('').click() #以Xpath定位第一个满足元素,并进行点击操作
driver.find_elements_by_id("") #以属性ID定位所有满足的元素
driver.find_element_by_id('jgwab').text #获取控件id为’jgwab’所显示的内容
driver.find_element_by_id('su').get_attribute('type') #获取控件id为’su’的属性,例如button,radio等
driver.find_element_by_id('kw').size #获得控件的尺寸
# 以属性ID定位第一个满足元素,等待最大响应时间
input = wait.until(EC.presence_of_element_located((By.ID, "")))
input.send_keys('') #输入文本信息
input.clear() #清空
如果以上方法都定位不到控件,说明很可能在iframe里,下面为切换到iframe的方法
driver.switch_to.frame(1) #转入网页内iframe(内嵌的网页元素)
driver.switch_to.parent_frame() #切回上一层frame
driver.switch_to_default_content() #返回到主页面
driver.execute_script('window.open()') # 开启一个选项卡
windows=driver.window_handles # 获得当前浏览器所有窗口
driver.switch_to.window(windows[0]) # 切换到最左侧窗口
driver.switch_to.window(windows[-1]) # 切换到最新打开窗口(注:也就是最右侧窗口)
# 将滚动条从 0px的地方移动到5000px的地方
driver.execute_script("window.scrollBy(0,5000)")
# 滚动到页面底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
# 滚动到页面顶部
driver.execute_script("window.scrollTo(document.body.scrollHeight,0)")
# 将滚动条向上移动500px单位
driver.execute_script("window.scrollTo(document.body.scrollHeight,500)")
driver.back() #返回界面
driver.forward() #前进界面
driver.refresh() #刷新界面
driver.current_url # 获取当前网址
driver.page_source # 获取源代码
driver.title # 获取当前页面标题内容
driver.delete_all_cookies() # 删除所有cookie
driver.add_cookie({'name':5}) # 添加cookie
driver.get_cookies()
cookie_list = []
for dict in cookies:
cookie = dict['name'] + '=' + dict['value']
cookie_list.append(cookie)
cookie = ';'.join(cookie_list)
logo = browser.find_element_by_id ('zh-top-link-logo')
print(logo.get_attribute('class'))
logo.text
logo.id
logo.location
logo.tag_name
logo.size
driver.get_screenshot_as_file("scr.jpg")
submit()方法用于提交表单,这里特别用于没提交按钮的情况
例如搜索框输入关键字之后的“回车”操作,那么就可以通过submit()来提交搜索框的内容。
driver.find_element_by_id('query').submit()
from selenium.webdriver import ActionChains
source = driver.find_element_by_xpath('')
target = driver.find_element_by_xpath('')
actions = ActionChains(driver)
actions.drag_and_drop(source, target)
actions. perform()
select = Select(driver.find_element_by_id('id'))
select.deselect_all()
alert = driver.switch_to_alert() #访问对话框
alert.accept() #点击alert对话框的确定按钮
alert.dismiss() #点击alert对话框的取消按钮
driver.find_element_by_id('ShippingMethod').find_element_by_xpath("//option[@value='7.45']").click()
返回元素的结果是否可见,返回结果为True 或False
result=driver.find_element_by_id("kw").is_displayed()
from selenium.webdriver.common.action_chains import ActionChains
# 鼠标右击,perform() 是执行方法的语句
ActionChains(driver).context_click(driver.find_element_by_xpath("//*[text()='最新热剧']")).perform()
# 鼠标双击
ActionChains(driver).double_click(driver.find_element_by_id('kw')).perform()
# 停留在文本搜索框上
ActionChains(driver).move_to_element(driver.find_element_by_id('kw')).perform()
# 点击并且停留在搜索按钮上
ActionChains(driver).click_and_hold(driver.find_element_by_id('su')).perform()
from selenium.webdriver.common.keys import Keys
driver.find_element_by_id('kw').send_keys(Keys.BACK_SPACE) #Backspace键
driver.find_element_by_id('kw').send_keys(Keys.SPACE) #Space键
driver.find_element_by_id('kw').send_keys(Keys.DELETE) #Delete键
driver.find_element_by_id('kw').send_keys(Keys.CONTROL,'a') # CTRL + A
driver.find_element_by_id('kw').send_keys(Keys.CONTROL,'x') # CTRL + X
driver.find_element_by_id('kw').send_keys(Keys.CONTROL,'v') # CTRL + V
driver.find_element_by_id('kw').send_keys(Keys.ENTER) #ENTER键
driver.find_element_by_id('kw').send_keys(Keys.CONTROL,'c') # CTRL + C
driver.find_element_by_id('kw').send_keys(Keys.TAB) # TAB键
driver.close() #关闭当前窗口
driver.quit() #退出浏览器
更多内容可以参考:selenium中文官方文档
若驱动与Chrome版本不兼容,请更新chromedriver版本:chromedriver驱动下载
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 2 20:19:17 2020
@author: denglei
"""
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
def chrome(show=True,picture=False):
opt = Options()
opt.add_argument('--no-sandbox')
opt._arguments = ['disable-infobars']
opt.add_argument('window-size=1920x3000') #指定浏览器分辨率
opt.add_argument('--disable-gpu')
opt.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
if not picture :opt.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
##加载用户信息的谷歌浏览器
opt.add_argument("--user-data-dir="+r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data")
if not show : opt.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
# 以最高权限运行
opt.add_argument('--no-sandbox')
# 禁用浏览器弹窗
# =============================================================================
# prefs = {
# 'profile.default_content_setting_values' : {
# 'notifications' : 2
# }
# }
# opt.add_experimental_option('prefs',prefs)
# =============================================================================
# 禁用弹出拦截
#opt.add_argument('--disable-popup-blocking')
# 禁用JavaScript
#opt.add_argument("--disable-javascript")
#修改UA
#opt.add_argument('user-agent="MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"')
driver = webdriver.Chrome(chrome_options=opt) # 调用带参数的谷歌浏览器
driver.maximize_window() #窗口最大化
wait = WebDriverWait(driver, 15)
return driver,wait
if __name__ == '__main__':
url = 'https://daohang.qq.com/?fr=hmpage'
driver,wait = chrome()
driver.get(url)
# -*- coding: utf-8 -*-
"""
Created on Sat May 25 15:38:06 2019
@author: denglei
"""
import time, re
import requests
import numpy as np
import pandas as pd
from selenium.webdriver.common.by import By
from selenium_chrome import chrome
from selenium.webdriver.support import expected_conditions as EC
class Spyder():
# file_name为保存文件名
def __init__(self, file_name='数据结果.xlsx'):
self.file_name = file_name
self.driver, self.wait = chrome()
self.run()
def run(self):
self.defined_var() # 第一步:定义待爬取数据
self.generate_urls() # 第二步:生成待爬取网址
self.main() # 第三步:爬取主要过程
self.driver.close() # 第四步:关闭谷歌浏览器
self.clean_data() # 第五步:清洗数据
self.save_data() # 第六步:保存数据
def defined_var(self): # 定义待爬取数据
self.value = [] # 原始数据
self.val = [] # 清洗后数据
def generate_urls(self):
pass
def main(self):
pass
def clean_data(self): # 清洗数据
pass
def save_data(self):
df = pd.DataFrame(self.val, columns=['爬取结果'])
df.to_excel(self.file_name, index = False)
# =============================================================================
# with open(self.file_name,'w',encoding='utf-8') as f:
# for i in val:
# f.write('{}\n'.format(i))
# =============================================================================
if __name__ == '__main__':
example = Spyder()
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def loginWeibo(username, password,url='https://passport.weibo.cn/signin/login',show=False,find_cookie=True):
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错
chrome_options._arguments = ['disable-infobars']#去掉谷歌浏览器正在被自动测试控制字样
chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
##加载用户信息的谷歌浏览器
chrome_options.add_argument("--user-data-dir="+r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data")
if not show : chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
driver = webdriver.Chrome(chrome_options=chrome_options) # 调用带参数的谷歌浏览器
driver.maximize_window() #窗口最大化
driver.get(url)
print('正在加载界面....')
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "loginAction")))
time.sleep(1)
driver.find_element_by_id("loginName").send_keys(username)
driver.find_element_by_id("loginPassword").send_keys(password)
driver.find_element_by_id("loginAction").click()
print('已登录,正在获取cookie....')
if find_cookie:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.NAME, "viewport")))
time.sleep(3)
cookies = driver.get_cookies()
cookie_list = []
for dict in cookies:
cookie = dict['name'] + '=' + dict['value']
cookie_list.append(cookie)
cookie = ';'.join(cookie_list)
print('获取cookie成功\ncookie:%s'%cookie)
else : cookie=False
driver.close()
return cookie
if __name__ == '__main__':
cookie = loginWeibo('账号', '密码',show=True)
from urllib.parse import urlencode
import requests
base_url = 'https://www.baidu.com?'
params={'type':'uid',
'value':'2830678474',
'containerid':'1076032830678474',
'page':2}
url=base_url + urlencode(params)
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 2 22:51:32 2020
实现单线程抓取
@author: denglei
"""
import time
import requests
import logging
class GetUrl():
def __init__(self, url, num=5, headers={}, level=2):
''' 各参数说明:
url - 待爬取网站
num - 网站响应失败后,最大爬取次数
headers - 表头信息
level - 信息显示等级,0级最低,5级最大
err_url - 抓取失败的网站汇总
data - 爬取数据 '''
self.url = url
self.num = num
self.headers = headers
self.level = level
self.err_url = ''
self.data = ''
self.run()
def run(self):
self.Info_level()
self.data = self.judge()
def Info_level(self):
levels = [logging.NOTSET , logging.DEBUG, logging.INFO,
logging.WARNING, logging.ERROR, logging.CRITICAL]
logging.basicConfig(level = levels[self.level])
def response(self):
try:
req = requests.get(self.url, headers=self.headers,timeout=30)
req.raise_for_status()
req.encoding = 'utf-8'
logging.debug('Python 返回URUL:%s 数据成功' %self.url)
return req.text
except:
logging.debug('Python 返回URL:%s 数据失败' %self.url)
return ''
def judge(self):
n = 1;
while n <= self.num:
logging.debug(f'开始第{n}次加载.......')
data = self.response()
if data : break
n += 1
time.sleep(2)
if data == '':
logging.debug('***************************************')
logging.debug('爬取次数已达到最大限制,已跳过URL:%s'%self.url)
logging.debug('***************************************')
self.err_url = self.url
return data
if __name__ == '__main__':
cookie = ''
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2714.400",
'Cookie':cookie}
url = 'https://www.baidu.com/'
data = GetUrl(url, headers=header).data
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 3 01:48:20 2020
@author: Administrator
"""
import numpy as np
import pandas as pd
import threading ,re, os ,time
from queue import Queue
from lxml import etree
import logging
from GetUrl import GetUrl
class ThreadGetData(threading.Thread):
def __init__(self, urlQueue, headers={}):
# urlQueue - 待爬取网站队列
# headers - 表头信息
super(ThreadGetData, self).__init__()
self.urlQueue = urlQueue
self.headers = headers
def run(self):
while True:
try : url = self.urlQueue.get(block=False) # 队列为空 产生异常
except : break
output = GetUrl(url, headers=self.headers)
if output.err_url == True : err_url.append(output.err_url)
logging.debug('正在解析url:%s'%url)
selector = etree.HTML(output.data)
text = selector.xpath('//div[@id="articlelistnew"]/div[contains(@class,"articleh")]')
for t in text:
result['value'].append(''.join(map(lambda x: x.strip() , t.xpath('./span[1]//text()'))))
time.sleep(1)
def main():
# 1.构建爬取队列
urlQueue = Queue()
for i in range(1,4):
url='http://guba.eastmoney.com/list,zssh000001,f_{}.html'.format(i)
urlQueue.put(url)
# 2.构建表头信息
cookie = ''
header = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/\
537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 \
Core/1.63.6788.400 QQBrowser/10.3.2714.400"
"Cookie" : cookie}
# 3.创建线程
threadParse = []
for i in range(4):
Cthread = ThreadGetData(urlQueue, header)
Cthread.start()
threadParse.append(Cthread)
logging.info('采集线程开始运行--------')
# 4.结束线程
for single in threadParse:
single.join()
logging.info("所有采集线程退出循环")
# 5.保存数据
df = pd.DataFrame(result)
df.to_excel('爬取结果.xlsx', index = False)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
logging.info('程序开始运行...')
start=time.perf_counter() # 开始计时
# 定义待爬取数据
result = {}
result['value'] = []
err_url = []
main()
end=time.perf_counter() # 结束计时
logging.info('程序共运行%.2f秒'%(end-start))
#-*- encoding:utf-8 -*-
import base64
import requests
import re
import rsa
import binascii
def Get_cookies(username, password):
'''登陆新浪微博,获取登陆后的Cookie,返回到变量cookie中'''
url = 'http://login.sina.com.cn/sso/prelogin.php?entry=sso&callback=sinaSSOController.preloginCallBack&su=%s&rsakt=mod&client=ssologin.js(v1.4.4)%'+username
html = requests.get(url).content.decode('utf-8')
servertime = re.findall('"servertime":(.*?),',html,re.S)[0]
nonce = re.findall('"nonce":"(.*?)"',html,re.S)[0]
pubkey = re.findall('"pubkey":"(.*?)"',html,re.S)[0]
rsakv = re.findall('"rsakv":"(.*?)"',html,re.S)[0]
username = base64.b64encode(username.encode()) #加密用户名
rsaPublickey = int(pubkey, 16)
key = rsa.PublicKey(rsaPublickey, 65537) #创建公钥
message = str(servertime) + '\t' + str(nonce) + '\n' + str(password) #拼接明文js加密文件中得到
passwd = rsa.encrypt(message.encode('utf-8'), key) #加密
passwd = binascii.b2a_hex(passwd) #将加密信息转换为16进制。
login_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.4)'
data = {'entry': 'weibo',
'gateway': '1',
'from': '',
'savestate': '7',
'userticket': '1',
'ssosimplelogin': '1',
'vsnf': '1',
'vsnval': '',
'su': username,
'service': 'miniblog',
'servertime': servertime,
'nonce': nonce,
'pwencode': 'rsa2',
'sp': passwd,
'encoding': 'UTF-8',
'prelt': '115',
'rsakv' : rsakv,
'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
'returntype': 'META'
}
html = requests.post(login_url,data=data).content.decode('gbk')
urlnew = re.findall('location.replace\(\'(.*?)\'',html,re.S)[0]
#发送get请求并保存cookies
cookies = requests.get(urlnew).cookies.items()
cookie = ''
for name, value in cookies:
cookie += '{0}={1};'.format(name, value)
return cookie
if __name__ == '__main__':
cookie = Get_cookies('账号', '密码')
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400",
'Cookie':cookie}
url = 'https://s.weibo.com/weibo?q=%E4%B8%AD%E7%BE%8E%E8%B4%B8%E6%98%93%E6%88%98&wvr=6&b=1&reason=&retcode=&Refer=SWeibo_box'
data = requests.get(url, headers=header).text