多臂老虎机(multi-armed bandit,MAB)
如果拉杆的收益分布已知,直接使用贪心策略一直拉最优拉杆即可,但当分布未知时,我们一方面需要足够多的交互来估计拉杆的期望收益,另一方面又要充分利用当前的估计结果尽量最大化收益,这个简单的问题很好地反映了强化学习中的 “探索-利用困境”
多臂老虎机问题相比完整 RL 问题的显著简化是不存在状态转移。不妨设系统一直处于一个固定状态 s s s,操作第 i i i 根拉杆记为动作 a i a_i ai,则该设定下任何 ( s , a i ) (s,a_i) (s,ai) 的真实价值 Q ( s , a i ) Q(s,a_i) Q(s,ai) 都仅和此处的即时 reward r ( s , a i ) r(s,a_i) r(s,ai) 相关,所有反馈都是纠正性反馈,可以保证(交互次数足够多时)经验期望就是对真实价值的良好估计。因此我们不用特别考虑价值估计方法导致的误差,特别适合研究如何平衡开发和试探
为期望 reward,即 Q ( a ) = E r ∼ R ( ⋅ ∣ a ) [ r ] Q(a)=\mathbb{E}_{r\sim\mathcal{R}(·|a)}[r] Q(a)=Er∼R(⋅∣a)[r],最优动作
具有最大的价值 Q ∗ = max a ∈ A Q ( a ) Q^*=\max_{a\in\mathcal{A}}Q(a) Q∗=maxa∈AQ(a)。定义后悔
为拉动当前拉杆的动作 a a a 与最优拉杆的期望奖励差(价值差) R ( a ) = Q ∗ − Q ( a ) R(a)=Q^*-Q(a) R(a)=Q∗−Q(a),则一次完整的 T T T 步决策的累计后悔
为 σ R = ∑ t = 1 T R ( a t ) \sigma_R = \sum_{t=1}^T R(a_t) σR=∑t=1TR(at)。MAB 的优化目标最大化累积奖励等价于最小化累积懊悔from typing import Tuple
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
from abc import ABCMeta
import abc
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
class BernoulliBandit:
""" K臂伯努利多臂老虎机, 每个拉杆有p的概率 reward=1, 1-p 概率 reward=0, p 从0-1均匀分布采样 """
def __init__(self, K):
self.K = K
self.values = np.random.uniform(size=K) # 随机生成K个0~1的数, 作为拉动每根拉杆的期望reward
self.bestAction = np.argmax(self.values) # 获奖概率最大的拉杆
def step(self, k):
return np.random.rand() < self.values[k] # python 中 True/False 等价于 1/0
class GaussBandit:
""" K臂高斯老虎机, 每个拉杆期望收益采样自标准正态分布, 即时 reward 是收益期望加上高斯噪声 """
def __init__(self, K=10):
self.K = K # 摇臂数量
self.values = np.random.randn(K) # 从标准正态分布采样K个拉杆的收益均值
self.bestAction = np.argmax(self.values) # 最优动作索引
def step(self, k):
return np.random.normal(loc=self.values[k], scale=1, size=1)
def showDistribution(self):
# 绘制K个拉杆即时 reward 分布的小提琴图
fig = plt.figure(figsize=(8,5))
foo = pd.DataFrame(columns =['Arm','Reward'])
for i in range(10):
foo.loc[i] = ['no'+str(i+1),np.random.normal(loc=self.values[i], scale=1, size=1000)]
foo = foo.explode('Reward')
foo['Reward'] = foo['Reward'].astype('float')
sns.violinplot(data=foo, x='Arm', y='Reward')
# 随机生成一个10臂高斯老虎机,观察拉杆 reward 分布
bandit = GaussBandit(10)
在多臂老虎机乃至完整 RL 问题中,平衡探索和利用的常用思路是在开始时做比较多的探索,在对每根拉杆都有比较准确的估计后,再进行利用。目前已有一些比较经典的算法来解决这个问题,例如 ϵ \epsilon ϵ-贪婪算法、上置信界算法和汤普森采样算法等,我们接下来将分别介绍这几种算法
class Solver(metaclass=ABCMeta):
""" 多臂老虎机算法基本框架 """
def __init__(self, bandit, initValues):
self.bandit = bandit
self.counts = np.zeros(self.bandit.K) # 每根拉杆的尝试次数
self.initValues = initValues
self.qValues = initValues # 当前价值估计
def run_one_step(self) -> Tuple[int, float]:
# 返回当前动作选择的拉杆索引以及即时reward, 由每个具体的策略实现
def rollout(self,num_steps):
# 运行 num_steps 次
G, B, R = 0,0,0 # 当前收益, 当前最优选择次数, 当前步的累积懊悔
returnCurve = np.zeros(num_steps) # 收益曲线
proportionCurve = np.zeros(num_steps) # 比例曲线
regretCurve = np.zeros(num_steps) # 后悔曲线
self.counts = np.zeros(self.bandit.K) # 计数清零
self.qValues = self.initValues # 初始化价值估计
for i in range(num_steps):
k, r = self.run_one_step()
self.counts[k] += 1
self.qValues[k] += 1. / (self.counts[k]) * (r - self.qValues[k])
B += (k == self.bandit.bestAction)
G += r
R += self.bandit.values[self.bandit.bestAction] - self.bandit.values[k]
returnCurve[i] = G/(i+1)
proportionCurve[i] = B/(i+1)
regretCurve[i] = R
return returnCurve, proportionCurve, regretCurve
def plot(banditParas, sloverParas):
""" 绘制收益、最优动作比例以及累计后悔曲线 """
banditClass, banditArms, banditNum, banditSteps = banditParas
sloverClass, initValues, sloverSettings = sloverParas
# 解决 plt 中文显示的问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig = plt.figure(figsize=(9,9))
a1 = fig.add_subplot(3,1,1,label='a1')
a2 = fig.add_subplot(3,1,2,label='a2')
a3 = fig.add_subplot(3,1,3,label='a3')
# 测试各种设置
for setting in sloverSettings:
paraLabel = setting[0]
# 实例化 Num 个赌博机
aveRCurve, avePCurve, aveRegCurve = np.zeros(banditSteps), np.zeros(banditSteps), np.zeros(banditSteps)
for i in range(banditNum):
bandit = banditClass(banditArms)
solver = sloverClass(*(bandit,initValues)+setting[1:])
returnCurve, proportionCurve, regretCurve = solver.rollout(banditSteps)
aveRCurve += 1/(i+1)*(returnCurve-aveRCurve) # 增量式计算均值
avePCurve += 1/(i+1)*(proportionCurve-avePCurve) # 增量式计算均值
aveRegCurve += 1/(i+1)*(regretCurve-aveRegCurve) # 增量式计算均值
a1.plot(aveRCurve,'-',linewidth=2, label=paraLabel)
a2.plot(avePCurve,'-',linewidth=2, label=paraLabel)
a3.plot(aveRegCurve,'-',linewidth=2, label=paraLabel)
a1.legend(fontsize=10) # 显示图例,即每条线对应 label 中的内容
def plotRegret(banditParas, sloverParas):
""" 只绘制后悔曲线 """
banditClass, banditArms, banditNum, banditSteps = banditParas
sloverClass, initValues, sloverSettings = sloverParas
# 解决 plt 中文显示的问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig = plt.figure(figsize=(10,5))
a1 = fig.add_subplot(1,1,1,label='a1')
# 测试各种设置
for setting in sloverSettings:
paraLabel = setting[0]
# 实例化 Num 个赌博机
aveRegCurve = np.zeros(banditSteps)
for i in range(banditNum):
bandit = banditClass(banditArms)
solver = sloverClass(*(bandit,initValues)+setting[1:])
_, _, regretCurve = solver.rollout(banditSteps)
aveRegCurve += 1.0/(i+1)*(regretCurve-aveRegCurve) # 增量式计算均值
a1.plot(aveRegCurve,'-',linewidth=2, label=paraLabel)
a1.legend(fontsize=10) # 显示图例,即每条线对应 label 中的内容
如下选择动作,以较大概率进行贪心利用,同时以 ϵ \epsilon ϵ 小概率随机探索
a t ← { arg max a Q ( a ) with probability 1 − ε a random action with probability ε a_t \leftarrow \begin{cases}\arg \max _a Q(a) & \text { with probability } 1-\varepsilon \\ \text { a random action } & \text { with probability } \varepsilon\end{cases} at←{argmaxaQ(a) a random action with probability 1−ε with probability ε
class EpsilonGreedy(Solver):
""" epsilon贪婪算法,继承Solver类 """
def __init__(self, *args):
bandit, initValues, epsilon = args
super(EpsilonGreedy, self).__init__(bandit, initValues)
self.epsilon = epsilon
def run_one_step(self):
if np.random.binomial(1,self.epsilon) == 1:
k = np.random.randint(self.bandit.K) # 随机选择一根拉杆
k = np.random.choice([a for a in range(self.bandit.K) if self.qValues[a] == np.max(self.qValues)])
r = self.bandit.step(k) # 得到本次动作的奖励
return k, r
if __name__ == '__main__':
K = 10 # 摇臂数
NUM = 100 # 赌博机数量
STEPS = 4000 # 交互次数
banditParas = (GaussBandit, K, NUM, STEPS)
sloverSettings = [('0.001-greedy', 0.001), ('0.1-greedy',0.1), ('0.15-greedy',0.15), ('0.25-greedy',0.25), ('0.50-greedy',0.50)]
sloverParas = (EpsilonGreedy, np.ones(K), sloverSettings)
# 根据参数列表进行对比试验
plot(banditParas, sloverParas)
注:这里理论上应该设 ϵ = 1 t \epsilon=\frac{1}{t} ϵ=t1,以保证 ϵ < 1 \epsilon<1 ϵ<1 是一个合法的概率,但是测试发现这样的探索还是有点不足,这里简单地增大分子就能在早期进行更多的纯随机试探
class DecayingEpsilonGreedy(Solver):
""" epsilon值随时间衰减的epsilon-贪婪算法,继承Solver类 """
def __init__(self, *args):
bandit, initValues = args
super(DecayingEpsilonGreedy, self).__init__(bandit, initValues)
self.total_count = 0
def run_one_step(self):
self.total_count += 1
if np.random.random() < 100 / self.total_count: # 试探概率(epsilon)值随时间衰减,这里分子可以设置超过 1 来增强随机探索
k = np.random.randint(0, self.bandit.K)
k = np.random.choice([a for a in range(self.bandit.K) if self.qValues[a] == np.max(self.qValues)])
r = self.bandit.step(k)
return k, r
if __name__ == '__main__':
K = 10 # 摇臂数
NUM = 10 # 赌博机数量
STEPS = 5000 # 交互次数
banditParas = (BernoulliBandit, K, NUM, STEPS)
sloverParas = (DecayingEpsilonGreedy, np.ones(K), [('DecayingEpsilonGreedy',)])
plotRegret(banditParas, sloverParas)
,具体推导和说明请参考 强化学习笔记(2)—— 多臂赌博机 第 5 节class UCB(Solver):
""" UCB算法,继承Solver类 """
def __init__(self, *args):
bandit, initValues, coef = args
super(UCB, self).__init__(bandit, initValues)
self.total_count = 0
self.coef = coef
def run_one_step(self):
self.total_count += 1
ucb = self.qValues + self.coef * np.sqrt(np.log(self.total_count) / (2 * (self.counts + 1))) # 计算上置信界
k = np.argmax(ucb) # 选出上置信界最大的拉杆
r = self.bandit.step(k)
return k, r
if __name__ == '__main__':
K = 10 # 摇臂数
NUM = 10 # 赌博机数量
STEPS = 5000 # 交互次数
banditParas = (BernoulliBandit, K, NUM, STEPS)
sloverParas = (UCB, np.ones(K), [('UCB',1)])
plotRegret(banditParas, sloverParas)
汤普森采样(Thompson sampling)
是适用于 MAB 问题的一个经典算法,其核心思想就是利用交互数据直接估计出各个拉杆的奖励分布 R ( r ∣ a ) \mathcal{R}(r|a) R(r∣a),然后根据它来选择动作。具体实现时
class ThompsonSampling(Solver):
""" 汤普森采样算法,继承Solver类 """
def __init__(self, *args):
bandit, initValues = args
super(ThompsonSampling, self).__init__(bandit, initValues) # 这里 initValues 其实没用
self._a = np.ones(self.bandit.K) # 列表,表示每根拉杆奖励为1的次数
self._b = np.ones(self.bandit.K) # 列表,表示每根拉杆奖励为0的次数
def run_one_step(self):
samples = np.random.beta(self._a, self._b) # 按照Beta分布采样一组奖励样本
k = np.argmax(samples) # 选出采样奖励最大的拉杆
r = self.bandit.step(k)
self._a[k] += r # 更新Beta分布的第一个参数
self._b[k] += (1 - r) # 更新Beta分布的第二个参数
return k, r
if __name__ == '__main__':
K = 10 # 摇臂数
NUM = 10 # 赌博机数量
STEPS = 5000 # 交互次数
banditParas = (BernoulliBandit, K, NUM, STEPS)
sloverParas = (ThompsonSampling, np.ones(K), [('ThompsonSampling',)])
plotRegret(banditParas, sloverParas)
