本博文讲的是Reinforcement Learning:An Introduction第二版,这本书的第二章节关于multi-armed bandits algorithm的python代码实现。整本书的代码实现在github上有,比较官方:https://github.com/ShangtongZhang/reinforcement-learning-an-introduction。当我第一次看到这个代码的时候,感觉读起来有点晦涩,虽然整个代码看起来很工整,但是仔细分析一下逻辑关系会发现代码之间的耦合太多了,当然,我这里只是说第二章的代码,其他章节的代码还没看过。该链接中提供的第二章的代码如下:
#######################################################################
# Copyright (C) #
# 2016 Shangtong Zhang([email protected]) #
# 2016 Tian Jun([email protected]) #
# 2016 Artem Oboturov([email protected]) #
# 2016 Kenta Shimada([email protected]) #
# Permission given to modify the code as long as you keep this #
# declaration at the top #
#######################################################################
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
class Bandit:
# @kArm: # of arms
# @epsilon: probability for exploration in epsilon-greedy algorithm
# @initial: initial estimation for each action
# @stepSize: constant step size for updating estimations
# @sampleAverages: if True, use sample averages to update estimations instead of constant step size
# @UCB: if not None, use UCB algorithm to select action
# @gradient: if True, use gradient based bandit algorithm
# @gradientBaseline: if True, use average reward as baseline for gradient based bandit algorithm
def __init__(self, kArm=10, epsilon=0., initial=0., stepSize=0.1, sampleAverages=False, UCBParam=None,
gradient=False, gradientBaseline=False, trueReward=0.):
self.k = kArm
self.stepSize = stepSize
self.sampleAverages = sampleAverages
self.indices = np.arange(self.k)
self.time = 0
self.UCBParam = UCBParam
self.gradient = gradient
self.gradientBaseline = gradientBaseline
self.averageReward = 0
self.trueReward = trueReward
# real reward for each action
self.qTrue = []
# estimation for each action
self.qEst = np.zeros(self.k)
# # of chosen times for each action
self.actionCount = []
self.epsilon = epsilon
# initialize real rewards with N(0,1) distribution and estimations with desired initial value
for i in range(0, self.k):
self.qTrue.append(np.random.randn() + trueReward)
self.qEst[i] = initial
self.actionCount.append(0)
self.bestAction = np.argmax(self.qTrue)
# get an action for this bandit, explore or exploit?
def getAction(self):
# explore
if self.epsilon > 0:
if np.random.binomial(1, self.epsilon) == 1:
return np.random.choice(self.indices)
# exploit
if self.UCBParam is not None:
UCBEst = self.qEst + \
self.UCBParam * np.sqrt(np.log(self.time + 1) / (np.asarray(self.actionCount) + 1))
return np.argmax(UCBEst)
if self.gradient:
expEst = np.exp(self.qEst)
self.actionProb = expEst / np.sum(expEst)
return np.random.choice(self.indices, p=self.actionProb)
return np.argmax(self.qEst)
# take an action, update estimation for this action
def takeAction(self, action):
# generate the reward under N(real reward, 1)
reward = np.random.randn() + self.qTrue[action]
self.time += 1
self.averageReward = (self.time - 1.0) / self.time * self.averageReward + reward / self.time
self.actionCount[action] += 1
if self.sampleAverages:
# update estimation using sample averages
self.qEst[action] += 1.0 / self.actionCount[action] * (reward - self.qEst[action])
elif self.gradient:
oneHot = np.zeros(self.k)
oneHot[action] = 1
if self.gradientBaseline:
baseline = self.averageReward
else:
baseline = 0
self.qEst = self.qEst + self.stepSize * (reward - baseline) * (oneHot - self.actionProb)
else:
# update estimation with constant step size
self.qEst[action] += self.stepSize * (reward - self.qEst[action])
return reward
figureIndex = 0
# for figure 2.1
def figure2_1():
global figureIndex
plt.figure(figureIndex)
figureIndex += 1
sns.violinplot(data=np.random.randn(200,10) + np.random.randn(10))
plt.xlabel("Action")
plt.ylabel("Reward distribution")
def banditSimulation(nBandits, time, bandits):
bestActionCounts = [np.zeros(time, dtype='float') for _ in range(0, len(bandits))]
averageRewards = [np.zeros(time, dtype='float') for _ in range(0, len(bandits))]
for banditInd, bandit in enumerate(bandits):
for i in range(0, nBandits):
for t in range(0, time):
action = bandit[i].getAction()
reward = bandit[i].takeAction(action)
averageRewards[banditInd][t] += reward
if action == bandit[i].bestAction:
bestActionCounts[banditInd][t] += 1
bestActionCounts[banditInd] /= nBandits
averageRewards[banditInd] /= nBandits
return bestActionCounts, averageRewards
# for figure 2.2
def epsilonGreedy(nBandits, time):
epsilons = [0, 0.1, 0.01]
bandits = []
for epsInd, eps in enumerate(epsilons):
bandits.append([Bandit(epsilon=eps, sampleAverages=True) for _ in range(0, nBandits)])
bestActionCounts, averageRewards = banditSimulation(nBandits, time, bandits)
global figureIndex
plt.figure(figureIndex)
figureIndex += 1
for eps, counts in zip(epsilons, bestActionCounts):
plt.plot(counts, label='epsilon = '+str(eps))
plt.xlabel('Steps')
plt.ylabel('% optimal action')
plt.legend()
plt.figure(figureIndex)
figureIndex += 1
for eps, rewards in zip(epsilons, averageRewards):
plt.plot(rewards, label='epsilon = '+str(eps))
plt.xlabel('Steps')
plt.ylabel('average reward')
plt.legend()
# for figure 2.3
def optimisticInitialValues(nBandits, time):
bandits = [[], []]
bandits[0] = [Bandit(epsilon=0, initial=5, stepSize=0.1) for _ in range(0, nBandits)]
bandits[1] = [Bandit(epsilon=0.1, initial=0, stepSize=0.1) for _ in range(0, nBandits)]
bestActionCounts, _ = banditSimulation(nBandits, time, bandits)
global figureIndex
plt.figure(figureIndex)
figureIndex += 1
plt.plot(bestActionCounts[0], label='epsilon = 0, q = 5')
plt.plot(bestActionCounts[1], label='epsilon = 0.1, q = 0')
plt.xlabel('Steps')
plt.ylabel('% optimal action')
plt.legend()
# for figure 2.4
def ucb(nBandits, time):
bandits = [[], []]
bandits[0] = [Bandit(epsilon=0, stepSize=0.1, UCBParam=2) for _ in range(0, nBandits)]
bandits[1] = [Bandit(epsilon=0.1, stepSize=0.1) for _ in range(0, nBandits)]
_, averageRewards = banditSimulation(nBandits, time, bandits)
global figureIndex
plt.figure(figureIndex)
figureIndex += 1
plt.plot(averageRewards[0], label='UCB c = 2')
plt.plot(averageRewards[1], label='epsilon greedy epsilon = 0.1')
plt.xlabel('Steps')
plt.ylabel('Average reward')
plt.legend()
# for figure 2.5
def gradientBandit(nBandits, time):
bandits =[[], [], [], []]
bandits[0] = [Bandit(gradient=True, stepSize=0.1, gradientBaseline=True, trueReward=4) for _ in range(0, nBandits)]
bandits[1] = [Bandit(gradient=True, stepSize=0.1, gradientBaseline=False, trueReward=4) for _ in range(0, nBandits)]
bandits[2] = [Bandit(gradient=True, stepSize=0.4, gradientBaseline=True, trueReward=4) for _ in range(0, nBandits)]
bandits[3] = [Bandit(gradient=True, stepSize=0.4, gradientBaseline=False, trueReward=4) for _ in range(0, nBandits)]
bestActionCounts, _ = banditSimulation(nBandits, time, bandits)
labels = ['alpha = 0.1, with baseline',
'alpha = 0.1, without baseline',
'alpha = 0.4, with baseline',
'alpha = 0.4, without baseline']
global figureIndex
plt.figure(figureIndex)
figureIndex += 1
for i in range(0, len(bandits)):
plt.plot(bestActionCounts[i], label=labels[i])
plt.xlabel('Steps')
plt.ylabel('% Optimal action')
plt.legend()
# Figure 2.6
def figure2_6(nBandits, time):
labels = ['epsilon-greedy', 'gradient bandit',
'UCB', 'optimistic initialization']
generators = [lambda epsilon: Bandit(epsilon=epsilon, sampleAverages=True),
lambda alpha: Bandit(gradient=True, stepSize=alpha, gradientBaseline=True),
lambda coef: Bandit(epsilon=0, stepSize=0.1, UCBParam=coef),
lambda initial: Bandit(epsilon=0, initial=initial, stepSize=0.1)]
parameters = [np.arange(-7, -1, dtype=np.float),
np.arange(-5, 2, dtype=np.float),
np.arange(-4, 3, dtype=np.float),
np.arange(-2, 3, dtype=np.float)]
bandits = [[generator(pow(2, param)) for _ in range(0, nBandits)] for generator, parameter in zip(generators, parameters) for param in parameter]
_, averageRewards = banditSimulation(nBandits, time, bandits)
rewards = np.sum(averageRewards, axis=1)/time
global figureIndex
plt.figure(figureIndex)
figureIndex += 1
i = 0
for label, parameter in zip(labels, parameters):
l = len(parameter)
plt.plot(parameter, rewards[i:i+l], label=label)
i += l
plt.xlabel('Parameter(2^x)')
plt.ylabel('Average reward')
plt.legend()
figure2_1()
epsilonGreedy(2000, 1000)
optimisticInitialValues(2000, 1000)
ucb(2000, 1000)
gradientBandit(2000, 1000)
# This will take somehow a long time
figure2_6(2000, 1000)
plt.show()
在我个人看来,强化学习强调的是智能体在未知环境下完成设定的任务,根据环境中反馈的激励信号来调整自身的动作策略。因此强化学习是一个环境与智能体交互,智能体根据交互信息调节自身动作策略的过程。在强化学习的代码中,就仿真代码而言,应该把环境对象和智能体对象来着分离,并利用面向过程的编程范式,编写函数将两者桥接起来。上述的代码糅合在一起了,让人一时间很难看懂。
下面我将根据上述代码的思路,重新根据面向对象的编程范式,编写新的代码:
①为环境创建一个类——Bandits
②为智能体创建一个类——Agent
③交互的过程是(看simulation函数): 智能体选择并执行动作(chooseAction),环境根据动作产生激励信号(yieldReward),智能体根据反馈信号调整动作(updatePolicy)。上述不断循环,从而使得智能体的策略不断根据激励信号被优化。
智能体应该具有的一些成员函数和成员变量为:
①与环境相关的变量,可采取的动作类型(如赌博机的臂数)
②选择动作函数,更新策略函数,保存策略函数和加载策略函数以及最后的智能体复位(回到最原始的状态)
环境应该具有的成员函数和成员变量为:
①根据问题需要用到的参数,每个动作的值
②最佳动作(如果是仿真环境下的话,可用于评估设计的算法的优劣),环境变化(能够在一定范围内变化),产生激励信号(对智能体的动作有一定的反馈)
调整后的代码如下所示:
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
#######################################################################
# Copyright (C) #
# 2018 Dianye Huang ([email protected])
# Permission given to modify the code as long as you keep this #
# declaration at the top #
#######################################################################
# 改程序参考github上的强化学习课程程序重新编写,逻辑更加清晰。编写环境和智能体两个类进行交互,将环境的信息
# 与智能体的信息隔离,使智能体成为单一的独立个体,并能够对智能体强化训练得到的参数复位,重新进行训练,以检
# 册算法的稳定性。
# stationary problem , 动作的价值是固定的,没有变化,而反馈给智能体的价值是夹杂了噪音的,需要通过多次行为来确定。
# bandit problem 是与当前状态没有关系的单步决策问题,不需要考虑上一时刻状态。动作作用于环境,然后直接获得reward
# non associative tasks, 动作的连续性和上下关联不大,
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 环境部分 environment 负责接收动作action并反馈动作信息reward
class Bandit:
def __init__(self, kArm=10, trueReward = 0): # 设置相关初始变量
self.kArm = kArm
self.trueReward = trueReward # 真实的每个动作对应的reward
self.actionCount = 0 # 记录动作次数
self.time = 0 # 记录运行时间
# 根据输入的单一的反馈值,自动生成对于各个臂的反馈值,用于验证算法使用
self.qTrue = []
for i in range(0, self.kArm):
self.qTrue.append(np.random.randn()+self.trueReward)
def change(self):
# 环境的改变即动作值的改变
self.qTrue = []
for i in range(0, self.kArm):
self.qTrue.append(np.random.randn()+self.trueReward)
def yieldReward(self, action): # 反馈奖励
self.time += 1
return np.random.randn() + self.qTrue[action]
def getBestAction(self): # 上帝视角的最优动作
return int(np.argmax(self.qTrue))
def getkArms(self): # 环境参数配置输出,供agent使用,用于作为智能体表现性能的参照指标
return self.kArm
def getTime(self):
return self.time
def showqTrue(self, figureIndex):
plt.figure(figureIndex)
print('qTrue:', self.qTrue)
sns.violinplot(data=self.qTrue + np.random.randn(200, 10))
plt.xlabel("Action")
plt.ylabel("Reward distribution")
# 智能体部分 agent,他能够执行的功能包括 ①选择动作,②从环境中获取reward,③调整动作策略, ④学习参数复位
# 使用动作值估计的方法使收益最大化
class Agent:
def __init__(self, actionNum=10, method='SampleAverages',paraList=None):
self.time = 0
self.actionNum = actionNum
self.currenAction = None
self.method = method
if self.method == 'SampleAverages':
# para[0]->epsilon
self.epsilon = paraList[0]
self.qEst = np.zeros(actionNum) # 估计每个动作的值, Est->estimate
self.actionCount = np.zeros(actionNum)
elif self.method == 'Incremental':
# para[0]->epsilon; para[1]->step size
self.epsilon = paraList[0]
self.stepSize = paraList[1]
self.qEst = np.zeros(actionNum) # 估计每个动作的值, Est->estimate
elif self.method == 'OptimisticInitial': # 参数1 步长,参数2 reward初始值
# para[0]->epsilon; para[1]->step size; para[2]-> optimistic values
self.epsilon = paraList[0]
self.stepSize = paraList[1]
self.optimisticValue = paraList[2]
self.qEst = np.zeros(actionNum) + self.optimisticValue # 估计每个动作的值, Est->estimate
elif self.method == 'UCB': # Upper Confidance Bound method, action-value + explaoration factor
# para[0]->step size; para[1]->c
self.stepSize = paraList[0] # params: stepSize, optimistic value and c(control the degree of exploration)
self.c = paraList[1]
self.qEst = np.zeros(self.actionNum)
self.actionCount = np.zeros(actionNum)
elif self.method == 'Gradient':
self.sum = 0
self.Ht = np.zeros(self.actionNum)
self.alpha = paraList[0]
self.baseLine = paraList[1]
self.averageReward = 0
self.actionProb = np.zeros(self.actionNum)
def chooseAction(self):
self.time += 1 # 每个时间步执行一个动作
if self.method == 'SampleAverages' or self.method == 'Incremental' or self.method == 'OptimisticInitial':
# 属于epsilon-greedy的策略,根据动作值进行动作的选择
# explore \ epsilon probability for exploration
if self.epsilon > 0:
if np.random.binomial(1, self.epsilon) == 1:
self.currenAction = np.random.choice(self.actionNum) # 随机返回一个动作
return self.currenAction
# exploit -- greedy policy 1-epsilon probability for exploitation
self.currenAction = int(np.argmax(self.qEst)) # 公式(2-2)的策略
elif self.method == 'UCB':
explrProb = self.c*np.sqrt(np.log(self.time)/(self.actionCount+1)) # 1 for the case of divided by zero
self.currenAction = int(np.argmax(self.qEst+explrProb))
elif self.method == 'Gradient':
# 更新选择动作的概率
expEst = np.exp(self.Ht)
self.actionProb = expEst / np.sum(expEst) # soft-max function 公式(2.9)
self.currenAction = np.random.choice(self.actionNum, p=self.actionProb)
return self.currenAction
def updatePolicy(self, reward):
# 更新动作值的估计, qEst_update
if self.method == 'SampleAverages':
self.actionCount[self.currenAction] += 1 # 统计执行的动作
self.qEst[self.currenAction] += 1.0/self.actionCount[self.currenAction]*(reward - self.qEst[self.currenAction]) # 利用迭代的方法,可以不用去累加Reward,做一个简单的推导即可 书本P21 公式(2.1)
elif self.method == 'Incremental' or self.method == 'OptimisticInitial':
self.qEst[self.currenAction] += self.stepSize*(reward-self.qEst[self.currenAction]) # exponential recency-weighted average
elif self.method == 'UCB': # optimistic initial values + incremental + ucb
self.actionCount[self.currenAction] += 1
self.qEst[self.currenAction] += self.stepSize * (reward - self.qEst[self.currenAction])
elif self.method == 'Gradient':
oneHot = np.zeros(self.actionNum)
oneHot[self.currenAction] = 1
if self.baseLine:
self.averageReward += (reward - self.averageReward)/ float(self.time) # 计算baseline期望均值
self.Ht += self.alpha * (reward - self.averageReward) * (oneHot - self.actionProb)
else:
self.Ht += self.alpha * reward * (oneHot - self.actionProb)
def reset(self):
self.time = 0 # 重新复位策略后时间重新计算
# 参数复位
if self.method == 'SampleAverages':
self.qEst = np.zeros(self.actionNum)
self.actionCount = np.zeros(self.actionNum) # numpy 包下的array数据类型的操作,python自带的列表类型无法使用
elif self.method == 'Incremental':
self.qEst = np.zeros(self.actionNum)
elif self.method == 'OptimisticInitial': # 参数1 步长,参数2 reward初始值
self.qEst = np.zeros(self.actionNum) + self.optimisticValue # 估计每个动作的值, Est->estimate
elif self.method == 'UCB':
self.qEst = np.zeros(self.actionNum)
self.actionCount = np.zeros(self.actionNum)
elif self.method == 'Gradient':
self.averageReward = 0
self.Ht = np.zeros(self.actionNum)
self.actionProb = np.zeros(self.actionNum)
def savePolicy(self):
pass
def loadPolicy(self):
pass
# 开始仿真 并 记录数据,整合连接智能体与环境之间的交互,记录交互数据的仿真函数
def simulation(env, player, nBandits, time):
# 智能体表现指标记录变量
bestCount = np.zeros(time) # 最佳动作统计
averageReward = np.zeros(time) # 平均反馈累加值
# 开始进行n轮bandits, 每轮time个时间步
for i in range(0, nBandits):
env.change() # 改变环境反馈的动作值,在一定范围内改变
for t in range(time):
# agent与环境交互和调整动作策略的过程
action = player.chooseAction() # player选择动作
reward = env.yieldReward(action) # 环境根据动作返回奖励
player.updatePolicy(reward) # player根据当前动作得到的reward调整策略,更新动作值
# 统计记录策略调整的效果的响应指标
averageReward[t] += reward
if action == env.getBestAction(): # 当前的决策与环境最好的动作相等时,累计加1
bestCount[t] += 1
# 重置智能体策略,用于评估算法的稳定性
player.reset() # 从0到time 训练完毕后reset policy进行下一次的训练,查看训练效果,从而估计算法的平均性能
# 求解平均性能
averageReward /= nBandits
bestCount /= nBandits
return averageReward, bestCount
# 用于画图使用
# 画图主要画两种图,①平均奖励值, ②最优动作概率
class Plot:
def __init__(self):
pass
def plotting(self, data, figureIndex, labelStr, xStr, yStr):
plt.figure(figureIndex) # 设置画图标号
plt.plot(data, label=labelStr) # 绘制数据图表,设置图标
plt.xlabel(xStr) # x轴标签
plt.ylabel(yStr) # y轴标签
plt.legend() # 打开图例
####################################################################################################
if __name__ == '__main__':
# 绘图对象实例化
plot = Plot()
# 交互对象实例化
env = Bandit(10, 4) # 实例化环境对象
env.showqTrue(figureIndex=1) # 显示reward参数分布
# --------------- 智能体1 仿真-------------
# 特定的智能体的实例化
player = Agent(env.kArm, method='Gradient', paraList=[0.1, False])
# 开始仿真并返回待记录数据
avgReward, bestCount = simulation(env, player, 2000, 1000)
# 绘制相关图表
plot.plotting(avgReward, 2, 'Gradient (alpha=0.1, without baseline)', 'Steps', 'Average Reward')
plot.plotting(bestCount, 3, 'Gradient (alpha=0.1, without baseline)', 'Steps', 'Optimal Action')
# ---------------- 智能体2 仿真-------------
player = Agent(env.kArm, method='Gradient', paraList=[0.1, True])
avgReward, bestCount = simulation(env, player, 2000, 1000)
plot.plotting(avgReward, 2, 'Gradient (alpha=0.1, with baseline)', 'Steps', 'Average Reward')
plot.plotting(bestCount, 3, 'Gradient (alpha=0.1, with baseline)', 'Steps', 'Optimal Action')
# ---------------- 智能体3 仿真-------------
player = Agent(env.kArm, method='Gradient', paraList=[0.4, False])
avgReward, bestCount = simulation(env, player, 2000, 1000)
plot.plotting(avgReward, 2, 'Gradient (alpha=0.4, without baseline)', 'Steps', 'Average Reward')
plot.plotting(bestCount, 3, 'Gradient (alpha=0.4, without baseline)', 'Steps', 'Optimal Action')
# ---------------- 智能体4 仿真-------------
player = Agent(env.kArm, method='Gradient', paraList=[0.4, True])
avgReward, bestCount = simulation(env, player, 2000, 1000)
plot.plotting(avgReward, 2, 'Gradient (alpha=0.4, with baseline)', 'Steps', 'Average Reward')
plot.plotting(bestCount, 3, 'Gradient (alpha=0.4, with baseline)', 'Steps', 'Optimal Action')
# 打开绘制图表开关
plt.show()