运行过程就是下面这样:
下面是代码:
"""
Created on Fri Mar 5 08:48:06 2021
@author: DELL
"""
import cv2 as cv
import numpy as np
import pandas as pd
import random as rd
nums_states = 8
nums_action = 4
gammma = 0.9
epslion = 0.4
epochs = 50
lr = 0.1
def Make_Q_table(nums_states,nums_action):
'''
Parameters
----------
nums_states : int
状态的数量,总共有九个状态,其中一个为终止状态.
nums_action : int
上下左右四个动作可以选择噢,还要根据当前所处的状态进行有条件的选择,比如靠近左边墙的时候,不可以选择向左移动.
Returns
-------
q_table : DataFrame
存储所有的q值.
'''
columns = ['up','down','left','right']
index = range(1,9)
q_table = pd.DataFrame(np.zeros(shape = (nums_states,nums_action)),index=index,columns=columns)
return q_table
def Environment(s,step_nums):
'''
Parameters
----------
s : int
描述当前所处的状态.
Returns
-------
使用cv绘图.
'''
id_map = {1:(1,1),
2:(2,1),
3:(3,1),
4:(1,2),
5:(2,2),
6:(3,2),
7:(1,3),
8:(2,3)
}
if s == 'end':
img = np.ones(shape=(600,600))
img[400:600,400:600]=0.1
for i in range(4):
cv.line(img,(0,200*i),(600,200*i),(0,0,0),1)
cv.line(img,(200*i,0),(200*i,600),(0,0,0),1)
cv.putText(img,'getting treasure and taking %d steps'%step_nums,(100,300),cv.FONT_HERSHEY_SIMPLEX, 0.7,(0,0,0), 1, cv.LINE_AA)
cv.imshow('treasure',img)
cv.waitKey(2000)
cv.destroyAllWindows()
else:
size = id_map[s]
img = np.ones(shape=(600,600))
img[(size[1]-1)*200:size[1]*200,(size[0]-1)*200:size[0]*200]=0.5
img[400:600,400:600]=0
for i in range(4):
cv.line(img,(0,200*i),(600,200*i),(0,0,0),1)
cv.line(img,(200*i,0),(200*i,600),(0,0,0),1)
cv.imshow('treasure',img)
cv.waitKey(100)
def choose_action(s,q_table):
'''
根据当前的状态选择agent需要执行的动作
感觉这个选择动作稍微麻烦一些些,一个个来判断吧
Parameters
----------
s : int
位置id.
Returns
-------
动作名字(up,down,left,right).
奖励
和下一个状态
'''
q_value_sa_dict = q_table.iloc[s-1,].to_dict()
if s==1:
actions = ['down','right']
part_dict = {}
for a in actions:
part_dict[a]=q_value_sa_dict[a]
is_rand = np.random.rand()
if is_rand<epslion or all(x==0 for x in q_value_sa_dict.values()):
n = rd.randint(0,1)
if n == 0:
s_ = 4
r = 0
return actions[0], s_, r
else:
s_ = 2
r = 0
return actions[1], s_, r
else:
action = max(part_dict,key=part_dict.get)
if action == 'down':
s_ = 4
r = 0
return 'down', s_, r
else:
s_ = 2
r = 0
return 'right', s_, r
elif s == 2:
actions = ['down', 'left', 'right']
part_dict = {}
for a in actions:
part_dict[a]=q_value_sa_dict[a]
is_rand = np.random.rand()
if is_rand<epslion or all(x==0 for x in q_value_sa_dict.values()):
n = rd.randint(0,2)
if n == 0:
s_ = 5
r = 0
return actions[0], s_, r
elif n == 1:
s_ = 1
r = 0
return actions[1], s_, r
elif n == 2:
s_ = 3
r = 0
return actions[2], s_, r
else:
action = max(part_dict,key=part_dict.get)
if action == 'down':
s_ = 5
r = 0
return 'down', s_, r
elif action == 'left':
s_ = 1
r = 0
return 'left', s_, r
elif action == 'right':
s_ = 3
r = 0
return 'right', s_, r
elif s == 3:
actions = ['down','left']
part_dict = {}
for a in actions:
part_dict[a]=q_value_sa_dict[a]
is_rand = np.random.rand()
if is_rand<epslion or all(x==0 for x in q_value_sa_dict.values()):
n = rd.randint(0,1)
if n == 0:
s_ = 6
r = 0
return actions[0], s_, r
else:
s_ = 2
r = 0
return actions[1], s_, r
else:
action = max(part_dict,key=part_dict.get)
if action == 'down':
s_ = 6
r = 0
return 'down', s_, r
else:
s_ = 2
r = 0
return 'left', s_, r
elif s == 4:
actions = ['up', 'down', 'right']
part_dict = {}
for a in actions:
part_dict[a]=q_value_sa_dict[a]
is_rand = np.random.rand()
if is_rand<epslion or all(x==0 for x in q_value_sa_dict.values()):
n = rd.randint(0,2)
if n == 0:
s_ = 1
r = 0
return actions[0], s_, r
elif n == 1:
s_ = 7
r = 0
return actions[1], s_, r
elif n == 2:
s_ = 5
r = 0
return actions[2], s_, r
else:
action = max(part_dict,key=part_dict.get)
if action == 'up':
s_ = 1
r = 0
return 'up', s_, r
elif action == 'down':
s_ = 7
r = 0
return 'down', s_, r
elif action == 'right':
s_ = 5
r = 0
return 'right', s_, r
elif s == 5:
actions = ['up', 'down', 'left', 'right']
part_dict = {}
for a in actions:
part_dict[a]=q_value_sa_dict[a]
is_rand = np.random.rand()
if is_rand<epslion or all(x==0 for x in q_value_sa_dict.values()):
n = rd.randint(0,3)
if n == 0:
s_ = 2
r = 0
return actions[0], s_, r
elif n == 1:
s_ = 8
r = 0
return actions[1], s_, r
elif n == 2:
s_ = 4
r = 0
return actions[2], s_, r
elif n == 3:
s_ = 6
r = 0
return actions[3], s_, r
else:
action = max(part_dict,key=part_dict.get)
if action == 'up':
s_ = 2
r = 0
return 'up', s_, r
elif action == 'down':
s_ = 8
r = 0
return 'down', s_, r
elif action == 'right':
s_ = 6
r = 0
return 'right', s_, r
elif action == 'left':
s_ = 4
r = 0
return 'left', s_, r
elif s == 6:
actions = ['up', 'down', 'left']
part_dict = {}
for a in actions:
part_dict[a]=q_value_sa_dict[a]
is_rand = np.random.rand()
if is_rand<epslion or all(x==0 for x in q_value_sa_dict.values()):
n = rd.randint(0,2)
if n == 0:
s_ = 3
r = 0
return actions[0], s_, r
elif n == 1:
s_ = 'end'
r = 1
return actions[1], s_, r
elif n == 2:
s_ = 5
r = 0
return actions[2], s_, r
else:
action = max(part_dict,key=part_dict.get)
if action == 'up':
s_ = 3
r = 0
return 'up', s_, r
elif action == 'down':
s_ = 'end'
r = 1
return 'down', s_, r
elif action == 'left':
s_ = 5
r = 0
return 'left', s_, r
elif s == 7:
actions = ['up','right']
part_dict = {}
for a in actions:
part_dict[a]=q_value_sa_dict[a]
is_rand = np.random.rand()
if is_rand<epslion or all(x==0 for x in q_value_sa_dict.values()):
n = rd.randint(0,1)
if n == 0:
s_ = 4
r = 0
return actions[0], s_, r
else:
s_ = 8
r = 0
return actions[1], s_, r
else:
action = max(part_dict,key=part_dict.get)
if action == 'up':
s_ = 4
r = 0
return 'up', s_, r
else:
s_ = 8
r = 0
return 'right', s_, r
elif s == 8:
actions = ['up', 'left', 'right']
part_dict = {}
for a in actions:
part_dict[a]=q_value_sa_dict[a]
is_rand = np.random.rand()
if is_rand<epslion or all(x==0 for x in q_value_sa_dict.values()):
n = rd.randint(0,2)
if n == 0:
s_ = 5
r = 0
return actions[0], s_, r
elif n == 1:
s_ = 7
r = 0
return actions[1], s_, r
elif n == 2:
s_ = 'end'
r = 1
return actions[2], s_, r
else:
action = max(part_dict,key=part_dict.get)
if action == 'up':
s_ = 5
r = 0
return 'up', s_, r
elif action == 'left':
s_ = 7
r = 0
return 'left', s_, r
elif action == 'right':
s_ = 'end'
r = 1
return 'right', s_, r
def learn():
q_table = Make_Q_table(nums_states,nums_action)
for e in range(epochs):
s = 1
step_nums = 0
Environment(s,step_nums)
flags = True
while flags:
a, s_, r = choose_action(s,q_table)
if s_ == 'end':
q_table.loc[s,a] = (1-lr) * q_table.loc[s,a] + lr*r
flags = False
else:
q_table.loc[s,a] = (1-lr) * q_table.loc[s,a] + lr * (r + max(q_table.iloc[s_-1,]))
step_nums+=1
s = s_
Environment(s,step_nums)
return q_table
if __name__ == '__main__':
q_table = learn()