通过奖惩机制reward,来对选择进行反向传递,从而确定是否需要进行加大,或者减小选择的力度
因为Policy Gradients是回合更新,不是每步更新,所以是在回合进行结束后,才对参数和reward进行调整
self.all_act_prob = tf.nn.softmax(all_act, name='act_prob')
neg_log_prob = tf.reduce_sum(
-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1
)
loss = tf.reduce_mean(neg_log_prob * self.tf_vt) # reward guided loss
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
def choose_action(self, observation):
prob_weights = self.sess.run(self.all_act_prob,
feed_dict={self.tf_obs: observation[np.newaxis, :]})
action = np.random.choice(
range(prob_weights.shape[1]), p=prob_weights.ravel())
# select action w.r.t the actions prob
return action
def store_transition(self, s, a, r):
self.ep_obs.append(s)
self.ep_as.append(a)
self.ep_rs.append(r)