《多智能体学习:强化学习方法》——代码实现

文章目录

  • Nash-QLearning
    • 智能体
    • 创建一个矩阵环境
    • 策略
    • 训练
  • WoLF-PHC(Policy hill-climbing algorithm)
    • 智能体
    • 创建一个矩阵环境
    • 训练
  • Minimax-QLearning¶

Nash-QLearning

论文:Nash Q-learning for general-sum stochastic games
链接:http://www.jmlr.org/papers/volume4/hu03a/hu03a.pdf

智能体

import numpy as np
import nashpy

class NashQLearner():
    def __init__(self,
                 alpha=0.1,
                 policy=None,
                 gamma=0.99,
                 ini_state="nonstate",
                 actions=None):

        self.alpha = alpha
        self.gamma = gamma
        self.policy = policy
        self.actions = actions
        self.state = ini_state

        # q values (my and opponent)
        self.q, self.q_o = {}, {}
        self.q[ini_state] = {}
        self.q_o[ini_state] = {}

        # nash q value
        self.nashq = {}
        self.nashq[ini_state] = 0

        # pi (my and opponent)
        self.pi, self.pi_o = {}, {}
        self.pi[ini_state] = np.repeat(1.0/len(self.actions), len(self.actions))
        self.pi_o[ini_state] = np.repeat(1.0/len(self.actions), len(self.actions))

        self.previous_action = None
        self.reward_history = []
        self.pi_history = []

    def act(self, training=True):
        if training:
            action_id = self.policy.select_action(self.pi[self.state])
            action = self.actions[action_id]
            self.previous_action = action
        else:
            action_id = self.policy.select_greedy_action(self.pi)
            action = self.actions[action_id]

        return action

    def observe(self, state="nonstate", reward=None, reward_o=None, opponent_action=None, is_learn=True):
        """
            observe next state and learn 
        """
        if is_learn:
            self.check_new_state(state) # if the state is new state, extend q table
            self.learn(state, reward, reward_o, opponent_action)

    def learn(self, state, reward, reward_o, opponent_action):
        self.reward_history.append(reward)
        self.q[state][(self.previous_action, opponent_action)] = self.compute_q(state, reward, opponent_action, self.q)
        self.q_o[state][(self.previous_action, opponent_action)] = self.compute_q(state, reward_o, opponent_action, self.q_o)

        self.pi[state], self.pi_o[state] = self.compute_pi(state)
        self.nashq[state] = self.compute_nashq(state)

        self.pi_history.append(self.pi[state][0])

    def compute_q(self, state, reward, opponent_action, q):
        if (self.previous_action, opponent_action) not in q[state].keys():
            q[state][(self.previous_action, opponent_action)] = 0.0
        q_old = q[state][(self.previous_action, opponent_action)]
        updated_q = q_old + (self.alpha * (reward + self.gamma*self.nashq[state] - q_old))

        return updated_q

    def compute_nashq(self, state):
        """
            compute nash q value 
        """
        nashq = 0
        for action1 in self.actions:
            for action2 in self.actions:
                nashq += self.pi[state][action1]*self.pi_o[state][action2] * \
                    self.q[state][(action1, action2)]

        return nashq

    def compute_pi(self, state):
        """
            compute pi (nash)
        """
        q_1, q_2 = [], []
        for action1 in self.actions:
            row_q_1, row_q_2 = [], []
            for action2 in self.actions:
                joint_action = (action1, action2)
                row_q_1.append(self.q[state][joint_action])
                row_q_2.append(self.q_o[state][joint_action])
            q_1.append(row_q_1)
            q_2.append(row_q_2)

        game = nashpy.Game(q_1, q_2)
        equilibria = game.support_enumeration()
        pi = []
        for eq in equilibria:
            pi.append(eq)

        return pi[0][0], pi[0][1]

    def check_new_state(self, state):
        """
            if the state is new state, extend q table
        """
        if state not in self.q.keys():
            self.q[state] = {}
            self.q_o[state] = {}
        for action1 in self.actions:
            for action2 in self.actions:
                if state not in self.pi.keys():
                    self.pi[state] = np.repeat(
                        1.0/len(self.actions), len(self.actions))
                    self.v[state] = np.random.random()
                if (action1, action2) not in self.q[state].keys():
                    self.q[state][(action1, action2)] = np.random.random()
                    self.q_o[state][(action1, action2)] = np.random.random()

创建一个矩阵环境

class MatrixGame():
    def __init__(self):
        self.reward_matrix = self._create_reward_table()

    def step(self, action1, action2):
        r1 = self.reward_matrix[0][action1][action2]
        r2 = self.reward_matrix[1][action1][action2]

        return None, r1, r2

    def _create_reward_table(self):
        reward_matrix = [
                            [[1, -1], [-1, 1]],
                            [[-1, 1], [1, -1]]
                        ]

        return reward_matrix

策略

class EpsGreedyQPolicy():
    def __init__(self, epsilon=.1, decay_rate=1):
        self.epsilon = epsilon
        self.decay_rate = decay_rate

    def select_action(self, q_values):
        assert q_values.ndim == 1
        nb_actions = q_values.shape[0]

        if np.random.uniform() < self.epsilon:
            action = np.random.random_integers(0, nb_actions-1)
        else:
            action = np.argmax(q_values)

        return action

    def select_greedy_action(self, q_values):
        assert q_values.ndim == 1
        action = np.argmax(q_values)

        return action

训练

import matplotlib.pyplot as plt

if __name__ == '__main__':
    nb_episode = 1000

    agent1 = NashQLearner(alpha=0.1, policy=EpsGreedyQPolicy(), actions=np.arange(2))
    agent2 = NashQLearner(alpha=0.1, policy=EpsGreedyQPolicy(), actions=np.arange(2))

    game = MatrixGame()
    for episode in range(nb_episode):
        action1 = agent1.act()
        action2 = agent2.act()

        _, r1, r2 = game.step(action1, action2)

        agent1.observe(reward=r1, reward_o=r2, opponent_action=agent2.previous_action)
        agent2.observe(reward=r2, reward_o=r1, opponent_action=agent1.previous_action)

    plt.plot(np.arange(len(agent1.pi_history)), agent1.pi_history, label="agent1's pi(0)")
    plt.plot(np.arange(len(agent2.pi_history)), agent2.pi_history, label="agent2's pi(0)")
    plt.xlabel("episode")
    plt.ylabel("pi(0)")
    plt.legend()
    plt.savefig(r"C:\Users\Administrator\Desktop\Implement-of-algorithm\Fig\Nash-Q.jpg")
    plt.show()

WoLF-PHC(Policy hill-climbing algorithm)

论文:Rational and Convergent Learning in Stochastic Games
链接:http://www.cs.cmu.edu/~mmv/papers/01ijcai-mike.pdf

智能体

import numpy as np

class WoLFAgent():

    def __init__(self, alpha=0.1, delta=0.0001, actions=None, high_delta=0.004, low_delta=0.002):
        self.alpha = alpha
        self.actions = actions  
        self.last_action_id = None
        self.q_values = self._init_q_values()
        self.pi = [(1.0/len(actions)) for idx in range(len(actions))]
        self.pi_average = [(1.0/len(actions)) for idx in range(len(actions))]
        self.high_delta = high_delta
        self.row_delta = low_delta 

        self.pi_history = [self.pi[0]]
        self.reward_history = []
        self.conter = 0

    def _init_q_values(self):
        q_values = {}
        q_values = np.repeat(0.0, len(self.actions))
        return q_values

    def act(self, q_values=None):
        action_id = np.random.choice(np.arange(len(self.pi)), p=self.pi)
        self.last_action_id = action_id
        action = self.actions[action_id]
        return action

    def observe(self, reward):
        self.reward_history.append(reward)
        self.q_values[self.last_action_id] = ((1.0 - self.alpha) * self.q_values[self.last_action_id]) + (self.alpha * reward)
        self._update_pi_average()
        self._update_pi()

    def _update_pi_average(self):
       self.conter += 1
       for aidx, _ in enumerate(self.pi):
           self.pi_average[aidx] = self.pi_average[aidx] + (1/self.conter)*(self.pi[aidx]-self.pi_average[aidx])
           if self.pi_average[aidx] > 1: self.pi_average[aidx] = 1
           if self.pi_average[aidx] < 0: self.pi_average[aidx] = 0

    def _update_pi(self):
       delta = self.decide_delta()
       max_action_id = np.argmax(self.q_values)
       for aidx, _ in enumerate(self.pi):
           if aidx == max_action_id:
               update_amount = delta
           else:
               update_amount = ((-delta)/(len(self.actions)-1))
           self.pi[aidx] = self.pi[aidx] + update_amount
           if self.pi[aidx] > 1: self.pi[aidx] = 1
           if self.pi[aidx] < 0: self.pi[aidx] = 0
       self.pi_history.append(self.pi[0])

    def decide_delta(self):
        """
            comfirm win or lose 
        """
        expected_value = 0
        expected_value_average = 0
        for aidx, _ in enumerate(self.pi):
            expected_value += self.pi[aidx]*self.q_values[aidx]
            expected_value_average += self.pi_average[aidx]*self.q_values[aidx]

        if expected_value > expected_value_average: # win
            return self.row_delta
        else:   # lose
            return self.high_delta

创建一个矩阵环境

class MatrixGame():
    def __init__(self):
        self.reward_matrix = self._create_reward_table()

    def step(self, action1, action2):
        r1, r2 = self.reward_matrix[action1][action2]

        return None, r1, r2

    def _create_reward_table(self):
        reward_matrix = [
                            [[1, -1], [-1, 1]],
                            [[-1, 1], [1, -1]]
                        ]

        return reward_matrix

训练

import matplotlib.pyplot as plt
import pandas as pd

if __name__ == '__main__':
    nb_episode = 1000

    actions = np.arange(2)
    agent1 = WoLFAgent(alpha=0.1, actions=actions, high_delta=0.0004, low_delta=0.0002) 
    agent2 = WoLFAgent(alpha=0.1, actions=actions, high_delta=0.0004, low_delta=0.0002)

    game = MatrixGame()
    for episode in range(nb_episode):
        action1 = agent1.act()
        action2 = agent2.act()

        _, r1, r2 = game.step(action1, action2)

        agent1.observe(reward=r1)
        agent2.observe(reward=r2)

    print(agent1.pi)
    print(agent2.pi)
    plt.plot(np.arange(len(agent1.pi_history)),agent1.pi_history, label="agent1's pi(0)")
    plt.plot(np.arange(len(agent2.pi_history)),agent2.pi_history, label="agent2's pi(0)")

    plt.ylim(0, 1)
    plt.xlabel("episode")
    plt.ylabel("pi(0)")
    plt.legend()
    plt.savefig(r"C:\Users\Administrator\Desktop\Implement-of-algorithm\Fig\WoLF-PHC.jpg")
    plt.show()

Minimax-QLearning¶

论文:Markov games as a framework for multi-agent reinforcement learning
链接:https://www2.cs.duke.edu/courses/spring07/cps296.3/littman94markov.pdf
TODO

你可能感兴趣的:(强化学习)