import cv2
import sys
import wrapped_flappy_bird as game
from BrainDQN_Nature import BrainDQN
import numpy as np
# preprocess raw image to 80*80 gray image
def preprocess(observation):
observation = cv2.cvtColor(cv2.resize(observation, (80, 80)), cv2.COLOR_BGR2GRAY)
ret, observation = cv2.threshold(observation,1,255,cv2.THRESH_BINARY)
return np.reshape(observation,(80,80,1))
def playFlappyBird():
# Step 1: init BrainDQN
actions = 2
brain = BrainDQN(actions)
# Step 2: init Flappy Bird Game
flappyBird = game.GameState()
# Step 3: play game
# Step 3.1: obtain init state
action0 = np.array([1,0]) # do nothing
observation0, reward0, terminal = flappyBird.frame_step(action0)
observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY)
ret, observation0 = cv2.threshold(observation0,1,255,cv2.THRESH_BINARY)
brain.setInitState(observation0)
# Step 3.2: run the game
while 1!= 0:
action = brain.getAction()
nextObservation,reward,terminal = flappyBird.frame_step(action)
nextObservation = preprocess(nextObservation)
brain.setPerception(nextObservation,action,reward,terminal)
def main():
playFlappyBird()
if __name__ == '__main__':
main()
import numpy as np
import sys
import random
import pygame
import flappy_bird_utils
import pygame.surfarray as surfarray
from pygame.locals import *
from itertools import cycle
FPS = 30
SCREENWIDTH = 288
SCREENHEIGHT = 512
pygame.init()
FPSCLOCK = pygame.time.Clock()
SCREEN = pygame.display.set_mode((SCREENWIDTH, SCREENHEIGHT))
pygame.display.set_caption('Flappy Bird')
IMAGES, SOUNDS, HITMASKS = flappy_bird_utils.load()
PIPEGAPSIZE = 100 # gap between upper and lower part of pipe
BASEY = SCREENHEIGHT * 0.79
PLAYER_WIDTH = IMAGES['player'][0].get_width()
PLAYER_HEIGHT = IMAGES['player'][0].get_height()
PIPE_WIDTH = IMAGES['pipe'][0].get_width()
PIPE_HEIGHT = IMAGES['pipe'][0].get_height()
BACKGROUND_WIDTH = IMAGES['background'].get_width()
PLAYER_INDEX_GEN = cycle([0, 1, 2, 1])
class GameState:
def __init__(self):
self.score = self.playerIndex = self.loopIter = 0
self.playerx = int(SCREENWIDTH * 0.2)
self.playery = int((SCREENHEIGHT - PLAYER_HEIGHT) / 2)
self.basex = 0
self.baseShift = IMAGES['base'].get_width() - BACKGROUND_WIDTH
newPipe1 = getRandomPipe()
newPipe2 = getRandomPipe()
self.upperPipes = [
{'x': SCREENWIDTH, 'y': newPipe1[0]['y']},
{'x': SCREENWIDTH + (SCREENWIDTH / 2), 'y': newPipe2[0]['y']},
]
self.lowerPipes = [
{'x': SCREENWIDTH, 'y': newPipe1[1]['y']},
{'x': SCREENWIDTH + (SCREENWIDTH / 2), 'y': newPipe2[1]['y']},
]
# player velocity, max velocity, downward accleration, accleration on flap
self.pipeVelX = -4
self.playerVelY = 0 # player's velocity along Y, default same as playerFlapped
self.playerMaxVelY = 10 # max vel along Y, max descend speed
self.playerMinVelY = -8 # min vel along Y, max ascend speed
self.playerAccY = 1 # players downward accleration
self.playerFlapAcc = -7 # players speed on flapping
self.playerFlapped = False # True when player flaps
def frame_step(self, input_actions):
pygame.event.pump()
reward = 0.1
terminal = False
if sum(input_actions) != 1:
raise ValueError('Multiple input actions!')
# input_actions[0] == 1: do nothing
# input_actions[1] == 1: flap the bird
if input_actions[1] == 1:
if self.playery > -2 * PLAYER_HEIGHT:
self.playerVelY = self.playerFlapAcc
self.playerFlapped = True
#SOUNDS['wing'].play()
# check for score
playerMidPos = self.playerx + PLAYER_WIDTH / 2
for pipe in self.upperPipes:
pipeMidPos = pipe['x'] + PIPE_WIDTH / 2
if pipeMidPos <= playerMidPos < pipeMidPos + 4:
self.score += 1
#SOUNDS['point'].play()
reward = 1
# playerIndex basex change
if (self.loopIter + 1) % 3 == 0:
self.playerIndex = next(PLAYER_INDEX_GEN)
self.loopIter = (self.loopIter + 1) % 30
self.basex = -((-self.basex + 100) % self.baseShift)
# player's movement
if self.playerVelY < self.playerMaxVelY and not self.playerFlapped:
self.playerVelY += self.playerAccY
if self.playerFlapped:
self.playerFlapped = False
self.playery += min(self.playerVelY, BASEY - self.playery - PLAYER_HEIGHT)
if self.playery < 0:
self.playery = 0
# move pipes to left
for uPipe, lPipe in zip(self.upperPipes, self.lowerPipes):
uPipe['x'] += self.pipeVelX
lPipe['x'] += self.pipeVelX
# add new pipe when first pipe is about to touch left of screen
if 0 < self.upperPipes[0]['x'] < 5:
newPipe = getRandomPipe()
self.upperPipes.append(newPipe[0])
self.lowerPipes.append(newPipe[1])
# remove first pipe if its out of the screen
if self.upperPipes[0]['x'] < -PIPE_WIDTH:
self.upperPipes.pop(0)
self.lowerPipes.pop(0)
# check if crash here
isCrash= checkCrash({'x': self.playerx, 'y': self.playery,
'index': self.playerIndex},
self.upperPipes, self.lowerPipes)
if isCrash:
#SOUNDS['hit'].play()
#SOUNDS['die'].play()
terminal = True
self.__init__()
reward = -1
# draw sprites
SCREEN.blit(IMAGES['background'], (0,0))
for uPipe, lPipe in zip(self.upperPipes, self.lowerPipes):
SCREEN.blit(IMAGES['pipe'][0], (uPipe['x'], uPipe['y']))
SCREEN.blit(IMAGES['pipe'][1], (lPipe['x'], lPipe['y']))
SCREEN.blit(IMAGES['base'], (self.basex, BASEY))
# print score so player overlaps the score
# showScore(self.score)
SCREEN.blit(IMAGES['player'][self.playerIndex],
(self.playerx, self.playery))
image_data = pygame.surfarray.array3d(pygame.display.get_surface())
pygame.display.update()
FPSCLOCK.tick(FPS)
#print self.upperPipes[0]['y'] + PIPE_HEIGHT - int(BASEY * 0.2)
return image_data, reward, terminal
def getRandomPipe():
"""returns a randomly generated pipe"""
# y of gap between upper and lower pipe
gapYs = [20, 30, 40, 50, 60, 70, 80, 90]
index = random.randint(0, len(gapYs)-1)
gapY = gapYs[index]
gapY += int(BASEY * 0.2)
pipeX = SCREENWIDTH + 10
return [
{'x': pipeX, 'y': gapY - PIPE_HEIGHT}, # upper pipe
{'x': pipeX, 'y': gapY + PIPEGAPSIZE}, # lower pipe
]
def showScore(score):
"""displays score in center of screen"""
scoreDigits = [int(x) for x in list(str(score))]
totalWidth = 0 # total width of all numbers to be printed
for digit in scoreDigits:
totalWidth += IMAGES['numbers'][digit].get_width()
Xoffset = (SCREENWIDTH - totalWidth) / 2
for digit in scoreDigits:
SCREEN.blit(IMAGES['numbers'][digit], (Xoffset, SCREENHEIGHT * 0.1))
Xoffset += IMAGES['numbers'][digit].get_width()
def checkCrash(player, upperPipes, lowerPipes):
"""returns True if player collders with base or pipes."""
pi = player['index']
player['w'] = IMAGES['player'][0].get_width()
player['h'] = IMAGES['player'][0].get_height()
# if player crashes into ground
if player['y'] + player['h'] >= BASEY - 1:
return True
else:
playerRect = pygame.Rect(player['x'], player['y'],
player['w'], player['h'])
for uPipe, lPipe in zip(upperPipes, lowerPipes):
# upper and lower pipe rects
uPipeRect = pygame.Rect(uPipe['x'], uPipe['y'], PIPE_WIDTH, PIPE_HEIGHT)
lPipeRect = pygame.Rect(lPipe['x'], lPipe['y'], PIPE_WIDTH, PIPE_HEIGHT)
# player and upper/lower pipe hitmasks
pHitMask = HITMASKS['player'][pi]
uHitmask = HITMASKS['pipe'][0]
lHitmask = HITMASKS['pipe'][1]
# if bird collided with upipe or lpipe
uCollide = pixelCollision(playerRect, uPipeRect, pHitMask, uHitmask)
lCollide = pixelCollision(playerRect, lPipeRect, pHitMask, lHitmask)
if uCollide or lCollide:
return True
return False
def pixelCollision(rect1, rect2, hitmask1, hitmask2):
"""Checks if two objects collide and not just their rects"""
rect = rect1.clip(rect2)
if rect.width == 0 or rect.height == 0:
return False
x1, y1 = rect.x - rect1.x, rect.y - rect1.y
x2, y2 = rect.x - rect2.x, rect.y - rect2.y
for x in range(rect.width):
for y in range(rect.height):
if hitmask1[x1+x][y1+y] and hitmask2[x2+x][y2+y]:
return True
return False
# import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import numpy as np
import random
from collections import deque
# Hyper Parameters:
FRAME_PER_ACTION = 1
GAMMA = 0.99 # decay rate of past observations
OBSERVE = 100. # timesteps to observe before training
EXPLORE = 200000. # frames over which to anneal epsilon
FINAL_EPSILON = 0#0.001 # final value of epsilon
INITIAL_EPSILON = 0#0.01 # starting value of epsilon
REPLAY_MEMORY = 50000 # number of previous transitions to remember
BATCH_SIZE = 32 # size of minibatch
UPDATE_TIME = 100
try:
tf.mul
except:
# For new version of tensorflow
# tf.mul has been removed in new version of tensorflow
# Using tf.multiply to replace tf.mul
tf.mul = tf.multiply
class BrainDQN:
def __init__(self,actions):
# init replay memory
self.replayMemory = deque()
# init some parameters
self.timeStep = 0
self.epsilon = INITIAL_EPSILON
self.actions = actions
# init Q network
self.stateInput,self.QValue,self.W_conv1,self.b_conv1,self.W_conv2,self.b_conv2,self.W_conv3,self.b_conv3,self.W_fc1,self.b_fc1,self.W_fc2,self.b_fc2 = self.createQNetwork()
# init Target Q Network
self.stateInputT,self.QValueT,self.W_conv1T,self.b_conv1T,self.W_conv2T,self.b_conv2T,self.W_conv3T,self.b_conv3T,self.W_fc1T,self.b_fc1T,self.W_fc2T,self.b_fc2T = self.createQNetwork()
self.copyTargetQNetworkOperation = [self.W_conv1T.assign(self.W_conv1),self.b_conv1T.assign(self.b_conv1),self.W_conv2T.assign(self.W_conv2),self.b_conv2T.assign(self.b_conv2),self.W_conv3T.assign(self.W_conv3),self.b_conv3T.assign(self.b_conv3),self.W_fc1T.assign(self.W_fc1),self.b_fc1T.assign(self.b_fc1),self.W_fc2T.assign(self.W_fc2),self.b_fc2T.assign(self.b_fc2)]
self.createTrainingMethod()
# saving and loading networks
self.saver = tf.train.Saver()
self.session = tf.InteractiveSession()
self.session.run(tf.initialize_all_variables())
checkpoint = tf.train.get_checkpoint_state("saved_networks")
if checkpoint and checkpoint.model_checkpoint_path:
self.saver.restore(self.session, checkpoint.model_checkpoint_path)
print ("Successfully loaded:", checkpoint.model_checkpoint_path)
else:
print ("Could not find old network weights")
def createQNetwork(self):
# network weights
W_conv1 = self.weight_variable([8,8,4,32])
b_conv1 = self.bias_variable([32])
W_conv2 = self.weight_variable([4,4,32,64])
b_conv2 = self.bias_variable([64])
W_conv3 = self.weight_variable([3,3,64,64])
b_conv3 = self.bias_variable([64])
W_fc1 = self.weight_variable([1600,512])
b_fc1 = self.bias_variable([512])
W_fc2 = self.weight_variable([512,self.actions])
b_fc2 = self.bias_variable([self.actions])
# input layer
stateInput = tf.placeholder("float",[None,80,80,4])
# hidden layers
h_conv1 = tf.nn.relu(self.conv2d(stateInput,W_conv1,4) + b_conv1)
h_pool1 = self.max_pool_2x2(h_conv1)
h_conv2 = tf.nn.relu(self.conv2d(h_pool1,W_conv2,2) + b_conv2)
h_conv3 = tf.nn.relu(self.conv2d(h_conv2,W_conv3,1) + b_conv3)
h_conv3_flat = tf.reshape(h_conv3,[-1,1600])
h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat,W_fc1) + b_fc1)
# Q Value layer
QValue = tf.matmul(h_fc1,W_fc2) + b_fc2
return stateInput,QValue,W_conv1,b_conv1,W_conv2,b_conv2,W_conv3,b_conv3,W_fc1,b_fc1,W_fc2,b_fc2
def copyTargetQNetwork(self):
self.session.run(self.copyTargetQNetworkOperation)
def createTrainingMethod(self):
self.actionInput = tf.placeholder("float",[None,self.actions])
self.yInput = tf.placeholder("float", [None])
Q_Action = tf.reduce_sum(tf.mul(self.QValue, self.actionInput), reduction_indices = 1)
self.cost = tf.reduce_mean(tf.square(self.yInput - Q_Action))
self.trainStep = tf.train.AdamOptimizer(1e-6).minimize(self.cost)
def trainQNetwork(self):
# Step 1: obtain random minibatch from replay memory
minibatch = random.sample(self.replayMemory,BATCH_SIZE)
state_batch = [data[0] for data in minibatch]
action_batch = [data[1] for data in minibatch]
reward_batch = [data[2] for data in minibatch]
nextState_batch = [data[3] for data in minibatch]
# Step 2: calculate y
y_batch = []
QValue_batch = self.QValueT.eval(feed_dict={self.stateInputT:nextState_batch})
for i in range(0,BATCH_SIZE):
terminal = minibatch[i][4]
if terminal:
y_batch.append(reward_batch[i])
else:
y_batch.append(reward_batch[i] + GAMMA * np.max(QValue_batch[i]))
self.trainStep.run(feed_dict={
self.yInput : y_batch,
self.actionInput : action_batch,
self.stateInput : state_batch
})
# save network every 100000 iteration
if self.timeStep % 10000 == 0:
self.saver.save(self.session, 'saved_networks/' + 'network' + '-dqn', global_step = self.timeStep)
if self.timeStep % UPDATE_TIME == 0:
self.copyTargetQNetwork()
def setPerception(self,nextObservation,action,reward,terminal):
#newState = np.append(nextObservation,self.currentState[:,:,1:],axis = 2)
newState = np.append(self.currentState[:,:,1:],nextObservation,axis = 2)
self.replayMemory.append((self.currentState,action,reward,newState,terminal))
if len(self.replayMemory) > REPLAY_MEMORY:
self.replayMemory.popleft()
if self.timeStep > OBSERVE:
# Train the network
self.trainQNetwork()
# print info
state = ""
if self.timeStep <= OBSERVE:
state = "observe"
elif self.timeStep > OBSERVE and self.timeStep <= OBSERVE + EXPLORE:
state = "explore"
else:
state = "train"
print ("TIMESTEP", self.timeStep, "/ STATE", state, \
"/ EPSILON", self.epsilon)
self.currentState = newState
self.timeStep += 1
def getAction(self):
QValue = self.QValue.eval(feed_dict= {self.stateInput:[self.currentState]})[0]
action = np.zeros(self.actions)
action_index = 0
if self.timeStep % FRAME_PER_ACTION == 0:
if random.random() <= self.epsilon:
action_index = random.randrange(self.actions)
action[action_index] = 1
else:
action_index = np.argmax(QValue)
action[action_index] = 1
else:
action[0] = 1 # do nothing
# change episilon
if self.epsilon > FINAL_EPSILON and self.timeStep > OBSERVE:
self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/EXPLORE
return action
def setInitState(self,observation):
self.currentState = np.stack((observation, observation, observation, observation), axis = 2)
def weight_variable(self,shape):
initial = tf.random.truncated_normal(shape, stddev = 0.01)
return tf.Variable(initial)
def bias_variable(self,shape):
initial = tf.constant(0.01, shape = shape)
return tf.Variable(initial)
def conv2d(self,x, W, stride):
return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")
def max_pool_2x2(self,x):
return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
十年磨剑,与君共勉!
更多代码:gitee主页:https://gitee.com/GZHzzz
博客主页:CSDN:https://blog.csdn.net/gzhzzaa
基于pytorch的经典模型:基于pytorch的典型智能体模型
强化学习经典论文:强化学习经典论文
while True:
Go life