RL: q_learning_fronzenlake.py
2020-04-16 本文已影响0人
魏鹏飞
Keywords:
num_episodes、num_iter、np.random.uniform(0,1)<eps
、state_new、reward、done、q_learning_table[state, action] + learning_rate * (reward + discount*np.max(q_learning_table[state_new,:]) - q_learning_table[state, action])
、
q_learning_frozenlake.py
"""
Model-free Control for OpenAI FrozenLake env (https://gym.openai.com/envs/FrozenLake-v0/)
Bolei Zhou for IERG6130 course example
"""
import gym,sys,numpy as np
from gym.envs.registration import register
no_slippery = True
render_last = False # whether to visualize the last episode in testing
# -- hyperparameters--
num_epis_train = 10000
num_iter = 100
learning_rate = 0.01
discount = 0.8
eps = 0.3
if no_slippery == True:
# the simplified frozen lake without slippery (so the transition is deterministic)
register(
id='FrozenLakeNotSlippery-v0',
entry_point='gym.envs.toy_text:FrozenLakeEnv',
kwargs={'map_name' : '4x4', 'is_slippery': False},
max_episode_steps=1000,
reward_threshold=0.78, # optimum = .8196
)
env = gym.make('FrozenLakeNotSlippery-v0')
else:
# the standard slippery frozen lake
env = gym.make('FrozenLake-v0')
q_learning_table = np.zeros([env.observation_space.n,env.action_space.n])
# -- training the agent ----
for epis in range(num_epis_train):
state = env.reset()
for iter in range(num_iter):
if np.random.uniform(0, 1) < eps:
action = np.random.choice(env.action_space.n)
else:
action = np.argmax(q_learning_table[state,:])
state_new, reward, done,_ = env.step(action)
q_learning_table[state,action] = q_learning_table[state, action] + learning_rate * (reward + discount*np.max(q_learning_table[state_new,:]) - q_learning_table[state, action])
state = state_new
if done: break
print(np.argmax(q_learning_table,axis=1))
print(np.around(q_learning_table,6))
if no_slippery == True:
print('---Frozenlake without slippery move-----')
else:
print('---Standard frozenlake------------------')
# visualize no uncertainty
num_episode = 500
rewards = 0
for epi in range(num_episode):
s = env.reset()
for _ in range(100):
action = np.argmax(q_learning_table[s,:])
state_new, reward_episode, done_episode, _ = env.step(action)
if epi == num_episode -1 and render_last:
env.render()
s = state_new
if done_episode:
if reward_episode==1:
rewards += 1
break
print('---Success rate=%.3f'%(rewards*1.0 / num_episode))
print('-------------------------------')
# Results:
python q_learning_frozenlake.py
[1 0 0 0 1 0 1 0 2 2 1 0 0 2 2 0]
[[2.60196e-01 3.27680e-01 2.01879e-01 2.60107e-01]
[2.60212e-01 0.00000e+00 1.84340e-02 6.54950e-02]
[9.35790e-02 2.18570e-02 5.00000e-06 2.93400e-03]
[1.01200e-03 0.00000e+00 0.00000e+00 3.00000e-06]
[3.24317e-01 4.09600e-01 0.00000e+00 2.58527e-01]
[0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
[0.00000e+00 5.88797e-01 0.00000e+00 6.04500e-03]
[0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
[4.02664e-01 0.00000e+00 5.12000e-01 3.19554e-01]
[3.90830e-01 6.03926e-01 6.40000e-01 0.00000e+00]
[4.91385e-01 8.00000e-01 0.00000e+00 3.96258e-01]
[0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
[0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
[0.00000e+00 2.07120e-01 7.98114e-01 1.86873e-01]
[5.90903e-01 7.73337e-01 1.00000e+00 6.18221e-01]
[0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]]
---Frozenlake without slippery move-----
---Success rate=1.000
-------------------------------