做网站的怎样能翻页,用vue开发的网站,阿里云搭建自己的网站,网线制作顺序图解gym 0.26.1 CartPole-v1 NoisyNet DQN NoisyNet 就是把原来Linear里的w/b 换成 mu sigma * epsilon, 这是一种非常简单的方法#xff0c;但是可以显著提升DQN的表现。 和之前最原始的DQN相比就是改了两个地方#xff0c;一个是Linear改成了NoisyLinear,另外一个是在agent在t…gym 0.26.1 CartPole-v1 NoisyNet DQN NoisyNet 就是把原来Linear里的w/b 换成 mu sigma * epsilon, 这是一种非常简单的方法但是可以显著提升DQN的表现。 和之前最原始的DQN相比就是改了两个地方一个是Linear改成了NoisyLinear,另外一个是在agent在take_action的时候策略 由ε-greedy改成了直接取argmax。详细见下面的代码。 本文的实现参考王树森的深度强化学习。 引用书上的一段话 噪声DQN本身就带有随机性,可以鼓励探索,起到与ε-greedy策略相同的作用,直接用a_t argmax Q(s,a,epsilon; mu,sigma), 作为行为策略效果比ε-greedy更好。 import gym
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
import random
import collections
from tqdm import tqdm
import matplotlib.pyplot as plt
from d2l import torch as d2l
import rl_utils
import mathclass ReplayBuffer:经验回放池def __init__(self, capacity):self.buffer collections.deque(maxlencapacity) # 队列,先进先出def add(self, state, action, reward, next_state, done): # 将数据加入bufferself.buffer.append((state, action, reward, next_state, done))def sample(self, batch_size): # 从buffer中采样数据,数量为batch_sizetransition random.sample(self.buffer, batch_size)state, action, reward, next_state, done zip(*transition)return np.array(state), action, reward, np.array(next_state), donedef size(self): # 目前buffer中数据的数量return len(self.buffer)class NoisyLinear(nn.Linear):def __init__(self, in_features, out_features, sigma_init0.017, biasTrue):super().__init__(in_features, out_features, bias)self.sigma_weight nn.Parameter(torch.full((out_features, in_features), sigma_init))self.register_buffer(epsilon_weight, torch.zeros(out_features, in_features))if bias:self.sigma_bias nn.Parameter(torch.full((out_features,), sigma_init))self.register_buffer(epsilon_bias, torch.zeros(out_features))self.reset_parameters()def reset_parameters(self):std math.sqrt(3 / self.in_features)self.weight.data.uniform_(-std, std)self.bias.data.uniform_(-std, std)def forward(self, x, is_trainingTrue):self.epsilon_weight.normal_()bias self.biasif bias is not None:self.epsilon_bias.normal_()bias bias self.sigma_bias * self.epsilon_bias.dataif is_training:return F.linear(x, self.weight self.sigma_weight * self.epsilon_weight.data, bias)else:return F.linear(x, self.weight, bias)class Q(nn.Module):def __init__(self, state_dim, hidden_dim, action_dim):super().__init__()self.fc1 NoisyLinear(state_dim, hidden_dim)self.fc2 NoisyLinear(hidden_dim, action_dim)def forward(self, x, is_trainingTrue):x F.relu(self.fc1(x, is_training)) # 隐藏层之后使用ReLU激活函数return self.fc2(x, is_training)class DQN:DQN算法def __init__(self, state_dim, hidden_dim, action_dim, lr, gamma, target_update, device):self.action_dim action_dimself.q Q(state_dim, hidden_dim, action_dim).to(device) # Q网络self.target_q Q(state_dim, hidden_dim, action_dim).to(device) # 目标网络self.target_q.load_state_dict(self.q.state_dict()) # 加载参数self.optimizer torch.optim.Adam(self.q.parameters(), lrlr)self.gamma gammaself.target_update target_update # 目标网络更新频率self.count 0 # 计数器记录更新次数self.device devicedef take_action(self, state): # 这个地方就不用epsilon-贪婪策略state torch.tensor(np.array([state]), dtypetorch.float).to(self.device)action self.q(state).argmax().item()return actiondef update(self, transition_dict):states torch.tensor(transition_dict[states], dtypetorch.float).to(self.device)actions torch.tensor(transition_dict[actions]).reshape(-1,1).to(self.device)rewards torch.tensor(transition_dict[rewards], dtypetorch.float).reshape(-1,1).to(self.device)next_states torch.tensor(transition_dict[next_states], dtypetorch.float).to(self.device)dones torch.tensor(transition_dict[dones], dtypetorch.float).reshape(-1,1).to(self.device)q_values self.q(states).gather(1, actions) # Q值# 下个状态的最大Q值max_next_q_values self.target_q(next_states).max(1)[0].reshape(-1,1)q_targets rewards self.gamma * max_next_q_values * (1- dones) # TD误差loss F.mse_loss(q_values, q_targets) # 均方误差self.optimizer.zero_grad() # 梯度清零,因为默认会梯度累加loss.mean().backward() # 反向传播self.optimizer.step() # 更新梯度if self.count % self.target_update 0:self.target_q.load_state_dict(self.q.state_dict())self.count 1lr 2e-3
num_episodes 500
hidden_dim 128
gamma 0.98
target_update 10
buffer_size 10000
minimal_size 500
batch_size 64
device d2l.try_gpu()
print(device)env_name CartPole-v1
env gym.make(env_name)
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
replay_buffer ReplayBuffer(buffer_size)
state_dim env.observation_space.shape[0]
action_dim env.action_space.n
agent DQN(state_dim, hidden_dim, action_dim, lr, gamma, target_update, device)
return_list []for i in range(10):with tqdm(totalint(num_episodes/10), descfIteration {i}) as pbar:for i_episode in range(int(num_episodes/10)):episode_return 0state env.reset()[0]done, truncated False, Falsewhile not done and not truncated :action agent.take_action(state)next_state, reward, done, truncated, info env.step(action)replay_buffer.add(state, action, reward, next_state, done)state next_stateepisode_return reward# 当buffer数据的数量超过一定值后才进行Q网络训练if replay_buffer.size() minimal_size:b_s, b_a, b_r, b_ns, b_d replay_buffer.sample(batch_size)transition_dict {states: b_s, actions: b_a, next_states: b_ns, rewards: b_r, dones: b_d}agent.update(transition_dict)return_list.append(episode_return)if (i_episode1) % 10 0:pbar.set_postfix({episode: %d % (num_episodes / 10 * i i_episode1), return: %.3f % np.mean(return_list[-10:])})pbar.update(1)episodes_list list(range(len(return_list)))
plt.plot(episodes_list, return_list)
plt.xlabel(Episodes)
plt.ylabel(Returns)
plt.title(fNoisy DQN on {env_name})
plt.show()mv_return rl_utils.moving_average(return_list, 9)
plt.plot(episodes_list, mv_return)
plt.xlabel(Episodes)
plt.ylabel(Returns)
plt.title(fNoisy DQN on {env_name})
plt.show()这次是在pycharm上运行jupyter file,结果如下: 效果对比之前的DQN 详细参考这篇 表现是显著提升。