本文选自Medium 作者:Raymond Yuan 机器之心编译 参与:李诗萌、张倩 原文链接:https://medium.com/tensorflow/deep-reinforcement-learning-playing-cartpole-through-asynchronous-advantage-actor-critic-a3c-7eab2eea5296
class RandomAgent: """Random Agent that will play the specified game Arguments: env_name: Name of the environment to be played max_eps: Maximum number of episodes to run agent for. """ def __init__(self, env_name, max_eps): self.env = gym.make(env_name) self.max_episodes = max_eps self.global_moving_average_reward = 0 self.res_queue = Queue() def run(self): reward_avg = 0 for episode in range(self.max_episodes): done = False self.env.reset() reward_sum = 0.0 steps = 0 while not done: # Sample randomly from the action space and step _, reward, done, _ = self.env.step(self.env.action_space.sample()) steps += 1 reward_sum += reward # Record statistics self.global_moving_average_reward = record(episode, reward_sum, 0, self.global_moving_average_reward, self.res_queue, 0, steps) reward_avg += reward_sum final_avg = reward_avg / float(self.max_episodes) print("Average score across {} episodes: {}".format(self.max_episodes, final_avg)) return final_avg
public class MyActivity extends AppCompatActivity { @Override //override the function protected void onCreate(@Nullable Bundle savedInstanceState) { super.onCreate(savedInstanceState); try { OkhttpManager.getInstance().setTrustrCertificates(getAssets().open("mycer.cer"); OkHttpClient mOkhttpClient= OkhttpManager.getInstance().build(); } catch (IOException e) { e.printStackTrace(); } }
class MasterAgent(): def __init__(self): self.game_name = 'CartPole-v0' save_dir = args.save_dir self.save_dir = save_dir if not os.path.exists(save_dir): os.makedirs(save_dir) env = gym.make(self.game_name) self.state_size = env.observation_space.shape[0] self.action_size = env.action_space.n self.opt = tf.train.AdamOptimizer(args.lr, use_locking=True) print(self.state_size, self.action_size) self.global_model = ActorCriticModel(self.state_size, self.action_size) # global network self.global_model(tf.convert_to_tensor(np.random.random((1, self.state_size)), dtype=tf.float32))
def train(self): if args.algorithm == 'random': random_agent = RandomAgent(self.game_name, args.max_eps) random_agent.run() return res_queue = Queue() workers = [Worker(self.state_size, self.action_size, self.global_model, self.opt, res_queue, i, game_name=self.game_name, save_dir=self.save_dir) for i in range(multiprocessing.cpu_count())] for i, worker in enumerate(workers): print("Starting worker {}".format(i)) worker.start() moving_average_rewards = [] # record episode reward to plot while True: reward = res_queue.get() if reward is not None: moving_average_rewards.append(reward) else: break [w.join() for w in workers] plt.plot(moving_average_rewards) plt.ylabel('Moving average ep reward') plt.xlabel('Step') plt.savefig(os.path.join(self.save_dir, '{} Moving Average.png'.format(self.game_name))) plt.show()
class Memory: def __init__(self): self.states = [] self.actions = [] self.rewards = [] def store(self, state, action, reward): self.states.append(state) self.actions.append(action) self.rewards.append(reward) def clear(self): self.states = [] self.actions = [] self.rewards = []
class Worker(threading.Thread): # Set up global variables across different threads global_episode = 0 # Moving average reward global_moving_average_reward = 0 best_score = 0 save_lock = threading.Lock() def __init__(self, state_size, action_size, global_model, opt, result_queue, idx, game_name='CartPole-v0', save_dir='/tmp'): super(Worker, self).__init__() self.state_size = state_size self.action_size = action_size self.result_queue = result_queue self.global_model = global_model self.opt = opt self.local_model = ActorCriticModel(self.state_size, self.action_size) self.worker_idx = idx self.game_name = game_name self.env = gym.make(self.game_name).unwrapped self.save_dir = save_dir self.ep_loss = 0.0
def run(self): total_step = 1 mem = Memory() while Worker.global_episode < args.max_eps: current_state = self.env.reset() mem.clear() ep_reward = 0. ep_steps = 0 self.ep_loss = 0 time_count = 0 done = False while not done: logits, _ = self.local_model( tf.convert_to_tensor(current_state[None, :], dtype=tf.float32)) probs = tf.nn.softmax(logits) action = np.random.choice(self.action_size, p=probs.numpy()[0]) new_state, reward, done, _ = self.env.step(action) if done: reward = -1 ep_reward += reward mem.store(current_state, action, reward) if time_count == args.update_freq or done: # Calculate gradient wrt to local model. We do so by tracking the # variables involved in computing the loss by using tf.GradientTape with tf.GradientTape() as tape: total_loss = self.compute_loss(done, new_state, mem, args.gamma) self.ep_loss += total_loss # Calculate local gradients grads = tape.gradient(total_loss, self.local_model.trainable_weights) # Push local gradients to global model self.opt.apply_gradients(zip(grads, self.global_model.trainable_weights)) # Update local model with new weights self.local_model.set_weights(self.global_model.get_weights()) mem.clear() time_count = 0 if done: # done and print information Worker.global_moving_average_reward = \ record(Worker.global_episode, ep_reward, self.worker_idx, Worker.global_moving_average_reward, self.result_queue, self.ep_loss, ep_steps) # We must use a lock to save our model and to print to prevent data races. if ep_reward > Worker.best_score: with Worker.save_lock: print("Saving best model to {}, " "episode score: {}".format(self.save_dir, ep_reward)) self.global_model.save_weights( os.path.join(self.save_dir, 'model_{}.h5'.format(self.game_name)) ) Worker.best_score = ep_reward Worker.global_episode += 1 ep_steps += 1 time_count += 1 current_state = new_state total_step += 1 self.result_queue.put(None)
如何计算损失?
工作智能体通过计算损失得到所有相关网络参数的梯度。这是 A3C 中最后一个 A——advantage(优势)所起的作用。将这些应用于全局网络。损失计算如下:
价值损失:L=∑(R—V(s))² 策略损失:L=-log(𝝅(s)) * A(s)
式中 R 是折扣奖励,V 是价值函数(输入状态),𝛑 是策略函数(输入状态),A 是优势函数。我们用折扣奖励估计 Q 值,因为我们不能直接用 A3C 决定 Q 值。
def compute_loss(self, done, new_state, memory, gamma=0.99): if done: reward_sum = 0. # terminal else: reward_sum = self.local_model( tf.convert_to_tensor(new_state[None, :], dtype=tf.float32))[-1].numpy()[0] # Get discounted rewards discounted_rewards = [] for reward in memory.rewards[::-1]: # reverse buffer r reward_sum = reward + gamma * reward_sum discounted_rewards.append(reward_sum) discounted_rewards.reverse() logits, values = self.local_model( tf.convert_to_tensor(np.vstack(memory.states), dtype=tf.float32)) # Get our advantages advantage = tf.convert_to_tensor(np.array(discounted_rewards)[:, None], dtype=tf.float32) - values # Value loss value_loss = advantage ** 2 # Calculate our policy loss actions_one_hot = tf.one_hot(memory.actions, self.action_size, dtype=tf.float32) policy = tf.nn.softmax(logits) entropy = tf.reduce_sum(policy * tf.log(policy + 1e-20), axis=1) policy_loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=actions_one_hot, logits=logits) policy_loss *= tf.stop_gradient(advantage) policy_loss -= 0.01 * entropy total_loss = tf.reduce_mean((0.5 * value_loss + policy_loss)) return total_loss
def play(self): env = gym.make(self.game_name).unwrapped state = env.reset() model = self.global_model model_path = os.path.join(self.save_dir, 'model_{}.h5'.format(self.game_name)) print('Loading model from: {}'.format(model_path)) model.load_weights(model_path) done = False step_counter = 0 reward_sum = 0 try: while not done: env.render(mode='rgb_array') policy, value = model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)) policy = tf.nn.softmax(policy) action = np.argmax(policy) state, reward, done, _ = env.step(action) reward_sum += reward print("{}. Reward: {}, action: {}".format(step_counter, reward_sum, action)) step_counter += 1 except KeyboardInterrupt: print("Received Keyboard Interrupt. Shutting down.") finally: env.close()