TensorFlow2048/agent.py at main · Armadillan/TensorFlow2048 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Module for creation and training of agents

WARNING: This will run for days. Literally.
My pretty decent laptop takes about 80 hours to finish this.
"""

import os
import psutil
import pickle

# import numpy as np
# import matplotlib.pyplot as plt

import tensorflow as tf

# from tf_agents.environments import py_environment
# from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
# from tf_agents.environments import utils
# from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
# from tf_agents.trajectories import time_step as ts

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver, dynamic_episode_driver
# from tf_agents.environments import tf_py_environment
# from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.policies.policy_saver import PolicySaver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
# from tf_agents.trajectories import trajectory
from tf_agents.utils import common

from env import PyEnv2048#, PyEnv2048FlatObservations
from env import PyEnv2048NoBadActions

"""HYPERPARAMETERS"""
NAME = "VPS test run" # Name of agent, used for directory and file names

FC_LAYER_PARAMS = (10, 10) # Number and size of hidden dense layers
MAX_DURATION = 500 # Maximum duration of an episode

LEARNING_RATE = 1e-5 # Learning rate for optimizer

DISCOUNT_FACTOR = 0.97 # Discount factor for future rewards (gamma)

ACTIVATION_FN = tf.keras.activations.relu # Activation function
# Optimizer
OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
LOSS_FN = common.element_wise_squared_loss # Loss function

BUFFER_MAX_LEN = 500 # Max length of replay buffer
# Size of experience batch passed to the agent each training iteration
BUFFER_BATCH_SIZE = 64
N_STEP_UPDATE = 2 # Number of consecutive transitions
# to pass to the agent at a time during training

# Number of experience steps to collect each training iteration,
# a higher value replaces the whole buffer faster.
COLLECTION_STEPS = 1
NUM_EVAL_EPISODES = 10 # Number of episodes for evaluation
NUM_TRAINING_ITERATIONS = 2000000 # Number of iterations to train for

# Initial epsilon (chance for collection policy to pick random move)
INITIAL_EPSILON = 0.99
END_EPSILON = 0.01 # End epsilon
EPSILON_DECAY_STEPS = 750000 # How many steps the epsilon should decay over

# Whether to map bad moves to the next good move
USE_BAD_MOVE_MAPPING = True
# Punishment for moves that don't change the state of the game
# (used if USE_BAD_MOVE_MAPPING is set to false)
PUNISHMENT_FOR_BAD_ACTIONS = 16
REWARD_MULTIPLIER = 1 # Multiplier for positive rewards

LOG_INTERVAL = 2000 # How often to print progress to console
EVAL_INTERVAL = 10000 # How often to evaluate the agent's performence

SAVE_DIR = os.path.join("..", "TensorFlow2048_DATA") # Where to save checkpoints, policies and stats

# Creates environments for training and evaluation
# Uses wrapper to limit number of moves
# Both environments must be of same type, but can have different
# parameters.
if USE_BAD_MOVE_MAPPING:
    train_py_env = PyEnv2048NoBadActions(REWARD_MULTIPLIER)
    eval_py_env = PyEnv2048NoBadActions(1)
else:
    train_py_env = PyEnv2048(PUNISHMENT_FOR_BAD_ACTIONS, REWARD_MULTIPLIER)
    eval_py_env = PyEnv2048(0, 1)

train_py_env = wrappers.TimeLimit(train_py_env, duration=MAX_DURATION)
eval_py_env = wrappers.TimeLimit(eval_py_env, duration=MAX_DURATION)

# Converts Py environments to TF environments
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

# Creates a tensor to count the number of training iterations
train_step_counter = tf.Variable(0)

# Initializes the neural network
q_net = q_network.QNetwork(
        train_env.observation_spec(), # Passes observation spec,
        train_env.action_spec(), # and action spec of environment.
        fc_layer_params=FC_LAYER_PARAMS,
        activation_fn=ACTIVATION_FN)

# Creates a function to handle epsilon decay
epsilon = tf.compat.v1.train.polynomial_decay(
    learning_rate=INITIAL_EPSILON,
    global_step=train_step_counter,
    decay_steps=EPSILON_DECAY_STEPS,
    end_learning_rate=END_EPSILON
    )

# Initializes an agent implementing the DDQN algorithm
agent = dqn_agent.DdqnAgent(
    time_step_spec=train_env.time_step_spec(), # Passes TimeStep spec,
    action_spec=train_env.action_spec(), # and action spec of environment.
    n_step_update=N_STEP_UPDATE,
    q_network=q_net,
    optimizer=OPTIMIZER,
    epsilon_greedy=epsilon,
    td_errors_loss_fn=LOSS_FN,
    gamma=DISCOUNT_FACTOR,
    train_step_counter=train_step_counter
    )

# Initializes replay buffer
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec, # Passes agent's data spec
    batch_size=train_env.batch_size,
    max_length=BUFFER_MAX_LEN,
    )

# Puts the buffers "add_batch" function in a list to pass as an observer
# to the training driver later
replay_observer = [replay_buffer.add_batch]

# Creates a dataset from the buffer
dataset = replay_buffer.as_dataset(
    sample_batch_size=BUFFER_BATCH_SIZE,
    num_steps=N_STEP_UPDATE + 1,
    num_parallel_calls=3).prefetch(3)

# And an interator from that dataset
dataset_iterator = iter(dataset)

# Initializes the agent
agent.initialize()

# Wraps agent.train in a graph for optimization, can be skipped
agent.train = common.function(agent.train)

# Sets agent's training steps counter to 0
agent.train_step_counter.assign(0)

# Initializes collection driver
collect_driver = dynamic_step_driver.DynamicStepDriver(
    env=train_env,
    policy=agent.collect_policy,
    observers=replay_observer, # Passes the replay buffer observer
    num_steps=COLLECTION_STEPS
    )

# Initializes driver employing random policy
random_policy_driver = dynamic_step_driver.DynamicStepDriver(
    env=train_env,
    policy=random_tf_policy.RandomTFPolicy(
        train_env.time_step_spec(), train_env.action_spec()
        ),
    observers=replay_observer,
    num_steps=COLLECTION_STEPS
    )

# Initializes metrics
num_episodes_metric = tf_metrics.NumberOfEpisodes()
num_steps_metric = tf_metrics.EnvironmentSteps()
avg_return_metric = tf_metrics.AverageReturnMetric()
avg_episode_len_metric = tf_metrics.AverageEpisodeLengthMetric()

# Puts them in a list to pass as observers to eval driver
eval_metrics = [
            # num_episodes_metric, # Number of episodes completed
            # num_steps_metric, # Number of steps completed
            avg_return_metric, # Average return (cumulative episode reward)
            avg_episode_len_metric # Average episode length
]

# Initalizes policy saver, to periodically save the agent's policy
policy_saver = PolicySaver(agent.policy)

# Initializes evaluation driver
eval_driver = dynamic_episode_driver.DynamicEpisodeDriver(
    env=eval_env,
    policy=agent.policy,
    observers = eval_metrics,
    num_episodes = NUM_EVAL_EPISODES
    )

"""
Function similar to running the eval driver with the average return
metric, sometimes useful for debugging purposes.
"""
# def compute_avg_return(environment, policy, num_episodes=10):

#   total_return = 0.0
#   for _ in range(num_episodes):


#     time_step = environment.reset()
#     episode_return = 0.0
#     num_steps = 0

#     while not time_step.is_last():
#       num_steps += 1
#       action_step = policy.action(time_step)
#       time_step = environment.step(action_step.action)
#       episode_return += time_step.reward
#     total_return += episode_return

#   avg_return = total_return / num_episodes
#   return avg_return.numpy()[0]

# Resets both environments
train_env.reset()
eval_env.reset()

# Runs the collection driver once to get a time step
final_time_step, _ = collect_driver.run()

# Initial buffer fill using random policy
for i in range(max(int(BUFFER_MAX_LEN/COLLECTION_STEPS), 1)):
    # Can alternatively be run with the collection policy like so:
    # final_time_step, _ = collect_driver.run(final_time_step)
    final_time_step, _ = random_policy_driver.run(final_time_step)

# Runs eval driver once to start it, and get inital performence
eval_driver.run()

# Gets average episode length and average return from metrics
avg_episode_len = avg_episode_len_metric.result().numpy()
avg_return = avg_return_metric.result().numpy()

# Puts them in lists, and initalizes list for losses
returns = [avg_return]
episode_lengths = [avg_episode_len]
losses = []

# Resets all metrics
for metric in eval_metrics:
    metric.reset()

# print(f"Average episode length: {avg_episode_len}")
# print(f"Average return: {avg_return}")

# Creates checkpointer, which periodically creates a backup of these objects,
# and restores them from the latest backup if one is available
checkpointer = common.Checkpointer(
    ckpt_dir=os.path.join(SAVE_DIR, NAME + " data", "checkpoints"),
    max_to_keep=20,
    agent=agent,
    policy=agent.policy,
    replay_buffer=replay_buffer,
    global_step=agent.train_step_counter,
    network=q_net
    )

# Main training loop
for _ in range(NUM_TRAINING_ITERATIONS):

    # Runs collect driver
    final_time_step, _ = collect_driver.run(final_time_step)

    # Gets experiance from buffer
    experience, unused_info = next(dataset_iterator)

    # Train the agent and get the loss
    train_loss = agent.train(experience).loss

    # Save the loss to the losses list
    losses.append(train_loss.numpy())

    # Gets the number of training steps completed
    step = agent.train_step_counter.numpy()

    # Prints progress to console
    if step % LOG_INTERVAL == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    # Evaluates agent performence
    if step % EVAL_INTERVAL == 0:

        # Runs evaluation driver
        eval_driver.run()

        # Gets average episode length and average return from metrics
        avg_episode_len = avg_episode_len_metric.result().numpy()
        avg_return = avg_return_metric.result().numpy()

        # Prints eval results to console
        print(f'Average Return: {avg_return}, '
              + f'Average episode length: {avg_episode_len}')

        # Appends returns and episode lengths to their repsecitve lists
        returns.append(avg_return)
        episode_lengths.append(avg_episode_len)

        # Saves agent policy
        policy_saver.save(
            os.path.join(
            SAVE_DIR, NAME + " data", "policy saves",
            NAME + " policy @ " + str(step)
            )
        )

        # Runs checkpointer to make a backup of the agent, network etc.
        checkpointer.save(step)

        # Saves the lists of statistics as a pickled dictionary
        with open(
                os.path.join(SAVE_DIR, NAME + " data", NAME + " stats.pkl"),
                "wb",
                ) as file:
            pickle.dump(
                {"Returns": returns,
                 "Lengths": episode_lengths,
                 "Losses": losses},
                file
                )

        # Resets all metrics
        for metric in eval_metrics:
            metric.reset()