好奇心 https://pathak22.github.io/noreward-rl/ 建议电脑上结合源代码一起看
train.py 封装了各种启动命令
def create_commands(session, num_workers, remotes, env_id, logdir, shell='bash',
mode='tmux', visualise=False, envWrap=False, designHead=None,
unsup=None, noReward=False, noLifeReward=False, psPort=12222,
delay=0, savio=False, pretrain=None):
# for launching the TF workers and for launching tensorboard
py_cmd = 'python' if savio else sys.executable
base_cmd = [
'CUDA_VISIBLE_DEVICES=',
py_cmd, 'worker.py', 子进程入口
worder.py 是 tensorflow相关训练的封装
if args.job_name == "worker":
server = tf.train.Server(cluster, job_name="worker", task_index=args.task,
config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=2))
if args.delay > 0:
print('Startup delay in worker: {}s'.format(args.delay))
time.sleep(args.delay)
print('.. wait over !')
run(args, server)
run 函数
def run(args, server):
env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes, envWrap=args.envWrap, designHead=args.designHead,
noLifeReward=args.noLifeReward)
trainer = A3C(env, args.task, args.visualise, args.unsup, args.envWrap, args.designHead, args.noReward)
。。。。
sv = tf.train.Supervisor(is_chief=(args.task == 0), tensorflow外层框架
logdir=logdir,
saver=saver,
summary_op=None,
init_op=init_op,
init_fn=init_fn,
summary_writer=summary_writer,
ready_op=tf.report_uninitialized_variables(variables_to_save),
global_step=trainer.global_step,
save_model_secs=30,
save_summaries_secs=30) model自动存储等功能进行方便的配置
num_global_steps = constants['MAX_GLOBAL_STEPS']
logger.info(
"Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " +
"One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.")
with sv.managed_session(server.target, config=config) as sess, sess.as_default():
# Workaround for FailedPreconditionError
# see: https://github.com/openai/universe-starter-agent/issues/44 and 31
sess.run(trainer.sync)
trainer.start(sess, summary_writer) 入口
global_step = sess.run(trainer.global_step)
logger.info("Starting training at gobal_step=%d", global_step)
while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps):
trainer.process(sess)
A3c.py:
定义线程中的各个网络
with tf.device(worker_device):
with tf.variable_scope("local"):
self.local_network = pi = LSTMPolicy(env.observation_space.shape, numaction, designHead)
pi.global_step = self.global_step
if self.unsup:
with tf.variable_scope("predictor"):
if 'state' in unsupType:
self.local_ap_network = predictor = StatePredictor(env.observation_space.shape, numaction, designHead, unsupType)
else:
self.local_ap_network = predictor = StateActionPredictor(env.observation_space.shape, numaction, designHead)
def start(self, sess, summary_writer):
self.runner.start_runner(sess, summary_writer)
self.runner = RunnerThread(env, pi, constants['ROLLOUT_MAXLEN'], visualise,
predictor, envWrap, noReward)
class RunnerThread(threading.Thread):
"""
One of the key distinctions between a normal environment and a universe environment
is that a universe environment is _real time_. This means that there should be a thread
that would constantly interact with the environment and tell it what to do. This thread is here.
"""
def __init__(self, env, policy, num_local_steps, visualise, predictor, envWrap,
noReward):
.......
def start_runner(self, sess, summary_writer):
self.sess = sess
self.summary_writer = summary_writer
self.start()
def run(self):
with self.sess.as_default():
self._run()
def _run(self):
rollout_provider = env_runner(self.env, self.policy, self.num_local_steps,
self.summary_writer, self.visualise, self.predictor,
self.envWrap, self.noReward)
while True:
self.queue.put(next(rollout_provider), timeout=600.0)
启动具体的游戏环境进行交互
def env_runner(env, policy, num_local_steps, summary_writer, render, predictor,
envWrap, noReward):
for _ in range(num_local_steps):
# run policy
fetched = policy.act(last_state, *last_features)
action, value_, features = fetched[0], fetched[1], fetched[2:]
# run environment: get action_index from sampled one-hot 'action'
stepAct = action.argmax()
state, reward, terminal, info = env.step(stepAct)
curr_tuple = [last_state, action, reward, value_, terminal, last_features]
if predictor is not None:
bonus = predictor.pred_bonus(last_state, state, action)
curr_tuple += [bonus, state]
life_bonus += bonus
ep_bonus += bonus
网络定义文件 model.py
class LSTMPolicy(object):
class StateActionPredictor(object):
def __init__(self, ob_space, ac_space, designHead='universe'):
# input: s1,s2: : [None, h, w, ch] (usually ch=1 or 4)
# asample: 1-hot encoding of sampled action from policy: [None, ac_space]
input_shape = [None] + list(ob_space)
self.s1 = phi1 = tf.placeholder(tf.float32, input_shape)
self.s2 = phi2 = tf.placeholder(tf.float32, input_shape)
self.asample = asample = tf.placeholder(tf.float32, [None, ac_space])
# inverse model: g(phi1,phi2) -> a_inv: [None, ac_space]
g = tf.concat(1,[phi1, phi2])
g = tf.nn.relu(linear(g, size, "g1", normalized_columns_initializer(0.01)))
aindex = tf.argmax(asample, axis=1) # aindex: [batch_size,]
logits = linear(g, ac_space, "glast", normalized_columns_initializer(0.01))
self.invloss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
logits, aindex), name="invloss")
self.ainvprobs = tf.nn.softmax(logits, dim=-1)
# forward model: f(phi1,asample) -> phi2
# Note: no backprop to asample of policy: it is treated as fixed for predictor training
全局网络跟子网络的变量优化:
class a3c的部分初始化 及loss 训练等
with tf.device(tf.train.replica_device_setter(1, worker_device=worker_device)):
with tf.variable_scope("global"):
self.network = LSTMPolicy(env.observation_space.shape, numaction, designHead)
self.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32),
trainable=False)
if self.unsup:
with tf.variable_scope("predictor"):
if 'state' in unsupType:
self.ap_network = StatePredictor(env.observation_space.shape, numaction, designHead, unsupType)
else:
self.ap_network = StateActionPredictor(env.observation_space.shape, numaction, designHead)
with tf.device(worker_device):
with tf.variable_scope("local"):
self.local_network = pi = LSTMPolicy(env.observation_space.shape, numaction, designHead)
pi.global_step = self.global_step
if self.unsup:
with tf.variable_scope("predictor"):
if 'state' in unsupType:
self.local_ap_network = predictor = StatePredictor(env.observation_space.shape, numaction, designHead, unsupType)
else:
self.local_ap_network = predictor = StateActionPredictor(env.observation_space.shape, numaction, designHead)
# Computing a3c loss: https://arxiv.org/abs/1506.02438
self.ac = tf.placeholder(tf.float32, [None, numaction], name="ac")
self.adv = tf.placeholder(tf.float32, [None], name="adv")
self.r = tf.placeholder(tf.float32, [None], name="r")
log_prob_tf = tf.nn.log_softmax(pi.logits)
prob_tf = tf.nn.softmax(pi.logits)
# 1) the "policy gradients" loss: its derivative is precisely the policy gradient
# notice that self.ac is a placeholder that is provided externally.
# adv will contain the advantages, as calculated in process_rollout
pi_loss = - tf.reduce_mean(tf.reduce_sum(log_prob_tf * self.ac, 1) * self.adv) # Eq (19)
# 2) loss of value function: l2_loss = (x-y)^2/2
vf_loss = 0.5 * tf.reduce_mean(tf.square(pi.vf - self.r)) # Eq (28)
# 3) entropy to ensure randomness
entropy = - tf.reduce_mean(tf.reduce_sum(prob_tf * log_prob_tf, 1))
# final a3c loss: lr of critic is half of actor
self.loss = pi_loss + 0.5 * vf_loss - entropy * constants['ENTROPY_BETA']
# compute gradients
grads = tf.gradients(self.loss * 20.0, pi.var_list) # batchsize=20. Factored out to make hyperparams not depend on it.
# computing predictor loss
if self.unsup:
if 'state' in unsupType:
self.predloss = constants['PREDICTION_LR_SCALE'] * predictor.forwardloss
else:
self.predloss = constants['PREDICTION_LR_SCALE'] * (predictor.invloss * (1-constants['FORWARD_LOSS_WT']) +
predictor.forwardloss * constants['FORWARD_LOSS_WT'])
predgrads = tf.gradients(self.predloss * 20.0, predictor.var_list) # batchsize=20. Factored out to make hyperparams not depend on it.
# do not backprop to policy
if constants['POLICY_NO_BACKPROP_STEPS'] > 0:
grads = [tf.scalar_mul(tf.to_float(tf.greater(self.global_step, constants['POLICY_NO_BACKPROP_STEPS'])), grads_i)
for grads_i in grads]
self.runner = RunnerThread(env, pi, constants['ROLLOUT_MAXLEN'], visualise,
predictor, envWrap, noReward)
# clip gradients
grads, _ = tf.clip_by_global_norm(grads, constants['GRAD_NORM_CLIP'])
grads_and_vars = list(zip(grads, self.network.var_list))
if self.unsup:
predgrads, _ = tf.clip_by_global_norm(predgrads, constants['GRAD_NORM_CLIP'])
pred_grads_and_vars = list(zip(predgrads, self.ap_network.var_list))
grads_and_vars = grads_and_vars + pred_grads_and_vars
opt = tf.train.AdamOptimizer(constants['LEARNING_RATE'])
self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step)
内外reward的不同分支 好奇心反映在哪里?命令行参数有,代码是下面的
env_running :
if noReward:
reward = 0.
好奇心的动作及环境互动的正反预测。
# inverse model: g(phi1,phi2) -> a_inv: [None, ac_space]
g = tf.concat(1,[phi1, phi2])
g = tf.nn.relu(linear(g, size, "g1", normalized_columns_initializer(0.01)))
aindex = tf.argmax(asample, axis=1) # aindex: [batch_size,]
logits = linear(g, ac_space, "glast", normalized_columns_initializer(0.01))
self.invloss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
logits, aindex), name="invloss")
self.ainvprobs = tf.nn.softmax(logits, dim=-1)
# forward model: f(phi1,asample) -> phi2
# Note: no backprop to asample of policy: it is treated as fixed for predictor training
没有完全理清楚,还有部分问题:
stat 4步图像是哪里设置??
A3c的优化??
police如何训练的?能否训练社交网络知道每个动作的结果,然后自己组织相关动作的输出??
small a3c code, a2c.
欢迎大家批评指教!