Curiosity-driven Exploration 好奇心代码阅读

2018 年 3 月 15 日 CreateAMind

好奇心 https://pathak22.github.io/noreward-rl/ 建议电脑上结合源代码一起看

train.py 封装了各种启动命令

def create_commands(session, num_workers, remotes, env_id, logdir, shell='bash',
                    mode='tmux', visualise=False, envWrap=False, designHead=None,
                    unsup=None, noReward=False, noLifeReward=False, psPort=12222,
                    delay=0, savio=False, pretrain=None):
    # for launching the TF workers and for launching tensorboard
    py_cmd = 'python' if savio else sys.executable
    base_cmd = [
        'CUDA_VISIBLE_DEVICES=',
        py_cmd, 'worker.py', 子进程入口

worder.py  是 tensorflow相关训练的封装

if args.job_name == "worker":
    server = tf.train.Server(cluster, job_name="worker", task_index=args.task,
                             config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=2))
    if args.delay > 0:
        print('Startup delay in worker: {}s'.format(args.delay))
        time.sleep(args.delay)
        print('.. wait over !')
    run(args, server)

run 函数

def run(args, server):
    env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes, envWrap=args.envWrap, designHead=args.designHead,
                        noLifeReward=args.noLifeReward)
    trainer = A3C(env, args.task, args.visualise, args.unsup, args.envWrap, args.designHead, args.noReward)

。。。。

sv = tf.train.Supervisor(is_chief=(args.task == 0),  tensorflow外层框架
                         logdir=logdir,
                         saver=saver,
                         summary_op=None,
                         init_op=init_op,
                         init_fn=init_fn,
                         summary_writer=summary_writer,
                         ready_op=tf.report_uninitialized_variables(variables_to_save),
                         global_step=trainer.global_step,
                         save_model_secs=30,
                         save_summaries_secs=30) model自动存储等功能进行方便的配置

num_global_steps = constants['MAX_GLOBAL_STEPS']

logger.info(
    "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " +
    "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.")
with sv.managed_session(server.target, config=config) as sess, sess.as_default():
    # Workaround for FailedPreconditionError
    # see: https://github.com/openai/universe-starter-agent/issues/44 and 31
    sess.run(trainer.sync)

    trainer.start(sess, summary_writer)  入口
    global_step = sess.run(trainer.global_step)
    logger.info("Starting training at gobal_step=%d", global_step)
    while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps):
        trainer.process(sess)

A3c.py：

定义线程中的各个网络

with tf.device(worker_device):
    with tf.variable_scope("local"):
        self.local_network = pi = LSTMPolicy(env.observation_space.shape, numaction, designHead)
        pi.global_step = self.global_step
        if self.unsup:
            with tf.variable_scope("predictor"):
                if 'state' in unsupType:
                    self.local_ap_network = predictor = StatePredictor(env.observation_space.shape, numaction, designHead, unsupType)
                else:
                    self.local_ap_network = predictor = StateActionPredictor(env.observation_space.shape, numaction, designHead)

def start(self, sess, summary_writer):
    self.runner.start_runner(sess, summary_writer)

self.runner = RunnerThread(env, pi, constants['ROLLOUT_MAXLEN'], visualise,
                            predictor, envWrap, noReward)

class RunnerThread(threading.Thread):
    """
    One of the key distinctions between a normal environment and a universe environment
    is that a universe environment is _real time_.  This means that there should be a thread
    that would constantly interact with the environment and tell it what to do.  This thread is here.
    """
    def __init__(self, env, policy, num_local_steps, visualise, predictor, envWrap,
                    noReward):
        .......

    def start_runner(self, sess, summary_writer):
        self.sess = sess
        self.summary_writer = summary_writer
        self.start()

    def run(self):
        with self.sess.as_default():
            self._run()

    def _run(self):
        rollout_provider = env_runner(self.env, self.policy, self.num_local_steps,
                                        self.summary_writer, self.visualise, self.predictor,
                                        self.envWrap, self.noReward)
        while True:
            self.queue.put(next(rollout_provider), timeout=600.0)

启动具体的游戏环境进行交互
def env_runner(env, policy, num_local_steps, summary_writer, render, predictor,
                envWrap, noReward):

for _ in range(num_local_steps):
    # run policy
    fetched = policy.act(last_state, *last_features)
    action, value_, features = fetched[0], fetched[1], fetched[2:]

    # run environment: get action_index from sampled one-hot 'action'
    stepAct = action.argmax()
    state, reward, terminal, info = env.step(stepAct)

curr_tuple = [last_state, action, reward, value_, terminal, last_features]
if predictor is not None:
    bonus = predictor.pred_bonus(last_state, state, action)
    curr_tuple += [bonus, state]
    life_bonus += bonus
    ep_bonus += bonus

网络定义文件 model.py

class LSTMPolicy(object):
    

class StateActionPredictor(object):
    def __init__(self, ob_space, ac_space, designHead='universe'):
        # input: s1,s2: : [None, h, w, ch] (usually ch=1 or 4)
        # asample: 1-hot encoding of sampled action from policy: [None, ac_space]
        input_shape = [None] + list(ob_space)
        self.s1 = phi1 = tf.placeholder(tf.float32, input_shape)
        self.s2 = phi2 = tf.placeholder(tf.float32, input_shape)
        self.asample = asample = tf.placeholder(tf.float32, [None, ac_space])

# inverse model: g(phi1,phi2) -> a_inv: [None, ac_space]
g = tf.concat(1,[phi1, phi2])
g = tf.nn.relu(linear(g, size, "g1", normalized_columns_initializer(0.01)))
aindex = tf.argmax(asample, axis=1)  # aindex: [batch_size,]
logits = linear(g, ac_space, "glast", normalized_columns_initializer(0.01))
self.invloss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
                                logits, aindex), name="invloss")
self.ainvprobs = tf.nn.softmax(logits, dim=-1)

# forward model: f(phi1,asample) -> phi2
# Note: no backprop to asample of policy: it is treated as fixed for predictor training

全局网络跟子网络的变量优化：

class a3c的部分初始化及loss 训练等

with tf.device(tf.train.replica_device_setter(1, worker_device=worker_device)):
    with tf.variable_scope("global"):
        self.network = LSTMPolicy(env.observation_space.shape, numaction, designHead)
        self.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32),
                                           trainable=False)
        if self.unsup:
            with tf.variable_scope("predictor"):
                if 'state' in unsupType:
                    self.ap_network = StatePredictor(env.observation_space.shape, numaction, designHead, unsupType)
                else:
                    self.ap_network = StateActionPredictor(env.observation_space.shape, numaction, designHead)

with tf.device(worker_device):
    with tf.variable_scope("local"):
        self.local_network = pi = LSTMPolicy(env.observation_space.shape, numaction, designHead)
        pi.global_step = self.global_step
        if self.unsup:
            with tf.variable_scope("predictor"):
                if 'state' in unsupType:
                    self.local_ap_network = predictor = StatePredictor(env.observation_space.shape, numaction, designHead, unsupType)
                else:
                    self.local_ap_network = predictor = StateActionPredictor(env.observation_space.shape, numaction, designHead)

    # Computing a3c loss: https://arxiv.org/abs/1506.02438
    self.ac = tf.placeholder(tf.float32, [None, numaction], name="ac")
    self.adv = tf.placeholder(tf.float32, [None], name="adv")
    self.r = tf.placeholder(tf.float32, [None], name="r")
    log_prob_tf = tf.nn.log_softmax(pi.logits)
    prob_tf = tf.nn.softmax(pi.logits)
    # 1) the "policy gradients" loss:  its derivative is precisely the policy gradient
    # notice that self.ac is a placeholder that is provided externally.
    # adv will contain the advantages, as calculated in process_rollout
    pi_loss = - tf.reduce_mean(tf.reduce_sum(log_prob_tf * self.ac, 1) * self.adv)  # Eq (19)
    # 2) loss of value function: l2_loss = (x-y)^2/2
    vf_loss = 0.5 * tf.reduce_mean(tf.square(pi.vf - self.r))  # Eq (28)
    # 3) entropy to ensure randomness
    entropy = - tf.reduce_mean(tf.reduce_sum(prob_tf * log_prob_tf, 1))
    # final a3c loss: lr of critic is half of actor
    self.loss = pi_loss + 0.5 * vf_loss - entropy * constants['ENTROPY_BETA']

    # compute gradients
    grads = tf.gradients(self.loss * 20.0, pi.var_list)  # batchsize=20. Factored out to make hyperparams not depend on it.

    # computing predictor loss
    if self.unsup:
        if 'state' in unsupType:
            self.predloss = constants['PREDICTION_LR_SCALE'] * predictor.forwardloss
        else:
            self.predloss = constants['PREDICTION_LR_SCALE'] * (predictor.invloss * (1-constants['FORWARD_LOSS_WT']) +
                                                            predictor.forwardloss * constants['FORWARD_LOSS_WT'])
        predgrads = tf.gradients(self.predloss * 20.0, predictor.var_list)  # batchsize=20. Factored out to make hyperparams not depend on it.

        # do not backprop to policy
        if constants['POLICY_NO_BACKPROP_STEPS'] > 0:
            grads = [tf.scalar_mul(tf.to_float(tf.greater(self.global_step, constants['POLICY_NO_BACKPROP_STEPS'])), grads_i)
                            for grads_i in grads]


    self.runner = RunnerThread(env, pi, constants['ROLLOUT_MAXLEN'], visualise,
                                predictor, envWrap, noReward)

   
    # clip gradients
    grads, _ = tf.clip_by_global_norm(grads, constants['GRAD_NORM_CLIP'])
    grads_and_vars = list(zip(grads, self.network.var_list))
    if self.unsup:
        predgrads, _ = tf.clip_by_global_norm(predgrads, constants['GRAD_NORM_CLIP'])
        pred_grads_and_vars = list(zip(predgrads, self.ap_network.var_list))
        grads_and_vars = grads_and_vars + pred_grads_and_vars

opt = tf.train.AdamOptimizer(constants['LEARNING_RATE'])
self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step)

内外reward的不同分支好奇心反映在哪里？命令行参数有，代码是下面的

env_running ：

if noReward:
    reward = 0.

好奇心的动作及环境互动的正反预测。

# inverse model: g(phi1,phi2) -> a_inv: [None, ac_space]
g = tf.concat(1,[phi1, phi2])
g = tf.nn.relu(linear(g, size, "g1", normalized_columns_initializer(0.01)))
aindex = tf.argmax(asample, axis=1)  # aindex: [batch_size,]
logits = linear(g, ac_space, "glast", normalized_columns_initializer(0.01))
self.invloss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
                                logits, aindex), name="invloss")
self.ainvprobs = tf.nn.softmax(logits, dim=-1)

# forward model: f(phi1,asample) -> phi2
# Note: no backprop to asample of policy: it is treated as fixed for predictor training