Curiosity-driven Exploration 好奇心代码阅读

2018 年 3 月 15 日 CreateAMind

好奇心 https://pathak22.github.io/noreward-rl/ 建议电脑上结合源代码一起看


train.py  封装了各种启动命令

def create_commands(session, num_workers, remotes, env_id, logdir, shell='bash',
mode='tmux', visualise=False, envWrap=False, designHead=None,
unsup=None, noReward=False, noLifeReward=False, psPort=12222,
delay=0, savio=False, pretrain=None):
   # for launching the TF workers and for launching tensorboard
   py_cmd = 'python' if savio else sys.executable
base_cmd = [
'CUDA_VISIBLE_DEVICES=',
py_cmd, 'worker.py', 子进程入口


worder.py  是 tensorflow相关训练的封装
if args.job_name == "worker":
   server = tf.train.Server(cluster, job_name="worker", task_index=args.task,
config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=2))
if args.delay > 0:
       print('Startup delay in worker: {}s'.format(args.delay))
time.sleep(args.delay)
print('.. wait over !')
run(args, server)


run 函数

def run(args, server):
   env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes, envWrap=args.envWrap, designHead=args.designHead,
noLifeReward=args.noLifeReward)
trainer = A3C(env, args.task, args.visualise, args.unsup, args.envWrap, args.designHead, args.noReward)

。。。。

sv = tf.train.Supervisor(is_chief=(args.task == 0),  tensorflow外层框架
logdir=logdir,
saver=saver,
summary_op=None,
init_op=init_op,
init_fn=init_fn,
summary_writer=summary_writer,
ready_op=tf.report_uninitialized_variables(variables_to_save),
global_step=trainer.global_step,
save_model_secs=30,
save_summaries_secs=30) model自动存储等功能进行方便的配置

num_global_steps = constants['MAX_GLOBAL_STEPS']

logger.info(
"Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " +
   "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.")
with sv.managed_session(server.target, config=config) as sess, sess.as_default():
   # Workaround for FailedPreconditionError
   # see: https://github.com/openai/universe-starter-agent/issues/44 and 31
   sess.run(trainer.sync)

trainer.start(sess, summary_writer) 入口
global_step = sess.run(trainer.global_step)
logger.info("Starting training at gobal_step=%d", global_step)
while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps):
       trainer.process(sess)

A3c.py:


定义线程中的各个网络

with tf.device(worker_device):
   with tf.variable_scope("local"):
       self.local_network = pi = LSTMPolicy(env.observation_space.shape, numaction, designHead)
pi.global_step = self.global_step
if self.unsup:
           with tf.variable_scope("predictor"):
               if 'state' in unsupType:
                   self.local_ap_network = predictor = StatePredictor(env.observation_space.shape, numaction, designHead, unsupType)
else:
                   self.local_ap_network = predictor = StateActionPredictor(env.observation_space.shape, numaction, designHead)


def start(self, sess, summary_writer):
   self.runner.start_runner(sess, summary_writer)
self.runner = RunnerThread(env, pi, constants['ROLLOUT_MAXLEN'], visualise,
predictor, envWrap, noReward)
class RunnerThread(threading.Thread):
   """
   One of the key distinctions between a normal environment and a universe environment
   is that a universe environment is _real time_.  This means that there should be a thread
   that would constantly interact with the environment and tell it what to do.  This thread is here.
   """
   def __init__(self, env, policy, num_local_steps, visualise, predictor, envWrap,
noReward):
       .......

   def start_runner(self, sess, summary_writer):
       self.sess = sess
       self.summary_writer = summary_writer
       self.start()

def run(self):
       with self.sess.as_default():
           self._run()

def _run(self):
       rollout_provider = env_runner(self.env, self.policy, self.num_local_steps,
self.summary_writer, self.visualise, self.predictor,
self.envWrap, self.noReward)
while True:
           self.queue.put(next(rollout_provider), timeout=600.0)
启动具体的游戏环境进行交互
def
env_runner(env, policy, num_local_steps, summary_writer, render, predictor,
envWrap, noReward):
for _ in range(num_local_steps):
   # run policy
   fetched = policy.act(last_state, *last_features)
action, value_, features = fetched[0], fetched[1], fetched[2:]

# run environment: get action_index from sampled one-hot 'action'
   stepAct = action.argmax()
state, reward, terminal, info = env.step(stepAct)
curr_tuple = [last_state, action, reward, value_, terminal, last_features]
if predictor is not None:
   bonus = predictor.pred_bonus(last_state, state, action)
curr_tuple += [bonus, state]
life_bonus += bonus
ep_bonus += bonus


网络定义文件 model.py 

class LSTMPolicy(object):
   

class StateActionPredictor(object):
   def __init__(self, ob_space, ac_space, designHead='universe'):
       # input: s1,s2: : [None, h, w, ch] (usually ch=1 or 4)
       # asample: 1-hot encoding of sampled action from policy: [None, ac_space]
       input_shape = [None] + list(ob_space)
self.s1 = phi1 = tf.placeholder(tf.float32, input_shape)
self.s2 = phi2 = tf.placeholder(tf.float32, input_shape)
self.asample = asample = tf.placeholder(tf.float32, [None, ac_space])


# inverse model: g(phi1,phi2) -> a_inv: [None, ac_space]
g = tf.concat(1,[phi1, phi2])
g = tf.nn.relu(linear(g, size, "g1", normalized_columns_initializer(0.01)))
aindex = tf.argmax(asample, axis=1) # aindex: [batch_size,]
logits = linear(g, ac_space, "glast", normalized_columns_initializer(0.01))
self.invloss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
logits, aindex), name="invloss")
self.ainvprobs = tf.nn.softmax(logits, dim=-1)

# forward model: f(phi1,asample) -> phi2
# Note: no backprop to asample of policy: it is treated as fixed for predictor training


全局网络跟子网络的变量优化:

class a3c的部分初始化 及loss 训练等

with tf.device(tf.train.replica_device_setter(1, worker_device=worker_device)):
   with tf.variable_scope("global"):
       self.network = LSTMPolicy(env.observation_space.shape, numaction, designHead)
self.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32),
trainable=False)
if self.unsup:
           with tf.variable_scope("predictor"):
               if 'state' in unsupType:
                   self.ap_network = StatePredictor(env.observation_space.shape, numaction, designHead, unsupType)
else:
                   self.ap_network = StateActionPredictor(env.observation_space.shape, numaction, designHead)

with tf.device(worker_device):
   with tf.variable_scope("local"):
       self.local_network = pi = LSTMPolicy(env.observation_space.shape, numaction, designHead)
pi.global_step = self.global_step
if self.unsup:
           with tf.variable_scope("predictor"):
               if 'state' in unsupType:
                   self.local_ap_network = predictor = StatePredictor(env.observation_space.shape, numaction, designHead, unsupType)
else:
                   self.local_ap_network = predictor = StateActionPredictor(env.observation_space.shape, numaction, designHead)

# Computing a3c loss: https://arxiv.org/abs/1506.02438
   self.ac = tf.placeholder(tf.float32, [None, numaction], name="ac")
self.adv = tf.placeholder(tf.float32, [None], name="adv")
self.r = tf.placeholder(tf.float32, [None], name="r")
log_prob_tf = tf.nn.log_softmax(pi.logits)
prob_tf = tf.nn.softmax(pi.logits)
# 1) the "policy gradients" loss:  its derivative is precisely the policy gradient
   # notice that self.ac is a placeholder that is provided externally.
   # adv will contain the advantages, as calculated in process_rollout
   pi_loss = - tf.reduce_mean(tf.reduce_sum(log_prob_tf * self.ac, 1) * self.adv) # Eq (19)
   # 2) loss of value function: l2_loss = (x-y)^2/2
   vf_loss = 0.5 * tf.reduce_mean(tf.square(pi.vf - self.r)) # Eq (28)
   # 3) entropy to ensure randomness
   entropy = - tf.reduce_mean(tf.reduce_sum(prob_tf * log_prob_tf, 1))
# final a3c loss: lr of critic is half of actor
   self.loss = pi_loss + 0.5 * vf_loss - entropy * constants['ENTROPY_BETA']

# compute gradients
   grads = tf.gradients(self.loss * 20.0, pi.var_list) # batchsize=20. Factored out to make hyperparams not depend on it.

   # computing predictor loss
   if self.unsup:
       if 'state' in unsupType:
           self.predloss = constants['PREDICTION_LR_SCALE'] * predictor.forwardloss
else:
           self.predloss = constants['PREDICTION_LR_SCALE'] * (predictor.invloss * (1-constants['FORWARD_LOSS_WT']) +
                                                           predictor.forwardloss * constants['FORWARD_LOSS_WT'])
predgrads = tf.gradients(self.predloss * 20.0, predictor.var_list) # batchsize=20. Factored out to make hyperparams not depend on it.

       # do not backprop to policy
       if constants['POLICY_NO_BACKPROP_STEPS'] > 0:
           grads = [tf.scalar_mul(tf.to_float(tf.greater(self.global_step, constants['POLICY_NO_BACKPROP_STEPS'])), grads_i)
for grads_i in grads]


self.runner = RunnerThread(env, pi, constants['ROLLOUT_MAXLEN'], visualise,
predictor, envWrap, noReward)


# clip gradients
   grads, _ = tf.clip_by_global_norm(grads, constants['GRAD_NORM_CLIP'])
grads_and_vars = list(zip(grads, self.network.var_list))
if self.unsup:
       predgrads, _ = tf.clip_by_global_norm(predgrads, constants['GRAD_NORM_CLIP'])
pred_grads_and_vars = list(zip(predgrads, self.ap_network.var_list))
grads_and_vars = grads_and_vars + pred_grads_and_vars
opt = tf.train.AdamOptimizer(constants['LEARNING_RATE'])
self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step)


内外reward的不同分支  好奇心反映在哪里?命令行参数有,代码是下面的

env_running  :

if noReward:
   reward = 0.


好奇心的动作及环境互动的正反预测。

# inverse model: g(phi1,phi2) -> a_inv: [None, ac_space]
g = tf.concat(1,[phi1, phi2])
g = tf.nn.relu(linear(g, size, "g1", normalized_columns_initializer(0.01)))
aindex = tf.argmax(asample, axis=1) # aindex: [batch_size,]
logits = linear(g, ac_space, "glast", normalized_columns_initializer(0.01))
self.invloss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
logits, aindex), name="invloss")
self.ainvprobs = tf.nn.softmax(logits, dim=-1)

# forward model: f(phi1,asample) -> phi2
# Note: no backprop to asample of policy: it is treated as fixed for predictor training



没有完全理清楚,还有部分问题:

stat 4步图像是哪里设置??

A3c的优化??

police如何训练的?能否训练社交网络知道每个动作的结果,然后自己组织相关动作的输出??

small a3c code, a2c.





欢迎大家批评指教!

登录查看更多
4

相关内容

【Google】利用AUTOML实现加速感知神经网络设计
专知会员服务
30+阅读 · 2020年3月5日
深度强化学习策略梯度教程,53页ppt
专知会员服务
182+阅读 · 2020年2月1日
强化学习最新教程,17页pdf
专知会员服务
177+阅读 · 2019年10月11日
机器学习入门的经验与建议
专知会员服务
94+阅读 · 2019年10月10日
机器学习相关资源(框架、库、软件)大列表
专知会员服务
40+阅读 · 2019年10月9日
MIT新书《强化学习与最优控制》
专知会员服务
277+阅读 · 2019年10月9日
通过Docker安装谷歌足球游戏环境
CreateAMind
11+阅读 · 2019年7月7日
谷歌足球游戏环境使用介绍
CreateAMind
33+阅读 · 2019年6月27日
已删除
将门创投
7+阅读 · 2019年3月28日
动物脑的好奇心和强化学习的好奇心
CreateAMind
10+阅读 · 2019年1月26日
RL 真经
CreateAMind
5+阅读 · 2018年12月28日
已删除
生物探索
3+阅读 · 2018年2月10日
强化学习 cartpole_a3c
CreateAMind
9+阅读 · 2017年7月21日
Arxiv
7+阅读 · 2018年12月26日
Large-Scale Study of Curiosity-Driven Learning
Arxiv
8+阅读 · 2018年8月13日
Arxiv
6+阅读 · 2018年2月7日
VIP会员
相关VIP内容
【Google】利用AUTOML实现加速感知神经网络设计
专知会员服务
30+阅读 · 2020年3月5日
深度强化学习策略梯度教程,53页ppt
专知会员服务
182+阅读 · 2020年2月1日
强化学习最新教程,17页pdf
专知会员服务
177+阅读 · 2019年10月11日
机器学习入门的经验与建议
专知会员服务
94+阅读 · 2019年10月10日
机器学习相关资源(框架、库、软件)大列表
专知会员服务
40+阅读 · 2019年10月9日
MIT新书《强化学习与最优控制》
专知会员服务
277+阅读 · 2019年10月9日
相关资讯
通过Docker安装谷歌足球游戏环境
CreateAMind
11+阅读 · 2019年7月7日
谷歌足球游戏环境使用介绍
CreateAMind
33+阅读 · 2019年6月27日
已删除
将门创投
7+阅读 · 2019年3月28日
动物脑的好奇心和强化学习的好奇心
CreateAMind
10+阅读 · 2019年1月26日
RL 真经
CreateAMind
5+阅读 · 2018年12月28日
已删除
生物探索
3+阅读 · 2018年2月10日
强化学习 cartpole_a3c
CreateAMind
9+阅读 · 2017年7月21日
Top
微信扫码咨询专知VIP会员