Hello. I am trying to similar to tensoflow working code with SameDiff.
So, tenflow code:
obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
act_ph = tf.placeholder(shape=(None, ), dtype=tf.int32, name='act')
ret_ph = tf.placeholder(shape=(None, ), dtype=tf.float32, name='ret')
x = tf.layers.dense(obs_ph, units=32, activation=tf.tanh)
p_logits = tf.layers.dense(x, units=2, activation=None)
actions_mask = tf.one_hot(act_ph, depth=act_dim)
p_log = tf.reduce_sum(actions_mask * tf.nn.log_softmax(p_logits), axis=1)
p_loss = -tf.reduce_mean(p_log * ret_ph)
p_opt = tf.train.AdamOptimizer(lr).minimize(p_loss)
My samediff variant:
SameDiff sd = SameDiff.create();
var obsPh = sd.placeHolder("obs", DataType.FLOAT, 2);
var actPh = sd.placeHolder("act", DataType.INT32, 1);
var retPh = sd.placeHolder("ret", DataType.FLOAT, 1);
SDVariable w0 = sd.var("w0", new XavierInitScheme('c', 2, 8), DataType.FLOAT, 2, 8);
SDVariable b0 = sd.var("b0", 1, 8);
SDVariable out0 = sd.nn().tanh(obsPh.mmul(w0).add(b0));
SDVariable w1 = sd.var("w1", new XavierInitScheme('c', 8, 2), DataType.FLOAT, 8, 2);
SDVariable b1 = sd.zero("b1", 1, 2);
SDVariable pLogits = sd.nn().tanh(out0.mmul(w1).add(b1));
SDVariable actSoftmax = sd.nn().softmax(pLogits);
SDVariable actionMasks = sd.oneHot(actPh, 2);
SDVariable pLog = sd.sum(actionMasks.mmul(sd.nn.logSoftmax(pLogits)), 1);
SDVariable pLoss = sd.mean(pLog.mmul(retPh)).mul(-1);
sd.setLossVariables(pLoss);
Is there anything similar to “p_opt = tf.train.AdamOptimizer(lr).minimize(p_loss)” in same diff (regarding that it is RL4j and there is no labels)?