Hi,
i try to create a simplified rl4j example based on the existing Gym and Malmo examples. Given is a sine wave and the AI should say if we are on top of the wave, on bottom or somewhere else(noop).
The SineRider is the “Game”, State is the value of the sine function(Just one double)
The problem is it never calls the step function in SineRider to get a reward. What do i wrong?
I also updated to beta7(still some deprecated stuff)
Code(Kotlin):
package aiexample
import org.deeplearning4j.gym.StepReply
import org.deeplearning4j.rl4j.learning.sync.qlearning.QLearning.QLConfiguration
import org.deeplearning4j.rl4j.learning.sync.qlearning.discrete.QLearningDiscreteDense
import org.deeplearning4j.rl4j.mdp.MDP
import org.deeplearning4j.rl4j.network.dqn.DQNFactoryStdDense
import org.deeplearning4j.rl4j.space.DiscreteSpace
import org.deeplearning4j.rl4j.space.Encodable
import org.deeplearning4j.rl4j.space.ObservationSpace
import org.nd4j.linalg.api.ndarray.INDArray
import org.nd4j.linalg.factory.Nd4j
import org.nd4j.linalg.learning.config.RmsProp
import kotlin.math.sin
object Example {
var ql = QLConfiguration.builder()
.seed(123) //Random seed (for reproducability)
.maxEpochStep(200) // Max step By epoch
.maxStep(15000) // Max step
.expRepMaxSize(150000) // Max size of experience replay
.batchSize(128) // size of batches
.targetDqnUpdateFreq(500) // target update (hard)
.updateStart(10) // num step noop warmup
.rewardFactor(0.01) // reward scaling
.gamma(0.99) // gamma
.errorClamp(1.0) // /td-error clipping
.minEpsilon(0.1f) // min epsilon
.epsilonNbStep(1000) // num step for eps greedy anneal
.doubleDQN(true) // double DQN
.build()
// The neural network used by the agent. Note that there is no need to specify the number of inputs/outputs.
// These will be read from the gym environment at the start of training.
var net = DQNFactoryStdDense.Configuration.builder()
.l2(0.0)
.updater(RmsProp(0.000025))
.numHiddenNodes(300)
.numLayer(2)
.build()
@JvmStatic
fun main(args: Array<String>) {
simpleSine()
}
private fun simpleSine() {
val mdp = Env.create()
val dql = QLearningDiscreteDense(mdp, net, ql)
dql.train()
mdp.close()
//return dql.getPolicy(); //return the trained agent.
}
}
class Action(val name:String) {
companion object {
val noop = Action("noop")
val top = Action("top")
val bottom = Action("bottom")
}
}
class State(private val inputs: DoubleArray): Encodable {
override fun toArray(): DoubleArray {
return inputs
}
override fun getData(): INDArray {
return Nd4j.create(inputs)
}
override fun dup(): Encodable {
TODO("Not yet implemented")
}
override fun isSkipped(): Boolean {
TODO("Not yet implemented")
}
}
class SineObservationSpace: ObservationSpace<State> {
override fun getLow(): INDArray {
return Nd4j.create(doubleArrayOf(-1.0))
}
override fun getHigh(): INDArray {
return Nd4j.create(doubleArrayOf(1.0))
}
override fun getName(): String {
return "Discrete"
}
override fun getShape(): IntArray {
return intArrayOf(1)
}
}
class SineRider{
companion object {
val actions = mapOf(
0 to Action.noop,
1 to Action.top,
2 to Action.bottom)
}
var i = 0.0
fun step(action:Int): Double{
val act = actions[action]
if(act == Action.top){
return if(i > 0.9) 1.0 else -1.0
}
if(act == Action.bottom){
return if(i < -0.9) 1.0 else -1.0
}
if(act == Action.noop){
return if(i < 0.9 && i > -0.9) 1.0 else -1.0
}
return 0.0
}
fun reset(){
}
fun next(){
i += 0.1
}
fun state(): State {
val sine = sin(i)
next()
return State(arrayOf(sine).toDoubleArray())
}
}
class Env(private val sineRider: SineRider) : MDP<State, Int, DiscreteSpace> {
private val actionSpace = DiscreteSpace(3)
private var done = false
override fun getObservationSpace(): ObservationSpace<State> {
return SineObservationSpace()
}
override fun getActionSpace(): DiscreteSpace {
return actionSpace
}
override fun step(action: Int): StepReply<State> {
val reward = sineRider.step(action)
val state = sineRider.state()
return StepReply(state, reward, true, null)
}
override fun isDone(): Boolean {
return true
}
override fun reset(): State? {
done = false
sineRider.reset()
return sineRider.state()
}
override fun close() {
}
override fun newInstance(): Env {
return create()
}
companion object {
fun create() : Env {
val sinRider = SineRider()
return Env(sinRider)
}
}
}