# DEMO #2 OF Q LEARNING. # # In this demo, the state used isn't really the full state of the world. # We see if something reasonable gets learned anyway... source("Qlearn.r") n.steps = 1000000 gamma = 0.95 alpha = 0.015 epsilon = 0.1 n.states = 10 # Positions arranged in a circle n.actions = 3 # 1 = move to lower, 2 = stay in place, 3 = move to higher # FUNCTION TO GENERATE INITIAL STATE. There is a hidden part of the # state, in the global variable "marks". init2 = function () { marks <<- rep(0,n.states) sample(n.states,1) } # FUNCTION TO GENERATE REWARDS AND TRANSITION TO NEXT STATE. In the hidden # part of the state, some marks are set to 1, producing future rewards. world2 = function (s, a) { if (runif(1)<0.05) { s = sample(n.states,1) } else { s = s + (a-2) if (s<1) s = n.states if (s>n.states) s = 1 } r = marks[s] - 10*as.numeric(s==1) marks <<- as.numeric (marks>0 | (runif(n.states)<0.3)) marks[s] <<- 0 list (s=s, r=r) } # DO SIMULATION AND DISPLAY RESULTS. set.seed(1) result2 = simulate (init2, world2, gamma, alpha, epsilon, n.steps) postscript("Qplots2.ps",horiz=F,width=6.5,height=4,pointsize=9) par(mfcol=c(3,3),mar=c(4.1,4.1,1,1)) hplot(result2$history[1:100,]) hplot(result2$history[seq(1000,n.steps,by=1000),]) hplot(result2$history[(n.steps-100):n.steps,]) dev.off() cat("Q matrix and best actions\n") action = apply(result2$Q,1,function(x)order(x)[3]) print (cbind (round(result2$Q,2), action)) cat("Average reward:",mean(result2$history[,"r"]),"\n")