# DEMO #2 OF Q LEARNING. # # In this demo, the state used isn't really the full state of the world. # We see if something reasonable gets learned anyway, with the help of # one bit of "memory" that can be stored in the state... source("Qlearn.r") n.steps = 1000000 gamma = 0.95 alpha = 0.015 epsilon = 0.1 n.states = 10*2 # Positions arranged in a circle, plus one bit of memory n.actions = 3*2 # 1 = move to lower, 2 = stay in place, 3 = move to higher # 4, 5, 6 similar but also toggle bit # FUNCTION TO GENERATE INITIAL STATE. There is a hidden part of the # state, in the global variable "marks". init2m = function () { marks <<- rep(0,n.states) sample(n.states,1) } # FUNCTION TO GENERATE REWARDS AND TRANSITION TO NEXT STATE. In the hidden # part of the state, some marks are set to 1, producing future rewards. # The action taken can toggle the bit of memory stored in the state. world2m = function (s, a) { bit = as.numeric(s>n.states/2) s = s - bit*n.states/2 if (runif(1)<0.05) { s = sample(n.states/2,1) } else { if (a>3) { bit = 1-bit a = a-3 } s = s + (a-2) if (s<1) s = n.states/2 if (s>n.states/2) s = 1 } r = marks[s] - 10*as.numeric(s==1) marks <<- as.numeric (marks>0 | (runif(n.states)<0.3)) marks[s] <<- 0 s = s + bit*n.states/2 list (s=s, r=r) } # PLOT INFORMATION FROM HISTORY. Displays memory and toggling actions # in colour. hplotm = function (history) { bits = as.numeric(history[,"s"]>n.states/2) places = history[,"s"] - bits*n.states/2 times = history[,"t"] actions = history[,"a"] act.toggle = as.numeric(actions>3) act.move = actions - 3*act.toggle plot (times,places,pch=20,xlab="time step",ylab="state", col=c("blue","red")[bits+1]) plot (times,act.move,pch=20,xlab="time step",ylab="action", col=c("blue","red")[act.toggle+1]) plot (history[,"t"],history[,"r"],pch=20, xlab="time step",ylab="reward / smoothed reward") lines (history[,"t"],history[,"rs"], col="gray") } # DO SIMULATION AND DISPLAY RESULTS. set.seed(1) result2m = simulate (init2m, world2m, gamma, alpha, epsilon, n.steps) postscript("Qplots2m.ps",horiz=F,width=6.5,height=4,pointsize=9) par(mfcol=c(3,3),mar=c(4.1,4.1,1,1)) hplotm(result2m$history[1:200,]) hplotm(result2m$history[seq(500,n.steps,by=500),]) hplotm(result2m$history[(n.steps-200):n.steps,]) dev.off() cat("Q matrix and best actions\n") action = apply(result2m$Q,1,function(x)order(x)[6]) act.toggle = as.numeric(action>3) act.move = action - 3*act.toggle print (cbind (round(result2m$Q,2), act.move, act.toggle)) cat("Average reward:",mean(result2m$history[,"r"]),"\n")