# DEMO #2 OF Q LEARNING.
#
# In this demo, the state used isn't really the full state of the world.
# We see if something reasonable gets learned anyway...

source("Qlearn.r")

n.steps = 1000000

gamma = 0.95
alpha = 0.015
epsilon = 0.1

n.states = 10	# Positions arranged in a circle
n.actions = 3	# 1 = move to lower, 2 = stay in place, 3 = move to higher


# FUNCTION TO GENERATE INITIAL STATE.  There is a hidden part of the
# state, in the global variable "marks".

init2 = function ()
{ 
  marks <<- rep(0,n.states)
  sample(n.states,1)
}


# FUNCTION TO GENERATE REWARDS AND TRANSITION TO NEXT STATE.  In the hidden
# part of the state, some marks are set to 1, producing future rewards.

world2 = function (s, a)
{
  if (runif(1)<0.05)
  { s = sample(n.states,1)
  }
  else
  { s = s + (a-2)
    if (s<1) s = n.states
    if (s>n.states) s = 1
  }

  r = marks[s] - 10*as.numeric(s==1)

  marks <<- as.numeric (marks>0 | (runif(n.states)<0.3))
  marks[s] <<- 0
 
  list (s=s, r=r)
}


# DO SIMULATION AND DISPLAY RESULTS.

set.seed(1)

result2 = simulate (init2, world2, gamma, alpha, epsilon, n.steps)

postscript("Qplots2.ps",horiz=F,width=6.5,height=4,pointsize=9)
par(mfcol=c(3,3),mar=c(4.1,4.1,1,1))
hplot(result2$history[1:100,])
hplot(result2$history[seq(1000,n.steps,by=1000),])
hplot(result2$history[(n.steps-100):n.steps,])
dev.off()

cat("Q matrix and best actions\n")

action = apply(result2$Q,1,function(x)order(x)[3])
print (cbind (round(result2$Q,2), action))

cat("Average reward:",mean(result2$history[,"r"]),"\n")