jam/test/test-rl1.js

135 lines
3.6 KiB
JavaScript

// Maze of Torment World
// Temporal Difference Learning (TD)
var height=7,width=7,start=0;
// 0: free place, 1: start, 2: destination, -1: wall
var f=0,s=1,d=2,w=-1
var maze = [
[s,f,w,d,w,f,f],
[f,f,w,f,w,f,f],
[f,f,w,f,f,f,f],
[f,f,w,w,w,f,f],
[f,f,f,f,f,f,f],
[f,f,f,f,w,w,w],
[f,w,f,f,f,f,f],
]
var states = []
maze.forEach(function (row) {
states=states.concat(row)
})
var way = []
function reset (pr) {
if (pr) print(way.join('\n'))
way = maze.map(function (row) {
return row.map(function (col) { return col==s?1:(col==w?'w':0) })})
env.steps=0;
}
var actions = ['left','right','up','down']
var env = {};
env.steps = 0;
env.iteration = 0;
// required by learner
env.getNumStates = function() { return height*width; }
env.getMaxNumActions = function() { return actions.length; }
env.nextState = function(state,action) {
var nx, ny, nextstate;
var x = env.stox(state);
var y = env.stoy(state);
switch (states[state]) {
case f:
case s:
// free place to move around
switch (action) {
case 'left' : nx=x-1; ny=y; break;
case 'right' : nx=x+1; ny=y; break;
case 'up' : ny=y-1; nx=x; break;
case 'down' : ny=y+1; nx=x; break;
}
nextstate = ny*width+nx;
way[ny][nx]=1;
env.steps++;
break;
case w:
// cliff! oh no! Should not happend - see below
// print('Back to start...')
nextstate=start;
reset(false)
env.iteration++;
break;
case d:
// agent wins! teleport to start
print('['+env.iteration+'] Found destination !!!!!!! steps='+env.steps)
reset(true);
nextstate=start;
env.iteration++;
break;
}
//print(state,action,nextstate)
return nextstate;
}
env.reward = function (state,action,nextstate) {
// reward of being in s, taking action a, and ending up in ns
var reward;
// If the destination was found, weight the reward with the number of steps
// return best reward for shortest path
if (states[state]==d) reward = 1.0-(env.steps/100)
else if (states[state]==w) reward = -1;
else reward = 0;
return reward;
}
env.allowedActions = function(state) {
var x = env.stox(state), y = env.stoy(state);
var actions=[];
if (x>0) actions.push('left');
if (y>0) actions.push('up');
if (x<width-1) actions.push('right');
if (y<height-1) actions.push('down');
return actions
}
// utils
env.stox = function (s) { return s % width }
env.stoy = function (s) { return Math.floor(s / width) }
reset()
// create the DQN agent
var model = ml.learn({
algorithm : ml.ML.RL,
kind : ml.ML.TDAgent,
actions : actions,
// specs
alpha : 0.1, // value function learning rate
beta : 0.2, // learning rate for smooth policy update
epsilon : 0.2, // initial epsilon for epsilon-greedy policy, [0, 1)
gamma : 0.5, // discount factor, [0, 1)
lambda : 0, // eligibility trace decay, [0,1). 0 = no eligibility traces
planN : 5, // number of planning steps per iteration. 0 = no planning
replacing_traces : true,
smooth_policy_update : false,
update : 'qlearn', // 'qlearn' or 'sarsa'
environment : env
});
print(model)
print(toJSON(model).length+' Bytes')
var state = start; // uppel left corner
var timer = setInterval(function(){ // start the learning loop
var action = ml.action(model,state); // s is an integer
//... execute action in environment and get the reward
// print(state,action,states[state])
var ns = env.nextState(state,action);
var reward = env.reward(ns)-0.01
ml.update(model,reward)
state = ns
}, 1);