Tue 27 Aug 00:14:56 CEST 2024
This commit is contained in:
parent
2bb30de385
commit
00ee12d73c
171
test/test-rl3.js
Normal file
171
test/test-rl3.js
Normal file
|
@ -0,0 +1,171 @@
|
|||
// Maze of Torment World
|
||||
// Deep-Q Learning (DQN)
|
||||
|
||||
var height=7,width=7,start,dest;
|
||||
// 0: free place, 1: start, 2: destination, -1: wall
|
||||
var f=0,s=1,d=2,w=-1
|
||||
var maze = [
|
||||
[s,f,w,d,w,f,f],
|
||||
[f,f,w,f,w,f,f],
|
||||
[f,f,w,f,f,f,f],
|
||||
[f,f,w,w,w,f,f],
|
||||
[f,f,f,f,f,f,f],
|
||||
[f,f,f,f,w,w,w],
|
||||
[f,w,f,f,f,f,f],
|
||||
]
|
||||
|
||||
// world states
|
||||
var states = []
|
||||
maze.forEach(function (row,j) {
|
||||
states=states.concat(row)
|
||||
row.forEach(function (cell,i) {
|
||||
if (cell==s) start=i+j*width;
|
||||
if (cell==d) dest={x:i,y:j}
|
||||
})
|
||||
})
|
||||
|
||||
var way = []
|
||||
function reset (pr) {
|
||||
if (pr) print(way.join('\n'))
|
||||
way = maze.map(function (row) {
|
||||
return row.map(function (col) { return col==s?1:(col==w?'w':0) })})
|
||||
env.steps=0;
|
||||
env.good=0;
|
||||
env.error=0;
|
||||
env.iteration++;
|
||||
}
|
||||
var actions = ['left','right','up','down']
|
||||
|
||||
// Agent sensor states (perception)
|
||||
// Distances {N,S,W,E} to boundaries and walls, distance
|
||||
var sensors = [0,0,0,0,0]
|
||||
|
||||
var env = {};
|
||||
|
||||
env.steps = 0;
|
||||
env.iteration = 0;
|
||||
env.error = 0;
|
||||
env.good = 0;
|
||||
env.last = 0;
|
||||
|
||||
// required by learner
|
||||
env.getNumStates = function() { return sensors.length /*!!*/ }
|
||||
env.getMaxNumActions = function() { return actions.length; }
|
||||
|
||||
// internals
|
||||
env.nextState = function(state,action) {
|
||||
var nx, ny, nextstate;
|
||||
var x = env.stox(state);
|
||||
var y = env.stoy(state);
|
||||
// free place to move around
|
||||
switch (action) {
|
||||
case 'left' : nx=x-1; ny=y; break;
|
||||
case 'right' : nx=x+1; ny=y; break;
|
||||
case 'up' : ny=y-1; nx=x; break;
|
||||
case 'down' : ny=y+1; nx=x; break;
|
||||
}
|
||||
nextstate = env.xytos(nx,ny);
|
||||
if (nx<0 || ny<0 || nx >= width || ny >= height ||
|
||||
states[nextstate]==w) {
|
||||
nextstate=-1;
|
||||
return nextstate;
|
||||
}
|
||||
way[ny][nx]=1;
|
||||
env.steps++;
|
||||
return nextstate;
|
||||
}
|
||||
env.reward = function (state,action,nextstate) {
|
||||
// reward of being in s, taking action a, and ending up in ns
|
||||
var reward;
|
||||
var dist1=Math.sqrt(Math.pow(dest.x-env.stox(nextstate),2)+
|
||||
Math.pow(dest.y-env.stoy(nextstate),2))
|
||||
var dist2=Math.sqrt(Math.pow(dest.x-env.stox(state),2)+
|
||||
Math.pow(dest.y-env.stoy(state),2))
|
||||
if (nextstate==env.laststate) reward = -10; // avoid ping-pong
|
||||
else if (nextstate==-1) reward = -100; // wall hit or outside world
|
||||
else if (dist1 < 1) reward = 100-env.steps/10; // destination found
|
||||
else reward = (dist1-dist2)<0?dist1/10:-dist1/10; // on the way
|
||||
env.laststate=nextstate;
|
||||
return reward;
|
||||
}
|
||||
|
||||
// Update sensors
|
||||
env.perception = function (state) {
|
||||
var i,
|
||||
dist=Math.sqrt(Math.pow(dest.x-env.stox(state),2)+
|
||||
Math.pow(dest.y-env.stoy(state),2)),
|
||||
x = env.stox(state),
|
||||
y = env.stoy(state),
|
||||
sensors = [0,0,0,0,dist]; // N S W E
|
||||
// Distances to obstacles
|
||||
for(i=y;i>0;i--) { if (states[env.xytos(x,i)]==w) break }
|
||||
sensors[0]=y-i-1;
|
||||
for(i=y;i<height;i++) { if (states[env.xytos(x,i)]==w) break }
|
||||
sensors[1]=i-y-1;
|
||||
for(i=x;i>0;i--) { if (states[env.xytos(i,y)]==w) break }
|
||||
sensors[2]=x-i-1;
|
||||
for(i=x;i<width;i++) { if (states[env.xytos(i,y)]==w) break }
|
||||
sensors[3]=i-x-1;
|
||||
return sensors
|
||||
}
|
||||
// utils
|
||||
env.stox = function (s) { return s % width }
|
||||
env.stoy = function (s) { return Math.floor(s / width) }
|
||||
env.xytos = function (x,y) { return x+y*width }
|
||||
|
||||
reset()
|
||||
|
||||
// create the DQN agent
|
||||
var model = ml.learn({
|
||||
algorithm : ml.ML.RL,
|
||||
kind : ml.ML.DQNAgent,
|
||||
actions : actions,
|
||||
|
||||
// specs
|
||||
update : 'qlearn', // qlearn | sarsa
|
||||
gamma : 0.9, // discount factor, [0, 1)
|
||||
epsilon : 0.2, // initial epsilon for epsilon-greedy policy, [0, 1)
|
||||
alpha : 0.005, // value function learning rate
|
||||
experience_add_every : 5, // number of time steps before we add another experience to replay memory
|
||||
experience_size : 10000, // size of experience
|
||||
learning_steps_per_iteration : 5,
|
||||
tderror_clamp : 1.0, // for robustness
|
||||
num_hidden_units : 100, // number of neurons in hidden layer
|
||||
|
||||
environment : env
|
||||
});
|
||||
|
||||
print(model)
|
||||
print(toJSON(model).length+' Bytes')
|
||||
|
||||
var state = start; // world state. upper left corner
|
||||
|
||||
// The agent searches the destination with random walk
|
||||
// If the the destination was found, it jumps back to the start
|
||||
later(1,function(task){ // start the learning loop
|
||||
sensors = env.perception(state);
|
||||
var action = ml.action(model,sensors); // s is a vector
|
||||
//... execute action in environment and get the reward
|
||||
var ns = env.nextState(state,action);
|
||||
var reward = env.reward(state,action,ns)
|
||||
if (states[ns]==d) {
|
||||
// destination found
|
||||
print('iteration='+env.iteration,', reward='+reward,' action: steps='+env.good,'error='+env.error+' tderror='+
|
||||
model.tderror)
|
||||
ns=start;
|
||||
reset(true);
|
||||
}
|
||||
if (ns==-1) env.error++;
|
||||
else env.good++;
|
||||
// print(state,ns,sensors,reward)
|
||||
ml.update(model,reward)
|
||||
state = ns==-1?state:ns
|
||||
// state = ns==-1?start:ns
|
||||
if (reward > 10) {
|
||||
save('/tmp/rl.json',model);
|
||||
print('continue with test-rl4.js ...')
|
||||
kill(task);
|
||||
}
|
||||
return true
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user