I have attempted to implement Q-learning in to a simple game I have written. The game is based around the player having to "jump" to avoid oncoming boxes.
I have designed the system with two actions; jump
and do_nothing
and the states are the distances from the next block (divided and floored to ensure that there are not a large number of states).
My issue seems to be that my implementation of the algorithm isn't considering "future reward", and so it ends up jumping at the wrong times.
Here is my implementation of the Q-learning algorithm;
JumpGameAIClass.prototype.getQ = function getQ(state) {
if (!this.Q.hasOwnProperty(state)) {
this.Q[state] = {};
for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
var action = this.actions[actionIndex];
this.Q[state][action] = 0;
}
}
return this.Q[state];
};
JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
var closest = -1;
for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
var block = this.blocks[blockIndex];
var distance = block.x - this.playerX;
if (distance >= 0 && (closest === -1 || distance < closest)) {
closest = distance;
}
}
return Math.max(0, Math.floor(closest * this.resolution));
};
JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
var jumpReward = this.getQ(distance)[this.actions[0]];
var doNothingReward = this.getQ(distance)[this.actions[1]];
if (jumpReward > doNothingReward) {
return this.actions[0];
} else if (doNothingReward > jumpReward) {
return this.actions[1];
} else {
if (!this.canJump()) {
return this.actions[1];
}
return this.actions[Math.floor(Math.random() * this.actions.length)];
}
};
JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
// We can't jump while in mid-air
if (!this.canJump()) {
return this.actions[1];
}
if (Math.random() < this.epsilon) {
return this.actions[Math.floor(Math.random() * this.actions.length)];
} else {
return this.getActionWithHighestQ(this.getBlockDistance());
}
};
JumpGameAIClass.prototype.think = function think() {
var reward = this.liveReward;
if (this.score !== this.lastScore) {
this.lastScore = this.score;
reward = this.scoreReward;
} else if (!this.playerAlive) {
reward = this.deathReward;
}
this.drawDistance();
var distance = this.getBlockDistance(),
maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],
previousQ = this.getQ(this.lastDistance)[this.lastAction];
this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);
this.lastAction = this.getActionEpsilonGreedy();
this.lastDistance = distance;
switch (this.lastAction) {
case this.actions[0]:
this.jump();
break;
}
};
And here are some of the properties used by it:
epsilon: 0.05,
alpha: 1,
gamma: 1,
resolution: 0.1,
actions: [ 'jump', 'do_nothing' ],
Q: {},
liveReward: 0,
scoreReward: 100,
deathReward: -1000,
lastAction: 'do_nothing',
lastDistance: 0,
lastScore: 0
I am having to use lastAction/lastDistance to calculate Q, as I cannot use the current data (would be acting on the action performed in the frame before).
The think
method is called once every frame after all rendering and game stuff is done (physics, controls, death, etc).
var JumpGameAIClass = function JumpGame(canvas) {
Game.JumpGame.call(this, canvas);
Object.defineProperties(this, {
epsilon: {
value: 0.05
},
alpha: {
value: 1
},
gamma: {
value: 1
},
resolution: {
value: 0.1
},
actions: {
value: [ 'jump', 'do_nothing' ]
},
Q: {
value: { },
writable: true
},
liveReward: {
value: 0
},
scoreReward: {
value: 100
},
deathReward: {
value: -1000
},
lastAction: {
value: 'do_nothing',
writable: true
},
lastDistance: {
value: 0,
writable: true
},
lastScore: {
value: 0,
writable: true
}
});
};
JumpGameAIClass.prototype = Object.create(Game.JumpGame.prototype);
JumpGameAIClass.prototype.getQ = function getQ(state) {
if (!this.Q.hasOwnProperty(state)) {
this.Q[state] = {};
for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
var action = this.actions[actionIndex];
this.Q[state][action] = 0;
}
}
return this.Q[state];
};
JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
var closest = -1;
for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
var block = this.blocks[blockIndex];
var distance = block.x - this.playerX;
if (distance >= 0 && (closest === -1 || distance < closest)) {
closest = distance;
}
}
return Math.max(0, Math.floor(closest * this.resolution));
};
JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
var jumpReward = this.getQ(distance)[this.actions[0]];
var doNothingReward = this.getQ(distance)[this.actions[1]];
if (jumpReward > doNothingReward) {
return this.actions[0];
} else if (doNothingReward > jumpReward) {
return this.actions[1];
} else {
if (!this.canJump()) {
return this.actions[1];
}
return this.actions[Math.floor(Math.random() * this.actions.length)];
}
};
JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
if (!this.canJump()) {
return this.actions[1];
}
if (Math.random() < this.epsilon) {
return this.actions[Math.floor(Math.random() * this.actions.length)];
} else {
return this.getActionWithHighestQ(this.getBlockDistance());
}
};
JumpGameAIClass.prototype.onDeath = function onDeath() {
this.restart();
};
JumpGameAIClass.prototype.think = function think() {
var reward = this.liveReward;
if (this.score !== this.lastScore) {
this.lastScore = this.score;
reward = this.scoreReward;
} else if (!this.playerAlive) {
reward = this.deathReward;
}
this.drawDistance();
var distance = this.getBlockDistance(),
maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],
previousQ = this.getQ(this.lastDistance)[this.lastAction];
this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);
this.lastAction = this.getActionEpsilonGreedy();
this.lastDistance = distance;
switch (this.lastAction) {
case this.actions[0]:
this.jump();
break;
}
};
JumpGameAIClass.prototype.drawDistance = function drawDistance() {
this.context.save();
this.context.textAlign = 'center';
this.context.textBaseline = 'bottom';
this.context.fillText('Distance: ' + this.getBlockDistance(), this.canvasWidth / 2, this.canvasHeight / 4);
this.context.textBaseline = 'top';
this.context.fillText('Last Distance: ' + this.lastDistance, this.canvasWidth / 2, this.canvasHeight / 4);
this.context.restore();
};
JumpGameAIClass.prototype.onFrame = function onFrame() {
Game.JumpGame.prototype.onFrame.apply(this, arguments);
this.think();
}
Game.JumpGameAI = JumpGameAIClass;
body {
background-color: #EEEEEE;
text-align: center;
}
canvas#game {
background-color: #FFFFFF;
border: 1px solid #DDDDDD;
}
<!DOCTYPE HTML>
<html lang="en">
<head>
<title>jump</title>
</head>
<body>
<canvas id="game" width="512" height="512">
<h1>Your browser doesn't support canvas!</h1>
</canvas>
<script src="https://raw.githubusercontent.com/cagosta/requestAnimationFrame/master/app/requestAnimationFrame.js"></script>
<!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 -->
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script>
</body>
You basically have simplified version of :
Source: Flappy Bird RL
I used values :
epsilon: {
value: 0.01
},
alpha: {
value: 0.7
},
gamma: {
value: 0.9
},
resolution: {
value: 0.1
},
liveReward: {
value: 10
},
scoreReward: {
value: -100
},
deathReward: {
value: 1000
},
It had no trouble of getting beyond 100 in first 20 attempts.
Q-learning can be described with temporal logic
Q(s, a)=r(s,a)+gamma*max_a'(Q(s', a'))
Where
r(s,a)
= r
= Immediate rewardgamma
= relative value of delayed vs. immediate rewards (0 to 1)s'
= the new state after action a
a
= action in state s
a'
= action in state s'
You should execute it as
Select an action a and execute it
Your implementation of the algorithm is fine, just need to adjust some of the parameters.
If you assign some reward for living, 10 in my example and set epsilon to 0 you get a wining AI.
Example:
var JumpGameAIClass = function JumpGame(canvas) {
Game.JumpGame.call(this, canvas);
Object.defineProperties(this, {
epsilon: {
value: 0
},
alpha: {
value: 1
},
gamma: {
value: 1
},
resolution: {
value: 0.1
},
actions: {
value: [ 'jump', 'do_nothing' ]
},
Q: {
value: { },
writable: true
},
liveReward: {
value: 0
},
scoreReward: {
value: 100
},
deathReward: {
value: -1000
},
lastAction: {
value: 'do_nothing',
writable: true
},
lastDistance: {
value: 0,
writable: true
},
lastScore: {
value: 0,
writable: true
}
});
};
JumpGameAIClass.prototype = Object.create(Game.JumpGame.prototype);
JumpGameAIClass.prototype.getQ = function getQ(state) {
if (!this.Q.hasOwnProperty(state)) {
this.Q[state] = {};
for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
var action = this.actions[actionIndex];
this.Q[state][action] = 0;
}
}
return this.Q[state];
};
JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
var closest = -1;
for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
var block = this.blocks[blockIndex];
var distance = block.x - this.playerX;
if (distance >= 0 && (closest === -1 || distance < closest)) {
closest = distance;
}
}
return Math.max(0, Math.floor(closest * this.resolution));
};
JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
var jumpReward = this.getQ(distance)[this.actions[0]];
var doNothingReward = this.getQ(distance)[this.actions[1]];
if (!this.canJump()) {
return this.actions[1];
} else if (jumpReward > doNothingReward) {
return this.actions[0];
} else if (doNothingReward > jumpReward) {
return this.actions[1];
} else {
return this.actions[Math.floor(Math.random() * this.actions.length)];
}
};
JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
if (!this.canJump()) {
return this.actions[1];
}
if (Math.random() < this.epsilon) {
return this.actions[Math.floor(Math.random() * this.actions.length)];
} else {
return this.getActionWithHighestQ(this.getBlockDistance());
}
};
JumpGameAIClass.prototype.onDeath = function onDeath() {
this.restart();
};
JumpGameAIClass.prototype.think = function think() {
var reward = this.liveReward;
if (this.score !== this.lastScore) {
this.lastScore = this.score;
reward = this.scoreReward;
} else if (!this.playerAlive) {
reward = this.deathReward;
}
this.drawDistance();
var distance = this.getBlockDistance(),
maxQ = this.playerAlive ? this.getQ(distance)[this.getActionWithHighestQ(distance)] : 0,
previousQ = this.getQ(this.lastDistance)[this.lastAction];
this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);
this.lastAction = this.getActionEpsilonGreedy();
this.lastDistance = distance;
switch (this.lastAction) {
case this.actions[0]:
this.jump();
break;
}
};
JumpGameAIClass.prototype.drawDistance = function drawDistance() {
this.context.save();
this.context.textAlign = 'center';
this.context.textBaseline = 'bottom';
this.context.fillText('Distance: ' + this.getBlockDistance(), this.canvasWidth / 2, this.canvasHeight / 4);
this.context.textBaseline = 'top';
this.context.fillText('Last Distance: ' + this.lastDistance, this.canvasWidth / 2, this.canvasHeight / 4);
this.context.restore();
};
JumpGameAIClass.prototype.onFrame = function onFrame() {
Game.JumpGame.prototype.onFrame.apply(this, arguments);
this.think();
}
Game.JumpGameAI = JumpGameAIClass;
body {
background-color: #EEEEEE;
text-align: center;
}
canvas#game {
background-color: #FFFFFF;
border: 1px solid #DDDDDD;
}
<!DOCTYPE HTML>
<html lang="en">
<head>
<title>jump</title>
</head>
<body>
<canvas id="game" width="512" height="512">
<h1>Your browser doesn't support canvas!</h1>
</canvas>
<script src="https://raw.githubusercontent.com/cagosta/requestAnimationFrame/master/app/requestAnimationFrame.js"></script>
<!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 -->
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script>
</body>
Update:
Had a bit more of a think about this and while my example appears to be working it is not correct.
What is happing is, because the outcome of a jump is not know until a number of iterations in the future, assigning an immediate reward for living will cause whatever random decisions are first made at each state to be repeated until the eventual outcome of the decision propagates back through the states.
With the physics of the game the player's jump distance is less than the block spacing, this means a jump that clears a block will land further away from the next block than its take off point from the last block, therefore the same jump can be made again. So provided a "good" jump is made before the first block the system will immediately converge to a successful pattern. If the physics of the game were different or a "bad" jump is made this AI may not correct itself.
The issue is the system actually has two parts to its state, blockDistance and playerY. Without including playerY state in the decisions the outcome of a jump cannot be correctly propagated back to its origin.
You can work around this in this simple game by biasing the decisions to take no action. As the decision states based just on distance are complete provided you don't jump, the outcome of not jumping, ie to die, will correctly propagate back through the decisions to not jump at each distance. It's still a little bit funky as the once you jump the propagation of the reward will not be correct, but you can now see it learning all the same.
Example:
var JumpGameAIClass = function JumpGame(canvas) {
Game.JumpGame.call(this, canvas);
Object.defineProperties(this, {
epsilon: {
value: 0
},
alpha: {
value: 1
},
gamma: {
value: 1
},
resolution: {
value: 0.1
},
actions: {
value: [ 'jump', 'do_nothing' ]
},
Q: {
value: { },
writable: true
},
liveReward: {
value: 10
},
scoreReward: {
value: 100
},
deathReward: {
value: -1000
},
lastAction: {
value: 'do_nothing',
writable: true
},
lastDistance: {
value: 0,
writable: true
},
lastScore: {
value: 0,
writable: true
}
});
};
JumpGameAIClass.prototype = Object.create(Game.JumpGame.prototype);
JumpGameAIClass.prototype.getQ = function getQ(state) {
if (!this.Q.hasOwnProperty(state)) {
this.Q[state] = {};
for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
var action = this.actions[actionIndex];
this.Q[state][action] = 0;
}
}
return this.Q[state];
};
JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
var closest = -1;
for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
var block = this.blocks[blockIndex];
var distance = block.x - this.playerX;
if (distance >= 0 && (closest === -1 || distance < closest)) {
closest = distance;
}
}
return Math.max(0, Math.floor(closest * this.resolution));
};
JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
var jumpReward = this.getQ(distance)[this.actions[0]];
var doNothingReward = this.getQ(distance)[this.actions[1]];
if (!this.canJump() || doNothingReward >= jumpReward) {
return this.actions[1];
} else {
return this.actions[0];
}
};
JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
if (!this.canJump()) {
return this.actions[1];
}
if (Math.random() < this.epsilon) {
return this.actions[Math.floor(Math.random() * this.actions.length)];
} else {
return this.getActionWithHighestQ(this.getBlockDistance());
}
};
JumpGameAIClass.prototype.onDeath = function onDeath() {
this.restart();
};
JumpGameAIClass.prototype.think = function think() {
var reward = this.liveReward;
if (this.score !== this.lastScore) {
this.lastScore = this.score;
reward = this.scoreReward;
} else if (!this.playerAlive) {
reward = this.deathReward;
}
this.drawDistance();
var distance = this.getBlockDistance(),
maxQ = this.playerAlive ? this.getQ(distance)[this.getActionWithHighestQ(distance)] : 0,
previousQ = this.getQ(this.lastDistance)[this.lastAction];
this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);
this.lastAction = this.getActionEpsilonGreedy();
this.lastDistance = distance;
switch (this.lastAction) {
case this.actions[0]:
this.jump();
break;
}
};
JumpGameAIClass.prototype.drawDistance = function drawDistance() {
this.context.save();
this.context.textAlign = 'center';
this.context.textBaseline = 'bottom';
this.context.fillText('Distance: ' + this.getBlockDistance(), this.canvasWidth / 2, this.canvasHeight / 4);
this.context.textBaseline = 'top';
this.context.fillText('Last Distance: ' + this.lastDistance, this.canvasWidth / 2, this.canvasHeight / 4);
this.context.restore();
};
JumpGameAIClass.prototype.onFrame = function onFrame() {
Game.JumpGame.prototype.onFrame.apply(this, arguments);
this.think();
}
Game.JumpGameAI = JumpGameAIClass;
body {
background-color: #EEEEEE;
text-align: center;
}
canvas#game {
background-color: #FFFFFF;
border: 1px solid #DDDDDD;
}
<!DOCTYPE HTML>
<html lang="en">
<head>
<title>jump</title>
</head>
<body>
<canvas id="game" width="512" height="512">
<h1>Your browser doesn't support canvas!</h1>
</canvas>
<script src="https://raw.githubusercontent.com/cagosta/requestAnimationFrame/master/app/requestAnimationFrame.js"></script>
<!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 -->
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script>
</body>
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With