Reinforcement learning
This project is easier than I expect. It only use very simplied version of Bellman equation. Use a dictionary to implement the Q-learning greatly reduces the complexity.
Smartcab
cd Desktop/Udacity/MLND/course_material/projects/smartcab/
pygame
pip install pygame
python smartcab/agent.py
warning: don’t use conda install! It wasted me 2 hours to realize what’s the problem.
With GUI open, a single trial has about 120 steps and takes about 4.5 minutes.
code structure
agent.py{
class LearningAgent(env.Agent):{
__int__(env,learning,epsilon,alpha)
reset()
build_state()
get_maxQ(state)
createQ(state)
choose_action(state)
learn(state,action,reward)
update()
}
run()
}
simulator.py{
class Simulator(){
__init__(env,size,update_delay,
display,log_metrics,optimiazed){
/ line 90-110 write header
}
run(tolerance,n_test){
/line 133 "total_trials>20"for training number control
/line 229-245 write data to csv file
}
render_text(trial,testing){}
render(trial,testing){}
pause(){}
}
}
environment.py{
class TrafficLight(){}
class Environment(){
__init__(verbose,num_dummies,grid_size){}
create_agent(agent_class,*args,**kwargs){}
set_primary_agent(agent,enforce_deadline){}
reset(testing){}
step(){}
sense(agent){}
get_deadline(agent){}
act(agent,action){}
compute_dist(a,b){}
class Agent(){}
class DummyAgent(Agent){}
}
}
key implementation:
with GUI off, this runs super fast.
revise run() by:
env = Environment(verbose = False, num_dummies = 100, grid_size = (8, 6))
agent = env.create_agent(LearningAgent, learning = True, epsilon = 1, alpha = 0.3)
env.set_primary_agent(agent, enforce_deadline = True)
sim = Simulator(env, size = None, update_delay = 0.01, display = False, log_metrics = True, optimized = True)
sim.run(tolerance = 0.05, n_test = 10)
implement function by:
def reset(self, destination=None, testing=False):
self.planner.route_to(destination)
#self.epsilon -= 0.05 # decaying function for question 6
self.epsilon *= 0.95 # for question 7
if testing:
self.epsilon, self.alpha = 0, 0
return None
def build_state(self):
waypoint = self.planner.next_waypoint()
inputs = self.env.sense(self)
deadline = self.env.get_deadline(self)
state = (waypoint,tuple([inputs[item] for item in inputs]))
return state
def get_maxQ(self, state):
maxQ = float('-inf')
for key,value in self.Q[state].iteritems():
maxQ = max(maxQ, value)
return maxQ
def createQ(self, state):
if not self.Q or state not in self.Q:
self.Q[state] ={None:0.01, 'left':0.0, 'right':0.0, 'forward':0.0} # give idle a slightly priority
return
def choose_action(self, state):
self.state = state
self.next_waypoint = self.planner.next_waypoint()
# action = None
waypoint = self.next_waypoint
import random
actions = [None, 'left','right', 'forward']
if not self.learning:
action = random.choice(actions)
else:
highest = self.get_maxQ(state)
action_dict = self.Q[state]
coin = random.random()
if coin <= self.epsilon:
if action_dict[waypoint] == 0.0:
action = waypoint
else:
action = random.choice(actions)
else:
for key,value in action_dict.iteritems():
if value == highest:
return key
return action
def learn(self, state, action, reward):
if self.learning:
self.Q[state][action] += self.alpha*reward
return