Reinforcement learning

This project is easier than I expect. It only use very simplied version of Bellman equation. Use a dictionary to implement the Q-learning greatly reduces the complexity.

Smartcab

cd Desktop/Udacity/MLND/course_material/projects/smartcab/

pygame

pip install pygame
python smartcab/agent.py

warning: don’t use conda install! It wasted me 2 hours to realize what’s the problem.

With GUI open, a single trial has about 120 steps and takes about 4.5 minutes.

code structure

agent.py{
  class LearningAgent(env.Agent):{
    __int__(env,learning,epsilon,alpha)
    reset()
    build_state()
    get_maxQ(state)
    createQ(state)
    choose_action(state)
    learn(state,action,reward)
    update()
  }
  run()
}

simulator.py{
  class Simulator(){
      __init__(env,size,update_delay,
      display,log_metrics,optimiazed){
      / line 90-110 write header
      }
      run(tolerance,n_test){
      /line 133 "total_trials>20"for training number control
      /line 229-245 write data to csv file
      }
      render_text(trial,testing){}
      render(trial,testing){}
      pause(){}

  }
}

environment.py{
  class TrafficLight(){}
  class Environment(){
    __init__(verbose,num_dummies,grid_size){}
    create_agent(agent_class,*args,**kwargs){}
    set_primary_agent(agent,enforce_deadline){}
    reset(testing){}
    step(){}
    sense(agent){}
    get_deadline(agent){}
    act(agent,action){}
    compute_dist(a,b){}
   class Agent(){}
   class DummyAgent(Agent){}
  }
}

key implementation:

with GUI off, this runs super fast.

revise run() by:

env = Environment(verbose = False, num_dummies = 100, grid_size = (8, 6))
agent = env.create_agent(LearningAgent, learning = True, epsilon = 1,  alpha = 0.3)
env.set_primary_agent(agent, enforce_deadline = True)
sim = Simulator(env, size = None, update_delay = 0.01, display = False, log_metrics = True, optimized = True)
sim.run(tolerance = 0.05, n_test = 10)

args and *kwargs in python

implement function by:

def reset(self, destination=None, testing=False):
    self.planner.route_to(destination)
    #self.epsilon -= 0.05  # decaying function for question 6
    self.epsilon *= 0.95 # for question 7
    if testing:
        self.epsilon, self.alpha = 0, 0
    return None

def build_state(self):
    waypoint = self.planner.next_waypoint() 
    inputs = self.env.sense(self)          
    deadline = self.env.get_deadline(self) 
    state = (waypoint,tuple([inputs[item] for item in inputs]))
    return state

def get_maxQ(self, state):
    maxQ = float('-inf')
    for key,value in self.Q[state].iteritems():
        maxQ = max(maxQ, value)
    return maxQ

def createQ(self, state):
    if not self.Q or state not in self.Q:
        self.Q[state] ={None:0.01,   'left':0.0, 'right':0.0,    'forward':0.0}  # give idle a slightly priority
    return

def choose_action(self, state):
    self.state = state
    self.next_waypoint = self.planner.next_waypoint()
    # action = None
    waypoint = self.next_waypoint
    import random
    actions = [None, 'left','right', 'forward']  

    if not self.learning:
        action = random.choice(actions)
    else:
        highest = self.get_maxQ(state)
        action_dict = self.Q[state]

        coin = random.random()
        if coin <= self.epsilon:
            if action_dict[waypoint] == 0.0:
                action = waypoint
            else:
                action = random.choice(actions)
        else:
            for key,value in action_dict.iteritems():
                if value == highest:
                    return key
    return action

def learn(self, state, action, reward):
    if self.learning:
        self.Q[state][action] += self.alpha*reward   
    return