data(Maze)
Maze
# use value iteration
maze_solved <- solve_MDP(Maze, method = "value_iteration")
maze_solved
policy(maze_solved)
# plot the value function U
plot_value_function(maze_solved)
# Maze solutions can be visualized
gridworld_plot_policy(maze_solved)
# use modified policy iteration
maze_solved <- solve_MDP(Maze, method = "policy_iteration")
policy(maze_solved)
# finite horizon
maze_solved <- solve_MDP(Maze, method = "value_iteration", horizon = 3)
policy(maze_solved)
gridworld_plot_policy(maze_solved, epoch = 1)
gridworld_plot_policy(maze_solved, epoch = 2)
gridworld_plot_policy(maze_solved, epoch = 3)
# create a random policy where action n is very likely and approximate
# the value function. We change the discount factor to .9 for this.
Maze_discounted <- Maze
Maze_discounted$discount <- .9
pi <- random_MDP_policy(Maze_discounted,
prob = c(n = .7, e = .1, s = .1, w = 0.1))
pi
# compare the utility function for the random policy with the function for the optimal
# policy found by the solver.
maze_solved <- solve_MDP(Maze)
MDP_policy_evaluation(pi, Maze, k_backup = 100)
MDP_policy_evaluation(policy(maze_solved), Maze, k_backup = 100)
# Note that the solver already calculates the utility function and returns it with the policy
policy(maze_solved)
# Learn a Policy using Q-Learning
maze_learned <- solve_MDP(Maze, method = "q_learning", N = 100)
maze_learned
maze_learned$solution
policy(maze_learned)
plot_value_function(maze_learned)
gridworld_plot_policy(maze_learned)
Run the code above in your browser using DataLab