data(Maze)
Maze
# create several policies:
# 1. optimal policy using value iteration
maze_solved <- solve_MDP(Maze, method = "value_iteration")
maze_solved
pi_opt <- policy(maze_solved)
pi_opt
gridworld_plot_policy(add_policy(Maze, pi_opt), main = "Optimal Policy")
# 2. a manual policy (go up and in some squares to the right)
acts <- rep("up", times = length(Maze$states))
names(acts) <- Maze$states
acts[c("s(1,1)", "s(1,2)", "s(1,3)")] <- "right"
pi_manual <- manual_MDP_policy(Maze, acts)
pi_manual
gridworld_plot_policy(add_policy(Maze, pi_manual), main = "Manual Policy")
# 3. a random policy
set.seed(1234)
pi_random <- random_MDP_policy(Maze)
pi_random
gridworld_plot_policy(add_policy(Maze, pi_random), main = "Random Policy")
# 4. an improved policy based on one policy evaluation and
# policy improvement step.
u <- MDP_policy_evaluation(pi_random, Maze)
q <- q_values_MDP(Maze, U = u)
pi_greedy <- greedy_MDP_policy(q)
pi_greedy
gridworld_plot_policy(add_policy(Maze, pi_greedy), main = "Greedy Policy")
#' compare the approx. value functions for the policies (we restrict
#' the number of backups for the random policy since it may not converge)
rbind(
random = MDP_policy_evaluation(pi_random, Maze, k_backups = 100),
manual = MDP_policy_evaluation(pi_manual, Maze),
greedy = MDP_policy_evaluation(pi_greedy, Maze),
optimal = MDP_policy_evaluation(pi_opt, Maze)
)
# For many functions, we first add the policy to the problem description
# to create a "solved" MDP
maze_random <- add_policy(Maze, pi_random)
maze_random
# plotting
plot_value_function(maze_random)
gridworld_plot_policy(maze_random)
# compare to a benchmark
regret(maze_random, benchmark = maze_solved)
# calculate greedy actions for state 1
q <- q_values_MDP(maze_random)
q
greedy_MDP_action(1, q, epsilon = 0, prob = FALSE)
greedy_MDP_action(1, q, epsilon = 0, prob = TRUE)
greedy_MDP_action(1, q, epsilon = .1, prob = TRUE)
Run the code above in your browser using DataLab