# enable parallel simulation
# doParallel::registerDoParallel()
data(Maze)
# solve the POMDP for 5 epochs and no discounting
sol <- solve_MDP(Maze, discount = 1)
sol
# U in the policy is and estimate of the utility of being in a state when using the optimal policy.
policy(sol)
gridworld_matrix(sol, what = "action")
## Example 1: simulate 100 trajectories following the policy,
# only the final belief state is returned
sim <- simulate_MDP(sol, n = 100, horizon = 10, verbose = TRUE)
sim
# Note that all simulations start at s_1 and that the simulated avg. reward
# is therefore an estimate to the U value for the start state s_1.
policy(sol)[1,]
# Calculate proportion of actions taken in the simulation
round_stochastic(sim$action_cnt / sum(sim$action_cnt), 2)
# reward distribution
hist(sim$reward)
## Example 2: simulate starting following a uniform distribution over all
# states and return all trajectories
sim <- simulate_MDP(sol, n = 100, start = "uniform", horizon = 10,
return_trajectories = TRUE)
head(sim$trajectories)
# how often was each state visited?
table(sim$trajectories$s)
Run the code above in your browser using DataLab