# display available solver options which can be passed on to pomdp-solve as parameters.
solve_POMDP_parameter()
################################################################
# Example 1: Solving the simple infinite-horizon Tiger problem
data("Tiger")
Tiger
# look at the model as a list
unclass(Tiger)
# inspect an individual field of the model (e.g., the transition probabilities and the reward)
Tiger$transition_prob
Tiger$reward
sol <- solve_POMDP(model = Tiger)
sol
# look at the solution
sol$solution
# policy (value function (alpha vectors), optimal action and observation dependent transitions)
policy(sol)
# plot the policy graph of the infinite-horizon POMDP
plot_policy_graph(sol)
# value function
plot_value_function(sol, ylim = c(0,20))
################################################################
# Example 2: Solve a problem specified as a POMDP file
# using a grid of size 20
sol <- solve_POMDP("http://www.pomdp.org/examples/cheese.95.POMDP",
method = "grid", parameter = list(fg_points = 20))
sol
policy(sol)
plot_policy_graph(sol)
# Example 3: Solving a finite-horizon POMDP using the incremental
# pruning method (without discounting)
sol <- solve_POMDP(model = Tiger,
horizon = 3, discount = 1, method = "incprune")
sol
# look at the policy tree
policy(sol)
plot_policy_graph(sol)
# note: only open the door in epoch 3 if you get twice the same observation.
# Expected reward starting for the models initial belief (uniform):
# listen twice and then open the door or listen 3 times
reward(sol)
# Expected reward for listen twice (-2) and then open-left (-1 + (-1) + 10 = 8)
reward(sol, belief = c(1,0))
# Expected reward for just opening the right door (10)
reward(sol, belief = c(1,0), epoch = 3)
# Expected reward for just opening the right door (0.5 * -100 + 0.95 * 10 = 4.5)
reward(sol, belief = c(.95,.05), epoch = 3)
################################################################
# Example 3: Using terminal values (state-dependent utilities after the final epoch)
#
# Specify 1000 if the tiger is right after 3 (horizon) epochs
sol <- solve_POMDP(model = Tiger,
horizon = 3, discount = 1, method = "incprune",
terminal_values = c(0, 1000))
sol
policy(sol)
# Note: The optimal strategy is to never open the left door. If we think the
# Tiger is behind the right door, then we just wait for the final payout. If
# we think the tiger might be behind the left door, then we open the right
# door, are likely to get a small reward and the tiger has a chance of 50\% to
# move behind the right door. The second episode is used to gather more
# information for the more important # final action.
################################################################
# Example 4: Model time-dependent transition probabilities
# The tiger reacts normally for 3 epochs (goes randomly two one
# of the two doors when a door was opened). After 3 epochs he gets
# scared and when a door is opened then he always goes to the other door.
# specify the horizon for each of the two different episodes
Tiger_time_dependent <- Tiger
Tiger_time_dependent$name <- "Scared Tiger Problem"
Tiger_time_dependent$horizon <- c(normal_tiger = 3, scared_tiger = 3)
Tiger_time_dependent$transition_prob <- list(
normal_tiger = list(
"listen" = "identity",
"open-left" = "uniform",
"open-right" = "uniform"),
scared_tiger = list(
"listen" = "identity",
"open-left" = rbind(c(0, 1), c(0, 1)),
"open-right" = rbind(c(1, 0), c(1, 0))
)
)
# Tiger_time_dependent (a higher value for verbose will show more messages)
sol <- solve_POMDP(model = Tiger_time_dependent, discount = 1,
method = "incprune", verbose = 1)
sol
policy(sol)
# note that the default method to estimate the belief for nodes is following a
# trajectory which uses only the first belief reached for each node. Random sampling
# can find a better estimate of the central belief of the segment (see nodes 4-1 to 6-3
# in the plots below).
plot_policy_graph(sol)
plot_policy_graph(sol, method = "random_sample")
################################################################
# Example 5: Alternative method to solve time-dependent POMDPs
# 1) create the scared tiger model
Tiger_scared <- Tiger
Tiger_scared$transition_prob <- list(
"listen" = "identity",
"open-left" = rbind(c(0, 1), c(0, 1)),
"open-right" = rbind(c(1, 0), c(1, 0))
)
# 2) Solve in reverse order. Scared tiger without terminal values first.
sol_scared <- solve_POMDP(model = Tiger_scared,
horizon = 3, discount = 1, method = "incprune")
sol_scared
policy(sol_scared)
# 3) Solve the regular tiger with the value function of the scared tiger as terminal values
sol <- solve_POMDP(model = Tiger,
horizon = 3, discount = 1, method = "incprune",
terminal_values = sol_scared$solution$alpha[[1]])
sol
policy(sol)
# Note: it is optimal to mostly listen till the Tiger gets in the scared mood. Only if
# we are extremely sure in the first epoch, then opening a door is optimal.
################################################################
# Example 6: PBVI with a custom grid
# Create a search grid by sampling from the belief space in
# 10 regular intervals
custom_grid <- sample_belief_space(Tiger, n = 10, method = "regular")
head(custom_grid)
# Visualize the search grid
plot_belief_space(sol, sample = custom_grid)
# Solve the POMDP using the grid for approximation
sol <- solve_POMDP(Tiger, method = "grid", parameter = list(grid = custom_grid))
policy(sol)
plot_policy_graph(sol)
# note that plot_policy_graph() automatically remove nodes that are unreachable from the
# initial node. This behavior can be switched off.
plot_policy_graph(sol, remove_unreachable_nodes = FALSE)
Run the code above in your browser using DataLab