UsageΒΆ
Here we give an example on how to solve the canonical tiger problem. First we define the environment:
class Tiger:
"""The Tiger POMDP environment"""
L = 0
R = 1
H = 2
H_REWARD = -1
OPEN_CORRECT_REWARD = 10
OPEN_INCORRECT_REWARD = -100
@staticmethod
def actions() -> List[int]:
"""returns actions in the tiger problem"""
return [Tiger.L, Tiger.R, Tiger.H]
@staticmethod
def sample_observation(s: int) -> int:
"""85% hear tiger correctly"""
if random.uniform(0, 1) < 0.85:
return s
return int(not s)
@staticmethod
def sim(s: int, a: int) -> Tuple[int, int, float, bool]:
"""Simulates the tiger dynamics"""
if a == Tiger.H:
o = Tiger.sample_observation(s)
return (s, o, Tiger.H_REWARD, False)
o = random.choice([Tiger.L, Tiger.R])
r = Tiger.OPEN_CORRECT_REWARD if s == a else Tiger.OPEN_INCORRECT_REWARD
s = random.choice([Tiger.L, Tiger.R])
return s, o, r, True
@staticmethod
def state_evaluation(s) -> Tuple[float, ActionStats]:
"""A 'state-based model' for the Tiger
Hard-coded evaluation and prior for this problem.
"""
good_door: int = s
bad_door = int(not s)
return 4.0, {
Tiger.H: {"qval": 0, "prior": 0.4, "n": 1},
good_door: {"qval": 0, "prior": 0.4, "n": 1},
bad_door: {"qval": 0, "prior": 0.2, "n": 1},
}
Then given some beliefs:
def uniform_tiger_belief():
"""Sampling returns 'left' and 'right' state equally"""
return random.choice([Tiger.L, Tiger.R])
def tiger_left_belief():
"""Sampling returns 'left' state"""
return Tiger.L
def tiger_right_belief():
"""Sampling returns 'right' state"""
return Tiger.R
Then this library solves for the particular beliefs:
from online_pomdp_planning.mcts import create_POUCT
n_sims = 2 * 16384
ucb_constant = 100
planner = create_POUCT(Tiger.actions(), Tiger.sim, n_sims, ucb_constant=ucb_constant)
# action for uniform belief
action, info = planner(uniform_tiger_belief)
assert action == Tiger.H
assert info["iteration"] == n_sims
# action for left belief
action, info = planner(tiger_left_belief)
assert action == Tiger.L
assert info["iteration"] == n_sims