UsageΒΆ

Here we give an example on how to solve the canonical tiger problem. First we define the environment:

class Tiger:
    """The Tiger POMDP environment"""

    L = 0
    R = 1
    H = 2

    H_REWARD = -1
    OPEN_CORRECT_REWARD = 10
    OPEN_INCORRECT_REWARD = -100

    @staticmethod
    def actions() -> List[int]:
        """returns actions in the tiger problem"""
        return [Tiger.L, Tiger.R, Tiger.H]

    @staticmethod
    def sample_observation(s: int) -> int:
        """85% hear tiger correctly"""
        if random.uniform(0, 1) < 0.85:
            return s
        return int(not s)

    @staticmethod
    def sim(s: int, a: int) -> Tuple[int, int, float, bool]:
        """Simulates the tiger dynamics"""

        if a == Tiger.H:
            o = Tiger.sample_observation(s)
            return (s, o, Tiger.H_REWARD, False)

        o = random.choice([Tiger.L, Tiger.R])
        r = Tiger.OPEN_CORRECT_REWARD if s == a else Tiger.OPEN_INCORRECT_REWARD

        s = random.choice([Tiger.L, Tiger.R])

        return s, o, r, True

    @staticmethod
    def state_evaluation(s) -> Tuple[float, ActionStats]:
        """A 'state-based model' for the Tiger

        Hard-coded evaluation and prior for this problem.
        """
        good_door: int = s
        bad_door = int(not s)
        return 4.0, {
            Tiger.H: {"qval": 0, "prior": 0.4, "n": 1},
            good_door: {"qval": 0, "prior": 0.4, "n": 1},
            bad_door: {"qval": 0, "prior": 0.2, "n": 1},
        }

Then given some beliefs:

def uniform_tiger_belief():
    """Sampling returns 'left' and 'right' state equally"""
    return random.choice([Tiger.L, Tiger.R])
def tiger_left_belief():
    """Sampling returns 'left' state"""
    return Tiger.L
def tiger_right_belief():
    """Sampling returns 'right' state"""
    return Tiger.R

Then this library solves for the particular beliefs:

from online_pomdp_planning.mcts import create_POUCT

n_sims = 2 * 16384
ucb_constant = 100

planner = create_POUCT(Tiger.actions(), Tiger.sim, n_sims, ucb_constant=ucb_constant)

# action for uniform belief
action, info = planner(uniform_tiger_belief)
assert action == Tiger.H
assert info["iteration"] == n_sims

# action for left belief
action, info = planner(tiger_left_belief)
assert action == Tiger.L
assert info["iteration"] == n_sims