Source code for btb.selection.ucb1

import random

import numpy as np

from btb.selection.selector import Selector


[docs]class UCB1(Selector):
    """UCB1 selector

    Uses Upper Confidence Bound 1 algorithm (UCB1) for bandit selection.

    See also::

       Auer, Peter et al. "Finite-time Analysis of the Multiarmed Bandit Problem."
       Machine Learning 47 (2002): 235-256.
    """

    def _shuffle(self, iterable):
        iterable = list(iterable)
        inds = list(range(len(iterable)))
        random.shuffle(inds)
        for i in inds:
            yield iterable[i]

[docs]    def bandit(self, choice_rewards):
        """
        Multi-armed bandit method which chooses the arm for which the upper
        confidence bound (UCB) of expected reward is greatest.

        If there are multiple arms with the same UCB1 index, then one is chosen
        at random.

        An explanation is here:
        https://www.cs.bham.ac.uk/internal/courses/robotics/lectures/ucb1.pdf
        """

        # count the larger of 1 and the total number of arm pulls
        total_pulls = max(1, sum(len(r) for r in choice_rewards.values()))

        def ucb1(choice):
            rewards = choice_rewards[choice]
            choice_pulls = max(len(rewards), 1)
            average_reward = np.nanmean(rewards) if len(rewards) else 0
            error = np.sqrt(2.0 * np.log(total_pulls) / choice_pulls)
            return average_reward + error

        return max(self._shuffle(choice_rewards), key=ucb1)