Source code for btb.selection.ucb1

import random

import numpy as np

from btb.selection.selector import Selector


[docs]class UCB1(Selector): """UCB1 selector Uses Upper Confidence Bound 1 algorithm (UCB1) for bandit selection. See also:: Auer, Peter et al. "Finite-time Analysis of the Multiarmed Bandit Problem." Machine Learning 47 (2002): 235-256. """ def _shuffle(self, iterable): iterable = list(iterable) inds = list(range(len(iterable))) random.shuffle(inds) for i in inds: yield iterable[i]
[docs] def bandit(self, choice_rewards): """ Multi-armed bandit method which chooses the arm for which the upper confidence bound (UCB) of expected reward is greatest. If there are multiple arms with the same UCB1 index, then one is chosen at random. An explanation is here: https://www.cs.bham.ac.uk/internal/courses/robotics/lectures/ucb1.pdf """ # count the larger of 1 and the total number of arm pulls total_pulls = max(1, sum(len(r) for r in choice_rewards.values())) def ucb1(choice): rewards = choice_rewards[choice] choice_pulls = max(len(rewards), 1) average_reward = np.nanmean(rewards) if len(rewards) else 0 error = np.sqrt(2.0 * np.log(total_pulls) / choice_pulls) return average_reward + error return max(self._shuffle(choice_rewards), key=ucb1)