# Copyright (c) Yuta Saito, Yusuke Narita, and ZOZO Technologies, Inc. All rights reserved.
# Licensed under the Apache 2.0 License.
"""Context-Free Bandit Algorithms."""
import os
# import pkg_resources
import yaml
from dataclasses import dataclass
from typing import Optional
import numpy as np
from .base import BaseContextFreePolicy
# configurations to replicate the Bernoulli Thompson Sampling policy used in ZOZOTOWN production
prior_bts_file = os.path.join(os.path.dirname(__file__), "conf", "prior_bts.yaml")
# prior_bts_file = pkg_resources.resource_stream(__name__, "data/emperors.csv")
with open(prior_bts_file, "rb") as f:
production_prior_for_bts = yaml.safe_load(f)
[docs]@dataclass
class EpsilonGreedy(BaseContextFreePolicy):
"""Epsilon Greedy policy.
Parameters
----------
n_actions: int
Number of actions.
len_list: int, default=1
Length of a list of actions recommended in each impression.
When Open Bandit Dataset is used, 3 should be set.
batch_size: int, default=1
Number of samples used in a batch parameter update.
random_state: int, default=None
Controls the random seed in sampling actions.
epsilon: float, default=1.
Exploration hyperparameter that must take value in the range of [0., 1.].
policy_name: str, default=f'egreedy_{epsilon}'.
Name of bandit policy.
"""
epsilon: float = 1.0
def __post_init__(self) -> None:
"""Initialize Class."""
assert (
0 <= self.epsilon <= 1
), f"epsilon must be between 0 and 1, but {self.epsilon} is given"
self.policy_name = f"egreedy_{self.epsilon}"
super().__post_init__()
[docs] def select_action(self) -> np.ndarray:
"""Select a list of actions.
Returns
----------
selected_actions: array-like, shape (len_list, )
List of selected actions.
"""
if (self.random_.rand() > self.epsilon) and (self.action_counts.min() > 0):
predicted_rewards = self.reward_counts / self.action_counts
return predicted_rewards.argsort()[::-1][: self.len_list]
else:
return self.random_.choice(
self.n_actions, size=self.len_list, replace=False
)
[docs] def update_params(self, action: int, reward: float) -> None:
"""Update policy parameters.
Parameters
----------
action: int
Selected action by the policy.
reward: float
Observed reward for the chosen action and position.
"""
self.n_trial += 1
self.action_counts_temp[action] += 1
n, old_reward = self.action_counts_temp[action], self.reward_counts_temp[action]
self.reward_counts_temp[action] = (old_reward * (n - 1) / n) + (reward / n)
if self.n_trial % self.batch_size == 0:
self.action_counts = np.copy(self.action_counts_temp)
self.reward_counts = np.copy(self.reward_counts_temp)
[docs]@dataclass
class Random(EpsilonGreedy):
"""Random policy
Parameters
----------
n_actions: int
Number of actions.
len_list: int, default=1
Length of a list of actions recommended in each impression.
When Open Bandit Dataset is used, 3 should be set.
batch_size: int, default=1
Number of samples used in a batch parameter update.
random_state: int, default=None
Controls the random seed in sampling actions.
epsilon: float, default=1.
Exploration hyperparameter that must take value in the range of [0., 1.].
policy_name: str, default='random'.
Name of bandit policy.
"""
policy_name: str = "random"
[docs] def compute_batch_action_dist(
self,
n_rounds: int = 1,
n_sim: int = 100000,
) -> np.ndarray:
"""Compute the distribution over actions by Monte Carlo simulation.
Parameters
----------
n_rounds: int, default=1
Number of rounds in the distribution over actions.
(the size of the first axis of `action_dist`)
Returns
----------
action_dist: array-like, shape (n_rounds, n_actions, len_list)
Probability estimates of each arm being the best one for each sample, action, and position.
"""
action_dist = np.ones((n_rounds, self.n_actions, self.len_list)) * (
1 / self.n_actions
)
return action_dist
[docs]@dataclass
class BernoulliTS(BaseContextFreePolicy):
"""Bernoulli Thompson Sampling Policy
Parameters
----------
n_actions: int
Number of actions.
len_list: int, default=1
Length of a list of actions recommended in each impression.
When Open Bandit Dataset is used, 3 should be set.
batch_size: int, default=1
Number of samples used in a batch parameter update.
random_state: int, default=None
Controls the random seed in sampling actions.
alpha: array-like, shape (n_actions, ), default=None
Prior parameter vector for Beta distributions.
beta: array-like, shape (n_actions, ), default=None
Prior parameter vector for Beta distributions.
is_zozotown_prior: bool, default=False
Whether to use hyperparameters for the beta distribution used
at the start of the data collection period in ZOZOTOWN.
campaign: str, default=None
One of the three possible campaigns considered in ZOZOTOWN, "all", "men", and "women".
policy_name: str, default='bts'
Name of bandit policy.
"""
alpha: Optional[np.ndarray] = None
beta: Optional[np.ndarray] = None
is_zozotown_prior: bool = False
campaign: Optional[str] = None
policy_name: str = "bts"
def __post_init__(self) -> None:
"""Initialize class."""
super().__post_init__()
if self.is_zozotown_prior:
assert (
self.campaign is not None
), "`campaign` must be specified when `is_zozotown_prior` is True."
self.alpha = production_prior_for_bts[self.campaign]["alpha"]
self.beta = production_prior_for_bts[self.campaign]["beta"]
else:
self.alpha = np.ones(self.n_actions) if self.alpha is None else self.alpha
self.beta = np.ones(self.n_actions) if self.beta is None else self.beta
[docs] def select_action(self) -> np.ndarray:
"""Select a list of actions.
Returns
----------
selected_actions: array-like, shape (len_list, )
List of selected actions.
"""
predicted_rewards = self.random_.beta(
a=self.reward_counts + self.alpha,
b=(self.action_counts - self.reward_counts) + self.beta,
)
return predicted_rewards.argsort()[::-1][: self.len_list]
[docs] def update_params(self, action: int, reward: float) -> None:
"""Update policy parameters.
Parameters
----------
action: int
Selected action by the policy.
reward: float
Observed reward for the chosen action and position.
"""
self.n_trial += 1
self.action_counts_temp[action] += 1
self.reward_counts_temp[action] += reward
if self.n_trial % self.batch_size == 0:
self.action_counts = np.copy(self.action_counts_temp)
self.reward_counts = np.copy(self.reward_counts_temp)
[docs] def compute_batch_action_dist(
self,
n_rounds: int = 1,
n_sim: int = 100000,
) -> np.ndarray:
"""Compute the distribution over actions by Monte Carlo simulation.
Parameters
----------
n_rounds: int, default=1
Number of rounds in the distribution over actions.
(the size of the first axis of `action_dist`)
n_sim: int, default=100000
Number of simulations in the Monte Carlo simulation to compute the distribution over actions.
Returns
----------
action_dist: array-like, shape (n_rounds, n_actions, len_list)
Probability estimates of each arm being the best one for each sample, action, and position.
"""
action_count = np.zeros((self.n_actions, self.len_list))
for _ in np.arange(n_sim):
selected_actions = self.select_action()
for pos in np.arange(self.len_list):
action_count[selected_actions[pos], pos] += 1
action_dist = np.tile(
action_count / n_sim,
(n_rounds, 1, 1),
)
return action_dist