# Copyright (c) Yuta Saito, Yusuke Narita, and ZOZO Technologies, Inc. All rights reserved.
# Licensed under the Apache 2.0 License.
"""Contextual Logistic Bandit Algorithms."""
from dataclasses import dataclass
from typing import Optional
import numpy as np
from sklearn.utils import check_random_state
from scipy.optimize import minimize
from .base import BaseContextualPolicy
from ..utils import sigmoid
[docs]@dataclass
class LogisticEpsilonGreedy(BaseContextualPolicy):
"""Logistic Epsilon Greedy.
Parameters
-----------
dim: int
Number of dimensions of context vectors.
n_actions: int
Number of actions.
len_list: int, default=1
Length of a list of actions recommended in each impression.
When Open Bandit Dataset is used, 3 should be set.
batch_size: int, default=1
Number of samples used in a batch parameter update.
alpha_: float, default=1.
Prior parameter for the online logistic regression.
lambda_: float, default=1.
Regularization hyperparameter for the online logistic regression.
random_state: int, default=None
Controls the random seed in sampling actions.
epsilon: float, default=0.
Exploration hyperparameter that must take value in the range of [0., 1.].
"""
epsilon: float = 0.0
def __post_init__(self) -> None:
"""Initialize class."""
assert (
0 <= self.epsilon <= 1
), f"epsilon must be between 0 and 1, but {self.epsilon} is given"
self.policy_name = f"logistic_egreedy_{self.epsilon}"
super().__post_init__()
self.model_list = [
MiniBatchLogisticRegression(
lambda_=self.lambda_list[i], alpha=self.alpha_list[i], dim=self.dim
)
for i in np.arange(self.n_actions)
]
self.reward_lists = [[] for _ in np.arange(self.n_actions)]
self.context_lists = [[] for _ in np.arange(self.n_actions)]
[docs] def select_action(self, context: np.ndarray) -> np.ndarray:
"""Select action for new data.
Parameters
----------
context: array-like, shape (1, dim_context)
Observed context vector.
Returns
----------
selected_actions: array-like, shape (len_list, )
List of selected actions.
"""
if self.random_.rand() > self.epsilon:
theta = np.array(
[model.predict_proba(context) for model in self.model_list]
).flatten()
return theta.argsort()[::-1][: self.len_list]
else:
return self.random_.choice(
self.n_actions, size=self.len_list, replace=False
)
[docs] def update_params(self, action: int, reward: float, context: np.ndarray) -> None:
"""Update policy parameters.
Parameters
----------
action: int
Selected action by the policy.
reward: float
Observed reward for the chosen action and position.
context: array-like, shape (1, dim_context)
Observed context vector.
"""
self.n_trial += 1
self.action_counts[action] += 1
self.reward_lists[action].append(reward)
self.context_lists[action].append(context)
if self.n_trial % self.batch_size == 0:
for action, model in enumerate(self.model_list):
if not len(self.reward_lists[action]) == 0:
model.fit(
X=np.concatenate(self.context_lists[action], axis=0),
y=np.array(self.reward_lists[action]),
)
self.reward_lists = [[] for _ in np.arange(self.n_actions)]
self.context_lists = [[] for _ in np.arange(self.n_actions)]
[docs]@dataclass
class LogisticUCB(BaseContextualPolicy):
"""Logistic Upper Confidence Bound.
Parameters
------------
dim: int
Number of dimensions of context vectors.
n_actions: int
Number of actions.
len_list: int, default=1
Length of a list of actions recommended in each impression.
When Open Bandit Dataset is used, 3 should be set.
batch_size: int, default=1
Number of samples used in a batch parameter update.
alpha_: float, default=1.
Prior parameter for the online logistic regression.
lambda_: float, default=1.
Regularization hyperparameter for the online logistic regression.
random_state: int, default=None
Controls the random seed in sampling actions.
epsilon: float, default=0.
Exploration hyperparameter that must take value in the range of [0., 1.].
References
----------
Lihong Li, Wei Chu, John Langford, and Robert E Schapire.
"A Contextual-bandit Approach to Personalized News Article Recommendation," 2010.
"""
epsilon: float = 0.0
def __post_init__(self) -> None:
"""Initialize class."""
assert (
0 <= self.epsilon <= 1
), f"epsilon must be between 0 and 1, but {self.epsilon} is given"
self.policy_name = f"logistic_ucb_{self.epsilon}"
super().__post_init__()
self.model_list = [
MiniBatchLogisticRegression(
lambda_=self.lambda_list[i], alpha=self.alpha_list[i], dim=self.dim
)
for i in np.arange(self.n_actions)
]
self.reward_lists = [[] for _ in np.arange(self.n_actions)]
self.context_lists = [[] for _ in np.arange(self.n_actions)]
[docs] def select_action(self, context: np.ndarray) -> np.ndarray:
"""Select action for new data.
Parameters
------------
context: array-like, shape (1, dim_context)
Observed context vector.
Returns
----------
selected_actions: array-like, shape (len_list, )
List of selected actions.
"""
theta = np.array(
[model.predict_proba(context) for model in self.model_list]
).flatten()
std = np.array(
[
np.sqrt(np.sum((model._q ** (-1)) * (context ** 2)))
for model in self.model_list
]
).flatten()
ucb_score = theta + self.epsilon * std
return ucb_score.argsort()[::-1][: self.len_list]
[docs] def update_params(self, action: int, reward: float, context: np.ndarray) -> None:
"""Update policy parameters.
Parameters
------------
action: int
Selected action by the policy.
reward: float
Observed reward for the chosen action and position.
context: array-like, shape (1, dim_context)
Observed context vector.
"""
self.n_trial += 1
self.action_counts[action] += 1
self.reward_lists[action].append(reward)
self.context_lists[action].append(context)
if self.n_trial % self.batch_size == 0:
for action, model in enumerate(self.model_list):
if not len(self.reward_lists[action]) == 0:
model.fit(
X=np.concatenate(self.context_lists[action], axis=0),
y=np.array(self.reward_lists[action]),
)
self.reward_lists = [[] for _ in np.arange(self.n_actions)]
self.context_lists = [[] for _ in np.arange(self.n_actions)]
[docs]@dataclass
class LogisticTS(BaseContextualPolicy):
"""Logistic Thompson Sampling.
Parameters
----------
dim: int
Number of dimensions of context vectors.
n_actions: int
Number of actions.
len_list: int, default=1
Length of a list of actions recommended in each impression.
When Open Bandit Dataset is used, 3 should be set.
batch_size: int, default=1
Number of samples used in a batch parameter update.
alpha_: float, default=1.
Prior parameter for the online logistic regression.
lambda_: float, default=1.
Regularization hyperparameter for the online logistic regression.
random_state: int, default=None
Controls the random seed in sampling actions.
References
----------
Olivier Chapelle and Lihong Li.
"An empirical evaluation of thompson sampling," 2011.
"""
policy_name: str = "logistic_ts"
def __post_init__(self) -> None:
"""Initialize class."""
super().__post_init__()
self.model_list = [
MiniBatchLogisticRegression(
lambda_=self.lambda_list[i],
alpha=self.alpha_list[i],
dim=self.dim,
random_state=self.random_state,
)
for i in np.arange(self.n_actions)
]
self.reward_lists = [[] for _ in np.arange(self.n_actions)]
self.context_lists = [[] for _ in np.arange(self.n_actions)]
[docs] def select_action(self, context: np.ndarray) -> np.ndarray:
"""Select action for new data.
Parameters
----------
context: array-like, shape (1, dim_context)
Observed context vector.
Returns
----------
selected_actions: array-like, shape (len_list, )
List of selected actions.
"""
theta = np.array(
[model.predict_proba_with_sampling(context) for model in self.model_list]
).flatten()
return theta.argsort()[::-1][: self.len_list]
[docs] def update_params(self, action: int, reward: float, context: np.ndarray) -> None:
"""Update policy parameters.
Parameters
----------
action: int
Selected action by the policy.
reward: float
Observed reward for the chosen action and position.
context: array-like, shape (1, dim_context)
Observed context vector.
"""
self.n_trial += 1
self.action_counts[action] += 1
self.reward_lists[action].append(reward)
self.context_lists[action].append(context)
if self.n_trial % self.batch_size == 0:
for action, model in enumerate(self.model_list):
if not len(self.reward_lists[action]) == 0:
model.fit(
X=np.concatenate(self.context_lists[action], axis=0),
y=np.array(self.reward_lists[action]),
)
self.reward_lists = [[] for _ in np.arange(self.n_actions)]
self.context_lists = [[] for _ in np.arange(self.n_actions)]
[docs]@dataclass
class MiniBatchLogisticRegression:
"""MiniBatch Online Logistic Regression Model."""
lambda_: float
alpha: float
dim: int
random_state: Optional[int] = None
def __post_init__(self) -> None:
"""Initialize Class."""
self._m = np.zeros(self.dim)
self._q = np.ones(self.dim) * self.lambda_
self.random_ = check_random_state(self.random_state)
[docs] def loss(self, w: np.ndarray, *args) -> float:
"""Calculate loss function."""
X, y = args
return (
0.5 * (self._q * (w - self._m)).dot(w - self._m)
+ np.log(1 + np.exp(-y * w.dot(X.T))).sum()
)
[docs] def grad(self, w: np.ndarray, *args) -> np.ndarray:
"""Calculate gradient."""
X, y = args
return self._q * (w - self._m) + (-1) * (
((y * X.T) / (1.0 + np.exp(y * w.dot(X.T)))).T
).sum(axis=0)
[docs] def sample(self) -> np.ndarray:
"""Sample coefficient vector from the prior distribution."""
return self.random_.normal(self._m, self.sd(), size=self.dim)
[docs] def fit(self, X: np.ndarray, y: np.ndarray):
"""Update coefficient vector by the mini-batch data."""
self._m = minimize(
self.loss,
self._m,
args=(X, y),
jac=self.grad,
method="L-BFGS-B",
options={"maxiter": 20, "disp": False},
).x
P = (1 + np.exp(1 + X.dot(self._m))) ** (-1)
self._q = self._q + (P * (1 - P)).dot(X ** 2)
[docs] def sd(self) -> np.ndarray:
"""Standard deviation for the coefficient vector."""
return self.alpha * (self._q) ** (-1.0)
[docs] def predict_proba(self, X: np.ndarray) -> np.ndarray:
"""Predict extected probability by the expected coefficient."""
return sigmoid(X.dot(self._m))
[docs] def predict_proba_with_sampling(self, X: np.ndarray) -> np.ndarray:
"""Predict extected probability by the sampled coefficient."""
return sigmoid(X.dot(self.sample()))