# Copyright (c) Yuta Saito, Yusuke Narita, and ZOZO Technologies, Inc. All rights reserved.
# Licensed under the Apache 2.0 License.
"""Class for Multi-Class Classification to Bandit Reduction."""
from dataclasses import dataclass
from typing import Optional, Union
import numpy as np
from scipy.stats import rankdata
from sklearn.base import ClassifierMixin, is_classifier, clone
from sklearn.model_selection import train_test_split
from sklearn.utils import check_random_state, check_X_y
from .base import BaseSyntheticBanditDataset
from ..types import BanditFeedback
[docs]@dataclass
class MultiClassToBanditReduction(BaseSyntheticBanditDataset):
"""Class for handling multi-class classification data as logged bandit feedback data.
Note
-----
A machine learning classifier such as logistic regression is used to construct behavior and evaluation policies as follows.
1. Split the original data into training (:math:`\\mathcal{D}_{\\mathrm{tr}}`) and evaluation (:math:`\\mathcal{D}_{\\mathrm{ev}}`) sets.
2. Train classifiers on :math:`\\mathcal{D}_{\\mathrm{tr}}` and obtain base deterministic policies :math:`\\pi_{\\mathrm{det},b}` and :math:`\\pi_{\\mathrm{det},e}`.
3. Construct behavior (:math:`\\pi_{b}`) and evaluation (:math:`\\pi_{e}`) policies based on :math:`\\pi_{\\mathrm{det},b}` and :math:`\\pi_{\\mathrm{det},e}` as
.. math::
\\pi_b (a | x) := \\alpha_b \\cdot \\pi_{\\mathrm{det},b} (a|x) + (1.0 - \\alpha_b) \\cdot \\pi_{u} (a|x)
.. math::
\\pi_e (a | x) := \\alpha_e \\cdot \\pi_{\\mathrm{det},e} (a|x) + (1.0 - \\alpha_e) \\cdot \\pi_{u} (a|x)
where :math:`\\pi_{u}` is a uniform random policy and :math:`\\alpha_b` and :math:`\\alpha_e` are set by the user.
4. Measure the accuracy of the evaluation policy on :math:`\\mathcal{D}_{\\mathrm{ev}}` with its fully observed rewards
and use it as the evaluation policy's ground truth policy value.
5. Using :math:`\\mathcal{D}_{\\mathrm{ev}}`, an estimator :math:`\\hat{V}` estimates the policy value of the evaluation policy, i.e.,
.. math::
V(\\pi_e) \\approx \\hat{V} (\\pi_e; \\mathcal{D}_{\\mathrm{ev}})
6. Evaluate the estimation performance of :math:`\\hat{V}` by comparing its estimate with the ground-truth policy value.
Parameters
-----------
X: array-like, shape (n_rounds,n_features)
Training vector of the original multi-class classification data,
where n_rounds is the number of samples and n_features is the number of features.
y: array-like, shape (n_rounds,)
Target vector (relative to X) of the original multi-class classification data.
base_classifier_b: ClassifierMixin
Machine learning classifier used to construct a behavior policy.
alpha_b: float, default=0.9
Ration of a uniform random policy when constructing a **behavior** policy.
Must be in the [0, 1) interval to make the behavior policy a stochastic one.
dataset_name: str, default=None
Name of the dataset.
Examples
----------
.. code-block:: python
# evaluate the estimation performance of IPW using the `digits` data in sklearn
>>> import numpy as np
>>> from sklearn.datasets import load_digits
>>> from sklearn.linear_model import LogisticRegression
# import open bandit pipeline (obp)
>>> from obp.dataset import MultiClassToBanditReduction
>>> from obp.ope import OffPolicyEvaluation, InverseProbabilityWeighting as IPW
# load raw digits data
>>> X, y = load_digits(return_X_y=True)
# convert the raw classification data into the logged bandit dataset
>>> dataset = MultiClassToBanditReduction(
X=X,
y=y,
base_classifier_b=LogisticRegression(random_state=12345),
alpha_b=0.8,
dataset_name="digits",
)
# split the original data into the training and evaluation sets
>>> dataset.split_train_eval(eval_size=0.7, random_state=12345)
# obtain logged bandit feedback generated by behavior policy
>>> bandit_feedback = dataset.obtain_batch_bandit_feedback(random_state=12345)
>>> bandit_feedback
{
'n_actions': 10,
'n_rounds': 1258,
'context': array([[ 0., 0., 0., ..., 16., 1., 0.],
[ 0., 0., 7., ..., 16., 3., 0.],
[ 0., 0., 12., ..., 8., 0., 0.],
...,
[ 0., 1., 13., ..., 8., 11., 1.],
[ 0., 0., 15., ..., 0., 0., 0.],
[ 0., 0., 4., ..., 15., 3., 0.]]),
'action': array([6, 8, 5, ..., 2, 5, 9]),
'reward': array([1., 1., 1., ..., 1., 1., 1.]),
'position': array([0, 0, 0, ..., 0, 0, 0]),
'pscore': array([0.82, 0.82, 0.82, ..., 0.82, 0.82, 0.82])
}
# obtain action choice probabilities by an evaluation policy and its ground-truth policy value
>>> action_dist = dataset.obtain_action_dist_by_eval_policy(
base_classifier_e=LogisticRegression(C=100, random_state=12345),
alpha_e=0.9,
)
>>> ground_truth = dataset.calc_ground_truth_policy_value(action_dist=action_dist)
>>> ground_truth
0.865643879173291
# off-policy evaluation using IPW
>>> ope = OffPolicyEvaluation(bandit_feedback=bandit_feedback, ope_estimators=[IPW()])
>>> estimated_policy_value = ope.estimate_policy_values(action_dist=action_dist)
>>> estimated_policy_value
{'ipw': 0.8662705029276045}
# evaluate the estimation performance (accuracy) of IPW by relative estimation error (relative-ee)
>>> relative_estimation_errors = ope.evaluate_performance_of_estimators(
ground_truth_policy_value=ground_truth,
action_dist=action_dist,
)
>>> relative_estimation_errors
{'ipw': 0.000723881690137968}
References
------------
Miroslav DudÃk, Dumitru Erhan, John Langford, and Lihong Li.
"Doubly Robust Policy Evaluation and Optimization.", 2014.
"""
X: np.ndarray
y: np.ndarray
base_classifier_b: ClassifierMixin
alpha_b: float = 0.8
dataset_name: Optional[str] = None
def __post_init__(self) -> None:
"""Initialize Class."""
assert is_classifier(
self.base_classifier_b
), f"base_classifier_b must be a classifier"
assert (0.0 <= self.alpha_b < 1.0) and isinstance(
self.alpha_b, float
), f"alpha_b must be a float in the [0,1) interval, but {self.alpha_b} is given"
self.X, y = check_X_y(X=self.X, y=self.y, ensure_2d=True, multi_output=False)
self.y = (rankdata(y, "dense") - 1).astype(int) # re-index action
# fully observed labels
self.y_full = np.zeros((self.n_rounds, self.n_actions))
self.y_full[np.arange(self.n_rounds), y] = 1
@property
def len_list(self) -> int:
"""Length of recommendation lists."""
return 1
@property
def n_actions(self) -> int:
"""Number of actions (number of classes)."""
return np.unique(self.y).shape[0]
@property
def n_rounds(self) -> int:
"""Number of samples in the original multi-class classification data."""
return self.y.shape[0]
[docs] def split_train_eval(
self, eval_size: Union[int, float] = 0.25, random_state: Optional[int] = None,
) -> None:
"""Split the original data into the training (used for policy learning) and evaluation (used for OPE) sets.
Parameters
----------
eval_size: float or int, default=0.25
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the evaluation split.
If int, represents the absolute number of test samples.
random_state: int, default=None
Controls the random seed in train-evaluation split.
"""
(
self.X_tr,
self.X_ev,
self.y_tr,
self.y_ev,
_,
self.y_full_ev,
) = train_test_split(
self.X, self.y, self.y_full, test_size=eval_size, random_state=random_state
)
self.n_rounds_ev = self.X_ev.shape[0]
[docs] def obtain_batch_bandit_feedback(
self, random_state: Optional[int] = None,
) -> BanditFeedback:
"""Obtain batch logged bandit feedback, an evaluation policy, and its ground-truth policy value.
Note
-------
Please call `self.split_train_eval()` before calling this method.
Parameters
-----------
eval_size: float or int, default=0.25
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
If int, represents the absolute number of test samples.
random_state: int, default=None
Controls the random seed in sampling actions.
Returns
---------
bandit_feedback: BanditFeedback
bandit_feedback is logged bandit feedback data generated from a multi-class classification dataset.
"""
random_ = check_random_state(random_state)
# train a base ML classifier
base_clf_b = clone(self.base_classifier_b)
base_clf_b.fit(X=self.X_tr, y=self.y_tr)
preds = base_clf_b.predict(self.X_ev).astype(int)
# construct a behavior policy
pi_b = np.zeros((self.n_rounds_ev, self.n_actions))
pi_b[:, :] = (1.0 - self.alpha_b) / self.n_actions
pi_b[np.arange(self.n_rounds_ev), preds] = (
self.alpha_b + (1.0 - self.alpha_b) / self.n_actions
)
# sample action and factual reward based on the behavior policy
action = np.zeros(self.n_rounds_ev, dtype=int)
for i, p in enumerate(pi_b):
action[i] = random_.choice(
np.arange(self.n_actions, dtype=int), p=p, replace=False
)
reward = self.y_full_ev[np.arange(self.n_rounds_ev), action]
return dict(
n_actions=self.n_actions,
n_rounds=self.n_rounds_ev,
context=self.X_ev,
action=action,
reward=reward,
position=np.zeros(self.n_rounds_ev, dtype=int),
pscore=pi_b[np.arange(self.n_rounds_ev), action],
)
[docs] def obtain_action_dist_by_eval_policy(
self, base_classifier_e: Optional[ClassifierMixin] = None, alpha_e: float = 1.0
) -> np.ndarray:
"""Obtain action choice probabilities by an evaluation policy.
Parameters
-----------
base_classifier_e: ClassifierMixin, default=None
Machine learning classifier used to construct a behavior policy.
alpha_e: float, default=1.0
Ration of a uniform random policy when constructing an **evaluation** policy.
Must be in the [0, 1] interval (evaluation policy can be deterministic).
Returns
---------
action_dist_by_eval_policy: array-like, shape (n_rounds_ev, n_actions, 1)
action_dist_by_eval_policy is an action choice probabilities by an evaluation policy.
where n_rounds_ev is the number of samples in the evaluation set given the current train-eval split.
n_actions is the number of actions.
axis 2 represents the length of list; it is always 1 in the current implementation.
"""
assert (0.0 <= alpha_e <= 1.0) and isinstance(
alpha_e, float
), f"alpha_e must be a float in the [0,1] interval, but {alpha_e} is given"
# train a base ML classifier
if base_classifier_e is None:
base_clf_e = clone(self.base_classifier_b)
else:
assert is_classifier(
base_classifier_e
), f"base_classifier_e must be a classifier"
base_clf_e = clone(base_classifier_e)
base_clf_e.fit(X=self.X_tr, y=self.y_tr)
preds = base_clf_e.predict(self.X_ev).astype(int)
# construct an evaluation policy
pi_e = np.zeros((self.n_rounds_ev, self.n_actions))
pi_e[:, :] = (1.0 - alpha_e) / self.n_actions
pi_e[np.arange(self.n_rounds_ev), preds] = (
alpha_e + (1.0 - alpha_e) / self.n_actions
)
return np.expand_dims(pi_e, 2)
[docs] def calc_ground_truth_policy_value(self, action_dist: np.ndarray) -> np.ndarray:
"""Calculate the ground-truth policy value of a given action distribution.
Parameters
----------
action_dist: array-like, shape (n_rounds_ev, n_actions, 1)
Action distribution or action choice probabilities of a policy whose ground-truth is to be caliculated here.
where n_rounds_ev is the number of samples in the evaluation set given the current train-eval split.
n_actions is the number of actions.
axis 2 of action_dist represents the length of list; it is always 1 in the current implementation.
Returns
---------
ground_truth_policy_value: float
policy value of a given action distribution (mostly evaluation policy).
"""
assert action_dist.ndim == 3 and isinstance(
action_dist, np.ndarray
), f"action_dist must be a 3-D np.ndarray"
assert (
action_dist.shape[0] == self.n_rounds_ev
), "the size of axis 0 of action_dist must be the same as the number of samples in the evaluation set"
return action_dist[np.arange(self.n_rounds_ev), self.y_ev].mean()