add read me
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,187 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import (
|
||||
MaxAbsScaler,
|
||||
MinMaxScaler,
|
||||
PowerTransformer,
|
||||
QuantileTransformer,
|
||||
RobustScaler,
|
||||
StandardScaler,
|
||||
maxabs_scale,
|
||||
minmax_scale,
|
||||
power_transform,
|
||||
quantile_transform,
|
||||
robust_scale,
|
||||
scale,
|
||||
)
|
||||
from sklearn.utils._testing import assert_allclose, assert_array_equal
|
||||
from sklearn.utils.fixes import (
|
||||
BSR_CONTAINERS,
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
DIA_CONTAINERS,
|
||||
DOK_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
|
||||
iris = load_iris()
|
||||
|
||||
|
||||
def _get_valid_samples_by_column(X, col):
|
||||
"""Get non NaN samples in column of X"""
|
||||
return X[:, [col]][~np.isnan(X[:, col])]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est, func, support_sparse, strictly_positive, omit_kwargs",
|
||||
[
|
||||
(MaxAbsScaler(), maxabs_scale, True, False, []),
|
||||
(MinMaxScaler(), minmax_scale, False, False, ["clip"]),
|
||||
(StandardScaler(), scale, False, False, []),
|
||||
(StandardScaler(with_mean=False), scale, True, False, []),
|
||||
(PowerTransformer("yeo-johnson"), power_transform, False, False, []),
|
||||
(PowerTransformer("box-cox"), power_transform, False, True, []),
|
||||
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []),
|
||||
(RobustScaler(), robust_scale, False, False, []),
|
||||
(RobustScaler(with_centering=False), robust_scale, True, False, []),
|
||||
],
|
||||
)
|
||||
def test_missing_value_handling(
|
||||
est, func, support_sparse, strictly_positive, omit_kwargs
|
||||
):
|
||||
# check that the preprocessing method let pass nan
|
||||
rng = np.random.RandomState(42)
|
||||
X = iris.data.copy()
|
||||
n_missing = 50
|
||||
X[
|
||||
rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)
|
||||
] = np.nan
|
||||
if strictly_positive:
|
||||
X += np.nanmin(X) + 0.1
|
||||
X_train, X_test = train_test_split(X, random_state=1)
|
||||
# sanity check
|
||||
assert not np.all(np.isnan(X_train), axis=0).any()
|
||||
assert np.any(np.isnan(X_train), axis=0).all()
|
||||
assert np.any(np.isnan(X_test), axis=0).all()
|
||||
X_test[:, 0] = np.nan # make sure this boundary case is tested
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt = est.fit(X_train).transform(X_test)
|
||||
# ensure no warnings are raised
|
||||
# missing values should still be missing, and only them
|
||||
assert_array_equal(np.isnan(Xt), np.isnan(X_test))
|
||||
|
||||
# check that the function leads to the same results as the class
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_class = est.transform(X_train)
|
||||
kwargs = est.get_params()
|
||||
# remove the parameters which should be omitted because they
|
||||
# are not defined in the counterpart function of the preprocessing class
|
||||
for kwarg in omit_kwargs:
|
||||
_ = kwargs.pop(kwarg)
|
||||
Xt_func = func(X_train, **kwargs)
|
||||
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
|
||||
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
|
||||
|
||||
# check that the inverse transform keep NaN
|
||||
Xt_inv = est.inverse_transform(Xt)
|
||||
assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
|
||||
# FIXME: we can introduce equal_nan=True in recent version of numpy.
|
||||
# For the moment which just check that non-NaN values are almost equal.
|
||||
assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
|
||||
|
||||
for i in range(X.shape[1]):
|
||||
# train only on non-NaN
|
||||
est.fit(_get_valid_samples_by_column(X_train, i))
|
||||
# check transforming with NaN works even when training without NaN
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_col = est.transform(X_test[:, [i]])
|
||||
assert_allclose(Xt_col, Xt[:, [i]])
|
||||
# check non-NaN is handled as before - the 1st column is all nan
|
||||
if not np.isnan(X_test[:, i]).all():
|
||||
Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i))
|
||||
assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())])
|
||||
|
||||
if support_sparse:
|
||||
est_dense = clone(est)
|
||||
est_sparse = clone(est)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_dense = est_dense.fit(X_train).transform(X_test)
|
||||
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
|
||||
|
||||
for sparse_container in (
|
||||
BSR_CONTAINERS
|
||||
+ COO_CONTAINERS
|
||||
+ CSC_CONTAINERS
|
||||
+ CSR_CONTAINERS
|
||||
+ DIA_CONTAINERS
|
||||
+ DOK_CONTAINERS
|
||||
+ LIL_CONTAINERS
|
||||
):
|
||||
# check that the dense and sparse inputs lead to the same results
|
||||
# precompute the matrix to avoid catching side warnings
|
||||
X_train_sp = sparse_container(X_train)
|
||||
X_test_sp = sparse_container(X_test)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", PendingDeprecationWarning)
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
|
||||
|
||||
assert_allclose(Xt_sp.toarray(), Xt_dense)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", PendingDeprecationWarning)
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
|
||||
|
||||
assert_allclose(Xt_inv_sp.toarray(), Xt_inv_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est, func",
|
||||
[
|
||||
(MaxAbsScaler(), maxabs_scale),
|
||||
(MinMaxScaler(), minmax_scale),
|
||||
(StandardScaler(), scale),
|
||||
(StandardScaler(with_mean=False), scale),
|
||||
(PowerTransformer("yeo-johnson"), power_transform),
|
||||
(
|
||||
PowerTransformer("box-cox"),
|
||||
power_transform,
|
||||
),
|
||||
(QuantileTransformer(n_quantiles=3), quantile_transform),
|
||||
(RobustScaler(), robust_scale),
|
||||
(RobustScaler(with_centering=False), robust_scale),
|
||||
],
|
||||
)
|
||||
def test_missing_value_pandas_na_support(est, func):
|
||||
# Test pandas IntegerArray with pd.NA
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
X = np.array(
|
||||
[
|
||||
[1, 2, 3, np.nan, np.nan, 4, 5, 1],
|
||||
[np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
|
||||
[1, 2, 3, 4, 5, 6, 7, 8],
|
||||
]
|
||||
).T
|
||||
|
||||
# Creates dataframe with IntegerArrays with pd.NA
|
||||
X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c"])
|
||||
X_df["c"] = X_df["c"].astype("int")
|
||||
|
||||
X_trans = est.fit_transform(X)
|
||||
X_df_trans = est.fit_transform(X_df)
|
||||
|
||||
assert_allclose(X_trans, X_df_trans)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,665 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn import clone
|
||||
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
ignore_warnings,
|
||||
)
|
||||
|
||||
X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, quantile_method, expected, sample_weight",
|
||||
[
|
||||
(
|
||||
"uniform",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"uniform",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
|
||||
[1, 1, 2, 1],
|
||||
),
|
||||
(
|
||||
"uniform",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
|
||||
[1, 1, 1, 1],
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
|
||||
[1, 1, 2, 1],
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
|
||||
[1, 1, 1, 1],
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
|
||||
[0, 1, 1, 1],
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [1, 1, 1, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
|
||||
[1, 0, 3, 1],
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
|
||||
[1, 1, 1, 1],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_fit_transform(strategy, quantile_method, expected, sample_weight):
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=3, encode="ordinal", strategy=strategy, quantile_method=quantile_method
|
||||
)
|
||||
with ignore_warnings(category=UserWarning):
|
||||
# Ignore the warning on removed small bins.
|
||||
est.fit(X, sample_weight=sample_weight)
|
||||
assert_array_equal(est.transform(X), expected)
|
||||
|
||||
|
||||
def test_valid_n_bins():
|
||||
KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf").fit_transform(X)
|
||||
KBinsDiscretizer(
|
||||
n_bins=np.array([2])[0], quantile_method="averaged_inverted_cdf"
|
||||
).fit_transform(X)
|
||||
assert KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf").fit(
|
||||
X
|
||||
).n_bins_.dtype == np.dtype(int)
|
||||
|
||||
|
||||
def test_invalid_n_bins_array():
|
||||
# Bad shape
|
||||
n_bins = np.full((2, 4), 2.0)
|
||||
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
|
||||
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Incorrect number of features
|
||||
n_bins = [1, 2, 2]
|
||||
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
|
||||
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Bad bin values
|
||||
n_bins = [1, 2, 2, 1]
|
||||
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
|
||||
err_msg = (
|
||||
"KBinsDiscretizer received an invalid number of bins "
|
||||
"at indices 0, 3. Number of bins must be at least 2, "
|
||||
"and must be an int."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Float bin values
|
||||
n_bins = [2.1, 2, 2.1, 2]
|
||||
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
|
||||
err_msg = (
|
||||
"KBinsDiscretizer received an invalid number of bins "
|
||||
"at indices 0, 2. Number of bins must be at least 2, "
|
||||
"and must be an int."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, quantile_method, expected, sample_weight",
|
||||
[
|
||||
(
|
||||
"uniform",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"linear",
|
||||
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
|
||||
[1, 1, 1, 1],
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
|
||||
[0, 1, 3, 1],
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 2, 2, 2], [1, 2, 2, 2]],
|
||||
[1, 1, 3, 1],
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [0, 1, 1, 0], [1, 1, 1, 1], [1, 2, 2, 2]],
|
||||
[1, 0, 3, 1],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_fit_transform_n_bins_array(strategy, quantile_method, expected, sample_weight):
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=[2, 3, 3, 3],
|
||||
encode="ordinal",
|
||||
strategy=strategy,
|
||||
quantile_method=quantile_method,
|
||||
).fit(X, sample_weight=sample_weight)
|
||||
assert_array_equal(est.transform(X), expected)
|
||||
|
||||
# test the shape of bin_edges_
|
||||
n_features = np.array(X).shape[1]
|
||||
assert est.bin_edges_.shape == (n_features,)
|
||||
for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
|
||||
assert bin_edges.shape == (n_bins + 1,)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore: Bins whose width are too small")
|
||||
def test_kbinsdiscretizer_effect_sample_weight():
|
||||
"""Check the impact of `sample_weight` one computed quantiles."""
|
||||
X = np.array([[-2], [-1], [1], [3], [500], [1000]])
|
||||
# add a large number of bins such that each sample with a non-null weight
|
||||
# will be used as bin edge
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=10,
|
||||
encode="ordinal",
|
||||
strategy="quantile",
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
)
|
||||
est.fit(X, sample_weight=[1, 1, 1, 1, 0, 0])
|
||||
assert_allclose(est.bin_edges_[0], [-2, -1, 0, 1, 3])
|
||||
assert_allclose(est.transform(X), [[0.0], [1.0], [3.0], [3.0], [3.0], [3.0]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", ["kmeans", "quantile"])
|
||||
def test_kbinsdiscretizer_no_mutating_sample_weight(strategy):
|
||||
"""Make sure that `sample_weight` is not changed in place."""
|
||||
|
||||
if strategy == "quantile":
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=3,
|
||||
encode="ordinal",
|
||||
strategy=strategy,
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
)
|
||||
else:
|
||||
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
|
||||
sample_weight = np.array([1, 3, 1, 2], dtype=np.float64)
|
||||
sample_weight_copy = np.copy(sample_weight)
|
||||
est.fit(X, sample_weight=sample_weight)
|
||||
assert_allclose(sample_weight, sample_weight_copy)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
|
||||
def test_same_min_max(strategy):
|
||||
warnings.simplefilter("always")
|
||||
X = np.array([[1, -2], [1, -1], [1, 0], [1, 1]])
|
||||
if strategy == "quantile":
|
||||
est = KBinsDiscretizer(
|
||||
strategy=strategy,
|
||||
n_bins=3,
|
||||
encode="ordinal",
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
)
|
||||
else:
|
||||
est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode="ordinal")
|
||||
warning_message = "Feature 0 is constant and will be replaced with 0."
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
est.fit(X)
|
||||
assert est.n_bins_[0] == 1
|
||||
# replace the feature with zeros
|
||||
Xt = est.transform(X)
|
||||
assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
|
||||
|
||||
|
||||
def test_transform_1d_behavior():
|
||||
X = np.arange(4)
|
||||
est = KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf")
|
||||
with pytest.raises(ValueError):
|
||||
est.fit(X)
|
||||
|
||||
est = KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf")
|
||||
est.fit(X.reshape(-1, 1))
|
||||
with pytest.raises(ValueError):
|
||||
est.transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("i", range(1, 9))
|
||||
def test_numeric_stability(i):
|
||||
X_init = np.array([2.0, 4.0, 6.0, 8.0, 10.0]).reshape(-1, 1)
|
||||
Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
|
||||
|
||||
# Test up to discretizing nano units
|
||||
X = X_init / 10**i
|
||||
Xt = KBinsDiscretizer(
|
||||
n_bins=2, encode="ordinal", quantile_method="averaged_inverted_cdf"
|
||||
).fit_transform(X)
|
||||
assert_array_equal(Xt_expected, Xt)
|
||||
|
||||
|
||||
def test_encode_options():
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=[2, 3, 3, 3], encode="ordinal", quantile_method="averaged_inverted_cdf"
|
||||
).fit(X)
|
||||
Xt_1 = est.transform(X)
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=[2, 3, 3, 3],
|
||||
encode="onehot-dense",
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
).fit(X)
|
||||
Xt_2 = est.transform(X)
|
||||
assert not sp.issparse(Xt_2)
|
||||
assert_array_equal(
|
||||
OneHotEncoder(
|
||||
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=False
|
||||
).fit_transform(Xt_1),
|
||||
Xt_2,
|
||||
)
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=[2, 3, 3, 3], encode="onehot", quantile_method="averaged_inverted_cdf"
|
||||
).fit(X)
|
||||
Xt_3 = est.transform(X)
|
||||
assert sp.issparse(Xt_3)
|
||||
assert_array_equal(
|
||||
OneHotEncoder(
|
||||
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=True
|
||||
)
|
||||
.fit_transform(Xt_1)
|
||||
.toarray(),
|
||||
Xt_3.toarray(),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, quantile_method, expected_2bins, expected_3bins, expected_5bins",
|
||||
[
|
||||
("uniform", "warn", [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
|
||||
("kmeans", "warn", [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[0, 0, 0, 1, 1, 1],
|
||||
[0, 0, 1, 1, 2, 2],
|
||||
[0, 1, 2, 3, 4, 4],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_nonuniform_strategies(
|
||||
strategy, quantile_method, expected_2bins, expected_3bins, expected_5bins
|
||||
):
|
||||
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
|
||||
|
||||
# with 2 bins
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=2, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
|
||||
)
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_2bins, Xt.ravel())
|
||||
|
||||
# with 3 bins
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=3, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
|
||||
)
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_3bins, Xt.ravel())
|
||||
|
||||
# with 5 bins
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=5, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
|
||||
)
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_5bins, Xt.ravel())
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected_inv,quantile_method",
|
||||
[
|
||||
(
|
||||
"uniform",
|
||||
[
|
||||
[-1.5, 2.0, -3.5, -0.5],
|
||||
[-0.5, 3.0, -2.5, -0.5],
|
||||
[0.5, 4.0, -1.5, 0.5],
|
||||
[0.5, 4.0, -1.5, 1.5],
|
||||
],
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
[
|
||||
[-1.375, 2.125, -3.375, -0.5625],
|
||||
[-1.375, 2.125, -3.375, -0.5625],
|
||||
[-0.125, 3.375, -2.125, 0.5625],
|
||||
[0.75, 4.25, -1.25, 1.625],
|
||||
],
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
[
|
||||
[-1.5, 2.0, -3.5, -0.75],
|
||||
[-0.5, 3.0, -2.5, 0.0],
|
||||
[0.5, 4.0, -1.5, 1.25],
|
||||
[0.5, 4.0, -1.5, 1.25],
|
||||
],
|
||||
"averaged_inverted_cdf",
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
|
||||
def test_inverse_transform(strategy, encode, expected_inv, quantile_method):
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=3, strategy=strategy, quantile_method=quantile_method, encode=encode
|
||||
)
|
||||
Xt = kbd.fit_transform(X)
|
||||
Xinv = kbd.inverse_transform(Xt)
|
||||
assert_array_almost_equal(expected_inv, Xinv)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
|
||||
def test_transform_outside_fit_range(strategy):
|
||||
X = np.array([0, 1, 2, 3])[:, None]
|
||||
|
||||
if strategy == "quantile":
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=4,
|
||||
strategy=strategy,
|
||||
encode="ordinal",
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
)
|
||||
else:
|
||||
kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode="ordinal")
|
||||
kbd.fit(X)
|
||||
|
||||
X2 = np.array([-2, 5])[:, None]
|
||||
X2t = kbd.transform(X2)
|
||||
assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
|
||||
assert_array_equal(X2t.min(axis=0), [0])
|
||||
|
||||
|
||||
def test_overwrite():
|
||||
X = np.array([0, 1, 2, 3])[:, None]
|
||||
X_before = X.copy()
|
||||
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=3, quantile_method="averaged_inverted_cdf", encode="ordinal"
|
||||
)
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(X, X_before)
|
||||
|
||||
Xt_before = Xt.copy()
|
||||
Xinv = est.inverse_transform(Xt)
|
||||
assert_array_equal(Xt, Xt_before)
|
||||
assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected_bin_edges, quantile_method",
|
||||
[
|
||||
("quantile", [0, 1.5, 3], "averaged_inverted_cdf"),
|
||||
("kmeans", [0, 1.5, 3], "warn"),
|
||||
],
|
||||
)
|
||||
def test_redundant_bins(strategy, expected_bin_edges, quantile_method):
|
||||
X = [[0], [0], [0], [0], [3], [3]]
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=3, strategy=strategy, quantile_method=quantile_method, subsample=None
|
||||
)
|
||||
warning_message = "Consider decreasing the number of bins."
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
kbd.fit(X)
|
||||
|
||||
assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
|
||||
|
||||
|
||||
def test_percentile_numeric_stability():
|
||||
X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
|
||||
bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
|
||||
Xt = np.array([0, 0, 4]).reshape(-1, 1)
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=10,
|
||||
encode="ordinal",
|
||||
strategy="quantile",
|
||||
quantile_method="linear",
|
||||
)
|
||||
## TODO: change to averaged inverted cdf, but that means we only get bin
|
||||
## edges of 0.05 and 0.95 and nothing in between
|
||||
|
||||
warning_message = "Consider decreasing the number of bins."
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
kbd.fit(X)
|
||||
|
||||
assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
|
||||
assert_array_almost_equal(kbd.transform(X), Xt)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("in_dtype", [np.float16, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("out_dtype", [None, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
|
||||
def test_consistent_dtype(in_dtype, out_dtype, encode):
|
||||
X_input = np.array(X, dtype=in_dtype)
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=3,
|
||||
encode=encode,
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
dtype=out_dtype,
|
||||
)
|
||||
kbd.fit(X_input)
|
||||
|
||||
# test output dtype
|
||||
if out_dtype is not None:
|
||||
expected_dtype = out_dtype
|
||||
elif out_dtype is None and X_input.dtype == np.float16:
|
||||
# wrong numeric input dtype are cast in np.float64
|
||||
expected_dtype = np.float64
|
||||
else:
|
||||
expected_dtype = X_input.dtype
|
||||
Xt = kbd.transform(X_input)
|
||||
assert Xt.dtype == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_dtype", [np.float16, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
|
||||
def test_32_equal_64(input_dtype, encode):
|
||||
# TODO this check is redundant with common checks and can be removed
|
||||
# once #16290 is merged
|
||||
X_input = np.array(X, dtype=input_dtype)
|
||||
|
||||
# 32 bit output
|
||||
kbd_32 = KBinsDiscretizer(
|
||||
n_bins=3,
|
||||
encode=encode,
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
dtype=np.float32,
|
||||
)
|
||||
kbd_32.fit(X_input)
|
||||
Xt_32 = kbd_32.transform(X_input)
|
||||
|
||||
# 64 bit output
|
||||
kbd_64 = KBinsDiscretizer(
|
||||
n_bins=3,
|
||||
encode=encode,
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
dtype=np.float64,
|
||||
)
|
||||
kbd_64.fit(X_input)
|
||||
Xt_64 = kbd_64.transform(X_input)
|
||||
|
||||
assert_allclose_dense_sparse(Xt_32, Xt_64)
|
||||
|
||||
|
||||
def test_kbinsdiscretizer_subsample_default():
|
||||
# Since the size of X is small (< 2e5), subsampling will not take place.
|
||||
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
|
||||
kbd_default = KBinsDiscretizer(
|
||||
n_bins=10,
|
||||
encode="ordinal",
|
||||
strategy="quantile",
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
)
|
||||
kbd_default.fit(X)
|
||||
|
||||
kbd_without_subsampling = clone(kbd_default)
|
||||
kbd_without_subsampling.set_params(subsample=None)
|
||||
kbd_without_subsampling.fit(X)
|
||||
|
||||
for bin_kbd_default, bin_kbd_with_subsampling in zip(
|
||||
kbd_default.bin_edges_[0], kbd_without_subsampling.bin_edges_[0]
|
||||
):
|
||||
np.testing.assert_allclose(bin_kbd_default, bin_kbd_with_subsampling)
|
||||
assert kbd_default.bin_edges_.shape == kbd_without_subsampling.bin_edges_.shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"encode, expected_names",
|
||||
[
|
||||
(
|
||||
"onehot",
|
||||
[
|
||||
f"feat{col_id}_{float(bin_id)}"
|
||||
for col_id in range(3)
|
||||
for bin_id in range(4)
|
||||
],
|
||||
),
|
||||
(
|
||||
"onehot-dense",
|
||||
[
|
||||
f"feat{col_id}_{float(bin_id)}"
|
||||
for col_id in range(3)
|
||||
for bin_id in range(4)
|
||||
],
|
||||
),
|
||||
("ordinal", [f"feat{col_id}" for col_id in range(3)]),
|
||||
],
|
||||
)
|
||||
def test_kbinsdiscrtizer_get_feature_names_out(encode, expected_names):
|
||||
"""Check get_feature_names_out for different settings.
|
||||
Non-regression test for #22731
|
||||
"""
|
||||
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
|
||||
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=4, encode=encode, quantile_method="averaged_inverted_cdf"
|
||||
).fit(X)
|
||||
Xt = kbd.transform(X)
|
||||
|
||||
input_features = [f"feat{i}" for i in range(3)]
|
||||
output_names = kbd.get_feature_names_out(input_features)
|
||||
assert Xt.shape[1] == output_names.shape[0]
|
||||
|
||||
assert_array_equal(output_names, expected_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
|
||||
def test_kbinsdiscretizer_subsample(strategy, global_random_seed):
|
||||
# Check that the bin edges are almost the same when subsampling is used.
|
||||
X = np.random.RandomState(global_random_seed).random_sample((100000, 1)) + 1
|
||||
|
||||
if strategy == "quantile":
|
||||
kbd_subsampling = KBinsDiscretizer(
|
||||
strategy=strategy,
|
||||
subsample=50000,
|
||||
random_state=global_random_seed,
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
)
|
||||
else:
|
||||
kbd_subsampling = KBinsDiscretizer(
|
||||
strategy=strategy, subsample=50000, random_state=global_random_seed
|
||||
)
|
||||
kbd_subsampling.fit(X)
|
||||
|
||||
kbd_no_subsampling = clone(kbd_subsampling)
|
||||
kbd_no_subsampling.set_params(subsample=None)
|
||||
kbd_no_subsampling.fit(X)
|
||||
|
||||
# We use a large tolerance because we can't expect the bin edges to be exactly the
|
||||
# same when subsampling is used.
|
||||
assert_allclose(
|
||||
kbd_subsampling.bin_edges_[0], kbd_no_subsampling.bin_edges_[0], rtol=1e-2
|
||||
)
|
||||
|
||||
|
||||
def test_quantile_method_future_warnings():
|
||||
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
|
||||
with pytest.warns(
|
||||
FutureWarning,
|
||||
match="The current default behavior, quantile_method='linear', will be "
|
||||
"changed to quantile_method='averaged_inverted_cdf' in "
|
||||
"scikit-learn version 1.9 to naturally support sample weight "
|
||||
"equivalence properties by default. Pass "
|
||||
"quantile_method='averaged_inverted_cdf' explicitly to silence this "
|
||||
"warning.",
|
||||
):
|
||||
KBinsDiscretizer(strategy="quantile").fit(X)
|
||||
|
||||
|
||||
def test_invalid_quantile_method_with_sample_weight():
|
||||
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
|
||||
expected_msg = (
|
||||
"When fitting with strategy='quantile' and sample weights, "
|
||||
"quantile_method should either be set to 'averaged_inverted_cdf' or "
|
||||
"'inverted_cdf', got quantile_method='linear' instead."
|
||||
)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=expected_msg,
|
||||
):
|
||||
KBinsDiscretizer(strategy="quantile", quantile_method="linear").fit(
|
||||
X,
|
||||
sample_weight=[1, 1, 2, 2],
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,579 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import FunctionTransformer, StandardScaler
|
||||
from sklearn.utils._testing import (
|
||||
_convert_container,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
|
||||
|
||||
|
||||
def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
|
||||
def _func(X, *args, **kwargs):
|
||||
args_store.append(X)
|
||||
args_store.extend(args)
|
||||
kwargs_store.update(kwargs)
|
||||
return func(X)
|
||||
|
||||
return _func
|
||||
|
||||
|
||||
def test_delegate_to_func():
|
||||
# (args|kwargs)_store will hold the positional and keyword arguments
|
||||
# passed to the function inside the FunctionTransformer.
|
||||
args_store = []
|
||||
kwargs_store = {}
|
||||
X = np.arange(10).reshape((5, 2))
|
||||
assert_array_equal(
|
||||
FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
|
||||
X,
|
||||
"transform should have returned X unchanged",
|
||||
)
|
||||
|
||||
# The function should only have received X.
|
||||
assert args_store == [X], (
|
||||
"Incorrect positional arguments passed to func: {args}".format(args=args_store)
|
||||
)
|
||||
|
||||
assert not kwargs_store, (
|
||||
"Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
|
||||
)
|
||||
|
||||
# reset the argument stores.
|
||||
args_store[:] = []
|
||||
kwargs_store.clear()
|
||||
transformed = FunctionTransformer(
|
||||
_make_func(args_store, kwargs_store),
|
||||
).transform(X)
|
||||
|
||||
assert_array_equal(
|
||||
transformed, X, err_msg="transform should have returned X unchanged"
|
||||
)
|
||||
|
||||
# The function should have received X
|
||||
assert args_store == [X], (
|
||||
"Incorrect positional arguments passed to func: {args}".format(args=args_store)
|
||||
)
|
||||
|
||||
assert not kwargs_store, (
|
||||
"Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
|
||||
)
|
||||
|
||||
|
||||
def test_np_log():
|
||||
X = np.arange(10).reshape((5, 2))
|
||||
|
||||
# Test that the numpy.log example still works.
|
||||
assert_array_equal(
|
||||
FunctionTransformer(np.log1p).transform(X),
|
||||
np.log1p(X),
|
||||
)
|
||||
|
||||
|
||||
def test_kw_arg():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=3))
|
||||
|
||||
|
||||
def test_kw_arg_update():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
F.kw_args["decimals"] = 1
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=1))
|
||||
|
||||
|
||||
def test_kw_arg_reset():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
F.kw_args = dict(decimals=1)
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=1))
|
||||
|
||||
|
||||
def test_inverse_transform():
|
||||
X = np.array([1, 4, 9, 16]).reshape((2, 2))
|
||||
|
||||
# Test that inverse_transform works correctly
|
||||
F = FunctionTransformer(
|
||||
func=np.sqrt,
|
||||
inverse_func=np.around,
|
||||
inv_kw_args=dict(decimals=3),
|
||||
)
|
||||
assert_array_equal(
|
||||
F.inverse_transform(F.transform(X)),
|
||||
np.around(np.sqrt(X), decimals=3),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
|
||||
def test_check_inverse(sparse_container):
|
||||
X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
|
||||
if sparse_container is not None:
|
||||
X = sparse_container(X)
|
||||
|
||||
trans = FunctionTransformer(
|
||||
func=np.sqrt,
|
||||
inverse_func=np.around,
|
||||
accept_sparse=sparse_container is not None,
|
||||
check_inverse=True,
|
||||
validate=True,
|
||||
)
|
||||
warning_message = (
|
||||
"The provided functions are not strictly"
|
||||
" inverse of each other. If you are sure you"
|
||||
" want to proceed regardless, set"
|
||||
" 'check_inverse=False'."
|
||||
)
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
trans.fit(X)
|
||||
|
||||
trans = FunctionTransformer(
|
||||
func=np.expm1,
|
||||
inverse_func=np.log1p,
|
||||
accept_sparse=sparse_container is not None,
|
||||
check_inverse=True,
|
||||
validate=True,
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
Xt = trans.fit_transform(X)
|
||||
|
||||
assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
|
||||
|
||||
|
||||
def test_check_inverse_func_or_inverse_not_provided():
|
||||
# check that we don't check inverse when one of the func or inverse is not
|
||||
# provided.
|
||||
X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
|
||||
|
||||
trans = FunctionTransformer(
|
||||
func=np.expm1, inverse_func=None, check_inverse=True, validate=True
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
trans.fit(X)
|
||||
trans = FunctionTransformer(
|
||||
func=None, inverse_func=np.expm1, check_inverse=True, validate=True
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
trans.fit(X)
|
||||
|
||||
|
||||
def test_function_transformer_frame():
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_df = pd.DataFrame(np.random.randn(100, 10))
|
||||
transformer = FunctionTransformer()
|
||||
X_df_trans = transformer.fit_transform(X_df)
|
||||
assert hasattr(X_df_trans, "loc")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X_type", ["array", "series"])
|
||||
def test_function_transformer_raise_error_with_mixed_dtype(X_type):
|
||||
"""Check that `FunctionTransformer.check_inverse` raises error on mixed dtype."""
|
||||
mapping = {"one": 1, "two": 2, "three": 3, 5: "five", 6: "six"}
|
||||
inverse_mapping = {value: key for key, value in mapping.items()}
|
||||
dtype = "object"
|
||||
|
||||
data = ["one", "two", "three", "one", "one", 5, 6]
|
||||
data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype)
|
||||
|
||||
def func(X):
|
||||
return np.array([mapping[X[i]] for i in range(X.size)], dtype=object)
|
||||
|
||||
def inverse_func(X):
|
||||
return _convert_container(
|
||||
[inverse_mapping[x] for x in X],
|
||||
X_type,
|
||||
columns_name=["value"],
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
transformer = FunctionTransformer(
|
||||
func=func, inverse_func=inverse_func, validate=False, check_inverse=True
|
||||
)
|
||||
|
||||
msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
transformer.fit(data)
|
||||
|
||||
|
||||
def test_function_transformer_support_all_nummerical_dataframes_check_inverse_True():
|
||||
"""Check support for dataframes with only numerical values."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
||||
transformer = FunctionTransformer(
|
||||
func=lambda x: x + 2, inverse_func=lambda x: x - 2, check_inverse=True
|
||||
)
|
||||
|
||||
# Does not raise an error
|
||||
df_out = transformer.fit_transform(df)
|
||||
assert_allclose_dense_sparse(df_out, df + 2)
|
||||
|
||||
|
||||
def test_function_transformer_with_dataframe_and_check_inverse_True():
|
||||
"""Check error is raised when check_inverse=True.
|
||||
|
||||
Non-regresion test for gh-25261.
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
transformer = FunctionTransformer(
|
||||
func=lambda x: x, inverse_func=lambda x: x, check_inverse=True
|
||||
)
|
||||
|
||||
df_mixed = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
|
||||
msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
transformer.fit(df_mixed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, feature_names_out, input_features, expected",
|
||||
[
|
||||
(
|
||||
# NumPy inputs, default behavior: generate names
|
||||
np.random.rand(100, 3),
|
||||
"one-to-one",
|
||||
None,
|
||||
("x0", "x1", "x2"),
|
||||
),
|
||||
(
|
||||
# Pandas input, default behavior: use input feature names
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
"one-to-one",
|
||||
None,
|
||||
("a", "b"),
|
||||
),
|
||||
(
|
||||
# NumPy input, feature_names_out=callable
|
||||
np.random.rand(100, 3),
|
||||
lambda transformer, input_features: ("a", "b"),
|
||||
None,
|
||||
("a", "b"),
|
||||
),
|
||||
(
|
||||
# Pandas input, feature_names_out=callable
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
lambda transformer, input_features: ("c", "d", "e"),
|
||||
None,
|
||||
("c", "d", "e"),
|
||||
),
|
||||
(
|
||||
# NumPy input, feature_names_out=callable – default input_features
|
||||
np.random.rand(100, 3),
|
||||
lambda transformer, input_features: tuple(input_features) + ("a",),
|
||||
None,
|
||||
("x0", "x1", "x2", "a"),
|
||||
),
|
||||
(
|
||||
# Pandas input, feature_names_out=callable – default input_features
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
lambda transformer, input_features: tuple(input_features) + ("c",),
|
||||
None,
|
||||
("a", "b", "c"),
|
||||
),
|
||||
(
|
||||
# NumPy input, input_features=list of names
|
||||
np.random.rand(100, 3),
|
||||
"one-to-one",
|
||||
("a", "b", "c"),
|
||||
("a", "b", "c"),
|
||||
),
|
||||
(
|
||||
# Pandas input, input_features=list of names
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
"one-to-one",
|
||||
("a", "b"), # must match feature_names_in_
|
||||
("a", "b"),
|
||||
),
|
||||
(
|
||||
# NumPy input, feature_names_out=callable, input_features=list
|
||||
np.random.rand(100, 3),
|
||||
lambda transformer, input_features: tuple(input_features) + ("d",),
|
||||
("a", "b", "c"),
|
||||
("a", "b", "c", "d"),
|
||||
),
|
||||
(
|
||||
# Pandas input, feature_names_out=callable, input_features=list
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
lambda transformer, input_features: tuple(input_features) + ("c",),
|
||||
("a", "b"), # must match feature_names_in_
|
||||
("a", "b", "c"),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("validate", [True, False])
|
||||
def test_function_transformer_get_feature_names_out(
|
||||
X, feature_names_out, input_features, expected, validate
|
||||
):
|
||||
if isinstance(X, dict):
|
||||
pd = pytest.importorskip("pandas")
|
||||
X = pd.DataFrame(X)
|
||||
|
||||
transformer = FunctionTransformer(
|
||||
feature_names_out=feature_names_out, validate=validate
|
||||
)
|
||||
transformer.fit(X)
|
||||
names = transformer.get_feature_names_out(input_features)
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, expected)
|
||||
|
||||
|
||||
def test_function_transformer_get_feature_names_out_without_validation():
|
||||
transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False)
|
||||
X = np.random.rand(100, 2)
|
||||
transformer.fit_transform(X)
|
||||
|
||||
names = transformer.get_feature_names_out(("a", "b"))
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, ("a", "b"))
|
||||
|
||||
|
||||
def test_function_transformer_feature_names_out_is_None():
|
||||
transformer = FunctionTransformer()
|
||||
X = np.random.rand(100, 2)
|
||||
transformer.fit_transform(X)
|
||||
|
||||
msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
transformer.get_feature_names_out()
|
||||
|
||||
|
||||
def test_function_transformer_feature_names_out_uses_estimator():
|
||||
def add_n_random_features(X, n):
|
||||
return np.concatenate([X, np.random.rand(len(X), n)], axis=1)
|
||||
|
||||
def feature_names_out(transformer, input_features):
|
||||
n = transformer.kw_args["n"]
|
||||
return list(input_features) + [f"rnd{i}" for i in range(n)]
|
||||
|
||||
transformer = FunctionTransformer(
|
||||
func=add_n_random_features,
|
||||
feature_names_out=feature_names_out,
|
||||
kw_args=dict(n=3),
|
||||
validate=True,
|
||||
)
|
||||
pd = pytest.importorskip("pandas")
|
||||
df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
|
||||
transformer.fit_transform(df)
|
||||
names = transformer.get_feature_names_out()
|
||||
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2"))
|
||||
|
||||
|
||||
def test_function_transformer_validate_inverse():
|
||||
"""Test that function transformer does not reset estimator in
|
||||
`inverse_transform`."""
|
||||
|
||||
def add_constant_feature(X):
|
||||
X_one = np.ones((X.shape[0], 1))
|
||||
return np.concatenate((X, X_one), axis=1)
|
||||
|
||||
def inverse_add_constant(X):
|
||||
return X[:, :-1]
|
||||
|
||||
X = np.array([[1, 2], [3, 4], [3, 4]])
|
||||
trans = FunctionTransformer(
|
||||
func=add_constant_feature,
|
||||
inverse_func=inverse_add_constant,
|
||||
validate=True,
|
||||
)
|
||||
X_trans = trans.fit_transform(X)
|
||||
assert trans.n_features_in_ == X.shape[1]
|
||||
|
||||
trans.inverse_transform(X_trans)
|
||||
assert trans.n_features_in_ == X.shape[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"feature_names_out, expected",
|
||||
[
|
||||
("one-to-one", ["pet", "color"]),
|
||||
[lambda est, names: [f"{n}_out" for n in names], ["pet_out", "color_out"]],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("in_pipeline", [True, False])
|
||||
def test_get_feature_names_out_dataframe_with_string_data(
|
||||
feature_names_out, expected, in_pipeline
|
||||
):
|
||||
"""Check that get_feature_names_out works with DataFrames with string data."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
X = pd.DataFrame({"pet": ["dog", "cat"], "color": ["red", "green"]})
|
||||
|
||||
def func(X):
|
||||
if feature_names_out == "one-to-one":
|
||||
return X
|
||||
else:
|
||||
name = feature_names_out(None, X.columns)
|
||||
return X.rename(columns=dict(zip(X.columns, name)))
|
||||
|
||||
transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out)
|
||||
if in_pipeline:
|
||||
transformer = make_pipeline(transformer)
|
||||
|
||||
X_trans = transformer.fit_transform(X)
|
||||
assert isinstance(X_trans, pd.DataFrame)
|
||||
|
||||
names = transformer.get_feature_names_out()
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, expected)
|
||||
|
||||
|
||||
def test_set_output_func():
|
||||
"""Check behavior of set_output with different settings."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
|
||||
|
||||
ft = FunctionTransformer(np.log, feature_names_out="one-to-one")
|
||||
|
||||
# no warning is raised when feature_names_out is defined
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
ft.set_output(transform="pandas")
|
||||
|
||||
X_trans = ft.fit_transform(X)
|
||||
assert isinstance(X_trans, pd.DataFrame)
|
||||
assert_array_equal(X_trans.columns, ["a", "b"])
|
||||
|
||||
ft = FunctionTransformer(lambda x: 2 * x)
|
||||
ft.set_output(transform="pandas")
|
||||
|
||||
# no warning is raised when func returns a panda dataframe
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
X_trans = ft.fit_transform(X)
|
||||
assert isinstance(X_trans, pd.DataFrame)
|
||||
assert_array_equal(X_trans.columns, ["a", "b"])
|
||||
|
||||
# Warning is raised when func returns a ndarray
|
||||
ft_np = FunctionTransformer(lambda x: np.asarray(x))
|
||||
|
||||
for transform in ("pandas", "polars"):
|
||||
ft_np.set_output(transform=transform)
|
||||
msg = (
|
||||
f"When `set_output` is configured to be '{transform}'.*{transform} "
|
||||
"DataFrame.*"
|
||||
)
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
ft_np.fit_transform(X)
|
||||
|
||||
# default transform does not warn
|
||||
ft_np.set_output(transform="default")
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
ft_np.fit_transform(X)
|
||||
|
||||
|
||||
def test_consistence_column_name_between_steps():
|
||||
"""Check that we have a consistence between the feature names out of
|
||||
`FunctionTransformer` and the feature names in of the next step in the pipeline.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/27695
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
def with_suffix(_, names):
|
||||
return [name + "__log" for name in names]
|
||||
|
||||
pipeline = make_pipeline(
|
||||
FunctionTransformer(np.log1p, feature_names_out=with_suffix), StandardScaler()
|
||||
)
|
||||
|
||||
df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["a", "b"])
|
||||
X_trans = pipeline.fit_transform(df)
|
||||
assert pipeline.get_feature_names_out().tolist() == ["a__log", "b__log"]
|
||||
# StandardScaler will convert to a numpy array
|
||||
assert isinstance(X_trans, np.ndarray)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
|
||||
@pytest.mark.parametrize("transform_output", ["default", "pandas", "polars"])
|
||||
def test_function_transformer_overwrite_column_names(dataframe_lib, transform_output):
|
||||
"""Check that we overwrite the column names when we should."""
|
||||
lib = pytest.importorskip(dataframe_lib)
|
||||
if transform_output != "numpy":
|
||||
pytest.importorskip(transform_output)
|
||||
|
||||
df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
|
||||
|
||||
def with_suffix(_, names):
|
||||
return [name + "__log" for name in names]
|
||||
|
||||
transformer = FunctionTransformer(feature_names_out=with_suffix).set_output(
|
||||
transform=transform_output
|
||||
)
|
||||
X_trans = transformer.fit_transform(df)
|
||||
assert_array_equal(np.asarray(X_trans), np.asarray(df))
|
||||
|
||||
feature_names = transformer.get_feature_names_out()
|
||||
assert list(X_trans.columns) == with_suffix(None, df.columns)
|
||||
assert feature_names.tolist() == with_suffix(None, df.columns)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"feature_names_out",
|
||||
["one-to-one", lambda _, names: [f"{name}_log" for name in names]],
|
||||
)
|
||||
def test_function_transformer_overwrite_column_names_numerical(feature_names_out):
|
||||
"""Check the same as `test_function_transformer_overwrite_column_names`
|
||||
but for the specific case of pandas where column names can be numerical."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
df = pd.DataFrame({0: [1, 2, 3], 1: [10, 20, 100]})
|
||||
|
||||
transformer = FunctionTransformer(feature_names_out=feature_names_out)
|
||||
X_trans = transformer.fit_transform(df)
|
||||
assert_array_equal(np.asarray(X_trans), np.asarray(df))
|
||||
|
||||
feature_names = transformer.get_feature_names_out()
|
||||
assert list(X_trans.columns) == list(feature_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
|
||||
@pytest.mark.parametrize(
|
||||
"feature_names_out",
|
||||
["one-to-one", lambda _, names: [f"{name}_log" for name in names]],
|
||||
)
|
||||
def test_function_transformer_error_column_inconsistent(
|
||||
dataframe_lib, feature_names_out
|
||||
):
|
||||
"""Check that we raise an error when `func` returns a dataframe with new
|
||||
column names that become inconsistent with `get_feature_names_out`."""
|
||||
lib = pytest.importorskip(dataframe_lib)
|
||||
|
||||
df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
|
||||
|
||||
def func(df):
|
||||
if dataframe_lib == "pandas":
|
||||
return df.rename(columns={"a": "c"})
|
||||
else:
|
||||
return df.rename({"a": "c"})
|
||||
|
||||
transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out)
|
||||
err_msg = "The output generated by `func` have different column names"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
transformer.fit_transform(df).columns
|
||||
@@ -0,0 +1,748 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn import config_context, datasets
|
||||
from sklearn.preprocessing._label import (
|
||||
LabelBinarizer,
|
||||
LabelEncoder,
|
||||
MultiLabelBinarizer,
|
||||
_inverse_binarize_multiclass,
|
||||
_inverse_binarize_thresholding,
|
||||
label_binarize,
|
||||
)
|
||||
from sklearn.utils._array_api import (
|
||||
_convert_to_numpy,
|
||||
_get_namespace_device_dtype_ids,
|
||||
get_namespace,
|
||||
yield_namespace_device_dtype_combinations,
|
||||
)
|
||||
from sklearn.utils._testing import (
|
||||
_array_api_for_tests,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import (
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
DOK_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
from sklearn.utils.validation import _to_object_array
|
||||
|
||||
iris = datasets.load_iris()
|
||||
|
||||
|
||||
def toarray(a):
|
||||
if hasattr(a, "toarray"):
|
||||
a = a.toarray()
|
||||
return a
|
||||
|
||||
|
||||
def test_label_binarizer():
|
||||
# one-class case defaults to negative label
|
||||
# For dense case:
|
||||
inp = ["pos", "pos", "pos", "pos"]
|
||||
lb = LabelBinarizer(sparse_output=False)
|
||||
expected = np.array([[0, 0, 0, 0]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["pos"])
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
# For sparse case:
|
||||
lb = LabelBinarizer(sparse_output=True)
|
||||
got = lb.fit_transform(inp)
|
||||
assert issparse(got)
|
||||
assert_array_equal(lb.classes_, ["pos"])
|
||||
assert_array_equal(expected, got.toarray())
|
||||
assert_array_equal(lb.inverse_transform(got.toarray()), inp)
|
||||
|
||||
lb = LabelBinarizer(sparse_output=False)
|
||||
# two-class case
|
||||
inp = ["neg", "pos", "pos", "neg"]
|
||||
expected = np.array([[0, 1, 1, 0]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["neg", "pos"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
|
||||
assert_array_equal(lb.inverse_transform(to_invert), inp)
|
||||
|
||||
# multi-class case
|
||||
inp = ["spam", "ham", "eggs", "ham", "0"]
|
||||
expected = np.array(
|
||||
[[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
|
||||
)
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"])
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
|
||||
def test_label_binarizer_unseen_labels():
|
||||
lb = LabelBinarizer()
|
||||
|
||||
expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
|
||||
got = lb.fit_transform(["b", "d", "e"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
expected = np.array(
|
||||
[[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]
|
||||
)
|
||||
got = lb.transform(["a", "b", "c", "d", "e", "f"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
|
||||
def test_label_binarizer_set_label_encoding():
|
||||
lb = LabelBinarizer(neg_label=-2, pos_label=0)
|
||||
|
||||
# two-class case with pos_label=0
|
||||
inp = np.array([0, 1, 1, 0])
|
||||
expected = np.array([[-2, 0, 0, -2]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
lb = LabelBinarizer(neg_label=-2, pos_label=2)
|
||||
|
||||
# multi-class case
|
||||
inp = np.array([3, 2, 1, 2, 0])
|
||||
expected = np.array(
|
||||
[
|
||||
[-2, -2, -2, +2],
|
||||
[-2, -2, +2, -2],
|
||||
[-2, +2, -2, -2],
|
||||
[-2, -2, +2, -2],
|
||||
[+2, -2, -2, -2],
|
||||
]
|
||||
)
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
@pytest.mark.parametrize("unique_first", [True, False])
|
||||
def test_label_binarizer_pandas_nullable(dtype, unique_first):
|
||||
"""Checks that LabelBinarizer works with pandas nullable dtypes.
|
||||
|
||||
Non-regression test for gh-25637.
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
|
||||
if unique_first:
|
||||
# Calling unique creates a pandas array which has a different interface
|
||||
# compared to a pandas Series. Specifically, pandas arrays do not have "iloc".
|
||||
y_true = y_true.unique()
|
||||
lb = LabelBinarizer().fit(y_true)
|
||||
y_out = lb.transform([1, 0])
|
||||
|
||||
assert_array_equal(y_out, [[1], [0]])
|
||||
|
||||
|
||||
def test_label_binarizer_errors():
|
||||
# Check that invalid arguments yield ValueError
|
||||
one_class = np.array([0, 0, 0, 0])
|
||||
lb = LabelBinarizer().fit(one_class)
|
||||
|
||||
multi_label = [(2, 3), (0,), (0, 2)]
|
||||
err_msg = "You appear to be using a legacy multi-label data representation."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.transform(multi_label)
|
||||
|
||||
lb = LabelBinarizer()
|
||||
err_msg = "This LabelBinarizer instance is not fitted yet"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.transform([])
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.inverse_transform([])
|
||||
|
||||
input_labels = [0, 1, 0, 1]
|
||||
err_msg = "neg_label=2 must be strictly less than pos_label=1."
|
||||
lb = LabelBinarizer(neg_label=2, pos_label=1)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.fit(input_labels)
|
||||
err_msg = "neg_label=2 must be strictly less than pos_label=2."
|
||||
lb = LabelBinarizer(neg_label=2, pos_label=2)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.fit(input_labels)
|
||||
err_msg = (
|
||||
"Sparse binarization is only supported with non zero pos_label and zero "
|
||||
"neg_label, got pos_label=2 and neg_label=1"
|
||||
)
|
||||
lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.fit(input_labels)
|
||||
|
||||
# Sequence of seq type should raise ValueError
|
||||
y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
|
||||
err_msg = "You appear to be using a legacy multi-label data representation"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
LabelBinarizer().fit_transform(y_seq_of_seqs)
|
||||
|
||||
# Fail on the dimension of 'binary'
|
||||
err_msg = "output_type='binary', but y.shape"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_inverse_binarize_thresholding(
|
||||
y=np.array([[1, 2, 3], [2, 1, 3]]),
|
||||
output_type="binary",
|
||||
classes=[1, 2, 3],
|
||||
threshold=0,
|
||||
)
|
||||
|
||||
# Fail on multioutput data
|
||||
err_msg = "Multioutput target data is not supported with label binarization"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_label_binarizer_sparse_errors(csr_container):
|
||||
# Fail on y_type
|
||||
err_msg = "foo format is not supported"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_inverse_binarize_thresholding(
|
||||
y=csr_container([[1, 2], [2, 1]]),
|
||||
output_type="foo",
|
||||
classes=[1, 2],
|
||||
threshold=0,
|
||||
)
|
||||
|
||||
# Fail on the number of classes
|
||||
err_msg = "The number of class is not equal to the number of dimension of y."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_inverse_binarize_thresholding(
|
||||
y=csr_container([[1, 2], [2, 1]]),
|
||||
output_type="foo",
|
||||
classes=[1, 2, 3],
|
||||
threshold=0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, classes, unknown",
|
||||
[
|
||||
(
|
||||
np.array([2, 1, 3, 1, 3], dtype="int64"),
|
||||
np.array([1, 2, 3], dtype="int64"),
|
||||
np.array([4], dtype="int64"),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", "c", "a", "c"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
np.array(["d"], dtype=object),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", "c", "a", "c"]),
|
||||
np.array(["a", "b", "c"]),
|
||||
np.array(["d"]),
|
||||
),
|
||||
],
|
||||
ids=["int64", "object", "str"],
|
||||
)
|
||||
def test_label_encoder(values, classes, unknown):
|
||||
# Test LabelEncoder's transform, fit_transform and
|
||||
# inverse_transform methods
|
||||
le = LabelEncoder()
|
||||
le.fit(values)
|
||||
assert_array_equal(le.classes_, classes)
|
||||
assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
|
||||
assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
|
||||
le = LabelEncoder()
|
||||
ret = le.fit_transform(values)
|
||||
assert_array_equal(ret, [1, 0, 2, 0, 2])
|
||||
|
||||
with pytest.raises(ValueError, match="unseen labels"):
|
||||
le.transform(unknown)
|
||||
|
||||
|
||||
def test_label_encoder_negative_ints():
|
||||
le = LabelEncoder()
|
||||
le.fit([1, 1, 4, 5, -1, 0])
|
||||
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
|
||||
assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
|
||||
assert_array_equal(
|
||||
le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
le.transform([0, 6])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["str", "object"])
|
||||
def test_label_encoder_str_bad_shape(dtype):
|
||||
le = LabelEncoder()
|
||||
le.fit(np.array(["apple", "orange"], dtype=dtype))
|
||||
msg = "should be a 1d array"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.transform("apple")
|
||||
|
||||
|
||||
def test_label_encoder_errors():
|
||||
# Check that invalid arguments yield ValueError
|
||||
le = LabelEncoder()
|
||||
with pytest.raises(ValueError):
|
||||
le.transform([])
|
||||
with pytest.raises(ValueError):
|
||||
le.inverse_transform([])
|
||||
|
||||
# Fail on unseen labels
|
||||
le = LabelEncoder()
|
||||
le.fit([1, 2, 3, -1, 1])
|
||||
msg = "contains previously unseen labels"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform([-2])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform([-2, -3, -4])
|
||||
|
||||
# Fail on inverse_transform("")
|
||||
msg = r"should be a 1d array.+shape \(\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform("")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
np.array([2, 1, 3, 1, 3], dtype="int64"),
|
||||
np.array(["b", "a", "c", "a", "c"], dtype=object),
|
||||
np.array(["b", "a", "c", "a", "c"]),
|
||||
],
|
||||
ids=["int64", "object", "str"],
|
||||
)
|
||||
def test_label_encoder_empty_array(values):
|
||||
le = LabelEncoder()
|
||||
le.fit(values)
|
||||
# test empty transform
|
||||
transformed = le.transform([])
|
||||
assert_array_equal(np.array([]), transformed)
|
||||
# test empty inverse transform
|
||||
inverse_transformed = le.inverse_transform([])
|
||||
assert_array_equal(np.array([]), inverse_transformed)
|
||||
|
||||
|
||||
def test_sparse_output_multilabel_binarizer():
|
||||
# test input as iterable of iterables
|
||||
inputs = [
|
||||
lambda: [(2, 3), (1,), (1, 2)],
|
||||
lambda: ({2, 3}, {1}, {1, 2}),
|
||||
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
|
||||
inverse = inputs[0]()
|
||||
for sparse_output in [True, False]:
|
||||
for inp in inputs:
|
||||
# With fit_transform
|
||||
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
|
||||
got = mlb.fit_transform(inp())
|
||||
assert issparse(got) == sparse_output
|
||||
if sparse_output:
|
||||
# verify CSR assumption that indices and indptr have same dtype
|
||||
assert got.indices.dtype == got.indptr.dtype
|
||||
got = got.toarray()
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
# With fit
|
||||
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
|
||||
got = mlb.fit(inp()).transform(inp())
|
||||
assert issparse(got) == sparse_output
|
||||
if sparse_output:
|
||||
# verify CSR assumption that indices and indptr have same dtype
|
||||
assert got.indices.dtype == got.indptr.dtype
|
||||
got = got.toarray()
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sparse_output_multilabel_binarizer_errors(csr_container):
|
||||
inp = iter([iter((2, 3)), iter((1,)), {1, 2}])
|
||||
mlb = MultiLabelBinarizer(sparse_output=False)
|
||||
mlb.fit(inp)
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(
|
||||
csr_container(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]]))
|
||||
)
|
||||
|
||||
|
||||
def test_multilabel_binarizer():
|
||||
# test input as iterable of iterables
|
||||
inputs = [
|
||||
lambda: [(2, 3), (1,), (1, 2)],
|
||||
lambda: ({2, 3}, {1}, {1, 2}),
|
||||
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
inverse = inputs[0]()
|
||||
for inp in inputs:
|
||||
# With fit_transform
|
||||
mlb = MultiLabelBinarizer()
|
||||
got = mlb.fit_transform(inp())
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
# With fit
|
||||
mlb = MultiLabelBinarizer()
|
||||
got = mlb.fit(inp()).transform(inp())
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
|
||||
def test_multilabel_binarizer_empty_sample():
|
||||
mlb = MultiLabelBinarizer()
|
||||
y = [[1, 2], [1], []]
|
||||
Y = np.array([[1, 1], [1, 0], [0, 0]])
|
||||
assert_array_equal(mlb.fit_transform(y), Y)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_unknown_class():
|
||||
mlb = MultiLabelBinarizer()
|
||||
y = [[1, 2]]
|
||||
Y = np.array([[1, 0], [0, 1]])
|
||||
warning_message = "unknown class.* will be ignored"
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
|
||||
|
||||
Y = np.array([[1, 0, 0], [0, 1, 0]])
|
||||
mlb = MultiLabelBinarizer(classes=[1, 2, 3])
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
|
||||
assert_array_equal(matrix, Y)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_given_classes():
|
||||
inp = [(2, 3), (1,), (1, 2)]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, [1, 3, 2])
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, [1, 3, 2])
|
||||
|
||||
# ensure works with extra class
|
||||
mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
|
||||
assert_array_equal(
|
||||
mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))
|
||||
)
|
||||
assert_array_equal(mlb.classes_, [4, 1, 3, 2])
|
||||
|
||||
# ensure fit is no-op as iterable is not consumed
|
||||
inp = iter(inp)
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
|
||||
# ensure a ValueError is thrown if given duplicate classes
|
||||
err_msg = (
|
||||
"The classes argument contains duplicate classes. Remove "
|
||||
"these duplicates before passing them to MultiLabelBinarizer."
|
||||
)
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
mlb.fit(inp)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_multiple_calls():
|
||||
inp = [(2, 3), (1,), (1, 2)]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
|
||||
|
||||
indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
|
||||
# first call
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
# second call change class
|
||||
mlb.classes = [1, 2, 3]
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_same_length_sequence():
|
||||
# Ensure sequences of the same length are not interpreted as a 2-d array
|
||||
inp = [[1], [0], [2]]
|
||||
indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_non_integer_labels():
|
||||
tuple_classes = _to_object_array([(1,), (2,), (3,)])
|
||||
inputs = [
|
||||
([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]),
|
||||
([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]),
|
||||
([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
for inp, classes in inputs:
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
inp = np.array(inp, dtype=object)
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, classes)
|
||||
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
|
||||
assert_array_equal(indicator_mat_inv, inp)
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, classes)
|
||||
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
|
||||
assert_array_equal(indicator_mat_inv, inp)
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
with pytest.raises(TypeError):
|
||||
mlb.fit_transform([({}), ({}, {"a": "b"})])
|
||||
|
||||
|
||||
def test_multilabel_binarizer_non_unique():
|
||||
inp = [(1, 1, 1, 0)]
|
||||
indicator_mat = np.array([[1, 1]])
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_inverse_validation():
|
||||
inp = [(1, 1, 1, 0)]
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit_transform(inp)
|
||||
# Not binary
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1, 3]]))
|
||||
# The following binary cases are fine, however
|
||||
mlb.inverse_transform(np.array([[0, 0]]))
|
||||
mlb.inverse_transform(np.array([[1, 1]]))
|
||||
mlb.inverse_transform(np.array([[1, 0]]))
|
||||
|
||||
# Wrong shape
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1]]))
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1, 1, 1]]))
|
||||
|
||||
|
||||
def test_label_binarize_with_class_order():
|
||||
out = label_binarize([1, 6], classes=[1, 2, 4, 6])
|
||||
expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
# Modified class order
|
||||
out = label_binarize([1, 6], classes=[1, 6, 4, 2])
|
||||
expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
|
||||
expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
|
||||
def check_binarized_results(y, classes, pos_label, neg_label, expected):
|
||||
for sparse_output in [True, False]:
|
||||
if (pos_label == 0 or neg_label != 0) and sparse_output:
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(
|
||||
y,
|
||||
classes=classes,
|
||||
neg_label=neg_label,
|
||||
pos_label=pos_label,
|
||||
sparse_output=sparse_output,
|
||||
)
|
||||
continue
|
||||
|
||||
# check label_binarize
|
||||
binarized = label_binarize(
|
||||
y,
|
||||
classes=classes,
|
||||
neg_label=neg_label,
|
||||
pos_label=pos_label,
|
||||
sparse_output=sparse_output,
|
||||
)
|
||||
assert_array_equal(toarray(binarized), expected)
|
||||
assert issparse(binarized) == sparse_output
|
||||
|
||||
# check inverse
|
||||
y_type = type_of_target(y)
|
||||
if y_type == "multiclass":
|
||||
inversed = _inverse_binarize_multiclass(binarized, classes=classes)
|
||||
|
||||
else:
|
||||
inversed = _inverse_binarize_thresholding(
|
||||
binarized,
|
||||
output_type=y_type,
|
||||
classes=classes,
|
||||
threshold=((neg_label + pos_label) / 2.0),
|
||||
)
|
||||
|
||||
assert_array_equal(toarray(inversed), toarray(y))
|
||||
|
||||
# Check label binarizer
|
||||
lb = LabelBinarizer(
|
||||
neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
|
||||
)
|
||||
binarized = lb.fit_transform(y)
|
||||
assert_array_equal(toarray(binarized), expected)
|
||||
assert issparse(binarized) == sparse_output
|
||||
inverse_output = lb.inverse_transform(binarized)
|
||||
assert_array_equal(toarray(inverse_output), toarray(y))
|
||||
assert issparse(inverse_output) == issparse(y)
|
||||
|
||||
|
||||
def test_label_binarize_binary():
|
||||
y = [0, 1, 0]
|
||||
classes = [0, 1]
|
||||
pos_label = 2
|
||||
neg_label = -1
|
||||
expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
# Binary case where sparse_output = True will not result in a ValueError
|
||||
y = [0, 1, 0]
|
||||
classes = [0, 1]
|
||||
pos_label = 3
|
||||
neg_label = 0
|
||||
expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
|
||||
def test_label_binarize_multiclass():
|
||||
y = [0, 1, 2]
|
||||
classes = [0, 1, 2]
|
||||
pos_label = 2
|
||||
neg_label = 0
|
||||
expected = 2 * np.eye(3)
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(
|
||||
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arr_type",
|
||||
[np.array]
|
||||
+ COO_CONTAINERS
|
||||
+ CSC_CONTAINERS
|
||||
+ CSR_CONTAINERS
|
||||
+ DOK_CONTAINERS
|
||||
+ LIL_CONTAINERS,
|
||||
)
|
||||
def test_label_binarize_multilabel(arr_type):
|
||||
y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
|
||||
classes = [0, 1, 2]
|
||||
pos_label = 2
|
||||
neg_label = 0
|
||||
expected = pos_label * y_ind
|
||||
y = arr_type(y_ind)
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(
|
||||
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
|
||||
)
|
||||
|
||||
|
||||
def test_invalid_input_label_binarize():
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
|
||||
with pytest.raises(ValueError, match="continuous target data is not "):
|
||||
label_binarize([1.2, 2.7], classes=[0, 1])
|
||||
with pytest.raises(ValueError, match="mismatch with the labels"):
|
||||
label_binarize([[1, 3]], classes=[1, 2, 3])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_inverse_binarize_multiclass(csr_container):
|
||||
got = _inverse_binarize_multiclass(
|
||||
csr_container([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3)
|
||||
)
|
||||
assert_array_equal(got, np.array([1, 1, 0]))
|
||||
|
||||
|
||||
def test_nan_label_encoder():
|
||||
"""Check that label encoder encodes nans in transform.
|
||||
|
||||
Non-regression test for #22628.
|
||||
"""
|
||||
le = LabelEncoder()
|
||||
le.fit(["a", "a", "b", np.nan])
|
||||
|
||||
y_trans = le.transform([np.nan])
|
||||
assert_array_equal(y_trans, [2])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"encoder", [LabelEncoder(), LabelBinarizer(), MultiLabelBinarizer()]
|
||||
)
|
||||
def test_label_encoders_do_not_have_set_output(encoder):
|
||||
"""Check that label encoders do not define set_output and work with y as a kwarg.
|
||||
|
||||
Non-regression test for #26854.
|
||||
"""
|
||||
assert not hasattr(encoder, "set_output")
|
||||
y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"])
|
||||
y_encoded_positional = encoder.fit_transform(["a", "b", "c"])
|
||||
assert_array_equal(y_encoded_with_kwarg, y_encoded_positional)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, dtype",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"y",
|
||||
[
|
||||
np.array([2, 1, 3, 1, 3]),
|
||||
np.array([1, 1, 4, 5, -1, 0]),
|
||||
np.array([3, 5, 9, 5, 9, 3]),
|
||||
],
|
||||
)
|
||||
def test_label_encoder_array_api_compliance(y, array_namespace, device, dtype):
|
||||
xp = _array_api_for_tests(array_namespace, device)
|
||||
xp_y = xp.asarray(y, device=device)
|
||||
with config_context(array_api_dispatch=True):
|
||||
xp_label = LabelEncoder()
|
||||
np_label = LabelEncoder()
|
||||
xp_label = xp_label.fit(xp_y)
|
||||
xp_transformed = xp_label.transform(xp_y)
|
||||
xp_inv_transformed = xp_label.inverse_transform(xp_transformed)
|
||||
np_label = np_label.fit(y)
|
||||
np_transformed = np_label.transform(y)
|
||||
assert get_namespace(xp_transformed)[0].__name__ == xp.__name__
|
||||
assert get_namespace(xp_inv_transformed)[0].__name__ == xp.__name__
|
||||
assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__
|
||||
assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed)
|
||||
assert_array_equal(_convert_to_numpy(xp_inv_transformed, xp), y)
|
||||
assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_)
|
||||
|
||||
xp_label = LabelEncoder()
|
||||
np_label = LabelEncoder()
|
||||
xp_transformed = xp_label.fit_transform(xp_y)
|
||||
np_transformed = np_label.fit_transform(y)
|
||||
assert get_namespace(xp_transformed)[0].__name__ == xp.__name__
|
||||
assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__
|
||||
assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed)
|
||||
assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,714 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_equal
|
||||
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.linear_model import Ridge
|
||||
from sklearn.model_selection import (
|
||||
KFold,
|
||||
ShuffleSplit,
|
||||
StratifiedKFold,
|
||||
cross_val_score,
|
||||
train_test_split,
|
||||
)
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import (
|
||||
KBinsDiscretizer,
|
||||
LabelBinarizer,
|
||||
LabelEncoder,
|
||||
TargetEncoder,
|
||||
)
|
||||
|
||||
|
||||
def _encode_target(X_ordinal, y_numeric, n_categories, smooth):
|
||||
"""Simple Python implementation of target encoding."""
|
||||
cur_encodings = np.zeros(n_categories, dtype=np.float64)
|
||||
y_mean = np.mean(y_numeric)
|
||||
|
||||
if smooth == "auto":
|
||||
y_variance = np.var(y_numeric)
|
||||
for c in range(n_categories):
|
||||
y_subset = y_numeric[X_ordinal == c]
|
||||
n_i = y_subset.shape[0]
|
||||
|
||||
if n_i == 0:
|
||||
cur_encodings[c] = y_mean
|
||||
continue
|
||||
|
||||
y_subset_variance = np.var(y_subset)
|
||||
m = y_subset_variance / y_variance
|
||||
lambda_ = n_i / (n_i + m)
|
||||
|
||||
cur_encodings[c] = lambda_ * np.mean(y_subset) + (1 - lambda_) * y_mean
|
||||
return cur_encodings
|
||||
else: # float
|
||||
for c in range(n_categories):
|
||||
y_subset = y_numeric[X_ordinal == c]
|
||||
current_sum = np.sum(y_subset) + y_mean * smooth
|
||||
current_cnt = y_subset.shape[0] + smooth
|
||||
cur_encodings[c] = current_sum / current_cnt
|
||||
return cur_encodings
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"categories, unknown_value",
|
||||
[
|
||||
([np.array([0, 1, 2], dtype=np.int64)], 4),
|
||||
([np.array([1.0, 3.0, np.nan], dtype=np.float64)], 6.0),
|
||||
([np.array(["cat", "dog", "snake"], dtype=object)], "bear"),
|
||||
("auto", 3),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("smooth", [5.0, "auto"])
|
||||
@pytest.mark.parametrize("target_type", ["binary", "continuous"])
|
||||
def test_encoding(categories, unknown_value, global_random_seed, smooth, target_type):
|
||||
"""Check encoding for binary and continuous targets.
|
||||
|
||||
Compare the values returned by `TargetEncoder.fit_transform` against the
|
||||
expected encodings for cv splits from a naive reference Python
|
||||
implementation in _encode_target.
|
||||
"""
|
||||
|
||||
n_categories = 3
|
||||
X_train_int_array = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=np.int64).T
|
||||
X_test_int_array = np.array([[0, 1, 2]], dtype=np.int64).T
|
||||
n_samples = X_train_int_array.shape[0]
|
||||
|
||||
if categories == "auto":
|
||||
X_train = X_train_int_array
|
||||
X_test = X_test_int_array
|
||||
else:
|
||||
X_train = categories[0][X_train_int_array]
|
||||
X_test = categories[0][X_test_int_array]
|
||||
|
||||
X_test = np.concatenate((X_test, [[unknown_value]]))
|
||||
|
||||
data_rng = np.random.RandomState(global_random_seed)
|
||||
n_splits = 3
|
||||
if target_type == "binary":
|
||||
y_numeric = data_rng.randint(low=0, high=2, size=n_samples)
|
||||
target_names = np.array(["cat", "dog"], dtype=object)
|
||||
y_train = target_names[y_numeric]
|
||||
|
||||
else:
|
||||
assert target_type == "continuous"
|
||||
y_numeric = data_rng.uniform(low=-10, high=20, size=n_samples)
|
||||
y_train = y_numeric
|
||||
|
||||
shuffled_idx = data_rng.permutation(n_samples)
|
||||
X_train_int_array = X_train_int_array[shuffled_idx]
|
||||
X_train = X_train[shuffled_idx]
|
||||
y_train = y_train[shuffled_idx]
|
||||
y_numeric = y_numeric[shuffled_idx]
|
||||
|
||||
# Define our CV splitting strategy
|
||||
if target_type == "binary":
|
||||
cv = StratifiedKFold(
|
||||
n_splits=n_splits, random_state=global_random_seed, shuffle=True
|
||||
)
|
||||
else:
|
||||
cv = KFold(n_splits=n_splits, random_state=global_random_seed, shuffle=True)
|
||||
|
||||
# Compute the expected values using our reference Python implementation of
|
||||
# target encoding:
|
||||
expected_X_fit_transform = np.empty_like(X_train_int_array, dtype=np.float64)
|
||||
|
||||
for train_idx, test_idx in cv.split(X_train_int_array, y_train):
|
||||
X_, y_ = X_train_int_array[train_idx, 0], y_numeric[train_idx]
|
||||
cur_encodings = _encode_target(X_, y_, n_categories, smooth)
|
||||
expected_X_fit_transform[test_idx, 0] = cur_encodings[
|
||||
X_train_int_array[test_idx, 0]
|
||||
]
|
||||
|
||||
# Check that we can obtain the same encodings by calling `fit_transform` on
|
||||
# the estimator with the same CV parameters:
|
||||
target_encoder = TargetEncoder(
|
||||
smooth=smooth,
|
||||
categories=categories,
|
||||
cv=n_splits,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
|
||||
X_fit_transform = target_encoder.fit_transform(X_train, y_train)
|
||||
|
||||
assert target_encoder.target_type_ == target_type
|
||||
assert_allclose(X_fit_transform, expected_X_fit_transform)
|
||||
assert len(target_encoder.encodings_) == 1
|
||||
if target_type == "binary":
|
||||
assert_array_equal(target_encoder.classes_, target_names)
|
||||
else:
|
||||
assert target_encoder.classes_ is None
|
||||
|
||||
# compute encodings for all data to validate `transform`
|
||||
y_mean = np.mean(y_numeric)
|
||||
expected_encodings = _encode_target(
|
||||
X_train_int_array[:, 0], y_numeric, n_categories, smooth
|
||||
)
|
||||
assert_allclose(target_encoder.encodings_[0], expected_encodings)
|
||||
assert target_encoder.target_mean_ == pytest.approx(y_mean)
|
||||
|
||||
# Transform on test data, the last value is unknown so it is encoded as the target
|
||||
# mean
|
||||
expected_X_test_transform = np.concatenate(
|
||||
(expected_encodings, np.array([y_mean]))
|
||||
).reshape(-1, 1)
|
||||
|
||||
X_test_transform = target_encoder.transform(X_test)
|
||||
assert_allclose(X_test_transform, expected_X_test_transform)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"categories, unknown_values",
|
||||
[
|
||||
([np.array([0, 1, 2], dtype=np.int64)], "auto"),
|
||||
([np.array(["cat", "dog", "snake"], dtype=object)], ["bear", "rabbit"]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"target_labels", [np.array([1, 2, 3]), np.array(["a", "b", "c"])]
|
||||
)
|
||||
@pytest.mark.parametrize("smooth", [5.0, "auto"])
|
||||
def test_encoding_multiclass(
|
||||
global_random_seed, categories, unknown_values, target_labels, smooth
|
||||
):
|
||||
"""Check encoding for multiclass targets."""
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
|
||||
n_samples = 80
|
||||
n_features = 2
|
||||
feat_1_int = np.array(rng.randint(low=0, high=2, size=n_samples))
|
||||
feat_2_int = np.array(rng.randint(low=0, high=3, size=n_samples))
|
||||
feat_1 = categories[0][feat_1_int]
|
||||
feat_2 = categories[0][feat_2_int]
|
||||
X_train = np.column_stack((feat_1, feat_2))
|
||||
X_train_int = np.column_stack((feat_1_int, feat_2_int))
|
||||
categories_ = [[0, 1], [0, 1, 2]]
|
||||
|
||||
n_classes = 3
|
||||
y_train_int = np.array(rng.randint(low=0, high=n_classes, size=n_samples))
|
||||
y_train = target_labels[y_train_int]
|
||||
y_train_enc = LabelBinarizer().fit_transform(y_train)
|
||||
|
||||
n_splits = 3
|
||||
cv = StratifiedKFold(
|
||||
n_splits=n_splits, random_state=global_random_seed, shuffle=True
|
||||
)
|
||||
|
||||
# Manually compute encodings for cv splits to validate `fit_transform`
|
||||
expected_X_fit_transform = np.empty(
|
||||
(X_train_int.shape[0], X_train_int.shape[1] * n_classes),
|
||||
dtype=np.float64,
|
||||
)
|
||||
for f_idx, cats in enumerate(categories_):
|
||||
for c_idx in range(n_classes):
|
||||
for train_idx, test_idx in cv.split(X_train, y_train):
|
||||
y_class = y_train_enc[:, c_idx]
|
||||
X_, y_ = X_train_int[train_idx, f_idx], y_class[train_idx]
|
||||
current_encoding = _encode_target(X_, y_, len(cats), smooth)
|
||||
# f_idx: 0, 0, 0, 1, 1, 1
|
||||
# c_idx: 0, 1, 2, 0, 1, 2
|
||||
# exp_idx: 0, 1, 2, 3, 4, 5
|
||||
exp_idx = c_idx + (f_idx * n_classes)
|
||||
expected_X_fit_transform[test_idx, exp_idx] = current_encoding[
|
||||
X_train_int[test_idx, f_idx]
|
||||
]
|
||||
|
||||
target_encoder = TargetEncoder(
|
||||
smooth=smooth,
|
||||
cv=n_splits,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
X_fit_transform = target_encoder.fit_transform(X_train, y_train)
|
||||
|
||||
assert target_encoder.target_type_ == "multiclass"
|
||||
assert_allclose(X_fit_transform, expected_X_fit_transform)
|
||||
|
||||
# Manually compute encoding to validate `transform`
|
||||
expected_encodings = []
|
||||
for f_idx, cats in enumerate(categories_):
|
||||
for c_idx in range(n_classes):
|
||||
y_class = y_train_enc[:, c_idx]
|
||||
current_encoding = _encode_target(
|
||||
X_train_int[:, f_idx], y_class, len(cats), smooth
|
||||
)
|
||||
expected_encodings.append(current_encoding)
|
||||
|
||||
assert len(target_encoder.encodings_) == n_features * n_classes
|
||||
for i in range(n_features * n_classes):
|
||||
assert_allclose(target_encoder.encodings_[i], expected_encodings[i])
|
||||
assert_array_equal(target_encoder.classes_, target_labels)
|
||||
|
||||
# Include unknown values at the end
|
||||
X_test_int = np.array([[0, 1], [1, 2], [4, 5]])
|
||||
if unknown_values == "auto":
|
||||
X_test = X_test_int
|
||||
else:
|
||||
X_test = np.empty_like(X_test_int[:-1, :], dtype=object)
|
||||
for column_idx in range(X_test_int.shape[1]):
|
||||
X_test[:, column_idx] = categories[0][X_test_int[:-1, column_idx]]
|
||||
# Add unknown values at end
|
||||
X_test = np.vstack((X_test, unknown_values))
|
||||
|
||||
y_mean = np.mean(y_train_enc, axis=0)
|
||||
expected_X_test_transform = np.empty(
|
||||
(X_test_int.shape[0], X_test_int.shape[1] * n_classes),
|
||||
dtype=np.float64,
|
||||
)
|
||||
n_rows = X_test_int.shape[0]
|
||||
f_idx = [0, 0, 0, 1, 1, 1]
|
||||
# Last row are unknowns, dealt with later
|
||||
for row_idx in range(n_rows - 1):
|
||||
for i, enc in enumerate(expected_encodings):
|
||||
expected_X_test_transform[row_idx, i] = enc[X_test_int[row_idx, f_idx[i]]]
|
||||
|
||||
# Unknowns encoded as target mean for each class
|
||||
# `y_mean` contains target mean for each class, thus cycle through mean of
|
||||
# each class, `n_features` times
|
||||
mean_idx = [0, 1, 2, 0, 1, 2]
|
||||
for i in range(n_classes * n_features):
|
||||
expected_X_test_transform[n_rows - 1, i] = y_mean[mean_idx[i]]
|
||||
|
||||
X_test_transform = target_encoder.transform(X_test)
|
||||
assert_allclose(X_test_transform, expected_X_test_transform)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, categories",
|
||||
[
|
||||
(
|
||||
np.array([[0] * 10 + [1] * 10 + [3]], dtype=np.int64).T, # 3 is unknown
|
||||
[[0, 1, 2]],
|
||||
),
|
||||
(
|
||||
np.array(
|
||||
[["cat"] * 10 + ["dog"] * 10 + ["snake"]], dtype=object
|
||||
).T, # snake is unknown
|
||||
[["dog", "cat", "cow"]],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("smooth", [4.0, "auto"])
|
||||
def test_custom_categories(X, categories, smooth):
|
||||
"""Custom categories with unknown categories that are not in training data."""
|
||||
rng = np.random.RandomState(0)
|
||||
y = rng.uniform(low=-10, high=20, size=X.shape[0])
|
||||
enc = TargetEncoder(categories=categories, smooth=smooth, random_state=0).fit(X, y)
|
||||
|
||||
# The last element is unknown and encoded as the mean
|
||||
y_mean = y.mean()
|
||||
X_trans = enc.transform(X[-1:])
|
||||
assert X_trans[0, 0] == pytest.approx(y_mean)
|
||||
|
||||
assert len(enc.encodings_) == 1
|
||||
# custom category that is not in training data
|
||||
assert enc.encodings_[0][-1] == pytest.approx(y_mean)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y, msg",
|
||||
[
|
||||
([1, 2, 0, 1], "Found input variables with inconsistent"),
|
||||
(
|
||||
np.array([[1, 2, 0], [1, 2, 3]]).T,
|
||||
"Target type was inferred to be 'multiclass-multioutput'",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_errors(y, msg):
|
||||
"""Check invalidate input."""
|
||||
X = np.array([[1, 0, 1]]).T
|
||||
|
||||
enc = TargetEncoder()
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
enc.fit_transform(X, y)
|
||||
|
||||
|
||||
def test_use_regression_target():
|
||||
"""Check inferred and specified `target_type` on regression target."""
|
||||
X = np.array([[0, 1, 0, 1, 0, 1]]).T
|
||||
y = np.array([1.0, 2.0, 3.0, 2.0, 3.0, 4.0])
|
||||
|
||||
enc = TargetEncoder(cv=2)
|
||||
with pytest.warns(
|
||||
UserWarning,
|
||||
match=re.escape(
|
||||
"The least populated class in y has only 1 members, which is less than"
|
||||
" n_splits=2."
|
||||
),
|
||||
):
|
||||
enc.fit_transform(X, y)
|
||||
assert enc.target_type_ == "multiclass"
|
||||
|
||||
enc = TargetEncoder(cv=2, target_type="continuous")
|
||||
enc.fit_transform(X, y)
|
||||
assert enc.target_type_ == "continuous"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y, feature_names",
|
||||
[
|
||||
([1, 2] * 10, ["A", "B"]),
|
||||
([1, 2, 3] * 6 + [1, 2], ["A_1", "A_2", "A_3", "B_1", "B_2", "B_3"]),
|
||||
(
|
||||
["y1", "y2", "y3"] * 6 + ["y1", "y2"],
|
||||
["A_y1", "A_y2", "A_y3", "B_y1", "B_y2", "B_y3"],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_feature_names_out_set_output(y, feature_names):
|
||||
"""Check TargetEncoder works with set_output."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
X_df = pd.DataFrame({"A": ["a", "b"] * 10, "B": [1, 2] * 10})
|
||||
|
||||
enc_default = TargetEncoder(cv=2, smooth=3.0, random_state=0)
|
||||
enc_default.set_output(transform="default")
|
||||
enc_pandas = TargetEncoder(cv=2, smooth=3.0, random_state=0)
|
||||
enc_pandas.set_output(transform="pandas")
|
||||
|
||||
X_default = enc_default.fit_transform(X_df, y)
|
||||
X_pandas = enc_pandas.fit_transform(X_df, y)
|
||||
|
||||
assert_allclose(X_pandas.to_numpy(), X_default)
|
||||
assert_array_equal(enc_pandas.get_feature_names_out(), feature_names)
|
||||
assert_array_equal(enc_pandas.get_feature_names_out(), X_pandas.columns)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("to_pandas", [True, False])
|
||||
@pytest.mark.parametrize("smooth", [1.0, "auto"])
|
||||
@pytest.mark.parametrize("target_type", ["binary-ints", "binary-str", "continuous"])
|
||||
def test_multiple_features_quick(to_pandas, smooth, target_type):
|
||||
"""Check target encoder with multiple features."""
|
||||
X_ordinal = np.array(
|
||||
[[1, 1], [0, 1], [1, 1], [2, 1], [1, 0], [0, 1], [1, 0], [0, 0]], dtype=np.int64
|
||||
)
|
||||
if target_type == "binary-str":
|
||||
y_train = np.array(["a", "b", "a", "a", "b", "b", "a", "b"])
|
||||
y_integer = LabelEncoder().fit_transform(y_train)
|
||||
cv = StratifiedKFold(2, random_state=0, shuffle=True)
|
||||
elif target_type == "binary-ints":
|
||||
y_train = np.array([3, 4, 3, 3, 3, 4, 4, 4])
|
||||
y_integer = LabelEncoder().fit_transform(y_train)
|
||||
cv = StratifiedKFold(2, random_state=0, shuffle=True)
|
||||
else:
|
||||
y_train = np.array([3.0, 5.1, 2.4, 3.5, 4.1, 5.5, 10.3, 7.3], dtype=np.float32)
|
||||
y_integer = y_train
|
||||
cv = KFold(2, random_state=0, shuffle=True)
|
||||
y_mean = np.mean(y_integer)
|
||||
categories = [[0, 1, 2], [0, 1]]
|
||||
|
||||
X_test = np.array(
|
||||
[
|
||||
[0, 1],
|
||||
[3, 0], # 3 is unknown
|
||||
[1, 10], # 10 is unknown
|
||||
],
|
||||
dtype=np.int64,
|
||||
)
|
||||
|
||||
if to_pandas:
|
||||
pd = pytest.importorskip("pandas")
|
||||
# convert second feature to an object
|
||||
X_train = pd.DataFrame(
|
||||
{
|
||||
"feat0": X_ordinal[:, 0],
|
||||
"feat1": np.array(["cat", "dog"], dtype=object)[X_ordinal[:, 1]],
|
||||
}
|
||||
)
|
||||
# "snake" is unknown
|
||||
X_test = pd.DataFrame({"feat0": X_test[:, 0], "feat1": ["dog", "cat", "snake"]})
|
||||
else:
|
||||
X_train = X_ordinal
|
||||
|
||||
# manually compute encoding for fit_transform
|
||||
expected_X_fit_transform = np.empty_like(X_ordinal, dtype=np.float64)
|
||||
for f_idx, cats in enumerate(categories):
|
||||
for train_idx, test_idx in cv.split(X_ordinal, y_integer):
|
||||
X_, y_ = X_ordinal[train_idx, f_idx], y_integer[train_idx]
|
||||
current_encoding = _encode_target(X_, y_, len(cats), smooth)
|
||||
expected_X_fit_transform[test_idx, f_idx] = current_encoding[
|
||||
X_ordinal[test_idx, f_idx]
|
||||
]
|
||||
|
||||
# manually compute encoding for transform
|
||||
expected_encodings = []
|
||||
for f_idx, cats in enumerate(categories):
|
||||
current_encoding = _encode_target(
|
||||
X_ordinal[:, f_idx], y_integer, len(cats), smooth
|
||||
)
|
||||
expected_encodings.append(current_encoding)
|
||||
|
||||
expected_X_test_transform = np.array(
|
||||
[
|
||||
[expected_encodings[0][0], expected_encodings[1][1]],
|
||||
[y_mean, expected_encodings[1][0]],
|
||||
[expected_encodings[0][1], y_mean],
|
||||
],
|
||||
dtype=np.float64,
|
||||
)
|
||||
|
||||
enc = TargetEncoder(smooth=smooth, cv=2, random_state=0)
|
||||
X_fit_transform = enc.fit_transform(X_train, y_train)
|
||||
assert_allclose(X_fit_transform, expected_X_fit_transform)
|
||||
|
||||
assert len(enc.encodings_) == 2
|
||||
for i in range(2):
|
||||
assert_allclose(enc.encodings_[i], expected_encodings[i])
|
||||
|
||||
X_test_transform = enc.transform(X_test)
|
||||
assert_allclose(X_test_transform, expected_X_test_transform)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y, y_mean",
|
||||
[
|
||||
(np.array([3.4] * 20), 3.4),
|
||||
(np.array([0] * 20), 0),
|
||||
(np.array(["a"] * 20, dtype=object), 0),
|
||||
],
|
||||
ids=["continuous", "binary", "binary-string"],
|
||||
)
|
||||
@pytest.mark.parametrize("smooth", ["auto", 4.0, 0.0])
|
||||
def test_constant_target_and_feature(y, y_mean, smooth):
|
||||
"""Check edge case where feature and target is constant."""
|
||||
X = np.array([[1] * 20]).T
|
||||
n_samples = X.shape[0]
|
||||
|
||||
enc = TargetEncoder(cv=2, smooth=smooth, random_state=0)
|
||||
X_trans = enc.fit_transform(X, y)
|
||||
assert_allclose(X_trans, np.repeat([[y_mean]], n_samples, axis=0))
|
||||
assert enc.encodings_[0][0] == pytest.approx(y_mean)
|
||||
assert enc.target_mean_ == pytest.approx(y_mean)
|
||||
|
||||
X_test = np.array([[1], [0]])
|
||||
X_test_trans = enc.transform(X_test)
|
||||
assert_allclose(X_test_trans, np.repeat([[y_mean]], 2, axis=0))
|
||||
|
||||
|
||||
def test_fit_transform_not_associated_with_y_if_ordinal_categorical_is_not(
|
||||
global_random_seed,
|
||||
):
|
||||
cardinality = 30 # not too large, otherwise we need a very large n_samples
|
||||
n_samples = 3000
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
y_train = rng.normal(size=n_samples)
|
||||
X_train = rng.randint(0, cardinality, size=n_samples).reshape(-1, 1)
|
||||
|
||||
# Sort by y_train to attempt to cause a leak
|
||||
y_sorted_indices = y_train.argsort()
|
||||
y_train = y_train[y_sorted_indices]
|
||||
X_train = X_train[y_sorted_indices]
|
||||
|
||||
target_encoder = TargetEncoder(shuffle=True, random_state=global_random_seed)
|
||||
X_encoded_train_shuffled = target_encoder.fit_transform(X_train, y_train)
|
||||
|
||||
target_encoder = TargetEncoder(shuffle=False)
|
||||
X_encoded_train_no_shuffled = target_encoder.fit_transform(X_train, y_train)
|
||||
|
||||
# Check that no information about y_train has leaked into X_train:
|
||||
regressor = RandomForestRegressor(
|
||||
n_estimators=10, min_samples_leaf=20, random_state=global_random_seed
|
||||
)
|
||||
|
||||
# It's impossible to learn a good predictive model on the training set when
|
||||
# using the original representation X_train or the target encoded
|
||||
# representation with shuffled inner CV. For the latter, no information
|
||||
# about y_train has inadvertently leaked into the prior used to generate
|
||||
# `X_encoded_train_shuffled`:
|
||||
cv = ShuffleSplit(n_splits=50, random_state=global_random_seed)
|
||||
assert cross_val_score(regressor, X_train, y_train, cv=cv).mean() < 0.1
|
||||
assert (
|
||||
cross_val_score(regressor, X_encoded_train_shuffled, y_train, cv=cv).mean()
|
||||
< 0.1
|
||||
)
|
||||
|
||||
# Without the inner CV shuffling, a lot of information about y_train goes into the
|
||||
# the per-fold y_train.mean() priors: shrinkage is no longer effective in this
|
||||
# case and would no longer be able to prevent downstream over-fitting.
|
||||
assert (
|
||||
cross_val_score(regressor, X_encoded_train_no_shuffled, y_train, cv=cv).mean()
|
||||
> 0.5
|
||||
)
|
||||
|
||||
|
||||
def test_smooth_zero():
|
||||
"""Check edge case with zero smoothing and cv does not contain category."""
|
||||
X = np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]).T
|
||||
y = np.array([2.1, 4.3, 1.2, 3.1, 1.0, 9.0, 10.3, 14.2, 13.3, 15.0])
|
||||
|
||||
enc = TargetEncoder(smooth=0.0, shuffle=False, cv=2)
|
||||
X_trans = enc.fit_transform(X, y)
|
||||
|
||||
# With cv = 2, category 0 does not exist in the second half, thus
|
||||
# it will be encoded as the mean of the second half
|
||||
assert_allclose(X_trans[0], np.mean(y[5:]))
|
||||
|
||||
# category 1 does not exist in the first half, thus it will be encoded as
|
||||
# the mean of the first half
|
||||
assert_allclose(X_trans[-1], np.mean(y[:5]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("smooth", [0.0, 1e3, "auto"])
|
||||
def test_invariance_of_encoding_under_label_permutation(smooth, global_random_seed):
|
||||
# Check that the encoding does not depend on the integer of the value of
|
||||
# the integer labels. This is quite a trivial property but it is helpful
|
||||
# to understand the following test.
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
|
||||
# Random y and informative categorical X to make the test non-trivial when
|
||||
# using smoothing.
|
||||
y = rng.normal(size=1000)
|
||||
n_categories = 30
|
||||
X = KBinsDiscretizer(
|
||||
n_bins=n_categories, quantile_method="averaged_inverted_cdf", encode="ordinal"
|
||||
).fit_transform(y.reshape(-1, 1))
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, random_state=global_random_seed
|
||||
)
|
||||
|
||||
# Shuffle the labels to make sure that the encoding is invariant to the
|
||||
# permutation of the labels
|
||||
permutated_labels = rng.permutation(n_categories)
|
||||
X_train_permuted = permutated_labels[X_train.astype(np.int32)]
|
||||
X_test_permuted = permutated_labels[X_test.astype(np.int32)]
|
||||
|
||||
target_encoder = TargetEncoder(smooth=smooth, random_state=global_random_seed)
|
||||
X_train_encoded = target_encoder.fit_transform(X_train, y_train)
|
||||
X_test_encoded = target_encoder.transform(X_test)
|
||||
|
||||
X_train_permuted_encoded = target_encoder.fit_transform(X_train_permuted, y_train)
|
||||
X_test_permuted_encoded = target_encoder.transform(X_test_permuted)
|
||||
|
||||
assert_allclose(X_train_encoded, X_train_permuted_encoded)
|
||||
assert_allclose(X_test_encoded, X_test_permuted_encoded)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("smooth", [0.0, "auto"])
|
||||
def test_target_encoding_for_linear_regression(smooth, global_random_seed):
|
||||
# Check some expected statistical properties when fitting a linear
|
||||
# regression model on target encoded features depending on their relation
|
||||
# with that target.
|
||||
|
||||
# In this test, we use the Ridge class with the "lsqr" solver and a little
|
||||
# bit of regularization to implement a linear regression model that
|
||||
# converges quickly for large `n_samples` and robustly in case of
|
||||
# correlated features. Since we will fit this model on a mean centered
|
||||
# target, we do not need to fit an intercept and this will help simplify
|
||||
# the analysis with respect to the expected coefficients.
|
||||
linear_regression = Ridge(alpha=1e-6, solver="lsqr", fit_intercept=False)
|
||||
|
||||
# Construct a random target variable. We need a large number of samples for
|
||||
# this test to be stable across all values of the random seed.
|
||||
n_samples = 50_000
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
y = rng.randn(n_samples)
|
||||
|
||||
# Generate a single informative ordinal feature with medium cardinality.
|
||||
# Inject some irreducible noise to make it harder for a multivariate model
|
||||
# to identify the informative feature from other pure noise features.
|
||||
noise = 0.8 * rng.randn(n_samples)
|
||||
n_categories = 100
|
||||
X_informative = KBinsDiscretizer(
|
||||
n_bins=n_categories,
|
||||
encode="ordinal",
|
||||
strategy="uniform",
|
||||
random_state=rng,
|
||||
).fit_transform((y + noise).reshape(-1, 1))
|
||||
|
||||
# Let's permute the labels to hide the fact that this feature is
|
||||
# informative to naive linear regression model trained on the raw ordinal
|
||||
# values. As highlighted in the previous test, the target encoding should be
|
||||
# invariant to such a permutation.
|
||||
permutated_labels = rng.permutation(n_categories)
|
||||
X_informative = permutated_labels[X_informative.astype(np.int32)]
|
||||
|
||||
# Generate a shuffled copy of the informative feature to destroy the
|
||||
# relationship with the target.
|
||||
X_shuffled = rng.permutation(X_informative)
|
||||
|
||||
# Also include a very high cardinality categorical feature that is by
|
||||
# itself independent of the target variable: target encoding such a feature
|
||||
# without internal cross-validation should cause catastrophic overfitting
|
||||
# for the downstream regressor, even with shrinkage. This kind of features
|
||||
# typically represents near unique identifiers of samples. In general they
|
||||
# should be removed from a machine learning datasets but here we want to
|
||||
# study the ability of the default behavior of TargetEncoder to mitigate
|
||||
# them automatically.
|
||||
X_near_unique_categories = rng.choice(
|
||||
int(0.9 * n_samples), size=n_samples, replace=True
|
||||
).reshape(-1, 1)
|
||||
|
||||
# Assemble the dataset and do a train-test split:
|
||||
X = np.concatenate(
|
||||
[X_informative, X_shuffled, X_near_unique_categories],
|
||||
axis=1,
|
||||
)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
|
||||
# Let's first check that a linear regression model trained on the raw
|
||||
# features underfits because of the meaning-less ordinal encoding of the
|
||||
# labels.
|
||||
raw_model = linear_regression.fit(X_train, y_train)
|
||||
assert raw_model.score(X_train, y_train) < 0.1
|
||||
assert raw_model.score(X_test, y_test) < 0.1
|
||||
|
||||
# Now do the same with target encoding using the internal CV mechanism
|
||||
# implemented when using fit_transform.
|
||||
model_with_cv = make_pipeline(
|
||||
TargetEncoder(smooth=smooth, random_state=rng), linear_regression
|
||||
).fit(X_train, y_train)
|
||||
|
||||
# This model should be able to fit the data well and also generalise to the
|
||||
# test data (assuming that the binning is fine-grained enough). The R2
|
||||
# scores are not perfect because of the noise injected during the
|
||||
# generation of the unique informative feature.
|
||||
coef = model_with_cv[-1].coef_
|
||||
assert model_with_cv.score(X_train, y_train) > 0.5, coef
|
||||
assert model_with_cv.score(X_test, y_test) > 0.5, coef
|
||||
|
||||
# The target encoder recovers the linear relationship with slope 1 between
|
||||
# the target encoded unique informative predictor and the target. Since the
|
||||
# target encoding of the 2 other features is not informative thanks to the
|
||||
# use of internal cross-validation, the multivariate linear regressor
|
||||
# assigns a coef of 1 to the first feature and 0 to the other 2.
|
||||
assert coef[0] == pytest.approx(1, abs=1e-2)
|
||||
assert (np.abs(coef[1:]) < 0.2).all()
|
||||
|
||||
# Let's now disable the internal cross-validation by calling fit and then
|
||||
# transform separately on the training set:
|
||||
target_encoder = TargetEncoder(smooth=smooth, random_state=rng).fit(
|
||||
X_train, y_train
|
||||
)
|
||||
X_enc_no_cv_train = target_encoder.transform(X_train)
|
||||
X_enc_no_cv_test = target_encoder.transform(X_test)
|
||||
model_no_cv = linear_regression.fit(X_enc_no_cv_train, y_train)
|
||||
|
||||
# The linear regression model should always overfit because it assigns
|
||||
# too much weight to the extremely high cardinality feature relatively to
|
||||
# the informative feature. Note that this is the case even when using
|
||||
# the empirical Bayes smoothing which is not enough to prevent such
|
||||
# overfitting alone.
|
||||
coef = model_no_cv.coef_
|
||||
assert model_no_cv.score(X_enc_no_cv_train, y_train) > 0.7, coef
|
||||
assert model_no_cv.score(X_enc_no_cv_test, y_test) < 0.5, coef
|
||||
|
||||
# The model overfits because it assigns too much weight to the high
|
||||
# cardinality yet non-informative feature instead of the lower
|
||||
# cardinality yet informative feature:
|
||||
assert abs(coef[0]) < abs(coef[2])
|
||||
|
||||
|
||||
def test_pandas_copy_on_write():
|
||||
"""
|
||||
Test target-encoder cython code when y is read-only.
|
||||
|
||||
The numpy array underlying df["y"] is read-only when copy-on-write is enabled.
|
||||
Non-regression test for gh-27879.
|
||||
"""
|
||||
pd = pytest.importorskip("pandas", minversion="2.0")
|
||||
with pd.option_context("mode.copy_on_write", True):
|
||||
df = pd.DataFrame({"x": ["a", "b", "b"], "y": [4.0, 5.0, 6.0]})
|
||||
TargetEncoder(target_type="continuous").fit(df[["x"]], df["y"])
|
||||
Reference in New Issue
Block a user