add read me

This commit is contained in:
2026-01-09 10:28:44 +11:00
commit edaf914b73
13417 changed files with 2952119 additions and 0 deletions

View File

@@ -0,0 +1,187 @@
import warnings
import numpy as np
import pytest
from sklearn.base import clone
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
MaxAbsScaler,
MinMaxScaler,
PowerTransformer,
QuantileTransformer,
RobustScaler,
StandardScaler,
maxabs_scale,
minmax_scale,
power_transform,
quantile_transform,
robust_scale,
scale,
)
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.utils.fixes import (
BSR_CONTAINERS,
COO_CONTAINERS,
CSC_CONTAINERS,
CSR_CONTAINERS,
DIA_CONTAINERS,
DOK_CONTAINERS,
LIL_CONTAINERS,
)
iris = load_iris()
def _get_valid_samples_by_column(X, col):
"""Get non NaN samples in column of X"""
return X[:, [col]][~np.isnan(X[:, col])]
@pytest.mark.parametrize(
"est, func, support_sparse, strictly_positive, omit_kwargs",
[
(MaxAbsScaler(), maxabs_scale, True, False, []),
(MinMaxScaler(), minmax_scale, False, False, ["clip"]),
(StandardScaler(), scale, False, False, []),
(StandardScaler(with_mean=False), scale, True, False, []),
(PowerTransformer("yeo-johnson"), power_transform, False, False, []),
(PowerTransformer("box-cox"), power_transform, False, True, []),
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []),
(RobustScaler(), robust_scale, False, False, []),
(RobustScaler(with_centering=False), robust_scale, True, False, []),
],
)
def test_missing_value_handling(
est, func, support_sparse, strictly_positive, omit_kwargs
):
# check that the preprocessing method let pass nan
rng = np.random.RandomState(42)
X = iris.data.copy()
n_missing = 50
X[
rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)
] = np.nan
if strictly_positive:
X += np.nanmin(X) + 0.1
X_train, X_test = train_test_split(X, random_state=1)
# sanity check
assert not np.all(np.isnan(X_train), axis=0).any()
assert np.any(np.isnan(X_train), axis=0).all()
assert np.any(np.isnan(X_test), axis=0).all()
X_test[:, 0] = np.nan # make sure this boundary case is tested
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt = est.fit(X_train).transform(X_test)
# ensure no warnings are raised
# missing values should still be missing, and only them
assert_array_equal(np.isnan(Xt), np.isnan(X_test))
# check that the function leads to the same results as the class
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt_class = est.transform(X_train)
kwargs = est.get_params()
# remove the parameters which should be omitted because they
# are not defined in the counterpart function of the preprocessing class
for kwarg in omit_kwargs:
_ = kwargs.pop(kwarg)
Xt_func = func(X_train, **kwargs)
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
# check that the inverse transform keep NaN
Xt_inv = est.inverse_transform(Xt)
assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
# FIXME: we can introduce equal_nan=True in recent version of numpy.
# For the moment which just check that non-NaN values are almost equal.
assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
for i in range(X.shape[1]):
# train only on non-NaN
est.fit(_get_valid_samples_by_column(X_train, i))
# check transforming with NaN works even when training without NaN
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt_col = est.transform(X_test[:, [i]])
assert_allclose(Xt_col, Xt[:, [i]])
# check non-NaN is handled as before - the 1st column is all nan
if not np.isnan(X_test[:, i]).all():
Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i))
assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())])
if support_sparse:
est_dense = clone(est)
est_sparse = clone(est)
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt_dense = est_dense.fit(X_train).transform(X_test)
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
for sparse_container in (
BSR_CONTAINERS
+ COO_CONTAINERS
+ CSC_CONTAINERS
+ CSR_CONTAINERS
+ DIA_CONTAINERS
+ DOK_CONTAINERS
+ LIL_CONTAINERS
):
# check that the dense and sparse inputs lead to the same results
# precompute the matrix to avoid catching side warnings
X_train_sp = sparse_container(X_train)
X_test_sp = sparse_container(X_test)
with warnings.catch_warnings():
warnings.simplefilter("ignore", PendingDeprecationWarning)
warnings.simplefilter("error", RuntimeWarning)
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
assert_allclose(Xt_sp.toarray(), Xt_dense)
with warnings.catch_warnings():
warnings.simplefilter("ignore", PendingDeprecationWarning)
warnings.simplefilter("error", RuntimeWarning)
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
assert_allclose(Xt_inv_sp.toarray(), Xt_inv_dense)
@pytest.mark.parametrize(
"est, func",
[
(MaxAbsScaler(), maxabs_scale),
(MinMaxScaler(), minmax_scale),
(StandardScaler(), scale),
(StandardScaler(with_mean=False), scale),
(PowerTransformer("yeo-johnson"), power_transform),
(
PowerTransformer("box-cox"),
power_transform,
),
(QuantileTransformer(n_quantiles=3), quantile_transform),
(RobustScaler(), robust_scale),
(RobustScaler(with_centering=False), robust_scale),
],
)
def test_missing_value_pandas_na_support(est, func):
# Test pandas IntegerArray with pd.NA
pd = pytest.importorskip("pandas")
X = np.array(
[
[1, 2, 3, np.nan, np.nan, 4, 5, 1],
[np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
[1, 2, 3, 4, 5, 6, 7, 8],
]
).T
# Creates dataframe with IntegerArrays with pd.NA
X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c"])
X_df["c"] = X_df["c"].astype("int")
X_trans = est.fit_transform(X)
X_df_trans = est.fit_transform(X_df)
assert_allclose(X_trans, X_df_trans)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,665 @@
import warnings
import numpy as np
import pytest
import scipy.sparse as sp
from sklearn import clone
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
from sklearn.utils._testing import (
assert_allclose,
assert_allclose_dense_sparse,
assert_array_almost_equal,
assert_array_equal,
ignore_warnings,
)
X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]]
@pytest.mark.parametrize(
"strategy, quantile_method, expected, sample_weight",
[
(
"uniform",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
None,
),
(
"kmeans",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
None,
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
None,
),
(
"uniform",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
[1, 1, 2, 1],
),
(
"uniform",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
[1, 1, 1, 1],
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
[1, 1, 2, 1],
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
[1, 1, 1, 1],
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
[0, 1, 1, 1],
),
(
"kmeans",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [1, 1, 1, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
[1, 0, 3, 1],
),
(
"kmeans",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
[1, 1, 1, 1],
),
],
)
def test_fit_transform(strategy, quantile_method, expected, sample_weight):
est = KBinsDiscretizer(
n_bins=3, encode="ordinal", strategy=strategy, quantile_method=quantile_method
)
with ignore_warnings(category=UserWarning):
# Ignore the warning on removed small bins.
est.fit(X, sample_weight=sample_weight)
assert_array_equal(est.transform(X), expected)
def test_valid_n_bins():
KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf").fit_transform(X)
KBinsDiscretizer(
n_bins=np.array([2])[0], quantile_method="averaged_inverted_cdf"
).fit_transform(X)
assert KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf").fit(
X
).n_bins_.dtype == np.dtype(int)
def test_invalid_n_bins_array():
# Bad shape
n_bins = np.full((2, 4), 2.0)
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Incorrect number of features
n_bins = [1, 2, 2]
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Bad bin values
n_bins = [1, 2, 2, 1]
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
err_msg = (
"KBinsDiscretizer received an invalid number of bins "
"at indices 0, 3. Number of bins must be at least 2, "
"and must be an int."
)
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Float bin values
n_bins = [2.1, 2, 2.1, 2]
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
err_msg = (
"KBinsDiscretizer received an invalid number of bins "
"at indices 0, 2. Number of bins must be at least 2, "
"and must be an int."
)
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
@pytest.mark.parametrize(
"strategy, quantile_method, expected, sample_weight",
[
(
"uniform",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]],
None,
),
(
"kmeans",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]],
None,
),
(
"quantile",
"linear",
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
None,
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
None,
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
[1, 1, 1, 1],
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
[0, 1, 3, 1],
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 2, 2, 2], [1, 2, 2, 2]],
[1, 1, 3, 1],
),
(
"kmeans",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [0, 1, 1, 0], [1, 1, 1, 1], [1, 2, 2, 2]],
[1, 0, 3, 1],
),
],
)
def test_fit_transform_n_bins_array(strategy, quantile_method, expected, sample_weight):
est = KBinsDiscretizer(
n_bins=[2, 3, 3, 3],
encode="ordinal",
strategy=strategy,
quantile_method=quantile_method,
).fit(X, sample_weight=sample_weight)
assert_array_equal(est.transform(X), expected)
# test the shape of bin_edges_
n_features = np.array(X).shape[1]
assert est.bin_edges_.shape == (n_features,)
for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
assert bin_edges.shape == (n_bins + 1,)
@pytest.mark.filterwarnings("ignore: Bins whose width are too small")
def test_kbinsdiscretizer_effect_sample_weight():
"""Check the impact of `sample_weight` one computed quantiles."""
X = np.array([[-2], [-1], [1], [3], [500], [1000]])
# add a large number of bins such that each sample with a non-null weight
# will be used as bin edge
est = KBinsDiscretizer(
n_bins=10,
encode="ordinal",
strategy="quantile",
quantile_method="averaged_inverted_cdf",
)
est.fit(X, sample_weight=[1, 1, 1, 1, 0, 0])
assert_allclose(est.bin_edges_[0], [-2, -1, 0, 1, 3])
assert_allclose(est.transform(X), [[0.0], [1.0], [3.0], [3.0], [3.0], [3.0]])
@pytest.mark.parametrize("strategy", ["kmeans", "quantile"])
def test_kbinsdiscretizer_no_mutating_sample_weight(strategy):
"""Make sure that `sample_weight` is not changed in place."""
if strategy == "quantile":
est = KBinsDiscretizer(
n_bins=3,
encode="ordinal",
strategy=strategy,
quantile_method="averaged_inverted_cdf",
)
else:
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
sample_weight = np.array([1, 3, 1, 2], dtype=np.float64)
sample_weight_copy = np.copy(sample_weight)
est.fit(X, sample_weight=sample_weight)
assert_allclose(sample_weight, sample_weight_copy)
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
def test_same_min_max(strategy):
warnings.simplefilter("always")
X = np.array([[1, -2], [1, -1], [1, 0], [1, 1]])
if strategy == "quantile":
est = KBinsDiscretizer(
strategy=strategy,
n_bins=3,
encode="ordinal",
quantile_method="averaged_inverted_cdf",
)
else:
est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode="ordinal")
warning_message = "Feature 0 is constant and will be replaced with 0."
with pytest.warns(UserWarning, match=warning_message):
est.fit(X)
assert est.n_bins_[0] == 1
# replace the feature with zeros
Xt = est.transform(X)
assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
def test_transform_1d_behavior():
X = np.arange(4)
est = KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf")
with pytest.raises(ValueError):
est.fit(X)
est = KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf")
est.fit(X.reshape(-1, 1))
with pytest.raises(ValueError):
est.transform(X)
@pytest.mark.parametrize("i", range(1, 9))
def test_numeric_stability(i):
X_init = np.array([2.0, 4.0, 6.0, 8.0, 10.0]).reshape(-1, 1)
Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
# Test up to discretizing nano units
X = X_init / 10**i
Xt = KBinsDiscretizer(
n_bins=2, encode="ordinal", quantile_method="averaged_inverted_cdf"
).fit_transform(X)
assert_array_equal(Xt_expected, Xt)
def test_encode_options():
est = KBinsDiscretizer(
n_bins=[2, 3, 3, 3], encode="ordinal", quantile_method="averaged_inverted_cdf"
).fit(X)
Xt_1 = est.transform(X)
est = KBinsDiscretizer(
n_bins=[2, 3, 3, 3],
encode="onehot-dense",
quantile_method="averaged_inverted_cdf",
).fit(X)
Xt_2 = est.transform(X)
assert not sp.issparse(Xt_2)
assert_array_equal(
OneHotEncoder(
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=False
).fit_transform(Xt_1),
Xt_2,
)
est = KBinsDiscretizer(
n_bins=[2, 3, 3, 3], encode="onehot", quantile_method="averaged_inverted_cdf"
).fit(X)
Xt_3 = est.transform(X)
assert sp.issparse(Xt_3)
assert_array_equal(
OneHotEncoder(
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=True
)
.fit_transform(Xt_1)
.toarray(),
Xt_3.toarray(),
)
@pytest.mark.parametrize(
"strategy, quantile_method, expected_2bins, expected_3bins, expected_5bins",
[
("uniform", "warn", [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
("kmeans", "warn", [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
(
"quantile",
"averaged_inverted_cdf",
[0, 0, 0, 1, 1, 1],
[0, 0, 1, 1, 2, 2],
[0, 1, 2, 3, 4, 4],
),
],
)
def test_nonuniform_strategies(
strategy, quantile_method, expected_2bins, expected_3bins, expected_5bins
):
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
# with 2 bins
est = KBinsDiscretizer(
n_bins=2, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
)
Xt = est.fit_transform(X)
assert_array_equal(expected_2bins, Xt.ravel())
# with 3 bins
est = KBinsDiscretizer(
n_bins=3, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
)
Xt = est.fit_transform(X)
assert_array_equal(expected_3bins, Xt.ravel())
# with 5 bins
est = KBinsDiscretizer(
n_bins=5, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
)
Xt = est.fit_transform(X)
assert_array_equal(expected_5bins, Xt.ravel())
@pytest.mark.parametrize(
"strategy, expected_inv,quantile_method",
[
(
"uniform",
[
[-1.5, 2.0, -3.5, -0.5],
[-0.5, 3.0, -2.5, -0.5],
[0.5, 4.0, -1.5, 0.5],
[0.5, 4.0, -1.5, 1.5],
],
"warn", # default, will not warn when strategy != "quantile"
),
(
"kmeans",
[
[-1.375, 2.125, -3.375, -0.5625],
[-1.375, 2.125, -3.375, -0.5625],
[-0.125, 3.375, -2.125, 0.5625],
[0.75, 4.25, -1.25, 1.625],
],
"warn", # default, will not warn when strategy != "quantile"
),
(
"quantile",
[
[-1.5, 2.0, -3.5, -0.75],
[-0.5, 3.0, -2.5, 0.0],
[0.5, 4.0, -1.5, 1.25],
[0.5, 4.0, -1.5, 1.25],
],
"averaged_inverted_cdf",
),
],
)
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
def test_inverse_transform(strategy, encode, expected_inv, quantile_method):
kbd = KBinsDiscretizer(
n_bins=3, strategy=strategy, quantile_method=quantile_method, encode=encode
)
Xt = kbd.fit_transform(X)
Xinv = kbd.inverse_transform(Xt)
assert_array_almost_equal(expected_inv, Xinv)
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
def test_transform_outside_fit_range(strategy):
X = np.array([0, 1, 2, 3])[:, None]
if strategy == "quantile":
kbd = KBinsDiscretizer(
n_bins=4,
strategy=strategy,
encode="ordinal",
quantile_method="averaged_inverted_cdf",
)
else:
kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode="ordinal")
kbd.fit(X)
X2 = np.array([-2, 5])[:, None]
X2t = kbd.transform(X2)
assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
assert_array_equal(X2t.min(axis=0), [0])
def test_overwrite():
X = np.array([0, 1, 2, 3])[:, None]
X_before = X.copy()
est = KBinsDiscretizer(
n_bins=3, quantile_method="averaged_inverted_cdf", encode="ordinal"
)
Xt = est.fit_transform(X)
assert_array_equal(X, X_before)
Xt_before = Xt.copy()
Xinv = est.inverse_transform(Xt)
assert_array_equal(Xt, Xt_before)
assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
@pytest.mark.parametrize(
"strategy, expected_bin_edges, quantile_method",
[
("quantile", [0, 1.5, 3], "averaged_inverted_cdf"),
("kmeans", [0, 1.5, 3], "warn"),
],
)
def test_redundant_bins(strategy, expected_bin_edges, quantile_method):
X = [[0], [0], [0], [0], [3], [3]]
kbd = KBinsDiscretizer(
n_bins=3, strategy=strategy, quantile_method=quantile_method, subsample=None
)
warning_message = "Consider decreasing the number of bins."
with pytest.warns(UserWarning, match=warning_message):
kbd.fit(X)
assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
def test_percentile_numeric_stability():
X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
Xt = np.array([0, 0, 4]).reshape(-1, 1)
kbd = KBinsDiscretizer(
n_bins=10,
encode="ordinal",
strategy="quantile",
quantile_method="linear",
)
## TODO: change to averaged inverted cdf, but that means we only get bin
## edges of 0.05 and 0.95 and nothing in between
warning_message = "Consider decreasing the number of bins."
with pytest.warns(UserWarning, match=warning_message):
kbd.fit(X)
assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
assert_array_almost_equal(kbd.transform(X), Xt)
@pytest.mark.parametrize("in_dtype", [np.float16, np.float32, np.float64])
@pytest.mark.parametrize("out_dtype", [None, np.float32, np.float64])
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
def test_consistent_dtype(in_dtype, out_dtype, encode):
X_input = np.array(X, dtype=in_dtype)
kbd = KBinsDiscretizer(
n_bins=3,
encode=encode,
quantile_method="averaged_inverted_cdf",
dtype=out_dtype,
)
kbd.fit(X_input)
# test output dtype
if out_dtype is not None:
expected_dtype = out_dtype
elif out_dtype is None and X_input.dtype == np.float16:
# wrong numeric input dtype are cast in np.float64
expected_dtype = np.float64
else:
expected_dtype = X_input.dtype
Xt = kbd.transform(X_input)
assert Xt.dtype == expected_dtype
@pytest.mark.parametrize("input_dtype", [np.float16, np.float32, np.float64])
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
def test_32_equal_64(input_dtype, encode):
# TODO this check is redundant with common checks and can be removed
# once #16290 is merged
X_input = np.array(X, dtype=input_dtype)
# 32 bit output
kbd_32 = KBinsDiscretizer(
n_bins=3,
encode=encode,
quantile_method="averaged_inverted_cdf",
dtype=np.float32,
)
kbd_32.fit(X_input)
Xt_32 = kbd_32.transform(X_input)
# 64 bit output
kbd_64 = KBinsDiscretizer(
n_bins=3,
encode=encode,
quantile_method="averaged_inverted_cdf",
dtype=np.float64,
)
kbd_64.fit(X_input)
Xt_64 = kbd_64.transform(X_input)
assert_allclose_dense_sparse(Xt_32, Xt_64)
def test_kbinsdiscretizer_subsample_default():
# Since the size of X is small (< 2e5), subsampling will not take place.
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
kbd_default = KBinsDiscretizer(
n_bins=10,
encode="ordinal",
strategy="quantile",
quantile_method="averaged_inverted_cdf",
)
kbd_default.fit(X)
kbd_without_subsampling = clone(kbd_default)
kbd_without_subsampling.set_params(subsample=None)
kbd_without_subsampling.fit(X)
for bin_kbd_default, bin_kbd_with_subsampling in zip(
kbd_default.bin_edges_[0], kbd_without_subsampling.bin_edges_[0]
):
np.testing.assert_allclose(bin_kbd_default, bin_kbd_with_subsampling)
assert kbd_default.bin_edges_.shape == kbd_without_subsampling.bin_edges_.shape
@pytest.mark.parametrize(
"encode, expected_names",
[
(
"onehot",
[
f"feat{col_id}_{float(bin_id)}"
for col_id in range(3)
for bin_id in range(4)
],
),
(
"onehot-dense",
[
f"feat{col_id}_{float(bin_id)}"
for col_id in range(3)
for bin_id in range(4)
],
),
("ordinal", [f"feat{col_id}" for col_id in range(3)]),
],
)
def test_kbinsdiscrtizer_get_feature_names_out(encode, expected_names):
"""Check get_feature_names_out for different settings.
Non-regression test for #22731
"""
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
kbd = KBinsDiscretizer(
n_bins=4, encode=encode, quantile_method="averaged_inverted_cdf"
).fit(X)
Xt = kbd.transform(X)
input_features = [f"feat{i}" for i in range(3)]
output_names = kbd.get_feature_names_out(input_features)
assert Xt.shape[1] == output_names.shape[0]
assert_array_equal(output_names, expected_names)
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
def test_kbinsdiscretizer_subsample(strategy, global_random_seed):
# Check that the bin edges are almost the same when subsampling is used.
X = np.random.RandomState(global_random_seed).random_sample((100000, 1)) + 1
if strategy == "quantile":
kbd_subsampling = KBinsDiscretizer(
strategy=strategy,
subsample=50000,
random_state=global_random_seed,
quantile_method="averaged_inverted_cdf",
)
else:
kbd_subsampling = KBinsDiscretizer(
strategy=strategy, subsample=50000, random_state=global_random_seed
)
kbd_subsampling.fit(X)
kbd_no_subsampling = clone(kbd_subsampling)
kbd_no_subsampling.set_params(subsample=None)
kbd_no_subsampling.fit(X)
# We use a large tolerance because we can't expect the bin edges to be exactly the
# same when subsampling is used.
assert_allclose(
kbd_subsampling.bin_edges_[0], kbd_no_subsampling.bin_edges_[0], rtol=1e-2
)
def test_quantile_method_future_warnings():
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
with pytest.warns(
FutureWarning,
match="The current default behavior, quantile_method='linear', will be "
"changed to quantile_method='averaged_inverted_cdf' in "
"scikit-learn version 1.9 to naturally support sample weight "
"equivalence properties by default. Pass "
"quantile_method='averaged_inverted_cdf' explicitly to silence this "
"warning.",
):
KBinsDiscretizer(strategy="quantile").fit(X)
def test_invalid_quantile_method_with_sample_weight():
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
expected_msg = (
"When fitting with strategy='quantile' and sample weights, "
"quantile_method should either be set to 'averaged_inverted_cdf' or "
"'inverted_cdf', got quantile_method='linear' instead."
)
with pytest.raises(
ValueError,
match=expected_msg,
):
KBinsDiscretizer(strategy="quantile", quantile_method="linear").fit(
X,
sample_weight=[1, 1, 2, 2],
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,579 @@
import warnings
import numpy as np
import pytest
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.utils._testing import (
_convert_container,
assert_allclose_dense_sparse,
assert_array_equal,
)
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
def _func(X, *args, **kwargs):
args_store.append(X)
args_store.extend(args)
kwargs_store.update(kwargs)
return func(X)
return _func
def test_delegate_to_func():
# (args|kwargs)_store will hold the positional and keyword arguments
# passed to the function inside the FunctionTransformer.
args_store = []
kwargs_store = {}
X = np.arange(10).reshape((5, 2))
assert_array_equal(
FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
X,
"transform should have returned X unchanged",
)
# The function should only have received X.
assert args_store == [X], (
"Incorrect positional arguments passed to func: {args}".format(args=args_store)
)
assert not kwargs_store, (
"Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
)
# reset the argument stores.
args_store[:] = []
kwargs_store.clear()
transformed = FunctionTransformer(
_make_func(args_store, kwargs_store),
).transform(X)
assert_array_equal(
transformed, X, err_msg="transform should have returned X unchanged"
)
# The function should have received X
assert args_store == [X], (
"Incorrect positional arguments passed to func: {args}".format(args=args_store)
)
assert not kwargs_store, (
"Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
)
def test_np_log():
X = np.arange(10).reshape((5, 2))
# Test that the numpy.log example still works.
assert_array_equal(
FunctionTransformer(np.log1p).transform(X),
np.log1p(X),
)
def test_kw_arg():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=3))
def test_kw_arg_update():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
F.kw_args["decimals"] = 1
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=1))
def test_kw_arg_reset():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
F.kw_args = dict(decimals=1)
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=1))
def test_inverse_transform():
X = np.array([1, 4, 9, 16]).reshape((2, 2))
# Test that inverse_transform works correctly
F = FunctionTransformer(
func=np.sqrt,
inverse_func=np.around,
inv_kw_args=dict(decimals=3),
)
assert_array_equal(
F.inverse_transform(F.transform(X)),
np.around(np.sqrt(X), decimals=3),
)
@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
def test_check_inverse(sparse_container):
X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
if sparse_container is not None:
X = sparse_container(X)
trans = FunctionTransformer(
func=np.sqrt,
inverse_func=np.around,
accept_sparse=sparse_container is not None,
check_inverse=True,
validate=True,
)
warning_message = (
"The provided functions are not strictly"
" inverse of each other. If you are sure you"
" want to proceed regardless, set"
" 'check_inverse=False'."
)
with pytest.warns(UserWarning, match=warning_message):
trans.fit(X)
trans = FunctionTransformer(
func=np.expm1,
inverse_func=np.log1p,
accept_sparse=sparse_container is not None,
check_inverse=True,
validate=True,
)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
Xt = trans.fit_transform(X)
assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
def test_check_inverse_func_or_inverse_not_provided():
# check that we don't check inverse when one of the func or inverse is not
# provided.
X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
trans = FunctionTransformer(
func=np.expm1, inverse_func=None, check_inverse=True, validate=True
)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
trans.fit(X)
trans = FunctionTransformer(
func=None, inverse_func=np.expm1, check_inverse=True, validate=True
)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
trans.fit(X)
def test_function_transformer_frame():
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame(np.random.randn(100, 10))
transformer = FunctionTransformer()
X_df_trans = transformer.fit_transform(X_df)
assert hasattr(X_df_trans, "loc")
@pytest.mark.parametrize("X_type", ["array", "series"])
def test_function_transformer_raise_error_with_mixed_dtype(X_type):
"""Check that `FunctionTransformer.check_inverse` raises error on mixed dtype."""
mapping = {"one": 1, "two": 2, "three": 3, 5: "five", 6: "six"}
inverse_mapping = {value: key for key, value in mapping.items()}
dtype = "object"
data = ["one", "two", "three", "one", "one", 5, 6]
data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype)
def func(X):
return np.array([mapping[X[i]] for i in range(X.size)], dtype=object)
def inverse_func(X):
return _convert_container(
[inverse_mapping[x] for x in X],
X_type,
columns_name=["value"],
dtype=dtype,
)
transformer = FunctionTransformer(
func=func, inverse_func=inverse_func, validate=False, check_inverse=True
)
msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
with pytest.raises(ValueError, match=msg):
transformer.fit(data)
def test_function_transformer_support_all_nummerical_dataframes_check_inverse_True():
"""Check support for dataframes with only numerical values."""
pd = pytest.importorskip("pandas")
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
transformer = FunctionTransformer(
func=lambda x: x + 2, inverse_func=lambda x: x - 2, check_inverse=True
)
# Does not raise an error
df_out = transformer.fit_transform(df)
assert_allclose_dense_sparse(df_out, df + 2)
def test_function_transformer_with_dataframe_and_check_inverse_True():
"""Check error is raised when check_inverse=True.
Non-regresion test for gh-25261.
"""
pd = pytest.importorskip("pandas")
transformer = FunctionTransformer(
func=lambda x: x, inverse_func=lambda x: x, check_inverse=True
)
df_mixed = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
with pytest.raises(ValueError, match=msg):
transformer.fit(df_mixed)
@pytest.mark.parametrize(
"X, feature_names_out, input_features, expected",
[
(
# NumPy inputs, default behavior: generate names
np.random.rand(100, 3),
"one-to-one",
None,
("x0", "x1", "x2"),
),
(
# Pandas input, default behavior: use input feature names
{"a": np.random.rand(100), "b": np.random.rand(100)},
"one-to-one",
None,
("a", "b"),
),
(
# NumPy input, feature_names_out=callable
np.random.rand(100, 3),
lambda transformer, input_features: ("a", "b"),
None,
("a", "b"),
),
(
# Pandas input, feature_names_out=callable
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: ("c", "d", "e"),
None,
("c", "d", "e"),
),
(
# NumPy input, feature_names_out=callable default input_features
np.random.rand(100, 3),
lambda transformer, input_features: tuple(input_features) + ("a",),
None,
("x0", "x1", "x2", "a"),
),
(
# Pandas input, feature_names_out=callable default input_features
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: tuple(input_features) + ("c",),
None,
("a", "b", "c"),
),
(
# NumPy input, input_features=list of names
np.random.rand(100, 3),
"one-to-one",
("a", "b", "c"),
("a", "b", "c"),
),
(
# Pandas input, input_features=list of names
{"a": np.random.rand(100), "b": np.random.rand(100)},
"one-to-one",
("a", "b"), # must match feature_names_in_
("a", "b"),
),
(
# NumPy input, feature_names_out=callable, input_features=list
np.random.rand(100, 3),
lambda transformer, input_features: tuple(input_features) + ("d",),
("a", "b", "c"),
("a", "b", "c", "d"),
),
(
# Pandas input, feature_names_out=callable, input_features=list
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: tuple(input_features) + ("c",),
("a", "b"), # must match feature_names_in_
("a", "b", "c"),
),
],
)
@pytest.mark.parametrize("validate", [True, False])
def test_function_transformer_get_feature_names_out(
X, feature_names_out, input_features, expected, validate
):
if isinstance(X, dict):
pd = pytest.importorskip("pandas")
X = pd.DataFrame(X)
transformer = FunctionTransformer(
feature_names_out=feature_names_out, validate=validate
)
transformer.fit(X)
names = transformer.get_feature_names_out(input_features)
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, expected)
def test_function_transformer_get_feature_names_out_without_validation():
transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False)
X = np.random.rand(100, 2)
transformer.fit_transform(X)
names = transformer.get_feature_names_out(("a", "b"))
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, ("a", "b"))
def test_function_transformer_feature_names_out_is_None():
transformer = FunctionTransformer()
X = np.random.rand(100, 2)
transformer.fit_transform(X)
msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'"
with pytest.raises(AttributeError, match=msg):
transformer.get_feature_names_out()
def test_function_transformer_feature_names_out_uses_estimator():
def add_n_random_features(X, n):
return np.concatenate([X, np.random.rand(len(X), n)], axis=1)
def feature_names_out(transformer, input_features):
n = transformer.kw_args["n"]
return list(input_features) + [f"rnd{i}" for i in range(n)]
transformer = FunctionTransformer(
func=add_n_random_features,
feature_names_out=feature_names_out,
kw_args=dict(n=3),
validate=True,
)
pd = pytest.importorskip("pandas")
df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
transformer.fit_transform(df)
names = transformer.get_feature_names_out()
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2"))
def test_function_transformer_validate_inverse():
"""Test that function transformer does not reset estimator in
`inverse_transform`."""
def add_constant_feature(X):
X_one = np.ones((X.shape[0], 1))
return np.concatenate((X, X_one), axis=1)
def inverse_add_constant(X):
return X[:, :-1]
X = np.array([[1, 2], [3, 4], [3, 4]])
trans = FunctionTransformer(
func=add_constant_feature,
inverse_func=inverse_add_constant,
validate=True,
)
X_trans = trans.fit_transform(X)
assert trans.n_features_in_ == X.shape[1]
trans.inverse_transform(X_trans)
assert trans.n_features_in_ == X.shape[1]
@pytest.mark.parametrize(
"feature_names_out, expected",
[
("one-to-one", ["pet", "color"]),
[lambda est, names: [f"{n}_out" for n in names], ["pet_out", "color_out"]],
],
)
@pytest.mark.parametrize("in_pipeline", [True, False])
def test_get_feature_names_out_dataframe_with_string_data(
feature_names_out, expected, in_pipeline
):
"""Check that get_feature_names_out works with DataFrames with string data."""
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"pet": ["dog", "cat"], "color": ["red", "green"]})
def func(X):
if feature_names_out == "one-to-one":
return X
else:
name = feature_names_out(None, X.columns)
return X.rename(columns=dict(zip(X.columns, name)))
transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out)
if in_pipeline:
transformer = make_pipeline(transformer)
X_trans = transformer.fit_transform(X)
assert isinstance(X_trans, pd.DataFrame)
names = transformer.get_feature_names_out()
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, expected)
def test_set_output_func():
"""Check behavior of set_output with different settings."""
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
ft = FunctionTransformer(np.log, feature_names_out="one-to-one")
# no warning is raised when feature_names_out is defined
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
ft.set_output(transform="pandas")
X_trans = ft.fit_transform(X)
assert isinstance(X_trans, pd.DataFrame)
assert_array_equal(X_trans.columns, ["a", "b"])
ft = FunctionTransformer(lambda x: 2 * x)
ft.set_output(transform="pandas")
# no warning is raised when func returns a panda dataframe
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
X_trans = ft.fit_transform(X)
assert isinstance(X_trans, pd.DataFrame)
assert_array_equal(X_trans.columns, ["a", "b"])
# Warning is raised when func returns a ndarray
ft_np = FunctionTransformer(lambda x: np.asarray(x))
for transform in ("pandas", "polars"):
ft_np.set_output(transform=transform)
msg = (
f"When `set_output` is configured to be '{transform}'.*{transform} "
"DataFrame.*"
)
with pytest.warns(UserWarning, match=msg):
ft_np.fit_transform(X)
# default transform does not warn
ft_np.set_output(transform="default")
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
ft_np.fit_transform(X)
def test_consistence_column_name_between_steps():
"""Check that we have a consistence between the feature names out of
`FunctionTransformer` and the feature names in of the next step in the pipeline.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27695
"""
pd = pytest.importorskip("pandas")
def with_suffix(_, names):
return [name + "__log" for name in names]
pipeline = make_pipeline(
FunctionTransformer(np.log1p, feature_names_out=with_suffix), StandardScaler()
)
df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["a", "b"])
X_trans = pipeline.fit_transform(df)
assert pipeline.get_feature_names_out().tolist() == ["a__log", "b__log"]
# StandardScaler will convert to a numpy array
assert isinstance(X_trans, np.ndarray)
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
@pytest.mark.parametrize("transform_output", ["default", "pandas", "polars"])
def test_function_transformer_overwrite_column_names(dataframe_lib, transform_output):
"""Check that we overwrite the column names when we should."""
lib = pytest.importorskip(dataframe_lib)
if transform_output != "numpy":
pytest.importorskip(transform_output)
df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
def with_suffix(_, names):
return [name + "__log" for name in names]
transformer = FunctionTransformer(feature_names_out=with_suffix).set_output(
transform=transform_output
)
X_trans = transformer.fit_transform(df)
assert_array_equal(np.asarray(X_trans), np.asarray(df))
feature_names = transformer.get_feature_names_out()
assert list(X_trans.columns) == with_suffix(None, df.columns)
assert feature_names.tolist() == with_suffix(None, df.columns)
@pytest.mark.parametrize(
"feature_names_out",
["one-to-one", lambda _, names: [f"{name}_log" for name in names]],
)
def test_function_transformer_overwrite_column_names_numerical(feature_names_out):
"""Check the same as `test_function_transformer_overwrite_column_names`
but for the specific case of pandas where column names can be numerical."""
pd = pytest.importorskip("pandas")
df = pd.DataFrame({0: [1, 2, 3], 1: [10, 20, 100]})
transformer = FunctionTransformer(feature_names_out=feature_names_out)
X_trans = transformer.fit_transform(df)
assert_array_equal(np.asarray(X_trans), np.asarray(df))
feature_names = transformer.get_feature_names_out()
assert list(X_trans.columns) == list(feature_names)
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
@pytest.mark.parametrize(
"feature_names_out",
["one-to-one", lambda _, names: [f"{name}_log" for name in names]],
)
def test_function_transformer_error_column_inconsistent(
dataframe_lib, feature_names_out
):
"""Check that we raise an error when `func` returns a dataframe with new
column names that become inconsistent with `get_feature_names_out`."""
lib = pytest.importorskip(dataframe_lib)
df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
def func(df):
if dataframe_lib == "pandas":
return df.rename(columns={"a": "c"})
else:
return df.rename({"a": "c"})
transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out)
err_msg = "The output generated by `func` have different column names"
with pytest.raises(ValueError, match=err_msg):
transformer.fit_transform(df).columns

View File

@@ -0,0 +1,748 @@
import numpy as np
import pytest
from scipy.sparse import issparse
from sklearn import config_context, datasets
from sklearn.preprocessing._label import (
LabelBinarizer,
LabelEncoder,
MultiLabelBinarizer,
_inverse_binarize_multiclass,
_inverse_binarize_thresholding,
label_binarize,
)
from sklearn.utils._array_api import (
_convert_to_numpy,
_get_namespace_device_dtype_ids,
get_namespace,
yield_namespace_device_dtype_combinations,
)
from sklearn.utils._testing import (
_array_api_for_tests,
assert_array_equal,
)
from sklearn.utils.fixes import (
COO_CONTAINERS,
CSC_CONTAINERS,
CSR_CONTAINERS,
DOK_CONTAINERS,
LIL_CONTAINERS,
)
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import _to_object_array
iris = datasets.load_iris()
def toarray(a):
if hasattr(a, "toarray"):
a = a.toarray()
return a
def test_label_binarizer():
# one-class case defaults to negative label
# For dense case:
inp = ["pos", "pos", "pos", "pos"]
lb = LabelBinarizer(sparse_output=False)
expected = np.array([[0, 0, 0, 0]]).T
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["pos"])
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
# For sparse case:
lb = LabelBinarizer(sparse_output=True)
got = lb.fit_transform(inp)
assert issparse(got)
assert_array_equal(lb.classes_, ["pos"])
assert_array_equal(expected, got.toarray())
assert_array_equal(lb.inverse_transform(got.toarray()), inp)
lb = LabelBinarizer(sparse_output=False)
# two-class case
inp = ["neg", "pos", "pos", "neg"]
expected = np.array([[0, 1, 1, 0]]).T
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["neg", "pos"])
assert_array_equal(expected, got)
to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
assert_array_equal(lb.inverse_transform(to_invert), inp)
# multi-class case
inp = ["spam", "ham", "eggs", "ham", "0"]
expected = np.array(
[[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
)
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"])
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer_unseen_labels():
lb = LabelBinarizer()
expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
got = lb.fit_transform(["b", "d", "e"])
assert_array_equal(expected, got)
expected = np.array(
[[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]
)
got = lb.transform(["a", "b", "c", "d", "e", "f"])
assert_array_equal(expected, got)
def test_label_binarizer_set_label_encoding():
lb = LabelBinarizer(neg_label=-2, pos_label=0)
# two-class case with pos_label=0
inp = np.array([0, 1, 1, 0])
expected = np.array([[-2, 0, 0, -2]]).T
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
lb = LabelBinarizer(neg_label=-2, pos_label=2)
# multi-class case
inp = np.array([3, 2, 1, 2, 0])
expected = np.array(
[
[-2, -2, -2, +2],
[-2, -2, +2, -2],
[-2, +2, -2, -2],
[-2, -2, +2, -2],
[+2, -2, -2, -2],
]
)
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
@pytest.mark.parametrize("unique_first", [True, False])
def test_label_binarizer_pandas_nullable(dtype, unique_first):
"""Checks that LabelBinarizer works with pandas nullable dtypes.
Non-regression test for gh-25637.
"""
pd = pytest.importorskip("pandas")
y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
if unique_first:
# Calling unique creates a pandas array which has a different interface
# compared to a pandas Series. Specifically, pandas arrays do not have "iloc".
y_true = y_true.unique()
lb = LabelBinarizer().fit(y_true)
y_out = lb.transform([1, 0])
assert_array_equal(y_out, [[1], [0]])
def test_label_binarizer_errors():
# Check that invalid arguments yield ValueError
one_class = np.array([0, 0, 0, 0])
lb = LabelBinarizer().fit(one_class)
multi_label = [(2, 3), (0,), (0, 2)]
err_msg = "You appear to be using a legacy multi-label data representation."
with pytest.raises(ValueError, match=err_msg):
lb.transform(multi_label)
lb = LabelBinarizer()
err_msg = "This LabelBinarizer instance is not fitted yet"
with pytest.raises(ValueError, match=err_msg):
lb.transform([])
with pytest.raises(ValueError, match=err_msg):
lb.inverse_transform([])
input_labels = [0, 1, 0, 1]
err_msg = "neg_label=2 must be strictly less than pos_label=1."
lb = LabelBinarizer(neg_label=2, pos_label=1)
with pytest.raises(ValueError, match=err_msg):
lb.fit(input_labels)
err_msg = "neg_label=2 must be strictly less than pos_label=2."
lb = LabelBinarizer(neg_label=2, pos_label=2)
with pytest.raises(ValueError, match=err_msg):
lb.fit(input_labels)
err_msg = (
"Sparse binarization is only supported with non zero pos_label and zero "
"neg_label, got pos_label=2 and neg_label=1"
)
lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
with pytest.raises(ValueError, match=err_msg):
lb.fit(input_labels)
# Sequence of seq type should raise ValueError
y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
err_msg = "You appear to be using a legacy multi-label data representation"
with pytest.raises(ValueError, match=err_msg):
LabelBinarizer().fit_transform(y_seq_of_seqs)
# Fail on the dimension of 'binary'
err_msg = "output_type='binary', but y.shape"
with pytest.raises(ValueError, match=err_msg):
_inverse_binarize_thresholding(
y=np.array([[1, 2, 3], [2, 1, 3]]),
output_type="binary",
classes=[1, 2, 3],
threshold=0,
)
# Fail on multioutput data
err_msg = "Multioutput target data is not supported with label binarization"
with pytest.raises(ValueError, match=err_msg):
LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
with pytest.raises(ValueError, match=err_msg):
label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_label_binarizer_sparse_errors(csr_container):
# Fail on y_type
err_msg = "foo format is not supported"
with pytest.raises(ValueError, match=err_msg):
_inverse_binarize_thresholding(
y=csr_container([[1, 2], [2, 1]]),
output_type="foo",
classes=[1, 2],
threshold=0,
)
# Fail on the number of classes
err_msg = "The number of class is not equal to the number of dimension of y."
with pytest.raises(ValueError, match=err_msg):
_inverse_binarize_thresholding(
y=csr_container([[1, 2], [2, 1]]),
output_type="foo",
classes=[1, 2, 3],
threshold=0,
)
@pytest.mark.parametrize(
"values, classes, unknown",
[
(
np.array([2, 1, 3, 1, 3], dtype="int64"),
np.array([1, 2, 3], dtype="int64"),
np.array([4], dtype="int64"),
),
(
np.array(["b", "a", "c", "a", "c"], dtype=object),
np.array(["a", "b", "c"], dtype=object),
np.array(["d"], dtype=object),
),
(
np.array(["b", "a", "c", "a", "c"]),
np.array(["a", "b", "c"]),
np.array(["d"]),
),
],
ids=["int64", "object", "str"],
)
def test_label_encoder(values, classes, unknown):
# Test LabelEncoder's transform, fit_transform and
# inverse_transform methods
le = LabelEncoder()
le.fit(values)
assert_array_equal(le.classes_, classes)
assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
le = LabelEncoder()
ret = le.fit_transform(values)
assert_array_equal(ret, [1, 0, 2, 0, 2])
with pytest.raises(ValueError, match="unseen labels"):
le.transform(unknown)
def test_label_encoder_negative_ints():
le = LabelEncoder()
le.fit([1, 1, 4, 5, -1, 0])
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
assert_array_equal(
le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]
)
with pytest.raises(ValueError):
le.transform([0, 6])
@pytest.mark.parametrize("dtype", ["str", "object"])
def test_label_encoder_str_bad_shape(dtype):
le = LabelEncoder()
le.fit(np.array(["apple", "orange"], dtype=dtype))
msg = "should be a 1d array"
with pytest.raises(ValueError, match=msg):
le.transform("apple")
def test_label_encoder_errors():
# Check that invalid arguments yield ValueError
le = LabelEncoder()
with pytest.raises(ValueError):
le.transform([])
with pytest.raises(ValueError):
le.inverse_transform([])
# Fail on unseen labels
le = LabelEncoder()
le.fit([1, 2, 3, -1, 1])
msg = "contains previously unseen labels"
with pytest.raises(ValueError, match=msg):
le.inverse_transform([-2])
with pytest.raises(ValueError, match=msg):
le.inverse_transform([-2, -3, -4])
# Fail on inverse_transform("")
msg = r"should be a 1d array.+shape \(\)"
with pytest.raises(ValueError, match=msg):
le.inverse_transform("")
@pytest.mark.parametrize(
"values",
[
np.array([2, 1, 3, 1, 3], dtype="int64"),
np.array(["b", "a", "c", "a", "c"], dtype=object),
np.array(["b", "a", "c", "a", "c"]),
],
ids=["int64", "object", "str"],
)
def test_label_encoder_empty_array(values):
le = LabelEncoder()
le.fit(values)
# test empty transform
transformed = le.transform([])
assert_array_equal(np.array([]), transformed)
# test empty inverse transform
inverse_transformed = le.inverse_transform([])
assert_array_equal(np.array([]), inverse_transformed)
def test_sparse_output_multilabel_binarizer():
# test input as iterable of iterables
inputs = [
lambda: [(2, 3), (1,), (1, 2)],
lambda: ({2, 3}, {1}, {1, 2}),
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
inverse = inputs[0]()
for sparse_output in [True, False]:
for inp in inputs:
# With fit_transform
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
got = mlb.fit_transform(inp())
assert issparse(got) == sparse_output
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert got.indices.dtype == got.indptr.dtype
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
# With fit
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
got = mlb.fit(inp()).transform(inp())
assert issparse(got) == sparse_output
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert got.indices.dtype == got.indptr.dtype
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sparse_output_multilabel_binarizer_errors(csr_container):
inp = iter([iter((2, 3)), iter((1,)), {1, 2}])
mlb = MultiLabelBinarizer(sparse_output=False)
mlb.fit(inp)
with pytest.raises(ValueError):
mlb.inverse_transform(
csr_container(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]]))
)
def test_multilabel_binarizer():
# test input as iterable of iterables
inputs = [
lambda: [(2, 3), (1,), (1, 2)],
lambda: ({2, 3}, {1}, {1, 2}),
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
inverse = inputs[0]()
for inp in inputs:
# With fit_transform
mlb = MultiLabelBinarizer()
got = mlb.fit_transform(inp())
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
# With fit
mlb = MultiLabelBinarizer()
got = mlb.fit(inp()).transform(inp())
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
def test_multilabel_binarizer_empty_sample():
mlb = MultiLabelBinarizer()
y = [[1, 2], [1], []]
Y = np.array([[1, 1], [1, 0], [0, 0]])
assert_array_equal(mlb.fit_transform(y), Y)
def test_multilabel_binarizer_unknown_class():
mlb = MultiLabelBinarizer()
y = [[1, 2]]
Y = np.array([[1, 0], [0, 1]])
warning_message = "unknown class.* will be ignored"
with pytest.warns(UserWarning, match=warning_message):
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
Y = np.array([[1, 0, 0], [0, 1, 0]])
mlb = MultiLabelBinarizer(classes=[1, 2, 3])
with pytest.warns(UserWarning, match=warning_message):
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
assert_array_equal(matrix, Y)
def test_multilabel_binarizer_given_classes():
inp = [(2, 3), (1,), (1, 2)]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
# fit_transform()
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, [1, 3, 2])
# fit().transform()
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, [1, 3, 2])
# ensure works with extra class
mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
assert_array_equal(
mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))
)
assert_array_equal(mlb.classes_, [4, 1, 3, 2])
# ensure fit is no-op as iterable is not consumed
inp = iter(inp)
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
# ensure a ValueError is thrown if given duplicate classes
err_msg = (
"The classes argument contains duplicate classes. Remove "
"these duplicates before passing them to MultiLabelBinarizer."
)
mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
with pytest.raises(ValueError, match=err_msg):
mlb.fit(inp)
def test_multilabel_binarizer_multiple_calls():
inp = [(2, 3), (1,), (1, 2)]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
# first call
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
# second call change class
mlb.classes = [1, 2, 3]
assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
def test_multilabel_binarizer_same_length_sequence():
# Ensure sequences of the same length are not interpreted as a 2-d array
inp = [[1], [0], [2]]
indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
# fit_transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
# fit().transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
def test_multilabel_binarizer_non_integer_labels():
tuple_classes = _to_object_array([(1,), (2,), (3,)])
inputs = [
([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]),
([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]),
([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
for inp, classes in inputs:
# fit_transform()
mlb = MultiLabelBinarizer()
inp = np.array(inp, dtype=object)
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, classes)
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
assert_array_equal(indicator_mat_inv, inp)
# fit().transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, classes)
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
assert_array_equal(indicator_mat_inv, inp)
mlb = MultiLabelBinarizer()
with pytest.raises(TypeError):
mlb.fit_transform([({}), ({}, {"a": "b"})])
def test_multilabel_binarizer_non_unique():
inp = [(1, 1, 1, 0)]
indicator_mat = np.array([[1, 1]])
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
def test_multilabel_binarizer_inverse_validation():
inp = [(1, 1, 1, 0)]
mlb = MultiLabelBinarizer()
mlb.fit_transform(inp)
# Not binary
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1, 3]]))
# The following binary cases are fine, however
mlb.inverse_transform(np.array([[0, 0]]))
mlb.inverse_transform(np.array([[1, 1]]))
mlb.inverse_transform(np.array([[1, 0]]))
# Wrong shape
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1]]))
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1, 1, 1]]))
def test_label_binarize_with_class_order():
out = label_binarize([1, 6], classes=[1, 2, 4, 6])
expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
assert_array_equal(out, expected)
# Modified class order
out = label_binarize([1, 6], classes=[1, 6, 4, 2])
expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
assert_array_equal(out, expected)
out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]])
assert_array_equal(out, expected)
def check_binarized_results(y, classes, pos_label, neg_label, expected):
for sparse_output in [True, False]:
if (pos_label == 0 or neg_label != 0) and sparse_output:
with pytest.raises(ValueError):
label_binarize(
y,
classes=classes,
neg_label=neg_label,
pos_label=pos_label,
sparse_output=sparse_output,
)
continue
# check label_binarize
binarized = label_binarize(
y,
classes=classes,
neg_label=neg_label,
pos_label=pos_label,
sparse_output=sparse_output,
)
assert_array_equal(toarray(binarized), expected)
assert issparse(binarized) == sparse_output
# check inverse
y_type = type_of_target(y)
if y_type == "multiclass":
inversed = _inverse_binarize_multiclass(binarized, classes=classes)
else:
inversed = _inverse_binarize_thresholding(
binarized,
output_type=y_type,
classes=classes,
threshold=((neg_label + pos_label) / 2.0),
)
assert_array_equal(toarray(inversed), toarray(y))
# Check label binarizer
lb = LabelBinarizer(
neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
)
binarized = lb.fit_transform(y)
assert_array_equal(toarray(binarized), expected)
assert issparse(binarized) == sparse_output
inverse_output = lb.inverse_transform(binarized)
assert_array_equal(toarray(inverse_output), toarray(y))
assert issparse(inverse_output) == issparse(y)
def test_label_binarize_binary():
y = [0, 1, 0]
classes = [0, 1]
pos_label = 2
neg_label = -1
expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
check_binarized_results(y, classes, pos_label, neg_label, expected)
# Binary case where sparse_output = True will not result in a ValueError
y = [0, 1, 0]
classes = [0, 1]
pos_label = 3
neg_label = 0
expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
check_binarized_results(y, classes, pos_label, neg_label, expected)
def test_label_binarize_multiclass():
y = [0, 1, 2]
classes = [0, 1, 2]
pos_label = 2
neg_label = 0
expected = 2 * np.eye(3)
check_binarized_results(y, classes, pos_label, neg_label, expected)
with pytest.raises(ValueError):
label_binarize(
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
)
@pytest.mark.parametrize(
"arr_type",
[np.array]
+ COO_CONTAINERS
+ CSC_CONTAINERS
+ CSR_CONTAINERS
+ DOK_CONTAINERS
+ LIL_CONTAINERS,
)
def test_label_binarize_multilabel(arr_type):
y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
classes = [0, 1, 2]
pos_label = 2
neg_label = 0
expected = pos_label * y_ind
y = arr_type(y_ind)
check_binarized_results(y, classes, pos_label, neg_label, expected)
with pytest.raises(ValueError):
label_binarize(
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
)
def test_invalid_input_label_binarize():
with pytest.raises(ValueError):
label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
with pytest.raises(ValueError, match="continuous target data is not "):
label_binarize([1.2, 2.7], classes=[0, 1])
with pytest.raises(ValueError, match="mismatch with the labels"):
label_binarize([[1, 3]], classes=[1, 2, 3])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_inverse_binarize_multiclass(csr_container):
got = _inverse_binarize_multiclass(
csr_container([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3)
)
assert_array_equal(got, np.array([1, 1, 0]))
def test_nan_label_encoder():
"""Check that label encoder encodes nans in transform.
Non-regression test for #22628.
"""
le = LabelEncoder()
le.fit(["a", "a", "b", np.nan])
y_trans = le.transform([np.nan])
assert_array_equal(y_trans, [2])
@pytest.mark.parametrize(
"encoder", [LabelEncoder(), LabelBinarizer(), MultiLabelBinarizer()]
)
def test_label_encoders_do_not_have_set_output(encoder):
"""Check that label encoders do not define set_output and work with y as a kwarg.
Non-regression test for #26854.
"""
assert not hasattr(encoder, "set_output")
y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"])
y_encoded_positional = encoder.fit_transform(["a", "b", "c"])
assert_array_equal(y_encoded_with_kwarg, y_encoded_positional)
@pytest.mark.parametrize(
"array_namespace, device, dtype",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
@pytest.mark.parametrize(
"y",
[
np.array([2, 1, 3, 1, 3]),
np.array([1, 1, 4, 5, -1, 0]),
np.array([3, 5, 9, 5, 9, 3]),
],
)
def test_label_encoder_array_api_compliance(y, array_namespace, device, dtype):
xp = _array_api_for_tests(array_namespace, device)
xp_y = xp.asarray(y, device=device)
with config_context(array_api_dispatch=True):
xp_label = LabelEncoder()
np_label = LabelEncoder()
xp_label = xp_label.fit(xp_y)
xp_transformed = xp_label.transform(xp_y)
xp_inv_transformed = xp_label.inverse_transform(xp_transformed)
np_label = np_label.fit(y)
np_transformed = np_label.transform(y)
assert get_namespace(xp_transformed)[0].__name__ == xp.__name__
assert get_namespace(xp_inv_transformed)[0].__name__ == xp.__name__
assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__
assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed)
assert_array_equal(_convert_to_numpy(xp_inv_transformed, xp), y)
assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_)
xp_label = LabelEncoder()
np_label = LabelEncoder()
xp_transformed = xp_label.fit_transform(xp_y)
np_transformed = np_label.fit_transform(y)
assert get_namespace(xp_transformed)[0].__name__ == xp.__name__
assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__
assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed)
assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,714 @@
import re
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_equal
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import (
KFold,
ShuffleSplit,
StratifiedKFold,
cross_val_score,
train_test_split,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
KBinsDiscretizer,
LabelBinarizer,
LabelEncoder,
TargetEncoder,
)
def _encode_target(X_ordinal, y_numeric, n_categories, smooth):
"""Simple Python implementation of target encoding."""
cur_encodings = np.zeros(n_categories, dtype=np.float64)
y_mean = np.mean(y_numeric)
if smooth == "auto":
y_variance = np.var(y_numeric)
for c in range(n_categories):
y_subset = y_numeric[X_ordinal == c]
n_i = y_subset.shape[0]
if n_i == 0:
cur_encodings[c] = y_mean
continue
y_subset_variance = np.var(y_subset)
m = y_subset_variance / y_variance
lambda_ = n_i / (n_i + m)
cur_encodings[c] = lambda_ * np.mean(y_subset) + (1 - lambda_) * y_mean
return cur_encodings
else: # float
for c in range(n_categories):
y_subset = y_numeric[X_ordinal == c]
current_sum = np.sum(y_subset) + y_mean * smooth
current_cnt = y_subset.shape[0] + smooth
cur_encodings[c] = current_sum / current_cnt
return cur_encodings
@pytest.mark.parametrize(
"categories, unknown_value",
[
([np.array([0, 1, 2], dtype=np.int64)], 4),
([np.array([1.0, 3.0, np.nan], dtype=np.float64)], 6.0),
([np.array(["cat", "dog", "snake"], dtype=object)], "bear"),
("auto", 3),
],
)
@pytest.mark.parametrize("smooth", [5.0, "auto"])
@pytest.mark.parametrize("target_type", ["binary", "continuous"])
def test_encoding(categories, unknown_value, global_random_seed, smooth, target_type):
"""Check encoding for binary and continuous targets.
Compare the values returned by `TargetEncoder.fit_transform` against the
expected encodings for cv splits from a naive reference Python
implementation in _encode_target.
"""
n_categories = 3
X_train_int_array = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=np.int64).T
X_test_int_array = np.array([[0, 1, 2]], dtype=np.int64).T
n_samples = X_train_int_array.shape[0]
if categories == "auto":
X_train = X_train_int_array
X_test = X_test_int_array
else:
X_train = categories[0][X_train_int_array]
X_test = categories[0][X_test_int_array]
X_test = np.concatenate((X_test, [[unknown_value]]))
data_rng = np.random.RandomState(global_random_seed)
n_splits = 3
if target_type == "binary":
y_numeric = data_rng.randint(low=0, high=2, size=n_samples)
target_names = np.array(["cat", "dog"], dtype=object)
y_train = target_names[y_numeric]
else:
assert target_type == "continuous"
y_numeric = data_rng.uniform(low=-10, high=20, size=n_samples)
y_train = y_numeric
shuffled_idx = data_rng.permutation(n_samples)
X_train_int_array = X_train_int_array[shuffled_idx]
X_train = X_train[shuffled_idx]
y_train = y_train[shuffled_idx]
y_numeric = y_numeric[shuffled_idx]
# Define our CV splitting strategy
if target_type == "binary":
cv = StratifiedKFold(
n_splits=n_splits, random_state=global_random_seed, shuffle=True
)
else:
cv = KFold(n_splits=n_splits, random_state=global_random_seed, shuffle=True)
# Compute the expected values using our reference Python implementation of
# target encoding:
expected_X_fit_transform = np.empty_like(X_train_int_array, dtype=np.float64)
for train_idx, test_idx in cv.split(X_train_int_array, y_train):
X_, y_ = X_train_int_array[train_idx, 0], y_numeric[train_idx]
cur_encodings = _encode_target(X_, y_, n_categories, smooth)
expected_X_fit_transform[test_idx, 0] = cur_encodings[
X_train_int_array[test_idx, 0]
]
# Check that we can obtain the same encodings by calling `fit_transform` on
# the estimator with the same CV parameters:
target_encoder = TargetEncoder(
smooth=smooth,
categories=categories,
cv=n_splits,
random_state=global_random_seed,
)
X_fit_transform = target_encoder.fit_transform(X_train, y_train)
assert target_encoder.target_type_ == target_type
assert_allclose(X_fit_transform, expected_X_fit_transform)
assert len(target_encoder.encodings_) == 1
if target_type == "binary":
assert_array_equal(target_encoder.classes_, target_names)
else:
assert target_encoder.classes_ is None
# compute encodings for all data to validate `transform`
y_mean = np.mean(y_numeric)
expected_encodings = _encode_target(
X_train_int_array[:, 0], y_numeric, n_categories, smooth
)
assert_allclose(target_encoder.encodings_[0], expected_encodings)
assert target_encoder.target_mean_ == pytest.approx(y_mean)
# Transform on test data, the last value is unknown so it is encoded as the target
# mean
expected_X_test_transform = np.concatenate(
(expected_encodings, np.array([y_mean]))
).reshape(-1, 1)
X_test_transform = target_encoder.transform(X_test)
assert_allclose(X_test_transform, expected_X_test_transform)
@pytest.mark.parametrize(
"categories, unknown_values",
[
([np.array([0, 1, 2], dtype=np.int64)], "auto"),
([np.array(["cat", "dog", "snake"], dtype=object)], ["bear", "rabbit"]),
],
)
@pytest.mark.parametrize(
"target_labels", [np.array([1, 2, 3]), np.array(["a", "b", "c"])]
)
@pytest.mark.parametrize("smooth", [5.0, "auto"])
def test_encoding_multiclass(
global_random_seed, categories, unknown_values, target_labels, smooth
):
"""Check encoding for multiclass targets."""
rng = np.random.RandomState(global_random_seed)
n_samples = 80
n_features = 2
feat_1_int = np.array(rng.randint(low=0, high=2, size=n_samples))
feat_2_int = np.array(rng.randint(low=0, high=3, size=n_samples))
feat_1 = categories[0][feat_1_int]
feat_2 = categories[0][feat_2_int]
X_train = np.column_stack((feat_1, feat_2))
X_train_int = np.column_stack((feat_1_int, feat_2_int))
categories_ = [[0, 1], [0, 1, 2]]
n_classes = 3
y_train_int = np.array(rng.randint(low=0, high=n_classes, size=n_samples))
y_train = target_labels[y_train_int]
y_train_enc = LabelBinarizer().fit_transform(y_train)
n_splits = 3
cv = StratifiedKFold(
n_splits=n_splits, random_state=global_random_seed, shuffle=True
)
# Manually compute encodings for cv splits to validate `fit_transform`
expected_X_fit_transform = np.empty(
(X_train_int.shape[0], X_train_int.shape[1] * n_classes),
dtype=np.float64,
)
for f_idx, cats in enumerate(categories_):
for c_idx in range(n_classes):
for train_idx, test_idx in cv.split(X_train, y_train):
y_class = y_train_enc[:, c_idx]
X_, y_ = X_train_int[train_idx, f_idx], y_class[train_idx]
current_encoding = _encode_target(X_, y_, len(cats), smooth)
# f_idx: 0, 0, 0, 1, 1, 1
# c_idx: 0, 1, 2, 0, 1, 2
# exp_idx: 0, 1, 2, 3, 4, 5
exp_idx = c_idx + (f_idx * n_classes)
expected_X_fit_transform[test_idx, exp_idx] = current_encoding[
X_train_int[test_idx, f_idx]
]
target_encoder = TargetEncoder(
smooth=smooth,
cv=n_splits,
random_state=global_random_seed,
)
X_fit_transform = target_encoder.fit_transform(X_train, y_train)
assert target_encoder.target_type_ == "multiclass"
assert_allclose(X_fit_transform, expected_X_fit_transform)
# Manually compute encoding to validate `transform`
expected_encodings = []
for f_idx, cats in enumerate(categories_):
for c_idx in range(n_classes):
y_class = y_train_enc[:, c_idx]
current_encoding = _encode_target(
X_train_int[:, f_idx], y_class, len(cats), smooth
)
expected_encodings.append(current_encoding)
assert len(target_encoder.encodings_) == n_features * n_classes
for i in range(n_features * n_classes):
assert_allclose(target_encoder.encodings_[i], expected_encodings[i])
assert_array_equal(target_encoder.classes_, target_labels)
# Include unknown values at the end
X_test_int = np.array([[0, 1], [1, 2], [4, 5]])
if unknown_values == "auto":
X_test = X_test_int
else:
X_test = np.empty_like(X_test_int[:-1, :], dtype=object)
for column_idx in range(X_test_int.shape[1]):
X_test[:, column_idx] = categories[0][X_test_int[:-1, column_idx]]
# Add unknown values at end
X_test = np.vstack((X_test, unknown_values))
y_mean = np.mean(y_train_enc, axis=0)
expected_X_test_transform = np.empty(
(X_test_int.shape[0], X_test_int.shape[1] * n_classes),
dtype=np.float64,
)
n_rows = X_test_int.shape[0]
f_idx = [0, 0, 0, 1, 1, 1]
# Last row are unknowns, dealt with later
for row_idx in range(n_rows - 1):
for i, enc in enumerate(expected_encodings):
expected_X_test_transform[row_idx, i] = enc[X_test_int[row_idx, f_idx[i]]]
# Unknowns encoded as target mean for each class
# `y_mean` contains target mean for each class, thus cycle through mean of
# each class, `n_features` times
mean_idx = [0, 1, 2, 0, 1, 2]
for i in range(n_classes * n_features):
expected_X_test_transform[n_rows - 1, i] = y_mean[mean_idx[i]]
X_test_transform = target_encoder.transform(X_test)
assert_allclose(X_test_transform, expected_X_test_transform)
@pytest.mark.parametrize(
"X, categories",
[
(
np.array([[0] * 10 + [1] * 10 + [3]], dtype=np.int64).T, # 3 is unknown
[[0, 1, 2]],
),
(
np.array(
[["cat"] * 10 + ["dog"] * 10 + ["snake"]], dtype=object
).T, # snake is unknown
[["dog", "cat", "cow"]],
),
],
)
@pytest.mark.parametrize("smooth", [4.0, "auto"])
def test_custom_categories(X, categories, smooth):
"""Custom categories with unknown categories that are not in training data."""
rng = np.random.RandomState(0)
y = rng.uniform(low=-10, high=20, size=X.shape[0])
enc = TargetEncoder(categories=categories, smooth=smooth, random_state=0).fit(X, y)
# The last element is unknown and encoded as the mean
y_mean = y.mean()
X_trans = enc.transform(X[-1:])
assert X_trans[0, 0] == pytest.approx(y_mean)
assert len(enc.encodings_) == 1
# custom category that is not in training data
assert enc.encodings_[0][-1] == pytest.approx(y_mean)
@pytest.mark.parametrize(
"y, msg",
[
([1, 2, 0, 1], "Found input variables with inconsistent"),
(
np.array([[1, 2, 0], [1, 2, 3]]).T,
"Target type was inferred to be 'multiclass-multioutput'",
),
],
)
def test_errors(y, msg):
"""Check invalidate input."""
X = np.array([[1, 0, 1]]).T
enc = TargetEncoder()
with pytest.raises(ValueError, match=msg):
enc.fit_transform(X, y)
def test_use_regression_target():
"""Check inferred and specified `target_type` on regression target."""
X = np.array([[0, 1, 0, 1, 0, 1]]).T
y = np.array([1.0, 2.0, 3.0, 2.0, 3.0, 4.0])
enc = TargetEncoder(cv=2)
with pytest.warns(
UserWarning,
match=re.escape(
"The least populated class in y has only 1 members, which is less than"
" n_splits=2."
),
):
enc.fit_transform(X, y)
assert enc.target_type_ == "multiclass"
enc = TargetEncoder(cv=2, target_type="continuous")
enc.fit_transform(X, y)
assert enc.target_type_ == "continuous"
@pytest.mark.parametrize(
"y, feature_names",
[
([1, 2] * 10, ["A", "B"]),
([1, 2, 3] * 6 + [1, 2], ["A_1", "A_2", "A_3", "B_1", "B_2", "B_3"]),
(
["y1", "y2", "y3"] * 6 + ["y1", "y2"],
["A_y1", "A_y2", "A_y3", "B_y1", "B_y2", "B_y3"],
),
],
)
def test_feature_names_out_set_output(y, feature_names):
"""Check TargetEncoder works with set_output."""
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame({"A": ["a", "b"] * 10, "B": [1, 2] * 10})
enc_default = TargetEncoder(cv=2, smooth=3.0, random_state=0)
enc_default.set_output(transform="default")
enc_pandas = TargetEncoder(cv=2, smooth=3.0, random_state=0)
enc_pandas.set_output(transform="pandas")
X_default = enc_default.fit_transform(X_df, y)
X_pandas = enc_pandas.fit_transform(X_df, y)
assert_allclose(X_pandas.to_numpy(), X_default)
assert_array_equal(enc_pandas.get_feature_names_out(), feature_names)
assert_array_equal(enc_pandas.get_feature_names_out(), X_pandas.columns)
@pytest.mark.parametrize("to_pandas", [True, False])
@pytest.mark.parametrize("smooth", [1.0, "auto"])
@pytest.mark.parametrize("target_type", ["binary-ints", "binary-str", "continuous"])
def test_multiple_features_quick(to_pandas, smooth, target_type):
"""Check target encoder with multiple features."""
X_ordinal = np.array(
[[1, 1], [0, 1], [1, 1], [2, 1], [1, 0], [0, 1], [1, 0], [0, 0]], dtype=np.int64
)
if target_type == "binary-str":
y_train = np.array(["a", "b", "a", "a", "b", "b", "a", "b"])
y_integer = LabelEncoder().fit_transform(y_train)
cv = StratifiedKFold(2, random_state=0, shuffle=True)
elif target_type == "binary-ints":
y_train = np.array([3, 4, 3, 3, 3, 4, 4, 4])
y_integer = LabelEncoder().fit_transform(y_train)
cv = StratifiedKFold(2, random_state=0, shuffle=True)
else:
y_train = np.array([3.0, 5.1, 2.4, 3.5, 4.1, 5.5, 10.3, 7.3], dtype=np.float32)
y_integer = y_train
cv = KFold(2, random_state=0, shuffle=True)
y_mean = np.mean(y_integer)
categories = [[0, 1, 2], [0, 1]]
X_test = np.array(
[
[0, 1],
[3, 0], # 3 is unknown
[1, 10], # 10 is unknown
],
dtype=np.int64,
)
if to_pandas:
pd = pytest.importorskip("pandas")
# convert second feature to an object
X_train = pd.DataFrame(
{
"feat0": X_ordinal[:, 0],
"feat1": np.array(["cat", "dog"], dtype=object)[X_ordinal[:, 1]],
}
)
# "snake" is unknown
X_test = pd.DataFrame({"feat0": X_test[:, 0], "feat1": ["dog", "cat", "snake"]})
else:
X_train = X_ordinal
# manually compute encoding for fit_transform
expected_X_fit_transform = np.empty_like(X_ordinal, dtype=np.float64)
for f_idx, cats in enumerate(categories):
for train_idx, test_idx in cv.split(X_ordinal, y_integer):
X_, y_ = X_ordinal[train_idx, f_idx], y_integer[train_idx]
current_encoding = _encode_target(X_, y_, len(cats), smooth)
expected_X_fit_transform[test_idx, f_idx] = current_encoding[
X_ordinal[test_idx, f_idx]
]
# manually compute encoding for transform
expected_encodings = []
for f_idx, cats in enumerate(categories):
current_encoding = _encode_target(
X_ordinal[:, f_idx], y_integer, len(cats), smooth
)
expected_encodings.append(current_encoding)
expected_X_test_transform = np.array(
[
[expected_encodings[0][0], expected_encodings[1][1]],
[y_mean, expected_encodings[1][0]],
[expected_encodings[0][1], y_mean],
],
dtype=np.float64,
)
enc = TargetEncoder(smooth=smooth, cv=2, random_state=0)
X_fit_transform = enc.fit_transform(X_train, y_train)
assert_allclose(X_fit_transform, expected_X_fit_transform)
assert len(enc.encodings_) == 2
for i in range(2):
assert_allclose(enc.encodings_[i], expected_encodings[i])
X_test_transform = enc.transform(X_test)
assert_allclose(X_test_transform, expected_X_test_transform)
@pytest.mark.parametrize(
"y, y_mean",
[
(np.array([3.4] * 20), 3.4),
(np.array([0] * 20), 0),
(np.array(["a"] * 20, dtype=object), 0),
],
ids=["continuous", "binary", "binary-string"],
)
@pytest.mark.parametrize("smooth", ["auto", 4.0, 0.0])
def test_constant_target_and_feature(y, y_mean, smooth):
"""Check edge case where feature and target is constant."""
X = np.array([[1] * 20]).T
n_samples = X.shape[0]
enc = TargetEncoder(cv=2, smooth=smooth, random_state=0)
X_trans = enc.fit_transform(X, y)
assert_allclose(X_trans, np.repeat([[y_mean]], n_samples, axis=0))
assert enc.encodings_[0][0] == pytest.approx(y_mean)
assert enc.target_mean_ == pytest.approx(y_mean)
X_test = np.array([[1], [0]])
X_test_trans = enc.transform(X_test)
assert_allclose(X_test_trans, np.repeat([[y_mean]], 2, axis=0))
def test_fit_transform_not_associated_with_y_if_ordinal_categorical_is_not(
global_random_seed,
):
cardinality = 30 # not too large, otherwise we need a very large n_samples
n_samples = 3000
rng = np.random.RandomState(global_random_seed)
y_train = rng.normal(size=n_samples)
X_train = rng.randint(0, cardinality, size=n_samples).reshape(-1, 1)
# Sort by y_train to attempt to cause a leak
y_sorted_indices = y_train.argsort()
y_train = y_train[y_sorted_indices]
X_train = X_train[y_sorted_indices]
target_encoder = TargetEncoder(shuffle=True, random_state=global_random_seed)
X_encoded_train_shuffled = target_encoder.fit_transform(X_train, y_train)
target_encoder = TargetEncoder(shuffle=False)
X_encoded_train_no_shuffled = target_encoder.fit_transform(X_train, y_train)
# Check that no information about y_train has leaked into X_train:
regressor = RandomForestRegressor(
n_estimators=10, min_samples_leaf=20, random_state=global_random_seed
)
# It's impossible to learn a good predictive model on the training set when
# using the original representation X_train or the target encoded
# representation with shuffled inner CV. For the latter, no information
# about y_train has inadvertently leaked into the prior used to generate
# `X_encoded_train_shuffled`:
cv = ShuffleSplit(n_splits=50, random_state=global_random_seed)
assert cross_val_score(regressor, X_train, y_train, cv=cv).mean() < 0.1
assert (
cross_val_score(regressor, X_encoded_train_shuffled, y_train, cv=cv).mean()
< 0.1
)
# Without the inner CV shuffling, a lot of information about y_train goes into the
# the per-fold y_train.mean() priors: shrinkage is no longer effective in this
# case and would no longer be able to prevent downstream over-fitting.
assert (
cross_val_score(regressor, X_encoded_train_no_shuffled, y_train, cv=cv).mean()
> 0.5
)
def test_smooth_zero():
"""Check edge case with zero smoothing and cv does not contain category."""
X = np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]).T
y = np.array([2.1, 4.3, 1.2, 3.1, 1.0, 9.0, 10.3, 14.2, 13.3, 15.0])
enc = TargetEncoder(smooth=0.0, shuffle=False, cv=2)
X_trans = enc.fit_transform(X, y)
# With cv = 2, category 0 does not exist in the second half, thus
# it will be encoded as the mean of the second half
assert_allclose(X_trans[0], np.mean(y[5:]))
# category 1 does not exist in the first half, thus it will be encoded as
# the mean of the first half
assert_allclose(X_trans[-1], np.mean(y[:5]))
@pytest.mark.parametrize("smooth", [0.0, 1e3, "auto"])
def test_invariance_of_encoding_under_label_permutation(smooth, global_random_seed):
# Check that the encoding does not depend on the integer of the value of
# the integer labels. This is quite a trivial property but it is helpful
# to understand the following test.
rng = np.random.RandomState(global_random_seed)
# Random y and informative categorical X to make the test non-trivial when
# using smoothing.
y = rng.normal(size=1000)
n_categories = 30
X = KBinsDiscretizer(
n_bins=n_categories, quantile_method="averaged_inverted_cdf", encode="ordinal"
).fit_transform(y.reshape(-1, 1))
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=global_random_seed
)
# Shuffle the labels to make sure that the encoding is invariant to the
# permutation of the labels
permutated_labels = rng.permutation(n_categories)
X_train_permuted = permutated_labels[X_train.astype(np.int32)]
X_test_permuted = permutated_labels[X_test.astype(np.int32)]
target_encoder = TargetEncoder(smooth=smooth, random_state=global_random_seed)
X_train_encoded = target_encoder.fit_transform(X_train, y_train)
X_test_encoded = target_encoder.transform(X_test)
X_train_permuted_encoded = target_encoder.fit_transform(X_train_permuted, y_train)
X_test_permuted_encoded = target_encoder.transform(X_test_permuted)
assert_allclose(X_train_encoded, X_train_permuted_encoded)
assert_allclose(X_test_encoded, X_test_permuted_encoded)
@pytest.mark.parametrize("smooth", [0.0, "auto"])
def test_target_encoding_for_linear_regression(smooth, global_random_seed):
# Check some expected statistical properties when fitting a linear
# regression model on target encoded features depending on their relation
# with that target.
# In this test, we use the Ridge class with the "lsqr" solver and a little
# bit of regularization to implement a linear regression model that
# converges quickly for large `n_samples` and robustly in case of
# correlated features. Since we will fit this model on a mean centered
# target, we do not need to fit an intercept and this will help simplify
# the analysis with respect to the expected coefficients.
linear_regression = Ridge(alpha=1e-6, solver="lsqr", fit_intercept=False)
# Construct a random target variable. We need a large number of samples for
# this test to be stable across all values of the random seed.
n_samples = 50_000
rng = np.random.RandomState(global_random_seed)
y = rng.randn(n_samples)
# Generate a single informative ordinal feature with medium cardinality.
# Inject some irreducible noise to make it harder for a multivariate model
# to identify the informative feature from other pure noise features.
noise = 0.8 * rng.randn(n_samples)
n_categories = 100
X_informative = KBinsDiscretizer(
n_bins=n_categories,
encode="ordinal",
strategy="uniform",
random_state=rng,
).fit_transform((y + noise).reshape(-1, 1))
# Let's permute the labels to hide the fact that this feature is
# informative to naive linear regression model trained on the raw ordinal
# values. As highlighted in the previous test, the target encoding should be
# invariant to such a permutation.
permutated_labels = rng.permutation(n_categories)
X_informative = permutated_labels[X_informative.astype(np.int32)]
# Generate a shuffled copy of the informative feature to destroy the
# relationship with the target.
X_shuffled = rng.permutation(X_informative)
# Also include a very high cardinality categorical feature that is by
# itself independent of the target variable: target encoding such a feature
# without internal cross-validation should cause catastrophic overfitting
# for the downstream regressor, even with shrinkage. This kind of features
# typically represents near unique identifiers of samples. In general they
# should be removed from a machine learning datasets but here we want to
# study the ability of the default behavior of TargetEncoder to mitigate
# them automatically.
X_near_unique_categories = rng.choice(
int(0.9 * n_samples), size=n_samples, replace=True
).reshape(-1, 1)
# Assemble the dataset and do a train-test split:
X = np.concatenate(
[X_informative, X_shuffled, X_near_unique_categories],
axis=1,
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# Let's first check that a linear regression model trained on the raw
# features underfits because of the meaning-less ordinal encoding of the
# labels.
raw_model = linear_regression.fit(X_train, y_train)
assert raw_model.score(X_train, y_train) < 0.1
assert raw_model.score(X_test, y_test) < 0.1
# Now do the same with target encoding using the internal CV mechanism
# implemented when using fit_transform.
model_with_cv = make_pipeline(
TargetEncoder(smooth=smooth, random_state=rng), linear_regression
).fit(X_train, y_train)
# This model should be able to fit the data well and also generalise to the
# test data (assuming that the binning is fine-grained enough). The R2
# scores are not perfect because of the noise injected during the
# generation of the unique informative feature.
coef = model_with_cv[-1].coef_
assert model_with_cv.score(X_train, y_train) > 0.5, coef
assert model_with_cv.score(X_test, y_test) > 0.5, coef
# The target encoder recovers the linear relationship with slope 1 between
# the target encoded unique informative predictor and the target. Since the
# target encoding of the 2 other features is not informative thanks to the
# use of internal cross-validation, the multivariate linear regressor
# assigns a coef of 1 to the first feature and 0 to the other 2.
assert coef[0] == pytest.approx(1, abs=1e-2)
assert (np.abs(coef[1:]) < 0.2).all()
# Let's now disable the internal cross-validation by calling fit and then
# transform separately on the training set:
target_encoder = TargetEncoder(smooth=smooth, random_state=rng).fit(
X_train, y_train
)
X_enc_no_cv_train = target_encoder.transform(X_train)
X_enc_no_cv_test = target_encoder.transform(X_test)
model_no_cv = linear_regression.fit(X_enc_no_cv_train, y_train)
# The linear regression model should always overfit because it assigns
# too much weight to the extremely high cardinality feature relatively to
# the informative feature. Note that this is the case even when using
# the empirical Bayes smoothing which is not enough to prevent such
# overfitting alone.
coef = model_no_cv.coef_
assert model_no_cv.score(X_enc_no_cv_train, y_train) > 0.7, coef
assert model_no_cv.score(X_enc_no_cv_test, y_test) < 0.5, coef
# The model overfits because it assigns too much weight to the high
# cardinality yet non-informative feature instead of the lower
# cardinality yet informative feature:
assert abs(coef[0]) < abs(coef[2])
def test_pandas_copy_on_write():
"""
Test target-encoder cython code when y is read-only.
The numpy array underlying df["y"] is read-only when copy-on-write is enabled.
Non-regression test for gh-27879.
"""
pd = pytest.importorskip("pandas", minversion="2.0")
with pd.option_context("mode.copy_on_write", True):
df = pd.DataFrame({"x": ["a", "b", "b"], "y": [4.0, 5.0, 6.0]})
TargetEncoder(target_type="continuous").fit(df[["x"]], df["y"])