add read me

This commit is contained in:
2026-01-09 10:28:44 +11:00
commit edaf914b73
13417 changed files with 2952119 additions and 0 deletions

View File

@@ -0,0 +1,63 @@
"""Methods for scaling, centering, normalization, binarization, and more."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from ._data import (
Binarizer,
KernelCenterer,
MaxAbsScaler,
MinMaxScaler,
Normalizer,
PowerTransformer,
QuantileTransformer,
RobustScaler,
StandardScaler,
add_dummy_feature,
binarize,
maxabs_scale,
minmax_scale,
normalize,
power_transform,
quantile_transform,
robust_scale,
scale,
)
from ._discretization import KBinsDiscretizer
from ._encoders import OneHotEncoder, OrdinalEncoder
from ._function_transformer import FunctionTransformer
from ._label import LabelBinarizer, LabelEncoder, MultiLabelBinarizer, label_binarize
from ._polynomial import PolynomialFeatures, SplineTransformer
from ._target_encoder import TargetEncoder
__all__ = [
"Binarizer",
"FunctionTransformer",
"KBinsDiscretizer",
"KernelCenterer",
"LabelBinarizer",
"LabelEncoder",
"MaxAbsScaler",
"MinMaxScaler",
"MultiLabelBinarizer",
"Normalizer",
"OneHotEncoder",
"OrdinalEncoder",
"PolynomialFeatures",
"PowerTransformer",
"QuantileTransformer",
"RobustScaler",
"SplineTransformer",
"StandardScaler",
"TargetEncoder",
"add_dummy_feature",
"binarize",
"label_binarize",
"maxabs_scale",
"minmax_scale",
"normalize",
"power_transform",
"quantile_transform",
"robust_scale",
"scale",
]

View File

@@ -0,0 +1,258 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from ..utils._typedefs cimport uint8_t, int64_t, intp_t
ctypedef uint8_t FLAG_t
# We use the following verbatim block to determine whether the current
# platform's compiler supports 128-bit integer values intrinsically.
# This should work for GCC and CLANG on 64-bit architectures, but doesn't for
# MSVC on any architecture. We prefer to use 128-bit integers when possible
# because the intermediate calculations have a non-trivial risk of overflow. It
# is, however, very unlikely to come up on an average use case, hence 64-bit
# integers (i.e. `long long`) are "good enough" for most common cases. There is
# not much we can do to efficiently mitigate the overflow risk on the Windows
# platform at this time. Consider this a "best effort" design decision that
# could be revisited later in case someone comes up with a safer option that
# does not hurt the performance of the common cases.
# See `test_sizeof_LARGEST_INT_t()`for more information on exact type expectations.
cdef extern from *:
"""
#ifdef __SIZEOF_INT128__
typedef __int128 LARGEST_INT_t;
#elif (__clang__ || __EMSCRIPTEN__) && !__i386__
typedef _BitInt(128) LARGEST_INT_t;
#else
typedef long long LARGEST_INT_t;
#endif
"""
ctypedef long long LARGEST_INT_t
# Determine the size of `LARGEST_INT_t` at runtime.
# Used in `test_sizeof_LARGEST_INT_t`.
def _get_sizeof_LARGEST_INT_t():
return sizeof(LARGEST_INT_t)
# TODO: use `{int,float}{32,64}_t` when cython#5230 is resolved:
# https://github.com/cython/cython/issues/5230
ctypedef fused DATA_t:
float
double
int
long long
# INDEX_{A,B}_t are defined to generate a proper Cartesian product
# of types through Cython fused-type expansion.
ctypedef fused INDEX_A_t:
signed int
signed long long
ctypedef fused INDEX_B_t:
signed int
signed long long
cdef inline int64_t _deg2_column(
LARGEST_INT_t n_features,
LARGEST_INT_t i,
LARGEST_INT_t j,
FLAG_t interaction_only
) nogil:
"""Compute the index of the column for a degree 2 expansion
n_features is the dimensionality of the input data, i and j are the indices
for the columns involved in the expansion.
"""
if interaction_only:
return n_features * i - i * (i + 3) / 2 - 1 + j
else:
return n_features * i - i* (i + 1) / 2 + j
cdef inline int64_t _deg3_column(
LARGEST_INT_t n_features,
LARGEST_INT_t i,
LARGEST_INT_t j,
LARGEST_INT_t k,
FLAG_t interaction_only
) nogil:
"""Compute the index of the column for a degree 3 expansion
n_features is the dimensionality of the input data, i, j and k are the indices
for the columns involved in the expansion.
"""
if interaction_only:
return (
(
(3 * n_features) * (n_features * i - i**2)
+ i * (i**2 + 11) - (3 * j) * (j + 3)
) / 6 + i**2 + n_features * (j - 1 - 2 * i) + k
)
else:
return (
(
(3 * n_features) * (n_features * i - i**2)
+ i ** 3 - i - (3 * j) * (j + 1)
) / 6 + n_features * j + k
)
def py_calc_expanded_nnz_deg2(n, interaction_only):
return n * (n + 1) // 2 - interaction_only * n
def py_calc_expanded_nnz_deg3(n, interaction_only):
return n * (n**2 + 3 * n + 2) // 6 - interaction_only * n**2
cpdef int64_t _calc_expanded_nnz(
LARGEST_INT_t n,
FLAG_t interaction_only,
LARGEST_INT_t degree
):
"""
Calculates the number of non-zero interaction terms generated by the
non-zero elements of a single row.
"""
# This is the maximum value before the intermediate computation
# d**2 + d overflows
# Solution to d**2 + d = maxint64
# SymPy: solve(x**2 + x - int64_max, x)
cdef int64_t MAX_SAFE_INDEX_CALC_DEG2 = 3037000499
# This is the maximum value before the intermediate computation
# d**3 + 3 * d**2 + 2*d overflows
# Solution to d**3 + 3 * d**2 + 2*d = maxint64
# SymPy: solve(x * (x**2 + 3 * x + 2) - int64_max, x)
cdef int64_t MAX_SAFE_INDEX_CALC_DEG3 = 2097151
if degree == 2:
# Only need to check when not using 128-bit integers
if sizeof(LARGEST_INT_t) < 16 and n <= MAX_SAFE_INDEX_CALC_DEG2:
return n * (n + 1) / 2 - interaction_only * n
return <int64_t> py_calc_expanded_nnz_deg2(n, interaction_only)
else:
# Only need to check when not using 128-bit integers
if sizeof(LARGEST_INT_t) < 16 and n <= MAX_SAFE_INDEX_CALC_DEG3:
return n * (n**2 + 3 * n + 2) / 6 - interaction_only * n**2
return <int64_t> py_calc_expanded_nnz_deg3(n, interaction_only)
cpdef int64_t _calc_total_nnz(
INDEX_A_t[:] indptr,
FLAG_t interaction_only,
int64_t degree,
):
"""
Calculates the number of non-zero interaction terms generated by the
non-zero elements across all rows for a single degree.
"""
cdef int64_t total_nnz=0
cdef intp_t row_idx
for row_idx in range(len(indptr) - 1):
total_nnz += _calc_expanded_nnz(
indptr[row_idx + 1] - indptr[row_idx],
interaction_only,
degree
)
return total_nnz
cpdef void _csr_polynomial_expansion(
const DATA_t[:] data, # IN READ-ONLY
const INDEX_A_t[:] indices, # IN READ-ONLY
const INDEX_A_t[:] indptr, # IN READ-ONLY
INDEX_A_t n_features,
DATA_t[:] result_data, # OUT
INDEX_B_t[:] result_indices, # OUT
INDEX_B_t[:] result_indptr, # OUT
FLAG_t interaction_only,
FLAG_t degree
):
"""
Perform a second or third degree polynomial or interaction expansion on a
compressed sparse row (CSR) matrix. The method used only takes products of
non-zero features. For a matrix with density :math:`d`, this results in a
speedup on the order of :math:`(1/d)^k` where :math:`k` is the degree of
the expansion, assuming all rows are of similar density.
Parameters
----------
data : memory view on nd-array
The "data" attribute of the input CSR matrix.
indices : memory view on nd-array
The "indices" attribute of the input CSR matrix.
indptr : memory view on nd-array
The "indptr" attribute of the input CSR matrix.
n_features : int
The dimensionality of the input CSR matrix.
result_data : nd-array
The output CSR matrix's "data" attribute.
It is modified by this routine.
result_indices : nd-array
The output CSR matrix's "indices" attribute.
It is modified by this routine.
result_indptr : nd-array
The output CSR matrix's "indptr" attribute.
It is modified by this routine.
interaction_only : int
0 for a polynomial expansion, 1 for an interaction expansion.
degree : int
The degree of the expansion. This must be either 2 or 3.
References
----------
"Leveraging Sparsity to Speed Up Polynomial Feature Expansions of CSR
Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes.
"""
# Make the arrays that will form the CSR matrix of the expansion.
cdef INDEX_A_t row_i, row_starts, row_ends, i, j, k, i_ptr, j_ptr, k_ptr
cdef INDEX_B_t expanded_index=0, num_cols_in_row, col
with nogil:
result_indptr[0] = indptr[0]
for row_i in range(indptr.shape[0]-1):
row_starts = indptr[row_i]
row_ends = indptr[row_i + 1]
num_cols_in_row = 0
for i_ptr in range(row_starts, row_ends):
i = indices[i_ptr]
for j_ptr in range(i_ptr + interaction_only, row_ends):
j = indices[j_ptr]
if degree == 2:
col = <INDEX_B_t> _deg2_column(
n_features,
i, j,
interaction_only
)
result_indices[expanded_index] = col
result_data[expanded_index] = (
data[i_ptr] * data[j_ptr]
)
expanded_index += 1
num_cols_in_row += 1
else:
# degree == 3
for k_ptr in range(j_ptr + interaction_only, row_ends):
k = indices[k_ptr]
col = <INDEX_B_t> _deg3_column(
n_features,
i, j, k,
interaction_only
)
result_indices[expanded_index] = col
result_data[expanded_index] = (
data[i_ptr] * data[j_ptr] * data[k_ptr]
)
expanded_index += 1
num_cols_in_row += 1
result_indptr[row_i+1] = result_indptr[row_i] + num_cols_in_row
return

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,548 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
from numbers import Integral
import numpy as np
from ..base import BaseEstimator, TransformerMixin, _fit_context
from ..utils import resample
from ..utils._param_validation import Interval, Options, StrOptions
from ..utils.stats import _averaged_weighted_percentile, _weighted_percentile
from ..utils.validation import (
_check_feature_names_in,
_check_sample_weight,
check_array,
check_is_fitted,
validate_data,
)
from ._encoders import OneHotEncoder
class KBinsDiscretizer(TransformerMixin, BaseEstimator):
"""
Bin continuous data into intervals.
Read more in the :ref:`User Guide <preprocessing_discretization>`.
.. versionadded:: 0.20
Parameters
----------
n_bins : int or array-like of shape (n_features,), default=5
The number of bins to produce. Raises ValueError if ``n_bins < 2``.
encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
Method used to encode the transformed result.
- 'onehot': Encode the transformed result with one-hot encoding
and return a sparse matrix. Ignored features are always
stacked to the right.
- 'onehot-dense': Encode the transformed result with one-hot encoding
and return a dense array. Ignored features are always
stacked to the right.
- 'ordinal': Return the bin identifier encoded as an integer value.
strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
Strategy used to define the widths of the bins.
- 'uniform': All bins in each feature have identical widths.
- 'quantile': All bins in each feature have the same number of points.
- 'kmeans': Values in each bin have the same nearest center of a 1D
k-means cluster.
For an example of the different strategies see:
:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.
quantile_method : {"inverted_cdf", "averaged_inverted_cdf",
"closest_observation", "interpolated_inverted_cdf", "hazen",
"weibull", "linear", "median_unbiased", "normal_unbiased"},
default="linear"
Method to pass on to np.percentile calculation when using
strategy="quantile". Only `averaged_inverted_cdf` and `inverted_cdf`
support the use of `sample_weight != None` when subsampling is not
active.
.. versionadded:: 1.7
dtype : {np.float32, np.float64}, default=None
The desired data-type for the output. If None, output dtype is
consistent with input dtype. Only np.float32 and np.float64 are
supported.
.. versionadded:: 0.24
subsample : int or None, default=200_000
Maximum number of samples, used to fit the model, for computational
efficiency.
`subsample=None` means that all the training samples are used when
computing the quantiles that determine the binning thresholds.
Since quantile computation relies on sorting each column of `X` and
that sorting has an `n log(n)` time complexity,
it is recommended to use subsampling on datasets with a
very large number of samples.
.. versionchanged:: 1.3
The default value of `subsample` changed from `None` to `200_000` when
`strategy="quantile"`.
.. versionchanged:: 1.5
The default value of `subsample` changed from `None` to `200_000` when
`strategy="uniform"` or `strategy="kmeans"`.
random_state : int, RandomState instance or None, default=None
Determines random number generation for subsampling.
Pass an int for reproducible results across multiple function calls.
See the `subsample` parameter for more details.
See :term:`Glossary <random_state>`.
.. versionadded:: 1.1
Attributes
----------
bin_edges_ : ndarray of ndarray of shape (n_features,)
The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
Ignored features will have empty arrays.
n_bins_ : ndarray of shape (n_features,), dtype=np.int64
Number of bins per feature. Bins whose width are too small
(i.e., <= 1e-8) are removed with a warning.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
Binarizer : Class used to bin values as ``0`` or
``1`` based on a parameter ``threshold``.
Notes
-----
For a visualization of discretization on different datasets refer to
:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.
On the effect of discretization on linear models see:
:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.
In bin edges for feature ``i``, the first and last values are used only for
``inverse_transform``. During transform, bin edges are extended to::
np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])
You can combine ``KBinsDiscretizer`` with
:class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
part of the features.
``KBinsDiscretizer`` might produce constant features (e.g., when
``encode = 'onehot'`` and certain bins do not contain any data).
These features can be removed with feature selection algorithms
(e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).
Examples
--------
>>> from sklearn.preprocessing import KBinsDiscretizer
>>> X = [[-2, 1, -4, -1],
... [-1, 2, -3, -0.5],
... [ 0, 3, -2, 0.5],
... [ 1, 4, -1, 2]]
>>> est = KBinsDiscretizer(
... n_bins=3, encode='ordinal', strategy='uniform'
... )
>>> est.fit(X)
KBinsDiscretizer(...)
>>> Xt = est.transform(X)
>>> Xt # doctest: +SKIP
array([[ 0., 0., 0., 0.],
[ 1., 1., 1., 0.],
[ 2., 2., 2., 1.],
[ 2., 2., 2., 2.]])
Sometimes it may be useful to convert the data back into the original
feature space. The ``inverse_transform`` function converts the binned
data into the original feature space. Each value will be equal to the mean
of the two bin edges.
>>> est.bin_edges_[0]
array([-2., -1., 0., 1.])
>>> est.inverse_transform(Xt)
array([[-1.5, 1.5, -3.5, -0.5],
[-0.5, 2.5, -2.5, -0.5],
[ 0.5, 3.5, -1.5, 0.5],
[ 0.5, 3.5, -1.5, 1.5]])
"""
_parameter_constraints: dict = {
"n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"],
"encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})],
"strategy": [StrOptions({"uniform", "quantile", "kmeans"})],
"quantile_method": [
StrOptions(
{
"warn",
"inverted_cdf",
"averaged_inverted_cdf",
"closest_observation",
"interpolated_inverted_cdf",
"hazen",
"weibull",
"linear",
"median_unbiased",
"normal_unbiased",
}
)
],
"dtype": [Options(type, {np.float64, np.float32}), None],
"subsample": [Interval(Integral, 1, None, closed="left"), None],
"random_state": ["random_state"],
}
def __init__(
self,
n_bins=5,
*,
encode="onehot",
strategy="quantile",
quantile_method="warn",
dtype=None,
subsample=200_000,
random_state=None,
):
self.n_bins = n_bins
self.encode = encode
self.strategy = strategy
self.quantile_method = quantile_method
self.dtype = dtype
self.subsample = subsample
self.random_state = random_state
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None, sample_weight=None):
"""
Fit the estimator.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data to be discretized.
y : None
Ignored. This parameter exists only for compatibility with
:class:`~sklearn.pipeline.Pipeline`.
sample_weight : ndarray of shape (n_samples,)
Contains weight values to be associated with each sample.
.. versionadded:: 1.3
.. versionchanged:: 1.7
Added support for strategy="uniform".
Returns
-------
self : object
Returns the instance itself.
"""
X = validate_data(self, X, dtype="numeric")
if self.dtype in (np.float64, np.float32):
output_dtype = self.dtype
else: # self.dtype is None
output_dtype = X.dtype
n_samples, n_features = X.shape
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
if self.subsample is not None and n_samples > self.subsample:
# Take a subsample of `X`
# When resampling, it is important to subsample **with replacement** to
# preserve the distribution, in particular in the presence of a few data
# points with large weights. You can check this by setting `replace=False`
# in sklearn.utils.test.test_indexing.test_resample_weighted and check that
# it fails as a justification for this claim.
X = resample(
X,
replace=True,
n_samples=self.subsample,
random_state=self.random_state,
sample_weight=sample_weight,
)
# Since we already used the weights when resampling when provided,
# we set them back to `None` to avoid accounting for the weights twice
# in subsequent operations to compute weight-aware bin edges with
# quantiles or k-means.
sample_weight = None
n_features = X.shape[1]
n_bins = self._validate_n_bins(n_features)
bin_edges = np.zeros(n_features, dtype=object)
# TODO(1.9): remove and switch to quantile_method="averaged_inverted_cdf"
# by default.
quantile_method = self.quantile_method
if self.strategy == "quantile" and quantile_method == "warn":
warnings.warn(
"The current default behavior, quantile_method='linear', will be "
"changed to quantile_method='averaged_inverted_cdf' in "
"scikit-learn version 1.9 to naturally support sample weight "
"equivalence properties by default. Pass "
"quantile_method='averaged_inverted_cdf' explicitly to silence this "
"warning.",
FutureWarning,
)
quantile_method = "linear"
if (
self.strategy == "quantile"
and quantile_method not in ["inverted_cdf", "averaged_inverted_cdf"]
and sample_weight is not None
):
raise ValueError(
"When fitting with strategy='quantile' and sample weights, "
"quantile_method should either be set to 'averaged_inverted_cdf' or "
f"'inverted_cdf', got quantile_method='{quantile_method}' instead."
)
if self.strategy != "quantile" and sample_weight is not None:
# Prepare a mask to filter out zero-weight samples when extracting
# the min and max values of each columns which are needed for the
# "uniform" and "kmeans" strategies.
nnz_weight_mask = sample_weight != 0
else:
# Otherwise, all samples are used. Use a slice to avoid creating a
# new array.
nnz_weight_mask = slice(None)
for jj in range(n_features):
column = X[:, jj]
col_min = column[nnz_weight_mask].min()
col_max = column[nnz_weight_mask].max()
if col_min == col_max:
warnings.warn(
"Feature %d is constant and will be replaced with 0." % jj
)
n_bins[jj] = 1
bin_edges[jj] = np.array([-np.inf, np.inf])
continue
if self.strategy == "uniform":
bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)
elif self.strategy == "quantile":
percentile_levels = np.linspace(0, 100, n_bins[jj] + 1)
# method="linear" is the implicit default for any numpy
# version. So we keep it version independent in that case by
# using an empty param dict.
percentile_kwargs = {}
if quantile_method != "linear" and sample_weight is None:
percentile_kwargs["method"] = quantile_method
if sample_weight is None:
bin_edges[jj] = np.asarray(
np.percentile(column, percentile_levels, **percentile_kwargs),
dtype=np.float64,
)
else:
# TODO: make _weighted_percentile and
# _averaged_weighted_percentile accept an array of
# quantiles instead of calling it multiple times and
# sorting the column multiple times as a result.
percentile_func = {
"inverted_cdf": _weighted_percentile,
"averaged_inverted_cdf": _averaged_weighted_percentile,
}[quantile_method]
bin_edges[jj] = np.asarray(
[
percentile_func(column, sample_weight, percentile_rank=p)
for p in percentile_levels
],
dtype=np.float64,
)
elif self.strategy == "kmeans":
from ..cluster import KMeans # fixes import loops
# Deterministic initialization with uniform spacing
uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
# 1D k-means procedure
km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
centers = km.fit(
column[:, None], sample_weight=sample_weight
).cluster_centers_[:, 0]
# Must sort, centers may be unsorted even with sorted init
centers.sort()
bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
# Remove bins whose width are too small (i.e., <= 1e-8)
if self.strategy in ("quantile", "kmeans"):
mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
bin_edges[jj] = bin_edges[jj][mask]
if len(bin_edges[jj]) - 1 != n_bins[jj]:
warnings.warn(
"Bins whose width are too small (i.e., <= "
"1e-8) in feature %d are removed. Consider "
"decreasing the number of bins." % jj
)
n_bins[jj] = len(bin_edges[jj]) - 1
self.bin_edges_ = bin_edges
self.n_bins_ = n_bins
if "onehot" in self.encode:
self._encoder = OneHotEncoder(
categories=[np.arange(i) for i in self.n_bins_],
sparse_output=self.encode == "onehot",
dtype=output_dtype,
)
# Fit the OneHotEncoder with toy datasets
# so that it's ready for use after the KBinsDiscretizer is fitted
self._encoder.fit(np.zeros((1, len(self.n_bins_))))
return self
def _validate_n_bins(self, n_features):
"""Returns n_bins_, the number of bins per feature."""
orig_bins = self.n_bins
if isinstance(orig_bins, Integral):
return np.full(n_features, orig_bins, dtype=int)
n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)
if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
raise ValueError("n_bins must be a scalar or array of shape (n_features,).")
bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
violating_indices = np.where(bad_nbins_value)[0]
if violating_indices.shape[0] > 0:
indices = ", ".join(str(i) for i in violating_indices)
raise ValueError(
"{} received an invalid number "
"of bins at indices {}. Number of bins "
"must be at least 2, and must be an int.".format(
KBinsDiscretizer.__name__, indices
)
)
return n_bins
def transform(self, X):
"""
Discretize the data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data to be discretized.
Returns
-------
Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
Data in the binned space. Will be a sparse matrix if
`self.encode='onehot'` and ndarray otherwise.
"""
check_is_fitted(self)
# check input and attribute dtypes
dtype = (np.float64, np.float32) if self.dtype is None else self.dtype
Xt = validate_data(self, X, copy=True, dtype=dtype, reset=False)
bin_edges = self.bin_edges_
for jj in range(Xt.shape[1]):
Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right")
if self.encode == "ordinal":
return Xt
dtype_init = None
if "onehot" in self.encode:
dtype_init = self._encoder.dtype
self._encoder.dtype = Xt.dtype
try:
Xt_enc = self._encoder.transform(Xt)
finally:
# revert the initial dtype to avoid modifying self.
self._encoder.dtype = dtype_init
return Xt_enc
def inverse_transform(self, X):
"""
Transform discretized data back to original feature space.
Note that this function does not regenerate the original data
due to discretization rounding.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Transformed data in the binned space.
Returns
-------
X_original : ndarray, dtype={np.float32, np.float64}
Data in the original feature space.
"""
check_is_fitted(self)
if "onehot" in self.encode:
X = self._encoder.inverse_transform(X)
Xinv = check_array(X, copy=True, dtype=(np.float64, np.float32))
n_features = self.n_bins_.shape[0]
if Xinv.shape[1] != n_features:
raise ValueError(
"Incorrect number of features. Expecting {}, received {}.".format(
n_features, Xinv.shape[1]
)
)
for jj in range(n_features):
bin_edges = self.bin_edges_[jj]
bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)]
return Xinv
def get_feature_names_out(self, input_features=None):
"""Get output feature names.
Parameters
----------
input_features : array-like of str or None, default=None
Input features.
- If `input_features` is `None`, then `feature_names_in_` is
used as feature names in. If `feature_names_in_` is not defined,
then the following input feature names are generated:
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
- If `input_features` is an array-like, then `input_features` must
match `feature_names_in_` if `feature_names_in_` is defined.
Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
"""
check_is_fitted(self, "n_features_in_")
input_features = _check_feature_names_in(self, input_features)
if hasattr(self, "_encoder"):
return self._encoder.get_feature_names_out(input_features)
# ordinal encoding
return input_features

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,449 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
from functools import partial
import numpy as np
from ..base import BaseEstimator, TransformerMixin, _fit_context
from ..utils._param_validation import StrOptions
from ..utils._repr_html.estimator import _VisualBlock
from ..utils._set_output import (
_get_adapter_from_container,
_get_output_config,
)
from ..utils.metaestimators import available_if
from ..utils.validation import (
_allclose_dense_sparse,
_check_feature_names,
_check_feature_names_in,
_check_n_features,
_get_feature_names,
_is_pandas_df,
_is_polars_df,
check_array,
validate_data,
)
def _identity(X):
"""The identity function."""
return X
class FunctionTransformer(TransformerMixin, BaseEstimator):
"""Constructs a transformer from an arbitrary callable.
A FunctionTransformer forwards its X (and optionally y) arguments to a
user-defined function or function object and returns the result of this
function. This is useful for stateless transformations such as taking the
log of frequencies, doing custom scaling, etc.
Note: If a lambda is used as the function, then the resulting
transformer will not be pickleable.
.. versionadded:: 0.17
Read more in the :ref:`User Guide <function_transformer>`.
Parameters
----------
func : callable, default=None
The callable to use for the transformation. This will be passed
the same arguments as transform, with args and kwargs forwarded.
If func is None, then func will be the identity function.
inverse_func : callable, default=None
The callable to use for the inverse transformation. This will be
passed the same arguments as inverse transform, with args and
kwargs forwarded. If inverse_func is None, then inverse_func
will be the identity function.
validate : bool, default=False
Indicate that the input X array should be checked before calling
``func``. The possibilities are:
- If False, there is no input validation.
- If True, then X will be converted to a 2-dimensional NumPy array or
sparse matrix. If the conversion is not possible an exception is
raised.
.. versionchanged:: 0.22
The default of ``validate`` changed from True to False.
accept_sparse : bool, default=False
Indicate that func accepts a sparse matrix as input. If validate is
False, this has no effect. Otherwise, if accept_sparse is false,
sparse matrix inputs will cause an exception to be raised.
check_inverse : bool, default=True
Whether to check that or ``func`` followed by ``inverse_func`` leads to
the original inputs. It can be used for a sanity check, raising a
warning when the condition is not fulfilled.
.. versionadded:: 0.20
feature_names_out : callable, 'one-to-one' or None, default=None
Determines the list of feature names that will be returned by the
`get_feature_names_out` method. If it is 'one-to-one', then the output
feature names will be equal to the input feature names. If it is a
callable, then it must take two positional arguments: this
`FunctionTransformer` (`self`) and an array-like of input feature names
(`input_features`). It must return an array-like of output feature
names. The `get_feature_names_out` method is only defined if
`feature_names_out` is not None.
See ``get_feature_names_out`` for more details.
.. versionadded:: 1.1
kw_args : dict, default=None
Dictionary of additional keyword arguments to pass to func.
.. versionadded:: 0.18
inv_kw_args : dict, default=None
Dictionary of additional keyword arguments to pass to inverse_func.
.. versionadded:: 0.18
Attributes
----------
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X` has feature
names that are all strings.
.. versionadded:: 1.0
See Also
--------
MaxAbsScaler : Scale each feature by its maximum absolute value.
StandardScaler : Standardize features by removing the mean and
scaling to unit variance.
LabelBinarizer : Binarize labels in a one-vs-all fashion.
MultiLabelBinarizer : Transform between iterable of iterables
and a multilabel format.
Notes
-----
If `func` returns an output with a `columns` attribute, then the columns is enforced
to be consistent with the output of `get_feature_names_out`.
Examples
--------
>>> import numpy as np
>>> from sklearn.preprocessing import FunctionTransformer
>>> transformer = FunctionTransformer(np.log1p)
>>> X = np.array([[0, 1], [2, 3]])
>>> transformer.transform(X)
array([[0. , 0.6931],
[1.0986, 1.3862]])
"""
_parameter_constraints: dict = {
"func": [callable, None],
"inverse_func": [callable, None],
"validate": ["boolean"],
"accept_sparse": ["boolean"],
"check_inverse": ["boolean"],
"feature_names_out": [callable, StrOptions({"one-to-one"}), None],
"kw_args": [dict, None],
"inv_kw_args": [dict, None],
}
def __init__(
self,
func=None,
inverse_func=None,
*,
validate=False,
accept_sparse=False,
check_inverse=True,
feature_names_out=None,
kw_args=None,
inv_kw_args=None,
):
self.func = func
self.inverse_func = inverse_func
self.validate = validate
self.accept_sparse = accept_sparse
self.check_inverse = check_inverse
self.feature_names_out = feature_names_out
self.kw_args = kw_args
self.inv_kw_args = inv_kw_args
def _check_input(self, X, *, reset):
if self.validate:
return validate_data(self, X, accept_sparse=self.accept_sparse, reset=reset)
elif reset:
# Set feature_names_in_ and n_features_in_ even if validate=False
# We run this only when reset==True to store the attributes but not
# validate them, because validate=False
_check_n_features(self, X, reset=reset)
_check_feature_names(self, X, reset=reset)
return X
def _check_inverse_transform(self, X):
"""Check that func and inverse_func are the inverse."""
idx_selected = slice(None, None, max(1, X.shape[0] // 100))
X_round_trip = self.inverse_transform(self.transform(X[idx_selected]))
if hasattr(X, "dtype"):
dtypes = [X.dtype]
elif hasattr(X, "dtypes"):
# Dataframes can have multiple dtypes
dtypes = X.dtypes
# Not all dtypes are numpy dtypes, they can be pandas dtypes as well
if not all(
isinstance(d, np.dtype) and np.issubdtype(d, np.number) for d in dtypes
):
raise ValueError(
"'check_inverse' is only supported when all the elements in `X` is"
" numerical."
)
if not _allclose_dense_sparse(X[idx_selected], X_round_trip):
warnings.warn(
(
"The provided functions are not strictly"
" inverse of each other. If you are sure you"
" want to proceed regardless, set"
" 'check_inverse=False'."
),
UserWarning,
)
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit transformer by checking X.
If ``validate`` is ``True``, ``X`` will be checked.
Parameters
----------
X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
if `validate=True` else any object that `func` can handle
Input array.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : object
FunctionTransformer class instance.
"""
X = self._check_input(X, reset=True)
if self.check_inverse and not (self.func is None or self.inverse_func is None):
self._check_inverse_transform(X)
return self
def transform(self, X):
"""Transform X using the forward function.
Parameters
----------
X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
if `validate=True` else any object that `func` can handle
Input array.
Returns
-------
X_out : array-like, shape (n_samples, n_features)
Transformed input.
"""
X = self._check_input(X, reset=False)
out = self._transform(X, func=self.func, kw_args=self.kw_args)
output_config = _get_output_config("transform", self)["dense"]
if hasattr(out, "columns") and self.feature_names_out is not None:
# check the consistency between the column provided by `transform` and
# the column names provided by `get_feature_names_out`.
feature_names_out = self.get_feature_names_out()
if list(out.columns) != list(feature_names_out):
# we can override the column names of the output if it is inconsistent
# with the column names provided by `get_feature_names_out` in the
# following cases:
# * `func` preserved the column names between the input and the output
# * the input column names are all numbers
# * the output is requested to be a DataFrame (pandas or polars)
feature_names_in = getattr(
X, "feature_names_in_", _get_feature_names(X)
)
same_feature_names_in_out = feature_names_in is not None and list(
feature_names_in
) == list(out.columns)
not_all_str_columns = not all(
isinstance(col, str) for col in out.columns
)
if same_feature_names_in_out or not_all_str_columns:
adapter = _get_adapter_from_container(out)
out = adapter.create_container(
X_output=out,
X_original=out,
columns=feature_names_out,
inplace=False,
)
else:
raise ValueError(
"The output generated by `func` have different column names "
"than the ones provided by `get_feature_names_out`. "
f"Got output with columns names: {list(out.columns)} and "
"`get_feature_names_out` returned: "
f"{list(self.get_feature_names_out())}. "
"The column names can be overridden by setting "
"`set_output(transform='pandas')` or "
"`set_output(transform='polars')` such that the column names "
"are set to the names provided by `get_feature_names_out`."
)
if self.feature_names_out is None:
warn_msg = (
"When `set_output` is configured to be '{0}', `func` should return "
"a {0} DataFrame to follow the `set_output` API or `feature_names_out`"
" should be defined."
)
if output_config == "pandas" and not _is_pandas_df(out):
warnings.warn(warn_msg.format("pandas"))
elif output_config == "polars" and not _is_polars_df(out):
warnings.warn(warn_msg.format("polars"))
return out
def inverse_transform(self, X):
"""Transform X using the inverse function.
Parameters
----------
X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
if `validate=True` else any object that `inverse_func` can handle
Input array.
Returns
-------
X_original : array-like, shape (n_samples, n_features)
Transformed input.
"""
if self.validate:
X = check_array(X, accept_sparse=self.accept_sparse)
return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args)
@available_if(lambda self: self.feature_names_out is not None)
def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.
This method is only defined if `feature_names_out` is not None.
Parameters
----------
input_features : array-like of str or None, default=None
Input feature names.
- If `input_features` is None, then `feature_names_in_` is
used as the input feature names. If `feature_names_in_` is not
defined, then names are generated:
`[x0, x1, ..., x(n_features_in_ - 1)]`.
- If `input_features` is array-like, then `input_features` must
match `feature_names_in_` if `feature_names_in_` is defined.
Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
- If `feature_names_out` is 'one-to-one', the input feature names
are returned (see `input_features` above). This requires
`feature_names_in_` and/or `n_features_in_` to be defined, which
is done automatically if `validate=True`. Alternatively, you can
set them in `func`.
- If `feature_names_out` is a callable, then it is called with two
arguments, `self` and `input_features`, and its return value is
returned by this method.
"""
if hasattr(self, "n_features_in_") or input_features is not None:
input_features = _check_feature_names_in(self, input_features)
if self.feature_names_out == "one-to-one":
names_out = input_features
elif callable(self.feature_names_out):
names_out = self.feature_names_out(self, input_features)
else:
raise ValueError(
f"feature_names_out={self.feature_names_out!r} is invalid. "
'It must either be "one-to-one" or a callable with two '
"arguments: the function transformer and an array-like of "
"input feature names. The callable must return an array-like "
"of output feature names."
)
return np.asarray(names_out, dtype=object)
def _transform(self, X, func=None, kw_args=None):
if func is None:
func = _identity
return func(X, **(kw_args if kw_args else {}))
def __sklearn_is_fitted__(self):
"""Return True since FunctionTransfomer is stateless."""
return True
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.no_validation = not self.validate
tags.requires_fit = False
tags.input_tags.sparse = not self.validate or self.accept_sparse
return tags
def set_output(self, *, transform=None):
"""Set output container.
See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
for an example on how to use the API.
Parameters
----------
transform : {"default", "pandas", "polars"}, default=None
Configure output of `transform` and `fit_transform`.
- `"default"`: Default output format of a transformer
- `"pandas"`: DataFrame output
- `"polars"`: Polars output
- `None`: Transform configuration is unchanged
.. versionadded:: 1.4
`"polars"` option was added.
Returns
-------
self : estimator instance
Estimator instance.
"""
if not hasattr(self, "_sklearn_output_config"):
self._sklearn_output_config = {}
self._sklearn_output_config["transform"] = transform
return self
def _get_function_name(self):
"""Get the name display of the `func` used in HTML representation."""
if hasattr(self.func, "__name__"):
return self.func.__name__
if isinstance(self.func, partial):
return self.func.func.__name__
return f"{self.func.__class__.__name__}(...)"
def _sk_visual_block_(self):
return _VisualBlock(
"single",
self,
names=self._get_function_name(),
name_details=str(self),
name_caption="FunctionTransformer",
doc_link_label="FunctionTransformer",
)

View File

@@ -0,0 +1,963 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import array
import itertools
import warnings
from collections import defaultdict
from numbers import Integral
import numpy as np
import scipy.sparse as sp
from ..base import BaseEstimator, TransformerMixin, _fit_context
from ..utils import column_or_1d
from ..utils._array_api import device, get_namespace, xpx
from ..utils._encode import _encode, _unique
from ..utils._param_validation import Interval, validate_params
from ..utils.multiclass import type_of_target, unique_labels
from ..utils.sparsefuncs import min_max_axis
from ..utils.validation import _num_samples, check_array, check_is_fitted
__all__ = [
"LabelBinarizer",
"LabelEncoder",
"MultiLabelBinarizer",
"label_binarize",
]
class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
"""Encode target labels with value between 0 and n_classes-1.
This transformer should be used to encode target values, *i.e.* `y`, and
not the input `X`.
Read more in the :ref:`User Guide <preprocessing_targets>`.
.. versionadded:: 0.12
Attributes
----------
classes_ : ndarray of shape (n_classes,)
Holds the label for each class.
See Also
--------
OrdinalEncoder : Encode categorical features using an ordinal encoding
scheme.
OneHotEncoder : Encode categorical features as a one-hot numeric array.
Examples
--------
`LabelEncoder` can be used to normalize labels.
>>> from sklearn.preprocessing import LabelEncoder
>>> le = LabelEncoder()
>>> le.fit([1, 2, 2, 6])
LabelEncoder()
>>> le.classes_
array([1, 2, 6])
>>> le.transform([1, 1, 2, 6])
array([0, 0, 1, 2]...)
>>> le.inverse_transform([0, 0, 1, 2])
array([1, 1, 2, 6])
It can also be used to transform non-numerical labels (as long as they are
hashable and comparable) to numerical labels.
>>> le = LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
>>> list(le.classes_)
[np.str_('amsterdam'), np.str_('paris'), np.str_('tokyo')]
>>> le.transform(["tokyo", "tokyo", "paris"])
array([2, 2, 1]...)
>>> list(le.inverse_transform([2, 2, 1]))
[np.str_('tokyo'), np.str_('tokyo'), np.str_('paris')]
"""
def fit(self, y):
"""Fit label encoder.
Parameters
----------
y : array-like of shape (n_samples,)
Target values.
Returns
-------
self : returns an instance of self.
Fitted label encoder.
"""
y = column_or_1d(y, warn=True)
self.classes_ = _unique(y)
return self
def fit_transform(self, y):
"""Fit label encoder and return encoded labels.
Parameters
----------
y : array-like of shape (n_samples,)
Target values.
Returns
-------
y : array-like of shape (n_samples,)
Encoded labels.
"""
y = column_or_1d(y, warn=True)
self.classes_, y = _unique(y, return_inverse=True)
return y
def transform(self, y):
"""Transform labels to normalized encoding.
Parameters
----------
y : array-like of shape (n_samples,)
Target values.
Returns
-------
y : array-like of shape (n_samples,)
Labels as normalized encodings.
"""
check_is_fitted(self)
xp, _ = get_namespace(y)
y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
# transform of empty array is empty array
if _num_samples(y) == 0:
return xp.asarray([])
return _encode(y, uniques=self.classes_)
def inverse_transform(self, y):
"""Transform labels back to original encoding.
Parameters
----------
y : array-like of shape (n_samples,)
Target values.
Returns
-------
y_original : ndarray of shape (n_samples,)
Original encoding.
"""
check_is_fitted(self)
xp, _ = get_namespace(y)
y = column_or_1d(y, warn=True)
# inverse transform of empty array is empty array
if _num_samples(y) == 0:
return xp.asarray([])
diff = xpx.setdiff1d(
y,
xp.arange(self.classes_.shape[0], device=device(y)),
xp=xp,
)
if diff.shape[0]:
raise ValueError("y contains previously unseen labels: %s" % str(diff))
y = xp.asarray(y)
return xp.take(self.classes_, y, axis=0)
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.array_api_support = True
tags.input_tags.two_d_array = False
tags.target_tags.one_d_labels = True
return tags
class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
"""Binarize labels in a one-vs-all fashion.
Several regression and binary classification algorithms are
available in scikit-learn. A simple way to extend these algorithms
to the multi-class classification case is to use the so-called
one-vs-all scheme.
At learning time, this simply consists in learning one regressor
or binary classifier per class. In doing so, one needs to convert
multi-class labels to binary labels (belong or does not belong
to the class). `LabelBinarizer` makes this process easy with the
transform method.
At prediction time, one assigns the class for which the corresponding
model gave the greatest confidence. `LabelBinarizer` makes this easy
with the :meth:`inverse_transform` method.
Read more in the :ref:`User Guide <preprocessing_targets>`.
Parameters
----------
neg_label : int, default=0
Value with which negative labels must be encoded.
pos_label : int, default=1
Value with which positive labels must be encoded.
sparse_output : bool, default=False
True if the returned array from transform is desired to be in sparse
CSR format.
Attributes
----------
classes_ : ndarray of shape (n_classes,)
Holds the label for each class.
y_type_ : str
Represents the type of the target data as evaluated by
:func:`~sklearn.utils.multiclass.type_of_target`. Possible type are
'continuous', 'continuous-multioutput', 'binary', 'multiclass',
'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.
sparse_input_ : bool
`True` if the input data to transform is given as a sparse matrix,
`False` otherwise.
See Also
--------
label_binarize : Function to perform the transform operation of
LabelBinarizer with fixed classes.
OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
scheme.
Examples
--------
>>> from sklearn.preprocessing import LabelBinarizer
>>> lb = LabelBinarizer()
>>> lb.fit([1, 2, 6, 4, 2])
LabelBinarizer()
>>> lb.classes_
array([1, 2, 4, 6])
>>> lb.transform([1, 6])
array([[1, 0, 0, 0],
[0, 0, 0, 1]])
Binary targets transform to a column vector
>>> lb = LabelBinarizer()
>>> lb.fit_transform(['yes', 'no', 'no', 'yes'])
array([[1],
[0],
[0],
[1]])
Passing a 2D matrix for multilabel classification
>>> import numpy as np
>>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))
LabelBinarizer()
>>> lb.classes_
array([0, 1, 2])
>>> lb.transform([0, 1, 2, 1])
array([[1, 0, 0],
[0, 1, 0],
[0, 0, 1],
[0, 1, 0]])
"""
_parameter_constraints: dict = {
"neg_label": [Integral],
"pos_label": [Integral],
"sparse_output": ["boolean"],
}
def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
self.neg_label = neg_label
self.pos_label = pos_label
self.sparse_output = sparse_output
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, y):
"""Fit label binarizer.
Parameters
----------
y : ndarray of shape (n_samples,) or (n_samples, n_classes)
Target values. The 2-d matrix should only contain 0 and 1,
represents multilabel classification.
Returns
-------
self : object
Returns the instance itself.
"""
if self.neg_label >= self.pos_label:
raise ValueError(
f"neg_label={self.neg_label} must be strictly less than "
f"pos_label={self.pos_label}."
)
if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0):
raise ValueError(
"Sparse binarization is only supported with non "
"zero pos_label and zero neg_label, got "
f"pos_label={self.pos_label} and neg_label={self.neg_label}"
)
self.y_type_ = type_of_target(y, input_name="y")
if "multioutput" in self.y_type_:
raise ValueError(
"Multioutput target data is not supported with label binarization"
)
if _num_samples(y) == 0:
raise ValueError("y has 0 samples: %r" % y)
self.sparse_input_ = sp.issparse(y)
self.classes_ = unique_labels(y)
return self
def fit_transform(self, y):
"""Fit label binarizer/transform multi-class labels to binary labels.
The output of transform is sometimes referred to as
the 1-of-K coding scheme.
Parameters
----------
y : {ndarray, sparse matrix} of shape (n_samples,) or \
(n_samples, n_classes)
Target values. The 2-d matrix should only contain 0 and 1,
represents multilabel classification. Sparse matrix can be
CSR, CSC, COO, DOK, or LIL.
Returns
-------
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
Shape will be (n_samples, 1) for binary problems. Sparse matrix
will be of CSR format.
"""
return self.fit(y).transform(y)
def transform(self, y):
"""Transform multi-class labels to binary labels.
The output of transform is sometimes referred to by some authors as
the 1-of-K coding scheme.
Parameters
----------
y : {array, sparse matrix} of shape (n_samples,) or \
(n_samples, n_classes)
Target values. The 2-d matrix should only contain 0 and 1,
represents multilabel classification. Sparse matrix can be
CSR, CSC, COO, DOK, or LIL.
Returns
-------
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
Shape will be (n_samples, 1) for binary problems. Sparse matrix
will be of CSR format.
"""
check_is_fitted(self)
y_is_multilabel = type_of_target(y).startswith("multilabel")
if y_is_multilabel and not self.y_type_.startswith("multilabel"):
raise ValueError("The object was not fitted with multilabel input.")
return label_binarize(
y,
classes=self.classes_,
pos_label=self.pos_label,
neg_label=self.neg_label,
sparse_output=self.sparse_output,
)
def inverse_transform(self, Y, threshold=None):
"""Transform binary labels back to multi-class labels.
Parameters
----------
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
Target values. All sparse matrices are converted to CSR before
inverse transformation.
threshold : float, default=None
Threshold used in the binary and multi-label cases.
Use 0 when ``Y`` contains the output of :term:`decision_function`
(classifier).
Use 0.5 when ``Y`` contains the output of :term:`predict_proba`.
If None, the threshold is assumed to be half way between
neg_label and pos_label.
Returns
-------
y_original : {ndarray, sparse matrix} of shape (n_samples,)
Target values. Sparse matrix will be of CSR format.
Notes
-----
In the case when the binary labels are fractional
(probabilistic), :meth:`inverse_transform` chooses the class with the
greatest value. Typically, this allows to use the output of a
linear model's :term:`decision_function` method directly as the input
of :meth:`inverse_transform`.
"""
check_is_fitted(self)
if threshold is None:
threshold = (self.pos_label + self.neg_label) / 2.0
if self.y_type_ == "multiclass":
y_inv = _inverse_binarize_multiclass(Y, self.classes_)
else:
y_inv = _inverse_binarize_thresholding(
Y, self.y_type_, self.classes_, threshold
)
if self.sparse_input_:
y_inv = sp.csr_matrix(y_inv)
elif sp.issparse(y_inv):
y_inv = y_inv.toarray()
return y_inv
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.two_d_array = False
tags.target_tags.one_d_labels = True
return tags
@validate_params(
{
"y": ["array-like", "sparse matrix"],
"classes": ["array-like"],
"neg_label": [Interval(Integral, None, None, closed="neither")],
"pos_label": [Interval(Integral, None, None, closed="neither")],
"sparse_output": ["boolean"],
},
prefer_skip_nested_validation=True,
)
def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False):
"""Binarize labels in a one-vs-all fashion.
Several regression and binary classification algorithms are
available in scikit-learn. A simple way to extend these algorithms
to the multi-class classification case is to use the so-called
one-vs-all scheme.
This function makes it possible to compute this transformation for a
fixed set of class labels known ahead of time.
Parameters
----------
y : array-like or sparse matrix
Sequence of integer labels or multilabel data to encode.
classes : array-like of shape (n_classes,)
Uniquely holds the label for each class.
neg_label : int, default=0
Value with which negative labels must be encoded.
pos_label : int, default=1
Value with which positive labels must be encoded.
sparse_output : bool, default=False,
Set to true if output binary array is desired in CSR sparse format.
Returns
-------
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
Shape will be (n_samples, 1) for binary problems. Sparse matrix will
be of CSR format.
See Also
--------
LabelBinarizer : Class used to wrap the functionality of label_binarize and
allow for fitting to classes independently of the transform operation.
Examples
--------
>>> from sklearn.preprocessing import label_binarize
>>> label_binarize([1, 6], classes=[1, 2, 4, 6])
array([[1, 0, 0, 0],
[0, 0, 0, 1]])
The class ordering is preserved:
>>> label_binarize([1, 6], classes=[1, 6, 4, 2])
array([[1, 0, 0, 0],
[0, 1, 0, 0]])
Binary targets transform to a column vector
>>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])
array([[1],
[0],
[0],
[1]])
"""
if not isinstance(y, list):
# XXX Workaround that will be removed when list of list format is
# dropped
y = check_array(
y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None
)
else:
if _num_samples(y) == 0:
raise ValueError("y has 0 samples: %r" % y)
if neg_label >= pos_label:
raise ValueError(
"neg_label={0} must be strictly less than pos_label={1}.".format(
neg_label, pos_label
)
)
if sparse_output and (pos_label == 0 or neg_label != 0):
raise ValueError(
"Sparse binarization is only supported with non "
"zero pos_label and zero neg_label, got "
"pos_label={0} and neg_label={1}"
"".format(pos_label, neg_label)
)
# To account for pos_label == 0 in the dense case
pos_switch = pos_label == 0
if pos_switch:
pos_label = -neg_label
y_type = type_of_target(y)
if "multioutput" in y_type:
raise ValueError(
"Multioutput target data is not supported with label binarization"
)
if y_type == "unknown":
raise ValueError("The type of target data is not known")
n_samples = y.shape[0] if sp.issparse(y) else len(y)
n_classes = len(classes)
classes = np.asarray(classes)
if y_type == "binary":
if n_classes == 1:
if sparse_output:
return sp.csr_matrix((n_samples, 1), dtype=int)
else:
Y = np.zeros((len(y), 1), dtype=int)
Y += neg_label
return Y
elif len(classes) >= 3:
y_type = "multiclass"
sorted_class = np.sort(classes)
if y_type == "multilabel-indicator":
y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0])
if classes.size != y_n_classes:
raise ValueError(
"classes {0} mismatch with the labels {1} found in the data".format(
classes, unique_labels(y)
)
)
if y_type in ("binary", "multiclass"):
y = column_or_1d(y)
# pick out the known labels from y
y_in_classes = np.isin(y, classes)
y_seen = y[y_in_classes]
indices = np.searchsorted(sorted_class, y_seen)
indptr = np.hstack((0, np.cumsum(y_in_classes)))
data = np.empty_like(indices)
data.fill(pos_label)
Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes))
elif y_type == "multilabel-indicator":
Y = sp.csr_matrix(y)
if pos_label != 1:
data = np.empty_like(Y.data)
data.fill(pos_label)
Y.data = data
else:
raise ValueError(
"%s target data is not supported with label binarization" % y_type
)
if not sparse_output:
Y = Y.toarray()
Y = Y.astype(int, copy=False)
if neg_label != 0:
Y[Y == 0] = neg_label
if pos_switch:
Y[Y == pos_label] = 0
else:
Y.data = Y.data.astype(int, copy=False)
# preserve label ordering
if np.any(classes != sorted_class):
indices = np.searchsorted(sorted_class, classes)
Y = Y[:, indices]
if y_type == "binary":
if sparse_output:
Y = Y[:, [-1]]
else:
Y = Y[:, -1].reshape((-1, 1))
return Y
def _inverse_binarize_multiclass(y, classes):
"""Inverse label binarization transformation for multiclass.
Multiclass uses the maximal score instead of a threshold.
"""
classes = np.asarray(classes)
if sp.issparse(y):
# Find the argmax for each row in y where y is a CSR matrix
y = y.tocsr()
n_samples, n_outputs = y.shape
outputs = np.arange(n_outputs)
row_max = min_max_axis(y, 1)[1]
row_nnz = np.diff(y.indptr)
y_data_repeated_max = np.repeat(row_max, row_nnz)
# picks out all indices obtaining the maximum per row
y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)
# For corner case where last row has a max of 0
if row_max[-1] == 0:
y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])
# Gets the index of the first argmax in each row from y_i_all_argmax
index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
# first argmax of each row
y_ind_ext = np.append(y.indices, [0])
y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
# Handle rows of all 0
y_i_argmax[np.where(row_nnz == 0)[0]] = 0
# Handles rows with max of 0 that contain negative numbers
samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]
for i in samples:
ind = y.indices[y.indptr[i] : y.indptr[i + 1]]
y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]
return classes[y_i_argmax]
else:
return classes.take(y.argmax(axis=1), mode="clip")
def _inverse_binarize_thresholding(y, output_type, classes, threshold):
"""Inverse label binarization transformation using thresholding."""
if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape))
if output_type != "binary" and y.shape[1] != len(classes):
raise ValueError(
"The number of class is not equal to the number of dimension of y."
)
classes = np.asarray(classes)
# Perform thresholding
if sp.issparse(y):
if threshold > 0:
if y.format not in ("csr", "csc"):
y = y.tocsr()
y.data = np.array(y.data > threshold, dtype=int)
y.eliminate_zeros()
else:
y = np.array(y.toarray() > threshold, dtype=int)
else:
y = np.array(y > threshold, dtype=int)
# Inverse transform data
if output_type == "binary":
if sp.issparse(y):
y = y.toarray()
if y.ndim == 2 and y.shape[1] == 2:
return classes[y[:, 1]]
else:
if len(classes) == 1:
return np.repeat(classes[0], len(y))
else:
return classes[y.ravel()]
elif output_type == "multilabel-indicator":
return y
else:
raise ValueError("{0} format is not supported".format(output_type))
class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
"""Transform between iterable of iterables and a multilabel format.
Although a list of sets or tuples is a very intuitive format for multilabel
data, it is unwieldy to process. This transformer converts between this
intuitive format and the supported multilabel format: a (samples x classes)
binary matrix indicating the presence of a class label.
Parameters
----------
classes : array-like of shape (n_classes,), default=None
Indicates an ordering for the class labels.
All entries should be unique (cannot contain duplicate classes).
sparse_output : bool, default=False
Set to True if output binary array is desired in CSR sparse format.
Attributes
----------
classes_ : ndarray of shape (n_classes,)
A copy of the `classes` parameter when provided.
Otherwise it corresponds to the sorted set of classes found
when fitting.
See Also
--------
OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
scheme.
Examples
--------
>>> from sklearn.preprocessing import MultiLabelBinarizer
>>> mlb = MultiLabelBinarizer()
>>> mlb.fit_transform([(1, 2), (3,)])
array([[1, 1, 0],
[0, 0, 1]])
>>> mlb.classes_
array([1, 2, 3])
>>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])
array([[0, 1, 1],
[1, 0, 0]])
>>> list(mlb.classes_)
['comedy', 'sci-fi', 'thriller']
A common mistake is to pass in a list, which leads to the following issue:
>>> mlb = MultiLabelBinarizer()
>>> mlb.fit(['sci-fi', 'thriller', 'comedy'])
MultiLabelBinarizer()
>>> mlb.classes_
array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't',
'y'], dtype=object)
To correct this, the list of labels should be passed in as:
>>> mlb = MultiLabelBinarizer()
>>> mlb.fit([['sci-fi', 'thriller', 'comedy']])
MultiLabelBinarizer()
>>> mlb.classes_
array(['comedy', 'sci-fi', 'thriller'], dtype=object)
"""
_parameter_constraints: dict = {
"classes": ["array-like", None],
"sparse_output": ["boolean"],
}
def __init__(self, *, classes=None, sparse_output=False):
self.classes = classes
self.sparse_output = sparse_output
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, y):
"""Fit the label sets binarizer, storing :term:`classes_`.
Parameters
----------
y : iterable of iterables
A set of labels (any orderable and hashable object) for each
sample. If the `classes` parameter is set, `y` will not be
iterated.
Returns
-------
self : object
Fitted estimator.
"""
self._cached_dict = None
if self.classes is None:
classes = sorted(set(itertools.chain.from_iterable(y)))
elif len(set(self.classes)) < len(self.classes):
raise ValueError(
"The classes argument contains duplicate "
"classes. Remove these duplicates before passing "
"them to MultiLabelBinarizer."
)
else:
classes = self.classes
dtype = int if all(isinstance(c, int) for c in classes) else object
self.classes_ = np.empty(len(classes), dtype=dtype)
self.classes_[:] = classes
return self
@_fit_context(prefer_skip_nested_validation=True)
def fit_transform(self, y):
"""Fit the label sets binarizer and transform the given label sets.
Parameters
----------
y : iterable of iterables
A set of labels (any orderable and hashable object) for each
sample. If the `classes` parameter is set, `y` will not be
iterated.
Returns
-------
y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)
A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`
is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR
format.
"""
if self.classes is not None:
return self.fit(y).transform(y)
self._cached_dict = None
# Automatically increment on new class
class_mapping = defaultdict(int)
class_mapping.default_factory = class_mapping.__len__
yt = self._transform(y, class_mapping)
# sort classes and reorder columns
tmp = sorted(class_mapping, key=class_mapping.get)
# (make safe for tuples)
dtype = int if all(isinstance(c, int) for c in tmp) else object
class_mapping = np.empty(len(tmp), dtype=dtype)
class_mapping[:] = tmp
self.classes_, inverse = np.unique(class_mapping, return_inverse=True)
# ensure yt.indices keeps its current dtype
yt.indices = np.asarray(inverse[yt.indices], dtype=yt.indices.dtype)
if not self.sparse_output:
yt = yt.toarray()
return yt
def transform(self, y):
"""Transform the given label sets.
Parameters
----------
y : iterable of iterables
A set of labels (any orderable and hashable object) for each
sample. If the `classes` parameter is set, `y` will not be
iterated.
Returns
-------
y_indicator : array or CSR matrix, shape (n_samples, n_classes)
A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in
`y[i]`, and 0 otherwise.
"""
check_is_fitted(self)
class_to_index = self._build_cache()
yt = self._transform(y, class_to_index)
if not self.sparse_output:
yt = yt.toarray()
return yt
def _build_cache(self):
if self._cached_dict is None:
self._cached_dict = dict(zip(self.classes_, range(len(self.classes_))))
return self._cached_dict
def _transform(self, y, class_mapping):
"""Transforms the label sets with a given mapping.
Parameters
----------
y : iterable of iterables
A set of labels (any orderable and hashable object) for each
sample. If the `classes` parameter is set, `y` will not be
iterated.
class_mapping : Mapping
Maps from label to column index in label indicator matrix.
Returns
-------
y_indicator : sparse matrix of shape (n_samples, n_classes)
Label indicator matrix. Will be of CSR format.
"""
indices = array.array("i")
indptr = array.array("i", [0])
unknown = set()
for labels in y:
index = set()
for label in labels:
try:
index.add(class_mapping[label])
except KeyError:
unknown.add(label)
indices.extend(index)
indptr.append(len(indices))
if unknown:
warnings.warn(
"unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))
)
data = np.ones(len(indices), dtype=int)
return sp.csr_matrix(
(data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping))
)
def inverse_transform(self, yt):
"""Transform the given indicator matrix into label sets.
Parameters
----------
yt : {ndarray, sparse matrix} of shape (n_samples, n_classes)
A matrix containing only 1s ands 0s.
Returns
-------
y_original : list of tuples
The set of labels for each sample such that `y[i]` consists of
`classes_[j]` for each `yt[i, j] == 1`.
"""
check_is_fitted(self)
if yt.shape[1] != len(self.classes_):
raise ValueError(
"Expected indicator for {0} classes, but got {1}".format(
len(self.classes_), yt.shape[1]
)
)
if sp.issparse(yt):
yt = yt.tocsr()
if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0:
raise ValueError("Expected only 0s and 1s in label indicator.")
return [
tuple(self.classes_.take(yt.indices[start:end]))
for start, end in zip(yt.indptr[:-1], yt.indptr[1:])
]
else:
unexpected = np.setdiff1d(yt, [0, 1])
if len(unexpected) > 0:
raise ValueError(
"Expected only 0s and 1s in label indicator. Also got {0}".format(
unexpected
)
)
return [tuple(self.classes_.compress(indicators)) for indicators in yt]
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.two_d_array = False
tags.target_tags.two_d_labels = True
return tags

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,534 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from numbers import Integral, Real
import numpy as np
from ..base import OneToOneFeatureMixin, _fit_context
from ..utils._param_validation import Interval, StrOptions
from ..utils.multiclass import type_of_target
from ..utils.validation import (
_check_feature_names_in,
_check_y,
check_consistent_length,
check_is_fitted,
)
from ._encoders import _BaseEncoder
from ._target_encoder_fast import _fit_encoding_fast, _fit_encoding_fast_auto_smooth
class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
"""Target Encoder for regression and classification targets.
Each category is encoded based on a shrunk estimate of the average target
values for observations belonging to the category. The encoding scheme mixes
the global target mean with the target mean conditioned on the value of the
category (see [MIC]_).
When the target type is "multiclass", encodings are based
on the conditional probability estimate for each class. The target is first
binarized using the "one-vs-all" scheme via
:class:`~sklearn.preprocessing.LabelBinarizer`, then the average target
value for each class and each category is used for encoding, resulting in
`n_features` * `n_classes` encoded output features.
:class:`TargetEncoder` considers missing values, such as `np.nan` or `None`,
as another category and encodes them like any other category. Categories
that are not seen during :meth:`fit` are encoded with the target mean, i.e.
`target_mean_`.
For a demo on the importance of the `TargetEncoder` internal cross-fitting,
see
:ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`.
For a comparison of different encoders, refer to
:ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. Read
more in the :ref:`User Guide <target_encoder>`.
.. note::
`fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
:term:`cross fitting` scheme is used in `fit_transform` for encoding.
See the :ref:`User Guide <target_encoder>` for details.
.. versionadded:: 1.3
Parameters
----------
categories : "auto" or list of shape (n_features,) of array-like, default="auto"
Categories (unique values) per feature:
- `"auto"` : Determine categories automatically from the training data.
- list : `categories[i]` holds the categories expected in the i-th column. The
passed categories should not mix strings and numeric values within a single
feature, and should be sorted in case of numeric values.
The used categories are stored in the `categories_` fitted attribute.
target_type : {"auto", "continuous", "binary", "multiclass"}, default="auto"
Type of target.
- `"auto"` : Type of target is inferred with
:func:`~sklearn.utils.multiclass.type_of_target`.
- `"continuous"` : Continuous target
- `"binary"` : Binary target
- `"multiclass"` : Multiclass target
.. note::
The type of target inferred with `"auto"` may not be the desired target
type used for modeling. For example, if the target consisted of integers
between 0 and 100, then :func:`~sklearn.utils.multiclass.type_of_target`
will infer the target as `"multiclass"`. In this case, setting
`target_type="continuous"` will specify the target as a regression
problem. The `target_type_` attribute gives the target type used by the
encoder.
.. versionchanged:: 1.4
Added the option 'multiclass'.
smooth : "auto" or float, default="auto"
The amount of mixing of the target mean conditioned on the value of the
category with the global target mean. A larger `smooth` value will put
more weight on the global target mean.
If `"auto"`, then `smooth` is set to an empirical Bayes estimate.
cv : int, default=5
Determines the number of folds in the :term:`cross fitting` strategy used in
:meth:`fit_transform`. For classification targets, `StratifiedKFold` is used
and for continuous targets, `KFold` is used.
shuffle : bool, default=True
Whether to shuffle the data in :meth:`fit_transform` before splitting into
folds. Note that the samples within each split will not be shuffled.
random_state : int, RandomState instance or None, default=None
When `shuffle` is True, `random_state` affects the ordering of the
indices, which controls the randomness of each fold. Otherwise, this
parameter has no effect.
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.
Attributes
----------
encodings_ : list of shape (n_features,) or (n_features * n_classes) of \
ndarray
Encodings learnt on all of `X`.
For feature `i`, `encodings_[i]` are the encodings matching the
categories listed in `categories_[i]`. When `target_type_` is
"multiclass", the encoding for feature `i` and class `j` is stored in
`encodings_[j + (i * len(classes_))]`. E.g., for 2 features (f) and
3 classes (c), encodings are ordered:
f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2,
categories_ : list of shape (n_features,) of ndarray
The categories of each input feature determined during fitting or
specified in `categories`
(in order of the features in `X` and corresponding with the output
of :meth:`transform`).
target_type_ : str
Type of target.
target_mean_ : float
The overall mean of the target. This value is only used in :meth:`transform`
to encode categories.
n_features_in_ : int
Number of features seen during :term:`fit`.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
classes_ : ndarray or None
If `target_type_` is 'binary' or 'multiclass', holds the label for each class,
otherwise `None`.
See Also
--------
OrdinalEncoder : Performs an ordinal (integer) encoding of the categorical features.
Contrary to TargetEncoder, this encoding is not supervised. Treating the
resulting encoding as a numerical features therefore lead arbitrarily
ordered values and therefore typically lead to lower predictive performance
when used as preprocessing for a classifier or regressor.
OneHotEncoder : Performs a one-hot encoding of categorical features. This
unsupervised encoding is better suited for low cardinality categorical
variables as it generate one new feature per unique category.
References
----------
.. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
categorical attributes in classification and prediction problems"
SIGKDD Explor. Newsl. 3, 1 (July 2001), 2732. <10.1145/507533.507538>`
Examples
--------
With `smooth="auto"`, the smoothing parameter is set to an empirical Bayes estimate:
>>> import numpy as np
>>> from sklearn.preprocessing import TargetEncoder
>>> X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T
>>> y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30
>>> enc_auto = TargetEncoder(smooth="auto")
>>> X_trans = enc_auto.fit_transform(X, y)
>>> # A high `smooth` parameter puts more weight on global mean on the categorical
>>> # encodings:
>>> enc_high_smooth = TargetEncoder(smooth=5000.0).fit(X, y)
>>> enc_high_smooth.target_mean_
np.float64(44.3)
>>> enc_high_smooth.encodings_
[array([44.1, 44.4, 44.3])]
>>> # On the other hand, a low `smooth` parameter puts more weight on target
>>> # conditioned on the value of the categorical:
>>> enc_low_smooth = TargetEncoder(smooth=1.0).fit(X, y)
>>> enc_low_smooth.encodings_
[array([21, 80.8, 43.2])]
"""
_parameter_constraints: dict = {
"categories": [StrOptions({"auto"}), list],
"target_type": [StrOptions({"auto", "continuous", "binary", "multiclass"})],
"smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")],
"cv": [Interval(Integral, 2, None, closed="left")],
"shuffle": ["boolean"],
"random_state": ["random_state"],
}
def __init__(
self,
categories="auto",
target_type="auto",
smooth="auto",
cv=5,
shuffle=True,
random_state=None,
):
self.categories = categories
self.smooth = smooth
self.target_type = target_type
self.cv = cv
self.shuffle = shuffle
self.random_state = random_state
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y):
"""Fit the :class:`TargetEncoder` to X and y.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data to determine the categories of each feature.
y : array-like of shape (n_samples,)
The target data used to encode the categories.
Returns
-------
self : object
Fitted encoder.
"""
self._fit_encodings_all(X, y)
return self
@_fit_context(prefer_skip_nested_validation=True)
def fit_transform(self, X, y):
"""Fit :class:`TargetEncoder` and transform X with the target encoding.
.. note::
`fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
:term:`cross fitting` scheme is used in `fit_transform` for encoding.
See the :ref:`User Guide <target_encoder>`. for details.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data to determine the categories of each feature.
y : array-like of shape (n_samples,)
The target data used to encode the categories.
Returns
-------
X_trans : ndarray of shape (n_samples, n_features) or \
(n_samples, (n_features * n_classes))
Transformed input.
"""
from ..model_selection import KFold, StratifiedKFold # avoid circular import
X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y)
# The cv splitter is voluntarily restricted to *KFold to enforce non
# overlapping validation folds, otherwise the fit_transform output will
# not be well-specified.
if self.target_type_ == "continuous":
cv = KFold(self.cv, shuffle=self.shuffle, random_state=self.random_state)
else:
cv = StratifiedKFold(
self.cv, shuffle=self.shuffle, random_state=self.random_state
)
# If 'multiclass' multiply axis=1 by num classes else keep shape the same
if self.target_type_ == "multiclass":
X_out = np.empty(
(X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
dtype=np.float64,
)
else:
X_out = np.empty_like(X_ordinal, dtype=np.float64)
for train_idx, test_idx in cv.split(X, y):
X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx]
y_train_mean = np.mean(y_train, axis=0)
if self.target_type_ == "multiclass":
encodings = self._fit_encoding_multiclass(
X_train,
y_train,
n_categories,
y_train_mean,
)
else:
encodings = self._fit_encoding_binary_or_continuous(
X_train,
y_train,
n_categories,
y_train_mean,
)
self._transform_X_ordinal(
X_out,
X_ordinal,
~X_known_mask,
test_idx,
encodings,
y_train_mean,
)
return X_out
def transform(self, X):
"""Transform X with the target encoding.
.. note::
`fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
:term:`cross fitting` scheme is used in `fit_transform` for encoding.
See the :ref:`User Guide <target_encoder>`. for details.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data to determine the categories of each feature.
Returns
-------
X_trans : ndarray of shape (n_samples, n_features) or \
(n_samples, (n_features * n_classes))
Transformed input.
"""
X_ordinal, X_known_mask = self._transform(
X, handle_unknown="ignore", ensure_all_finite="allow-nan"
)
# If 'multiclass' multiply axis=1 by num of classes else keep shape the same
if self.target_type_ == "multiclass":
X_out = np.empty(
(X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
dtype=np.float64,
)
else:
X_out = np.empty_like(X_ordinal, dtype=np.float64)
self._transform_X_ordinal(
X_out,
X_ordinal,
~X_known_mask,
slice(None),
self.encodings_,
self.target_mean_,
)
return X_out
def _fit_encodings_all(self, X, y):
"""Fit a target encoding with all the data."""
# avoid circular import
from ..preprocessing import (
LabelBinarizer,
LabelEncoder,
)
check_consistent_length(X, y)
self._fit(X, handle_unknown="ignore", ensure_all_finite="allow-nan")
if self.target_type == "auto":
accepted_target_types = ("binary", "multiclass", "continuous")
inferred_type_of_target = type_of_target(y, input_name="y")
if inferred_type_of_target not in accepted_target_types:
raise ValueError(
"Unknown label type: Target type was inferred to be "
f"{inferred_type_of_target!r}. Only {accepted_target_types} are "
"supported."
)
self.target_type_ = inferred_type_of_target
else:
self.target_type_ = self.target_type
self.classes_ = None
if self.target_type_ == "binary":
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
self.classes_ = label_encoder.classes_
elif self.target_type_ == "multiclass":
label_binarizer = LabelBinarizer()
y = label_binarizer.fit_transform(y)
self.classes_ = label_binarizer.classes_
else: # continuous
y = _check_y(y, y_numeric=True, estimator=self)
self.target_mean_ = np.mean(y, axis=0)
X_ordinal, X_known_mask = self._transform(
X, handle_unknown="ignore", ensure_all_finite="allow-nan"
)
n_categories = np.fromiter(
(len(category_for_feature) for category_for_feature in self.categories_),
dtype=np.int64,
count=len(self.categories_),
)
if self.target_type_ == "multiclass":
encodings = self._fit_encoding_multiclass(
X_ordinal,
y,
n_categories,
self.target_mean_,
)
else:
encodings = self._fit_encoding_binary_or_continuous(
X_ordinal,
y,
n_categories,
self.target_mean_,
)
self.encodings_ = encodings
return X_ordinal, X_known_mask, y, n_categories
def _fit_encoding_binary_or_continuous(
self, X_ordinal, y, n_categories, target_mean
):
"""Learn target encodings."""
if self.smooth == "auto":
y_variance = np.var(y)
encodings = _fit_encoding_fast_auto_smooth(
X_ordinal,
y,
n_categories,
target_mean,
y_variance,
)
else:
encodings = _fit_encoding_fast(
X_ordinal,
y,
n_categories,
self.smooth,
target_mean,
)
return encodings
def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
"""Learn multiclass encodings.
Learn encodings for each class (c) then reorder encodings such that
the same features (f) are grouped together. `reorder_index` enables
converting from:
f0_c0, f1_c0, f0_c1, f1_c1, f0_c2, f1_c2
to:
f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2
"""
n_features = self.n_features_in_
n_classes = len(self.classes_)
encodings = []
for i in range(n_classes):
y_class = y[:, i]
encoding = self._fit_encoding_binary_or_continuous(
X_ordinal,
y_class,
n_categories,
target_mean[i],
)
encodings.extend(encoding)
reorder_index = (
idx
for start in range(n_features)
for idx in range(start, (n_classes * n_features), n_features)
)
return [encodings[idx] for idx in reorder_index]
def _transform_X_ordinal(
self,
X_out,
X_ordinal,
X_unknown_mask,
row_indices,
encodings,
target_mean,
):
"""Transform X_ordinal using encodings.
In the multiclass case, `X_ordinal` and `X_unknown_mask` have column
(axis=1) size `n_features`, while `encodings` has length of size
`n_features * n_classes`. `feat_idx` deals with this by repeating
feature indices by `n_classes` E.g., for 3 features, 2 classes:
0,0,1,1,2,2
Additionally, `target_mean` is of shape (`n_classes`,) so `mean_idx`
cycles through 0 to `n_classes` - 1, `n_features` times.
"""
if self.target_type_ == "multiclass":
n_classes = len(self.classes_)
for e_idx, encoding in enumerate(encodings):
# Repeat feature indices by n_classes
feat_idx = e_idx // n_classes
# Cycle through each class
mean_idx = e_idx % n_classes
X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, feat_idx]]
X_out[X_unknown_mask[:, feat_idx], e_idx] = target_mean[mean_idx]
else:
for e_idx, encoding in enumerate(encodings):
X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, e_idx]]
X_out[X_unknown_mask[:, e_idx], e_idx] = target_mean
def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.
Parameters
----------
input_features : array-like of str or None, default=None
Not used, present here for API consistency by convention.
Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names. `feature_names_in_` is used unless it is
not defined, in which case the following input feature names are
generated: `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
When `type_of_target_` is "multiclass" the names are of the format
'<feature_name>_<class_name>'.
"""
check_is_fitted(self, "n_features_in_")
feature_names = _check_feature_names_in(self, input_features)
if self.target_type_ == "multiclass":
feature_names = [
f"{feature_name}_{class_name}"
for feature_name in feature_names
for class_name in self.classes_
]
return np.asarray(feature_names, dtype=object)
else:
return feature_names
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.target_tags.required = True
return tags

View File

@@ -0,0 +1,167 @@
from libc.math cimport isnan
from libcpp.vector cimport vector
from ..utils._typedefs cimport float32_t, float64_t, int32_t, int64_t
import numpy as np
ctypedef fused INT_DTYPE:
int64_t
int32_t
ctypedef fused Y_DTYPE:
int64_t
int32_t
float64_t
float32_t
def _fit_encoding_fast(
INT_DTYPE[:, ::1] X_int,
const Y_DTYPE[:] y,
int64_t[::1] n_categories,
double smooth,
double y_mean,
):
"""Fit a target encoding on X_int and y.
This implementation uses Eq 7 from [1] to compute the encoding.
As stated in the paper, Eq 7 is the same as Eq 3.
[1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
categorical attributes in classification and prediction problems"
"""
cdef:
int64_t sample_idx, feat_idx, cat_idx, n_cats
INT_DTYPE X_int_tmp
int n_samples = X_int.shape[0]
int n_features = X_int.shape[1]
double smooth_sum = smooth * y_mean
int64_t max_n_cats = np.max(n_categories)
double[::1] sums = np.empty(max_n_cats, dtype=np.float64)
double[::1] counts = np.empty(max_n_cats, dtype=np.float64)
list encodings = []
double[::1] current_encoding
# Gives access to encodings without gil
vector[double*] encoding_vec
encoding_vec.resize(n_features)
for feat_idx in range(n_features):
current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
encoding_vec[feat_idx] = &current_encoding[0]
encodings.append(np.asarray(current_encoding))
with nogil:
for feat_idx in range(n_features):
n_cats = n_categories[feat_idx]
for cat_idx in range(n_cats):
sums[cat_idx] = smooth_sum
counts[cat_idx] = smooth
for sample_idx in range(n_samples):
X_int_tmp = X_int[sample_idx, feat_idx]
# -1 are unknown categories, which are not counted
if X_int_tmp == -1:
continue
sums[X_int_tmp] += y[sample_idx]
counts[X_int_tmp] += 1.0
for cat_idx in range(n_cats):
if counts[cat_idx] == 0:
encoding_vec[feat_idx][cat_idx] = y_mean
else:
encoding_vec[feat_idx][cat_idx] = sums[cat_idx] / counts[cat_idx]
return encodings
def _fit_encoding_fast_auto_smooth(
INT_DTYPE[:, ::1] X_int,
const Y_DTYPE[:] y,
int64_t[::1] n_categories,
double y_mean,
double y_variance,
):
"""Fit a target encoding on X_int and y with auto smoothing.
This implementation uses Eq 5 and 6 from [1].
[1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
categorical attributes in classification and prediction problems"
"""
cdef:
int64_t sample_idx, feat_idx, cat_idx, n_cats
INT_DTYPE X_int_tmp
double diff
int n_samples = X_int.shape[0]
int n_features = X_int.shape[1]
int64_t max_n_cats = np.max(n_categories)
double[::1] means = np.empty(max_n_cats, dtype=np.float64)
int64_t[::1] counts = np.empty(max_n_cats, dtype=np.int64)
double[::1] sum_of_squared_diffs = np.empty(max_n_cats, dtype=np.float64)
double lambda_
list encodings = []
double[::1] current_encoding
# Gives access to encodings without gil
vector[double*] encoding_vec
encoding_vec.resize(n_features)
for feat_idx in range(n_features):
current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
encoding_vec[feat_idx] = &current_encoding[0]
encodings.append(np.asarray(current_encoding))
# TODO: parallelize this with OpenMP prange. When n_features >= n_threads, it's
# probably good to parallelize the outer loop. When n_features is too small,
# then it would probably better to parallelize the nested loops on n_samples and
# n_cats, but the code to handle thread-local temporary variables might be
# significantly more complex.
with nogil:
for feat_idx in range(n_features):
n_cats = n_categories[feat_idx]
for cat_idx in range(n_cats):
means[cat_idx] = 0.0
counts[cat_idx] = 0
sum_of_squared_diffs[cat_idx] = 0.0
# first pass to compute the mean
for sample_idx in range(n_samples):
X_int_tmp = X_int[sample_idx, feat_idx]
# -1 are unknown categories, which are not counted
if X_int_tmp == -1:
continue
counts[X_int_tmp] += 1
means[X_int_tmp] += y[sample_idx]
for cat_idx in range(n_cats):
means[cat_idx] /= counts[cat_idx]
# second pass to compute the sum of squared differences
for sample_idx in range(n_samples):
X_int_tmp = X_int[sample_idx, feat_idx]
if X_int_tmp == -1:
continue
diff = y[sample_idx] - means[X_int_tmp]
sum_of_squared_diffs[X_int_tmp] += diff * diff
for cat_idx in range(n_cats):
lambda_ = (
y_variance * counts[cat_idx] /
(y_variance * counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
counts[cat_idx])
)
if isnan(lambda_):
# A nan can happen when:
# 1. counts[cat_idx] == 0
# 2. y_variance == 0 and sum_of_squared_diffs[cat_idx] == 0
encoding_vec[feat_idx][cat_idx] = y_mean
else:
encoding_vec[feat_idx][cat_idx] = (
lambda_ * means[cat_idx] + (1 - lambda_) * y_mean
)
return encodings

View File

@@ -0,0 +1,13 @@
py.extension_module(
'_csr_polynomial_expansion',
[cython_gen.process('_csr_polynomial_expansion.pyx'), utils_cython_tree],
subdir: 'sklearn/preprocessing',
install: true
)
py.extension_module(
'_target_encoder_fast',
[cython_gen_cpp.process('_target_encoder_fast.pyx'), utils_cython_tree],
subdir: 'sklearn/preprocessing',
install: true
)

View File

@@ -0,0 +1,187 @@
import warnings
import numpy as np
import pytest
from sklearn.base import clone
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
MaxAbsScaler,
MinMaxScaler,
PowerTransformer,
QuantileTransformer,
RobustScaler,
StandardScaler,
maxabs_scale,
minmax_scale,
power_transform,
quantile_transform,
robust_scale,
scale,
)
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.utils.fixes import (
BSR_CONTAINERS,
COO_CONTAINERS,
CSC_CONTAINERS,
CSR_CONTAINERS,
DIA_CONTAINERS,
DOK_CONTAINERS,
LIL_CONTAINERS,
)
iris = load_iris()
def _get_valid_samples_by_column(X, col):
"""Get non NaN samples in column of X"""
return X[:, [col]][~np.isnan(X[:, col])]
@pytest.mark.parametrize(
"est, func, support_sparse, strictly_positive, omit_kwargs",
[
(MaxAbsScaler(), maxabs_scale, True, False, []),
(MinMaxScaler(), minmax_scale, False, False, ["clip"]),
(StandardScaler(), scale, False, False, []),
(StandardScaler(with_mean=False), scale, True, False, []),
(PowerTransformer("yeo-johnson"), power_transform, False, False, []),
(PowerTransformer("box-cox"), power_transform, False, True, []),
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []),
(RobustScaler(), robust_scale, False, False, []),
(RobustScaler(with_centering=False), robust_scale, True, False, []),
],
)
def test_missing_value_handling(
est, func, support_sparse, strictly_positive, omit_kwargs
):
# check that the preprocessing method let pass nan
rng = np.random.RandomState(42)
X = iris.data.copy()
n_missing = 50
X[
rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)
] = np.nan
if strictly_positive:
X += np.nanmin(X) + 0.1
X_train, X_test = train_test_split(X, random_state=1)
# sanity check
assert not np.all(np.isnan(X_train), axis=0).any()
assert np.any(np.isnan(X_train), axis=0).all()
assert np.any(np.isnan(X_test), axis=0).all()
X_test[:, 0] = np.nan # make sure this boundary case is tested
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt = est.fit(X_train).transform(X_test)
# ensure no warnings are raised
# missing values should still be missing, and only them
assert_array_equal(np.isnan(Xt), np.isnan(X_test))
# check that the function leads to the same results as the class
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt_class = est.transform(X_train)
kwargs = est.get_params()
# remove the parameters which should be omitted because they
# are not defined in the counterpart function of the preprocessing class
for kwarg in omit_kwargs:
_ = kwargs.pop(kwarg)
Xt_func = func(X_train, **kwargs)
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
# check that the inverse transform keep NaN
Xt_inv = est.inverse_transform(Xt)
assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
# FIXME: we can introduce equal_nan=True in recent version of numpy.
# For the moment which just check that non-NaN values are almost equal.
assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
for i in range(X.shape[1]):
# train only on non-NaN
est.fit(_get_valid_samples_by_column(X_train, i))
# check transforming with NaN works even when training without NaN
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt_col = est.transform(X_test[:, [i]])
assert_allclose(Xt_col, Xt[:, [i]])
# check non-NaN is handled as before - the 1st column is all nan
if not np.isnan(X_test[:, i]).all():
Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i))
assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())])
if support_sparse:
est_dense = clone(est)
est_sparse = clone(est)
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt_dense = est_dense.fit(X_train).transform(X_test)
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
for sparse_container in (
BSR_CONTAINERS
+ COO_CONTAINERS
+ CSC_CONTAINERS
+ CSR_CONTAINERS
+ DIA_CONTAINERS
+ DOK_CONTAINERS
+ LIL_CONTAINERS
):
# check that the dense and sparse inputs lead to the same results
# precompute the matrix to avoid catching side warnings
X_train_sp = sparse_container(X_train)
X_test_sp = sparse_container(X_test)
with warnings.catch_warnings():
warnings.simplefilter("ignore", PendingDeprecationWarning)
warnings.simplefilter("error", RuntimeWarning)
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
assert_allclose(Xt_sp.toarray(), Xt_dense)
with warnings.catch_warnings():
warnings.simplefilter("ignore", PendingDeprecationWarning)
warnings.simplefilter("error", RuntimeWarning)
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
assert_allclose(Xt_inv_sp.toarray(), Xt_inv_dense)
@pytest.mark.parametrize(
"est, func",
[
(MaxAbsScaler(), maxabs_scale),
(MinMaxScaler(), minmax_scale),
(StandardScaler(), scale),
(StandardScaler(with_mean=False), scale),
(PowerTransformer("yeo-johnson"), power_transform),
(
PowerTransformer("box-cox"),
power_transform,
),
(QuantileTransformer(n_quantiles=3), quantile_transform),
(RobustScaler(), robust_scale),
(RobustScaler(with_centering=False), robust_scale),
],
)
def test_missing_value_pandas_na_support(est, func):
# Test pandas IntegerArray with pd.NA
pd = pytest.importorskip("pandas")
X = np.array(
[
[1, 2, 3, np.nan, np.nan, 4, 5, 1],
[np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
[1, 2, 3, 4, 5, 6, 7, 8],
]
).T
# Creates dataframe with IntegerArrays with pd.NA
X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c"])
X_df["c"] = X_df["c"].astype("int")
X_trans = est.fit_transform(X)
X_df_trans = est.fit_transform(X_df)
assert_allclose(X_trans, X_df_trans)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,665 @@
import warnings
import numpy as np
import pytest
import scipy.sparse as sp
from sklearn import clone
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
from sklearn.utils._testing import (
assert_allclose,
assert_allclose_dense_sparse,
assert_array_almost_equal,
assert_array_equal,
ignore_warnings,
)
X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]]
@pytest.mark.parametrize(
"strategy, quantile_method, expected, sample_weight",
[
(
"uniform",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
None,
),
(
"kmeans",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
None,
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
None,
),
(
"uniform",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
[1, 1, 2, 1],
),
(
"uniform",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
[1, 1, 1, 1],
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
[1, 1, 2, 1],
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
[1, 1, 1, 1],
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
[0, 1, 1, 1],
),
(
"kmeans",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [1, 1, 1, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
[1, 0, 3, 1],
),
(
"kmeans",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
[1, 1, 1, 1],
),
],
)
def test_fit_transform(strategy, quantile_method, expected, sample_weight):
est = KBinsDiscretizer(
n_bins=3, encode="ordinal", strategy=strategy, quantile_method=quantile_method
)
with ignore_warnings(category=UserWarning):
# Ignore the warning on removed small bins.
est.fit(X, sample_weight=sample_weight)
assert_array_equal(est.transform(X), expected)
def test_valid_n_bins():
KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf").fit_transform(X)
KBinsDiscretizer(
n_bins=np.array([2])[0], quantile_method="averaged_inverted_cdf"
).fit_transform(X)
assert KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf").fit(
X
).n_bins_.dtype == np.dtype(int)
def test_invalid_n_bins_array():
# Bad shape
n_bins = np.full((2, 4), 2.0)
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Incorrect number of features
n_bins = [1, 2, 2]
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Bad bin values
n_bins = [1, 2, 2, 1]
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
err_msg = (
"KBinsDiscretizer received an invalid number of bins "
"at indices 0, 3. Number of bins must be at least 2, "
"and must be an int."
)
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Float bin values
n_bins = [2.1, 2, 2.1, 2]
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
err_msg = (
"KBinsDiscretizer received an invalid number of bins "
"at indices 0, 2. Number of bins must be at least 2, "
"and must be an int."
)
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
@pytest.mark.parametrize(
"strategy, quantile_method, expected, sample_weight",
[
(
"uniform",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]],
None,
),
(
"kmeans",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]],
None,
),
(
"quantile",
"linear",
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
None,
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
None,
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
[1, 1, 1, 1],
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
[0, 1, 3, 1],
),
(
"quantile",
"averaged_inverted_cdf",
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 2, 2, 2], [1, 2, 2, 2]],
[1, 1, 3, 1],
),
(
"kmeans",
"warn", # default, will not warn when strategy != "quantile"
[[0, 0, 0, 0], [0, 1, 1, 0], [1, 1, 1, 1], [1, 2, 2, 2]],
[1, 0, 3, 1],
),
],
)
def test_fit_transform_n_bins_array(strategy, quantile_method, expected, sample_weight):
est = KBinsDiscretizer(
n_bins=[2, 3, 3, 3],
encode="ordinal",
strategy=strategy,
quantile_method=quantile_method,
).fit(X, sample_weight=sample_weight)
assert_array_equal(est.transform(X), expected)
# test the shape of bin_edges_
n_features = np.array(X).shape[1]
assert est.bin_edges_.shape == (n_features,)
for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
assert bin_edges.shape == (n_bins + 1,)
@pytest.mark.filterwarnings("ignore: Bins whose width are too small")
def test_kbinsdiscretizer_effect_sample_weight():
"""Check the impact of `sample_weight` one computed quantiles."""
X = np.array([[-2], [-1], [1], [3], [500], [1000]])
# add a large number of bins such that each sample with a non-null weight
# will be used as bin edge
est = KBinsDiscretizer(
n_bins=10,
encode="ordinal",
strategy="quantile",
quantile_method="averaged_inverted_cdf",
)
est.fit(X, sample_weight=[1, 1, 1, 1, 0, 0])
assert_allclose(est.bin_edges_[0], [-2, -1, 0, 1, 3])
assert_allclose(est.transform(X), [[0.0], [1.0], [3.0], [3.0], [3.0], [3.0]])
@pytest.mark.parametrize("strategy", ["kmeans", "quantile"])
def test_kbinsdiscretizer_no_mutating_sample_weight(strategy):
"""Make sure that `sample_weight` is not changed in place."""
if strategy == "quantile":
est = KBinsDiscretizer(
n_bins=3,
encode="ordinal",
strategy=strategy,
quantile_method="averaged_inverted_cdf",
)
else:
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
sample_weight = np.array([1, 3, 1, 2], dtype=np.float64)
sample_weight_copy = np.copy(sample_weight)
est.fit(X, sample_weight=sample_weight)
assert_allclose(sample_weight, sample_weight_copy)
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
def test_same_min_max(strategy):
warnings.simplefilter("always")
X = np.array([[1, -2], [1, -1], [1, 0], [1, 1]])
if strategy == "quantile":
est = KBinsDiscretizer(
strategy=strategy,
n_bins=3,
encode="ordinal",
quantile_method="averaged_inverted_cdf",
)
else:
est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode="ordinal")
warning_message = "Feature 0 is constant and will be replaced with 0."
with pytest.warns(UserWarning, match=warning_message):
est.fit(X)
assert est.n_bins_[0] == 1
# replace the feature with zeros
Xt = est.transform(X)
assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
def test_transform_1d_behavior():
X = np.arange(4)
est = KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf")
with pytest.raises(ValueError):
est.fit(X)
est = KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf")
est.fit(X.reshape(-1, 1))
with pytest.raises(ValueError):
est.transform(X)
@pytest.mark.parametrize("i", range(1, 9))
def test_numeric_stability(i):
X_init = np.array([2.0, 4.0, 6.0, 8.0, 10.0]).reshape(-1, 1)
Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
# Test up to discretizing nano units
X = X_init / 10**i
Xt = KBinsDiscretizer(
n_bins=2, encode="ordinal", quantile_method="averaged_inverted_cdf"
).fit_transform(X)
assert_array_equal(Xt_expected, Xt)
def test_encode_options():
est = KBinsDiscretizer(
n_bins=[2, 3, 3, 3], encode="ordinal", quantile_method="averaged_inverted_cdf"
).fit(X)
Xt_1 = est.transform(X)
est = KBinsDiscretizer(
n_bins=[2, 3, 3, 3],
encode="onehot-dense",
quantile_method="averaged_inverted_cdf",
).fit(X)
Xt_2 = est.transform(X)
assert not sp.issparse(Xt_2)
assert_array_equal(
OneHotEncoder(
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=False
).fit_transform(Xt_1),
Xt_2,
)
est = KBinsDiscretizer(
n_bins=[2, 3, 3, 3], encode="onehot", quantile_method="averaged_inverted_cdf"
).fit(X)
Xt_3 = est.transform(X)
assert sp.issparse(Xt_3)
assert_array_equal(
OneHotEncoder(
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=True
)
.fit_transform(Xt_1)
.toarray(),
Xt_3.toarray(),
)
@pytest.mark.parametrize(
"strategy, quantile_method, expected_2bins, expected_3bins, expected_5bins",
[
("uniform", "warn", [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
("kmeans", "warn", [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
(
"quantile",
"averaged_inverted_cdf",
[0, 0, 0, 1, 1, 1],
[0, 0, 1, 1, 2, 2],
[0, 1, 2, 3, 4, 4],
),
],
)
def test_nonuniform_strategies(
strategy, quantile_method, expected_2bins, expected_3bins, expected_5bins
):
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
# with 2 bins
est = KBinsDiscretizer(
n_bins=2, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
)
Xt = est.fit_transform(X)
assert_array_equal(expected_2bins, Xt.ravel())
# with 3 bins
est = KBinsDiscretizer(
n_bins=3, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
)
Xt = est.fit_transform(X)
assert_array_equal(expected_3bins, Xt.ravel())
# with 5 bins
est = KBinsDiscretizer(
n_bins=5, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
)
Xt = est.fit_transform(X)
assert_array_equal(expected_5bins, Xt.ravel())
@pytest.mark.parametrize(
"strategy, expected_inv,quantile_method",
[
(
"uniform",
[
[-1.5, 2.0, -3.5, -0.5],
[-0.5, 3.0, -2.5, -0.5],
[0.5, 4.0, -1.5, 0.5],
[0.5, 4.0, -1.5, 1.5],
],
"warn", # default, will not warn when strategy != "quantile"
),
(
"kmeans",
[
[-1.375, 2.125, -3.375, -0.5625],
[-1.375, 2.125, -3.375, -0.5625],
[-0.125, 3.375, -2.125, 0.5625],
[0.75, 4.25, -1.25, 1.625],
],
"warn", # default, will not warn when strategy != "quantile"
),
(
"quantile",
[
[-1.5, 2.0, -3.5, -0.75],
[-0.5, 3.0, -2.5, 0.0],
[0.5, 4.0, -1.5, 1.25],
[0.5, 4.0, -1.5, 1.25],
],
"averaged_inverted_cdf",
),
],
)
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
def test_inverse_transform(strategy, encode, expected_inv, quantile_method):
kbd = KBinsDiscretizer(
n_bins=3, strategy=strategy, quantile_method=quantile_method, encode=encode
)
Xt = kbd.fit_transform(X)
Xinv = kbd.inverse_transform(Xt)
assert_array_almost_equal(expected_inv, Xinv)
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
def test_transform_outside_fit_range(strategy):
X = np.array([0, 1, 2, 3])[:, None]
if strategy == "quantile":
kbd = KBinsDiscretizer(
n_bins=4,
strategy=strategy,
encode="ordinal",
quantile_method="averaged_inverted_cdf",
)
else:
kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode="ordinal")
kbd.fit(X)
X2 = np.array([-2, 5])[:, None]
X2t = kbd.transform(X2)
assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
assert_array_equal(X2t.min(axis=0), [0])
def test_overwrite():
X = np.array([0, 1, 2, 3])[:, None]
X_before = X.copy()
est = KBinsDiscretizer(
n_bins=3, quantile_method="averaged_inverted_cdf", encode="ordinal"
)
Xt = est.fit_transform(X)
assert_array_equal(X, X_before)
Xt_before = Xt.copy()
Xinv = est.inverse_transform(Xt)
assert_array_equal(Xt, Xt_before)
assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
@pytest.mark.parametrize(
"strategy, expected_bin_edges, quantile_method",
[
("quantile", [0, 1.5, 3], "averaged_inverted_cdf"),
("kmeans", [0, 1.5, 3], "warn"),
],
)
def test_redundant_bins(strategy, expected_bin_edges, quantile_method):
X = [[0], [0], [0], [0], [3], [3]]
kbd = KBinsDiscretizer(
n_bins=3, strategy=strategy, quantile_method=quantile_method, subsample=None
)
warning_message = "Consider decreasing the number of bins."
with pytest.warns(UserWarning, match=warning_message):
kbd.fit(X)
assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
def test_percentile_numeric_stability():
X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
Xt = np.array([0, 0, 4]).reshape(-1, 1)
kbd = KBinsDiscretizer(
n_bins=10,
encode="ordinal",
strategy="quantile",
quantile_method="linear",
)
## TODO: change to averaged inverted cdf, but that means we only get bin
## edges of 0.05 and 0.95 and nothing in between
warning_message = "Consider decreasing the number of bins."
with pytest.warns(UserWarning, match=warning_message):
kbd.fit(X)
assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
assert_array_almost_equal(kbd.transform(X), Xt)
@pytest.mark.parametrize("in_dtype", [np.float16, np.float32, np.float64])
@pytest.mark.parametrize("out_dtype", [None, np.float32, np.float64])
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
def test_consistent_dtype(in_dtype, out_dtype, encode):
X_input = np.array(X, dtype=in_dtype)
kbd = KBinsDiscretizer(
n_bins=3,
encode=encode,
quantile_method="averaged_inverted_cdf",
dtype=out_dtype,
)
kbd.fit(X_input)
# test output dtype
if out_dtype is not None:
expected_dtype = out_dtype
elif out_dtype is None and X_input.dtype == np.float16:
# wrong numeric input dtype are cast in np.float64
expected_dtype = np.float64
else:
expected_dtype = X_input.dtype
Xt = kbd.transform(X_input)
assert Xt.dtype == expected_dtype
@pytest.mark.parametrize("input_dtype", [np.float16, np.float32, np.float64])
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
def test_32_equal_64(input_dtype, encode):
# TODO this check is redundant with common checks and can be removed
# once #16290 is merged
X_input = np.array(X, dtype=input_dtype)
# 32 bit output
kbd_32 = KBinsDiscretizer(
n_bins=3,
encode=encode,
quantile_method="averaged_inverted_cdf",
dtype=np.float32,
)
kbd_32.fit(X_input)
Xt_32 = kbd_32.transform(X_input)
# 64 bit output
kbd_64 = KBinsDiscretizer(
n_bins=3,
encode=encode,
quantile_method="averaged_inverted_cdf",
dtype=np.float64,
)
kbd_64.fit(X_input)
Xt_64 = kbd_64.transform(X_input)
assert_allclose_dense_sparse(Xt_32, Xt_64)
def test_kbinsdiscretizer_subsample_default():
# Since the size of X is small (< 2e5), subsampling will not take place.
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
kbd_default = KBinsDiscretizer(
n_bins=10,
encode="ordinal",
strategy="quantile",
quantile_method="averaged_inverted_cdf",
)
kbd_default.fit(X)
kbd_without_subsampling = clone(kbd_default)
kbd_without_subsampling.set_params(subsample=None)
kbd_without_subsampling.fit(X)
for bin_kbd_default, bin_kbd_with_subsampling in zip(
kbd_default.bin_edges_[0], kbd_without_subsampling.bin_edges_[0]
):
np.testing.assert_allclose(bin_kbd_default, bin_kbd_with_subsampling)
assert kbd_default.bin_edges_.shape == kbd_without_subsampling.bin_edges_.shape
@pytest.mark.parametrize(
"encode, expected_names",
[
(
"onehot",
[
f"feat{col_id}_{float(bin_id)}"
for col_id in range(3)
for bin_id in range(4)
],
),
(
"onehot-dense",
[
f"feat{col_id}_{float(bin_id)}"
for col_id in range(3)
for bin_id in range(4)
],
),
("ordinal", [f"feat{col_id}" for col_id in range(3)]),
],
)
def test_kbinsdiscrtizer_get_feature_names_out(encode, expected_names):
"""Check get_feature_names_out for different settings.
Non-regression test for #22731
"""
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
kbd = KBinsDiscretizer(
n_bins=4, encode=encode, quantile_method="averaged_inverted_cdf"
).fit(X)
Xt = kbd.transform(X)
input_features = [f"feat{i}" for i in range(3)]
output_names = kbd.get_feature_names_out(input_features)
assert Xt.shape[1] == output_names.shape[0]
assert_array_equal(output_names, expected_names)
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
def test_kbinsdiscretizer_subsample(strategy, global_random_seed):
# Check that the bin edges are almost the same when subsampling is used.
X = np.random.RandomState(global_random_seed).random_sample((100000, 1)) + 1
if strategy == "quantile":
kbd_subsampling = KBinsDiscretizer(
strategy=strategy,
subsample=50000,
random_state=global_random_seed,
quantile_method="averaged_inverted_cdf",
)
else:
kbd_subsampling = KBinsDiscretizer(
strategy=strategy, subsample=50000, random_state=global_random_seed
)
kbd_subsampling.fit(X)
kbd_no_subsampling = clone(kbd_subsampling)
kbd_no_subsampling.set_params(subsample=None)
kbd_no_subsampling.fit(X)
# We use a large tolerance because we can't expect the bin edges to be exactly the
# same when subsampling is used.
assert_allclose(
kbd_subsampling.bin_edges_[0], kbd_no_subsampling.bin_edges_[0], rtol=1e-2
)
def test_quantile_method_future_warnings():
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
with pytest.warns(
FutureWarning,
match="The current default behavior, quantile_method='linear', will be "
"changed to quantile_method='averaged_inverted_cdf' in "
"scikit-learn version 1.9 to naturally support sample weight "
"equivalence properties by default. Pass "
"quantile_method='averaged_inverted_cdf' explicitly to silence this "
"warning.",
):
KBinsDiscretizer(strategy="quantile").fit(X)
def test_invalid_quantile_method_with_sample_weight():
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
expected_msg = (
"When fitting with strategy='quantile' and sample weights, "
"quantile_method should either be set to 'averaged_inverted_cdf' or "
"'inverted_cdf', got quantile_method='linear' instead."
)
with pytest.raises(
ValueError,
match=expected_msg,
):
KBinsDiscretizer(strategy="quantile", quantile_method="linear").fit(
X,
sample_weight=[1, 1, 2, 2],
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,579 @@
import warnings
import numpy as np
import pytest
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.utils._testing import (
_convert_container,
assert_allclose_dense_sparse,
assert_array_equal,
)
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
def _func(X, *args, **kwargs):
args_store.append(X)
args_store.extend(args)
kwargs_store.update(kwargs)
return func(X)
return _func
def test_delegate_to_func():
# (args|kwargs)_store will hold the positional and keyword arguments
# passed to the function inside the FunctionTransformer.
args_store = []
kwargs_store = {}
X = np.arange(10).reshape((5, 2))
assert_array_equal(
FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
X,
"transform should have returned X unchanged",
)
# The function should only have received X.
assert args_store == [X], (
"Incorrect positional arguments passed to func: {args}".format(args=args_store)
)
assert not kwargs_store, (
"Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
)
# reset the argument stores.
args_store[:] = []
kwargs_store.clear()
transformed = FunctionTransformer(
_make_func(args_store, kwargs_store),
).transform(X)
assert_array_equal(
transformed, X, err_msg="transform should have returned X unchanged"
)
# The function should have received X
assert args_store == [X], (
"Incorrect positional arguments passed to func: {args}".format(args=args_store)
)
assert not kwargs_store, (
"Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
)
def test_np_log():
X = np.arange(10).reshape((5, 2))
# Test that the numpy.log example still works.
assert_array_equal(
FunctionTransformer(np.log1p).transform(X),
np.log1p(X),
)
def test_kw_arg():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=3))
def test_kw_arg_update():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
F.kw_args["decimals"] = 1
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=1))
def test_kw_arg_reset():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
F.kw_args = dict(decimals=1)
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=1))
def test_inverse_transform():
X = np.array([1, 4, 9, 16]).reshape((2, 2))
# Test that inverse_transform works correctly
F = FunctionTransformer(
func=np.sqrt,
inverse_func=np.around,
inv_kw_args=dict(decimals=3),
)
assert_array_equal(
F.inverse_transform(F.transform(X)),
np.around(np.sqrt(X), decimals=3),
)
@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
def test_check_inverse(sparse_container):
X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
if sparse_container is not None:
X = sparse_container(X)
trans = FunctionTransformer(
func=np.sqrt,
inverse_func=np.around,
accept_sparse=sparse_container is not None,
check_inverse=True,
validate=True,
)
warning_message = (
"The provided functions are not strictly"
" inverse of each other. If you are sure you"
" want to proceed regardless, set"
" 'check_inverse=False'."
)
with pytest.warns(UserWarning, match=warning_message):
trans.fit(X)
trans = FunctionTransformer(
func=np.expm1,
inverse_func=np.log1p,
accept_sparse=sparse_container is not None,
check_inverse=True,
validate=True,
)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
Xt = trans.fit_transform(X)
assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
def test_check_inverse_func_or_inverse_not_provided():
# check that we don't check inverse when one of the func or inverse is not
# provided.
X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
trans = FunctionTransformer(
func=np.expm1, inverse_func=None, check_inverse=True, validate=True
)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
trans.fit(X)
trans = FunctionTransformer(
func=None, inverse_func=np.expm1, check_inverse=True, validate=True
)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
trans.fit(X)
def test_function_transformer_frame():
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame(np.random.randn(100, 10))
transformer = FunctionTransformer()
X_df_trans = transformer.fit_transform(X_df)
assert hasattr(X_df_trans, "loc")
@pytest.mark.parametrize("X_type", ["array", "series"])
def test_function_transformer_raise_error_with_mixed_dtype(X_type):
"""Check that `FunctionTransformer.check_inverse` raises error on mixed dtype."""
mapping = {"one": 1, "two": 2, "three": 3, 5: "five", 6: "six"}
inverse_mapping = {value: key for key, value in mapping.items()}
dtype = "object"
data = ["one", "two", "three", "one", "one", 5, 6]
data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype)
def func(X):
return np.array([mapping[X[i]] for i in range(X.size)], dtype=object)
def inverse_func(X):
return _convert_container(
[inverse_mapping[x] for x in X],
X_type,
columns_name=["value"],
dtype=dtype,
)
transformer = FunctionTransformer(
func=func, inverse_func=inverse_func, validate=False, check_inverse=True
)
msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
with pytest.raises(ValueError, match=msg):
transformer.fit(data)
def test_function_transformer_support_all_nummerical_dataframes_check_inverse_True():
"""Check support for dataframes with only numerical values."""
pd = pytest.importorskip("pandas")
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
transformer = FunctionTransformer(
func=lambda x: x + 2, inverse_func=lambda x: x - 2, check_inverse=True
)
# Does not raise an error
df_out = transformer.fit_transform(df)
assert_allclose_dense_sparse(df_out, df + 2)
def test_function_transformer_with_dataframe_and_check_inverse_True():
"""Check error is raised when check_inverse=True.
Non-regresion test for gh-25261.
"""
pd = pytest.importorskip("pandas")
transformer = FunctionTransformer(
func=lambda x: x, inverse_func=lambda x: x, check_inverse=True
)
df_mixed = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
with pytest.raises(ValueError, match=msg):
transformer.fit(df_mixed)
@pytest.mark.parametrize(
"X, feature_names_out, input_features, expected",
[
(
# NumPy inputs, default behavior: generate names
np.random.rand(100, 3),
"one-to-one",
None,
("x0", "x1", "x2"),
),
(
# Pandas input, default behavior: use input feature names
{"a": np.random.rand(100), "b": np.random.rand(100)},
"one-to-one",
None,
("a", "b"),
),
(
# NumPy input, feature_names_out=callable
np.random.rand(100, 3),
lambda transformer, input_features: ("a", "b"),
None,
("a", "b"),
),
(
# Pandas input, feature_names_out=callable
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: ("c", "d", "e"),
None,
("c", "d", "e"),
),
(
# NumPy input, feature_names_out=callable default input_features
np.random.rand(100, 3),
lambda transformer, input_features: tuple(input_features) + ("a",),
None,
("x0", "x1", "x2", "a"),
),
(
# Pandas input, feature_names_out=callable default input_features
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: tuple(input_features) + ("c",),
None,
("a", "b", "c"),
),
(
# NumPy input, input_features=list of names
np.random.rand(100, 3),
"one-to-one",
("a", "b", "c"),
("a", "b", "c"),
),
(
# Pandas input, input_features=list of names
{"a": np.random.rand(100), "b": np.random.rand(100)},
"one-to-one",
("a", "b"), # must match feature_names_in_
("a", "b"),
),
(
# NumPy input, feature_names_out=callable, input_features=list
np.random.rand(100, 3),
lambda transformer, input_features: tuple(input_features) + ("d",),
("a", "b", "c"),
("a", "b", "c", "d"),
),
(
# Pandas input, feature_names_out=callable, input_features=list
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: tuple(input_features) + ("c",),
("a", "b"), # must match feature_names_in_
("a", "b", "c"),
),
],
)
@pytest.mark.parametrize("validate", [True, False])
def test_function_transformer_get_feature_names_out(
X, feature_names_out, input_features, expected, validate
):
if isinstance(X, dict):
pd = pytest.importorskip("pandas")
X = pd.DataFrame(X)
transformer = FunctionTransformer(
feature_names_out=feature_names_out, validate=validate
)
transformer.fit(X)
names = transformer.get_feature_names_out(input_features)
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, expected)
def test_function_transformer_get_feature_names_out_without_validation():
transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False)
X = np.random.rand(100, 2)
transformer.fit_transform(X)
names = transformer.get_feature_names_out(("a", "b"))
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, ("a", "b"))
def test_function_transformer_feature_names_out_is_None():
transformer = FunctionTransformer()
X = np.random.rand(100, 2)
transformer.fit_transform(X)
msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'"
with pytest.raises(AttributeError, match=msg):
transformer.get_feature_names_out()
def test_function_transformer_feature_names_out_uses_estimator():
def add_n_random_features(X, n):
return np.concatenate([X, np.random.rand(len(X), n)], axis=1)
def feature_names_out(transformer, input_features):
n = transformer.kw_args["n"]
return list(input_features) + [f"rnd{i}" for i in range(n)]
transformer = FunctionTransformer(
func=add_n_random_features,
feature_names_out=feature_names_out,
kw_args=dict(n=3),
validate=True,
)
pd = pytest.importorskip("pandas")
df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
transformer.fit_transform(df)
names = transformer.get_feature_names_out()
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2"))
def test_function_transformer_validate_inverse():
"""Test that function transformer does not reset estimator in
`inverse_transform`."""
def add_constant_feature(X):
X_one = np.ones((X.shape[0], 1))
return np.concatenate((X, X_one), axis=1)
def inverse_add_constant(X):
return X[:, :-1]
X = np.array([[1, 2], [3, 4], [3, 4]])
trans = FunctionTransformer(
func=add_constant_feature,
inverse_func=inverse_add_constant,
validate=True,
)
X_trans = trans.fit_transform(X)
assert trans.n_features_in_ == X.shape[1]
trans.inverse_transform(X_trans)
assert trans.n_features_in_ == X.shape[1]
@pytest.mark.parametrize(
"feature_names_out, expected",
[
("one-to-one", ["pet", "color"]),
[lambda est, names: [f"{n}_out" for n in names], ["pet_out", "color_out"]],
],
)
@pytest.mark.parametrize("in_pipeline", [True, False])
def test_get_feature_names_out_dataframe_with_string_data(
feature_names_out, expected, in_pipeline
):
"""Check that get_feature_names_out works with DataFrames with string data."""
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"pet": ["dog", "cat"], "color": ["red", "green"]})
def func(X):
if feature_names_out == "one-to-one":
return X
else:
name = feature_names_out(None, X.columns)
return X.rename(columns=dict(zip(X.columns, name)))
transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out)
if in_pipeline:
transformer = make_pipeline(transformer)
X_trans = transformer.fit_transform(X)
assert isinstance(X_trans, pd.DataFrame)
names = transformer.get_feature_names_out()
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, expected)
def test_set_output_func():
"""Check behavior of set_output with different settings."""
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
ft = FunctionTransformer(np.log, feature_names_out="one-to-one")
# no warning is raised when feature_names_out is defined
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
ft.set_output(transform="pandas")
X_trans = ft.fit_transform(X)
assert isinstance(X_trans, pd.DataFrame)
assert_array_equal(X_trans.columns, ["a", "b"])
ft = FunctionTransformer(lambda x: 2 * x)
ft.set_output(transform="pandas")
# no warning is raised when func returns a panda dataframe
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
X_trans = ft.fit_transform(X)
assert isinstance(X_trans, pd.DataFrame)
assert_array_equal(X_trans.columns, ["a", "b"])
# Warning is raised when func returns a ndarray
ft_np = FunctionTransformer(lambda x: np.asarray(x))
for transform in ("pandas", "polars"):
ft_np.set_output(transform=transform)
msg = (
f"When `set_output` is configured to be '{transform}'.*{transform} "
"DataFrame.*"
)
with pytest.warns(UserWarning, match=msg):
ft_np.fit_transform(X)
# default transform does not warn
ft_np.set_output(transform="default")
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
ft_np.fit_transform(X)
def test_consistence_column_name_between_steps():
"""Check that we have a consistence between the feature names out of
`FunctionTransformer` and the feature names in of the next step in the pipeline.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27695
"""
pd = pytest.importorskip("pandas")
def with_suffix(_, names):
return [name + "__log" for name in names]
pipeline = make_pipeline(
FunctionTransformer(np.log1p, feature_names_out=with_suffix), StandardScaler()
)
df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["a", "b"])
X_trans = pipeline.fit_transform(df)
assert pipeline.get_feature_names_out().tolist() == ["a__log", "b__log"]
# StandardScaler will convert to a numpy array
assert isinstance(X_trans, np.ndarray)
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
@pytest.mark.parametrize("transform_output", ["default", "pandas", "polars"])
def test_function_transformer_overwrite_column_names(dataframe_lib, transform_output):
"""Check that we overwrite the column names when we should."""
lib = pytest.importorskip(dataframe_lib)
if transform_output != "numpy":
pytest.importorskip(transform_output)
df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
def with_suffix(_, names):
return [name + "__log" for name in names]
transformer = FunctionTransformer(feature_names_out=with_suffix).set_output(
transform=transform_output
)
X_trans = transformer.fit_transform(df)
assert_array_equal(np.asarray(X_trans), np.asarray(df))
feature_names = transformer.get_feature_names_out()
assert list(X_trans.columns) == with_suffix(None, df.columns)
assert feature_names.tolist() == with_suffix(None, df.columns)
@pytest.mark.parametrize(
"feature_names_out",
["one-to-one", lambda _, names: [f"{name}_log" for name in names]],
)
def test_function_transformer_overwrite_column_names_numerical(feature_names_out):
"""Check the same as `test_function_transformer_overwrite_column_names`
but for the specific case of pandas where column names can be numerical."""
pd = pytest.importorskip("pandas")
df = pd.DataFrame({0: [1, 2, 3], 1: [10, 20, 100]})
transformer = FunctionTransformer(feature_names_out=feature_names_out)
X_trans = transformer.fit_transform(df)
assert_array_equal(np.asarray(X_trans), np.asarray(df))
feature_names = transformer.get_feature_names_out()
assert list(X_trans.columns) == list(feature_names)
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
@pytest.mark.parametrize(
"feature_names_out",
["one-to-one", lambda _, names: [f"{name}_log" for name in names]],
)
def test_function_transformer_error_column_inconsistent(
dataframe_lib, feature_names_out
):
"""Check that we raise an error when `func` returns a dataframe with new
column names that become inconsistent with `get_feature_names_out`."""
lib = pytest.importorskip(dataframe_lib)
df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
def func(df):
if dataframe_lib == "pandas":
return df.rename(columns={"a": "c"})
else:
return df.rename({"a": "c"})
transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out)
err_msg = "The output generated by `func` have different column names"
with pytest.raises(ValueError, match=err_msg):
transformer.fit_transform(df).columns

View File

@@ -0,0 +1,748 @@
import numpy as np
import pytest
from scipy.sparse import issparse
from sklearn import config_context, datasets
from sklearn.preprocessing._label import (
LabelBinarizer,
LabelEncoder,
MultiLabelBinarizer,
_inverse_binarize_multiclass,
_inverse_binarize_thresholding,
label_binarize,
)
from sklearn.utils._array_api import (
_convert_to_numpy,
_get_namespace_device_dtype_ids,
get_namespace,
yield_namespace_device_dtype_combinations,
)
from sklearn.utils._testing import (
_array_api_for_tests,
assert_array_equal,
)
from sklearn.utils.fixes import (
COO_CONTAINERS,
CSC_CONTAINERS,
CSR_CONTAINERS,
DOK_CONTAINERS,
LIL_CONTAINERS,
)
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import _to_object_array
iris = datasets.load_iris()
def toarray(a):
if hasattr(a, "toarray"):
a = a.toarray()
return a
def test_label_binarizer():
# one-class case defaults to negative label
# For dense case:
inp = ["pos", "pos", "pos", "pos"]
lb = LabelBinarizer(sparse_output=False)
expected = np.array([[0, 0, 0, 0]]).T
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["pos"])
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
# For sparse case:
lb = LabelBinarizer(sparse_output=True)
got = lb.fit_transform(inp)
assert issparse(got)
assert_array_equal(lb.classes_, ["pos"])
assert_array_equal(expected, got.toarray())
assert_array_equal(lb.inverse_transform(got.toarray()), inp)
lb = LabelBinarizer(sparse_output=False)
# two-class case
inp = ["neg", "pos", "pos", "neg"]
expected = np.array([[0, 1, 1, 0]]).T
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["neg", "pos"])
assert_array_equal(expected, got)
to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
assert_array_equal(lb.inverse_transform(to_invert), inp)
# multi-class case
inp = ["spam", "ham", "eggs", "ham", "0"]
expected = np.array(
[[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
)
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"])
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer_unseen_labels():
lb = LabelBinarizer()
expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
got = lb.fit_transform(["b", "d", "e"])
assert_array_equal(expected, got)
expected = np.array(
[[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]
)
got = lb.transform(["a", "b", "c", "d", "e", "f"])
assert_array_equal(expected, got)
def test_label_binarizer_set_label_encoding():
lb = LabelBinarizer(neg_label=-2, pos_label=0)
# two-class case with pos_label=0
inp = np.array([0, 1, 1, 0])
expected = np.array([[-2, 0, 0, -2]]).T
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
lb = LabelBinarizer(neg_label=-2, pos_label=2)
# multi-class case
inp = np.array([3, 2, 1, 2, 0])
expected = np.array(
[
[-2, -2, -2, +2],
[-2, -2, +2, -2],
[-2, +2, -2, -2],
[-2, -2, +2, -2],
[+2, -2, -2, -2],
]
)
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
@pytest.mark.parametrize("unique_first", [True, False])
def test_label_binarizer_pandas_nullable(dtype, unique_first):
"""Checks that LabelBinarizer works with pandas nullable dtypes.
Non-regression test for gh-25637.
"""
pd = pytest.importorskip("pandas")
y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
if unique_first:
# Calling unique creates a pandas array which has a different interface
# compared to a pandas Series. Specifically, pandas arrays do not have "iloc".
y_true = y_true.unique()
lb = LabelBinarizer().fit(y_true)
y_out = lb.transform([1, 0])
assert_array_equal(y_out, [[1], [0]])
def test_label_binarizer_errors():
# Check that invalid arguments yield ValueError
one_class = np.array([0, 0, 0, 0])
lb = LabelBinarizer().fit(one_class)
multi_label = [(2, 3), (0,), (0, 2)]
err_msg = "You appear to be using a legacy multi-label data representation."
with pytest.raises(ValueError, match=err_msg):
lb.transform(multi_label)
lb = LabelBinarizer()
err_msg = "This LabelBinarizer instance is not fitted yet"
with pytest.raises(ValueError, match=err_msg):
lb.transform([])
with pytest.raises(ValueError, match=err_msg):
lb.inverse_transform([])
input_labels = [0, 1, 0, 1]
err_msg = "neg_label=2 must be strictly less than pos_label=1."
lb = LabelBinarizer(neg_label=2, pos_label=1)
with pytest.raises(ValueError, match=err_msg):
lb.fit(input_labels)
err_msg = "neg_label=2 must be strictly less than pos_label=2."
lb = LabelBinarizer(neg_label=2, pos_label=2)
with pytest.raises(ValueError, match=err_msg):
lb.fit(input_labels)
err_msg = (
"Sparse binarization is only supported with non zero pos_label and zero "
"neg_label, got pos_label=2 and neg_label=1"
)
lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
with pytest.raises(ValueError, match=err_msg):
lb.fit(input_labels)
# Sequence of seq type should raise ValueError
y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
err_msg = "You appear to be using a legacy multi-label data representation"
with pytest.raises(ValueError, match=err_msg):
LabelBinarizer().fit_transform(y_seq_of_seqs)
# Fail on the dimension of 'binary'
err_msg = "output_type='binary', but y.shape"
with pytest.raises(ValueError, match=err_msg):
_inverse_binarize_thresholding(
y=np.array([[1, 2, 3], [2, 1, 3]]),
output_type="binary",
classes=[1, 2, 3],
threshold=0,
)
# Fail on multioutput data
err_msg = "Multioutput target data is not supported with label binarization"
with pytest.raises(ValueError, match=err_msg):
LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
with pytest.raises(ValueError, match=err_msg):
label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_label_binarizer_sparse_errors(csr_container):
# Fail on y_type
err_msg = "foo format is not supported"
with pytest.raises(ValueError, match=err_msg):
_inverse_binarize_thresholding(
y=csr_container([[1, 2], [2, 1]]),
output_type="foo",
classes=[1, 2],
threshold=0,
)
# Fail on the number of classes
err_msg = "The number of class is not equal to the number of dimension of y."
with pytest.raises(ValueError, match=err_msg):
_inverse_binarize_thresholding(
y=csr_container([[1, 2], [2, 1]]),
output_type="foo",
classes=[1, 2, 3],
threshold=0,
)
@pytest.mark.parametrize(
"values, classes, unknown",
[
(
np.array([2, 1, 3, 1, 3], dtype="int64"),
np.array([1, 2, 3], dtype="int64"),
np.array([4], dtype="int64"),
),
(
np.array(["b", "a", "c", "a", "c"], dtype=object),
np.array(["a", "b", "c"], dtype=object),
np.array(["d"], dtype=object),
),
(
np.array(["b", "a", "c", "a", "c"]),
np.array(["a", "b", "c"]),
np.array(["d"]),
),
],
ids=["int64", "object", "str"],
)
def test_label_encoder(values, classes, unknown):
# Test LabelEncoder's transform, fit_transform and
# inverse_transform methods
le = LabelEncoder()
le.fit(values)
assert_array_equal(le.classes_, classes)
assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
le = LabelEncoder()
ret = le.fit_transform(values)
assert_array_equal(ret, [1, 0, 2, 0, 2])
with pytest.raises(ValueError, match="unseen labels"):
le.transform(unknown)
def test_label_encoder_negative_ints():
le = LabelEncoder()
le.fit([1, 1, 4, 5, -1, 0])
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
assert_array_equal(
le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]
)
with pytest.raises(ValueError):
le.transform([0, 6])
@pytest.mark.parametrize("dtype", ["str", "object"])
def test_label_encoder_str_bad_shape(dtype):
le = LabelEncoder()
le.fit(np.array(["apple", "orange"], dtype=dtype))
msg = "should be a 1d array"
with pytest.raises(ValueError, match=msg):
le.transform("apple")
def test_label_encoder_errors():
# Check that invalid arguments yield ValueError
le = LabelEncoder()
with pytest.raises(ValueError):
le.transform([])
with pytest.raises(ValueError):
le.inverse_transform([])
# Fail on unseen labels
le = LabelEncoder()
le.fit([1, 2, 3, -1, 1])
msg = "contains previously unseen labels"
with pytest.raises(ValueError, match=msg):
le.inverse_transform([-2])
with pytest.raises(ValueError, match=msg):
le.inverse_transform([-2, -3, -4])
# Fail on inverse_transform("")
msg = r"should be a 1d array.+shape \(\)"
with pytest.raises(ValueError, match=msg):
le.inverse_transform("")
@pytest.mark.parametrize(
"values",
[
np.array([2, 1, 3, 1, 3], dtype="int64"),
np.array(["b", "a", "c", "a", "c"], dtype=object),
np.array(["b", "a", "c", "a", "c"]),
],
ids=["int64", "object", "str"],
)
def test_label_encoder_empty_array(values):
le = LabelEncoder()
le.fit(values)
# test empty transform
transformed = le.transform([])
assert_array_equal(np.array([]), transformed)
# test empty inverse transform
inverse_transformed = le.inverse_transform([])
assert_array_equal(np.array([]), inverse_transformed)
def test_sparse_output_multilabel_binarizer():
# test input as iterable of iterables
inputs = [
lambda: [(2, 3), (1,), (1, 2)],
lambda: ({2, 3}, {1}, {1, 2}),
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
inverse = inputs[0]()
for sparse_output in [True, False]:
for inp in inputs:
# With fit_transform
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
got = mlb.fit_transform(inp())
assert issparse(got) == sparse_output
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert got.indices.dtype == got.indptr.dtype
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
# With fit
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
got = mlb.fit(inp()).transform(inp())
assert issparse(got) == sparse_output
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert got.indices.dtype == got.indptr.dtype
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sparse_output_multilabel_binarizer_errors(csr_container):
inp = iter([iter((2, 3)), iter((1,)), {1, 2}])
mlb = MultiLabelBinarizer(sparse_output=False)
mlb.fit(inp)
with pytest.raises(ValueError):
mlb.inverse_transform(
csr_container(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]]))
)
def test_multilabel_binarizer():
# test input as iterable of iterables
inputs = [
lambda: [(2, 3), (1,), (1, 2)],
lambda: ({2, 3}, {1}, {1, 2}),
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
inverse = inputs[0]()
for inp in inputs:
# With fit_transform
mlb = MultiLabelBinarizer()
got = mlb.fit_transform(inp())
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
# With fit
mlb = MultiLabelBinarizer()
got = mlb.fit(inp()).transform(inp())
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
def test_multilabel_binarizer_empty_sample():
mlb = MultiLabelBinarizer()
y = [[1, 2], [1], []]
Y = np.array([[1, 1], [1, 0], [0, 0]])
assert_array_equal(mlb.fit_transform(y), Y)
def test_multilabel_binarizer_unknown_class():
mlb = MultiLabelBinarizer()
y = [[1, 2]]
Y = np.array([[1, 0], [0, 1]])
warning_message = "unknown class.* will be ignored"
with pytest.warns(UserWarning, match=warning_message):
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
Y = np.array([[1, 0, 0], [0, 1, 0]])
mlb = MultiLabelBinarizer(classes=[1, 2, 3])
with pytest.warns(UserWarning, match=warning_message):
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
assert_array_equal(matrix, Y)
def test_multilabel_binarizer_given_classes():
inp = [(2, 3), (1,), (1, 2)]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
# fit_transform()
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, [1, 3, 2])
# fit().transform()
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, [1, 3, 2])
# ensure works with extra class
mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
assert_array_equal(
mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))
)
assert_array_equal(mlb.classes_, [4, 1, 3, 2])
# ensure fit is no-op as iterable is not consumed
inp = iter(inp)
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
# ensure a ValueError is thrown if given duplicate classes
err_msg = (
"The classes argument contains duplicate classes. Remove "
"these duplicates before passing them to MultiLabelBinarizer."
)
mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
with pytest.raises(ValueError, match=err_msg):
mlb.fit(inp)
def test_multilabel_binarizer_multiple_calls():
inp = [(2, 3), (1,), (1, 2)]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
# first call
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
# second call change class
mlb.classes = [1, 2, 3]
assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
def test_multilabel_binarizer_same_length_sequence():
# Ensure sequences of the same length are not interpreted as a 2-d array
inp = [[1], [0], [2]]
indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
# fit_transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
# fit().transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
def test_multilabel_binarizer_non_integer_labels():
tuple_classes = _to_object_array([(1,), (2,), (3,)])
inputs = [
([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]),
([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]),
([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
for inp, classes in inputs:
# fit_transform()
mlb = MultiLabelBinarizer()
inp = np.array(inp, dtype=object)
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, classes)
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
assert_array_equal(indicator_mat_inv, inp)
# fit().transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, classes)
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
assert_array_equal(indicator_mat_inv, inp)
mlb = MultiLabelBinarizer()
with pytest.raises(TypeError):
mlb.fit_transform([({}), ({}, {"a": "b"})])
def test_multilabel_binarizer_non_unique():
inp = [(1, 1, 1, 0)]
indicator_mat = np.array([[1, 1]])
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
def test_multilabel_binarizer_inverse_validation():
inp = [(1, 1, 1, 0)]
mlb = MultiLabelBinarizer()
mlb.fit_transform(inp)
# Not binary
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1, 3]]))
# The following binary cases are fine, however
mlb.inverse_transform(np.array([[0, 0]]))
mlb.inverse_transform(np.array([[1, 1]]))
mlb.inverse_transform(np.array([[1, 0]]))
# Wrong shape
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1]]))
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1, 1, 1]]))
def test_label_binarize_with_class_order():
out = label_binarize([1, 6], classes=[1, 2, 4, 6])
expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
assert_array_equal(out, expected)
# Modified class order
out = label_binarize([1, 6], classes=[1, 6, 4, 2])
expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
assert_array_equal(out, expected)
out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]])
assert_array_equal(out, expected)
def check_binarized_results(y, classes, pos_label, neg_label, expected):
for sparse_output in [True, False]:
if (pos_label == 0 or neg_label != 0) and sparse_output:
with pytest.raises(ValueError):
label_binarize(
y,
classes=classes,
neg_label=neg_label,
pos_label=pos_label,
sparse_output=sparse_output,
)
continue
# check label_binarize
binarized = label_binarize(
y,
classes=classes,
neg_label=neg_label,
pos_label=pos_label,
sparse_output=sparse_output,
)
assert_array_equal(toarray(binarized), expected)
assert issparse(binarized) == sparse_output
# check inverse
y_type = type_of_target(y)
if y_type == "multiclass":
inversed = _inverse_binarize_multiclass(binarized, classes=classes)
else:
inversed = _inverse_binarize_thresholding(
binarized,
output_type=y_type,
classes=classes,
threshold=((neg_label + pos_label) / 2.0),
)
assert_array_equal(toarray(inversed), toarray(y))
# Check label binarizer
lb = LabelBinarizer(
neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
)
binarized = lb.fit_transform(y)
assert_array_equal(toarray(binarized), expected)
assert issparse(binarized) == sparse_output
inverse_output = lb.inverse_transform(binarized)
assert_array_equal(toarray(inverse_output), toarray(y))
assert issparse(inverse_output) == issparse(y)
def test_label_binarize_binary():
y = [0, 1, 0]
classes = [0, 1]
pos_label = 2
neg_label = -1
expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
check_binarized_results(y, classes, pos_label, neg_label, expected)
# Binary case where sparse_output = True will not result in a ValueError
y = [0, 1, 0]
classes = [0, 1]
pos_label = 3
neg_label = 0
expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
check_binarized_results(y, classes, pos_label, neg_label, expected)
def test_label_binarize_multiclass():
y = [0, 1, 2]
classes = [0, 1, 2]
pos_label = 2
neg_label = 0
expected = 2 * np.eye(3)
check_binarized_results(y, classes, pos_label, neg_label, expected)
with pytest.raises(ValueError):
label_binarize(
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
)
@pytest.mark.parametrize(
"arr_type",
[np.array]
+ COO_CONTAINERS
+ CSC_CONTAINERS
+ CSR_CONTAINERS
+ DOK_CONTAINERS
+ LIL_CONTAINERS,
)
def test_label_binarize_multilabel(arr_type):
y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
classes = [0, 1, 2]
pos_label = 2
neg_label = 0
expected = pos_label * y_ind
y = arr_type(y_ind)
check_binarized_results(y, classes, pos_label, neg_label, expected)
with pytest.raises(ValueError):
label_binarize(
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
)
def test_invalid_input_label_binarize():
with pytest.raises(ValueError):
label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
with pytest.raises(ValueError, match="continuous target data is not "):
label_binarize([1.2, 2.7], classes=[0, 1])
with pytest.raises(ValueError, match="mismatch with the labels"):
label_binarize([[1, 3]], classes=[1, 2, 3])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_inverse_binarize_multiclass(csr_container):
got = _inverse_binarize_multiclass(
csr_container([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3)
)
assert_array_equal(got, np.array([1, 1, 0]))
def test_nan_label_encoder():
"""Check that label encoder encodes nans in transform.
Non-regression test for #22628.
"""
le = LabelEncoder()
le.fit(["a", "a", "b", np.nan])
y_trans = le.transform([np.nan])
assert_array_equal(y_trans, [2])
@pytest.mark.parametrize(
"encoder", [LabelEncoder(), LabelBinarizer(), MultiLabelBinarizer()]
)
def test_label_encoders_do_not_have_set_output(encoder):
"""Check that label encoders do not define set_output and work with y as a kwarg.
Non-regression test for #26854.
"""
assert not hasattr(encoder, "set_output")
y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"])
y_encoded_positional = encoder.fit_transform(["a", "b", "c"])
assert_array_equal(y_encoded_with_kwarg, y_encoded_positional)
@pytest.mark.parametrize(
"array_namespace, device, dtype",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
@pytest.mark.parametrize(
"y",
[
np.array([2, 1, 3, 1, 3]),
np.array([1, 1, 4, 5, -1, 0]),
np.array([3, 5, 9, 5, 9, 3]),
],
)
def test_label_encoder_array_api_compliance(y, array_namespace, device, dtype):
xp = _array_api_for_tests(array_namespace, device)
xp_y = xp.asarray(y, device=device)
with config_context(array_api_dispatch=True):
xp_label = LabelEncoder()
np_label = LabelEncoder()
xp_label = xp_label.fit(xp_y)
xp_transformed = xp_label.transform(xp_y)
xp_inv_transformed = xp_label.inverse_transform(xp_transformed)
np_label = np_label.fit(y)
np_transformed = np_label.transform(y)
assert get_namespace(xp_transformed)[0].__name__ == xp.__name__
assert get_namespace(xp_inv_transformed)[0].__name__ == xp.__name__
assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__
assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed)
assert_array_equal(_convert_to_numpy(xp_inv_transformed, xp), y)
assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_)
xp_label = LabelEncoder()
np_label = LabelEncoder()
xp_transformed = xp_label.fit_transform(xp_y)
np_transformed = np_label.fit_transform(y)
assert get_namespace(xp_transformed)[0].__name__ == xp.__name__
assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__
assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed)
assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,714 @@
import re
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_equal
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import (
KFold,
ShuffleSplit,
StratifiedKFold,
cross_val_score,
train_test_split,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
KBinsDiscretizer,
LabelBinarizer,
LabelEncoder,
TargetEncoder,
)
def _encode_target(X_ordinal, y_numeric, n_categories, smooth):
"""Simple Python implementation of target encoding."""
cur_encodings = np.zeros(n_categories, dtype=np.float64)
y_mean = np.mean(y_numeric)
if smooth == "auto":
y_variance = np.var(y_numeric)
for c in range(n_categories):
y_subset = y_numeric[X_ordinal == c]
n_i = y_subset.shape[0]
if n_i == 0:
cur_encodings[c] = y_mean
continue
y_subset_variance = np.var(y_subset)
m = y_subset_variance / y_variance
lambda_ = n_i / (n_i + m)
cur_encodings[c] = lambda_ * np.mean(y_subset) + (1 - lambda_) * y_mean
return cur_encodings
else: # float
for c in range(n_categories):
y_subset = y_numeric[X_ordinal == c]
current_sum = np.sum(y_subset) + y_mean * smooth
current_cnt = y_subset.shape[0] + smooth
cur_encodings[c] = current_sum / current_cnt
return cur_encodings
@pytest.mark.parametrize(
"categories, unknown_value",
[
([np.array([0, 1, 2], dtype=np.int64)], 4),
([np.array([1.0, 3.0, np.nan], dtype=np.float64)], 6.0),
([np.array(["cat", "dog", "snake"], dtype=object)], "bear"),
("auto", 3),
],
)
@pytest.mark.parametrize("smooth", [5.0, "auto"])
@pytest.mark.parametrize("target_type", ["binary", "continuous"])
def test_encoding(categories, unknown_value, global_random_seed, smooth, target_type):
"""Check encoding for binary and continuous targets.
Compare the values returned by `TargetEncoder.fit_transform` against the
expected encodings for cv splits from a naive reference Python
implementation in _encode_target.
"""
n_categories = 3
X_train_int_array = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=np.int64).T
X_test_int_array = np.array([[0, 1, 2]], dtype=np.int64).T
n_samples = X_train_int_array.shape[0]
if categories == "auto":
X_train = X_train_int_array
X_test = X_test_int_array
else:
X_train = categories[0][X_train_int_array]
X_test = categories[0][X_test_int_array]
X_test = np.concatenate((X_test, [[unknown_value]]))
data_rng = np.random.RandomState(global_random_seed)
n_splits = 3
if target_type == "binary":
y_numeric = data_rng.randint(low=0, high=2, size=n_samples)
target_names = np.array(["cat", "dog"], dtype=object)
y_train = target_names[y_numeric]
else:
assert target_type == "continuous"
y_numeric = data_rng.uniform(low=-10, high=20, size=n_samples)
y_train = y_numeric
shuffled_idx = data_rng.permutation(n_samples)
X_train_int_array = X_train_int_array[shuffled_idx]
X_train = X_train[shuffled_idx]
y_train = y_train[shuffled_idx]
y_numeric = y_numeric[shuffled_idx]
# Define our CV splitting strategy
if target_type == "binary":
cv = StratifiedKFold(
n_splits=n_splits, random_state=global_random_seed, shuffle=True
)
else:
cv = KFold(n_splits=n_splits, random_state=global_random_seed, shuffle=True)
# Compute the expected values using our reference Python implementation of
# target encoding:
expected_X_fit_transform = np.empty_like(X_train_int_array, dtype=np.float64)
for train_idx, test_idx in cv.split(X_train_int_array, y_train):
X_, y_ = X_train_int_array[train_idx, 0], y_numeric[train_idx]
cur_encodings = _encode_target(X_, y_, n_categories, smooth)
expected_X_fit_transform[test_idx, 0] = cur_encodings[
X_train_int_array[test_idx, 0]
]
# Check that we can obtain the same encodings by calling `fit_transform` on
# the estimator with the same CV parameters:
target_encoder = TargetEncoder(
smooth=smooth,
categories=categories,
cv=n_splits,
random_state=global_random_seed,
)
X_fit_transform = target_encoder.fit_transform(X_train, y_train)
assert target_encoder.target_type_ == target_type
assert_allclose(X_fit_transform, expected_X_fit_transform)
assert len(target_encoder.encodings_) == 1
if target_type == "binary":
assert_array_equal(target_encoder.classes_, target_names)
else:
assert target_encoder.classes_ is None
# compute encodings for all data to validate `transform`
y_mean = np.mean(y_numeric)
expected_encodings = _encode_target(
X_train_int_array[:, 0], y_numeric, n_categories, smooth
)
assert_allclose(target_encoder.encodings_[0], expected_encodings)
assert target_encoder.target_mean_ == pytest.approx(y_mean)
# Transform on test data, the last value is unknown so it is encoded as the target
# mean
expected_X_test_transform = np.concatenate(
(expected_encodings, np.array([y_mean]))
).reshape(-1, 1)
X_test_transform = target_encoder.transform(X_test)
assert_allclose(X_test_transform, expected_X_test_transform)
@pytest.mark.parametrize(
"categories, unknown_values",
[
([np.array([0, 1, 2], dtype=np.int64)], "auto"),
([np.array(["cat", "dog", "snake"], dtype=object)], ["bear", "rabbit"]),
],
)
@pytest.mark.parametrize(
"target_labels", [np.array([1, 2, 3]), np.array(["a", "b", "c"])]
)
@pytest.mark.parametrize("smooth", [5.0, "auto"])
def test_encoding_multiclass(
global_random_seed, categories, unknown_values, target_labels, smooth
):
"""Check encoding for multiclass targets."""
rng = np.random.RandomState(global_random_seed)
n_samples = 80
n_features = 2
feat_1_int = np.array(rng.randint(low=0, high=2, size=n_samples))
feat_2_int = np.array(rng.randint(low=0, high=3, size=n_samples))
feat_1 = categories[0][feat_1_int]
feat_2 = categories[0][feat_2_int]
X_train = np.column_stack((feat_1, feat_2))
X_train_int = np.column_stack((feat_1_int, feat_2_int))
categories_ = [[0, 1], [0, 1, 2]]
n_classes = 3
y_train_int = np.array(rng.randint(low=0, high=n_classes, size=n_samples))
y_train = target_labels[y_train_int]
y_train_enc = LabelBinarizer().fit_transform(y_train)
n_splits = 3
cv = StratifiedKFold(
n_splits=n_splits, random_state=global_random_seed, shuffle=True
)
# Manually compute encodings for cv splits to validate `fit_transform`
expected_X_fit_transform = np.empty(
(X_train_int.shape[0], X_train_int.shape[1] * n_classes),
dtype=np.float64,
)
for f_idx, cats in enumerate(categories_):
for c_idx in range(n_classes):
for train_idx, test_idx in cv.split(X_train, y_train):
y_class = y_train_enc[:, c_idx]
X_, y_ = X_train_int[train_idx, f_idx], y_class[train_idx]
current_encoding = _encode_target(X_, y_, len(cats), smooth)
# f_idx: 0, 0, 0, 1, 1, 1
# c_idx: 0, 1, 2, 0, 1, 2
# exp_idx: 0, 1, 2, 3, 4, 5
exp_idx = c_idx + (f_idx * n_classes)
expected_X_fit_transform[test_idx, exp_idx] = current_encoding[
X_train_int[test_idx, f_idx]
]
target_encoder = TargetEncoder(
smooth=smooth,
cv=n_splits,
random_state=global_random_seed,
)
X_fit_transform = target_encoder.fit_transform(X_train, y_train)
assert target_encoder.target_type_ == "multiclass"
assert_allclose(X_fit_transform, expected_X_fit_transform)
# Manually compute encoding to validate `transform`
expected_encodings = []
for f_idx, cats in enumerate(categories_):
for c_idx in range(n_classes):
y_class = y_train_enc[:, c_idx]
current_encoding = _encode_target(
X_train_int[:, f_idx], y_class, len(cats), smooth
)
expected_encodings.append(current_encoding)
assert len(target_encoder.encodings_) == n_features * n_classes
for i in range(n_features * n_classes):
assert_allclose(target_encoder.encodings_[i], expected_encodings[i])
assert_array_equal(target_encoder.classes_, target_labels)
# Include unknown values at the end
X_test_int = np.array([[0, 1], [1, 2], [4, 5]])
if unknown_values == "auto":
X_test = X_test_int
else:
X_test = np.empty_like(X_test_int[:-1, :], dtype=object)
for column_idx in range(X_test_int.shape[1]):
X_test[:, column_idx] = categories[0][X_test_int[:-1, column_idx]]
# Add unknown values at end
X_test = np.vstack((X_test, unknown_values))
y_mean = np.mean(y_train_enc, axis=0)
expected_X_test_transform = np.empty(
(X_test_int.shape[0], X_test_int.shape[1] * n_classes),
dtype=np.float64,
)
n_rows = X_test_int.shape[0]
f_idx = [0, 0, 0, 1, 1, 1]
# Last row are unknowns, dealt with later
for row_idx in range(n_rows - 1):
for i, enc in enumerate(expected_encodings):
expected_X_test_transform[row_idx, i] = enc[X_test_int[row_idx, f_idx[i]]]
# Unknowns encoded as target mean for each class
# `y_mean` contains target mean for each class, thus cycle through mean of
# each class, `n_features` times
mean_idx = [0, 1, 2, 0, 1, 2]
for i in range(n_classes * n_features):
expected_X_test_transform[n_rows - 1, i] = y_mean[mean_idx[i]]
X_test_transform = target_encoder.transform(X_test)
assert_allclose(X_test_transform, expected_X_test_transform)
@pytest.mark.parametrize(
"X, categories",
[
(
np.array([[0] * 10 + [1] * 10 + [3]], dtype=np.int64).T, # 3 is unknown
[[0, 1, 2]],
),
(
np.array(
[["cat"] * 10 + ["dog"] * 10 + ["snake"]], dtype=object
).T, # snake is unknown
[["dog", "cat", "cow"]],
),
],
)
@pytest.mark.parametrize("smooth", [4.0, "auto"])
def test_custom_categories(X, categories, smooth):
"""Custom categories with unknown categories that are not in training data."""
rng = np.random.RandomState(0)
y = rng.uniform(low=-10, high=20, size=X.shape[0])
enc = TargetEncoder(categories=categories, smooth=smooth, random_state=0).fit(X, y)
# The last element is unknown and encoded as the mean
y_mean = y.mean()
X_trans = enc.transform(X[-1:])
assert X_trans[0, 0] == pytest.approx(y_mean)
assert len(enc.encodings_) == 1
# custom category that is not in training data
assert enc.encodings_[0][-1] == pytest.approx(y_mean)
@pytest.mark.parametrize(
"y, msg",
[
([1, 2, 0, 1], "Found input variables with inconsistent"),
(
np.array([[1, 2, 0], [1, 2, 3]]).T,
"Target type was inferred to be 'multiclass-multioutput'",
),
],
)
def test_errors(y, msg):
"""Check invalidate input."""
X = np.array([[1, 0, 1]]).T
enc = TargetEncoder()
with pytest.raises(ValueError, match=msg):
enc.fit_transform(X, y)
def test_use_regression_target():
"""Check inferred and specified `target_type` on regression target."""
X = np.array([[0, 1, 0, 1, 0, 1]]).T
y = np.array([1.0, 2.0, 3.0, 2.0, 3.0, 4.0])
enc = TargetEncoder(cv=2)
with pytest.warns(
UserWarning,
match=re.escape(
"The least populated class in y has only 1 members, which is less than"
" n_splits=2."
),
):
enc.fit_transform(X, y)
assert enc.target_type_ == "multiclass"
enc = TargetEncoder(cv=2, target_type="continuous")
enc.fit_transform(X, y)
assert enc.target_type_ == "continuous"
@pytest.mark.parametrize(
"y, feature_names",
[
([1, 2] * 10, ["A", "B"]),
([1, 2, 3] * 6 + [1, 2], ["A_1", "A_2", "A_3", "B_1", "B_2", "B_3"]),
(
["y1", "y2", "y3"] * 6 + ["y1", "y2"],
["A_y1", "A_y2", "A_y3", "B_y1", "B_y2", "B_y3"],
),
],
)
def test_feature_names_out_set_output(y, feature_names):
"""Check TargetEncoder works with set_output."""
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame({"A": ["a", "b"] * 10, "B": [1, 2] * 10})
enc_default = TargetEncoder(cv=2, smooth=3.0, random_state=0)
enc_default.set_output(transform="default")
enc_pandas = TargetEncoder(cv=2, smooth=3.0, random_state=0)
enc_pandas.set_output(transform="pandas")
X_default = enc_default.fit_transform(X_df, y)
X_pandas = enc_pandas.fit_transform(X_df, y)
assert_allclose(X_pandas.to_numpy(), X_default)
assert_array_equal(enc_pandas.get_feature_names_out(), feature_names)
assert_array_equal(enc_pandas.get_feature_names_out(), X_pandas.columns)
@pytest.mark.parametrize("to_pandas", [True, False])
@pytest.mark.parametrize("smooth", [1.0, "auto"])
@pytest.mark.parametrize("target_type", ["binary-ints", "binary-str", "continuous"])
def test_multiple_features_quick(to_pandas, smooth, target_type):
"""Check target encoder with multiple features."""
X_ordinal = np.array(
[[1, 1], [0, 1], [1, 1], [2, 1], [1, 0], [0, 1], [1, 0], [0, 0]], dtype=np.int64
)
if target_type == "binary-str":
y_train = np.array(["a", "b", "a", "a", "b", "b", "a", "b"])
y_integer = LabelEncoder().fit_transform(y_train)
cv = StratifiedKFold(2, random_state=0, shuffle=True)
elif target_type == "binary-ints":
y_train = np.array([3, 4, 3, 3, 3, 4, 4, 4])
y_integer = LabelEncoder().fit_transform(y_train)
cv = StratifiedKFold(2, random_state=0, shuffle=True)
else:
y_train = np.array([3.0, 5.1, 2.4, 3.5, 4.1, 5.5, 10.3, 7.3], dtype=np.float32)
y_integer = y_train
cv = KFold(2, random_state=0, shuffle=True)
y_mean = np.mean(y_integer)
categories = [[0, 1, 2], [0, 1]]
X_test = np.array(
[
[0, 1],
[3, 0], # 3 is unknown
[1, 10], # 10 is unknown
],
dtype=np.int64,
)
if to_pandas:
pd = pytest.importorskip("pandas")
# convert second feature to an object
X_train = pd.DataFrame(
{
"feat0": X_ordinal[:, 0],
"feat1": np.array(["cat", "dog"], dtype=object)[X_ordinal[:, 1]],
}
)
# "snake" is unknown
X_test = pd.DataFrame({"feat0": X_test[:, 0], "feat1": ["dog", "cat", "snake"]})
else:
X_train = X_ordinal
# manually compute encoding for fit_transform
expected_X_fit_transform = np.empty_like(X_ordinal, dtype=np.float64)
for f_idx, cats in enumerate(categories):
for train_idx, test_idx in cv.split(X_ordinal, y_integer):
X_, y_ = X_ordinal[train_idx, f_idx], y_integer[train_idx]
current_encoding = _encode_target(X_, y_, len(cats), smooth)
expected_X_fit_transform[test_idx, f_idx] = current_encoding[
X_ordinal[test_idx, f_idx]
]
# manually compute encoding for transform
expected_encodings = []
for f_idx, cats in enumerate(categories):
current_encoding = _encode_target(
X_ordinal[:, f_idx], y_integer, len(cats), smooth
)
expected_encodings.append(current_encoding)
expected_X_test_transform = np.array(
[
[expected_encodings[0][0], expected_encodings[1][1]],
[y_mean, expected_encodings[1][0]],
[expected_encodings[0][1], y_mean],
],
dtype=np.float64,
)
enc = TargetEncoder(smooth=smooth, cv=2, random_state=0)
X_fit_transform = enc.fit_transform(X_train, y_train)
assert_allclose(X_fit_transform, expected_X_fit_transform)
assert len(enc.encodings_) == 2
for i in range(2):
assert_allclose(enc.encodings_[i], expected_encodings[i])
X_test_transform = enc.transform(X_test)
assert_allclose(X_test_transform, expected_X_test_transform)
@pytest.mark.parametrize(
"y, y_mean",
[
(np.array([3.4] * 20), 3.4),
(np.array([0] * 20), 0),
(np.array(["a"] * 20, dtype=object), 0),
],
ids=["continuous", "binary", "binary-string"],
)
@pytest.mark.parametrize("smooth", ["auto", 4.0, 0.0])
def test_constant_target_and_feature(y, y_mean, smooth):
"""Check edge case where feature and target is constant."""
X = np.array([[1] * 20]).T
n_samples = X.shape[0]
enc = TargetEncoder(cv=2, smooth=smooth, random_state=0)
X_trans = enc.fit_transform(X, y)
assert_allclose(X_trans, np.repeat([[y_mean]], n_samples, axis=0))
assert enc.encodings_[0][0] == pytest.approx(y_mean)
assert enc.target_mean_ == pytest.approx(y_mean)
X_test = np.array([[1], [0]])
X_test_trans = enc.transform(X_test)
assert_allclose(X_test_trans, np.repeat([[y_mean]], 2, axis=0))
def test_fit_transform_not_associated_with_y_if_ordinal_categorical_is_not(
global_random_seed,
):
cardinality = 30 # not too large, otherwise we need a very large n_samples
n_samples = 3000
rng = np.random.RandomState(global_random_seed)
y_train = rng.normal(size=n_samples)
X_train = rng.randint(0, cardinality, size=n_samples).reshape(-1, 1)
# Sort by y_train to attempt to cause a leak
y_sorted_indices = y_train.argsort()
y_train = y_train[y_sorted_indices]
X_train = X_train[y_sorted_indices]
target_encoder = TargetEncoder(shuffle=True, random_state=global_random_seed)
X_encoded_train_shuffled = target_encoder.fit_transform(X_train, y_train)
target_encoder = TargetEncoder(shuffle=False)
X_encoded_train_no_shuffled = target_encoder.fit_transform(X_train, y_train)
# Check that no information about y_train has leaked into X_train:
regressor = RandomForestRegressor(
n_estimators=10, min_samples_leaf=20, random_state=global_random_seed
)
# It's impossible to learn a good predictive model on the training set when
# using the original representation X_train or the target encoded
# representation with shuffled inner CV. For the latter, no information
# about y_train has inadvertently leaked into the prior used to generate
# `X_encoded_train_shuffled`:
cv = ShuffleSplit(n_splits=50, random_state=global_random_seed)
assert cross_val_score(regressor, X_train, y_train, cv=cv).mean() < 0.1
assert (
cross_val_score(regressor, X_encoded_train_shuffled, y_train, cv=cv).mean()
< 0.1
)
# Without the inner CV shuffling, a lot of information about y_train goes into the
# the per-fold y_train.mean() priors: shrinkage is no longer effective in this
# case and would no longer be able to prevent downstream over-fitting.
assert (
cross_val_score(regressor, X_encoded_train_no_shuffled, y_train, cv=cv).mean()
> 0.5
)
def test_smooth_zero():
"""Check edge case with zero smoothing and cv does not contain category."""
X = np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]).T
y = np.array([2.1, 4.3, 1.2, 3.1, 1.0, 9.0, 10.3, 14.2, 13.3, 15.0])
enc = TargetEncoder(smooth=0.0, shuffle=False, cv=2)
X_trans = enc.fit_transform(X, y)
# With cv = 2, category 0 does not exist in the second half, thus
# it will be encoded as the mean of the second half
assert_allclose(X_trans[0], np.mean(y[5:]))
# category 1 does not exist in the first half, thus it will be encoded as
# the mean of the first half
assert_allclose(X_trans[-1], np.mean(y[:5]))
@pytest.mark.parametrize("smooth", [0.0, 1e3, "auto"])
def test_invariance_of_encoding_under_label_permutation(smooth, global_random_seed):
# Check that the encoding does not depend on the integer of the value of
# the integer labels. This is quite a trivial property but it is helpful
# to understand the following test.
rng = np.random.RandomState(global_random_seed)
# Random y and informative categorical X to make the test non-trivial when
# using smoothing.
y = rng.normal(size=1000)
n_categories = 30
X = KBinsDiscretizer(
n_bins=n_categories, quantile_method="averaged_inverted_cdf", encode="ordinal"
).fit_transform(y.reshape(-1, 1))
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=global_random_seed
)
# Shuffle the labels to make sure that the encoding is invariant to the
# permutation of the labels
permutated_labels = rng.permutation(n_categories)
X_train_permuted = permutated_labels[X_train.astype(np.int32)]
X_test_permuted = permutated_labels[X_test.astype(np.int32)]
target_encoder = TargetEncoder(smooth=smooth, random_state=global_random_seed)
X_train_encoded = target_encoder.fit_transform(X_train, y_train)
X_test_encoded = target_encoder.transform(X_test)
X_train_permuted_encoded = target_encoder.fit_transform(X_train_permuted, y_train)
X_test_permuted_encoded = target_encoder.transform(X_test_permuted)
assert_allclose(X_train_encoded, X_train_permuted_encoded)
assert_allclose(X_test_encoded, X_test_permuted_encoded)
@pytest.mark.parametrize("smooth", [0.0, "auto"])
def test_target_encoding_for_linear_regression(smooth, global_random_seed):
# Check some expected statistical properties when fitting a linear
# regression model on target encoded features depending on their relation
# with that target.
# In this test, we use the Ridge class with the "lsqr" solver and a little
# bit of regularization to implement a linear regression model that
# converges quickly for large `n_samples` and robustly in case of
# correlated features. Since we will fit this model on a mean centered
# target, we do not need to fit an intercept and this will help simplify
# the analysis with respect to the expected coefficients.
linear_regression = Ridge(alpha=1e-6, solver="lsqr", fit_intercept=False)
# Construct a random target variable. We need a large number of samples for
# this test to be stable across all values of the random seed.
n_samples = 50_000
rng = np.random.RandomState(global_random_seed)
y = rng.randn(n_samples)
# Generate a single informative ordinal feature with medium cardinality.
# Inject some irreducible noise to make it harder for a multivariate model
# to identify the informative feature from other pure noise features.
noise = 0.8 * rng.randn(n_samples)
n_categories = 100
X_informative = KBinsDiscretizer(
n_bins=n_categories,
encode="ordinal",
strategy="uniform",
random_state=rng,
).fit_transform((y + noise).reshape(-1, 1))
# Let's permute the labels to hide the fact that this feature is
# informative to naive linear regression model trained on the raw ordinal
# values. As highlighted in the previous test, the target encoding should be
# invariant to such a permutation.
permutated_labels = rng.permutation(n_categories)
X_informative = permutated_labels[X_informative.astype(np.int32)]
# Generate a shuffled copy of the informative feature to destroy the
# relationship with the target.
X_shuffled = rng.permutation(X_informative)
# Also include a very high cardinality categorical feature that is by
# itself independent of the target variable: target encoding such a feature
# without internal cross-validation should cause catastrophic overfitting
# for the downstream regressor, even with shrinkage. This kind of features
# typically represents near unique identifiers of samples. In general they
# should be removed from a machine learning datasets but here we want to
# study the ability of the default behavior of TargetEncoder to mitigate
# them automatically.
X_near_unique_categories = rng.choice(
int(0.9 * n_samples), size=n_samples, replace=True
).reshape(-1, 1)
# Assemble the dataset and do a train-test split:
X = np.concatenate(
[X_informative, X_shuffled, X_near_unique_categories],
axis=1,
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# Let's first check that a linear regression model trained on the raw
# features underfits because of the meaning-less ordinal encoding of the
# labels.
raw_model = linear_regression.fit(X_train, y_train)
assert raw_model.score(X_train, y_train) < 0.1
assert raw_model.score(X_test, y_test) < 0.1
# Now do the same with target encoding using the internal CV mechanism
# implemented when using fit_transform.
model_with_cv = make_pipeline(
TargetEncoder(smooth=smooth, random_state=rng), linear_regression
).fit(X_train, y_train)
# This model should be able to fit the data well and also generalise to the
# test data (assuming that the binning is fine-grained enough). The R2
# scores are not perfect because of the noise injected during the
# generation of the unique informative feature.
coef = model_with_cv[-1].coef_
assert model_with_cv.score(X_train, y_train) > 0.5, coef
assert model_with_cv.score(X_test, y_test) > 0.5, coef
# The target encoder recovers the linear relationship with slope 1 between
# the target encoded unique informative predictor and the target. Since the
# target encoding of the 2 other features is not informative thanks to the
# use of internal cross-validation, the multivariate linear regressor
# assigns a coef of 1 to the first feature and 0 to the other 2.
assert coef[0] == pytest.approx(1, abs=1e-2)
assert (np.abs(coef[1:]) < 0.2).all()
# Let's now disable the internal cross-validation by calling fit and then
# transform separately on the training set:
target_encoder = TargetEncoder(smooth=smooth, random_state=rng).fit(
X_train, y_train
)
X_enc_no_cv_train = target_encoder.transform(X_train)
X_enc_no_cv_test = target_encoder.transform(X_test)
model_no_cv = linear_regression.fit(X_enc_no_cv_train, y_train)
# The linear regression model should always overfit because it assigns
# too much weight to the extremely high cardinality feature relatively to
# the informative feature. Note that this is the case even when using
# the empirical Bayes smoothing which is not enough to prevent such
# overfitting alone.
coef = model_no_cv.coef_
assert model_no_cv.score(X_enc_no_cv_train, y_train) > 0.7, coef
assert model_no_cv.score(X_enc_no_cv_test, y_test) < 0.5, coef
# The model overfits because it assigns too much weight to the high
# cardinality yet non-informative feature instead of the lower
# cardinality yet informative feature:
assert abs(coef[0]) < abs(coef[2])
def test_pandas_copy_on_write():
"""
Test target-encoder cython code when y is read-only.
The numpy array underlying df["y"] is read-only when copy-on-write is enabled.
Non-regression test for gh-27879.
"""
pd = pytest.importorskip("pandas", minversion="2.0")
with pd.option_context("mode.copy_on_write", True):
df = pd.DataFrame({"x": ["a", "b", "b"], "y": [4.0, 5.0, 6.0]})
TargetEncoder(target_type="continuous").fit(df[["x"]], df["y"])