add read me
This commit is contained in:
@@ -0,0 +1,63 @@
|
||||
"""Methods for scaling, centering, normalization, binarization, and more."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from ._data import (
|
||||
Binarizer,
|
||||
KernelCenterer,
|
||||
MaxAbsScaler,
|
||||
MinMaxScaler,
|
||||
Normalizer,
|
||||
PowerTransformer,
|
||||
QuantileTransformer,
|
||||
RobustScaler,
|
||||
StandardScaler,
|
||||
add_dummy_feature,
|
||||
binarize,
|
||||
maxabs_scale,
|
||||
minmax_scale,
|
||||
normalize,
|
||||
power_transform,
|
||||
quantile_transform,
|
||||
robust_scale,
|
||||
scale,
|
||||
)
|
||||
from ._discretization import KBinsDiscretizer
|
||||
from ._encoders import OneHotEncoder, OrdinalEncoder
|
||||
from ._function_transformer import FunctionTransformer
|
||||
from ._label import LabelBinarizer, LabelEncoder, MultiLabelBinarizer, label_binarize
|
||||
from ._polynomial import PolynomialFeatures, SplineTransformer
|
||||
from ._target_encoder import TargetEncoder
|
||||
|
||||
__all__ = [
|
||||
"Binarizer",
|
||||
"FunctionTransformer",
|
||||
"KBinsDiscretizer",
|
||||
"KernelCenterer",
|
||||
"LabelBinarizer",
|
||||
"LabelEncoder",
|
||||
"MaxAbsScaler",
|
||||
"MinMaxScaler",
|
||||
"MultiLabelBinarizer",
|
||||
"Normalizer",
|
||||
"OneHotEncoder",
|
||||
"OrdinalEncoder",
|
||||
"PolynomialFeatures",
|
||||
"PowerTransformer",
|
||||
"QuantileTransformer",
|
||||
"RobustScaler",
|
||||
"SplineTransformer",
|
||||
"StandardScaler",
|
||||
"TargetEncoder",
|
||||
"add_dummy_feature",
|
||||
"binarize",
|
||||
"label_binarize",
|
||||
"maxabs_scale",
|
||||
"minmax_scale",
|
||||
"normalize",
|
||||
"power_transform",
|
||||
"quantile_transform",
|
||||
"robust_scale",
|
||||
"scale",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,258 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from ..utils._typedefs cimport uint8_t, int64_t, intp_t
|
||||
|
||||
ctypedef uint8_t FLAG_t
|
||||
|
||||
# We use the following verbatim block to determine whether the current
|
||||
# platform's compiler supports 128-bit integer values intrinsically.
|
||||
# This should work for GCC and CLANG on 64-bit architectures, but doesn't for
|
||||
# MSVC on any architecture. We prefer to use 128-bit integers when possible
|
||||
# because the intermediate calculations have a non-trivial risk of overflow. It
|
||||
# is, however, very unlikely to come up on an average use case, hence 64-bit
|
||||
# integers (i.e. `long long`) are "good enough" for most common cases. There is
|
||||
# not much we can do to efficiently mitigate the overflow risk on the Windows
|
||||
# platform at this time. Consider this a "best effort" design decision that
|
||||
# could be revisited later in case someone comes up with a safer option that
|
||||
# does not hurt the performance of the common cases.
|
||||
# See `test_sizeof_LARGEST_INT_t()`for more information on exact type expectations.
|
||||
cdef extern from *:
|
||||
"""
|
||||
#ifdef __SIZEOF_INT128__
|
||||
typedef __int128 LARGEST_INT_t;
|
||||
#elif (__clang__ || __EMSCRIPTEN__) && !__i386__
|
||||
typedef _BitInt(128) LARGEST_INT_t;
|
||||
#else
|
||||
typedef long long LARGEST_INT_t;
|
||||
#endif
|
||||
"""
|
||||
ctypedef long long LARGEST_INT_t
|
||||
|
||||
|
||||
# Determine the size of `LARGEST_INT_t` at runtime.
|
||||
# Used in `test_sizeof_LARGEST_INT_t`.
|
||||
def _get_sizeof_LARGEST_INT_t():
|
||||
return sizeof(LARGEST_INT_t)
|
||||
|
||||
|
||||
# TODO: use `{int,float}{32,64}_t` when cython#5230 is resolved:
|
||||
# https://github.com/cython/cython/issues/5230
|
||||
ctypedef fused DATA_t:
|
||||
float
|
||||
double
|
||||
int
|
||||
long long
|
||||
# INDEX_{A,B}_t are defined to generate a proper Cartesian product
|
||||
# of types through Cython fused-type expansion.
|
||||
ctypedef fused INDEX_A_t:
|
||||
signed int
|
||||
signed long long
|
||||
ctypedef fused INDEX_B_t:
|
||||
signed int
|
||||
signed long long
|
||||
|
||||
cdef inline int64_t _deg2_column(
|
||||
LARGEST_INT_t n_features,
|
||||
LARGEST_INT_t i,
|
||||
LARGEST_INT_t j,
|
||||
FLAG_t interaction_only
|
||||
) nogil:
|
||||
"""Compute the index of the column for a degree 2 expansion
|
||||
|
||||
n_features is the dimensionality of the input data, i and j are the indices
|
||||
for the columns involved in the expansion.
|
||||
"""
|
||||
if interaction_only:
|
||||
return n_features * i - i * (i + 3) / 2 - 1 + j
|
||||
else:
|
||||
return n_features * i - i* (i + 1) / 2 + j
|
||||
|
||||
|
||||
cdef inline int64_t _deg3_column(
|
||||
LARGEST_INT_t n_features,
|
||||
LARGEST_INT_t i,
|
||||
LARGEST_INT_t j,
|
||||
LARGEST_INT_t k,
|
||||
FLAG_t interaction_only
|
||||
) nogil:
|
||||
"""Compute the index of the column for a degree 3 expansion
|
||||
|
||||
n_features is the dimensionality of the input data, i, j and k are the indices
|
||||
for the columns involved in the expansion.
|
||||
"""
|
||||
if interaction_only:
|
||||
return (
|
||||
(
|
||||
(3 * n_features) * (n_features * i - i**2)
|
||||
+ i * (i**2 + 11) - (3 * j) * (j + 3)
|
||||
) / 6 + i**2 + n_features * (j - 1 - 2 * i) + k
|
||||
)
|
||||
else:
|
||||
return (
|
||||
(
|
||||
(3 * n_features) * (n_features * i - i**2)
|
||||
+ i ** 3 - i - (3 * j) * (j + 1)
|
||||
) / 6 + n_features * j + k
|
||||
)
|
||||
|
||||
|
||||
def py_calc_expanded_nnz_deg2(n, interaction_only):
|
||||
return n * (n + 1) // 2 - interaction_only * n
|
||||
|
||||
|
||||
def py_calc_expanded_nnz_deg3(n, interaction_only):
|
||||
return n * (n**2 + 3 * n + 2) // 6 - interaction_only * n**2
|
||||
|
||||
|
||||
cpdef int64_t _calc_expanded_nnz(
|
||||
LARGEST_INT_t n,
|
||||
FLAG_t interaction_only,
|
||||
LARGEST_INT_t degree
|
||||
):
|
||||
"""
|
||||
Calculates the number of non-zero interaction terms generated by the
|
||||
non-zero elements of a single row.
|
||||
"""
|
||||
# This is the maximum value before the intermediate computation
|
||||
# d**2 + d overflows
|
||||
# Solution to d**2 + d = maxint64
|
||||
# SymPy: solve(x**2 + x - int64_max, x)
|
||||
cdef int64_t MAX_SAFE_INDEX_CALC_DEG2 = 3037000499
|
||||
|
||||
# This is the maximum value before the intermediate computation
|
||||
# d**3 + 3 * d**2 + 2*d overflows
|
||||
# Solution to d**3 + 3 * d**2 + 2*d = maxint64
|
||||
# SymPy: solve(x * (x**2 + 3 * x + 2) - int64_max, x)
|
||||
cdef int64_t MAX_SAFE_INDEX_CALC_DEG3 = 2097151
|
||||
|
||||
if degree == 2:
|
||||
# Only need to check when not using 128-bit integers
|
||||
if sizeof(LARGEST_INT_t) < 16 and n <= MAX_SAFE_INDEX_CALC_DEG2:
|
||||
return n * (n + 1) / 2 - interaction_only * n
|
||||
return <int64_t> py_calc_expanded_nnz_deg2(n, interaction_only)
|
||||
else:
|
||||
# Only need to check when not using 128-bit integers
|
||||
if sizeof(LARGEST_INT_t) < 16 and n <= MAX_SAFE_INDEX_CALC_DEG3:
|
||||
return n * (n**2 + 3 * n + 2) / 6 - interaction_only * n**2
|
||||
return <int64_t> py_calc_expanded_nnz_deg3(n, interaction_only)
|
||||
|
||||
cpdef int64_t _calc_total_nnz(
|
||||
INDEX_A_t[:] indptr,
|
||||
FLAG_t interaction_only,
|
||||
int64_t degree,
|
||||
):
|
||||
"""
|
||||
Calculates the number of non-zero interaction terms generated by the
|
||||
non-zero elements across all rows for a single degree.
|
||||
"""
|
||||
cdef int64_t total_nnz=0
|
||||
cdef intp_t row_idx
|
||||
for row_idx in range(len(indptr) - 1):
|
||||
total_nnz += _calc_expanded_nnz(
|
||||
indptr[row_idx + 1] - indptr[row_idx],
|
||||
interaction_only,
|
||||
degree
|
||||
)
|
||||
return total_nnz
|
||||
|
||||
|
||||
cpdef void _csr_polynomial_expansion(
|
||||
const DATA_t[:] data, # IN READ-ONLY
|
||||
const INDEX_A_t[:] indices, # IN READ-ONLY
|
||||
const INDEX_A_t[:] indptr, # IN READ-ONLY
|
||||
INDEX_A_t n_features,
|
||||
DATA_t[:] result_data, # OUT
|
||||
INDEX_B_t[:] result_indices, # OUT
|
||||
INDEX_B_t[:] result_indptr, # OUT
|
||||
FLAG_t interaction_only,
|
||||
FLAG_t degree
|
||||
):
|
||||
"""
|
||||
Perform a second or third degree polynomial or interaction expansion on a
|
||||
compressed sparse row (CSR) matrix. The method used only takes products of
|
||||
non-zero features. For a matrix with density :math:`d`, this results in a
|
||||
speedup on the order of :math:`(1/d)^k` where :math:`k` is the degree of
|
||||
the expansion, assuming all rows are of similar density.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : memory view on nd-array
|
||||
The "data" attribute of the input CSR matrix.
|
||||
|
||||
indices : memory view on nd-array
|
||||
The "indices" attribute of the input CSR matrix.
|
||||
|
||||
indptr : memory view on nd-array
|
||||
The "indptr" attribute of the input CSR matrix.
|
||||
|
||||
n_features : int
|
||||
The dimensionality of the input CSR matrix.
|
||||
|
||||
result_data : nd-array
|
||||
The output CSR matrix's "data" attribute.
|
||||
It is modified by this routine.
|
||||
|
||||
result_indices : nd-array
|
||||
The output CSR matrix's "indices" attribute.
|
||||
It is modified by this routine.
|
||||
|
||||
result_indptr : nd-array
|
||||
The output CSR matrix's "indptr" attribute.
|
||||
It is modified by this routine.
|
||||
|
||||
interaction_only : int
|
||||
0 for a polynomial expansion, 1 for an interaction expansion.
|
||||
|
||||
degree : int
|
||||
The degree of the expansion. This must be either 2 or 3.
|
||||
|
||||
References
|
||||
----------
|
||||
"Leveraging Sparsity to Speed Up Polynomial Feature Expansions of CSR
|
||||
Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes.
|
||||
"""
|
||||
|
||||
# Make the arrays that will form the CSR matrix of the expansion.
|
||||
cdef INDEX_A_t row_i, row_starts, row_ends, i, j, k, i_ptr, j_ptr, k_ptr
|
||||
cdef INDEX_B_t expanded_index=0, num_cols_in_row, col
|
||||
with nogil:
|
||||
result_indptr[0] = indptr[0]
|
||||
for row_i in range(indptr.shape[0]-1):
|
||||
row_starts = indptr[row_i]
|
||||
row_ends = indptr[row_i + 1]
|
||||
num_cols_in_row = 0
|
||||
for i_ptr in range(row_starts, row_ends):
|
||||
i = indices[i_ptr]
|
||||
for j_ptr in range(i_ptr + interaction_only, row_ends):
|
||||
j = indices[j_ptr]
|
||||
if degree == 2:
|
||||
col = <INDEX_B_t> _deg2_column(
|
||||
n_features,
|
||||
i, j,
|
||||
interaction_only
|
||||
)
|
||||
result_indices[expanded_index] = col
|
||||
result_data[expanded_index] = (
|
||||
data[i_ptr] * data[j_ptr]
|
||||
)
|
||||
expanded_index += 1
|
||||
num_cols_in_row += 1
|
||||
else:
|
||||
# degree == 3
|
||||
for k_ptr in range(j_ptr + interaction_only, row_ends):
|
||||
k = indices[k_ptr]
|
||||
col = <INDEX_B_t> _deg3_column(
|
||||
n_features,
|
||||
i, j, k,
|
||||
interaction_only
|
||||
)
|
||||
result_indices[expanded_index] = col
|
||||
result_data[expanded_index] = (
|
||||
data[i_ptr] * data[j_ptr] * data[k_ptr]
|
||||
)
|
||||
expanded_index += 1
|
||||
num_cols_in_row += 1
|
||||
|
||||
result_indptr[row_i+1] = result_indptr[row_i] + num_cols_in_row
|
||||
return
|
||||
3706
venv/lib/python3.12/site-packages/sklearn/preprocessing/_data.py
Normal file
3706
venv/lib/python3.12/site-packages/sklearn/preprocessing/_data.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,548 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
|
||||
import warnings
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin, _fit_context
|
||||
from ..utils import resample
|
||||
from ..utils._param_validation import Interval, Options, StrOptions
|
||||
from ..utils.stats import _averaged_weighted_percentile, _weighted_percentile
|
||||
from ..utils.validation import (
|
||||
_check_feature_names_in,
|
||||
_check_sample_weight,
|
||||
check_array,
|
||||
check_is_fitted,
|
||||
validate_data,
|
||||
)
|
||||
from ._encoders import OneHotEncoder
|
||||
|
||||
|
||||
class KBinsDiscretizer(TransformerMixin, BaseEstimator):
|
||||
"""
|
||||
Bin continuous data into intervals.
|
||||
|
||||
Read more in the :ref:`User Guide <preprocessing_discretization>`.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_bins : int or array-like of shape (n_features,), default=5
|
||||
The number of bins to produce. Raises ValueError if ``n_bins < 2``.
|
||||
|
||||
encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
|
||||
Method used to encode the transformed result.
|
||||
|
||||
- 'onehot': Encode the transformed result with one-hot encoding
|
||||
and return a sparse matrix. Ignored features are always
|
||||
stacked to the right.
|
||||
- 'onehot-dense': Encode the transformed result with one-hot encoding
|
||||
and return a dense array. Ignored features are always
|
||||
stacked to the right.
|
||||
- 'ordinal': Return the bin identifier encoded as an integer value.
|
||||
|
||||
strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
|
||||
Strategy used to define the widths of the bins.
|
||||
|
||||
- 'uniform': All bins in each feature have identical widths.
|
||||
- 'quantile': All bins in each feature have the same number of points.
|
||||
- 'kmeans': Values in each bin have the same nearest center of a 1D
|
||||
k-means cluster.
|
||||
|
||||
For an example of the different strategies see:
|
||||
:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.
|
||||
|
||||
quantile_method : {"inverted_cdf", "averaged_inverted_cdf",
|
||||
"closest_observation", "interpolated_inverted_cdf", "hazen",
|
||||
"weibull", "linear", "median_unbiased", "normal_unbiased"},
|
||||
default="linear"
|
||||
Method to pass on to np.percentile calculation when using
|
||||
strategy="quantile". Only `averaged_inverted_cdf` and `inverted_cdf`
|
||||
support the use of `sample_weight != None` when subsampling is not
|
||||
active.
|
||||
|
||||
.. versionadded:: 1.7
|
||||
|
||||
dtype : {np.float32, np.float64}, default=None
|
||||
The desired data-type for the output. If None, output dtype is
|
||||
consistent with input dtype. Only np.float32 and np.float64 are
|
||||
supported.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
subsample : int or None, default=200_000
|
||||
Maximum number of samples, used to fit the model, for computational
|
||||
efficiency.
|
||||
`subsample=None` means that all the training samples are used when
|
||||
computing the quantiles that determine the binning thresholds.
|
||||
Since quantile computation relies on sorting each column of `X` and
|
||||
that sorting has an `n log(n)` time complexity,
|
||||
it is recommended to use subsampling on datasets with a
|
||||
very large number of samples.
|
||||
|
||||
.. versionchanged:: 1.3
|
||||
The default value of `subsample` changed from `None` to `200_000` when
|
||||
`strategy="quantile"`.
|
||||
|
||||
.. versionchanged:: 1.5
|
||||
The default value of `subsample` changed from `None` to `200_000` when
|
||||
`strategy="uniform"` or `strategy="kmeans"`.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines random number generation for subsampling.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See the `subsample` parameter for more details.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
Attributes
|
||||
----------
|
||||
bin_edges_ : ndarray of ndarray of shape (n_features,)
|
||||
The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
|
||||
Ignored features will have empty arrays.
|
||||
|
||||
n_bins_ : ndarray of shape (n_features,), dtype=np.int64
|
||||
Number of bins per feature. Bins whose width are too small
|
||||
(i.e., <= 1e-8) are removed with a warning.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
Binarizer : Class used to bin values as ``0`` or
|
||||
``1`` based on a parameter ``threshold``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
For a visualization of discretization on different datasets refer to
|
||||
:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.
|
||||
On the effect of discretization on linear models see:
|
||||
:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.
|
||||
|
||||
In bin edges for feature ``i``, the first and last values are used only for
|
||||
``inverse_transform``. During transform, bin edges are extended to::
|
||||
|
||||
np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])
|
||||
|
||||
You can combine ``KBinsDiscretizer`` with
|
||||
:class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
|
||||
part of the features.
|
||||
|
||||
``KBinsDiscretizer`` might produce constant features (e.g., when
|
||||
``encode = 'onehot'`` and certain bins do not contain any data).
|
||||
These features can be removed with feature selection algorithms
|
||||
(e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.preprocessing import KBinsDiscretizer
|
||||
>>> X = [[-2, 1, -4, -1],
|
||||
... [-1, 2, -3, -0.5],
|
||||
... [ 0, 3, -2, 0.5],
|
||||
... [ 1, 4, -1, 2]]
|
||||
>>> est = KBinsDiscretizer(
|
||||
... n_bins=3, encode='ordinal', strategy='uniform'
|
||||
... )
|
||||
>>> est.fit(X)
|
||||
KBinsDiscretizer(...)
|
||||
>>> Xt = est.transform(X)
|
||||
>>> Xt # doctest: +SKIP
|
||||
array([[ 0., 0., 0., 0.],
|
||||
[ 1., 1., 1., 0.],
|
||||
[ 2., 2., 2., 1.],
|
||||
[ 2., 2., 2., 2.]])
|
||||
|
||||
Sometimes it may be useful to convert the data back into the original
|
||||
feature space. The ``inverse_transform`` function converts the binned
|
||||
data into the original feature space. Each value will be equal to the mean
|
||||
of the two bin edges.
|
||||
|
||||
>>> est.bin_edges_[0]
|
||||
array([-2., -1., 0., 1.])
|
||||
>>> est.inverse_transform(Xt)
|
||||
array([[-1.5, 1.5, -3.5, -0.5],
|
||||
[-0.5, 2.5, -2.5, -0.5],
|
||||
[ 0.5, 3.5, -1.5, 0.5],
|
||||
[ 0.5, 3.5, -1.5, 1.5]])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"],
|
||||
"encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})],
|
||||
"strategy": [StrOptions({"uniform", "quantile", "kmeans"})],
|
||||
"quantile_method": [
|
||||
StrOptions(
|
||||
{
|
||||
"warn",
|
||||
"inverted_cdf",
|
||||
"averaged_inverted_cdf",
|
||||
"closest_observation",
|
||||
"interpolated_inverted_cdf",
|
||||
"hazen",
|
||||
"weibull",
|
||||
"linear",
|
||||
"median_unbiased",
|
||||
"normal_unbiased",
|
||||
}
|
||||
)
|
||||
],
|
||||
"dtype": [Options(type, {np.float64, np.float32}), None],
|
||||
"subsample": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"random_state": ["random_state"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_bins=5,
|
||||
*,
|
||||
encode="onehot",
|
||||
strategy="quantile",
|
||||
quantile_method="warn",
|
||||
dtype=None,
|
||||
subsample=200_000,
|
||||
random_state=None,
|
||||
):
|
||||
self.n_bins = n_bins
|
||||
self.encode = encode
|
||||
self.strategy = strategy
|
||||
self.quantile_method = quantile_method
|
||||
self.dtype = dtype
|
||||
self.subsample = subsample
|
||||
self.random_state = random_state
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None, sample_weight=None):
|
||||
"""
|
||||
Fit the estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data to be discretized.
|
||||
|
||||
y : None
|
||||
Ignored. This parameter exists only for compatibility with
|
||||
:class:`~sklearn.pipeline.Pipeline`.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,)
|
||||
Contains weight values to be associated with each sample.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
.. versionchanged:: 1.7
|
||||
Added support for strategy="uniform".
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
X = validate_data(self, X, dtype="numeric")
|
||||
|
||||
if self.dtype in (np.float64, np.float32):
|
||||
output_dtype = self.dtype
|
||||
else: # self.dtype is None
|
||||
output_dtype = X.dtype
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
if sample_weight is not None:
|
||||
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
|
||||
|
||||
if self.subsample is not None and n_samples > self.subsample:
|
||||
# Take a subsample of `X`
|
||||
# When resampling, it is important to subsample **with replacement** to
|
||||
# preserve the distribution, in particular in the presence of a few data
|
||||
# points with large weights. You can check this by setting `replace=False`
|
||||
# in sklearn.utils.test.test_indexing.test_resample_weighted and check that
|
||||
# it fails as a justification for this claim.
|
||||
X = resample(
|
||||
X,
|
||||
replace=True,
|
||||
n_samples=self.subsample,
|
||||
random_state=self.random_state,
|
||||
sample_weight=sample_weight,
|
||||
)
|
||||
# Since we already used the weights when resampling when provided,
|
||||
# we set them back to `None` to avoid accounting for the weights twice
|
||||
# in subsequent operations to compute weight-aware bin edges with
|
||||
# quantiles or k-means.
|
||||
sample_weight = None
|
||||
|
||||
n_features = X.shape[1]
|
||||
n_bins = self._validate_n_bins(n_features)
|
||||
|
||||
bin_edges = np.zeros(n_features, dtype=object)
|
||||
|
||||
# TODO(1.9): remove and switch to quantile_method="averaged_inverted_cdf"
|
||||
# by default.
|
||||
quantile_method = self.quantile_method
|
||||
if self.strategy == "quantile" and quantile_method == "warn":
|
||||
warnings.warn(
|
||||
"The current default behavior, quantile_method='linear', will be "
|
||||
"changed to quantile_method='averaged_inverted_cdf' in "
|
||||
"scikit-learn version 1.9 to naturally support sample weight "
|
||||
"equivalence properties by default. Pass "
|
||||
"quantile_method='averaged_inverted_cdf' explicitly to silence this "
|
||||
"warning.",
|
||||
FutureWarning,
|
||||
)
|
||||
quantile_method = "linear"
|
||||
|
||||
if (
|
||||
self.strategy == "quantile"
|
||||
and quantile_method not in ["inverted_cdf", "averaged_inverted_cdf"]
|
||||
and sample_weight is not None
|
||||
):
|
||||
raise ValueError(
|
||||
"When fitting with strategy='quantile' and sample weights, "
|
||||
"quantile_method should either be set to 'averaged_inverted_cdf' or "
|
||||
f"'inverted_cdf', got quantile_method='{quantile_method}' instead."
|
||||
)
|
||||
|
||||
if self.strategy != "quantile" and sample_weight is not None:
|
||||
# Prepare a mask to filter out zero-weight samples when extracting
|
||||
# the min and max values of each columns which are needed for the
|
||||
# "uniform" and "kmeans" strategies.
|
||||
nnz_weight_mask = sample_weight != 0
|
||||
else:
|
||||
# Otherwise, all samples are used. Use a slice to avoid creating a
|
||||
# new array.
|
||||
nnz_weight_mask = slice(None)
|
||||
|
||||
for jj in range(n_features):
|
||||
column = X[:, jj]
|
||||
col_min = column[nnz_weight_mask].min()
|
||||
col_max = column[nnz_weight_mask].max()
|
||||
|
||||
if col_min == col_max:
|
||||
warnings.warn(
|
||||
"Feature %d is constant and will be replaced with 0." % jj
|
||||
)
|
||||
n_bins[jj] = 1
|
||||
bin_edges[jj] = np.array([-np.inf, np.inf])
|
||||
continue
|
||||
|
||||
if self.strategy == "uniform":
|
||||
bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)
|
||||
|
||||
elif self.strategy == "quantile":
|
||||
percentile_levels = np.linspace(0, 100, n_bins[jj] + 1)
|
||||
|
||||
# method="linear" is the implicit default for any numpy
|
||||
# version. So we keep it version independent in that case by
|
||||
# using an empty param dict.
|
||||
percentile_kwargs = {}
|
||||
if quantile_method != "linear" and sample_weight is None:
|
||||
percentile_kwargs["method"] = quantile_method
|
||||
|
||||
if sample_weight is None:
|
||||
bin_edges[jj] = np.asarray(
|
||||
np.percentile(column, percentile_levels, **percentile_kwargs),
|
||||
dtype=np.float64,
|
||||
)
|
||||
else:
|
||||
# TODO: make _weighted_percentile and
|
||||
# _averaged_weighted_percentile accept an array of
|
||||
# quantiles instead of calling it multiple times and
|
||||
# sorting the column multiple times as a result.
|
||||
percentile_func = {
|
||||
"inverted_cdf": _weighted_percentile,
|
||||
"averaged_inverted_cdf": _averaged_weighted_percentile,
|
||||
}[quantile_method]
|
||||
bin_edges[jj] = np.asarray(
|
||||
[
|
||||
percentile_func(column, sample_weight, percentile_rank=p)
|
||||
for p in percentile_levels
|
||||
],
|
||||
dtype=np.float64,
|
||||
)
|
||||
elif self.strategy == "kmeans":
|
||||
from ..cluster import KMeans # fixes import loops
|
||||
|
||||
# Deterministic initialization with uniform spacing
|
||||
uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
|
||||
init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
|
||||
|
||||
# 1D k-means procedure
|
||||
km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
|
||||
centers = km.fit(
|
||||
column[:, None], sample_weight=sample_weight
|
||||
).cluster_centers_[:, 0]
|
||||
# Must sort, centers may be unsorted even with sorted init
|
||||
centers.sort()
|
||||
bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
|
||||
bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
|
||||
|
||||
# Remove bins whose width are too small (i.e., <= 1e-8)
|
||||
if self.strategy in ("quantile", "kmeans"):
|
||||
mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
|
||||
bin_edges[jj] = bin_edges[jj][mask]
|
||||
if len(bin_edges[jj]) - 1 != n_bins[jj]:
|
||||
warnings.warn(
|
||||
"Bins whose width are too small (i.e., <= "
|
||||
"1e-8) in feature %d are removed. Consider "
|
||||
"decreasing the number of bins." % jj
|
||||
)
|
||||
n_bins[jj] = len(bin_edges[jj]) - 1
|
||||
|
||||
self.bin_edges_ = bin_edges
|
||||
self.n_bins_ = n_bins
|
||||
|
||||
if "onehot" in self.encode:
|
||||
self._encoder = OneHotEncoder(
|
||||
categories=[np.arange(i) for i in self.n_bins_],
|
||||
sparse_output=self.encode == "onehot",
|
||||
dtype=output_dtype,
|
||||
)
|
||||
# Fit the OneHotEncoder with toy datasets
|
||||
# so that it's ready for use after the KBinsDiscretizer is fitted
|
||||
self._encoder.fit(np.zeros((1, len(self.n_bins_))))
|
||||
|
||||
return self
|
||||
|
||||
def _validate_n_bins(self, n_features):
|
||||
"""Returns n_bins_, the number of bins per feature."""
|
||||
orig_bins = self.n_bins
|
||||
if isinstance(orig_bins, Integral):
|
||||
return np.full(n_features, orig_bins, dtype=int)
|
||||
|
||||
n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)
|
||||
|
||||
if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
|
||||
raise ValueError("n_bins must be a scalar or array of shape (n_features,).")
|
||||
|
||||
bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
|
||||
|
||||
violating_indices = np.where(bad_nbins_value)[0]
|
||||
if violating_indices.shape[0] > 0:
|
||||
indices = ", ".join(str(i) for i in violating_indices)
|
||||
raise ValueError(
|
||||
"{} received an invalid number "
|
||||
"of bins at indices {}. Number of bins "
|
||||
"must be at least 2, and must be an int.".format(
|
||||
KBinsDiscretizer.__name__, indices
|
||||
)
|
||||
)
|
||||
return n_bins
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Discretize the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data to be discretized.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
|
||||
Data in the binned space. Will be a sparse matrix if
|
||||
`self.encode='onehot'` and ndarray otherwise.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
# check input and attribute dtypes
|
||||
dtype = (np.float64, np.float32) if self.dtype is None else self.dtype
|
||||
Xt = validate_data(self, X, copy=True, dtype=dtype, reset=False)
|
||||
|
||||
bin_edges = self.bin_edges_
|
||||
for jj in range(Xt.shape[1]):
|
||||
Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right")
|
||||
|
||||
if self.encode == "ordinal":
|
||||
return Xt
|
||||
|
||||
dtype_init = None
|
||||
if "onehot" in self.encode:
|
||||
dtype_init = self._encoder.dtype
|
||||
self._encoder.dtype = Xt.dtype
|
||||
try:
|
||||
Xt_enc = self._encoder.transform(Xt)
|
||||
finally:
|
||||
# revert the initial dtype to avoid modifying self.
|
||||
self._encoder.dtype = dtype_init
|
||||
return Xt_enc
|
||||
|
||||
def inverse_transform(self, X):
|
||||
"""
|
||||
Transform discretized data back to original feature space.
|
||||
|
||||
Note that this function does not regenerate the original data
|
||||
due to discretization rounding.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Transformed data in the binned space.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_original : ndarray, dtype={np.float32, np.float64}
|
||||
Data in the original feature space.
|
||||
"""
|
||||
|
||||
check_is_fitted(self)
|
||||
|
||||
if "onehot" in self.encode:
|
||||
X = self._encoder.inverse_transform(X)
|
||||
|
||||
Xinv = check_array(X, copy=True, dtype=(np.float64, np.float32))
|
||||
n_features = self.n_bins_.shape[0]
|
||||
if Xinv.shape[1] != n_features:
|
||||
raise ValueError(
|
||||
"Incorrect number of features. Expecting {}, received {}.".format(
|
||||
n_features, Xinv.shape[1]
|
||||
)
|
||||
)
|
||||
|
||||
for jj in range(n_features):
|
||||
bin_edges = self.bin_edges_[jj]
|
||||
bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
|
||||
Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)]
|
||||
|
||||
return Xinv
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
"""Get output feature names.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_features : array-like of str or None, default=None
|
||||
Input features.
|
||||
|
||||
- If `input_features` is `None`, then `feature_names_in_` is
|
||||
used as feature names in. If `feature_names_in_` is not defined,
|
||||
then the following input feature names are generated:
|
||||
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
|
||||
- If `input_features` is an array-like, then `input_features` must
|
||||
match `feature_names_in_` if `feature_names_in_` is defined.
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_names_out : ndarray of str objects
|
||||
Transformed feature names.
|
||||
"""
|
||||
check_is_fitted(self, "n_features_in_")
|
||||
input_features = _check_feature_names_in(self, input_features)
|
||||
if hasattr(self, "_encoder"):
|
||||
return self._encoder.get_feature_names_out(input_features)
|
||||
|
||||
# ordinal encoding
|
||||
return input_features
|
||||
1698
venv/lib/python3.12/site-packages/sklearn/preprocessing/_encoders.py
Normal file
1698
venv/lib/python3.12/site-packages/sklearn/preprocessing/_encoders.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,449 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin, _fit_context
|
||||
from ..utils._param_validation import StrOptions
|
||||
from ..utils._repr_html.estimator import _VisualBlock
|
||||
from ..utils._set_output import (
|
||||
_get_adapter_from_container,
|
||||
_get_output_config,
|
||||
)
|
||||
from ..utils.metaestimators import available_if
|
||||
from ..utils.validation import (
|
||||
_allclose_dense_sparse,
|
||||
_check_feature_names,
|
||||
_check_feature_names_in,
|
||||
_check_n_features,
|
||||
_get_feature_names,
|
||||
_is_pandas_df,
|
||||
_is_polars_df,
|
||||
check_array,
|
||||
validate_data,
|
||||
)
|
||||
|
||||
|
||||
def _identity(X):
|
||||
"""The identity function."""
|
||||
return X
|
||||
|
||||
|
||||
class FunctionTransformer(TransformerMixin, BaseEstimator):
|
||||
"""Constructs a transformer from an arbitrary callable.
|
||||
|
||||
A FunctionTransformer forwards its X (and optionally y) arguments to a
|
||||
user-defined function or function object and returns the result of this
|
||||
function. This is useful for stateless transformations such as taking the
|
||||
log of frequencies, doing custom scaling, etc.
|
||||
|
||||
Note: If a lambda is used as the function, then the resulting
|
||||
transformer will not be pickleable.
|
||||
|
||||
.. versionadded:: 0.17
|
||||
|
||||
Read more in the :ref:`User Guide <function_transformer>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : callable, default=None
|
||||
The callable to use for the transformation. This will be passed
|
||||
the same arguments as transform, with args and kwargs forwarded.
|
||||
If func is None, then func will be the identity function.
|
||||
|
||||
inverse_func : callable, default=None
|
||||
The callable to use for the inverse transformation. This will be
|
||||
passed the same arguments as inverse transform, with args and
|
||||
kwargs forwarded. If inverse_func is None, then inverse_func
|
||||
will be the identity function.
|
||||
|
||||
validate : bool, default=False
|
||||
Indicate that the input X array should be checked before calling
|
||||
``func``. The possibilities are:
|
||||
|
||||
- If False, there is no input validation.
|
||||
- If True, then X will be converted to a 2-dimensional NumPy array or
|
||||
sparse matrix. If the conversion is not possible an exception is
|
||||
raised.
|
||||
|
||||
.. versionchanged:: 0.22
|
||||
The default of ``validate`` changed from True to False.
|
||||
|
||||
accept_sparse : bool, default=False
|
||||
Indicate that func accepts a sparse matrix as input. If validate is
|
||||
False, this has no effect. Otherwise, if accept_sparse is false,
|
||||
sparse matrix inputs will cause an exception to be raised.
|
||||
|
||||
check_inverse : bool, default=True
|
||||
Whether to check that or ``func`` followed by ``inverse_func`` leads to
|
||||
the original inputs. It can be used for a sanity check, raising a
|
||||
warning when the condition is not fulfilled.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
feature_names_out : callable, 'one-to-one' or None, default=None
|
||||
Determines the list of feature names that will be returned by the
|
||||
`get_feature_names_out` method. If it is 'one-to-one', then the output
|
||||
feature names will be equal to the input feature names. If it is a
|
||||
callable, then it must take two positional arguments: this
|
||||
`FunctionTransformer` (`self`) and an array-like of input feature names
|
||||
(`input_features`). It must return an array-like of output feature
|
||||
names. The `get_feature_names_out` method is only defined if
|
||||
`feature_names_out` is not None.
|
||||
|
||||
See ``get_feature_names_out`` for more details.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
kw_args : dict, default=None
|
||||
Dictionary of additional keyword arguments to pass to func.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
inv_kw_args : dict, default=None
|
||||
Dictionary of additional keyword arguments to pass to inverse_func.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
Attributes
|
||||
----------
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X` has feature
|
||||
names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
MaxAbsScaler : Scale each feature by its maximum absolute value.
|
||||
StandardScaler : Standardize features by removing the mean and
|
||||
scaling to unit variance.
|
||||
LabelBinarizer : Binarize labels in a one-vs-all fashion.
|
||||
MultiLabelBinarizer : Transform between iterable of iterables
|
||||
and a multilabel format.
|
||||
|
||||
Notes
|
||||
-----
|
||||
If `func` returns an output with a `columns` attribute, then the columns is enforced
|
||||
to be consistent with the output of `get_feature_names_out`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.preprocessing import FunctionTransformer
|
||||
>>> transformer = FunctionTransformer(np.log1p)
|
||||
>>> X = np.array([[0, 1], [2, 3]])
|
||||
>>> transformer.transform(X)
|
||||
array([[0. , 0.6931],
|
||||
[1.0986, 1.3862]])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"func": [callable, None],
|
||||
"inverse_func": [callable, None],
|
||||
"validate": ["boolean"],
|
||||
"accept_sparse": ["boolean"],
|
||||
"check_inverse": ["boolean"],
|
||||
"feature_names_out": [callable, StrOptions({"one-to-one"}), None],
|
||||
"kw_args": [dict, None],
|
||||
"inv_kw_args": [dict, None],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
func=None,
|
||||
inverse_func=None,
|
||||
*,
|
||||
validate=False,
|
||||
accept_sparse=False,
|
||||
check_inverse=True,
|
||||
feature_names_out=None,
|
||||
kw_args=None,
|
||||
inv_kw_args=None,
|
||||
):
|
||||
self.func = func
|
||||
self.inverse_func = inverse_func
|
||||
self.validate = validate
|
||||
self.accept_sparse = accept_sparse
|
||||
self.check_inverse = check_inverse
|
||||
self.feature_names_out = feature_names_out
|
||||
self.kw_args = kw_args
|
||||
self.inv_kw_args = inv_kw_args
|
||||
|
||||
def _check_input(self, X, *, reset):
|
||||
if self.validate:
|
||||
return validate_data(self, X, accept_sparse=self.accept_sparse, reset=reset)
|
||||
elif reset:
|
||||
# Set feature_names_in_ and n_features_in_ even if validate=False
|
||||
# We run this only when reset==True to store the attributes but not
|
||||
# validate them, because validate=False
|
||||
_check_n_features(self, X, reset=reset)
|
||||
_check_feature_names(self, X, reset=reset)
|
||||
return X
|
||||
|
||||
def _check_inverse_transform(self, X):
|
||||
"""Check that func and inverse_func are the inverse."""
|
||||
idx_selected = slice(None, None, max(1, X.shape[0] // 100))
|
||||
X_round_trip = self.inverse_transform(self.transform(X[idx_selected]))
|
||||
|
||||
if hasattr(X, "dtype"):
|
||||
dtypes = [X.dtype]
|
||||
elif hasattr(X, "dtypes"):
|
||||
# Dataframes can have multiple dtypes
|
||||
dtypes = X.dtypes
|
||||
|
||||
# Not all dtypes are numpy dtypes, they can be pandas dtypes as well
|
||||
if not all(
|
||||
isinstance(d, np.dtype) and np.issubdtype(d, np.number) for d in dtypes
|
||||
):
|
||||
raise ValueError(
|
||||
"'check_inverse' is only supported when all the elements in `X` is"
|
||||
" numerical."
|
||||
)
|
||||
|
||||
if not _allclose_dense_sparse(X[idx_selected], X_round_trip):
|
||||
warnings.warn(
|
||||
(
|
||||
"The provided functions are not strictly"
|
||||
" inverse of each other. If you are sure you"
|
||||
" want to proceed regardless, set"
|
||||
" 'check_inverse=False'."
|
||||
),
|
||||
UserWarning,
|
||||
)
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit transformer by checking X.
|
||||
|
||||
If ``validate`` is ``True``, ``X`` will be checked.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
|
||||
if `validate=True` else any object that `func` can handle
|
||||
Input array.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
FunctionTransformer class instance.
|
||||
"""
|
||||
X = self._check_input(X, reset=True)
|
||||
if self.check_inverse and not (self.func is None or self.inverse_func is None):
|
||||
self._check_inverse_transform(X)
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Transform X using the forward function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
|
||||
if `validate=True` else any object that `func` can handle
|
||||
Input array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_out : array-like, shape (n_samples, n_features)
|
||||
Transformed input.
|
||||
"""
|
||||
X = self._check_input(X, reset=False)
|
||||
out = self._transform(X, func=self.func, kw_args=self.kw_args)
|
||||
output_config = _get_output_config("transform", self)["dense"]
|
||||
|
||||
if hasattr(out, "columns") and self.feature_names_out is not None:
|
||||
# check the consistency between the column provided by `transform` and
|
||||
# the column names provided by `get_feature_names_out`.
|
||||
feature_names_out = self.get_feature_names_out()
|
||||
if list(out.columns) != list(feature_names_out):
|
||||
# we can override the column names of the output if it is inconsistent
|
||||
# with the column names provided by `get_feature_names_out` in the
|
||||
# following cases:
|
||||
# * `func` preserved the column names between the input and the output
|
||||
# * the input column names are all numbers
|
||||
# * the output is requested to be a DataFrame (pandas or polars)
|
||||
feature_names_in = getattr(
|
||||
X, "feature_names_in_", _get_feature_names(X)
|
||||
)
|
||||
same_feature_names_in_out = feature_names_in is not None and list(
|
||||
feature_names_in
|
||||
) == list(out.columns)
|
||||
not_all_str_columns = not all(
|
||||
isinstance(col, str) for col in out.columns
|
||||
)
|
||||
if same_feature_names_in_out or not_all_str_columns:
|
||||
adapter = _get_adapter_from_container(out)
|
||||
out = adapter.create_container(
|
||||
X_output=out,
|
||||
X_original=out,
|
||||
columns=feature_names_out,
|
||||
inplace=False,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"The output generated by `func` have different column names "
|
||||
"than the ones provided by `get_feature_names_out`. "
|
||||
f"Got output with columns names: {list(out.columns)} and "
|
||||
"`get_feature_names_out` returned: "
|
||||
f"{list(self.get_feature_names_out())}. "
|
||||
"The column names can be overridden by setting "
|
||||
"`set_output(transform='pandas')` or "
|
||||
"`set_output(transform='polars')` such that the column names "
|
||||
"are set to the names provided by `get_feature_names_out`."
|
||||
)
|
||||
|
||||
if self.feature_names_out is None:
|
||||
warn_msg = (
|
||||
"When `set_output` is configured to be '{0}', `func` should return "
|
||||
"a {0} DataFrame to follow the `set_output` API or `feature_names_out`"
|
||||
" should be defined."
|
||||
)
|
||||
if output_config == "pandas" and not _is_pandas_df(out):
|
||||
warnings.warn(warn_msg.format("pandas"))
|
||||
elif output_config == "polars" and not _is_polars_df(out):
|
||||
warnings.warn(warn_msg.format("polars"))
|
||||
|
||||
return out
|
||||
|
||||
def inverse_transform(self, X):
|
||||
"""Transform X using the inverse function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
|
||||
if `validate=True` else any object that `inverse_func` can handle
|
||||
Input array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_original : array-like, shape (n_samples, n_features)
|
||||
Transformed input.
|
||||
"""
|
||||
if self.validate:
|
||||
X = check_array(X, accept_sparse=self.accept_sparse)
|
||||
return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args)
|
||||
|
||||
@available_if(lambda self: self.feature_names_out is not None)
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
"""Get output feature names for transformation.
|
||||
|
||||
This method is only defined if `feature_names_out` is not None.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_features : array-like of str or None, default=None
|
||||
Input feature names.
|
||||
|
||||
- If `input_features` is None, then `feature_names_in_` is
|
||||
used as the input feature names. If `feature_names_in_` is not
|
||||
defined, then names are generated:
|
||||
`[x0, x1, ..., x(n_features_in_ - 1)]`.
|
||||
- If `input_features` is array-like, then `input_features` must
|
||||
match `feature_names_in_` if `feature_names_in_` is defined.
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_names_out : ndarray of str objects
|
||||
Transformed feature names.
|
||||
|
||||
- If `feature_names_out` is 'one-to-one', the input feature names
|
||||
are returned (see `input_features` above). This requires
|
||||
`feature_names_in_` and/or `n_features_in_` to be defined, which
|
||||
is done automatically if `validate=True`. Alternatively, you can
|
||||
set them in `func`.
|
||||
- If `feature_names_out` is a callable, then it is called with two
|
||||
arguments, `self` and `input_features`, and its return value is
|
||||
returned by this method.
|
||||
"""
|
||||
if hasattr(self, "n_features_in_") or input_features is not None:
|
||||
input_features = _check_feature_names_in(self, input_features)
|
||||
if self.feature_names_out == "one-to-one":
|
||||
names_out = input_features
|
||||
elif callable(self.feature_names_out):
|
||||
names_out = self.feature_names_out(self, input_features)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"feature_names_out={self.feature_names_out!r} is invalid. "
|
||||
'It must either be "one-to-one" or a callable with two '
|
||||
"arguments: the function transformer and an array-like of "
|
||||
"input feature names. The callable must return an array-like "
|
||||
"of output feature names."
|
||||
)
|
||||
return np.asarray(names_out, dtype=object)
|
||||
|
||||
def _transform(self, X, func=None, kw_args=None):
|
||||
if func is None:
|
||||
func = _identity
|
||||
|
||||
return func(X, **(kw_args if kw_args else {}))
|
||||
|
||||
def __sklearn_is_fitted__(self):
|
||||
"""Return True since FunctionTransfomer is stateless."""
|
||||
return True
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.no_validation = not self.validate
|
||||
tags.requires_fit = False
|
||||
tags.input_tags.sparse = not self.validate or self.accept_sparse
|
||||
return tags
|
||||
|
||||
def set_output(self, *, transform=None):
|
||||
"""Set output container.
|
||||
|
||||
See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
|
||||
for an example on how to use the API.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transform : {"default", "pandas", "polars"}, default=None
|
||||
Configure output of `transform` and `fit_transform`.
|
||||
|
||||
- `"default"`: Default output format of a transformer
|
||||
- `"pandas"`: DataFrame output
|
||||
- `"polars"`: Polars output
|
||||
- `None`: Transform configuration is unchanged
|
||||
|
||||
.. versionadded:: 1.4
|
||||
`"polars"` option was added.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : estimator instance
|
||||
Estimator instance.
|
||||
"""
|
||||
if not hasattr(self, "_sklearn_output_config"):
|
||||
self._sklearn_output_config = {}
|
||||
|
||||
self._sklearn_output_config["transform"] = transform
|
||||
return self
|
||||
|
||||
def _get_function_name(self):
|
||||
"""Get the name display of the `func` used in HTML representation."""
|
||||
if hasattr(self.func, "__name__"):
|
||||
return self.func.__name__
|
||||
if isinstance(self.func, partial):
|
||||
return self.func.func.__name__
|
||||
return f"{self.func.__class__.__name__}(...)"
|
||||
|
||||
def _sk_visual_block_(self):
|
||||
return _VisualBlock(
|
||||
"single",
|
||||
self,
|
||||
names=self._get_function_name(),
|
||||
name_details=str(self),
|
||||
name_caption="FunctionTransformer",
|
||||
doc_link_label="FunctionTransformer",
|
||||
)
|
||||
@@ -0,0 +1,963 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import array
|
||||
import itertools
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin, _fit_context
|
||||
from ..utils import column_or_1d
|
||||
from ..utils._array_api import device, get_namespace, xpx
|
||||
from ..utils._encode import _encode, _unique
|
||||
from ..utils._param_validation import Interval, validate_params
|
||||
from ..utils.multiclass import type_of_target, unique_labels
|
||||
from ..utils.sparsefuncs import min_max_axis
|
||||
from ..utils.validation import _num_samples, check_array, check_is_fitted
|
||||
|
||||
__all__ = [
|
||||
"LabelBinarizer",
|
||||
"LabelEncoder",
|
||||
"MultiLabelBinarizer",
|
||||
"label_binarize",
|
||||
]
|
||||
|
||||
|
||||
class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
|
||||
"""Encode target labels with value between 0 and n_classes-1.
|
||||
|
||||
This transformer should be used to encode target values, *i.e.* `y`, and
|
||||
not the input `X`.
|
||||
|
||||
Read more in the :ref:`User Guide <preprocessing_targets>`.
|
||||
|
||||
.. versionadded:: 0.12
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
Holds the label for each class.
|
||||
|
||||
See Also
|
||||
--------
|
||||
OrdinalEncoder : Encode categorical features using an ordinal encoding
|
||||
scheme.
|
||||
OneHotEncoder : Encode categorical features as a one-hot numeric array.
|
||||
|
||||
Examples
|
||||
--------
|
||||
`LabelEncoder` can be used to normalize labels.
|
||||
|
||||
>>> from sklearn.preprocessing import LabelEncoder
|
||||
>>> le = LabelEncoder()
|
||||
>>> le.fit([1, 2, 2, 6])
|
||||
LabelEncoder()
|
||||
>>> le.classes_
|
||||
array([1, 2, 6])
|
||||
>>> le.transform([1, 1, 2, 6])
|
||||
array([0, 0, 1, 2]...)
|
||||
>>> le.inverse_transform([0, 0, 1, 2])
|
||||
array([1, 1, 2, 6])
|
||||
|
||||
It can also be used to transform non-numerical labels (as long as they are
|
||||
hashable and comparable) to numerical labels.
|
||||
|
||||
>>> le = LabelEncoder()
|
||||
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
|
||||
LabelEncoder()
|
||||
>>> list(le.classes_)
|
||||
[np.str_('amsterdam'), np.str_('paris'), np.str_('tokyo')]
|
||||
>>> le.transform(["tokyo", "tokyo", "paris"])
|
||||
array([2, 2, 1]...)
|
||||
>>> list(le.inverse_transform([2, 2, 1]))
|
||||
[np.str_('tokyo'), np.str_('tokyo'), np.str_('paris')]
|
||||
"""
|
||||
|
||||
def fit(self, y):
|
||||
"""Fit label encoder.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : returns an instance of self.
|
||||
Fitted label encoder.
|
||||
"""
|
||||
y = column_or_1d(y, warn=True)
|
||||
self.classes_ = _unique(y)
|
||||
return self
|
||||
|
||||
def fit_transform(self, y):
|
||||
"""Fit label encoder and return encoded labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : array-like of shape (n_samples,)
|
||||
Encoded labels.
|
||||
"""
|
||||
y = column_or_1d(y, warn=True)
|
||||
self.classes_, y = _unique(y, return_inverse=True)
|
||||
return y
|
||||
|
||||
def transform(self, y):
|
||||
"""Transform labels to normalized encoding.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : array-like of shape (n_samples,)
|
||||
Labels as normalized encodings.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
xp, _ = get_namespace(y)
|
||||
y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
|
||||
# transform of empty array is empty array
|
||||
if _num_samples(y) == 0:
|
||||
return xp.asarray([])
|
||||
|
||||
return _encode(y, uniques=self.classes_)
|
||||
|
||||
def inverse_transform(self, y):
|
||||
"""Transform labels back to original encoding.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_original : ndarray of shape (n_samples,)
|
||||
Original encoding.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
xp, _ = get_namespace(y)
|
||||
y = column_or_1d(y, warn=True)
|
||||
# inverse transform of empty array is empty array
|
||||
if _num_samples(y) == 0:
|
||||
return xp.asarray([])
|
||||
|
||||
diff = xpx.setdiff1d(
|
||||
y,
|
||||
xp.arange(self.classes_.shape[0], device=device(y)),
|
||||
xp=xp,
|
||||
)
|
||||
if diff.shape[0]:
|
||||
raise ValueError("y contains previously unseen labels: %s" % str(diff))
|
||||
y = xp.asarray(y)
|
||||
return xp.take(self.classes_, y, axis=0)
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.array_api_support = True
|
||||
tags.input_tags.two_d_array = False
|
||||
tags.target_tags.one_d_labels = True
|
||||
return tags
|
||||
|
||||
|
||||
class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
|
||||
"""Binarize labels in a one-vs-all fashion.
|
||||
|
||||
Several regression and binary classification algorithms are
|
||||
available in scikit-learn. A simple way to extend these algorithms
|
||||
to the multi-class classification case is to use the so-called
|
||||
one-vs-all scheme.
|
||||
|
||||
At learning time, this simply consists in learning one regressor
|
||||
or binary classifier per class. In doing so, one needs to convert
|
||||
multi-class labels to binary labels (belong or does not belong
|
||||
to the class). `LabelBinarizer` makes this process easy with the
|
||||
transform method.
|
||||
|
||||
At prediction time, one assigns the class for which the corresponding
|
||||
model gave the greatest confidence. `LabelBinarizer` makes this easy
|
||||
with the :meth:`inverse_transform` method.
|
||||
|
||||
Read more in the :ref:`User Guide <preprocessing_targets>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
neg_label : int, default=0
|
||||
Value with which negative labels must be encoded.
|
||||
|
||||
pos_label : int, default=1
|
||||
Value with which positive labels must be encoded.
|
||||
|
||||
sparse_output : bool, default=False
|
||||
True if the returned array from transform is desired to be in sparse
|
||||
CSR format.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
Holds the label for each class.
|
||||
|
||||
y_type_ : str
|
||||
Represents the type of the target data as evaluated by
|
||||
:func:`~sklearn.utils.multiclass.type_of_target`. Possible type are
|
||||
'continuous', 'continuous-multioutput', 'binary', 'multiclass',
|
||||
'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.
|
||||
|
||||
sparse_input_ : bool
|
||||
`True` if the input data to transform is given as a sparse matrix,
|
||||
`False` otherwise.
|
||||
|
||||
See Also
|
||||
--------
|
||||
label_binarize : Function to perform the transform operation of
|
||||
LabelBinarizer with fixed classes.
|
||||
OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
|
||||
scheme.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.preprocessing import LabelBinarizer
|
||||
>>> lb = LabelBinarizer()
|
||||
>>> lb.fit([1, 2, 6, 4, 2])
|
||||
LabelBinarizer()
|
||||
>>> lb.classes_
|
||||
array([1, 2, 4, 6])
|
||||
>>> lb.transform([1, 6])
|
||||
array([[1, 0, 0, 0],
|
||||
[0, 0, 0, 1]])
|
||||
|
||||
Binary targets transform to a column vector
|
||||
|
||||
>>> lb = LabelBinarizer()
|
||||
>>> lb.fit_transform(['yes', 'no', 'no', 'yes'])
|
||||
array([[1],
|
||||
[0],
|
||||
[0],
|
||||
[1]])
|
||||
|
||||
Passing a 2D matrix for multilabel classification
|
||||
|
||||
>>> import numpy as np
|
||||
>>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))
|
||||
LabelBinarizer()
|
||||
>>> lb.classes_
|
||||
array([0, 1, 2])
|
||||
>>> lb.transform([0, 1, 2, 1])
|
||||
array([[1, 0, 0],
|
||||
[0, 1, 0],
|
||||
[0, 0, 1],
|
||||
[0, 1, 0]])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"neg_label": [Integral],
|
||||
"pos_label": [Integral],
|
||||
"sparse_output": ["boolean"],
|
||||
}
|
||||
|
||||
def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
|
||||
self.neg_label = neg_label
|
||||
self.pos_label = pos_label
|
||||
self.sparse_output = sparse_output
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, y):
|
||||
"""Fit label binarizer.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : ndarray of shape (n_samples,) or (n_samples, n_classes)
|
||||
Target values. The 2-d matrix should only contain 0 and 1,
|
||||
represents multilabel classification.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
if self.neg_label >= self.pos_label:
|
||||
raise ValueError(
|
||||
f"neg_label={self.neg_label} must be strictly less than "
|
||||
f"pos_label={self.pos_label}."
|
||||
)
|
||||
|
||||
if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0):
|
||||
raise ValueError(
|
||||
"Sparse binarization is only supported with non "
|
||||
"zero pos_label and zero neg_label, got "
|
||||
f"pos_label={self.pos_label} and neg_label={self.neg_label}"
|
||||
)
|
||||
|
||||
self.y_type_ = type_of_target(y, input_name="y")
|
||||
|
||||
if "multioutput" in self.y_type_:
|
||||
raise ValueError(
|
||||
"Multioutput target data is not supported with label binarization"
|
||||
)
|
||||
if _num_samples(y) == 0:
|
||||
raise ValueError("y has 0 samples: %r" % y)
|
||||
|
||||
self.sparse_input_ = sp.issparse(y)
|
||||
self.classes_ = unique_labels(y)
|
||||
return self
|
||||
|
||||
def fit_transform(self, y):
|
||||
"""Fit label binarizer/transform multi-class labels to binary labels.
|
||||
|
||||
The output of transform is sometimes referred to as
|
||||
the 1-of-K coding scheme.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : {ndarray, sparse matrix} of shape (n_samples,) or \
|
||||
(n_samples, n_classes)
|
||||
Target values. The 2-d matrix should only contain 0 and 1,
|
||||
represents multilabel classification. Sparse matrix can be
|
||||
CSR, CSC, COO, DOK, or LIL.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
|
||||
Shape will be (n_samples, 1) for binary problems. Sparse matrix
|
||||
will be of CSR format.
|
||||
"""
|
||||
return self.fit(y).transform(y)
|
||||
|
||||
def transform(self, y):
|
||||
"""Transform multi-class labels to binary labels.
|
||||
|
||||
The output of transform is sometimes referred to by some authors as
|
||||
the 1-of-K coding scheme.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : {array, sparse matrix} of shape (n_samples,) or \
|
||||
(n_samples, n_classes)
|
||||
Target values. The 2-d matrix should only contain 0 and 1,
|
||||
represents multilabel classification. Sparse matrix can be
|
||||
CSR, CSC, COO, DOK, or LIL.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
|
||||
Shape will be (n_samples, 1) for binary problems. Sparse matrix
|
||||
will be of CSR format.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
y_is_multilabel = type_of_target(y).startswith("multilabel")
|
||||
if y_is_multilabel and not self.y_type_.startswith("multilabel"):
|
||||
raise ValueError("The object was not fitted with multilabel input.")
|
||||
|
||||
return label_binarize(
|
||||
y,
|
||||
classes=self.classes_,
|
||||
pos_label=self.pos_label,
|
||||
neg_label=self.neg_label,
|
||||
sparse_output=self.sparse_output,
|
||||
)
|
||||
|
||||
def inverse_transform(self, Y, threshold=None):
|
||||
"""Transform binary labels back to multi-class labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
|
||||
Target values. All sparse matrices are converted to CSR before
|
||||
inverse transformation.
|
||||
|
||||
threshold : float, default=None
|
||||
Threshold used in the binary and multi-label cases.
|
||||
|
||||
Use 0 when ``Y`` contains the output of :term:`decision_function`
|
||||
(classifier).
|
||||
Use 0.5 when ``Y`` contains the output of :term:`predict_proba`.
|
||||
|
||||
If None, the threshold is assumed to be half way between
|
||||
neg_label and pos_label.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_original : {ndarray, sparse matrix} of shape (n_samples,)
|
||||
Target values. Sparse matrix will be of CSR format.
|
||||
|
||||
Notes
|
||||
-----
|
||||
In the case when the binary labels are fractional
|
||||
(probabilistic), :meth:`inverse_transform` chooses the class with the
|
||||
greatest value. Typically, this allows to use the output of a
|
||||
linear model's :term:`decision_function` method directly as the input
|
||||
of :meth:`inverse_transform`.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
if threshold is None:
|
||||
threshold = (self.pos_label + self.neg_label) / 2.0
|
||||
|
||||
if self.y_type_ == "multiclass":
|
||||
y_inv = _inverse_binarize_multiclass(Y, self.classes_)
|
||||
else:
|
||||
y_inv = _inverse_binarize_thresholding(
|
||||
Y, self.y_type_, self.classes_, threshold
|
||||
)
|
||||
|
||||
if self.sparse_input_:
|
||||
y_inv = sp.csr_matrix(y_inv)
|
||||
elif sp.issparse(y_inv):
|
||||
y_inv = y_inv.toarray()
|
||||
|
||||
return y_inv
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.two_d_array = False
|
||||
tags.target_tags.one_d_labels = True
|
||||
return tags
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"y": ["array-like", "sparse matrix"],
|
||||
"classes": ["array-like"],
|
||||
"neg_label": [Interval(Integral, None, None, closed="neither")],
|
||||
"pos_label": [Interval(Integral, None, None, closed="neither")],
|
||||
"sparse_output": ["boolean"],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False):
|
||||
"""Binarize labels in a one-vs-all fashion.
|
||||
|
||||
Several regression and binary classification algorithms are
|
||||
available in scikit-learn. A simple way to extend these algorithms
|
||||
to the multi-class classification case is to use the so-called
|
||||
one-vs-all scheme.
|
||||
|
||||
This function makes it possible to compute this transformation for a
|
||||
fixed set of class labels known ahead of time.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array-like or sparse matrix
|
||||
Sequence of integer labels or multilabel data to encode.
|
||||
|
||||
classes : array-like of shape (n_classes,)
|
||||
Uniquely holds the label for each class.
|
||||
|
||||
neg_label : int, default=0
|
||||
Value with which negative labels must be encoded.
|
||||
|
||||
pos_label : int, default=1
|
||||
Value with which positive labels must be encoded.
|
||||
|
||||
sparse_output : bool, default=False,
|
||||
Set to true if output binary array is desired in CSR sparse format.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
|
||||
Shape will be (n_samples, 1) for binary problems. Sparse matrix will
|
||||
be of CSR format.
|
||||
|
||||
See Also
|
||||
--------
|
||||
LabelBinarizer : Class used to wrap the functionality of label_binarize and
|
||||
allow for fitting to classes independently of the transform operation.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.preprocessing import label_binarize
|
||||
>>> label_binarize([1, 6], classes=[1, 2, 4, 6])
|
||||
array([[1, 0, 0, 0],
|
||||
[0, 0, 0, 1]])
|
||||
|
||||
The class ordering is preserved:
|
||||
|
||||
>>> label_binarize([1, 6], classes=[1, 6, 4, 2])
|
||||
array([[1, 0, 0, 0],
|
||||
[0, 1, 0, 0]])
|
||||
|
||||
Binary targets transform to a column vector
|
||||
|
||||
>>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])
|
||||
array([[1],
|
||||
[0],
|
||||
[0],
|
||||
[1]])
|
||||
"""
|
||||
if not isinstance(y, list):
|
||||
# XXX Workaround that will be removed when list of list format is
|
||||
# dropped
|
||||
y = check_array(
|
||||
y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None
|
||||
)
|
||||
else:
|
||||
if _num_samples(y) == 0:
|
||||
raise ValueError("y has 0 samples: %r" % y)
|
||||
if neg_label >= pos_label:
|
||||
raise ValueError(
|
||||
"neg_label={0} must be strictly less than pos_label={1}.".format(
|
||||
neg_label, pos_label
|
||||
)
|
||||
)
|
||||
|
||||
if sparse_output and (pos_label == 0 or neg_label != 0):
|
||||
raise ValueError(
|
||||
"Sparse binarization is only supported with non "
|
||||
"zero pos_label and zero neg_label, got "
|
||||
"pos_label={0} and neg_label={1}"
|
||||
"".format(pos_label, neg_label)
|
||||
)
|
||||
|
||||
# To account for pos_label == 0 in the dense case
|
||||
pos_switch = pos_label == 0
|
||||
if pos_switch:
|
||||
pos_label = -neg_label
|
||||
|
||||
y_type = type_of_target(y)
|
||||
if "multioutput" in y_type:
|
||||
raise ValueError(
|
||||
"Multioutput target data is not supported with label binarization"
|
||||
)
|
||||
if y_type == "unknown":
|
||||
raise ValueError("The type of target data is not known")
|
||||
|
||||
n_samples = y.shape[0] if sp.issparse(y) else len(y)
|
||||
n_classes = len(classes)
|
||||
classes = np.asarray(classes)
|
||||
|
||||
if y_type == "binary":
|
||||
if n_classes == 1:
|
||||
if sparse_output:
|
||||
return sp.csr_matrix((n_samples, 1), dtype=int)
|
||||
else:
|
||||
Y = np.zeros((len(y), 1), dtype=int)
|
||||
Y += neg_label
|
||||
return Y
|
||||
elif len(classes) >= 3:
|
||||
y_type = "multiclass"
|
||||
|
||||
sorted_class = np.sort(classes)
|
||||
if y_type == "multilabel-indicator":
|
||||
y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0])
|
||||
if classes.size != y_n_classes:
|
||||
raise ValueError(
|
||||
"classes {0} mismatch with the labels {1} found in the data".format(
|
||||
classes, unique_labels(y)
|
||||
)
|
||||
)
|
||||
|
||||
if y_type in ("binary", "multiclass"):
|
||||
y = column_or_1d(y)
|
||||
|
||||
# pick out the known labels from y
|
||||
y_in_classes = np.isin(y, classes)
|
||||
y_seen = y[y_in_classes]
|
||||
indices = np.searchsorted(sorted_class, y_seen)
|
||||
indptr = np.hstack((0, np.cumsum(y_in_classes)))
|
||||
|
||||
data = np.empty_like(indices)
|
||||
data.fill(pos_label)
|
||||
Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes))
|
||||
elif y_type == "multilabel-indicator":
|
||||
Y = sp.csr_matrix(y)
|
||||
if pos_label != 1:
|
||||
data = np.empty_like(Y.data)
|
||||
data.fill(pos_label)
|
||||
Y.data = data
|
||||
else:
|
||||
raise ValueError(
|
||||
"%s target data is not supported with label binarization" % y_type
|
||||
)
|
||||
|
||||
if not sparse_output:
|
||||
Y = Y.toarray()
|
||||
Y = Y.astype(int, copy=False)
|
||||
|
||||
if neg_label != 0:
|
||||
Y[Y == 0] = neg_label
|
||||
|
||||
if pos_switch:
|
||||
Y[Y == pos_label] = 0
|
||||
else:
|
||||
Y.data = Y.data.astype(int, copy=False)
|
||||
|
||||
# preserve label ordering
|
||||
if np.any(classes != sorted_class):
|
||||
indices = np.searchsorted(sorted_class, classes)
|
||||
Y = Y[:, indices]
|
||||
|
||||
if y_type == "binary":
|
||||
if sparse_output:
|
||||
Y = Y[:, [-1]]
|
||||
else:
|
||||
Y = Y[:, -1].reshape((-1, 1))
|
||||
|
||||
return Y
|
||||
|
||||
|
||||
def _inverse_binarize_multiclass(y, classes):
|
||||
"""Inverse label binarization transformation for multiclass.
|
||||
|
||||
Multiclass uses the maximal score instead of a threshold.
|
||||
"""
|
||||
classes = np.asarray(classes)
|
||||
|
||||
if sp.issparse(y):
|
||||
# Find the argmax for each row in y where y is a CSR matrix
|
||||
|
||||
y = y.tocsr()
|
||||
n_samples, n_outputs = y.shape
|
||||
outputs = np.arange(n_outputs)
|
||||
row_max = min_max_axis(y, 1)[1]
|
||||
row_nnz = np.diff(y.indptr)
|
||||
|
||||
y_data_repeated_max = np.repeat(row_max, row_nnz)
|
||||
# picks out all indices obtaining the maximum per row
|
||||
y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)
|
||||
|
||||
# For corner case where last row has a max of 0
|
||||
if row_max[-1] == 0:
|
||||
y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])
|
||||
|
||||
# Gets the index of the first argmax in each row from y_i_all_argmax
|
||||
index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
|
||||
# first argmax of each row
|
||||
y_ind_ext = np.append(y.indices, [0])
|
||||
y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
|
||||
# Handle rows of all 0
|
||||
y_i_argmax[np.where(row_nnz == 0)[0]] = 0
|
||||
|
||||
# Handles rows with max of 0 that contain negative numbers
|
||||
samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]
|
||||
for i in samples:
|
||||
ind = y.indices[y.indptr[i] : y.indptr[i + 1]]
|
||||
y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]
|
||||
|
||||
return classes[y_i_argmax]
|
||||
else:
|
||||
return classes.take(y.argmax(axis=1), mode="clip")
|
||||
|
||||
|
||||
def _inverse_binarize_thresholding(y, output_type, classes, threshold):
|
||||
"""Inverse label binarization transformation using thresholding."""
|
||||
|
||||
if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
|
||||
raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape))
|
||||
|
||||
if output_type != "binary" and y.shape[1] != len(classes):
|
||||
raise ValueError(
|
||||
"The number of class is not equal to the number of dimension of y."
|
||||
)
|
||||
|
||||
classes = np.asarray(classes)
|
||||
|
||||
# Perform thresholding
|
||||
if sp.issparse(y):
|
||||
if threshold > 0:
|
||||
if y.format not in ("csr", "csc"):
|
||||
y = y.tocsr()
|
||||
y.data = np.array(y.data > threshold, dtype=int)
|
||||
y.eliminate_zeros()
|
||||
else:
|
||||
y = np.array(y.toarray() > threshold, dtype=int)
|
||||
else:
|
||||
y = np.array(y > threshold, dtype=int)
|
||||
|
||||
# Inverse transform data
|
||||
if output_type == "binary":
|
||||
if sp.issparse(y):
|
||||
y = y.toarray()
|
||||
if y.ndim == 2 and y.shape[1] == 2:
|
||||
return classes[y[:, 1]]
|
||||
else:
|
||||
if len(classes) == 1:
|
||||
return np.repeat(classes[0], len(y))
|
||||
else:
|
||||
return classes[y.ravel()]
|
||||
|
||||
elif output_type == "multilabel-indicator":
|
||||
return y
|
||||
|
||||
else:
|
||||
raise ValueError("{0} format is not supported".format(output_type))
|
||||
|
||||
|
||||
class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
|
||||
"""Transform between iterable of iterables and a multilabel format.
|
||||
|
||||
Although a list of sets or tuples is a very intuitive format for multilabel
|
||||
data, it is unwieldy to process. This transformer converts between this
|
||||
intuitive format and the supported multilabel format: a (samples x classes)
|
||||
binary matrix indicating the presence of a class label.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
classes : array-like of shape (n_classes,), default=None
|
||||
Indicates an ordering for the class labels.
|
||||
All entries should be unique (cannot contain duplicate classes).
|
||||
|
||||
sparse_output : bool, default=False
|
||||
Set to True if output binary array is desired in CSR sparse format.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
A copy of the `classes` parameter when provided.
|
||||
Otherwise it corresponds to the sorted set of classes found
|
||||
when fitting.
|
||||
|
||||
See Also
|
||||
--------
|
||||
OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
|
||||
scheme.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.preprocessing import MultiLabelBinarizer
|
||||
>>> mlb = MultiLabelBinarizer()
|
||||
>>> mlb.fit_transform([(1, 2), (3,)])
|
||||
array([[1, 1, 0],
|
||||
[0, 0, 1]])
|
||||
>>> mlb.classes_
|
||||
array([1, 2, 3])
|
||||
|
||||
>>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])
|
||||
array([[0, 1, 1],
|
||||
[1, 0, 0]])
|
||||
>>> list(mlb.classes_)
|
||||
['comedy', 'sci-fi', 'thriller']
|
||||
|
||||
A common mistake is to pass in a list, which leads to the following issue:
|
||||
|
||||
>>> mlb = MultiLabelBinarizer()
|
||||
>>> mlb.fit(['sci-fi', 'thriller', 'comedy'])
|
||||
MultiLabelBinarizer()
|
||||
>>> mlb.classes_
|
||||
array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't',
|
||||
'y'], dtype=object)
|
||||
|
||||
To correct this, the list of labels should be passed in as:
|
||||
|
||||
>>> mlb = MultiLabelBinarizer()
|
||||
>>> mlb.fit([['sci-fi', 'thriller', 'comedy']])
|
||||
MultiLabelBinarizer()
|
||||
>>> mlb.classes_
|
||||
array(['comedy', 'sci-fi', 'thriller'], dtype=object)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"classes": ["array-like", None],
|
||||
"sparse_output": ["boolean"],
|
||||
}
|
||||
|
||||
def __init__(self, *, classes=None, sparse_output=False):
|
||||
self.classes = classes
|
||||
self.sparse_output = sparse_output
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, y):
|
||||
"""Fit the label sets binarizer, storing :term:`classes_`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : iterable of iterables
|
||||
A set of labels (any orderable and hashable object) for each
|
||||
sample. If the `classes` parameter is set, `y` will not be
|
||||
iterated.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
self._cached_dict = None
|
||||
|
||||
if self.classes is None:
|
||||
classes = sorted(set(itertools.chain.from_iterable(y)))
|
||||
elif len(set(self.classes)) < len(self.classes):
|
||||
raise ValueError(
|
||||
"The classes argument contains duplicate "
|
||||
"classes. Remove these duplicates before passing "
|
||||
"them to MultiLabelBinarizer."
|
||||
)
|
||||
else:
|
||||
classes = self.classes
|
||||
dtype = int if all(isinstance(c, int) for c in classes) else object
|
||||
self.classes_ = np.empty(len(classes), dtype=dtype)
|
||||
self.classes_[:] = classes
|
||||
return self
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit_transform(self, y):
|
||||
"""Fit the label sets binarizer and transform the given label sets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : iterable of iterables
|
||||
A set of labels (any orderable and hashable object) for each
|
||||
sample. If the `classes` parameter is set, `y` will not be
|
||||
iterated.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)
|
||||
A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`
|
||||
is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR
|
||||
format.
|
||||
"""
|
||||
if self.classes is not None:
|
||||
return self.fit(y).transform(y)
|
||||
|
||||
self._cached_dict = None
|
||||
|
||||
# Automatically increment on new class
|
||||
class_mapping = defaultdict(int)
|
||||
class_mapping.default_factory = class_mapping.__len__
|
||||
yt = self._transform(y, class_mapping)
|
||||
|
||||
# sort classes and reorder columns
|
||||
tmp = sorted(class_mapping, key=class_mapping.get)
|
||||
|
||||
# (make safe for tuples)
|
||||
dtype = int if all(isinstance(c, int) for c in tmp) else object
|
||||
class_mapping = np.empty(len(tmp), dtype=dtype)
|
||||
class_mapping[:] = tmp
|
||||
self.classes_, inverse = np.unique(class_mapping, return_inverse=True)
|
||||
# ensure yt.indices keeps its current dtype
|
||||
yt.indices = np.asarray(inverse[yt.indices], dtype=yt.indices.dtype)
|
||||
|
||||
if not self.sparse_output:
|
||||
yt = yt.toarray()
|
||||
|
||||
return yt
|
||||
|
||||
def transform(self, y):
|
||||
"""Transform the given label sets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : iterable of iterables
|
||||
A set of labels (any orderable and hashable object) for each
|
||||
sample. If the `classes` parameter is set, `y` will not be
|
||||
iterated.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_indicator : array or CSR matrix, shape (n_samples, n_classes)
|
||||
A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in
|
||||
`y[i]`, and 0 otherwise.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
class_to_index = self._build_cache()
|
||||
yt = self._transform(y, class_to_index)
|
||||
|
||||
if not self.sparse_output:
|
||||
yt = yt.toarray()
|
||||
|
||||
return yt
|
||||
|
||||
def _build_cache(self):
|
||||
if self._cached_dict is None:
|
||||
self._cached_dict = dict(zip(self.classes_, range(len(self.classes_))))
|
||||
|
||||
return self._cached_dict
|
||||
|
||||
def _transform(self, y, class_mapping):
|
||||
"""Transforms the label sets with a given mapping.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : iterable of iterables
|
||||
A set of labels (any orderable and hashable object) for each
|
||||
sample. If the `classes` parameter is set, `y` will not be
|
||||
iterated.
|
||||
|
||||
class_mapping : Mapping
|
||||
Maps from label to column index in label indicator matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_indicator : sparse matrix of shape (n_samples, n_classes)
|
||||
Label indicator matrix. Will be of CSR format.
|
||||
"""
|
||||
indices = array.array("i")
|
||||
indptr = array.array("i", [0])
|
||||
unknown = set()
|
||||
for labels in y:
|
||||
index = set()
|
||||
for label in labels:
|
||||
try:
|
||||
index.add(class_mapping[label])
|
||||
except KeyError:
|
||||
unknown.add(label)
|
||||
indices.extend(index)
|
||||
indptr.append(len(indices))
|
||||
if unknown:
|
||||
warnings.warn(
|
||||
"unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))
|
||||
)
|
||||
data = np.ones(len(indices), dtype=int)
|
||||
|
||||
return sp.csr_matrix(
|
||||
(data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping))
|
||||
)
|
||||
|
||||
def inverse_transform(self, yt):
|
||||
"""Transform the given indicator matrix into label sets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
yt : {ndarray, sparse matrix} of shape (n_samples, n_classes)
|
||||
A matrix containing only 1s ands 0s.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_original : list of tuples
|
||||
The set of labels for each sample such that `y[i]` consists of
|
||||
`classes_[j]` for each `yt[i, j] == 1`.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
if yt.shape[1] != len(self.classes_):
|
||||
raise ValueError(
|
||||
"Expected indicator for {0} classes, but got {1}".format(
|
||||
len(self.classes_), yt.shape[1]
|
||||
)
|
||||
)
|
||||
|
||||
if sp.issparse(yt):
|
||||
yt = yt.tocsr()
|
||||
if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0:
|
||||
raise ValueError("Expected only 0s and 1s in label indicator.")
|
||||
return [
|
||||
tuple(self.classes_.take(yt.indices[start:end]))
|
||||
for start, end in zip(yt.indptr[:-1], yt.indptr[1:])
|
||||
]
|
||||
else:
|
||||
unexpected = np.setdiff1d(yt, [0, 1])
|
||||
if len(unexpected) > 0:
|
||||
raise ValueError(
|
||||
"Expected only 0s and 1s in label indicator. Also got {0}".format(
|
||||
unexpected
|
||||
)
|
||||
)
|
||||
return [tuple(self.classes_.compress(indicators)) for indicators in yt]
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.two_d_array = False
|
||||
tags.target_tags.two_d_labels = True
|
||||
return tags
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,534 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..base import OneToOneFeatureMixin, _fit_context
|
||||
from ..utils._param_validation import Interval, StrOptions
|
||||
from ..utils.multiclass import type_of_target
|
||||
from ..utils.validation import (
|
||||
_check_feature_names_in,
|
||||
_check_y,
|
||||
check_consistent_length,
|
||||
check_is_fitted,
|
||||
)
|
||||
from ._encoders import _BaseEncoder
|
||||
from ._target_encoder_fast import _fit_encoding_fast, _fit_encoding_fast_auto_smooth
|
||||
|
||||
|
||||
class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
|
||||
"""Target Encoder for regression and classification targets.
|
||||
|
||||
Each category is encoded based on a shrunk estimate of the average target
|
||||
values for observations belonging to the category. The encoding scheme mixes
|
||||
the global target mean with the target mean conditioned on the value of the
|
||||
category (see [MIC]_).
|
||||
|
||||
When the target type is "multiclass", encodings are based
|
||||
on the conditional probability estimate for each class. The target is first
|
||||
binarized using the "one-vs-all" scheme via
|
||||
:class:`~sklearn.preprocessing.LabelBinarizer`, then the average target
|
||||
value for each class and each category is used for encoding, resulting in
|
||||
`n_features` * `n_classes` encoded output features.
|
||||
|
||||
:class:`TargetEncoder` considers missing values, such as `np.nan` or `None`,
|
||||
as another category and encodes them like any other category. Categories
|
||||
that are not seen during :meth:`fit` are encoded with the target mean, i.e.
|
||||
`target_mean_`.
|
||||
|
||||
For a demo on the importance of the `TargetEncoder` internal cross-fitting,
|
||||
see
|
||||
:ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`.
|
||||
For a comparison of different encoders, refer to
|
||||
:ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. Read
|
||||
more in the :ref:`User Guide <target_encoder>`.
|
||||
|
||||
.. note::
|
||||
`fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
|
||||
:term:`cross fitting` scheme is used in `fit_transform` for encoding.
|
||||
See the :ref:`User Guide <target_encoder>` for details.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
Parameters
|
||||
----------
|
||||
categories : "auto" or list of shape (n_features,) of array-like, default="auto"
|
||||
Categories (unique values) per feature:
|
||||
|
||||
- `"auto"` : Determine categories automatically from the training data.
|
||||
- list : `categories[i]` holds the categories expected in the i-th column. The
|
||||
passed categories should not mix strings and numeric values within a single
|
||||
feature, and should be sorted in case of numeric values.
|
||||
|
||||
The used categories are stored in the `categories_` fitted attribute.
|
||||
|
||||
target_type : {"auto", "continuous", "binary", "multiclass"}, default="auto"
|
||||
Type of target.
|
||||
|
||||
- `"auto"` : Type of target is inferred with
|
||||
:func:`~sklearn.utils.multiclass.type_of_target`.
|
||||
- `"continuous"` : Continuous target
|
||||
- `"binary"` : Binary target
|
||||
- `"multiclass"` : Multiclass target
|
||||
|
||||
.. note::
|
||||
The type of target inferred with `"auto"` may not be the desired target
|
||||
type used for modeling. For example, if the target consisted of integers
|
||||
between 0 and 100, then :func:`~sklearn.utils.multiclass.type_of_target`
|
||||
will infer the target as `"multiclass"`. In this case, setting
|
||||
`target_type="continuous"` will specify the target as a regression
|
||||
problem. The `target_type_` attribute gives the target type used by the
|
||||
encoder.
|
||||
|
||||
.. versionchanged:: 1.4
|
||||
Added the option 'multiclass'.
|
||||
|
||||
smooth : "auto" or float, default="auto"
|
||||
The amount of mixing of the target mean conditioned on the value of the
|
||||
category with the global target mean. A larger `smooth` value will put
|
||||
more weight on the global target mean.
|
||||
If `"auto"`, then `smooth` is set to an empirical Bayes estimate.
|
||||
|
||||
cv : int, default=5
|
||||
Determines the number of folds in the :term:`cross fitting` strategy used in
|
||||
:meth:`fit_transform`. For classification targets, `StratifiedKFold` is used
|
||||
and for continuous targets, `KFold` is used.
|
||||
|
||||
shuffle : bool, default=True
|
||||
Whether to shuffle the data in :meth:`fit_transform` before splitting into
|
||||
folds. Note that the samples within each split will not be shuffled.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
When `shuffle` is True, `random_state` affects the ordering of the
|
||||
indices, which controls the randomness of each fold. Otherwise, this
|
||||
parameter has no effect.
|
||||
Pass an int for reproducible output across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
encodings_ : list of shape (n_features,) or (n_features * n_classes) of \
|
||||
ndarray
|
||||
Encodings learnt on all of `X`.
|
||||
For feature `i`, `encodings_[i]` are the encodings matching the
|
||||
categories listed in `categories_[i]`. When `target_type_` is
|
||||
"multiclass", the encoding for feature `i` and class `j` is stored in
|
||||
`encodings_[j + (i * len(classes_))]`. E.g., for 2 features (f) and
|
||||
3 classes (c), encodings are ordered:
|
||||
f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2,
|
||||
|
||||
categories_ : list of shape (n_features,) of ndarray
|
||||
The categories of each input feature determined during fitting or
|
||||
specified in `categories`
|
||||
(in order of the features in `X` and corresponding with the output
|
||||
of :meth:`transform`).
|
||||
|
||||
target_type_ : str
|
||||
Type of target.
|
||||
|
||||
target_mean_ : float
|
||||
The overall mean of the target. This value is only used in :meth:`transform`
|
||||
to encode categories.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
classes_ : ndarray or None
|
||||
If `target_type_` is 'binary' or 'multiclass', holds the label for each class,
|
||||
otherwise `None`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
OrdinalEncoder : Performs an ordinal (integer) encoding of the categorical features.
|
||||
Contrary to TargetEncoder, this encoding is not supervised. Treating the
|
||||
resulting encoding as a numerical features therefore lead arbitrarily
|
||||
ordered values and therefore typically lead to lower predictive performance
|
||||
when used as preprocessing for a classifier or regressor.
|
||||
OneHotEncoder : Performs a one-hot encoding of categorical features. This
|
||||
unsupervised encoding is better suited for low cardinality categorical
|
||||
variables as it generate one new feature per unique category.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
|
||||
categorical attributes in classification and prediction problems"
|
||||
SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32. <10.1145/507533.507538>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
With `smooth="auto"`, the smoothing parameter is set to an empirical Bayes estimate:
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.preprocessing import TargetEncoder
|
||||
>>> X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T
|
||||
>>> y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30
|
||||
>>> enc_auto = TargetEncoder(smooth="auto")
|
||||
>>> X_trans = enc_auto.fit_transform(X, y)
|
||||
|
||||
>>> # A high `smooth` parameter puts more weight on global mean on the categorical
|
||||
>>> # encodings:
|
||||
>>> enc_high_smooth = TargetEncoder(smooth=5000.0).fit(X, y)
|
||||
>>> enc_high_smooth.target_mean_
|
||||
np.float64(44.3)
|
||||
>>> enc_high_smooth.encodings_
|
||||
[array([44.1, 44.4, 44.3])]
|
||||
|
||||
>>> # On the other hand, a low `smooth` parameter puts more weight on target
|
||||
>>> # conditioned on the value of the categorical:
|
||||
>>> enc_low_smooth = TargetEncoder(smooth=1.0).fit(X, y)
|
||||
>>> enc_low_smooth.encodings_
|
||||
[array([21, 80.8, 43.2])]
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"categories": [StrOptions({"auto"}), list],
|
||||
"target_type": [StrOptions({"auto", "continuous", "binary", "multiclass"})],
|
||||
"smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")],
|
||||
"cv": [Interval(Integral, 2, None, closed="left")],
|
||||
"shuffle": ["boolean"],
|
||||
"random_state": ["random_state"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
categories="auto",
|
||||
target_type="auto",
|
||||
smooth="auto",
|
||||
cv=5,
|
||||
shuffle=True,
|
||||
random_state=None,
|
||||
):
|
||||
self.categories = categories
|
||||
self.smooth = smooth
|
||||
self.target_type = target_type
|
||||
self.cv = cv
|
||||
self.shuffle = shuffle
|
||||
self.random_state = random_state
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y):
|
||||
"""Fit the :class:`TargetEncoder` to X and y.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data to determine the categories of each feature.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
The target data used to encode the categories.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted encoder.
|
||||
"""
|
||||
self._fit_encodings_all(X, y)
|
||||
return self
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit_transform(self, X, y):
|
||||
"""Fit :class:`TargetEncoder` and transform X with the target encoding.
|
||||
|
||||
.. note::
|
||||
`fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
|
||||
:term:`cross fitting` scheme is used in `fit_transform` for encoding.
|
||||
See the :ref:`User Guide <target_encoder>`. for details.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data to determine the categories of each feature.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
The target data used to encode the categories.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_trans : ndarray of shape (n_samples, n_features) or \
|
||||
(n_samples, (n_features * n_classes))
|
||||
Transformed input.
|
||||
"""
|
||||
from ..model_selection import KFold, StratifiedKFold # avoid circular import
|
||||
|
||||
X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y)
|
||||
|
||||
# The cv splitter is voluntarily restricted to *KFold to enforce non
|
||||
# overlapping validation folds, otherwise the fit_transform output will
|
||||
# not be well-specified.
|
||||
if self.target_type_ == "continuous":
|
||||
cv = KFold(self.cv, shuffle=self.shuffle, random_state=self.random_state)
|
||||
else:
|
||||
cv = StratifiedKFold(
|
||||
self.cv, shuffle=self.shuffle, random_state=self.random_state
|
||||
)
|
||||
|
||||
# If 'multiclass' multiply axis=1 by num classes else keep shape the same
|
||||
if self.target_type_ == "multiclass":
|
||||
X_out = np.empty(
|
||||
(X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
|
||||
dtype=np.float64,
|
||||
)
|
||||
else:
|
||||
X_out = np.empty_like(X_ordinal, dtype=np.float64)
|
||||
|
||||
for train_idx, test_idx in cv.split(X, y):
|
||||
X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx]
|
||||
y_train_mean = np.mean(y_train, axis=0)
|
||||
|
||||
if self.target_type_ == "multiclass":
|
||||
encodings = self._fit_encoding_multiclass(
|
||||
X_train,
|
||||
y_train,
|
||||
n_categories,
|
||||
y_train_mean,
|
||||
)
|
||||
else:
|
||||
encodings = self._fit_encoding_binary_or_continuous(
|
||||
X_train,
|
||||
y_train,
|
||||
n_categories,
|
||||
y_train_mean,
|
||||
)
|
||||
self._transform_X_ordinal(
|
||||
X_out,
|
||||
X_ordinal,
|
||||
~X_known_mask,
|
||||
test_idx,
|
||||
encodings,
|
||||
y_train_mean,
|
||||
)
|
||||
return X_out
|
||||
|
||||
def transform(self, X):
|
||||
"""Transform X with the target encoding.
|
||||
|
||||
.. note::
|
||||
`fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
|
||||
:term:`cross fitting` scheme is used in `fit_transform` for encoding.
|
||||
See the :ref:`User Guide <target_encoder>`. for details.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data to determine the categories of each feature.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_trans : ndarray of shape (n_samples, n_features) or \
|
||||
(n_samples, (n_features * n_classes))
|
||||
Transformed input.
|
||||
"""
|
||||
X_ordinal, X_known_mask = self._transform(
|
||||
X, handle_unknown="ignore", ensure_all_finite="allow-nan"
|
||||
)
|
||||
|
||||
# If 'multiclass' multiply axis=1 by num of classes else keep shape the same
|
||||
if self.target_type_ == "multiclass":
|
||||
X_out = np.empty(
|
||||
(X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
|
||||
dtype=np.float64,
|
||||
)
|
||||
else:
|
||||
X_out = np.empty_like(X_ordinal, dtype=np.float64)
|
||||
|
||||
self._transform_X_ordinal(
|
||||
X_out,
|
||||
X_ordinal,
|
||||
~X_known_mask,
|
||||
slice(None),
|
||||
self.encodings_,
|
||||
self.target_mean_,
|
||||
)
|
||||
return X_out
|
||||
|
||||
def _fit_encodings_all(self, X, y):
|
||||
"""Fit a target encoding with all the data."""
|
||||
# avoid circular import
|
||||
from ..preprocessing import (
|
||||
LabelBinarizer,
|
||||
LabelEncoder,
|
||||
)
|
||||
|
||||
check_consistent_length(X, y)
|
||||
self._fit(X, handle_unknown="ignore", ensure_all_finite="allow-nan")
|
||||
|
||||
if self.target_type == "auto":
|
||||
accepted_target_types = ("binary", "multiclass", "continuous")
|
||||
inferred_type_of_target = type_of_target(y, input_name="y")
|
||||
if inferred_type_of_target not in accepted_target_types:
|
||||
raise ValueError(
|
||||
"Unknown label type: Target type was inferred to be "
|
||||
f"{inferred_type_of_target!r}. Only {accepted_target_types} are "
|
||||
"supported."
|
||||
)
|
||||
self.target_type_ = inferred_type_of_target
|
||||
else:
|
||||
self.target_type_ = self.target_type
|
||||
|
||||
self.classes_ = None
|
||||
if self.target_type_ == "binary":
|
||||
label_encoder = LabelEncoder()
|
||||
y = label_encoder.fit_transform(y)
|
||||
self.classes_ = label_encoder.classes_
|
||||
elif self.target_type_ == "multiclass":
|
||||
label_binarizer = LabelBinarizer()
|
||||
y = label_binarizer.fit_transform(y)
|
||||
self.classes_ = label_binarizer.classes_
|
||||
else: # continuous
|
||||
y = _check_y(y, y_numeric=True, estimator=self)
|
||||
|
||||
self.target_mean_ = np.mean(y, axis=0)
|
||||
|
||||
X_ordinal, X_known_mask = self._transform(
|
||||
X, handle_unknown="ignore", ensure_all_finite="allow-nan"
|
||||
)
|
||||
n_categories = np.fromiter(
|
||||
(len(category_for_feature) for category_for_feature in self.categories_),
|
||||
dtype=np.int64,
|
||||
count=len(self.categories_),
|
||||
)
|
||||
if self.target_type_ == "multiclass":
|
||||
encodings = self._fit_encoding_multiclass(
|
||||
X_ordinal,
|
||||
y,
|
||||
n_categories,
|
||||
self.target_mean_,
|
||||
)
|
||||
else:
|
||||
encodings = self._fit_encoding_binary_or_continuous(
|
||||
X_ordinal,
|
||||
y,
|
||||
n_categories,
|
||||
self.target_mean_,
|
||||
)
|
||||
self.encodings_ = encodings
|
||||
|
||||
return X_ordinal, X_known_mask, y, n_categories
|
||||
|
||||
def _fit_encoding_binary_or_continuous(
|
||||
self, X_ordinal, y, n_categories, target_mean
|
||||
):
|
||||
"""Learn target encodings."""
|
||||
if self.smooth == "auto":
|
||||
y_variance = np.var(y)
|
||||
encodings = _fit_encoding_fast_auto_smooth(
|
||||
X_ordinal,
|
||||
y,
|
||||
n_categories,
|
||||
target_mean,
|
||||
y_variance,
|
||||
)
|
||||
else:
|
||||
encodings = _fit_encoding_fast(
|
||||
X_ordinal,
|
||||
y,
|
||||
n_categories,
|
||||
self.smooth,
|
||||
target_mean,
|
||||
)
|
||||
return encodings
|
||||
|
||||
def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
|
||||
"""Learn multiclass encodings.
|
||||
|
||||
Learn encodings for each class (c) then reorder encodings such that
|
||||
the same features (f) are grouped together. `reorder_index` enables
|
||||
converting from:
|
||||
f0_c0, f1_c0, f0_c1, f1_c1, f0_c2, f1_c2
|
||||
to:
|
||||
f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2
|
||||
"""
|
||||
n_features = self.n_features_in_
|
||||
n_classes = len(self.classes_)
|
||||
|
||||
encodings = []
|
||||
for i in range(n_classes):
|
||||
y_class = y[:, i]
|
||||
encoding = self._fit_encoding_binary_or_continuous(
|
||||
X_ordinal,
|
||||
y_class,
|
||||
n_categories,
|
||||
target_mean[i],
|
||||
)
|
||||
encodings.extend(encoding)
|
||||
|
||||
reorder_index = (
|
||||
idx
|
||||
for start in range(n_features)
|
||||
for idx in range(start, (n_classes * n_features), n_features)
|
||||
)
|
||||
return [encodings[idx] for idx in reorder_index]
|
||||
|
||||
def _transform_X_ordinal(
|
||||
self,
|
||||
X_out,
|
||||
X_ordinal,
|
||||
X_unknown_mask,
|
||||
row_indices,
|
||||
encodings,
|
||||
target_mean,
|
||||
):
|
||||
"""Transform X_ordinal using encodings.
|
||||
|
||||
In the multiclass case, `X_ordinal` and `X_unknown_mask` have column
|
||||
(axis=1) size `n_features`, while `encodings` has length of size
|
||||
`n_features * n_classes`. `feat_idx` deals with this by repeating
|
||||
feature indices by `n_classes` E.g., for 3 features, 2 classes:
|
||||
0,0,1,1,2,2
|
||||
|
||||
Additionally, `target_mean` is of shape (`n_classes`,) so `mean_idx`
|
||||
cycles through 0 to `n_classes` - 1, `n_features` times.
|
||||
"""
|
||||
if self.target_type_ == "multiclass":
|
||||
n_classes = len(self.classes_)
|
||||
for e_idx, encoding in enumerate(encodings):
|
||||
# Repeat feature indices by n_classes
|
||||
feat_idx = e_idx // n_classes
|
||||
# Cycle through each class
|
||||
mean_idx = e_idx % n_classes
|
||||
X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, feat_idx]]
|
||||
X_out[X_unknown_mask[:, feat_idx], e_idx] = target_mean[mean_idx]
|
||||
else:
|
||||
for e_idx, encoding in enumerate(encodings):
|
||||
X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, e_idx]]
|
||||
X_out[X_unknown_mask[:, e_idx], e_idx] = target_mean
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
"""Get output feature names for transformation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_features : array-like of str or None, default=None
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_names_out : ndarray of str objects
|
||||
Transformed feature names. `feature_names_in_` is used unless it is
|
||||
not defined, in which case the following input feature names are
|
||||
generated: `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
|
||||
When `type_of_target_` is "multiclass" the names are of the format
|
||||
'<feature_name>_<class_name>'.
|
||||
"""
|
||||
check_is_fitted(self, "n_features_in_")
|
||||
feature_names = _check_feature_names_in(self, input_features)
|
||||
if self.target_type_ == "multiclass":
|
||||
feature_names = [
|
||||
f"{feature_name}_{class_name}"
|
||||
for feature_name in feature_names
|
||||
for class_name in self.classes_
|
||||
]
|
||||
return np.asarray(feature_names, dtype=object)
|
||||
else:
|
||||
return feature_names
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.target_tags.required = True
|
||||
return tags
|
||||
Binary file not shown.
@@ -0,0 +1,167 @@
|
||||
from libc.math cimport isnan
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from ..utils._typedefs cimport float32_t, float64_t, int32_t, int64_t
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
ctypedef fused INT_DTYPE:
|
||||
int64_t
|
||||
int32_t
|
||||
|
||||
ctypedef fused Y_DTYPE:
|
||||
int64_t
|
||||
int32_t
|
||||
float64_t
|
||||
float32_t
|
||||
|
||||
|
||||
def _fit_encoding_fast(
|
||||
INT_DTYPE[:, ::1] X_int,
|
||||
const Y_DTYPE[:] y,
|
||||
int64_t[::1] n_categories,
|
||||
double smooth,
|
||||
double y_mean,
|
||||
):
|
||||
"""Fit a target encoding on X_int and y.
|
||||
|
||||
This implementation uses Eq 7 from [1] to compute the encoding.
|
||||
As stated in the paper, Eq 7 is the same as Eq 3.
|
||||
|
||||
[1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
|
||||
categorical attributes in classification and prediction problems"
|
||||
"""
|
||||
cdef:
|
||||
int64_t sample_idx, feat_idx, cat_idx, n_cats
|
||||
INT_DTYPE X_int_tmp
|
||||
int n_samples = X_int.shape[0]
|
||||
int n_features = X_int.shape[1]
|
||||
double smooth_sum = smooth * y_mean
|
||||
int64_t max_n_cats = np.max(n_categories)
|
||||
double[::1] sums = np.empty(max_n_cats, dtype=np.float64)
|
||||
double[::1] counts = np.empty(max_n_cats, dtype=np.float64)
|
||||
list encodings = []
|
||||
double[::1] current_encoding
|
||||
# Gives access to encodings without gil
|
||||
vector[double*] encoding_vec
|
||||
|
||||
encoding_vec.resize(n_features)
|
||||
for feat_idx in range(n_features):
|
||||
current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
|
||||
encoding_vec[feat_idx] = ¤t_encoding[0]
|
||||
encodings.append(np.asarray(current_encoding))
|
||||
|
||||
with nogil:
|
||||
for feat_idx in range(n_features):
|
||||
n_cats = n_categories[feat_idx]
|
||||
|
||||
for cat_idx in range(n_cats):
|
||||
sums[cat_idx] = smooth_sum
|
||||
counts[cat_idx] = smooth
|
||||
|
||||
for sample_idx in range(n_samples):
|
||||
X_int_tmp = X_int[sample_idx, feat_idx]
|
||||
# -1 are unknown categories, which are not counted
|
||||
if X_int_tmp == -1:
|
||||
continue
|
||||
sums[X_int_tmp] += y[sample_idx]
|
||||
counts[X_int_tmp] += 1.0
|
||||
|
||||
for cat_idx in range(n_cats):
|
||||
if counts[cat_idx] == 0:
|
||||
encoding_vec[feat_idx][cat_idx] = y_mean
|
||||
else:
|
||||
encoding_vec[feat_idx][cat_idx] = sums[cat_idx] / counts[cat_idx]
|
||||
|
||||
return encodings
|
||||
|
||||
|
||||
def _fit_encoding_fast_auto_smooth(
|
||||
INT_DTYPE[:, ::1] X_int,
|
||||
const Y_DTYPE[:] y,
|
||||
int64_t[::1] n_categories,
|
||||
double y_mean,
|
||||
double y_variance,
|
||||
):
|
||||
"""Fit a target encoding on X_int and y with auto smoothing.
|
||||
|
||||
This implementation uses Eq 5 and 6 from [1].
|
||||
|
||||
[1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
|
||||
categorical attributes in classification and prediction problems"
|
||||
"""
|
||||
cdef:
|
||||
int64_t sample_idx, feat_idx, cat_idx, n_cats
|
||||
INT_DTYPE X_int_tmp
|
||||
double diff
|
||||
int n_samples = X_int.shape[0]
|
||||
int n_features = X_int.shape[1]
|
||||
int64_t max_n_cats = np.max(n_categories)
|
||||
double[::1] means = np.empty(max_n_cats, dtype=np.float64)
|
||||
int64_t[::1] counts = np.empty(max_n_cats, dtype=np.int64)
|
||||
double[::1] sum_of_squared_diffs = np.empty(max_n_cats, dtype=np.float64)
|
||||
double lambda_
|
||||
list encodings = []
|
||||
double[::1] current_encoding
|
||||
# Gives access to encodings without gil
|
||||
vector[double*] encoding_vec
|
||||
|
||||
encoding_vec.resize(n_features)
|
||||
for feat_idx in range(n_features):
|
||||
current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
|
||||
encoding_vec[feat_idx] = ¤t_encoding[0]
|
||||
encodings.append(np.asarray(current_encoding))
|
||||
|
||||
# TODO: parallelize this with OpenMP prange. When n_features >= n_threads, it's
|
||||
# probably good to parallelize the outer loop. When n_features is too small,
|
||||
# then it would probably better to parallelize the nested loops on n_samples and
|
||||
# n_cats, but the code to handle thread-local temporary variables might be
|
||||
# significantly more complex.
|
||||
with nogil:
|
||||
for feat_idx in range(n_features):
|
||||
n_cats = n_categories[feat_idx]
|
||||
|
||||
for cat_idx in range(n_cats):
|
||||
means[cat_idx] = 0.0
|
||||
counts[cat_idx] = 0
|
||||
sum_of_squared_diffs[cat_idx] = 0.0
|
||||
|
||||
# first pass to compute the mean
|
||||
for sample_idx in range(n_samples):
|
||||
X_int_tmp = X_int[sample_idx, feat_idx]
|
||||
|
||||
# -1 are unknown categories, which are not counted
|
||||
if X_int_tmp == -1:
|
||||
continue
|
||||
counts[X_int_tmp] += 1
|
||||
means[X_int_tmp] += y[sample_idx]
|
||||
|
||||
for cat_idx in range(n_cats):
|
||||
means[cat_idx] /= counts[cat_idx]
|
||||
|
||||
# second pass to compute the sum of squared differences
|
||||
for sample_idx in range(n_samples):
|
||||
X_int_tmp = X_int[sample_idx, feat_idx]
|
||||
if X_int_tmp == -1:
|
||||
continue
|
||||
diff = y[sample_idx] - means[X_int_tmp]
|
||||
sum_of_squared_diffs[X_int_tmp] += diff * diff
|
||||
|
||||
for cat_idx in range(n_cats):
|
||||
lambda_ = (
|
||||
y_variance * counts[cat_idx] /
|
||||
(y_variance * counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
|
||||
counts[cat_idx])
|
||||
)
|
||||
if isnan(lambda_):
|
||||
# A nan can happen when:
|
||||
# 1. counts[cat_idx] == 0
|
||||
# 2. y_variance == 0 and sum_of_squared_diffs[cat_idx] == 0
|
||||
encoding_vec[feat_idx][cat_idx] = y_mean
|
||||
else:
|
||||
encoding_vec[feat_idx][cat_idx] = (
|
||||
lambda_ * means[cat_idx] + (1 - lambda_) * y_mean
|
||||
)
|
||||
|
||||
return encodings
|
||||
@@ -0,0 +1,13 @@
|
||||
py.extension_module(
|
||||
'_csr_polynomial_expansion',
|
||||
[cython_gen.process('_csr_polynomial_expansion.pyx'), utils_cython_tree],
|
||||
subdir: 'sklearn/preprocessing',
|
||||
install: true
|
||||
)
|
||||
|
||||
py.extension_module(
|
||||
'_target_encoder_fast',
|
||||
[cython_gen_cpp.process('_target_encoder_fast.pyx'), utils_cython_tree],
|
||||
subdir: 'sklearn/preprocessing',
|
||||
install: true
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,187 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import (
|
||||
MaxAbsScaler,
|
||||
MinMaxScaler,
|
||||
PowerTransformer,
|
||||
QuantileTransformer,
|
||||
RobustScaler,
|
||||
StandardScaler,
|
||||
maxabs_scale,
|
||||
minmax_scale,
|
||||
power_transform,
|
||||
quantile_transform,
|
||||
robust_scale,
|
||||
scale,
|
||||
)
|
||||
from sklearn.utils._testing import assert_allclose, assert_array_equal
|
||||
from sklearn.utils.fixes import (
|
||||
BSR_CONTAINERS,
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
DIA_CONTAINERS,
|
||||
DOK_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
|
||||
iris = load_iris()
|
||||
|
||||
|
||||
def _get_valid_samples_by_column(X, col):
|
||||
"""Get non NaN samples in column of X"""
|
||||
return X[:, [col]][~np.isnan(X[:, col])]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est, func, support_sparse, strictly_positive, omit_kwargs",
|
||||
[
|
||||
(MaxAbsScaler(), maxabs_scale, True, False, []),
|
||||
(MinMaxScaler(), minmax_scale, False, False, ["clip"]),
|
||||
(StandardScaler(), scale, False, False, []),
|
||||
(StandardScaler(with_mean=False), scale, True, False, []),
|
||||
(PowerTransformer("yeo-johnson"), power_transform, False, False, []),
|
||||
(PowerTransformer("box-cox"), power_transform, False, True, []),
|
||||
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []),
|
||||
(RobustScaler(), robust_scale, False, False, []),
|
||||
(RobustScaler(with_centering=False), robust_scale, True, False, []),
|
||||
],
|
||||
)
|
||||
def test_missing_value_handling(
|
||||
est, func, support_sparse, strictly_positive, omit_kwargs
|
||||
):
|
||||
# check that the preprocessing method let pass nan
|
||||
rng = np.random.RandomState(42)
|
||||
X = iris.data.copy()
|
||||
n_missing = 50
|
||||
X[
|
||||
rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)
|
||||
] = np.nan
|
||||
if strictly_positive:
|
||||
X += np.nanmin(X) + 0.1
|
||||
X_train, X_test = train_test_split(X, random_state=1)
|
||||
# sanity check
|
||||
assert not np.all(np.isnan(X_train), axis=0).any()
|
||||
assert np.any(np.isnan(X_train), axis=0).all()
|
||||
assert np.any(np.isnan(X_test), axis=0).all()
|
||||
X_test[:, 0] = np.nan # make sure this boundary case is tested
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt = est.fit(X_train).transform(X_test)
|
||||
# ensure no warnings are raised
|
||||
# missing values should still be missing, and only them
|
||||
assert_array_equal(np.isnan(Xt), np.isnan(X_test))
|
||||
|
||||
# check that the function leads to the same results as the class
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_class = est.transform(X_train)
|
||||
kwargs = est.get_params()
|
||||
# remove the parameters which should be omitted because they
|
||||
# are not defined in the counterpart function of the preprocessing class
|
||||
for kwarg in omit_kwargs:
|
||||
_ = kwargs.pop(kwarg)
|
||||
Xt_func = func(X_train, **kwargs)
|
||||
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
|
||||
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
|
||||
|
||||
# check that the inverse transform keep NaN
|
||||
Xt_inv = est.inverse_transform(Xt)
|
||||
assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
|
||||
# FIXME: we can introduce equal_nan=True in recent version of numpy.
|
||||
# For the moment which just check that non-NaN values are almost equal.
|
||||
assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
|
||||
|
||||
for i in range(X.shape[1]):
|
||||
# train only on non-NaN
|
||||
est.fit(_get_valid_samples_by_column(X_train, i))
|
||||
# check transforming with NaN works even when training without NaN
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_col = est.transform(X_test[:, [i]])
|
||||
assert_allclose(Xt_col, Xt[:, [i]])
|
||||
# check non-NaN is handled as before - the 1st column is all nan
|
||||
if not np.isnan(X_test[:, i]).all():
|
||||
Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i))
|
||||
assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())])
|
||||
|
||||
if support_sparse:
|
||||
est_dense = clone(est)
|
||||
est_sparse = clone(est)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_dense = est_dense.fit(X_train).transform(X_test)
|
||||
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
|
||||
|
||||
for sparse_container in (
|
||||
BSR_CONTAINERS
|
||||
+ COO_CONTAINERS
|
||||
+ CSC_CONTAINERS
|
||||
+ CSR_CONTAINERS
|
||||
+ DIA_CONTAINERS
|
||||
+ DOK_CONTAINERS
|
||||
+ LIL_CONTAINERS
|
||||
):
|
||||
# check that the dense and sparse inputs lead to the same results
|
||||
# precompute the matrix to avoid catching side warnings
|
||||
X_train_sp = sparse_container(X_train)
|
||||
X_test_sp = sparse_container(X_test)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", PendingDeprecationWarning)
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
|
||||
|
||||
assert_allclose(Xt_sp.toarray(), Xt_dense)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", PendingDeprecationWarning)
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
|
||||
|
||||
assert_allclose(Xt_inv_sp.toarray(), Xt_inv_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est, func",
|
||||
[
|
||||
(MaxAbsScaler(), maxabs_scale),
|
||||
(MinMaxScaler(), minmax_scale),
|
||||
(StandardScaler(), scale),
|
||||
(StandardScaler(with_mean=False), scale),
|
||||
(PowerTransformer("yeo-johnson"), power_transform),
|
||||
(
|
||||
PowerTransformer("box-cox"),
|
||||
power_transform,
|
||||
),
|
||||
(QuantileTransformer(n_quantiles=3), quantile_transform),
|
||||
(RobustScaler(), robust_scale),
|
||||
(RobustScaler(with_centering=False), robust_scale),
|
||||
],
|
||||
)
|
||||
def test_missing_value_pandas_na_support(est, func):
|
||||
# Test pandas IntegerArray with pd.NA
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
X = np.array(
|
||||
[
|
||||
[1, 2, 3, np.nan, np.nan, 4, 5, 1],
|
||||
[np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
|
||||
[1, 2, 3, 4, 5, 6, 7, 8],
|
||||
]
|
||||
).T
|
||||
|
||||
# Creates dataframe with IntegerArrays with pd.NA
|
||||
X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c"])
|
||||
X_df["c"] = X_df["c"].astype("int")
|
||||
|
||||
X_trans = est.fit_transform(X)
|
||||
X_df_trans = est.fit_transform(X_df)
|
||||
|
||||
assert_allclose(X_trans, X_df_trans)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,665 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn import clone
|
||||
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
ignore_warnings,
|
||||
)
|
||||
|
||||
X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, quantile_method, expected, sample_weight",
|
||||
[
|
||||
(
|
||||
"uniform",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"uniform",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
|
||||
[1, 1, 2, 1],
|
||||
),
|
||||
(
|
||||
"uniform",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
|
||||
[1, 1, 1, 1],
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
|
||||
[1, 1, 2, 1],
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
|
||||
[1, 1, 1, 1],
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
|
||||
[0, 1, 1, 1],
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [1, 1, 1, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
|
||||
[1, 0, 3, 1],
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
|
||||
[1, 1, 1, 1],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_fit_transform(strategy, quantile_method, expected, sample_weight):
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=3, encode="ordinal", strategy=strategy, quantile_method=quantile_method
|
||||
)
|
||||
with ignore_warnings(category=UserWarning):
|
||||
# Ignore the warning on removed small bins.
|
||||
est.fit(X, sample_weight=sample_weight)
|
||||
assert_array_equal(est.transform(X), expected)
|
||||
|
||||
|
||||
def test_valid_n_bins():
|
||||
KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf").fit_transform(X)
|
||||
KBinsDiscretizer(
|
||||
n_bins=np.array([2])[0], quantile_method="averaged_inverted_cdf"
|
||||
).fit_transform(X)
|
||||
assert KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf").fit(
|
||||
X
|
||||
).n_bins_.dtype == np.dtype(int)
|
||||
|
||||
|
||||
def test_invalid_n_bins_array():
|
||||
# Bad shape
|
||||
n_bins = np.full((2, 4), 2.0)
|
||||
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
|
||||
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Incorrect number of features
|
||||
n_bins = [1, 2, 2]
|
||||
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
|
||||
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Bad bin values
|
||||
n_bins = [1, 2, 2, 1]
|
||||
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
|
||||
err_msg = (
|
||||
"KBinsDiscretizer received an invalid number of bins "
|
||||
"at indices 0, 3. Number of bins must be at least 2, "
|
||||
"and must be an int."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Float bin values
|
||||
n_bins = [2.1, 2, 2.1, 2]
|
||||
est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
|
||||
err_msg = (
|
||||
"KBinsDiscretizer received an invalid number of bins "
|
||||
"at indices 0, 2. Number of bins must be at least 2, "
|
||||
"and must be an int."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, quantile_method, expected, sample_weight",
|
||||
[
|
||||
(
|
||||
"uniform",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"linear",
|
||||
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
|
||||
[1, 1, 1, 1],
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
|
||||
[0, 1, 3, 1],
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 2, 2, 2], [1, 2, 2, 2]],
|
||||
[1, 1, 3, 1],
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
[[0, 0, 0, 0], [0, 1, 1, 0], [1, 1, 1, 1], [1, 2, 2, 2]],
|
||||
[1, 0, 3, 1],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_fit_transform_n_bins_array(strategy, quantile_method, expected, sample_weight):
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=[2, 3, 3, 3],
|
||||
encode="ordinal",
|
||||
strategy=strategy,
|
||||
quantile_method=quantile_method,
|
||||
).fit(X, sample_weight=sample_weight)
|
||||
assert_array_equal(est.transform(X), expected)
|
||||
|
||||
# test the shape of bin_edges_
|
||||
n_features = np.array(X).shape[1]
|
||||
assert est.bin_edges_.shape == (n_features,)
|
||||
for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
|
||||
assert bin_edges.shape == (n_bins + 1,)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore: Bins whose width are too small")
|
||||
def test_kbinsdiscretizer_effect_sample_weight():
|
||||
"""Check the impact of `sample_weight` one computed quantiles."""
|
||||
X = np.array([[-2], [-1], [1], [3], [500], [1000]])
|
||||
# add a large number of bins such that each sample with a non-null weight
|
||||
# will be used as bin edge
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=10,
|
||||
encode="ordinal",
|
||||
strategy="quantile",
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
)
|
||||
est.fit(X, sample_weight=[1, 1, 1, 1, 0, 0])
|
||||
assert_allclose(est.bin_edges_[0], [-2, -1, 0, 1, 3])
|
||||
assert_allclose(est.transform(X), [[0.0], [1.0], [3.0], [3.0], [3.0], [3.0]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", ["kmeans", "quantile"])
|
||||
def test_kbinsdiscretizer_no_mutating_sample_weight(strategy):
|
||||
"""Make sure that `sample_weight` is not changed in place."""
|
||||
|
||||
if strategy == "quantile":
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=3,
|
||||
encode="ordinal",
|
||||
strategy=strategy,
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
)
|
||||
else:
|
||||
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
|
||||
sample_weight = np.array([1, 3, 1, 2], dtype=np.float64)
|
||||
sample_weight_copy = np.copy(sample_weight)
|
||||
est.fit(X, sample_weight=sample_weight)
|
||||
assert_allclose(sample_weight, sample_weight_copy)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
|
||||
def test_same_min_max(strategy):
|
||||
warnings.simplefilter("always")
|
||||
X = np.array([[1, -2], [1, -1], [1, 0], [1, 1]])
|
||||
if strategy == "quantile":
|
||||
est = KBinsDiscretizer(
|
||||
strategy=strategy,
|
||||
n_bins=3,
|
||||
encode="ordinal",
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
)
|
||||
else:
|
||||
est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode="ordinal")
|
||||
warning_message = "Feature 0 is constant and will be replaced with 0."
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
est.fit(X)
|
||||
assert est.n_bins_[0] == 1
|
||||
# replace the feature with zeros
|
||||
Xt = est.transform(X)
|
||||
assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
|
||||
|
||||
|
||||
def test_transform_1d_behavior():
|
||||
X = np.arange(4)
|
||||
est = KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf")
|
||||
with pytest.raises(ValueError):
|
||||
est.fit(X)
|
||||
|
||||
est = KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf")
|
||||
est.fit(X.reshape(-1, 1))
|
||||
with pytest.raises(ValueError):
|
||||
est.transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("i", range(1, 9))
|
||||
def test_numeric_stability(i):
|
||||
X_init = np.array([2.0, 4.0, 6.0, 8.0, 10.0]).reshape(-1, 1)
|
||||
Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
|
||||
|
||||
# Test up to discretizing nano units
|
||||
X = X_init / 10**i
|
||||
Xt = KBinsDiscretizer(
|
||||
n_bins=2, encode="ordinal", quantile_method="averaged_inverted_cdf"
|
||||
).fit_transform(X)
|
||||
assert_array_equal(Xt_expected, Xt)
|
||||
|
||||
|
||||
def test_encode_options():
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=[2, 3, 3, 3], encode="ordinal", quantile_method="averaged_inverted_cdf"
|
||||
).fit(X)
|
||||
Xt_1 = est.transform(X)
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=[2, 3, 3, 3],
|
||||
encode="onehot-dense",
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
).fit(X)
|
||||
Xt_2 = est.transform(X)
|
||||
assert not sp.issparse(Xt_2)
|
||||
assert_array_equal(
|
||||
OneHotEncoder(
|
||||
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=False
|
||||
).fit_transform(Xt_1),
|
||||
Xt_2,
|
||||
)
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=[2, 3, 3, 3], encode="onehot", quantile_method="averaged_inverted_cdf"
|
||||
).fit(X)
|
||||
Xt_3 = est.transform(X)
|
||||
assert sp.issparse(Xt_3)
|
||||
assert_array_equal(
|
||||
OneHotEncoder(
|
||||
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=True
|
||||
)
|
||||
.fit_transform(Xt_1)
|
||||
.toarray(),
|
||||
Xt_3.toarray(),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, quantile_method, expected_2bins, expected_3bins, expected_5bins",
|
||||
[
|
||||
("uniform", "warn", [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
|
||||
("kmeans", "warn", [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
|
||||
(
|
||||
"quantile",
|
||||
"averaged_inverted_cdf",
|
||||
[0, 0, 0, 1, 1, 1],
|
||||
[0, 0, 1, 1, 2, 2],
|
||||
[0, 1, 2, 3, 4, 4],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_nonuniform_strategies(
|
||||
strategy, quantile_method, expected_2bins, expected_3bins, expected_5bins
|
||||
):
|
||||
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
|
||||
|
||||
# with 2 bins
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=2, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
|
||||
)
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_2bins, Xt.ravel())
|
||||
|
||||
# with 3 bins
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=3, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
|
||||
)
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_3bins, Xt.ravel())
|
||||
|
||||
# with 5 bins
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=5, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
|
||||
)
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_5bins, Xt.ravel())
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected_inv,quantile_method",
|
||||
[
|
||||
(
|
||||
"uniform",
|
||||
[
|
||||
[-1.5, 2.0, -3.5, -0.5],
|
||||
[-0.5, 3.0, -2.5, -0.5],
|
||||
[0.5, 4.0, -1.5, 0.5],
|
||||
[0.5, 4.0, -1.5, 1.5],
|
||||
],
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
[
|
||||
[-1.375, 2.125, -3.375, -0.5625],
|
||||
[-1.375, 2.125, -3.375, -0.5625],
|
||||
[-0.125, 3.375, -2.125, 0.5625],
|
||||
[0.75, 4.25, -1.25, 1.625],
|
||||
],
|
||||
"warn", # default, will not warn when strategy != "quantile"
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
[
|
||||
[-1.5, 2.0, -3.5, -0.75],
|
||||
[-0.5, 3.0, -2.5, 0.0],
|
||||
[0.5, 4.0, -1.5, 1.25],
|
||||
[0.5, 4.0, -1.5, 1.25],
|
||||
],
|
||||
"averaged_inverted_cdf",
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
|
||||
def test_inverse_transform(strategy, encode, expected_inv, quantile_method):
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=3, strategy=strategy, quantile_method=quantile_method, encode=encode
|
||||
)
|
||||
Xt = kbd.fit_transform(X)
|
||||
Xinv = kbd.inverse_transform(Xt)
|
||||
assert_array_almost_equal(expected_inv, Xinv)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
|
||||
def test_transform_outside_fit_range(strategy):
|
||||
X = np.array([0, 1, 2, 3])[:, None]
|
||||
|
||||
if strategy == "quantile":
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=4,
|
||||
strategy=strategy,
|
||||
encode="ordinal",
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
)
|
||||
else:
|
||||
kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode="ordinal")
|
||||
kbd.fit(X)
|
||||
|
||||
X2 = np.array([-2, 5])[:, None]
|
||||
X2t = kbd.transform(X2)
|
||||
assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
|
||||
assert_array_equal(X2t.min(axis=0), [0])
|
||||
|
||||
|
||||
def test_overwrite():
|
||||
X = np.array([0, 1, 2, 3])[:, None]
|
||||
X_before = X.copy()
|
||||
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=3, quantile_method="averaged_inverted_cdf", encode="ordinal"
|
||||
)
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(X, X_before)
|
||||
|
||||
Xt_before = Xt.copy()
|
||||
Xinv = est.inverse_transform(Xt)
|
||||
assert_array_equal(Xt, Xt_before)
|
||||
assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected_bin_edges, quantile_method",
|
||||
[
|
||||
("quantile", [0, 1.5, 3], "averaged_inverted_cdf"),
|
||||
("kmeans", [0, 1.5, 3], "warn"),
|
||||
],
|
||||
)
|
||||
def test_redundant_bins(strategy, expected_bin_edges, quantile_method):
|
||||
X = [[0], [0], [0], [0], [3], [3]]
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=3, strategy=strategy, quantile_method=quantile_method, subsample=None
|
||||
)
|
||||
warning_message = "Consider decreasing the number of bins."
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
kbd.fit(X)
|
||||
|
||||
assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
|
||||
|
||||
|
||||
def test_percentile_numeric_stability():
|
||||
X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
|
||||
bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
|
||||
Xt = np.array([0, 0, 4]).reshape(-1, 1)
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=10,
|
||||
encode="ordinal",
|
||||
strategy="quantile",
|
||||
quantile_method="linear",
|
||||
)
|
||||
## TODO: change to averaged inverted cdf, but that means we only get bin
|
||||
## edges of 0.05 and 0.95 and nothing in between
|
||||
|
||||
warning_message = "Consider decreasing the number of bins."
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
kbd.fit(X)
|
||||
|
||||
assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
|
||||
assert_array_almost_equal(kbd.transform(X), Xt)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("in_dtype", [np.float16, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("out_dtype", [None, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
|
||||
def test_consistent_dtype(in_dtype, out_dtype, encode):
|
||||
X_input = np.array(X, dtype=in_dtype)
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=3,
|
||||
encode=encode,
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
dtype=out_dtype,
|
||||
)
|
||||
kbd.fit(X_input)
|
||||
|
||||
# test output dtype
|
||||
if out_dtype is not None:
|
||||
expected_dtype = out_dtype
|
||||
elif out_dtype is None and X_input.dtype == np.float16:
|
||||
# wrong numeric input dtype are cast in np.float64
|
||||
expected_dtype = np.float64
|
||||
else:
|
||||
expected_dtype = X_input.dtype
|
||||
Xt = kbd.transform(X_input)
|
||||
assert Xt.dtype == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_dtype", [np.float16, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
|
||||
def test_32_equal_64(input_dtype, encode):
|
||||
# TODO this check is redundant with common checks and can be removed
|
||||
# once #16290 is merged
|
||||
X_input = np.array(X, dtype=input_dtype)
|
||||
|
||||
# 32 bit output
|
||||
kbd_32 = KBinsDiscretizer(
|
||||
n_bins=3,
|
||||
encode=encode,
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
dtype=np.float32,
|
||||
)
|
||||
kbd_32.fit(X_input)
|
||||
Xt_32 = kbd_32.transform(X_input)
|
||||
|
||||
# 64 bit output
|
||||
kbd_64 = KBinsDiscretizer(
|
||||
n_bins=3,
|
||||
encode=encode,
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
dtype=np.float64,
|
||||
)
|
||||
kbd_64.fit(X_input)
|
||||
Xt_64 = kbd_64.transform(X_input)
|
||||
|
||||
assert_allclose_dense_sparse(Xt_32, Xt_64)
|
||||
|
||||
|
||||
def test_kbinsdiscretizer_subsample_default():
|
||||
# Since the size of X is small (< 2e5), subsampling will not take place.
|
||||
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
|
||||
kbd_default = KBinsDiscretizer(
|
||||
n_bins=10,
|
||||
encode="ordinal",
|
||||
strategy="quantile",
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
)
|
||||
kbd_default.fit(X)
|
||||
|
||||
kbd_without_subsampling = clone(kbd_default)
|
||||
kbd_without_subsampling.set_params(subsample=None)
|
||||
kbd_without_subsampling.fit(X)
|
||||
|
||||
for bin_kbd_default, bin_kbd_with_subsampling in zip(
|
||||
kbd_default.bin_edges_[0], kbd_without_subsampling.bin_edges_[0]
|
||||
):
|
||||
np.testing.assert_allclose(bin_kbd_default, bin_kbd_with_subsampling)
|
||||
assert kbd_default.bin_edges_.shape == kbd_without_subsampling.bin_edges_.shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"encode, expected_names",
|
||||
[
|
||||
(
|
||||
"onehot",
|
||||
[
|
||||
f"feat{col_id}_{float(bin_id)}"
|
||||
for col_id in range(3)
|
||||
for bin_id in range(4)
|
||||
],
|
||||
),
|
||||
(
|
||||
"onehot-dense",
|
||||
[
|
||||
f"feat{col_id}_{float(bin_id)}"
|
||||
for col_id in range(3)
|
||||
for bin_id in range(4)
|
||||
],
|
||||
),
|
||||
("ordinal", [f"feat{col_id}" for col_id in range(3)]),
|
||||
],
|
||||
)
|
||||
def test_kbinsdiscrtizer_get_feature_names_out(encode, expected_names):
|
||||
"""Check get_feature_names_out for different settings.
|
||||
Non-regression test for #22731
|
||||
"""
|
||||
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
|
||||
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=4, encode=encode, quantile_method="averaged_inverted_cdf"
|
||||
).fit(X)
|
||||
Xt = kbd.transform(X)
|
||||
|
||||
input_features = [f"feat{i}" for i in range(3)]
|
||||
output_names = kbd.get_feature_names_out(input_features)
|
||||
assert Xt.shape[1] == output_names.shape[0]
|
||||
|
||||
assert_array_equal(output_names, expected_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
|
||||
def test_kbinsdiscretizer_subsample(strategy, global_random_seed):
|
||||
# Check that the bin edges are almost the same when subsampling is used.
|
||||
X = np.random.RandomState(global_random_seed).random_sample((100000, 1)) + 1
|
||||
|
||||
if strategy == "quantile":
|
||||
kbd_subsampling = KBinsDiscretizer(
|
||||
strategy=strategy,
|
||||
subsample=50000,
|
||||
random_state=global_random_seed,
|
||||
quantile_method="averaged_inverted_cdf",
|
||||
)
|
||||
else:
|
||||
kbd_subsampling = KBinsDiscretizer(
|
||||
strategy=strategy, subsample=50000, random_state=global_random_seed
|
||||
)
|
||||
kbd_subsampling.fit(X)
|
||||
|
||||
kbd_no_subsampling = clone(kbd_subsampling)
|
||||
kbd_no_subsampling.set_params(subsample=None)
|
||||
kbd_no_subsampling.fit(X)
|
||||
|
||||
# We use a large tolerance because we can't expect the bin edges to be exactly the
|
||||
# same when subsampling is used.
|
||||
assert_allclose(
|
||||
kbd_subsampling.bin_edges_[0], kbd_no_subsampling.bin_edges_[0], rtol=1e-2
|
||||
)
|
||||
|
||||
|
||||
def test_quantile_method_future_warnings():
|
||||
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
|
||||
with pytest.warns(
|
||||
FutureWarning,
|
||||
match="The current default behavior, quantile_method='linear', will be "
|
||||
"changed to quantile_method='averaged_inverted_cdf' in "
|
||||
"scikit-learn version 1.9 to naturally support sample weight "
|
||||
"equivalence properties by default. Pass "
|
||||
"quantile_method='averaged_inverted_cdf' explicitly to silence this "
|
||||
"warning.",
|
||||
):
|
||||
KBinsDiscretizer(strategy="quantile").fit(X)
|
||||
|
||||
|
||||
def test_invalid_quantile_method_with_sample_weight():
|
||||
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
|
||||
expected_msg = (
|
||||
"When fitting with strategy='quantile' and sample weights, "
|
||||
"quantile_method should either be set to 'averaged_inverted_cdf' or "
|
||||
"'inverted_cdf', got quantile_method='linear' instead."
|
||||
)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=expected_msg,
|
||||
):
|
||||
KBinsDiscretizer(strategy="quantile", quantile_method="linear").fit(
|
||||
X,
|
||||
sample_weight=[1, 1, 2, 2],
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,579 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import FunctionTransformer, StandardScaler
|
||||
from sklearn.utils._testing import (
|
||||
_convert_container,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
|
||||
|
||||
|
||||
def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
|
||||
def _func(X, *args, **kwargs):
|
||||
args_store.append(X)
|
||||
args_store.extend(args)
|
||||
kwargs_store.update(kwargs)
|
||||
return func(X)
|
||||
|
||||
return _func
|
||||
|
||||
|
||||
def test_delegate_to_func():
|
||||
# (args|kwargs)_store will hold the positional and keyword arguments
|
||||
# passed to the function inside the FunctionTransformer.
|
||||
args_store = []
|
||||
kwargs_store = {}
|
||||
X = np.arange(10).reshape((5, 2))
|
||||
assert_array_equal(
|
||||
FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
|
||||
X,
|
||||
"transform should have returned X unchanged",
|
||||
)
|
||||
|
||||
# The function should only have received X.
|
||||
assert args_store == [X], (
|
||||
"Incorrect positional arguments passed to func: {args}".format(args=args_store)
|
||||
)
|
||||
|
||||
assert not kwargs_store, (
|
||||
"Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
|
||||
)
|
||||
|
||||
# reset the argument stores.
|
||||
args_store[:] = []
|
||||
kwargs_store.clear()
|
||||
transformed = FunctionTransformer(
|
||||
_make_func(args_store, kwargs_store),
|
||||
).transform(X)
|
||||
|
||||
assert_array_equal(
|
||||
transformed, X, err_msg="transform should have returned X unchanged"
|
||||
)
|
||||
|
||||
# The function should have received X
|
||||
assert args_store == [X], (
|
||||
"Incorrect positional arguments passed to func: {args}".format(args=args_store)
|
||||
)
|
||||
|
||||
assert not kwargs_store, (
|
||||
"Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
|
||||
)
|
||||
|
||||
|
||||
def test_np_log():
|
||||
X = np.arange(10).reshape((5, 2))
|
||||
|
||||
# Test that the numpy.log example still works.
|
||||
assert_array_equal(
|
||||
FunctionTransformer(np.log1p).transform(X),
|
||||
np.log1p(X),
|
||||
)
|
||||
|
||||
|
||||
def test_kw_arg():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=3))
|
||||
|
||||
|
||||
def test_kw_arg_update():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
F.kw_args["decimals"] = 1
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=1))
|
||||
|
||||
|
||||
def test_kw_arg_reset():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
F.kw_args = dict(decimals=1)
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=1))
|
||||
|
||||
|
||||
def test_inverse_transform():
|
||||
X = np.array([1, 4, 9, 16]).reshape((2, 2))
|
||||
|
||||
# Test that inverse_transform works correctly
|
||||
F = FunctionTransformer(
|
||||
func=np.sqrt,
|
||||
inverse_func=np.around,
|
||||
inv_kw_args=dict(decimals=3),
|
||||
)
|
||||
assert_array_equal(
|
||||
F.inverse_transform(F.transform(X)),
|
||||
np.around(np.sqrt(X), decimals=3),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
|
||||
def test_check_inverse(sparse_container):
|
||||
X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
|
||||
if sparse_container is not None:
|
||||
X = sparse_container(X)
|
||||
|
||||
trans = FunctionTransformer(
|
||||
func=np.sqrt,
|
||||
inverse_func=np.around,
|
||||
accept_sparse=sparse_container is not None,
|
||||
check_inverse=True,
|
||||
validate=True,
|
||||
)
|
||||
warning_message = (
|
||||
"The provided functions are not strictly"
|
||||
" inverse of each other. If you are sure you"
|
||||
" want to proceed regardless, set"
|
||||
" 'check_inverse=False'."
|
||||
)
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
trans.fit(X)
|
||||
|
||||
trans = FunctionTransformer(
|
||||
func=np.expm1,
|
||||
inverse_func=np.log1p,
|
||||
accept_sparse=sparse_container is not None,
|
||||
check_inverse=True,
|
||||
validate=True,
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
Xt = trans.fit_transform(X)
|
||||
|
||||
assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
|
||||
|
||||
|
||||
def test_check_inverse_func_or_inverse_not_provided():
|
||||
# check that we don't check inverse when one of the func or inverse is not
|
||||
# provided.
|
||||
X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
|
||||
|
||||
trans = FunctionTransformer(
|
||||
func=np.expm1, inverse_func=None, check_inverse=True, validate=True
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
trans.fit(X)
|
||||
trans = FunctionTransformer(
|
||||
func=None, inverse_func=np.expm1, check_inverse=True, validate=True
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
trans.fit(X)
|
||||
|
||||
|
||||
def test_function_transformer_frame():
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_df = pd.DataFrame(np.random.randn(100, 10))
|
||||
transformer = FunctionTransformer()
|
||||
X_df_trans = transformer.fit_transform(X_df)
|
||||
assert hasattr(X_df_trans, "loc")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X_type", ["array", "series"])
|
||||
def test_function_transformer_raise_error_with_mixed_dtype(X_type):
|
||||
"""Check that `FunctionTransformer.check_inverse` raises error on mixed dtype."""
|
||||
mapping = {"one": 1, "two": 2, "three": 3, 5: "five", 6: "six"}
|
||||
inverse_mapping = {value: key for key, value in mapping.items()}
|
||||
dtype = "object"
|
||||
|
||||
data = ["one", "two", "three", "one", "one", 5, 6]
|
||||
data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype)
|
||||
|
||||
def func(X):
|
||||
return np.array([mapping[X[i]] for i in range(X.size)], dtype=object)
|
||||
|
||||
def inverse_func(X):
|
||||
return _convert_container(
|
||||
[inverse_mapping[x] for x in X],
|
||||
X_type,
|
||||
columns_name=["value"],
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
transformer = FunctionTransformer(
|
||||
func=func, inverse_func=inverse_func, validate=False, check_inverse=True
|
||||
)
|
||||
|
||||
msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
transformer.fit(data)
|
||||
|
||||
|
||||
def test_function_transformer_support_all_nummerical_dataframes_check_inverse_True():
|
||||
"""Check support for dataframes with only numerical values."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
||||
transformer = FunctionTransformer(
|
||||
func=lambda x: x + 2, inverse_func=lambda x: x - 2, check_inverse=True
|
||||
)
|
||||
|
||||
# Does not raise an error
|
||||
df_out = transformer.fit_transform(df)
|
||||
assert_allclose_dense_sparse(df_out, df + 2)
|
||||
|
||||
|
||||
def test_function_transformer_with_dataframe_and_check_inverse_True():
|
||||
"""Check error is raised when check_inverse=True.
|
||||
|
||||
Non-regresion test for gh-25261.
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
transformer = FunctionTransformer(
|
||||
func=lambda x: x, inverse_func=lambda x: x, check_inverse=True
|
||||
)
|
||||
|
||||
df_mixed = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
|
||||
msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
transformer.fit(df_mixed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, feature_names_out, input_features, expected",
|
||||
[
|
||||
(
|
||||
# NumPy inputs, default behavior: generate names
|
||||
np.random.rand(100, 3),
|
||||
"one-to-one",
|
||||
None,
|
||||
("x0", "x1", "x2"),
|
||||
),
|
||||
(
|
||||
# Pandas input, default behavior: use input feature names
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
"one-to-one",
|
||||
None,
|
||||
("a", "b"),
|
||||
),
|
||||
(
|
||||
# NumPy input, feature_names_out=callable
|
||||
np.random.rand(100, 3),
|
||||
lambda transformer, input_features: ("a", "b"),
|
||||
None,
|
||||
("a", "b"),
|
||||
),
|
||||
(
|
||||
# Pandas input, feature_names_out=callable
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
lambda transformer, input_features: ("c", "d", "e"),
|
||||
None,
|
||||
("c", "d", "e"),
|
||||
),
|
||||
(
|
||||
# NumPy input, feature_names_out=callable – default input_features
|
||||
np.random.rand(100, 3),
|
||||
lambda transformer, input_features: tuple(input_features) + ("a",),
|
||||
None,
|
||||
("x0", "x1", "x2", "a"),
|
||||
),
|
||||
(
|
||||
# Pandas input, feature_names_out=callable – default input_features
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
lambda transformer, input_features: tuple(input_features) + ("c",),
|
||||
None,
|
||||
("a", "b", "c"),
|
||||
),
|
||||
(
|
||||
# NumPy input, input_features=list of names
|
||||
np.random.rand(100, 3),
|
||||
"one-to-one",
|
||||
("a", "b", "c"),
|
||||
("a", "b", "c"),
|
||||
),
|
||||
(
|
||||
# Pandas input, input_features=list of names
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
"one-to-one",
|
||||
("a", "b"), # must match feature_names_in_
|
||||
("a", "b"),
|
||||
),
|
||||
(
|
||||
# NumPy input, feature_names_out=callable, input_features=list
|
||||
np.random.rand(100, 3),
|
||||
lambda transformer, input_features: tuple(input_features) + ("d",),
|
||||
("a", "b", "c"),
|
||||
("a", "b", "c", "d"),
|
||||
),
|
||||
(
|
||||
# Pandas input, feature_names_out=callable, input_features=list
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
lambda transformer, input_features: tuple(input_features) + ("c",),
|
||||
("a", "b"), # must match feature_names_in_
|
||||
("a", "b", "c"),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("validate", [True, False])
|
||||
def test_function_transformer_get_feature_names_out(
|
||||
X, feature_names_out, input_features, expected, validate
|
||||
):
|
||||
if isinstance(X, dict):
|
||||
pd = pytest.importorskip("pandas")
|
||||
X = pd.DataFrame(X)
|
||||
|
||||
transformer = FunctionTransformer(
|
||||
feature_names_out=feature_names_out, validate=validate
|
||||
)
|
||||
transformer.fit(X)
|
||||
names = transformer.get_feature_names_out(input_features)
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, expected)
|
||||
|
||||
|
||||
def test_function_transformer_get_feature_names_out_without_validation():
|
||||
transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False)
|
||||
X = np.random.rand(100, 2)
|
||||
transformer.fit_transform(X)
|
||||
|
||||
names = transformer.get_feature_names_out(("a", "b"))
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, ("a", "b"))
|
||||
|
||||
|
||||
def test_function_transformer_feature_names_out_is_None():
|
||||
transformer = FunctionTransformer()
|
||||
X = np.random.rand(100, 2)
|
||||
transformer.fit_transform(X)
|
||||
|
||||
msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
transformer.get_feature_names_out()
|
||||
|
||||
|
||||
def test_function_transformer_feature_names_out_uses_estimator():
|
||||
def add_n_random_features(X, n):
|
||||
return np.concatenate([X, np.random.rand(len(X), n)], axis=1)
|
||||
|
||||
def feature_names_out(transformer, input_features):
|
||||
n = transformer.kw_args["n"]
|
||||
return list(input_features) + [f"rnd{i}" for i in range(n)]
|
||||
|
||||
transformer = FunctionTransformer(
|
||||
func=add_n_random_features,
|
||||
feature_names_out=feature_names_out,
|
||||
kw_args=dict(n=3),
|
||||
validate=True,
|
||||
)
|
||||
pd = pytest.importorskip("pandas")
|
||||
df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
|
||||
transformer.fit_transform(df)
|
||||
names = transformer.get_feature_names_out()
|
||||
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2"))
|
||||
|
||||
|
||||
def test_function_transformer_validate_inverse():
|
||||
"""Test that function transformer does not reset estimator in
|
||||
`inverse_transform`."""
|
||||
|
||||
def add_constant_feature(X):
|
||||
X_one = np.ones((X.shape[0], 1))
|
||||
return np.concatenate((X, X_one), axis=1)
|
||||
|
||||
def inverse_add_constant(X):
|
||||
return X[:, :-1]
|
||||
|
||||
X = np.array([[1, 2], [3, 4], [3, 4]])
|
||||
trans = FunctionTransformer(
|
||||
func=add_constant_feature,
|
||||
inverse_func=inverse_add_constant,
|
||||
validate=True,
|
||||
)
|
||||
X_trans = trans.fit_transform(X)
|
||||
assert trans.n_features_in_ == X.shape[1]
|
||||
|
||||
trans.inverse_transform(X_trans)
|
||||
assert trans.n_features_in_ == X.shape[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"feature_names_out, expected",
|
||||
[
|
||||
("one-to-one", ["pet", "color"]),
|
||||
[lambda est, names: [f"{n}_out" for n in names], ["pet_out", "color_out"]],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("in_pipeline", [True, False])
|
||||
def test_get_feature_names_out_dataframe_with_string_data(
|
||||
feature_names_out, expected, in_pipeline
|
||||
):
|
||||
"""Check that get_feature_names_out works with DataFrames with string data."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
X = pd.DataFrame({"pet": ["dog", "cat"], "color": ["red", "green"]})
|
||||
|
||||
def func(X):
|
||||
if feature_names_out == "one-to-one":
|
||||
return X
|
||||
else:
|
||||
name = feature_names_out(None, X.columns)
|
||||
return X.rename(columns=dict(zip(X.columns, name)))
|
||||
|
||||
transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out)
|
||||
if in_pipeline:
|
||||
transformer = make_pipeline(transformer)
|
||||
|
||||
X_trans = transformer.fit_transform(X)
|
||||
assert isinstance(X_trans, pd.DataFrame)
|
||||
|
||||
names = transformer.get_feature_names_out()
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, expected)
|
||||
|
||||
|
||||
def test_set_output_func():
|
||||
"""Check behavior of set_output with different settings."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
|
||||
|
||||
ft = FunctionTransformer(np.log, feature_names_out="one-to-one")
|
||||
|
||||
# no warning is raised when feature_names_out is defined
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
ft.set_output(transform="pandas")
|
||||
|
||||
X_trans = ft.fit_transform(X)
|
||||
assert isinstance(X_trans, pd.DataFrame)
|
||||
assert_array_equal(X_trans.columns, ["a", "b"])
|
||||
|
||||
ft = FunctionTransformer(lambda x: 2 * x)
|
||||
ft.set_output(transform="pandas")
|
||||
|
||||
# no warning is raised when func returns a panda dataframe
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
X_trans = ft.fit_transform(X)
|
||||
assert isinstance(X_trans, pd.DataFrame)
|
||||
assert_array_equal(X_trans.columns, ["a", "b"])
|
||||
|
||||
# Warning is raised when func returns a ndarray
|
||||
ft_np = FunctionTransformer(lambda x: np.asarray(x))
|
||||
|
||||
for transform in ("pandas", "polars"):
|
||||
ft_np.set_output(transform=transform)
|
||||
msg = (
|
||||
f"When `set_output` is configured to be '{transform}'.*{transform} "
|
||||
"DataFrame.*"
|
||||
)
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
ft_np.fit_transform(X)
|
||||
|
||||
# default transform does not warn
|
||||
ft_np.set_output(transform="default")
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
ft_np.fit_transform(X)
|
||||
|
||||
|
||||
def test_consistence_column_name_between_steps():
|
||||
"""Check that we have a consistence between the feature names out of
|
||||
`FunctionTransformer` and the feature names in of the next step in the pipeline.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/27695
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
def with_suffix(_, names):
|
||||
return [name + "__log" for name in names]
|
||||
|
||||
pipeline = make_pipeline(
|
||||
FunctionTransformer(np.log1p, feature_names_out=with_suffix), StandardScaler()
|
||||
)
|
||||
|
||||
df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["a", "b"])
|
||||
X_trans = pipeline.fit_transform(df)
|
||||
assert pipeline.get_feature_names_out().tolist() == ["a__log", "b__log"]
|
||||
# StandardScaler will convert to a numpy array
|
||||
assert isinstance(X_trans, np.ndarray)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
|
||||
@pytest.mark.parametrize("transform_output", ["default", "pandas", "polars"])
|
||||
def test_function_transformer_overwrite_column_names(dataframe_lib, transform_output):
|
||||
"""Check that we overwrite the column names when we should."""
|
||||
lib = pytest.importorskip(dataframe_lib)
|
||||
if transform_output != "numpy":
|
||||
pytest.importorskip(transform_output)
|
||||
|
||||
df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
|
||||
|
||||
def with_suffix(_, names):
|
||||
return [name + "__log" for name in names]
|
||||
|
||||
transformer = FunctionTransformer(feature_names_out=with_suffix).set_output(
|
||||
transform=transform_output
|
||||
)
|
||||
X_trans = transformer.fit_transform(df)
|
||||
assert_array_equal(np.asarray(X_trans), np.asarray(df))
|
||||
|
||||
feature_names = transformer.get_feature_names_out()
|
||||
assert list(X_trans.columns) == with_suffix(None, df.columns)
|
||||
assert feature_names.tolist() == with_suffix(None, df.columns)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"feature_names_out",
|
||||
["one-to-one", lambda _, names: [f"{name}_log" for name in names]],
|
||||
)
|
||||
def test_function_transformer_overwrite_column_names_numerical(feature_names_out):
|
||||
"""Check the same as `test_function_transformer_overwrite_column_names`
|
||||
but for the specific case of pandas where column names can be numerical."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
df = pd.DataFrame({0: [1, 2, 3], 1: [10, 20, 100]})
|
||||
|
||||
transformer = FunctionTransformer(feature_names_out=feature_names_out)
|
||||
X_trans = transformer.fit_transform(df)
|
||||
assert_array_equal(np.asarray(X_trans), np.asarray(df))
|
||||
|
||||
feature_names = transformer.get_feature_names_out()
|
||||
assert list(X_trans.columns) == list(feature_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
|
||||
@pytest.mark.parametrize(
|
||||
"feature_names_out",
|
||||
["one-to-one", lambda _, names: [f"{name}_log" for name in names]],
|
||||
)
|
||||
def test_function_transformer_error_column_inconsistent(
|
||||
dataframe_lib, feature_names_out
|
||||
):
|
||||
"""Check that we raise an error when `func` returns a dataframe with new
|
||||
column names that become inconsistent with `get_feature_names_out`."""
|
||||
lib = pytest.importorskip(dataframe_lib)
|
||||
|
||||
df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
|
||||
|
||||
def func(df):
|
||||
if dataframe_lib == "pandas":
|
||||
return df.rename(columns={"a": "c"})
|
||||
else:
|
||||
return df.rename({"a": "c"})
|
||||
|
||||
transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out)
|
||||
err_msg = "The output generated by `func` have different column names"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
transformer.fit_transform(df).columns
|
||||
@@ -0,0 +1,748 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn import config_context, datasets
|
||||
from sklearn.preprocessing._label import (
|
||||
LabelBinarizer,
|
||||
LabelEncoder,
|
||||
MultiLabelBinarizer,
|
||||
_inverse_binarize_multiclass,
|
||||
_inverse_binarize_thresholding,
|
||||
label_binarize,
|
||||
)
|
||||
from sklearn.utils._array_api import (
|
||||
_convert_to_numpy,
|
||||
_get_namespace_device_dtype_ids,
|
||||
get_namespace,
|
||||
yield_namespace_device_dtype_combinations,
|
||||
)
|
||||
from sklearn.utils._testing import (
|
||||
_array_api_for_tests,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import (
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
DOK_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
from sklearn.utils.validation import _to_object_array
|
||||
|
||||
iris = datasets.load_iris()
|
||||
|
||||
|
||||
def toarray(a):
|
||||
if hasattr(a, "toarray"):
|
||||
a = a.toarray()
|
||||
return a
|
||||
|
||||
|
||||
def test_label_binarizer():
|
||||
# one-class case defaults to negative label
|
||||
# For dense case:
|
||||
inp = ["pos", "pos", "pos", "pos"]
|
||||
lb = LabelBinarizer(sparse_output=False)
|
||||
expected = np.array([[0, 0, 0, 0]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["pos"])
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
# For sparse case:
|
||||
lb = LabelBinarizer(sparse_output=True)
|
||||
got = lb.fit_transform(inp)
|
||||
assert issparse(got)
|
||||
assert_array_equal(lb.classes_, ["pos"])
|
||||
assert_array_equal(expected, got.toarray())
|
||||
assert_array_equal(lb.inverse_transform(got.toarray()), inp)
|
||||
|
||||
lb = LabelBinarizer(sparse_output=False)
|
||||
# two-class case
|
||||
inp = ["neg", "pos", "pos", "neg"]
|
||||
expected = np.array([[0, 1, 1, 0]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["neg", "pos"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
|
||||
assert_array_equal(lb.inverse_transform(to_invert), inp)
|
||||
|
||||
# multi-class case
|
||||
inp = ["spam", "ham", "eggs", "ham", "0"]
|
||||
expected = np.array(
|
||||
[[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
|
||||
)
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"])
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
|
||||
def test_label_binarizer_unseen_labels():
|
||||
lb = LabelBinarizer()
|
||||
|
||||
expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
|
||||
got = lb.fit_transform(["b", "d", "e"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
expected = np.array(
|
||||
[[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]
|
||||
)
|
||||
got = lb.transform(["a", "b", "c", "d", "e", "f"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
|
||||
def test_label_binarizer_set_label_encoding():
|
||||
lb = LabelBinarizer(neg_label=-2, pos_label=0)
|
||||
|
||||
# two-class case with pos_label=0
|
||||
inp = np.array([0, 1, 1, 0])
|
||||
expected = np.array([[-2, 0, 0, -2]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
lb = LabelBinarizer(neg_label=-2, pos_label=2)
|
||||
|
||||
# multi-class case
|
||||
inp = np.array([3, 2, 1, 2, 0])
|
||||
expected = np.array(
|
||||
[
|
||||
[-2, -2, -2, +2],
|
||||
[-2, -2, +2, -2],
|
||||
[-2, +2, -2, -2],
|
||||
[-2, -2, +2, -2],
|
||||
[+2, -2, -2, -2],
|
||||
]
|
||||
)
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
@pytest.mark.parametrize("unique_first", [True, False])
|
||||
def test_label_binarizer_pandas_nullable(dtype, unique_first):
|
||||
"""Checks that LabelBinarizer works with pandas nullable dtypes.
|
||||
|
||||
Non-regression test for gh-25637.
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
|
||||
if unique_first:
|
||||
# Calling unique creates a pandas array which has a different interface
|
||||
# compared to a pandas Series. Specifically, pandas arrays do not have "iloc".
|
||||
y_true = y_true.unique()
|
||||
lb = LabelBinarizer().fit(y_true)
|
||||
y_out = lb.transform([1, 0])
|
||||
|
||||
assert_array_equal(y_out, [[1], [0]])
|
||||
|
||||
|
||||
def test_label_binarizer_errors():
|
||||
# Check that invalid arguments yield ValueError
|
||||
one_class = np.array([0, 0, 0, 0])
|
||||
lb = LabelBinarizer().fit(one_class)
|
||||
|
||||
multi_label = [(2, 3), (0,), (0, 2)]
|
||||
err_msg = "You appear to be using a legacy multi-label data representation."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.transform(multi_label)
|
||||
|
||||
lb = LabelBinarizer()
|
||||
err_msg = "This LabelBinarizer instance is not fitted yet"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.transform([])
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.inverse_transform([])
|
||||
|
||||
input_labels = [0, 1, 0, 1]
|
||||
err_msg = "neg_label=2 must be strictly less than pos_label=1."
|
||||
lb = LabelBinarizer(neg_label=2, pos_label=1)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.fit(input_labels)
|
||||
err_msg = "neg_label=2 must be strictly less than pos_label=2."
|
||||
lb = LabelBinarizer(neg_label=2, pos_label=2)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.fit(input_labels)
|
||||
err_msg = (
|
||||
"Sparse binarization is only supported with non zero pos_label and zero "
|
||||
"neg_label, got pos_label=2 and neg_label=1"
|
||||
)
|
||||
lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.fit(input_labels)
|
||||
|
||||
# Sequence of seq type should raise ValueError
|
||||
y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
|
||||
err_msg = "You appear to be using a legacy multi-label data representation"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
LabelBinarizer().fit_transform(y_seq_of_seqs)
|
||||
|
||||
# Fail on the dimension of 'binary'
|
||||
err_msg = "output_type='binary', but y.shape"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_inverse_binarize_thresholding(
|
||||
y=np.array([[1, 2, 3], [2, 1, 3]]),
|
||||
output_type="binary",
|
||||
classes=[1, 2, 3],
|
||||
threshold=0,
|
||||
)
|
||||
|
||||
# Fail on multioutput data
|
||||
err_msg = "Multioutput target data is not supported with label binarization"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_label_binarizer_sparse_errors(csr_container):
|
||||
# Fail on y_type
|
||||
err_msg = "foo format is not supported"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_inverse_binarize_thresholding(
|
||||
y=csr_container([[1, 2], [2, 1]]),
|
||||
output_type="foo",
|
||||
classes=[1, 2],
|
||||
threshold=0,
|
||||
)
|
||||
|
||||
# Fail on the number of classes
|
||||
err_msg = "The number of class is not equal to the number of dimension of y."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_inverse_binarize_thresholding(
|
||||
y=csr_container([[1, 2], [2, 1]]),
|
||||
output_type="foo",
|
||||
classes=[1, 2, 3],
|
||||
threshold=0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, classes, unknown",
|
||||
[
|
||||
(
|
||||
np.array([2, 1, 3, 1, 3], dtype="int64"),
|
||||
np.array([1, 2, 3], dtype="int64"),
|
||||
np.array([4], dtype="int64"),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", "c", "a", "c"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
np.array(["d"], dtype=object),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", "c", "a", "c"]),
|
||||
np.array(["a", "b", "c"]),
|
||||
np.array(["d"]),
|
||||
),
|
||||
],
|
||||
ids=["int64", "object", "str"],
|
||||
)
|
||||
def test_label_encoder(values, classes, unknown):
|
||||
# Test LabelEncoder's transform, fit_transform and
|
||||
# inverse_transform methods
|
||||
le = LabelEncoder()
|
||||
le.fit(values)
|
||||
assert_array_equal(le.classes_, classes)
|
||||
assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
|
||||
assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
|
||||
le = LabelEncoder()
|
||||
ret = le.fit_transform(values)
|
||||
assert_array_equal(ret, [1, 0, 2, 0, 2])
|
||||
|
||||
with pytest.raises(ValueError, match="unseen labels"):
|
||||
le.transform(unknown)
|
||||
|
||||
|
||||
def test_label_encoder_negative_ints():
|
||||
le = LabelEncoder()
|
||||
le.fit([1, 1, 4, 5, -1, 0])
|
||||
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
|
||||
assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
|
||||
assert_array_equal(
|
||||
le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
le.transform([0, 6])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["str", "object"])
|
||||
def test_label_encoder_str_bad_shape(dtype):
|
||||
le = LabelEncoder()
|
||||
le.fit(np.array(["apple", "orange"], dtype=dtype))
|
||||
msg = "should be a 1d array"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.transform("apple")
|
||||
|
||||
|
||||
def test_label_encoder_errors():
|
||||
# Check that invalid arguments yield ValueError
|
||||
le = LabelEncoder()
|
||||
with pytest.raises(ValueError):
|
||||
le.transform([])
|
||||
with pytest.raises(ValueError):
|
||||
le.inverse_transform([])
|
||||
|
||||
# Fail on unseen labels
|
||||
le = LabelEncoder()
|
||||
le.fit([1, 2, 3, -1, 1])
|
||||
msg = "contains previously unseen labels"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform([-2])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform([-2, -3, -4])
|
||||
|
||||
# Fail on inverse_transform("")
|
||||
msg = r"should be a 1d array.+shape \(\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform("")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
np.array([2, 1, 3, 1, 3], dtype="int64"),
|
||||
np.array(["b", "a", "c", "a", "c"], dtype=object),
|
||||
np.array(["b", "a", "c", "a", "c"]),
|
||||
],
|
||||
ids=["int64", "object", "str"],
|
||||
)
|
||||
def test_label_encoder_empty_array(values):
|
||||
le = LabelEncoder()
|
||||
le.fit(values)
|
||||
# test empty transform
|
||||
transformed = le.transform([])
|
||||
assert_array_equal(np.array([]), transformed)
|
||||
# test empty inverse transform
|
||||
inverse_transformed = le.inverse_transform([])
|
||||
assert_array_equal(np.array([]), inverse_transformed)
|
||||
|
||||
|
||||
def test_sparse_output_multilabel_binarizer():
|
||||
# test input as iterable of iterables
|
||||
inputs = [
|
||||
lambda: [(2, 3), (1,), (1, 2)],
|
||||
lambda: ({2, 3}, {1}, {1, 2}),
|
||||
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
|
||||
inverse = inputs[0]()
|
||||
for sparse_output in [True, False]:
|
||||
for inp in inputs:
|
||||
# With fit_transform
|
||||
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
|
||||
got = mlb.fit_transform(inp())
|
||||
assert issparse(got) == sparse_output
|
||||
if sparse_output:
|
||||
# verify CSR assumption that indices and indptr have same dtype
|
||||
assert got.indices.dtype == got.indptr.dtype
|
||||
got = got.toarray()
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
# With fit
|
||||
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
|
||||
got = mlb.fit(inp()).transform(inp())
|
||||
assert issparse(got) == sparse_output
|
||||
if sparse_output:
|
||||
# verify CSR assumption that indices and indptr have same dtype
|
||||
assert got.indices.dtype == got.indptr.dtype
|
||||
got = got.toarray()
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sparse_output_multilabel_binarizer_errors(csr_container):
|
||||
inp = iter([iter((2, 3)), iter((1,)), {1, 2}])
|
||||
mlb = MultiLabelBinarizer(sparse_output=False)
|
||||
mlb.fit(inp)
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(
|
||||
csr_container(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]]))
|
||||
)
|
||||
|
||||
|
||||
def test_multilabel_binarizer():
|
||||
# test input as iterable of iterables
|
||||
inputs = [
|
||||
lambda: [(2, 3), (1,), (1, 2)],
|
||||
lambda: ({2, 3}, {1}, {1, 2}),
|
||||
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
inverse = inputs[0]()
|
||||
for inp in inputs:
|
||||
# With fit_transform
|
||||
mlb = MultiLabelBinarizer()
|
||||
got = mlb.fit_transform(inp())
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
# With fit
|
||||
mlb = MultiLabelBinarizer()
|
||||
got = mlb.fit(inp()).transform(inp())
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
|
||||
def test_multilabel_binarizer_empty_sample():
|
||||
mlb = MultiLabelBinarizer()
|
||||
y = [[1, 2], [1], []]
|
||||
Y = np.array([[1, 1], [1, 0], [0, 0]])
|
||||
assert_array_equal(mlb.fit_transform(y), Y)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_unknown_class():
|
||||
mlb = MultiLabelBinarizer()
|
||||
y = [[1, 2]]
|
||||
Y = np.array([[1, 0], [0, 1]])
|
||||
warning_message = "unknown class.* will be ignored"
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
|
||||
|
||||
Y = np.array([[1, 0, 0], [0, 1, 0]])
|
||||
mlb = MultiLabelBinarizer(classes=[1, 2, 3])
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
|
||||
assert_array_equal(matrix, Y)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_given_classes():
|
||||
inp = [(2, 3), (1,), (1, 2)]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, [1, 3, 2])
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, [1, 3, 2])
|
||||
|
||||
# ensure works with extra class
|
||||
mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
|
||||
assert_array_equal(
|
||||
mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))
|
||||
)
|
||||
assert_array_equal(mlb.classes_, [4, 1, 3, 2])
|
||||
|
||||
# ensure fit is no-op as iterable is not consumed
|
||||
inp = iter(inp)
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
|
||||
# ensure a ValueError is thrown if given duplicate classes
|
||||
err_msg = (
|
||||
"The classes argument contains duplicate classes. Remove "
|
||||
"these duplicates before passing them to MultiLabelBinarizer."
|
||||
)
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
mlb.fit(inp)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_multiple_calls():
|
||||
inp = [(2, 3), (1,), (1, 2)]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
|
||||
|
||||
indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
|
||||
# first call
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
# second call change class
|
||||
mlb.classes = [1, 2, 3]
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_same_length_sequence():
|
||||
# Ensure sequences of the same length are not interpreted as a 2-d array
|
||||
inp = [[1], [0], [2]]
|
||||
indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_non_integer_labels():
|
||||
tuple_classes = _to_object_array([(1,), (2,), (3,)])
|
||||
inputs = [
|
||||
([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]),
|
||||
([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]),
|
||||
([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
for inp, classes in inputs:
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
inp = np.array(inp, dtype=object)
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, classes)
|
||||
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
|
||||
assert_array_equal(indicator_mat_inv, inp)
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, classes)
|
||||
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
|
||||
assert_array_equal(indicator_mat_inv, inp)
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
with pytest.raises(TypeError):
|
||||
mlb.fit_transform([({}), ({}, {"a": "b"})])
|
||||
|
||||
|
||||
def test_multilabel_binarizer_non_unique():
|
||||
inp = [(1, 1, 1, 0)]
|
||||
indicator_mat = np.array([[1, 1]])
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_inverse_validation():
|
||||
inp = [(1, 1, 1, 0)]
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit_transform(inp)
|
||||
# Not binary
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1, 3]]))
|
||||
# The following binary cases are fine, however
|
||||
mlb.inverse_transform(np.array([[0, 0]]))
|
||||
mlb.inverse_transform(np.array([[1, 1]]))
|
||||
mlb.inverse_transform(np.array([[1, 0]]))
|
||||
|
||||
# Wrong shape
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1]]))
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1, 1, 1]]))
|
||||
|
||||
|
||||
def test_label_binarize_with_class_order():
|
||||
out = label_binarize([1, 6], classes=[1, 2, 4, 6])
|
||||
expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
# Modified class order
|
||||
out = label_binarize([1, 6], classes=[1, 6, 4, 2])
|
||||
expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
|
||||
expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
|
||||
def check_binarized_results(y, classes, pos_label, neg_label, expected):
|
||||
for sparse_output in [True, False]:
|
||||
if (pos_label == 0 or neg_label != 0) and sparse_output:
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(
|
||||
y,
|
||||
classes=classes,
|
||||
neg_label=neg_label,
|
||||
pos_label=pos_label,
|
||||
sparse_output=sparse_output,
|
||||
)
|
||||
continue
|
||||
|
||||
# check label_binarize
|
||||
binarized = label_binarize(
|
||||
y,
|
||||
classes=classes,
|
||||
neg_label=neg_label,
|
||||
pos_label=pos_label,
|
||||
sparse_output=sparse_output,
|
||||
)
|
||||
assert_array_equal(toarray(binarized), expected)
|
||||
assert issparse(binarized) == sparse_output
|
||||
|
||||
# check inverse
|
||||
y_type = type_of_target(y)
|
||||
if y_type == "multiclass":
|
||||
inversed = _inverse_binarize_multiclass(binarized, classes=classes)
|
||||
|
||||
else:
|
||||
inversed = _inverse_binarize_thresholding(
|
||||
binarized,
|
||||
output_type=y_type,
|
||||
classes=classes,
|
||||
threshold=((neg_label + pos_label) / 2.0),
|
||||
)
|
||||
|
||||
assert_array_equal(toarray(inversed), toarray(y))
|
||||
|
||||
# Check label binarizer
|
||||
lb = LabelBinarizer(
|
||||
neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
|
||||
)
|
||||
binarized = lb.fit_transform(y)
|
||||
assert_array_equal(toarray(binarized), expected)
|
||||
assert issparse(binarized) == sparse_output
|
||||
inverse_output = lb.inverse_transform(binarized)
|
||||
assert_array_equal(toarray(inverse_output), toarray(y))
|
||||
assert issparse(inverse_output) == issparse(y)
|
||||
|
||||
|
||||
def test_label_binarize_binary():
|
||||
y = [0, 1, 0]
|
||||
classes = [0, 1]
|
||||
pos_label = 2
|
||||
neg_label = -1
|
||||
expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
# Binary case where sparse_output = True will not result in a ValueError
|
||||
y = [0, 1, 0]
|
||||
classes = [0, 1]
|
||||
pos_label = 3
|
||||
neg_label = 0
|
||||
expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
|
||||
def test_label_binarize_multiclass():
|
||||
y = [0, 1, 2]
|
||||
classes = [0, 1, 2]
|
||||
pos_label = 2
|
||||
neg_label = 0
|
||||
expected = 2 * np.eye(3)
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(
|
||||
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arr_type",
|
||||
[np.array]
|
||||
+ COO_CONTAINERS
|
||||
+ CSC_CONTAINERS
|
||||
+ CSR_CONTAINERS
|
||||
+ DOK_CONTAINERS
|
||||
+ LIL_CONTAINERS,
|
||||
)
|
||||
def test_label_binarize_multilabel(arr_type):
|
||||
y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
|
||||
classes = [0, 1, 2]
|
||||
pos_label = 2
|
||||
neg_label = 0
|
||||
expected = pos_label * y_ind
|
||||
y = arr_type(y_ind)
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(
|
||||
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
|
||||
)
|
||||
|
||||
|
||||
def test_invalid_input_label_binarize():
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
|
||||
with pytest.raises(ValueError, match="continuous target data is not "):
|
||||
label_binarize([1.2, 2.7], classes=[0, 1])
|
||||
with pytest.raises(ValueError, match="mismatch with the labels"):
|
||||
label_binarize([[1, 3]], classes=[1, 2, 3])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_inverse_binarize_multiclass(csr_container):
|
||||
got = _inverse_binarize_multiclass(
|
||||
csr_container([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3)
|
||||
)
|
||||
assert_array_equal(got, np.array([1, 1, 0]))
|
||||
|
||||
|
||||
def test_nan_label_encoder():
|
||||
"""Check that label encoder encodes nans in transform.
|
||||
|
||||
Non-regression test for #22628.
|
||||
"""
|
||||
le = LabelEncoder()
|
||||
le.fit(["a", "a", "b", np.nan])
|
||||
|
||||
y_trans = le.transform([np.nan])
|
||||
assert_array_equal(y_trans, [2])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"encoder", [LabelEncoder(), LabelBinarizer(), MultiLabelBinarizer()]
|
||||
)
|
||||
def test_label_encoders_do_not_have_set_output(encoder):
|
||||
"""Check that label encoders do not define set_output and work with y as a kwarg.
|
||||
|
||||
Non-regression test for #26854.
|
||||
"""
|
||||
assert not hasattr(encoder, "set_output")
|
||||
y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"])
|
||||
y_encoded_positional = encoder.fit_transform(["a", "b", "c"])
|
||||
assert_array_equal(y_encoded_with_kwarg, y_encoded_positional)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, dtype",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"y",
|
||||
[
|
||||
np.array([2, 1, 3, 1, 3]),
|
||||
np.array([1, 1, 4, 5, -1, 0]),
|
||||
np.array([3, 5, 9, 5, 9, 3]),
|
||||
],
|
||||
)
|
||||
def test_label_encoder_array_api_compliance(y, array_namespace, device, dtype):
|
||||
xp = _array_api_for_tests(array_namespace, device)
|
||||
xp_y = xp.asarray(y, device=device)
|
||||
with config_context(array_api_dispatch=True):
|
||||
xp_label = LabelEncoder()
|
||||
np_label = LabelEncoder()
|
||||
xp_label = xp_label.fit(xp_y)
|
||||
xp_transformed = xp_label.transform(xp_y)
|
||||
xp_inv_transformed = xp_label.inverse_transform(xp_transformed)
|
||||
np_label = np_label.fit(y)
|
||||
np_transformed = np_label.transform(y)
|
||||
assert get_namespace(xp_transformed)[0].__name__ == xp.__name__
|
||||
assert get_namespace(xp_inv_transformed)[0].__name__ == xp.__name__
|
||||
assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__
|
||||
assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed)
|
||||
assert_array_equal(_convert_to_numpy(xp_inv_transformed, xp), y)
|
||||
assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_)
|
||||
|
||||
xp_label = LabelEncoder()
|
||||
np_label = LabelEncoder()
|
||||
xp_transformed = xp_label.fit_transform(xp_y)
|
||||
np_transformed = np_label.fit_transform(y)
|
||||
assert get_namespace(xp_transformed)[0].__name__ == xp.__name__
|
||||
assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__
|
||||
assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed)
|
||||
assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,714 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_equal
|
||||
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.linear_model import Ridge
|
||||
from sklearn.model_selection import (
|
||||
KFold,
|
||||
ShuffleSplit,
|
||||
StratifiedKFold,
|
||||
cross_val_score,
|
||||
train_test_split,
|
||||
)
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import (
|
||||
KBinsDiscretizer,
|
||||
LabelBinarizer,
|
||||
LabelEncoder,
|
||||
TargetEncoder,
|
||||
)
|
||||
|
||||
|
||||
def _encode_target(X_ordinal, y_numeric, n_categories, smooth):
|
||||
"""Simple Python implementation of target encoding."""
|
||||
cur_encodings = np.zeros(n_categories, dtype=np.float64)
|
||||
y_mean = np.mean(y_numeric)
|
||||
|
||||
if smooth == "auto":
|
||||
y_variance = np.var(y_numeric)
|
||||
for c in range(n_categories):
|
||||
y_subset = y_numeric[X_ordinal == c]
|
||||
n_i = y_subset.shape[0]
|
||||
|
||||
if n_i == 0:
|
||||
cur_encodings[c] = y_mean
|
||||
continue
|
||||
|
||||
y_subset_variance = np.var(y_subset)
|
||||
m = y_subset_variance / y_variance
|
||||
lambda_ = n_i / (n_i + m)
|
||||
|
||||
cur_encodings[c] = lambda_ * np.mean(y_subset) + (1 - lambda_) * y_mean
|
||||
return cur_encodings
|
||||
else: # float
|
||||
for c in range(n_categories):
|
||||
y_subset = y_numeric[X_ordinal == c]
|
||||
current_sum = np.sum(y_subset) + y_mean * smooth
|
||||
current_cnt = y_subset.shape[0] + smooth
|
||||
cur_encodings[c] = current_sum / current_cnt
|
||||
return cur_encodings
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"categories, unknown_value",
|
||||
[
|
||||
([np.array([0, 1, 2], dtype=np.int64)], 4),
|
||||
([np.array([1.0, 3.0, np.nan], dtype=np.float64)], 6.0),
|
||||
([np.array(["cat", "dog", "snake"], dtype=object)], "bear"),
|
||||
("auto", 3),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("smooth", [5.0, "auto"])
|
||||
@pytest.mark.parametrize("target_type", ["binary", "continuous"])
|
||||
def test_encoding(categories, unknown_value, global_random_seed, smooth, target_type):
|
||||
"""Check encoding for binary and continuous targets.
|
||||
|
||||
Compare the values returned by `TargetEncoder.fit_transform` against the
|
||||
expected encodings for cv splits from a naive reference Python
|
||||
implementation in _encode_target.
|
||||
"""
|
||||
|
||||
n_categories = 3
|
||||
X_train_int_array = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=np.int64).T
|
||||
X_test_int_array = np.array([[0, 1, 2]], dtype=np.int64).T
|
||||
n_samples = X_train_int_array.shape[0]
|
||||
|
||||
if categories == "auto":
|
||||
X_train = X_train_int_array
|
||||
X_test = X_test_int_array
|
||||
else:
|
||||
X_train = categories[0][X_train_int_array]
|
||||
X_test = categories[0][X_test_int_array]
|
||||
|
||||
X_test = np.concatenate((X_test, [[unknown_value]]))
|
||||
|
||||
data_rng = np.random.RandomState(global_random_seed)
|
||||
n_splits = 3
|
||||
if target_type == "binary":
|
||||
y_numeric = data_rng.randint(low=0, high=2, size=n_samples)
|
||||
target_names = np.array(["cat", "dog"], dtype=object)
|
||||
y_train = target_names[y_numeric]
|
||||
|
||||
else:
|
||||
assert target_type == "continuous"
|
||||
y_numeric = data_rng.uniform(low=-10, high=20, size=n_samples)
|
||||
y_train = y_numeric
|
||||
|
||||
shuffled_idx = data_rng.permutation(n_samples)
|
||||
X_train_int_array = X_train_int_array[shuffled_idx]
|
||||
X_train = X_train[shuffled_idx]
|
||||
y_train = y_train[shuffled_idx]
|
||||
y_numeric = y_numeric[shuffled_idx]
|
||||
|
||||
# Define our CV splitting strategy
|
||||
if target_type == "binary":
|
||||
cv = StratifiedKFold(
|
||||
n_splits=n_splits, random_state=global_random_seed, shuffle=True
|
||||
)
|
||||
else:
|
||||
cv = KFold(n_splits=n_splits, random_state=global_random_seed, shuffle=True)
|
||||
|
||||
# Compute the expected values using our reference Python implementation of
|
||||
# target encoding:
|
||||
expected_X_fit_transform = np.empty_like(X_train_int_array, dtype=np.float64)
|
||||
|
||||
for train_idx, test_idx in cv.split(X_train_int_array, y_train):
|
||||
X_, y_ = X_train_int_array[train_idx, 0], y_numeric[train_idx]
|
||||
cur_encodings = _encode_target(X_, y_, n_categories, smooth)
|
||||
expected_X_fit_transform[test_idx, 0] = cur_encodings[
|
||||
X_train_int_array[test_idx, 0]
|
||||
]
|
||||
|
||||
# Check that we can obtain the same encodings by calling `fit_transform` on
|
||||
# the estimator with the same CV parameters:
|
||||
target_encoder = TargetEncoder(
|
||||
smooth=smooth,
|
||||
categories=categories,
|
||||
cv=n_splits,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
|
||||
X_fit_transform = target_encoder.fit_transform(X_train, y_train)
|
||||
|
||||
assert target_encoder.target_type_ == target_type
|
||||
assert_allclose(X_fit_transform, expected_X_fit_transform)
|
||||
assert len(target_encoder.encodings_) == 1
|
||||
if target_type == "binary":
|
||||
assert_array_equal(target_encoder.classes_, target_names)
|
||||
else:
|
||||
assert target_encoder.classes_ is None
|
||||
|
||||
# compute encodings for all data to validate `transform`
|
||||
y_mean = np.mean(y_numeric)
|
||||
expected_encodings = _encode_target(
|
||||
X_train_int_array[:, 0], y_numeric, n_categories, smooth
|
||||
)
|
||||
assert_allclose(target_encoder.encodings_[0], expected_encodings)
|
||||
assert target_encoder.target_mean_ == pytest.approx(y_mean)
|
||||
|
||||
# Transform on test data, the last value is unknown so it is encoded as the target
|
||||
# mean
|
||||
expected_X_test_transform = np.concatenate(
|
||||
(expected_encodings, np.array([y_mean]))
|
||||
).reshape(-1, 1)
|
||||
|
||||
X_test_transform = target_encoder.transform(X_test)
|
||||
assert_allclose(X_test_transform, expected_X_test_transform)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"categories, unknown_values",
|
||||
[
|
||||
([np.array([0, 1, 2], dtype=np.int64)], "auto"),
|
||||
([np.array(["cat", "dog", "snake"], dtype=object)], ["bear", "rabbit"]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"target_labels", [np.array([1, 2, 3]), np.array(["a", "b", "c"])]
|
||||
)
|
||||
@pytest.mark.parametrize("smooth", [5.0, "auto"])
|
||||
def test_encoding_multiclass(
|
||||
global_random_seed, categories, unknown_values, target_labels, smooth
|
||||
):
|
||||
"""Check encoding for multiclass targets."""
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
|
||||
n_samples = 80
|
||||
n_features = 2
|
||||
feat_1_int = np.array(rng.randint(low=0, high=2, size=n_samples))
|
||||
feat_2_int = np.array(rng.randint(low=0, high=3, size=n_samples))
|
||||
feat_1 = categories[0][feat_1_int]
|
||||
feat_2 = categories[0][feat_2_int]
|
||||
X_train = np.column_stack((feat_1, feat_2))
|
||||
X_train_int = np.column_stack((feat_1_int, feat_2_int))
|
||||
categories_ = [[0, 1], [0, 1, 2]]
|
||||
|
||||
n_classes = 3
|
||||
y_train_int = np.array(rng.randint(low=0, high=n_classes, size=n_samples))
|
||||
y_train = target_labels[y_train_int]
|
||||
y_train_enc = LabelBinarizer().fit_transform(y_train)
|
||||
|
||||
n_splits = 3
|
||||
cv = StratifiedKFold(
|
||||
n_splits=n_splits, random_state=global_random_seed, shuffle=True
|
||||
)
|
||||
|
||||
# Manually compute encodings for cv splits to validate `fit_transform`
|
||||
expected_X_fit_transform = np.empty(
|
||||
(X_train_int.shape[0], X_train_int.shape[1] * n_classes),
|
||||
dtype=np.float64,
|
||||
)
|
||||
for f_idx, cats in enumerate(categories_):
|
||||
for c_idx in range(n_classes):
|
||||
for train_idx, test_idx in cv.split(X_train, y_train):
|
||||
y_class = y_train_enc[:, c_idx]
|
||||
X_, y_ = X_train_int[train_idx, f_idx], y_class[train_idx]
|
||||
current_encoding = _encode_target(X_, y_, len(cats), smooth)
|
||||
# f_idx: 0, 0, 0, 1, 1, 1
|
||||
# c_idx: 0, 1, 2, 0, 1, 2
|
||||
# exp_idx: 0, 1, 2, 3, 4, 5
|
||||
exp_idx = c_idx + (f_idx * n_classes)
|
||||
expected_X_fit_transform[test_idx, exp_idx] = current_encoding[
|
||||
X_train_int[test_idx, f_idx]
|
||||
]
|
||||
|
||||
target_encoder = TargetEncoder(
|
||||
smooth=smooth,
|
||||
cv=n_splits,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
X_fit_transform = target_encoder.fit_transform(X_train, y_train)
|
||||
|
||||
assert target_encoder.target_type_ == "multiclass"
|
||||
assert_allclose(X_fit_transform, expected_X_fit_transform)
|
||||
|
||||
# Manually compute encoding to validate `transform`
|
||||
expected_encodings = []
|
||||
for f_idx, cats in enumerate(categories_):
|
||||
for c_idx in range(n_classes):
|
||||
y_class = y_train_enc[:, c_idx]
|
||||
current_encoding = _encode_target(
|
||||
X_train_int[:, f_idx], y_class, len(cats), smooth
|
||||
)
|
||||
expected_encodings.append(current_encoding)
|
||||
|
||||
assert len(target_encoder.encodings_) == n_features * n_classes
|
||||
for i in range(n_features * n_classes):
|
||||
assert_allclose(target_encoder.encodings_[i], expected_encodings[i])
|
||||
assert_array_equal(target_encoder.classes_, target_labels)
|
||||
|
||||
# Include unknown values at the end
|
||||
X_test_int = np.array([[0, 1], [1, 2], [4, 5]])
|
||||
if unknown_values == "auto":
|
||||
X_test = X_test_int
|
||||
else:
|
||||
X_test = np.empty_like(X_test_int[:-1, :], dtype=object)
|
||||
for column_idx in range(X_test_int.shape[1]):
|
||||
X_test[:, column_idx] = categories[0][X_test_int[:-1, column_idx]]
|
||||
# Add unknown values at end
|
||||
X_test = np.vstack((X_test, unknown_values))
|
||||
|
||||
y_mean = np.mean(y_train_enc, axis=0)
|
||||
expected_X_test_transform = np.empty(
|
||||
(X_test_int.shape[0], X_test_int.shape[1] * n_classes),
|
||||
dtype=np.float64,
|
||||
)
|
||||
n_rows = X_test_int.shape[0]
|
||||
f_idx = [0, 0, 0, 1, 1, 1]
|
||||
# Last row are unknowns, dealt with later
|
||||
for row_idx in range(n_rows - 1):
|
||||
for i, enc in enumerate(expected_encodings):
|
||||
expected_X_test_transform[row_idx, i] = enc[X_test_int[row_idx, f_idx[i]]]
|
||||
|
||||
# Unknowns encoded as target mean for each class
|
||||
# `y_mean` contains target mean for each class, thus cycle through mean of
|
||||
# each class, `n_features` times
|
||||
mean_idx = [0, 1, 2, 0, 1, 2]
|
||||
for i in range(n_classes * n_features):
|
||||
expected_X_test_transform[n_rows - 1, i] = y_mean[mean_idx[i]]
|
||||
|
||||
X_test_transform = target_encoder.transform(X_test)
|
||||
assert_allclose(X_test_transform, expected_X_test_transform)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, categories",
|
||||
[
|
||||
(
|
||||
np.array([[0] * 10 + [1] * 10 + [3]], dtype=np.int64).T, # 3 is unknown
|
||||
[[0, 1, 2]],
|
||||
),
|
||||
(
|
||||
np.array(
|
||||
[["cat"] * 10 + ["dog"] * 10 + ["snake"]], dtype=object
|
||||
).T, # snake is unknown
|
||||
[["dog", "cat", "cow"]],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("smooth", [4.0, "auto"])
|
||||
def test_custom_categories(X, categories, smooth):
|
||||
"""Custom categories with unknown categories that are not in training data."""
|
||||
rng = np.random.RandomState(0)
|
||||
y = rng.uniform(low=-10, high=20, size=X.shape[0])
|
||||
enc = TargetEncoder(categories=categories, smooth=smooth, random_state=0).fit(X, y)
|
||||
|
||||
# The last element is unknown and encoded as the mean
|
||||
y_mean = y.mean()
|
||||
X_trans = enc.transform(X[-1:])
|
||||
assert X_trans[0, 0] == pytest.approx(y_mean)
|
||||
|
||||
assert len(enc.encodings_) == 1
|
||||
# custom category that is not in training data
|
||||
assert enc.encodings_[0][-1] == pytest.approx(y_mean)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y, msg",
|
||||
[
|
||||
([1, 2, 0, 1], "Found input variables with inconsistent"),
|
||||
(
|
||||
np.array([[1, 2, 0], [1, 2, 3]]).T,
|
||||
"Target type was inferred to be 'multiclass-multioutput'",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_errors(y, msg):
|
||||
"""Check invalidate input."""
|
||||
X = np.array([[1, 0, 1]]).T
|
||||
|
||||
enc = TargetEncoder()
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
enc.fit_transform(X, y)
|
||||
|
||||
|
||||
def test_use_regression_target():
|
||||
"""Check inferred and specified `target_type` on regression target."""
|
||||
X = np.array([[0, 1, 0, 1, 0, 1]]).T
|
||||
y = np.array([1.0, 2.0, 3.0, 2.0, 3.0, 4.0])
|
||||
|
||||
enc = TargetEncoder(cv=2)
|
||||
with pytest.warns(
|
||||
UserWarning,
|
||||
match=re.escape(
|
||||
"The least populated class in y has only 1 members, which is less than"
|
||||
" n_splits=2."
|
||||
),
|
||||
):
|
||||
enc.fit_transform(X, y)
|
||||
assert enc.target_type_ == "multiclass"
|
||||
|
||||
enc = TargetEncoder(cv=2, target_type="continuous")
|
||||
enc.fit_transform(X, y)
|
||||
assert enc.target_type_ == "continuous"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y, feature_names",
|
||||
[
|
||||
([1, 2] * 10, ["A", "B"]),
|
||||
([1, 2, 3] * 6 + [1, 2], ["A_1", "A_2", "A_3", "B_1", "B_2", "B_3"]),
|
||||
(
|
||||
["y1", "y2", "y3"] * 6 + ["y1", "y2"],
|
||||
["A_y1", "A_y2", "A_y3", "B_y1", "B_y2", "B_y3"],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_feature_names_out_set_output(y, feature_names):
|
||||
"""Check TargetEncoder works with set_output."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
X_df = pd.DataFrame({"A": ["a", "b"] * 10, "B": [1, 2] * 10})
|
||||
|
||||
enc_default = TargetEncoder(cv=2, smooth=3.0, random_state=0)
|
||||
enc_default.set_output(transform="default")
|
||||
enc_pandas = TargetEncoder(cv=2, smooth=3.0, random_state=0)
|
||||
enc_pandas.set_output(transform="pandas")
|
||||
|
||||
X_default = enc_default.fit_transform(X_df, y)
|
||||
X_pandas = enc_pandas.fit_transform(X_df, y)
|
||||
|
||||
assert_allclose(X_pandas.to_numpy(), X_default)
|
||||
assert_array_equal(enc_pandas.get_feature_names_out(), feature_names)
|
||||
assert_array_equal(enc_pandas.get_feature_names_out(), X_pandas.columns)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("to_pandas", [True, False])
|
||||
@pytest.mark.parametrize("smooth", [1.0, "auto"])
|
||||
@pytest.mark.parametrize("target_type", ["binary-ints", "binary-str", "continuous"])
|
||||
def test_multiple_features_quick(to_pandas, smooth, target_type):
|
||||
"""Check target encoder with multiple features."""
|
||||
X_ordinal = np.array(
|
||||
[[1, 1], [0, 1], [1, 1], [2, 1], [1, 0], [0, 1], [1, 0], [0, 0]], dtype=np.int64
|
||||
)
|
||||
if target_type == "binary-str":
|
||||
y_train = np.array(["a", "b", "a", "a", "b", "b", "a", "b"])
|
||||
y_integer = LabelEncoder().fit_transform(y_train)
|
||||
cv = StratifiedKFold(2, random_state=0, shuffle=True)
|
||||
elif target_type == "binary-ints":
|
||||
y_train = np.array([3, 4, 3, 3, 3, 4, 4, 4])
|
||||
y_integer = LabelEncoder().fit_transform(y_train)
|
||||
cv = StratifiedKFold(2, random_state=0, shuffle=True)
|
||||
else:
|
||||
y_train = np.array([3.0, 5.1, 2.4, 3.5, 4.1, 5.5, 10.3, 7.3], dtype=np.float32)
|
||||
y_integer = y_train
|
||||
cv = KFold(2, random_state=0, shuffle=True)
|
||||
y_mean = np.mean(y_integer)
|
||||
categories = [[0, 1, 2], [0, 1]]
|
||||
|
||||
X_test = np.array(
|
||||
[
|
||||
[0, 1],
|
||||
[3, 0], # 3 is unknown
|
||||
[1, 10], # 10 is unknown
|
||||
],
|
||||
dtype=np.int64,
|
||||
)
|
||||
|
||||
if to_pandas:
|
||||
pd = pytest.importorskip("pandas")
|
||||
# convert second feature to an object
|
||||
X_train = pd.DataFrame(
|
||||
{
|
||||
"feat0": X_ordinal[:, 0],
|
||||
"feat1": np.array(["cat", "dog"], dtype=object)[X_ordinal[:, 1]],
|
||||
}
|
||||
)
|
||||
# "snake" is unknown
|
||||
X_test = pd.DataFrame({"feat0": X_test[:, 0], "feat1": ["dog", "cat", "snake"]})
|
||||
else:
|
||||
X_train = X_ordinal
|
||||
|
||||
# manually compute encoding for fit_transform
|
||||
expected_X_fit_transform = np.empty_like(X_ordinal, dtype=np.float64)
|
||||
for f_idx, cats in enumerate(categories):
|
||||
for train_idx, test_idx in cv.split(X_ordinal, y_integer):
|
||||
X_, y_ = X_ordinal[train_idx, f_idx], y_integer[train_idx]
|
||||
current_encoding = _encode_target(X_, y_, len(cats), smooth)
|
||||
expected_X_fit_transform[test_idx, f_idx] = current_encoding[
|
||||
X_ordinal[test_idx, f_idx]
|
||||
]
|
||||
|
||||
# manually compute encoding for transform
|
||||
expected_encodings = []
|
||||
for f_idx, cats in enumerate(categories):
|
||||
current_encoding = _encode_target(
|
||||
X_ordinal[:, f_idx], y_integer, len(cats), smooth
|
||||
)
|
||||
expected_encodings.append(current_encoding)
|
||||
|
||||
expected_X_test_transform = np.array(
|
||||
[
|
||||
[expected_encodings[0][0], expected_encodings[1][1]],
|
||||
[y_mean, expected_encodings[1][0]],
|
||||
[expected_encodings[0][1], y_mean],
|
||||
],
|
||||
dtype=np.float64,
|
||||
)
|
||||
|
||||
enc = TargetEncoder(smooth=smooth, cv=2, random_state=0)
|
||||
X_fit_transform = enc.fit_transform(X_train, y_train)
|
||||
assert_allclose(X_fit_transform, expected_X_fit_transform)
|
||||
|
||||
assert len(enc.encodings_) == 2
|
||||
for i in range(2):
|
||||
assert_allclose(enc.encodings_[i], expected_encodings[i])
|
||||
|
||||
X_test_transform = enc.transform(X_test)
|
||||
assert_allclose(X_test_transform, expected_X_test_transform)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y, y_mean",
|
||||
[
|
||||
(np.array([3.4] * 20), 3.4),
|
||||
(np.array([0] * 20), 0),
|
||||
(np.array(["a"] * 20, dtype=object), 0),
|
||||
],
|
||||
ids=["continuous", "binary", "binary-string"],
|
||||
)
|
||||
@pytest.mark.parametrize("smooth", ["auto", 4.0, 0.0])
|
||||
def test_constant_target_and_feature(y, y_mean, smooth):
|
||||
"""Check edge case where feature and target is constant."""
|
||||
X = np.array([[1] * 20]).T
|
||||
n_samples = X.shape[0]
|
||||
|
||||
enc = TargetEncoder(cv=2, smooth=smooth, random_state=0)
|
||||
X_trans = enc.fit_transform(X, y)
|
||||
assert_allclose(X_trans, np.repeat([[y_mean]], n_samples, axis=0))
|
||||
assert enc.encodings_[0][0] == pytest.approx(y_mean)
|
||||
assert enc.target_mean_ == pytest.approx(y_mean)
|
||||
|
||||
X_test = np.array([[1], [0]])
|
||||
X_test_trans = enc.transform(X_test)
|
||||
assert_allclose(X_test_trans, np.repeat([[y_mean]], 2, axis=0))
|
||||
|
||||
|
||||
def test_fit_transform_not_associated_with_y_if_ordinal_categorical_is_not(
|
||||
global_random_seed,
|
||||
):
|
||||
cardinality = 30 # not too large, otherwise we need a very large n_samples
|
||||
n_samples = 3000
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
y_train = rng.normal(size=n_samples)
|
||||
X_train = rng.randint(0, cardinality, size=n_samples).reshape(-1, 1)
|
||||
|
||||
# Sort by y_train to attempt to cause a leak
|
||||
y_sorted_indices = y_train.argsort()
|
||||
y_train = y_train[y_sorted_indices]
|
||||
X_train = X_train[y_sorted_indices]
|
||||
|
||||
target_encoder = TargetEncoder(shuffle=True, random_state=global_random_seed)
|
||||
X_encoded_train_shuffled = target_encoder.fit_transform(X_train, y_train)
|
||||
|
||||
target_encoder = TargetEncoder(shuffle=False)
|
||||
X_encoded_train_no_shuffled = target_encoder.fit_transform(X_train, y_train)
|
||||
|
||||
# Check that no information about y_train has leaked into X_train:
|
||||
regressor = RandomForestRegressor(
|
||||
n_estimators=10, min_samples_leaf=20, random_state=global_random_seed
|
||||
)
|
||||
|
||||
# It's impossible to learn a good predictive model on the training set when
|
||||
# using the original representation X_train or the target encoded
|
||||
# representation with shuffled inner CV. For the latter, no information
|
||||
# about y_train has inadvertently leaked into the prior used to generate
|
||||
# `X_encoded_train_shuffled`:
|
||||
cv = ShuffleSplit(n_splits=50, random_state=global_random_seed)
|
||||
assert cross_val_score(regressor, X_train, y_train, cv=cv).mean() < 0.1
|
||||
assert (
|
||||
cross_val_score(regressor, X_encoded_train_shuffled, y_train, cv=cv).mean()
|
||||
< 0.1
|
||||
)
|
||||
|
||||
# Without the inner CV shuffling, a lot of information about y_train goes into the
|
||||
# the per-fold y_train.mean() priors: shrinkage is no longer effective in this
|
||||
# case and would no longer be able to prevent downstream over-fitting.
|
||||
assert (
|
||||
cross_val_score(regressor, X_encoded_train_no_shuffled, y_train, cv=cv).mean()
|
||||
> 0.5
|
||||
)
|
||||
|
||||
|
||||
def test_smooth_zero():
|
||||
"""Check edge case with zero smoothing and cv does not contain category."""
|
||||
X = np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]).T
|
||||
y = np.array([2.1, 4.3, 1.2, 3.1, 1.0, 9.0, 10.3, 14.2, 13.3, 15.0])
|
||||
|
||||
enc = TargetEncoder(smooth=0.0, shuffle=False, cv=2)
|
||||
X_trans = enc.fit_transform(X, y)
|
||||
|
||||
# With cv = 2, category 0 does not exist in the second half, thus
|
||||
# it will be encoded as the mean of the second half
|
||||
assert_allclose(X_trans[0], np.mean(y[5:]))
|
||||
|
||||
# category 1 does not exist in the first half, thus it will be encoded as
|
||||
# the mean of the first half
|
||||
assert_allclose(X_trans[-1], np.mean(y[:5]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("smooth", [0.0, 1e3, "auto"])
|
||||
def test_invariance_of_encoding_under_label_permutation(smooth, global_random_seed):
|
||||
# Check that the encoding does not depend on the integer of the value of
|
||||
# the integer labels. This is quite a trivial property but it is helpful
|
||||
# to understand the following test.
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
|
||||
# Random y and informative categorical X to make the test non-trivial when
|
||||
# using smoothing.
|
||||
y = rng.normal(size=1000)
|
||||
n_categories = 30
|
||||
X = KBinsDiscretizer(
|
||||
n_bins=n_categories, quantile_method="averaged_inverted_cdf", encode="ordinal"
|
||||
).fit_transform(y.reshape(-1, 1))
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, random_state=global_random_seed
|
||||
)
|
||||
|
||||
# Shuffle the labels to make sure that the encoding is invariant to the
|
||||
# permutation of the labels
|
||||
permutated_labels = rng.permutation(n_categories)
|
||||
X_train_permuted = permutated_labels[X_train.astype(np.int32)]
|
||||
X_test_permuted = permutated_labels[X_test.astype(np.int32)]
|
||||
|
||||
target_encoder = TargetEncoder(smooth=smooth, random_state=global_random_seed)
|
||||
X_train_encoded = target_encoder.fit_transform(X_train, y_train)
|
||||
X_test_encoded = target_encoder.transform(X_test)
|
||||
|
||||
X_train_permuted_encoded = target_encoder.fit_transform(X_train_permuted, y_train)
|
||||
X_test_permuted_encoded = target_encoder.transform(X_test_permuted)
|
||||
|
||||
assert_allclose(X_train_encoded, X_train_permuted_encoded)
|
||||
assert_allclose(X_test_encoded, X_test_permuted_encoded)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("smooth", [0.0, "auto"])
|
||||
def test_target_encoding_for_linear_regression(smooth, global_random_seed):
|
||||
# Check some expected statistical properties when fitting a linear
|
||||
# regression model on target encoded features depending on their relation
|
||||
# with that target.
|
||||
|
||||
# In this test, we use the Ridge class with the "lsqr" solver and a little
|
||||
# bit of regularization to implement a linear regression model that
|
||||
# converges quickly for large `n_samples` and robustly in case of
|
||||
# correlated features. Since we will fit this model on a mean centered
|
||||
# target, we do not need to fit an intercept and this will help simplify
|
||||
# the analysis with respect to the expected coefficients.
|
||||
linear_regression = Ridge(alpha=1e-6, solver="lsqr", fit_intercept=False)
|
||||
|
||||
# Construct a random target variable. We need a large number of samples for
|
||||
# this test to be stable across all values of the random seed.
|
||||
n_samples = 50_000
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
y = rng.randn(n_samples)
|
||||
|
||||
# Generate a single informative ordinal feature with medium cardinality.
|
||||
# Inject some irreducible noise to make it harder for a multivariate model
|
||||
# to identify the informative feature from other pure noise features.
|
||||
noise = 0.8 * rng.randn(n_samples)
|
||||
n_categories = 100
|
||||
X_informative = KBinsDiscretizer(
|
||||
n_bins=n_categories,
|
||||
encode="ordinal",
|
||||
strategy="uniform",
|
||||
random_state=rng,
|
||||
).fit_transform((y + noise).reshape(-1, 1))
|
||||
|
||||
# Let's permute the labels to hide the fact that this feature is
|
||||
# informative to naive linear regression model trained on the raw ordinal
|
||||
# values. As highlighted in the previous test, the target encoding should be
|
||||
# invariant to such a permutation.
|
||||
permutated_labels = rng.permutation(n_categories)
|
||||
X_informative = permutated_labels[X_informative.astype(np.int32)]
|
||||
|
||||
# Generate a shuffled copy of the informative feature to destroy the
|
||||
# relationship with the target.
|
||||
X_shuffled = rng.permutation(X_informative)
|
||||
|
||||
# Also include a very high cardinality categorical feature that is by
|
||||
# itself independent of the target variable: target encoding such a feature
|
||||
# without internal cross-validation should cause catastrophic overfitting
|
||||
# for the downstream regressor, even with shrinkage. This kind of features
|
||||
# typically represents near unique identifiers of samples. In general they
|
||||
# should be removed from a machine learning datasets but here we want to
|
||||
# study the ability of the default behavior of TargetEncoder to mitigate
|
||||
# them automatically.
|
||||
X_near_unique_categories = rng.choice(
|
||||
int(0.9 * n_samples), size=n_samples, replace=True
|
||||
).reshape(-1, 1)
|
||||
|
||||
# Assemble the dataset and do a train-test split:
|
||||
X = np.concatenate(
|
||||
[X_informative, X_shuffled, X_near_unique_categories],
|
||||
axis=1,
|
||||
)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
|
||||
# Let's first check that a linear regression model trained on the raw
|
||||
# features underfits because of the meaning-less ordinal encoding of the
|
||||
# labels.
|
||||
raw_model = linear_regression.fit(X_train, y_train)
|
||||
assert raw_model.score(X_train, y_train) < 0.1
|
||||
assert raw_model.score(X_test, y_test) < 0.1
|
||||
|
||||
# Now do the same with target encoding using the internal CV mechanism
|
||||
# implemented when using fit_transform.
|
||||
model_with_cv = make_pipeline(
|
||||
TargetEncoder(smooth=smooth, random_state=rng), linear_regression
|
||||
).fit(X_train, y_train)
|
||||
|
||||
# This model should be able to fit the data well and also generalise to the
|
||||
# test data (assuming that the binning is fine-grained enough). The R2
|
||||
# scores are not perfect because of the noise injected during the
|
||||
# generation of the unique informative feature.
|
||||
coef = model_with_cv[-1].coef_
|
||||
assert model_with_cv.score(X_train, y_train) > 0.5, coef
|
||||
assert model_with_cv.score(X_test, y_test) > 0.5, coef
|
||||
|
||||
# The target encoder recovers the linear relationship with slope 1 between
|
||||
# the target encoded unique informative predictor and the target. Since the
|
||||
# target encoding of the 2 other features is not informative thanks to the
|
||||
# use of internal cross-validation, the multivariate linear regressor
|
||||
# assigns a coef of 1 to the first feature and 0 to the other 2.
|
||||
assert coef[0] == pytest.approx(1, abs=1e-2)
|
||||
assert (np.abs(coef[1:]) < 0.2).all()
|
||||
|
||||
# Let's now disable the internal cross-validation by calling fit and then
|
||||
# transform separately on the training set:
|
||||
target_encoder = TargetEncoder(smooth=smooth, random_state=rng).fit(
|
||||
X_train, y_train
|
||||
)
|
||||
X_enc_no_cv_train = target_encoder.transform(X_train)
|
||||
X_enc_no_cv_test = target_encoder.transform(X_test)
|
||||
model_no_cv = linear_regression.fit(X_enc_no_cv_train, y_train)
|
||||
|
||||
# The linear regression model should always overfit because it assigns
|
||||
# too much weight to the extremely high cardinality feature relatively to
|
||||
# the informative feature. Note that this is the case even when using
|
||||
# the empirical Bayes smoothing which is not enough to prevent such
|
||||
# overfitting alone.
|
||||
coef = model_no_cv.coef_
|
||||
assert model_no_cv.score(X_enc_no_cv_train, y_train) > 0.7, coef
|
||||
assert model_no_cv.score(X_enc_no_cv_test, y_test) < 0.5, coef
|
||||
|
||||
# The model overfits because it assigns too much weight to the high
|
||||
# cardinality yet non-informative feature instead of the lower
|
||||
# cardinality yet informative feature:
|
||||
assert abs(coef[0]) < abs(coef[2])
|
||||
|
||||
|
||||
def test_pandas_copy_on_write():
|
||||
"""
|
||||
Test target-encoder cython code when y is read-only.
|
||||
|
||||
The numpy array underlying df["y"] is read-only when copy-on-write is enabled.
|
||||
Non-regression test for gh-27879.
|
||||
"""
|
||||
pd = pytest.importorskip("pandas", minversion="2.0")
|
||||
with pd.option_context("mode.copy_on_write", True):
|
||||
df = pd.DataFrame({"x": ["a", "b", "b"], "y": [4.0, 5.0, 6.0]})
|
||||
TargetEncoder(target_type="continuous").fit(df[["x"]], df["y"])
|
||||
Reference in New Issue
Block a user