add read me

This commit is contained in:
2026-01-09 10:28:44 +11:00
commit edaf914b73
13417 changed files with 2952119 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
"""Semi-supervised learning algorithms.
These algorithms utilize small amounts of labeled data and large amounts of unlabeled
data for classification tasks.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from ._label_propagation import LabelPropagation, LabelSpreading
from ._self_training import SelfTrainingClassifier
__all__ = ["LabelPropagation", "LabelSpreading", "SelfTrainingClassifier"]

View File

@@ -0,0 +1,630 @@
# coding=utf8
"""
Label propagation in the context of this module refers to a set of
semi-supervised classification algorithms. At a high level, these algorithms
work by forming a fully-connected graph between all points given and solving
for the steady-state distribution of labels at each point.
These algorithms perform very well in practice. The cost of running can be very
expensive, at approximately O(N^3) where N is the number of (labeled and
unlabeled) points. The theory (why they perform so well) is motivated by
intuitions from random walk algorithms and geometric relationships in the data.
For more information see the references below.
Model Features
--------------
Label clamping:
The algorithm tries to learn distributions of labels over the dataset given
label assignments over an initial subset. In one variant, the algorithm does
not allow for any errors in the initial assignment (hard-clamping) while
in another variant, the algorithm allows for some wiggle room for the initial
assignments, allowing them to change by a fraction alpha in each iteration
(soft-clamping).
Kernel:
A function which projects a vector into some higher dimensional space. This
implementation supports RBF and KNN kernels. Using the RBF kernel generates
a dense matrix of size O(N^2). KNN kernel will generate a sparse matrix of
size O(k*N) which will run much faster. See the documentation for SVMs for
more info on kernels.
Examples
--------
>>> import numpy as np
>>> from sklearn import datasets
>>> from sklearn.semi_supervised import LabelPropagation
>>> label_prop_model = LabelPropagation()
>>> iris = datasets.load_iris()
>>> rng = np.random.RandomState(42)
>>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
>>> labels = np.copy(iris.target)
>>> labels[random_unlabeled_points] = -1
>>> label_prop_model.fit(iris.data, labels)
LabelPropagation(...)
Notes
-----
References:
[1] Yoshua Bengio, Olivier Delalleau, Nicolas Le Roux. In Semi-Supervised
Learning (2006), pp. 193-216
[2] Olivier Delalleau, Yoshua Bengio, Nicolas Le Roux. Efficient
Non-Parametric Function Induction in Semi-Supervised Learning. AISTAT 2005
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
from abc import ABCMeta, abstractmethod
from numbers import Integral, Real
import numpy as np
from scipy import sparse
from ..base import BaseEstimator, ClassifierMixin, _fit_context
from ..exceptions import ConvergenceWarning
from ..metrics.pairwise import rbf_kernel
from ..neighbors import NearestNeighbors
from ..utils._param_validation import Interval, StrOptions
from ..utils.extmath import safe_sparse_dot
from ..utils.fixes import laplacian as csgraph_laplacian
from ..utils.multiclass import check_classification_targets
from ..utils.validation import check_is_fitted, validate_data
class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
"""Base class for label propagation module.
Parameters
----------
kernel : {'knn', 'rbf'} or callable, default='rbf'
String identifier for kernel function to use or the kernel function
itself. Only 'rbf' and 'knn' strings are valid inputs. The function
passed should take two inputs, each of shape (n_samples, n_features),
and return a (n_samples, n_samples) shaped weight matrix.
gamma : float, default=20
Parameter for rbf kernel.
n_neighbors : int, default=7
Parameter for knn kernel. Need to be strictly positive.
alpha : float, default=1.0
Clamping factor.
max_iter : int, default=30
Change maximum number of iterations allowed.
tol : float, default=1e-3
Convergence tolerance: threshold to consider the system at steady
state.
n_jobs : int, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
"""
_parameter_constraints: dict = {
"kernel": [StrOptions({"knn", "rbf"}), callable],
"gamma": [Interval(Real, 0, None, closed="left")],
"n_neighbors": [Interval(Integral, 0, None, closed="neither")],
"alpha": [None, Interval(Real, 0, 1, closed="neither")],
"max_iter": [Interval(Integral, 0, None, closed="neither")],
"tol": [Interval(Real, 0, None, closed="left")],
"n_jobs": [None, Integral],
}
def __init__(
self,
kernel="rbf",
*,
gamma=20,
n_neighbors=7,
alpha=1,
max_iter=30,
tol=1e-3,
n_jobs=None,
):
self.max_iter = max_iter
self.tol = tol
# kernel parameters
self.kernel = kernel
self.gamma = gamma
self.n_neighbors = n_neighbors
# clamping factor
self.alpha = alpha
self.n_jobs = n_jobs
def _get_kernel(self, X, y=None):
if self.kernel == "rbf":
if y is None:
return rbf_kernel(X, X, gamma=self.gamma)
else:
return rbf_kernel(X, y, gamma=self.gamma)
elif self.kernel == "knn":
if self.nn_fit is None:
self.nn_fit = NearestNeighbors(
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs
).fit(X)
if y is None:
return self.nn_fit.kneighbors_graph(
self.nn_fit._fit_X, self.n_neighbors, mode="connectivity"
)
else:
return self.nn_fit.kneighbors(y, return_distance=False)
elif callable(self.kernel):
if y is None:
return self.kernel(X, X)
else:
return self.kernel(X, y)
@abstractmethod
def _build_graph(self):
raise NotImplementedError(
"Graph construction must be implemented to fit a label propagation model."
)
def predict(self, X):
"""Perform inductive inference across the model.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix.
Returns
-------
y : ndarray of shape (n_samples,)
Predictions for input data.
"""
# Note: since `predict` does not accept semi-supervised labels as input,
# `fit(X, y).predict(X) != fit(X, y).transduction_`.
# Hence, `fit_predict` is not implemented.
# See https://github.com/scikit-learn/scikit-learn/pull/24898
probas = self.predict_proba(X)
return self.classes_[np.argmax(probas, axis=1)].ravel()
def predict_proba(self, X):
"""Predict probability for each possible outcome.
Compute the probability estimates for each single sample in X
and each possible outcome seen during training (categorical
distribution).
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix.
Returns
-------
probabilities : ndarray of shape (n_samples, n_classes)
Normalized probability distributions across
class labels.
"""
check_is_fitted(self)
X_2d = validate_data(
self,
X,
accept_sparse=["csc", "csr", "coo", "dok", "bsr", "lil", "dia"],
reset=False,
)
weight_matrices = self._get_kernel(self.X_, X_2d)
if self.kernel == "knn":
probabilities = np.array(
[
np.sum(self.label_distributions_[weight_matrix], axis=0)
for weight_matrix in weight_matrices
]
)
else:
weight_matrices = weight_matrices.T
probabilities = safe_sparse_dot(weight_matrices, self.label_distributions_)
normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T
probabilities /= normalizer
return probabilities
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y):
"""Fit a semi-supervised label propagation model to X.
The input samples (labeled and unlabeled) are provided by matrix X,
and target labels are provided by matrix y. We conventionally apply the
label -1 to unlabeled samples in matrix y in a semi-supervised
classification.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : array-like of shape (n_samples,)
Target class values with unlabeled points marked as -1.
All unlabeled samples will be transductively assigned labels
internally, which are stored in `transduction_`.
Returns
-------
self : object
Returns the instance itself.
"""
X, y = validate_data(
self,
X,
y,
accept_sparse=["csr", "csc"],
reset=True,
)
self.X_ = X
check_classification_targets(y)
# actual graph construction (implementations should override this)
graph_matrix = self._build_graph()
# label construction
# construct a categorical distribution for classification only
classes = np.unique(y)
classes = classes[classes != -1]
self.classes_ = classes
n_samples, n_classes = len(y), len(classes)
y = np.asarray(y)
unlabeled = y == -1
# initialize distributions
self.label_distributions_ = np.zeros((n_samples, n_classes))
for label in classes:
self.label_distributions_[y == label, classes == label] = 1
y_static = np.copy(self.label_distributions_)
if self._variant == "propagation":
# LabelPropagation
y_static[unlabeled] = 0
else:
# LabelSpreading
y_static *= 1 - self.alpha
l_previous = np.zeros((self.X_.shape[0], n_classes))
unlabeled = unlabeled[:, np.newaxis]
if sparse.issparse(graph_matrix):
graph_matrix = graph_matrix.tocsr()
for self.n_iter_ in range(self.max_iter):
if np.abs(self.label_distributions_ - l_previous).sum() < self.tol:
break
l_previous = self.label_distributions_
self.label_distributions_ = safe_sparse_dot(
graph_matrix, self.label_distributions_
)
if self._variant == "propagation":
normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]
normalizer[normalizer == 0] = 1
self.label_distributions_ /= normalizer
self.label_distributions_ = np.where(
unlabeled, self.label_distributions_, y_static
)
else:
# clamp
self.label_distributions_ = (
np.multiply(self.alpha, self.label_distributions_) + y_static
)
else:
warnings.warn(
"max_iter=%d was reached without convergence." % self.max_iter,
category=ConvergenceWarning,
)
self.n_iter_ += 1
normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]
normalizer[normalizer == 0] = 1
self.label_distributions_ /= normalizer
# set the transduction item
transduction = self.classes_[np.argmax(self.label_distributions_, axis=1)]
self.transduction_ = transduction.ravel()
return self
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.sparse = True
return tags
class LabelPropagation(BaseLabelPropagation):
"""Label Propagation classifier.
Read more in the :ref:`User Guide <label_propagation>`.
Parameters
----------
kernel : {'knn', 'rbf'} or callable, default='rbf'
String identifier for kernel function to use or the kernel function
itself. Only 'rbf' and 'knn' strings are valid inputs. The function
passed should take two inputs, each of shape (n_samples, n_features),
and return a (n_samples, n_samples) shaped weight matrix.
gamma : float, default=20
Parameter for rbf kernel.
n_neighbors : int, default=7
Parameter for knn kernel which need to be strictly positive.
max_iter : int, default=1000
Change maximum number of iterations allowed.
tol : float, default=1e-3
Convergence tolerance: threshold to consider the system at steady
state.
n_jobs : int, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
X_ : {array-like, sparse matrix} of shape (n_samples, n_features)
Input array.
classes_ : ndarray of shape (n_classes,)
The distinct labels used in classifying instances.
label_distributions_ : ndarray of shape (n_samples, n_classes)
Categorical distribution for each item.
transduction_ : ndarray of shape (n_samples)
Label assigned to each item during :term:`fit`.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_iter_ : int
Number of iterations run.
See Also
--------
LabelSpreading : Alternate label propagation strategy more robust to noise.
References
----------
Xiaojin Zhu and Zoubin Ghahramani. Learning from labeled and unlabeled data
with label propagation. Technical Report CMU-CALD-02-107, Carnegie Mellon
University, 2002 http://pages.cs.wisc.edu/~jerryzhu/pub/CMU-CALD-02-107.pdf
Examples
--------
>>> import numpy as np
>>> from sklearn import datasets
>>> from sklearn.semi_supervised import LabelPropagation
>>> label_prop_model = LabelPropagation()
>>> iris = datasets.load_iris()
>>> rng = np.random.RandomState(42)
>>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
>>> labels = np.copy(iris.target)
>>> labels[random_unlabeled_points] = -1
>>> label_prop_model.fit(iris.data, labels)
LabelPropagation(...)
"""
_variant = "propagation"
_parameter_constraints: dict = {**BaseLabelPropagation._parameter_constraints}
_parameter_constraints.pop("alpha")
def __init__(
self,
kernel="rbf",
*,
gamma=20,
n_neighbors=7,
max_iter=1000,
tol=1e-3,
n_jobs=None,
):
super().__init__(
kernel=kernel,
gamma=gamma,
n_neighbors=n_neighbors,
max_iter=max_iter,
tol=tol,
n_jobs=n_jobs,
alpha=None,
)
def _build_graph(self):
"""Matrix representing a fully connected graph between each sample
This basic implementation creates a non-stochastic affinity matrix, so
class distributions will exceed 1 (normalization may be desired).
"""
if self.kernel == "knn":
self.nn_fit = None
affinity_matrix = self._get_kernel(self.X_)
normalizer = affinity_matrix.sum(axis=0)
if sparse.issparse(affinity_matrix):
affinity_matrix.data /= np.diag(np.array(normalizer))
else:
affinity_matrix /= normalizer[:, np.newaxis]
return affinity_matrix
def fit(self, X, y):
"""Fit a semi-supervised label propagation model to X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : array-like of shape (n_samples,)
Target class values with unlabeled points marked as -1.
All unlabeled samples will be transductively assigned labels
internally, which are stored in `transduction_`.
Returns
-------
self : object
Returns the instance itself.
"""
return super().fit(X, y)
class LabelSpreading(BaseLabelPropagation):
"""LabelSpreading model for semi-supervised learning.
This model is similar to the basic Label Propagation algorithm,
but uses affinity matrix based on the normalized graph Laplacian
and soft clamping across the labels.
Read more in the :ref:`User Guide <label_propagation>`.
Parameters
----------
kernel : {'knn', 'rbf'} or callable, default='rbf'
String identifier for kernel function to use or the kernel function
itself. Only 'rbf' and 'knn' strings are valid inputs. The function
passed should take two inputs, each of shape (n_samples, n_features),
and return a (n_samples, n_samples) shaped weight matrix.
gamma : float, default=20
Parameter for rbf kernel.
n_neighbors : int, default=7
Parameter for knn kernel which is a strictly positive integer.
alpha : float, default=0.2
Clamping factor. A value in (0, 1) that specifies the relative amount
that an instance should adopt the information from its neighbors as
opposed to its initial label.
alpha=0 means keeping the initial label information; alpha=1 means
replacing all initial information.
max_iter : int, default=30
Maximum number of iterations allowed.
tol : float, default=1e-3
Convergence tolerance: threshold to consider the system at steady
state.
n_jobs : int, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
X_ : ndarray of shape (n_samples, n_features)
Input array.
classes_ : ndarray of shape (n_classes,)
The distinct labels used in classifying instances.
label_distributions_ : ndarray of shape (n_samples, n_classes)
Categorical distribution for each item.
transduction_ : ndarray of shape (n_samples,)
Label assigned to each item during :term:`fit`.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_iter_ : int
Number of iterations run.
See Also
--------
LabelPropagation : Unregularized graph based semi-supervised learning.
References
----------
`Dengyong Zhou, Olivier Bousquet, Thomas Navin Lal, Jason Weston,
Bernhard Schoelkopf. Learning with local and global consistency (2004)
<https://citeseerx.ist.psu.edu/doc_view/pid/d74c37aabf2d5cae663007cbd8718175466aea8c>`_
Examples
--------
>>> import numpy as np
>>> from sklearn import datasets
>>> from sklearn.semi_supervised import LabelSpreading
>>> label_prop_model = LabelSpreading()
>>> iris = datasets.load_iris()
>>> rng = np.random.RandomState(42)
>>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
>>> labels = np.copy(iris.target)
>>> labels[random_unlabeled_points] = -1
>>> label_prop_model.fit(iris.data, labels)
LabelSpreading(...)
"""
_variant = "spreading"
_parameter_constraints: dict = {**BaseLabelPropagation._parameter_constraints}
_parameter_constraints["alpha"] = [Interval(Real, 0, 1, closed="neither")]
def __init__(
self,
kernel="rbf",
*,
gamma=20,
n_neighbors=7,
alpha=0.2,
max_iter=30,
tol=1e-3,
n_jobs=None,
):
# this one has different base parameters
super().__init__(
kernel=kernel,
gamma=gamma,
n_neighbors=n_neighbors,
alpha=alpha,
max_iter=max_iter,
tol=tol,
n_jobs=n_jobs,
)
def _build_graph(self):
"""Graph matrix for Label Spreading computes the graph laplacian"""
# compute affinity matrix (or gram matrix)
if self.kernel == "knn":
self.nn_fit = None
n_samples = self.X_.shape[0]
affinity_matrix = self._get_kernel(self.X_)
laplacian = csgraph_laplacian(affinity_matrix, normed=True)
laplacian = -laplacian
if sparse.issparse(laplacian):
diag_mask = laplacian.row == laplacian.col
laplacian.data[diag_mask] = 0.0
else:
laplacian.flat[:: n_samples + 1] = 0.0 # set diag to 0.0
return laplacian

View File

@@ -0,0 +1,625 @@
import warnings
from numbers import Integral, Real
from warnings import warn
import numpy as np
from ..base import (
BaseEstimator,
ClassifierMixin,
MetaEstimatorMixin,
_fit_context,
clone,
)
from ..utils import Bunch, get_tags, safe_mask
from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions
from ..utils.metadata_routing import (
MetadataRouter,
MethodMapping,
_raise_for_params,
_routing_enabled,
process_routing,
)
from ..utils.metaestimators import available_if
from ..utils.validation import _estimator_has, check_is_fitted, validate_data
__all__ = ["SelfTrainingClassifier"]
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
class SelfTrainingClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
"""Self-training classifier.
This :term:`metaestimator` allows a given supervised classifier to function as a
semi-supervised classifier, allowing it to learn from unlabeled data. It
does this by iteratively predicting pseudo-labels for the unlabeled data
and adding them to the training set.
The classifier will continue iterating until either max_iter is reached, or
no pseudo-labels were added to the training set in the previous iteration.
Read more in the :ref:`User Guide <self_training>`.
Parameters
----------
estimator : estimator object
An estimator object implementing `fit` and `predict_proba`.
Invoking the `fit` method will fit a clone of the passed estimator,
which will be stored in the `estimator_` attribute.
.. versionadded:: 1.6
`estimator` was added to replace `base_estimator`.
base_estimator : estimator object
An estimator object implementing `fit` and `predict_proba`.
Invoking the `fit` method will fit a clone of the passed estimator,
which will be stored in the `estimator_` attribute.
.. deprecated:: 1.6
`base_estimator` was deprecated in 1.6 and will be removed in 1.8.
Use `estimator` instead.
threshold : float, default=0.75
The decision threshold for use with `criterion='threshold'`.
Should be in [0, 1). When using the `'threshold'` criterion, a
:ref:`well calibrated classifier <calibration>` should be used.
criterion : {'threshold', 'k_best'}, default='threshold'
The selection criterion used to select which labels to add to the
training set. If `'threshold'`, pseudo-labels with prediction
probabilities above `threshold` are added to the dataset. If `'k_best'`,
the `k_best` pseudo-labels with highest prediction probabilities are
added to the dataset. When using the 'threshold' criterion, a
:ref:`well calibrated classifier <calibration>` should be used.
k_best : int, default=10
The amount of samples to add in each iteration. Only used when
`criterion='k_best'`.
max_iter : int or None, default=10
Maximum number of iterations allowed. Should be greater than or equal
to 0. If it is `None`, the classifier will continue to predict labels
until no new pseudo-labels are added, or all unlabeled samples have
been labeled.
verbose : bool, default=False
Enable verbose output.
Attributes
----------
estimator_ : estimator object
The fitted estimator.
classes_ : ndarray or list of ndarray of shape (n_classes,)
Class labels for each output. (Taken from the trained
`estimator_`).
transduction_ : ndarray of shape (n_samples,)
The labels used for the final fit of the classifier, including
pseudo-labels added during fit.
labeled_iter_ : ndarray of shape (n_samples,)
The iteration in which each sample was labeled. When a sample has
iteration 0, the sample was already labeled in the original dataset.
When a sample has iteration -1, the sample was not labeled in any
iteration.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_iter_ : int
The number of rounds of self-training, that is the number of times the
base estimator is fitted on relabeled variants of the training set.
termination_condition_ : {'max_iter', 'no_change', 'all_labeled'}
The reason that fitting was stopped.
- `'max_iter'`: `n_iter_` reached `max_iter`.
- `'no_change'`: no new labels were predicted.
- `'all_labeled'`: all unlabeled samples were labeled before `max_iter`
was reached.
See Also
--------
LabelPropagation : Label propagation classifier.
LabelSpreading : Label spreading model for semi-supervised learning.
References
----------
:doi:`David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling
supervised methods. In Proceedings of the 33rd annual meeting on
Association for Computational Linguistics (ACL '95). Association for
Computational Linguistics, Stroudsburg, PA, USA, 189-196.
<10.3115/981658.981684>`
Examples
--------
>>> import numpy as np
>>> from sklearn import datasets
>>> from sklearn.semi_supervised import SelfTrainingClassifier
>>> from sklearn.svm import SVC
>>> rng = np.random.RandomState(42)
>>> iris = datasets.load_iris()
>>> random_unlabeled_points = rng.rand(iris.target.shape[0]) < 0.3
>>> iris.target[random_unlabeled_points] = -1
>>> svc = SVC(probability=True, gamma="auto")
>>> self_training_model = SelfTrainingClassifier(svc)
>>> self_training_model.fit(iris.data, iris.target)
SelfTrainingClassifier(...)
"""
_parameter_constraints: dict = {
# We don't require `predic_proba` here to allow passing a meta-estimator
# that only exposes `predict_proba` after fitting.
# TODO(1.8) remove None option
"estimator": [None, HasMethods(["fit"])],
# TODO(1.8) remove
"base_estimator": [
HasMethods(["fit"]),
Hidden(StrOptions({"deprecated"})),
],
"threshold": [Interval(Real, 0.0, 1.0, closed="left")],
"criterion": [StrOptions({"threshold", "k_best"})],
"k_best": [Interval(Integral, 1, None, closed="left")],
"max_iter": [Interval(Integral, 0, None, closed="left"), None],
"verbose": ["verbose"],
}
def __init__(
self,
estimator=None,
base_estimator="deprecated",
threshold=0.75,
criterion="threshold",
k_best=10,
max_iter=10,
verbose=False,
):
self.estimator = estimator
self.threshold = threshold
self.criterion = criterion
self.k_best = k_best
self.max_iter = max_iter
self.verbose = verbose
# TODO(1.8) remove
self.base_estimator = base_estimator
def _get_estimator(self):
"""Get the estimator.
Returns
-------
estimator_ : estimator object
The cloned estimator object.
"""
# TODO(1.8): remove and only keep clone(self.estimator)
if self.estimator is None and self.base_estimator != "deprecated":
estimator_ = clone(self.base_estimator)
warn(
(
"`base_estimator` has been deprecated in 1.6 and will be removed"
" in 1.8. Please use `estimator` instead."
),
FutureWarning,
)
# TODO(1.8) remove
elif self.estimator is None and self.base_estimator == "deprecated":
raise ValueError(
"You must pass an estimator to SelfTrainingClassifier. Use `estimator`."
)
elif self.estimator is not None and self.base_estimator != "deprecated":
raise ValueError(
"You must pass only one estimator to SelfTrainingClassifier."
" Use `estimator`."
)
else:
estimator_ = clone(self.estimator)
return estimator_
@_fit_context(
# SelfTrainingClassifier.estimator is not validated yet
prefer_skip_nested_validation=False
)
def fit(self, X, y, **params):
"""
Fit self-training classifier using `X`, `y` as training data.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Array representing the data.
y : {array-like, sparse matrix} of shape (n_samples,)
Array representing the labels. Unlabeled samples should have the
label -1.
**params : dict
Parameters to pass to the underlying estimators.
.. versionadded:: 1.6
Only available if `enable_metadata_routing=True`,
which can be set by using
``sklearn.set_config(enable_metadata_routing=True)``.
See :ref:`Metadata Routing User Guide <metadata_routing>` for
more details.
Returns
-------
self : object
Fitted estimator.
"""
_raise_for_params(params, self, "fit")
self.estimator_ = self._get_estimator()
# we need row slicing support for sparse matrices, but costly finiteness check
# can be delegated to the base estimator.
X, y = validate_data(
self,
X,
y,
accept_sparse=["csr", "csc", "lil", "dok"],
ensure_all_finite=False,
)
if y.dtype.kind in ["U", "S"]:
raise ValueError(
"y has dtype string. If you wish to predict on "
"string targets, use dtype object, and use -1"
" as the label for unlabeled samples."
)
has_label = y != -1
if np.all(has_label):
warnings.warn("y contains no unlabeled samples", UserWarning)
if self.criterion == "k_best" and (
self.k_best > X.shape[0] - np.sum(has_label)
):
warnings.warn(
(
"k_best is larger than the amount of unlabeled "
"samples. All unlabeled samples will be labeled in "
"the first iteration"
),
UserWarning,
)
if _routing_enabled():
routed_params = process_routing(self, "fit", **params)
else:
routed_params = Bunch(estimator=Bunch(fit={}))
self.transduction_ = np.copy(y)
self.labeled_iter_ = np.full_like(y, -1)
self.labeled_iter_[has_label] = 0
self.n_iter_ = 0
while not np.all(has_label) and (
self.max_iter is None or self.n_iter_ < self.max_iter
):
self.n_iter_ += 1
self.estimator_.fit(
X[safe_mask(X, has_label)],
self.transduction_[has_label],
**routed_params.estimator.fit,
)
# Predict on the unlabeled samples
prob = self.estimator_.predict_proba(X[safe_mask(X, ~has_label)])
pred = self.estimator_.classes_[np.argmax(prob, axis=1)]
max_proba = np.max(prob, axis=1)
# Select new labeled samples
if self.criterion == "threshold":
selected = max_proba > self.threshold
else:
n_to_select = min(self.k_best, max_proba.shape[0])
if n_to_select == max_proba.shape[0]:
selected = np.ones_like(max_proba, dtype=bool)
else:
# NB these are indices, not a mask
selected = np.argpartition(-max_proba, n_to_select)[:n_to_select]
# Map selected indices into original array
selected_full = np.nonzero(~has_label)[0][selected]
# Add newly labeled confident predictions to the dataset
self.transduction_[selected_full] = pred[selected]
has_label[selected_full] = True
self.labeled_iter_[selected_full] = self.n_iter_
if selected_full.shape[0] == 0:
# no changed labels
self.termination_condition_ = "no_change"
break
if self.verbose:
print(
f"End of iteration {self.n_iter_},"
f" added {selected_full.shape[0]} new labels."
)
if self.n_iter_ == self.max_iter:
self.termination_condition_ = "max_iter"
if np.all(has_label):
self.termination_condition_ = "all_labeled"
self.estimator_.fit(
X[safe_mask(X, has_label)],
self.transduction_[has_label],
**routed_params.estimator.fit,
)
self.classes_ = self.estimator_.classes_
return self
@available_if(_estimator_has("predict"))
def predict(self, X, **params):
"""Predict the classes of `X`.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Array representing the data.
**params : dict of str -> object
Parameters to pass to the underlying estimator's ``predict`` method.
.. versionadded:: 1.6
Only available if `enable_metadata_routing=True`,
which can be set by using
``sklearn.set_config(enable_metadata_routing=True)``.
See :ref:`Metadata Routing User Guide <metadata_routing>` for
more details.
Returns
-------
y : ndarray of shape (n_samples,)
Array with predicted labels.
"""
check_is_fitted(self)
_raise_for_params(params, self, "predict")
if _routing_enabled():
# metadata routing is enabled.
routed_params = process_routing(self, "predict", **params)
else:
routed_params = Bunch(estimator=Bunch(predict={}))
X = validate_data(
self,
X,
accept_sparse=True,
ensure_all_finite=False,
reset=False,
)
return self.estimator_.predict(X, **routed_params.estimator.predict)
@available_if(_estimator_has("predict_proba"))
def predict_proba(self, X, **params):
"""Predict probability for each possible outcome.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Array representing the data.
**params : dict of str -> object
Parameters to pass to the underlying estimator's
``predict_proba`` method.
.. versionadded:: 1.6
Only available if `enable_metadata_routing=True`,
which can be set by using
``sklearn.set_config(enable_metadata_routing=True)``.
See :ref:`Metadata Routing User Guide <metadata_routing>` for
more details.
Returns
-------
y : ndarray of shape (n_samples, n_features)
Array with prediction probabilities.
"""
check_is_fitted(self)
_raise_for_params(params, self, "predict_proba")
if _routing_enabled():
# metadata routing is enabled.
routed_params = process_routing(self, "predict_proba", **params)
else:
routed_params = Bunch(estimator=Bunch(predict_proba={}))
X = validate_data(
self,
X,
accept_sparse=True,
ensure_all_finite=False,
reset=False,
)
return self.estimator_.predict_proba(X, **routed_params.estimator.predict_proba)
@available_if(_estimator_has("decision_function"))
def decision_function(self, X, **params):
"""Call decision function of the `estimator`.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Array representing the data.
**params : dict of str -> object
Parameters to pass to the underlying estimator's
``decision_function`` method.
.. versionadded:: 1.6
Only available if `enable_metadata_routing=True`,
which can be set by using
``sklearn.set_config(enable_metadata_routing=True)``.
See :ref:`Metadata Routing User Guide <metadata_routing>` for
more details.
Returns
-------
y : ndarray of shape (n_samples, n_features)
Result of the decision function of the `estimator`.
"""
check_is_fitted(self)
_raise_for_params(params, self, "decision_function")
if _routing_enabled():
# metadata routing is enabled.
routed_params = process_routing(self, "decision_function", **params)
else:
routed_params = Bunch(estimator=Bunch(decision_function={}))
X = validate_data(
self,
X,
accept_sparse=True,
ensure_all_finite=False,
reset=False,
)
return self.estimator_.decision_function(
X, **routed_params.estimator.decision_function
)
@available_if(_estimator_has("predict_log_proba"))
def predict_log_proba(self, X, **params):
"""Predict log probability for each possible outcome.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Array representing the data.
**params : dict of str -> object
Parameters to pass to the underlying estimator's
``predict_log_proba`` method.
.. versionadded:: 1.6
Only available if `enable_metadata_routing=True`,
which can be set by using
``sklearn.set_config(enable_metadata_routing=True)``.
See :ref:`Metadata Routing User Guide <metadata_routing>` for
more details.
Returns
-------
y : ndarray of shape (n_samples, n_features)
Array with log prediction probabilities.
"""
check_is_fitted(self)
_raise_for_params(params, self, "predict_log_proba")
if _routing_enabled():
# metadata routing is enabled.
routed_params = process_routing(self, "predict_log_proba", **params)
else:
routed_params = Bunch(estimator=Bunch(predict_log_proba={}))
X = validate_data(
self,
X,
accept_sparse=True,
ensure_all_finite=False,
reset=False,
)
return self.estimator_.predict_log_proba(
X, **routed_params.estimator.predict_log_proba
)
@available_if(_estimator_has("score"))
def score(self, X, y, **params):
"""Call score on the `estimator`.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Array representing the data.
y : array-like of shape (n_samples,)
Array representing the labels.
**params : dict of str -> object
Parameters to pass to the underlying estimator's ``score`` method.
.. versionadded:: 1.6
Only available if `enable_metadata_routing=True`,
which can be set by using
``sklearn.set_config(enable_metadata_routing=True)``.
See :ref:`Metadata Routing User Guide <metadata_routing>` for
more details.
Returns
-------
score : float
Result of calling score on the `estimator`.
"""
check_is_fitted(self)
_raise_for_params(params, self, "score")
if _routing_enabled():
# metadata routing is enabled.
routed_params = process_routing(self, "score", **params)
else:
routed_params = Bunch(estimator=Bunch(score={}))
X = validate_data(
self,
X,
accept_sparse=True,
ensure_all_finite=False,
reset=False,
)
return self.estimator_.score(X, y, **routed_params.estimator.score)
def get_metadata_routing(self):
"""Get metadata routing of this object.
Please check :ref:`User Guide <metadata_routing>` on how the routing
mechanism works.
.. versionadded:: 1.6
Returns
-------
routing : MetadataRouter
A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
routing information.
"""
router = MetadataRouter(owner=self.__class__.__name__)
router.add(
estimator=self.estimator,
method_mapping=(
MethodMapping()
.add(callee="fit", caller="fit")
.add(callee="score", caller="fit")
.add(callee="predict", caller="predict")
.add(callee="predict_proba", caller="predict_proba")
.add(callee="decision_function", caller="decision_function")
.add(callee="predict_log_proba", caller="predict_log_proba")
.add(callee="score", caller="score")
),
)
return router
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
# TODO(1.8): remove the condition check together with base_estimator
if self.estimator is not None:
tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse
return tags

View File

@@ -0,0 +1,238 @@
"""test the label propagation module"""
import warnings
import numpy as np
import pytest
from scipy.sparse import issparse
from sklearn.datasets import make_classification
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.semi_supervised import _label_propagation as label_propagation
from sklearn.utils._testing import (
_convert_container,
assert_allclose,
assert_array_equal,
)
CONSTRUCTOR_TYPES = ("array", "sparse_csr", "sparse_csc")
ESTIMATORS = [
(label_propagation.LabelPropagation, {"kernel": "rbf"}),
(label_propagation.LabelPropagation, {"kernel": "knn", "n_neighbors": 2}),
(
label_propagation.LabelPropagation,
{"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)},
),
(label_propagation.LabelSpreading, {"kernel": "rbf"}),
(label_propagation.LabelSpreading, {"kernel": "knn", "n_neighbors": 2}),
(
label_propagation.LabelSpreading,
{"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)},
),
]
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
def test_fit_transduction(global_dtype, Estimator, parameters):
samples = np.asarray([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], dtype=global_dtype)
labels = [0, 1, -1]
clf = Estimator(**parameters).fit(samples, labels)
assert clf.transduction_[2] == 1
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
def test_distribution(global_dtype, Estimator, parameters):
if parameters["kernel"] == "knn":
pytest.skip(
"Unstable test for this configuration: changes in k-NN ordering break it."
)
samples = np.asarray([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=global_dtype)
labels = [0, 1, -1]
clf = Estimator(**parameters).fit(samples, labels)
assert_allclose(clf.label_distributions_[2], [0.5, 0.5], atol=1e-2)
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
def test_predict(global_dtype, Estimator, parameters):
samples = np.asarray([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], dtype=global_dtype)
labels = [0, 1, -1]
clf = Estimator(**parameters).fit(samples, labels)
assert_array_equal(clf.predict([[0.5, 2.5]]), np.array([1]))
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
def test_predict_proba(global_dtype, Estimator, parameters):
samples = np.asarray([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]], dtype=global_dtype)
labels = [0, 1, -1]
clf = Estimator(**parameters).fit(samples, labels)
assert_allclose(clf.predict_proba([[1.0, 1.0]]), np.array([[0.5, 0.5]]))
@pytest.mark.parametrize("alpha", [0.1, 0.3, 0.5, 0.7, 0.9])
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
def test_label_spreading_closed_form(global_dtype, Estimator, parameters, alpha):
n_classes = 2
X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0)
X = X.astype(global_dtype, copy=False)
y[::3] = -1
gamma = 0.1
clf = label_propagation.LabelSpreading(gamma=gamma).fit(X, y)
# adopting notation from Zhou et al (2004):
S = clf._build_graph()
Y = np.zeros((len(y), n_classes + 1), dtype=X.dtype)
Y[np.arange(len(y)), y] = 1
Y = Y[:, :-1]
expected = np.dot(np.linalg.inv(np.eye(len(S), dtype=S.dtype) - alpha * S), Y)
expected /= expected.sum(axis=1)[:, np.newaxis]
clf = label_propagation.LabelSpreading(
max_iter=100, alpha=alpha, tol=1e-10, gamma=gamma
)
clf.fit(X, y)
assert_allclose(expected, clf.label_distributions_)
def test_label_propagation_closed_form(global_dtype):
n_classes = 2
X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0)
X = X.astype(global_dtype, copy=False)
y[::3] = -1
Y = np.zeros((len(y), n_classes + 1))
Y[np.arange(len(y)), y] = 1
unlabelled_idx = Y[:, (-1,)].nonzero()[0]
labelled_idx = (Y[:, (-1,)] == 0).nonzero()[0]
clf = label_propagation.LabelPropagation(max_iter=100, tol=1e-10, gamma=0.1)
clf.fit(X, y)
# adopting notation from Zhu et al 2002
T_bar = clf._build_graph()
Tuu = T_bar[tuple(np.meshgrid(unlabelled_idx, unlabelled_idx, indexing="ij"))]
Tul = T_bar[tuple(np.meshgrid(unlabelled_idx, labelled_idx, indexing="ij"))]
Y = Y[:, :-1]
Y_l = Y[labelled_idx, :]
Y_u = np.dot(np.dot(np.linalg.inv(np.eye(Tuu.shape[0]) - Tuu), Tul), Y_l)
expected = Y.copy()
expected[unlabelled_idx, :] = Y_u
expected /= expected.sum(axis=1)[:, np.newaxis]
assert_allclose(expected, clf.label_distributions_, atol=1e-4)
@pytest.mark.parametrize("accepted_sparse_type", ["sparse_csr", "sparse_csc"])
@pytest.mark.parametrize("index_dtype", [np.int32, np.int64])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
def test_sparse_input_types(
accepted_sparse_type, index_dtype, dtype, Estimator, parameters
):
# This is non-regression test for #17085
X = _convert_container([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], accepted_sparse_type)
X.data = X.data.astype(dtype, copy=False)
X.indices = X.indices.astype(index_dtype, copy=False)
X.indptr = X.indptr.astype(index_dtype, copy=False)
labels = [0, 1, -1]
clf = Estimator(**parameters).fit(X, labels)
assert_array_equal(clf.predict([[0.5, 2.5]]), np.array([1]))
@pytest.mark.parametrize("constructor_type", CONSTRUCTOR_TYPES)
def test_convergence_speed(constructor_type):
# This is a non-regression test for #5774
X = _convert_container([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]], constructor_type)
y = np.array([0, 1, -1])
mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=5000)
mdl.fit(X, y)
# this should converge quickly:
assert mdl.n_iter_ < 10
assert_array_equal(mdl.predict(X), [0, 1, 1])
def test_convergence_warning():
# This is a non-regression test for #5774
X = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]])
y = np.array([0, 1, -1])
mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=1)
warn_msg = "max_iter=1 was reached without convergence."
with pytest.warns(ConvergenceWarning, match=warn_msg):
mdl.fit(X, y)
assert mdl.n_iter_ == mdl.max_iter
mdl = label_propagation.LabelPropagation(kernel="rbf", max_iter=1)
with pytest.warns(ConvergenceWarning, match=warn_msg):
mdl.fit(X, y)
assert mdl.n_iter_ == mdl.max_iter
mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=500)
with warnings.catch_warnings():
warnings.simplefilter("error", ConvergenceWarning)
mdl.fit(X, y)
mdl = label_propagation.LabelPropagation(kernel="rbf", max_iter=500)
with warnings.catch_warnings():
warnings.simplefilter("error", ConvergenceWarning)
mdl.fit(X, y)
@pytest.mark.parametrize(
"LabelPropagationCls",
[label_propagation.LabelSpreading, label_propagation.LabelPropagation],
)
def test_label_propagation_non_zero_normalizer(LabelPropagationCls):
# check that we don't divide by zero in case of null normalizer
# non-regression test for
# https://github.com/scikit-learn/scikit-learn/pull/15946
# https://github.com/scikit-learn/scikit-learn/issues/9292
X = np.array([[100.0, 100.0], [100.0, 100.0], [0.0, 0.0], [0.0, 0.0]])
y = np.array([0, 1, -1, -1])
mdl = LabelPropagationCls(kernel="knn", max_iter=100, n_neighbors=1)
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
mdl.fit(X, y)
def test_predict_sparse_callable_kernel(global_dtype):
# This is a non-regression test for #15866
# Custom sparse kernel (top-K RBF)
def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5):
nn = NearestNeighbors(n_neighbors=10, metric="euclidean", n_jobs=2)
nn.fit(X)
W = -1 * nn.kneighbors_graph(Y, mode="distance").power(2) * gamma
np.exp(W.data, out=W.data)
assert issparse(W)
return W.T
n_classes = 4
n_samples = 500
n_test = 10
X, y = make_classification(
n_classes=n_classes,
n_samples=n_samples,
n_features=20,
n_informative=20,
n_redundant=0,
n_repeated=0,
random_state=0,
)
X = X.astype(global_dtype)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=n_test, random_state=0
)
model = label_propagation.LabelSpreading(kernel=topk_rbf)
model.fit(X_train, y_train)
assert model.score(X_test, y_test) >= 0.9
model = label_propagation.LabelPropagation(kernel=topk_rbf)
model.fit(X_train, y_train)
assert model.score(X_test, y_test) >= 0.9

View File

@@ -0,0 +1,395 @@
from math import ceil
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from sklearn.datasets import load_iris, make_blobs
from sklearn.ensemble import StackingClassifier
from sklearn.exceptions import NotFittedError
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.svm import SVC
from sklearn.tests.test_pipeline import SimpleEstimator
from sklearn.tree import DecisionTreeClassifier
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# load the iris dataset and randomly permute it
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, random_state=0
)
n_labeled_samples = 50
y_train_missing_labels = y_train.copy()
y_train_missing_labels[n_labeled_samples:] = -1
mapping = {0: "A", 1: "B", 2: "C", -1: "-1"}
y_train_missing_strings = np.vectorize(mapping.get)(y_train_missing_labels).astype(
object
)
y_train_missing_strings[y_train_missing_labels == -1] = -1
def test_warns_k_best():
st = SelfTrainingClassifier(KNeighborsClassifier(), criterion="k_best", k_best=1000)
with pytest.warns(UserWarning, match="k_best is larger than"):
st.fit(X_train, y_train_missing_labels)
assert st.termination_condition_ == "all_labeled"
@pytest.mark.parametrize(
"estimator",
[KNeighborsClassifier(), SVC(gamma="scale", probability=True, random_state=0)],
)
@pytest.mark.parametrize("selection_crit", ["threshold", "k_best"])
def test_classification(estimator, selection_crit):
# Check classification for various parameter settings.
# Also assert that predictions for strings and numerical labels are equal.
# Also test for multioutput classification
threshold = 0.75
max_iter = 10
st = SelfTrainingClassifier(
estimator, max_iter=max_iter, threshold=threshold, criterion=selection_crit
)
st.fit(X_train, y_train_missing_labels)
pred = st.predict(X_test)
proba = st.predict_proba(X_test)
st_string = SelfTrainingClassifier(
estimator, max_iter=max_iter, criterion=selection_crit, threshold=threshold
)
st_string.fit(X_train, y_train_missing_strings)
pred_string = st_string.predict(X_test)
proba_string = st_string.predict_proba(X_test)
assert_array_equal(np.vectorize(mapping.get)(pred), pred_string)
assert_array_equal(proba, proba_string)
assert st.termination_condition_ == st_string.termination_condition_
# Check consistency between labeled_iter, n_iter and max_iter
labeled = y_train_missing_labels != -1
# assert that labeled samples have labeled_iter = 0
assert_array_equal(st.labeled_iter_ == 0, labeled)
# assert that labeled samples do not change label during training
assert_array_equal(y_train_missing_labels[labeled], st.transduction_[labeled])
# assert that the max of the iterations is less than the total amount of
# iterations
assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
assert np.max(st_string.labeled_iter_) <= st_string.n_iter_ <= max_iter
# check shapes
assert st.labeled_iter_.shape == st.transduction_.shape
assert st_string.labeled_iter_.shape == st_string.transduction_.shape
def test_k_best():
st = SelfTrainingClassifier(
KNeighborsClassifier(n_neighbors=1),
criterion="k_best",
k_best=10,
max_iter=None,
)
y_train_only_one_label = np.copy(y_train)
y_train_only_one_label[1:] = -1
n_samples = y_train.shape[0]
n_expected_iter = ceil((n_samples - 1) / 10)
st.fit(X_train, y_train_only_one_label)
assert st.n_iter_ == n_expected_iter
# Check labeled_iter_
assert np.sum(st.labeled_iter_ == 0) == 1
for i in range(1, n_expected_iter):
assert np.sum(st.labeled_iter_ == i) == 10
assert np.sum(st.labeled_iter_ == n_expected_iter) == (n_samples - 1) % 10
assert st.termination_condition_ == "all_labeled"
def test_sanity_classification():
estimator = SVC(gamma="scale", probability=True)
estimator.fit(X_train[n_labeled_samples:], y_train[n_labeled_samples:])
st = SelfTrainingClassifier(estimator)
st.fit(X_train, y_train_missing_labels)
pred1, pred2 = estimator.predict(X_test), st.predict(X_test)
assert not np.array_equal(pred1, pred2)
score_supervised = accuracy_score(estimator.predict(X_test), y_test)
score_self_training = accuracy_score(st.predict(X_test), y_test)
assert score_self_training > score_supervised
def test_none_iter():
# Check that the all samples were labeled after a 'reasonable' number of
# iterations.
st = SelfTrainingClassifier(KNeighborsClassifier(), threshold=0.55, max_iter=None)
st.fit(X_train, y_train_missing_labels)
assert st.n_iter_ < 10
assert st.termination_condition_ == "all_labeled"
@pytest.mark.parametrize(
"estimator",
[KNeighborsClassifier(), SVC(gamma="scale", probability=True, random_state=0)],
)
@pytest.mark.parametrize("y", [y_train_missing_labels, y_train_missing_strings])
def test_zero_iterations(estimator, y):
# Check classification for zero iterations.
# Fitting a SelfTrainingClassifier with zero iterations should give the
# same results as fitting a supervised classifier.
# This also asserts that string arrays work as expected.
clf1 = SelfTrainingClassifier(estimator, max_iter=0)
clf1.fit(X_train, y)
clf2 = estimator.fit(X_train[:n_labeled_samples], y[:n_labeled_samples])
assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
assert clf1.termination_condition_ == "max_iter"
def test_prefitted_throws_error():
# Test that passing a pre-fitted classifier and calling predict throws an
# error
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
st = SelfTrainingClassifier(knn)
with pytest.raises(
NotFittedError,
match="This SelfTrainingClassifier instance is not fitted yet",
):
st.predict(X_train)
@pytest.mark.parametrize("max_iter", range(1, 5))
def test_labeled_iter(max_iter):
# Check that the amount of datapoints labeled in iteration 0 is equal to
# the amount of labeled datapoints we passed.
st = SelfTrainingClassifier(KNeighborsClassifier(), max_iter=max_iter)
st.fit(X_train, y_train_missing_labels)
amount_iter_0 = len(st.labeled_iter_[st.labeled_iter_ == 0])
assert amount_iter_0 == n_labeled_samples
# Check that the max of the iterations is less than the total amount of
# iterations
assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
def test_no_unlabeled():
# Test that training on a fully labeled dataset produces the same results
# as training the classifier by itself.
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
st = SelfTrainingClassifier(knn)
with pytest.warns(UserWarning, match="y contains no unlabeled samples"):
st.fit(X_train, y_train)
assert_array_equal(knn.predict(X_test), st.predict(X_test))
# Assert that all samples were labeled in iteration 0 (since there were no
# unlabeled samples).
assert np.all(st.labeled_iter_ == 0)
assert st.termination_condition_ == "all_labeled"
def test_early_stopping():
svc = SVC(gamma="scale", probability=True)
st = SelfTrainingClassifier(svc)
X_train_easy = [[1], [0], [1], [0.5]]
y_train_easy = [1, 0, -1, -1]
# X = [[0.5]] cannot be predicted on with a high confidence, so training
# stops early
st.fit(X_train_easy, y_train_easy)
assert st.n_iter_ == 1
assert st.termination_condition_ == "no_change"
def test_strings_dtype():
clf = SelfTrainingClassifier(KNeighborsClassifier())
X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
labels_multiclass = ["one", "two", "three"]
y_strings = np.take(labels_multiclass, y)
with pytest.raises(ValueError, match="dtype"):
clf.fit(X, y_strings)
@pytest.mark.parametrize("verbose", [True, False])
def test_verbose(capsys, verbose):
clf = SelfTrainingClassifier(KNeighborsClassifier(), verbose=verbose)
clf.fit(X_train, y_train_missing_labels)
captured = capsys.readouterr()
if verbose:
assert "iteration" in captured.out
else:
assert "iteration" not in captured.out
def test_verbose_k_best(capsys):
st = SelfTrainingClassifier(
KNeighborsClassifier(n_neighbors=1),
criterion="k_best",
k_best=10,
verbose=True,
max_iter=None,
)
y_train_only_one_label = np.copy(y_train)
y_train_only_one_label[1:] = -1
n_samples = y_train.shape[0]
n_expected_iter = ceil((n_samples - 1) / 10)
st.fit(X_train, y_train_only_one_label)
captured = capsys.readouterr()
msg = "End of iteration {}, added {} new labels."
for i in range(1, n_expected_iter):
assert msg.format(i, 10) in captured.out
assert msg.format(n_expected_iter, (n_samples - 1) % 10) in captured.out
def test_k_best_selects_best():
# Tests that the labels added by st really are the 10 best labels.
svc = SVC(gamma="scale", probability=True, random_state=0)
st = SelfTrainingClassifier(svc, criterion="k_best", max_iter=1, k_best=10)
has_label = y_train_missing_labels != -1
st.fit(X_train, y_train_missing_labels)
got_label = ~has_label & (st.transduction_ != -1)
svc.fit(X_train[has_label], y_train_missing_labels[has_label])
pred = svc.predict_proba(X_train[~has_label])
max_proba = np.max(pred, axis=1)
most_confident_svc = X_train[~has_label][np.argsort(max_proba)[-10:]]
added_by_st = X_train[np.where(got_label)].tolist()
for row in most_confident_svc.tolist():
assert row in added_by_st
def test_estimator_meta_estimator():
# Check that a meta-estimator relying on an estimator implementing
# `predict_proba` will work even if it does not expose this method before being
# fitted.
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/19119
estimator = StackingClassifier(
estimators=[
("svc_1", SVC(probability=True)),
("svc_2", SVC(probability=True)),
],
final_estimator=SVC(probability=True),
cv=2,
)
assert hasattr(estimator, "predict_proba")
clf = SelfTrainingClassifier(estimator=estimator)
clf.fit(X_train, y_train_missing_labels)
clf.predict_proba(X_test)
estimator = StackingClassifier(
estimators=[
("svc_1", SVC(probability=False)),
("svc_2", SVC(probability=False)),
],
final_estimator=SVC(probability=False),
cv=2,
)
assert not hasattr(estimator, "predict_proba")
clf = SelfTrainingClassifier(estimator=estimator)
with pytest.raises(AttributeError):
clf.fit(X_train, y_train_missing_labels)
def test_self_training_estimator_attribute_error():
"""Check that we raise the proper AttributeErrors when the `estimator`
does not implement the `predict_proba` method, which is called from within
`fit`, or `decision_function`, which is decorated with `available_if`.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/28108
"""
# `SVC` with `probability=False` does not implement 'predict_proba' that
# is required internally in `fit` of `SelfTrainingClassifier`. We expect
# an AttributeError to be raised.
estimator = SVC(probability=False, gamma="scale")
self_training = SelfTrainingClassifier(estimator)
with pytest.raises(AttributeError, match="has no attribute 'predict_proba'"):
self_training.fit(X_train, y_train_missing_labels)
# `DecisionTreeClassifier` does not implement 'decision_function' and
# should raise an AttributeError
self_training = SelfTrainingClassifier(estimator=DecisionTreeClassifier())
outer_msg = "This 'SelfTrainingClassifier' has no attribute 'decision_function'"
inner_msg = "'DecisionTreeClassifier' object has no attribute 'decision_function'"
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
self_training.fit(X_train, y_train_missing_labels).decision_function(X_train)
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg in str(exec_info.value.__cause__)
# TODO(1.8): remove in 1.8
def test_deprecation_warning_base_estimator():
warn_msg = "`base_estimator` has been deprecated in 1.6 and will be removed"
with pytest.warns(FutureWarning, match=warn_msg):
SelfTrainingClassifier(base_estimator=DecisionTreeClassifier()).fit(
X_train, y_train_missing_labels
)
error_msg = "You must pass an estimator to SelfTrainingClassifier"
with pytest.raises(ValueError, match=error_msg):
SelfTrainingClassifier().fit(X_train, y_train_missing_labels)
error_msg = "You must pass only one estimator to SelfTrainingClassifier."
with pytest.raises(ValueError, match=error_msg):
SelfTrainingClassifier(
base_estimator=DecisionTreeClassifier(), estimator=DecisionTreeClassifier()
).fit(X_train, y_train_missing_labels)
# Metadata routing tests
# =================================================================
@pytest.mark.filterwarnings("ignore:y contains no unlabeled samples:UserWarning")
@pytest.mark.parametrize(
"method", ["decision_function", "predict_log_proba", "predict_proba", "predict"]
)
def test_routing_passed_metadata_not_supported(method):
"""Test that the right error message is raised when metadata is passed while
not supported when `enable_metadata_routing=False`."""
est = SelfTrainingClassifier(estimator=SimpleEstimator())
with pytest.raises(
ValueError, match="is only supported if enable_metadata_routing=True"
):
est.fit([[1], [1]], [1, 1], sample_weight=[1], prop="a")
est = SelfTrainingClassifier(estimator=SimpleEstimator())
with pytest.raises(
ValueError, match="is only supported if enable_metadata_routing=True"
):
# make sure that the estimator thinks it is already fitted
est.fitted_params_ = True
getattr(est, method)([[1]], sample_weight=[1], prop="a")
# End of routing tests
# ====================