add read me
This commit is contained in:
@@ -0,0 +1,13 @@
|
||||
"""Semi-supervised learning algorithms.
|
||||
|
||||
These algorithms utilize small amounts of labeled data and large amounts of unlabeled
|
||||
data for classification tasks.
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from ._label_propagation import LabelPropagation, LabelSpreading
|
||||
from ._self_training import SelfTrainingClassifier
|
||||
|
||||
__all__ = ["LabelPropagation", "LabelSpreading", "SelfTrainingClassifier"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,630 @@
|
||||
# coding=utf8
|
||||
"""
|
||||
Label propagation in the context of this module refers to a set of
|
||||
semi-supervised classification algorithms. At a high level, these algorithms
|
||||
work by forming a fully-connected graph between all points given and solving
|
||||
for the steady-state distribution of labels at each point.
|
||||
|
||||
These algorithms perform very well in practice. The cost of running can be very
|
||||
expensive, at approximately O(N^3) where N is the number of (labeled and
|
||||
unlabeled) points. The theory (why they perform so well) is motivated by
|
||||
intuitions from random walk algorithms and geometric relationships in the data.
|
||||
For more information see the references below.
|
||||
|
||||
Model Features
|
||||
--------------
|
||||
Label clamping:
|
||||
The algorithm tries to learn distributions of labels over the dataset given
|
||||
label assignments over an initial subset. In one variant, the algorithm does
|
||||
not allow for any errors in the initial assignment (hard-clamping) while
|
||||
in another variant, the algorithm allows for some wiggle room for the initial
|
||||
assignments, allowing them to change by a fraction alpha in each iteration
|
||||
(soft-clamping).
|
||||
|
||||
Kernel:
|
||||
A function which projects a vector into some higher dimensional space. This
|
||||
implementation supports RBF and KNN kernels. Using the RBF kernel generates
|
||||
a dense matrix of size O(N^2). KNN kernel will generate a sparse matrix of
|
||||
size O(k*N) which will run much faster. See the documentation for SVMs for
|
||||
more info on kernels.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn import datasets
|
||||
>>> from sklearn.semi_supervised import LabelPropagation
|
||||
>>> label_prop_model = LabelPropagation()
|
||||
>>> iris = datasets.load_iris()
|
||||
>>> rng = np.random.RandomState(42)
|
||||
>>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
|
||||
>>> labels = np.copy(iris.target)
|
||||
>>> labels[random_unlabeled_points] = -1
|
||||
>>> label_prop_model.fit(iris.data, labels)
|
||||
LabelPropagation(...)
|
||||
|
||||
Notes
|
||||
-----
|
||||
References:
|
||||
[1] Yoshua Bengio, Olivier Delalleau, Nicolas Le Roux. In Semi-Supervised
|
||||
Learning (2006), pp. 193-216
|
||||
|
||||
[2] Olivier Delalleau, Yoshua Bengio, Nicolas Le Roux. Efficient
|
||||
Non-Parametric Function Induction in Semi-Supervised Learning. AISTAT 2005
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
from ..base import BaseEstimator, ClassifierMixin, _fit_context
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from ..metrics.pairwise import rbf_kernel
|
||||
from ..neighbors import NearestNeighbors
|
||||
from ..utils._param_validation import Interval, StrOptions
|
||||
from ..utils.extmath import safe_sparse_dot
|
||||
from ..utils.fixes import laplacian as csgraph_laplacian
|
||||
from ..utils.multiclass import check_classification_targets
|
||||
from ..utils.validation import check_is_fitted, validate_data
|
||||
|
||||
|
||||
class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
|
||||
"""Base class for label propagation module.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kernel : {'knn', 'rbf'} or callable, default='rbf'
|
||||
String identifier for kernel function to use or the kernel function
|
||||
itself. Only 'rbf' and 'knn' strings are valid inputs. The function
|
||||
passed should take two inputs, each of shape (n_samples, n_features),
|
||||
and return a (n_samples, n_samples) shaped weight matrix.
|
||||
|
||||
gamma : float, default=20
|
||||
Parameter for rbf kernel.
|
||||
|
||||
n_neighbors : int, default=7
|
||||
Parameter for knn kernel. Need to be strictly positive.
|
||||
|
||||
alpha : float, default=1.0
|
||||
Clamping factor.
|
||||
|
||||
max_iter : int, default=30
|
||||
Change maximum number of iterations allowed.
|
||||
|
||||
tol : float, default=1e-3
|
||||
Convergence tolerance: threshold to consider the system at steady
|
||||
state.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"kernel": [StrOptions({"knn", "rbf"}), callable],
|
||||
"gamma": [Interval(Real, 0, None, closed="left")],
|
||||
"n_neighbors": [Interval(Integral, 0, None, closed="neither")],
|
||||
"alpha": [None, Interval(Real, 0, 1, closed="neither")],
|
||||
"max_iter": [Interval(Integral, 0, None, closed="neither")],
|
||||
"tol": [Interval(Real, 0, None, closed="left")],
|
||||
"n_jobs": [None, Integral],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
kernel="rbf",
|
||||
*,
|
||||
gamma=20,
|
||||
n_neighbors=7,
|
||||
alpha=1,
|
||||
max_iter=30,
|
||||
tol=1e-3,
|
||||
n_jobs=None,
|
||||
):
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
|
||||
# kernel parameters
|
||||
self.kernel = kernel
|
||||
self.gamma = gamma
|
||||
self.n_neighbors = n_neighbors
|
||||
|
||||
# clamping factor
|
||||
self.alpha = alpha
|
||||
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def _get_kernel(self, X, y=None):
|
||||
if self.kernel == "rbf":
|
||||
if y is None:
|
||||
return rbf_kernel(X, X, gamma=self.gamma)
|
||||
else:
|
||||
return rbf_kernel(X, y, gamma=self.gamma)
|
||||
elif self.kernel == "knn":
|
||||
if self.nn_fit is None:
|
||||
self.nn_fit = NearestNeighbors(
|
||||
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs
|
||||
).fit(X)
|
||||
if y is None:
|
||||
return self.nn_fit.kneighbors_graph(
|
||||
self.nn_fit._fit_X, self.n_neighbors, mode="connectivity"
|
||||
)
|
||||
else:
|
||||
return self.nn_fit.kneighbors(y, return_distance=False)
|
||||
elif callable(self.kernel):
|
||||
if y is None:
|
||||
return self.kernel(X, X)
|
||||
else:
|
||||
return self.kernel(X, y)
|
||||
|
||||
@abstractmethod
|
||||
def _build_graph(self):
|
||||
raise NotImplementedError(
|
||||
"Graph construction must be implemented to fit a label propagation model."
|
||||
)
|
||||
|
||||
def predict(self, X):
|
||||
"""Perform inductive inference across the model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_samples,)
|
||||
Predictions for input data.
|
||||
"""
|
||||
# Note: since `predict` does not accept semi-supervised labels as input,
|
||||
# `fit(X, y).predict(X) != fit(X, y).transduction_`.
|
||||
# Hence, `fit_predict` is not implemented.
|
||||
# See https://github.com/scikit-learn/scikit-learn/pull/24898
|
||||
probas = self.predict_proba(X)
|
||||
return self.classes_[np.argmax(probas, axis=1)].ravel()
|
||||
|
||||
def predict_proba(self, X):
|
||||
"""Predict probability for each possible outcome.
|
||||
|
||||
Compute the probability estimates for each single sample in X
|
||||
and each possible outcome seen during training (categorical
|
||||
distribution).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
probabilities : ndarray of shape (n_samples, n_classes)
|
||||
Normalized probability distributions across
|
||||
class labels.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X_2d = validate_data(
|
||||
self,
|
||||
X,
|
||||
accept_sparse=["csc", "csr", "coo", "dok", "bsr", "lil", "dia"],
|
||||
reset=False,
|
||||
)
|
||||
weight_matrices = self._get_kernel(self.X_, X_2d)
|
||||
if self.kernel == "knn":
|
||||
probabilities = np.array(
|
||||
[
|
||||
np.sum(self.label_distributions_[weight_matrix], axis=0)
|
||||
for weight_matrix in weight_matrices
|
||||
]
|
||||
)
|
||||
else:
|
||||
weight_matrices = weight_matrices.T
|
||||
probabilities = safe_sparse_dot(weight_matrices, self.label_distributions_)
|
||||
normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T
|
||||
probabilities /= normalizer
|
||||
return probabilities
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y):
|
||||
"""Fit a semi-supervised label propagation model to X.
|
||||
|
||||
The input samples (labeled and unlabeled) are provided by matrix X,
|
||||
and target labels are provided by matrix y. We conventionally apply the
|
||||
label -1 to unlabeled samples in matrix y in a semi-supervised
|
||||
classification.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target class values with unlabeled points marked as -1.
|
||||
All unlabeled samples will be transductively assigned labels
|
||||
internally, which are stored in `transduction_`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
X, y = validate_data(
|
||||
self,
|
||||
X,
|
||||
y,
|
||||
accept_sparse=["csr", "csc"],
|
||||
reset=True,
|
||||
)
|
||||
self.X_ = X
|
||||
check_classification_targets(y)
|
||||
|
||||
# actual graph construction (implementations should override this)
|
||||
graph_matrix = self._build_graph()
|
||||
|
||||
# label construction
|
||||
# construct a categorical distribution for classification only
|
||||
classes = np.unique(y)
|
||||
classes = classes[classes != -1]
|
||||
self.classes_ = classes
|
||||
|
||||
n_samples, n_classes = len(y), len(classes)
|
||||
|
||||
y = np.asarray(y)
|
||||
unlabeled = y == -1
|
||||
|
||||
# initialize distributions
|
||||
self.label_distributions_ = np.zeros((n_samples, n_classes))
|
||||
for label in classes:
|
||||
self.label_distributions_[y == label, classes == label] = 1
|
||||
|
||||
y_static = np.copy(self.label_distributions_)
|
||||
if self._variant == "propagation":
|
||||
# LabelPropagation
|
||||
y_static[unlabeled] = 0
|
||||
else:
|
||||
# LabelSpreading
|
||||
y_static *= 1 - self.alpha
|
||||
|
||||
l_previous = np.zeros((self.X_.shape[0], n_classes))
|
||||
|
||||
unlabeled = unlabeled[:, np.newaxis]
|
||||
if sparse.issparse(graph_matrix):
|
||||
graph_matrix = graph_matrix.tocsr()
|
||||
|
||||
for self.n_iter_ in range(self.max_iter):
|
||||
if np.abs(self.label_distributions_ - l_previous).sum() < self.tol:
|
||||
break
|
||||
|
||||
l_previous = self.label_distributions_
|
||||
self.label_distributions_ = safe_sparse_dot(
|
||||
graph_matrix, self.label_distributions_
|
||||
)
|
||||
|
||||
if self._variant == "propagation":
|
||||
normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]
|
||||
normalizer[normalizer == 0] = 1
|
||||
self.label_distributions_ /= normalizer
|
||||
self.label_distributions_ = np.where(
|
||||
unlabeled, self.label_distributions_, y_static
|
||||
)
|
||||
else:
|
||||
# clamp
|
||||
self.label_distributions_ = (
|
||||
np.multiply(self.alpha, self.label_distributions_) + y_static
|
||||
)
|
||||
else:
|
||||
warnings.warn(
|
||||
"max_iter=%d was reached without convergence." % self.max_iter,
|
||||
category=ConvergenceWarning,
|
||||
)
|
||||
self.n_iter_ += 1
|
||||
|
||||
normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]
|
||||
normalizer[normalizer == 0] = 1
|
||||
self.label_distributions_ /= normalizer
|
||||
|
||||
# set the transduction item
|
||||
transduction = self.classes_[np.argmax(self.label_distributions_, axis=1)]
|
||||
self.transduction_ = transduction.ravel()
|
||||
return self
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.sparse = True
|
||||
return tags
|
||||
|
||||
|
||||
class LabelPropagation(BaseLabelPropagation):
|
||||
"""Label Propagation classifier.
|
||||
|
||||
Read more in the :ref:`User Guide <label_propagation>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kernel : {'knn', 'rbf'} or callable, default='rbf'
|
||||
String identifier for kernel function to use or the kernel function
|
||||
itself. Only 'rbf' and 'knn' strings are valid inputs. The function
|
||||
passed should take two inputs, each of shape (n_samples, n_features),
|
||||
and return a (n_samples, n_samples) shaped weight matrix.
|
||||
|
||||
gamma : float, default=20
|
||||
Parameter for rbf kernel.
|
||||
|
||||
n_neighbors : int, default=7
|
||||
Parameter for knn kernel which need to be strictly positive.
|
||||
|
||||
max_iter : int, default=1000
|
||||
Change maximum number of iterations allowed.
|
||||
|
||||
tol : float, default=1e-3
|
||||
Convergence tolerance: threshold to consider the system at steady
|
||||
state.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
X_ : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input array.
|
||||
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
The distinct labels used in classifying instances.
|
||||
|
||||
label_distributions_ : ndarray of shape (n_samples, n_classes)
|
||||
Categorical distribution for each item.
|
||||
|
||||
transduction_ : ndarray of shape (n_samples)
|
||||
Label assigned to each item during :term:`fit`.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_iter_ : int
|
||||
Number of iterations run.
|
||||
|
||||
See Also
|
||||
--------
|
||||
LabelSpreading : Alternate label propagation strategy more robust to noise.
|
||||
|
||||
References
|
||||
----------
|
||||
Xiaojin Zhu and Zoubin Ghahramani. Learning from labeled and unlabeled data
|
||||
with label propagation. Technical Report CMU-CALD-02-107, Carnegie Mellon
|
||||
University, 2002 http://pages.cs.wisc.edu/~jerryzhu/pub/CMU-CALD-02-107.pdf
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn import datasets
|
||||
>>> from sklearn.semi_supervised import LabelPropagation
|
||||
>>> label_prop_model = LabelPropagation()
|
||||
>>> iris = datasets.load_iris()
|
||||
>>> rng = np.random.RandomState(42)
|
||||
>>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
|
||||
>>> labels = np.copy(iris.target)
|
||||
>>> labels[random_unlabeled_points] = -1
|
||||
>>> label_prop_model.fit(iris.data, labels)
|
||||
LabelPropagation(...)
|
||||
"""
|
||||
|
||||
_variant = "propagation"
|
||||
|
||||
_parameter_constraints: dict = {**BaseLabelPropagation._parameter_constraints}
|
||||
_parameter_constraints.pop("alpha")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
kernel="rbf",
|
||||
*,
|
||||
gamma=20,
|
||||
n_neighbors=7,
|
||||
max_iter=1000,
|
||||
tol=1e-3,
|
||||
n_jobs=None,
|
||||
):
|
||||
super().__init__(
|
||||
kernel=kernel,
|
||||
gamma=gamma,
|
||||
n_neighbors=n_neighbors,
|
||||
max_iter=max_iter,
|
||||
tol=tol,
|
||||
n_jobs=n_jobs,
|
||||
alpha=None,
|
||||
)
|
||||
|
||||
def _build_graph(self):
|
||||
"""Matrix representing a fully connected graph between each sample
|
||||
|
||||
This basic implementation creates a non-stochastic affinity matrix, so
|
||||
class distributions will exceed 1 (normalization may be desired).
|
||||
"""
|
||||
if self.kernel == "knn":
|
||||
self.nn_fit = None
|
||||
affinity_matrix = self._get_kernel(self.X_)
|
||||
normalizer = affinity_matrix.sum(axis=0)
|
||||
if sparse.issparse(affinity_matrix):
|
||||
affinity_matrix.data /= np.diag(np.array(normalizer))
|
||||
else:
|
||||
affinity_matrix /= normalizer[:, np.newaxis]
|
||||
return affinity_matrix
|
||||
|
||||
def fit(self, X, y):
|
||||
"""Fit a semi-supervised label propagation model to X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target class values with unlabeled points marked as -1.
|
||||
All unlabeled samples will be transductively assigned labels
|
||||
internally, which are stored in `transduction_`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
return super().fit(X, y)
|
||||
|
||||
|
||||
class LabelSpreading(BaseLabelPropagation):
|
||||
"""LabelSpreading model for semi-supervised learning.
|
||||
|
||||
This model is similar to the basic Label Propagation algorithm,
|
||||
but uses affinity matrix based on the normalized graph Laplacian
|
||||
and soft clamping across the labels.
|
||||
|
||||
Read more in the :ref:`User Guide <label_propagation>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kernel : {'knn', 'rbf'} or callable, default='rbf'
|
||||
String identifier for kernel function to use or the kernel function
|
||||
itself. Only 'rbf' and 'knn' strings are valid inputs. The function
|
||||
passed should take two inputs, each of shape (n_samples, n_features),
|
||||
and return a (n_samples, n_samples) shaped weight matrix.
|
||||
|
||||
gamma : float, default=20
|
||||
Parameter for rbf kernel.
|
||||
|
||||
n_neighbors : int, default=7
|
||||
Parameter for knn kernel which is a strictly positive integer.
|
||||
|
||||
alpha : float, default=0.2
|
||||
Clamping factor. A value in (0, 1) that specifies the relative amount
|
||||
that an instance should adopt the information from its neighbors as
|
||||
opposed to its initial label.
|
||||
alpha=0 means keeping the initial label information; alpha=1 means
|
||||
replacing all initial information.
|
||||
|
||||
max_iter : int, default=30
|
||||
Maximum number of iterations allowed.
|
||||
|
||||
tol : float, default=1e-3
|
||||
Convergence tolerance: threshold to consider the system at steady
|
||||
state.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
X_ : ndarray of shape (n_samples, n_features)
|
||||
Input array.
|
||||
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
The distinct labels used in classifying instances.
|
||||
|
||||
label_distributions_ : ndarray of shape (n_samples, n_classes)
|
||||
Categorical distribution for each item.
|
||||
|
||||
transduction_ : ndarray of shape (n_samples,)
|
||||
Label assigned to each item during :term:`fit`.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_iter_ : int
|
||||
Number of iterations run.
|
||||
|
||||
See Also
|
||||
--------
|
||||
LabelPropagation : Unregularized graph based semi-supervised learning.
|
||||
|
||||
References
|
||||
----------
|
||||
`Dengyong Zhou, Olivier Bousquet, Thomas Navin Lal, Jason Weston,
|
||||
Bernhard Schoelkopf. Learning with local and global consistency (2004)
|
||||
<https://citeseerx.ist.psu.edu/doc_view/pid/d74c37aabf2d5cae663007cbd8718175466aea8c>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn import datasets
|
||||
>>> from sklearn.semi_supervised import LabelSpreading
|
||||
>>> label_prop_model = LabelSpreading()
|
||||
>>> iris = datasets.load_iris()
|
||||
>>> rng = np.random.RandomState(42)
|
||||
>>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
|
||||
>>> labels = np.copy(iris.target)
|
||||
>>> labels[random_unlabeled_points] = -1
|
||||
>>> label_prop_model.fit(iris.data, labels)
|
||||
LabelSpreading(...)
|
||||
"""
|
||||
|
||||
_variant = "spreading"
|
||||
|
||||
_parameter_constraints: dict = {**BaseLabelPropagation._parameter_constraints}
|
||||
_parameter_constraints["alpha"] = [Interval(Real, 0, 1, closed="neither")]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
kernel="rbf",
|
||||
*,
|
||||
gamma=20,
|
||||
n_neighbors=7,
|
||||
alpha=0.2,
|
||||
max_iter=30,
|
||||
tol=1e-3,
|
||||
n_jobs=None,
|
||||
):
|
||||
# this one has different base parameters
|
||||
super().__init__(
|
||||
kernel=kernel,
|
||||
gamma=gamma,
|
||||
n_neighbors=n_neighbors,
|
||||
alpha=alpha,
|
||||
max_iter=max_iter,
|
||||
tol=tol,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
|
||||
def _build_graph(self):
|
||||
"""Graph matrix for Label Spreading computes the graph laplacian"""
|
||||
# compute affinity matrix (or gram matrix)
|
||||
if self.kernel == "knn":
|
||||
self.nn_fit = None
|
||||
n_samples = self.X_.shape[0]
|
||||
affinity_matrix = self._get_kernel(self.X_)
|
||||
laplacian = csgraph_laplacian(affinity_matrix, normed=True)
|
||||
laplacian = -laplacian
|
||||
if sparse.issparse(laplacian):
|
||||
diag_mask = laplacian.row == laplacian.col
|
||||
laplacian.data[diag_mask] = 0.0
|
||||
else:
|
||||
laplacian.flat[:: n_samples + 1] = 0.0 # set diag to 0.0
|
||||
return laplacian
|
||||
@@ -0,0 +1,625 @@
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
from warnings import warn
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..base import (
|
||||
BaseEstimator,
|
||||
ClassifierMixin,
|
||||
MetaEstimatorMixin,
|
||||
_fit_context,
|
||||
clone,
|
||||
)
|
||||
from ..utils import Bunch, get_tags, safe_mask
|
||||
from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions
|
||||
from ..utils.metadata_routing import (
|
||||
MetadataRouter,
|
||||
MethodMapping,
|
||||
_raise_for_params,
|
||||
_routing_enabled,
|
||||
process_routing,
|
||||
)
|
||||
from ..utils.metaestimators import available_if
|
||||
from ..utils.validation import _estimator_has, check_is_fitted, validate_data
|
||||
|
||||
__all__ = ["SelfTrainingClassifier"]
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
|
||||
class SelfTrainingClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
|
||||
"""Self-training classifier.
|
||||
|
||||
This :term:`metaestimator` allows a given supervised classifier to function as a
|
||||
semi-supervised classifier, allowing it to learn from unlabeled data. It
|
||||
does this by iteratively predicting pseudo-labels for the unlabeled data
|
||||
and adding them to the training set.
|
||||
|
||||
The classifier will continue iterating until either max_iter is reached, or
|
||||
no pseudo-labels were added to the training set in the previous iteration.
|
||||
|
||||
Read more in the :ref:`User Guide <self_training>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator object
|
||||
An estimator object implementing `fit` and `predict_proba`.
|
||||
Invoking the `fit` method will fit a clone of the passed estimator,
|
||||
which will be stored in the `estimator_` attribute.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
`estimator` was added to replace `base_estimator`.
|
||||
|
||||
base_estimator : estimator object
|
||||
An estimator object implementing `fit` and `predict_proba`.
|
||||
Invoking the `fit` method will fit a clone of the passed estimator,
|
||||
which will be stored in the `estimator_` attribute.
|
||||
|
||||
.. deprecated:: 1.6
|
||||
`base_estimator` was deprecated in 1.6 and will be removed in 1.8.
|
||||
Use `estimator` instead.
|
||||
|
||||
threshold : float, default=0.75
|
||||
The decision threshold for use with `criterion='threshold'`.
|
||||
Should be in [0, 1). When using the `'threshold'` criterion, a
|
||||
:ref:`well calibrated classifier <calibration>` should be used.
|
||||
|
||||
criterion : {'threshold', 'k_best'}, default='threshold'
|
||||
The selection criterion used to select which labels to add to the
|
||||
training set. If `'threshold'`, pseudo-labels with prediction
|
||||
probabilities above `threshold` are added to the dataset. If `'k_best'`,
|
||||
the `k_best` pseudo-labels with highest prediction probabilities are
|
||||
added to the dataset. When using the 'threshold' criterion, a
|
||||
:ref:`well calibrated classifier <calibration>` should be used.
|
||||
|
||||
k_best : int, default=10
|
||||
The amount of samples to add in each iteration. Only used when
|
||||
`criterion='k_best'`.
|
||||
|
||||
max_iter : int or None, default=10
|
||||
Maximum number of iterations allowed. Should be greater than or equal
|
||||
to 0. If it is `None`, the classifier will continue to predict labels
|
||||
until no new pseudo-labels are added, or all unlabeled samples have
|
||||
been labeled.
|
||||
|
||||
verbose : bool, default=False
|
||||
Enable verbose output.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
estimator_ : estimator object
|
||||
The fitted estimator.
|
||||
|
||||
classes_ : ndarray or list of ndarray of shape (n_classes,)
|
||||
Class labels for each output. (Taken from the trained
|
||||
`estimator_`).
|
||||
|
||||
transduction_ : ndarray of shape (n_samples,)
|
||||
The labels used for the final fit of the classifier, including
|
||||
pseudo-labels added during fit.
|
||||
|
||||
labeled_iter_ : ndarray of shape (n_samples,)
|
||||
The iteration in which each sample was labeled. When a sample has
|
||||
iteration 0, the sample was already labeled in the original dataset.
|
||||
When a sample has iteration -1, the sample was not labeled in any
|
||||
iteration.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_iter_ : int
|
||||
The number of rounds of self-training, that is the number of times the
|
||||
base estimator is fitted on relabeled variants of the training set.
|
||||
|
||||
termination_condition_ : {'max_iter', 'no_change', 'all_labeled'}
|
||||
The reason that fitting was stopped.
|
||||
|
||||
- `'max_iter'`: `n_iter_` reached `max_iter`.
|
||||
- `'no_change'`: no new labels were predicted.
|
||||
- `'all_labeled'`: all unlabeled samples were labeled before `max_iter`
|
||||
was reached.
|
||||
|
||||
See Also
|
||||
--------
|
||||
LabelPropagation : Label propagation classifier.
|
||||
LabelSpreading : Label spreading model for semi-supervised learning.
|
||||
|
||||
References
|
||||
----------
|
||||
:doi:`David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling
|
||||
supervised methods. In Proceedings of the 33rd annual meeting on
|
||||
Association for Computational Linguistics (ACL '95). Association for
|
||||
Computational Linguistics, Stroudsburg, PA, USA, 189-196.
|
||||
<10.3115/981658.981684>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn import datasets
|
||||
>>> from sklearn.semi_supervised import SelfTrainingClassifier
|
||||
>>> from sklearn.svm import SVC
|
||||
>>> rng = np.random.RandomState(42)
|
||||
>>> iris = datasets.load_iris()
|
||||
>>> random_unlabeled_points = rng.rand(iris.target.shape[0]) < 0.3
|
||||
>>> iris.target[random_unlabeled_points] = -1
|
||||
>>> svc = SVC(probability=True, gamma="auto")
|
||||
>>> self_training_model = SelfTrainingClassifier(svc)
|
||||
>>> self_training_model.fit(iris.data, iris.target)
|
||||
SelfTrainingClassifier(...)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
# We don't require `predic_proba` here to allow passing a meta-estimator
|
||||
# that only exposes `predict_proba` after fitting.
|
||||
# TODO(1.8) remove None option
|
||||
"estimator": [None, HasMethods(["fit"])],
|
||||
# TODO(1.8) remove
|
||||
"base_estimator": [
|
||||
HasMethods(["fit"]),
|
||||
Hidden(StrOptions({"deprecated"})),
|
||||
],
|
||||
"threshold": [Interval(Real, 0.0, 1.0, closed="left")],
|
||||
"criterion": [StrOptions({"threshold", "k_best"})],
|
||||
"k_best": [Interval(Integral, 1, None, closed="left")],
|
||||
"max_iter": [Interval(Integral, 0, None, closed="left"), None],
|
||||
"verbose": ["verbose"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
estimator=None,
|
||||
base_estimator="deprecated",
|
||||
threshold=0.75,
|
||||
criterion="threshold",
|
||||
k_best=10,
|
||||
max_iter=10,
|
||||
verbose=False,
|
||||
):
|
||||
self.estimator = estimator
|
||||
self.threshold = threshold
|
||||
self.criterion = criterion
|
||||
self.k_best = k_best
|
||||
self.max_iter = max_iter
|
||||
self.verbose = verbose
|
||||
|
||||
# TODO(1.8) remove
|
||||
self.base_estimator = base_estimator
|
||||
|
||||
def _get_estimator(self):
|
||||
"""Get the estimator.
|
||||
|
||||
Returns
|
||||
-------
|
||||
estimator_ : estimator object
|
||||
The cloned estimator object.
|
||||
"""
|
||||
# TODO(1.8): remove and only keep clone(self.estimator)
|
||||
if self.estimator is None and self.base_estimator != "deprecated":
|
||||
estimator_ = clone(self.base_estimator)
|
||||
|
||||
warn(
|
||||
(
|
||||
"`base_estimator` has been deprecated in 1.6 and will be removed"
|
||||
" in 1.8. Please use `estimator` instead."
|
||||
),
|
||||
FutureWarning,
|
||||
)
|
||||
# TODO(1.8) remove
|
||||
elif self.estimator is None and self.base_estimator == "deprecated":
|
||||
raise ValueError(
|
||||
"You must pass an estimator to SelfTrainingClassifier. Use `estimator`."
|
||||
)
|
||||
elif self.estimator is not None and self.base_estimator != "deprecated":
|
||||
raise ValueError(
|
||||
"You must pass only one estimator to SelfTrainingClassifier."
|
||||
" Use `estimator`."
|
||||
)
|
||||
else:
|
||||
estimator_ = clone(self.estimator)
|
||||
return estimator_
|
||||
|
||||
@_fit_context(
|
||||
# SelfTrainingClassifier.estimator is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y, **params):
|
||||
"""
|
||||
Fit self-training classifier using `X`, `y` as training data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Array representing the data.
|
||||
|
||||
y : {array-like, sparse matrix} of shape (n_samples,)
|
||||
Array representing the labels. Unlabeled samples should have the
|
||||
label -1.
|
||||
|
||||
**params : dict
|
||||
Parameters to pass to the underlying estimators.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
Only available if `enable_metadata_routing=True`,
|
||||
which can be set by using
|
||||
``sklearn.set_config(enable_metadata_routing=True)``.
|
||||
See :ref:`Metadata Routing User Guide <metadata_routing>` for
|
||||
more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
_raise_for_params(params, self, "fit")
|
||||
|
||||
self.estimator_ = self._get_estimator()
|
||||
|
||||
# we need row slicing support for sparse matrices, but costly finiteness check
|
||||
# can be delegated to the base estimator.
|
||||
X, y = validate_data(
|
||||
self,
|
||||
X,
|
||||
y,
|
||||
accept_sparse=["csr", "csc", "lil", "dok"],
|
||||
ensure_all_finite=False,
|
||||
)
|
||||
|
||||
if y.dtype.kind in ["U", "S"]:
|
||||
raise ValueError(
|
||||
"y has dtype string. If you wish to predict on "
|
||||
"string targets, use dtype object, and use -1"
|
||||
" as the label for unlabeled samples."
|
||||
)
|
||||
|
||||
has_label = y != -1
|
||||
|
||||
if np.all(has_label):
|
||||
warnings.warn("y contains no unlabeled samples", UserWarning)
|
||||
|
||||
if self.criterion == "k_best" and (
|
||||
self.k_best > X.shape[0] - np.sum(has_label)
|
||||
):
|
||||
warnings.warn(
|
||||
(
|
||||
"k_best is larger than the amount of unlabeled "
|
||||
"samples. All unlabeled samples will be labeled in "
|
||||
"the first iteration"
|
||||
),
|
||||
UserWarning,
|
||||
)
|
||||
|
||||
if _routing_enabled():
|
||||
routed_params = process_routing(self, "fit", **params)
|
||||
else:
|
||||
routed_params = Bunch(estimator=Bunch(fit={}))
|
||||
|
||||
self.transduction_ = np.copy(y)
|
||||
self.labeled_iter_ = np.full_like(y, -1)
|
||||
self.labeled_iter_[has_label] = 0
|
||||
|
||||
self.n_iter_ = 0
|
||||
|
||||
while not np.all(has_label) and (
|
||||
self.max_iter is None or self.n_iter_ < self.max_iter
|
||||
):
|
||||
self.n_iter_ += 1
|
||||
self.estimator_.fit(
|
||||
X[safe_mask(X, has_label)],
|
||||
self.transduction_[has_label],
|
||||
**routed_params.estimator.fit,
|
||||
)
|
||||
|
||||
# Predict on the unlabeled samples
|
||||
prob = self.estimator_.predict_proba(X[safe_mask(X, ~has_label)])
|
||||
pred = self.estimator_.classes_[np.argmax(prob, axis=1)]
|
||||
max_proba = np.max(prob, axis=1)
|
||||
|
||||
# Select new labeled samples
|
||||
if self.criterion == "threshold":
|
||||
selected = max_proba > self.threshold
|
||||
else:
|
||||
n_to_select = min(self.k_best, max_proba.shape[0])
|
||||
if n_to_select == max_proba.shape[0]:
|
||||
selected = np.ones_like(max_proba, dtype=bool)
|
||||
else:
|
||||
# NB these are indices, not a mask
|
||||
selected = np.argpartition(-max_proba, n_to_select)[:n_to_select]
|
||||
|
||||
# Map selected indices into original array
|
||||
selected_full = np.nonzero(~has_label)[0][selected]
|
||||
|
||||
# Add newly labeled confident predictions to the dataset
|
||||
self.transduction_[selected_full] = pred[selected]
|
||||
has_label[selected_full] = True
|
||||
self.labeled_iter_[selected_full] = self.n_iter_
|
||||
|
||||
if selected_full.shape[0] == 0:
|
||||
# no changed labels
|
||||
self.termination_condition_ = "no_change"
|
||||
break
|
||||
|
||||
if self.verbose:
|
||||
print(
|
||||
f"End of iteration {self.n_iter_},"
|
||||
f" added {selected_full.shape[0]} new labels."
|
||||
)
|
||||
|
||||
if self.n_iter_ == self.max_iter:
|
||||
self.termination_condition_ = "max_iter"
|
||||
if np.all(has_label):
|
||||
self.termination_condition_ = "all_labeled"
|
||||
|
||||
self.estimator_.fit(
|
||||
X[safe_mask(X, has_label)],
|
||||
self.transduction_[has_label],
|
||||
**routed_params.estimator.fit,
|
||||
)
|
||||
self.classes_ = self.estimator_.classes_
|
||||
return self
|
||||
|
||||
@available_if(_estimator_has("predict"))
|
||||
def predict(self, X, **params):
|
||||
"""Predict the classes of `X`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Array representing the data.
|
||||
|
||||
**params : dict of str -> object
|
||||
Parameters to pass to the underlying estimator's ``predict`` method.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
Only available if `enable_metadata_routing=True`,
|
||||
which can be set by using
|
||||
``sklearn.set_config(enable_metadata_routing=True)``.
|
||||
See :ref:`Metadata Routing User Guide <metadata_routing>` for
|
||||
more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_samples,)
|
||||
Array with predicted labels.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
_raise_for_params(params, self, "predict")
|
||||
|
||||
if _routing_enabled():
|
||||
# metadata routing is enabled.
|
||||
routed_params = process_routing(self, "predict", **params)
|
||||
else:
|
||||
routed_params = Bunch(estimator=Bunch(predict={}))
|
||||
|
||||
X = validate_data(
|
||||
self,
|
||||
X,
|
||||
accept_sparse=True,
|
||||
ensure_all_finite=False,
|
||||
reset=False,
|
||||
)
|
||||
return self.estimator_.predict(X, **routed_params.estimator.predict)
|
||||
|
||||
@available_if(_estimator_has("predict_proba"))
|
||||
def predict_proba(self, X, **params):
|
||||
"""Predict probability for each possible outcome.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Array representing the data.
|
||||
|
||||
**params : dict of str -> object
|
||||
Parameters to pass to the underlying estimator's
|
||||
``predict_proba`` method.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
Only available if `enable_metadata_routing=True`,
|
||||
which can be set by using
|
||||
``sklearn.set_config(enable_metadata_routing=True)``.
|
||||
See :ref:`Metadata Routing User Guide <metadata_routing>` for
|
||||
more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_samples, n_features)
|
||||
Array with prediction probabilities.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
_raise_for_params(params, self, "predict_proba")
|
||||
|
||||
if _routing_enabled():
|
||||
# metadata routing is enabled.
|
||||
routed_params = process_routing(self, "predict_proba", **params)
|
||||
else:
|
||||
routed_params = Bunch(estimator=Bunch(predict_proba={}))
|
||||
|
||||
X = validate_data(
|
||||
self,
|
||||
X,
|
||||
accept_sparse=True,
|
||||
ensure_all_finite=False,
|
||||
reset=False,
|
||||
)
|
||||
return self.estimator_.predict_proba(X, **routed_params.estimator.predict_proba)
|
||||
|
||||
@available_if(_estimator_has("decision_function"))
|
||||
def decision_function(self, X, **params):
|
||||
"""Call decision function of the `estimator`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Array representing the data.
|
||||
|
||||
**params : dict of str -> object
|
||||
Parameters to pass to the underlying estimator's
|
||||
``decision_function`` method.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
Only available if `enable_metadata_routing=True`,
|
||||
which can be set by using
|
||||
``sklearn.set_config(enable_metadata_routing=True)``.
|
||||
See :ref:`Metadata Routing User Guide <metadata_routing>` for
|
||||
more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_samples, n_features)
|
||||
Result of the decision function of the `estimator`.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
_raise_for_params(params, self, "decision_function")
|
||||
|
||||
if _routing_enabled():
|
||||
# metadata routing is enabled.
|
||||
routed_params = process_routing(self, "decision_function", **params)
|
||||
else:
|
||||
routed_params = Bunch(estimator=Bunch(decision_function={}))
|
||||
|
||||
X = validate_data(
|
||||
self,
|
||||
X,
|
||||
accept_sparse=True,
|
||||
ensure_all_finite=False,
|
||||
reset=False,
|
||||
)
|
||||
return self.estimator_.decision_function(
|
||||
X, **routed_params.estimator.decision_function
|
||||
)
|
||||
|
||||
@available_if(_estimator_has("predict_log_proba"))
|
||||
def predict_log_proba(self, X, **params):
|
||||
"""Predict log probability for each possible outcome.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Array representing the data.
|
||||
|
||||
**params : dict of str -> object
|
||||
Parameters to pass to the underlying estimator's
|
||||
``predict_log_proba`` method.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
Only available if `enable_metadata_routing=True`,
|
||||
which can be set by using
|
||||
``sklearn.set_config(enable_metadata_routing=True)``.
|
||||
See :ref:`Metadata Routing User Guide <metadata_routing>` for
|
||||
more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_samples, n_features)
|
||||
Array with log prediction probabilities.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
_raise_for_params(params, self, "predict_log_proba")
|
||||
|
||||
if _routing_enabled():
|
||||
# metadata routing is enabled.
|
||||
routed_params = process_routing(self, "predict_log_proba", **params)
|
||||
else:
|
||||
routed_params = Bunch(estimator=Bunch(predict_log_proba={}))
|
||||
|
||||
X = validate_data(
|
||||
self,
|
||||
X,
|
||||
accept_sparse=True,
|
||||
ensure_all_finite=False,
|
||||
reset=False,
|
||||
)
|
||||
return self.estimator_.predict_log_proba(
|
||||
X, **routed_params.estimator.predict_log_proba
|
||||
)
|
||||
|
||||
@available_if(_estimator_has("score"))
|
||||
def score(self, X, y, **params):
|
||||
"""Call score on the `estimator`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Array representing the data.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Array representing the labels.
|
||||
|
||||
**params : dict of str -> object
|
||||
Parameters to pass to the underlying estimator's ``score`` method.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
Only available if `enable_metadata_routing=True`,
|
||||
which can be set by using
|
||||
``sklearn.set_config(enable_metadata_routing=True)``.
|
||||
See :ref:`Metadata Routing User Guide <metadata_routing>` for
|
||||
more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
Result of calling score on the `estimator`.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
_raise_for_params(params, self, "score")
|
||||
|
||||
if _routing_enabled():
|
||||
# metadata routing is enabled.
|
||||
routed_params = process_routing(self, "score", **params)
|
||||
else:
|
||||
routed_params = Bunch(estimator=Bunch(score={}))
|
||||
|
||||
X = validate_data(
|
||||
self,
|
||||
X,
|
||||
accept_sparse=True,
|
||||
ensure_all_finite=False,
|
||||
reset=False,
|
||||
)
|
||||
return self.estimator_.score(X, y, **routed_params.estimator.score)
|
||||
|
||||
def get_metadata_routing(self):
|
||||
"""Get metadata routing of this object.
|
||||
|
||||
Please check :ref:`User Guide <metadata_routing>` on how the routing
|
||||
mechanism works.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
|
||||
Returns
|
||||
-------
|
||||
routing : MetadataRouter
|
||||
A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
|
||||
routing information.
|
||||
"""
|
||||
router = MetadataRouter(owner=self.__class__.__name__)
|
||||
router.add(
|
||||
estimator=self.estimator,
|
||||
method_mapping=(
|
||||
MethodMapping()
|
||||
.add(callee="fit", caller="fit")
|
||||
.add(callee="score", caller="fit")
|
||||
.add(callee="predict", caller="predict")
|
||||
.add(callee="predict_proba", caller="predict_proba")
|
||||
.add(callee="decision_function", caller="decision_function")
|
||||
.add(callee="predict_log_proba", caller="predict_log_proba")
|
||||
.add(callee="score", caller="score")
|
||||
),
|
||||
)
|
||||
return router
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
# TODO(1.8): remove the condition check together with base_estimator
|
||||
if self.estimator is not None:
|
||||
tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse
|
||||
return tags
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,238 @@
|
||||
"""test the label propagation module"""
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.metrics.pairwise import rbf_kernel
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.semi_supervised import _label_propagation as label_propagation
|
||||
from sklearn.utils._testing import (
|
||||
_convert_container,
|
||||
assert_allclose,
|
||||
assert_array_equal,
|
||||
)
|
||||
|
||||
CONSTRUCTOR_TYPES = ("array", "sparse_csr", "sparse_csc")
|
||||
|
||||
ESTIMATORS = [
|
||||
(label_propagation.LabelPropagation, {"kernel": "rbf"}),
|
||||
(label_propagation.LabelPropagation, {"kernel": "knn", "n_neighbors": 2}),
|
||||
(
|
||||
label_propagation.LabelPropagation,
|
||||
{"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)},
|
||||
),
|
||||
(label_propagation.LabelSpreading, {"kernel": "rbf"}),
|
||||
(label_propagation.LabelSpreading, {"kernel": "knn", "n_neighbors": 2}),
|
||||
(
|
||||
label_propagation.LabelSpreading,
|
||||
{"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)},
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
|
||||
def test_fit_transduction(global_dtype, Estimator, parameters):
|
||||
samples = np.asarray([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], dtype=global_dtype)
|
||||
labels = [0, 1, -1]
|
||||
clf = Estimator(**parameters).fit(samples, labels)
|
||||
assert clf.transduction_[2] == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
|
||||
def test_distribution(global_dtype, Estimator, parameters):
|
||||
if parameters["kernel"] == "knn":
|
||||
pytest.skip(
|
||||
"Unstable test for this configuration: changes in k-NN ordering break it."
|
||||
)
|
||||
samples = np.asarray([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=global_dtype)
|
||||
labels = [0, 1, -1]
|
||||
clf = Estimator(**parameters).fit(samples, labels)
|
||||
assert_allclose(clf.label_distributions_[2], [0.5, 0.5], atol=1e-2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
|
||||
def test_predict(global_dtype, Estimator, parameters):
|
||||
samples = np.asarray([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], dtype=global_dtype)
|
||||
labels = [0, 1, -1]
|
||||
clf = Estimator(**parameters).fit(samples, labels)
|
||||
assert_array_equal(clf.predict([[0.5, 2.5]]), np.array([1]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
|
||||
def test_predict_proba(global_dtype, Estimator, parameters):
|
||||
samples = np.asarray([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]], dtype=global_dtype)
|
||||
labels = [0, 1, -1]
|
||||
clf = Estimator(**parameters).fit(samples, labels)
|
||||
assert_allclose(clf.predict_proba([[1.0, 1.0]]), np.array([[0.5, 0.5]]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("alpha", [0.1, 0.3, 0.5, 0.7, 0.9])
|
||||
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
|
||||
def test_label_spreading_closed_form(global_dtype, Estimator, parameters, alpha):
|
||||
n_classes = 2
|
||||
X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
y[::3] = -1
|
||||
|
||||
gamma = 0.1
|
||||
clf = label_propagation.LabelSpreading(gamma=gamma).fit(X, y)
|
||||
# adopting notation from Zhou et al (2004):
|
||||
S = clf._build_graph()
|
||||
Y = np.zeros((len(y), n_classes + 1), dtype=X.dtype)
|
||||
Y[np.arange(len(y)), y] = 1
|
||||
Y = Y[:, :-1]
|
||||
|
||||
expected = np.dot(np.linalg.inv(np.eye(len(S), dtype=S.dtype) - alpha * S), Y)
|
||||
expected /= expected.sum(axis=1)[:, np.newaxis]
|
||||
|
||||
clf = label_propagation.LabelSpreading(
|
||||
max_iter=100, alpha=alpha, tol=1e-10, gamma=gamma
|
||||
)
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_allclose(expected, clf.label_distributions_)
|
||||
|
||||
|
||||
def test_label_propagation_closed_form(global_dtype):
|
||||
n_classes = 2
|
||||
X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
y[::3] = -1
|
||||
Y = np.zeros((len(y), n_classes + 1))
|
||||
Y[np.arange(len(y)), y] = 1
|
||||
unlabelled_idx = Y[:, (-1,)].nonzero()[0]
|
||||
labelled_idx = (Y[:, (-1,)] == 0).nonzero()[0]
|
||||
|
||||
clf = label_propagation.LabelPropagation(max_iter=100, tol=1e-10, gamma=0.1)
|
||||
clf.fit(X, y)
|
||||
# adopting notation from Zhu et al 2002
|
||||
T_bar = clf._build_graph()
|
||||
Tuu = T_bar[tuple(np.meshgrid(unlabelled_idx, unlabelled_idx, indexing="ij"))]
|
||||
Tul = T_bar[tuple(np.meshgrid(unlabelled_idx, labelled_idx, indexing="ij"))]
|
||||
Y = Y[:, :-1]
|
||||
Y_l = Y[labelled_idx, :]
|
||||
Y_u = np.dot(np.dot(np.linalg.inv(np.eye(Tuu.shape[0]) - Tuu), Tul), Y_l)
|
||||
|
||||
expected = Y.copy()
|
||||
expected[unlabelled_idx, :] = Y_u
|
||||
expected /= expected.sum(axis=1)[:, np.newaxis]
|
||||
|
||||
assert_allclose(expected, clf.label_distributions_, atol=1e-4)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("accepted_sparse_type", ["sparse_csr", "sparse_csc"])
|
||||
@pytest.mark.parametrize("index_dtype", [np.int32, np.int64])
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
|
||||
def test_sparse_input_types(
|
||||
accepted_sparse_type, index_dtype, dtype, Estimator, parameters
|
||||
):
|
||||
# This is non-regression test for #17085
|
||||
X = _convert_container([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], accepted_sparse_type)
|
||||
X.data = X.data.astype(dtype, copy=False)
|
||||
X.indices = X.indices.astype(index_dtype, copy=False)
|
||||
X.indptr = X.indptr.astype(index_dtype, copy=False)
|
||||
labels = [0, 1, -1]
|
||||
clf = Estimator(**parameters).fit(X, labels)
|
||||
assert_array_equal(clf.predict([[0.5, 2.5]]), np.array([1]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor_type", CONSTRUCTOR_TYPES)
|
||||
def test_convergence_speed(constructor_type):
|
||||
# This is a non-regression test for #5774
|
||||
X = _convert_container([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]], constructor_type)
|
||||
y = np.array([0, 1, -1])
|
||||
mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=5000)
|
||||
mdl.fit(X, y)
|
||||
|
||||
# this should converge quickly:
|
||||
assert mdl.n_iter_ < 10
|
||||
assert_array_equal(mdl.predict(X), [0, 1, 1])
|
||||
|
||||
|
||||
def test_convergence_warning():
|
||||
# This is a non-regression test for #5774
|
||||
X = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]])
|
||||
y = np.array([0, 1, -1])
|
||||
mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=1)
|
||||
warn_msg = "max_iter=1 was reached without convergence."
|
||||
with pytest.warns(ConvergenceWarning, match=warn_msg):
|
||||
mdl.fit(X, y)
|
||||
assert mdl.n_iter_ == mdl.max_iter
|
||||
|
||||
mdl = label_propagation.LabelPropagation(kernel="rbf", max_iter=1)
|
||||
with pytest.warns(ConvergenceWarning, match=warn_msg):
|
||||
mdl.fit(X, y)
|
||||
assert mdl.n_iter_ == mdl.max_iter
|
||||
|
||||
mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=500)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", ConvergenceWarning)
|
||||
mdl.fit(X, y)
|
||||
|
||||
mdl = label_propagation.LabelPropagation(kernel="rbf", max_iter=500)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", ConvergenceWarning)
|
||||
mdl.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"LabelPropagationCls",
|
||||
[label_propagation.LabelSpreading, label_propagation.LabelPropagation],
|
||||
)
|
||||
def test_label_propagation_non_zero_normalizer(LabelPropagationCls):
|
||||
# check that we don't divide by zero in case of null normalizer
|
||||
# non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/15946
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/9292
|
||||
X = np.array([[100.0, 100.0], [100.0, 100.0], [0.0, 0.0], [0.0, 0.0]])
|
||||
y = np.array([0, 1, -1, -1])
|
||||
mdl = LabelPropagationCls(kernel="knn", max_iter=100, n_neighbors=1)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
mdl.fit(X, y)
|
||||
|
||||
|
||||
def test_predict_sparse_callable_kernel(global_dtype):
|
||||
# This is a non-regression test for #15866
|
||||
|
||||
# Custom sparse kernel (top-K RBF)
|
||||
def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5):
|
||||
nn = NearestNeighbors(n_neighbors=10, metric="euclidean", n_jobs=2)
|
||||
nn.fit(X)
|
||||
W = -1 * nn.kneighbors_graph(Y, mode="distance").power(2) * gamma
|
||||
np.exp(W.data, out=W.data)
|
||||
assert issparse(W)
|
||||
return W.T
|
||||
|
||||
n_classes = 4
|
||||
n_samples = 500
|
||||
n_test = 10
|
||||
X, y = make_classification(
|
||||
n_classes=n_classes,
|
||||
n_samples=n_samples,
|
||||
n_features=20,
|
||||
n_informative=20,
|
||||
n_redundant=0,
|
||||
n_repeated=0,
|
||||
random_state=0,
|
||||
)
|
||||
X = X.astype(global_dtype)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=n_test, random_state=0
|
||||
)
|
||||
|
||||
model = label_propagation.LabelSpreading(kernel=topk_rbf)
|
||||
model.fit(X_train, y_train)
|
||||
assert model.score(X_test, y_test) >= 0.9
|
||||
|
||||
model = label_propagation.LabelPropagation(kernel=topk_rbf)
|
||||
model.fit(X_train, y_train)
|
||||
assert model.score(X_test, y_test) >= 0.9
|
||||
@@ -0,0 +1,395 @@
|
||||
from math import ceil
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.datasets import load_iris, make_blobs
|
||||
from sklearn.ensemble import StackingClassifier
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.semi_supervised import SelfTrainingClassifier
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.tests.test_pipeline import SimpleEstimator
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# load the iris dataset and randomly permute it
|
||||
iris = load_iris()
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
iris.data, iris.target, random_state=0
|
||||
)
|
||||
|
||||
n_labeled_samples = 50
|
||||
|
||||
y_train_missing_labels = y_train.copy()
|
||||
y_train_missing_labels[n_labeled_samples:] = -1
|
||||
mapping = {0: "A", 1: "B", 2: "C", -1: "-1"}
|
||||
y_train_missing_strings = np.vectorize(mapping.get)(y_train_missing_labels).astype(
|
||||
object
|
||||
)
|
||||
y_train_missing_strings[y_train_missing_labels == -1] = -1
|
||||
|
||||
|
||||
def test_warns_k_best():
|
||||
st = SelfTrainingClassifier(KNeighborsClassifier(), criterion="k_best", k_best=1000)
|
||||
with pytest.warns(UserWarning, match="k_best is larger than"):
|
||||
st.fit(X_train, y_train_missing_labels)
|
||||
|
||||
assert st.termination_condition_ == "all_labeled"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[KNeighborsClassifier(), SVC(gamma="scale", probability=True, random_state=0)],
|
||||
)
|
||||
@pytest.mark.parametrize("selection_crit", ["threshold", "k_best"])
|
||||
def test_classification(estimator, selection_crit):
|
||||
# Check classification for various parameter settings.
|
||||
# Also assert that predictions for strings and numerical labels are equal.
|
||||
# Also test for multioutput classification
|
||||
threshold = 0.75
|
||||
max_iter = 10
|
||||
st = SelfTrainingClassifier(
|
||||
estimator, max_iter=max_iter, threshold=threshold, criterion=selection_crit
|
||||
)
|
||||
st.fit(X_train, y_train_missing_labels)
|
||||
pred = st.predict(X_test)
|
||||
proba = st.predict_proba(X_test)
|
||||
|
||||
st_string = SelfTrainingClassifier(
|
||||
estimator, max_iter=max_iter, criterion=selection_crit, threshold=threshold
|
||||
)
|
||||
st_string.fit(X_train, y_train_missing_strings)
|
||||
pred_string = st_string.predict(X_test)
|
||||
proba_string = st_string.predict_proba(X_test)
|
||||
|
||||
assert_array_equal(np.vectorize(mapping.get)(pred), pred_string)
|
||||
assert_array_equal(proba, proba_string)
|
||||
|
||||
assert st.termination_condition_ == st_string.termination_condition_
|
||||
# Check consistency between labeled_iter, n_iter and max_iter
|
||||
labeled = y_train_missing_labels != -1
|
||||
# assert that labeled samples have labeled_iter = 0
|
||||
assert_array_equal(st.labeled_iter_ == 0, labeled)
|
||||
# assert that labeled samples do not change label during training
|
||||
assert_array_equal(y_train_missing_labels[labeled], st.transduction_[labeled])
|
||||
|
||||
# assert that the max of the iterations is less than the total amount of
|
||||
# iterations
|
||||
assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
|
||||
assert np.max(st_string.labeled_iter_) <= st_string.n_iter_ <= max_iter
|
||||
|
||||
# check shapes
|
||||
assert st.labeled_iter_.shape == st.transduction_.shape
|
||||
assert st_string.labeled_iter_.shape == st_string.transduction_.shape
|
||||
|
||||
|
||||
def test_k_best():
|
||||
st = SelfTrainingClassifier(
|
||||
KNeighborsClassifier(n_neighbors=1),
|
||||
criterion="k_best",
|
||||
k_best=10,
|
||||
max_iter=None,
|
||||
)
|
||||
y_train_only_one_label = np.copy(y_train)
|
||||
y_train_only_one_label[1:] = -1
|
||||
n_samples = y_train.shape[0]
|
||||
|
||||
n_expected_iter = ceil((n_samples - 1) / 10)
|
||||
st.fit(X_train, y_train_only_one_label)
|
||||
assert st.n_iter_ == n_expected_iter
|
||||
|
||||
# Check labeled_iter_
|
||||
assert np.sum(st.labeled_iter_ == 0) == 1
|
||||
for i in range(1, n_expected_iter):
|
||||
assert np.sum(st.labeled_iter_ == i) == 10
|
||||
assert np.sum(st.labeled_iter_ == n_expected_iter) == (n_samples - 1) % 10
|
||||
assert st.termination_condition_ == "all_labeled"
|
||||
|
||||
|
||||
def test_sanity_classification():
|
||||
estimator = SVC(gamma="scale", probability=True)
|
||||
estimator.fit(X_train[n_labeled_samples:], y_train[n_labeled_samples:])
|
||||
|
||||
st = SelfTrainingClassifier(estimator)
|
||||
st.fit(X_train, y_train_missing_labels)
|
||||
|
||||
pred1, pred2 = estimator.predict(X_test), st.predict(X_test)
|
||||
assert not np.array_equal(pred1, pred2)
|
||||
score_supervised = accuracy_score(estimator.predict(X_test), y_test)
|
||||
score_self_training = accuracy_score(st.predict(X_test), y_test)
|
||||
|
||||
assert score_self_training > score_supervised
|
||||
|
||||
|
||||
def test_none_iter():
|
||||
# Check that the all samples were labeled after a 'reasonable' number of
|
||||
# iterations.
|
||||
st = SelfTrainingClassifier(KNeighborsClassifier(), threshold=0.55, max_iter=None)
|
||||
st.fit(X_train, y_train_missing_labels)
|
||||
|
||||
assert st.n_iter_ < 10
|
||||
assert st.termination_condition_ == "all_labeled"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[KNeighborsClassifier(), SVC(gamma="scale", probability=True, random_state=0)],
|
||||
)
|
||||
@pytest.mark.parametrize("y", [y_train_missing_labels, y_train_missing_strings])
|
||||
def test_zero_iterations(estimator, y):
|
||||
# Check classification for zero iterations.
|
||||
# Fitting a SelfTrainingClassifier with zero iterations should give the
|
||||
# same results as fitting a supervised classifier.
|
||||
# This also asserts that string arrays work as expected.
|
||||
|
||||
clf1 = SelfTrainingClassifier(estimator, max_iter=0)
|
||||
|
||||
clf1.fit(X_train, y)
|
||||
|
||||
clf2 = estimator.fit(X_train[:n_labeled_samples], y[:n_labeled_samples])
|
||||
|
||||
assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
|
||||
assert clf1.termination_condition_ == "max_iter"
|
||||
|
||||
|
||||
def test_prefitted_throws_error():
|
||||
# Test that passing a pre-fitted classifier and calling predict throws an
|
||||
# error
|
||||
knn = KNeighborsClassifier()
|
||||
knn.fit(X_train, y_train)
|
||||
st = SelfTrainingClassifier(knn)
|
||||
with pytest.raises(
|
||||
NotFittedError,
|
||||
match="This SelfTrainingClassifier instance is not fitted yet",
|
||||
):
|
||||
st.predict(X_train)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_iter", range(1, 5))
|
||||
def test_labeled_iter(max_iter):
|
||||
# Check that the amount of datapoints labeled in iteration 0 is equal to
|
||||
# the amount of labeled datapoints we passed.
|
||||
st = SelfTrainingClassifier(KNeighborsClassifier(), max_iter=max_iter)
|
||||
|
||||
st.fit(X_train, y_train_missing_labels)
|
||||
amount_iter_0 = len(st.labeled_iter_[st.labeled_iter_ == 0])
|
||||
assert amount_iter_0 == n_labeled_samples
|
||||
# Check that the max of the iterations is less than the total amount of
|
||||
# iterations
|
||||
assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
|
||||
|
||||
|
||||
def test_no_unlabeled():
|
||||
# Test that training on a fully labeled dataset produces the same results
|
||||
# as training the classifier by itself.
|
||||
knn = KNeighborsClassifier()
|
||||
knn.fit(X_train, y_train)
|
||||
st = SelfTrainingClassifier(knn)
|
||||
with pytest.warns(UserWarning, match="y contains no unlabeled samples"):
|
||||
st.fit(X_train, y_train)
|
||||
assert_array_equal(knn.predict(X_test), st.predict(X_test))
|
||||
# Assert that all samples were labeled in iteration 0 (since there were no
|
||||
# unlabeled samples).
|
||||
assert np.all(st.labeled_iter_ == 0)
|
||||
assert st.termination_condition_ == "all_labeled"
|
||||
|
||||
|
||||
def test_early_stopping():
|
||||
svc = SVC(gamma="scale", probability=True)
|
||||
st = SelfTrainingClassifier(svc)
|
||||
X_train_easy = [[1], [0], [1], [0.5]]
|
||||
y_train_easy = [1, 0, -1, -1]
|
||||
# X = [[0.5]] cannot be predicted on with a high confidence, so training
|
||||
# stops early
|
||||
st.fit(X_train_easy, y_train_easy)
|
||||
assert st.n_iter_ == 1
|
||||
assert st.termination_condition_ == "no_change"
|
||||
|
||||
|
||||
def test_strings_dtype():
|
||||
clf = SelfTrainingClassifier(KNeighborsClassifier())
|
||||
X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
|
||||
labels_multiclass = ["one", "two", "three"]
|
||||
|
||||
y_strings = np.take(labels_multiclass, y)
|
||||
|
||||
with pytest.raises(ValueError, match="dtype"):
|
||||
clf.fit(X, y_strings)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("verbose", [True, False])
|
||||
def test_verbose(capsys, verbose):
|
||||
clf = SelfTrainingClassifier(KNeighborsClassifier(), verbose=verbose)
|
||||
clf.fit(X_train, y_train_missing_labels)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
|
||||
if verbose:
|
||||
assert "iteration" in captured.out
|
||||
else:
|
||||
assert "iteration" not in captured.out
|
||||
|
||||
|
||||
def test_verbose_k_best(capsys):
|
||||
st = SelfTrainingClassifier(
|
||||
KNeighborsClassifier(n_neighbors=1),
|
||||
criterion="k_best",
|
||||
k_best=10,
|
||||
verbose=True,
|
||||
max_iter=None,
|
||||
)
|
||||
|
||||
y_train_only_one_label = np.copy(y_train)
|
||||
y_train_only_one_label[1:] = -1
|
||||
n_samples = y_train.shape[0]
|
||||
|
||||
n_expected_iter = ceil((n_samples - 1) / 10)
|
||||
st.fit(X_train, y_train_only_one_label)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
|
||||
msg = "End of iteration {}, added {} new labels."
|
||||
for i in range(1, n_expected_iter):
|
||||
assert msg.format(i, 10) in captured.out
|
||||
|
||||
assert msg.format(n_expected_iter, (n_samples - 1) % 10) in captured.out
|
||||
|
||||
|
||||
def test_k_best_selects_best():
|
||||
# Tests that the labels added by st really are the 10 best labels.
|
||||
svc = SVC(gamma="scale", probability=True, random_state=0)
|
||||
st = SelfTrainingClassifier(svc, criterion="k_best", max_iter=1, k_best=10)
|
||||
has_label = y_train_missing_labels != -1
|
||||
st.fit(X_train, y_train_missing_labels)
|
||||
|
||||
got_label = ~has_label & (st.transduction_ != -1)
|
||||
|
||||
svc.fit(X_train[has_label], y_train_missing_labels[has_label])
|
||||
pred = svc.predict_proba(X_train[~has_label])
|
||||
max_proba = np.max(pred, axis=1)
|
||||
|
||||
most_confident_svc = X_train[~has_label][np.argsort(max_proba)[-10:]]
|
||||
added_by_st = X_train[np.where(got_label)].tolist()
|
||||
|
||||
for row in most_confident_svc.tolist():
|
||||
assert row in added_by_st
|
||||
|
||||
|
||||
def test_estimator_meta_estimator():
|
||||
# Check that a meta-estimator relying on an estimator implementing
|
||||
# `predict_proba` will work even if it does not expose this method before being
|
||||
# fitted.
|
||||
# Non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/19119
|
||||
|
||||
estimator = StackingClassifier(
|
||||
estimators=[
|
||||
("svc_1", SVC(probability=True)),
|
||||
("svc_2", SVC(probability=True)),
|
||||
],
|
||||
final_estimator=SVC(probability=True),
|
||||
cv=2,
|
||||
)
|
||||
|
||||
assert hasattr(estimator, "predict_proba")
|
||||
clf = SelfTrainingClassifier(estimator=estimator)
|
||||
clf.fit(X_train, y_train_missing_labels)
|
||||
clf.predict_proba(X_test)
|
||||
|
||||
estimator = StackingClassifier(
|
||||
estimators=[
|
||||
("svc_1", SVC(probability=False)),
|
||||
("svc_2", SVC(probability=False)),
|
||||
],
|
||||
final_estimator=SVC(probability=False),
|
||||
cv=2,
|
||||
)
|
||||
|
||||
assert not hasattr(estimator, "predict_proba")
|
||||
clf = SelfTrainingClassifier(estimator=estimator)
|
||||
with pytest.raises(AttributeError):
|
||||
clf.fit(X_train, y_train_missing_labels)
|
||||
|
||||
|
||||
def test_self_training_estimator_attribute_error():
|
||||
"""Check that we raise the proper AttributeErrors when the `estimator`
|
||||
does not implement the `predict_proba` method, which is called from within
|
||||
`fit`, or `decision_function`, which is decorated with `available_if`.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/28108
|
||||
"""
|
||||
# `SVC` with `probability=False` does not implement 'predict_proba' that
|
||||
# is required internally in `fit` of `SelfTrainingClassifier`. We expect
|
||||
# an AttributeError to be raised.
|
||||
estimator = SVC(probability=False, gamma="scale")
|
||||
self_training = SelfTrainingClassifier(estimator)
|
||||
|
||||
with pytest.raises(AttributeError, match="has no attribute 'predict_proba'"):
|
||||
self_training.fit(X_train, y_train_missing_labels)
|
||||
|
||||
# `DecisionTreeClassifier` does not implement 'decision_function' and
|
||||
# should raise an AttributeError
|
||||
self_training = SelfTrainingClassifier(estimator=DecisionTreeClassifier())
|
||||
|
||||
outer_msg = "This 'SelfTrainingClassifier' has no attribute 'decision_function'"
|
||||
inner_msg = "'DecisionTreeClassifier' object has no attribute 'decision_function'"
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
self_training.fit(X_train, y_train_missing_labels).decision_function(X_train)
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg in str(exec_info.value.__cause__)
|
||||
|
||||
|
||||
# TODO(1.8): remove in 1.8
|
||||
def test_deprecation_warning_base_estimator():
|
||||
warn_msg = "`base_estimator` has been deprecated in 1.6 and will be removed"
|
||||
with pytest.warns(FutureWarning, match=warn_msg):
|
||||
SelfTrainingClassifier(base_estimator=DecisionTreeClassifier()).fit(
|
||||
X_train, y_train_missing_labels
|
||||
)
|
||||
|
||||
error_msg = "You must pass an estimator to SelfTrainingClassifier"
|
||||
with pytest.raises(ValueError, match=error_msg):
|
||||
SelfTrainingClassifier().fit(X_train, y_train_missing_labels)
|
||||
|
||||
error_msg = "You must pass only one estimator to SelfTrainingClassifier."
|
||||
with pytest.raises(ValueError, match=error_msg):
|
||||
SelfTrainingClassifier(
|
||||
base_estimator=DecisionTreeClassifier(), estimator=DecisionTreeClassifier()
|
||||
).fit(X_train, y_train_missing_labels)
|
||||
|
||||
|
||||
# Metadata routing tests
|
||||
# =================================================================
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:y contains no unlabeled samples:UserWarning")
|
||||
@pytest.mark.parametrize(
|
||||
"method", ["decision_function", "predict_log_proba", "predict_proba", "predict"]
|
||||
)
|
||||
def test_routing_passed_metadata_not_supported(method):
|
||||
"""Test that the right error message is raised when metadata is passed while
|
||||
not supported when `enable_metadata_routing=False`."""
|
||||
est = SelfTrainingClassifier(estimator=SimpleEstimator())
|
||||
with pytest.raises(
|
||||
ValueError, match="is only supported if enable_metadata_routing=True"
|
||||
):
|
||||
est.fit([[1], [1]], [1, 1], sample_weight=[1], prop="a")
|
||||
|
||||
est = SelfTrainingClassifier(estimator=SimpleEstimator())
|
||||
with pytest.raises(
|
||||
ValueError, match="is only supported if enable_metadata_routing=True"
|
||||
):
|
||||
# make sure that the estimator thinks it is already fitted
|
||||
est.fitted_params_ = True
|
||||
getattr(est, method)([[1]], sample_weight=[1], prop="a")
|
||||
|
||||
|
||||
# End of routing tests
|
||||
# ====================
|
||||
Reference in New Issue
Block a user