add read me
This commit is contained in:
@@ -0,0 +1,46 @@
|
||||
"""Methods and algorithms to robustly estimate covariance.
|
||||
|
||||
They estimate the covariance of features at given sets of points, as well as the
|
||||
precision matrix defined as the inverse of the covariance. Covariance estimation is
|
||||
closely related to the theory of Gaussian graphical models.
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from ._elliptic_envelope import EllipticEnvelope
|
||||
from ._empirical_covariance import (
|
||||
EmpiricalCovariance,
|
||||
empirical_covariance,
|
||||
log_likelihood,
|
||||
)
|
||||
from ._graph_lasso import GraphicalLasso, GraphicalLassoCV, graphical_lasso
|
||||
from ._robust_covariance import MinCovDet, fast_mcd
|
||||
from ._shrunk_covariance import (
|
||||
OAS,
|
||||
LedoitWolf,
|
||||
ShrunkCovariance,
|
||||
ledoit_wolf,
|
||||
ledoit_wolf_shrinkage,
|
||||
oas,
|
||||
shrunk_covariance,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"OAS",
|
||||
"EllipticEnvelope",
|
||||
"EmpiricalCovariance",
|
||||
"GraphicalLasso",
|
||||
"GraphicalLassoCV",
|
||||
"LedoitWolf",
|
||||
"MinCovDet",
|
||||
"ShrunkCovariance",
|
||||
"empirical_covariance",
|
||||
"fast_mcd",
|
||||
"graphical_lasso",
|
||||
"ledoit_wolf",
|
||||
"ledoit_wolf_shrinkage",
|
||||
"log_likelihood",
|
||||
"oas",
|
||||
"shrunk_covariance",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,266 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from numbers import Real
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..base import OutlierMixin, _fit_context
|
||||
from ..metrics import accuracy_score
|
||||
from ..utils._param_validation import Interval
|
||||
from ..utils.validation import check_is_fitted
|
||||
from ._robust_covariance import MinCovDet
|
||||
|
||||
|
||||
class EllipticEnvelope(OutlierMixin, MinCovDet):
|
||||
"""An object for detecting outliers in a Gaussian distributed dataset.
|
||||
|
||||
Read more in the :ref:`User Guide <outlier_detection>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specify if the estimated precision is stored.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, the support of robust location and covariance estimates
|
||||
is computed, and a covariance estimate is recomputed from it,
|
||||
without centering the data.
|
||||
Useful to work with data whose mean is significantly equal to
|
||||
zero but is not exactly zero.
|
||||
If False, the robust location and covariance are directly computed
|
||||
with the FastMCD algorithm without additional treatment.
|
||||
|
||||
support_fraction : float, default=None
|
||||
The proportion of points to be included in the support of the raw
|
||||
MCD estimate. If None, the minimum value of support_fraction will
|
||||
be used within the algorithm: `(n_samples + n_features + 1) / 2 * n_samples`.
|
||||
Range is (0, 1).
|
||||
|
||||
contamination : float, default=0.1
|
||||
The amount of contamination of the data set, i.e. the proportion
|
||||
of outliers in the data set. Range is (0, 0.5].
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the pseudo random number generator for shuffling
|
||||
the data. Pass an int for reproducible results across multiple function
|
||||
calls. See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated robust location.
|
||||
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated robust covariance matrix.
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
support_ : ndarray of shape (n_samples,)
|
||||
A mask of the observations that have been used to compute the
|
||||
robust estimates of location and shape.
|
||||
|
||||
offset_ : float
|
||||
Offset used to define the decision function from the raw scores.
|
||||
We have the relation: ``decision_function = score_samples - offset_``.
|
||||
The offset depends on the contamination parameter and is defined in
|
||||
such a way we obtain the expected number of outliers (samples with
|
||||
decision function < 0) in training.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
raw_location_ : ndarray of shape (n_features,)
|
||||
The raw robust estimated location before correction and re-weighting.
|
||||
|
||||
raw_covariance_ : ndarray of shape (n_features, n_features)
|
||||
The raw robust estimated covariance before correction and re-weighting.
|
||||
|
||||
raw_support_ : ndarray of shape (n_samples,)
|
||||
A mask of the observations that have been used to compute
|
||||
the raw robust estimates of location and shape, before correction
|
||||
and re-weighting.
|
||||
|
||||
dist_ : ndarray of shape (n_samples,)
|
||||
Mahalanobis distances of the training set (on which :meth:`fit` is
|
||||
called) observations.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
EmpiricalCovariance : Maximum likelihood covariance estimator.
|
||||
GraphicalLasso : Sparse inverse covariance estimation
|
||||
with an l1-penalized estimator.
|
||||
LedoitWolf : LedoitWolf Estimator.
|
||||
MinCovDet : Minimum Covariance Determinant
|
||||
(robust estimator of covariance).
|
||||
OAS : Oracle Approximating Shrinkage Estimator.
|
||||
ShrunkCovariance : Covariance estimator with shrinkage.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Outlier detection from covariance estimation may break or not
|
||||
perform well in high-dimensional settings. In particular, one will
|
||||
always take care to work with ``n_samples > n_features ** 2``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Rousseeuw, P.J., Van Driessen, K. "A fast algorithm for the
|
||||
minimum covariance determinant estimator" Technometrics 41(3), 212
|
||||
(1999)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import EllipticEnvelope
|
||||
>>> true_cov = np.array([[.8, .3],
|
||||
... [.3, .4]])
|
||||
>>> X = np.random.RandomState(0).multivariate_normal(mean=[0, 0],
|
||||
... cov=true_cov,
|
||||
... size=500)
|
||||
>>> cov = EllipticEnvelope(random_state=0).fit(X)
|
||||
>>> # predict returns 1 for an inlier and -1 for an outlier
|
||||
>>> cov.predict([[0, 0],
|
||||
... [3, 3]])
|
||||
array([ 1, -1])
|
||||
>>> cov.covariance_
|
||||
array([[0.7411, 0.2535],
|
||||
[0.2535, 0.3053]])
|
||||
>>> cov.location_
|
||||
array([0.0813 , 0.0427])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**MinCovDet._parameter_constraints,
|
||||
"contamination": [Interval(Real, 0, 0.5, closed="right")],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
store_precision=True,
|
||||
assume_centered=False,
|
||||
support_fraction=None,
|
||||
contamination=0.1,
|
||||
random_state=None,
|
||||
):
|
||||
super().__init__(
|
||||
store_precision=store_precision,
|
||||
assume_centered=assume_centered,
|
||||
support_fraction=support_fraction,
|
||||
random_state=random_state,
|
||||
)
|
||||
self.contamination = contamination
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the EllipticEnvelope model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
super().fit(X)
|
||||
self.offset_ = np.percentile(-self.dist_, 100.0 * self.contamination)
|
||||
return self
|
||||
|
||||
def decision_function(self, X):
|
||||
"""Compute the decision function of the given observations.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
decision : ndarray of shape (n_samples,)
|
||||
Decision function of the samples.
|
||||
It is equal to the shifted Mahalanobis distances.
|
||||
The threshold for being an outlier is 0, which ensures a
|
||||
compatibility with other outlier detection algorithms.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
negative_mahal_dist = self.score_samples(X)
|
||||
return negative_mahal_dist - self.offset_
|
||||
|
||||
def score_samples(self, X):
|
||||
"""Compute the negative Mahalanobis distances.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
negative_mahal_distances : array-like of shape (n_samples,)
|
||||
Opposite of the Mahalanobis distances.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
return -self.mahalanobis(X)
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Predict labels (1 inlier, -1 outlier) of X according to fitted model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
Returns -1 for anomalies/outliers and +1 for inliers.
|
||||
"""
|
||||
values = self.decision_function(X)
|
||||
is_inlier = np.full(values.shape[0], -1, dtype=int)
|
||||
is_inlier[values >= 0] = 1
|
||||
|
||||
return is_inlier
|
||||
|
||||
def score(self, X, y, sample_weight=None):
|
||||
"""Return the mean accuracy on the given test data and labels.
|
||||
|
||||
In multi-label classification, this is the subset accuracy
|
||||
which is a harsh metric since you require for each sample that
|
||||
each label set be correctly predicted.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Test samples.
|
||||
|
||||
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||||
True labels for X.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
Mean accuracy of self.predict(X) w.r.t. y.
|
||||
"""
|
||||
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
|
||||
@@ -0,0 +1,370 @@
|
||||
"""
|
||||
Maximum likelihood covariance estimator.
|
||||
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# avoid division truncation
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
|
||||
from sklearn.utils import metadata_routing
|
||||
|
||||
from .. import config_context
|
||||
from ..base import BaseEstimator, _fit_context
|
||||
from ..metrics.pairwise import pairwise_distances
|
||||
from ..utils import check_array
|
||||
from ..utils._param_validation import validate_params
|
||||
from ..utils.extmath import fast_logdet
|
||||
from ..utils.validation import validate_data
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"emp_cov": [np.ndarray],
|
||||
"precision": [np.ndarray],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def log_likelihood(emp_cov, precision):
|
||||
"""Compute the sample mean of the log_likelihood under a covariance model.
|
||||
|
||||
Computes the empirical expected log-likelihood, allowing for universal
|
||||
comparison (beyond this software package), and accounts for normalization
|
||||
terms and scaling.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
emp_cov : ndarray of shape (n_features, n_features)
|
||||
Maximum Likelihood Estimator of covariance.
|
||||
|
||||
precision : ndarray of shape (n_features, n_features)
|
||||
The precision matrix of the covariance model to be tested.
|
||||
|
||||
Returns
|
||||
-------
|
||||
log_likelihood_ : float
|
||||
Sample mean of the log-likelihood.
|
||||
"""
|
||||
p = precision.shape[0]
|
||||
log_likelihood_ = -np.sum(emp_cov * precision) + fast_logdet(precision)
|
||||
log_likelihood_ -= p * np.log(2 * np.pi)
|
||||
log_likelihood_ /= 2.0
|
||||
return log_likelihood_
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like"],
|
||||
"assume_centered": ["boolean"],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def empirical_covariance(X, *, assume_centered=False):
|
||||
"""Compute the Maximum likelihood covariance estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
Data from which to compute the covariance estimate.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If `True`, data will not be centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If `False`, data will be centered before computation.
|
||||
|
||||
Returns
|
||||
-------
|
||||
covariance : ndarray of shape (n_features, n_features)
|
||||
Empirical covariance (Maximum Likelihood Estimator).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.covariance import empirical_covariance
|
||||
>>> X = [[1,1,1],[1,1,1],[1,1,1],
|
||||
... [0,0,0],[0,0,0],[0,0,0]]
|
||||
>>> empirical_covariance(X)
|
||||
array([[0.25, 0.25, 0.25],
|
||||
[0.25, 0.25, 0.25],
|
||||
[0.25, 0.25, 0.25]])
|
||||
"""
|
||||
X = check_array(X, ensure_2d=False, ensure_all_finite=False)
|
||||
|
||||
if X.ndim == 1:
|
||||
X = np.reshape(X, (1, -1))
|
||||
|
||||
if X.shape[0] == 1:
|
||||
warnings.warn(
|
||||
"Only one sample available. You may want to reshape your data array"
|
||||
)
|
||||
|
||||
if assume_centered:
|
||||
covariance = np.dot(X.T, X) / X.shape[0]
|
||||
else:
|
||||
covariance = np.cov(X.T, bias=1)
|
||||
|
||||
if covariance.ndim == 0:
|
||||
covariance = np.array([[covariance]])
|
||||
return covariance
|
||||
|
||||
|
||||
class EmpiricalCovariance(BaseEstimator):
|
||||
"""Maximum likelihood covariance estimator.
|
||||
|
||||
Read more in the :ref:`User Guide <covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specifies if the estimated precision is stored.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data are not centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If False (default), data are centered before computation.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated location, i.e. the estimated mean.
|
||||
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated covariance matrix.
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo-inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
EllipticEnvelope : An object for detecting outliers in
|
||||
a Gaussian distributed dataset.
|
||||
GraphicalLasso : Sparse inverse covariance estimation
|
||||
with an l1-penalized estimator.
|
||||
LedoitWolf : LedoitWolf Estimator.
|
||||
MinCovDet : Minimum Covariance Determinant
|
||||
(robust estimator of covariance).
|
||||
OAS : Oracle Approximating Shrinkage Estimator.
|
||||
ShrunkCovariance : Covariance estimator with shrinkage.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import EmpiricalCovariance
|
||||
>>> from sklearn.datasets import make_gaussian_quantiles
|
||||
>>> real_cov = np.array([[.8, .3],
|
||||
... [.3, .4]])
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X = rng.multivariate_normal(mean=[0, 0],
|
||||
... cov=real_cov,
|
||||
... size=500)
|
||||
>>> cov = EmpiricalCovariance().fit(X)
|
||||
>>> cov.covariance_
|
||||
array([[0.7569, 0.2818],
|
||||
[0.2818, 0.3928]])
|
||||
>>> cov.location_
|
||||
array([0.0622, 0.0193])
|
||||
"""
|
||||
|
||||
# X_test should have been called X
|
||||
__metadata_request__score = {"X_test": metadata_routing.UNUSED}
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"store_precision": ["boolean"],
|
||||
"assume_centered": ["boolean"],
|
||||
}
|
||||
|
||||
def __init__(self, *, store_precision=True, assume_centered=False):
|
||||
self.store_precision = store_precision
|
||||
self.assume_centered = assume_centered
|
||||
|
||||
def _set_covariance(self, covariance):
|
||||
"""Saves the covariance and precision estimates
|
||||
|
||||
Storage is done accordingly to `self.store_precision`.
|
||||
Precision stored only if invertible.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
covariance : array-like of shape (n_features, n_features)
|
||||
Estimated covariance matrix to be stored, and from which precision
|
||||
is computed.
|
||||
"""
|
||||
covariance = check_array(covariance)
|
||||
# set covariance
|
||||
self.covariance_ = covariance
|
||||
# set precision
|
||||
if self.store_precision:
|
||||
self.precision_ = linalg.pinvh(covariance, check_finite=False)
|
||||
else:
|
||||
self.precision_ = None
|
||||
|
||||
def get_precision(self):
|
||||
"""Getter for the precision matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
precision_ : array-like of shape (n_features, n_features)
|
||||
The precision matrix associated to the current covariance object.
|
||||
"""
|
||||
if self.store_precision:
|
||||
precision = self.precision_
|
||||
else:
|
||||
precision = linalg.pinvh(self.covariance_, check_finite=False)
|
||||
return precision
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the maximum likelihood covariance estimator to X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
X = validate_data(self, X)
|
||||
if self.assume_centered:
|
||||
self.location_ = np.zeros(X.shape[1])
|
||||
else:
|
||||
self.location_ = X.mean(0)
|
||||
covariance = empirical_covariance(X, assume_centered=self.assume_centered)
|
||||
self._set_covariance(covariance)
|
||||
|
||||
return self
|
||||
|
||||
def score(self, X_test, y=None):
|
||||
"""Compute the log-likelihood of `X_test` under the estimated Gaussian model.
|
||||
|
||||
The Gaussian model is defined by its mean and covariance matrix which are
|
||||
represented respectively by `self.location_` and `self.covariance_`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X_test : array-like of shape (n_samples, n_features)
|
||||
Test data of which we compute the likelihood, where `n_samples` is
|
||||
the number of samples and `n_features` is the number of features.
|
||||
`X_test` is assumed to be drawn from the same distribution than
|
||||
the data used in fit (including centering).
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : float
|
||||
The log-likelihood of `X_test` with `self.location_` and `self.covariance_`
|
||||
as estimators of the Gaussian model mean and covariance matrix respectively.
|
||||
"""
|
||||
X_test = validate_data(self, X_test, reset=False)
|
||||
# compute empirical covariance of the test set
|
||||
test_cov = empirical_covariance(X_test - self.location_, assume_centered=True)
|
||||
# compute log likelihood
|
||||
res = log_likelihood(test_cov, self.get_precision())
|
||||
|
||||
return res
|
||||
|
||||
def error_norm(self, comp_cov, norm="frobenius", scaling=True, squared=True):
|
||||
"""Compute the Mean Squared Error between two covariance estimators.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
comp_cov : array-like of shape (n_features, n_features)
|
||||
The covariance to compare with.
|
||||
|
||||
norm : {"frobenius", "spectral"}, default="frobenius"
|
||||
The type of norm used to compute the error. Available error types:
|
||||
- 'frobenius' (default): sqrt(tr(A^t.A))
|
||||
- 'spectral': sqrt(max(eigenvalues(A^t.A))
|
||||
where A is the error ``(comp_cov - self.covariance_)``.
|
||||
|
||||
scaling : bool, default=True
|
||||
If True (default), the squared error norm is divided by n_features.
|
||||
If False, the squared error norm is not rescaled.
|
||||
|
||||
squared : bool, default=True
|
||||
Whether to compute the squared error norm or the error norm.
|
||||
If True (default), the squared error norm is returned.
|
||||
If False, the error norm is returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : float
|
||||
The Mean Squared Error (in the sense of the Frobenius norm) between
|
||||
`self` and `comp_cov` covariance estimators.
|
||||
"""
|
||||
# compute the error
|
||||
error = comp_cov - self.covariance_
|
||||
# compute the error norm
|
||||
if norm == "frobenius":
|
||||
squared_norm = np.sum(error**2)
|
||||
elif norm == "spectral":
|
||||
squared_norm = np.amax(linalg.svdvals(np.dot(error.T, error)))
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Only spectral and frobenius norms are implemented"
|
||||
)
|
||||
# optionally scale the error norm
|
||||
if scaling:
|
||||
squared_norm = squared_norm / error.shape[0]
|
||||
# finally get either the squared norm or the norm
|
||||
if squared:
|
||||
result = squared_norm
|
||||
else:
|
||||
result = np.sqrt(squared_norm)
|
||||
|
||||
return result
|
||||
|
||||
def mahalanobis(self, X):
|
||||
"""Compute the squared Mahalanobis distances of given observations.
|
||||
|
||||
For a detailed example of how outliers affects the Mahalanobis distance,
|
||||
see :ref:`sphx_glr_auto_examples_covariance_plot_mahalanobis_distances.py`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The observations, the Mahalanobis distances of the which we
|
||||
compute. Observations are assumed to be drawn from the same
|
||||
distribution than the data used in fit.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dist : ndarray of shape (n_samples,)
|
||||
Squared Mahalanobis distances of the observations.
|
||||
"""
|
||||
X = validate_data(self, X, reset=False)
|
||||
|
||||
precision = self.get_precision()
|
||||
with config_context(assume_finite=True):
|
||||
# compute mahalanobis distances
|
||||
dist = pairwise_distances(
|
||||
X, self.location_[np.newaxis, :], metric="mahalanobis", VI=precision
|
||||
)
|
||||
|
||||
return np.reshape(dist, (len(X),)) ** 2
|
||||
1145
venv/lib/python3.12/site-packages/sklearn/covariance/_graph_lasso.py
Normal file
1145
venv/lib/python3.12/site-packages/sklearn/covariance/_graph_lasso.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,874 @@
|
||||
"""
|
||||
Robust location and covariance estimators.
|
||||
|
||||
Here are implemented estimators that are resistant to outliers.
|
||||
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
from scipy.stats import chi2
|
||||
|
||||
from ..base import _fit_context
|
||||
from ..utils import check_array, check_random_state
|
||||
from ..utils._param_validation import Interval
|
||||
from ..utils.extmath import fast_logdet
|
||||
from ..utils.validation import validate_data
|
||||
from ._empirical_covariance import EmpiricalCovariance, empirical_covariance
|
||||
|
||||
|
||||
# Minimum Covariance Determinant
|
||||
# Implementing of an algorithm by Rousseeuw & Van Driessen described in
|
||||
# (A Fast Algorithm for the Minimum Covariance Determinant Estimator,
|
||||
# 1999, American Statistical Association and the American Society
|
||||
# for Quality, TECHNOMETRICS)
|
||||
# XXX Is this really a public function? It's not listed in the docs or
|
||||
# exported by sklearn.covariance. Deprecate?
|
||||
def c_step(
|
||||
X,
|
||||
n_support,
|
||||
remaining_iterations=30,
|
||||
initial_estimates=None,
|
||||
verbose=False,
|
||||
cov_computation_method=empirical_covariance,
|
||||
random_state=None,
|
||||
):
|
||||
"""C_step procedure described in [Rouseeuw1984]_ aiming at computing MCD.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data set in which we look for the n_support observations whose
|
||||
scatter matrix has minimum determinant.
|
||||
|
||||
n_support : int
|
||||
Number of observations to compute the robust estimates of location
|
||||
and covariance from. This parameter must be greater than
|
||||
`n_samples / 2`.
|
||||
|
||||
remaining_iterations : int, default=30
|
||||
Number of iterations to perform.
|
||||
According to [Rouseeuw1999]_, two iterations are sufficient to get
|
||||
close to the minimum, and we never need more than 30 to reach
|
||||
convergence.
|
||||
|
||||
initial_estimates : tuple of shape (2,), default=None
|
||||
Initial estimates of location and shape from which to run the c_step
|
||||
procedure:
|
||||
- initial_estimates[0]: an initial location estimate
|
||||
- initial_estimates[1]: an initial covariance estimate
|
||||
|
||||
verbose : bool, default=False
|
||||
Verbose mode.
|
||||
|
||||
cov_computation_method : callable, \
|
||||
default=:func:`sklearn.covariance.empirical_covariance`
|
||||
The function which will be used to compute the covariance.
|
||||
Must return array of shape (n_features, n_features).
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the pseudo random number generator for shuffling the data.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
location : ndarray of shape (n_features,)
|
||||
Robust location estimates.
|
||||
|
||||
covariance : ndarray of shape (n_features, n_features)
|
||||
Robust covariance estimates.
|
||||
|
||||
support : ndarray of shape (n_samples,)
|
||||
A mask for the `n_support` observations whose scatter matrix has
|
||||
minimum determinant.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [Rouseeuw1999] A Fast Algorithm for the Minimum Covariance Determinant
|
||||
Estimator, 1999, American Statistical Association and the American
|
||||
Society for Quality, TECHNOMETRICS
|
||||
"""
|
||||
X = np.asarray(X)
|
||||
random_state = check_random_state(random_state)
|
||||
return _c_step(
|
||||
X,
|
||||
n_support,
|
||||
remaining_iterations=remaining_iterations,
|
||||
initial_estimates=initial_estimates,
|
||||
verbose=verbose,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state,
|
||||
)
|
||||
|
||||
|
||||
def _c_step(
|
||||
X,
|
||||
n_support,
|
||||
random_state,
|
||||
remaining_iterations=30,
|
||||
initial_estimates=None,
|
||||
verbose=False,
|
||||
cov_computation_method=empirical_covariance,
|
||||
):
|
||||
n_samples, n_features = X.shape
|
||||
dist = np.inf
|
||||
|
||||
# Initialisation
|
||||
if initial_estimates is None:
|
||||
# compute initial robust estimates from a random subset
|
||||
support_indices = random_state.permutation(n_samples)[:n_support]
|
||||
else:
|
||||
# get initial robust estimates from the function parameters
|
||||
location = initial_estimates[0]
|
||||
covariance = initial_estimates[1]
|
||||
# run a special iteration for that case (to get an initial support_indices)
|
||||
precision = linalg.pinvh(covariance)
|
||||
X_centered = X - location
|
||||
dist = (np.dot(X_centered, precision) * X_centered).sum(1)
|
||||
# compute new estimates
|
||||
support_indices = np.argpartition(dist, n_support - 1)[:n_support]
|
||||
|
||||
X_support = X[support_indices]
|
||||
location = X_support.mean(0)
|
||||
covariance = cov_computation_method(X_support)
|
||||
|
||||
# Iterative procedure for Minimum Covariance Determinant computation
|
||||
det = fast_logdet(covariance)
|
||||
# If the data already has singular covariance, calculate the precision,
|
||||
# as the loop below will not be entered.
|
||||
if np.isinf(det):
|
||||
precision = linalg.pinvh(covariance)
|
||||
|
||||
previous_det = np.inf
|
||||
while det < previous_det and remaining_iterations > 0 and not np.isinf(det):
|
||||
# save old estimates values
|
||||
previous_location = location
|
||||
previous_covariance = covariance
|
||||
previous_det = det
|
||||
previous_support_indices = support_indices
|
||||
# compute a new support_indices from the full data set mahalanobis distances
|
||||
precision = linalg.pinvh(covariance)
|
||||
X_centered = X - location
|
||||
dist = (np.dot(X_centered, precision) * X_centered).sum(axis=1)
|
||||
# compute new estimates
|
||||
support_indices = np.argpartition(dist, n_support - 1)[:n_support]
|
||||
X_support = X[support_indices]
|
||||
location = X_support.mean(axis=0)
|
||||
covariance = cov_computation_method(X_support)
|
||||
det = fast_logdet(covariance)
|
||||
# update remaining iterations for early stopping
|
||||
remaining_iterations -= 1
|
||||
|
||||
previous_dist = dist
|
||||
dist = (np.dot(X - location, precision) * (X - location)).sum(axis=1)
|
||||
# Check if best fit already found (det => 0, logdet => -inf)
|
||||
if np.isinf(det):
|
||||
results = location, covariance, det, support_indices, dist
|
||||
# Check convergence
|
||||
if np.allclose(det, previous_det):
|
||||
# c_step procedure converged
|
||||
if verbose:
|
||||
print(
|
||||
"Optimal couple (location, covariance) found before"
|
||||
" ending iterations (%d left)" % (remaining_iterations)
|
||||
)
|
||||
results = location, covariance, det, support_indices, dist
|
||||
elif det > previous_det:
|
||||
# determinant has increased (should not happen)
|
||||
warnings.warn(
|
||||
"Determinant has increased; this should not happen: "
|
||||
"log(det) > log(previous_det) (%.15f > %.15f). "
|
||||
"You may want to try with a higher value of "
|
||||
"support_fraction (current value: %.3f)."
|
||||
% (det, previous_det, n_support / n_samples),
|
||||
RuntimeWarning,
|
||||
)
|
||||
results = (
|
||||
previous_location,
|
||||
previous_covariance,
|
||||
previous_det,
|
||||
previous_support_indices,
|
||||
previous_dist,
|
||||
)
|
||||
|
||||
# Check early stopping
|
||||
if remaining_iterations == 0:
|
||||
if verbose:
|
||||
print("Maximum number of iterations reached")
|
||||
results = location, covariance, det, support_indices, dist
|
||||
|
||||
location, covariance, det, support_indices, dist = results
|
||||
# Convert from list of indices to boolean mask.
|
||||
support = np.bincount(support_indices, minlength=n_samples).astype(bool)
|
||||
return location, covariance, det, support, dist
|
||||
|
||||
|
||||
def select_candidates(
|
||||
X,
|
||||
n_support,
|
||||
n_trials,
|
||||
select=1,
|
||||
n_iter=30,
|
||||
verbose=False,
|
||||
cov_computation_method=empirical_covariance,
|
||||
random_state=None,
|
||||
):
|
||||
"""Finds the best pure subset of observations to compute MCD from it.
|
||||
|
||||
The purpose of this function is to find the best sets of n_support
|
||||
observations with respect to a minimization of their covariance
|
||||
matrix determinant. Equivalently, it removes n_samples-n_support
|
||||
observations to construct what we call a pure data set (i.e. not
|
||||
containing outliers). The list of the observations of the pure
|
||||
data set is referred to as the `support`.
|
||||
|
||||
Starting from a random support, the pure data set is found by the
|
||||
c_step procedure introduced by Rousseeuw and Van Driessen in
|
||||
[RV]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data (sub)set in which we look for the n_support purest observations.
|
||||
|
||||
n_support : int
|
||||
The number of samples the pure data set must contain.
|
||||
This parameter must be in the range `[(n + p + 1)/2] < n_support < n`.
|
||||
|
||||
n_trials : int or tuple of shape (2,)
|
||||
Number of different initial sets of observations from which to
|
||||
run the algorithm. This parameter should be a strictly positive
|
||||
integer.
|
||||
Instead of giving a number of trials to perform, one can provide a
|
||||
list of initial estimates that will be used to iteratively run
|
||||
c_step procedures. In this case:
|
||||
- n_trials[0]: array-like, shape (n_trials, n_features)
|
||||
is the list of `n_trials` initial location estimates
|
||||
- n_trials[1]: array-like, shape (n_trials, n_features, n_features)
|
||||
is the list of `n_trials` initial covariances estimates
|
||||
|
||||
select : int, default=1
|
||||
Number of best candidates results to return. This parameter must be
|
||||
a strictly positive integer.
|
||||
|
||||
n_iter : int, default=30
|
||||
Maximum number of iterations for the c_step procedure.
|
||||
(2 is enough to be close to the final solution. "Never" exceeds 20).
|
||||
This parameter must be a strictly positive integer.
|
||||
|
||||
verbose : bool, default=False
|
||||
Control the output verbosity.
|
||||
|
||||
cov_computation_method : callable, \
|
||||
default=:func:`sklearn.covariance.empirical_covariance`
|
||||
The function which will be used to compute the covariance.
|
||||
Must return an array of shape (n_features, n_features).
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the pseudo random number generator for shuffling the data.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
See Also
|
||||
---------
|
||||
c_step
|
||||
|
||||
Returns
|
||||
-------
|
||||
best_locations : ndarray of shape (select, n_features)
|
||||
The `select` location estimates computed from the `select` best
|
||||
supports found in the data set (`X`).
|
||||
|
||||
best_covariances : ndarray of shape (select, n_features, n_features)
|
||||
The `select` covariance estimates computed from the `select`
|
||||
best supports found in the data set (`X`).
|
||||
|
||||
best_supports : ndarray of shape (select, n_samples)
|
||||
The `select` best supports found in the data set (`X`).
|
||||
|
||||
References
|
||||
----------
|
||||
.. [RV] A Fast Algorithm for the Minimum Covariance Determinant
|
||||
Estimator, 1999, American Statistical Association and the American
|
||||
Society for Quality, TECHNOMETRICS
|
||||
"""
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
if isinstance(n_trials, Integral):
|
||||
run_from_estimates = False
|
||||
elif isinstance(n_trials, tuple):
|
||||
run_from_estimates = True
|
||||
estimates_list = n_trials
|
||||
n_trials = estimates_list[0].shape[0]
|
||||
else:
|
||||
raise TypeError(
|
||||
"Invalid 'n_trials' parameter, expected tuple or integer, got %s (%s)"
|
||||
% (n_trials, type(n_trials))
|
||||
)
|
||||
|
||||
# compute `n_trials` location and shape estimates candidates in the subset
|
||||
all_estimates = []
|
||||
if not run_from_estimates:
|
||||
# perform `n_trials` computations from random initial supports
|
||||
for j in range(n_trials):
|
||||
all_estimates.append(
|
||||
_c_step(
|
||||
X,
|
||||
n_support,
|
||||
remaining_iterations=n_iter,
|
||||
verbose=verbose,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# perform computations from every given initial estimates
|
||||
for j in range(n_trials):
|
||||
initial_estimates = (estimates_list[0][j], estimates_list[1][j])
|
||||
all_estimates.append(
|
||||
_c_step(
|
||||
X,
|
||||
n_support,
|
||||
remaining_iterations=n_iter,
|
||||
initial_estimates=initial_estimates,
|
||||
verbose=verbose,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state,
|
||||
)
|
||||
)
|
||||
all_locs_sub, all_covs_sub, all_dets_sub, all_supports_sub, all_ds_sub = zip(
|
||||
*all_estimates
|
||||
)
|
||||
# find the `n_best` best results among the `n_trials` ones
|
||||
index_best = np.argsort(all_dets_sub)[:select]
|
||||
best_locations = np.asarray(all_locs_sub)[index_best]
|
||||
best_covariances = np.asarray(all_covs_sub)[index_best]
|
||||
best_supports = np.asarray(all_supports_sub)[index_best]
|
||||
best_ds = np.asarray(all_ds_sub)[index_best]
|
||||
|
||||
return best_locations, best_covariances, best_supports, best_ds
|
||||
|
||||
|
||||
def fast_mcd(
|
||||
X,
|
||||
support_fraction=None,
|
||||
cov_computation_method=empirical_covariance,
|
||||
random_state=None,
|
||||
):
|
||||
"""Estimate the Minimum Covariance Determinant matrix.
|
||||
|
||||
Read more in the :ref:`User Guide <robust_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix, with p features and n samples.
|
||||
|
||||
support_fraction : float, default=None
|
||||
The proportion of points to be included in the support of the raw
|
||||
MCD estimate. Default is `None`, which implies that the minimum
|
||||
value of `support_fraction` will be used within the algorithm:
|
||||
`(n_samples + n_features + 1) / 2 * n_samples`. This parameter must be
|
||||
in the range (0, 1).
|
||||
|
||||
cov_computation_method : callable, \
|
||||
default=:func:`sklearn.covariance.empirical_covariance`
|
||||
The function which will be used to compute the covariance.
|
||||
Must return an array of shape (n_features, n_features).
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the pseudo random number generator for shuffling the data.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
location : ndarray of shape (n_features,)
|
||||
Robust location of the data.
|
||||
|
||||
covariance : ndarray of shape (n_features, n_features)
|
||||
Robust covariance of the features.
|
||||
|
||||
support : ndarray of shape (n_samples,), dtype=bool
|
||||
A mask of the observations that have been used to compute
|
||||
the robust location and covariance estimates of the data set.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The FastMCD algorithm has been introduced by Rousseuw and Van Driessen
|
||||
in "A Fast Algorithm for the Minimum Covariance Determinant Estimator,
|
||||
1999, American Statistical Association and the American Society
|
||||
for Quality, TECHNOMETRICS".
|
||||
The principle is to compute robust estimates and random subsets before
|
||||
pooling them into a larger subsets, and finally into the full data set.
|
||||
Depending on the size of the initial sample, we have one, two or three
|
||||
such computation levels.
|
||||
|
||||
Note that only raw estimates are returned. If one is interested in
|
||||
the correction and reweighting steps described in [RouseeuwVan]_,
|
||||
see the MinCovDet object.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [RouseeuwVan] A Fast Algorithm for the Minimum Covariance
|
||||
Determinant Estimator, 1999, American Statistical Association
|
||||
and the American Society for Quality, TECHNOMETRICS
|
||||
|
||||
.. [Butler1993] R. W. Butler, P. L. Davies and M. Jhun,
|
||||
Asymptotics For The Minimum Covariance Determinant Estimator,
|
||||
The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400
|
||||
"""
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
X = check_array(X, ensure_min_samples=2, estimator="fast_mcd")
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
# minimum breakdown value
|
||||
if support_fraction is None:
|
||||
n_support = min(int(np.ceil(0.5 * (n_samples + n_features + 1))), n_samples)
|
||||
else:
|
||||
n_support = int(support_fraction * n_samples)
|
||||
|
||||
# 1-dimensional case quick computation
|
||||
# (Rousseeuw, P. J. and Leroy, A. M. (2005) References, in Robust
|
||||
# Regression and Outlier Detection, John Wiley & Sons, chapter 4)
|
||||
if n_features == 1:
|
||||
if n_support < n_samples:
|
||||
# find the sample shortest halves
|
||||
X_sorted = np.sort(np.ravel(X))
|
||||
diff = X_sorted[n_support:] - X_sorted[: (n_samples - n_support)]
|
||||
halves_start = np.where(diff == np.min(diff))[0]
|
||||
# take the middle points' mean to get the robust location estimate
|
||||
location = (
|
||||
0.5
|
||||
* (X_sorted[n_support + halves_start] + X_sorted[halves_start]).mean()
|
||||
)
|
||||
support = np.zeros(n_samples, dtype=bool)
|
||||
X_centered = X - location
|
||||
support[np.argsort(np.abs(X_centered), 0)[:n_support]] = True
|
||||
covariance = np.asarray([[np.var(X[support])]])
|
||||
location = np.array([location])
|
||||
# get precision matrix in an optimized way
|
||||
precision = linalg.pinvh(covariance)
|
||||
dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1)
|
||||
else:
|
||||
support = np.ones(n_samples, dtype=bool)
|
||||
covariance = np.asarray([[np.var(X)]])
|
||||
location = np.asarray([np.mean(X)])
|
||||
X_centered = X - location
|
||||
# get precision matrix in an optimized way
|
||||
precision = linalg.pinvh(covariance)
|
||||
dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1)
|
||||
# Starting FastMCD algorithm for p-dimensional case
|
||||
if (n_samples > 500) and (n_features > 1):
|
||||
# 1. Find candidate supports on subsets
|
||||
# a. split the set in subsets of size ~ 300
|
||||
n_subsets = n_samples // 300
|
||||
n_samples_subsets = n_samples // n_subsets
|
||||
samples_shuffle = random_state.permutation(n_samples)
|
||||
h_subset = int(np.ceil(n_samples_subsets * (n_support / float(n_samples))))
|
||||
# b. perform a total of 500 trials
|
||||
n_trials_tot = 500
|
||||
# c. select 10 best (location, covariance) for each subset
|
||||
n_best_sub = 10
|
||||
n_trials = max(10, n_trials_tot // n_subsets)
|
||||
n_best_tot = n_subsets * n_best_sub
|
||||
all_best_locations = np.zeros((n_best_tot, n_features))
|
||||
try:
|
||||
all_best_covariances = np.zeros((n_best_tot, n_features, n_features))
|
||||
except MemoryError:
|
||||
# The above is too big. Let's try with something much small
|
||||
# (and less optimal)
|
||||
n_best_tot = 10
|
||||
all_best_covariances = np.zeros((n_best_tot, n_features, n_features))
|
||||
n_best_sub = 2
|
||||
for i in range(n_subsets):
|
||||
low_bound = i * n_samples_subsets
|
||||
high_bound = low_bound + n_samples_subsets
|
||||
current_subset = X[samples_shuffle[low_bound:high_bound]]
|
||||
best_locations_sub, best_covariances_sub, _, _ = select_candidates(
|
||||
current_subset,
|
||||
h_subset,
|
||||
n_trials,
|
||||
select=n_best_sub,
|
||||
n_iter=2,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state,
|
||||
)
|
||||
subset_slice = np.arange(i * n_best_sub, (i + 1) * n_best_sub)
|
||||
all_best_locations[subset_slice] = best_locations_sub
|
||||
all_best_covariances[subset_slice] = best_covariances_sub
|
||||
# 2. Pool the candidate supports into a merged set
|
||||
# (possibly the full dataset)
|
||||
n_samples_merged = min(1500, n_samples)
|
||||
h_merged = int(np.ceil(n_samples_merged * (n_support / float(n_samples))))
|
||||
if n_samples > 1500:
|
||||
n_best_merged = 10
|
||||
else:
|
||||
n_best_merged = 1
|
||||
# find the best couples (location, covariance) on the merged set
|
||||
selection = random_state.permutation(n_samples)[:n_samples_merged]
|
||||
locations_merged, covariances_merged, supports_merged, d = select_candidates(
|
||||
X[selection],
|
||||
h_merged,
|
||||
n_trials=(all_best_locations, all_best_covariances),
|
||||
select=n_best_merged,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state,
|
||||
)
|
||||
# 3. Finally get the overall best (locations, covariance) couple
|
||||
if n_samples < 1500:
|
||||
# directly get the best couple (location, covariance)
|
||||
location = locations_merged[0]
|
||||
covariance = covariances_merged[0]
|
||||
support = np.zeros(n_samples, dtype=bool)
|
||||
dist = np.zeros(n_samples)
|
||||
support[selection] = supports_merged[0]
|
||||
dist[selection] = d[0]
|
||||
else:
|
||||
# select the best couple on the full dataset
|
||||
locations_full, covariances_full, supports_full, d = select_candidates(
|
||||
X,
|
||||
n_support,
|
||||
n_trials=(locations_merged, covariances_merged),
|
||||
select=1,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state,
|
||||
)
|
||||
location = locations_full[0]
|
||||
covariance = covariances_full[0]
|
||||
support = supports_full[0]
|
||||
dist = d[0]
|
||||
elif n_features > 1:
|
||||
# 1. Find the 10 best couples (location, covariance)
|
||||
# considering two iterations
|
||||
n_trials = 30
|
||||
n_best = 10
|
||||
locations_best, covariances_best, _, _ = select_candidates(
|
||||
X,
|
||||
n_support,
|
||||
n_trials=n_trials,
|
||||
select=n_best,
|
||||
n_iter=2,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state,
|
||||
)
|
||||
# 2. Select the best couple on the full dataset amongst the 10
|
||||
locations_full, covariances_full, supports_full, d = select_candidates(
|
||||
X,
|
||||
n_support,
|
||||
n_trials=(locations_best, covariances_best),
|
||||
select=1,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state,
|
||||
)
|
||||
location = locations_full[0]
|
||||
covariance = covariances_full[0]
|
||||
support = supports_full[0]
|
||||
dist = d[0]
|
||||
|
||||
return location, covariance, support, dist
|
||||
|
||||
|
||||
class MinCovDet(EmpiricalCovariance):
|
||||
"""Minimum Covariance Determinant (MCD): robust estimator of covariance.
|
||||
|
||||
The Minimum Covariance Determinant covariance estimator is to be applied
|
||||
on Gaussian-distributed data, but could still be relevant on data
|
||||
drawn from a unimodal, symmetric distribution. It is not meant to be used
|
||||
with multi-modal data (the algorithm used to fit a MinCovDet object is
|
||||
likely to fail in such a case).
|
||||
One should consider projection pursuit methods to deal with multi-modal
|
||||
datasets.
|
||||
|
||||
Read more in the :ref:`User Guide <robust_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specify if the estimated precision is stored.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, the support of the robust location and the covariance
|
||||
estimates is computed, and a covariance estimate is recomputed from
|
||||
it, without centering the data.
|
||||
Useful to work with data whose mean is significantly equal to
|
||||
zero but is not exactly zero.
|
||||
If False, the robust location and covariance are directly computed
|
||||
with the FastMCD algorithm without additional treatment.
|
||||
|
||||
support_fraction : float, default=None
|
||||
The proportion of points to be included in the support of the raw
|
||||
MCD estimate. Default is None, which implies that the minimum
|
||||
value of support_fraction will be used within the algorithm:
|
||||
`(n_samples + n_features + 1) / 2 * n_samples`. The parameter must be
|
||||
in the range (0, 1].
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the pseudo random number generator for shuffling the data.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
raw_location_ : ndarray of shape (n_features,)
|
||||
The raw robust estimated location before correction and re-weighting.
|
||||
|
||||
raw_covariance_ : ndarray of shape (n_features, n_features)
|
||||
The raw robust estimated covariance before correction and re-weighting.
|
||||
|
||||
raw_support_ : ndarray of shape (n_samples,)
|
||||
A mask of the observations that have been used to compute
|
||||
the raw robust estimates of location and shape, before correction
|
||||
and re-weighting.
|
||||
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated robust location.
|
||||
|
||||
For an example of comparing raw robust estimates with
|
||||
the true location and covariance, refer to
|
||||
:ref:`sphx_glr_auto_examples_covariance_plot_robust_vs_empirical_covariance.py`.
|
||||
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated robust covariance matrix.
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
support_ : ndarray of shape (n_samples,)
|
||||
A mask of the observations that have been used to compute
|
||||
the robust estimates of location and shape.
|
||||
|
||||
dist_ : ndarray of shape (n_samples,)
|
||||
Mahalanobis distances of the training set (on which :meth:`fit` is
|
||||
called) observations.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
EllipticEnvelope : An object for detecting outliers in
|
||||
a Gaussian distributed dataset.
|
||||
EmpiricalCovariance : Maximum likelihood covariance estimator.
|
||||
GraphicalLasso : Sparse inverse covariance estimation
|
||||
with an l1-penalized estimator.
|
||||
GraphicalLassoCV : Sparse inverse covariance with cross-validated
|
||||
choice of the l1 penalty.
|
||||
LedoitWolf : LedoitWolf Estimator.
|
||||
OAS : Oracle Approximating Shrinkage Estimator.
|
||||
ShrunkCovariance : Covariance estimator with shrinkage.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [Rouseeuw1984] P. J. Rousseeuw. Least median of squares regression.
|
||||
J. Am Stat Ass, 79:871, 1984.
|
||||
.. [Rousseeuw] A Fast Algorithm for the Minimum Covariance Determinant
|
||||
Estimator, 1999, American Statistical Association and the American
|
||||
Society for Quality, TECHNOMETRICS
|
||||
.. [ButlerDavies] R. W. Butler, P. L. Davies and M. Jhun,
|
||||
Asymptotics For The Minimum Covariance Determinant Estimator,
|
||||
The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import MinCovDet
|
||||
>>> from sklearn.datasets import make_gaussian_quantiles
|
||||
>>> real_cov = np.array([[.8, .3],
|
||||
... [.3, .4]])
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X = rng.multivariate_normal(mean=[0, 0],
|
||||
... cov=real_cov,
|
||||
... size=500)
|
||||
>>> cov = MinCovDet(random_state=0).fit(X)
|
||||
>>> cov.covariance_
|
||||
array([[0.7411, 0.2535],
|
||||
[0.2535, 0.3053]])
|
||||
>>> cov.location_
|
||||
array([0.0813 , 0.0427])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**EmpiricalCovariance._parameter_constraints,
|
||||
"support_fraction": [Interval(Real, 0, 1, closed="right"), None],
|
||||
"random_state": ["random_state"],
|
||||
}
|
||||
_nonrobust_covariance = staticmethod(empirical_covariance)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
store_precision=True,
|
||||
assume_centered=False,
|
||||
support_fraction=None,
|
||||
random_state=None,
|
||||
):
|
||||
self.store_precision = store_precision
|
||||
self.assume_centered = assume_centered
|
||||
self.support_fraction = support_fraction
|
||||
self.random_state = random_state
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit a Minimum Covariance Determinant with the FastMCD algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
X = validate_data(self, X, ensure_min_samples=2, estimator="MinCovDet")
|
||||
random_state = check_random_state(self.random_state)
|
||||
n_samples, n_features = X.shape
|
||||
# check that the empirical covariance is full rank
|
||||
if (linalg.svdvals(np.dot(X.T, X)) > 1e-8).sum() != n_features:
|
||||
warnings.warn(
|
||||
"The covariance matrix associated to your dataset is not full rank"
|
||||
)
|
||||
# compute and store raw estimates
|
||||
raw_location, raw_covariance, raw_support, raw_dist = fast_mcd(
|
||||
X,
|
||||
support_fraction=self.support_fraction,
|
||||
cov_computation_method=self._nonrobust_covariance,
|
||||
random_state=random_state,
|
||||
)
|
||||
if self.assume_centered:
|
||||
raw_location = np.zeros(n_features)
|
||||
raw_covariance = self._nonrobust_covariance(
|
||||
X[raw_support], assume_centered=True
|
||||
)
|
||||
# get precision matrix in an optimized way
|
||||
precision = linalg.pinvh(raw_covariance)
|
||||
raw_dist = np.sum(np.dot(X, precision) * X, 1)
|
||||
self.raw_location_ = raw_location
|
||||
self.raw_covariance_ = raw_covariance
|
||||
self.raw_support_ = raw_support
|
||||
self.location_ = raw_location
|
||||
self.support_ = raw_support
|
||||
self.dist_ = raw_dist
|
||||
# obtain consistency at normal models
|
||||
self.correct_covariance(X)
|
||||
# re-weight estimator
|
||||
self.reweight_covariance(X)
|
||||
|
||||
return self
|
||||
|
||||
def correct_covariance(self, data):
|
||||
"""Apply a correction to raw Minimum Covariance Determinant estimates.
|
||||
|
||||
Correction using the empirical correction factor suggested
|
||||
by Rousseeuw and Van Driessen in [RVD]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array-like of shape (n_samples, n_features)
|
||||
The data matrix, with p features and n samples.
|
||||
The data set must be the one which was used to compute
|
||||
the raw estimates.
|
||||
|
||||
Returns
|
||||
-------
|
||||
covariance_corrected : ndarray of shape (n_features, n_features)
|
||||
Corrected robust covariance estimate.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [RVD] A Fast Algorithm for the Minimum Covariance
|
||||
Determinant Estimator, 1999, American Statistical Association
|
||||
and the American Society for Quality, TECHNOMETRICS
|
||||
"""
|
||||
|
||||
# Check that the covariance of the support data is not equal to 0.
|
||||
# Otherwise self.dist_ = 0 and thus correction = 0.
|
||||
n_samples = len(self.dist_)
|
||||
n_support = np.sum(self.support_)
|
||||
if n_support < n_samples and np.allclose(self.raw_covariance_, 0):
|
||||
raise ValueError(
|
||||
"The covariance matrix of the support data "
|
||||
"is equal to 0, try to increase support_fraction"
|
||||
)
|
||||
correction = np.median(self.dist_) / chi2(data.shape[1]).isf(0.5)
|
||||
covariance_corrected = self.raw_covariance_ * correction
|
||||
self.dist_ /= correction
|
||||
return covariance_corrected
|
||||
|
||||
def reweight_covariance(self, data):
|
||||
"""Re-weight raw Minimum Covariance Determinant estimates.
|
||||
|
||||
Re-weight observations using Rousseeuw's method (equivalent to
|
||||
deleting outlying observations from the data set before
|
||||
computing location and covariance estimates) described
|
||||
in [RVDriessen]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array-like of shape (n_samples, n_features)
|
||||
The data matrix, with p features and n samples.
|
||||
The data set must be the one which was used to compute
|
||||
the raw estimates.
|
||||
|
||||
Returns
|
||||
-------
|
||||
location_reweighted : ndarray of shape (n_features,)
|
||||
Re-weighted robust location estimate.
|
||||
|
||||
covariance_reweighted : ndarray of shape (n_features, n_features)
|
||||
Re-weighted robust covariance estimate.
|
||||
|
||||
support_reweighted : ndarray of shape (n_samples,), dtype=bool
|
||||
A mask of the observations that have been used to compute
|
||||
the re-weighted robust location and covariance estimates.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [RVDriessen] A Fast Algorithm for the Minimum Covariance
|
||||
Determinant Estimator, 1999, American Statistical Association
|
||||
and the American Society for Quality, TECHNOMETRICS
|
||||
"""
|
||||
n_samples, n_features = data.shape
|
||||
mask = self.dist_ < chi2(n_features).isf(0.025)
|
||||
if self.assume_centered:
|
||||
location_reweighted = np.zeros(n_features)
|
||||
else:
|
||||
location_reweighted = data[mask].mean(0)
|
||||
covariance_reweighted = self._nonrobust_covariance(
|
||||
data[mask], assume_centered=self.assume_centered
|
||||
)
|
||||
support_reweighted = np.zeros(n_samples, dtype=bool)
|
||||
support_reweighted[mask] = True
|
||||
self._set_covariance(covariance_reweighted)
|
||||
self.location_ = location_reweighted
|
||||
self.support_ = support_reweighted
|
||||
X_centered = data - self.location_
|
||||
self.dist_ = np.sum(np.dot(X_centered, self.get_precision()) * X_centered, 1)
|
||||
return location_reweighted, covariance_reweighted, support_reweighted
|
||||
@@ -0,0 +1,822 @@
|
||||
"""
|
||||
Covariance estimators using shrinkage.
|
||||
|
||||
Shrinkage corresponds to regularising `cov` using a convex combination:
|
||||
shrunk_cov = (1-shrinkage)*cov + shrinkage*structured_estimate.
|
||||
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# avoid division truncation
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..base import _fit_context
|
||||
from ..utils import check_array
|
||||
from ..utils._param_validation import Interval, validate_params
|
||||
from ..utils.validation import validate_data
|
||||
from . import EmpiricalCovariance, empirical_covariance
|
||||
|
||||
|
||||
def _ledoit_wolf(X, *, assume_centered, block_size):
|
||||
"""Estimate the shrunk Ledoit-Wolf covariance matrix."""
|
||||
# for only one feature, the result is the same whatever the shrinkage
|
||||
if len(X.shape) == 2 and X.shape[1] == 1:
|
||||
if not assume_centered:
|
||||
X = X - X.mean()
|
||||
return np.atleast_2d((X**2).mean()), 0.0
|
||||
n_features = X.shape[1]
|
||||
|
||||
# get Ledoit-Wolf shrinkage
|
||||
shrinkage = ledoit_wolf_shrinkage(
|
||||
X, assume_centered=assume_centered, block_size=block_size
|
||||
)
|
||||
emp_cov = empirical_covariance(X, assume_centered=assume_centered)
|
||||
mu = np.sum(np.trace(emp_cov)) / n_features
|
||||
shrunk_cov = (1.0 - shrinkage) * emp_cov
|
||||
shrunk_cov.flat[:: n_features + 1] += shrinkage * mu
|
||||
|
||||
return shrunk_cov, shrinkage
|
||||
|
||||
|
||||
def _oas(X, *, assume_centered=False):
|
||||
"""Estimate covariance with the Oracle Approximating Shrinkage algorithm.
|
||||
|
||||
The formulation is based on [1]_.
|
||||
[1] "Shrinkage algorithms for MMSE covariance estimation.",
|
||||
Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
|
||||
IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
|
||||
https://arxiv.org/pdf/0907.4698.pdf
|
||||
"""
|
||||
if len(X.shape) == 2 and X.shape[1] == 1:
|
||||
# for only one feature, the result is the same whatever the shrinkage
|
||||
if not assume_centered:
|
||||
X = X - X.mean()
|
||||
return np.atleast_2d((X**2).mean()), 0.0
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
emp_cov = empirical_covariance(X, assume_centered=assume_centered)
|
||||
|
||||
# The shrinkage is defined as:
|
||||
# shrinkage = min(
|
||||
# trace(S @ S.T) + trace(S)**2) / ((n + 1) (trace(S @ S.T) - trace(S)**2 / p), 1
|
||||
# )
|
||||
# where n and p are n_samples and n_features, respectively (cf. Eq. 23 in [1]).
|
||||
# The factor 2 / p is omitted since it does not impact the value of the estimator
|
||||
# for large p.
|
||||
|
||||
# Instead of computing trace(S)**2, we can compute the average of the squared
|
||||
# elements of S that is equal to trace(S)**2 / p**2.
|
||||
# See the definition of the Frobenius norm:
|
||||
# https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm
|
||||
alpha = np.mean(emp_cov**2)
|
||||
mu = np.trace(emp_cov) / n_features
|
||||
mu_squared = mu**2
|
||||
|
||||
# The factor 1 / p**2 will cancel out since it is in both the numerator and
|
||||
# denominator
|
||||
num = alpha + mu_squared
|
||||
den = (n_samples + 1) * (alpha - mu_squared / n_features)
|
||||
shrinkage = 1.0 if den == 0 else min(num / den, 1.0)
|
||||
|
||||
# The shrunk covariance is defined as:
|
||||
# (1 - shrinkage) * S + shrinkage * F (cf. Eq. 4 in [1])
|
||||
# where S is the empirical covariance and F is the shrinkage target defined as
|
||||
# F = trace(S) / n_features * np.identity(n_features) (cf. Eq. 3 in [1])
|
||||
shrunk_cov = (1.0 - shrinkage) * emp_cov
|
||||
shrunk_cov.flat[:: n_features + 1] += shrinkage * mu
|
||||
|
||||
return shrunk_cov, shrinkage
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Public API
|
||||
# ShrunkCovariance estimator
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"emp_cov": ["array-like"],
|
||||
"shrinkage": [Interval(Real, 0, 1, closed="both")],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def shrunk_covariance(emp_cov, shrinkage=0.1):
|
||||
"""Calculate covariance matrices shrunk on the diagonal.
|
||||
|
||||
Read more in the :ref:`User Guide <shrunk_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
emp_cov : array-like of shape (..., n_features, n_features)
|
||||
Covariance matrices to be shrunk, at least 2D ndarray.
|
||||
|
||||
shrinkage : float, default=0.1
|
||||
Coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate. Range is [0, 1].
|
||||
|
||||
Returns
|
||||
-------
|
||||
shrunk_cov : ndarray of shape (..., n_features, n_features)
|
||||
Shrunk covariance matrices.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularized (shrunk) covariance is given by::
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
|
||||
|
||||
where `mu = trace(cov) / n_features`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.datasets import make_gaussian_quantiles
|
||||
>>> from sklearn.covariance import empirical_covariance, shrunk_covariance
|
||||
>>> real_cov = np.array([[.8, .3], [.3, .4]])
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=500)
|
||||
>>> shrunk_covariance(empirical_covariance(X))
|
||||
array([[0.739, 0.254],
|
||||
[0.254, 0.411]])
|
||||
"""
|
||||
emp_cov = check_array(emp_cov, allow_nd=True)
|
||||
n_features = emp_cov.shape[-1]
|
||||
|
||||
shrunk_cov = (1.0 - shrinkage) * emp_cov
|
||||
mu = np.trace(emp_cov, axis1=-2, axis2=-1) / n_features
|
||||
mu = np.expand_dims(mu, axis=tuple(range(mu.ndim, emp_cov.ndim)))
|
||||
shrunk_cov += shrinkage * mu * np.eye(n_features)
|
||||
|
||||
return shrunk_cov
|
||||
|
||||
|
||||
class ShrunkCovariance(EmpiricalCovariance):
|
||||
"""Covariance estimator with shrinkage.
|
||||
|
||||
Read more in the :ref:`User Guide <shrunk_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specify if the estimated precision is stored.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data will not be centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If False, data will be centered before computation.
|
||||
|
||||
shrinkage : float, default=0.1
|
||||
Coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate. Range is [0, 1].
|
||||
|
||||
Attributes
|
||||
----------
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated covariance matrix
|
||||
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated location, i.e. the estimated mean.
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
EllipticEnvelope : An object for detecting outliers in
|
||||
a Gaussian distributed dataset.
|
||||
EmpiricalCovariance : Maximum likelihood covariance estimator.
|
||||
GraphicalLasso : Sparse inverse covariance estimation
|
||||
with an l1-penalized estimator.
|
||||
GraphicalLassoCV : Sparse inverse covariance with cross-validated
|
||||
choice of the l1 penalty.
|
||||
LedoitWolf : LedoitWolf Estimator.
|
||||
MinCovDet : Minimum Covariance Determinant
|
||||
(robust estimator of covariance).
|
||||
OAS : Oracle Approximating Shrinkage Estimator.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularized covariance is given by:
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
|
||||
|
||||
where mu = trace(cov) / n_features
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import ShrunkCovariance
|
||||
>>> from sklearn.datasets import make_gaussian_quantiles
|
||||
>>> real_cov = np.array([[.8, .3],
|
||||
... [.3, .4]])
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X = rng.multivariate_normal(mean=[0, 0],
|
||||
... cov=real_cov,
|
||||
... size=500)
|
||||
>>> cov = ShrunkCovariance().fit(X)
|
||||
>>> cov.covariance_
|
||||
array([[0.7387, 0.2536],
|
||||
[0.2536, 0.4110]])
|
||||
>>> cov.location_
|
||||
array([0.0622, 0.0193])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**EmpiricalCovariance._parameter_constraints,
|
||||
"shrinkage": [Interval(Real, 0, 1, closed="both")],
|
||||
}
|
||||
|
||||
def __init__(self, *, store_precision=True, assume_centered=False, shrinkage=0.1):
|
||||
super().__init__(
|
||||
store_precision=store_precision, assume_centered=assume_centered
|
||||
)
|
||||
self.shrinkage = shrinkage
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the shrunk covariance model to X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
X = validate_data(self, X)
|
||||
# Not calling the parent object to fit, to avoid a potential
|
||||
# matrix inversion when setting the precision
|
||||
if self.assume_centered:
|
||||
self.location_ = np.zeros(X.shape[1])
|
||||
else:
|
||||
self.location_ = X.mean(0)
|
||||
covariance = empirical_covariance(X, assume_centered=self.assume_centered)
|
||||
covariance = shrunk_covariance(covariance, self.shrinkage)
|
||||
self._set_covariance(covariance)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
# Ledoit-Wolf estimator
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like"],
|
||||
"assume_centered": ["boolean"],
|
||||
"block_size": [Interval(Integral, 1, None, closed="left")],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000):
|
||||
"""Estimate the shrunk Ledoit-Wolf covariance matrix.
|
||||
|
||||
Read more in the :ref:`User Guide <shrunk_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data from which to compute the Ledoit-Wolf shrunk covariance shrinkage.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data will not be centered before computation.
|
||||
Useful to work with data whose mean is significantly equal to
|
||||
zero but is not exactly zero.
|
||||
If False, data will be centered before computation.
|
||||
|
||||
block_size : int, default=1000
|
||||
Size of blocks into which the covariance matrix will be split.
|
||||
|
||||
Returns
|
||||
-------
|
||||
shrinkage : float
|
||||
Coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularized (shrunk) covariance is:
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
|
||||
|
||||
where mu = trace(cov) / n_features
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import ledoit_wolf_shrinkage
|
||||
>>> real_cov = np.array([[.4, .2], [.2, .8]])
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=50)
|
||||
>>> shrinkage_coefficient = ledoit_wolf_shrinkage(X)
|
||||
>>> shrinkage_coefficient
|
||||
np.float64(0.23)
|
||||
"""
|
||||
X = check_array(X)
|
||||
# for only one feature, the result is the same whatever the shrinkage
|
||||
if len(X.shape) == 2 and X.shape[1] == 1:
|
||||
return 0.0
|
||||
if X.ndim == 1:
|
||||
X = np.reshape(X, (1, -1))
|
||||
|
||||
if X.shape[0] == 1:
|
||||
warnings.warn(
|
||||
"Only one sample available. You may want to reshape your data array"
|
||||
)
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
# optionally center data
|
||||
if not assume_centered:
|
||||
X = X - X.mean(0)
|
||||
|
||||
# A non-blocked version of the computation is present in the tests
|
||||
# in tests/test_covariance.py
|
||||
|
||||
# number of blocks to split the covariance matrix into
|
||||
n_splits = int(n_features / block_size)
|
||||
X2 = X**2
|
||||
emp_cov_trace = np.sum(X2, axis=0) / n_samples
|
||||
mu = np.sum(emp_cov_trace) / n_features
|
||||
beta_ = 0.0 # sum of the coefficients of <X2.T, X2>
|
||||
delta_ = 0.0 # sum of the *squared* coefficients of <X.T, X>
|
||||
# starting block computation
|
||||
for i in range(n_splits):
|
||||
for j in range(n_splits):
|
||||
rows = slice(block_size * i, block_size * (i + 1))
|
||||
cols = slice(block_size * j, block_size * (j + 1))
|
||||
beta_ += np.sum(np.dot(X2.T[rows], X2[:, cols]))
|
||||
delta_ += np.sum(np.dot(X.T[rows], X[:, cols]) ** 2)
|
||||
rows = slice(block_size * i, block_size * (i + 1))
|
||||
beta_ += np.sum(np.dot(X2.T[rows], X2[:, block_size * n_splits :]))
|
||||
delta_ += np.sum(np.dot(X.T[rows], X[:, block_size * n_splits :]) ** 2)
|
||||
for j in range(n_splits):
|
||||
cols = slice(block_size * j, block_size * (j + 1))
|
||||
beta_ += np.sum(np.dot(X2.T[block_size * n_splits :], X2[:, cols]))
|
||||
delta_ += np.sum(np.dot(X.T[block_size * n_splits :], X[:, cols]) ** 2)
|
||||
delta_ += np.sum(
|
||||
np.dot(X.T[block_size * n_splits :], X[:, block_size * n_splits :]) ** 2
|
||||
)
|
||||
delta_ /= n_samples**2
|
||||
beta_ += np.sum(
|
||||
np.dot(X2.T[block_size * n_splits :], X2[:, block_size * n_splits :])
|
||||
)
|
||||
# use delta_ to compute beta
|
||||
beta = 1.0 / (n_features * n_samples) * (beta_ / n_samples - delta_)
|
||||
# delta is the sum of the squared coefficients of (<X.T,X> - mu*Id) / p
|
||||
delta = delta_ - 2.0 * mu * emp_cov_trace.sum() + n_features * mu**2
|
||||
delta /= n_features
|
||||
# get final beta as the min between beta and delta
|
||||
# We do this to prevent shrinking more than "1", which would invert
|
||||
# the value of covariances
|
||||
beta = min(beta, delta)
|
||||
# finally get shrinkage
|
||||
shrinkage = 0 if beta == 0 else beta / delta
|
||||
return shrinkage
|
||||
|
||||
|
||||
@validate_params(
|
||||
{"X": ["array-like"]},
|
||||
prefer_skip_nested_validation=False,
|
||||
)
|
||||
def ledoit_wolf(X, *, assume_centered=False, block_size=1000):
|
||||
"""Estimate the shrunk Ledoit-Wolf covariance matrix.
|
||||
|
||||
Read more in the :ref:`User Guide <shrunk_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data from which to compute the covariance estimate.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data will not be centered before computation.
|
||||
Useful to work with data whose mean is significantly equal to
|
||||
zero but is not exactly zero.
|
||||
If False, data will be centered before computation.
|
||||
|
||||
block_size : int, default=1000
|
||||
Size of blocks into which the covariance matrix will be split.
|
||||
This is purely a memory optimization and does not affect results.
|
||||
|
||||
Returns
|
||||
-------
|
||||
shrunk_cov : ndarray of shape (n_features, n_features)
|
||||
Shrunk covariance.
|
||||
|
||||
shrinkage : float
|
||||
Coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularized (shrunk) covariance is:
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
|
||||
|
||||
where mu = trace(cov) / n_features
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import empirical_covariance, ledoit_wolf
|
||||
>>> real_cov = np.array([[.4, .2], [.2, .8]])
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=50)
|
||||
>>> covariance, shrinkage = ledoit_wolf(X)
|
||||
>>> covariance
|
||||
array([[0.44, 0.16],
|
||||
[0.16, 0.80]])
|
||||
>>> shrinkage
|
||||
np.float64(0.23)
|
||||
"""
|
||||
estimator = LedoitWolf(
|
||||
assume_centered=assume_centered,
|
||||
block_size=block_size,
|
||||
store_precision=False,
|
||||
).fit(X)
|
||||
|
||||
return estimator.covariance_, estimator.shrinkage_
|
||||
|
||||
|
||||
class LedoitWolf(EmpiricalCovariance):
|
||||
"""LedoitWolf Estimator.
|
||||
|
||||
Ledoit-Wolf is a particular form of shrinkage, where the shrinkage
|
||||
coefficient is computed using O. Ledoit and M. Wolf's formula as
|
||||
described in "A Well-Conditioned Estimator for Large-Dimensional
|
||||
Covariance Matrices", Ledoit and Wolf, Journal of Multivariate
|
||||
Analysis, Volume 88, Issue 2, February 2004, pages 365-411.
|
||||
|
||||
Read more in the :ref:`User Guide <shrunk_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specify if the estimated precision is stored.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data will not be centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If False (default), data will be centered before computation.
|
||||
|
||||
block_size : int, default=1000
|
||||
Size of blocks into which the covariance matrix will be split
|
||||
during its Ledoit-Wolf estimation. This is purely a memory
|
||||
optimization and does not affect results.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated covariance matrix.
|
||||
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated location, i.e. the estimated mean.
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
shrinkage_ : float
|
||||
Coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate. Range is [0, 1].
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
EllipticEnvelope : An object for detecting outliers in
|
||||
a Gaussian distributed dataset.
|
||||
EmpiricalCovariance : Maximum likelihood covariance estimator.
|
||||
GraphicalLasso : Sparse inverse covariance estimation
|
||||
with an l1-penalized estimator.
|
||||
GraphicalLassoCV : Sparse inverse covariance with cross-validated
|
||||
choice of the l1 penalty.
|
||||
MinCovDet : Minimum Covariance Determinant
|
||||
(robust estimator of covariance).
|
||||
OAS : Oracle Approximating Shrinkage Estimator.
|
||||
ShrunkCovariance : Covariance estimator with shrinkage.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularised covariance is:
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
|
||||
|
||||
where mu = trace(cov) / n_features
|
||||
and shrinkage is given by the Ledoit and Wolf formula (see References)
|
||||
|
||||
References
|
||||
----------
|
||||
"A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices",
|
||||
Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2,
|
||||
February 2004, pages 365-411.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import LedoitWolf
|
||||
>>> real_cov = np.array([[.4, .2],
|
||||
... [.2, .8]])
|
||||
>>> np.random.seed(0)
|
||||
>>> X = np.random.multivariate_normal(mean=[0, 0],
|
||||
... cov=real_cov,
|
||||
... size=50)
|
||||
>>> cov = LedoitWolf().fit(X)
|
||||
>>> cov.covariance_
|
||||
array([[0.4406, 0.1616],
|
||||
[0.1616, 0.8022]])
|
||||
>>> cov.location_
|
||||
array([ 0.0595 , -0.0075])
|
||||
|
||||
See also :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py`
|
||||
and :ref:`sphx_glr_auto_examples_covariance_plot_lw_vs_oas.py`
|
||||
for more detailed examples.
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**EmpiricalCovariance._parameter_constraints,
|
||||
"block_size": [Interval(Integral, 1, None, closed="left")],
|
||||
}
|
||||
|
||||
def __init__(self, *, store_precision=True, assume_centered=False, block_size=1000):
|
||||
super().__init__(
|
||||
store_precision=store_precision, assume_centered=assume_centered
|
||||
)
|
||||
self.block_size = block_size
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the Ledoit-Wolf shrunk covariance model to X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
# Not calling the parent object to fit, to avoid computing the
|
||||
# covariance matrix (and potentially the precision)
|
||||
X = validate_data(self, X)
|
||||
if self.assume_centered:
|
||||
self.location_ = np.zeros(X.shape[1])
|
||||
else:
|
||||
self.location_ = X.mean(0)
|
||||
covariance, shrinkage = _ledoit_wolf(
|
||||
X - self.location_, assume_centered=True, block_size=self.block_size
|
||||
)
|
||||
self.shrinkage_ = shrinkage
|
||||
self._set_covariance(covariance)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
# OAS estimator
|
||||
@validate_params(
|
||||
{"X": ["array-like"]},
|
||||
prefer_skip_nested_validation=False,
|
||||
)
|
||||
def oas(X, *, assume_centered=False):
|
||||
"""Estimate covariance with the Oracle Approximating Shrinkage.
|
||||
|
||||
Read more in the :ref:`User Guide <shrunk_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data from which to compute the covariance estimate.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data will not be centered before computation.
|
||||
Useful to work with data whose mean is significantly equal to
|
||||
zero but is not exactly zero.
|
||||
If False, data will be centered before computation.
|
||||
|
||||
Returns
|
||||
-------
|
||||
shrunk_cov : array-like of shape (n_features, n_features)
|
||||
Shrunk covariance.
|
||||
|
||||
shrinkage : float
|
||||
Coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularised covariance is:
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features),
|
||||
|
||||
where mu = trace(cov) / n_features and shrinkage is given by the OAS formula
|
||||
(see [1]_).
|
||||
|
||||
The shrinkage formulation implemented here differs from Eq. 23 in [1]_. In
|
||||
the original article, formula (23) states that 2/p (p being the number of
|
||||
features) is multiplied by Trace(cov*cov) in both the numerator and
|
||||
denominator, but this operation is omitted because for a large p, the value
|
||||
of 2/p is so small that it doesn't affect the value of the estimator.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] :arxiv:`"Shrinkage algorithms for MMSE covariance estimation.",
|
||||
Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
|
||||
IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
|
||||
<0907.4698>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import oas
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> real_cov = [[.8, .3], [.3, .4]]
|
||||
>>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=500)
|
||||
>>> shrunk_cov, shrinkage = oas(X)
|
||||
>>> shrunk_cov
|
||||
array([[0.7533, 0.2763],
|
||||
[0.2763, 0.3964]])
|
||||
>>> shrinkage
|
||||
np.float64(0.0195)
|
||||
"""
|
||||
estimator = OAS(
|
||||
assume_centered=assume_centered,
|
||||
).fit(X)
|
||||
return estimator.covariance_, estimator.shrinkage_
|
||||
|
||||
|
||||
class OAS(EmpiricalCovariance):
|
||||
"""Oracle Approximating Shrinkage Estimator.
|
||||
|
||||
Read more in the :ref:`User Guide <shrunk_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specify if the estimated precision is stored.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data will not be centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If False (default), data will be centered before computation.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated covariance matrix.
|
||||
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated location, i.e. the estimated mean.
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
shrinkage_ : float
|
||||
coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate. Range is [0, 1].
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
EllipticEnvelope : An object for detecting outliers in
|
||||
a Gaussian distributed dataset.
|
||||
EmpiricalCovariance : Maximum likelihood covariance estimator.
|
||||
GraphicalLasso : Sparse inverse covariance estimation
|
||||
with an l1-penalized estimator.
|
||||
GraphicalLassoCV : Sparse inverse covariance with cross-validated
|
||||
choice of the l1 penalty.
|
||||
LedoitWolf : LedoitWolf Estimator.
|
||||
MinCovDet : Minimum Covariance Determinant
|
||||
(robust estimator of covariance).
|
||||
ShrunkCovariance : Covariance estimator with shrinkage.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularised covariance is:
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features),
|
||||
|
||||
where mu = trace(cov) / n_features and shrinkage is given by the OAS formula
|
||||
(see [1]_).
|
||||
|
||||
The shrinkage formulation implemented here differs from Eq. 23 in [1]_. In
|
||||
the original article, formula (23) states that 2/p (p being the number of
|
||||
features) is multiplied by Trace(cov*cov) in both the numerator and
|
||||
denominator, but this operation is omitted because for a large p, the value
|
||||
of 2/p is so small that it doesn't affect the value of the estimator.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] :arxiv:`"Shrinkage algorithms for MMSE covariance estimation.",
|
||||
Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
|
||||
IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
|
||||
<0907.4698>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import OAS
|
||||
>>> from sklearn.datasets import make_gaussian_quantiles
|
||||
>>> real_cov = np.array([[.8, .3],
|
||||
... [.3, .4]])
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X = rng.multivariate_normal(mean=[0, 0],
|
||||
... cov=real_cov,
|
||||
... size=500)
|
||||
>>> oas = OAS().fit(X)
|
||||
>>> oas.covariance_
|
||||
array([[0.7533, 0.2763],
|
||||
[0.2763, 0.3964]])
|
||||
>>> oas.precision_
|
||||
array([[ 1.7833, -1.2431 ],
|
||||
[-1.2431, 3.3889]])
|
||||
>>> oas.shrinkage_
|
||||
np.float64(0.0195)
|
||||
|
||||
See also :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py`
|
||||
and :ref:`sphx_glr_auto_examples_covariance_plot_lw_vs_oas.py`
|
||||
for more detailed examples.
|
||||
"""
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the Oracle Approximating Shrinkage covariance model to X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
X = validate_data(self, X)
|
||||
# Not calling the parent object to fit, to avoid computing the
|
||||
# covariance matrix (and potentially the precision)
|
||||
if self.assume_centered:
|
||||
self.location_ = np.zeros(X.shape[1])
|
||||
else:
|
||||
self.location_ = X.mean(0)
|
||||
|
||||
covariance, shrinkage = _oas(X - self.location_, assume_centered=True)
|
||||
self.shrinkage_ = shrinkage
|
||||
self._set_covariance(covariance)
|
||||
|
||||
return self
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,374 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.covariance import (
|
||||
OAS,
|
||||
EmpiricalCovariance,
|
||||
LedoitWolf,
|
||||
ShrunkCovariance,
|
||||
empirical_covariance,
|
||||
ledoit_wolf,
|
||||
ledoit_wolf_shrinkage,
|
||||
oas,
|
||||
shrunk_covariance,
|
||||
)
|
||||
from sklearn.covariance._shrunk_covariance import _ledoit_wolf
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
|
||||
from .._shrunk_covariance import _oas
|
||||
|
||||
X, _ = datasets.load_diabetes(return_X_y=True)
|
||||
X_1d = X[:, 0]
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
|
||||
def test_covariance():
|
||||
# Tests Covariance module on a simple dataset.
|
||||
# test covariance fit from data
|
||||
cov = EmpiricalCovariance()
|
||||
cov.fit(X)
|
||||
emp_cov = empirical_covariance(X)
|
||||
assert_array_almost_equal(emp_cov, cov.covariance_, 4)
|
||||
assert_almost_equal(cov.error_norm(emp_cov), 0)
|
||||
assert_almost_equal(cov.error_norm(emp_cov, norm="spectral"), 0)
|
||||
assert_almost_equal(cov.error_norm(emp_cov, norm="frobenius"), 0)
|
||||
assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0)
|
||||
assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
cov.error_norm(emp_cov, norm="foo")
|
||||
# Mahalanobis distances computation test
|
||||
mahal_dist = cov.mahalanobis(X)
|
||||
assert np.amin(mahal_dist) > 0
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
cov = EmpiricalCovariance()
|
||||
cov.fit(X_1d)
|
||||
assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
|
||||
assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
|
||||
assert_almost_equal(cov.error_norm(empirical_covariance(X_1d), norm="spectral"), 0)
|
||||
|
||||
# test with one sample
|
||||
# Create X with 1 sample and 5 features
|
||||
X_1sample = np.arange(5).reshape(1, 5)
|
||||
cov = EmpiricalCovariance()
|
||||
warn_msg = "Only one sample available. You may want to reshape your data array"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
cov.fit(X_1sample)
|
||||
|
||||
assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
|
||||
|
||||
# test integer type
|
||||
X_integer = np.asarray([[0, 1], [1, 0]])
|
||||
result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
|
||||
assert_array_almost_equal(empirical_covariance(X_integer), result)
|
||||
|
||||
# test centered case
|
||||
cov = EmpiricalCovariance(assume_centered=True)
|
||||
cov.fit(X)
|
||||
assert_array_equal(cov.location_, np.zeros(X.shape[1]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_matrices", [1, 3])
|
||||
def test_shrunk_covariance_func(n_matrices):
|
||||
"""Check `shrunk_covariance` function."""
|
||||
|
||||
n_features = 2
|
||||
cov = np.ones((n_features, n_features))
|
||||
cov_target = np.array([[1, 0.5], [0.5, 1]])
|
||||
|
||||
if n_matrices > 1:
|
||||
cov = np.repeat(cov[np.newaxis, ...], n_matrices, axis=0)
|
||||
cov_target = np.repeat(cov_target[np.newaxis, ...], n_matrices, axis=0)
|
||||
|
||||
cov_shrunk = shrunk_covariance(cov, 0.5)
|
||||
assert_allclose(cov_shrunk, cov_target)
|
||||
|
||||
|
||||
def test_shrunk_covariance():
|
||||
"""Check consistency between `ShrunkCovariance` and `shrunk_covariance`."""
|
||||
|
||||
# Tests ShrunkCovariance module on a simple dataset.
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
cov = ShrunkCovariance(shrinkage=0.5)
|
||||
cov.fit(X)
|
||||
assert_array_almost_equal(
|
||||
shrunk_covariance(empirical_covariance(X), shrinkage=0.5), cov.covariance_, 4
|
||||
)
|
||||
|
||||
# same test with shrinkage not provided
|
||||
cov = ShrunkCovariance()
|
||||
cov.fit(X)
|
||||
assert_array_almost_equal(
|
||||
shrunk_covariance(empirical_covariance(X)), cov.covariance_, 4
|
||||
)
|
||||
|
||||
# same test with shrinkage = 0 (<==> empirical_covariance)
|
||||
cov = ShrunkCovariance(shrinkage=0.0)
|
||||
cov.fit(X)
|
||||
assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
cov = ShrunkCovariance(shrinkage=0.3)
|
||||
cov.fit(X_1d)
|
||||
assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
cov = ShrunkCovariance(shrinkage=0.5, store_precision=False)
|
||||
cov.fit(X)
|
||||
assert cov.precision_ is None
|
||||
|
||||
|
||||
def test_ledoit_wolf():
|
||||
# Tests LedoitWolf module on a simple dataset.
|
||||
# test shrinkage coeff on a simple data set
|
||||
X_centered = X - X.mean(axis=0)
|
||||
lw = LedoitWolf(assume_centered=True)
|
||||
lw.fit(X_centered)
|
||||
shrinkage_ = lw.shrinkage_
|
||||
|
||||
score_ = lw.score(X_centered)
|
||||
assert_almost_equal(
|
||||
ledoit_wolf_shrinkage(X_centered, assume_centered=True), shrinkage_
|
||||
)
|
||||
assert_almost_equal(
|
||||
ledoit_wolf_shrinkage(X_centered, assume_centered=True, block_size=6),
|
||||
shrinkage_,
|
||||
)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(
|
||||
X_centered, assume_centered=True
|
||||
)
|
||||
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
|
||||
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
|
||||
# compare estimates given by LW and ShrunkCovariance
|
||||
scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True)
|
||||
scov.fit(X_centered)
|
||||
assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
lw = LedoitWolf(assume_centered=True)
|
||||
lw.fit(X_1d)
|
||||
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d, assume_centered=True)
|
||||
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
|
||||
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
|
||||
assert_array_almost_equal((X_1d**2).sum() / n_samples, lw.covariance_, 4)
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
lw = LedoitWolf(store_precision=False, assume_centered=True)
|
||||
lw.fit(X_centered)
|
||||
assert_almost_equal(lw.score(X_centered), score_, 4)
|
||||
assert lw.precision_ is None
|
||||
|
||||
# Same tests without assuming centered data
|
||||
# test shrinkage coeff on a simple data set
|
||||
lw = LedoitWolf()
|
||||
lw.fit(X)
|
||||
assert_almost_equal(lw.shrinkage_, shrinkage_, 4)
|
||||
assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X))
|
||||
assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1])
|
||||
assert_almost_equal(
|
||||
lw.shrinkage_, _ledoit_wolf(X=X, assume_centered=False, block_size=10000)[1]
|
||||
)
|
||||
assert_almost_equal(lw.score(X), score_, 4)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X)
|
||||
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
|
||||
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
|
||||
# compare estimates given by LW and ShrunkCovariance
|
||||
scov = ShrunkCovariance(shrinkage=lw.shrinkage_)
|
||||
scov.fit(X)
|
||||
assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
lw = LedoitWolf()
|
||||
lw.fit(X_1d)
|
||||
assert_allclose(
|
||||
X_1d.var(ddof=0),
|
||||
_ledoit_wolf(X=X_1d, assume_centered=False, block_size=10000)[0],
|
||||
)
|
||||
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d)
|
||||
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
|
||||
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
|
||||
assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4)
|
||||
|
||||
# test with one sample
|
||||
# warning should be raised when using only 1 sample
|
||||
X_1sample = np.arange(5).reshape(1, 5)
|
||||
lw = LedoitWolf()
|
||||
|
||||
warn_msg = "Only one sample available. You may want to reshape your data array"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
lw.fit(X_1sample)
|
||||
|
||||
assert_array_almost_equal(lw.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
lw = LedoitWolf(store_precision=False)
|
||||
lw.fit(X)
|
||||
assert_almost_equal(lw.score(X), score_, 4)
|
||||
assert lw.precision_ is None
|
||||
|
||||
|
||||
def _naive_ledoit_wolf_shrinkage(X):
|
||||
# A simple implementation of the formulas from Ledoit & Wolf
|
||||
|
||||
# The computation below achieves the following computations of the
|
||||
# "O. Ledoit and M. Wolf, A Well-Conditioned Estimator for
|
||||
# Large-Dimensional Covariance Matrices"
|
||||
# beta and delta are given in the beginning of section 3.2
|
||||
n_samples, n_features = X.shape
|
||||
emp_cov = empirical_covariance(X, assume_centered=False)
|
||||
mu = np.trace(emp_cov) / n_features
|
||||
delta_ = emp_cov.copy()
|
||||
delta_.flat[:: n_features + 1] -= mu
|
||||
delta = (delta_**2).sum() / n_features
|
||||
X2 = X**2
|
||||
beta_ = (
|
||||
1.0
|
||||
/ (n_features * n_samples)
|
||||
* np.sum(np.dot(X2.T, X2) / n_samples - emp_cov**2)
|
||||
)
|
||||
|
||||
beta = min(beta_, delta)
|
||||
shrinkage = beta / delta
|
||||
return shrinkage
|
||||
|
||||
|
||||
def test_ledoit_wolf_small():
|
||||
# Compare our blocked implementation to the naive implementation
|
||||
X_small = X[:, :4]
|
||||
lw = LedoitWolf()
|
||||
lw.fit(X_small)
|
||||
shrinkage_ = lw.shrinkage_
|
||||
|
||||
assert_almost_equal(shrinkage_, _naive_ledoit_wolf_shrinkage(X_small))
|
||||
|
||||
|
||||
def test_ledoit_wolf_large():
|
||||
# test that ledoit_wolf doesn't error on data that is wider than block_size
|
||||
rng = np.random.RandomState(0)
|
||||
# use a number of features that is larger than the block-size
|
||||
X = rng.normal(size=(10, 20))
|
||||
lw = LedoitWolf(block_size=10).fit(X)
|
||||
# check that covariance is about diagonal (random normal noise)
|
||||
assert_almost_equal(lw.covariance_, np.eye(20), 0)
|
||||
cov = lw.covariance_
|
||||
|
||||
# check that the result is consistent with not splitting data into blocks.
|
||||
lw = LedoitWolf(block_size=25).fit(X)
|
||||
assert_almost_equal(lw.covariance_, cov)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ledoit_wolf_fitting_function", [LedoitWolf().fit, ledoit_wolf_shrinkage]
|
||||
)
|
||||
def test_ledoit_wolf_empty_array(ledoit_wolf_fitting_function):
|
||||
"""Check that we validate X and raise proper error with 0-sample array."""
|
||||
X_empty = np.zeros((0, 2))
|
||||
with pytest.raises(ValueError, match="Found array with 0 sample"):
|
||||
ledoit_wolf_fitting_function(X_empty)
|
||||
|
||||
|
||||
def test_oas():
|
||||
# Tests OAS module on a simple dataset.
|
||||
# test shrinkage coeff on a simple data set
|
||||
X_centered = X - X.mean(axis=0)
|
||||
oa = OAS(assume_centered=True)
|
||||
oa.fit(X_centered)
|
||||
shrinkage_ = oa.shrinkage_
|
||||
score_ = oa.score(X_centered)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_centered, assume_centered=True)
|
||||
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
# compare estimates given by OAS and ShrunkCovariance
|
||||
scov = ShrunkCovariance(shrinkage=oa.shrinkage_, assume_centered=True)
|
||||
scov.fit(X_centered)
|
||||
assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0:1]
|
||||
oa = OAS(assume_centered=True)
|
||||
oa.fit(X_1d)
|
||||
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d, assume_centered=True)
|
||||
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
assert_array_almost_equal((X_1d**2).sum() / n_samples, oa.covariance_, 4)
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
oa = OAS(store_precision=False, assume_centered=True)
|
||||
oa.fit(X_centered)
|
||||
assert_almost_equal(oa.score(X_centered), score_, 4)
|
||||
assert oa.precision_ is None
|
||||
|
||||
# Same tests without assuming centered data--------------------------------
|
||||
# test shrinkage coeff on a simple data set
|
||||
oa = OAS()
|
||||
oa.fit(X)
|
||||
assert_almost_equal(oa.shrinkage_, shrinkage_, 4)
|
||||
assert_almost_equal(oa.score(X), score_, 4)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X)
|
||||
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
# compare estimates given by OAS and ShrunkCovariance
|
||||
scov = ShrunkCovariance(shrinkage=oa.shrinkage_)
|
||||
scov.fit(X)
|
||||
assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
oa = OAS()
|
||||
oa.fit(X_1d)
|
||||
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d)
|
||||
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
assert_array_almost_equal(empirical_covariance(X_1d), oa.covariance_, 4)
|
||||
|
||||
# test with one sample
|
||||
# warning should be raised when using only 1 sample
|
||||
X_1sample = np.arange(5).reshape(1, 5)
|
||||
oa = OAS()
|
||||
warn_msg = "Only one sample available. You may want to reshape your data array"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
oa.fit(X_1sample)
|
||||
|
||||
assert_array_almost_equal(oa.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
oa = OAS(store_precision=False)
|
||||
oa.fit(X)
|
||||
assert_almost_equal(oa.score(X), score_, 4)
|
||||
assert oa.precision_ is None
|
||||
|
||||
# test function _oas without assuming centered data
|
||||
X_1f = X[:, 0:1]
|
||||
oa = OAS()
|
||||
oa.fit(X_1f)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
_oa_cov_from_mle, _oa_shrinkage_from_mle = _oas(X_1f)
|
||||
assert_array_almost_equal(_oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(_oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
assert_array_almost_equal((X_1f**2).sum() / n_samples, oa.covariance_, 4)
|
||||
|
||||
|
||||
def test_EmpiricalCovariance_validates_mahalanobis():
|
||||
"""Checks that EmpiricalCovariance validates data with mahalanobis."""
|
||||
cov = EmpiricalCovariance().fit(X)
|
||||
|
||||
msg = f"X has 2 features, but \\w+ is expecting {X.shape[1]} features as input"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cov.mahalanobis(X[:, :2])
|
||||
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Testing for Elliptic Envelope algorithm (sklearn.covariance.elliptic_envelope).
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.covariance import EllipticEnvelope
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.utils._testing import (
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
|
||||
|
||||
def test_elliptic_envelope(global_random_seed):
|
||||
rnd = np.random.RandomState(global_random_seed)
|
||||
X = rnd.randn(100, 10)
|
||||
clf = EllipticEnvelope(contamination=0.1)
|
||||
with pytest.raises(NotFittedError):
|
||||
clf.predict(X)
|
||||
with pytest.raises(NotFittedError):
|
||||
clf.decision_function(X)
|
||||
clf.fit(X)
|
||||
y_pred = clf.predict(X)
|
||||
scores = clf.score_samples(X)
|
||||
decisions = clf.decision_function(X)
|
||||
|
||||
assert_array_almost_equal(scores, -clf.mahalanobis(X))
|
||||
assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
|
||||
assert_almost_equal(
|
||||
clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0
|
||||
)
|
||||
assert sum(y_pred == -1) == sum(decisions < 0)
|
||||
|
||||
|
||||
def test_score_samples():
|
||||
X_train = [[1, 1], [1, 2], [2, 1]]
|
||||
clf1 = EllipticEnvelope(contamination=0.2).fit(X_train)
|
||||
clf2 = EllipticEnvelope().fit(X_train)
|
||||
assert_array_equal(
|
||||
clf1.score_samples([[2.0, 2.0]]),
|
||||
clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
|
||||
)
|
||||
assert_array_equal(
|
||||
clf2.score_samples([[2.0, 2.0]]),
|
||||
clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
|
||||
)
|
||||
assert_array_equal(
|
||||
clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]])
|
||||
)
|
||||
@@ -0,0 +1,318 @@
|
||||
"""Test the graphical_lasso module."""
|
||||
|
||||
import sys
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
from scipy import linalg
|
||||
|
||||
from sklearn import config_context, datasets
|
||||
from sklearn.covariance import (
|
||||
GraphicalLasso,
|
||||
GraphicalLassoCV,
|
||||
empirical_covariance,
|
||||
graphical_lasso,
|
||||
)
|
||||
from sklearn.datasets import make_sparse_spd_matrix
|
||||
from sklearn.model_selection import GroupKFold
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import (
|
||||
_convert_container,
|
||||
assert_array_almost_equal,
|
||||
assert_array_less,
|
||||
)
|
||||
|
||||
|
||||
def test_graphical_lassos(random_state=1):
|
||||
"""Test the graphical lasso solvers.
|
||||
|
||||
This checks is unstable for some random seeds where the covariance found with "cd"
|
||||
and "lars" solvers are different (4 cases / 100 tries).
|
||||
"""
|
||||
# Sample data from a sparse multivariate normal
|
||||
dim = 20
|
||||
n_samples = 100
|
||||
random_state = check_random_state(random_state)
|
||||
prec = make_sparse_spd_matrix(dim, alpha=0.95, random_state=random_state)
|
||||
cov = linalg.inv(prec)
|
||||
X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
|
||||
emp_cov = empirical_covariance(X)
|
||||
|
||||
for alpha in (0.0, 0.1, 0.25):
|
||||
covs = dict()
|
||||
icovs = dict()
|
||||
for method in ("cd", "lars"):
|
||||
cov_, icov_, costs = graphical_lasso(
|
||||
emp_cov, return_costs=True, alpha=alpha, mode=method
|
||||
)
|
||||
covs[method] = cov_
|
||||
icovs[method] = icov_
|
||||
costs, dual_gap = np.array(costs).T
|
||||
# Check that the costs always decrease (doesn't hold if alpha == 0)
|
||||
if not alpha == 0:
|
||||
# use 1e-12 since the cost can be exactly 0
|
||||
assert_array_less(np.diff(costs), 1e-12)
|
||||
# Check that the 2 approaches give similar results
|
||||
assert_allclose(covs["cd"], covs["lars"], atol=5e-4)
|
||||
assert_allclose(icovs["cd"], icovs["lars"], atol=5e-4)
|
||||
|
||||
# Smoke test the estimator
|
||||
model = GraphicalLasso(alpha=0.25).fit(X)
|
||||
model.score(X)
|
||||
assert_array_almost_equal(model.covariance_, covs["cd"], decimal=4)
|
||||
assert_array_almost_equal(model.covariance_, covs["lars"], decimal=4)
|
||||
|
||||
# For a centered matrix, assume_centered could be chosen True or False
|
||||
# Check that this returns indeed the same result for centered data
|
||||
Z = X - X.mean(0)
|
||||
precs = list()
|
||||
for assume_centered in (False, True):
|
||||
prec_ = GraphicalLasso(assume_centered=assume_centered).fit(Z).precision_
|
||||
precs.append(prec_)
|
||||
assert_array_almost_equal(precs[0], precs[1])
|
||||
|
||||
|
||||
def test_graphical_lasso_when_alpha_equals_0():
|
||||
"""Test graphical_lasso's early return condition when alpha=0."""
|
||||
X = np.random.randn(100, 10)
|
||||
emp_cov = empirical_covariance(X, assume_centered=True)
|
||||
|
||||
model = GraphicalLasso(alpha=0, covariance="precomputed").fit(emp_cov)
|
||||
assert_allclose(model.precision_, np.linalg.inv(emp_cov))
|
||||
|
||||
_, precision = graphical_lasso(emp_cov, alpha=0)
|
||||
assert_allclose(precision, np.linalg.inv(emp_cov))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mode", ["cd", "lars"])
|
||||
def test_graphical_lasso_n_iter(mode):
|
||||
X, _ = datasets.make_classification(n_samples=5_000, n_features=20, random_state=0)
|
||||
emp_cov = empirical_covariance(X)
|
||||
|
||||
_, _, n_iter = graphical_lasso(
|
||||
emp_cov, 0.2, mode=mode, max_iter=2, return_n_iter=True
|
||||
)
|
||||
assert n_iter == 2
|
||||
|
||||
|
||||
def test_graphical_lasso_iris():
|
||||
# Hard-coded solution from R glasso package for alpha=1.0
|
||||
# (need to set penalize.diagonal to FALSE)
|
||||
cov_R = np.array(
|
||||
[
|
||||
[0.68112222, 0.0000000, 0.265820, 0.02464314],
|
||||
[0.00000000, 0.1887129, 0.000000, 0.00000000],
|
||||
[0.26582000, 0.0000000, 3.095503, 0.28697200],
|
||||
[0.02464314, 0.0000000, 0.286972, 0.57713289],
|
||||
]
|
||||
)
|
||||
icov_R = np.array(
|
||||
[
|
||||
[1.5190747, 0.000000, -0.1304475, 0.0000000],
|
||||
[0.0000000, 5.299055, 0.0000000, 0.0000000],
|
||||
[-0.1304475, 0.000000, 0.3498624, -0.1683946],
|
||||
[0.0000000, 0.000000, -0.1683946, 1.8164353],
|
||||
]
|
||||
)
|
||||
X = datasets.load_iris().data
|
||||
emp_cov = empirical_covariance(X)
|
||||
for method in ("cd", "lars"):
|
||||
cov, icov = graphical_lasso(emp_cov, alpha=1.0, return_costs=False, mode=method)
|
||||
assert_array_almost_equal(cov, cov_R)
|
||||
assert_array_almost_equal(icov, icov_R)
|
||||
|
||||
|
||||
def test_graph_lasso_2D():
|
||||
# Hard-coded solution from Python skggm package
|
||||
# obtained by calling `quic(emp_cov, lam=.1, tol=1e-8)`
|
||||
cov_skggm = np.array([[3.09550269, 1.186972], [1.186972, 0.57713289]])
|
||||
|
||||
icov_skggm = np.array([[1.52836773, -3.14334831], [-3.14334831, 8.19753385]])
|
||||
X = datasets.load_iris().data[:, 2:]
|
||||
emp_cov = empirical_covariance(X)
|
||||
for method in ("cd", "lars"):
|
||||
cov, icov = graphical_lasso(emp_cov, alpha=0.1, return_costs=False, mode=method)
|
||||
assert_array_almost_equal(cov, cov_skggm)
|
||||
assert_array_almost_equal(icov, icov_skggm)
|
||||
|
||||
|
||||
def test_graphical_lasso_iris_singular():
|
||||
# Small subset of rows to test the rank-deficient case
|
||||
# Need to choose samples such that none of the variances are zero
|
||||
indices = np.arange(10, 13)
|
||||
|
||||
# Hard-coded solution from R glasso package for alpha=0.01
|
||||
cov_R = np.array(
|
||||
[
|
||||
[0.08, 0.056666662595, 0.00229729713223, 0.00153153142149],
|
||||
[0.056666662595, 0.082222222222, 0.00333333333333, 0.00222222222222],
|
||||
[0.002297297132, 0.003333333333, 0.00666666666667, 0.00009009009009],
|
||||
[0.001531531421, 0.002222222222, 0.00009009009009, 0.00222222222222],
|
||||
]
|
||||
)
|
||||
icov_R = np.array(
|
||||
[
|
||||
[24.42244057, -16.831679593, 0.0, 0.0],
|
||||
[-16.83168201, 24.351841681, -6.206896552, -12.5],
|
||||
[0.0, -6.206896171, 153.103448276, 0.0],
|
||||
[0.0, -12.499999143, 0.0, 462.5],
|
||||
]
|
||||
)
|
||||
X = datasets.load_iris().data[indices, :]
|
||||
emp_cov = empirical_covariance(X)
|
||||
for method in ("cd", "lars"):
|
||||
cov, icov = graphical_lasso(
|
||||
emp_cov, alpha=0.01, return_costs=False, mode=method
|
||||
)
|
||||
assert_array_almost_equal(cov, cov_R, decimal=5)
|
||||
assert_array_almost_equal(icov, icov_R, decimal=5)
|
||||
|
||||
|
||||
def test_graphical_lasso_cv(random_state=1):
|
||||
# Sample data from a sparse multivariate normal
|
||||
dim = 5
|
||||
n_samples = 6
|
||||
random_state = check_random_state(random_state)
|
||||
prec = make_sparse_spd_matrix(dim, alpha=0.96, random_state=random_state)
|
||||
cov = linalg.inv(prec)
|
||||
X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
|
||||
# Capture stdout, to smoke test the verbose mode
|
||||
orig_stdout = sys.stdout
|
||||
try:
|
||||
sys.stdout = StringIO()
|
||||
# We need verbose very high so that Parallel prints on stdout
|
||||
GraphicalLassoCV(verbose=100, alphas=5, tol=1e-1).fit(X)
|
||||
finally:
|
||||
sys.stdout = orig_stdout
|
||||
|
||||
|
||||
@pytest.mark.parametrize("alphas_container_type", ["list", "tuple", "array"])
|
||||
def test_graphical_lasso_cv_alphas_iterable(alphas_container_type):
|
||||
"""Check that we can pass an array-like to `alphas`.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/22489
|
||||
"""
|
||||
true_cov = np.array(
|
||||
[
|
||||
[0.8, 0.0, 0.2, 0.0],
|
||||
[0.0, 0.4, 0.0, 0.0],
|
||||
[0.2, 0.0, 0.3, 0.1],
|
||||
[0.0, 0.0, 0.1, 0.7],
|
||||
]
|
||||
)
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
|
||||
alphas = _convert_container([0.02, 0.03], alphas_container_type)
|
||||
GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"alphas,err_type,err_msg",
|
||||
[
|
||||
([-0.02, 0.03], ValueError, "must be > 0"),
|
||||
([0, 0.03], ValueError, "must be > 0"),
|
||||
(["not_number", 0.03], TypeError, "must be an instance of float"),
|
||||
],
|
||||
)
|
||||
def test_graphical_lasso_cv_alphas_invalid_array(alphas, err_type, err_msg):
|
||||
"""Check that if an array-like containing a value
|
||||
outside of (0, inf] is passed to `alphas`, a ValueError is raised.
|
||||
Check if a string is passed, a TypeError is raised.
|
||||
"""
|
||||
true_cov = np.array(
|
||||
[
|
||||
[0.8, 0.0, 0.2, 0.0],
|
||||
[0.0, 0.4, 0.0, 0.0],
|
||||
[0.2, 0.0, 0.3, 0.1],
|
||||
[0.0, 0.0, 0.1, 0.7],
|
||||
]
|
||||
)
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
|
||||
|
||||
with pytest.raises(err_type, match=err_msg):
|
||||
GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X)
|
||||
|
||||
|
||||
def test_graphical_lasso_cv_scores():
|
||||
splits = 4
|
||||
n_alphas = 5
|
||||
n_refinements = 3
|
||||
true_cov = np.array(
|
||||
[
|
||||
[0.8, 0.0, 0.2, 0.0],
|
||||
[0.0, 0.4, 0.0, 0.0],
|
||||
[0.2, 0.0, 0.3, 0.1],
|
||||
[0.0, 0.0, 0.1, 0.7],
|
||||
]
|
||||
)
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
|
||||
cov = GraphicalLassoCV(cv=splits, alphas=n_alphas, n_refinements=n_refinements).fit(
|
||||
X
|
||||
)
|
||||
|
||||
_assert_graphical_lasso_cv_scores(
|
||||
cov=cov,
|
||||
n_splits=splits,
|
||||
n_refinements=n_refinements,
|
||||
n_alphas=n_alphas,
|
||||
)
|
||||
|
||||
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_graphical_lasso_cv_scores_with_routing(global_random_seed):
|
||||
"""Check that `GraphicalLassoCV` internally dispatches metadata to
|
||||
the splitter.
|
||||
"""
|
||||
splits = 5
|
||||
n_alphas = 5
|
||||
n_refinements = 3
|
||||
true_cov = np.array(
|
||||
[
|
||||
[0.8, 0.0, 0.2, 0.0],
|
||||
[0.0, 0.4, 0.0, 0.0],
|
||||
[0.2, 0.0, 0.3, 0.1],
|
||||
[0.0, 0.0, 0.1, 0.7],
|
||||
]
|
||||
)
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=300)
|
||||
n_samples = X.shape[0]
|
||||
groups = rng.randint(0, 5, n_samples)
|
||||
params = {"groups": groups}
|
||||
cv = GroupKFold(n_splits=splits)
|
||||
cv.set_split_request(groups=True)
|
||||
|
||||
cov = GraphicalLassoCV(cv=cv, alphas=n_alphas, n_refinements=n_refinements).fit(
|
||||
X, **params
|
||||
)
|
||||
|
||||
_assert_graphical_lasso_cv_scores(
|
||||
cov=cov,
|
||||
n_splits=splits,
|
||||
n_refinements=n_refinements,
|
||||
n_alphas=n_alphas,
|
||||
)
|
||||
|
||||
|
||||
def _assert_graphical_lasso_cv_scores(cov, n_splits, n_refinements, n_alphas):
|
||||
cv_results = cov.cv_results_
|
||||
# alpha and one for each split
|
||||
|
||||
total_alphas = n_refinements * n_alphas + 1
|
||||
keys = ["alphas"]
|
||||
split_keys = [f"split{i}_test_score" for i in range(n_splits)]
|
||||
for key in keys + split_keys:
|
||||
assert key in cv_results
|
||||
assert len(cv_results[key]) == total_alphas
|
||||
|
||||
cv_scores = np.asarray([cov.cv_results_[key] for key in split_keys])
|
||||
expected_mean = cv_scores.mean(axis=0)
|
||||
expected_std = cv_scores.std(axis=0)
|
||||
|
||||
assert_allclose(cov.cv_results_["mean_test_score"], expected_mean)
|
||||
assert_allclose(cov.cv_results_["std_test_score"], expected_std)
|
||||
@@ -0,0 +1,171 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.covariance import MinCovDet, empirical_covariance, fast_mcd
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
|
||||
X = datasets.load_iris().data
|
||||
X_1d = X[:, 0]
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
|
||||
def test_mcd(global_random_seed):
|
||||
# Tests the FastMCD algorithm implementation
|
||||
# Small data set
|
||||
# test without outliers (random independent normal data)
|
||||
launch_mcd_on_dataset(100, 5, 0, 0.02, 0.1, 75, global_random_seed)
|
||||
# test with a contaminated data set (medium contamination)
|
||||
launch_mcd_on_dataset(100, 5, 20, 0.3, 0.3, 65, global_random_seed)
|
||||
# test with a contaminated data set (strong contamination)
|
||||
launch_mcd_on_dataset(100, 5, 40, 0.1, 0.1, 50, global_random_seed)
|
||||
|
||||
# Medium data set
|
||||
launch_mcd_on_dataset(1000, 5, 450, 0.1, 0.1, 540, global_random_seed)
|
||||
|
||||
# Large data set
|
||||
launch_mcd_on_dataset(1700, 5, 800, 0.1, 0.1, 870, global_random_seed)
|
||||
|
||||
# 1D data set
|
||||
launch_mcd_on_dataset(500, 1, 100, 0.02, 0.02, 350, global_random_seed)
|
||||
|
||||
# n_samples == n_features
|
||||
launch_mcd_on_dataset(20, 20, 0, 0.1, 0.1, 15, global_random_seed)
|
||||
|
||||
|
||||
def test_fast_mcd_on_invalid_input():
|
||||
X = np.arange(100)
|
||||
msg = "Expected 2D array, got 1D array instead"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
fast_mcd(X)
|
||||
|
||||
|
||||
def test_mcd_class_on_invalid_input():
|
||||
X = np.arange(100)
|
||||
mcd = MinCovDet()
|
||||
msg = "Expected 2D array, got 1D array instead"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
mcd.fit(X)
|
||||
|
||||
|
||||
def launch_mcd_on_dataset(
|
||||
n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support, seed
|
||||
):
|
||||
rand_gen = np.random.RandomState(seed)
|
||||
data = rand_gen.randn(n_samples, n_features)
|
||||
# add some outliers
|
||||
outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
|
||||
outliers_offset = 10.0 * (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
|
||||
data[outliers_index] += outliers_offset
|
||||
inliers_mask = np.ones(n_samples).astype(bool)
|
||||
inliers_mask[outliers_index] = False
|
||||
|
||||
pure_data = data[inliers_mask]
|
||||
# compute MCD by fitting an object
|
||||
mcd_fit = MinCovDet(random_state=seed).fit(data)
|
||||
T = mcd_fit.location_
|
||||
S = mcd_fit.covariance_
|
||||
H = mcd_fit.support_
|
||||
# compare with the estimates learnt from the inliers
|
||||
error_location = np.mean((pure_data.mean(0) - T) ** 2)
|
||||
assert error_location < tol_loc
|
||||
error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
|
||||
assert error_cov < tol_cov
|
||||
assert np.sum(H) >= tol_support
|
||||
assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
|
||||
|
||||
|
||||
def test_mcd_issue1127():
|
||||
# Check that the code does not break with X.shape = (3, 1)
|
||||
# (i.e. n_support = n_samples)
|
||||
rnd = np.random.RandomState(0)
|
||||
X = rnd.normal(size=(3, 1))
|
||||
mcd = MinCovDet()
|
||||
mcd.fit(X)
|
||||
|
||||
|
||||
def test_mcd_issue3367(global_random_seed):
|
||||
# Check that MCD completes when the covariance matrix is singular
|
||||
# i.e. one of the rows and columns are all zeros
|
||||
rand_gen = np.random.RandomState(global_random_seed)
|
||||
|
||||
# Think of these as the values for X and Y -> 10 values between -5 and 5
|
||||
data_values = np.linspace(-5, 5, 10).tolist()
|
||||
# Get the cartesian product of all possible coordinate pairs from above set
|
||||
data = np.array(list(itertools.product(data_values, data_values)))
|
||||
|
||||
# Add a third column that's all zeros to make our data a set of point
|
||||
# within a plane, which means that the covariance matrix will be singular
|
||||
data = np.hstack((data, np.zeros((data.shape[0], 1))))
|
||||
|
||||
# The below line of code should raise an exception if the covariance matrix
|
||||
# is singular. As a further test, since we have points in XYZ, the
|
||||
# principle components (Eigenvectors) of these directly relate to the
|
||||
# geometry of the points. Since it's a plane, we should be able to test
|
||||
# that the Eigenvector that corresponds to the smallest Eigenvalue is the
|
||||
# plane normal, specifically [0, 0, 1], since everything is in the XY plane
|
||||
# (as I've set it up above). To do this one would start by:
|
||||
#
|
||||
# evals, evecs = np.linalg.eigh(mcd_fit.covariance_)
|
||||
# normal = evecs[:, np.argmin(evals)]
|
||||
#
|
||||
# After which we need to assert that our `normal` is equal to [0, 0, 1].
|
||||
# Do note that there is floating point error associated with this, so it's
|
||||
# best to subtract the two and then compare some small tolerance (e.g.
|
||||
# 1e-12).
|
||||
MinCovDet(random_state=rand_gen).fit(data)
|
||||
|
||||
|
||||
def test_mcd_support_covariance_is_zero():
|
||||
# Check that MCD returns a ValueError with informative message when the
|
||||
# covariance of the support data is equal to 0.
|
||||
X_1 = np.array([0.5, 0.1, 0.1, 0.1, 0.957, 0.1, 0.1, 0.1, 0.4285, 0.1])
|
||||
X_1 = X_1.reshape(-1, 1)
|
||||
X_2 = np.array([0.5, 0.3, 0.3, 0.3, 0.957, 0.3, 0.3, 0.3, 0.4285, 0.3])
|
||||
X_2 = X_2.reshape(-1, 1)
|
||||
msg = (
|
||||
"The covariance matrix of the support data is equal to 0, try to "
|
||||
"increase support_fraction"
|
||||
)
|
||||
for X in [X_1, X_2]:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
MinCovDet().fit(X)
|
||||
|
||||
|
||||
def test_mcd_increasing_det_warning(global_random_seed):
|
||||
# Check that a warning is raised if we observe increasing determinants
|
||||
# during the c_step. In theory the sequence of determinants should be
|
||||
# decreasing. Increasing determinants are likely due to ill-conditioned
|
||||
# covariance matrices that result in poor precision matrices.
|
||||
|
||||
X = [
|
||||
[5.1, 3.5, 1.4, 0.2],
|
||||
[4.9, 3.0, 1.4, 0.2],
|
||||
[4.7, 3.2, 1.3, 0.2],
|
||||
[4.6, 3.1, 1.5, 0.2],
|
||||
[5.0, 3.6, 1.4, 0.2],
|
||||
[4.6, 3.4, 1.4, 0.3],
|
||||
[5.0, 3.4, 1.5, 0.2],
|
||||
[4.4, 2.9, 1.4, 0.2],
|
||||
[4.9, 3.1, 1.5, 0.1],
|
||||
[5.4, 3.7, 1.5, 0.2],
|
||||
[4.8, 3.4, 1.6, 0.2],
|
||||
[4.8, 3.0, 1.4, 0.1],
|
||||
[4.3, 3.0, 1.1, 0.1],
|
||||
[5.1, 3.5, 1.4, 0.3],
|
||||
[5.7, 3.8, 1.7, 0.3],
|
||||
[5.4, 3.4, 1.7, 0.2],
|
||||
[4.6, 3.6, 1.0, 0.2],
|
||||
[5.0, 3.0, 1.6, 0.2],
|
||||
[5.2, 3.5, 1.5, 0.2],
|
||||
]
|
||||
|
||||
mcd = MinCovDet(support_fraction=0.5, random_state=global_random_seed)
|
||||
warn_msg = "Determinant has increased"
|
||||
with pytest.warns(RuntimeWarning, match=warn_msg):
|
||||
mcd.fit(X)
|
||||
Reference in New Issue
Block a user