add read me
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,374 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.covariance import (
|
||||
OAS,
|
||||
EmpiricalCovariance,
|
||||
LedoitWolf,
|
||||
ShrunkCovariance,
|
||||
empirical_covariance,
|
||||
ledoit_wolf,
|
||||
ledoit_wolf_shrinkage,
|
||||
oas,
|
||||
shrunk_covariance,
|
||||
)
|
||||
from sklearn.covariance._shrunk_covariance import _ledoit_wolf
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
|
||||
from .._shrunk_covariance import _oas
|
||||
|
||||
X, _ = datasets.load_diabetes(return_X_y=True)
|
||||
X_1d = X[:, 0]
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
|
||||
def test_covariance():
|
||||
# Tests Covariance module on a simple dataset.
|
||||
# test covariance fit from data
|
||||
cov = EmpiricalCovariance()
|
||||
cov.fit(X)
|
||||
emp_cov = empirical_covariance(X)
|
||||
assert_array_almost_equal(emp_cov, cov.covariance_, 4)
|
||||
assert_almost_equal(cov.error_norm(emp_cov), 0)
|
||||
assert_almost_equal(cov.error_norm(emp_cov, norm="spectral"), 0)
|
||||
assert_almost_equal(cov.error_norm(emp_cov, norm="frobenius"), 0)
|
||||
assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0)
|
||||
assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
cov.error_norm(emp_cov, norm="foo")
|
||||
# Mahalanobis distances computation test
|
||||
mahal_dist = cov.mahalanobis(X)
|
||||
assert np.amin(mahal_dist) > 0
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
cov = EmpiricalCovariance()
|
||||
cov.fit(X_1d)
|
||||
assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
|
||||
assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
|
||||
assert_almost_equal(cov.error_norm(empirical_covariance(X_1d), norm="spectral"), 0)
|
||||
|
||||
# test with one sample
|
||||
# Create X with 1 sample and 5 features
|
||||
X_1sample = np.arange(5).reshape(1, 5)
|
||||
cov = EmpiricalCovariance()
|
||||
warn_msg = "Only one sample available. You may want to reshape your data array"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
cov.fit(X_1sample)
|
||||
|
||||
assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
|
||||
|
||||
# test integer type
|
||||
X_integer = np.asarray([[0, 1], [1, 0]])
|
||||
result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
|
||||
assert_array_almost_equal(empirical_covariance(X_integer), result)
|
||||
|
||||
# test centered case
|
||||
cov = EmpiricalCovariance(assume_centered=True)
|
||||
cov.fit(X)
|
||||
assert_array_equal(cov.location_, np.zeros(X.shape[1]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_matrices", [1, 3])
|
||||
def test_shrunk_covariance_func(n_matrices):
|
||||
"""Check `shrunk_covariance` function."""
|
||||
|
||||
n_features = 2
|
||||
cov = np.ones((n_features, n_features))
|
||||
cov_target = np.array([[1, 0.5], [0.5, 1]])
|
||||
|
||||
if n_matrices > 1:
|
||||
cov = np.repeat(cov[np.newaxis, ...], n_matrices, axis=0)
|
||||
cov_target = np.repeat(cov_target[np.newaxis, ...], n_matrices, axis=0)
|
||||
|
||||
cov_shrunk = shrunk_covariance(cov, 0.5)
|
||||
assert_allclose(cov_shrunk, cov_target)
|
||||
|
||||
|
||||
def test_shrunk_covariance():
|
||||
"""Check consistency between `ShrunkCovariance` and `shrunk_covariance`."""
|
||||
|
||||
# Tests ShrunkCovariance module on a simple dataset.
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
cov = ShrunkCovariance(shrinkage=0.5)
|
||||
cov.fit(X)
|
||||
assert_array_almost_equal(
|
||||
shrunk_covariance(empirical_covariance(X), shrinkage=0.5), cov.covariance_, 4
|
||||
)
|
||||
|
||||
# same test with shrinkage not provided
|
||||
cov = ShrunkCovariance()
|
||||
cov.fit(X)
|
||||
assert_array_almost_equal(
|
||||
shrunk_covariance(empirical_covariance(X)), cov.covariance_, 4
|
||||
)
|
||||
|
||||
# same test with shrinkage = 0 (<==> empirical_covariance)
|
||||
cov = ShrunkCovariance(shrinkage=0.0)
|
||||
cov.fit(X)
|
||||
assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
cov = ShrunkCovariance(shrinkage=0.3)
|
||||
cov.fit(X_1d)
|
||||
assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
cov = ShrunkCovariance(shrinkage=0.5, store_precision=False)
|
||||
cov.fit(X)
|
||||
assert cov.precision_ is None
|
||||
|
||||
|
||||
def test_ledoit_wolf():
|
||||
# Tests LedoitWolf module on a simple dataset.
|
||||
# test shrinkage coeff on a simple data set
|
||||
X_centered = X - X.mean(axis=0)
|
||||
lw = LedoitWolf(assume_centered=True)
|
||||
lw.fit(X_centered)
|
||||
shrinkage_ = lw.shrinkage_
|
||||
|
||||
score_ = lw.score(X_centered)
|
||||
assert_almost_equal(
|
||||
ledoit_wolf_shrinkage(X_centered, assume_centered=True), shrinkage_
|
||||
)
|
||||
assert_almost_equal(
|
||||
ledoit_wolf_shrinkage(X_centered, assume_centered=True, block_size=6),
|
||||
shrinkage_,
|
||||
)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(
|
||||
X_centered, assume_centered=True
|
||||
)
|
||||
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
|
||||
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
|
||||
# compare estimates given by LW and ShrunkCovariance
|
||||
scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True)
|
||||
scov.fit(X_centered)
|
||||
assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
lw = LedoitWolf(assume_centered=True)
|
||||
lw.fit(X_1d)
|
||||
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d, assume_centered=True)
|
||||
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
|
||||
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
|
||||
assert_array_almost_equal((X_1d**2).sum() / n_samples, lw.covariance_, 4)
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
lw = LedoitWolf(store_precision=False, assume_centered=True)
|
||||
lw.fit(X_centered)
|
||||
assert_almost_equal(lw.score(X_centered), score_, 4)
|
||||
assert lw.precision_ is None
|
||||
|
||||
# Same tests without assuming centered data
|
||||
# test shrinkage coeff on a simple data set
|
||||
lw = LedoitWolf()
|
||||
lw.fit(X)
|
||||
assert_almost_equal(lw.shrinkage_, shrinkage_, 4)
|
||||
assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X))
|
||||
assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1])
|
||||
assert_almost_equal(
|
||||
lw.shrinkage_, _ledoit_wolf(X=X, assume_centered=False, block_size=10000)[1]
|
||||
)
|
||||
assert_almost_equal(lw.score(X), score_, 4)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X)
|
||||
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
|
||||
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
|
||||
# compare estimates given by LW and ShrunkCovariance
|
||||
scov = ShrunkCovariance(shrinkage=lw.shrinkage_)
|
||||
scov.fit(X)
|
||||
assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
lw = LedoitWolf()
|
||||
lw.fit(X_1d)
|
||||
assert_allclose(
|
||||
X_1d.var(ddof=0),
|
||||
_ledoit_wolf(X=X_1d, assume_centered=False, block_size=10000)[0],
|
||||
)
|
||||
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d)
|
||||
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
|
||||
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
|
||||
assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4)
|
||||
|
||||
# test with one sample
|
||||
# warning should be raised when using only 1 sample
|
||||
X_1sample = np.arange(5).reshape(1, 5)
|
||||
lw = LedoitWolf()
|
||||
|
||||
warn_msg = "Only one sample available. You may want to reshape your data array"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
lw.fit(X_1sample)
|
||||
|
||||
assert_array_almost_equal(lw.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
lw = LedoitWolf(store_precision=False)
|
||||
lw.fit(X)
|
||||
assert_almost_equal(lw.score(X), score_, 4)
|
||||
assert lw.precision_ is None
|
||||
|
||||
|
||||
def _naive_ledoit_wolf_shrinkage(X):
|
||||
# A simple implementation of the formulas from Ledoit & Wolf
|
||||
|
||||
# The computation below achieves the following computations of the
|
||||
# "O. Ledoit and M. Wolf, A Well-Conditioned Estimator for
|
||||
# Large-Dimensional Covariance Matrices"
|
||||
# beta and delta are given in the beginning of section 3.2
|
||||
n_samples, n_features = X.shape
|
||||
emp_cov = empirical_covariance(X, assume_centered=False)
|
||||
mu = np.trace(emp_cov) / n_features
|
||||
delta_ = emp_cov.copy()
|
||||
delta_.flat[:: n_features + 1] -= mu
|
||||
delta = (delta_**2).sum() / n_features
|
||||
X2 = X**2
|
||||
beta_ = (
|
||||
1.0
|
||||
/ (n_features * n_samples)
|
||||
* np.sum(np.dot(X2.T, X2) / n_samples - emp_cov**2)
|
||||
)
|
||||
|
||||
beta = min(beta_, delta)
|
||||
shrinkage = beta / delta
|
||||
return shrinkage
|
||||
|
||||
|
||||
def test_ledoit_wolf_small():
|
||||
# Compare our blocked implementation to the naive implementation
|
||||
X_small = X[:, :4]
|
||||
lw = LedoitWolf()
|
||||
lw.fit(X_small)
|
||||
shrinkage_ = lw.shrinkage_
|
||||
|
||||
assert_almost_equal(shrinkage_, _naive_ledoit_wolf_shrinkage(X_small))
|
||||
|
||||
|
||||
def test_ledoit_wolf_large():
|
||||
# test that ledoit_wolf doesn't error on data that is wider than block_size
|
||||
rng = np.random.RandomState(0)
|
||||
# use a number of features that is larger than the block-size
|
||||
X = rng.normal(size=(10, 20))
|
||||
lw = LedoitWolf(block_size=10).fit(X)
|
||||
# check that covariance is about diagonal (random normal noise)
|
||||
assert_almost_equal(lw.covariance_, np.eye(20), 0)
|
||||
cov = lw.covariance_
|
||||
|
||||
# check that the result is consistent with not splitting data into blocks.
|
||||
lw = LedoitWolf(block_size=25).fit(X)
|
||||
assert_almost_equal(lw.covariance_, cov)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ledoit_wolf_fitting_function", [LedoitWolf().fit, ledoit_wolf_shrinkage]
|
||||
)
|
||||
def test_ledoit_wolf_empty_array(ledoit_wolf_fitting_function):
|
||||
"""Check that we validate X and raise proper error with 0-sample array."""
|
||||
X_empty = np.zeros((0, 2))
|
||||
with pytest.raises(ValueError, match="Found array with 0 sample"):
|
||||
ledoit_wolf_fitting_function(X_empty)
|
||||
|
||||
|
||||
def test_oas():
|
||||
# Tests OAS module on a simple dataset.
|
||||
# test shrinkage coeff on a simple data set
|
||||
X_centered = X - X.mean(axis=0)
|
||||
oa = OAS(assume_centered=True)
|
||||
oa.fit(X_centered)
|
||||
shrinkage_ = oa.shrinkage_
|
||||
score_ = oa.score(X_centered)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_centered, assume_centered=True)
|
||||
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
# compare estimates given by OAS and ShrunkCovariance
|
||||
scov = ShrunkCovariance(shrinkage=oa.shrinkage_, assume_centered=True)
|
||||
scov.fit(X_centered)
|
||||
assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0:1]
|
||||
oa = OAS(assume_centered=True)
|
||||
oa.fit(X_1d)
|
||||
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d, assume_centered=True)
|
||||
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
assert_array_almost_equal((X_1d**2).sum() / n_samples, oa.covariance_, 4)
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
oa = OAS(store_precision=False, assume_centered=True)
|
||||
oa.fit(X_centered)
|
||||
assert_almost_equal(oa.score(X_centered), score_, 4)
|
||||
assert oa.precision_ is None
|
||||
|
||||
# Same tests without assuming centered data--------------------------------
|
||||
# test shrinkage coeff on a simple data set
|
||||
oa = OAS()
|
||||
oa.fit(X)
|
||||
assert_almost_equal(oa.shrinkage_, shrinkage_, 4)
|
||||
assert_almost_equal(oa.score(X), score_, 4)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X)
|
||||
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
# compare estimates given by OAS and ShrunkCovariance
|
||||
scov = ShrunkCovariance(shrinkage=oa.shrinkage_)
|
||||
scov.fit(X)
|
||||
assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
oa = OAS()
|
||||
oa.fit(X_1d)
|
||||
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d)
|
||||
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
assert_array_almost_equal(empirical_covariance(X_1d), oa.covariance_, 4)
|
||||
|
||||
# test with one sample
|
||||
# warning should be raised when using only 1 sample
|
||||
X_1sample = np.arange(5).reshape(1, 5)
|
||||
oa = OAS()
|
||||
warn_msg = "Only one sample available. You may want to reshape your data array"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
oa.fit(X_1sample)
|
||||
|
||||
assert_array_almost_equal(oa.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
oa = OAS(store_precision=False)
|
||||
oa.fit(X)
|
||||
assert_almost_equal(oa.score(X), score_, 4)
|
||||
assert oa.precision_ is None
|
||||
|
||||
# test function _oas without assuming centered data
|
||||
X_1f = X[:, 0:1]
|
||||
oa = OAS()
|
||||
oa.fit(X_1f)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
_oa_cov_from_mle, _oa_shrinkage_from_mle = _oas(X_1f)
|
||||
assert_array_almost_equal(_oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(_oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
assert_array_almost_equal((X_1f**2).sum() / n_samples, oa.covariance_, 4)
|
||||
|
||||
|
||||
def test_EmpiricalCovariance_validates_mahalanobis():
|
||||
"""Checks that EmpiricalCovariance validates data with mahalanobis."""
|
||||
cov = EmpiricalCovariance().fit(X)
|
||||
|
||||
msg = f"X has 2 features, but \\w+ is expecting {X.shape[1]} features as input"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cov.mahalanobis(X[:, :2])
|
||||
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Testing for Elliptic Envelope algorithm (sklearn.covariance.elliptic_envelope).
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.covariance import EllipticEnvelope
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.utils._testing import (
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
|
||||
|
||||
def test_elliptic_envelope(global_random_seed):
|
||||
rnd = np.random.RandomState(global_random_seed)
|
||||
X = rnd.randn(100, 10)
|
||||
clf = EllipticEnvelope(contamination=0.1)
|
||||
with pytest.raises(NotFittedError):
|
||||
clf.predict(X)
|
||||
with pytest.raises(NotFittedError):
|
||||
clf.decision_function(X)
|
||||
clf.fit(X)
|
||||
y_pred = clf.predict(X)
|
||||
scores = clf.score_samples(X)
|
||||
decisions = clf.decision_function(X)
|
||||
|
||||
assert_array_almost_equal(scores, -clf.mahalanobis(X))
|
||||
assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
|
||||
assert_almost_equal(
|
||||
clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0
|
||||
)
|
||||
assert sum(y_pred == -1) == sum(decisions < 0)
|
||||
|
||||
|
||||
def test_score_samples():
|
||||
X_train = [[1, 1], [1, 2], [2, 1]]
|
||||
clf1 = EllipticEnvelope(contamination=0.2).fit(X_train)
|
||||
clf2 = EllipticEnvelope().fit(X_train)
|
||||
assert_array_equal(
|
||||
clf1.score_samples([[2.0, 2.0]]),
|
||||
clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
|
||||
)
|
||||
assert_array_equal(
|
||||
clf2.score_samples([[2.0, 2.0]]),
|
||||
clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
|
||||
)
|
||||
assert_array_equal(
|
||||
clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]])
|
||||
)
|
||||
@@ -0,0 +1,318 @@
|
||||
"""Test the graphical_lasso module."""
|
||||
|
||||
import sys
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
from scipy import linalg
|
||||
|
||||
from sklearn import config_context, datasets
|
||||
from sklearn.covariance import (
|
||||
GraphicalLasso,
|
||||
GraphicalLassoCV,
|
||||
empirical_covariance,
|
||||
graphical_lasso,
|
||||
)
|
||||
from sklearn.datasets import make_sparse_spd_matrix
|
||||
from sklearn.model_selection import GroupKFold
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import (
|
||||
_convert_container,
|
||||
assert_array_almost_equal,
|
||||
assert_array_less,
|
||||
)
|
||||
|
||||
|
||||
def test_graphical_lassos(random_state=1):
|
||||
"""Test the graphical lasso solvers.
|
||||
|
||||
This checks is unstable for some random seeds where the covariance found with "cd"
|
||||
and "lars" solvers are different (4 cases / 100 tries).
|
||||
"""
|
||||
# Sample data from a sparse multivariate normal
|
||||
dim = 20
|
||||
n_samples = 100
|
||||
random_state = check_random_state(random_state)
|
||||
prec = make_sparse_spd_matrix(dim, alpha=0.95, random_state=random_state)
|
||||
cov = linalg.inv(prec)
|
||||
X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
|
||||
emp_cov = empirical_covariance(X)
|
||||
|
||||
for alpha in (0.0, 0.1, 0.25):
|
||||
covs = dict()
|
||||
icovs = dict()
|
||||
for method in ("cd", "lars"):
|
||||
cov_, icov_, costs = graphical_lasso(
|
||||
emp_cov, return_costs=True, alpha=alpha, mode=method
|
||||
)
|
||||
covs[method] = cov_
|
||||
icovs[method] = icov_
|
||||
costs, dual_gap = np.array(costs).T
|
||||
# Check that the costs always decrease (doesn't hold if alpha == 0)
|
||||
if not alpha == 0:
|
||||
# use 1e-12 since the cost can be exactly 0
|
||||
assert_array_less(np.diff(costs), 1e-12)
|
||||
# Check that the 2 approaches give similar results
|
||||
assert_allclose(covs["cd"], covs["lars"], atol=5e-4)
|
||||
assert_allclose(icovs["cd"], icovs["lars"], atol=5e-4)
|
||||
|
||||
# Smoke test the estimator
|
||||
model = GraphicalLasso(alpha=0.25).fit(X)
|
||||
model.score(X)
|
||||
assert_array_almost_equal(model.covariance_, covs["cd"], decimal=4)
|
||||
assert_array_almost_equal(model.covariance_, covs["lars"], decimal=4)
|
||||
|
||||
# For a centered matrix, assume_centered could be chosen True or False
|
||||
# Check that this returns indeed the same result for centered data
|
||||
Z = X - X.mean(0)
|
||||
precs = list()
|
||||
for assume_centered in (False, True):
|
||||
prec_ = GraphicalLasso(assume_centered=assume_centered).fit(Z).precision_
|
||||
precs.append(prec_)
|
||||
assert_array_almost_equal(precs[0], precs[1])
|
||||
|
||||
|
||||
def test_graphical_lasso_when_alpha_equals_0():
|
||||
"""Test graphical_lasso's early return condition when alpha=0."""
|
||||
X = np.random.randn(100, 10)
|
||||
emp_cov = empirical_covariance(X, assume_centered=True)
|
||||
|
||||
model = GraphicalLasso(alpha=0, covariance="precomputed").fit(emp_cov)
|
||||
assert_allclose(model.precision_, np.linalg.inv(emp_cov))
|
||||
|
||||
_, precision = graphical_lasso(emp_cov, alpha=0)
|
||||
assert_allclose(precision, np.linalg.inv(emp_cov))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mode", ["cd", "lars"])
|
||||
def test_graphical_lasso_n_iter(mode):
|
||||
X, _ = datasets.make_classification(n_samples=5_000, n_features=20, random_state=0)
|
||||
emp_cov = empirical_covariance(X)
|
||||
|
||||
_, _, n_iter = graphical_lasso(
|
||||
emp_cov, 0.2, mode=mode, max_iter=2, return_n_iter=True
|
||||
)
|
||||
assert n_iter == 2
|
||||
|
||||
|
||||
def test_graphical_lasso_iris():
|
||||
# Hard-coded solution from R glasso package for alpha=1.0
|
||||
# (need to set penalize.diagonal to FALSE)
|
||||
cov_R = np.array(
|
||||
[
|
||||
[0.68112222, 0.0000000, 0.265820, 0.02464314],
|
||||
[0.00000000, 0.1887129, 0.000000, 0.00000000],
|
||||
[0.26582000, 0.0000000, 3.095503, 0.28697200],
|
||||
[0.02464314, 0.0000000, 0.286972, 0.57713289],
|
||||
]
|
||||
)
|
||||
icov_R = np.array(
|
||||
[
|
||||
[1.5190747, 0.000000, -0.1304475, 0.0000000],
|
||||
[0.0000000, 5.299055, 0.0000000, 0.0000000],
|
||||
[-0.1304475, 0.000000, 0.3498624, -0.1683946],
|
||||
[0.0000000, 0.000000, -0.1683946, 1.8164353],
|
||||
]
|
||||
)
|
||||
X = datasets.load_iris().data
|
||||
emp_cov = empirical_covariance(X)
|
||||
for method in ("cd", "lars"):
|
||||
cov, icov = graphical_lasso(emp_cov, alpha=1.0, return_costs=False, mode=method)
|
||||
assert_array_almost_equal(cov, cov_R)
|
||||
assert_array_almost_equal(icov, icov_R)
|
||||
|
||||
|
||||
def test_graph_lasso_2D():
|
||||
# Hard-coded solution from Python skggm package
|
||||
# obtained by calling `quic(emp_cov, lam=.1, tol=1e-8)`
|
||||
cov_skggm = np.array([[3.09550269, 1.186972], [1.186972, 0.57713289]])
|
||||
|
||||
icov_skggm = np.array([[1.52836773, -3.14334831], [-3.14334831, 8.19753385]])
|
||||
X = datasets.load_iris().data[:, 2:]
|
||||
emp_cov = empirical_covariance(X)
|
||||
for method in ("cd", "lars"):
|
||||
cov, icov = graphical_lasso(emp_cov, alpha=0.1, return_costs=False, mode=method)
|
||||
assert_array_almost_equal(cov, cov_skggm)
|
||||
assert_array_almost_equal(icov, icov_skggm)
|
||||
|
||||
|
||||
def test_graphical_lasso_iris_singular():
|
||||
# Small subset of rows to test the rank-deficient case
|
||||
# Need to choose samples such that none of the variances are zero
|
||||
indices = np.arange(10, 13)
|
||||
|
||||
# Hard-coded solution from R glasso package for alpha=0.01
|
||||
cov_R = np.array(
|
||||
[
|
||||
[0.08, 0.056666662595, 0.00229729713223, 0.00153153142149],
|
||||
[0.056666662595, 0.082222222222, 0.00333333333333, 0.00222222222222],
|
||||
[0.002297297132, 0.003333333333, 0.00666666666667, 0.00009009009009],
|
||||
[0.001531531421, 0.002222222222, 0.00009009009009, 0.00222222222222],
|
||||
]
|
||||
)
|
||||
icov_R = np.array(
|
||||
[
|
||||
[24.42244057, -16.831679593, 0.0, 0.0],
|
||||
[-16.83168201, 24.351841681, -6.206896552, -12.5],
|
||||
[0.0, -6.206896171, 153.103448276, 0.0],
|
||||
[0.0, -12.499999143, 0.0, 462.5],
|
||||
]
|
||||
)
|
||||
X = datasets.load_iris().data[indices, :]
|
||||
emp_cov = empirical_covariance(X)
|
||||
for method in ("cd", "lars"):
|
||||
cov, icov = graphical_lasso(
|
||||
emp_cov, alpha=0.01, return_costs=False, mode=method
|
||||
)
|
||||
assert_array_almost_equal(cov, cov_R, decimal=5)
|
||||
assert_array_almost_equal(icov, icov_R, decimal=5)
|
||||
|
||||
|
||||
def test_graphical_lasso_cv(random_state=1):
|
||||
# Sample data from a sparse multivariate normal
|
||||
dim = 5
|
||||
n_samples = 6
|
||||
random_state = check_random_state(random_state)
|
||||
prec = make_sparse_spd_matrix(dim, alpha=0.96, random_state=random_state)
|
||||
cov = linalg.inv(prec)
|
||||
X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
|
||||
# Capture stdout, to smoke test the verbose mode
|
||||
orig_stdout = sys.stdout
|
||||
try:
|
||||
sys.stdout = StringIO()
|
||||
# We need verbose very high so that Parallel prints on stdout
|
||||
GraphicalLassoCV(verbose=100, alphas=5, tol=1e-1).fit(X)
|
||||
finally:
|
||||
sys.stdout = orig_stdout
|
||||
|
||||
|
||||
@pytest.mark.parametrize("alphas_container_type", ["list", "tuple", "array"])
|
||||
def test_graphical_lasso_cv_alphas_iterable(alphas_container_type):
|
||||
"""Check that we can pass an array-like to `alphas`.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/22489
|
||||
"""
|
||||
true_cov = np.array(
|
||||
[
|
||||
[0.8, 0.0, 0.2, 0.0],
|
||||
[0.0, 0.4, 0.0, 0.0],
|
||||
[0.2, 0.0, 0.3, 0.1],
|
||||
[0.0, 0.0, 0.1, 0.7],
|
||||
]
|
||||
)
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
|
||||
alphas = _convert_container([0.02, 0.03], alphas_container_type)
|
||||
GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"alphas,err_type,err_msg",
|
||||
[
|
||||
([-0.02, 0.03], ValueError, "must be > 0"),
|
||||
([0, 0.03], ValueError, "must be > 0"),
|
||||
(["not_number", 0.03], TypeError, "must be an instance of float"),
|
||||
],
|
||||
)
|
||||
def test_graphical_lasso_cv_alphas_invalid_array(alphas, err_type, err_msg):
|
||||
"""Check that if an array-like containing a value
|
||||
outside of (0, inf] is passed to `alphas`, a ValueError is raised.
|
||||
Check if a string is passed, a TypeError is raised.
|
||||
"""
|
||||
true_cov = np.array(
|
||||
[
|
||||
[0.8, 0.0, 0.2, 0.0],
|
||||
[0.0, 0.4, 0.0, 0.0],
|
||||
[0.2, 0.0, 0.3, 0.1],
|
||||
[0.0, 0.0, 0.1, 0.7],
|
||||
]
|
||||
)
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
|
||||
|
||||
with pytest.raises(err_type, match=err_msg):
|
||||
GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X)
|
||||
|
||||
|
||||
def test_graphical_lasso_cv_scores():
|
||||
splits = 4
|
||||
n_alphas = 5
|
||||
n_refinements = 3
|
||||
true_cov = np.array(
|
||||
[
|
||||
[0.8, 0.0, 0.2, 0.0],
|
||||
[0.0, 0.4, 0.0, 0.0],
|
||||
[0.2, 0.0, 0.3, 0.1],
|
||||
[0.0, 0.0, 0.1, 0.7],
|
||||
]
|
||||
)
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
|
||||
cov = GraphicalLassoCV(cv=splits, alphas=n_alphas, n_refinements=n_refinements).fit(
|
||||
X
|
||||
)
|
||||
|
||||
_assert_graphical_lasso_cv_scores(
|
||||
cov=cov,
|
||||
n_splits=splits,
|
||||
n_refinements=n_refinements,
|
||||
n_alphas=n_alphas,
|
||||
)
|
||||
|
||||
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_graphical_lasso_cv_scores_with_routing(global_random_seed):
|
||||
"""Check that `GraphicalLassoCV` internally dispatches metadata to
|
||||
the splitter.
|
||||
"""
|
||||
splits = 5
|
||||
n_alphas = 5
|
||||
n_refinements = 3
|
||||
true_cov = np.array(
|
||||
[
|
||||
[0.8, 0.0, 0.2, 0.0],
|
||||
[0.0, 0.4, 0.0, 0.0],
|
||||
[0.2, 0.0, 0.3, 0.1],
|
||||
[0.0, 0.0, 0.1, 0.7],
|
||||
]
|
||||
)
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=300)
|
||||
n_samples = X.shape[0]
|
||||
groups = rng.randint(0, 5, n_samples)
|
||||
params = {"groups": groups}
|
||||
cv = GroupKFold(n_splits=splits)
|
||||
cv.set_split_request(groups=True)
|
||||
|
||||
cov = GraphicalLassoCV(cv=cv, alphas=n_alphas, n_refinements=n_refinements).fit(
|
||||
X, **params
|
||||
)
|
||||
|
||||
_assert_graphical_lasso_cv_scores(
|
||||
cov=cov,
|
||||
n_splits=splits,
|
||||
n_refinements=n_refinements,
|
||||
n_alphas=n_alphas,
|
||||
)
|
||||
|
||||
|
||||
def _assert_graphical_lasso_cv_scores(cov, n_splits, n_refinements, n_alphas):
|
||||
cv_results = cov.cv_results_
|
||||
# alpha and one for each split
|
||||
|
||||
total_alphas = n_refinements * n_alphas + 1
|
||||
keys = ["alphas"]
|
||||
split_keys = [f"split{i}_test_score" for i in range(n_splits)]
|
||||
for key in keys + split_keys:
|
||||
assert key in cv_results
|
||||
assert len(cv_results[key]) == total_alphas
|
||||
|
||||
cv_scores = np.asarray([cov.cv_results_[key] for key in split_keys])
|
||||
expected_mean = cv_scores.mean(axis=0)
|
||||
expected_std = cv_scores.std(axis=0)
|
||||
|
||||
assert_allclose(cov.cv_results_["mean_test_score"], expected_mean)
|
||||
assert_allclose(cov.cv_results_["std_test_score"], expected_std)
|
||||
@@ -0,0 +1,171 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.covariance import MinCovDet, empirical_covariance, fast_mcd
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
|
||||
X = datasets.load_iris().data
|
||||
X_1d = X[:, 0]
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
|
||||
def test_mcd(global_random_seed):
|
||||
# Tests the FastMCD algorithm implementation
|
||||
# Small data set
|
||||
# test without outliers (random independent normal data)
|
||||
launch_mcd_on_dataset(100, 5, 0, 0.02, 0.1, 75, global_random_seed)
|
||||
# test with a contaminated data set (medium contamination)
|
||||
launch_mcd_on_dataset(100, 5, 20, 0.3, 0.3, 65, global_random_seed)
|
||||
# test with a contaminated data set (strong contamination)
|
||||
launch_mcd_on_dataset(100, 5, 40, 0.1, 0.1, 50, global_random_seed)
|
||||
|
||||
# Medium data set
|
||||
launch_mcd_on_dataset(1000, 5, 450, 0.1, 0.1, 540, global_random_seed)
|
||||
|
||||
# Large data set
|
||||
launch_mcd_on_dataset(1700, 5, 800, 0.1, 0.1, 870, global_random_seed)
|
||||
|
||||
# 1D data set
|
||||
launch_mcd_on_dataset(500, 1, 100, 0.02, 0.02, 350, global_random_seed)
|
||||
|
||||
# n_samples == n_features
|
||||
launch_mcd_on_dataset(20, 20, 0, 0.1, 0.1, 15, global_random_seed)
|
||||
|
||||
|
||||
def test_fast_mcd_on_invalid_input():
|
||||
X = np.arange(100)
|
||||
msg = "Expected 2D array, got 1D array instead"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
fast_mcd(X)
|
||||
|
||||
|
||||
def test_mcd_class_on_invalid_input():
|
||||
X = np.arange(100)
|
||||
mcd = MinCovDet()
|
||||
msg = "Expected 2D array, got 1D array instead"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
mcd.fit(X)
|
||||
|
||||
|
||||
def launch_mcd_on_dataset(
|
||||
n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support, seed
|
||||
):
|
||||
rand_gen = np.random.RandomState(seed)
|
||||
data = rand_gen.randn(n_samples, n_features)
|
||||
# add some outliers
|
||||
outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
|
||||
outliers_offset = 10.0 * (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
|
||||
data[outliers_index] += outliers_offset
|
||||
inliers_mask = np.ones(n_samples).astype(bool)
|
||||
inliers_mask[outliers_index] = False
|
||||
|
||||
pure_data = data[inliers_mask]
|
||||
# compute MCD by fitting an object
|
||||
mcd_fit = MinCovDet(random_state=seed).fit(data)
|
||||
T = mcd_fit.location_
|
||||
S = mcd_fit.covariance_
|
||||
H = mcd_fit.support_
|
||||
# compare with the estimates learnt from the inliers
|
||||
error_location = np.mean((pure_data.mean(0) - T) ** 2)
|
||||
assert error_location < tol_loc
|
||||
error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
|
||||
assert error_cov < tol_cov
|
||||
assert np.sum(H) >= tol_support
|
||||
assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
|
||||
|
||||
|
||||
def test_mcd_issue1127():
|
||||
# Check that the code does not break with X.shape = (3, 1)
|
||||
# (i.e. n_support = n_samples)
|
||||
rnd = np.random.RandomState(0)
|
||||
X = rnd.normal(size=(3, 1))
|
||||
mcd = MinCovDet()
|
||||
mcd.fit(X)
|
||||
|
||||
|
||||
def test_mcd_issue3367(global_random_seed):
|
||||
# Check that MCD completes when the covariance matrix is singular
|
||||
# i.e. one of the rows and columns are all zeros
|
||||
rand_gen = np.random.RandomState(global_random_seed)
|
||||
|
||||
# Think of these as the values for X and Y -> 10 values between -5 and 5
|
||||
data_values = np.linspace(-5, 5, 10).tolist()
|
||||
# Get the cartesian product of all possible coordinate pairs from above set
|
||||
data = np.array(list(itertools.product(data_values, data_values)))
|
||||
|
||||
# Add a third column that's all zeros to make our data a set of point
|
||||
# within a plane, which means that the covariance matrix will be singular
|
||||
data = np.hstack((data, np.zeros((data.shape[0], 1))))
|
||||
|
||||
# The below line of code should raise an exception if the covariance matrix
|
||||
# is singular. As a further test, since we have points in XYZ, the
|
||||
# principle components (Eigenvectors) of these directly relate to the
|
||||
# geometry of the points. Since it's a plane, we should be able to test
|
||||
# that the Eigenvector that corresponds to the smallest Eigenvalue is the
|
||||
# plane normal, specifically [0, 0, 1], since everything is in the XY plane
|
||||
# (as I've set it up above). To do this one would start by:
|
||||
#
|
||||
# evals, evecs = np.linalg.eigh(mcd_fit.covariance_)
|
||||
# normal = evecs[:, np.argmin(evals)]
|
||||
#
|
||||
# After which we need to assert that our `normal` is equal to [0, 0, 1].
|
||||
# Do note that there is floating point error associated with this, so it's
|
||||
# best to subtract the two and then compare some small tolerance (e.g.
|
||||
# 1e-12).
|
||||
MinCovDet(random_state=rand_gen).fit(data)
|
||||
|
||||
|
||||
def test_mcd_support_covariance_is_zero():
|
||||
# Check that MCD returns a ValueError with informative message when the
|
||||
# covariance of the support data is equal to 0.
|
||||
X_1 = np.array([0.5, 0.1, 0.1, 0.1, 0.957, 0.1, 0.1, 0.1, 0.4285, 0.1])
|
||||
X_1 = X_1.reshape(-1, 1)
|
||||
X_2 = np.array([0.5, 0.3, 0.3, 0.3, 0.957, 0.3, 0.3, 0.3, 0.4285, 0.3])
|
||||
X_2 = X_2.reshape(-1, 1)
|
||||
msg = (
|
||||
"The covariance matrix of the support data is equal to 0, try to "
|
||||
"increase support_fraction"
|
||||
)
|
||||
for X in [X_1, X_2]:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
MinCovDet().fit(X)
|
||||
|
||||
|
||||
def test_mcd_increasing_det_warning(global_random_seed):
|
||||
# Check that a warning is raised if we observe increasing determinants
|
||||
# during the c_step. In theory the sequence of determinants should be
|
||||
# decreasing. Increasing determinants are likely due to ill-conditioned
|
||||
# covariance matrices that result in poor precision matrices.
|
||||
|
||||
X = [
|
||||
[5.1, 3.5, 1.4, 0.2],
|
||||
[4.9, 3.0, 1.4, 0.2],
|
||||
[4.7, 3.2, 1.3, 0.2],
|
||||
[4.6, 3.1, 1.5, 0.2],
|
||||
[5.0, 3.6, 1.4, 0.2],
|
||||
[4.6, 3.4, 1.4, 0.3],
|
||||
[5.0, 3.4, 1.5, 0.2],
|
||||
[4.4, 2.9, 1.4, 0.2],
|
||||
[4.9, 3.1, 1.5, 0.1],
|
||||
[5.4, 3.7, 1.5, 0.2],
|
||||
[4.8, 3.4, 1.6, 0.2],
|
||||
[4.8, 3.0, 1.4, 0.1],
|
||||
[4.3, 3.0, 1.1, 0.1],
|
||||
[5.1, 3.5, 1.4, 0.3],
|
||||
[5.7, 3.8, 1.7, 0.3],
|
||||
[5.4, 3.4, 1.7, 0.2],
|
||||
[4.6, 3.6, 1.0, 0.2],
|
||||
[5.0, 3.0, 1.6, 0.2],
|
||||
[5.2, 3.5, 1.5, 0.2],
|
||||
]
|
||||
|
||||
mcd = MinCovDet(support_fraction=0.5, random_state=global_random_seed)
|
||||
warn_msg = "Determinant has increased"
|
||||
with pytest.warns(RuntimeWarning, match=warn_msg):
|
||||
mcd.fit(X)
|
||||
Reference in New Issue
Block a user