add read me
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,200 @@
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal
|
||||
|
||||
from sklearn.neighbors._ball_tree import BallTree, BallTree32, BallTree64
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import _convert_container
|
||||
from sklearn.utils.validation import check_array
|
||||
|
||||
rng = np.random.RandomState(10)
|
||||
V_mahalanobis = rng.rand(3, 3)
|
||||
V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T)
|
||||
|
||||
DIMENSION = 3
|
||||
|
||||
METRICS = {
|
||||
"euclidean": {},
|
||||
"manhattan": {},
|
||||
"minkowski": dict(p=3),
|
||||
"chebyshev": {},
|
||||
}
|
||||
|
||||
DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"]
|
||||
|
||||
BOOLEAN_METRICS = [
|
||||
"jaccard",
|
||||
"dice",
|
||||
"rogerstanimoto",
|
||||
"russellrao",
|
||||
"sokalmichener",
|
||||
"sokalsneath",
|
||||
]
|
||||
|
||||
BALL_TREE_CLASSES = [
|
||||
BallTree64,
|
||||
BallTree32,
|
||||
]
|
||||
|
||||
|
||||
def brute_force_neighbors(X, Y, k, metric, **kwargs):
|
||||
from sklearn.metrics import DistanceMetric
|
||||
|
||||
X, Y = check_array(X), check_array(Y)
|
||||
D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
|
||||
ind = np.argsort(D, axis=1)[:, :k]
|
||||
dist = D[np.arange(Y.shape[0])[:, None], ind]
|
||||
return dist, ind
|
||||
|
||||
|
||||
def test_BallTree_is_BallTree64_subclass():
|
||||
assert issubclass(BallTree, BallTree64)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS))
|
||||
@pytest.mark.parametrize("array_type", ["list", "array"])
|
||||
@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
|
||||
def test_ball_tree_query_metrics(metric, array_type, BallTreeImplementation):
|
||||
rng = check_random_state(0)
|
||||
if metric in BOOLEAN_METRICS:
|
||||
X = rng.random_sample((40, 10)).round(0)
|
||||
Y = rng.random_sample((10, 10)).round(0)
|
||||
elif metric in DISCRETE_METRICS:
|
||||
X = (4 * rng.random_sample((40, 10))).round(0)
|
||||
Y = (4 * rng.random_sample((10, 10))).round(0)
|
||||
X = _convert_container(X, array_type)
|
||||
Y = _convert_container(Y, array_type)
|
||||
|
||||
k = 5
|
||||
|
||||
bt = BallTreeImplementation(X, leaf_size=1, metric=metric)
|
||||
dist1, ind1 = bt.query(Y, k)
|
||||
dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
|
||||
assert_array_almost_equal(dist1, dist2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"BallTreeImplementation, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5])
|
||||
)
|
||||
def test_query_haversine(BallTreeImplementation, decimal_tol):
|
||||
rng = check_random_state(0)
|
||||
X = 2 * np.pi * rng.random_sample((40, 2))
|
||||
bt = BallTreeImplementation(X, leaf_size=1, metric="haversine")
|
||||
dist1, ind1 = bt.query(X, k=5)
|
||||
dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine")
|
||||
|
||||
assert_array_almost_equal(dist1, dist2, decimal=decimal_tol)
|
||||
assert_array_almost_equal(ind1, ind2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
|
||||
def test_array_object_type(BallTreeImplementation):
|
||||
"""Check that we do not accept object dtype array."""
|
||||
X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
|
||||
with pytest.raises(ValueError, match="setting an array element with a sequence"):
|
||||
BallTreeImplementation(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
|
||||
def test_bad_pyfunc_metric(BallTreeImplementation):
|
||||
def wrong_returned_value(x, y):
|
||||
return "1"
|
||||
|
||||
def one_arg_func(x):
|
||||
return 1.0 # pragma: no cover
|
||||
|
||||
X = np.ones((5, 2))
|
||||
msg = "Custom distance function must accept two vectors and return a float."
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
BallTreeImplementation(X, metric=wrong_returned_value)
|
||||
|
||||
msg = "takes 1 positional argument but 2 were given"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
BallTreeImplementation(X, metric=one_arg_func)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS))
|
||||
def test_ball_tree_numerical_consistency(global_random_seed, metric):
|
||||
# Results on float64 and float32 versions of a dataset must be
|
||||
# numerically close.
|
||||
X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(
|
||||
random_seed=global_random_seed, features=50
|
||||
)
|
||||
|
||||
metric_params = METRICS.get(metric, {})
|
||||
bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params)
|
||||
bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params)
|
||||
|
||||
# Test consistency with respect to the `query` method
|
||||
k = 5
|
||||
dist_64, ind_64 = bt_64.query(Y_64, k=k)
|
||||
dist_32, ind_32 = bt_32.query(Y_32, k=k)
|
||||
assert_allclose(dist_64, dist_32, rtol=1e-5)
|
||||
assert_equal(ind_64, ind_32)
|
||||
assert dist_64.dtype == np.float64
|
||||
assert dist_32.dtype == np.float32
|
||||
|
||||
# Test consistency with respect to the `query_radius` method
|
||||
r = 2.38
|
||||
ind_64 = bt_64.query_radius(Y_64, r=r)
|
||||
ind_32 = bt_32.query_radius(Y_32, r=r)
|
||||
for _ind64, _ind32 in zip(ind_64, ind_32):
|
||||
assert_equal(_ind64, _ind32)
|
||||
|
||||
# Test consistency with respect to the `query_radius` method
|
||||
# with return distances being true
|
||||
ind_64, dist_64 = bt_64.query_radius(Y_64, r=r, return_distance=True)
|
||||
ind_32, dist_32 = bt_32.query_radius(Y_32, r=r, return_distance=True)
|
||||
for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32):
|
||||
assert_equal(_ind64, _ind32)
|
||||
assert_allclose(_dist_64, _dist_32, rtol=1e-5)
|
||||
assert _dist_64.dtype == np.float64
|
||||
assert _dist_32.dtype == np.float32
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS))
|
||||
def test_kernel_density_numerical_consistency(global_random_seed, metric):
|
||||
# Test consistency with respect to the `kernel_density` method
|
||||
X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
|
||||
|
||||
metric_params = METRICS.get(metric, {})
|
||||
bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params)
|
||||
bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params)
|
||||
|
||||
kernel = "gaussian"
|
||||
h = 0.1
|
||||
density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True)
|
||||
density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True)
|
||||
assert_allclose(density64, density32, rtol=1e-5)
|
||||
assert density64.dtype == np.float64
|
||||
assert density32.dtype == np.float32
|
||||
|
||||
|
||||
def test_two_point_correlation_numerical_consistency(global_random_seed):
|
||||
# Test consistency with respect to the `two_point_correlation` method
|
||||
X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
|
||||
|
||||
bt_64 = BallTree64(X_64, leaf_size=10)
|
||||
bt_32 = BallTree32(X_32, leaf_size=10)
|
||||
|
||||
r = np.linspace(0, 1, 10)
|
||||
|
||||
counts_64 = bt_64.two_point_correlation(Y_64, r=r, dualtree=True)
|
||||
counts_32 = bt_32.two_point_correlation(Y_32, r=r, dualtree=True)
|
||||
assert_allclose(counts_64, counts_32)
|
||||
|
||||
|
||||
def get_dataset_for_binary_tree(random_seed, features=3):
|
||||
rng = np.random.RandomState(random_seed)
|
||||
_X = rng.rand(100, features)
|
||||
_Y = rng.rand(5, features)
|
||||
|
||||
X_64 = _X.astype(dtype=np.float64, copy=False)
|
||||
Y_64 = _Y.astype(dtype=np.float64, copy=False)
|
||||
|
||||
X_32 = _X.astype(dtype=np.float32, copy=False)
|
||||
Y_32 = _Y.astype(dtype=np.float32, copy=False)
|
||||
|
||||
return X_64, X_32, Y_64, Y_32
|
||||
@@ -0,0 +1,101 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.metrics import euclidean_distances
|
||||
from sklearn.neighbors import KNeighborsTransformer, RadiusNeighborsTransformer
|
||||
from sklearn.neighbors._base import _is_sorted_by_data
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
|
||||
def test_transformer_result():
|
||||
# Test the number of neighbors returned
|
||||
n_neighbors = 5
|
||||
n_samples_fit = 20
|
||||
n_queries = 18
|
||||
n_features = 10
|
||||
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.randn(n_samples_fit, n_features)
|
||||
X2 = rng.randn(n_queries, n_features)
|
||||
radius = np.percentile(euclidean_distances(X), 10)
|
||||
|
||||
# with n_neighbors
|
||||
for mode in ["distance", "connectivity"]:
|
||||
add_one = mode == "distance"
|
||||
nnt = KNeighborsTransformer(n_neighbors=n_neighbors, mode=mode)
|
||||
Xt = nnt.fit_transform(X)
|
||||
assert Xt.shape == (n_samples_fit, n_samples_fit)
|
||||
assert Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),)
|
||||
assert Xt.format == "csr"
|
||||
assert _is_sorted_by_data(Xt)
|
||||
|
||||
X2t = nnt.transform(X2)
|
||||
assert X2t.shape == (n_queries, n_samples_fit)
|
||||
assert X2t.data.shape == (n_queries * (n_neighbors + add_one),)
|
||||
assert X2t.format == "csr"
|
||||
assert _is_sorted_by_data(X2t)
|
||||
|
||||
# with radius
|
||||
for mode in ["distance", "connectivity"]:
|
||||
add_one = mode == "distance"
|
||||
nnt = RadiusNeighborsTransformer(radius=radius, mode=mode)
|
||||
Xt = nnt.fit_transform(X)
|
||||
assert Xt.shape == (n_samples_fit, n_samples_fit)
|
||||
assert not Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),)
|
||||
assert Xt.format == "csr"
|
||||
assert _is_sorted_by_data(Xt)
|
||||
|
||||
X2t = nnt.transform(X2)
|
||||
assert X2t.shape == (n_queries, n_samples_fit)
|
||||
assert not X2t.data.shape == (n_queries * (n_neighbors + add_one),)
|
||||
assert X2t.format == "csr"
|
||||
assert _is_sorted_by_data(X2t)
|
||||
|
||||
|
||||
def _has_explicit_diagonal(X):
|
||||
"""Return True if the diagonal is explicitly stored"""
|
||||
X = X.tocoo()
|
||||
explicit = X.row[X.row == X.col]
|
||||
return len(explicit) == X.shape[0]
|
||||
|
||||
|
||||
def test_explicit_diagonal():
|
||||
# Test that the diagonal is explicitly stored in the sparse graph
|
||||
n_neighbors = 5
|
||||
n_samples_fit, n_samples_transform, n_features = 20, 18, 10
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.randn(n_samples_fit, n_features)
|
||||
X2 = rng.randn(n_samples_transform, n_features)
|
||||
|
||||
nnt = KNeighborsTransformer(n_neighbors=n_neighbors)
|
||||
Xt = nnt.fit_transform(X)
|
||||
assert _has_explicit_diagonal(Xt)
|
||||
assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
|
||||
|
||||
Xt = nnt.transform(X)
|
||||
assert _has_explicit_diagonal(Xt)
|
||||
assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
|
||||
|
||||
# Using transform on new data should not always have zero diagonal
|
||||
X2t = nnt.transform(X2)
|
||||
assert not _has_explicit_diagonal(X2t)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Klass", [KNeighborsTransformer, RadiusNeighborsTransformer])
|
||||
def test_graph_feature_names_out(Klass):
|
||||
"""Check `get_feature_names_out` for transformers defined in `_graph.py`."""
|
||||
|
||||
n_samples_fit = 20
|
||||
n_features = 10
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.randn(n_samples_fit, n_features)
|
||||
|
||||
est = Klass().fit(X)
|
||||
names_out = est.get_feature_names_out()
|
||||
|
||||
class_name_lower = Klass.__name__.lower()
|
||||
expected_names_out = np.array(
|
||||
[f"{class_name_lower}{i}" for i in range(est.n_samples_fit_)],
|
||||
dtype=object,
|
||||
)
|
||||
assert_array_equal(names_out, expected_names_out)
|
||||
@@ -0,0 +1,100 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_equal
|
||||
|
||||
from sklearn.neighbors._kd_tree import KDTree, KDTree32, KDTree64
|
||||
from sklearn.neighbors.tests.test_ball_tree import get_dataset_for_binary_tree
|
||||
from sklearn.utils.parallel import Parallel, delayed
|
||||
|
||||
DIMENSION = 3
|
||||
|
||||
METRICS = {"euclidean": {}, "manhattan": {}, "chebyshev": {}, "minkowski": dict(p=3)}
|
||||
|
||||
KD_TREE_CLASSES = [
|
||||
KDTree64,
|
||||
KDTree32,
|
||||
]
|
||||
|
||||
|
||||
def test_KDTree_is_KDTree64_subclass():
|
||||
assert issubclass(KDTree, KDTree64)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES)
|
||||
def test_array_object_type(BinarySearchTree):
|
||||
"""Check that we do not accept object dtype array."""
|
||||
X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
|
||||
with pytest.raises(ValueError, match="setting an array element with a sequence"):
|
||||
BinarySearchTree(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES)
|
||||
def test_kdtree_picklable_with_joblib(BinarySearchTree):
|
||||
"""Make sure that KDTree queries work when joblib memmaps.
|
||||
|
||||
Non-regression test for #21685 and #21228."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.random_sample((10, 3))
|
||||
tree = BinarySearchTree(X, leaf_size=2)
|
||||
|
||||
# Call Parallel with max_nbytes=1 to trigger readonly memory mapping that
|
||||
# use to raise "ValueError: buffer source array is read-only" in a previous
|
||||
# version of the Cython code.
|
||||
Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", METRICS)
|
||||
def test_kd_tree_numerical_consistency(global_random_seed, metric):
|
||||
# Results on float64 and float32 versions of a dataset must be
|
||||
# numerically close.
|
||||
X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(
|
||||
random_seed=global_random_seed, features=50
|
||||
)
|
||||
|
||||
metric_params = METRICS.get(metric, {})
|
||||
kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params)
|
||||
kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params)
|
||||
|
||||
# Test consistency with respect to the `query` method
|
||||
k = 4
|
||||
dist_64, ind_64 = kd_64.query(Y_64, k=k)
|
||||
dist_32, ind_32 = kd_32.query(Y_32, k=k)
|
||||
assert_allclose(dist_64, dist_32, rtol=1e-5)
|
||||
assert_equal(ind_64, ind_32)
|
||||
assert dist_64.dtype == np.float64
|
||||
assert dist_32.dtype == np.float32
|
||||
|
||||
# Test consistency with respect to the `query_radius` method
|
||||
r = 2.38
|
||||
ind_64 = kd_64.query_radius(Y_64, r=r)
|
||||
ind_32 = kd_32.query_radius(Y_32, r=r)
|
||||
for _ind64, _ind32 in zip(ind_64, ind_32):
|
||||
assert_equal(_ind64, _ind32)
|
||||
|
||||
# Test consistency with respect to the `query_radius` method
|
||||
# with return distances being true
|
||||
ind_64, dist_64 = kd_64.query_radius(Y_64, r=r, return_distance=True)
|
||||
ind_32, dist_32 = kd_32.query_radius(Y_32, r=r, return_distance=True)
|
||||
for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32):
|
||||
assert_equal(_ind64, _ind32)
|
||||
assert_allclose(_dist_64, _dist_32, rtol=1e-5)
|
||||
assert _dist_64.dtype == np.float64
|
||||
assert _dist_32.dtype == np.float32
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", METRICS)
|
||||
def test_kernel_density_numerical_consistency(global_random_seed, metric):
|
||||
# Test consistency with respect to the `kernel_density` method
|
||||
X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
|
||||
|
||||
metric_params = METRICS.get(metric, {})
|
||||
kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params)
|
||||
kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params)
|
||||
|
||||
kernel = "gaussian"
|
||||
h = 0.1
|
||||
density64 = kd_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True)
|
||||
density32 = kd_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True)
|
||||
assert_allclose(density64, density32, rtol=1e-5)
|
||||
assert density64.dtype == np.float64
|
||||
assert density32.dtype == np.float32
|
||||
@@ -0,0 +1,252 @@
|
||||
import joblib
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.neighbors import KDTree, KernelDensity, NearestNeighbors
|
||||
from sklearn.neighbors._ball_tree import kernel_norm
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
|
||||
# XXX Duplicated in test_neighbors_tree, test_kde
|
||||
def compute_kernel_slow(Y, X, kernel, h):
|
||||
if h == "scott":
|
||||
h = X.shape[0] ** (-1 / (X.shape[1] + 4))
|
||||
elif h == "silverman":
|
||||
h = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4))
|
||||
|
||||
d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
|
||||
norm = kernel_norm(h, X.shape[1], kernel) / X.shape[0]
|
||||
|
||||
if kernel == "gaussian":
|
||||
return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
|
||||
elif kernel == "tophat":
|
||||
return norm * (d < h).sum(-1)
|
||||
elif kernel == "epanechnikov":
|
||||
return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
|
||||
elif kernel == "exponential":
|
||||
return norm * (np.exp(-d / h)).sum(-1)
|
||||
elif kernel == "linear":
|
||||
return norm * ((1 - d / h) * (d < h)).sum(-1)
|
||||
elif kernel == "cosine":
|
||||
return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
|
||||
else:
|
||||
raise ValueError("kernel not recognized")
|
||||
|
||||
|
||||
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
|
||||
kde = KernelDensity(kernel=kernel, bandwidth=bandwidth, atol=atol, rtol=rtol)
|
||||
log_dens = kde.fit(X).score_samples(Y)
|
||||
assert_allclose(np.exp(log_dens), dens_true, atol=atol, rtol=max(1e-7, rtol))
|
||||
assert_allclose(
|
||||
np.exp(kde.score(Y)), np.prod(dens_true), atol=atol, rtol=max(1e-7, rtol)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kernel", ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"]
|
||||
)
|
||||
@pytest.mark.parametrize("bandwidth", [0.01, 0.1, 1, "scott", "silverman"])
|
||||
def test_kernel_density(kernel, bandwidth):
|
||||
n_samples, n_features = (100, 3)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(n_samples, n_features)
|
||||
Y = rng.randn(n_samples, n_features)
|
||||
|
||||
dens_true = compute_kernel_slow(Y, X, kernel, bandwidth)
|
||||
|
||||
for rtol in [0, 1e-5]:
|
||||
for atol in [1e-6, 1e-2]:
|
||||
for breadth_first in (True, False):
|
||||
check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true)
|
||||
|
||||
|
||||
def test_kernel_density_sampling(n_samples=100, n_features=3):
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(n_samples, n_features)
|
||||
|
||||
bandwidth = 0.2
|
||||
|
||||
for kernel in ["gaussian", "tophat"]:
|
||||
# draw a tophat sample
|
||||
kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
|
||||
samp = kde.sample(100)
|
||||
assert X.shape == samp.shape
|
||||
|
||||
# check that samples are in the right range
|
||||
nbrs = NearestNeighbors(n_neighbors=1).fit(X)
|
||||
dist, ind = nbrs.kneighbors(X, return_distance=True)
|
||||
|
||||
if kernel == "tophat":
|
||||
assert np.all(dist < bandwidth)
|
||||
elif kernel == "gaussian":
|
||||
# 5 standard deviations is safe for 100 samples, but there's a
|
||||
# very small chance this test could fail.
|
||||
assert np.all(dist < 5 * bandwidth)
|
||||
|
||||
# check unsupported kernels
|
||||
for kernel in ["epanechnikov", "exponential", "linear", "cosine"]:
|
||||
kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
|
||||
with pytest.raises(NotImplementedError):
|
||||
kde.sample(100)
|
||||
|
||||
# non-regression test: used to return a scalar
|
||||
X = rng.randn(4, 1)
|
||||
kde = KernelDensity(kernel="gaussian").fit(X)
|
||||
assert kde.sample().shape == (1, 1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree"])
|
||||
@pytest.mark.parametrize(
|
||||
"metric", ["euclidean", "minkowski", "manhattan", "chebyshev", "haversine"]
|
||||
)
|
||||
def test_kde_algorithm_metric_choice(algorithm, metric):
|
||||
# Smoke test for various metrics and algorithms
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 2) # 2 features required for haversine dist.
|
||||
Y = rng.randn(10, 2)
|
||||
|
||||
kde = KernelDensity(algorithm=algorithm, metric=metric)
|
||||
|
||||
if algorithm == "kd_tree" and metric not in KDTree.valid_metrics:
|
||||
with pytest.raises(ValueError, match="invalid metric"):
|
||||
kde.fit(X)
|
||||
else:
|
||||
kde.fit(X)
|
||||
y_dens = kde.score_samples(Y)
|
||||
assert y_dens.shape == Y.shape[:1]
|
||||
|
||||
|
||||
def test_kde_score(n_samples=100, n_features=3):
|
||||
pass
|
||||
# FIXME
|
||||
# rng = np.random.RandomState(0)
|
||||
# X = rng.random_sample((n_samples, n_features))
|
||||
# Y = rng.random_sample((n_samples, n_features))
|
||||
|
||||
|
||||
def test_kde_sample_weights_error():
|
||||
kde = KernelDensity()
|
||||
with pytest.raises(ValueError):
|
||||
kde.fit(np.random.random((200, 10)), sample_weight=np.random.random((200, 10)))
|
||||
with pytest.raises(ValueError):
|
||||
kde.fit(np.random.random((200, 10)), sample_weight=-np.random.random(200))
|
||||
|
||||
|
||||
def test_kde_pipeline_gridsearch():
|
||||
# test that kde plays nice in pipelines and grid-searches
|
||||
X, _ = make_blobs(cluster_std=0.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]])
|
||||
pipe1 = make_pipeline(
|
||||
StandardScaler(with_mean=False, with_std=False),
|
||||
KernelDensity(kernel="gaussian"),
|
||||
)
|
||||
params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])
|
||||
search = GridSearchCV(pipe1, param_grid=params)
|
||||
search.fit(X)
|
||||
assert search.best_params_["kerneldensity__bandwidth"] == 0.1
|
||||
|
||||
|
||||
def test_kde_sample_weights():
|
||||
n_samples = 400
|
||||
size_test = 20
|
||||
weights_neutral = np.full(n_samples, 3.0)
|
||||
for d in [1, 2, 10]:
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(n_samples, d)
|
||||
weights = 1 + (10 * X.sum(axis=1)).astype(np.int8)
|
||||
X_repetitions = np.repeat(X, weights, axis=0)
|
||||
n_samples_test = size_test // d
|
||||
test_points = rng.rand(n_samples_test, d)
|
||||
for algorithm in ["auto", "ball_tree", "kd_tree"]:
|
||||
for metric in ["euclidean", "minkowski", "manhattan", "chebyshev"]:
|
||||
if algorithm != "kd_tree" or metric in KDTree.valid_metrics:
|
||||
kde = KernelDensity(algorithm=algorithm, metric=metric)
|
||||
|
||||
# Test that adding a constant sample weight has no effect
|
||||
kde.fit(X, sample_weight=weights_neutral)
|
||||
scores_const_weight = kde.score_samples(test_points)
|
||||
sample_const_weight = kde.sample(random_state=1234)
|
||||
kde.fit(X)
|
||||
scores_no_weight = kde.score_samples(test_points)
|
||||
sample_no_weight = kde.sample(random_state=1234)
|
||||
assert_allclose(scores_const_weight, scores_no_weight)
|
||||
assert_allclose(sample_const_weight, sample_no_weight)
|
||||
|
||||
# Test equivalence between sampling and (integer) weights
|
||||
kde.fit(X, sample_weight=weights)
|
||||
scores_weight = kde.score_samples(test_points)
|
||||
sample_weight = kde.sample(random_state=1234)
|
||||
kde.fit(X_repetitions)
|
||||
scores_ref_sampling = kde.score_samples(test_points)
|
||||
sample_ref_sampling = kde.sample(random_state=1234)
|
||||
assert_allclose(scores_weight, scores_ref_sampling)
|
||||
assert_allclose(sample_weight, sample_ref_sampling)
|
||||
|
||||
# Test that sample weights has a non-trivial effect
|
||||
diff = np.max(np.abs(scores_no_weight - scores_weight))
|
||||
assert diff > 0.001
|
||||
|
||||
# Test invariance with respect to arbitrary scaling
|
||||
scale_factor = rng.rand()
|
||||
kde.fit(X, sample_weight=(scale_factor * weights))
|
||||
scores_scaled_weight = kde.score_samples(test_points)
|
||||
assert_allclose(scores_scaled_weight, scores_weight)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sample_weight", [None, [0.1, 0.2, 0.3]])
|
||||
def test_pickling(tmpdir, sample_weight):
|
||||
# Make sure that predictions are the same before and after pickling. Used
|
||||
# to be a bug because sample_weights wasn't pickled and the resulting tree
|
||||
# would miss some info.
|
||||
|
||||
kde = KernelDensity()
|
||||
data = np.reshape([1.0, 2.0, 3.0], (-1, 1))
|
||||
kde.fit(data, sample_weight=sample_weight)
|
||||
|
||||
X = np.reshape([1.1, 2.1], (-1, 1))
|
||||
scores = kde.score_samples(X)
|
||||
|
||||
file_path = str(tmpdir.join("dump.pkl"))
|
||||
joblib.dump(kde, file_path)
|
||||
kde = joblib.load(file_path)
|
||||
scores_pickled = kde.score_samples(X)
|
||||
|
||||
assert_allclose(scores, scores_pickled)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["score_samples", "sample"])
|
||||
def test_check_is_fitted(method):
|
||||
# Check that predict raises an exception in an unfitted estimator.
|
||||
# Unfitted estimators should raise a NotFittedError.
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 2)
|
||||
kde = KernelDensity()
|
||||
|
||||
with pytest.raises(NotFittedError):
|
||||
getattr(kde, method)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bandwidth", ["scott", "silverman", 0.1])
|
||||
def test_bandwidth(bandwidth):
|
||||
n_samples, n_features = (100, 3)
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(n_samples, n_features)
|
||||
kde = KernelDensity(bandwidth=bandwidth).fit(X)
|
||||
samp = kde.sample(100)
|
||||
kde_sc = kde.score_samples(X)
|
||||
assert X.shape == samp.shape
|
||||
assert kde_sc.shape == (n_samples,)
|
||||
|
||||
# Test that the attribute self.bandwidth_ has the expected value
|
||||
if bandwidth == "scott":
|
||||
h = X.shape[0] ** (-1 / (X.shape[1] + 4))
|
||||
elif bandwidth == "silverman":
|
||||
h = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4))
|
||||
else:
|
||||
h = bandwidth
|
||||
assert kde.bandwidth_ == pytest.approx(h)
|
||||
@@ -0,0 +1,394 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import re
|
||||
from math import sqrt
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import metrics, neighbors
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.metrics import roc_auc_score
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import assert_allclose, assert_array_equal
|
||||
from sklearn.utils.estimator_checks import (
|
||||
check_outlier_corruption,
|
||||
parametrize_with_checks,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
# load the iris dataset
|
||||
# and randomly permute it
|
||||
rng = check_random_state(0)
|
||||
iris = load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data = iris.data[perm]
|
||||
iris.target = iris.target[perm]
|
||||
|
||||
|
||||
def test_lof(global_dtype):
|
||||
# Toy sample (the last two samples are outliers):
|
||||
X = np.asarray(
|
||||
[[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]],
|
||||
dtype=global_dtype,
|
||||
)
|
||||
|
||||
# Test LocalOutlierFactor:
|
||||
clf = neighbors.LocalOutlierFactor(n_neighbors=5)
|
||||
score = clf.fit(X).negative_outlier_factor_
|
||||
assert_array_equal(clf._fit_X, X)
|
||||
|
||||
# Assert largest outlier score is smaller than smallest inlier score:
|
||||
assert np.min(score[:-2]) > np.max(score[-2:])
|
||||
|
||||
# Assert predict() works:
|
||||
clf = neighbors.LocalOutlierFactor(contamination=0.25, n_neighbors=5).fit(X)
|
||||
expected_predictions = 6 * [1] + 2 * [-1]
|
||||
assert_array_equal(clf._predict(), expected_predictions)
|
||||
assert_array_equal(clf.fit_predict(X), expected_predictions)
|
||||
|
||||
|
||||
def test_lof_performance(global_dtype):
|
||||
# Generate train/test data
|
||||
rng = check_random_state(2)
|
||||
X = 0.3 * rng.randn(120, 2).astype(global_dtype, copy=False)
|
||||
X_train = X[:100]
|
||||
|
||||
# Generate some abnormal novel observations
|
||||
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)).astype(
|
||||
global_dtype, copy=False
|
||||
)
|
||||
X_test = np.r_[X[100:], X_outliers]
|
||||
y_test = np.array([0] * 20 + [1] * 20)
|
||||
|
||||
# fit the model for novelty detection
|
||||
clf = neighbors.LocalOutlierFactor(novelty=True).fit(X_train)
|
||||
|
||||
# predict scores (the lower, the more normal)
|
||||
y_pred = -clf.decision_function(X_test)
|
||||
|
||||
# check that roc_auc is good
|
||||
assert roc_auc_score(y_test, y_pred) > 0.99
|
||||
|
||||
|
||||
def test_lof_values(global_dtype):
|
||||
# toy samples:
|
||||
X_train = np.asarray([[1, 1], [1, 2], [2, 1]], dtype=global_dtype)
|
||||
clf1 = neighbors.LocalOutlierFactor(
|
||||
n_neighbors=2, contamination=0.1, novelty=True
|
||||
).fit(X_train)
|
||||
clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train)
|
||||
s_0 = 2.0 * sqrt(2.0) / (1.0 + sqrt(2.0))
|
||||
s_1 = (1.0 + sqrt(2)) * (1.0 / (4.0 * sqrt(2.0)) + 1.0 / (2.0 + 2.0 * sqrt(2)))
|
||||
# check predict()
|
||||
assert_allclose(-clf1.negative_outlier_factor_, [s_0, s_1, s_1])
|
||||
assert_allclose(-clf2.negative_outlier_factor_, [s_0, s_1, s_1])
|
||||
# check predict(one sample not in train)
|
||||
assert_allclose(-clf1.score_samples([[2.0, 2.0]]), [s_0])
|
||||
assert_allclose(-clf2.score_samples([[2.0, 2.0]]), [s_0])
|
||||
# check predict(one sample already in train)
|
||||
assert_allclose(-clf1.score_samples([[1.0, 1.0]]), [s_1])
|
||||
assert_allclose(-clf2.score_samples([[1.0, 1.0]]), [s_1])
|
||||
|
||||
|
||||
def test_lof_precomputed(global_dtype, random_state=42):
|
||||
"""Tests LOF with a distance matrix."""
|
||||
# Note: smaller samples may result in spurious test success
|
||||
rng = np.random.RandomState(random_state)
|
||||
X = rng.random_sample((10, 4)).astype(global_dtype, copy=False)
|
||||
Y = rng.random_sample((3, 4)).astype(global_dtype, copy=False)
|
||||
DXX = metrics.pairwise_distances(X, metric="euclidean")
|
||||
DYX = metrics.pairwise_distances(Y, X, metric="euclidean")
|
||||
# As a feature matrix (n_samples by n_features)
|
||||
lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True)
|
||||
lof_X.fit(X)
|
||||
pred_X_X = lof_X._predict()
|
||||
pred_X_Y = lof_X.predict(Y)
|
||||
|
||||
# As a dense distance matrix (n_samples by n_samples)
|
||||
lof_D = neighbors.LocalOutlierFactor(
|
||||
n_neighbors=3, algorithm="brute", metric="precomputed", novelty=True
|
||||
)
|
||||
lof_D.fit(DXX)
|
||||
pred_D_X = lof_D._predict()
|
||||
pred_D_Y = lof_D.predict(DYX)
|
||||
|
||||
assert_allclose(pred_X_X, pred_D_X)
|
||||
assert_allclose(pred_X_Y, pred_D_Y)
|
||||
|
||||
|
||||
def test_n_neighbors_attribute():
|
||||
X = iris.data
|
||||
clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X)
|
||||
assert clf.n_neighbors_ == X.shape[0] - 1
|
||||
|
||||
clf = neighbors.LocalOutlierFactor(n_neighbors=500)
|
||||
msg = "n_neighbors will be set to (n_samples - 1)"
|
||||
with pytest.warns(UserWarning, match=re.escape(msg)):
|
||||
clf.fit(X)
|
||||
assert clf.n_neighbors_ == X.shape[0] - 1
|
||||
|
||||
|
||||
def test_score_samples(global_dtype):
|
||||
X_train = np.asarray([[1, 1], [1, 2], [2, 1]], dtype=global_dtype)
|
||||
X_test = np.asarray([[2.0, 2.0]], dtype=global_dtype)
|
||||
clf1 = neighbors.LocalOutlierFactor(
|
||||
n_neighbors=2, contamination=0.1, novelty=True
|
||||
).fit(X_train)
|
||||
clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train)
|
||||
|
||||
clf1_scores = clf1.score_samples(X_test)
|
||||
clf1_decisions = clf1.decision_function(X_test)
|
||||
|
||||
clf2_scores = clf2.score_samples(X_test)
|
||||
clf2_decisions = clf2.decision_function(X_test)
|
||||
|
||||
assert_allclose(
|
||||
clf1_scores,
|
||||
clf1_decisions + clf1.offset_,
|
||||
)
|
||||
assert_allclose(
|
||||
clf2_scores,
|
||||
clf2_decisions + clf2.offset_,
|
||||
)
|
||||
assert_allclose(clf1_scores, clf2_scores)
|
||||
|
||||
|
||||
def test_novelty_errors():
|
||||
X = iris.data
|
||||
|
||||
# check errors for novelty=False
|
||||
clf = neighbors.LocalOutlierFactor()
|
||||
clf.fit(X)
|
||||
# predict, decision_function and score_samples raise ValueError
|
||||
for method in ["predict", "decision_function", "score_samples"]:
|
||||
outer_msg = f"'LocalOutlierFactor' has no attribute '{method}'"
|
||||
inner_msg = "{} is not available when novelty=False".format(method)
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
getattr(clf, method)
|
||||
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg in str(exec_info.value.__cause__)
|
||||
|
||||
# check errors for novelty=True
|
||||
clf = neighbors.LocalOutlierFactor(novelty=True)
|
||||
|
||||
outer_msg = "'LocalOutlierFactor' has no attribute 'fit_predict'"
|
||||
inner_msg = "fit_predict is not available when novelty=True"
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
getattr(clf, "fit_predict")
|
||||
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg in str(exec_info.value.__cause__)
|
||||
|
||||
|
||||
def test_novelty_training_scores(global_dtype):
|
||||
# check that the scores of the training samples are still accessible
|
||||
# when novelty=True through the negative_outlier_factor_ attribute
|
||||
X = iris.data.astype(global_dtype)
|
||||
|
||||
# fit with novelty=False
|
||||
clf_1 = neighbors.LocalOutlierFactor()
|
||||
clf_1.fit(X)
|
||||
scores_1 = clf_1.negative_outlier_factor_
|
||||
|
||||
# fit with novelty=True
|
||||
clf_2 = neighbors.LocalOutlierFactor(novelty=True)
|
||||
clf_2.fit(X)
|
||||
scores_2 = clf_2.negative_outlier_factor_
|
||||
|
||||
assert_allclose(scores_1, scores_2)
|
||||
|
||||
|
||||
def test_hasattr_prediction():
|
||||
# check availability of prediction methods depending on novelty value.
|
||||
X = [[1, 1], [1, 2], [2, 1]]
|
||||
|
||||
# when novelty=True
|
||||
clf = neighbors.LocalOutlierFactor(novelty=True)
|
||||
clf.fit(X)
|
||||
assert hasattr(clf, "predict")
|
||||
assert hasattr(clf, "decision_function")
|
||||
assert hasattr(clf, "score_samples")
|
||||
assert not hasattr(clf, "fit_predict")
|
||||
|
||||
# when novelty=False
|
||||
clf = neighbors.LocalOutlierFactor(novelty=False)
|
||||
clf.fit(X)
|
||||
assert hasattr(clf, "fit_predict")
|
||||
assert not hasattr(clf, "predict")
|
||||
assert not hasattr(clf, "decision_function")
|
||||
assert not hasattr(clf, "score_samples")
|
||||
|
||||
|
||||
@parametrize_with_checks([neighbors.LocalOutlierFactor(novelty=True)])
|
||||
def test_novelty_true_common_tests(estimator, check):
|
||||
# the common tests are run for the default LOF (novelty=False).
|
||||
# here we run these common tests for LOF when novelty=True
|
||||
check(estimator)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("expected_outliers", [30, 53])
|
||||
def test_predicted_outlier_number(expected_outliers):
|
||||
# the number of predicted outliers should be equal to the number of
|
||||
# expected outliers unless there are ties in the abnormality scores.
|
||||
X = iris.data
|
||||
n_samples = X.shape[0]
|
||||
contamination = float(expected_outliers) / n_samples
|
||||
|
||||
clf = neighbors.LocalOutlierFactor(contamination=contamination)
|
||||
y_pred = clf.fit_predict(X)
|
||||
|
||||
num_outliers = np.sum(y_pred != 1)
|
||||
if num_outliers != expected_outliers:
|
||||
y_dec = clf.negative_outlier_factor_
|
||||
check_outlier_corruption(num_outliers, expected_outliers, y_dec)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sparse(csr_container):
|
||||
# LocalOutlierFactor must support CSR inputs
|
||||
# TODO: compare results on dense and sparse data as proposed in:
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
|
||||
X = csr_container(iris.data)
|
||||
|
||||
lof = neighbors.LocalOutlierFactor(novelty=True)
|
||||
lof.fit(X)
|
||||
lof.predict(X)
|
||||
lof.score_samples(X)
|
||||
lof.decision_function(X)
|
||||
|
||||
lof = neighbors.LocalOutlierFactor(novelty=False)
|
||||
lof.fit_predict(X)
|
||||
|
||||
|
||||
def test_lof_error_n_neighbors_too_large():
|
||||
"""Check that we raise a proper error message when n_neighbors == n_samples.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/17207
|
||||
"""
|
||||
X = np.ones((7, 7))
|
||||
|
||||
msg = (
|
||||
"Expected n_neighbors < n_samples_fit, but n_neighbors = 1, "
|
||||
"n_samples_fit = 1, n_samples = 1"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
lof = neighbors.LocalOutlierFactor(n_neighbors=1).fit(X[:1])
|
||||
|
||||
lof = neighbors.LocalOutlierFactor(n_neighbors=2).fit(X[:2])
|
||||
assert lof.n_samples_fit_ == 2
|
||||
|
||||
msg = (
|
||||
"Expected n_neighbors < n_samples_fit, but n_neighbors = 2, "
|
||||
"n_samples_fit = 2, n_samples = 2"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
lof.kneighbors(None, n_neighbors=2)
|
||||
|
||||
distances, indices = lof.kneighbors(None, n_neighbors=1)
|
||||
assert distances.shape == (2, 1)
|
||||
assert indices.shape == (2, 1)
|
||||
|
||||
msg = (
|
||||
"Expected n_neighbors <= n_samples_fit, but n_neighbors = 3, "
|
||||
"n_samples_fit = 2, n_samples = 7"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
lof.kneighbors(X, n_neighbors=3)
|
||||
|
||||
(
|
||||
distances,
|
||||
indices,
|
||||
) = lof.kneighbors(X, n_neighbors=2)
|
||||
assert distances.shape == (7, 2)
|
||||
assert indices.shape == (7, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree", "brute"])
|
||||
@pytest.mark.parametrize("novelty", [True, False])
|
||||
@pytest.mark.parametrize("contamination", [0.5, "auto"])
|
||||
def test_lof_input_dtype_preservation(global_dtype, algorithm, contamination, novelty):
|
||||
"""Check that the fitted attributes are stored using the data type of X."""
|
||||
X = iris.data.astype(global_dtype, copy=False)
|
||||
|
||||
iso = neighbors.LocalOutlierFactor(
|
||||
n_neighbors=5, algorithm=algorithm, contamination=contamination, novelty=novelty
|
||||
)
|
||||
iso.fit(X)
|
||||
|
||||
assert iso.negative_outlier_factor_.dtype == global_dtype
|
||||
|
||||
for method in ("score_samples", "decision_function"):
|
||||
if hasattr(iso, method):
|
||||
y_pred = getattr(iso, method)(X)
|
||||
assert y_pred.dtype == global_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree", "brute"])
|
||||
@pytest.mark.parametrize("novelty", [True, False])
|
||||
@pytest.mark.parametrize("contamination", [0.5, "auto"])
|
||||
def test_lof_dtype_equivalence(algorithm, novelty, contamination):
|
||||
"""Check the equivalence of the results with 32 and 64 bits input."""
|
||||
|
||||
inliers = iris.data[:50] # setosa iris are really distinct from others
|
||||
outliers = iris.data[-5:] # virginica will be considered as outliers
|
||||
# lower the precision of the input data to check that we have an equivalence when
|
||||
# making the computation in 32 and 64 bits.
|
||||
X = np.concatenate([inliers, outliers], axis=0).astype(np.float32)
|
||||
|
||||
lof_32 = neighbors.LocalOutlierFactor(
|
||||
algorithm=algorithm, novelty=novelty, contamination=contamination
|
||||
)
|
||||
X_32 = X.astype(np.float32, copy=True)
|
||||
lof_32.fit(X_32)
|
||||
|
||||
lof_64 = neighbors.LocalOutlierFactor(
|
||||
algorithm=algorithm, novelty=novelty, contamination=contamination
|
||||
)
|
||||
X_64 = X.astype(np.float64, copy=True)
|
||||
lof_64.fit(X_64)
|
||||
|
||||
assert_allclose(lof_32.negative_outlier_factor_, lof_64.negative_outlier_factor_)
|
||||
|
||||
for method in ("score_samples", "decision_function", "predict", "fit_predict"):
|
||||
if hasattr(lof_32, method):
|
||||
y_pred_32 = getattr(lof_32, method)(X_32)
|
||||
y_pred_64 = getattr(lof_64, method)(X_64)
|
||||
assert_allclose(y_pred_32, y_pred_64, atol=0.0002)
|
||||
|
||||
|
||||
def test_lof_duplicate_samples():
|
||||
"""
|
||||
Check that LocalOutlierFactor raises a warning when duplicate values
|
||||
in the training data cause inaccurate results.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/27839
|
||||
"""
|
||||
|
||||
rng = np.random.default_rng(0)
|
||||
|
||||
x = rng.permutation(
|
||||
np.hstack(
|
||||
[
|
||||
[0.1] * 1000, # constant values
|
||||
np.linspace(0.1, 0.3, num=3000),
|
||||
rng.random(500) * 100, # the clear outliers
|
||||
]
|
||||
)
|
||||
)
|
||||
X = x.reshape(-1, 1)
|
||||
|
||||
error_msg = (
|
||||
"Duplicate values are leading to incorrect results. "
|
||||
"Increase the number of neighbors for more accurate results."
|
||||
)
|
||||
|
||||
lof = neighbors.LocalOutlierFactor(n_neighbors=5, contamination=0.1)
|
||||
|
||||
# Catch the warning
|
||||
with pytest.warns(UserWarning, match=re.escape(error_msg)):
|
||||
lof.fit_predict(X)
|
||||
@@ -0,0 +1,563 @@
|
||||
"""
|
||||
Testing for Neighborhood Component Analysis module (sklearn.neighbors.nca)
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_almost_equal, assert_array_equal
|
||||
from scipy.optimize import check_grad
|
||||
|
||||
from sklearn import clone
|
||||
from sklearn.datasets import load_iris, make_blobs, make_classification
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.metrics import pairwise_distances
|
||||
from sklearn.neighbors import NeighborhoodComponentsAnalysis
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils.validation import validate_data
|
||||
|
||||
rng = check_random_state(0)
|
||||
# Load and shuffle the iris dataset.
|
||||
iris = load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris_data = iris.data[perm]
|
||||
iris_target = iris.target[perm]
|
||||
# Avoid having test data introducing dependencies between tests.
|
||||
iris_data.flags.writeable = False
|
||||
iris_target.flags.writeable = False
|
||||
EPS = np.finfo(float).eps
|
||||
|
||||
|
||||
def test_simple_example():
|
||||
"""Test on a simple example.
|
||||
|
||||
Puts four points in the input space where the opposite labels points are
|
||||
next to each other. After transform the samples from the same class
|
||||
should be next to each other.
|
||||
|
||||
"""
|
||||
X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
|
||||
y = np.array([1, 0, 1, 0])
|
||||
nca = NeighborhoodComponentsAnalysis(
|
||||
n_components=2, init="identity", random_state=42
|
||||
)
|
||||
nca.fit(X, y)
|
||||
X_t = nca.transform(X)
|
||||
assert_array_equal(pairwise_distances(X_t).argsort()[:, 1], np.array([2, 3, 0, 1]))
|
||||
|
||||
|
||||
def test_toy_example_collapse_points():
|
||||
"""Test on a toy example of three points that should collapse
|
||||
|
||||
We build a simple example: two points from the same class and a point from
|
||||
a different class in the middle of them. On this simple example, the new
|
||||
(transformed) points should all collapse into one single point. Indeed, the
|
||||
objective is 2/(1 + exp(d/2)), with d the euclidean distance between the
|
||||
two samples from the same class. This is maximized for d=0 (because d>=0),
|
||||
with an objective equal to 1 (loss=-1.).
|
||||
|
||||
"""
|
||||
rng = np.random.RandomState(42)
|
||||
input_dim = 5
|
||||
two_points = rng.randn(2, input_dim)
|
||||
X = np.vstack([two_points, two_points.mean(axis=0)[np.newaxis, :]])
|
||||
y = [0, 0, 1]
|
||||
|
||||
class LossStorer:
|
||||
def __init__(self, X, y):
|
||||
self.loss = np.inf # initialize the loss to very high
|
||||
# Initialize a fake NCA and variables needed to compute the loss:
|
||||
self.fake_nca = NeighborhoodComponentsAnalysis()
|
||||
self.fake_nca.n_iter_ = np.inf
|
||||
self.X, y = validate_data(self.fake_nca, X, y, ensure_min_samples=2)
|
||||
y = LabelEncoder().fit_transform(y)
|
||||
self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
|
||||
|
||||
def callback(self, transformation, n_iter):
|
||||
"""Stores the last value of the loss function"""
|
||||
self.loss, _ = self.fake_nca._loss_grad_lbfgs(
|
||||
transformation, self.X, self.same_class_mask, -1.0
|
||||
)
|
||||
|
||||
loss_storer = LossStorer(X, y)
|
||||
nca = NeighborhoodComponentsAnalysis(random_state=42, callback=loss_storer.callback)
|
||||
X_t = nca.fit_transform(X, y)
|
||||
print(X_t)
|
||||
# test that points are collapsed into one point
|
||||
assert_array_almost_equal(X_t - X_t[0], 0.0)
|
||||
assert abs(loss_storer.loss + 1) < 1e-10
|
||||
|
||||
|
||||
def test_finite_differences(global_random_seed):
|
||||
"""Test gradient of loss function
|
||||
|
||||
Assert that the gradient is almost equal to its finite differences
|
||||
approximation.
|
||||
"""
|
||||
# Initialize the transformation `M`, as well as `X` and `y` and `NCA`
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
X, y = make_classification(random_state=global_random_seed)
|
||||
M = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1])
|
||||
nca = NeighborhoodComponentsAnalysis()
|
||||
nca.n_iter_ = 0
|
||||
mask = y[:, np.newaxis] == y[np.newaxis, :]
|
||||
|
||||
def fun(M):
|
||||
return nca._loss_grad_lbfgs(M, X, mask)[0]
|
||||
|
||||
def grad(M):
|
||||
return nca._loss_grad_lbfgs(M, X, mask)[1]
|
||||
|
||||
# compare the gradient to a finite difference approximation
|
||||
diff = check_grad(fun, grad, M.ravel())
|
||||
assert diff == pytest.approx(0.0, abs=1e-4)
|
||||
|
||||
|
||||
def test_params_validation():
|
||||
# Test that invalid parameters raise value error
|
||||
X = np.arange(12).reshape(4, 3)
|
||||
y = [1, 1, 2, 2]
|
||||
NCA = NeighborhoodComponentsAnalysis
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
init = rng.rand(5, 3)
|
||||
msg = (
|
||||
f"The output dimensionality ({init.shape[0]}) "
|
||||
"of the given linear transformation `init` cannot be "
|
||||
f"greater than its input dimensionality ({init.shape[1]})."
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
NCA(init=init).fit(X, y)
|
||||
n_components = 10
|
||||
msg = (
|
||||
"The preferred dimensionality of the projected space "
|
||||
f"`n_components` ({n_components}) cannot be greater "
|
||||
f"than the given data dimensionality ({X.shape[1]})!"
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
NCA(n_components=n_components).fit(X, y)
|
||||
|
||||
|
||||
def test_transformation_dimensions():
|
||||
X = np.arange(12).reshape(4, 3)
|
||||
y = [1, 1, 2, 2]
|
||||
|
||||
# Fail if transformation input dimension does not match inputs dimensions
|
||||
transformation = np.array([[1, 2], [3, 4]])
|
||||
with pytest.raises(ValueError):
|
||||
NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
|
||||
|
||||
# Fail if transformation output dimension is larger than
|
||||
# transformation input dimension
|
||||
transformation = np.array([[1, 2], [3, 4], [5, 6]])
|
||||
# len(transformation) > len(transformation[0])
|
||||
with pytest.raises(ValueError):
|
||||
NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
|
||||
|
||||
# Pass otherwise
|
||||
transformation = np.arange(9).reshape(3, 3)
|
||||
NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
|
||||
|
||||
|
||||
def test_n_components():
|
||||
rng = np.random.RandomState(42)
|
||||
X = np.arange(12).reshape(4, 3)
|
||||
y = [1, 1, 2, 2]
|
||||
|
||||
init = rng.rand(X.shape[1] - 1, 3)
|
||||
|
||||
# n_components = X.shape[1] != transformation.shape[0]
|
||||
n_components = X.shape[1]
|
||||
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
|
||||
msg = (
|
||||
"The preferred dimensionality of the projected space "
|
||||
f"`n_components` ({n_components}) does not match the output "
|
||||
"dimensionality of the given linear transformation "
|
||||
f"`init` ({init.shape[0]})!"
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
nca.fit(X, y)
|
||||
|
||||
# n_components > X.shape[1]
|
||||
n_components = X.shape[1] + 2
|
||||
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
|
||||
msg = (
|
||||
"The preferred dimensionality of the projected space "
|
||||
f"`n_components` ({n_components}) cannot be greater than "
|
||||
f"the given data dimensionality ({X.shape[1]})!"
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
nca.fit(X, y)
|
||||
|
||||
# n_components < X.shape[1]
|
||||
nca = NeighborhoodComponentsAnalysis(n_components=2, init="identity")
|
||||
nca.fit(X, y)
|
||||
|
||||
|
||||
def test_init_transformation():
|
||||
rng = np.random.RandomState(42)
|
||||
X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
|
||||
|
||||
# Start learning from scratch
|
||||
nca = NeighborhoodComponentsAnalysis(init="identity")
|
||||
nca.fit(X, y)
|
||||
|
||||
# Initialize with random
|
||||
nca_random = NeighborhoodComponentsAnalysis(init="random")
|
||||
nca_random.fit(X, y)
|
||||
|
||||
# Initialize with auto
|
||||
nca_auto = NeighborhoodComponentsAnalysis(init="auto")
|
||||
nca_auto.fit(X, y)
|
||||
|
||||
# Initialize with PCA
|
||||
nca_pca = NeighborhoodComponentsAnalysis(init="pca")
|
||||
nca_pca.fit(X, y)
|
||||
|
||||
# Initialize with LDA
|
||||
nca_lda = NeighborhoodComponentsAnalysis(init="lda")
|
||||
nca_lda.fit(X, y)
|
||||
|
||||
init = rng.rand(X.shape[1], X.shape[1])
|
||||
nca = NeighborhoodComponentsAnalysis(init=init)
|
||||
nca.fit(X, y)
|
||||
|
||||
# init.shape[1] must match X.shape[1]
|
||||
init = rng.rand(X.shape[1], X.shape[1] + 1)
|
||||
nca = NeighborhoodComponentsAnalysis(init=init)
|
||||
msg = (
|
||||
f"The input dimensionality ({init.shape[1]}) of the given "
|
||||
"linear transformation `init` must match the "
|
||||
f"dimensionality of the given inputs `X` ({X.shape[1]})."
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
nca.fit(X, y)
|
||||
|
||||
# init.shape[0] must be <= init.shape[1]
|
||||
init = rng.rand(X.shape[1] + 1, X.shape[1])
|
||||
nca = NeighborhoodComponentsAnalysis(init=init)
|
||||
msg = (
|
||||
f"The output dimensionality ({init.shape[0]}) of the given "
|
||||
"linear transformation `init` cannot be "
|
||||
f"greater than its input dimensionality ({init.shape[1]})."
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
nca.fit(X, y)
|
||||
|
||||
# init.shape[0] must match n_components
|
||||
init = rng.rand(X.shape[1], X.shape[1])
|
||||
n_components = X.shape[1] - 2
|
||||
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
|
||||
msg = (
|
||||
"The preferred dimensionality of the "
|
||||
f"projected space `n_components` ({n_components}) "
|
||||
"does not match the output dimensionality of the given "
|
||||
f"linear transformation `init` ({init.shape[0]})!"
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
nca.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_samples", [3, 5, 7, 11])
|
||||
@pytest.mark.parametrize("n_features", [3, 5, 7, 11])
|
||||
@pytest.mark.parametrize("n_classes", [5, 7, 11])
|
||||
@pytest.mark.parametrize("n_components", [3, 5, 7, 11])
|
||||
def test_auto_init(n_samples, n_features, n_classes, n_components):
|
||||
# Test that auto choose the init as expected with every configuration
|
||||
# of order of n_samples, n_features, n_classes and n_components.
|
||||
rng = np.random.RandomState(42)
|
||||
nca_base = NeighborhoodComponentsAnalysis(
|
||||
init="auto", n_components=n_components, max_iter=1, random_state=rng
|
||||
)
|
||||
if n_classes >= n_samples:
|
||||
pass
|
||||
# n_classes > n_samples is impossible, and n_classes == n_samples
|
||||
# throws an error from lda but is an absurd case
|
||||
else:
|
||||
X = rng.randn(n_samples, n_features)
|
||||
y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
|
||||
if n_components > n_features:
|
||||
# this would return a ValueError, which is already tested in
|
||||
# test_params_validation
|
||||
pass
|
||||
else:
|
||||
nca = clone(nca_base)
|
||||
nca.fit(X, y)
|
||||
if n_components <= min(n_classes - 1, n_features):
|
||||
nca_other = clone(nca_base).set_params(init="lda")
|
||||
elif n_components < min(n_features, n_samples):
|
||||
nca_other = clone(nca_base).set_params(init="pca")
|
||||
else:
|
||||
nca_other = clone(nca_base).set_params(init="identity")
|
||||
nca_other.fit(X, y)
|
||||
assert_array_almost_equal(nca.components_, nca_other.components_)
|
||||
|
||||
|
||||
def test_warm_start_validation():
|
||||
X, y = make_classification(
|
||||
n_samples=30,
|
||||
n_features=5,
|
||||
n_classes=4,
|
||||
n_redundant=0,
|
||||
n_informative=5,
|
||||
random_state=0,
|
||||
)
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5)
|
||||
nca.fit(X, y)
|
||||
|
||||
X_less_features, y = make_classification(
|
||||
n_samples=30,
|
||||
n_features=4,
|
||||
n_classes=4,
|
||||
n_redundant=0,
|
||||
n_informative=4,
|
||||
random_state=0,
|
||||
)
|
||||
msg = (
|
||||
f"The new inputs dimensionality ({X_less_features.shape[1]}) "
|
||||
"does not match the input dimensionality of the previously learned "
|
||||
f"transformation ({nca.components_.shape[1]})."
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
nca.fit(X_less_features, y)
|
||||
|
||||
|
||||
def test_warm_start_effectiveness():
|
||||
# A 1-iteration second fit on same data should give almost same result
|
||||
# with warm starting, and quite different result without warm starting.
|
||||
|
||||
nca_warm = NeighborhoodComponentsAnalysis(warm_start=True, random_state=0)
|
||||
nca_warm.fit(iris_data, iris_target)
|
||||
transformation_warm = nca_warm.components_
|
||||
nca_warm.max_iter = 1
|
||||
nca_warm.fit(iris_data, iris_target)
|
||||
transformation_warm_plus_one = nca_warm.components_
|
||||
|
||||
nca_cold = NeighborhoodComponentsAnalysis(warm_start=False, random_state=0)
|
||||
nca_cold.fit(iris_data, iris_target)
|
||||
transformation_cold = nca_cold.components_
|
||||
nca_cold.max_iter = 1
|
||||
nca_cold.fit(iris_data, iris_target)
|
||||
transformation_cold_plus_one = nca_cold.components_
|
||||
|
||||
diff_warm = np.sum(np.abs(transformation_warm_plus_one - transformation_warm))
|
||||
diff_cold = np.sum(np.abs(transformation_cold_plus_one - transformation_cold))
|
||||
assert diff_warm < 3.0, (
|
||||
"Transformer changed significantly after one "
|
||||
"iteration even though it was warm-started."
|
||||
)
|
||||
|
||||
assert diff_cold > diff_warm, (
|
||||
"Cold-started transformer changed less "
|
||||
"significantly than warm-started "
|
||||
"transformer after one iteration."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"init_name", ["pca", "lda", "identity", "random", "precomputed"]
|
||||
)
|
||||
def test_verbose(init_name, capsys):
|
||||
# assert there is proper output when verbose = 1, for every initialization
|
||||
# except auto because auto will call one of the others
|
||||
rng = np.random.RandomState(42)
|
||||
X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
|
||||
regexp_init = r"... done in \ *\d+\.\d{2}s"
|
||||
msgs = {
|
||||
"pca": "Finding principal components" + regexp_init,
|
||||
"lda": "Finding most discriminative components" + regexp_init,
|
||||
}
|
||||
if init_name == "precomputed":
|
||||
init = rng.randn(X.shape[1], X.shape[1])
|
||||
else:
|
||||
init = init_name
|
||||
nca = NeighborhoodComponentsAnalysis(verbose=1, init=init)
|
||||
nca.fit(X, y)
|
||||
out, _ = capsys.readouterr()
|
||||
|
||||
# check output
|
||||
lines = re.split("\n+", out)
|
||||
# if pca or lda init, an additional line is printed, so we test
|
||||
# it and remove it to test the rest equally among initializations
|
||||
if init_name in ["pca", "lda"]:
|
||||
assert re.match(msgs[init_name], lines[0])
|
||||
lines = lines[1:]
|
||||
assert lines[0] == "[NeighborhoodComponentsAnalysis]"
|
||||
header = "{:>10} {:>20} {:>10}".format("Iteration", "Objective Value", "Time(s)")
|
||||
assert lines[1] == "[NeighborhoodComponentsAnalysis] {}".format(header)
|
||||
assert lines[2] == "[NeighborhoodComponentsAnalysis] {}".format("-" * len(header))
|
||||
for line in lines[3:-2]:
|
||||
# The following regex will match for instance:
|
||||
# '[NeighborhoodComponentsAnalysis] 0 6.988936e+01 0.01'
|
||||
assert re.match(
|
||||
r"\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e"
|
||||
r"[+|-]\d+\ *\d+\.\d{2}",
|
||||
line,
|
||||
)
|
||||
assert re.match(
|
||||
r"\[NeighborhoodComponentsAnalysis\] Training took\ *\d+\.\d{2}s\.",
|
||||
lines[-2],
|
||||
)
|
||||
assert lines[-1] == ""
|
||||
|
||||
|
||||
def test_no_verbose(capsys):
|
||||
# assert by default there is no output (verbose=0)
|
||||
nca = NeighborhoodComponentsAnalysis()
|
||||
nca.fit(iris_data, iris_target)
|
||||
out, _ = capsys.readouterr()
|
||||
# check output
|
||||
assert out == ""
|
||||
|
||||
|
||||
def test_singleton_class():
|
||||
X = iris_data.copy()
|
||||
y = iris_target.copy()
|
||||
|
||||
# one singleton class
|
||||
singleton_class = 1
|
||||
(ind_singleton,) = np.where(y == singleton_class)
|
||||
y[ind_singleton] = 2
|
||||
y[ind_singleton[0]] = singleton_class
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=30)
|
||||
nca.fit(X, y)
|
||||
|
||||
# One non-singleton class
|
||||
(ind_1,) = np.where(y == 1)
|
||||
(ind_2,) = np.where(y == 2)
|
||||
y[ind_1] = 0
|
||||
y[ind_1[0]] = 1
|
||||
y[ind_2] = 0
|
||||
y[ind_2[0]] = 2
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=30)
|
||||
nca.fit(X, y)
|
||||
|
||||
# Only singleton classes
|
||||
(ind_0,) = np.where(y == 0)
|
||||
(ind_1,) = np.where(y == 1)
|
||||
(ind_2,) = np.where(y == 2)
|
||||
X = X[[ind_0[0], ind_1[0], ind_2[0]]]
|
||||
y = y[[ind_0[0], ind_1[0], ind_2[0]]]
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(init="identity", max_iter=30)
|
||||
nca.fit(X, y)
|
||||
assert_array_equal(X, nca.transform(X))
|
||||
|
||||
|
||||
def test_one_class():
|
||||
X = iris_data[iris_target == 0]
|
||||
y = iris_target[iris_target == 0]
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(
|
||||
max_iter=30, n_components=X.shape[1], init="identity"
|
||||
)
|
||||
nca.fit(X, y)
|
||||
assert_array_equal(X, nca.transform(X))
|
||||
|
||||
|
||||
def test_callback(capsys):
|
||||
max_iter = 10
|
||||
|
||||
def my_cb(transformation, n_iter):
|
||||
assert transformation.shape == (iris_data.shape[1] ** 2,)
|
||||
rem_iter = max_iter - n_iter
|
||||
print("{} iterations remaining...".format(rem_iter))
|
||||
|
||||
# assert that my_cb is called
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=max_iter, callback=my_cb, verbose=1)
|
||||
nca.fit(iris_data, iris_target)
|
||||
out, _ = capsys.readouterr()
|
||||
|
||||
# check output
|
||||
assert "{} iterations remaining...".format(max_iter - 1) in out
|
||||
|
||||
|
||||
def test_expected_transformation_shape():
|
||||
"""Test that the transformation has the expected shape."""
|
||||
X = iris_data
|
||||
y = iris_target
|
||||
|
||||
class TransformationStorer:
|
||||
def __init__(self, X, y):
|
||||
# Initialize a fake NCA and variables needed to call the loss
|
||||
# function:
|
||||
self.fake_nca = NeighborhoodComponentsAnalysis()
|
||||
self.fake_nca.n_iter_ = np.inf
|
||||
self.X, y = validate_data(self.fake_nca, X, y, ensure_min_samples=2)
|
||||
y = LabelEncoder().fit_transform(y)
|
||||
self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
|
||||
|
||||
def callback(self, transformation, n_iter):
|
||||
"""Stores the last value of the transformation taken as input by
|
||||
the optimizer"""
|
||||
self.transformation = transformation
|
||||
|
||||
transformation_storer = TransformationStorer(X, y)
|
||||
cb = transformation_storer.callback
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=5, callback=cb)
|
||||
nca.fit(X, y)
|
||||
assert transformation_storer.transformation.size == X.shape[1] ** 2
|
||||
|
||||
|
||||
def test_convergence_warning():
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1)
|
||||
cls_name = nca.__class__.__name__
|
||||
msg = "[{}] NCA did not converge".format(cls_name)
|
||||
with pytest.warns(ConvergenceWarning, match=re.escape(msg)):
|
||||
nca.fit(iris_data, iris_target)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"param, value",
|
||||
[
|
||||
("n_components", np.int32(3)),
|
||||
("max_iter", np.int32(100)),
|
||||
("tol", np.float32(0.0001)),
|
||||
],
|
||||
)
|
||||
def test_parameters_valid_types(param, value):
|
||||
# check that no error is raised when parameters have numpy integer or
|
||||
# floating types.
|
||||
nca = NeighborhoodComponentsAnalysis(**{param: value})
|
||||
|
||||
X = iris_data
|
||||
y = iris_target
|
||||
|
||||
nca.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_components", [None, 2])
|
||||
def test_nca_feature_names_out(n_components):
|
||||
"""Check `get_feature_names_out` for `NeighborhoodComponentsAnalysis`.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/28293
|
||||
"""
|
||||
|
||||
X = iris_data
|
||||
y = iris_target
|
||||
|
||||
est = NeighborhoodComponentsAnalysis(n_components=n_components).fit(X, y)
|
||||
names_out = est.get_feature_names_out()
|
||||
|
||||
class_name_lower = est.__class__.__name__.lower()
|
||||
|
||||
if n_components is not None:
|
||||
expected_n_features = n_components
|
||||
else:
|
||||
expected_n_features = X.shape[1]
|
||||
|
||||
expected_names_out = np.array(
|
||||
[f"{class_name_lower}{i}" for i in range(expected_n_features)],
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
assert_array_equal(names_out, expected_names_out)
|
||||
@@ -0,0 +1,237 @@
|
||||
"""
|
||||
Testing for the nearest centroid module.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.neighbors import NearestCentroid
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
# toy sample
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
|
||||
y = [-1, -1, -1, 1, 1, 1]
|
||||
T = [[-1, -1], [2, 2], [3, 2]]
|
||||
true_result = [-1, 1, 1]
|
||||
true_result_prior1 = [-1, 1, 1]
|
||||
|
||||
true_discriminant_scores = [-32, 64, 80]
|
||||
true_proba = [[1, 1.26642e-14], [1.60381e-28, 1], [1.80485e-35, 1]]
|
||||
|
||||
|
||||
# also load the iris dataset
|
||||
# and randomly permute it
|
||||
iris = datasets.load_iris()
|
||||
rng = np.random.RandomState(1)
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data = iris.data[perm]
|
||||
iris.target = iris.target[perm]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_classification_toy(csr_container):
|
||||
# Check classification on a toy dataset, including sparse versions.
|
||||
X_csr = csr_container(X)
|
||||
T_csr = csr_container(T)
|
||||
|
||||
# Check classification on a toy dataset, including sparse versions.
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(T), true_result)
|
||||
assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores)
|
||||
assert_array_almost_equal(clf.predict_proba(T), true_proba)
|
||||
|
||||
# Test uniform priors
|
||||
clf = NearestCentroid(priors="uniform")
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(T), true_result)
|
||||
assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores)
|
||||
assert_array_almost_equal(clf.predict_proba(T), true_proba)
|
||||
|
||||
clf = NearestCentroid(priors="empirical")
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(T), true_result)
|
||||
assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores)
|
||||
assert_array_almost_equal(clf.predict_proba(T), true_proba)
|
||||
|
||||
# Test custom priors
|
||||
clf = NearestCentroid(priors=[0.25, 0.75])
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(T), true_result_prior1)
|
||||
|
||||
# Same test, but with a sparse matrix to fit and test.
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X_csr, y)
|
||||
assert_array_equal(clf.predict(T_csr), true_result)
|
||||
|
||||
# Fit with sparse, test with non-sparse
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X_csr, y)
|
||||
assert_array_equal(clf.predict(T), true_result)
|
||||
|
||||
# Fit with non-sparse, test with sparse
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(T_csr), true_result)
|
||||
|
||||
# Fit and predict with non-CSR sparse matrices
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X_csr.tocoo(), y)
|
||||
assert_array_equal(clf.predict(T_csr.tolil()), true_result)
|
||||
|
||||
|
||||
def test_iris():
|
||||
# Check consistency on dataset iris.
|
||||
for metric in ("euclidean", "manhattan"):
|
||||
clf = NearestCentroid(metric=metric).fit(iris.data, iris.target)
|
||||
score = np.mean(clf.predict(iris.data) == iris.target)
|
||||
assert score > 0.9, "Failed with score = " + str(score)
|
||||
|
||||
|
||||
def test_iris_shrinkage():
|
||||
# Check consistency on dataset iris, when using shrinkage.
|
||||
for metric in ("euclidean", "manhattan"):
|
||||
for shrink_threshold in [None, 0.1, 0.5]:
|
||||
clf = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold)
|
||||
clf = clf.fit(iris.data, iris.target)
|
||||
score = np.mean(clf.predict(iris.data) == iris.target)
|
||||
assert score > 0.8, "Failed with score = " + str(score)
|
||||
|
||||
|
||||
def test_pickle():
|
||||
import pickle
|
||||
|
||||
# classification
|
||||
obj = NearestCentroid()
|
||||
obj.fit(iris.data, iris.target)
|
||||
score = obj.score(iris.data, iris.target)
|
||||
s = pickle.dumps(obj)
|
||||
|
||||
obj2 = pickle.loads(s)
|
||||
assert type(obj2) == obj.__class__
|
||||
score2 = obj2.score(iris.data, iris.target)
|
||||
assert_array_equal(
|
||||
score,
|
||||
score2,
|
||||
"Failed to generate same score after pickling (classification).",
|
||||
)
|
||||
|
||||
|
||||
def test_shrinkage_correct():
|
||||
# Ensure that the shrinking is correct.
|
||||
# The expected result is calculated by R (pamr),
|
||||
# which is implemented by the author of the original paper.
|
||||
# (One need to modify the code to output the new centroid in pamr.predict)
|
||||
|
||||
X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]])
|
||||
y = np.array([1, 1, 2, 2, 2])
|
||||
clf = NearestCentroid(shrink_threshold=0.1)
|
||||
clf.fit(X, y)
|
||||
expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]])
|
||||
np.testing.assert_array_almost_equal(clf.centroids_, expected_result)
|
||||
|
||||
|
||||
def test_shrinkage_threshold_decoded_y():
|
||||
clf = NearestCentroid(shrink_threshold=0.01)
|
||||
y_ind = np.asarray(y)
|
||||
y_ind[y_ind == -1] = 0
|
||||
clf.fit(X, y_ind)
|
||||
centroid_encoded = clf.centroids_
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(centroid_encoded, clf.centroids_)
|
||||
|
||||
|
||||
def test_predict_translated_data():
|
||||
# Test that NearestCentroid gives same results on translated data
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(50, 50)
|
||||
y = rng.randint(0, 3, 50)
|
||||
noise = rng.rand(50)
|
||||
clf = NearestCentroid(shrink_threshold=0.1)
|
||||
clf.fit(X, y)
|
||||
y_init = clf.predict(X)
|
||||
clf = NearestCentroid(shrink_threshold=0.1)
|
||||
X_noise = X + noise
|
||||
clf.fit(X_noise, y)
|
||||
y_translate = clf.predict(X_noise)
|
||||
assert_array_equal(y_init, y_translate)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_manhattan_metric(csr_container):
|
||||
# Test the manhattan metric.
|
||||
X_csr = csr_container(X)
|
||||
|
||||
clf = NearestCentroid(metric="manhattan")
|
||||
clf.fit(X, y)
|
||||
dense_centroid = clf.centroids_
|
||||
clf.fit(X_csr, y)
|
||||
assert_array_equal(clf.centroids_, dense_centroid)
|
||||
assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])
|
||||
|
||||
|
||||
def test_features_zero_var():
|
||||
# Test that features with 0 variance throw error
|
||||
|
||||
X = np.empty((10, 2))
|
||||
X[:, 0] = -0.13725701
|
||||
X[:, 1] = -0.9853293
|
||||
y = np.zeros((10))
|
||||
y[0] = 1
|
||||
|
||||
clf = NearestCentroid(shrink_threshold=0.1)
|
||||
with pytest.raises(ValueError):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_negative_priors_error():
|
||||
"""Check that we raise an error when the user-defined priors are negative."""
|
||||
clf = NearestCentroid(priors=[-2, 4])
|
||||
with pytest.raises(ValueError, match="priors must be non-negative"):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_warn_non_normalized_priors():
|
||||
"""Check that we raise a warning and normalize the user-defined priors when they
|
||||
don't sum to 1.
|
||||
"""
|
||||
priors = [2, 4]
|
||||
clf = NearestCentroid(priors=priors)
|
||||
with pytest.warns(
|
||||
UserWarning,
|
||||
match="The priors do not sum to 1. Normalizing such that it sums to one.",
|
||||
):
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_allclose(clf.class_prior_, np.asarray(priors) / np.asarray(priors).sum())
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_method", ["decision_function", "predict_proba", "predict_log_proba"]
|
||||
)
|
||||
def test_method_not_available_with_manhattan(response_method):
|
||||
"""Check that we raise an AttributeError with Manhattan metric when trying
|
||||
to call a non-thresholded response method.
|
||||
"""
|
||||
clf = NearestCentroid(metric="manhattan").fit(X, y)
|
||||
with pytest.raises(AttributeError):
|
||||
getattr(clf, response_method)(T)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_constructor", [np.array] + CSR_CONTAINERS)
|
||||
def test_error_zero_variances(array_constructor):
|
||||
"""Check that we raise an error when the variance for all features is zero."""
|
||||
X = np.ones((len(y), 2))
|
||||
X[:, 1] *= 2
|
||||
X = array_constructor(X)
|
||||
|
||||
clf = NearestCentroid()
|
||||
with pytest.raises(ValueError, match="All features have zero variance"):
|
||||
clf.fit(X, y)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,256 @@
|
||||
"""
|
||||
This is testing the equivalence between some estimators with internal nearest
|
||||
neighbors computations, and the corresponding pipeline versions with
|
||||
KNeighborsTransformer or RadiusNeighborsTransformer to precompute the
|
||||
neighbors.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.cluster import DBSCAN, SpectralClustering
|
||||
from sklearn.cluster.tests.common import generate_clustered_data
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.manifold import TSNE, Isomap, SpectralEmbedding
|
||||
from sklearn.neighbors import (
|
||||
KNeighborsRegressor,
|
||||
KNeighborsTransformer,
|
||||
LocalOutlierFactor,
|
||||
RadiusNeighborsRegressor,
|
||||
RadiusNeighborsTransformer,
|
||||
)
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
|
||||
|
||||
def test_spectral_clustering():
|
||||
# Test chaining KNeighborsTransformer and SpectralClustering
|
||||
n_neighbors = 5
|
||||
X, _ = make_blobs(random_state=0)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, mode="connectivity"),
|
||||
SpectralClustering(
|
||||
n_neighbors=n_neighbors, affinity="precomputed", random_state=42
|
||||
),
|
||||
)
|
||||
est_compact = SpectralClustering(
|
||||
n_neighbors=n_neighbors, affinity="nearest_neighbors", random_state=42
|
||||
)
|
||||
labels_compact = est_compact.fit_predict(X)
|
||||
labels_chain = est_chain.fit_predict(X)
|
||||
assert_array_almost_equal(labels_chain, labels_compact)
|
||||
|
||||
|
||||
def test_spectral_embedding():
|
||||
# Test chaining KNeighborsTransformer and SpectralEmbedding
|
||||
n_neighbors = 5
|
||||
|
||||
n_samples = 1000
|
||||
centers = np.array(
|
||||
[
|
||||
[0.0, 5.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 4.0, 0.0, 0.0],
|
||||
[1.0, 0.0, 0.0, 5.0, 1.0],
|
||||
]
|
||||
)
|
||||
S, true_labels = make_blobs(
|
||||
n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
|
||||
)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, mode="connectivity"),
|
||||
SpectralEmbedding(
|
||||
n_neighbors=n_neighbors, affinity="precomputed", random_state=42
|
||||
),
|
||||
)
|
||||
est_compact = SpectralEmbedding(
|
||||
n_neighbors=n_neighbors, affinity="nearest_neighbors", random_state=42
|
||||
)
|
||||
St_compact = est_compact.fit_transform(S)
|
||||
St_chain = est_chain.fit_transform(S)
|
||||
assert_array_almost_equal(St_chain, St_compact)
|
||||
|
||||
|
||||
def test_dbscan():
|
||||
# Test chaining RadiusNeighborsTransformer and DBSCAN
|
||||
radius = 0.3
|
||||
n_clusters = 3
|
||||
X = generate_clustered_data(n_clusters=n_clusters)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
RadiusNeighborsTransformer(radius=radius, mode="distance"),
|
||||
DBSCAN(metric="precomputed", eps=radius),
|
||||
)
|
||||
est_compact = DBSCAN(eps=radius)
|
||||
|
||||
labels_chain = est_chain.fit_predict(X)
|
||||
labels_compact = est_compact.fit_predict(X)
|
||||
assert_array_almost_equal(labels_chain, labels_compact)
|
||||
|
||||
|
||||
def test_isomap():
|
||||
# Test chaining KNeighborsTransformer and Isomap with
|
||||
# neighbors_algorithm='precomputed'
|
||||
algorithm = "auto"
|
||||
n_neighbors = 10
|
||||
|
||||
X, _ = make_blobs(random_state=0)
|
||||
X2, _ = make_blobs(random_state=1)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(
|
||||
n_neighbors=n_neighbors, algorithm=algorithm, mode="distance"
|
||||
),
|
||||
Isomap(n_neighbors=n_neighbors, metric="precomputed"),
|
||||
)
|
||||
est_compact = Isomap(n_neighbors=n_neighbors, neighbors_algorithm=algorithm)
|
||||
|
||||
Xt_chain = est_chain.fit_transform(X)
|
||||
Xt_compact = est_compact.fit_transform(X)
|
||||
assert_array_almost_equal(Xt_chain, Xt_compact)
|
||||
|
||||
Xt_chain = est_chain.transform(X2)
|
||||
Xt_compact = est_compact.transform(X2)
|
||||
assert_array_almost_equal(Xt_chain, Xt_compact)
|
||||
|
||||
|
||||
def test_tsne():
|
||||
# Test chaining KNeighborsTransformer and TSNE
|
||||
max_iter = 250
|
||||
perplexity = 5
|
||||
n_neighbors = int(3.0 * perplexity + 1)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(20, 2)
|
||||
|
||||
for metric in ["minkowski", "sqeuclidean"]:
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(
|
||||
n_neighbors=n_neighbors, mode="distance", metric=metric
|
||||
),
|
||||
TSNE(
|
||||
init="random",
|
||||
metric="precomputed",
|
||||
perplexity=perplexity,
|
||||
method="barnes_hut",
|
||||
random_state=42,
|
||||
max_iter=max_iter,
|
||||
),
|
||||
)
|
||||
est_compact = TSNE(
|
||||
init="random",
|
||||
metric=metric,
|
||||
perplexity=perplexity,
|
||||
max_iter=max_iter,
|
||||
method="barnes_hut",
|
||||
random_state=42,
|
||||
)
|
||||
|
||||
Xt_chain = est_chain.fit_transform(X)
|
||||
Xt_compact = est_compact.fit_transform(X)
|
||||
assert_array_almost_equal(Xt_chain, Xt_compact)
|
||||
|
||||
|
||||
def test_lof_novelty_false():
|
||||
# Test chaining KNeighborsTransformer and LocalOutlierFactor
|
||||
n_neighbors = 4
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(40, 2)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance"),
|
||||
LocalOutlierFactor(
|
||||
metric="precomputed",
|
||||
n_neighbors=n_neighbors,
|
||||
novelty=False,
|
||||
contamination="auto",
|
||||
),
|
||||
)
|
||||
est_compact = LocalOutlierFactor(
|
||||
n_neighbors=n_neighbors, novelty=False, contamination="auto"
|
||||
)
|
||||
|
||||
pred_chain = est_chain.fit_predict(X)
|
||||
pred_compact = est_compact.fit_predict(X)
|
||||
assert_array_almost_equal(pred_chain, pred_compact)
|
||||
|
||||
|
||||
def test_lof_novelty_true():
|
||||
# Test chaining KNeighborsTransformer and LocalOutlierFactor
|
||||
n_neighbors = 4
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X1 = rng.randn(40, 2)
|
||||
X2 = rng.randn(40, 2)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance"),
|
||||
LocalOutlierFactor(
|
||||
metric="precomputed",
|
||||
n_neighbors=n_neighbors,
|
||||
novelty=True,
|
||||
contamination="auto",
|
||||
),
|
||||
)
|
||||
est_compact = LocalOutlierFactor(
|
||||
n_neighbors=n_neighbors, novelty=True, contamination="auto"
|
||||
)
|
||||
|
||||
pred_chain = est_chain.fit(X1).predict(X2)
|
||||
pred_compact = est_compact.fit(X1).predict(X2)
|
||||
assert_array_almost_equal(pred_chain, pred_compact)
|
||||
|
||||
|
||||
def test_kneighbors_regressor():
|
||||
# Test chaining KNeighborsTransformer and classifiers/regressors
|
||||
rng = np.random.RandomState(0)
|
||||
X = 2 * rng.rand(40, 5) - 1
|
||||
X2 = 2 * rng.rand(40, 5) - 1
|
||||
y = rng.rand(40, 1)
|
||||
|
||||
n_neighbors = 12
|
||||
radius = 1.5
|
||||
# We precompute more neighbors than necessary, to have equivalence between
|
||||
# k-neighbors estimator after radius-neighbors transformer, and vice-versa.
|
||||
factor = 2
|
||||
|
||||
k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance")
|
||||
k_trans_factor = KNeighborsTransformer(
|
||||
n_neighbors=int(n_neighbors * factor), mode="distance"
|
||||
)
|
||||
|
||||
r_trans = RadiusNeighborsTransformer(radius=radius, mode="distance")
|
||||
r_trans_factor = RadiusNeighborsTransformer(
|
||||
radius=int(radius * factor), mode="distance"
|
||||
)
|
||||
|
||||
k_reg = KNeighborsRegressor(n_neighbors=n_neighbors)
|
||||
r_reg = RadiusNeighborsRegressor(radius=radius)
|
||||
|
||||
test_list = [
|
||||
(k_trans, k_reg),
|
||||
(k_trans_factor, r_reg),
|
||||
(r_trans, r_reg),
|
||||
(r_trans_factor, k_reg),
|
||||
]
|
||||
|
||||
for trans, reg in test_list:
|
||||
# compare the chained version and the compact version
|
||||
reg_compact = clone(reg)
|
||||
reg_precomp = clone(reg)
|
||||
reg_precomp.set_params(metric="precomputed")
|
||||
|
||||
reg_chain = make_pipeline(clone(trans), reg_precomp)
|
||||
|
||||
y_pred_chain = reg_chain.fit(X, y).predict(X2)
|
||||
y_pred_compact = reg_compact.fit(X, y).predict(X2)
|
||||
assert_array_almost_equal(y_pred_chain, y_pred_compact)
|
||||
@@ -0,0 +1,296 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import itertools
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_almost_equal
|
||||
|
||||
from sklearn.metrics import DistanceMetric
|
||||
from sklearn.neighbors._ball_tree import (
|
||||
BallTree,
|
||||
kernel_norm,
|
||||
)
|
||||
from sklearn.neighbors._ball_tree import (
|
||||
NeighborsHeap64 as NeighborsHeapBT,
|
||||
)
|
||||
from sklearn.neighbors._ball_tree import (
|
||||
nodeheap_sort as nodeheap_sort_bt,
|
||||
)
|
||||
from sklearn.neighbors._ball_tree import (
|
||||
simultaneous_sort as simultaneous_sort_bt,
|
||||
)
|
||||
from sklearn.neighbors._kd_tree import (
|
||||
KDTree,
|
||||
)
|
||||
from sklearn.neighbors._kd_tree import (
|
||||
NeighborsHeap64 as NeighborsHeapKDT,
|
||||
)
|
||||
from sklearn.neighbors._kd_tree import (
|
||||
nodeheap_sort as nodeheap_sort_kdt,
|
||||
)
|
||||
from sklearn.neighbors._kd_tree import (
|
||||
simultaneous_sort as simultaneous_sort_kdt,
|
||||
)
|
||||
from sklearn.utils import check_random_state
|
||||
|
||||
rng = np.random.RandomState(42)
|
||||
V_mahalanobis = rng.rand(3, 3)
|
||||
V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T)
|
||||
|
||||
DIMENSION = 3
|
||||
|
||||
METRICS = {
|
||||
"euclidean": {},
|
||||
"manhattan": {},
|
||||
"minkowski": dict(p=3),
|
||||
"chebyshev": {},
|
||||
"seuclidean": dict(V=rng.random_sample(DIMENSION)),
|
||||
"mahalanobis": dict(V=V_mahalanobis),
|
||||
}
|
||||
|
||||
KD_TREE_METRICS = ["euclidean", "manhattan", "chebyshev", "minkowski"]
|
||||
BALL_TREE_METRICS = list(METRICS)
|
||||
|
||||
|
||||
def dist_func(x1, x2, p):
|
||||
return np.sum((x1 - x2) ** p) ** (1.0 / p)
|
||||
|
||||
|
||||
def compute_kernel_slow(Y, X, kernel, h):
|
||||
d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
|
||||
norm = kernel_norm(h, X.shape[1], kernel)
|
||||
|
||||
if kernel == "gaussian":
|
||||
return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
|
||||
elif kernel == "tophat":
|
||||
return norm * (d < h).sum(-1)
|
||||
elif kernel == "epanechnikov":
|
||||
return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
|
||||
elif kernel == "exponential":
|
||||
return norm * (np.exp(-d / h)).sum(-1)
|
||||
elif kernel == "linear":
|
||||
return norm * ((1 - d / h) * (d < h)).sum(-1)
|
||||
elif kernel == "cosine":
|
||||
return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
|
||||
else:
|
||||
raise ValueError("kernel not recognized")
|
||||
|
||||
|
||||
def brute_force_neighbors(X, Y, k, metric, **kwargs):
|
||||
D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
|
||||
ind = np.argsort(D, axis=1)[:, :k]
|
||||
dist = D[np.arange(Y.shape[0])[:, None], ind]
|
||||
return dist, ind
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
|
||||
@pytest.mark.parametrize(
|
||||
"kernel", ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"]
|
||||
)
|
||||
@pytest.mark.parametrize("h", [0.01, 0.1, 1])
|
||||
@pytest.mark.parametrize("rtol", [0, 1e-5])
|
||||
@pytest.mark.parametrize("atol", [1e-6, 1e-2])
|
||||
@pytest.mark.parametrize("breadth_first", [True, False])
|
||||
def test_kernel_density(
|
||||
Cls, kernel, h, rtol, atol, breadth_first, n_samples=100, n_features=3
|
||||
):
|
||||
rng = check_random_state(1)
|
||||
X = rng.random_sample((n_samples, n_features))
|
||||
Y = rng.random_sample((n_samples, n_features))
|
||||
dens_true = compute_kernel_slow(Y, X, kernel, h)
|
||||
|
||||
tree = Cls(X, leaf_size=10)
|
||||
dens = tree.kernel_density(
|
||||
Y, h, atol=atol, rtol=rtol, kernel=kernel, breadth_first=breadth_first
|
||||
)
|
||||
assert_allclose(dens, dens_true, atol=atol, rtol=max(rtol, 1e-7))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
|
||||
def test_neighbor_tree_query_radius(Cls, n_samples=100, n_features=10):
|
||||
rng = check_random_state(0)
|
||||
X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
|
||||
query_pt = np.zeros(n_features, dtype=float)
|
||||
|
||||
eps = 1e-15 # roundoff error can cause test to fail
|
||||
tree = Cls(X, leaf_size=5)
|
||||
rad = np.sqrt(((X - query_pt) ** 2).sum(1))
|
||||
|
||||
for r in np.linspace(rad[0], rad[-1], 100):
|
||||
ind = tree.query_radius([query_pt], r + eps)[0]
|
||||
i = np.where(rad <= r + eps)[0]
|
||||
|
||||
ind.sort()
|
||||
i.sort()
|
||||
|
||||
assert_array_almost_equal(i, ind)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
|
||||
def test_neighbor_tree_query_radius_distance(Cls, n_samples=100, n_features=10):
|
||||
rng = check_random_state(0)
|
||||
X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
|
||||
query_pt = np.zeros(n_features, dtype=float)
|
||||
|
||||
eps = 1e-15 # roundoff error can cause test to fail
|
||||
tree = Cls(X, leaf_size=5)
|
||||
rad = np.sqrt(((X - query_pt) ** 2).sum(1))
|
||||
|
||||
for r in np.linspace(rad[0], rad[-1], 100):
|
||||
ind, dist = tree.query_radius([query_pt], r + eps, return_distance=True)
|
||||
|
||||
ind = ind[0]
|
||||
dist = dist[0]
|
||||
|
||||
d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1))
|
||||
|
||||
assert_array_almost_equal(d, dist)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
|
||||
@pytest.mark.parametrize("dualtree", (True, False))
|
||||
def test_neighbor_tree_two_point(Cls, dualtree, n_samples=100, n_features=3):
|
||||
rng = check_random_state(0)
|
||||
X = rng.random_sample((n_samples, n_features))
|
||||
Y = rng.random_sample((n_samples, n_features))
|
||||
r = np.linspace(0, 1, 10)
|
||||
tree = Cls(X, leaf_size=10)
|
||||
|
||||
D = DistanceMetric.get_metric("euclidean").pairwise(Y, X)
|
||||
counts_true = [(D <= ri).sum() for ri in r]
|
||||
|
||||
counts = tree.two_point_correlation(Y, r=r, dualtree=dualtree)
|
||||
assert_array_almost_equal(counts, counts_true)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("NeighborsHeap", [NeighborsHeapBT, NeighborsHeapKDT])
|
||||
def test_neighbors_heap(NeighborsHeap, n_pts=5, n_nbrs=10):
|
||||
heap = NeighborsHeap(n_pts, n_nbrs)
|
||||
rng = check_random_state(0)
|
||||
|
||||
for row in range(n_pts):
|
||||
d_in = rng.random_sample(2 * n_nbrs).astype(np.float64, copy=False)
|
||||
i_in = np.arange(2 * n_nbrs, dtype=np.intp)
|
||||
for d, i in zip(d_in, i_in):
|
||||
heap.push(row, d, i)
|
||||
|
||||
ind = np.argsort(d_in)
|
||||
d_in = d_in[ind]
|
||||
i_in = i_in[ind]
|
||||
|
||||
d_heap, i_heap = heap.get_arrays(sort=True)
|
||||
|
||||
assert_array_almost_equal(d_in[:n_nbrs], d_heap[row])
|
||||
assert_array_almost_equal(i_in[:n_nbrs], i_heap[row])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nodeheap_sort", [nodeheap_sort_bt, nodeheap_sort_kdt])
|
||||
def test_node_heap(nodeheap_sort, n_nodes=50):
|
||||
rng = check_random_state(0)
|
||||
vals = rng.random_sample(n_nodes).astype(np.float64, copy=False)
|
||||
|
||||
i1 = np.argsort(vals)
|
||||
vals2, i2 = nodeheap_sort(vals)
|
||||
|
||||
assert_array_almost_equal(i1, i2)
|
||||
assert_array_almost_equal(vals[i1], vals2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"simultaneous_sort", [simultaneous_sort_bt, simultaneous_sort_kdt]
|
||||
)
|
||||
def test_simultaneous_sort(simultaneous_sort, n_rows=10, n_pts=201):
|
||||
rng = check_random_state(0)
|
||||
dist = rng.random_sample((n_rows, n_pts)).astype(np.float64, copy=False)
|
||||
ind = (np.arange(n_pts) + np.zeros((n_rows, 1))).astype(np.intp, copy=False)
|
||||
|
||||
dist2 = dist.copy()
|
||||
ind2 = ind.copy()
|
||||
|
||||
# simultaneous sort rows using function
|
||||
simultaneous_sort(dist, ind)
|
||||
|
||||
# simultaneous sort rows using numpy
|
||||
i = np.argsort(dist2, axis=1)
|
||||
row_ind = np.arange(n_rows)[:, None]
|
||||
dist2 = dist2[row_ind, i]
|
||||
ind2 = ind2[row_ind, i]
|
||||
|
||||
assert_array_almost_equal(dist, dist2)
|
||||
assert_array_almost_equal(ind, ind2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
|
||||
def test_gaussian_kde(Cls, n_samples=1000):
|
||||
# Compare gaussian KDE results to scipy.stats.gaussian_kde
|
||||
from scipy.stats import gaussian_kde
|
||||
|
||||
rng = check_random_state(0)
|
||||
x_in = rng.normal(0, 1, n_samples)
|
||||
x_out = np.linspace(-5, 5, 30)
|
||||
|
||||
for h in [0.01, 0.1, 1]:
|
||||
tree = Cls(x_in[:, None])
|
||||
gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))
|
||||
|
||||
dens_tree = tree.kernel_density(x_out[:, None], h) / n_samples
|
||||
dens_gkde = gkde.evaluate(x_out)
|
||||
|
||||
assert_array_almost_equal(dens_tree, dens_gkde, decimal=3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Cls, metric",
|
||||
itertools.chain(
|
||||
[(KDTree, metric) for metric in KD_TREE_METRICS],
|
||||
[(BallTree, metric) for metric in BALL_TREE_METRICS],
|
||||
),
|
||||
)
|
||||
@pytest.mark.parametrize("k", (1, 3, 5))
|
||||
@pytest.mark.parametrize("dualtree", (True, False))
|
||||
@pytest.mark.parametrize("breadth_first", (True, False))
|
||||
def test_nn_tree_query(Cls, metric, k, dualtree, breadth_first):
|
||||
rng = check_random_state(0)
|
||||
X = rng.random_sample((40, DIMENSION))
|
||||
Y = rng.random_sample((10, DIMENSION))
|
||||
|
||||
kwargs = METRICS[metric]
|
||||
|
||||
kdt = Cls(X, leaf_size=1, metric=metric, **kwargs)
|
||||
dist1, ind1 = kdt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first)
|
||||
dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs)
|
||||
|
||||
# don't check indices here: if there are any duplicate distances,
|
||||
# the indices may not match. Distances should not have this problem.
|
||||
assert_array_almost_equal(dist1, dist2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Cls, metric",
|
||||
[(KDTree, "euclidean"), (BallTree, "euclidean"), (BallTree, dist_func)],
|
||||
)
|
||||
@pytest.mark.parametrize("protocol", (0, 1, 2))
|
||||
def test_pickle(Cls, metric, protocol):
|
||||
rng = check_random_state(0)
|
||||
X = rng.random_sample((10, 3))
|
||||
|
||||
if hasattr(metric, "__call__"):
|
||||
kwargs = {"p": 2}
|
||||
else:
|
||||
kwargs = {}
|
||||
|
||||
tree1 = Cls(X, leaf_size=1, metric=metric, **kwargs)
|
||||
|
||||
ind1, dist1 = tree1.query(X)
|
||||
|
||||
s = pickle.dumps(tree1, protocol=protocol)
|
||||
tree2 = pickle.loads(s)
|
||||
|
||||
ind2, dist2 = tree2.query(X)
|
||||
|
||||
assert_array_almost_equal(ind1, ind2)
|
||||
assert_array_almost_equal(dist1, dist2)
|
||||
|
||||
assert isinstance(tree2, Cls)
|
||||
@@ -0,0 +1,144 @@
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.neighbors._quad_tree import _QuadTree
|
||||
from sklearn.utils import check_random_state
|
||||
|
||||
|
||||
def test_quadtree_boundary_computation():
|
||||
# Introduce a point into a quad tree with boundaries not easy to compute.
|
||||
Xs = []
|
||||
|
||||
# check a random case
|
||||
Xs.append(np.array([[-1, 1], [-4, -1]], dtype=np.float32))
|
||||
# check the case where only 0 are inserted
|
||||
Xs.append(np.array([[0, 0], [0, 0]], dtype=np.float32))
|
||||
# check the case where only negative are inserted
|
||||
Xs.append(np.array([[-1, -2], [-4, 0]], dtype=np.float32))
|
||||
# check the case where only small numbers are inserted
|
||||
Xs.append(np.array([[-1e-6, 1e-6], [-4e-6, -1e-6]], dtype=np.float32))
|
||||
|
||||
for X in Xs:
|
||||
tree = _QuadTree(n_dimensions=2, verbose=0)
|
||||
tree.build_tree(X)
|
||||
tree._check_coherence()
|
||||
|
||||
|
||||
def test_quadtree_similar_point():
|
||||
# Introduce a point into a quad tree where a similar point already exists.
|
||||
# Test will hang if it doesn't complete.
|
||||
Xs = []
|
||||
|
||||
# check the case where points are actually different
|
||||
Xs.append(np.array([[1, 2], [3, 4]], dtype=np.float32))
|
||||
# check the case where points are the same on X axis
|
||||
Xs.append(np.array([[1.0, 2.0], [1.0, 3.0]], dtype=np.float32))
|
||||
# check the case where points are arbitrarily close on X axis
|
||||
Xs.append(np.array([[1.00001, 2.0], [1.00002, 3.0]], dtype=np.float32))
|
||||
# check the case where points are the same on Y axis
|
||||
Xs.append(np.array([[1.0, 2.0], [3.0, 2.0]], dtype=np.float32))
|
||||
# check the case where points are arbitrarily close on Y axis
|
||||
Xs.append(np.array([[1.0, 2.00001], [3.0, 2.00002]], dtype=np.float32))
|
||||
# check the case where points are arbitrarily close on both axes
|
||||
Xs.append(np.array([[1.00001, 2.00001], [1.00002, 2.00002]], dtype=np.float32))
|
||||
|
||||
# check the case where points are arbitrarily close on both axes
|
||||
# close to machine epsilon - x axis
|
||||
Xs.append(np.array([[1, 0.0003817754041], [2, 0.0003817753750]], dtype=np.float32))
|
||||
|
||||
# check the case where points are arbitrarily close on both axes
|
||||
# close to machine epsilon - y axis
|
||||
Xs.append(
|
||||
np.array([[0.0003817754041, 1.0], [0.0003817753750, 2.0]], dtype=np.float32)
|
||||
)
|
||||
|
||||
for X in Xs:
|
||||
tree = _QuadTree(n_dimensions=2, verbose=0)
|
||||
tree.build_tree(X)
|
||||
tree._check_coherence()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_dimensions", (2, 3))
|
||||
@pytest.mark.parametrize("protocol", (0, 1, 2))
|
||||
def test_quad_tree_pickle(n_dimensions, protocol):
|
||||
rng = check_random_state(0)
|
||||
|
||||
X = rng.random_sample((10, n_dimensions))
|
||||
|
||||
tree = _QuadTree(n_dimensions=n_dimensions, verbose=0)
|
||||
tree.build_tree(X)
|
||||
|
||||
s = pickle.dumps(tree, protocol=protocol)
|
||||
bt2 = pickle.loads(s)
|
||||
|
||||
for x in X:
|
||||
cell_x_tree = tree.get_cell(x)
|
||||
cell_x_bt2 = bt2.get_cell(x)
|
||||
assert cell_x_tree == cell_x_bt2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_dimensions", (2, 3))
|
||||
def test_qt_insert_duplicate(n_dimensions):
|
||||
rng = check_random_state(0)
|
||||
|
||||
X = rng.random_sample((10, n_dimensions))
|
||||
Xd = np.r_[X, X[:5]]
|
||||
tree = _QuadTree(n_dimensions=n_dimensions, verbose=0)
|
||||
tree.build_tree(Xd)
|
||||
|
||||
cumulative_size = tree.cumulative_size
|
||||
leafs = tree.leafs
|
||||
|
||||
# Assert that the first 5 are indeed duplicated and that the next
|
||||
# ones are single point leaf
|
||||
for i, x in enumerate(X):
|
||||
cell_id = tree.get_cell(x)
|
||||
assert leafs[cell_id]
|
||||
assert cumulative_size[cell_id] == 1 + (i < 5)
|
||||
|
||||
|
||||
def test_summarize():
|
||||
# Simple check for quad tree's summarize
|
||||
|
||||
angle = 0.9
|
||||
X = np.array(
|
||||
[[-10.0, -10.0], [9.0, 10.0], [10.0, 9.0], [10.0, 10.0]], dtype=np.float32
|
||||
)
|
||||
query_pt = X[0, :]
|
||||
n_dimensions = X.shape[1]
|
||||
offset = n_dimensions + 2
|
||||
|
||||
qt = _QuadTree(n_dimensions, verbose=0)
|
||||
qt.build_tree(X)
|
||||
|
||||
idx, summary = qt._py_summarize(query_pt, X, angle)
|
||||
|
||||
node_dist = summary[n_dimensions]
|
||||
node_size = summary[n_dimensions + 1]
|
||||
|
||||
# Summary should contain only 1 node with size 3 and distance to
|
||||
# X[1:] barycenter
|
||||
barycenter = X[1:].mean(axis=0)
|
||||
ds2c = ((X[0] - barycenter) ** 2).sum()
|
||||
|
||||
assert idx == offset
|
||||
assert node_size == 3, "summary size = {}".format(node_size)
|
||||
assert np.isclose(node_dist, ds2c)
|
||||
|
||||
# Summary should contain all 3 node with size 1 and distance to
|
||||
# each point in X[1:] for ``angle=0``
|
||||
idx, summary = qt._py_summarize(query_pt, X, 0.0)
|
||||
barycenter = X[1:].mean(axis=0)
|
||||
ds2c = ((X[0] - barycenter) ** 2).sum()
|
||||
|
||||
assert idx == 3 * (offset)
|
||||
for i in range(3):
|
||||
node_dist = summary[i * offset + n_dimensions]
|
||||
node_size = summary[i * offset + n_dimensions + 1]
|
||||
|
||||
ds2c = ((X[0] - X[i + 1]) ** 2).sum()
|
||||
|
||||
assert node_size == 1, "summary size = {}".format(node_size)
|
||||
assert np.isclose(node_dist, ds2c)
|
||||
Reference in New Issue
Block a user