add read me
This commit is contained in:
@@ -0,0 +1,18 @@
|
||||
"""Feature extraction from raw data."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from . import image, text
|
||||
from ._dict_vectorizer import DictVectorizer
|
||||
from ._hash import FeatureHasher
|
||||
from .image import grid_to_graph, img_to_graph
|
||||
|
||||
__all__ = [
|
||||
"DictVectorizer",
|
||||
"FeatureHasher",
|
||||
"grid_to_graph",
|
||||
"image",
|
||||
"img_to_graph",
|
||||
"text",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,459 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from array import array
|
||||
from collections.abc import Iterable, Mapping
|
||||
from numbers import Number
|
||||
from operator import itemgetter
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn.utils import metadata_routing
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin, _fit_context
|
||||
from ..utils import check_array
|
||||
from ..utils.validation import check_is_fitted
|
||||
|
||||
|
||||
class DictVectorizer(TransformerMixin, BaseEstimator):
|
||||
"""Transforms lists of feature-value mappings to vectors.
|
||||
|
||||
This transformer turns lists of mappings (dict-like objects) of feature
|
||||
names to feature values into Numpy arrays or scipy.sparse matrices for use
|
||||
with scikit-learn estimators.
|
||||
|
||||
When feature values are strings, this transformer will do a binary one-hot
|
||||
(aka one-of-K) coding: one boolean-valued feature is constructed for each
|
||||
of the possible string values that the feature can take on. For instance,
|
||||
a feature "f" that can take on the values "ham" and "spam" will become two
|
||||
features in the output, one signifying "f=ham", the other "f=spam".
|
||||
|
||||
If a feature value is a sequence or set of strings, this transformer
|
||||
will iterate over the values and will count the occurrences of each string
|
||||
value.
|
||||
|
||||
However, note that this transformer will only do a binary one-hot encoding
|
||||
when feature values are of type string. If categorical features are
|
||||
represented as numeric values such as int or iterables of strings, the
|
||||
DictVectorizer can be followed by
|
||||
:class:`~sklearn.preprocessing.OneHotEncoder` to complete
|
||||
binary one-hot encoding.
|
||||
|
||||
Features that do not occur in a sample (mapping) will have a zero value
|
||||
in the resulting array/matrix.
|
||||
|
||||
For an efficiency comparison of the different feature extractors, see
|
||||
:ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
|
||||
|
||||
Read more in the :ref:`User Guide <dict_feature_extraction>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dtype : dtype, default=np.float64
|
||||
The type of feature values. Passed to Numpy array/scipy.sparse matrix
|
||||
constructors as the dtype argument.
|
||||
separator : str, default="="
|
||||
Separator string used when constructing new features for one-hot
|
||||
coding.
|
||||
sparse : bool, default=True
|
||||
Whether transform should produce scipy.sparse matrices.
|
||||
sort : bool, default=True
|
||||
Whether ``feature_names_`` and ``vocabulary_`` should be
|
||||
sorted when fitting.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
vocabulary_ : dict
|
||||
A dictionary mapping feature names to feature indices.
|
||||
|
||||
feature_names_ : list
|
||||
A list of length n_features containing the feature names (e.g., "f=ham"
|
||||
and "f=spam").
|
||||
|
||||
See Also
|
||||
--------
|
||||
FeatureHasher : Performs vectorization using only a hash function.
|
||||
sklearn.preprocessing.OrdinalEncoder : Handles nominal/categorical
|
||||
features encoded as columns of arbitrary data types.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.feature_extraction import DictVectorizer
|
||||
>>> v = DictVectorizer(sparse=False)
|
||||
>>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
|
||||
>>> X = v.fit_transform(D)
|
||||
>>> X
|
||||
array([[2., 0., 1.],
|
||||
[0., 1., 3.]])
|
||||
>>> v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},
|
||||
... {'baz': 1.0, 'foo': 3.0}]
|
||||
True
|
||||
>>> v.transform({'foo': 4, 'unseen_feature': 3})
|
||||
array([[0., 0., 4.]])
|
||||
"""
|
||||
|
||||
# This isn't something that people should be routing / using in a pipeline.
|
||||
__metadata_request__inverse_transform = {"dict_type": metadata_routing.UNUSED}
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"dtype": "no_validation", # validation delegated to numpy,
|
||||
"separator": [str],
|
||||
"sparse": ["boolean"],
|
||||
"sort": ["boolean"],
|
||||
}
|
||||
|
||||
def __init__(self, *, dtype=np.float64, separator="=", sparse=True, sort=True):
|
||||
self.dtype = dtype
|
||||
self.separator = separator
|
||||
self.sparse = sparse
|
||||
self.sort = sort
|
||||
|
||||
def _add_iterable_element(
|
||||
self,
|
||||
f,
|
||||
v,
|
||||
feature_names,
|
||||
vocab,
|
||||
*,
|
||||
fitting=True,
|
||||
transforming=False,
|
||||
indices=None,
|
||||
values=None,
|
||||
):
|
||||
"""Add feature names for iterable of strings"""
|
||||
for vv in v:
|
||||
if isinstance(vv, str):
|
||||
feature_name = "%s%s%s" % (f, self.separator, vv)
|
||||
vv = 1
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Unsupported type {type(vv)} in iterable "
|
||||
"value. Only iterables of string are "
|
||||
"supported."
|
||||
)
|
||||
if fitting and feature_name not in vocab:
|
||||
vocab[feature_name] = len(feature_names)
|
||||
feature_names.append(feature_name)
|
||||
|
||||
if transforming and feature_name in vocab:
|
||||
indices.append(vocab[feature_name])
|
||||
values.append(self.dtype(vv))
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Learn a list of feature name -> indices mappings.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : Mapping or iterable over Mappings
|
||||
Dict(s) or Mapping(s) from feature names (arbitrary Python
|
||||
objects) to feature values (strings or convertible to dtype).
|
||||
|
||||
.. versionchanged:: 0.24
|
||||
Accepts multiple string values for one categorical feature.
|
||||
|
||||
y : (ignored)
|
||||
Ignored parameter.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
DictVectorizer class instance.
|
||||
"""
|
||||
feature_names = []
|
||||
vocab = {}
|
||||
|
||||
for x in X:
|
||||
for f, v in x.items():
|
||||
if isinstance(v, str):
|
||||
feature_name = "%s%s%s" % (f, self.separator, v)
|
||||
elif isinstance(v, Number) or (v is None):
|
||||
feature_name = f
|
||||
elif isinstance(v, Mapping):
|
||||
raise TypeError(
|
||||
f"Unsupported value type {type(v)} "
|
||||
f"for {f}: {v}.\n"
|
||||
"Mapping objects are not supported."
|
||||
)
|
||||
elif isinstance(v, Iterable):
|
||||
feature_name = None
|
||||
self._add_iterable_element(f, v, feature_names, vocab)
|
||||
|
||||
if feature_name is not None:
|
||||
if feature_name not in vocab:
|
||||
vocab[feature_name] = len(feature_names)
|
||||
feature_names.append(feature_name)
|
||||
|
||||
if self.sort:
|
||||
feature_names.sort()
|
||||
vocab = {f: i for i, f in enumerate(feature_names)}
|
||||
|
||||
self.feature_names_ = feature_names
|
||||
self.vocabulary_ = vocab
|
||||
|
||||
return self
|
||||
|
||||
def _transform(self, X, fitting):
|
||||
# Sanity check: Python's array has no way of explicitly requesting the
|
||||
# signed 32-bit integers that scipy.sparse needs, so we use the next
|
||||
# best thing: typecode "i" (int). However, if that gives larger or
|
||||
# smaller integers than 32-bit ones, np.frombuffer screws up.
|
||||
assert array("i").itemsize == 4, (
|
||||
"sizeof(int) != 4 on your platform; please report this at"
|
||||
" https://github.com/scikit-learn/scikit-learn/issues and"
|
||||
" include the output from platform.platform() in your bug report"
|
||||
)
|
||||
|
||||
dtype = self.dtype
|
||||
if fitting:
|
||||
feature_names = []
|
||||
vocab = {}
|
||||
else:
|
||||
feature_names = self.feature_names_
|
||||
vocab = self.vocabulary_
|
||||
|
||||
transforming = True
|
||||
|
||||
# Process everything as sparse regardless of setting
|
||||
X = [X] if isinstance(X, Mapping) else X
|
||||
|
||||
indices = array("i")
|
||||
indptr = [0]
|
||||
# XXX we could change values to an array.array as well, but it
|
||||
# would require (heuristic) conversion of dtype to typecode...
|
||||
values = []
|
||||
|
||||
# collect all the possible feature names and build sparse matrix at
|
||||
# same time
|
||||
for x in X:
|
||||
for f, v in x.items():
|
||||
if isinstance(v, str):
|
||||
feature_name = "%s%s%s" % (f, self.separator, v)
|
||||
v = 1
|
||||
elif isinstance(v, Number) or (v is None):
|
||||
feature_name = f
|
||||
elif not isinstance(v, Mapping) and isinstance(v, Iterable):
|
||||
feature_name = None
|
||||
self._add_iterable_element(
|
||||
f,
|
||||
v,
|
||||
feature_names,
|
||||
vocab,
|
||||
fitting=fitting,
|
||||
transforming=transforming,
|
||||
indices=indices,
|
||||
values=values,
|
||||
)
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Unsupported value Type {type(v)} "
|
||||
f"for {f}: {v}.\n"
|
||||
f"{type(v)} objects are not supported."
|
||||
)
|
||||
|
||||
if feature_name is not None:
|
||||
if fitting and feature_name not in vocab:
|
||||
vocab[feature_name] = len(feature_names)
|
||||
feature_names.append(feature_name)
|
||||
|
||||
if feature_name in vocab:
|
||||
indices.append(vocab[feature_name])
|
||||
values.append(self.dtype(v))
|
||||
|
||||
indptr.append(len(indices))
|
||||
|
||||
if len(indptr) == 1:
|
||||
raise ValueError("Sample sequence X is empty.")
|
||||
|
||||
indices = np.frombuffer(indices, dtype=np.intc)
|
||||
shape = (len(indptr) - 1, len(vocab))
|
||||
|
||||
result_matrix = sp.csr_matrix(
|
||||
(values, indices, indptr), shape=shape, dtype=dtype
|
||||
)
|
||||
|
||||
# Sort everything if asked
|
||||
if fitting and self.sort:
|
||||
feature_names.sort()
|
||||
map_index = np.empty(len(feature_names), dtype=np.int32)
|
||||
for new_val, f in enumerate(feature_names):
|
||||
map_index[new_val] = vocab[f]
|
||||
vocab[f] = new_val
|
||||
result_matrix = result_matrix[:, map_index]
|
||||
|
||||
if self.sparse:
|
||||
result_matrix.sort_indices()
|
||||
else:
|
||||
result_matrix = result_matrix.toarray()
|
||||
|
||||
if fitting:
|
||||
self.feature_names_ = feature_names
|
||||
self.vocabulary_ = vocab
|
||||
|
||||
return result_matrix
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Learn a list of feature name -> indices mappings and transform X.
|
||||
|
||||
Like fit(X) followed by transform(X), but does not require
|
||||
materializing X in memory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : Mapping or iterable over Mappings
|
||||
Dict(s) or Mapping(s) from feature names (arbitrary Python
|
||||
objects) to feature values (strings or convertible to dtype).
|
||||
|
||||
.. versionchanged:: 0.24
|
||||
Accepts multiple string values for one categorical feature.
|
||||
|
||||
y : (ignored)
|
||||
Ignored parameter.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xa : {array, sparse matrix}
|
||||
Feature vectors; always 2-d.
|
||||
"""
|
||||
return self._transform(X, fitting=True)
|
||||
|
||||
def inverse_transform(self, X, dict_type=dict):
|
||||
"""Transform array or sparse matrix X back to feature mappings.
|
||||
|
||||
X must have been produced by this DictVectorizer's transform or
|
||||
fit_transform method; it may only have passed through transformers
|
||||
that preserve the number of features and their order.
|
||||
|
||||
In the case of one-hot/one-of-K coding, the constructed feature
|
||||
names and values are returned rather than the original ones.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Sample matrix.
|
||||
dict_type : type, default=dict
|
||||
Constructor for feature mappings. Must conform to the
|
||||
collections.Mapping API.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_original : list of dict_type objects of shape (n_samples,)
|
||||
Feature mappings for the samples in X.
|
||||
"""
|
||||
check_is_fitted(self, "feature_names_")
|
||||
|
||||
# COO matrix is not subscriptable
|
||||
X = check_array(X, accept_sparse=["csr", "csc"])
|
||||
n_samples = X.shape[0]
|
||||
|
||||
names = self.feature_names_
|
||||
dicts = [dict_type() for _ in range(n_samples)]
|
||||
|
||||
if sp.issparse(X):
|
||||
for i, j in zip(*X.nonzero()):
|
||||
dicts[i][names[j]] = X[i, j]
|
||||
else:
|
||||
for i, d in enumerate(dicts):
|
||||
for j, v in enumerate(X[i, :]):
|
||||
if v != 0:
|
||||
d[names[j]] = X[i, j]
|
||||
|
||||
return dicts
|
||||
|
||||
def transform(self, X):
|
||||
"""Transform feature->value dicts to array or sparse matrix.
|
||||
|
||||
Named features not encountered during fit or fit_transform will be
|
||||
silently ignored.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : Mapping or iterable over Mappings of shape (n_samples,)
|
||||
Dict(s) or Mapping(s) from feature names (arbitrary Python
|
||||
objects) to feature values (strings or convertible to dtype).
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xa : {array, sparse matrix}
|
||||
Feature vectors; always 2-d.
|
||||
"""
|
||||
check_is_fitted(self, ["feature_names_", "vocabulary_"])
|
||||
return self._transform(X, fitting=False)
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
"""Get output feature names for transformation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_features : array-like of str or None, default=None
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_names_out : ndarray of str objects
|
||||
Transformed feature names.
|
||||
"""
|
||||
check_is_fitted(self, "feature_names_")
|
||||
if any(not isinstance(name, str) for name in self.feature_names_):
|
||||
feature_names = [str(name) for name in self.feature_names_]
|
||||
else:
|
||||
feature_names = self.feature_names_
|
||||
return np.asarray(feature_names, dtype=object)
|
||||
|
||||
def restrict(self, support, indices=False):
|
||||
"""Restrict the features to those in support using feature selection.
|
||||
|
||||
This function modifies the estimator in-place.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
support : array-like
|
||||
Boolean mask or list of indices (as returned by the get_support
|
||||
member of feature selectors).
|
||||
indices : bool, default=False
|
||||
Whether support is a list of indices.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
DictVectorizer class instance.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.feature_extraction import DictVectorizer
|
||||
>>> from sklearn.feature_selection import SelectKBest, chi2
|
||||
>>> v = DictVectorizer()
|
||||
>>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
|
||||
>>> X = v.fit_transform(D)
|
||||
>>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])
|
||||
>>> v.get_feature_names_out()
|
||||
array(['bar', 'baz', 'foo'], ...)
|
||||
>>> v.restrict(support.get_support())
|
||||
DictVectorizer()
|
||||
>>> v.get_feature_names_out()
|
||||
array(['bar', 'foo'], ...)
|
||||
"""
|
||||
check_is_fitted(self, "feature_names_")
|
||||
|
||||
if not indices:
|
||||
support = np.where(support)[0]
|
||||
|
||||
names = self.feature_names_
|
||||
new_vocab = {}
|
||||
for i in support:
|
||||
new_vocab[names[i]] = len(new_vocab)
|
||||
|
||||
self.vocabulary_ = new_vocab
|
||||
self.feature_names_ = [
|
||||
f for f, i in sorted(new_vocab.items(), key=itemgetter(1))
|
||||
]
|
||||
|
||||
return self
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.dict = True
|
||||
tags.input_tags.two_d_array = False
|
||||
return tags
|
||||
@@ -0,0 +1,209 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from itertools import chain
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn.utils import metadata_routing
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin, _fit_context
|
||||
from ..utils._param_validation import Interval, StrOptions
|
||||
from ._hashing_fast import transform as _hashing_transform
|
||||
|
||||
|
||||
def _iteritems(d):
|
||||
"""Like d.iteritems, but accepts any collections.Mapping."""
|
||||
return d.iteritems() if hasattr(d, "iteritems") else d.items()
|
||||
|
||||
|
||||
class FeatureHasher(TransformerMixin, BaseEstimator):
|
||||
"""Implements feature hashing, aka the hashing trick.
|
||||
|
||||
This class turns sequences of symbolic feature names (strings) into
|
||||
scipy.sparse matrices, using a hash function to compute the matrix column
|
||||
corresponding to a name. The hash function employed is the signed 32-bit
|
||||
version of Murmurhash3.
|
||||
|
||||
Feature names of type byte string are used as-is. Unicode strings are
|
||||
converted to UTF-8 first, but no Unicode normalization is done.
|
||||
Feature values must be (finite) numbers.
|
||||
|
||||
This class is a low-memory alternative to DictVectorizer and
|
||||
CountVectorizer, intended for large-scale (online) learning and situations
|
||||
where memory is tight, e.g. when running prediction code on embedded
|
||||
devices.
|
||||
|
||||
For an efficiency comparison of the different feature extractors, see
|
||||
:ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
|
||||
|
||||
Read more in the :ref:`User Guide <feature_hashing>`.
|
||||
|
||||
.. versionadded:: 0.13
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_features : int, default=2**20
|
||||
The number of features (columns) in the output matrices. Small numbers
|
||||
of features are likely to cause hash collisions, but large numbers
|
||||
will cause larger coefficient dimensions in linear learners.
|
||||
input_type : str, default='dict'
|
||||
Choose a string from {'dict', 'pair', 'string'}.
|
||||
Either "dict" (the default) to accept dictionaries over
|
||||
(feature_name, value); "pair" to accept pairs of (feature_name, value);
|
||||
or "string" to accept single strings.
|
||||
feature_name should be a string, while value should be a number.
|
||||
In the case of "string", a value of 1 is implied.
|
||||
The feature_name is hashed to find the appropriate column for the
|
||||
feature. The value's sign might be flipped in the output (but see
|
||||
non_negative, below).
|
||||
dtype : numpy dtype, default=np.float64
|
||||
The type of feature values. Passed to scipy.sparse matrix constructors
|
||||
as the dtype argument. Do not set this to bool, np.boolean or any
|
||||
unsigned integer type.
|
||||
alternate_sign : bool, default=True
|
||||
When True, an alternating sign is added to the features as to
|
||||
approximately conserve the inner product in the hashed space even for
|
||||
small n_features. This approach is similar to sparse random projection.
|
||||
|
||||
.. versionchanged:: 0.19
|
||||
``alternate_sign`` replaces the now deprecated ``non_negative``
|
||||
parameter.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DictVectorizer : Vectorizes string-valued features using a hash table.
|
||||
sklearn.preprocessing.OneHotEncoder : Handles nominal/categorical features.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This estimator is :term:`stateless` and does not need to be fitted.
|
||||
However, we recommend to call :meth:`fit_transform` instead of
|
||||
:meth:`transform`, as parameter validation is only performed in
|
||||
:meth:`fit`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.feature_extraction import FeatureHasher
|
||||
>>> h = FeatureHasher(n_features=10)
|
||||
>>> D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]
|
||||
>>> f = h.transform(D)
|
||||
>>> f.toarray()
|
||||
array([[ 0., 0., -4., -1., 0., 0., 0., 0., 0., 2.],
|
||||
[ 0., 0., 0., -2., -5., 0., 0., 0., 0., 0.]])
|
||||
|
||||
With `input_type="string"`, the input must be an iterable over iterables of
|
||||
strings:
|
||||
|
||||
>>> h = FeatureHasher(n_features=8, input_type="string")
|
||||
>>> raw_X = [["dog", "cat", "snake"], ["snake", "dog"], ["cat", "bird"]]
|
||||
>>> f = h.transform(raw_X)
|
||||
>>> f.toarray()
|
||||
array([[ 0., 0., 0., -1., 0., -1., 0., 1.],
|
||||
[ 0., 0., 0., -1., 0., -1., 0., 0.],
|
||||
[ 0., -1., 0., 0., 0., 0., 0., 1.]])
|
||||
"""
|
||||
|
||||
# raw_X should have been called X
|
||||
__metadata_request__transform = {"raw_X": metadata_routing.UNUSED}
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_features": [Interval(Integral, 1, np.iinfo(np.int32).max, closed="both")],
|
||||
"input_type": [StrOptions({"dict", "pair", "string"})],
|
||||
"dtype": "no_validation", # delegate to numpy
|
||||
"alternate_sign": ["boolean"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_features=(2**20),
|
||||
*,
|
||||
input_type="dict",
|
||||
dtype=np.float64,
|
||||
alternate_sign=True,
|
||||
):
|
||||
self.dtype = dtype
|
||||
self.input_type = input_type
|
||||
self.n_features = n_features
|
||||
self.alternate_sign = alternate_sign
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X=None, y=None):
|
||||
"""Only validates estimator's parameters.
|
||||
|
||||
This method allows to: (i) validate the estimator's parameters and
|
||||
(ii) be consistent with the scikit-learn transformer API.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
FeatureHasher class instance.
|
||||
"""
|
||||
return self
|
||||
|
||||
def transform(self, raw_X):
|
||||
"""Transform a sequence of instances to a scipy.sparse matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
raw_X : iterable over iterable over raw features, length = n_samples
|
||||
Samples. Each sample must be iterable an (e.g., a list or tuple)
|
||||
containing/generating feature names (and optionally values, see
|
||||
the input_type constructor argument) which will be hashed.
|
||||
raw_X need not support the len function, so it can be the result
|
||||
of a generator; n_samples is determined on the fly.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Feature matrix, for use with estimators or further transformers.
|
||||
"""
|
||||
raw_X = iter(raw_X)
|
||||
if self.input_type == "dict":
|
||||
raw_X = (_iteritems(d) for d in raw_X)
|
||||
elif self.input_type == "string":
|
||||
first_raw_X = next(raw_X)
|
||||
if isinstance(first_raw_X, str):
|
||||
raise ValueError(
|
||||
"Samples can not be a single string. The input must be an iterable"
|
||||
" over iterables of strings."
|
||||
)
|
||||
raw_X_ = chain([first_raw_X], raw_X)
|
||||
raw_X = (((f, 1) for f in x) for x in raw_X_)
|
||||
|
||||
indices, indptr, values = _hashing_transform(
|
||||
raw_X, self.n_features, self.dtype, self.alternate_sign, seed=0
|
||||
)
|
||||
n_samples = indptr.shape[0] - 1
|
||||
|
||||
if n_samples == 0:
|
||||
raise ValueError("Cannot vectorize empty sequence.")
|
||||
|
||||
X = sp.csr_matrix(
|
||||
(values, indices, indptr),
|
||||
dtype=self.dtype,
|
||||
shape=(n_samples, self.n_features),
|
||||
)
|
||||
X.sum_duplicates() # also sorts the indices
|
||||
|
||||
return X
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.two_d_array = False
|
||||
if self.input_type == "string":
|
||||
tags.input_tags.string = True
|
||||
elif self.input_type == "dict":
|
||||
tags.input_tags.dict = True
|
||||
tags.requires_fit = False
|
||||
return tags
|
||||
Binary file not shown.
@@ -0,0 +1,89 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from libc.stdlib cimport abs
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
cimport numpy as cnp
|
||||
import numpy as np
|
||||
from ..utils._typedefs cimport int32_t, int64_t
|
||||
from ..utils.murmurhash cimport murmurhash3_bytes_s32
|
||||
from ..utils._vector_sentinel cimport vector_to_nd_array
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
|
||||
def transform(raw_X, Py_ssize_t n_features, dtype,
|
||||
bint alternate_sign=1, unsigned int seed=0):
|
||||
"""Guts of FeatureHasher.transform.
|
||||
|
||||
Returns
|
||||
-------
|
||||
n_samples : integer
|
||||
indices, indptr, values : lists
|
||||
For constructing a scipy.sparse.csr_matrix.
|
||||
|
||||
"""
|
||||
cdef int32_t h
|
||||
cdef double value
|
||||
|
||||
cdef vector[int32_t] indices
|
||||
cdef vector[int64_t] indptr
|
||||
indptr.push_back(0)
|
||||
|
||||
# Since Python array does not understand Numpy dtypes, we grow the indices
|
||||
# and values arrays ourselves. Use a Py_ssize_t capacity for safety.
|
||||
cdef Py_ssize_t capacity = 8192 # arbitrary
|
||||
cdef int64_t size = 0
|
||||
cdef cnp.ndarray values = np.empty(capacity, dtype=dtype)
|
||||
|
||||
for x in raw_X:
|
||||
for f, v in x:
|
||||
if isinstance(v, (str, unicode)):
|
||||
f = "%s%s%s" % (f, '=', v)
|
||||
value = 1
|
||||
else:
|
||||
value = v
|
||||
|
||||
if value == 0:
|
||||
continue
|
||||
|
||||
if isinstance(f, unicode):
|
||||
f = (<unicode>f).encode("utf-8")
|
||||
# Need explicit type check because Murmurhash does not propagate
|
||||
# all exceptions. Add "except *" there?
|
||||
elif not isinstance(f, bytes):
|
||||
raise TypeError("feature names must be strings")
|
||||
|
||||
h = murmurhash3_bytes_s32(<bytes>f, seed)
|
||||
|
||||
if h == - 2147483648:
|
||||
# abs(-2**31) is undefined behavior because h is a `np.int32`
|
||||
# The following is defined such that it is equal to: abs(-2**31) % n_features
|
||||
indices.push_back((2147483647 - (n_features - 1)) % n_features)
|
||||
else:
|
||||
indices.push_back(abs(h) % n_features)
|
||||
# improve inner product preservation in the hashed space
|
||||
if alternate_sign:
|
||||
value *= (h >= 0) * 2 - 1
|
||||
values[size] = value
|
||||
size += 1
|
||||
|
||||
if size == capacity:
|
||||
capacity *= 2
|
||||
# can't use resize member because there might be multiple
|
||||
# references to the arrays due to Cython's error checking
|
||||
values = np.resize(values, capacity)
|
||||
|
||||
indptr.push_back(size)
|
||||
|
||||
indices_array = vector_to_nd_array(&indices)
|
||||
indptr_array = vector_to_nd_array(&indptr)
|
||||
|
||||
if indptr_array[indptr_array.shape[0]-1] > np.iinfo(np.int32).max: # = 2**31 - 1
|
||||
# both indices and indptr have the same dtype in CSR arrays
|
||||
indices_array = indices_array.astype(np.int64, copy=False)
|
||||
else:
|
||||
indptr_array = indptr_array.astype(np.int32, copy=False)
|
||||
|
||||
return (indices_array, indptr_array, values[:size])
|
||||
@@ -0,0 +1,328 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# This list of English stop words is taken from the "Glasgow Information
|
||||
# Retrieval Group". The original list can be found at
|
||||
# http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
|
||||
ENGLISH_STOP_WORDS = frozenset(
|
||||
[
|
||||
"a",
|
||||
"about",
|
||||
"above",
|
||||
"across",
|
||||
"after",
|
||||
"afterwards",
|
||||
"again",
|
||||
"against",
|
||||
"all",
|
||||
"almost",
|
||||
"alone",
|
||||
"along",
|
||||
"already",
|
||||
"also",
|
||||
"although",
|
||||
"always",
|
||||
"am",
|
||||
"among",
|
||||
"amongst",
|
||||
"amoungst",
|
||||
"amount",
|
||||
"an",
|
||||
"and",
|
||||
"another",
|
||||
"any",
|
||||
"anyhow",
|
||||
"anyone",
|
||||
"anything",
|
||||
"anyway",
|
||||
"anywhere",
|
||||
"are",
|
||||
"around",
|
||||
"as",
|
||||
"at",
|
||||
"back",
|
||||
"be",
|
||||
"became",
|
||||
"because",
|
||||
"become",
|
||||
"becomes",
|
||||
"becoming",
|
||||
"been",
|
||||
"before",
|
||||
"beforehand",
|
||||
"behind",
|
||||
"being",
|
||||
"below",
|
||||
"beside",
|
||||
"besides",
|
||||
"between",
|
||||
"beyond",
|
||||
"bill",
|
||||
"both",
|
||||
"bottom",
|
||||
"but",
|
||||
"by",
|
||||
"call",
|
||||
"can",
|
||||
"cannot",
|
||||
"cant",
|
||||
"co",
|
||||
"con",
|
||||
"could",
|
||||
"couldnt",
|
||||
"cry",
|
||||
"de",
|
||||
"describe",
|
||||
"detail",
|
||||
"do",
|
||||
"done",
|
||||
"down",
|
||||
"due",
|
||||
"during",
|
||||
"each",
|
||||
"eg",
|
||||
"eight",
|
||||
"either",
|
||||
"eleven",
|
||||
"else",
|
||||
"elsewhere",
|
||||
"empty",
|
||||
"enough",
|
||||
"etc",
|
||||
"even",
|
||||
"ever",
|
||||
"every",
|
||||
"everyone",
|
||||
"everything",
|
||||
"everywhere",
|
||||
"except",
|
||||
"few",
|
||||
"fifteen",
|
||||
"fifty",
|
||||
"fill",
|
||||
"find",
|
||||
"fire",
|
||||
"first",
|
||||
"five",
|
||||
"for",
|
||||
"former",
|
||||
"formerly",
|
||||
"forty",
|
||||
"found",
|
||||
"four",
|
||||
"from",
|
||||
"front",
|
||||
"full",
|
||||
"further",
|
||||
"get",
|
||||
"give",
|
||||
"go",
|
||||
"had",
|
||||
"has",
|
||||
"hasnt",
|
||||
"have",
|
||||
"he",
|
||||
"hence",
|
||||
"her",
|
||||
"here",
|
||||
"hereafter",
|
||||
"hereby",
|
||||
"herein",
|
||||
"hereupon",
|
||||
"hers",
|
||||
"herself",
|
||||
"him",
|
||||
"himself",
|
||||
"his",
|
||||
"how",
|
||||
"however",
|
||||
"hundred",
|
||||
"i",
|
||||
"ie",
|
||||
"if",
|
||||
"in",
|
||||
"inc",
|
||||
"indeed",
|
||||
"interest",
|
||||
"into",
|
||||
"is",
|
||||
"it",
|
||||
"its",
|
||||
"itself",
|
||||
"keep",
|
||||
"last",
|
||||
"latter",
|
||||
"latterly",
|
||||
"least",
|
||||
"less",
|
||||
"ltd",
|
||||
"made",
|
||||
"many",
|
||||
"may",
|
||||
"me",
|
||||
"meanwhile",
|
||||
"might",
|
||||
"mill",
|
||||
"mine",
|
||||
"more",
|
||||
"moreover",
|
||||
"most",
|
||||
"mostly",
|
||||
"move",
|
||||
"much",
|
||||
"must",
|
||||
"my",
|
||||
"myself",
|
||||
"name",
|
||||
"namely",
|
||||
"neither",
|
||||
"never",
|
||||
"nevertheless",
|
||||
"next",
|
||||
"nine",
|
||||
"no",
|
||||
"nobody",
|
||||
"none",
|
||||
"noone",
|
||||
"nor",
|
||||
"not",
|
||||
"nothing",
|
||||
"now",
|
||||
"nowhere",
|
||||
"of",
|
||||
"off",
|
||||
"often",
|
||||
"on",
|
||||
"once",
|
||||
"one",
|
||||
"only",
|
||||
"onto",
|
||||
"or",
|
||||
"other",
|
||||
"others",
|
||||
"otherwise",
|
||||
"our",
|
||||
"ours",
|
||||
"ourselves",
|
||||
"out",
|
||||
"over",
|
||||
"own",
|
||||
"part",
|
||||
"per",
|
||||
"perhaps",
|
||||
"please",
|
||||
"put",
|
||||
"rather",
|
||||
"re",
|
||||
"same",
|
||||
"see",
|
||||
"seem",
|
||||
"seemed",
|
||||
"seeming",
|
||||
"seems",
|
||||
"serious",
|
||||
"several",
|
||||
"she",
|
||||
"should",
|
||||
"show",
|
||||
"side",
|
||||
"since",
|
||||
"sincere",
|
||||
"six",
|
||||
"sixty",
|
||||
"so",
|
||||
"some",
|
||||
"somehow",
|
||||
"someone",
|
||||
"something",
|
||||
"sometime",
|
||||
"sometimes",
|
||||
"somewhere",
|
||||
"still",
|
||||
"such",
|
||||
"system",
|
||||
"take",
|
||||
"ten",
|
||||
"than",
|
||||
"that",
|
||||
"the",
|
||||
"their",
|
||||
"them",
|
||||
"themselves",
|
||||
"then",
|
||||
"thence",
|
||||
"there",
|
||||
"thereafter",
|
||||
"thereby",
|
||||
"therefore",
|
||||
"therein",
|
||||
"thereupon",
|
||||
"these",
|
||||
"they",
|
||||
"thick",
|
||||
"thin",
|
||||
"third",
|
||||
"this",
|
||||
"those",
|
||||
"though",
|
||||
"three",
|
||||
"through",
|
||||
"throughout",
|
||||
"thru",
|
||||
"thus",
|
||||
"to",
|
||||
"together",
|
||||
"too",
|
||||
"top",
|
||||
"toward",
|
||||
"towards",
|
||||
"twelve",
|
||||
"twenty",
|
||||
"two",
|
||||
"un",
|
||||
"under",
|
||||
"until",
|
||||
"up",
|
||||
"upon",
|
||||
"us",
|
||||
"very",
|
||||
"via",
|
||||
"was",
|
||||
"we",
|
||||
"well",
|
||||
"were",
|
||||
"what",
|
||||
"whatever",
|
||||
"when",
|
||||
"whence",
|
||||
"whenever",
|
||||
"where",
|
||||
"whereafter",
|
||||
"whereas",
|
||||
"whereby",
|
||||
"wherein",
|
||||
"whereupon",
|
||||
"wherever",
|
||||
"whether",
|
||||
"which",
|
||||
"while",
|
||||
"whither",
|
||||
"who",
|
||||
"whoever",
|
||||
"whole",
|
||||
"whom",
|
||||
"whose",
|
||||
"why",
|
||||
"will",
|
||||
"with",
|
||||
"within",
|
||||
"without",
|
||||
"would",
|
||||
"yet",
|
||||
"you",
|
||||
"your",
|
||||
"yours",
|
||||
"yourself",
|
||||
"yourselves",
|
||||
]
|
||||
)
|
||||
@@ -0,0 +1,687 @@
|
||||
"""Utilities to extract features from images."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from itertools import product
|
||||
from numbers import Integral, Number, Real
|
||||
|
||||
import numpy as np
|
||||
from numpy.lib.stride_tricks import as_strided
|
||||
from scipy import sparse
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin, _fit_context
|
||||
from ..utils import check_array, check_random_state
|
||||
from ..utils._param_validation import Hidden, Interval, RealNotInt, validate_params
|
||||
|
||||
__all__ = [
|
||||
"PatchExtractor",
|
||||
"extract_patches_2d",
|
||||
"grid_to_graph",
|
||||
"img_to_graph",
|
||||
"reconstruct_from_patches_2d",
|
||||
]
|
||||
|
||||
from ..utils.validation import validate_data
|
||||
|
||||
###############################################################################
|
||||
# From an image to a graph
|
||||
|
||||
|
||||
def _make_edges_3d(n_x, n_y, n_z=1):
|
||||
"""Returns a list of edges for a 3D image.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_x : int
|
||||
The size of the grid in the x direction.
|
||||
n_y : int
|
||||
The size of the grid in the y direction.
|
||||
n_z : integer, default=1
|
||||
The size of the grid in the z direction, defaults to 1
|
||||
"""
|
||||
vertices = np.arange(n_x * n_y * n_z).reshape((n_x, n_y, n_z))
|
||||
edges_deep = np.vstack((vertices[:, :, :-1].ravel(), vertices[:, :, 1:].ravel()))
|
||||
edges_right = np.vstack((vertices[:, :-1].ravel(), vertices[:, 1:].ravel()))
|
||||
edges_down = np.vstack((vertices[:-1].ravel(), vertices[1:].ravel()))
|
||||
edges = np.hstack((edges_deep, edges_right, edges_down))
|
||||
return edges
|
||||
|
||||
|
||||
def _compute_gradient_3d(edges, img):
|
||||
_, n_y, n_z = img.shape
|
||||
gradient = np.abs(
|
||||
img[
|
||||
edges[0] // (n_y * n_z),
|
||||
(edges[0] % (n_y * n_z)) // n_z,
|
||||
(edges[0] % (n_y * n_z)) % n_z,
|
||||
]
|
||||
- img[
|
||||
edges[1] // (n_y * n_z),
|
||||
(edges[1] % (n_y * n_z)) // n_z,
|
||||
(edges[1] % (n_y * n_z)) % n_z,
|
||||
]
|
||||
)
|
||||
return gradient
|
||||
|
||||
|
||||
# XXX: Why mask the image after computing the weights?
|
||||
|
||||
|
||||
def _mask_edges_weights(mask, edges, weights=None):
|
||||
"""Apply a mask to edges (weighted or not)"""
|
||||
inds = np.arange(mask.size)
|
||||
inds = inds[mask.ravel()]
|
||||
ind_mask = np.logical_and(np.isin(edges[0], inds), np.isin(edges[1], inds))
|
||||
edges = edges[:, ind_mask]
|
||||
if weights is not None:
|
||||
weights = weights[ind_mask]
|
||||
if len(edges.ravel()):
|
||||
maxval = edges.max()
|
||||
else:
|
||||
maxval = 0
|
||||
order = np.searchsorted(np.flatnonzero(mask), np.arange(maxval + 1))
|
||||
edges = order[edges]
|
||||
if weights is None:
|
||||
return edges
|
||||
else:
|
||||
return edges, weights
|
||||
|
||||
|
||||
def _to_graph(
|
||||
n_x, n_y, n_z, mask=None, img=None, return_as=sparse.coo_matrix, dtype=None
|
||||
):
|
||||
"""Auxiliary function for img_to_graph and grid_to_graph"""
|
||||
edges = _make_edges_3d(n_x, n_y, n_z)
|
||||
|
||||
if dtype is None: # To not overwrite input dtype
|
||||
if img is None:
|
||||
dtype = int
|
||||
else:
|
||||
dtype = img.dtype
|
||||
|
||||
if img is not None:
|
||||
img = np.atleast_3d(img)
|
||||
weights = _compute_gradient_3d(edges, img)
|
||||
if mask is not None:
|
||||
edges, weights = _mask_edges_weights(mask, edges, weights)
|
||||
diag = img.squeeze()[mask]
|
||||
else:
|
||||
diag = img.ravel()
|
||||
n_voxels = diag.size
|
||||
else:
|
||||
if mask is not None:
|
||||
mask = mask.astype(dtype=bool, copy=False)
|
||||
edges = _mask_edges_weights(mask, edges)
|
||||
n_voxels = np.sum(mask)
|
||||
else:
|
||||
n_voxels = n_x * n_y * n_z
|
||||
weights = np.ones(edges.shape[1], dtype=dtype)
|
||||
diag = np.ones(n_voxels, dtype=dtype)
|
||||
|
||||
diag_idx = np.arange(n_voxels)
|
||||
i_idx = np.hstack((edges[0], edges[1]))
|
||||
j_idx = np.hstack((edges[1], edges[0]))
|
||||
graph = sparse.coo_matrix(
|
||||
(
|
||||
np.hstack((weights, weights, diag)),
|
||||
(np.hstack((i_idx, diag_idx)), np.hstack((j_idx, diag_idx))),
|
||||
),
|
||||
(n_voxels, n_voxels),
|
||||
dtype=dtype,
|
||||
)
|
||||
if return_as is np.ndarray:
|
||||
return graph.toarray()
|
||||
return return_as(graph)
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"img": ["array-like"],
|
||||
"mask": [None, np.ndarray],
|
||||
"return_as": [type],
|
||||
"dtype": "no_validation", # validation delegated to numpy
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None):
|
||||
"""Graph of the pixel-to-pixel gradient connections.
|
||||
|
||||
Edges are weighted with the gradient values.
|
||||
|
||||
Read more in the :ref:`User Guide <image_feature_extraction>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
img : array-like of shape (height, width) or (height, width, channel)
|
||||
2D or 3D image.
|
||||
mask : ndarray of shape (height, width) or \
|
||||
(height, width, channel), dtype=bool, default=None
|
||||
An optional mask of the image, to consider only part of the
|
||||
pixels.
|
||||
return_as : np.ndarray or a sparse matrix class, \
|
||||
default=sparse.coo_matrix
|
||||
The class to use to build the returned adjacency matrix.
|
||||
dtype : dtype, default=None
|
||||
The data of the returned sparse matrix. By default it is the
|
||||
dtype of img.
|
||||
|
||||
Returns
|
||||
-------
|
||||
graph : ndarray or a sparse matrix class
|
||||
The computed adjacency matrix.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.feature_extraction.image import img_to_graph
|
||||
>>> img = np.array([[0, 0], [0, 1]])
|
||||
>>> img_to_graph(img, return_as=np.ndarray)
|
||||
array([[0, 0, 0, 0],
|
||||
[0, 0, 0, 1],
|
||||
[0, 0, 0, 1],
|
||||
[0, 1, 1, 1]])
|
||||
"""
|
||||
img = np.atleast_3d(img)
|
||||
n_x, n_y, n_z = img.shape
|
||||
return _to_graph(n_x, n_y, n_z, mask, img, return_as, dtype)
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"n_x": [Interval(Integral, left=1, right=None, closed="left")],
|
||||
"n_y": [Interval(Integral, left=1, right=None, closed="left")],
|
||||
"n_z": [Interval(Integral, left=1, right=None, closed="left")],
|
||||
"mask": [None, np.ndarray],
|
||||
"return_as": [type],
|
||||
"dtype": "no_validation", # validation delegated to numpy
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def grid_to_graph(
|
||||
n_x, n_y, n_z=1, *, mask=None, return_as=sparse.coo_matrix, dtype=int
|
||||
):
|
||||
"""Graph of the pixel-to-pixel connections.
|
||||
|
||||
Edges exist if 2 voxels are connected.
|
||||
|
||||
Read more in the :ref:`User Guide <connectivity_graph_image>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_x : int
|
||||
Dimension in x axis.
|
||||
n_y : int
|
||||
Dimension in y axis.
|
||||
n_z : int, default=1
|
||||
Dimension in z axis.
|
||||
mask : ndarray of shape (n_x, n_y, n_z), dtype=bool, default=None
|
||||
An optional mask of the image, to consider only part of the
|
||||
pixels.
|
||||
return_as : np.ndarray or a sparse matrix class, \
|
||||
default=sparse.coo_matrix
|
||||
The class to use to build the returned adjacency matrix.
|
||||
dtype : dtype, default=int
|
||||
The data of the returned sparse matrix. By default it is int.
|
||||
|
||||
Returns
|
||||
-------
|
||||
graph : np.ndarray or a sparse matrix class
|
||||
The computed adjacency matrix.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.feature_extraction.image import grid_to_graph
|
||||
>>> shape_img = (4, 4, 1)
|
||||
>>> mask = np.zeros(shape=shape_img, dtype=bool)
|
||||
>>> mask[[1, 2], [1, 2], :] = True
|
||||
>>> graph = grid_to_graph(*shape_img, mask=mask)
|
||||
>>> print(graph)
|
||||
<COOrdinate sparse matrix of dtype 'int64'
|
||||
with 2 stored elements and shape (2, 2)>
|
||||
Coords Values
|
||||
(0, 0) 1
|
||||
(1, 1) 1
|
||||
"""
|
||||
return _to_graph(n_x, n_y, n_z, mask=mask, return_as=return_as, dtype=dtype)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# From an image to a set of small image patches
|
||||
|
||||
|
||||
def _compute_n_patches(i_h, i_w, p_h, p_w, max_patches=None):
|
||||
"""Compute the number of patches that will be extracted in an image.
|
||||
|
||||
Read more in the :ref:`User Guide <image_feature_extraction>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
i_h : int
|
||||
The image height
|
||||
i_w : int
|
||||
The image with
|
||||
p_h : int
|
||||
The height of a patch
|
||||
p_w : int
|
||||
The width of a patch
|
||||
max_patches : int or float, default=None
|
||||
The maximum number of patches to extract. If `max_patches` is a float
|
||||
between 0 and 1, it is taken to be a proportion of the total number
|
||||
of patches. If `max_patches` is None, all possible patches are extracted.
|
||||
"""
|
||||
n_h = i_h - p_h + 1
|
||||
n_w = i_w - p_w + 1
|
||||
all_patches = n_h * n_w
|
||||
|
||||
if max_patches:
|
||||
if isinstance(max_patches, (Integral)) and max_patches < all_patches:
|
||||
return max_patches
|
||||
elif isinstance(max_patches, (Integral)) and max_patches >= all_patches:
|
||||
return all_patches
|
||||
elif isinstance(max_patches, (Real)) and 0 < max_patches < 1:
|
||||
return int(max_patches * all_patches)
|
||||
else:
|
||||
raise ValueError("Invalid value for max_patches: %r" % max_patches)
|
||||
else:
|
||||
return all_patches
|
||||
|
||||
|
||||
def _extract_patches(arr, patch_shape=8, extraction_step=1):
|
||||
"""Extracts patches of any n-dimensional array in place using strides.
|
||||
|
||||
Given an n-dimensional array it will return a 2n-dimensional array with
|
||||
the first n dimensions indexing patch position and the last n indexing
|
||||
the patch content. This operation is immediate (O(1)). A reshape
|
||||
performed on the first n dimensions will cause numpy to copy data, leading
|
||||
to a list of extracted patches.
|
||||
|
||||
Read more in the :ref:`User Guide <image_feature_extraction>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : ndarray
|
||||
n-dimensional array of which patches are to be extracted
|
||||
|
||||
patch_shape : int or tuple of length arr.ndim.default=8
|
||||
Indicates the shape of the patches to be extracted. If an
|
||||
integer is given, the shape will be a hypercube of
|
||||
sidelength given by its value.
|
||||
|
||||
extraction_step : int or tuple of length arr.ndim, default=1
|
||||
Indicates step size at which extraction shall be performed.
|
||||
If integer is given, then the step is uniform in all dimensions.
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
patches : strided ndarray
|
||||
2n-dimensional array indexing patches on first n dimensions and
|
||||
containing patches on the last n dimensions. These dimensions
|
||||
are fake, but this way no data is copied. A simple reshape invokes
|
||||
a copying operation to obtain a list of patches:
|
||||
result.reshape([-1] + list(patch_shape))
|
||||
"""
|
||||
|
||||
arr_ndim = arr.ndim
|
||||
|
||||
if isinstance(patch_shape, Number):
|
||||
patch_shape = tuple([patch_shape] * arr_ndim)
|
||||
if isinstance(extraction_step, Number):
|
||||
extraction_step = tuple([extraction_step] * arr_ndim)
|
||||
|
||||
patch_strides = arr.strides
|
||||
|
||||
slices = tuple(slice(None, None, st) for st in extraction_step)
|
||||
indexing_strides = arr[slices].strides
|
||||
|
||||
patch_indices_shape = (
|
||||
(np.array(arr.shape) - np.array(patch_shape)) // np.array(extraction_step)
|
||||
) + 1
|
||||
|
||||
shape = tuple(list(patch_indices_shape) + list(patch_shape))
|
||||
strides = tuple(list(indexing_strides) + list(patch_strides))
|
||||
|
||||
patches = as_strided(arr, shape=shape, strides=strides)
|
||||
return patches
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"image": [np.ndarray],
|
||||
"patch_size": [tuple, list],
|
||||
"max_patches": [
|
||||
Interval(RealNotInt, 0, 1, closed="neither"),
|
||||
Interval(Integral, 1, None, closed="left"),
|
||||
None,
|
||||
],
|
||||
"random_state": ["random_state"],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def extract_patches_2d(image, patch_size, *, max_patches=None, random_state=None):
|
||||
"""Reshape a 2D image into a collection of patches.
|
||||
|
||||
The resulting patches are allocated in a dedicated array.
|
||||
|
||||
Read more in the :ref:`User Guide <image_feature_extraction>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
image : ndarray of shape (image_height, image_width) or \
|
||||
(image_height, image_width, n_channels)
|
||||
The original image data. For color images, the last dimension specifies
|
||||
the channel: a RGB image would have `n_channels=3`.
|
||||
|
||||
patch_size : tuple of int (patch_height, patch_width)
|
||||
The dimensions of one patch.
|
||||
|
||||
max_patches : int or float, default=None
|
||||
The maximum number of patches to extract. If `max_patches` is a float
|
||||
between 0 and 1, it is taken to be a proportion of the total number
|
||||
of patches. If `max_patches` is None it corresponds to the total number
|
||||
of patches that can be extracted.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator used for random sampling when
|
||||
`max_patches` is not None. Use an int to make the randomness
|
||||
deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
patches : array of shape (n_patches, patch_height, patch_width) or \
|
||||
(n_patches, patch_height, patch_width, n_channels)
|
||||
The collection of patches extracted from the image, where `n_patches`
|
||||
is either `max_patches` or the total number of patches that can be
|
||||
extracted.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_sample_image
|
||||
>>> from sklearn.feature_extraction import image
|
||||
>>> # Use the array data from the first image in this dataset:
|
||||
>>> one_image = load_sample_image("china.jpg")
|
||||
>>> print('Image shape: {}'.format(one_image.shape))
|
||||
Image shape: (427, 640, 3)
|
||||
>>> patches = image.extract_patches_2d(one_image, (2, 2))
|
||||
>>> print('Patches shape: {}'.format(patches.shape))
|
||||
Patches shape: (272214, 2, 2, 3)
|
||||
>>> # Here are just two of these patches:
|
||||
>>> print(patches[1])
|
||||
[[[174 201 231]
|
||||
[174 201 231]]
|
||||
[[173 200 230]
|
||||
[173 200 230]]]
|
||||
>>> print(patches[800])
|
||||
[[[187 214 243]
|
||||
[188 215 244]]
|
||||
[[187 214 243]
|
||||
[188 215 244]]]
|
||||
"""
|
||||
i_h, i_w = image.shape[:2]
|
||||
p_h, p_w = patch_size
|
||||
|
||||
if p_h > i_h:
|
||||
raise ValueError(
|
||||
"Height of the patch should be less than the height of the image."
|
||||
)
|
||||
|
||||
if p_w > i_w:
|
||||
raise ValueError(
|
||||
"Width of the patch should be less than the width of the image."
|
||||
)
|
||||
|
||||
image = check_array(image, allow_nd=True)
|
||||
image = image.reshape((i_h, i_w, -1))
|
||||
n_colors = image.shape[-1]
|
||||
|
||||
extracted_patches = _extract_patches(
|
||||
image, patch_shape=(p_h, p_w, n_colors), extraction_step=1
|
||||
)
|
||||
|
||||
n_patches = _compute_n_patches(i_h, i_w, p_h, p_w, max_patches)
|
||||
if max_patches:
|
||||
rng = check_random_state(random_state)
|
||||
i_s = rng.randint(i_h - p_h + 1, size=n_patches)
|
||||
j_s = rng.randint(i_w - p_w + 1, size=n_patches)
|
||||
patches = extracted_patches[i_s, j_s, 0]
|
||||
else:
|
||||
patches = extracted_patches
|
||||
|
||||
patches = patches.reshape(-1, p_h, p_w, n_colors)
|
||||
# remove the color dimension if useless
|
||||
if patches.shape[-1] == 1:
|
||||
return patches.reshape((n_patches, p_h, p_w))
|
||||
else:
|
||||
return patches
|
||||
|
||||
|
||||
@validate_params(
|
||||
{"patches": [np.ndarray], "image_size": [tuple, Hidden(list)]},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def reconstruct_from_patches_2d(patches, image_size):
|
||||
"""Reconstruct the image from all of its patches.
|
||||
|
||||
Patches are assumed to overlap and the image is constructed by filling in
|
||||
the patches from left to right, top to bottom, averaging the overlapping
|
||||
regions.
|
||||
|
||||
Read more in the :ref:`User Guide <image_feature_extraction>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
patches : ndarray of shape (n_patches, patch_height, patch_width) or \
|
||||
(n_patches, patch_height, patch_width, n_channels)
|
||||
The complete set of patches. If the patches contain colour information,
|
||||
channels are indexed along the last dimension: RGB patches would
|
||||
have `n_channels=3`.
|
||||
|
||||
image_size : tuple of int (image_height, image_width) or \
|
||||
(image_height, image_width, n_channels)
|
||||
The size of the image that will be reconstructed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
image : ndarray of shape image_size
|
||||
The reconstructed image.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_sample_image
|
||||
>>> from sklearn.feature_extraction import image
|
||||
>>> one_image = load_sample_image("china.jpg")
|
||||
>>> print('Image shape: {}'.format(one_image.shape))
|
||||
Image shape: (427, 640, 3)
|
||||
>>> image_patches = image.extract_patches_2d(image=one_image, patch_size=(10, 10))
|
||||
>>> print('Patches shape: {}'.format(image_patches.shape))
|
||||
Patches shape: (263758, 10, 10, 3)
|
||||
>>> image_reconstructed = image.reconstruct_from_patches_2d(
|
||||
... patches=image_patches,
|
||||
... image_size=one_image.shape
|
||||
... )
|
||||
>>> print(f"Reconstructed shape: {image_reconstructed.shape}")
|
||||
Reconstructed shape: (427, 640, 3)
|
||||
"""
|
||||
i_h, i_w = image_size[:2]
|
||||
p_h, p_w = patches.shape[1:3]
|
||||
img = np.zeros(image_size)
|
||||
# compute the dimensions of the patches array
|
||||
n_h = i_h - p_h + 1
|
||||
n_w = i_w - p_w + 1
|
||||
for p, (i, j) in zip(patches, product(range(n_h), range(n_w))):
|
||||
img[i : i + p_h, j : j + p_w] += p
|
||||
|
||||
for i in range(i_h):
|
||||
for j in range(i_w):
|
||||
# divide by the amount of overlap
|
||||
# XXX: is this the most efficient way? memory-wise yes, cpu wise?
|
||||
img[i, j] /= float(min(i + 1, p_h, i_h - i) * min(j + 1, p_w, i_w - j))
|
||||
return img
|
||||
|
||||
|
||||
class PatchExtractor(TransformerMixin, BaseEstimator):
|
||||
"""Extracts patches from a collection of images.
|
||||
|
||||
Read more in the :ref:`User Guide <image_feature_extraction>`.
|
||||
|
||||
.. versionadded:: 0.9
|
||||
|
||||
Parameters
|
||||
----------
|
||||
patch_size : tuple of int (patch_height, patch_width), default=None
|
||||
The dimensions of one patch. If set to None, the patch size will be
|
||||
automatically set to `(img_height // 10, img_width // 10)`, where
|
||||
`img_height` and `img_width` are the dimensions of the input images.
|
||||
|
||||
max_patches : int or float, default=None
|
||||
The maximum number of patches per image to extract. If `max_patches` is
|
||||
a float in (0, 1), it is taken to mean a proportion of the total number
|
||||
of patches. If set to None, extract all possible patches.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator used for random sampling when
|
||||
`max_patches is not None`. Use an int to make the randomness
|
||||
deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
reconstruct_from_patches_2d : Reconstruct image from all of its patches.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This estimator is stateless and does not need to be fitted. However, we
|
||||
recommend to call :meth:`fit_transform` instead of :meth:`transform`, as
|
||||
parameter validation is only performed in :meth:`fit`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_sample_images
|
||||
>>> from sklearn.feature_extraction import image
|
||||
>>> # Use the array data from the second image in this dataset:
|
||||
>>> X = load_sample_images().images[1]
|
||||
>>> X = X[None, ...]
|
||||
>>> print(f"Image shape: {X.shape}")
|
||||
Image shape: (1, 427, 640, 3)
|
||||
>>> pe = image.PatchExtractor(patch_size=(10, 10))
|
||||
>>> pe_trans = pe.transform(X)
|
||||
>>> print(f"Patches shape: {pe_trans.shape}")
|
||||
Patches shape: (263758, 10, 10, 3)
|
||||
>>> X_reconstructed = image.reconstruct_from_patches_2d(pe_trans, X.shape[1:])
|
||||
>>> print(f"Reconstructed shape: {X_reconstructed.shape}")
|
||||
Reconstructed shape: (427, 640, 3)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"patch_size": [tuple, None],
|
||||
"max_patches": [
|
||||
None,
|
||||
Interval(RealNotInt, 0, 1, closed="neither"),
|
||||
Interval(Integral, 1, None, closed="left"),
|
||||
],
|
||||
"random_state": ["random_state"],
|
||||
}
|
||||
|
||||
def __init__(self, *, patch_size=None, max_patches=None, random_state=None):
|
||||
self.patch_size = patch_size
|
||||
self.max_patches = max_patches
|
||||
self.random_state = random_state
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Only validate the parameters of the estimator.
|
||||
|
||||
This method allows to: (i) validate the parameters of the estimator and
|
||||
(ii) be consistent with the scikit-learn transformer API.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, image_height, image_width) or \
|
||||
(n_samples, image_height, image_width, n_channels)
|
||||
Array of images from which to extract patches. For color images,
|
||||
the last dimension specifies the channel: a RGB image would have
|
||||
`n_channels=3`.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Transform the image samples in `X` into a matrix of patch data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, image_height, image_width) or \
|
||||
(n_samples, image_height, image_width, n_channels)
|
||||
Array of images from which to extract patches. For color images,
|
||||
the last dimension specifies the channel: a RGB image would have
|
||||
`n_channels=3`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
patches : array of shape (n_patches, patch_height, patch_width) or \
|
||||
(n_patches, patch_height, patch_width, n_channels)
|
||||
The collection of patches extracted from the images, where
|
||||
`n_patches` is either `n_samples * max_patches` or the total
|
||||
number of patches that can be extracted.
|
||||
"""
|
||||
X = validate_data(
|
||||
self,
|
||||
X=X,
|
||||
ensure_2d=False,
|
||||
allow_nd=True,
|
||||
ensure_min_samples=1,
|
||||
ensure_min_features=1,
|
||||
reset=False,
|
||||
)
|
||||
random_state = check_random_state(self.random_state)
|
||||
n_imgs, img_height, img_width = X.shape[:3]
|
||||
if self.patch_size is None:
|
||||
patch_size = img_height // 10, img_width // 10
|
||||
else:
|
||||
if len(self.patch_size) != 2:
|
||||
raise ValueError(
|
||||
"patch_size must be a tuple of two integers. Got"
|
||||
f" {self.patch_size} instead."
|
||||
)
|
||||
patch_size = self.patch_size
|
||||
|
||||
n_imgs, img_height, img_width = X.shape[:3]
|
||||
X = np.reshape(X, (n_imgs, img_height, img_width, -1))
|
||||
n_channels = X.shape[-1]
|
||||
|
||||
# compute the dimensions of the patches array
|
||||
patch_height, patch_width = patch_size
|
||||
n_patches = _compute_n_patches(
|
||||
img_height, img_width, patch_height, patch_width, self.max_patches
|
||||
)
|
||||
patches_shape = (n_imgs * n_patches,) + patch_size
|
||||
if n_channels > 1:
|
||||
patches_shape += (n_channels,)
|
||||
|
||||
# extract the patches
|
||||
patches = np.empty(patches_shape)
|
||||
for ii, image in enumerate(X):
|
||||
patches[ii * n_patches : (ii + 1) * n_patches] = extract_patches_2d(
|
||||
image,
|
||||
patch_size,
|
||||
max_patches=self.max_patches,
|
||||
random_state=random_state,
|
||||
)
|
||||
return patches
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.two_d_array = False
|
||||
tags.input_tags.three_d_array = True
|
||||
tags.requires_fit = False
|
||||
return tags
|
||||
@@ -0,0 +1,7 @@
|
||||
py.extension_module(
|
||||
'_hashing_fast',
|
||||
[cython_gen_cpp.process('_hashing_fast.pyx'), utils_cython_tree],
|
||||
dependencies: [np_dep],
|
||||
subdir: 'sklearn/feature_extraction',
|
||||
install: true
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,261 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from random import Random
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
from numpy.testing import assert_allclose, assert_array_equal
|
||||
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.feature_extraction import DictVectorizer
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse", (True, False))
|
||||
@pytest.mark.parametrize("dtype", (int, np.float32, np.int16))
|
||||
@pytest.mark.parametrize("sort", (True, False))
|
||||
@pytest.mark.parametrize("iterable", (True, False))
|
||||
def test_dictvectorizer(sparse, dtype, sort, iterable):
|
||||
D = [{"foo": 1, "bar": 3}, {"bar": 4, "baz": 2}, {"bar": 1, "quux": 1, "quuux": 2}]
|
||||
|
||||
v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
|
||||
X = v.fit_transform(iter(D) if iterable else D)
|
||||
|
||||
assert sp.issparse(X) == sparse
|
||||
assert X.shape == (3, 5)
|
||||
assert X.sum() == 14
|
||||
assert v.inverse_transform(X) == D
|
||||
|
||||
if sparse:
|
||||
# CSR matrices can't be compared for equality
|
||||
assert_array_equal(
|
||||
X.toarray(), v.transform(iter(D) if iterable else D).toarray()
|
||||
)
|
||||
else:
|
||||
assert_array_equal(X, v.transform(iter(D) if iterable else D))
|
||||
|
||||
if sort:
|
||||
assert v.feature_names_ == sorted(v.feature_names_)
|
||||
|
||||
|
||||
def test_feature_selection():
|
||||
# make two feature dicts with two useful features and a bunch of useless
|
||||
# ones, in terms of chi2
|
||||
d1 = dict([("useless%d" % i, 10) for i in range(20)], useful1=1, useful2=20)
|
||||
d2 = dict([("useless%d" % i, 10) for i in range(20)], useful1=20, useful2=1)
|
||||
|
||||
for indices in (True, False):
|
||||
v = DictVectorizer().fit([d1, d2])
|
||||
X = v.transform([d1, d2])
|
||||
sel = SelectKBest(chi2, k=2).fit(X, [0, 1])
|
||||
|
||||
v.restrict(sel.get_support(indices=indices), indices=indices)
|
||||
assert_array_equal(v.get_feature_names_out(), ["useful1", "useful2"])
|
||||
|
||||
|
||||
def test_one_of_k():
|
||||
D_in = [
|
||||
{"version": "1", "ham": 2},
|
||||
{"version": "2", "spam": 0.3},
|
||||
{"version=3": True, "spam": -1},
|
||||
]
|
||||
v = DictVectorizer()
|
||||
X = v.fit_transform(D_in)
|
||||
assert X.shape == (3, 5)
|
||||
|
||||
D_out = v.inverse_transform(X)
|
||||
assert D_out[0] == {"version=1": 1, "ham": 2}
|
||||
|
||||
names = v.get_feature_names_out()
|
||||
assert "version=2" in names
|
||||
assert "version" not in names
|
||||
|
||||
|
||||
def test_iterable_value():
|
||||
D_names = ["ham", "spam", "version=1", "version=2", "version=3"]
|
||||
X_expected = [
|
||||
[2.0, 0.0, 2.0, 1.0, 0.0],
|
||||
[0.0, 0.3, 0.0, 1.0, 0.0],
|
||||
[0.0, -1.0, 0.0, 0.0, 1.0],
|
||||
]
|
||||
D_in = [
|
||||
{"version": ["1", "2", "1"], "ham": 2},
|
||||
{"version": "2", "spam": 0.3},
|
||||
{"version=3": True, "spam": -1},
|
||||
]
|
||||
v = DictVectorizer()
|
||||
X = v.fit_transform(D_in)
|
||||
X = X.toarray()
|
||||
assert_array_equal(X, X_expected)
|
||||
|
||||
D_out = v.inverse_transform(X)
|
||||
assert D_out[0] == {"version=1": 2, "version=2": 1, "ham": 2}
|
||||
|
||||
names = v.get_feature_names_out()
|
||||
|
||||
assert_array_equal(names, D_names)
|
||||
|
||||
|
||||
def test_iterable_not_string_error():
|
||||
error_value = (
|
||||
"Unsupported type <class 'int'> in iterable value. "
|
||||
"Only iterables of string are supported."
|
||||
)
|
||||
D2 = [{"foo": "1", "bar": "2"}, {"foo": "3", "baz": "1"}, {"foo": [1, "three"]}]
|
||||
v = DictVectorizer(sparse=False)
|
||||
with pytest.raises(TypeError) as error:
|
||||
v.fit(D2)
|
||||
assert str(error.value) == error_value
|
||||
|
||||
|
||||
def test_mapping_error():
|
||||
error_value = (
|
||||
"Unsupported value type <class 'dict'> "
|
||||
"for foo: {'one': 1, 'three': 3}.\n"
|
||||
"Mapping objects are not supported."
|
||||
)
|
||||
D2 = [
|
||||
{"foo": "1", "bar": "2"},
|
||||
{"foo": "3", "baz": "1"},
|
||||
{"foo": {"one": 1, "three": 3}},
|
||||
]
|
||||
v = DictVectorizer(sparse=False)
|
||||
with pytest.raises(TypeError) as error:
|
||||
v.fit(D2)
|
||||
assert str(error.value) == error_value
|
||||
|
||||
|
||||
def test_unseen_or_no_features():
|
||||
D = [{"camelot": 0, "spamalot": 1}]
|
||||
for sparse in [True, False]:
|
||||
v = DictVectorizer(sparse=sparse).fit(D)
|
||||
|
||||
X = v.transform({"push the pram a lot": 2})
|
||||
if sparse:
|
||||
X = X.toarray()
|
||||
assert_array_equal(X, np.zeros((1, 2)))
|
||||
|
||||
X = v.transform({})
|
||||
if sparse:
|
||||
X = X.toarray()
|
||||
assert_array_equal(X, np.zeros((1, 2)))
|
||||
|
||||
with pytest.raises(ValueError, match="empty"):
|
||||
v.transform([])
|
||||
|
||||
|
||||
def test_deterministic_vocabulary(global_random_seed):
|
||||
# Generate equal dictionaries with different memory layouts
|
||||
items = [("%03d" % i, i) for i in range(1000)]
|
||||
rng = Random(global_random_seed)
|
||||
d_sorted = dict(items)
|
||||
rng.shuffle(items)
|
||||
d_shuffled = dict(items)
|
||||
|
||||
# check that the memory layout does not impact the resulting vocabulary
|
||||
v_1 = DictVectorizer().fit([d_sorted])
|
||||
v_2 = DictVectorizer().fit([d_shuffled])
|
||||
|
||||
assert v_1.vocabulary_ == v_2.vocabulary_
|
||||
|
||||
|
||||
def test_n_features_in():
|
||||
# For vectorizers, n_features_in_ does not make sense and does not exist.
|
||||
dv = DictVectorizer()
|
||||
assert not hasattr(dv, "n_features_in_")
|
||||
d = [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}]
|
||||
dv.fit(d)
|
||||
assert not hasattr(dv, "n_features_in_")
|
||||
|
||||
|
||||
def test_dictvectorizer_dense_sparse_equivalence():
|
||||
"""Check the equivalence between between sparse and dense DictVectorizer.
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/19978
|
||||
"""
|
||||
movie_entry_fit = [
|
||||
{"category": ["thriller", "drama"], "year": 2003},
|
||||
{"category": ["animation", "family"], "year": 2011},
|
||||
{"year": 1974},
|
||||
]
|
||||
movie_entry_transform = [{"category": ["thriller"], "unseen_feature": "3"}]
|
||||
dense_vectorizer = DictVectorizer(sparse=False)
|
||||
sparse_vectorizer = DictVectorizer(sparse=True)
|
||||
|
||||
dense_vector_fit = dense_vectorizer.fit_transform(movie_entry_fit)
|
||||
sparse_vector_fit = sparse_vectorizer.fit_transform(movie_entry_fit)
|
||||
|
||||
assert not sp.issparse(dense_vector_fit)
|
||||
assert sp.issparse(sparse_vector_fit)
|
||||
|
||||
assert_allclose(dense_vector_fit, sparse_vector_fit.toarray())
|
||||
|
||||
dense_vector_transform = dense_vectorizer.transform(movie_entry_transform)
|
||||
sparse_vector_transform = sparse_vectorizer.transform(movie_entry_transform)
|
||||
|
||||
assert not sp.issparse(dense_vector_transform)
|
||||
assert sp.issparse(sparse_vector_transform)
|
||||
|
||||
assert_allclose(dense_vector_transform, sparse_vector_transform.toarray())
|
||||
|
||||
dense_inverse_transform = dense_vectorizer.inverse_transform(dense_vector_transform)
|
||||
sparse_inverse_transform = sparse_vectorizer.inverse_transform(
|
||||
sparse_vector_transform
|
||||
)
|
||||
|
||||
expected_inverse = [{"category=thriller": 1.0}]
|
||||
assert dense_inverse_transform == expected_inverse
|
||||
assert sparse_inverse_transform == expected_inverse
|
||||
|
||||
|
||||
def test_dict_vectorizer_unsupported_value_type():
|
||||
"""Check that we raise an error when the value associated to a feature
|
||||
is not supported.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/19489
|
||||
"""
|
||||
|
||||
class A:
|
||||
pass
|
||||
|
||||
vectorizer = DictVectorizer(sparse=True)
|
||||
X = [{"foo": A()}]
|
||||
err_msg = "Unsupported value Type"
|
||||
with pytest.raises(TypeError, match=err_msg):
|
||||
vectorizer.fit_transform(X)
|
||||
|
||||
|
||||
def test_dict_vectorizer_get_feature_names_out():
|
||||
"""Check that integer feature names are converted to strings in
|
||||
feature_names_out."""
|
||||
|
||||
X = [{1: 2, 3: 4}, {2: 4}]
|
||||
dv = DictVectorizer(sparse=False).fit(X)
|
||||
|
||||
feature_names = dv.get_feature_names_out()
|
||||
assert isinstance(feature_names, np.ndarray)
|
||||
assert feature_names.dtype == object
|
||||
assert_array_equal(feature_names, ["1", "2", "3"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, input",
|
||||
[
|
||||
("transform", [{1: 2, 3: 4}, {2: 4}]),
|
||||
("inverse_transform", [{1: 2, 3: 4}, {2: 4}]),
|
||||
("restrict", [True, False, True]),
|
||||
],
|
||||
)
|
||||
def test_dict_vectorizer_not_fitted_error(method, input):
|
||||
"""Check that unfitted DictVectorizer instance raises NotFittedError.
|
||||
|
||||
This should be part of the common test but currently they test estimator accepting
|
||||
text input.
|
||||
"""
|
||||
dv = DictVectorizer(sparse=False)
|
||||
|
||||
with pytest.raises(NotFittedError):
|
||||
getattr(dv, method)(input)
|
||||
@@ -0,0 +1,175 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.feature_extraction import FeatureHasher
|
||||
from sklearn.feature_extraction._hashing_fast import transform as _hashing_transform
|
||||
|
||||
|
||||
def test_feature_hasher_dicts():
|
||||
feature_hasher = FeatureHasher(n_features=16)
|
||||
assert "dict" == feature_hasher.input_type
|
||||
|
||||
raw_X = [{"foo": "bar", "dada": 42, "tzara": 37}, {"foo": "baz", "gaga": "string1"}]
|
||||
X1 = FeatureHasher(n_features=16).transform(raw_X)
|
||||
gen = (iter(d.items()) for d in raw_X)
|
||||
X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
|
||||
assert_array_equal(X1.toarray(), X2.toarray())
|
||||
|
||||
|
||||
def test_feature_hasher_strings():
|
||||
# mix byte and Unicode strings; note that "foo" is a duplicate in row 0
|
||||
raw_X = [
|
||||
["foo", "bar", "baz", "foo".encode("ascii")],
|
||||
["bar".encode("ascii"), "baz", "quux"],
|
||||
]
|
||||
|
||||
for lg_n_features in (7, 9, 11, 16, 22):
|
||||
n_features = 2**lg_n_features
|
||||
|
||||
it = (x for x in raw_X) # iterable
|
||||
|
||||
feature_hasher = FeatureHasher(
|
||||
n_features=n_features, input_type="string", alternate_sign=False
|
||||
)
|
||||
X = feature_hasher.transform(it)
|
||||
|
||||
assert X.shape[0] == len(raw_X)
|
||||
assert X.shape[1] == n_features
|
||||
|
||||
assert X[0].sum() == 4
|
||||
assert X[1].sum() == 3
|
||||
|
||||
assert X.nnz == 6
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"raw_X",
|
||||
[
|
||||
["my_string", "another_string"],
|
||||
(x for x in ["my_string", "another_string"]),
|
||||
],
|
||||
ids=["list", "generator"],
|
||||
)
|
||||
def test_feature_hasher_single_string(raw_X):
|
||||
"""FeatureHasher raises error when a sample is a single string.
|
||||
|
||||
Non-regression test for gh-13199.
|
||||
"""
|
||||
msg = "Samples can not be a single string"
|
||||
|
||||
feature_hasher = FeatureHasher(n_features=10, input_type="string")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
feature_hasher.transform(raw_X)
|
||||
|
||||
|
||||
def test_hashing_transform_seed():
|
||||
# check the influence of the seed when computing the hashes
|
||||
raw_X = [
|
||||
["foo", "bar", "baz", "foo".encode("ascii")],
|
||||
["bar".encode("ascii"), "baz", "quux"],
|
||||
]
|
||||
|
||||
raw_X_ = (((f, 1) for f in x) for x in raw_X)
|
||||
indices, indptr, _ = _hashing_transform(raw_X_, 2**7, str, False)
|
||||
|
||||
raw_X_ = (((f, 1) for f in x) for x in raw_X)
|
||||
indices_0, indptr_0, _ = _hashing_transform(raw_X_, 2**7, str, False, seed=0)
|
||||
assert_array_equal(indices, indices_0)
|
||||
assert_array_equal(indptr, indptr_0)
|
||||
|
||||
raw_X_ = (((f, 1) for f in x) for x in raw_X)
|
||||
indices_1, _, _ = _hashing_transform(raw_X_, 2**7, str, False, seed=1)
|
||||
with pytest.raises(AssertionError):
|
||||
assert_array_equal(indices, indices_1)
|
||||
|
||||
|
||||
def test_feature_hasher_pairs():
|
||||
raw_X = (
|
||||
iter(d.items())
|
||||
for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}]
|
||||
)
|
||||
feature_hasher = FeatureHasher(n_features=16, input_type="pair")
|
||||
x1, x2 = feature_hasher.transform(raw_X).toarray()
|
||||
x1_nz = sorted(np.abs(x1[x1 != 0]))
|
||||
x2_nz = sorted(np.abs(x2[x2 != 0]))
|
||||
assert [1, 2] == x1_nz
|
||||
assert [1, 3, 4] == x2_nz
|
||||
|
||||
|
||||
def test_feature_hasher_pairs_with_string_values():
|
||||
raw_X = (
|
||||
iter(d.items())
|
||||
for d in [{"foo": 1, "bar": "a"}, {"baz": "abc", "quux": 4, "foo": -1}]
|
||||
)
|
||||
feature_hasher = FeatureHasher(n_features=16, input_type="pair")
|
||||
x1, x2 = feature_hasher.transform(raw_X).toarray()
|
||||
x1_nz = sorted(np.abs(x1[x1 != 0]))
|
||||
x2_nz = sorted(np.abs(x2[x2 != 0]))
|
||||
assert [1, 1] == x1_nz
|
||||
assert [1, 1, 4] == x2_nz
|
||||
|
||||
raw_X = (iter(d.items()) for d in [{"bax": "abc"}, {"bax": "abc"}])
|
||||
x1, x2 = feature_hasher.transform(raw_X).toarray()
|
||||
x1_nz = np.abs(x1[x1 != 0])
|
||||
x2_nz = np.abs(x2[x2 != 0])
|
||||
assert [1] == x1_nz
|
||||
assert [1] == x2_nz
|
||||
assert_array_equal(x1, x2)
|
||||
|
||||
|
||||
def test_hash_empty_input():
|
||||
n_features = 16
|
||||
raw_X = [[], (), iter(range(0))]
|
||||
|
||||
feature_hasher = FeatureHasher(n_features=n_features, input_type="string")
|
||||
X = feature_hasher.transform(raw_X)
|
||||
|
||||
assert_array_equal(X.toarray(), np.zeros((len(raw_X), n_features)))
|
||||
|
||||
|
||||
def test_hasher_zeros():
|
||||
# Assert that no zeros are materialized in the output.
|
||||
X = FeatureHasher().transform([{"foo": 0}])
|
||||
assert X.data.shape == (0,)
|
||||
|
||||
|
||||
def test_hasher_alternate_sign():
|
||||
X = [list("Thequickbrownfoxjumped")]
|
||||
|
||||
Xt = FeatureHasher(alternate_sign=True, input_type="string").fit_transform(X)
|
||||
assert Xt.data.min() < 0 and Xt.data.max() > 0
|
||||
|
||||
Xt = FeatureHasher(alternate_sign=False, input_type="string").fit_transform(X)
|
||||
assert Xt.data.min() > 0
|
||||
|
||||
|
||||
def test_hash_collisions():
|
||||
X = [list("Thequickbrownfoxjumped")]
|
||||
|
||||
Xt = FeatureHasher(
|
||||
alternate_sign=True, n_features=1, input_type="string"
|
||||
).fit_transform(X)
|
||||
# check that some of the hashed tokens are added
|
||||
# with an opposite sign and cancel out
|
||||
assert abs(Xt.data[0]) < len(X[0])
|
||||
|
||||
Xt = FeatureHasher(
|
||||
alternate_sign=False, n_features=1, input_type="string"
|
||||
).fit_transform(X)
|
||||
assert Xt.data[0] == len(X[0])
|
||||
|
||||
|
||||
def test_feature_hasher_requires_fit_tag():
|
||||
"""Test that FeatureHasher has requires_fit=False tag."""
|
||||
hasher = FeatureHasher()
|
||||
tags = hasher.__sklearn_tags__()
|
||||
assert not tags.requires_fit
|
||||
|
||||
|
||||
def test_feature_hasher_transform_without_fit():
|
||||
"""Test that FeatureHasher can transform without fitting."""
|
||||
hasher = FeatureHasher(n_features=10)
|
||||
data = [{"dog": 1, "cat": 2}, {"dog": 2, "run": 5}]
|
||||
result = hasher.transform(data)
|
||||
assert result.shape == (2, 10)
|
||||
@@ -0,0 +1,359 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import ndimage
|
||||
from scipy.sparse.csgraph import connected_components
|
||||
|
||||
from sklearn.feature_extraction.image import (
|
||||
PatchExtractor,
|
||||
_extract_patches,
|
||||
extract_patches_2d,
|
||||
grid_to_graph,
|
||||
img_to_graph,
|
||||
reconstruct_from_patches_2d,
|
||||
)
|
||||
|
||||
|
||||
def test_img_to_graph():
|
||||
x, y = np.mgrid[:4, :4] - 10
|
||||
grad_x = img_to_graph(x)
|
||||
grad_y = img_to_graph(y)
|
||||
assert grad_x.nnz == grad_y.nnz
|
||||
# Negative elements are the diagonal: the elements of the original
|
||||
# image. Positive elements are the values of the gradient, they
|
||||
# should all be equal on grad_x and grad_y
|
||||
np.testing.assert_array_equal(
|
||||
grad_x.data[grad_x.data > 0], grad_y.data[grad_y.data > 0]
|
||||
)
|
||||
|
||||
|
||||
def test_img_to_graph_sparse():
|
||||
# Check that the edges are in the right position
|
||||
# when using a sparse image with a singleton component
|
||||
mask = np.zeros((2, 3), dtype=bool)
|
||||
mask[0, 0] = 1
|
||||
mask[:, 2] = 1
|
||||
x = np.zeros((2, 3))
|
||||
x[0, 0] = 1
|
||||
x[0, 2] = -1
|
||||
x[1, 2] = -2
|
||||
grad_x = img_to_graph(x, mask=mask).todense()
|
||||
desired = np.array([[1, 0, 0], [0, -1, 1], [0, 1, -2]])
|
||||
np.testing.assert_array_equal(grad_x, desired)
|
||||
|
||||
|
||||
def test_grid_to_graph():
|
||||
# Checking that the function works with graphs containing no edges
|
||||
size = 2
|
||||
roi_size = 1
|
||||
# Generating two convex parts with one vertex
|
||||
# Thus, edges will be empty in _to_graph
|
||||
mask = np.zeros((size, size), dtype=bool)
|
||||
mask[0:roi_size, 0:roi_size] = True
|
||||
mask[-roi_size:, -roi_size:] = True
|
||||
mask = mask.reshape(size**2)
|
||||
A = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray)
|
||||
assert connected_components(A)[0] == 2
|
||||
|
||||
# check ordering
|
||||
mask = np.zeros((2, 3), dtype=bool)
|
||||
mask[0, 0] = 1
|
||||
mask[:, 2] = 1
|
||||
graph = grid_to_graph(2, 3, 1, mask=mask.ravel()).todense()
|
||||
desired = np.array([[1, 0, 0], [0, 1, 1], [0, 1, 1]])
|
||||
np.testing.assert_array_equal(graph, desired)
|
||||
|
||||
# Checking that the function works whatever the type of mask is
|
||||
mask = np.ones((size, size), dtype=np.int16)
|
||||
A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask)
|
||||
assert connected_components(A)[0] == 1
|
||||
|
||||
# Checking dtype of the graph
|
||||
mask = np.ones((size, size))
|
||||
A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=bool)
|
||||
assert A.dtype == bool
|
||||
A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=int)
|
||||
assert A.dtype == int
|
||||
A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.float64)
|
||||
assert A.dtype == np.float64
|
||||
|
||||
|
||||
def test_connect_regions(raccoon_face_fxt):
|
||||
face = raccoon_face_fxt
|
||||
# subsample by 4 to reduce run time
|
||||
face = face[::4, ::4]
|
||||
for thr in (50, 150):
|
||||
mask = face > thr
|
||||
graph = img_to_graph(face, mask=mask)
|
||||
assert ndimage.label(mask)[1] == connected_components(graph)[0]
|
||||
|
||||
|
||||
def test_connect_regions_with_grid(raccoon_face_fxt):
|
||||
face = raccoon_face_fxt
|
||||
|
||||
# subsample by 4 to reduce run time
|
||||
face = face[::4, ::4]
|
||||
|
||||
mask = face > 50
|
||||
graph = grid_to_graph(*face.shape, mask=mask)
|
||||
assert ndimage.label(mask)[1] == connected_components(graph)[0]
|
||||
|
||||
mask = face > 150
|
||||
graph = grid_to_graph(*face.shape, mask=mask, dtype=None)
|
||||
assert ndimage.label(mask)[1] == connected_components(graph)[0]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def downsampled_face(raccoon_face_fxt):
|
||||
face = raccoon_face_fxt
|
||||
face = face[::2, ::2] + face[1::2, ::2] + face[::2, 1::2] + face[1::2, 1::2]
|
||||
face = face[::2, ::2] + face[1::2, ::2] + face[::2, 1::2] + face[1::2, 1::2]
|
||||
face = face.astype(np.float32)
|
||||
face /= 16.0
|
||||
return face
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def orange_face(downsampled_face):
|
||||
face = downsampled_face
|
||||
face_color = np.zeros(face.shape + (3,))
|
||||
face_color[:, :, 0] = 256 - face
|
||||
face_color[:, :, 1] = 256 - face / 2
|
||||
face_color[:, :, 2] = 256 - face / 4
|
||||
return face_color
|
||||
|
||||
|
||||
def _make_images(face):
|
||||
# make a collection of faces
|
||||
images = np.zeros((3,) + face.shape)
|
||||
images[0] = face
|
||||
images[1] = face + 1
|
||||
images[2] = face + 2
|
||||
return images
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def downsampled_face_collection(downsampled_face):
|
||||
return _make_images(downsampled_face)
|
||||
|
||||
|
||||
def test_extract_patches_all(downsampled_face):
|
||||
face = downsampled_face
|
||||
i_h, i_w = face.shape
|
||||
p_h, p_w = 16, 16
|
||||
expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
|
||||
patches = extract_patches_2d(face, (p_h, p_w))
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
|
||||
def test_extract_patches_all_color(orange_face):
|
||||
face = orange_face
|
||||
i_h, i_w = face.shape[:2]
|
||||
p_h, p_w = 16, 16
|
||||
expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
|
||||
patches = extract_patches_2d(face, (p_h, p_w))
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w, 3)
|
||||
|
||||
|
||||
def test_extract_patches_all_rect(downsampled_face):
|
||||
face = downsampled_face
|
||||
face = face[:, 32:97]
|
||||
i_h, i_w = face.shape
|
||||
p_h, p_w = 16, 12
|
||||
expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
|
||||
|
||||
patches = extract_patches_2d(face, (p_h, p_w))
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
|
||||
def test_extract_patches_max_patches(downsampled_face):
|
||||
face = downsampled_face
|
||||
i_h, i_w = face.shape
|
||||
p_h, p_w = 16, 16
|
||||
|
||||
patches = extract_patches_2d(face, (p_h, p_w), max_patches=100)
|
||||
assert patches.shape == (100, p_h, p_w)
|
||||
|
||||
expected_n_patches = int(0.5 * (i_h - p_h + 1) * (i_w - p_w + 1))
|
||||
patches = extract_patches_2d(face, (p_h, p_w), max_patches=0.5)
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
extract_patches_2d(face, (p_h, p_w), max_patches=2.0)
|
||||
with pytest.raises(ValueError):
|
||||
extract_patches_2d(face, (p_h, p_w), max_patches=-1.0)
|
||||
|
||||
|
||||
def test_extract_patch_same_size_image(downsampled_face):
|
||||
face = downsampled_face
|
||||
# Request patches of the same size as image
|
||||
# Should return just the single patch a.k.a. the image
|
||||
patches = extract_patches_2d(face, face.shape, max_patches=2)
|
||||
assert patches.shape[0] == 1
|
||||
|
||||
|
||||
def test_extract_patches_less_than_max_patches(downsampled_face):
|
||||
face = downsampled_face
|
||||
i_h, i_w = face.shape
|
||||
p_h, p_w = 3 * i_h // 4, 3 * i_w // 4
|
||||
# this is 3185
|
||||
expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
|
||||
|
||||
patches = extract_patches_2d(face, (p_h, p_w), max_patches=4000)
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
|
||||
def test_reconstruct_patches_perfect(downsampled_face):
|
||||
face = downsampled_face
|
||||
p_h, p_w = 16, 16
|
||||
|
||||
patches = extract_patches_2d(face, (p_h, p_w))
|
||||
face_reconstructed = reconstruct_from_patches_2d(patches, face.shape)
|
||||
np.testing.assert_array_almost_equal(face, face_reconstructed)
|
||||
|
||||
|
||||
def test_reconstruct_patches_perfect_color(orange_face):
|
||||
face = orange_face
|
||||
p_h, p_w = 16, 16
|
||||
|
||||
patches = extract_patches_2d(face, (p_h, p_w))
|
||||
face_reconstructed = reconstruct_from_patches_2d(patches, face.shape)
|
||||
np.testing.assert_array_almost_equal(face, face_reconstructed)
|
||||
|
||||
|
||||
def test_patch_extractor_fit(downsampled_face_collection, global_random_seed):
|
||||
faces = downsampled_face_collection
|
||||
extr = PatchExtractor(
|
||||
patch_size=(8, 8), max_patches=100, random_state=global_random_seed
|
||||
)
|
||||
assert extr == extr.fit(faces)
|
||||
|
||||
|
||||
def test_patch_extractor_max_patches(downsampled_face_collection, global_random_seed):
|
||||
faces = downsampled_face_collection
|
||||
i_h, i_w = faces.shape[1:3]
|
||||
p_h, p_w = 8, 8
|
||||
|
||||
max_patches = 100
|
||||
expected_n_patches = len(faces) * max_patches
|
||||
extr = PatchExtractor(
|
||||
patch_size=(p_h, p_w), max_patches=max_patches, random_state=global_random_seed
|
||||
)
|
||||
patches = extr.transform(faces)
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
max_patches = 0.5
|
||||
expected_n_patches = len(faces) * int(
|
||||
(i_h - p_h + 1) * (i_w - p_w + 1) * max_patches
|
||||
)
|
||||
extr = PatchExtractor(
|
||||
patch_size=(p_h, p_w), max_patches=max_patches, random_state=global_random_seed
|
||||
)
|
||||
patches = extr.transform(faces)
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
|
||||
def test_patch_extractor_max_patches_default(
|
||||
downsampled_face_collection, global_random_seed
|
||||
):
|
||||
faces = downsampled_face_collection
|
||||
extr = PatchExtractor(max_patches=100, random_state=global_random_seed)
|
||||
patches = extr.transform(faces)
|
||||
assert patches.shape == (len(faces) * 100, 19, 25)
|
||||
|
||||
|
||||
def test_patch_extractor_all_patches(downsampled_face_collection, global_random_seed):
|
||||
faces = downsampled_face_collection
|
||||
i_h, i_w = faces.shape[1:3]
|
||||
p_h, p_w = 8, 8
|
||||
expected_n_patches = len(faces) * (i_h - p_h + 1) * (i_w - p_w + 1)
|
||||
extr = PatchExtractor(patch_size=(p_h, p_w), random_state=global_random_seed)
|
||||
patches = extr.transform(faces)
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
|
||||
def test_patch_extractor_color(orange_face, global_random_seed):
|
||||
faces = _make_images(orange_face)
|
||||
i_h, i_w = faces.shape[1:3]
|
||||
p_h, p_w = 8, 8
|
||||
expected_n_patches = len(faces) * (i_h - p_h + 1) * (i_w - p_w + 1)
|
||||
extr = PatchExtractor(patch_size=(p_h, p_w), random_state=global_random_seed)
|
||||
patches = extr.transform(faces)
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w, 3)
|
||||
|
||||
|
||||
def test_extract_patches_strided():
|
||||
image_shapes_1D = [(10,), (10,), (11,), (10,)]
|
||||
patch_sizes_1D = [(1,), (2,), (3,), (8,)]
|
||||
patch_steps_1D = [(1,), (1,), (4,), (2,)]
|
||||
|
||||
expected_views_1D = [(10,), (9,), (3,), (2,)]
|
||||
last_patch_1D = [(10,), (8,), (8,), (2,)]
|
||||
|
||||
image_shapes_2D = [(10, 20), (10, 20), (10, 20), (11, 20)]
|
||||
patch_sizes_2D = [(2, 2), (10, 10), (10, 11), (6, 6)]
|
||||
patch_steps_2D = [(5, 5), (3, 10), (3, 4), (4, 2)]
|
||||
|
||||
expected_views_2D = [(2, 4), (1, 2), (1, 3), (2, 8)]
|
||||
last_patch_2D = [(5, 15), (0, 10), (0, 8), (4, 14)]
|
||||
|
||||
image_shapes_3D = [(5, 4, 3), (3, 3, 3), (7, 8, 9), (7, 8, 9)]
|
||||
patch_sizes_3D = [(2, 2, 3), (2, 2, 2), (1, 7, 3), (1, 3, 3)]
|
||||
patch_steps_3D = [(1, 2, 10), (1, 1, 1), (2, 1, 3), (3, 3, 4)]
|
||||
|
||||
expected_views_3D = [(4, 2, 1), (2, 2, 2), (4, 2, 3), (3, 2, 2)]
|
||||
last_patch_3D = [(3, 2, 0), (1, 1, 1), (6, 1, 6), (6, 3, 4)]
|
||||
|
||||
image_shapes = image_shapes_1D + image_shapes_2D + image_shapes_3D
|
||||
patch_sizes = patch_sizes_1D + patch_sizes_2D + patch_sizes_3D
|
||||
patch_steps = patch_steps_1D + patch_steps_2D + patch_steps_3D
|
||||
expected_views = expected_views_1D + expected_views_2D + expected_views_3D
|
||||
last_patches = last_patch_1D + last_patch_2D + last_patch_3D
|
||||
|
||||
for image_shape, patch_size, patch_step, expected_view, last_patch in zip(
|
||||
image_shapes, patch_sizes, patch_steps, expected_views, last_patches
|
||||
):
|
||||
image = np.arange(np.prod(image_shape)).reshape(image_shape)
|
||||
patches = _extract_patches(
|
||||
image, patch_shape=patch_size, extraction_step=patch_step
|
||||
)
|
||||
|
||||
ndim = len(image_shape)
|
||||
|
||||
assert patches.shape[:ndim] == expected_view
|
||||
last_patch_slices = tuple(
|
||||
slice(i, i + j, None) for i, j in zip(last_patch, patch_size)
|
||||
)
|
||||
assert (
|
||||
patches[(-1, None, None) * ndim] == image[last_patch_slices].squeeze()
|
||||
).all()
|
||||
|
||||
|
||||
def test_extract_patches_square(downsampled_face):
|
||||
# test same patch size for all dimensions
|
||||
face = downsampled_face
|
||||
i_h, i_w = face.shape
|
||||
p = 8
|
||||
expected_n_patches = ((i_h - p + 1), (i_w - p + 1))
|
||||
patches = _extract_patches(face, patch_shape=p)
|
||||
assert patches.shape == (expected_n_patches[0], expected_n_patches[1], p, p)
|
||||
|
||||
|
||||
def test_width_patch():
|
||||
# width and height of the patch should be less than the image
|
||||
x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
||||
with pytest.raises(ValueError):
|
||||
extract_patches_2d(x, (4, 1))
|
||||
with pytest.raises(ValueError):
|
||||
extract_patches_2d(x, (1, 4))
|
||||
|
||||
|
||||
def test_patch_extractor_wrong_input(orange_face):
|
||||
"""Check that an informative error is raised if the patch_size is not valid."""
|
||||
faces = _make_images(orange_face)
|
||||
err_msg = "patch_size must be a tuple of two integers"
|
||||
extractor = PatchExtractor(patch_size=(8, 8, 8))
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
extractor.transform(faces)
|
||||
File diff suppressed because it is too large
Load Diff
2137
venv/lib/python3.12/site-packages/sklearn/feature_extraction/text.py
Normal file
2137
venv/lib/python3.12/site-packages/sklearn/feature_extraction/text.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user