add read me

2026-01-09 10:28:44 +11:00
commit edaf914b73
13417 changed files with 2952119 additions and 0 deletions
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/init.py
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/init.py
@@ -0,0 +1,18 @@
+"""Feature extraction from raw data."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from . import image, text
+from ._dict_vectorizer import DictVectorizer
+from ._hash import FeatureHasher
+from .image import grid_to_graph, img_to_graph
+
+__all__ = [
+    "DictVectorizer",
+    "FeatureHasher",
+    "grid_to_graph",
+    "image",
+    "img_to_graph",
+    "text",
+]
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/pycache/init.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/pycache/init.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/pycache/_dict_vectorizer.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/pycache/_dict_vectorizer.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/pycache/_hash.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/pycache/_hash.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/pycache/_stop_words.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/pycache/_stop_words.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/pycache/image.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/pycache/image.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/pycache/text.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/pycache/text.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/_dict_vectorizer.py
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/_dict_vectorizer.py
@@ -0,0 +1,459 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from array import array
+from collections.abc import Iterable, Mapping
+from numbers import Number
+from operator import itemgetter
+
+import numpy as np
+import scipy.sparse as sp
+
+from sklearn.utils import metadata_routing
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils import check_array
+from ..utils.validation import check_is_fitted
+
+
+class DictVectorizer(TransformerMixin, BaseEstimator):
+    """Transforms lists of feature-value mappings to vectors.
+
+    This transformer turns lists of mappings (dict-like objects) of feature
+    names to feature values into Numpy arrays or scipy.sparse matrices for use
+    with scikit-learn estimators.
+
+    When feature values are strings, this transformer will do a binary one-hot
+    (aka one-of-K) coding: one boolean-valued feature is constructed for each
+    of the possible string values that the feature can take on. For instance,
+    a feature "f" that can take on the values "ham" and "spam" will become two
+    features in the output, one signifying "f=ham", the other "f=spam".
+
+    If a feature value is a sequence or set of strings, this transformer
+    will iterate over the values and will count the occurrences of each string
+    value.
+
+    However, note that this transformer will only do a binary one-hot encoding
+    when feature values are of type string. If categorical features are
+    represented as numeric values such as int or iterables of strings, the
+    DictVectorizer can be followed by
+    :class:`~sklearn.preprocessing.OneHotEncoder` to complete
+    binary one-hot encoding.
+
+    Features that do not occur in a sample (mapping) will have a zero value
+    in the resulting array/matrix.
+
+    For an efficiency comparison of the different feature extractors, see
+    :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
+
+    Read more in the :ref:`User Guide <dict_feature_extraction>`.
+
+    Parameters
+    ----------
+    dtype : dtype, default=np.float64
+        The type of feature values. Passed to Numpy array/scipy.sparse matrix
+        constructors as the dtype argument.
+    separator : str, default="="
+        Separator string used when constructing new features for one-hot
+        coding.
+    sparse : bool, default=True
+        Whether transform should produce scipy.sparse matrices.
+    sort : bool, default=True
+        Whether ``feature_names_`` and ``vocabulary_`` should be
+        sorted when fitting.
+
+    Attributes
+    ----------
+    vocabulary_ : dict
+        A dictionary mapping feature names to feature indices.
+
+    feature_names_ : list
+        A list of length n_features containing the feature names (e.g., "f=ham"
+        and "f=spam").
+
+    See Also
+    --------
+    FeatureHasher : Performs vectorization using only a hash function.
+    sklearn.preprocessing.OrdinalEncoder : Handles nominal/categorical
+        features encoded as columns of arbitrary data types.
+
+    Examples
+    --------
+    >>> from sklearn.feature_extraction import DictVectorizer
+    >>> v = DictVectorizer(sparse=False)
+    >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
+    >>> X = v.fit_transform(D)
+    >>> X
+    array([[2., 0., 1.],
+           [0., 1., 3.]])
+    >>> v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},
+    ...                            {'baz': 1.0, 'foo': 3.0}]
+    True
+    >>> v.transform({'foo': 4, 'unseen_feature': 3})
+    array([[0., 0., 4.]])
+    """
+
+    # This isn't something that people should be routing / using in a pipeline.
+    __metadata_request__inverse_transform = {"dict_type": metadata_routing.UNUSED}
+
+    _parameter_constraints: dict = {
+        "dtype": "no_validation",  # validation delegated to numpy,
+        "separator": [str],
+        "sparse": ["boolean"],
+        "sort": ["boolean"],
+    }
+
+    def __init__(self, *, dtype=np.float64, separator="=", sparse=True, sort=True):
+        self.dtype = dtype
+        self.separator = separator
+        self.sparse = sparse
+        self.sort = sort
+
+    def _add_iterable_element(
+        self,
+        f,
+        v,
+        feature_names,
+        vocab,
+        *,
+        fitting=True,
+        transforming=False,
+        indices=None,
+        values=None,
+    ):
+        """Add feature names for iterable of strings"""
+        for vv in v:
+            if isinstance(vv, str):
+                feature_name = "%s%s%s" % (f, self.separator, vv)
+                vv = 1
+            else:
+                raise TypeError(
+                    f"Unsupported type {type(vv)} in iterable "
+                    "value. Only iterables of string are "
+                    "supported."
+                )
+            if fitting and feature_name not in vocab:
+                vocab[feature_name] = len(feature_names)
+                feature_names.append(feature_name)
+
+            if transforming and feature_name in vocab:
+                indices.append(vocab[feature_name])
+                values.append(self.dtype(vv))
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Learn a list of feature name -> indices mappings.
+
+        Parameters
+        ----------
+        X : Mapping or iterable over Mappings
+            Dict(s) or Mapping(s) from feature names (arbitrary Python
+            objects) to feature values (strings or convertible to dtype).
+
+            .. versionchanged:: 0.24
+               Accepts multiple string values for one categorical feature.
+
+        y : (ignored)
+            Ignored parameter.
+
+        Returns
+        -------
+        self : object
+            DictVectorizer class instance.
+        """
+        feature_names = []
+        vocab = {}
+
+        for x in X:
+            for f, v in x.items():
+                if isinstance(v, str):
+                    feature_name = "%s%s%s" % (f, self.separator, v)
+                elif isinstance(v, Number) or (v is None):
+                    feature_name = f
+                elif isinstance(v, Mapping):
+                    raise TypeError(
+                        f"Unsupported value type {type(v)} "
+                        f"for {f}: {v}.\n"
+                        "Mapping objects are not supported."
+                    )
+                elif isinstance(v, Iterable):
+                    feature_name = None
+                    self._add_iterable_element(f, v, feature_names, vocab)
+
+                if feature_name is not None:
+                    if feature_name not in vocab:
+                        vocab[feature_name] = len(feature_names)
+                        feature_names.append(feature_name)
+
+        if self.sort:
+            feature_names.sort()
+            vocab = {f: i for i, f in enumerate(feature_names)}
+
+        self.feature_names_ = feature_names
+        self.vocabulary_ = vocab
+
+        return self
+
+    def _transform(self, X, fitting):
+        # Sanity check: Python's array has no way of explicitly requesting the
+        # signed 32-bit integers that scipy.sparse needs, so we use the next
+        # best thing: typecode "i" (int). However, if that gives larger or
+        # smaller integers than 32-bit ones, np.frombuffer screws up.
+        assert array("i").itemsize == 4, (
+            "sizeof(int) != 4 on your platform; please report this at"
+            " https://github.com/scikit-learn/scikit-learn/issues and"
+            " include the output from platform.platform() in your bug report"
+        )
+
+        dtype = self.dtype
+        if fitting:
+            feature_names = []
+            vocab = {}
+        else:
+            feature_names = self.feature_names_
+            vocab = self.vocabulary_
+
+        transforming = True
+
+        # Process everything as sparse regardless of setting
+        X = [X] if isinstance(X, Mapping) else X
+
+        indices = array("i")
+        indptr = [0]
+        # XXX we could change values to an array.array as well, but it
+        # would require (heuristic) conversion of dtype to typecode...
+        values = []
+
+        # collect all the possible feature names and build sparse matrix at
+        # same time
+        for x in X:
+            for f, v in x.items():
+                if isinstance(v, str):
+                    feature_name = "%s%s%s" % (f, self.separator, v)
+                    v = 1
+                elif isinstance(v, Number) or (v is None):
+                    feature_name = f
+                elif not isinstance(v, Mapping) and isinstance(v, Iterable):
+                    feature_name = None
+                    self._add_iterable_element(
+                        f,
+                        v,
+                        feature_names,
+                        vocab,
+                        fitting=fitting,
+                        transforming=transforming,
+                        indices=indices,
+                        values=values,
+                    )
+                else:
+                    raise TypeError(
+                        f"Unsupported value Type {type(v)} "
+                        f"for {f}: {v}.\n"
+                        f"{type(v)} objects are not supported."
+                    )
+
+                if feature_name is not None:
+                    if fitting and feature_name not in vocab:
+                        vocab[feature_name] = len(feature_names)
+                        feature_names.append(feature_name)
+
+                    if feature_name in vocab:
+                        indices.append(vocab[feature_name])
+                        values.append(self.dtype(v))
+
+            indptr.append(len(indices))
+
+        if len(indptr) == 1:
+            raise ValueError("Sample sequence X is empty.")
+
+        indices = np.frombuffer(indices, dtype=np.intc)
+        shape = (len(indptr) - 1, len(vocab))
+
+        result_matrix = sp.csr_matrix(
+            (values, indices, indptr), shape=shape, dtype=dtype
+        )
+
+        # Sort everything if asked
+        if fitting and self.sort:
+            feature_names.sort()
+            map_index = np.empty(len(feature_names), dtype=np.int32)
+            for new_val, f in enumerate(feature_names):
+                map_index[new_val] = vocab[f]
+                vocab[f] = new_val
+            result_matrix = result_matrix[:, map_index]
+
+        if self.sparse:
+            result_matrix.sort_indices()
+        else:
+            result_matrix = result_matrix.toarray()
+
+        if fitting:
+            self.feature_names_ = feature_names
+            self.vocabulary_ = vocab
+
+        return result_matrix
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None):
+        """Learn a list of feature name -> indices mappings and transform X.
+
+        Like fit(X) followed by transform(X), but does not require
+        materializing X in memory.
+
+        Parameters
+        ----------
+        X : Mapping or iterable over Mappings
+            Dict(s) or Mapping(s) from feature names (arbitrary Python
+            objects) to feature values (strings or convertible to dtype).
+
+            .. versionchanged:: 0.24
+               Accepts multiple string values for one categorical feature.
+
+        y : (ignored)
+            Ignored parameter.
+
+        Returns
+        -------
+        Xa : {array, sparse matrix}
+            Feature vectors; always 2-d.
+        """
+        return self._transform(X, fitting=True)
+
+    def inverse_transform(self, X, dict_type=dict):
+        """Transform array or sparse matrix X back to feature mappings.
+
+        X must have been produced by this DictVectorizer's transform or
+        fit_transform method; it may only have passed through transformers
+        that preserve the number of features and their order.
+
+        In the case of one-hot/one-of-K coding, the constructed feature
+        names and values are returned rather than the original ones.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Sample matrix.
+        dict_type : type, default=dict
+            Constructor for feature mappings. Must conform to the
+            collections.Mapping API.
+
+        Returns
+        -------
+        X_original : list of dict_type objects of shape (n_samples,)
+            Feature mappings for the samples in X.
+        """
+        check_is_fitted(self, "feature_names_")
+
+        # COO matrix is not subscriptable
+        X = check_array(X, accept_sparse=["csr", "csc"])
+        n_samples = X.shape[0]
+
+        names = self.feature_names_
+        dicts = [dict_type() for _ in range(n_samples)]
+
+        if sp.issparse(X):
+            for i, j in zip(*X.nonzero()):
+                dicts[i][names[j]] = X[i, j]
+        else:
+            for i, d in enumerate(dicts):
+                for j, v in enumerate(X[i, :]):
+                    if v != 0:
+                        d[names[j]] = X[i, j]
+
+        return dicts
+
+    def transform(self, X):
+        """Transform feature->value dicts to array or sparse matrix.
+
+        Named features not encountered during fit or fit_transform will be
+        silently ignored.
+
+        Parameters
+        ----------
+        X : Mapping or iterable over Mappings of shape (n_samples,)
+            Dict(s) or Mapping(s) from feature names (arbitrary Python
+            objects) to feature values (strings or convertible to dtype).
+
+        Returns
+        -------
+        Xa : {array, sparse matrix}
+            Feature vectors; always 2-d.
+        """
+        check_is_fitted(self, ["feature_names_", "vocabulary_"])
+        return self._transform(X, fitting=False)
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "feature_names_")
+        if any(not isinstance(name, str) for name in self.feature_names_):
+            feature_names = [str(name) for name in self.feature_names_]
+        else:
+            feature_names = self.feature_names_
+        return np.asarray(feature_names, dtype=object)
+
+    def restrict(self, support, indices=False):
+        """Restrict the features to those in support using feature selection.
+
+        This function modifies the estimator in-place.
+
+        Parameters
+        ----------
+        support : array-like
+            Boolean mask or list of indices (as returned by the get_support
+            member of feature selectors).
+        indices : bool, default=False
+            Whether support is a list of indices.
+
+        Returns
+        -------
+        self : object
+            DictVectorizer class instance.
+
+        Examples
+        --------
+        >>> from sklearn.feature_extraction import DictVectorizer
+        >>> from sklearn.feature_selection import SelectKBest, chi2
+        >>> v = DictVectorizer()
+        >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
+        >>> X = v.fit_transform(D)
+        >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])
+        >>> v.get_feature_names_out()
+        array(['bar', 'baz', 'foo'], ...)
+        >>> v.restrict(support.get_support())
+        DictVectorizer()
+        >>> v.get_feature_names_out()
+        array(['bar', 'foo'], ...)
+        """
+        check_is_fitted(self, "feature_names_")
+
+        if not indices:
+            support = np.where(support)[0]
+
+        names = self.feature_names_
+        new_vocab = {}
+        for i in support:
+            new_vocab[names[i]] = len(new_vocab)
+
+        self.vocabulary_ = new_vocab
+        self.feature_names_ = [
+            f for f, i in sorted(new_vocab.items(), key=itemgetter(1))
+        ]
+
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.dict = True
+        tags.input_tags.two_d_array = False
+        return tags
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hash.py
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hash.py
@@ -0,0 +1,209 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from itertools import chain
+from numbers import Integral
+
+import numpy as np
+import scipy.sparse as sp
+
+from sklearn.utils import metadata_routing
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils._param_validation import Interval, StrOptions
+from ._hashing_fast import transform as _hashing_transform
+
+
+def _iteritems(d):
+    """Like d.iteritems, but accepts any collections.Mapping."""
+    return d.iteritems() if hasattr(d, "iteritems") else d.items()
+
+
+class FeatureHasher(TransformerMixin, BaseEstimator):
+    """Implements feature hashing, aka the hashing trick.
+
+    This class turns sequences of symbolic feature names (strings) into
+    scipy.sparse matrices, using a hash function to compute the matrix column
+    corresponding to a name. The hash function employed is the signed 32-bit
+    version of Murmurhash3.
+
+    Feature names of type byte string are used as-is. Unicode strings are
+    converted to UTF-8 first, but no Unicode normalization is done.
+    Feature values must be (finite) numbers.
+
+    This class is a low-memory alternative to DictVectorizer and
+    CountVectorizer, intended for large-scale (online) learning and situations
+    where memory is tight, e.g. when running prediction code on embedded
+    devices.
+
+    For an efficiency comparison of the different feature extractors, see
+    :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
+
+    Read more in the :ref:`User Guide <feature_hashing>`.
+
+    .. versionadded:: 0.13
+
+    Parameters
+    ----------
+    n_features : int, default=2**20
+        The number of features (columns) in the output matrices. Small numbers
+        of features are likely to cause hash collisions, but large numbers
+        will cause larger coefficient dimensions in linear learners.
+    input_type : str, default='dict'
+        Choose a string from {'dict', 'pair', 'string'}.
+        Either "dict" (the default) to accept dictionaries over
+        (feature_name, value); "pair" to accept pairs of (feature_name, value);
+        or "string" to accept single strings.
+        feature_name should be a string, while value should be a number.
+        In the case of "string", a value of 1 is implied.
+        The feature_name is hashed to find the appropriate column for the
+        feature. The value's sign might be flipped in the output (but see
+        non_negative, below).
+    dtype : numpy dtype, default=np.float64
+        The type of feature values. Passed to scipy.sparse matrix constructors
+        as the dtype argument. Do not set this to bool, np.boolean or any
+        unsigned integer type.
+    alternate_sign : bool, default=True
+        When True, an alternating sign is added to the features as to
+        approximately conserve the inner product in the hashed space even for
+        small n_features. This approach is similar to sparse random projection.
+
+        .. versionchanged:: 0.19
+            ``alternate_sign`` replaces the now deprecated ``non_negative``
+            parameter.
+
+    See Also
+    --------
+    DictVectorizer : Vectorizes string-valued features using a hash table.
+    sklearn.preprocessing.OneHotEncoder : Handles nominal/categorical features.
+
+    Notes
+    -----
+    This estimator is :term:`stateless` and does not need to be fitted.
+    However, we recommend to call :meth:`fit_transform` instead of
+    :meth:`transform`, as parameter validation is only performed in
+    :meth:`fit`.
+
+    Examples
+    --------
+    >>> from sklearn.feature_extraction import FeatureHasher
+    >>> h = FeatureHasher(n_features=10)
+    >>> D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]
+    >>> f = h.transform(D)
+    >>> f.toarray()
+    array([[ 0.,  0., -4., -1.,  0.,  0.,  0.,  0.,  0.,  2.],
+           [ 0.,  0.,  0., -2., -5.,  0.,  0.,  0.,  0.,  0.]])
+
+    With `input_type="string"`, the input must be an iterable over iterables of
+    strings:
+
+    >>> h = FeatureHasher(n_features=8, input_type="string")
+    >>> raw_X = [["dog", "cat", "snake"], ["snake", "dog"], ["cat", "bird"]]
+    >>> f = h.transform(raw_X)
+    >>> f.toarray()
+    array([[ 0.,  0.,  0., -1.,  0., -1.,  0.,  1.],
+           [ 0.,  0.,  0., -1.,  0., -1.,  0.,  0.],
+           [ 0., -1.,  0.,  0.,  0.,  0.,  0.,  1.]])
+    """
+
+    # raw_X should have been called X
+    __metadata_request__transform = {"raw_X": metadata_routing.UNUSED}
+
+    _parameter_constraints: dict = {
+        "n_features": [Interval(Integral, 1, np.iinfo(np.int32).max, closed="both")],
+        "input_type": [StrOptions({"dict", "pair", "string"})],
+        "dtype": "no_validation",  # delegate to numpy
+        "alternate_sign": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        n_features=(2**20),
+        *,
+        input_type="dict",
+        dtype=np.float64,
+        alternate_sign=True,
+    ):
+        self.dtype = dtype
+        self.input_type = input_type
+        self.n_features = n_features
+        self.alternate_sign = alternate_sign
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X=None, y=None):
+        """Only validates estimator's parameters.
+
+        This method allows to: (i) validate the estimator's parameters and
+        (ii) be consistent with the scikit-learn transformer API.
+
+        Parameters
+        ----------
+        X : Ignored
+            Not used, present here for API consistency by convention.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            FeatureHasher class instance.
+        """
+        return self
+
+    def transform(self, raw_X):
+        """Transform a sequence of instances to a scipy.sparse matrix.
+
+        Parameters
+        ----------
+        raw_X : iterable over iterable over raw features, length = n_samples
+            Samples. Each sample must be iterable an (e.g., a list or tuple)
+            containing/generating feature names (and optionally values, see
+            the input_type constructor argument) which will be hashed.
+            raw_X need not support the len function, so it can be the result
+            of a generator; n_samples is determined on the fly.
+
+        Returns
+        -------
+        X : sparse matrix of shape (n_samples, n_features)
+            Feature matrix, for use with estimators or further transformers.
+        """
+        raw_X = iter(raw_X)
+        if self.input_type == "dict":
+            raw_X = (_iteritems(d) for d in raw_X)
+        elif self.input_type == "string":
+            first_raw_X = next(raw_X)
+            if isinstance(first_raw_X, str):
+                raise ValueError(
+                    "Samples can not be a single string. The input must be an iterable"
+                    " over iterables of strings."
+                )
+            raw_X_ = chain([first_raw_X], raw_X)
+            raw_X = (((f, 1) for f in x) for x in raw_X_)
+
+        indices, indptr, values = _hashing_transform(
+            raw_X, self.n_features, self.dtype, self.alternate_sign, seed=0
+        )
+        n_samples = indptr.shape[0] - 1
+
+        if n_samples == 0:
+            raise ValueError("Cannot vectorize empty sequence.")
+
+        X = sp.csr_matrix(
+            (values, indices, indptr),
+            dtype=self.dtype,
+            shape=(n_samples, self.n_features),
+        )
+        X.sum_duplicates()  # also sorts the indices
+
+        return X
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.two_d_array = False
+        if self.input_type == "string":
+            tags.input_tags.string = True
+        elif self.input_type == "dict":
+            tags.input_tags.dict = True
+        tags.requires_fit = False
+        return tags
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.cpython-312-x86_64-linux-gnu.so
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.cpython-312-x86_64-linux-gnu.so
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.pyx
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.pyx
@@ -0,0 +1,89 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libc.stdlib cimport abs
+from libcpp.vector cimport vector
+
+cimport numpy as cnp
+import numpy as np
+from ..utils._typedefs cimport int32_t, int64_t
+from ..utils.murmurhash cimport murmurhash3_bytes_s32
+from ..utils._vector_sentinel cimport vector_to_nd_array
+
+cnp.import_array()
+
+
+def transform(raw_X, Py_ssize_t n_features, dtype,
+              bint alternate_sign=1, unsigned int seed=0):
+    """Guts of FeatureHasher.transform.
+
+    Returns
+    -------
+    n_samples : integer
+    indices, indptr, values : lists
+        For constructing a scipy.sparse.csr_matrix.
+
+    """
+    cdef int32_t h
+    cdef double value
+
+    cdef vector[int32_t] indices
+    cdef vector[int64_t] indptr
+    indptr.push_back(0)
+
+    # Since Python array does not understand Numpy dtypes, we grow the indices
+    # and values arrays ourselves. Use a Py_ssize_t capacity for safety.
+    cdef Py_ssize_t capacity = 8192     # arbitrary
+    cdef int64_t size = 0
+    cdef cnp.ndarray values = np.empty(capacity, dtype=dtype)
+
+    for x in raw_X:
+        for f, v in x:
+            if isinstance(v, (str, unicode)):
+                f = "%s%s%s" % (f, '=', v)
+                value = 1
+            else:
+                value = v
+
+            if value == 0:
+                continue
+
+            if isinstance(f, unicode):
+                f = (<unicode>f).encode("utf-8")
+            # Need explicit type check because Murmurhash does not propagate
+            # all exceptions. Add "except *" there?
+            elif not isinstance(f, bytes):
+                raise TypeError("feature names must be strings")
+
+            h = murmurhash3_bytes_s32(<bytes>f, seed)
+
+            if h == - 2147483648:
+                # abs(-2**31) is undefined behavior because h is a `np.int32`
+                # The following is defined such that it is equal to: abs(-2**31) % n_features
+                indices.push_back((2147483647 - (n_features - 1)) % n_features)
+            else:
+                indices.push_back(abs(h) % n_features)
+            # improve inner product preservation in the hashed space
+            if alternate_sign:
+                value *= (h >= 0) * 2 - 1
+            values[size] = value
+            size += 1
+
+            if size == capacity:
+                capacity *= 2
+                # can't use resize member because there might be multiple
+                # references to the arrays due to Cython's error checking
+                values = np.resize(values, capacity)
+
+        indptr.push_back(size)
+
+    indices_array = vector_to_nd_array(&indices)
+    indptr_array = vector_to_nd_array(&indptr)
+
+    if indptr_array[indptr_array.shape[0]-1] > np.iinfo(np.int32).max:  # = 2**31 - 1
+        # both indices and indptr have the same dtype in CSR arrays
+        indices_array = indices_array.astype(np.int64, copy=False)
+    else:
+        indptr_array = indptr_array.astype(np.int32, copy=False)
+
+    return (indices_array, indptr_array, values[:size])
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/_stop_words.py
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/_stop_words.py
@@ -0,0 +1,328 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# This list of English stop words is taken from the "Glasgow Information
+# Retrieval Group". The original list can be found at
+# http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
+ENGLISH_STOP_WORDS = frozenset(
+    [
+        "a",
+        "about",
+        "above",
+        "across",
+        "after",
+        "afterwards",
+        "again",
+        "against",
+        "all",
+        "almost",
+        "alone",
+        "along",
+        "already",
+        "also",
+        "although",
+        "always",
+        "am",
+        "among",
+        "amongst",
+        "amoungst",
+        "amount",
+        "an",
+        "and",
+        "another",
+        "any",
+        "anyhow",
+        "anyone",
+        "anything",
+        "anyway",
+        "anywhere",
+        "are",
+        "around",
+        "as",
+        "at",
+        "back",
+        "be",
+        "became",
+        "because",
+        "become",
+        "becomes",
+        "becoming",
+        "been",
+        "before",
+        "beforehand",
+        "behind",
+        "being",
+        "below",
+        "beside",
+        "besides",
+        "between",
+        "beyond",
+        "bill",
+        "both",
+        "bottom",
+        "but",
+        "by",
+        "call",
+        "can",
+        "cannot",
+        "cant",
+        "co",
+        "con",
+        "could",
+        "couldnt",
+        "cry",
+        "de",
+        "describe",
+        "detail",
+        "do",
+        "done",
+        "down",
+        "due",
+        "during",
+        "each",
+        "eg",
+        "eight",
+        "either",
+        "eleven",
+        "else",
+        "elsewhere",
+        "empty",
+        "enough",
+        "etc",
+        "even",
+        "ever",
+        "every",
+        "everyone",
+        "everything",
+        "everywhere",
+        "except",
+        "few",
+        "fifteen",
+        "fifty",
+        "fill",
+        "find",
+        "fire",
+        "first",
+        "five",
+        "for",
+        "former",
+        "formerly",
+        "forty",
+        "found",
+        "four",
+        "from",
+        "front",
+        "full",
+        "further",
+        "get",
+        "give",
+        "go",
+        "had",
+        "has",
+        "hasnt",
+        "have",
+        "he",
+        "hence",
+        "her",
+        "here",
+        "hereafter",
+        "hereby",
+        "herein",
+        "hereupon",
+        "hers",
+        "herself",
+        "him",
+        "himself",
+        "his",
+        "how",
+        "however",
+        "hundred",
+        "i",
+        "ie",
+        "if",
+        "in",
+        "inc",
+        "indeed",
+        "interest",
+        "into",
+        "is",
+        "it",
+        "its",
+        "itself",
+        "keep",
+        "last",
+        "latter",
+        "latterly",
+        "least",
+        "less",
+        "ltd",
+        "made",
+        "many",
+        "may",
+        "me",
+        "meanwhile",
+        "might",
+        "mill",
+        "mine",
+        "more",
+        "moreover",
+        "most",
+        "mostly",
+        "move",
+        "much",
+        "must",
+        "my",
+        "myself",
+        "name",
+        "namely",
+        "neither",
+        "never",
+        "nevertheless",
+        "next",
+        "nine",
+        "no",
+        "nobody",
+        "none",
+        "noone",
+        "nor",
+        "not",
+        "nothing",
+        "now",
+        "nowhere",
+        "of",
+        "off",
+        "often",
+        "on",
+        "once",
+        "one",
+        "only",
+        "onto",
+        "or",
+        "other",
+        "others",
+        "otherwise",
+        "our",
+        "ours",
+        "ourselves",
+        "out",
+        "over",
+        "own",
+        "part",
+        "per",
+        "perhaps",
+        "please",
+        "put",
+        "rather",
+        "re",
+        "same",
+        "see",
+        "seem",
+        "seemed",
+        "seeming",
+        "seems",
+        "serious",
+        "several",
+        "she",
+        "should",
+        "show",
+        "side",
+        "since",
+        "sincere",
+        "six",
+        "sixty",
+        "so",
+        "some",
+        "somehow",
+        "someone",
+        "something",
+        "sometime",
+        "sometimes",
+        "somewhere",
+        "still",
+        "such",
+        "system",
+        "take",
+        "ten",
+        "than",
+        "that",
+        "the",
+        "their",
+        "them",
+        "themselves",
+        "then",
+        "thence",
+        "there",
+        "thereafter",
+        "thereby",
+        "therefore",
+        "therein",
+        "thereupon",
+        "these",
+        "they",
+        "thick",
+        "thin",
+        "third",
+        "this",
+        "those",
+        "though",
+        "three",
+        "through",
+        "throughout",
+        "thru",
+        "thus",
+        "to",
+        "together",
+        "too",
+        "top",
+        "toward",
+        "towards",
+        "twelve",
+        "twenty",
+        "two",
+        "un",
+        "under",
+        "until",
+        "up",
+        "upon",
+        "us",
+        "very",
+        "via",
+        "was",
+        "we",
+        "well",
+        "were",
+        "what",
+        "whatever",
+        "when",
+        "whence",
+        "whenever",
+        "where",
+        "whereafter",
+        "whereas",
+        "whereby",
+        "wherein",
+        "whereupon",
+        "wherever",
+        "whether",
+        "which",
+        "while",
+        "whither",
+        "who",
+        "whoever",
+        "whole",
+        "whom",
+        "whose",
+        "why",
+        "will",
+        "with",
+        "within",
+        "without",
+        "would",
+        "yet",
+        "you",
+        "your",
+        "yours",
+        "yourself",
+        "yourselves",
+    ]
+)
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/image.py
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/image.py
@@ -0,0 +1,687 @@
+"""Utilities to extract features from images."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from itertools import product
+from numbers import Integral, Number, Real
+
+import numpy as np
+from numpy.lib.stride_tricks import as_strided
+from scipy import sparse
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils import check_array, check_random_state
+from ..utils._param_validation import Hidden, Interval, RealNotInt, validate_params
+
+__all__ = [
+    "PatchExtractor",
+    "extract_patches_2d",
+    "grid_to_graph",
+    "img_to_graph",
+    "reconstruct_from_patches_2d",
+]
+
+from ..utils.validation import validate_data
+
+###############################################################################
+# From an image to a graph
+
+
+def _make_edges_3d(n_x, n_y, n_z=1):
+    """Returns a list of edges for a 3D image.
+
+    Parameters
+    ----------
+    n_x : int
+        The size of the grid in the x direction.
+    n_y : int
+        The size of the grid in the y direction.
+    n_z : integer, default=1
+        The size of the grid in the z direction, defaults to 1
+    """
+    vertices = np.arange(n_x * n_y * n_z).reshape((n_x, n_y, n_z))
+    edges_deep = np.vstack((vertices[:, :, :-1].ravel(), vertices[:, :, 1:].ravel()))
+    edges_right = np.vstack((vertices[:, :-1].ravel(), vertices[:, 1:].ravel()))
+    edges_down = np.vstack((vertices[:-1].ravel(), vertices[1:].ravel()))
+    edges = np.hstack((edges_deep, edges_right, edges_down))
+    return edges
+
+
+def _compute_gradient_3d(edges, img):
+    _, n_y, n_z = img.shape
+    gradient = np.abs(
+        img[
+            edges[0] // (n_y * n_z),
+            (edges[0] % (n_y * n_z)) // n_z,
+            (edges[0] % (n_y * n_z)) % n_z,
+        ]
+        - img[
+            edges[1] // (n_y * n_z),
+            (edges[1] % (n_y * n_z)) // n_z,
+            (edges[1] % (n_y * n_z)) % n_z,
+        ]
+    )
+    return gradient
+
+
+# XXX: Why mask the image after computing the weights?
+
+
+def _mask_edges_weights(mask, edges, weights=None):
+    """Apply a mask to edges (weighted or not)"""
+    inds = np.arange(mask.size)
+    inds = inds[mask.ravel()]
+    ind_mask = np.logical_and(np.isin(edges[0], inds), np.isin(edges[1], inds))
+    edges = edges[:, ind_mask]
+    if weights is not None:
+        weights = weights[ind_mask]
+    if len(edges.ravel()):
+        maxval = edges.max()
+    else:
+        maxval = 0
+    order = np.searchsorted(np.flatnonzero(mask), np.arange(maxval + 1))
+    edges = order[edges]
+    if weights is None:
+        return edges
+    else:
+        return edges, weights
+
+
+def _to_graph(
+    n_x, n_y, n_z, mask=None, img=None, return_as=sparse.coo_matrix, dtype=None
+):
+    """Auxiliary function for img_to_graph and grid_to_graph"""
+    edges = _make_edges_3d(n_x, n_y, n_z)
+
+    if dtype is None:  # To not overwrite input dtype
+        if img is None:
+            dtype = int
+        else:
+            dtype = img.dtype
+
+    if img is not None:
+        img = np.atleast_3d(img)
+        weights = _compute_gradient_3d(edges, img)
+        if mask is not None:
+            edges, weights = _mask_edges_weights(mask, edges, weights)
+            diag = img.squeeze()[mask]
+        else:
+            diag = img.ravel()
+        n_voxels = diag.size
+    else:
+        if mask is not None:
+            mask = mask.astype(dtype=bool, copy=False)
+            edges = _mask_edges_weights(mask, edges)
+            n_voxels = np.sum(mask)
+        else:
+            n_voxels = n_x * n_y * n_z
+        weights = np.ones(edges.shape[1], dtype=dtype)
+        diag = np.ones(n_voxels, dtype=dtype)
+
+    diag_idx = np.arange(n_voxels)
+    i_idx = np.hstack((edges[0], edges[1]))
+    j_idx = np.hstack((edges[1], edges[0]))
+    graph = sparse.coo_matrix(
+        (
+            np.hstack((weights, weights, diag)),
+            (np.hstack((i_idx, diag_idx)), np.hstack((j_idx, diag_idx))),
+        ),
+        (n_voxels, n_voxels),
+        dtype=dtype,
+    )
+    if return_as is np.ndarray:
+        return graph.toarray()
+    return return_as(graph)
+
+
+@validate_params(
+    {
+        "img": ["array-like"],
+        "mask": [None, np.ndarray],
+        "return_as": [type],
+        "dtype": "no_validation",  # validation delegated to numpy
+    },
+    prefer_skip_nested_validation=True,
+)
+def img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None):
+    """Graph of the pixel-to-pixel gradient connections.
+
+    Edges are weighted with the gradient values.
+
+    Read more in the :ref:`User Guide <image_feature_extraction>`.
+
+    Parameters
+    ----------
+    img : array-like of shape (height, width) or (height, width, channel)
+        2D or 3D image.
+    mask : ndarray of shape (height, width) or \
+            (height, width, channel), dtype=bool, default=None
+        An optional mask of the image, to consider only part of the
+        pixels.
+    return_as : np.ndarray or a sparse matrix class, \
+            default=sparse.coo_matrix
+        The class to use to build the returned adjacency matrix.
+    dtype : dtype, default=None
+        The data of the returned sparse matrix. By default it is the
+        dtype of img.
+
+    Returns
+    -------
+    graph : ndarray or a sparse matrix class
+        The computed adjacency matrix.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.feature_extraction.image import img_to_graph
+    >>> img = np.array([[0, 0], [0, 1]])
+    >>> img_to_graph(img, return_as=np.ndarray)
+    array([[0, 0, 0, 0],
+           [0, 0, 0, 1],
+           [0, 0, 0, 1],
+           [0, 1, 1, 1]])
+    """
+    img = np.atleast_3d(img)
+    n_x, n_y, n_z = img.shape
+    return _to_graph(n_x, n_y, n_z, mask, img, return_as, dtype)
+
+
+@validate_params(
+    {
+        "n_x": [Interval(Integral, left=1, right=None, closed="left")],
+        "n_y": [Interval(Integral, left=1, right=None, closed="left")],
+        "n_z": [Interval(Integral, left=1, right=None, closed="left")],
+        "mask": [None, np.ndarray],
+        "return_as": [type],
+        "dtype": "no_validation",  # validation delegated to numpy
+    },
+    prefer_skip_nested_validation=True,
+)
+def grid_to_graph(
+    n_x, n_y, n_z=1, *, mask=None, return_as=sparse.coo_matrix, dtype=int
+):
+    """Graph of the pixel-to-pixel connections.
+
+    Edges exist if 2 voxels are connected.
+
+    Read more in the :ref:`User Guide <connectivity_graph_image>`.
+
+    Parameters
+    ----------
+    n_x : int
+        Dimension in x axis.
+    n_y : int
+        Dimension in y axis.
+    n_z : int, default=1
+        Dimension in z axis.
+    mask : ndarray of shape (n_x, n_y, n_z), dtype=bool, default=None
+        An optional mask of the image, to consider only part of the
+        pixels.
+    return_as : np.ndarray or a sparse matrix class, \
+            default=sparse.coo_matrix
+        The class to use to build the returned adjacency matrix.
+    dtype : dtype, default=int
+        The data of the returned sparse matrix. By default it is int.
+
+    Returns
+    -------
+    graph : np.ndarray or a sparse matrix class
+        The computed adjacency matrix.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.feature_extraction.image import grid_to_graph
+    >>> shape_img = (4, 4, 1)
+    >>> mask = np.zeros(shape=shape_img, dtype=bool)
+    >>> mask[[1, 2], [1, 2], :] = True
+    >>> graph = grid_to_graph(*shape_img, mask=mask)
+    >>> print(graph)
+    <COOrdinate sparse matrix of dtype 'int64'
+      with 2 stored elements and shape (2, 2)>
+      Coords	Values
+      (0, 0)    1
+      (1, 1)    1
+    """
+    return _to_graph(n_x, n_y, n_z, mask=mask, return_as=return_as, dtype=dtype)
+
+
+###############################################################################
+# From an image to a set of small image patches
+
+
+def _compute_n_patches(i_h, i_w, p_h, p_w, max_patches=None):
+    """Compute the number of patches that will be extracted in an image.
+
+    Read more in the :ref:`User Guide <image_feature_extraction>`.
+
+    Parameters
+    ----------
+    i_h : int
+        The image height
+    i_w : int
+        The image with
+    p_h : int
+        The height of a patch
+    p_w : int
+        The width of a patch
+    max_patches : int or float, default=None
+        The maximum number of patches to extract. If `max_patches` is a float
+        between 0 and 1, it is taken to be a proportion of the total number
+        of patches. If `max_patches` is None, all possible patches are extracted.
+    """
+    n_h = i_h - p_h + 1
+    n_w = i_w - p_w + 1
+    all_patches = n_h * n_w
+
+    if max_patches:
+        if isinstance(max_patches, (Integral)) and max_patches < all_patches:
+            return max_patches
+        elif isinstance(max_patches, (Integral)) and max_patches >= all_patches:
+            return all_patches
+        elif isinstance(max_patches, (Real)) and 0 < max_patches < 1:
+            return int(max_patches * all_patches)
+        else:
+            raise ValueError("Invalid value for max_patches: %r" % max_patches)
+    else:
+        return all_patches
+
+
+def _extract_patches(arr, patch_shape=8, extraction_step=1):
+    """Extracts patches of any n-dimensional array in place using strides.
+
+    Given an n-dimensional array it will return a 2n-dimensional array with
+    the first n dimensions indexing patch position and the last n indexing
+    the patch content. This operation is immediate (O(1)). A reshape
+    performed on the first n dimensions will cause numpy to copy data, leading
+    to a list of extracted patches.
+
+    Read more in the :ref:`User Guide <image_feature_extraction>`.
+
+    Parameters
+    ----------
+    arr : ndarray
+        n-dimensional array of which patches are to be extracted
+
+    patch_shape : int or tuple of length arr.ndim.default=8
+        Indicates the shape of the patches to be extracted. If an
+        integer is given, the shape will be a hypercube of
+        sidelength given by its value.
+
+    extraction_step : int or tuple of length arr.ndim, default=1
+        Indicates step size at which extraction shall be performed.
+        If integer is given, then the step is uniform in all dimensions.
+
+
+    Returns
+    -------
+    patches : strided ndarray
+        2n-dimensional array indexing patches on first n dimensions and
+        containing patches on the last n dimensions. These dimensions
+        are fake, but this way no data is copied. A simple reshape invokes
+        a copying operation to obtain a list of patches:
+        result.reshape([-1] + list(patch_shape))
+    """
+
+    arr_ndim = arr.ndim
+
+    if isinstance(patch_shape, Number):
+        patch_shape = tuple([patch_shape] * arr_ndim)
+    if isinstance(extraction_step, Number):
+        extraction_step = tuple([extraction_step] * arr_ndim)
+
+    patch_strides = arr.strides
+
+    slices = tuple(slice(None, None, st) for st in extraction_step)
+    indexing_strides = arr[slices].strides
+
+    patch_indices_shape = (
+        (np.array(arr.shape) - np.array(patch_shape)) // np.array(extraction_step)
+    ) + 1
+
+    shape = tuple(list(patch_indices_shape) + list(patch_shape))
+    strides = tuple(list(indexing_strides) + list(patch_strides))
+
+    patches = as_strided(arr, shape=shape, strides=strides)
+    return patches
+
+
+@validate_params(
+    {
+        "image": [np.ndarray],
+        "patch_size": [tuple, list],
+        "max_patches": [
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            Interval(Integral, 1, None, closed="left"),
+            None,
+        ],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def extract_patches_2d(image, patch_size, *, max_patches=None, random_state=None):
+    """Reshape a 2D image into a collection of patches.
+
+    The resulting patches are allocated in a dedicated array.
+
+    Read more in the :ref:`User Guide <image_feature_extraction>`.
+
+    Parameters
+    ----------
+    image : ndarray of shape (image_height, image_width) or \
+        (image_height, image_width, n_channels)
+        The original image data. For color images, the last dimension specifies
+        the channel: a RGB image would have `n_channels=3`.
+
+    patch_size : tuple of int (patch_height, patch_width)
+        The dimensions of one patch.
+
+    max_patches : int or float, default=None
+        The maximum number of patches to extract. If `max_patches` is a float
+        between 0 and 1, it is taken to be a proportion of the total number
+        of patches. If `max_patches` is None it corresponds to the total number
+        of patches that can be extracted.
+
+    random_state : int, RandomState instance, default=None
+        Determines the random number generator used for random sampling when
+        `max_patches` is not None. Use an int to make the randomness
+        deterministic.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    patches : array of shape (n_patches, patch_height, patch_width) or \
+        (n_patches, patch_height, patch_width, n_channels)
+        The collection of patches extracted from the image, where `n_patches`
+        is either `max_patches` or the total number of patches that can be
+        extracted.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_sample_image
+    >>> from sklearn.feature_extraction import image
+    >>> # Use the array data from the first image in this dataset:
+    >>> one_image = load_sample_image("china.jpg")
+    >>> print('Image shape: {}'.format(one_image.shape))
+    Image shape: (427, 640, 3)
+    >>> patches = image.extract_patches_2d(one_image, (2, 2))
+    >>> print('Patches shape: {}'.format(patches.shape))
+    Patches shape: (272214, 2, 2, 3)
+    >>> # Here are just two of these patches:
+    >>> print(patches[1])
+    [[[174 201 231]
+      [174 201 231]]
+     [[173 200 230]
+      [173 200 230]]]
+    >>> print(patches[800])
+    [[[187 214 243]
+      [188 215 244]]
+     [[187 214 243]
+      [188 215 244]]]
+    """
+    i_h, i_w = image.shape[:2]
+    p_h, p_w = patch_size
+
+    if p_h > i_h:
+        raise ValueError(
+            "Height of the patch should be less than the height of the image."
+        )
+
+    if p_w > i_w:
+        raise ValueError(
+            "Width of the patch should be less than the width of the image."
+        )
+
+    image = check_array(image, allow_nd=True)
+    image = image.reshape((i_h, i_w, -1))
+    n_colors = image.shape[-1]
+
+    extracted_patches = _extract_patches(
+        image, patch_shape=(p_h, p_w, n_colors), extraction_step=1
+    )
+
+    n_patches = _compute_n_patches(i_h, i_w, p_h, p_w, max_patches)
+    if max_patches:
+        rng = check_random_state(random_state)
+        i_s = rng.randint(i_h - p_h + 1, size=n_patches)
+        j_s = rng.randint(i_w - p_w + 1, size=n_patches)
+        patches = extracted_patches[i_s, j_s, 0]
+    else:
+        patches = extracted_patches
+
+    patches = patches.reshape(-1, p_h, p_w, n_colors)
+    # remove the color dimension if useless
+    if patches.shape[-1] == 1:
+        return patches.reshape((n_patches, p_h, p_w))
+    else:
+        return patches
+
+
+@validate_params(
+    {"patches": [np.ndarray], "image_size": [tuple, Hidden(list)]},
+    prefer_skip_nested_validation=True,
+)
+def reconstruct_from_patches_2d(patches, image_size):
+    """Reconstruct the image from all of its patches.
+
+    Patches are assumed to overlap and the image is constructed by filling in
+    the patches from left to right, top to bottom, averaging the overlapping
+    regions.
+
+    Read more in the :ref:`User Guide <image_feature_extraction>`.
+
+    Parameters
+    ----------
+    patches : ndarray of shape (n_patches, patch_height, patch_width) or \
+        (n_patches, patch_height, patch_width, n_channels)
+        The complete set of patches. If the patches contain colour information,
+        channels are indexed along the last dimension: RGB patches would
+        have `n_channels=3`.
+
+    image_size : tuple of int (image_height, image_width) or \
+        (image_height, image_width, n_channels)
+        The size of the image that will be reconstructed.
+
+    Returns
+    -------
+    image : ndarray of shape image_size
+        The reconstructed image.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_sample_image
+    >>> from sklearn.feature_extraction import image
+    >>> one_image = load_sample_image("china.jpg")
+    >>> print('Image shape: {}'.format(one_image.shape))
+    Image shape: (427, 640, 3)
+    >>> image_patches = image.extract_patches_2d(image=one_image, patch_size=(10, 10))
+    >>> print('Patches shape: {}'.format(image_patches.shape))
+    Patches shape: (263758, 10, 10, 3)
+    >>> image_reconstructed = image.reconstruct_from_patches_2d(
+    ...     patches=image_patches,
+    ...     image_size=one_image.shape
+    ... )
+    >>> print(f"Reconstructed shape: {image_reconstructed.shape}")
+    Reconstructed shape: (427, 640, 3)
+    """
+    i_h, i_w = image_size[:2]
+    p_h, p_w = patches.shape[1:3]
+    img = np.zeros(image_size)
+    # compute the dimensions of the patches array
+    n_h = i_h - p_h + 1
+    n_w = i_w - p_w + 1
+    for p, (i, j) in zip(patches, product(range(n_h), range(n_w))):
+        img[i : i + p_h, j : j + p_w] += p
+
+    for i in range(i_h):
+        for j in range(i_w):
+            # divide by the amount of overlap
+            # XXX: is this the most efficient way? memory-wise yes, cpu wise?
+            img[i, j] /= float(min(i + 1, p_h, i_h - i) * min(j + 1, p_w, i_w - j))
+    return img
+
+
+class PatchExtractor(TransformerMixin, BaseEstimator):
+    """Extracts patches from a collection of images.
+
+    Read more in the :ref:`User Guide <image_feature_extraction>`.
+
+    .. versionadded:: 0.9
+
+    Parameters
+    ----------
+    patch_size : tuple of int (patch_height, patch_width), default=None
+        The dimensions of one patch. If set to None, the patch size will be
+        automatically set to `(img_height // 10, img_width // 10)`, where
+        `img_height` and `img_width` are the dimensions of the input images.
+
+    max_patches : int or float, default=None
+        The maximum number of patches per image to extract. If `max_patches` is
+        a float in (0, 1), it is taken to mean a proportion of the total number
+        of patches. If set to None, extract all possible patches.
+
+    random_state : int, RandomState instance, default=None
+        Determines the random number generator used for random sampling when
+        `max_patches is not None`. Use an int to make the randomness
+        deterministic.
+        See :term:`Glossary <random_state>`.
+
+    See Also
+    --------
+    reconstruct_from_patches_2d : Reconstruct image from all of its patches.
+
+    Notes
+    -----
+    This estimator is stateless and does not need to be fitted. However, we
+    recommend to call :meth:`fit_transform` instead of :meth:`transform`, as
+    parameter validation is only performed in :meth:`fit`.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_sample_images
+    >>> from sklearn.feature_extraction import image
+    >>> # Use the array data from the second image in this dataset:
+    >>> X = load_sample_images().images[1]
+    >>> X = X[None, ...]
+    >>> print(f"Image shape: {X.shape}")
+    Image shape: (1, 427, 640, 3)
+    >>> pe = image.PatchExtractor(patch_size=(10, 10))
+    >>> pe_trans = pe.transform(X)
+    >>> print(f"Patches shape: {pe_trans.shape}")
+    Patches shape: (263758, 10, 10, 3)
+    >>> X_reconstructed = image.reconstruct_from_patches_2d(pe_trans, X.shape[1:])
+    >>> print(f"Reconstructed shape: {X_reconstructed.shape}")
+    Reconstructed shape: (427, 640, 3)
+    """
+
+    _parameter_constraints: dict = {
+        "patch_size": [tuple, None],
+        "max_patches": [
+            None,
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            Interval(Integral, 1, None, closed="left"),
+        ],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(self, *, patch_size=None, max_patches=None, random_state=None):
+        self.patch_size = patch_size
+        self.max_patches = max_patches
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Only validate the parameters of the estimator.
+
+        This method allows to: (i) validate the parameters of the estimator  and
+        (ii) be consistent with the scikit-learn transformer API.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, image_height, image_width) or \
+                (n_samples, image_height, image_width, n_channels)
+            Array of images from which to extract patches. For color images,
+            the last dimension specifies the channel: a RGB image would have
+            `n_channels=3`.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        return self
+
+    def transform(self, X):
+        """Transform the image samples in `X` into a matrix of patch data.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, image_height, image_width) or \
+                (n_samples, image_height, image_width, n_channels)
+            Array of images from which to extract patches. For color images,
+            the last dimension specifies the channel: a RGB image would have
+            `n_channels=3`.
+
+        Returns
+        -------
+        patches : array of shape (n_patches, patch_height, patch_width) or \
+                (n_patches, patch_height, patch_width, n_channels)
+            The collection of patches extracted from the images, where
+            `n_patches` is either `n_samples * max_patches` or the total
+            number of patches that can be extracted.
+        """
+        X = validate_data(
+            self,
+            X=X,
+            ensure_2d=False,
+            allow_nd=True,
+            ensure_min_samples=1,
+            ensure_min_features=1,
+            reset=False,
+        )
+        random_state = check_random_state(self.random_state)
+        n_imgs, img_height, img_width = X.shape[:3]
+        if self.patch_size is None:
+            patch_size = img_height // 10, img_width // 10
+        else:
+            if len(self.patch_size) != 2:
+                raise ValueError(
+                    "patch_size must be a tuple of two integers. Got"
+                    f" {self.patch_size} instead."
+                )
+            patch_size = self.patch_size
+
+        n_imgs, img_height, img_width = X.shape[:3]
+        X = np.reshape(X, (n_imgs, img_height, img_width, -1))
+        n_channels = X.shape[-1]
+
+        # compute the dimensions of the patches array
+        patch_height, patch_width = patch_size
+        n_patches = _compute_n_patches(
+            img_height, img_width, patch_height, patch_width, self.max_patches
+        )
+        patches_shape = (n_imgs * n_patches,) + patch_size
+        if n_channels > 1:
+            patches_shape += (n_channels,)
+
+        # extract the patches
+        patches = np.empty(patches_shape)
+        for ii, image in enumerate(X):
+            patches[ii * n_patches : (ii + 1) * n_patches] = extract_patches_2d(
+                image,
+                patch_size,
+                max_patches=self.max_patches,
+                random_state=random_state,
+            )
+        return patches
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.two_d_array = False
+        tags.input_tags.three_d_array = True
+        tags.requires_fit = False
+        return tags
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/meson.build
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/meson.build
@@ -0,0 +1,7 @@
+py.extension_module(
+  '_hashing_fast',
+  [cython_gen_cpp.process('_hashing_fast.pyx'), utils_cython_tree],
+  dependencies: [np_dep],
+  subdir: 'sklearn/feature_extraction',
+  install: true
+)
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/init.py
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/init.py
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/pycache/init.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/pycache/init.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/pycache/test_dict_vectorizer.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/pycache/test_dict_vectorizer.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/pycache/test_feature_hasher.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/pycache/test_feature_hasher.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/pycache/test_image.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/pycache/test_image.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/pycache/test_text.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/pycache/test_text.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/test_dict_vectorizer.py
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/test_dict_vectorizer.py
@@ -0,0 +1,261 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from random import Random
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+from numpy.testing import assert_allclose, assert_array_equal
+
+from sklearn.exceptions import NotFittedError
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.feature_selection import SelectKBest, chi2
+
+
+@pytest.mark.parametrize("sparse", (True, False))
+@pytest.mark.parametrize("dtype", (int, np.float32, np.int16))
+@pytest.mark.parametrize("sort", (True, False))
+@pytest.mark.parametrize("iterable", (True, False))
+def test_dictvectorizer(sparse, dtype, sort, iterable):
+    D = [{"foo": 1, "bar": 3}, {"bar": 4, "baz": 2}, {"bar": 1, "quux": 1, "quuux": 2}]
+
+    v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
+    X = v.fit_transform(iter(D) if iterable else D)
+
+    assert sp.issparse(X) == sparse
+    assert X.shape == (3, 5)
+    assert X.sum() == 14
+    assert v.inverse_transform(X) == D
+
+    if sparse:
+        # CSR matrices can't be compared for equality
+        assert_array_equal(
+            X.toarray(), v.transform(iter(D) if iterable else D).toarray()
+        )
+    else:
+        assert_array_equal(X, v.transform(iter(D) if iterable else D))
+
+    if sort:
+        assert v.feature_names_ == sorted(v.feature_names_)
+
+
+def test_feature_selection():
+    # make two feature dicts with two useful features and a bunch of useless
+    # ones, in terms of chi2
+    d1 = dict([("useless%d" % i, 10) for i in range(20)], useful1=1, useful2=20)
+    d2 = dict([("useless%d" % i, 10) for i in range(20)], useful1=20, useful2=1)
+
+    for indices in (True, False):
+        v = DictVectorizer().fit([d1, d2])
+        X = v.transform([d1, d2])
+        sel = SelectKBest(chi2, k=2).fit(X, [0, 1])
+
+        v.restrict(sel.get_support(indices=indices), indices=indices)
+        assert_array_equal(v.get_feature_names_out(), ["useful1", "useful2"])
+
+
+def test_one_of_k():
+    D_in = [
+        {"version": "1", "ham": 2},
+        {"version": "2", "spam": 0.3},
+        {"version=3": True, "spam": -1},
+    ]
+    v = DictVectorizer()
+    X = v.fit_transform(D_in)
+    assert X.shape == (3, 5)
+
+    D_out = v.inverse_transform(X)
+    assert D_out[0] == {"version=1": 1, "ham": 2}
+
+    names = v.get_feature_names_out()
+    assert "version=2" in names
+    assert "version" not in names
+
+
+def test_iterable_value():
+    D_names = ["ham", "spam", "version=1", "version=2", "version=3"]
+    X_expected = [
+        [2.0, 0.0, 2.0, 1.0, 0.0],
+        [0.0, 0.3, 0.0, 1.0, 0.0],
+        [0.0, -1.0, 0.0, 0.0, 1.0],
+    ]
+    D_in = [
+        {"version": ["1", "2", "1"], "ham": 2},
+        {"version": "2", "spam": 0.3},
+        {"version=3": True, "spam": -1},
+    ]
+    v = DictVectorizer()
+    X = v.fit_transform(D_in)
+    X = X.toarray()
+    assert_array_equal(X, X_expected)
+
+    D_out = v.inverse_transform(X)
+    assert D_out[0] == {"version=1": 2, "version=2": 1, "ham": 2}
+
+    names = v.get_feature_names_out()
+
+    assert_array_equal(names, D_names)
+
+
+def test_iterable_not_string_error():
+    error_value = (
+        "Unsupported type <class 'int'> in iterable value. "
+        "Only iterables of string are supported."
+    )
+    D2 = [{"foo": "1", "bar": "2"}, {"foo": "3", "baz": "1"}, {"foo": [1, "three"]}]
+    v = DictVectorizer(sparse=False)
+    with pytest.raises(TypeError) as error:
+        v.fit(D2)
+    assert str(error.value) == error_value
+
+
+def test_mapping_error():
+    error_value = (
+        "Unsupported value type <class 'dict'> "
+        "for foo: {'one': 1, 'three': 3}.\n"
+        "Mapping objects are not supported."
+    )
+    D2 = [
+        {"foo": "1", "bar": "2"},
+        {"foo": "3", "baz": "1"},
+        {"foo": {"one": 1, "three": 3}},
+    ]
+    v = DictVectorizer(sparse=False)
+    with pytest.raises(TypeError) as error:
+        v.fit(D2)
+    assert str(error.value) == error_value
+
+
+def test_unseen_or_no_features():
+    D = [{"camelot": 0, "spamalot": 1}]
+    for sparse in [True, False]:
+        v = DictVectorizer(sparse=sparse).fit(D)
+
+        X = v.transform({"push the pram a lot": 2})
+        if sparse:
+            X = X.toarray()
+        assert_array_equal(X, np.zeros((1, 2)))
+
+        X = v.transform({})
+        if sparse:
+            X = X.toarray()
+        assert_array_equal(X, np.zeros((1, 2)))
+
+        with pytest.raises(ValueError, match="empty"):
+            v.transform([])
+
+
+def test_deterministic_vocabulary(global_random_seed):
+    # Generate equal dictionaries with different memory layouts
+    items = [("%03d" % i, i) for i in range(1000)]
+    rng = Random(global_random_seed)
+    d_sorted = dict(items)
+    rng.shuffle(items)
+    d_shuffled = dict(items)
+
+    # check that the memory layout does not impact the resulting vocabulary
+    v_1 = DictVectorizer().fit([d_sorted])
+    v_2 = DictVectorizer().fit([d_shuffled])
+
+    assert v_1.vocabulary_ == v_2.vocabulary_
+
+
+def test_n_features_in():
+    # For vectorizers, n_features_in_ does not make sense and does not exist.
+    dv = DictVectorizer()
+    assert not hasattr(dv, "n_features_in_")
+    d = [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}]
+    dv.fit(d)
+    assert not hasattr(dv, "n_features_in_")
+
+
+def test_dictvectorizer_dense_sparse_equivalence():
+    """Check the equivalence between between sparse and dense DictVectorizer.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19978
+    """
+    movie_entry_fit = [
+        {"category": ["thriller", "drama"], "year": 2003},
+        {"category": ["animation", "family"], "year": 2011},
+        {"year": 1974},
+    ]
+    movie_entry_transform = [{"category": ["thriller"], "unseen_feature": "3"}]
+    dense_vectorizer = DictVectorizer(sparse=False)
+    sparse_vectorizer = DictVectorizer(sparse=True)
+
+    dense_vector_fit = dense_vectorizer.fit_transform(movie_entry_fit)
+    sparse_vector_fit = sparse_vectorizer.fit_transform(movie_entry_fit)
+
+    assert not sp.issparse(dense_vector_fit)
+    assert sp.issparse(sparse_vector_fit)
+
+    assert_allclose(dense_vector_fit, sparse_vector_fit.toarray())
+
+    dense_vector_transform = dense_vectorizer.transform(movie_entry_transform)
+    sparse_vector_transform = sparse_vectorizer.transform(movie_entry_transform)
+
+    assert not sp.issparse(dense_vector_transform)
+    assert sp.issparse(sparse_vector_transform)
+
+    assert_allclose(dense_vector_transform, sparse_vector_transform.toarray())
+
+    dense_inverse_transform = dense_vectorizer.inverse_transform(dense_vector_transform)
+    sparse_inverse_transform = sparse_vectorizer.inverse_transform(
+        sparse_vector_transform
+    )
+
+    expected_inverse = [{"category=thriller": 1.0}]
+    assert dense_inverse_transform == expected_inverse
+    assert sparse_inverse_transform == expected_inverse
+
+
+def test_dict_vectorizer_unsupported_value_type():
+    """Check that we raise an error when the value associated to a feature
+    is not supported.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19489
+    """
+
+    class A:
+        pass
+
+    vectorizer = DictVectorizer(sparse=True)
+    X = [{"foo": A()}]
+    err_msg = "Unsupported value Type"
+    with pytest.raises(TypeError, match=err_msg):
+        vectorizer.fit_transform(X)
+
+
+def test_dict_vectorizer_get_feature_names_out():
+    """Check that integer feature names are converted to strings in
+    feature_names_out."""
+
+    X = [{1: 2, 3: 4}, {2: 4}]
+    dv = DictVectorizer(sparse=False).fit(X)
+
+    feature_names = dv.get_feature_names_out()
+    assert isinstance(feature_names, np.ndarray)
+    assert feature_names.dtype == object
+    assert_array_equal(feature_names, ["1", "2", "3"])
+
+
+@pytest.mark.parametrize(
+    "method, input",
+    [
+        ("transform", [{1: 2, 3: 4}, {2: 4}]),
+        ("inverse_transform", [{1: 2, 3: 4}, {2: 4}]),
+        ("restrict", [True, False, True]),
+    ],
+)
+def test_dict_vectorizer_not_fitted_error(method, input):
+    """Check that unfitted DictVectorizer instance raises NotFittedError.
+
+    This should be part of the common test but currently they test estimator accepting
+    text input.
+    """
+    dv = DictVectorizer(sparse=False)
+
+    with pytest.raises(NotFittedError):
+        getattr(dv, method)(input)
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/test_feature_hasher.py
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/test_feature_hasher.py
@@ -0,0 +1,175 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+from sklearn.feature_extraction import FeatureHasher
+from sklearn.feature_extraction._hashing_fast import transform as _hashing_transform
+
+
+def test_feature_hasher_dicts():
+    feature_hasher = FeatureHasher(n_features=16)
+    assert "dict" == feature_hasher.input_type
+
+    raw_X = [{"foo": "bar", "dada": 42, "tzara": 37}, {"foo": "baz", "gaga": "string1"}]
+    X1 = FeatureHasher(n_features=16).transform(raw_X)
+    gen = (iter(d.items()) for d in raw_X)
+    X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
+    assert_array_equal(X1.toarray(), X2.toarray())
+
+
+def test_feature_hasher_strings():
+    # mix byte and Unicode strings; note that "foo" is a duplicate in row 0
+    raw_X = [
+        ["foo", "bar", "baz", "foo".encode("ascii")],
+        ["bar".encode("ascii"), "baz", "quux"],
+    ]
+
+    for lg_n_features in (7, 9, 11, 16, 22):
+        n_features = 2**lg_n_features
+
+        it = (x for x in raw_X)  # iterable
+
+        feature_hasher = FeatureHasher(
+            n_features=n_features, input_type="string", alternate_sign=False
+        )
+        X = feature_hasher.transform(it)
+
+        assert X.shape[0] == len(raw_X)
+        assert X.shape[1] == n_features
+
+        assert X[0].sum() == 4
+        assert X[1].sum() == 3
+
+        assert X.nnz == 6
+
+
+@pytest.mark.parametrize(
+    "raw_X",
+    [
+        ["my_string", "another_string"],
+        (x for x in ["my_string", "another_string"]),
+    ],
+    ids=["list", "generator"],
+)
+def test_feature_hasher_single_string(raw_X):
+    """FeatureHasher raises error when a sample is a single string.
+
+    Non-regression test for gh-13199.
+    """
+    msg = "Samples can not be a single string"
+
+    feature_hasher = FeatureHasher(n_features=10, input_type="string")
+    with pytest.raises(ValueError, match=msg):
+        feature_hasher.transform(raw_X)
+
+
+def test_hashing_transform_seed():
+    # check the influence of the seed when computing the hashes
+    raw_X = [
+        ["foo", "bar", "baz", "foo".encode("ascii")],
+        ["bar".encode("ascii"), "baz", "quux"],
+    ]
+
+    raw_X_ = (((f, 1) for f in x) for x in raw_X)
+    indices, indptr, _ = _hashing_transform(raw_X_, 2**7, str, False)
+
+    raw_X_ = (((f, 1) for f in x) for x in raw_X)
+    indices_0, indptr_0, _ = _hashing_transform(raw_X_, 2**7, str, False, seed=0)
+    assert_array_equal(indices, indices_0)
+    assert_array_equal(indptr, indptr_0)
+
+    raw_X_ = (((f, 1) for f in x) for x in raw_X)
+    indices_1, _, _ = _hashing_transform(raw_X_, 2**7, str, False, seed=1)
+    with pytest.raises(AssertionError):
+        assert_array_equal(indices, indices_1)
+
+
+def test_feature_hasher_pairs():
+    raw_X = (
+        iter(d.items())
+        for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}]
+    )
+    feature_hasher = FeatureHasher(n_features=16, input_type="pair")
+    x1, x2 = feature_hasher.transform(raw_X).toarray()
+    x1_nz = sorted(np.abs(x1[x1 != 0]))
+    x2_nz = sorted(np.abs(x2[x2 != 0]))
+    assert [1, 2] == x1_nz
+    assert [1, 3, 4] == x2_nz
+
+
+def test_feature_hasher_pairs_with_string_values():
+    raw_X = (
+        iter(d.items())
+        for d in [{"foo": 1, "bar": "a"}, {"baz": "abc", "quux": 4, "foo": -1}]
+    )
+    feature_hasher = FeatureHasher(n_features=16, input_type="pair")
+    x1, x2 = feature_hasher.transform(raw_X).toarray()
+    x1_nz = sorted(np.abs(x1[x1 != 0]))
+    x2_nz = sorted(np.abs(x2[x2 != 0]))
+    assert [1, 1] == x1_nz
+    assert [1, 1, 4] == x2_nz
+
+    raw_X = (iter(d.items()) for d in [{"bax": "abc"}, {"bax": "abc"}])
+    x1, x2 = feature_hasher.transform(raw_X).toarray()
+    x1_nz = np.abs(x1[x1 != 0])
+    x2_nz = np.abs(x2[x2 != 0])
+    assert [1] == x1_nz
+    assert [1] == x2_nz
+    assert_array_equal(x1, x2)
+
+
+def test_hash_empty_input():
+    n_features = 16
+    raw_X = [[], (), iter(range(0))]
+
+    feature_hasher = FeatureHasher(n_features=n_features, input_type="string")
+    X = feature_hasher.transform(raw_X)
+
+    assert_array_equal(X.toarray(), np.zeros((len(raw_X), n_features)))
+
+
+def test_hasher_zeros():
+    # Assert that no zeros are materialized in the output.
+    X = FeatureHasher().transform([{"foo": 0}])
+    assert X.data.shape == (0,)
+
+
+def test_hasher_alternate_sign():
+    X = [list("Thequickbrownfoxjumped")]
+
+    Xt = FeatureHasher(alternate_sign=True, input_type="string").fit_transform(X)
+    assert Xt.data.min() < 0 and Xt.data.max() > 0
+
+    Xt = FeatureHasher(alternate_sign=False, input_type="string").fit_transform(X)
+    assert Xt.data.min() > 0
+
+
+def test_hash_collisions():
+    X = [list("Thequickbrownfoxjumped")]
+
+    Xt = FeatureHasher(
+        alternate_sign=True, n_features=1, input_type="string"
+    ).fit_transform(X)
+    # check that some of the hashed tokens are added
+    # with an opposite sign and cancel out
+    assert abs(Xt.data[0]) < len(X[0])
+
+    Xt = FeatureHasher(
+        alternate_sign=False, n_features=1, input_type="string"
+    ).fit_transform(X)
+    assert Xt.data[0] == len(X[0])
+
+
+def test_feature_hasher_requires_fit_tag():
+    """Test that FeatureHasher has requires_fit=False tag."""
+    hasher = FeatureHasher()
+    tags = hasher.__sklearn_tags__()
+    assert not tags.requires_fit
+
+
+def test_feature_hasher_transform_without_fit():
+    """Test that FeatureHasher can transform without fitting."""
+    hasher = FeatureHasher(n_features=10)
+    data = [{"dog": 1, "cat": 2}, {"dog": 2, "run": 5}]
+    result = hasher.transform(data)
+    assert result.shape == (2, 10)
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/test_image.py
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/test_image.py
@@ -0,0 +1,359 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+import pytest
+from scipy import ndimage
+from scipy.sparse.csgraph import connected_components
+
+from sklearn.feature_extraction.image import (
+    PatchExtractor,
+    _extract_patches,
+    extract_patches_2d,
+    grid_to_graph,
+    img_to_graph,
+    reconstruct_from_patches_2d,
+)
+
+
+def test_img_to_graph():
+    x, y = np.mgrid[:4, :4] - 10
+    grad_x = img_to_graph(x)
+    grad_y = img_to_graph(y)
+    assert grad_x.nnz == grad_y.nnz
+    # Negative elements are the diagonal: the elements of the original
+    # image. Positive elements are the values of the gradient, they
+    # should all be equal on grad_x and grad_y
+    np.testing.assert_array_equal(
+        grad_x.data[grad_x.data > 0], grad_y.data[grad_y.data > 0]
+    )
+
+
+def test_img_to_graph_sparse():
+    # Check that the edges are in the right position
+    #  when using a sparse image with a singleton component
+    mask = np.zeros((2, 3), dtype=bool)
+    mask[0, 0] = 1
+    mask[:, 2] = 1
+    x = np.zeros((2, 3))
+    x[0, 0] = 1
+    x[0, 2] = -1
+    x[1, 2] = -2
+    grad_x = img_to_graph(x, mask=mask).todense()
+    desired = np.array([[1, 0, 0], [0, -1, 1], [0, 1, -2]])
+    np.testing.assert_array_equal(grad_x, desired)
+
+
+def test_grid_to_graph():
+    # Checking that the function works with graphs containing no edges
+    size = 2
+    roi_size = 1
+    # Generating two convex parts with one vertex
+    # Thus, edges will be empty in _to_graph
+    mask = np.zeros((size, size), dtype=bool)
+    mask[0:roi_size, 0:roi_size] = True
+    mask[-roi_size:, -roi_size:] = True
+    mask = mask.reshape(size**2)
+    A = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray)
+    assert connected_components(A)[0] == 2
+
+    # check ordering
+    mask = np.zeros((2, 3), dtype=bool)
+    mask[0, 0] = 1
+    mask[:, 2] = 1
+    graph = grid_to_graph(2, 3, 1, mask=mask.ravel()).todense()
+    desired = np.array([[1, 0, 0], [0, 1, 1], [0, 1, 1]])
+    np.testing.assert_array_equal(graph, desired)
+
+    # Checking that the function works whatever the type of mask is
+    mask = np.ones((size, size), dtype=np.int16)
+    A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask)
+    assert connected_components(A)[0] == 1
+
+    # Checking dtype of the graph
+    mask = np.ones((size, size))
+    A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=bool)
+    assert A.dtype == bool
+    A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=int)
+    assert A.dtype == int
+    A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.float64)
+    assert A.dtype == np.float64
+
+
+def test_connect_regions(raccoon_face_fxt):
+    face = raccoon_face_fxt
+    # subsample by 4 to reduce run time
+    face = face[::4, ::4]
+    for thr in (50, 150):
+        mask = face > thr
+        graph = img_to_graph(face, mask=mask)
+        assert ndimage.label(mask)[1] == connected_components(graph)[0]
+
+
+def test_connect_regions_with_grid(raccoon_face_fxt):
+    face = raccoon_face_fxt
+
+    # subsample by 4 to reduce run time
+    face = face[::4, ::4]
+
+    mask = face > 50
+    graph = grid_to_graph(*face.shape, mask=mask)
+    assert ndimage.label(mask)[1] == connected_components(graph)[0]
+
+    mask = face > 150
+    graph = grid_to_graph(*face.shape, mask=mask, dtype=None)
+    assert ndimage.label(mask)[1] == connected_components(graph)[0]
+
+
+@pytest.fixture
+def downsampled_face(raccoon_face_fxt):
+    face = raccoon_face_fxt
+    face = face[::2, ::2] + face[1::2, ::2] + face[::2, 1::2] + face[1::2, 1::2]
+    face = face[::2, ::2] + face[1::2, ::2] + face[::2, 1::2] + face[1::2, 1::2]
+    face = face.astype(np.float32)
+    face /= 16.0
+    return face
+
+
+@pytest.fixture
+def orange_face(downsampled_face):
+    face = downsampled_face
+    face_color = np.zeros(face.shape + (3,))
+    face_color[:, :, 0] = 256 - face
+    face_color[:, :, 1] = 256 - face / 2
+    face_color[:, :, 2] = 256 - face / 4
+    return face_color
+
+
+def _make_images(face):
+    # make a collection of faces
+    images = np.zeros((3,) + face.shape)
+    images[0] = face
+    images[1] = face + 1
+    images[2] = face + 2
+    return images
+
+
+@pytest.fixture
+def downsampled_face_collection(downsampled_face):
+    return _make_images(downsampled_face)
+
+
+def test_extract_patches_all(downsampled_face):
+    face = downsampled_face
+    i_h, i_w = face.shape
+    p_h, p_w = 16, 16
+    expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
+    patches = extract_patches_2d(face, (p_h, p_w))
+    assert patches.shape == (expected_n_patches, p_h, p_w)
+
+
+def test_extract_patches_all_color(orange_face):
+    face = orange_face
+    i_h, i_w = face.shape[:2]
+    p_h, p_w = 16, 16
+    expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
+    patches = extract_patches_2d(face, (p_h, p_w))
+    assert patches.shape == (expected_n_patches, p_h, p_w, 3)
+
+
+def test_extract_patches_all_rect(downsampled_face):
+    face = downsampled_face
+    face = face[:, 32:97]
+    i_h, i_w = face.shape
+    p_h, p_w = 16, 12
+    expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
+
+    patches = extract_patches_2d(face, (p_h, p_w))
+    assert patches.shape == (expected_n_patches, p_h, p_w)
+
+
+def test_extract_patches_max_patches(downsampled_face):
+    face = downsampled_face
+    i_h, i_w = face.shape
+    p_h, p_w = 16, 16
+
+    patches = extract_patches_2d(face, (p_h, p_w), max_patches=100)
+    assert patches.shape == (100, p_h, p_w)
+
+    expected_n_patches = int(0.5 * (i_h - p_h + 1) * (i_w - p_w + 1))
+    patches = extract_patches_2d(face, (p_h, p_w), max_patches=0.5)
+    assert patches.shape == (expected_n_patches, p_h, p_w)
+
+    with pytest.raises(ValueError):
+        extract_patches_2d(face, (p_h, p_w), max_patches=2.0)
+    with pytest.raises(ValueError):
+        extract_patches_2d(face, (p_h, p_w), max_patches=-1.0)
+
+
+def test_extract_patch_same_size_image(downsampled_face):
+    face = downsampled_face
+    # Request patches of the same size as image
+    # Should return just the single patch a.k.a. the image
+    patches = extract_patches_2d(face, face.shape, max_patches=2)
+    assert patches.shape[0] == 1
+
+
+def test_extract_patches_less_than_max_patches(downsampled_face):
+    face = downsampled_face
+    i_h, i_w = face.shape
+    p_h, p_w = 3 * i_h // 4, 3 * i_w // 4
+    # this is 3185
+    expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
+
+    patches = extract_patches_2d(face, (p_h, p_w), max_patches=4000)
+    assert patches.shape == (expected_n_patches, p_h, p_w)
+
+
+def test_reconstruct_patches_perfect(downsampled_face):
+    face = downsampled_face
+    p_h, p_w = 16, 16
+
+    patches = extract_patches_2d(face, (p_h, p_w))
+    face_reconstructed = reconstruct_from_patches_2d(patches, face.shape)
+    np.testing.assert_array_almost_equal(face, face_reconstructed)
+
+
+def test_reconstruct_patches_perfect_color(orange_face):
+    face = orange_face
+    p_h, p_w = 16, 16
+
+    patches = extract_patches_2d(face, (p_h, p_w))
+    face_reconstructed = reconstruct_from_patches_2d(patches, face.shape)
+    np.testing.assert_array_almost_equal(face, face_reconstructed)
+
+
+def test_patch_extractor_fit(downsampled_face_collection, global_random_seed):
+    faces = downsampled_face_collection
+    extr = PatchExtractor(
+        patch_size=(8, 8), max_patches=100, random_state=global_random_seed
+    )
+    assert extr == extr.fit(faces)
+
+
+def test_patch_extractor_max_patches(downsampled_face_collection, global_random_seed):
+    faces = downsampled_face_collection
+    i_h, i_w = faces.shape[1:3]
+    p_h, p_w = 8, 8
+
+    max_patches = 100
+    expected_n_patches = len(faces) * max_patches
+    extr = PatchExtractor(
+        patch_size=(p_h, p_w), max_patches=max_patches, random_state=global_random_seed
+    )
+    patches = extr.transform(faces)
+    assert patches.shape == (expected_n_patches, p_h, p_w)
+
+    max_patches = 0.5
+    expected_n_patches = len(faces) * int(
+        (i_h - p_h + 1) * (i_w - p_w + 1) * max_patches
+    )
+    extr = PatchExtractor(
+        patch_size=(p_h, p_w), max_patches=max_patches, random_state=global_random_seed
+    )
+    patches = extr.transform(faces)
+    assert patches.shape == (expected_n_patches, p_h, p_w)
+
+
+def test_patch_extractor_max_patches_default(
+    downsampled_face_collection, global_random_seed
+):
+    faces = downsampled_face_collection
+    extr = PatchExtractor(max_patches=100, random_state=global_random_seed)
+    patches = extr.transform(faces)
+    assert patches.shape == (len(faces) * 100, 19, 25)
+
+
+def test_patch_extractor_all_patches(downsampled_face_collection, global_random_seed):
+    faces = downsampled_face_collection
+    i_h, i_w = faces.shape[1:3]
+    p_h, p_w = 8, 8
+    expected_n_patches = len(faces) * (i_h - p_h + 1) * (i_w - p_w + 1)
+    extr = PatchExtractor(patch_size=(p_h, p_w), random_state=global_random_seed)
+    patches = extr.transform(faces)
+    assert patches.shape == (expected_n_patches, p_h, p_w)
+
+
+def test_patch_extractor_color(orange_face, global_random_seed):
+    faces = _make_images(orange_face)
+    i_h, i_w = faces.shape[1:3]
+    p_h, p_w = 8, 8
+    expected_n_patches = len(faces) * (i_h - p_h + 1) * (i_w - p_w + 1)
+    extr = PatchExtractor(patch_size=(p_h, p_w), random_state=global_random_seed)
+    patches = extr.transform(faces)
+    assert patches.shape == (expected_n_patches, p_h, p_w, 3)
+
+
+def test_extract_patches_strided():
+    image_shapes_1D = [(10,), (10,), (11,), (10,)]
+    patch_sizes_1D = [(1,), (2,), (3,), (8,)]
+    patch_steps_1D = [(1,), (1,), (4,), (2,)]
+
+    expected_views_1D = [(10,), (9,), (3,), (2,)]
+    last_patch_1D = [(10,), (8,), (8,), (2,)]
+
+    image_shapes_2D = [(10, 20), (10, 20), (10, 20), (11, 20)]
+    patch_sizes_2D = [(2, 2), (10, 10), (10, 11), (6, 6)]
+    patch_steps_2D = [(5, 5), (3, 10), (3, 4), (4, 2)]
+
+    expected_views_2D = [(2, 4), (1, 2), (1, 3), (2, 8)]
+    last_patch_2D = [(5, 15), (0, 10), (0, 8), (4, 14)]
+
+    image_shapes_3D = [(5, 4, 3), (3, 3, 3), (7, 8, 9), (7, 8, 9)]
+    patch_sizes_3D = [(2, 2, 3), (2, 2, 2), (1, 7, 3), (1, 3, 3)]
+    patch_steps_3D = [(1, 2, 10), (1, 1, 1), (2, 1, 3), (3, 3, 4)]
+
+    expected_views_3D = [(4, 2, 1), (2, 2, 2), (4, 2, 3), (3, 2, 2)]
+    last_patch_3D = [(3, 2, 0), (1, 1, 1), (6, 1, 6), (6, 3, 4)]
+
+    image_shapes = image_shapes_1D + image_shapes_2D + image_shapes_3D
+    patch_sizes = patch_sizes_1D + patch_sizes_2D + patch_sizes_3D
+    patch_steps = patch_steps_1D + patch_steps_2D + patch_steps_3D
+    expected_views = expected_views_1D + expected_views_2D + expected_views_3D
+    last_patches = last_patch_1D + last_patch_2D + last_patch_3D
+
+    for image_shape, patch_size, patch_step, expected_view, last_patch in zip(
+        image_shapes, patch_sizes, patch_steps, expected_views, last_patches
+    ):
+        image = np.arange(np.prod(image_shape)).reshape(image_shape)
+        patches = _extract_patches(
+            image, patch_shape=patch_size, extraction_step=patch_step
+        )
+
+        ndim = len(image_shape)
+
+        assert patches.shape[:ndim] == expected_view
+        last_patch_slices = tuple(
+            slice(i, i + j, None) for i, j in zip(last_patch, patch_size)
+        )
+        assert (
+            patches[(-1, None, None) * ndim] == image[last_patch_slices].squeeze()
+        ).all()
+
+
+def test_extract_patches_square(downsampled_face):
+    # test same patch size for all dimensions
+    face = downsampled_face
+    i_h, i_w = face.shape
+    p = 8
+    expected_n_patches = ((i_h - p + 1), (i_w - p + 1))
+    patches = _extract_patches(face, patch_shape=p)
+    assert patches.shape == (expected_n_patches[0], expected_n_patches[1], p, p)
+
+
+def test_width_patch():
+    # width and height of the patch should be less than the image
+    x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    with pytest.raises(ValueError):
+        extract_patches_2d(x, (4, 1))
+    with pytest.raises(ValueError):
+        extract_patches_2d(x, (1, 4))
+
+
+def test_patch_extractor_wrong_input(orange_face):
+    """Check that an informative error is raised if the patch_size is not valid."""
+    faces = _make_images(orange_face)
+    err_msg = "patch_size must be a tuple of two integers"
+    extractor = PatchExtractor(patch_size=(8, 8, 8))
+    with pytest.raises(ValueError, match=err_msg):
+        extractor.transform(faces)
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/test_text.py
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/tests/test_text.py
--- a/venv/lib/python3.12/site-packages/sklearn/feature_extraction/text.py
+++ b/venv/lib/python3.12/site-packages/sklearn/feature_extraction/text.py