add read me
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
"""Models based on neural networks."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from ._multilayer_perceptron import MLPClassifier, MLPRegressor
|
||||
from ._rbm import BernoulliRBM
|
||||
|
||||
__all__ = ["BernoulliRBM", "MLPClassifier", "MLPRegressor"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,287 @@
|
||||
"""Utilities for the neural network modules"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numpy as np
|
||||
from scipy.special import expit as logistic_sigmoid
|
||||
from scipy.special import xlogy
|
||||
|
||||
|
||||
def inplace_identity(X):
|
||||
"""Simply leave the input array unchanged.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
Data, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
"""
|
||||
# Nothing to do
|
||||
|
||||
|
||||
def inplace_exp(X):
|
||||
"""Compute the exponential inplace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
The input data.
|
||||
"""
|
||||
np.exp(X, out=X)
|
||||
|
||||
|
||||
def inplace_logistic(X):
|
||||
"""Compute the logistic function inplace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
The input data.
|
||||
"""
|
||||
logistic_sigmoid(X, out=X)
|
||||
|
||||
|
||||
def inplace_tanh(X):
|
||||
"""Compute the hyperbolic tan function inplace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
The input data.
|
||||
"""
|
||||
np.tanh(X, out=X)
|
||||
|
||||
|
||||
def inplace_relu(X):
|
||||
"""Compute the rectified linear unit function inplace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
The input data.
|
||||
"""
|
||||
np.maximum(X, 0, out=X)
|
||||
|
||||
|
||||
def inplace_softmax(X):
|
||||
"""Compute the K-way softmax function inplace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
The input data.
|
||||
"""
|
||||
tmp = X - X.max(axis=1)[:, np.newaxis]
|
||||
np.exp(tmp, out=X)
|
||||
X /= X.sum(axis=1)[:, np.newaxis]
|
||||
|
||||
|
||||
ACTIVATIONS = {
|
||||
"identity": inplace_identity,
|
||||
"exp": inplace_exp,
|
||||
"tanh": inplace_tanh,
|
||||
"logistic": inplace_logistic,
|
||||
"relu": inplace_relu,
|
||||
"softmax": inplace_softmax,
|
||||
}
|
||||
|
||||
|
||||
def inplace_identity_derivative(Z, delta):
|
||||
"""Apply the derivative of the identity function: do nothing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
Z : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
The data which was output from the identity activation function during
|
||||
the forward pass.
|
||||
|
||||
delta : {array-like}, shape (n_samples, n_features)
|
||||
The backpropagated error signal to be modified inplace.
|
||||
"""
|
||||
# Nothing to do
|
||||
|
||||
|
||||
def inplace_logistic_derivative(Z, delta):
|
||||
"""Apply the derivative of the logistic sigmoid function.
|
||||
|
||||
It exploits the fact that the derivative is a simple function of the output
|
||||
value from logistic function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
Z : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
The data which was output from the logistic activation function during
|
||||
the forward pass.
|
||||
|
||||
delta : {array-like}, shape (n_samples, n_features)
|
||||
The backpropagated error signal to be modified inplace.
|
||||
"""
|
||||
delta *= Z
|
||||
delta *= 1 - Z
|
||||
|
||||
|
||||
def inplace_tanh_derivative(Z, delta):
|
||||
"""Apply the derivative of the hyperbolic tanh function.
|
||||
|
||||
It exploits the fact that the derivative is a simple function of the output
|
||||
value from hyperbolic tangent.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
Z : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
The data which was output from the hyperbolic tangent activation
|
||||
function during the forward pass.
|
||||
|
||||
delta : {array-like}, shape (n_samples, n_features)
|
||||
The backpropagated error signal to be modified inplace.
|
||||
"""
|
||||
delta *= 1 - Z**2
|
||||
|
||||
|
||||
def inplace_relu_derivative(Z, delta):
|
||||
"""Apply the derivative of the relu function.
|
||||
|
||||
It exploits the fact that the derivative is a simple function of the output
|
||||
value from rectified linear units activation function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
Z : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
The data which was output from the rectified linear units activation
|
||||
function during the forward pass.
|
||||
|
||||
delta : {array-like}, shape (n_samples, n_features)
|
||||
The backpropagated error signal to be modified inplace.
|
||||
"""
|
||||
delta[Z == 0] = 0
|
||||
|
||||
|
||||
DERIVATIVES = {
|
||||
"identity": inplace_identity_derivative,
|
||||
"tanh": inplace_tanh_derivative,
|
||||
"logistic": inplace_logistic_derivative,
|
||||
"relu": inplace_relu_derivative,
|
||||
}
|
||||
|
||||
|
||||
def squared_loss(y_true, y_pred, sample_weight=None):
|
||||
"""Compute the squared loss for regression.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : array-like or label indicator matrix
|
||||
Ground truth (correct) values.
|
||||
|
||||
y_pred : array-like or label indicator matrix
|
||||
Predicted values, as returned by a regression estimator.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
loss : float
|
||||
The degree to which the samples are correctly predicted.
|
||||
"""
|
||||
return (
|
||||
0.5 * np.average((y_true - y_pred) ** 2, weights=sample_weight, axis=0).mean()
|
||||
)
|
||||
|
||||
|
||||
def poisson_loss(y_true, y_pred, sample_weight=None):
|
||||
"""Compute (half of the) Poisson deviance loss for regression.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : array-like or label indicator matrix
|
||||
Ground truth (correct) labels.
|
||||
|
||||
y_pred : array-like or label indicator matrix
|
||||
Predicted values, as returned by a regression estimator.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
loss : float
|
||||
The degree to which the samples are correctly predicted.
|
||||
"""
|
||||
# TODO: Decide what to do with the term `xlogy(y_true, y_true) - y_true`. For now,
|
||||
# it is included. But the _loss module doesn't use it (for performance reasons) and
|
||||
# only adds it as return of constant_to_optimal_zero (mainly for testing).
|
||||
return np.average(
|
||||
xlogy(y_true, y_true / y_pred) - y_true + y_pred, weights=sample_weight, axis=0
|
||||
).sum()
|
||||
|
||||
|
||||
def log_loss(y_true, y_prob, sample_weight=None):
|
||||
"""Compute Logistic loss for classification.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : array-like or label indicator matrix
|
||||
Ground truth (correct) labels.
|
||||
|
||||
y_prob : array-like of float, shape = (n_samples, n_classes)
|
||||
Predicted probabilities, as returned by a classifier's
|
||||
predict_proba method.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
loss : float
|
||||
The degree to which the samples are correctly predicted.
|
||||
"""
|
||||
eps = np.finfo(y_prob.dtype).eps
|
||||
y_prob = np.clip(y_prob, eps, 1 - eps)
|
||||
if y_prob.shape[1] == 1:
|
||||
y_prob = np.append(1 - y_prob, y_prob, axis=1)
|
||||
|
||||
if y_true.shape[1] == 1:
|
||||
y_true = np.append(1 - y_true, y_true, axis=1)
|
||||
|
||||
return -np.average(xlogy(y_true, y_prob), weights=sample_weight, axis=0).sum()
|
||||
|
||||
|
||||
def binary_log_loss(y_true, y_prob, sample_weight=None):
|
||||
"""Compute binary logistic loss for classification.
|
||||
|
||||
This is identical to log_loss in binary classification case,
|
||||
but is kept for its use in multilabel case.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : array-like or label indicator matrix
|
||||
Ground truth (correct) labels.
|
||||
|
||||
y_prob : array-like of float, shape = (n_samples, 1)
|
||||
Predicted probabilities, as returned by a classifier's
|
||||
predict_proba method.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
loss : float
|
||||
The degree to which the samples are correctly predicted.
|
||||
"""
|
||||
eps = np.finfo(y_prob.dtype).eps
|
||||
y_prob = np.clip(y_prob, eps, 1 - eps)
|
||||
return -np.average(
|
||||
xlogy(y_true, y_prob) + xlogy(1 - y_true, 1 - y_prob),
|
||||
weights=sample_weight,
|
||||
axis=0,
|
||||
).sum()
|
||||
|
||||
|
||||
LOSS_FUNCTIONS = {
|
||||
"squared_error": squared_loss,
|
||||
"poisson": poisson_loss,
|
||||
"log_loss": log_loss,
|
||||
"binary_log_loss": binary_log_loss,
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
445
venv/lib/python3.12/site-packages/sklearn/neural_network/_rbm.py
Normal file
445
venv/lib/python3.12/site-packages/sklearn/neural_network/_rbm.py
Normal file
@@ -0,0 +1,445 @@
|
||||
"""Restricted Boltzmann Machine"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import time
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
from scipy.special import expit # logistic function
|
||||
|
||||
from ..base import (
|
||||
BaseEstimator,
|
||||
ClassNamePrefixFeaturesOutMixin,
|
||||
TransformerMixin,
|
||||
_fit_context,
|
||||
)
|
||||
from ..utils import check_random_state, gen_even_slices
|
||||
from ..utils._param_validation import Interval
|
||||
from ..utils.extmath import safe_sparse_dot
|
||||
from ..utils.validation import check_is_fitted, validate_data
|
||||
|
||||
|
||||
class BernoulliRBM(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
|
||||
"""Bernoulli Restricted Boltzmann Machine (RBM).
|
||||
|
||||
A Restricted Boltzmann Machine with binary visible units and
|
||||
binary hidden units. Parameters are estimated using Stochastic Maximum
|
||||
Likelihood (SML), also known as Persistent Contrastive Divergence (PCD)
|
||||
[2].
|
||||
|
||||
The time complexity of this implementation is ``O(d ** 2)`` assuming
|
||||
d ~ n_features ~ n_components.
|
||||
|
||||
Read more in the :ref:`User Guide <rbm>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_components : int, default=256
|
||||
Number of binary hidden units.
|
||||
|
||||
learning_rate : float, default=0.1
|
||||
The learning rate for weight updates. It is *highly* recommended
|
||||
to tune this hyper-parameter. Reasonable values are in the
|
||||
10**[0., -3.] range.
|
||||
|
||||
batch_size : int, default=10
|
||||
Number of examples per minibatch.
|
||||
|
||||
n_iter : int, default=10
|
||||
Number of iterations/sweeps over the training dataset to perform
|
||||
during training.
|
||||
|
||||
verbose : int, default=0
|
||||
The verbosity level. The default, zero, means silent mode. Range
|
||||
of values is [0, inf].
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines random number generation for:
|
||||
|
||||
- Gibbs sampling from visible and hidden layers.
|
||||
|
||||
- Initializing components, sampling from layers during fit.
|
||||
|
||||
- Corrupting the data when scoring samples.
|
||||
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
intercept_hidden_ : array-like of shape (n_components,)
|
||||
Biases of the hidden units.
|
||||
|
||||
intercept_visible_ : array-like of shape (n_features,)
|
||||
Biases of the visible units.
|
||||
|
||||
components_ : array-like of shape (n_components, n_features)
|
||||
Weight matrix, where `n_features` is the number of
|
||||
visible units and `n_components` is the number of hidden units.
|
||||
|
||||
h_samples_ : array-like of shape (batch_size, n_components)
|
||||
Hidden Activation sampled from the model distribution,
|
||||
where `batch_size` is the number of examples per minibatch and
|
||||
`n_components` is the number of hidden units.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.neural_network.MLPRegressor : Multi-layer Perceptron regressor.
|
||||
sklearn.neural_network.MLPClassifier : Multi-layer Perceptron classifier.
|
||||
sklearn.decomposition.PCA : An unsupervised linear dimensionality
|
||||
reduction model.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
[1] Hinton, G. E., Osindero, S. and Teh, Y. A fast learning algorithm for
|
||||
deep belief nets. Neural Computation 18, pp 1527-1554.
|
||||
https://www.cs.toronto.edu/~hinton/absps/fastnc.pdf
|
||||
|
||||
[2] Tieleman, T. Training Restricted Boltzmann Machines using
|
||||
Approximations to the Likelihood Gradient. International Conference
|
||||
on Machine Learning (ICML) 2008
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.neural_network import BernoulliRBM
|
||||
>>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
|
||||
>>> model = BernoulliRBM(n_components=2)
|
||||
>>> model.fit(X)
|
||||
BernoulliRBM(n_components=2)
|
||||
|
||||
For a more detailed example usage, see
|
||||
:ref:`sphx_glr_auto_examples_neural_networks_plot_rbm_logistic_classification.py`.
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"learning_rate": [Interval(Real, 0, None, closed="neither")],
|
||||
"batch_size": [Interval(Integral, 1, None, closed="left")],
|
||||
"n_iter": [Interval(Integral, 0, None, closed="left")],
|
||||
"verbose": ["verbose"],
|
||||
"random_state": ["random_state"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_components=256,
|
||||
*,
|
||||
learning_rate=0.1,
|
||||
batch_size=10,
|
||||
n_iter=10,
|
||||
verbose=0,
|
||||
random_state=None,
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.learning_rate = learning_rate
|
||||
self.batch_size = batch_size
|
||||
self.n_iter = n_iter
|
||||
self.verbose = verbose
|
||||
self.random_state = random_state
|
||||
|
||||
def transform(self, X):
|
||||
"""Compute the hidden layer activation probabilities, P(h=1|v=X).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The data to be transformed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
h : ndarray of shape (n_samples, n_components)
|
||||
Latent representations of the data.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X = validate_data(
|
||||
self, X, accept_sparse="csr", reset=False, dtype=(np.float64, np.float32)
|
||||
)
|
||||
return self._mean_hiddens(X)
|
||||
|
||||
def _mean_hiddens(self, v):
|
||||
"""Computes the probabilities P(h=1|v).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
v : ndarray of shape (n_samples, n_features)
|
||||
Values of the visible layer.
|
||||
|
||||
Returns
|
||||
-------
|
||||
h : ndarray of shape (n_samples, n_components)
|
||||
Corresponding mean field values for the hidden layer.
|
||||
"""
|
||||
p = safe_sparse_dot(v, self.components_.T)
|
||||
p += self.intercept_hidden_
|
||||
return expit(p, out=p)
|
||||
|
||||
def _sample_hiddens(self, v, rng):
|
||||
"""Sample from the distribution P(h|v).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
v : ndarray of shape (n_samples, n_features)
|
||||
Values of the visible layer to sample from.
|
||||
|
||||
rng : RandomState instance
|
||||
Random number generator to use.
|
||||
|
||||
Returns
|
||||
-------
|
||||
h : ndarray of shape (n_samples, n_components)
|
||||
Values of the hidden layer.
|
||||
"""
|
||||
p = self._mean_hiddens(v)
|
||||
return rng.uniform(size=p.shape) < p
|
||||
|
||||
def _sample_visibles(self, h, rng):
|
||||
"""Sample from the distribution P(v|h).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
h : ndarray of shape (n_samples, n_components)
|
||||
Values of the hidden layer to sample from.
|
||||
|
||||
rng : RandomState instance
|
||||
Random number generator to use.
|
||||
|
||||
Returns
|
||||
-------
|
||||
v : ndarray of shape (n_samples, n_features)
|
||||
Values of the visible layer.
|
||||
"""
|
||||
p = np.dot(h, self.components_)
|
||||
p += self.intercept_visible_
|
||||
expit(p, out=p)
|
||||
return rng.uniform(size=p.shape) < p
|
||||
|
||||
def _free_energy(self, v):
|
||||
"""Computes the free energy F(v) = - log sum_h exp(-E(v,h)).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
v : ndarray of shape (n_samples, n_features)
|
||||
Values of the visible layer.
|
||||
|
||||
Returns
|
||||
-------
|
||||
free_energy : ndarray of shape (n_samples,)
|
||||
The value of the free energy.
|
||||
"""
|
||||
return -safe_sparse_dot(v, self.intercept_visible_) - np.logaddexp(
|
||||
0, safe_sparse_dot(v, self.components_.T) + self.intercept_hidden_
|
||||
).sum(axis=1)
|
||||
|
||||
def gibbs(self, v):
|
||||
"""Perform one Gibbs sampling step.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
v : ndarray of shape (n_samples, n_features)
|
||||
Values of the visible layer to start from.
|
||||
|
||||
Returns
|
||||
-------
|
||||
v_new : ndarray of shape (n_samples, n_features)
|
||||
Values of the visible layer after one Gibbs step.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
if not hasattr(self, "random_state_"):
|
||||
self.random_state_ = check_random_state(self.random_state)
|
||||
h_ = self._sample_hiddens(v, self.random_state_)
|
||||
v_ = self._sample_visibles(h_, self.random_state_)
|
||||
|
||||
return v_
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def partial_fit(self, X, y=None):
|
||||
"""Fit the model to the partial segment of the data X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
|
||||
Target values (None for unsupervised transformations).
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : BernoulliRBM
|
||||
The fitted model.
|
||||
"""
|
||||
first_pass = not hasattr(self, "components_")
|
||||
X = validate_data(
|
||||
self, X, accept_sparse="csr", dtype=np.float64, reset=first_pass
|
||||
)
|
||||
if not hasattr(self, "random_state_"):
|
||||
self.random_state_ = check_random_state(self.random_state)
|
||||
if not hasattr(self, "components_"):
|
||||
self.components_ = np.asarray(
|
||||
self.random_state_.normal(0, 0.01, (self.n_components, X.shape[1])),
|
||||
order="F",
|
||||
)
|
||||
self._n_features_out = self.components_.shape[0]
|
||||
if not hasattr(self, "intercept_hidden_"):
|
||||
self.intercept_hidden_ = np.zeros(
|
||||
self.n_components,
|
||||
)
|
||||
if not hasattr(self, "intercept_visible_"):
|
||||
self.intercept_visible_ = np.zeros(
|
||||
X.shape[1],
|
||||
)
|
||||
if not hasattr(self, "h_samples_"):
|
||||
self.h_samples_ = np.zeros((self.batch_size, self.n_components))
|
||||
|
||||
self._fit(X, self.random_state_)
|
||||
|
||||
def _fit(self, v_pos, rng):
|
||||
"""Inner fit for one mini-batch.
|
||||
|
||||
Adjust the parameters to maximize the likelihood of v using
|
||||
Stochastic Maximum Likelihood (SML).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
v_pos : ndarray of shape (n_samples, n_features)
|
||||
The data to use for training.
|
||||
|
||||
rng : RandomState instance
|
||||
Random number generator to use for sampling.
|
||||
"""
|
||||
h_pos = self._mean_hiddens(v_pos)
|
||||
v_neg = self._sample_visibles(self.h_samples_, rng)
|
||||
h_neg = self._mean_hiddens(v_neg)
|
||||
|
||||
lr = float(self.learning_rate) / v_pos.shape[0]
|
||||
update = safe_sparse_dot(v_pos.T, h_pos, dense_output=True).T
|
||||
update -= np.dot(h_neg.T, v_neg)
|
||||
self.components_ += lr * update
|
||||
self.intercept_hidden_ += lr * (h_pos.sum(axis=0) - h_neg.sum(axis=0))
|
||||
self.intercept_visible_ += lr * (
|
||||
np.asarray(v_pos.sum(axis=0)).squeeze() - v_neg.sum(axis=0)
|
||||
)
|
||||
|
||||
h_neg[rng.uniform(size=h_neg.shape) < h_neg] = 1.0 # sample binomial
|
||||
self.h_samples_ = np.floor(h_neg, h_neg)
|
||||
|
||||
def score_samples(self, X):
|
||||
"""Compute the pseudo-likelihood of X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Values of the visible layer. Must be all-boolean (not checked).
|
||||
|
||||
Returns
|
||||
-------
|
||||
pseudo_likelihood : ndarray of shape (n_samples,)
|
||||
Value of the pseudo-likelihood (proxy for likelihood).
|
||||
|
||||
Notes
|
||||
-----
|
||||
This method is not deterministic: it computes a quantity called the
|
||||
free energy on X, then on a randomly corrupted version of X, and
|
||||
returns the log of the logistic function of the difference.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
v = validate_data(self, X, accept_sparse="csr", reset=False)
|
||||
rng = check_random_state(self.random_state)
|
||||
|
||||
# Randomly corrupt one feature in each sample in v.
|
||||
ind = (np.arange(v.shape[0]), rng.randint(0, v.shape[1], v.shape[0]))
|
||||
if sp.issparse(v):
|
||||
data = -2 * v[ind] + 1
|
||||
if isinstance(data, np.matrix): # v is a sparse matrix
|
||||
v_ = v + sp.csr_matrix((data.A.ravel(), ind), shape=v.shape)
|
||||
else: # v is a sparse array
|
||||
v_ = v + sp.csr_array((data.ravel(), ind), shape=v.shape)
|
||||
else:
|
||||
v_ = v.copy()
|
||||
v_[ind] = 1 - v_[ind]
|
||||
|
||||
fe = self._free_energy(v)
|
||||
fe_ = self._free_energy(v_)
|
||||
# log(expit(x)) = log(1 / (1 + exp(-x)) = -np.logaddexp(0, -x)
|
||||
return -v.shape[1] * np.logaddexp(0, -(fe_ - fe))
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the model to the data X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
|
||||
Target values (None for unsupervised transformations).
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : BernoulliRBM
|
||||
The fitted model.
|
||||
"""
|
||||
X = validate_data(self, X, accept_sparse="csr", dtype=(np.float64, np.float32))
|
||||
n_samples = X.shape[0]
|
||||
rng = check_random_state(self.random_state)
|
||||
|
||||
self.components_ = np.asarray(
|
||||
rng.normal(0, 0.01, (self.n_components, X.shape[1])),
|
||||
order="F",
|
||||
dtype=X.dtype,
|
||||
)
|
||||
self._n_features_out = self.components_.shape[0]
|
||||
self.intercept_hidden_ = np.zeros(self.n_components, dtype=X.dtype)
|
||||
self.intercept_visible_ = np.zeros(X.shape[1], dtype=X.dtype)
|
||||
self.h_samples_ = np.zeros((self.batch_size, self.n_components), dtype=X.dtype)
|
||||
|
||||
n_batches = int(np.ceil(float(n_samples) / self.batch_size))
|
||||
batch_slices = list(
|
||||
gen_even_slices(n_batches * self.batch_size, n_batches, n_samples=n_samples)
|
||||
)
|
||||
verbose = self.verbose
|
||||
begin = time.time()
|
||||
for iteration in range(1, self.n_iter + 1):
|
||||
for batch_slice in batch_slices:
|
||||
self._fit(X[batch_slice], rng)
|
||||
|
||||
if verbose:
|
||||
end = time.time()
|
||||
print(
|
||||
"[%s] Iteration %d, pseudo-likelihood = %.2f, time = %.2fs"
|
||||
% (
|
||||
type(self).__name__,
|
||||
iteration,
|
||||
self.score_samples(X).mean(),
|
||||
end - begin,
|
||||
)
|
||||
)
|
||||
begin = end
|
||||
|
||||
return self
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.sparse = True
|
||||
tags.transformer_tags.preserves_dtype = ["float64", "float32"]
|
||||
return tags
|
||||
@@ -0,0 +1,287 @@
|
||||
"""Stochastic optimization methods for MLP"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class BaseOptimizer:
|
||||
"""Base (Stochastic) gradient descent optimizer
|
||||
|
||||
Parameters
|
||||
----------
|
||||
learning_rate_init : float, default=0.1
|
||||
The initial learning rate used. It controls the step-size in updating
|
||||
the weights
|
||||
|
||||
Attributes
|
||||
----------
|
||||
learning_rate : float
|
||||
the current learning rate
|
||||
"""
|
||||
|
||||
def __init__(self, learning_rate_init=0.1):
|
||||
self.learning_rate_init = learning_rate_init
|
||||
self.learning_rate = float(learning_rate_init)
|
||||
|
||||
def update_params(self, params, grads):
|
||||
"""Update parameters with given gradients
|
||||
|
||||
Parameters
|
||||
----------
|
||||
params : list of length = len(coefs_) + len(intercepts_)
|
||||
The concatenated list containing coefs_ and intercepts_ in MLP
|
||||
model. Used for initializing velocities and updating params
|
||||
|
||||
grads : list of length = len(params)
|
||||
Containing gradients with respect to coefs_ and intercepts_ in MLP
|
||||
model. So length should be aligned with params
|
||||
"""
|
||||
updates = self._get_updates(grads)
|
||||
for param, update in zip((p for p in params), updates):
|
||||
param += update
|
||||
|
||||
def iteration_ends(self, time_step):
|
||||
"""Perform update to learning rate and potentially other states at the
|
||||
end of an iteration
|
||||
"""
|
||||
pass
|
||||
|
||||
def trigger_stopping(self, msg, verbose):
|
||||
"""Decides whether it is time to stop training
|
||||
|
||||
Parameters
|
||||
----------
|
||||
msg : str
|
||||
Message passed in for verbose output
|
||||
|
||||
verbose : bool
|
||||
Print message to stdin if True
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_stopping : bool
|
||||
True if training needs to stop
|
||||
"""
|
||||
if verbose:
|
||||
print(msg + " Stopping.")
|
||||
return True
|
||||
|
||||
|
||||
class SGDOptimizer(BaseOptimizer):
|
||||
"""Stochastic gradient descent optimizer with momentum
|
||||
|
||||
Parameters
|
||||
----------
|
||||
params : list, length = len(coefs_) + len(intercepts_)
|
||||
The concatenated list containing coefs_ and intercepts_ in MLP model.
|
||||
Used for initializing velocities and updating params
|
||||
|
||||
learning_rate_init : float, default=0.1
|
||||
The initial learning rate used. It controls the step-size in updating
|
||||
the weights
|
||||
|
||||
lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant'
|
||||
Learning rate schedule for weight updates.
|
||||
|
||||
-'constant', is a constant learning rate given by
|
||||
'learning_rate_init'.
|
||||
|
||||
-'invscaling' gradually decreases the learning rate 'learning_rate_' at
|
||||
each time step 't' using an inverse scaling exponent of 'power_t'.
|
||||
learning_rate_ = learning_rate_init / pow(t, power_t)
|
||||
|
||||
-'adaptive', keeps the learning rate constant to
|
||||
'learning_rate_init' as long as the training keeps decreasing.
|
||||
Each time 2 consecutive epochs fail to decrease the training loss by
|
||||
tol, or fail to increase validation score by tol if 'early_stopping'
|
||||
is on, the current learning rate is divided by 5.
|
||||
|
||||
momentum : float, default=0.9
|
||||
Value of momentum used, must be larger than or equal to 0
|
||||
|
||||
nesterov : bool, default=True
|
||||
Whether to use nesterov's momentum or not. Use nesterov's if True
|
||||
|
||||
power_t : float, default=0.5
|
||||
Power of time step 't' in inverse scaling. See `lr_schedule` for
|
||||
more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
learning_rate : float
|
||||
the current learning rate
|
||||
|
||||
velocities : list, length = len(params)
|
||||
velocities that are used to update params
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
params,
|
||||
learning_rate_init=0.1,
|
||||
lr_schedule="constant",
|
||||
momentum=0.9,
|
||||
nesterov=True,
|
||||
power_t=0.5,
|
||||
):
|
||||
super().__init__(learning_rate_init)
|
||||
|
||||
self.lr_schedule = lr_schedule
|
||||
self.momentum = momentum
|
||||
self.nesterov = nesterov
|
||||
self.power_t = power_t
|
||||
self.velocities = [np.zeros_like(param) for param in params]
|
||||
|
||||
def iteration_ends(self, time_step):
|
||||
"""Perform updates to learning rate and potential other states at the
|
||||
end of an iteration
|
||||
|
||||
Parameters
|
||||
----------
|
||||
time_step : int
|
||||
number of training samples trained on so far, used to update
|
||||
learning rate for 'invscaling'
|
||||
"""
|
||||
if self.lr_schedule == "invscaling":
|
||||
self.learning_rate = (
|
||||
float(self.learning_rate_init) / (time_step + 1) ** self.power_t
|
||||
)
|
||||
|
||||
def trigger_stopping(self, msg, verbose):
|
||||
if self.lr_schedule != "adaptive":
|
||||
if verbose:
|
||||
print(msg + " Stopping.")
|
||||
return True
|
||||
|
||||
if self.learning_rate <= 1e-6:
|
||||
if verbose:
|
||||
print(msg + " Learning rate too small. Stopping.")
|
||||
return True
|
||||
|
||||
self.learning_rate /= 5.0
|
||||
if verbose:
|
||||
print(msg + " Setting learning rate to %f" % self.learning_rate)
|
||||
return False
|
||||
|
||||
def _get_updates(self, grads):
|
||||
"""Get the values used to update params with given gradients
|
||||
|
||||
Parameters
|
||||
----------
|
||||
grads : list, length = len(coefs_) + len(intercepts_)
|
||||
Containing gradients with respect to coefs_ and intercepts_ in MLP
|
||||
model. So length should be aligned with params
|
||||
|
||||
Returns
|
||||
-------
|
||||
updates : list, length = len(grads)
|
||||
The values to add to params
|
||||
"""
|
||||
updates = [
|
||||
self.momentum * velocity - self.learning_rate * grad
|
||||
for velocity, grad in zip(self.velocities, grads)
|
||||
]
|
||||
self.velocities = updates
|
||||
|
||||
if self.nesterov:
|
||||
updates = [
|
||||
self.momentum * velocity - self.learning_rate * grad
|
||||
for velocity, grad in zip(self.velocities, grads)
|
||||
]
|
||||
|
||||
return updates
|
||||
|
||||
|
||||
class AdamOptimizer(BaseOptimizer):
|
||||
"""Stochastic gradient descent optimizer with Adam
|
||||
|
||||
Note: All default values are from the original Adam paper
|
||||
|
||||
Parameters
|
||||
----------
|
||||
params : list, length = len(coefs_) + len(intercepts_)
|
||||
The concatenated list containing coefs_ and intercepts_ in MLP model.
|
||||
Used for initializing velocities and updating params
|
||||
|
||||
learning_rate_init : float, default=0.001
|
||||
The initial learning rate used. It controls the step-size in updating
|
||||
the weights
|
||||
|
||||
beta_1 : float, default=0.9
|
||||
Exponential decay rate for estimates of first moment vector, should be
|
||||
in [0, 1)
|
||||
|
||||
beta_2 : float, default=0.999
|
||||
Exponential decay rate for estimates of second moment vector, should be
|
||||
in [0, 1)
|
||||
|
||||
epsilon : float, default=1e-8
|
||||
Value for numerical stability
|
||||
|
||||
Attributes
|
||||
----------
|
||||
learning_rate : float
|
||||
The current learning rate
|
||||
|
||||
t : int
|
||||
Timestep
|
||||
|
||||
ms : list, length = len(params)
|
||||
First moment vectors
|
||||
|
||||
vs : list, length = len(params)
|
||||
Second moment vectors
|
||||
|
||||
References
|
||||
----------
|
||||
:arxiv:`Kingma, Diederik, and Jimmy Ba (2014) "Adam: A method for
|
||||
stochastic optimization." <1412.6980>
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, params, learning_rate_init=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
|
||||
):
|
||||
super().__init__(learning_rate_init)
|
||||
|
||||
self.beta_1 = beta_1
|
||||
self.beta_2 = beta_2
|
||||
self.epsilon = epsilon
|
||||
self.t = 0
|
||||
self.ms = [np.zeros_like(param) for param in params]
|
||||
self.vs = [np.zeros_like(param) for param in params]
|
||||
|
||||
def _get_updates(self, grads):
|
||||
"""Get the values used to update params with given gradients
|
||||
|
||||
Parameters
|
||||
----------
|
||||
grads : list, length = len(coefs_) + len(intercepts_)
|
||||
Containing gradients with respect to coefs_ and intercepts_ in MLP
|
||||
model. So length should be aligned with params
|
||||
|
||||
Returns
|
||||
-------
|
||||
updates : list, length = len(grads)
|
||||
The values to add to params
|
||||
"""
|
||||
self.t += 1
|
||||
self.ms = [
|
||||
self.beta_1 * m + (1 - self.beta_1) * grad
|
||||
for m, grad in zip(self.ms, grads)
|
||||
]
|
||||
self.vs = [
|
||||
self.beta_2 * v + (1 - self.beta_2) * (grad**2)
|
||||
for v, grad in zip(self.vs, grads)
|
||||
]
|
||||
self.learning_rate = (
|
||||
self.learning_rate_init
|
||||
* np.sqrt(1 - self.beta_2**self.t)
|
||||
/ (1 - self.beta_1**self.t)
|
||||
)
|
||||
updates = [
|
||||
-self.learning_rate * m / (np.sqrt(v) + self.epsilon)
|
||||
for m, v in zip(self.ms, self.vs)
|
||||
]
|
||||
return updates
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,52 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn._loss import HalfPoissonLoss
|
||||
from sklearn.neural_network._base import binary_log_loss, log_loss, poisson_loss
|
||||
|
||||
|
||||
def test_binary_log_loss_1_prob_finite():
|
||||
# y_proba is equal to one should result in a finite logloss
|
||||
y_true = np.array([[0, 0, 1]]).T
|
||||
y_prob = np.array([[0.9, 1.0, 1.0]]).T
|
||||
|
||||
loss = binary_log_loss(y_true, y_prob)
|
||||
assert np.isfinite(loss)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y_true, y_prob",
|
||||
[
|
||||
(
|
||||
np.array([[1, 0, 0], [0, 1, 0]]),
|
||||
np.array([[0.0, 1.0, 0.0], [0.9, 0.05, 0.05]]),
|
||||
),
|
||||
(np.array([[0, 0, 1]]).T, np.array([[0.9, 1.0, 1.0]]).T),
|
||||
],
|
||||
)
|
||||
def test_log_loss_1_prob_finite(y_true, y_prob):
|
||||
# y_proba is equal to 1 should result in a finite logloss
|
||||
loss = log_loss(y_true, y_prob)
|
||||
assert np.isfinite(loss)
|
||||
|
||||
|
||||
def test_poisson_loss(global_random_seed):
|
||||
"""Test Poisson loss against well tested HalfPoissonLoss."""
|
||||
n = 1000
|
||||
rng = np.random.default_rng(global_random_seed)
|
||||
y_true = rng.integers(low=0, high=10, size=n).astype(float)
|
||||
y_raw = rng.standard_normal(n)
|
||||
y_pred = np.exp(y_raw)
|
||||
sw = rng.uniform(low=0.1, high=10, size=n)
|
||||
|
||||
assert 0 in y_true
|
||||
|
||||
loss = poisson_loss(y_true=y_true, y_pred=y_pred, sample_weight=sw)
|
||||
pl = HalfPoissonLoss()
|
||||
loss_ref = (
|
||||
pl(y_true=y_true, raw_prediction=y_raw, sample_weight=sw)
|
||||
+ pl.constant_to_optimal_zero(y_true=y_true, sample_weight=sw).mean()
|
||||
/ sw.mean()
|
||||
)
|
||||
|
||||
assert loss == pytest.approx(loss_ref, rel=1e-12)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,251 @@
|
||||
import re
|
||||
import sys
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets import load_digits
|
||||
from sklearn.neural_network import BernoulliRBM
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS
|
||||
from sklearn.utils.validation import assert_all_finite
|
||||
|
||||
Xdigits, _ = load_digits(return_X_y=True)
|
||||
Xdigits -= Xdigits.min()
|
||||
Xdigits /= Xdigits.max()
|
||||
|
||||
|
||||
def test_fit():
|
||||
X = Xdigits.copy()
|
||||
|
||||
rbm = BernoulliRBM(
|
||||
n_components=64, learning_rate=0.1, batch_size=10, n_iter=7, random_state=9
|
||||
)
|
||||
rbm.fit(X)
|
||||
|
||||
assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0)
|
||||
|
||||
# in-place tricks shouldn't have modified X
|
||||
assert_array_equal(X, Xdigits)
|
||||
|
||||
|
||||
def test_partial_fit():
|
||||
X = Xdigits.copy()
|
||||
rbm = BernoulliRBM(
|
||||
n_components=64, learning_rate=0.1, batch_size=20, random_state=9
|
||||
)
|
||||
n_samples = X.shape[0]
|
||||
n_batches = int(np.ceil(float(n_samples) / rbm.batch_size))
|
||||
batch_slices = np.array_split(X, n_batches)
|
||||
|
||||
for i in range(7):
|
||||
for batch in batch_slices:
|
||||
rbm.partial_fit(batch)
|
||||
|
||||
assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0)
|
||||
assert_array_equal(X, Xdigits)
|
||||
|
||||
|
||||
def test_transform():
|
||||
X = Xdigits[:100]
|
||||
rbm1 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
|
||||
rbm1.fit(X)
|
||||
|
||||
Xt1 = rbm1.transform(X)
|
||||
Xt2 = rbm1._mean_hiddens(X)
|
||||
|
||||
assert_array_equal(Xt1, Xt2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_small_sparse(csr_container):
|
||||
# BernoulliRBM should work on small sparse matrices.
|
||||
X = csr_container(Xdigits[:4])
|
||||
BernoulliRBM().fit(X) # no exception
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
|
||||
def test_small_sparse_partial_fit(sparse_container):
|
||||
X_sparse = sparse_container(Xdigits[:100])
|
||||
X = Xdigits[:100].copy()
|
||||
|
||||
rbm1 = BernoulliRBM(
|
||||
n_components=64, learning_rate=0.1, batch_size=10, random_state=9
|
||||
)
|
||||
rbm2 = BernoulliRBM(
|
||||
n_components=64, learning_rate=0.1, batch_size=10, random_state=9
|
||||
)
|
||||
|
||||
rbm1.partial_fit(X_sparse)
|
||||
rbm2.partial_fit(X)
|
||||
|
||||
assert_almost_equal(
|
||||
rbm1.score_samples(X).mean(), rbm2.score_samples(X).mean(), decimal=0
|
||||
)
|
||||
|
||||
|
||||
def test_sample_hiddens():
|
||||
rng = np.random.RandomState(0)
|
||||
X = Xdigits[:100]
|
||||
rbm1 = BernoulliRBM(n_components=2, batch_size=5, n_iter=5, random_state=42)
|
||||
rbm1.fit(X)
|
||||
|
||||
h = rbm1._mean_hiddens(X[0])
|
||||
hs = np.mean([rbm1._sample_hiddens(X[0], rng) for i in range(100)], 0)
|
||||
|
||||
assert_almost_equal(h, hs, decimal=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_fit_gibbs(csc_container):
|
||||
# XXX: this test is very seed-dependent! It probably needs to be rewritten.
|
||||
|
||||
# Gibbs on the RBM hidden layer should be able to recreate [[0], [1]]
|
||||
# from the same input
|
||||
rng = np.random.RandomState(42)
|
||||
X = np.array([[0.0], [1.0]])
|
||||
rbm1 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng)
|
||||
# you need that much iters
|
||||
rbm1.fit(X)
|
||||
assert_almost_equal(
|
||||
rbm1.components_, np.array([[0.02649814], [0.02009084]]), decimal=4
|
||||
)
|
||||
assert_almost_equal(rbm1.gibbs(X), X)
|
||||
|
||||
# Gibbs on the RBM hidden layer should be able to recreate [[0], [1]] from
|
||||
# the same input even when the input is sparse, and test against non-sparse
|
||||
rng = np.random.RandomState(42)
|
||||
X = csc_container([[0.0], [1.0]])
|
||||
rbm2 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng)
|
||||
rbm2.fit(X)
|
||||
assert_almost_equal(
|
||||
rbm2.components_, np.array([[0.02649814], [0.02009084]]), decimal=4
|
||||
)
|
||||
assert_almost_equal(rbm2.gibbs(X), X.toarray())
|
||||
assert_almost_equal(rbm1.components_, rbm2.components_)
|
||||
|
||||
|
||||
def test_gibbs_smoke():
|
||||
# Check if we don't get NaNs sampling the full digits dataset.
|
||||
# Also check that sampling again will yield different results.
|
||||
X = Xdigits
|
||||
rbm1 = BernoulliRBM(n_components=42, batch_size=40, n_iter=20, random_state=42)
|
||||
rbm1.fit(X)
|
||||
X_sampled = rbm1.gibbs(X)
|
||||
assert_all_finite(X_sampled)
|
||||
X_sampled2 = rbm1.gibbs(X)
|
||||
assert np.all((X_sampled != X_sampled2).max(axis=1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lil_containers", LIL_CONTAINERS)
|
||||
def test_score_samples(lil_containers):
|
||||
# Test score_samples (pseudo-likelihood) method.
|
||||
# Assert that pseudo-likelihood is computed without clipping.
|
||||
# See Fabian's blog, http://bit.ly/1iYefRk
|
||||
rng = np.random.RandomState(42)
|
||||
X = np.vstack([np.zeros(1000), np.ones(1000)])
|
||||
rbm1 = BernoulliRBM(n_components=10, batch_size=2, n_iter=10, random_state=rng)
|
||||
rbm1.fit(X)
|
||||
assert (rbm1.score_samples(X) < -300).all()
|
||||
|
||||
# Sparse vs. dense should not affect the output. Also test sparse input
|
||||
# validation.
|
||||
rbm1.random_state = 42
|
||||
d_score = rbm1.score_samples(X)
|
||||
rbm1.random_state = 42
|
||||
s_score = rbm1.score_samples(lil_containers(X))
|
||||
assert_almost_equal(d_score, s_score)
|
||||
|
||||
# Test numerical stability (#2785): would previously generate infinities
|
||||
# and crash with an exception.
|
||||
with np.errstate(under="ignore"):
|
||||
rbm1.score_samples([np.arange(1000) * 100])
|
||||
|
||||
|
||||
def test_rbm_verbose():
|
||||
rbm = BernoulliRBM(n_iter=2, verbose=10)
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = StringIO()
|
||||
try:
|
||||
rbm.fit(Xdigits)
|
||||
finally:
|
||||
sys.stdout = old_stdout
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_sparse_and_verbose(csc_container):
|
||||
# Make sure RBM works with sparse input when verbose=True
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = StringIO()
|
||||
|
||||
X = csc_container([[0.0], [1.0]])
|
||||
rbm = BernoulliRBM(
|
||||
n_components=2, batch_size=2, n_iter=1, random_state=42, verbose=True
|
||||
)
|
||||
try:
|
||||
rbm.fit(X)
|
||||
s = sys.stdout.getvalue()
|
||||
# make sure output is sound
|
||||
assert re.match(
|
||||
r"\[BernoulliRBM\] Iteration 1,"
|
||||
r" pseudo-likelihood = -?(\d)+(\.\d+)?,"
|
||||
r" time = (\d|\.)+s",
|
||||
s,
|
||||
)
|
||||
finally:
|
||||
sys.stdout = old_stdout
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype_in, dtype_out",
|
||||
[(np.float32, np.float32), (np.float64, np.float64), (int, np.float64)],
|
||||
)
|
||||
def test_transformer_dtypes_casting(dtype_in, dtype_out):
|
||||
X = Xdigits[:100].astype(dtype_in)
|
||||
rbm = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
|
||||
Xt = rbm.fit_transform(X)
|
||||
|
||||
# dtype_in and dtype_out should be consistent
|
||||
assert Xt.dtype == dtype_out, "transform dtype: {} - original dtype: {}".format(
|
||||
Xt.dtype, X.dtype
|
||||
)
|
||||
|
||||
|
||||
def test_convergence_dtype_consistency():
|
||||
# float 64 transformer
|
||||
X_64 = Xdigits[:100].astype(np.float64)
|
||||
rbm_64 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
|
||||
Xt_64 = rbm_64.fit_transform(X_64)
|
||||
|
||||
# float 32 transformer
|
||||
X_32 = Xdigits[:100].astype(np.float32)
|
||||
rbm_32 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
|
||||
Xt_32 = rbm_32.fit_transform(X_32)
|
||||
|
||||
# results and attributes should be close enough in 32 bit and 64 bit
|
||||
assert_allclose(Xt_64, Xt_32, rtol=1e-06, atol=0)
|
||||
assert_allclose(
|
||||
rbm_64.intercept_hidden_, rbm_32.intercept_hidden_, rtol=1e-06, atol=0
|
||||
)
|
||||
assert_allclose(
|
||||
rbm_64.intercept_visible_, rbm_32.intercept_visible_, rtol=1e-05, atol=0
|
||||
)
|
||||
assert_allclose(rbm_64.components_, rbm_32.components_, rtol=1e-03, atol=0)
|
||||
assert_allclose(rbm_64.h_samples_, rbm_32.h_samples_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["fit", "partial_fit"])
|
||||
def test_feature_names_out(method):
|
||||
"""Check `get_feature_names_out` for `BernoulliRBM`."""
|
||||
n_components = 10
|
||||
rbm = BernoulliRBM(n_components=n_components)
|
||||
getattr(rbm, method)(Xdigits)
|
||||
|
||||
names = rbm.get_feature_names_out()
|
||||
expected_names = [f"bernoullirbm{i}" for i in range(n_components)]
|
||||
assert_array_equal(expected_names, names)
|
||||
@@ -0,0 +1,112 @@
|
||||
import numpy as np
|
||||
|
||||
from sklearn.neural_network._stochastic_optimizers import (
|
||||
AdamOptimizer,
|
||||
BaseOptimizer,
|
||||
SGDOptimizer,
|
||||
)
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
shapes = [(4, 6), (6, 8), (7, 8, 9)]
|
||||
|
||||
|
||||
def test_base_optimizer():
|
||||
for lr in [10**i for i in range(-3, 4)]:
|
||||
optimizer = BaseOptimizer(lr)
|
||||
assert optimizer.trigger_stopping("", False)
|
||||
|
||||
|
||||
def test_sgd_optimizer_no_momentum():
|
||||
params = [np.zeros(shape) for shape in shapes]
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
for lr in [10**i for i in range(-3, 4)]:
|
||||
optimizer = SGDOptimizer(params, lr, momentum=0, nesterov=False)
|
||||
grads = [rng.random_sample(shape) for shape in shapes]
|
||||
expected = [param - lr * grad for param, grad in zip(params, grads)]
|
||||
optimizer.update_params(params, grads)
|
||||
|
||||
for exp, param in zip(expected, params):
|
||||
assert_array_equal(exp, param)
|
||||
|
||||
|
||||
def test_sgd_optimizer_momentum():
|
||||
params = [np.zeros(shape) for shape in shapes]
|
||||
lr = 0.1
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
for momentum in np.arange(0.5, 0.9, 0.1):
|
||||
optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=False)
|
||||
velocities = [rng.random_sample(shape) for shape in shapes]
|
||||
optimizer.velocities = velocities
|
||||
grads = [rng.random_sample(shape) for shape in shapes]
|
||||
updates = [
|
||||
momentum * velocity - lr * grad for velocity, grad in zip(velocities, grads)
|
||||
]
|
||||
expected = [param + update for param, update in zip(params, updates)]
|
||||
optimizer.update_params(params, grads)
|
||||
|
||||
for exp, param in zip(expected, params):
|
||||
assert_array_equal(exp, param)
|
||||
|
||||
|
||||
def test_sgd_optimizer_trigger_stopping():
|
||||
params = [np.zeros(shape) for shape in shapes]
|
||||
lr = 2e-6
|
||||
optimizer = SGDOptimizer(params, lr, lr_schedule="adaptive")
|
||||
assert not optimizer.trigger_stopping("", False)
|
||||
assert lr / 5 == optimizer.learning_rate
|
||||
assert optimizer.trigger_stopping("", False)
|
||||
|
||||
|
||||
def test_sgd_optimizer_nesterovs_momentum():
|
||||
params = [np.zeros(shape) for shape in shapes]
|
||||
lr = 0.1
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
for momentum in np.arange(0.5, 0.9, 0.1):
|
||||
optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=True)
|
||||
velocities = [rng.random_sample(shape) for shape in shapes]
|
||||
optimizer.velocities = velocities
|
||||
grads = [rng.random_sample(shape) for shape in shapes]
|
||||
updates = [
|
||||
momentum * velocity - lr * grad for velocity, grad in zip(velocities, grads)
|
||||
]
|
||||
updates = [
|
||||
momentum * update - lr * grad for update, grad in zip(updates, grads)
|
||||
]
|
||||
expected = [param + update for param, update in zip(params, updates)]
|
||||
optimizer.update_params(params, grads)
|
||||
|
||||
for exp, param in zip(expected, params):
|
||||
assert_array_equal(exp, param)
|
||||
|
||||
|
||||
def test_adam_optimizer():
|
||||
params = [np.zeros(shape) for shape in shapes]
|
||||
lr = 0.001
|
||||
epsilon = 1e-8
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
for beta_1 in np.arange(0.9, 1.0, 0.05):
|
||||
for beta_2 in np.arange(0.995, 1.0, 0.001):
|
||||
optimizer = AdamOptimizer(params, lr, beta_1, beta_2, epsilon)
|
||||
ms = [rng.random_sample(shape) for shape in shapes]
|
||||
vs = [rng.random_sample(shape) for shape in shapes]
|
||||
t = 10
|
||||
optimizer.ms = ms
|
||||
optimizer.vs = vs
|
||||
optimizer.t = t - 1
|
||||
grads = [rng.random_sample(shape) for shape in shapes]
|
||||
|
||||
ms = [beta_1 * m + (1 - beta_1) * grad for m, grad in zip(ms, grads)]
|
||||
vs = [beta_2 * v + (1 - beta_2) * (grad**2) for v, grad in zip(vs, grads)]
|
||||
learning_rate = lr * np.sqrt(1 - beta_2**t) / (1 - beta_1**t)
|
||||
updates = [
|
||||
-learning_rate * m / (np.sqrt(v) + epsilon) for m, v in zip(ms, vs)
|
||||
]
|
||||
expected = [param + update for param, update in zip(params, updates)]
|
||||
|
||||
optimizer.update_params(params, grads)
|
||||
for exp, param in zip(expected, params):
|
||||
assert_array_equal(exp, param)
|
||||
Reference in New Issue
Block a user