Source code for mplearn.graphical_model.base_graph._base_graph

import sys

import numpy as np
import joblib
sys.modules['sklearn.externals.joblib'] = joblib
import sklearn.utils._testing
sys.modules['sklearn.utils.testing'] = sklearn.utils._testing
from sklearn.utils.extmath import fast_logdet
from inverse_covariance import QuicGraphicalLasso

from ...common import BaseLearner


def _hard_thresh_matrix(mat, tau_k):
    """ Hard-threshold a matrix elementwise at a threshold value.

    Parameters
    ----------
    mat : ndarray of shape (m, m)
        The initial graph estimate from a minipatch.

    tau_k : float
        Every element of `mat` whose absolute value is less than
        or equal to `tau_k` will be set to zero.

    Returns
    -------

    """

    mat_copy = mat.copy()

    mat_copy[np.absolute(mat) <= tau_k] = 0

    return mat_copy


def _compute_ebic(precision, covariance, n, m, gamma):
    """ Compute the EBIC score for Gaussian graphical model.

    Parameters
    ----------
    precision : ndarray of shape (m, m)
        The estimated precision matrix on the minipatch.

    covariance : ndarray of shape (m, m)
        The sample covariance matrix on the minipatch.

    n : int
        The number of observations in the minipatch.

    m : int
        The number of nodes in the minipatch.

    gamma : float
        The gamma parameter in the extended BIC criterion.

    Returns
    -------

    """

    ln = (n / 2.) * (fast_logdet(precision) - np.sum(covariance * precision))

    if np.isinf(ln) or np.isnan(ln):
        return 1e10

    E_card = (np.sum(np.abs(precision.flat) > np.finfo(precision.dtype).eps) - m) / 2.0

    return (-2 * ln) + (E_card * np.log(n)) + (4 * E_card * gamma * np.log(m))


[docs]class ThresholdedGraphicalLasso(BaseLearner): """Gaussian graphical model selection with the Thresholded Graphical Lasso estimator. This class is designed to be used as a base graph selector on the minipatches with the `mplearn.graphical_model.MPGraph` class. At a high level, this estimator first gets an initial graph estimate using the Graphical Lasso at a small amount of regularization. After that, it hard-thresholds the initial graph estimate at a sequence of threshold values and then chooses the thresholded graph with the best EBIC score as the final graph estimate. See the original paper [1] for more details. Parameters ---------- lambda0_scale : float, default=1 Controls the amount of regularization for Graphical Lasso to get the initial graphical model estimate. Specifically, the Graphical Lasso is fit at regularization `lambda0_scale * sqrt(log(n_features)/n_samples)`. Hence a larger value means a sparser initial graph estimate. Note that `lambda0_scale` needs to be larger than 0. threshold_seq : ndarray, default=None The sequence of threshold values at which to hard-threshold the initial graph estimate from the Graphical Lasso estimator. The elements of this array should be in the interval (0.0, 1]. If set to `None`, the program will automatically set `threshold_seq=numpy.linspace(0.1, 0.5, 9)`. ebic_gamma : float, default=0.5 The gamma parameter in the extended BIC criterion [2]. A larger value encourages sparser graph estimates. Note that `ebic_gamma` needs to be larger than or equal to 0. Attributes ---------- Theta_tilde_ : ndarray of shape (n_features, n_features) The final precision matrix estimate corresponding to the thresholded graph with the best EBIC score. References ---------- .. [1] Yao, T. and Wang, M. and Allen, G. I., "Gaussian Graphical Model Selection for Huge Data via Minipatch Learning", arXiv:2110.12067. .. [2] Foygel, R. and Drton, M., "Extended Bayesian Information Criteria for Gaussian Graphical Models", Neural Information Processing Systems 2010. """ def __init__(self, *, lambda0_scale=1, threshold_seq=None, ebic_gamma=0.5): if threshold_seq is None: threshold_seq = np.linspace(0.1, 0.5, 9) self.lambda0_scale = lambda0_scale self.threshold_seq = threshold_seq self.ebic_gamma = ebic_gamma
[docs] def fit(self, X, y=None): """Fit the Thresholded Graphical Lasso model to X. Parameters ---------- X : ndarray of shape (n_samples, n_features) Data from which to infer the graphical model structure. y : Ignored. Not used, present for API consistency. Returns ------- self : object Fitted estimator. """ assert (self.ebic_gamma >= 0), "ebic_gamma needs to be larger than or equal to 0." assert (self.lambda0_scale > 0), "lambda0_scale needs to be a float number larger than 0." n, m = X.shape lambda0 = self.lambda0_scale * np.sqrt(np.log(m) / n) # get an initial graph estimate using the Graphical Lasso estimator glasso = QuicGraphicalLasso(lam=lambda0, mode='default', verbose=0, init_method='cov', auto_scale=False, tol=1e-3, max_iter=1000).fit(X) Theta_hat_glasso = glasso.precision_.astype(np.float32) Sigma_hat = glasso.sample_covariance_.astype(np.float32) del glasso del X Theta_hat_max = 1 ebic_seq = np.zeros(self.threshold_seq.size) # hard-threshold the initial graph estimate at different threshold and compute corresponding EBIC score for i in range(self.threshold_seq.size): ebic_seq[i] = _compute_ebic(_hard_thresh_matrix(Theta_hat_glasso, self.threshold_seq[i] * Theta_hat_max), Sigma_hat, n, m, self.ebic_gamma) min_indices = np.where(np.absolute(ebic_seq - ebic_seq.min()) < 1e-10) # choose the thresholded graph with the best EBIC score as the final graph estimate self.Theta_tilde_ = _hard_thresh_matrix(Theta_hat_glasso, self.threshold_seq[np.min(min_indices)] * Theta_hat_max) return self