Source code for mplearn.feature_selection._adaptive_stable_minipatch_selection

import numbers
from warnings import warn

import numpy as np
from sklearn.utils.validation import check_is_fitted
from sklearn.base import clone
from sklearn.utils import safe_mask

from ..common import BaseLearner
from .utils_feature_selection import (
    manual_sort,
    kde_based_pi_thr,
    visualize_selection_frequency_versus_iteration
)


def _adaptive_feature_sampling_exploitation_exploration(sampling_options,
                                                        rng,
                                                        gamma,
                                                        m,
                                                        M,
                                                        G,
                                                        keep_all_iters,
                                                        max_iters_to_keep,
                                                        Pi_hat_seq,
                                                        prev_partition,
                                                        k,
                                                        verbose):
    """Exploitation & Exploration adaptive feature sampling procedure.

    Parameters
    ----------
    sampling_options : dict
        Dictionary with parameter names (`str`) as keys and specific parameter
        settings as values. See the documentation of `AdaSTAMPS` for details.

    rng : RandomState instance
        Make sure the sampling uses the same RandomState instance for all iterations.

    gamma : float
        Controls the proportion of features sampled from the active set.

    m : int
        The number of features in a minipatch.

    M : int
        The total number of input features.

    G : int
        Number of iterations for each burn-in epoch.

    keep_all_iters : bool
        See the documentation of `AdaSTAMPS` for details.

    max_iters_to_keep : int
        See the documentation of `AdaSTAMPS` for details.

    Pi_hat_seq : array of shape (n_features, num_iterations)
        Contains the selection frequency of features over previous iterations.

    prev_partition : list of arrays
        Each element contains the indices of features that are sampled into a minipatch.

    k : int
        The current iteration number.

    verbose : {0, 1, 2}
        Controls the verbosity.

    Returns
    -------
    Fk : array of shape (m, )
        Contains the integer indices of the features from X that are in the current minipatch.

    rng : RandomState instance

    gamma : float
        The gamma value from the current iteration.

    feature_set_partition : list of arrays
        This is only returned during the burn-in stage.

    """

    # Specify options for the sampling procedure
    gamma_max = sampling_options["gamma_max"]  # usually default to 1

    # Initialize array to store index for all M features
    full_feature_idx = np.arange(M)

    # Subsample features using Adaptive Feature Sampling Exploitation and Exploration
    if k <= (sampling_options["E"] * G):  # during burn-in
        if int(k % G) == 1:
            # Reshuffle the full index set and partition at the beginning of an epoch (during burn-in only)
            feature_set_partition = np.array_split(rng.permutation(full_feature_idx), G)
            if verbose == 2:
                print("Reshuffling index set at k = %.i and partitioning into G=%.i groups is completed" % (k, G))
        else:
            feature_set_partition = prev_partition

        Fk_current = feature_set_partition[np.int((k % G))]
        if Fk_current.size == m:
            Fk = Fk_current
        elif Fk_current.size < m:
            if np.int((k % G)) < (len(feature_set_partition) - 1):
                # take Fk_append from the next block
                Fk_append = rng.choice(feature_set_partition[np.int((k % G) + 1)],
                                       size=np.int(m - Fk_current.size),
                                       replace=False)
            elif np.int((k % G)) == (len(feature_set_partition) - 1):
                Fk_append = rng.choice(feature_set_partition[0],
                                       size=np.int(m - Fk_current.size),
                                       replace=False)
            Fk = np.concatenate((Fk_current, Fk_append))
    else:  # after burn-in

        if keep_all_iters:
            above_thr_mask = Pi_hat_seq[:, (k - 1)] >= sampling_options["active_set_thr"]
        else:
            above_thr_mask = Pi_hat_seq[:, np.int(((k - 2) % max_iters_to_keep))] >= sampling_options["active_set_thr"]
        active_set_above_thr = full_feature_idx[above_thr_mask]
        active_set_size = np.sum(above_thr_mask)
        if verbose == 2:
            print("The active set above threshold |A| = %.i at iteration k = %.i" % (active_set_size, k))

        # Increase the proportion of active set sampled in the subsequent minipatch
        gamma = np.minimum((sampling_options["gamma_multiplier"] * gamma), gamma_max)
        num_var_from_active = np.minimum(m, np.int(gamma * active_set_size))
        if verbose == 2:
            print("%.i variables are sampled from active set at iteration k = %.i" % (num_var_from_active, k))
        Fk_from_active = rng.choice(active_set_above_thr, size=num_var_from_active, replace=False)

        if (m - num_var_from_active) > 0:
            candidate_set_below_thr = full_feature_idx[np.logical_not(above_thr_mask)]
            Fk_from_rest = rng.choice(candidate_set_below_thr, size=np.int(m - num_var_from_active), replace=False)
            Fk = np.sort(np.concatenate((Fk_from_active, Fk_from_rest)))
        else:
            Fk = Fk_from_active

    if k <= (sampling_options["E"] * G):
        return Fk, rng, gamma, feature_set_partition
    else:
        return Fk, rng, gamma


def _adaptive_feature_sampling_probabilistic(sampling_options,
                                             rng,
                                             m,
                                             M,
                                             G,
                                             keep_all_iters,
                                             max_iters_to_keep,
                                             Pi_hat_seq,
                                             prev_partition,
                                             k,
                                             verbose):
    """Probabilistic adaptive feature sampling procedure.

    Parameters
    ----------
    sampling_options : dict
        Dictionary with parameter names (`str`) as keys and specific parameter
        settings as values. See the documentation of `AdaSTAMPS` for details.

    rng : RandomState instance
        Make sure the sampling uses the same RandomState instance for all iterations.

    m : int
        The number of features in a minipatch.

    M : int
        The total number of input features.

    G : int
        Number of iterations for each burn-in epoch.

    keep_all_iters : bool
        See the documentation of `AdaSTAMPS` for details.

    max_iters_to_keep : int
        See the documentation of `AdaSTAMPS` for details.

    Pi_hat_seq : array of shape (n_features, num_iterations)
        Contains the selection frequency of features over previous iterations.

    prev_partition : list of arrays
        Each element contains the indices of features that are sampled into a minipatch.

    k : int
        The current iteration number.

    verbose : {0, 1, 2}
        Controls the verbosity.

    Returns
    -------
    Fk : array of shape (m, )
        Contains the integer indices of the features from X that are in the current minipatch.

    rng : RandomState instance

    feature_set_partition : list of arrays
        This is only returned during the burn-in stage.

    """

    # Initialize array to store index for all M features
    full_feature_idx = np.arange(M)

    if k <= (sampling_options["E"] * G):  # during burn-in
        if int(k % G) == 1:
            # Reshuffle the full index set and partition at the beginning of an epoch (during burn-in only)
            feature_set_partition = np.array_split(rng.permutation(full_feature_idx), G)
            if verbose == 2:
                print("Reshuffling index set at k = %.i and partitioning into G=%.i groups is completed" % (k, G))
        else:
            feature_set_partition = prev_partition

        Fk_current = feature_set_partition[np.int((k % G))]
        if Fk_current.size == m:
            Fk = Fk_current
        elif Fk_current.size < m:
            if np.int((k % G)) < (len(feature_set_partition) - 1):
                # take Fk_append from the next block
                Fk_append = rng.choice(feature_set_partition[np.int((k % G) + 1)],
                                       size=np.int(m - Fk_current.size), replace=False)
            elif np.int((k % G)) == (len(feature_set_partition) - 1):
                Fk_append = rng.choice(feature_set_partition[0],
                                       size=np.int(m - Fk_current.size), replace=False)
            Fk = np.concatenate((Fk_current, Fk_append))

    else:  # after burn-in

        if keep_all_iters:
            feature_prob = Pi_hat_seq[:, (k - 1)] / np.sum(Pi_hat_seq[:, (k - 1)])
        else:
            feature_prob = (Pi_hat_seq[:, np.int(((k - 2) % max_iters_to_keep))]
                            / np.sum(Pi_hat_seq[:, np.int(((k - 2) % max_iters_to_keep))]))

        Fk = np.sort(rng.choice(M, size=m, replace=False, p=feature_prob))

    if k <= (sampling_options["E"] * G):
        return Fk, rng, feature_set_partition
    else:
        return Fk, rng


[docs]class AdaSTAMPS(BaseLearner): """Feature selection with Adaptive Stable Minipatch Selection. This is a meta-algorithm that repeatedly fits base feature selectors to many random or adaptively chosen subsets of both observations and features (minipatches) and ensembles the selection events from all these base selectors. At the end of the algorithm, the final selection frequency is computed for each input feature as the number of times it is sampled and then selected by base selectors divided by the number of times it is sampled into minipatches. The selection frequency signifies the importance of features. The algorithm eventually selects the set of features whose selection frequency is above a certain threshold (can be either user-specific or determined automatically). Important note: `AdaSTAMPS` assumes that all necessary pre-processing steps prior to feature selection have already been carried out on the input data (X, y). For instance, data standardization (centering and/or scaling) needs to be performed on the raw data prior to calling `fit()` if such data pre-processing is deemed necessary by the users. Parameters ---------- base_selector : ``Estimator`` instance A feature selector with a ``fit`` method that provides binary selection indicators (1 for selected features and 0 for unselected features). See **Notes** for more details. minipatch_m_ratio : float, default=0.05 The proportion of features to draw from X randomly without replacement to train each base selector. Specifically, `round(minipatch_m_ratio * X.shape[1])` features are randomly drawn into each minipatch. Note that the same feature won't appear twice in a given minipatch (due to sampling without replacement), but a feature can appear in multiple minipatches. `minipatch_m_ratio` should be in the interval (0.0, 1.0]. See **Notes** for more details. minipatch_n_ratio : float, default=0.5 The proportion of observations to draw from X uniformly at random without replacement to train each base selector. Specifically, `round(minipatch_n_ratio * X.shape[0])` observations are randomly drawn into each minipatch. Note that the same observation won't appear twice in a given minipatch (due to sampling without replacement), but an observation can appear in multiple minipatches. `minipatch_n_ratio` should be in the interval (0.0, 1.0]. See **Notes** for more details. sampling_options : dict, default=None Dictionary with parameter names (`str`) as keys and specific parameter settings as values. This specifies the randomization scheme used to sample features into minipatches. Unless set to `None`, `sampling_options` is required to have a key named `'mode'`, whose value must be one of {'ee', 'prob', 'uniform'}. It is recommended to set `sampling_options` to `None` for starters, which uses the default Exploitation & Exploration adaptive feature sampling scheme with parameter values set to respective recommended values as described below. See **Notes** for more details. - If `'mode'` has value 'ee', it uses the Exploitation & Exploration scheme to adaptively sample features into minipatches. In this case, `sampling_options` is required to have the following parameters as additional keys: - `'E'` : int. The number of burn-in epochs during which every feature is sampled exactly `'E'` times to get an initial guess of feature importance before starting the adaptive sampling of features. A value of 10 generally works well for many problems. Note that a larger `'E'` generally requires increase the maximum number of iterations (`'max_k'`) in `stopping_criteria_options`. - `'active_set_thr'` : float. The selection frequency threshold above which a feature is put into the active set during the adaptive sampling stage. A value of 0.1 generally works well for many problems. Note that its value should be in the interval (0.0, 1.0). A larger value generally means fewer features in the active set. - `'gamma_min'` : float. The minimum proportion of features in the active set to sample into minipatches at the beginning of the adaptive sampling stage. It is recommened to fix its value to 0.5. Note that its value should be in the interval (0.0, 1.0), and should not exceed the value of `'gamma_max'`. - `'gamma_max'` : float. The maximum proportion of features in the active set to sample into minipatches as the adaptive sampling scheme proceeds. It is recommened to fix its value to 1.0. Note that its value should be in the interval (0.0, 1.0]. - `'gamma_len'` : int. The number of iterations it takes for the adaptive feature sampler to go from `'gamma_min'` to `'gamma_max'`. This controls the trade-off between exploiting the active set and exploring the remaining input feature space. In general, a smaller value favors exploitation while a larger value favors exploration. A value in the range [50, 500] generally works well for many problems. - If `'mode'` has value 'prob', it uses the Probabilistic scheme to adaptively sample features into minipatches. In this case, `sampling_options` is required to have the following parameters as additional keys: - `'E'` : int. This is the same as the `'E'` parameter in the case of `'mode'` being 'ee'. See the descriptions above for details. - If `'mode'` has value 'uniform', it samples features uniformly at random without replacement into minipatches. In this case, `sampling_options` does not need to have other key-value pairs. stopping_criteria_options : dict, default=None Dictionary with parameter names (`str`) as keys and specific parameter settings as values. This specifies parameter values for the data-driven stopping rule, which stops the meta-algorithm when the rank ordering of the top features in terms of selection frequency remain unchanged for the past `'num_last_iterations'`. Unless set to `None`, `stopping_criteria_options` is required to have 4 keys: `'tau_u'`, `'tau_l'`, `'max_k'`, and `'num_last_iterations'`. It is recommended to set `stopping_criteria_options` to `None` for starters, which sets the parameter values to the respective default as described below. - `'tau_u'` : int. This specifies the maximum number of top features whose rank orderings should be considered when assessing the stopping rule. It is recommened to set its value to well exceed the expected number of truly informative features. The default value is set to 30. Note that its value should be much smaller than the total number of input features. - `'tau_l'` : int. This specifies the minimum number of top features whose rank orderings should be considered when assessing the stopping rule. It is recommened to set its value to well exceed the expected number of truly informative features. The default value is set to 15. Note that its value should be much smaller than `'tau_u'`. - `'num_last_iterations'` : int. The algorithm stops when the rank ordering of the top features in terms of selection frequency remain unchanged for the past `'num_last_iterations'`. It is recommended to fix its value to 100. Note that a unreasonably large value could render the stopping rule ineffective. - `'max_k'` : int. The maximum number of iterations to run the meta-algorithm if the data-driven stopping rule has not stopped it earlier. The default value is set to 5000. If `'mode'` of `sampling_options` is set to {'ee', 'prob'} and `'max_k'` is set to `None`, the algorithm will automatically compute `'max_k'` to be 5 times the number of burn-in iterations. random_state : int, default=0 Controls both the randomness of sampling observations and sampling features into minipatches. keep_all_iters : bool, default=True Whether to store and output intermediate featrue selection frequency across all iterations. It could be useful to visualize selection frequency of all features versus iteration number for qualitatively discovering informative features. However, if the number of input feature is large (e.g. hundreds of thousands), then it is recommended to set this to `False` to avoid consuming too much memory. If set to `False`, it is required to set a value for `max_iters_to_keep`. max_iters_to_keep : int, default=None This value is ignored if `keep_all_iters` is `True`. Otherwise, this specifies the number of iterations (counting backwards from the last iteration) for which feature selection frequency should be stored and output. Note that `max_iters_to_keep` should be at least as large as the `'num_last_iterations'` of `stopping_criteria_options` (which is the default when `max_iters_to_keep` is set to `None`). verbose : {0, 1, 2} Controls the verbosity: the higher, more messages are displayed. Attributes ---------- last_k_ : int The total number of iterations for which the meta-algorithm has run. Pi_hat_last_k_ : ndarray of shape (n_features,) The final selection frequency for each of the input features. Each element is in the interval [0.0, 1.0]. A larger value indicates that the corresponding feature is more informative, vice versa. full_Pi_hat_seq_ : ndarray of shape (n_features, `last_k_`) or (n_features, `max_iters_to_keep`) If `keep_all_iters` is `True`, then this is a ndarray of shape (n_features, `last_k_`) containing the selection frequency of all input features from first iteration to the last. If `keep_all_iters` is `False`, then this is a ndarray of shape (n_features, `max_iters_to_keep`) containing the selection frequency of all input features for the last `max_iters_to_keep` iterations. full_Pi_hat_k_seq_ : ndarray of shape (`last_k_`,) or (`max_iters_to_keep`,) This contains the iteration numbers corresponding to the columns of `full_Pi_hat_seq_`. burn_in_length_ : int The total number of iterations spent in the burn-in stage. If `'mode'` of `sampling_options` is set to 'uniform' or if `keep_all_iters` is `False`, this is set to `None`. Notes ----- - More details about `base_selector`: The AdaSTAMPS meta-algorithm can be employed with a wide variety of feature selection techniques as the base selector on minipatches. This package currently provides two highly efficient base selector classes - `mplearn.feature_selection.base_selector.ThresholdedOLS` for regression problems and `mplearn.feature_selection.base_selector.DecisionTreeSelector` for both regression and classification problems. However, user-supplied selector is also allowed as long as the selector class follows the same structure as the two base selectors mentioned above (i.e. has a ``fit`` method that accepts minipatch feature indices and provides binary selection indicators (1 for selected features and 0 for unselected features)). - More details about choice of minipatch size: Suppose the data X has N observations (rows) and M features (columns). Following the notations of [1], a minipatch is obtained by subsampling n observations and m features simultaneously without replacement from X using some form of randomization. The parameter `minipatch_m_ratio` represents :math:`m/M` and `minipatch_n_ratio` represents :math:`n/N`. As demonstrated in [1], the performance of the meta-algorithm is robust for a sensible range of n and m values. The general rule of thumb is to take m to well exceed the expected number of true informative features (e.g. 3-10 times the expected number of true informative features) and then pick n relative to m such that it well exceeds the sample complexity of the base selector used. - We refer the users to the original paper [1] for detailed algorithms for the various sampling procedures and the stopping rule. References ---------- .. [1] Yao, T. and Allen, G. I., "Feature Selection for Huge Data via Minipatch Learning", arXiv:2010.08529. Examples -------- The following example shows how to retrieve the 4 truly informative features in the sparse regression dataset. >>> from sklearn.datasets import make_sparse_uncorrelated >>> from mplearn.feature_selection.base_selector import ThresholdedOLS >>> from mplearn.feature_selection import AdaSTAMPS >>> X, y = make_sparse_uncorrelated(n_samples=100, n_features=10, random_state=0) >>> thresholded_ols = ThresholdedOLS(num_features_to_select=None, screening_thresh=None) >>> selector = AdaSTAMPS(base_selector=thresholded_ols, ... minipatch_m_ratio=0.5, ... minipatch_n_ratio=0.5, ... random_state=123, ... verbose=0) >>> fitted_selector = selector.fit(X, y) >>> fitted_selector.get_support(indices=True, pi_thr=0.5) array([0, 1, 2, 3]) >>> X_new = fitted_selector.transform(X, pi_thr=0.5) >>> X_new.shape (100, 4) >>> fitted_selector.visualize_selection_frequency(max_features_to_plot=None) # doctest: +SKIP """ def __init__(self, base_selector, *, minipatch_m_ratio=0.05, minipatch_n_ratio=0.5, sampling_options=None, stopping_criteria_options=None, random_state=0, keep_all_iters=True, max_iters_to_keep=None, verbose=0): if sampling_options is None: sampling_options = {"mode": "ee", "E": 10, "active_set_thr": 0.1, "gamma_min": 0.5, "gamma_max": 1., "gamma_len": 50} if stopping_criteria_options is None: stopping_criteria_options = {"tau_u": 30, "tau_l": 15, "max_k": 5000, "num_last_iterations": 100} self.base_selector = base_selector self.minipatch_m_ratio = minipatch_m_ratio self.minipatch_n_ratio = minipatch_n_ratio self.sampling_options = sampling_options self.stopping_criteria_options = stopping_criteria_options self.random_state = random_state self.keep_all_iters = keep_all_iters self.max_iters_to_keep = max_iters_to_keep self.verbose = verbose
[docs] def fit(self, X, y): """Fit the AdaSTAMPS model to data. Parameters ---------- X : ndarray of shape (n_samples, n_features) The training input samples. Note that data frame or sparse matrix format are not allowed. Also, the dtype of X has to be numeric (e.g. float, int). The algorithm expects that all appropriate preprocessing steps on X have been completed prior to calling `fit`. y : ndarray of shape (n_samples,) The target values. Note that for classification problems (categorical y), the input y should contain integers denoting class labels instead of actual class names (`str`). In other words, the dtype of y has to be numeric (e.g. float, int). Returns ------- self : object Fitted estimator. Notes ----- Allows NaN/Inf in the input if the underlying base selector does as well. """ # Specify minipatch size m_ratio = self.minipatch_m_ratio # m/M n_ratio = self.minipatch_n_ratio # n/N # Dimensions of data matrix N, M = X.shape n = np.int(np.round(n_ratio * N)) m = np.int(np.round(m_ratio * M)) # Specify the maximum number of iterations to run if the data-driven stopping criterion is not satisfied max_k = self.stopping_criteria_options["max_k"] if max_k is None: if self.sampling_options["mode"] != "uniform": max_k = np.int((np.ceil(1. / m_ratio) * self.sampling_options["E"]) * 5) else: raise Exception('Setting max_k to None is not allowed when sampling mode is uniform.') # Initialize parameters for the feature sampling procedure if self.sampling_options["mode"] != "uniform": G = np.int(np.ceil(1. / m_ratio)) iter_to_start_checking_criterion = int(self.sampling_options["E"] * G) burn_in_length = int(self.sampling_options["E"] * G) prev_partition = np.array_split(np.arange(M), G) if self.sampling_options["mode"] == "ee": gamma_seq_after_burn_in = np.geomspace(self.sampling_options["gamma_min"], self.sampling_options["gamma_max"], num=self.sampling_options["gamma_len"]) self.sampling_options["gamma_multiplier"] = gamma_seq_after_burn_in[1] / gamma_seq_after_burn_in[0] # Initialize gamma to start from the minimum value gamma = self.sampling_options["gamma_min"] else: iter_to_start_checking_criterion = np.minimum(500, max_k) burn_in_length = None # Specify parameter values for the data-driven stopping criteria num_last_iterations = self.stopping_criteria_options["num_last_iterations"] num_top_var_to_check_lower_bound = np.minimum(self.stopping_criteria_options["tau_l"], int(0.5*M)) stop_criterion_matrix = np.zeros((self.stopping_criteria_options["tau_u"], num_last_iterations), dtype=np.int64) stop_criterion_met = False stop_criterion_counter = -1 * iter_to_start_checking_criterion # Initialize array to store feature selection frequency over iterations if self.keep_all_iters: Pi_hat_seq = np.zeros((M, (max_k + 1))) else: if self.max_iters_to_keep is None: self.max_iters_to_keep = self.stopping_criteria_options["num_last_iterations"] Pi_hat_seq = np.zeros((M, self.max_iters_to_keep)) Pi_hat_k_seq = np.zeros(self.max_iters_to_keep) # Initialize random number generator rng = np.random.RandomState(self.random_state) # Initialize iteration counter k = 0 # Get a clean copy of the base selector base_selector = clone(self.base_selector) # Keep a running tally of the number of times each variable is subsampled and selected total_times_subsampled_selected = np.zeros(M) total_times_subsampled = np.zeros(M) continue_alg = True while continue_alg: # Increment iteration counter k += 1 # Initialize temporary storage for results (cleared after each iteration) var_subsampled_indicator = np.zeros(M) var_subsampled_selected_indicator = np.zeros(M) # Subsample n observations from {1,...,N} uniformly at random without replacement Ik = np.sort(rng.choice(N, size=n, replace=False)) if self.sampling_options["mode"] == "uniform": # use uniform sampling of features Fk = np.sort(rng.choice(M, size=m, replace=False)) elif self.sampling_options["mode"] == "ee": # use Exploitation and Exploration Adaptive Feature Sampling if k <= burn_in_length: Fk, rng, gamma, prev_partition = _adaptive_feature_sampling_exploitation_exploration(self.sampling_options, rng, gamma, m, M, G, self.keep_all_iters, self.max_iters_to_keep, Pi_hat_seq, prev_partition, k, self.verbose) else: Fk, rng, gamma = _adaptive_feature_sampling_exploitation_exploration(self.sampling_options, rng, gamma, m, M, G, self.keep_all_iters, self.max_iters_to_keep, Pi_hat_seq, prev_partition, k, self.verbose) elif self.sampling_options["mode"] == "prob": # use Probabilistic Adaptive Feature Sampling if k <= burn_in_length: Fk, rng, prev_partition = _adaptive_feature_sampling_probabilistic(self.sampling_options, rng, m, M, G, self.keep_all_iters, self.max_iters_to_keep, Pi_hat_seq, prev_partition, k, self.verbose) else: Fk, rng = _adaptive_feature_sampling_probabilistic(self.sampling_options, rng, m, M, G, self.keep_all_iters, self.max_iters_to_keep, Pi_hat_seq, prev_partition, k, self.verbose) else: raise Exception('Other sampling procedures not implemented.') stop_criterion_counter += 1 var_subsampled_indicator[Fk] = 1 base_selector_on_minipatch = clone(base_selector) selection_on_minipatch = base_selector_on_minipatch.fit(X[np.ix_(list(Ik), list(Fk))], y[Ik], Fk) # Set the j-th entry to 1 if \beta_j^{(k)} ~= 0 and j \in Fk var_subsampled_selected_indicator[selection_on_minipatch.Fk_] = selection_on_minipatch.selection_indicator_ # Keep a running tally of each feature's stability score up until the kth iteration (inclusive) total_times_subsampled_selected = total_times_subsampled_selected + var_subsampled_selected_indicator total_times_subsampled = total_times_subsampled + var_subsampled_indicator if self.keep_all_iters: Pi_hat_seq[:, k] = total_times_subsampled_selected / np.maximum(1, total_times_subsampled) if stop_criterion_counter >= 1: # estimate size of possible stable set whose selection frequency are above 0.5 num_var_above_tau_thr = np.sum(Pi_hat_seq[:, k] >= 0.5) num_top_var_to_check = np.minimum(self.stopping_criteria_options["tau_u"], np.maximum(num_var_above_tau_thr, num_top_var_to_check_lower_bound)) else: Pi_hat_seq[:, np.int(((k - 1) % self.max_iters_to_keep))] = (total_times_subsampled_selected / np.maximum(1, total_times_subsampled)) Pi_hat_k_seq[np.int(((k - 1) % self.max_iters_to_keep))] = k if stop_criterion_counter >= 1: num_var_above_tau_thr = np.sum(Pi_hat_seq[:, np.int(((k - 1) % self.max_iters_to_keep))] >= 0.5) num_top_var_to_check = np.minimum(self.stopping_criteria_options["tau_u"], np.maximum(num_var_above_tau_thr, num_top_var_to_check_lower_bound)) # Compute the stopping criterion at the kth iteration if k >= np.maximum(iter_to_start_checking_criterion + 1, 10): if self.keep_all_iters: top_Pi_hat_descend_idx = manual_sort(Pi_hat_seq[:, k], num_top_var_to_check, self.stopping_criteria_options["tau_u"]) stop_criterion_matrix[:, np.int(((stop_criterion_counter - 1) % num_last_iterations))] = top_Pi_hat_descend_idx else: top_Pi_hat_descend_idx = manual_sort(Pi_hat_seq[:, np.int(((k - 1) % self.max_iters_to_keep))], num_top_var_to_check, self.stopping_criteria_options["tau_u"]) stop_criterion_matrix[:, np.int(((stop_criterion_counter - 1) % num_last_iterations))] = top_Pi_hat_descend_idx # check the stopping criterion after fully filled out the stop_criterion_matrix for the first time if k >= np.maximum(iter_to_start_checking_criterion + num_last_iterations + 1, 10): unique_top_var_sets = np.unique(stop_criterion_matrix, axis=1) if unique_top_var_sets.shape[1] == 1: stop_criterion_met = True else: if self.verbose == 2: print("=== Number of unique set of top var in the last %.i iterations is %.i" % (num_last_iterations, unique_top_var_sets.shape[1])) if (k + 1) > max_k: continue_alg = False print("==== Maximum number of iteration reached. ======") last_k = k elif k >= np.maximum(iter_to_start_checking_criterion + num_last_iterations + 1, 10) and stop_criterion_met: continue_alg = False print("=== Stopping criterion reached at k = " + str(k)) last_k = k else: if self.verbose == 2: print("========= Currently finished iteration k = " + str(k) + "=========") if self.keep_all_iters: Pi_hat_last_k = Pi_hat_seq[:, last_k] else: Pi_hat_last_k = Pi_hat_seq[:, np.int(((last_k - 1) % self.max_iters_to_keep))] if self.keep_all_iters: output_Pi_hat_seq = Pi_hat_seq[:, 1:(last_k + 1)] output_Pi_hat_k_seq = np.arange(1, (last_k + 1)) # clear memory Pi_hat_seq = None else: if last_k > self.max_iters_to_keep: output_Pi_hat_seq = np.concatenate((Pi_hat_seq[:, np.int((last_k % self.max_iters_to_keep)):], Pi_hat_seq[:, :np.int((last_k % self.max_iters_to_keep))]), axis=1) # clear memory Pi_hat_seq = None output_Pi_hat_k_seq = np.concatenate((Pi_hat_k_seq[np.int((last_k % self.max_iters_to_keep)):], Pi_hat_k_seq[:np.int((last_k % self.max_iters_to_keep))])) else: output_Pi_hat_seq = Pi_hat_seq[:, :last_k] output_Pi_hat_k_seq = Pi_hat_k_seq[:last_k] # clear memory Pi_hat_seq = None if not self.keep_all_iters: burn_in_length = None self.last_k_ = last_k self.Pi_hat_last_k_ = Pi_hat_last_k self.full_Pi_hat_seq_ = output_Pi_hat_seq self.full_Pi_hat_k_seq_ = output_Pi_hat_k_seq self.burn_in_length_ = burn_in_length return self
[docs] def get_support(self, indices=False, pi_thr=None): """ Get a mask, or integer index, of the features selected by the meta-algorithm. Parameters ---------- indices : bool, default=False If True, the return value will be an array of integers, rather than a boolean mask. pi_thr : float, default=None The selection frequency threshold above which a feature is considered selected. A larger threshold indicates a more stringent criterion. By default (`None`), a data-driven procedure is run to choose this threshold automatically. This is generally recommended, however, this procedure might take a long time if [# input features] is large (e.g. hundreds of thousands). For many problems, setting this threshold to 0.5 is a reasonable choice. Note that this threshold must be within (0.0, 1.0). Returns ------- support : ndarray If `indices` is `False`, this is a boolean array of shape [# input features], in which an element is True iff its corresponding feature is selected by the algorithm. If `indices` is `True`, this is an integer array of shape [# output features] whose values are indices into the input feature vector. """ check_is_fitted(self) error_msg = ( "pi_thr must be either None" "or a float in (0.0, 1.0)." ) if pi_thr is None: pi_thr = kde_based_pi_thr(self.Pi_hat_last_k_) elif pi_thr < 0: raise ValueError(error_msg) elif pi_thr > 1.0: raise ValueError(error_msg) support_ = np.ones(self.Pi_hat_last_k_.size, dtype=bool) support_[self.Pi_hat_last_k_ < pi_thr] = False return support_ if not indices else np.where(support_)[0]
[docs] def visualize_selection_frequency(self, max_features_to_plot=None): """ Visualize the selection frequency of the input features. It is generally useful to visualize the selection frequency of the input features versus number of iterations for better insights into the estimated importance of the features. Parameters ---------- max_features_to_plot : int, default=None Controls the maximum number of features whose selection frequency over iterations are visualized. By default (`None`), all input features are shown. However, such visualization might consume too much memory if the number of input feature is too large (e.g. 5000). In such cases, consider setting `max_features_to_plot` to be much smaller than n_features, which will only plot a small fraction of features whose selection frequency is below 0.3 to save on memory consumption. Returns ------- None """ check_is_fitted(self) error_msg = ( "max_features_to_plot must be either None" "or a positive integer in (0, M]." ) if max_features_to_plot is None: max_features_to_plot = self.Pi_hat_last_k_.size elif max_features_to_plot < 0: raise ValueError(error_msg) elif not isinstance(max_features_to_plot, numbers.Integral): raise ValueError(error_msg) visualize_selection_frequency_versus_iteration(self.full_Pi_hat_seq_, self.full_Pi_hat_k_seq_, self.burn_in_length_, max_features_to_plot)
[docs] def transform(self, X, pi_thr=None): """Reduce X to the selected features. Parameters ---------- X : ndarray of shape (n_samples, n_features) The input samples. Note that data frame or sparse matrix format are not allowed. Also, the dtype of X has to be numeric (e.g. float, int). pi_thr : float, default=None The selection frequency threshold above which a feature is considered selected. See the documentations of `get_support` for details. Returns ------- X_r : ndarray of shape (n_samples, n_selected_features) The input samples with only the selected features. """ check_is_fitted(self) mask = self.get_support(pi_thr=pi_thr) if not mask.any(): warn( "No features were selected: either the data is" " too noisy or the selection threshold is too strict.", UserWarning, ) return np.empty(0).reshape((X.shape[0], 0)) if len(mask) != X.shape[1]: raise ValueError("X has a different shape than during fitting.") return X[:, safe_mask(X, mask)]