Source code for mplearn.feature_selection.base_selector._base_selector

import numbers

import numpy as np
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from ...common import BaseLearner


[docs]class ThresholdedOLS(BaseLearner):
    """Feature selection with the Thresholded OLS selector.

    This class is designed to be used as a base feature selector on
    the minipatches with the `mplearn.feature_selection.AdaSTAMPS` class.

    Parameters
    ----------
    num_features_to_select : int or float, default=None
        The number of features to select from the m features in a minipatch.

        - If `None`, it employs the Bonferroni procedure as
          described in [1] to automatically decide the number
          of features to select on the minipatch.
        - If positive integer, it is the absolute number of
          features to select on a minipatch.
        - If float in the interval (0.0, 1.0], it is the percentage
          of the m features in a minipatch to select.

    screening_thresh : float, default=None
        This is ignored if the minipatch has more observations n
        than features m. For high-dimensional minipatches (n<m),
        `screening_thresh` should be a float in the interval (0.0, 1.0),
        which will first apply an efficient screening rule from [1] to reduce
        the number of features in the minipatch to `round(screening_thresh * n)`.

    Attributes
    ----------
    selection_indicator_ : ndarray of shape (m,) or (`round(screening_thresh * n)`,)
        A binary selection indicator for the features in the minipatch
        (1 for selected features and 0 for unselected features). If low-dimensional
        minipatch (n>m), the shape is (m,). Otherwise, the shape is (`round(screening_thresh * n)`,).

    Fk_ : ndarray of shape (m,) or (`round(screening_thresh * n)`,)
        The corresponding integer indices of the features in `selection_indicator_`.
        Note that these indices correspond to these features' column indices
        in the full data X_full (N observations and M features).

    References
    ----------
    .. [1] Giurcanu, M. . "Thresholding least-squares inference in
           high-dimensional regression models." Electron. J. Statist.
           10 (2) 2124 - 2156, 2016.

    """

    def __init__(self,
                 *,
                 num_features_to_select=None,
                 screening_thresh=None):

        self.num_features_to_select = num_features_to_select
        self.screening_thresh = screening_thresh

[docs]    def fit(self, X, y, Fk):
        """Fit the thresholded OLS base selector to a minipatch.

        Parameters
        ----------
        X : ndarray of shape (n, m)
            The data matrix corresponding to the minipatch (n observations and m features).

        y : ndarray of shape (n,)
            The target values corresponding to the minipatch.

        Fk : ndarray of shape (m,)
            The integer indices of the features in the minipatch.
            Note that these indices correspond to these features' column indices
            in the full data X_full. For example, `X = X_full[:, F_k]`.

        Returns
        -------
        self : object
            Fitted estimator.

        """

        n, m = X.shape

        # If the minipatch has more features than observations, apply the screening rule to keep only (prop_to_keep * n) features
        if n < m:

            if self.screening_thresh is None:
                raise ValueError('Because n < m. Please specify screening_thresh to be a float in (0.0, 1.0).')

            # Compute componentwise least squared estimate
            y_n_m = np.tile(y, (m, 1)).transpose()
            beta_tilde = np.sum((y_n_m * X), axis=0) / np.sum((X ** 2), axis=0)

            beta_tilde_n_m = np.tile(beta_tilde, (n, 1))
            sigma_tilde_squared = (1. / (n * (n - 1))) * np.sum(np.square((y_n_m - (X * beta_tilde_n_m))), axis=0)

            gamma_tilde = np.absolute(beta_tilde) / np.sqrt(sigma_tilde_squared)

            sorted_gamma_tilde_descend = np.sort(gamma_tilde)[::-1]
            q = np.int(self.screening_thresh * n)
            gamma_tilde_thres = sorted_gamma_tilde_descend[q]
            Fk_to_keep_mask = (gamma_tilde > gamma_tilde_thres)
            X = X[:, Fk_to_keep_mask]
            n, m = X.shape
            Fk = Fk[Fk_to_keep_mask]

        lm = LinearRegression(fit_intercept=False).fit(X, y)
        beta_hat_ols = lm.coef_

        beta_hat_ols_thresholded = beta_hat_ols.copy()

        if self.num_features_to_select is None:

            t_alpha_quantile = 1. / (2 * np.log(n))
            t_scale = stats.t(df=(n - m)).ppf((1 - (t_alpha_quantile / m)))
            error_var_hat = (1. / (n - m)) * (np.linalg.norm(y - np.matmul(X, beta_hat_ols)) ** 2)
            try:
                Omega_inv = np.linalg.inv(((1. / n) * np.matmul(X.transpose(), X)))
            except:
                Omega_inv = np.linalg.pinv(((1. / n) * np.matmul(X.transpose(), X)))
            sigma_bar_jj = ((1. / np.sqrt(n)) * np.sqrt(error_var_hat)) * np.sqrt(np.absolute(np.diagonal(Omega_inv)))
            beta_hat_ols_thresholded[(np.absolute(beta_hat_ols) <= (sigma_bar_jj * t_scale))] = 0
        else:
            error_msg = (
                "num_features_to_select must be either None, a "
                "positive integer representing the absolute "
                "number of features to select on a minipatch or a float in (0.0, 1.0] "
                "representing a percentage of the m features in a minipatch to select."
            )
            if self.num_features_to_select < 0:
                raise ValueError(error_msg)
            elif isinstance(self.num_features_to_select, numbers.Integral):
                num_features_to_select = self.num_features_to_select
            elif self.num_features_to_select > 1.0:
                raise ValueError(error_msg)
            else:
                num_features_to_select = int(m * self.num_features_to_select)

            sorted_beta_hat_ols = np.sort(np.absolute(beta_hat_ols))[::-1]
            beta_hat_ols_thresholded[(np.absolute(beta_hat_ols) <= sorted_beta_hat_ols[num_features_to_select])] = 0

        self.selection_indicator_ = np.absolute(np.sign(beta_hat_ols_thresholded))
        self.Fk_ = Fk

        return self


[docs]class DecisionTreeSelector(BaseLearner):
    """Feature selection with the decision tree selector.

    This class is designed to be used as a base feature selector on
    the minipatches with the `mplearn.feature_selection.AdaSTAMPS` class.
    This is a wrapper built around the DecisionTreeClassifier and the
    DecisionTreeRegressor from the sklearn package.

    Parameters
    ----------
    mode : {'classifier', 'regressor'}
        Controls the type of the decision tree model to use.

    max_depth : int, default=5
        The maximum depth of the tree. If `None`, then nodes are expanded until
        all leaves are pure or until all leaves contain less than
        min_samples_split samples.

    criterion : {'gini', 'entropy', 'squared_error', 'friedman_mse', 'absolute_error', 'poisson'}
        The criterion to measure the quality of a split. If `mode='classifier'`, this must be
        {'gini', 'entropy'}. If `mode='regressor'`, this must be
        {'squared_error', 'friedman_mse', 'absolute_error', 'poisson'}.

    num_features_to_select : int or float, default=0.1
        The number of features to select from the m features in a minipatch.

        - If positive integer, it is the absolute number of
          features to select on a minipatch.
        - If float in the interval (0.0, 1.0], it is the percentage
          of the m features in a minipatch to select.

    random_state : int, default=0
        Controls the randomness of the decision tree model.

    Attributes
    ----------
    selection_indicator_ : ndarray of shape (m,)
        A binary selection indicator for the features in the minipatch
        (1 for selected features and 0 for unselected features).

    Fk_ : ndarray of shape (m,)
        The corresponding integer indices of the features in `selection_indicator_`.
        Note that these indices correspond to these features' column indices
        in the full data X_full (N observations and M features).

    """

    def __init__(self,
                 *,
                 mode='classifier',
                 max_depth=5,
                 criterion='gini',
                 num_features_to_select=0.1,
                 random_state=0):

        self.mode = mode
        self.max_depth = max_depth
        self.criterion = criterion
        self.num_features_to_select = num_features_to_select
        self.random_state = random_state

[docs]    def fit(self, X, y, Fk):
        """Fit the decision tree base selector to a minipatch.

        Parameters
        ----------
        X : ndarray of shape (n, m)
            The data matrix corresponding to the minipatch (n observations and m features).

        y : ndarray of shape (n,)
            The target values corresponding to the minipatch.

        Fk : ndarray of shape (m,)
            The integer indices of the features in the minipatch.
            Note that these indices correspond to these features' column indices
            in the full data X_full. For example, `X = X_full[:, F_k]`.

        Returns
        -------
        self : object
            Fitted estimator.

        """

        n, m = X.shape

        if self.mode == 'classifier':
            if self.criterion not in ['gini', 'entropy']:
                raise ValueError("criterion must be {'gini', 'entropy'} for classification.")
            estimator = DecisionTreeClassifier(criterion=self.criterion,
                                               max_depth=self.max_depth,
                                               random_state=self.random_state).fit(X, y)
        elif self.mode == 'regressor':
            if self.criterion not in ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']:
                raise ValueError("criterion must be {'squared_error', 'friedman_mse', 'absolute_error', 'poisson'} for regression.")
            estimator = DecisionTreeRegressor(criterion=self.criterion,
                                              max_depth=self.max_depth,
                                              random_state=self.random_state).fit(X, y)
        else:
            raise ValueError("mode must be either 'classifier' or 'regressor'.")
        feature_importance_scores = estimator.feature_importances_

        error_msg = (
            "num_features_to_select must be either a "
            "positive integer representing the absolute "
            "number of features to select on a minipatch or a float in (0.0, 1.0] "
            "representing a percentage of the m features in a minipatch to select."
        )
        if self.num_features_to_select < 0:
            raise ValueError(error_msg)
        elif isinstance(self.num_features_to_select, numbers.Integral):
            num_features_to_select = self.num_features_to_select
        elif self.num_features_to_select > 1.0:
            raise ValueError(error_msg)
        else:
            num_features_to_select = int(m * self.num_features_to_select)

        feature_importance_scores_sort_descend_idx = np.argsort(feature_importance_scores)[::-1]

        hat_nonzero_indicator = np.zeros(m)
        hat_nonzero_indicator[feature_importance_scores_sort_descend_idx[:num_features_to_select]] = 1

        self.selection_indicator_ = hat_nonzero_indicator
        self.Fk_ = Fk

        return self