Source code for mplearn.feature_selection.base_selector._base_selector

import numbers

import numpy as np
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from ...common import BaseLearner


[docs]class ThresholdedOLS(BaseLearner): """Feature selection with the Thresholded OLS selector. This class is designed to be used as a base feature selector on the minipatches with the `mplearn.feature_selection.AdaSTAMPS` class. Parameters ---------- num_features_to_select : int or float, default=None The number of features to select from the m features in a minipatch. - If `None`, it employs the Bonferroni procedure as described in [1] to automatically decide the number of features to select on the minipatch. - If positive integer, it is the absolute number of features to select on a minipatch. - If float in the interval (0.0, 1.0], it is the percentage of the m features in a minipatch to select. screening_thresh : float, default=None This is ignored if the minipatch has more observations n than features m. For high-dimensional minipatches (n<m), `screening_thresh` should be a float in the interval (0.0, 1.0), which will first apply an efficient screening rule from [1] to reduce the number of features in the minipatch to `round(screening_thresh * n)`. Attributes ---------- selection_indicator_ : ndarray of shape (m,) or (`round(screening_thresh * n)`,) A binary selection indicator for the features in the minipatch (1 for selected features and 0 for unselected features). If low-dimensional minipatch (n>m), the shape is (m,). Otherwise, the shape is (`round(screening_thresh * n)`,). Fk_ : ndarray of shape (m,) or (`round(screening_thresh * n)`,) The corresponding integer indices of the features in `selection_indicator_`. Note that these indices correspond to these features' column indices in the full data X_full (N observations and M features). References ---------- .. [1] Giurcanu, M. . "Thresholding least-squares inference in high-dimensional regression models." Electron. J. Statist. 10 (2) 2124 - 2156, 2016. """ def __init__(self, *, num_features_to_select=None, screening_thresh=None): self.num_features_to_select = num_features_to_select self.screening_thresh = screening_thresh
[docs] def fit(self, X, y, Fk): """Fit the thresholded OLS base selector to a minipatch. Parameters ---------- X : ndarray of shape (n, m) The data matrix corresponding to the minipatch (n observations and m features). y : ndarray of shape (n,) The target values corresponding to the minipatch. Fk : ndarray of shape (m,) The integer indices of the features in the minipatch. Note that these indices correspond to these features' column indices in the full data X_full. For example, `X = X_full[:, F_k]`. Returns ------- self : object Fitted estimator. """ n, m = X.shape # If the minipatch has more features than observations, apply the screening rule to keep only (prop_to_keep * n) features if n < m: if self.screening_thresh is None: raise ValueError('Because n < m. Please specify screening_thresh to be a float in (0.0, 1.0).') # Compute componentwise least squared estimate y_n_m = np.tile(y, (m, 1)).transpose() beta_tilde = np.sum((y_n_m * X), axis=0) / np.sum((X ** 2), axis=0) beta_tilde_n_m = np.tile(beta_tilde, (n, 1)) sigma_tilde_squared = (1. / (n * (n - 1))) * np.sum(np.square((y_n_m - (X * beta_tilde_n_m))), axis=0) gamma_tilde = np.absolute(beta_tilde) / np.sqrt(sigma_tilde_squared) sorted_gamma_tilde_descend = np.sort(gamma_tilde)[::-1] q = np.int(self.screening_thresh * n) gamma_tilde_thres = sorted_gamma_tilde_descend[q] Fk_to_keep_mask = (gamma_tilde > gamma_tilde_thres) X = X[:, Fk_to_keep_mask] n, m = X.shape Fk = Fk[Fk_to_keep_mask] lm = LinearRegression(fit_intercept=False).fit(X, y) beta_hat_ols = lm.coef_ beta_hat_ols_thresholded = beta_hat_ols.copy() if self.num_features_to_select is None: t_alpha_quantile = 1. / (2 * np.log(n)) t_scale = stats.t(df=(n - m)).ppf((1 - (t_alpha_quantile / m))) error_var_hat = (1. / (n - m)) * (np.linalg.norm(y - np.matmul(X, beta_hat_ols)) ** 2) try: Omega_inv = np.linalg.inv(((1. / n) * np.matmul(X.transpose(), X))) except: Omega_inv = np.linalg.pinv(((1. / n) * np.matmul(X.transpose(), X))) sigma_bar_jj = ((1. / np.sqrt(n)) * np.sqrt(error_var_hat)) * np.sqrt(np.absolute(np.diagonal(Omega_inv))) beta_hat_ols_thresholded[(np.absolute(beta_hat_ols) <= (sigma_bar_jj * t_scale))] = 0 else: error_msg = ( "num_features_to_select must be either None, a " "positive integer representing the absolute " "number of features to select on a minipatch or a float in (0.0, 1.0] " "representing a percentage of the m features in a minipatch to select." ) if self.num_features_to_select < 0: raise ValueError(error_msg) elif isinstance(self.num_features_to_select, numbers.Integral): num_features_to_select = self.num_features_to_select elif self.num_features_to_select > 1.0: raise ValueError(error_msg) else: num_features_to_select = int(m * self.num_features_to_select) sorted_beta_hat_ols = np.sort(np.absolute(beta_hat_ols))[::-1] beta_hat_ols_thresholded[(np.absolute(beta_hat_ols) <= sorted_beta_hat_ols[num_features_to_select])] = 0 self.selection_indicator_ = np.absolute(np.sign(beta_hat_ols_thresholded)) self.Fk_ = Fk return self
[docs]class DecisionTreeSelector(BaseLearner): """Feature selection with the decision tree selector. This class is designed to be used as a base feature selector on the minipatches with the `mplearn.feature_selection.AdaSTAMPS` class. This is a wrapper built around the DecisionTreeClassifier and the DecisionTreeRegressor from the sklearn package. Parameters ---------- mode : {'classifier', 'regressor'} Controls the type of the decision tree model to use. max_depth : int, default=5 The maximum depth of the tree. If `None`, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. criterion : {'gini', 'entropy', 'squared_error', 'friedman_mse', 'absolute_error', 'poisson'} The criterion to measure the quality of a split. If `mode='classifier'`, this must be {'gini', 'entropy'}. If `mode='regressor'`, this must be {'squared_error', 'friedman_mse', 'absolute_error', 'poisson'}. num_features_to_select : int or float, default=0.1 The number of features to select from the m features in a minipatch. - If positive integer, it is the absolute number of features to select on a minipatch. - If float in the interval (0.0, 1.0], it is the percentage of the m features in a minipatch to select. random_state : int, default=0 Controls the randomness of the decision tree model. Attributes ---------- selection_indicator_ : ndarray of shape (m,) A binary selection indicator for the features in the minipatch (1 for selected features and 0 for unselected features). Fk_ : ndarray of shape (m,) The corresponding integer indices of the features in `selection_indicator_`. Note that these indices correspond to these features' column indices in the full data X_full (N observations and M features). """ def __init__(self, *, mode='classifier', max_depth=5, criterion='gini', num_features_to_select=0.1, random_state=0): self.mode = mode self.max_depth = max_depth self.criterion = criterion self.num_features_to_select = num_features_to_select self.random_state = random_state
[docs] def fit(self, X, y, Fk): """Fit the decision tree base selector to a minipatch. Parameters ---------- X : ndarray of shape (n, m) The data matrix corresponding to the minipatch (n observations and m features). y : ndarray of shape (n,) The target values corresponding to the minipatch. Fk : ndarray of shape (m,) The integer indices of the features in the minipatch. Note that these indices correspond to these features' column indices in the full data X_full. For example, `X = X_full[:, F_k]`. Returns ------- self : object Fitted estimator. """ n, m = X.shape if self.mode == 'classifier': if self.criterion not in ['gini', 'entropy']: raise ValueError("criterion must be {'gini', 'entropy'} for classification.") estimator = DecisionTreeClassifier(criterion=self.criterion, max_depth=self.max_depth, random_state=self.random_state).fit(X, y) elif self.mode == 'regressor': if self.criterion not in ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']: raise ValueError("criterion must be {'squared_error', 'friedman_mse', 'absolute_error', 'poisson'} for regression.") estimator = DecisionTreeRegressor(criterion=self.criterion, max_depth=self.max_depth, random_state=self.random_state).fit(X, y) else: raise ValueError("mode must be either 'classifier' or 'regressor'.") feature_importance_scores = estimator.feature_importances_ error_msg = ( "num_features_to_select must be either a " "positive integer representing the absolute " "number of features to select on a minipatch or a float in (0.0, 1.0] " "representing a percentage of the m features in a minipatch to select." ) if self.num_features_to_select < 0: raise ValueError(error_msg) elif isinstance(self.num_features_to_select, numbers.Integral): num_features_to_select = self.num_features_to_select elif self.num_features_to_select > 1.0: raise ValueError(error_msg) else: num_features_to_select = int(m * self.num_features_to_select) feature_importance_scores_sort_descend_idx = np.argsort(feature_importance_scores)[::-1] hat_nonzero_indicator = np.zeros(m) hat_nonzero_indicator[feature_importance_scores_sort_descend_idx[:num_features_to_select]] = 1 self.selection_indicator_ = hat_nonzero_indicator self.Fk_ = Fk return self