Finding Common Origins of Milky Way Stars
Validation: Stability across Alternative Data Preprocessing
To begin validating the obtained (consensus) clusters from K-means with \(k = 8\) clusters, we assess the stability of the resulting clusters, had the same clustering pipeline been applied but with alternative data preprocessing choices. Importantly, we note that some data preprocessing choices (e.g., mean- versus RF-imputation, the set of chemical abundance features, and the dimension reduction method) have already been accounted for when performing the consensus clustering step discussed previously. However, due to the high computational burden, there are often more data preprocessing choices that are worth exploring than those that are feasible to include in the consensus step. For example, we have yet to investigate whether or not the resulting clusters are similar, had alternative quality control filtering thresholds been used.
Below, we repeat the clustering pipeline on the training data, but with alternative quality control filtering thresholds. Specifically, we marginally sweep across the following parameters:
- Signal-to-noise (S/N) ratio threshold = 30, 50, 70\(^*\), 90, 110, 130, 150
- Effective temperature \(T_{eff}\) range width = 500, 1000\(^*\), 1500, 2000
- Logarithm of the surface gravity \(\log g\) = 3.0, 3.3, 3.6\(^*\), 3.9, 4.2
- GC membership probability = 0.5, 0.7, 0.9\(^*\), 0.99
We denote the default QC thresholds (used in previously in the main analysis) with an astericks (\(^*\)).
For each of these alternative choices, we compare the similarity between the original clusters and these new clusters using the adjusted Rand index (ARI). We report these results below. We find that the ARI at non-default filtering thresholds is both high and very similar to the ARI at the default filtering thresholds, indicating that the clusters are highly robust to these alternative data preprocessing choices.
Show Code to Evaluate Cluster Stability Across Alternative Data Preprocessing Pipelines
# Default QC thresholds
<- list(
default_grid snr = 70,
vb = 0.9,
logg = 3.6,
teff_width = 1000 # yields [3500, 5500]
)
# Parameter grids to sweep
<- list(
vary_param_grid snr = c(30, 50, 70, 90, 110, 130, 150),
vb = c(0.5, 0.7, 0.9, 0.99),
logg = c(3.0, 3.3, 3.6, 3.9, 4.2),
teff_width = c(500, 1000, 1500, 2000)
)
# Load data
<- dplyr::bind_cols(
data_mean_imputed_train $train, data_mean_imputed$train
metadata
)<- dplyr::bind_cols(
data_rf_imputed_train $train, data_rf_imputed$train
metadata
)
# Get original cluster fit
<- readRDS(
orig_clusters file.path(RESULTS_PATH, "consensus_clusters_train.rds")
$cluster_ids
)[[best_clust_method_name]]
<- file.path(
stability_results_fname "clustering_fits_stability.rds"
RESULTS_PATH,
)
if (!file.exists(stability_results_fname)) {
<- list()
stability_fits_ls for (param_name in names(vary_param_grid)) {
<- default_grid
param_grid_ls <- vary_param_grid[[param_name]]
param_grid_ls[[param_name]] <- expand.grid(param_grid_ls)
param_grid <- purrr::map(
stability_fits_ls[[param_name]] 1:nrow(param_grid),
function(i) {
<- param_grid$snr[[i]]
snr <- param_grid$vb[[i]]
vb <- param_grid$logg[[i]]
logg <- param_grid$teff_width[[i]]
teff_width
# get new QC-filtered data
<- apply_qc_filtering(
data_mean_imputed_qc
data_mean_imputed_train, snr_threshold = snr,
teff_thresholds = c(4500 - teff_width, 4500 + teff_width),
logg_threshold = logg,
starflag = 0,
vb_threshold = vb
)<- apply_qc_filtering(
data_rf_imputed_qc
data_rf_imputed_train, snr_threshold = snr,
teff_thresholds = c(4500 - teff_width, 4500 + teff_width),
logg_threshold = logg,
starflag = 0,
vb_threshold = vb
)
# get sample ids for new QC-filtered data
<- tibble::tibble(
keep_sample_idxs APOGEE_ID = data_mean_imputed_qc$meta$APOGEE_ID
|>
) ::left_join(
dplyr::bind_cols(.id = 1:nrow(metadata$train), metadata$train),
dplyrby = "APOGEE_ID"
|>
) ::pull(.id)
dplyr
# fit clustering pipeline on new QC-filtered data
<- fit_best_pipeline(
cluster_ids data_ls = list(
"Mean-imputed" = data_mean_imputed_qc$abundance,
"RF-imputed" = data_rf_imputed_qc$abundance
),k = best_k
)
# evaluate stability between original clusters and new clusters
<- mclust::adjustedRandIndex(
stability
cluster_ids, orig_clusters[keep_sample_idxs]
)
return(list(
sample_ids = keep_sample_idxs,
cluster_ids = cluster_ids,
stability = stability
))
}|>
) setNames(vary_param_grid[[param_name]])
}saveRDS(stability_fits_ls, stability_results_fname)
else {
} <- readRDS(stability_results_fname)
stability_fits_ls
}
<- purrr::imap(
stability_results_ls
stability_fits_ls,function(x, key) {
::map(x, ~ data.frame(stability = .x$stability)) |>
purrr::bind_rows(.id = key) |>
dplyr::mutate(
dplyr:= as.numeric(.data[[key]])
{{key}}
)
}
)
<- purrr::imap(
plt_ls
stability_results_ls,function(stability_df, key) {
<- dplyr::case_when(
param_title == "snr" ~ "SNR Threshold",
key == "vb" ~ "VB Threshold",
key == "logg" ~ "Log(g) Threshold",
key == "teff_width" ~ "Teff Width"
key
)|>
stability_df ::ggplot() +
ggplot2::aes(x = !!rlang::sym(key), y = stability) +
ggplot2::geom_line(width = 1.2, color = "steelblue") +
ggplot2::geom_point(width = 2.5, color = "steelblue") +
ggplot2::ylim(c(0, 1)) +
ggplot2::labs(
ggplot2x = param_title,
y = "ARI Stability",
+
) ::theme_minimal(base_size = 14) +
ggplot2::theme(
ggplot2panel.grid.major = ggplot2::element_line(color = "grey80"),
panel.grid.minor = ggplot2::element_blank(),
axis.title.y.left = ggplot2::element_text(color = "steelblue")
)
}
)
<- patchwork::wrap_plots(plt_ls, nrow = 1) +
plt ::plot_layout(axes = "collect")
patchworksubchunkify(
fig_height = 4, fig_width = 10,
plt, caption = "'Stability of cluster labels across different choices of data preprocessing pipelines.'"
)
Validation: Cluster Generalizability
We next assess the overall generalizability of our estimated clusters using the held-out test set and the idea of cluster predictability. Specifically, we (i) apply the same consensus K-means with \(k = 8\) clustering pipeline to the training and test data separately, (ii) fit a random forest (RF) using the training covariates data to predict the cluster labels on the training set, (iii) use this trained RF to predict the cluster labels on the test set, and (iv) evaluate the concordance (via ARI) between the test cluster labels and the RF-predicted cluster labels on the test set. Note that this procedure relies on some preprocessed version of the covariates (i.e., abundance) data, and we thus report the generalizability using various data preprocessing pipelines below. In general, the choice of data preprocessing does not substantially impact the overall cluster generalizability, which is consistently between 0.39-0.43 ARI.
Though this ARI is not particularly high, there are select clusters that are far more generalizable than others. This can be seen by examining the confusion tables below (under the Confusion Tables
tab). For example, the cluster labeled “Predicted 8” (i.e., cluster 8 from the training data) is highly generalizable and directly maps to cluster 7 in the test data. On the other hand, the cluster labeled “Predicted 3” (i.e., cluster 3 from the training data) is often confused with test clusters 2, 3 and 4.
To more rigorously quantify this cluster-specific notion of generalizability, we define a local generalizability metric, computed as the precision (i.e., TP/(TP + FP)) of each cluster. (Note: since the cluster labels are arbitrary, we align the RF-predicted clusters and the estimated test clusters based upon the pairing with the largest overlap and report the precision given that mapping.) These local generalizability scores clearly indicate that clusters 1, 2, 6, and 8 from the training dataset are highly predictable and generalizable while other clusters such as cluster 3 are less generalizable.
Show Code to Evaluate Cluster Generalizability using Test Data
## this code chunk evaluates cluster generalizability using test data
#### choose imputation methods ####
<- list(
test_data_ls "Mean-imputed" = data_mean_imputed$test,
"RF-imputed" = data_rf_imputed$test
)<- list(
train_data_ls "Mean-imputed" = data_mean_imputed$train,
"RF-imputed" = data_rf_imputed$train
)
#### choose number of features ####
<- list(
feature_modes "Small" = 7,
"Medium" = 11,
"Big" = 19
)
#### choose ks ####
<- c(2, 8, 9)
ks
#### choose dimension reduction methods ####
# raw data
<- list("Raw" = function(x) x)
identity_fun_ls
# pca
<- list("PCA" = purrr::partial(fit_pca, ndim = 4))
pca_fun_ls
# tsne
<- c(30, 100)
tsne_perplexities <- purrr::map(
tsne_fun_ls
tsne_perplexities,~ purrr::partial(fit_tsne, dims = 2, perplexity = .x)
|>
) setNames(sprintf("tSNE (perplexity = %d)", tsne_perplexities))
# putting it together
<- c(
dr_fun_ls
identity_fun_ls,
pca_fun_ls,
tsne_fun_ls
)
#### choose clustering methods ####
# kmeans
<- list("K-means" = purrr::partial(fit_kmeans, ks = ks))
kmeans_fun_ls
# spectral clustering
<- c(60, 100)
n_neighbors <- purrr::map(
spectral_fun_ls
n_neighbors,~ purrr::partial(
fit_spectral_clustering, ks = ks,
affinity = "nearest_neighbors",
n_neighbors = .x
)|>
) setNames(sprintf("Spectral (n_neighbors = %s)", n_neighbors))
# putting it together
<- c(
clust_fun_ls
kmeans_fun_ls,
spectral_fun_ls
)
#### Fit Clustering Pipelines ####
<- tidyr::expand_grid(
pipe_tib train_data = train_data_ls,
test_data = test_data_ls,
feature_mode = feature_modes,
dr_method = dr_fun_ls,
clust_method = clust_fun_ls
|>
) ::mutate(
dplyrimpute_mode_name = names(train_data),
feature_mode_name = names(feature_mode),
dr_method_name = names(dr_method),
clust_method_name = names(clust_method),
name = stringr::str_glue(
"{clust_method_name} [{impute_mode_name} + {feature_mode_name} + {dr_method_name}]"
)|>
) # remove some clustering pipelines to reduce computation burden
::filter(
dplyr# remove all big feature set + dimension-reduction runs
!((dr_method_name != "Raw") & (feature_mode_name == "Big"))
)<- split(pipe_tib, seq_len(nrow(pipe_tib))) |>
pipe_ls setNames(pipe_tib$name)
<- file.path(RESULTS_PATH, "clustering_fits_test.rds")
fit_results_fname if (!file.exists(fit_results_fname)) {
library(future)
plan(multisession, workers = NCORES)
# fit clustering pipelines (if not already cached)
<- furrr::future_map(
clust_fit_ls
pipe_ls,function(pipe_df) {
<- create_preprocessing_pipeline(
g feature_mode = pipe_df$feature_mode[[1]],
preprocess_fun = pipe_df$dr_method[[1]]
)<- pipe_df$clust_method[[1]](
clust_out data = pipe_df$test_data[[1]], preprocess_fun = g
)return(clust_out)
},.options = furrr::furrr_options(
seed = TRUE,
globals = list(
ks = ks,
create_preprocessing_pipeline = create_preprocessing_pipeline,
get_abundance_data = get_abundance_data,
tsne_perplexities = tsne_perplexities,
n_neighbors = n_neighbors,
fit_kmeans = fit_kmeans,
fit_spectral_clustering = fit_spectral_clustering
)
)
)# save fitted clustering pipelines
saveRDS(clust_fit_ls, file = fit_results_fname)
# evaluate generalizability of test clusters
<- clust_fit_ls |>
clust_fit_ls ::map(~ .x$cluster_ids) |>
purrr::list_flatten(name_spec = "{inner}: {outer}")
purrr<- tibble::tibble(
clust_fit_df name = names(clust_fit_ls),
cluster_ids = clust_fit_ls
|>
) annotate_clustering_results() |>
::group_by(k, clust_method_name) |>
dplyr::summarise(
dplyrcluster_list = list(cluster_ids),
.groups = "drop"
)
<- readRDS(
train_consensus_out_ls file.path(RESULTS_PATH, "consensus_clusters_train.rds")
)
<- list()
test_nbhd_mat_ls <- list()
test_consensus_out_ls <- list()
gen_errs for (i in 1:length(best_clust_methods)) {
<- best_clust_methods[[i]]$clust_method_name
clust_method_name <- best_clust_methods[[i]]$k
k <- names(best_clust_methods)[i]
key
<- clust_fit_df |>
keep_clust_ls ::filter(
dplyr== !!k,
k == !!clust_method_name
clust_method_name |>
) ::pull(cluster_list)
dplyr<- keep_clust_ls[[1]]
keep_clust_ls
# compute neighborhood matrix
<- get_consensus_neighborhood_matrix(keep_clust_ls)
test_nbhd_mat_ls[[key]] # aggregate stable clusters using consensus clustering
<- test_nbhd_mat_ls[[key]]
test_nbhd_mat <- fit_consensus_clusters(test_nbhd_mat, k = k)
test_consensus_out_ls[[key]]
# assess generalizability
<- pipe_tib |>
data_pipe_tib ::distinct(impute_mode_name, feature_mode_name, .keep_all = TRUE) |>
dplyr::mutate(
dplyrname = sprintf("%s + %s", impute_mode_name, feature_mode_name)
)<- split(data_pipe_tib, seq_len(nrow(data_pipe_tib))) |>
data_pipe_ls setNames(data_pipe_tib$name)
<- purrr::map(
gen_errs[[key]]
data_pipe_ls,function(data_pipe_df) {
<- create_preprocessing_pipeline(
g feature_mode = data_pipe_df$feature_mode[[1]],
preprocess_fun = identity_fun_ls[[1]]
)<- g(data_pipe_df$train_data[[1]])
X_train <- g(data_pipe_df$test_data[[1]])
X_test cluster_generalizability(
X_train = X_train,
cluster_train = train_consensus_out_ls[[key]]$cluster_ids,
X_test = X_test,
cluster_test = test_consensus_out_ls[[key]]$cluster_ids
)
}
)
}saveRDS(
test_consensus_out_ls,file.path(RESULTS_PATH, "consensus_clusters_test.rds")
)saveRDS(
test_nbhd_mat_ls,file.path(RESULTS_PATH, "consensus_neighborhood_matrices_test.rds")
)saveRDS(
gen_errs,file.path(RESULTS_PATH, "generalizability_errors_test.rds")
)else {
} # read in results (if already cached)
<- readRDS(
gen_errs file.path(RESULTS_PATH, "generalizability_errors_test.rds")
) }
Overall Generalizability
Overall generalizability, computed as the ARI between the estimated test cluster labels and the predicted cluster labels.
Local Generalizability
Local generalizability, defined as the precision for each cluster (after appropriately aligning the cluster labels).