Finding Common Origins of Milky Way Stars

Author

Andersen Chang, Tiffany M. Tang, Tarek M. Zikry, Genevera I. Allen

Published

June 4, 2025

Fit Tuned Clustering Pipeline(s) on Full Data

We lastly re-fit the consensus K-means (\(k = 8\)) clustering pipeline on the full training and test data together to obtain the final clusters.

Show Code to Fit Final (tuned) Clustering Pipeline on Full Data

## this code chunk fits the final (tuned) clustering pipeline on the full data

#### choose imputation methods ####
data_ls <- list(
  "Mean-imputed" = rbind(data_mean_imputed$train, data_mean_imputed$test),
  "RF-imputed" = rbind(data_rf_imputed$train, data_rf_imputed$test)
)

#### choose number of features ####
feature_modes <- list(
  "Small" = 7,
  "Medium" = 11,
  "Big" = 19
)

#### choose dimension reduction methods ####
# raw data
identity_fun_ls <- list("Raw" = function(x) x)

# pca
pca_fun_ls <- list("PCA" = purrr::partial(fit_pca, ndim = 4))

# tsne
tsne_perplexities <- c(30, 100)
tsne_fun_ls <- purrr::map(
  tsne_perplexities,
  ~ purrr::partial(fit_tsne, dims = 2, perplexity = .x)
) |> 
  setNames(sprintf("tSNE (perplexity = %d)", tsne_perplexities))

# putting it together
dr_fun_ls <- c(
  identity_fun_ls,
  pca_fun_ls,
  tsne_fun_ls
)

#### choose clustering methods ####
# kmeans
kmeans_fun_ls <- list("K-means" = purrr::partial(fit_kmeans, ks = ks))

# spectral clustering
n_neighbors <- c(60, 100)
spectral_fun_ls <- purrr::map(
  n_neighbors,
  ~ purrr::partial(
    fit_spectral_clustering, 
    ks = ks,
    affinity = "nearest_neighbors",
    n_neighbors = .x
  )
) |> 
  setNames(sprintf("Spectral (n_neighbors = %s)", n_neighbors))

# putting it together
clust_fun_ls <- c(
  kmeans_fun_ls,
  spectral_fun_ls
)

#### Fit Clustering Pipelines ####
pipe_tib <- tidyr::expand_grid(
  data = data_ls,
  feature_mode = feature_modes,
  dr_method = dr_fun_ls,
  clust_method = clust_fun_ls
) |> 
  dplyr::mutate(
    impute_mode_name = names(data),
    feature_mode_name = names(feature_mode),
    dr_method_name = names(dr_method),
    clust_method_name = names(clust_method),
    name = stringr::str_glue(
      "{clust_method_name} [{impute_mode_name} + {feature_mode_name} + {dr_method_name}]"
    )
  ) |> 
  # remove some clustering pipelines to reduce computation burden
  dplyr::filter(
    # remove all big feature set + dimension-reduction runs
    !((dr_method_name != "Raw") & (feature_mode_name == "Big")),
    # restrict to tuned models
    clust_method_name == !!best_clust_method_name
  )
pipe_ls <- split(pipe_tib, seq_len(nrow(pipe_tib))) |> 
  setNames(pipe_tib$name)

fit_results_fname <- file.path(RESULTS_PATH, "clustering_fits_final.rds")
consensus_clusters_results_path <- file.path(
  RESULTS_PATH, "consensus_clusters_final.rds"
)
consensus_nbhd_results_path <- file.path(
  RESULTS_PATH, "consensus_neighborhood_matrices_final.rds"
)
if (!file.exists(fit_results_fname) ||
    !file.exists(consensus_clusters_results_path) ||
    !file.exists(consensus_nbhd_results_path)) {
  library(future)
  plan(multisession, workers = NCORES)
  
  # fit clustering pipelines (if not already cached)
  clust_fit_ls <- furrr::future_map(
    pipe_ls,
    function(pipe_df) {
      g <- create_preprocessing_pipeline(
        feature_mode = pipe_df$feature_mode[[1]],
        preprocess_fun = pipe_df$dr_method[[1]]
      )
      clust_out <- pipe_df$clust_method[[1]](
        data = pipe_df$data[[1]], preprocess_fun = g
      )
      return(clust_out)
    },
    .options = furrr::furrr_options(
      seed = TRUE, 
      globals = list(
        ks = best_k,
        create_preprocessing_pipeline = create_preprocessing_pipeline,
        get_abundance_data = get_abundance_data,
        tsne_perplexities = tsne_perplexities,
        n_neighbors = n_neighbors,
        fit_kmeans = fit_kmeans,
        fit_spectral_clustering = fit_spectral_clustering
      )
    )
  )
  # save fitted clustering pipelines
  saveRDS(clust_fit_ls, file = fit_results_fname)
  
  # estimate consensus clusters
  clust_fit_ls <- purrr::map(clust_fit_ls, ~ .x$cluster_ids) |> 
    purrr::list_flatten(name_spec = "{inner}: {outer}")
  nbhd_mat <- get_consensus_neighborhood_matrix(clust_fit_ls)
  consensus_out <- fit_consensus_clusters(nbhd_mat, k = best_k)
  saveRDS(consensus_out, file = consensus_clusters_results_path)
  saveRDS(nbhd_mat, file = consensus_nbhd_results_path)
} else {
  # read in results (if already cached)
  clust_fit_ls <- readRDS(fit_results_fname)
  consensus_out <- readRDS(consensus_clusters_results_path)
  nbhd_mat <- readRDS(consensus_nbhd_results_path)
}

Interpreting the Final Clusters

Below, we show:

Consensus neighborhood heatmap, showing the proportion of times each pair of stars were clustered together across the different choices of data preprocessing pipelines
GC composition for each estimated cluster, illuminating several GCs that are fully contained within a single cluster, e.g., NGC6121 (dark green bar) in cluster 1 and NGC0104 (orange bar) in cluster 2
Local stability values for each star, providing a more granular measure of how consistently each star is clustered together across the different data preprocessing pipelines
Distribution of abundances per estimated cluster, revealing distinctive chemical properties of each cluster (e.g., cluster 2 is iron-rich)

For additional discussion of these clustering results, we refer interested readers to the main text.

k = 8: K-means

Consensus Neighborhood Heatmap

GCs per Cluster

GC composition for each estimated consensus cluster.

On Galactic Coordinates

Galactic coordinates of stars, colored by its local cluster stability.

On Dimension Reduction

Dimension reduction visualizations of stars, colored by its local cluster stability.

Abundance of features per estimated consensus cluster.