## this code chunk fits the final (tuned) clustering pipeline on the full data
#### choose imputation methods ####
data_ls <- list (
"Mean-imputed" = rbind (data_mean_imputed$ train, data_mean_imputed$ test),
"RF-imputed" = rbind (data_rf_imputed$ train, data_rf_imputed$ test)
)
#### choose number of features ####
feature_modes <- list (
"Small" = 7 ,
"Medium" = 11 ,
"Big" = 19
)
#### choose dimension reduction methods ####
# raw data
identity_fun_ls <- list ("Raw" = function (x) x)
# pca
pca_fun_ls <- list ("PCA" = purrr:: partial (fit_pca, ndim = 4 ))
# tsne
tsne_perplexities <- c (30 , 100 )
tsne_fun_ls <- purrr:: map (
tsne_perplexities,
~ purrr:: partial (fit_tsne, dims = 2 , perplexity = .x)
) |>
setNames (sprintf ("tSNE (perplexity = %d)" , tsne_perplexities))
# putting it together
dr_fun_ls <- c (
identity_fun_ls,
pca_fun_ls,
tsne_fun_ls
)
#### choose clustering methods ####
# kmeans
kmeans_fun_ls <- list ("K-means" = purrr:: partial (fit_kmeans, ks = ks))
# spectral clustering
n_neighbors <- c (60 , 100 )
spectral_fun_ls <- purrr:: map (
n_neighbors,
~ purrr:: partial (
fit_spectral_clustering,
ks = ks,
affinity = "nearest_neighbors" ,
n_neighbors = .x
)
) |>
setNames (sprintf ("Spectral (n_neighbors = %s)" , n_neighbors))
# putting it together
clust_fun_ls <- c (
kmeans_fun_ls,
spectral_fun_ls
)
#### Fit Clustering Pipelines ####
pipe_tib <- tidyr:: expand_grid (
data = data_ls,
feature_mode = feature_modes,
dr_method = dr_fun_ls,
clust_method = clust_fun_ls
) |>
dplyr:: mutate (
impute_mode_name = names (data),
feature_mode_name = names (feature_mode),
dr_method_name = names (dr_method),
clust_method_name = names (clust_method),
name = stringr:: str_glue (
"{clust_method_name} [{impute_mode_name} + {feature_mode_name} + {dr_method_name}]"
)
) |>
# remove some clustering pipelines to reduce computation burden
dplyr:: filter (
# remove all big feature set + dimension-reduction runs
! ((dr_method_name != "Raw" ) & (feature_mode_name == "Big" )),
# restrict to tuned models
clust_method_name == !! best_clust_method_name
)
pipe_ls <- split (pipe_tib, seq_len (nrow (pipe_tib))) |>
setNames (pipe_tib$ name)
fit_results_fname <- file.path (RESULTS_PATH, "clustering_fits_final.rds" )
consensus_clusters_results_path <- file.path (
RESULTS_PATH, "consensus_clusters_final.rds"
)
consensus_nbhd_results_path <- file.path (
RESULTS_PATH, "consensus_neighborhood_matrices_final.rds"
)
if (! file.exists (fit_results_fname) ||
! file.exists (consensus_clusters_results_path) ||
! file.exists (consensus_nbhd_results_path)) {
library (future)
plan (multisession, workers = NCORES)
# fit clustering pipelines (if not already cached)
clust_fit_ls <- furrr:: future_map (
pipe_ls,
function (pipe_df) {
g <- create_preprocessing_pipeline (
feature_mode = pipe_df$ feature_mode[[1 ]],
preprocess_fun = pipe_df$ dr_method[[1 ]]
)
clust_out <- pipe_df$ clust_method[[1 ]](
data = pipe_df$ data[[1 ]], preprocess_fun = g
)
return (clust_out)
},
.options = furrr:: furrr_options (
seed = TRUE ,
globals = list (
ks = best_k,
create_preprocessing_pipeline = create_preprocessing_pipeline,
get_abundance_data = get_abundance_data,
tsne_perplexities = tsne_perplexities,
n_neighbors = n_neighbors,
fit_kmeans = fit_kmeans,
fit_spectral_clustering = fit_spectral_clustering
)
)
)
# save fitted clustering pipelines
saveRDS (clust_fit_ls, file = fit_results_fname)
# estimate consensus clusters
clust_fit_ls <- purrr:: map (clust_fit_ls, ~ .x$ cluster_ids) |>
purrr:: list_flatten (name_spec = "{inner}: {outer}" )
nbhd_mat <- get_consensus_neighborhood_matrix (clust_fit_ls)
consensus_out <- fit_consensus_clusters (nbhd_mat, k = best_k)
saveRDS (consensus_out, file = consensus_clusters_results_path)
saveRDS (nbhd_mat, file = consensus_nbhd_results_path)
} else {
# read in results (if already cached)
clust_fit_ls <- readRDS (fit_results_fname)
consensus_out <- readRDS (consensus_clusters_results_path)
nbhd_mat <- readRDS (consensus_nbhd_results_path)
}