Show Code
# Load data
data_orig <- load_astro_data(data_dir = DATA_PATH)Let us begin by loading in the APOGEE DR17 data, which was originally downloaded from SDSS (or direct download link here). A detailed data dictionary can be found here.
# Load data
data_orig <- load_astro_data(data_dir = DATA_PATH)This data contains the chemical abundances from 19 different elements as well as metadata about the quality control and star properties. Here, we have also supplemented the data with the APOGEE DR17 value-added catalog of globular clusters (GCs) (Schiavon et al. 2024). GCs are sites of dense star formation, comprised of \(10^5 - 10^7\) stars that were roughly formed around the same time. GCs hence provide a convenient starting point for understanding and assessing the chemical origins of stars in the Milky Way. Nonetheless, this is only a starting point — we note that many stars in the APOGEE survey have been assigned to GCs based on proximity while there is also the broader open question of discovering deeper shared origins of possibly distant bodies.
We next perform some basic quality control filtering. Specifically, following previous literature (Pagnini et al. 2025), we subset the stars to those that meet the criteria below:
This quality control filtering resulted in a sample of 3,286 observations (or stars). We will investigate alternative, but equally-reasonable quality control filtering choices in a future section.
# QC filtering choices
SNR_THRESHOLD <- 70
MIN_TEFF_THRESHOLD <- 3500
MAX_TEFF_THRESHOLD <- 5500
LOGG_THRESHOLD <- 3.6
STARFLAG <- 0
VB_THRESHOLD <- 0.9
# Apply QC filter
data_qc_filtered <- apply_qc_filtering(
data_orig,
snr_threshold = SNR_THRESHOLD,
teff_thresholds = c(MIN_TEFF_THRESHOLD, MAX_TEFF_THRESHOLD),
logg_threshold = LOGG_THRESHOLD,
starflag = STARFLAG,
vb_threshold = VB_THRESHOLD
)Below, we provide a quick overview of the data after quality control filtering.
abundance_df <- data_qc_filtered$abundance
skimr::skim(abundance_df)| Name | abundance_df |
| Number of rows | 3286 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| numeric | 19 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| C_FE | 4 | 1.00 | -0.17 | 0.31 | -1.48 | -0.39 | -0.16 | 0.04 | 0.85 | ▁▂▇▆▁ |
| CI_FE | 50 | 0.98 | -0.07 | 0.35 | -1.44 | -0.29 | -0.07 | 0.14 | 0.97 | ▁▂▇▆▁ |
| N_FE | 3 | 1.00 | 0.61 | 0.47 | -0.55 | 0.27 | 0.67 | 0.94 | 1.96 | ▂▅▇▅▁ |
| O_FE | 18 | 0.99 | 0.28 | 0.20 | -0.57 | 0.18 | 0.30 | 0.40 | 0.86 | ▁▂▆▇▁ |
| NA_FE | 729 | 0.78 | 0.10 | 0.51 | -1.61 | -0.19 | 0.14 | 0.43 | 2.33 | ▁▅▇▁▁ |
| MG_FE | 2 | 1.00 | 0.22 | 0.17 | -0.50 | 0.15 | 0.25 | 0.33 | 0.81 | ▁▂▇▇▁ |
| AL_FE | 20 | 0.99 | 0.25 | 0.42 | -0.67 | -0.12 | 0.24 | 0.53 | 1.38 | ▃▆▇▃▂ |
| SI_FE | 2 | 1.00 | 0.26 | 0.09 | -0.30 | 0.21 | 0.26 | 0.31 | 0.58 | ▁▁▅▇▁ |
| S_FE | 106 | 0.97 | 0.40 | 0.23 | -0.74 | 0.30 | 0.41 | 0.53 | 1.12 | ▁▁▆▇▁ |
| K_FE | 189 | 0.94 | 0.24 | 0.28 | -1.02 | 0.14 | 0.27 | 0.38 | 1.31 | ▁▁▇▃▁ |
| CA_FE | 68 | 0.98 | 0.23 | 0.18 | -0.76 | 0.16 | 0.24 | 0.32 | 0.90 | ▁▁▇▇▁ |
| TI_FE | 302 | 0.91 | 0.03 | 0.20 | -0.75 | -0.08 | 0.03 | 0.14 | 0.96 | ▁▃▇▁▁ |
| TIII_FE | 208 | 0.94 | 0.19 | 0.25 | -0.78 | 0.08 | 0.22 | 0.33 | 1.02 | ▁▁▇▅▁ |
| V_FE | 467 | 0.86 | 0.09 | 0.41 | -1.32 | -0.15 | 0.10 | 0.34 | 1.62 | ▁▃▇▂▁ |
| CR_FE | 192 | 0.94 | 0.02 | 0.37 | -1.07 | -0.20 | 0.01 | 0.22 | 1.45 | ▁▆▇▂▁ |
| MN_FE | 508 | 0.85 | -0.28 | 0.21 | -1.01 | -0.41 | -0.27 | -0.17 | 0.81 | ▁▇▇▁▁ |
| FE_H | 2 | 1.00 | -1.44 | 0.39 | -2.42 | -1.72 | -1.50 | -1.16 | 0.33 | ▁▇▃▁▁ |
| CO_FE | 286 | 0.91 | -0.08 | 0.44 | -1.42 | -0.42 | -0.07 | 0.17 | 1.52 | ▁▅▇▂▁ |
| NI_FE | 60 | 0.98 | 0.00 | 0.11 | -0.70 | -0.06 | 0.00 | 0.06 | 0.62 | ▁▁▇▁▁ |
metadata_df <- data_qc_filtered$meta |>
dplyr::mutate(
dplyr::across(where(is.character), as.factor)
)
skimr::skim(metadata_df)| Name | metadata_df |
| Number of rows | 3286 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| factor | 6 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| X_H | 0 | 1.00 | FALSE | 3286 | (-0: 1, (-0: 1, (-0: 1, (-0: 1 |
| GC_NAME | 0 | 1.00 | FALSE | 57 | NGC: 1221, NGC: 233, NGC: 228, NGC: 171 |
| APSTAR_ID | 0 | 1.00 | FALSE | 3286 | apo: 1, apo: 1, apo: 1, apo: 1 |
| TELESCOPE | 0 | 1.00 | FALSE | 2 | lco: 2829, apo: 457 |
| TARGFLAGS | 146 | 0.96 | FALSE | 68 | APO: 1141, APO: 959, APO: 224, APO: 171 |
| APOGEE_ID | 0 | 1.00 | FALSE | 3286 | 2M0: 1, 2M0: 1, 2M0: 1, 2M0: 1 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| GLON | 0 | 1 | 231.41 | 130.85 | 0.00 | 59.20 | 308.90 | 309.17 | 357.63 | ▃▁▁▁▇ |
| GLAT | 0 | 1 | 4.10 | 28.34 | -89.47 | -11.25 | 14.84 | 15.09 | 79.90 | ▁▃▅▇▁ |
| LOCATION_ID | 0 | 1 | 5438.53 | 766.57 | 2011.00 | 5293.00 | 5298.00 | 5644.00 | 7254.00 | ▁▁▁▇▁ |
| STARFLAG | 0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| VHELIO_AVG | 0 | 1 | 104.38 | 158.94 | -257.40 | -23.62 | 110.43 | 231.11 | 501.77 | ▃▅▃▇▁ |
| TEFF_SPEC | 0 | 1 | 4571.09 | 303.00 | 3502.70 | 4382.40 | 4621.60 | 4783.35 | 5445.70 | ▁▃▇▇▁ |
| LOGG_SPEC | 0 | 1 | 1.56 | 0.56 | -0.19 | 1.16 | 1.59 | 1.98 | 3.20 | ▁▅▇▆▁ |
| VB_PROB | 0 | 1 | 1.00 | 0.00 | 0.90 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▁▁▇ |
| SNREV | 0 | 1 | 209.99 | 143.20 | 70.03 | 119.26 | 162.09 | 249.67 | 1310.13 | ▇▁▁▁▁ |
data_qc_filtered$meta |>
dplyr::group_by(GC_NAME) |>
dplyr::summarise(
n = dplyr::n(),
.groups = "drop"
) |>
dplyr::arrange(dplyr::desc(n)) |>
vthemes::pretty_DT()Before proceeding any further, we randomly split the data into training and test using an 80-20% split. This led to a training set of 2,628 observations and a test set of 658 observations.
# Split data into training and test sets
TRAIN_PROP <- 0.8
train_idxs <- sample(
1:nrow(data_qc_filtered$abundance),
size = round(TRAIN_PROP * nrow(data_qc_filtered$abundance)),
replace = FALSE
)
train_data <- data_qc_filtered$abundance[train_idxs, ]
test_data <- data_qc_filtered$abundance[-train_idxs, ]
train_metadata <- data_qc_filtered$meta[train_idxs, ]
test_metadata <- data_qc_filtered$meta[-train_idxs, ]We next impute missing abundance values in the data and standardize each elemental abundance feature to have mean 0 and standard deviation 1. Given that there are various ways to perform missing data imputation, we implement two different imputation methods: (1) mean imputation and (2) random forest imputation using MissForest (Stekhoven and Bühlmann 2012).
# impute missing values and standardize data
data_fpath <- file.path(DATA_PATH, "astro_cleaned_data.RData")
if (!file.exists(data_fpath)) { # if not already cached
metadata <- list(
train = train_metadata,
test = test_metadata
)
data_mean_imputed <- clean_astro_data(
train_data = train_data,
test_data = test_data,
impute_mode = "mean"
)
data_rf_imputed <- clean_astro_data(
train_data = train_data,
test_data = test_data,
impute_mode = "rf"
)
save(
metadata, data_mean_imputed, data_rf_imputed,
file = file.path(DATA_PATH, "astro_cleaned_data.RData")
)
} else { # read from cache
load(data_fpath)
}