if (file.exists("data/fyc_split.RData")) {
load("data/fyc_split.RData")
} else {
recode_meps_missing <- function(x) {
ifelse(x < 0, NA, x)
}
fyc_raw <- read_dta("data/h243.dta")
vars_select <- c(
"DUPERSID", "PERWT22F", "VARSTR", "VARPSU",
"AGE22X", "SEX", "RACETHX", "EDUCYR", "REGION22",
"FAMINC22", "POVCAT22",
"INSCOV22",
"RTHLTH53", "MNHLTH53",
"DIABDX_M18", "HIBPDX", "ASTHDX", "ARTHDX", "CANCERDX", "STRKDX",
"DLAYCA42", "AFRDCA42",
"OBTOTV22", "ERTOT22", "IPDIS22", "HHTOTD22",
"TOTEXP22", "TOTSLF22", "TOTMCR22", "TOTMCD22", "TOTPRV22",
"TOTVA22", "TOTTRI22", "TOTOFD22", "TOTWCP22", "TOTOSR22", "TOTOTH22"
)
fyc <- fyc_raw %>% select(all_of(vars_select))
fyc <- fyc %>% filter(PERWT22F > 0)
fyc <- fyc %>% filter(FAMINC22 > 0)
fyc <- fyc %>% filter(AGE22X >= 18)
fyc <- fyc %>%
mutate(
burden_ratio = TOTSLF22 / FAMINC22,
high_burden = factor(ifelse(burden_ratio > 0.10, 1, 0), levels = c("0", "1"),
labels = c("No", "Yes"))
)
fyc <- fyc %>%
mutate(across(c(RTHLTH53, MNHLTH53, EDUCYR, DIABDX_M18, HIBPDX, ASTHDX,
ARTHDX, CANCERDX, STRKDX, DLAYCA42, AFRDCA42,
OBTOTV22, ERTOT22, IPDIS22, HHTOTD22),
recode_meps_missing))
fyc <- fyc %>%
mutate(
sex = factor(SEX, levels = 1:2, labels = c("Male", "Female")),
race_eth = factor(RACETHX, levels = 1:5,
labels = c("Hispanic", "NH White", "NH Black",
"NH Asian", "NH Other/Multiple")),
region = factor(REGION22, levels = c(-1, 1:4),
labels = c("Unknown", "Northeast", "Midwest", "South", "West")),
poverty = factor(POVCAT22, levels = 1:5,
labels = c("Poor/Negative", "Near Poor", "Low Income",
"Middle Income", "High Income")),
insurance = factor(INSCOV22, levels = 1:3,
labels = c("Private", "Public Only", "Uninsured")),
health_phys = factor(RTHLTH53, levels = 1:5,
labels = c("Excellent", "Very Good", "Good", "Fair", "Poor")),
health_ment = factor(MNHLTH53, levels = 1:5,
labels = c("Excellent", "Very Good", "Good", "Fair", "Poor")),
diabetes = factor(DIABDX_M18, levels = c(2, 1), labels = c("No", "Yes")),
highbp = factor(HIBPDX, levels = c(2, 1), labels = c("No", "Yes")),
asthma = factor(ASTHDX, levels = c(2, 1), labels = c("No", "Yes")),
arthritis = factor(ARTHDX, levels = c(2, 1), labels = c("No", "Yes")),
cancer = factor(CANCERDX, levels = c(2, 1), labels = c("No", "Yes")),
stroke = factor(STRKDX, levels = c(2, 1), labels = c("No", "Yes")),
delayed_care = factor(DLAYCA42, levels = c(1, 2), labels = c("Yes", "No")),
forgone_care = factor(AFRDCA42, levels = c(1, 2), labels = c("Yes", "No")),
age_group = cut(AGE22X, breaks = c(17, 25, 35, 45, 55, 65, Inf),
labels = c("18-25", "26-35", "36-45", "46-55", "56-65", "65+"))
)
fyc <- fyc %>%
mutate(
n_chronic = rowSums(across(c(DIABDX_M18, HIBPDX, ASTHDX, ARTHDX, CANCERDX, STRKDX),
~ .x == 1), na.rm = TRUE)
)
set.seed(231)
fyc_split <- initial_split(fyc, prop = 0.80, strata = high_burden)
fyc_train <- training(fyc_split)
fyc_test <- testing(fyc_split)
set.seed(231)
cv_folds <- vfold_cv(fyc_train, v = 10, strata = high_burden)
save(fyc_split, fyc_train, fyc_test, cv_folds, file = "data/fyc_split.RData")
}