---
title: "Dying on the Margin: How Hospital Capacity Utilization Predicts Risk-Adjusted Mortality"
subtitle: "Math 425 - Applied Linear Regression | BYU-Idaho"
author: "Markuss Saule"
date: today
format:
html:
theme: flatly
toc: true
toc-depth: 3
toc-location: left
code-fold: true
code-tools: true
embed-resources: true
fig-width: 9
fig-height: 6
execute:
warning: false
message: false
cache: true
---
<style>
mjx-container[display="true"] {
overflow-x: auto;
overflow-y: hidden;
max-width: 100%;
}
</style>
```{r setup, include=FALSE}
knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
message = FALSE,
fig.align = "center"
)
library(tidyverse)
library(knitr)
library(scales)
library(broom)
theme_set(theme_minimal(base_size = 12))
options(scipen = 999)
options(knitr.kable.NA = "")
```
```{r helpers, include=FALSE}
download_if_missing <- function(url, path) {
if (!file.exists(path) || file.info(path)$size == 0) {
download.file(
url,
destfile = path,
mode = "wb",
quiet = TRUE,
method = "libcurl"
)
}
path
}
extract_if_missing <- function(zip_path, member, exdir) {
dir.create(exdir, recursive = TRUE, showWarnings = FALSE)
out_path <- file.path(exdir, member)
if (!file.exists(out_path)) {
unzip(zip_path, files = member, exdir = exdir)
}
out_path
}
filter_hcris_s3_lines <- function(input_path, output_path, chunk_size = 200000L) {
if (file.exists(output_path) && file.info(output_path)$size > 0) {
return(output_path)
}
dir.create(dirname(output_path), recursive = TRUE, showWarnings = FALSE)
in_con <- file(input_path, open = "rt")
out_con <- file(output_path, open = "wt")
on.exit(close(in_con), add = TRUE)
on.exit(close(out_con), add = TRUE)
target_pattern <- ",S300001,01400,(00200|00300|00800|01500),"
repeat {
lines <- readLines(in_con, n = chunk_size, warn = FALSE)
if (length(lines) == 0) {
break
}
keep <- grepl(target_pattern, lines)
if (any(keep)) {
writeLines(lines[keep], out_con)
}
}
output_path
}
fmt_num <- function(x, digits = 2) {
scales::number(x, accuracy = 10^(-digits), big.mark = ",")
}
fmt_pct <- function(x, digits = 1) {
scales::percent(x, accuracy = 10^(-digits))
}
fmt_p <- function(p) {
ifelse(is.na(p), "", ifelse(p < 0.001, "< 0.001", sprintf("%.3f", p)))
}
show_summary <- function(model, label) {
sum_lines <- capture.output(summary(model))
cat(
"<details><summary><b>", label, "</b></summary>\n\n```text\n",
paste(sum_lines, collapse = "\n"),
"\n```\n\n</details>\n",
sep = ""
)
}
mode_value <- function(x) {
ux <- na.omit(x)
names(sort(table(ux), decreasing = TRUE))[1]
}
```
```{r data-build, include=FALSE}
data_dir <- file.path(getwd(), "data-cache")
dir.create(data_dir, recursive = TRUE, showWarnings = FALSE)
mortality_url <- "https://data.nber.org/cms/compare/hospital/2025/2/complications_and_deaths_hospital.csv"
hospital_info_url <- "https://data.nber.org/cms/compare/hospital/2025/2/hospital_general_information.csv"
pos_url <- "https://data.nber.org/cms/pos/csv/2024/posotherdec2024.csv"
hcris_zip_url <- "https://downloads.cms.gov/FILES/HCRIS/HOSP10FY2023.ZIP"
mortality_path <- download_if_missing(
mortality_url,
file.path(data_dir, "complications_and_deaths_hospital_2025_2.csv")
)
hospital_info_path <- download_if_missing(
hospital_info_url,
file.path(data_dir, "hospital_general_information_2025_2.csv")
)
pos_path <- download_if_missing(
pos_url,
file.path(data_dir, "posotherdec2024.csv")
)
hcris_zip_path <- download_if_missing(
hcris_zip_url,
file.path(data_dir, "HOSP10FY2023.ZIP")
)
hcris_extract_dir <- file.path(data_dir, "hcris_2023")
hcris_rpt_path <- extract_if_missing(hcris_zip_path, "HOSP10_2023_rpt.csv", hcris_extract_dir)
hcris_nmrc_path <- extract_if_missing(hcris_zip_path, "HOSP10_2023_nmrc.csv", hcris_extract_dir)
hcris_nmrc_filtered_path <- filter_hcris_s3_lines(
hcris_nmrc_path,
file.path(hcris_extract_dir, "HOSP10_2023_nmrc_s3_line14.csv")
)
mortality_measures <- c("MORT_30_AMI", "MORT_30_HF", "MORT_30_PN")
primary_measure <- "MORT_30_HF"
primary_score_var <- "mort_30_hf_score"
primary_denom_var <- "mort_30_hf_denominator"
primary_measure_label <- "risk-adjusted 30-day heart-failure mortality"
mortality_long <- readr::read_csv(
mortality_path,
show_col_types = FALSE,
col_select = c(
facility_id,
facility_name,
state,
measure_id,
measure_name,
compared_to_national,
denominator,
score,
lower_estimate,
higher_estimate,
start_date,
end_date
)
) %>%
filter(
!is.na(facility_id),
measure_id %in% mortality_measures,
!is.na(score)
) %>%
mutate(
ccn = sprintf("%06d", as.integer(facility_id))
) %>%
group_by(ccn, facility_name, state, measure_id, measure_name) %>%
summarise(
denominator = max(denominator, na.rm = TRUE),
score = mean(score, na.rm = TRUE),
.groups = "drop"
)
mortality_wide <- mortality_long %>%
pivot_wider(
id_cols = c(ccn, facility_name, state),
names_from = measure_id,
values_from = c(score, denominator),
names_glue = "{tolower(measure_id)}_{.value}"
)
hospital_info <- readr::read_csv(
hospital_info_path,
show_col_types = FALSE,
col_select = c(
facility_id,
facility_name,
state,
hospital_type,
hospital_ownership
)
) %>%
mutate(ccn = sprintf("%06d", as.integer(facility_id))) %>%
filter(hospital_type == "Acute Care Hospitals") %>%
dplyr::select(ccn, facility_name, state, hospital_ownership)
pos_info <- readr::read_csv(
pos_path,
show_col_types = FALSE,
col_select = c(
prvdr_num,
prvdr_ctgry_cd,
cbsa_urbn_rrl_ind,
mdcl_schl_afltn_cd
)
) %>%
transmute(
ccn = stringr::str_pad(prvdr_num, width = 6, side = "left", pad = "0"),
prvdr_ctgry_cd = prvdr_ctgry_cd,
is_rural = as.integer(cbsa_urbn_rrl_ind == "R"),
is_teaching = as.integer(mdcl_schl_afltn_cd %in% c(1, 2, 3))
) %>%
filter(prvdr_ctgry_cd == "01") %>%
distinct(ccn, .keep_all = TRUE)
hcris_rpt <- readr::read_csv(
hcris_rpt_path,
col_names = FALSE,
show_col_types = FALSE,
col_types = cols(.default = col_character())
) %>%
transmute(
rpt_rec_num = X1,
ccn = stringr::str_pad(X3, width = 6, side = "left", pad = "0"),
fy_bgn_dt = as.Date(X6, format = "%m/%d/%Y"),
fy_end_dt = as.Date(X7, format = "%m/%d/%Y"),
proc_dt = as.Date(X8, format = "%m/%d/%Y")
)
hcris_s3 <- readr::read_csv(
hcris_nmrc_filtered_path,
col_names = FALSE,
show_col_types = FALSE,
col_types = cols(
X1 = col_character(),
X2 = col_character(),
X3 = col_character(),
X4 = col_character(),
X5 = col_double()
)
) %>%
transmute(
rpt_rec_num = X1,
clmn_num = X4,
value = X5
) %>%
pivot_wider(names_from = clmn_num, values_from = value) %>%
transmute(
rpt_rec_num = rpt_rec_num,
total_beds = .data[["00200"]],
bed_days_available = .data[["00300"]],
inpatient_days = .data[["00800"]],
total_discharges = .data[["01500"]]
)
hcris_occ <- hcris_rpt %>%
inner_join(hcris_s3, by = "rpt_rec_num") %>%
mutate(
occupancy_rate = inpatient_days / bed_days_available
) %>%
arrange(ccn, desc(fy_end_dt), desc(proc_dt)) %>%
distinct(ccn, .keep_all = TRUE)
analytic_base <- hospital_info %>%
inner_join(pos_info, by = "ccn") %>%
inner_join(hcris_occ, by = "ccn") %>%
left_join(mortality_wide, by = c("ccn", "facility_name", "state")) %>%
mutate(
ownership = case_when(
stringr::str_detect(hospital_ownership, "Voluntary non-profit") ~ "Nonprofit",
hospital_ownership == "Proprietary" ~ "For-profit",
stringr::str_detect(hospital_ownership, "Government") ~ "Government",
TRUE ~ "Other"
),
ownership = forcats::fct_relevel(
factor(ownership),
"Nonprofit",
"For-profit",
"Government",
"Other"
),
mortality_composite = rowMeans(
cbind(mort_30_ami_score, mort_30_hf_score, mort_30_pn_score),
na.rm = TRUE
)
)
make_analysis_data <- function(outcome_var, min_beds = 25, drop_ccn = character()) {
analytic_base %>%
mutate(mortality_rate = .data[[outcome_var]]) %>%
filter(
!is.na(mortality_rate),
occupancy_rate > 0.1,
occupancy_rate < 1.15,
total_beds >= min_beds,
!ccn %in% drop_ccn
) %>%
mutate(
occupancy_c = occupancy_rate - mean(occupancy_rate, na.rm = TRUE),
occupancy_c_sq = occupancy_c^2,
occupancy_c_cu = occupancy_c^3,
log_beds = log(total_beds)
)
}
analytic <- make_analysis_data(primary_score_var)
sample_flow <- tibble(
Step = c(
"Acute care hospitals in Hospital General Information",
"Matched to acute-care POS records",
"Matched to 2023 HCRIS occupancy records",
"Hospitals with available heart-failure mortality scores",
"Final analytic sample after occupancy and bed filters"
),
Hospitals = c(
nrow(hospital_info),
nrow(inner_join(hospital_info, pos_info, by = "ccn")),
nrow(inner_join(inner_join(hospital_info, pos_info, by = "ccn"), hcris_occ, by = "ccn")),
analytic_base %>% filter(!is.na(.data[[primary_score_var]])) %>% nrow(),
nrow(analytic)
)
)
missing_tbl <- analytic_base %>%
summarise(
heart_failure_mortality = mean(is.na(mort_30_hf_score)),
pneumonia_mortality = mean(is.na(mort_30_pn_score)),
ami_mortality = mean(is.na(mort_30_ami_score)),
occupancy_rate = mean(is.na(occupancy_rate)),
total_beds = mean(is.na(total_beds)),
is_teaching = mean(is.na(is_teaching)),
is_rural = mean(is.na(is_rural))
) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "MissingRate") %>%
mutate(MissingRate = percent(MissingRate, accuracy = 0.1))
summary_table <- analytic %>%
summarise(
`Heart-failure mortality (%)` = mean(mortality_rate),
`Occupancy rate` = mean(occupancy_rate),
`Total beds` = mean(total_beds),
`Bed days available` = mean(bed_days_available),
`Inpatient days` = mean(inpatient_days),
`Discharges` = mean(total_discharges),
`Teaching hospital share` = mean(is_teaching),
`Rural hospital share` = mean(is_rural, na.rm = TRUE)
) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Mean") %>%
mutate(
Mean = case_when(
Variable %in% c("Occupancy rate", "Teaching hospital share", "Rural hospital share") ~ percent(Mean, accuracy = 0.1),
TRUE ~ number(Mean, accuracy = 0.1, big.mark = ",")
)
)
```
```{r models-and-results, include=FALSE}
mod1 <- lm(mortality_rate ~ occupancy_rate, data = analytic)
mod2 <- lm(
mortality_rate ~ occupancy_rate + log_beds + is_teaching + is_rural + ownership,
data = analytic
)
mod3 <- lm(
mortality_rate ~ occupancy_c + occupancy_c_sq + log_beds + is_teaching + is_rural + ownership,
data = analytic
)
mod4 <- lm(
mortality_rate ~ occupancy_c * is_rural + occupancy_c_sq + log_beds + is_teaching + ownership,
data = analytic
)
quad_row <- coef(summary(mod3))["occupancy_c_sq", ]
linear_row <- coef(summary(mod3))["occupancy_c", ]
anova_23 <- anova(mod2, mod3)
interaction_row <- coef(summary(mod4))["occupancy_c:is_rural", ]
model_comp <- tibble(
Model = c("Simple", "Multiple", "Quadratic", "Rural interaction"),
`Adj R^2` = c(
summary(mod1)$adj.r.squared,
summary(mod2)$adj.r.squared,
summary(mod3)$adj.r.squared,
summary(mod4)$adj.r.squared
),
Sigma = c(
summary(mod1)$sigma,
summary(mod2)$sigma,
summary(mod3)$sigma,
summary(mod4)$sigma
),
AIC = c(AIC(mod1), AIC(mod2), AIC(mod3), AIC(mod4))
) %>%
mutate(across(where(is.numeric), ~ round(.x, 4)))
coef_table_mod3 <- broom::tidy(mod3, conf.int = TRUE) %>%
mutate(across(where(is.numeric), ~ round(.x, 4)))
coef_table_mod4 <- broom::tidy(mod4, conf.int = TRUE) %>%
mutate(across(where(is.numeric), ~ round(.x, 4)))
mean_occ <- mean(analytic$occupancy_rate, na.rm = TRUE)
ref_profile <- tibble(
log_beds = median(analytic$log_beds, na.rm = TRUE),
is_teaching = median(analytic$is_teaching, na.rm = TRUE),
is_rural = 0,
ownership = factor("Nonprofit", levels = levels(analytic$ownership))
)
hero_grid <- tibble(
occupancy_rate = seq(
min(analytic$occupancy_rate, na.rm = TRUE),
max(analytic$occupancy_rate, na.rm = TRUE),
length.out = 300
)
) %>%
mutate(
occupancy_c = occupancy_rate - mean_occ,
occupancy_c_sq = occupancy_c^2,
log_beds = ref_profile$log_beds,
is_teaching = ref_profile$is_teaching,
is_rural = ref_profile$is_rural,
ownership = ref_profile$ownership
)
hero_pred <- predict(mod3, newdata = hero_grid, se.fit = TRUE)
hero_grid <- hero_grid %>%
mutate(
fit = hero_pred$fit,
lower = hero_pred$fit - 1.96 * hero_pred$se.fit,
upper = hero_pred$fit + 1.96 * hero_pred$se.fit
)
lowess_fit <- lowess(analytic$occupancy_rate, analytic$mortality_rate)
lowess_df <- tibble(
occupancy_rate = lowess_fit$x,
mortality_rate = lowess_fit$y
)
rural_grid <- expand_grid(
occupancy_rate = seq(
min(analytic$occupancy_rate, na.rm = TRUE),
max(analytic$occupancy_rate, na.rm = TRUE),
length.out = 200
),
is_rural = c(0, 1)
) %>%
mutate(
occupancy_c = occupancy_rate - mean_occ,
occupancy_c_sq = occupancy_c^2,
log_beds = median(analytic$log_beds, na.rm = TRUE),
is_teaching = median(analytic$is_teaching, na.rm = TRUE),
ownership = factor("Nonprofit", levels = levels(analytic$ownership)),
location = if_else(is_rural == 1, "Rural", "Urban"),
predicted = predict(mod4, newdata = cur_data())
)
cook_cutoff <- 4 / nrow(analytic)
cook_tbl <- analytic %>%
mutate(cooks_d = cooks.distance(mod3)) %>%
arrange(desc(cooks_d))
influential_tbl <- cook_tbl %>%
filter(cooks_d > cook_cutoff) %>%
dplyr::select(facility_name, state, occupancy_rate, mortality_rate, total_beds, cooks_d) %>%
mutate(
occupancy_rate = percent(occupancy_rate, accuracy = 0.1),
mortality_rate = number(mortality_rate, accuracy = 0.1),
total_beds = scales::comma(total_beds),
cooks_d = round(cooks_d, 4)
)
analytic_no_influential <- make_analysis_data(
primary_score_var,
drop_ccn = cook_tbl %>% filter(cooks_d > cook_cutoff) %>% pull(ccn)
)
mod3_no_influential <- lm(
mortality_rate ~ occupancy_c + occupancy_c_sq + log_beds + is_teaching + is_rural + ownership,
data = analytic_no_influential
)
mod3_cubic <- lm(
mortality_rate ~ occupancy_c + occupancy_c_sq + occupancy_c_cu +
log_beds + is_teaching + is_rural + ownership,
data = analytic
)
analytic_100 <- make_analysis_data(primary_score_var, min_beds = 100)
mod3_100 <- lm(
mortality_rate ~ occupancy_c + occupancy_c_sq + log_beds + is_teaching + is_rural + ownership,
data = analytic_100
)
sensitivity_tbl <- purrr::map_dfr(
c("mort_30_ami_score", "mort_30_hf_score", "mort_30_pn_score"),
function(outcome_var) {
dat <- make_analysis_data(outcome_var)
fit <- lm(
mortality_rate ~ occupancy_c + occupancy_c_sq + log_beds + is_teaching + is_rural + ownership,
data = dat
)
tibble(
Outcome = case_when(
outcome_var == "mort_30_ami_score" ~ "AMI mortality",
outcome_var == "mort_30_hf_score" ~ "Heart-failure mortality",
TRUE ~ "Pneumonia mortality"
),
N = nrow(dat),
QuadraticBeta = coef(fit)[["occupancy_c_sq"]],
QuadraticP = coef(summary(fit))["occupancy_c_sq", "Pr(>|t|)"],
AdjR2 = summary(fit)$adj.r.squared
)
}
) %>%
mutate(
QuadraticBeta = round(QuadraticBeta, 4),
QuadraticP = ifelse(QuadraticP < 0.001, "< 0.001", sprintf("%.3f", QuadraticP)),
AdjR2 = round(AdjR2, 4)
)
appendix_tbl <- tibble(
Analysis = c(
"Primary model without Cook's D > 4/n hospitals",
"Primary model with cubic occupancy term",
"Primary model restricted to hospitals with 100+ beds"
),
N = c(
nrow(analytic_no_influential),
nrow(analytic),
nrow(analytic_100)
),
`Key occupancy term` = c(
"occupancy_c_sq",
"occupancy_c_cu",
"occupancy_c_sq"
),
Estimate = c(
coef(mod3_no_influential)[["occupancy_c_sq"]],
coef(mod3_cubic)[["occupancy_c_cu"]],
coef(mod3_100)[["occupancy_c_sq"]]
),
`p-value` = c(
coef(summary(mod3_no_influential))["occupancy_c_sq", "Pr(>|t|)"],
coef(summary(mod3_cubic))["occupancy_c_cu", "Pr(>|t|)"],
coef(summary(mod3_100))["occupancy_c_sq", "Pr(>|t|)"]
)
) %>%
mutate(
Estimate = round(Estimate, 4),
`p-value` = ifelse(`p-value` < 0.001, "< 0.001", sprintf("%.3f", `p-value`))
)
```
# Abstract
This paper examines whether annual hospital bed occupancy is associated with risk-adjusted mortality in national CMS data. I merged `r nrow(hospital_info)` acute-care hospitals from CMS Hospital General Information with 2024 Provider of Services characteristics and 2023 HCRIS bed-use data, then modeled `r primary_measure_label` as a function of annual bed occupancy, size, teaching status, rurality, and ownership. In the final analytic sample of `r scales::comma(nrow(analytic))` hospitals, the quadratic occupancy term was statistically significant in the primary model (`r fmt_p(quad_row["Pr(>|t|)"])`) and negative (`r round(quad_row["Estimate"], 4)`), indicating downward curvature in the fitted relationship. The LOWESS smoother and the fitted quadratic curve told the same basic story: the relationship bends downward rather than turning sharply upward at high occupancy. The main conclusion is that these annual hospital-level CMS data do not reveal a high-occupancy mortality spike; instead, lower-occupancy hospitals in this cross section tend to have worse risk-adjusted heart-failure mortality.
# Introduction
Hospital crowding matters because hospitals cannot add staffed beds instantly when demand spikes. Once inpatient units operate close to capacity, nurses, physicians, transport systems, and discharge processes all have less slack. At the same time, annual occupancy may also capture broader differences in hospital scale, organization, and referral patterns rather than pure operational strain. That makes hospital occupancy a useful variable for a discovery analysis: it may reveal either a harmful threshold pattern or a more complicated cross-sectional relationship.
The research question is:
**Is there a nonlinear relationship between hospital bed occupancy rate and risk-adjusted 30-day mortality, and if so, what shape does that relationship take?**
For the primary model, I test the quadratic occupancy effect in the form used in class:
$$
H_0: \beta_2 = 0
$$
$$
H_a: \beta_2 \neq 0
$$
$$
\alpha = 0.05
$$
with
$$
Y_i = \beta_0 + \beta_1 \cdot \text{Occupancy}_{c,i} + \beta_2 \cdot \text{Occupancy}_{c,i}^2 + \beta_3 \cdot \log(\text{Beds})_i + \beta_4 \cdot \text{Teaching}_i + \beta_5 \cdot \text{Rural}_i + \beta_6 \cdot \text{Ownership}_i + \epsilon_i
$$
where $Y_i$ is the hospital's risk-adjusted 30-day heart-failure mortality rate and $\text{Occupancy}_{c,i}$ is the centered annual bed occupancy rate.
For the rural interaction, I test:
$$
H_0: \beta_{\text{Rural:Occupancy}} = 0
$$
$$
H_a: \beta_{\text{Rural:Occupancy}} \neq 0
$$
$$
\alpha = 0.05
$$
I also planned a staffing interaction, but I did not estimate it in the final model because the public POS staffing headcounts are not aligned cleanly with the inpatient census denominator used for occupancy. In this one-file reproducible workflow, that staffing variable would have introduced more measurement noise than signal.
# Literature Review
The hospital crowding literature usually argues that strain should worsen outcomes, but it does so through several overlapping mechanisms. Sprivulis et al. (2006) linked emergency department overcrowding with higher short-term mortality among patients admitted through Western Australian EDs. Hoot and Aronsky (2008) reviewed the ED crowding literature and summarized recurring downstream effects such as treatment delay, ambulance diversion, and mortality risk. Those papers are not direct hospital-wide occupancy studies, but they motivate the broader idea that capacity strain can translate into clinical harm.
Nurse staffing studies push the same general logic from another angle. Needleman et al. (2002) showed that higher nurse staffing was associated with better quality measures in hospitals, while Aiken et al. (2002) connected heavier nurse workloads with higher mortality, burnout, and dissatisfaction. If occupancy grows faster than staff and beds can support it, those staffing papers suggest a plausible mechanism for worse outcomes.
More directly, Schilling et al. (2010) found that high hospital occupancy on admission was independently associated with in-hospital mortality in older Michigan patients, even when weekend admission, influenza, and staffing were considered simultaneously. Madsen et al. (2014) reported higher inpatient and 30-day mortality at high bed occupancy in Danish medical departments. Those studies make the present project worth doing, but they also leave room for a U.S. hospital-level replication using publicly reproducible federal data and a course-standard regression workflow built around `lm()`, residual diagnostics, and formal hypothesis testing.
# Data
I used four public sources:
1. CMS Care Compare Complications and Deaths - Hospital for risk-adjusted 30-day mortality measures.
2. CMS Hospital General Information for hospital ownership and acute-care hospital filtering.
3. The CMS Provider of Services file for rural and teaching indicators.
4. CMS HCRIS Hospital 2552-10 FY2023 cost reports for total beds, bed days available, inpatient days, and discharges.
The direct CMS compare-file hashes rotate, so I used the current NBER mirror for the Care Compare flat files. The HCRIS occupancy data came from the official CMS 2023 hospital zip. For Worksheet S-3 Part I in the CMS-2552-10 instructions, line 14 column 2 is total beds, column 3 is available bed days, column 8 is total inpatient days, and column 15 is total discharges. I used:
$$
\text{Occupancy Rate} = \frac{\text{Inpatient Days}}{\text{Bed Days Available}}
$$
## Download Code
```{r data-download-code}
mortality_url
hospital_info_url
pos_url
hcris_zip_url
```
```{r data-wrangling-code, eval=FALSE}
mortality_long <- readr::read_csv(mortality_path, show_col_types = FALSE) %>%
filter(
!is.na(facility_id),
measure_id %in% c("MORT_30_AMI", "MORT_30_HF", "MORT_30_PN"),
!is.na(score)
) %>%
mutate(ccn = sprintf("%06d", as.integer(facility_id))) %>%
group_by(ccn, measure_id) %>%
summarise(
denominator = max(denominator, na.rm = TRUE),
score = mean(score, na.rm = TRUE),
.groups = "drop"
) %>%
pivot_wider(
names_from = measure_id,
values_from = c(score, denominator),
names_glue = "{tolower(measure_id)}_{.value}"
)
analytic <- analytic_base %>%
mutate(mortality_rate = mort_30_hf_score) %>%
filter(
!is.na(mortality_rate),
occupancy_rate > 0.1,
occupancy_rate < 1.15,
total_beds >= 25
) %>%
mutate(
occupancy_c = occupancy_rate - mean(occupancy_rate, na.rm = TRUE),
occupancy_c_sq = occupancy_c^2,
log_beds = log(total_beds)
)
```
## Sample Construction
```{r sample-flow}
sample_flow %>%
kable(
caption = "Sample construction for the primary heart-failure mortality analysis"
)
```
The analysis starts with `r scales::comma(nrow(hospital_info))` acute-care hospitals from the Hospital General Information file. After matching to POS characteristics and 2023 HCRIS occupancy records, `r scales::comma(sample_flow$Hospitals[3])` hospitals remained. Of those, `r scales::comma(sample_flow$Hospitals[4])` had an available risk-adjusted heart-failure mortality score, and `r scales::comma(nrow(analytic))` survived the quality filters on occupancy and bed count.
## Data Quality
```{r data-quality}
missing_tbl %>%
kable(caption = "Missingness rates in the merged hospital-level file")
```
```{r summary-stats}
summary_table %>%
kable(
caption = "Table 1. Descriptive statistics for the final analytic sample"
)
```
The average hospital in the final analytic sample operated at about `r fmt_pct(mean(analytic$occupancy_rate), 1)` occupancy and had `r scales::comma(round(mean(analytic$total_beds)))` beds. About `r fmt_pct(mean(analytic$is_teaching), 1)` were teaching-affiliated under the POS medical-school coding, and about `r fmt_pct(mean(analytic$is_rural, na.rm = TRUE), 1)` were rural. The average risk-adjusted 30-day heart-failure mortality rate was `r fmt_num(mean(analytic$mortality_rate), 2)` percent.
I chose heart-failure mortality as the primary outcome because it offered a large national sample, clinically broad medical volume, and a direct CMS risk-adjusted measure. AMI and pneumonia models appear in the appendix as sensitivity checks.
# Methods
This paper frames the regression around `lm()`, `summary()`, and direct interpretation of coefficients.
## Model 1: Simple Linear Regression
$$
Y_i = \beta_0 + \beta_1 \cdot \text{Occupancy}_i + \epsilon_i
$$
This model asks whether there is any average linear relationship between annual occupancy and heart-failure mortality before adding controls.
## Model 2: Multiple Linear Regression
$$
Y_i = \beta_0 + \beta_1 \cdot \text{Occupancy}_i + \beta_2 \cdot \log(\text{Beds})_i + \beta_3 \cdot \text{Teaching}_i + \beta_4 \cdot \text{Rural}_i + \beta_5 \cdot \text{Ownership}_i + \epsilon_i
$$
This model adjusts for hospital size and structure before testing curvature.
## Model 3: Quadratic Model
$$
Y_i = \beta_0 + \beta_1 \cdot \text{Occupancy}_{c,i} + \beta_2 \cdot \text{Occupancy}_{c,i}^2 + \beta_3 \cdot \log(\text{Beds})_i + \beta_4 \cdot \text{Teaching}_i + \beta_5 \cdot \text{Rural}_i + \beta_6 \cdot \text{Ownership}_i + \epsilon_i
$$
I centered occupancy before squaring it so the linear and quadratic terms would be easier to interpret together:
$$
\text{Occupancy}_{c,i} = \text{Occupancy}_i - \overline{\text{Occupancy}}
$$
The primary test is the coefficient on $\text{Occupancy}_{c,i}^2$. I also use the partial F-test comparing Model 2 to Model 3:
$$
H_0: \beta_2 = 0 \qquad \text{vs.} \qquad H_a: \beta_2 \neq 0
$$
The partial F-test and the t-test on the quadratic term both assess whether curvature is present. The sign of the estimated quadratic coefficient then tells whether the fitted relationship bends upward or downward.
## Model 4: Rural Interaction
$$
Y_i = \beta_0 + \beta_1 \cdot \text{Occupancy}_{c,i} + \beta_2 \cdot \text{Occupancy}_{c,i}^2 + \beta_3 \cdot \text{Rural}_i + \beta_4 \cdot \left(\text{Occupancy}_{c,i} \times \text{Rural}_i\right) + \cdots + \epsilon_i
$$
This tests whether rural hospitals respond differently to crowding.
I also use a LOWESS curve as a visual check for whether a straight line is reasonable before moving to the quadratic model. That follows the textbook approach of using LOWESS to look for bending in the scatterplot before deciding that a more flexible model is needed.
## Diagnostic Plan
For the primary quadratic model, I check:
1. Linearity with Residuals vs Fitted and the LOWESS overlay.
2. Constant variance with Residuals vs Fitted.
3. Normality with the Q-Q plot.
4. Independence with Residuals vs Order.
5. Fixed X by arguing that the administrative predictor values can reasonably be treated as fixed for this cross-sectional analysis.
# Results
## Visual Pattern
```{r hero-figure, fig.height=6.5, fig.cap="Figure 1. Heart-failure mortality changes nonlinearly across the annual occupancy distribution in the national hospital cross section. The blue curve is the adjusted quadratic fit from Model 3 with a 95% confidence band, and the dashed red line is the raw LOWESS smoother. Both curves slope downward rather than showing an upward mortality spike at high occupancy. Predictions are shown for a median-size, urban, non-teaching, nonprofit hospital."}
ggplot(analytic, aes(x = occupancy_rate, y = mortality_rate)) +
geom_point(color = "gray35", alpha = 0.18, size = 1.4) +
geom_ribbon(
data = hero_grid,
aes(x = occupancy_rate, ymin = lower, ymax = upper),
inherit.aes = FALSE,
fill = "#9ecae1",
alpha = 0.35
) +
geom_line(
data = hero_grid,
aes(x = occupancy_rate, y = fit),
inherit.aes = FALSE,
color = "#08519c",
linewidth = 1.2
) +
geom_line(
data = lowess_df,
aes(x = occupancy_rate, y = mortality_rate),
inherit.aes = FALSE,
color = "#cb181d",
linewidth = 1,
linetype = "dashed"
) +
annotate(
"label",
x = quantile(analytic$occupancy_rate, 0.20, na.rm = TRUE),
y = max(analytic$mortality_rate, na.rm = TRUE) - 0.4,
label = "LOWESS and quadratic fit\nboth bend downward",
hjust = 0,
size = 3.6,
fill = "white"
) +
labs(
title = "Heart-Failure Mortality Changes Nonlinearly Across the Occupancy Distribution",
subtitle = paste0(
"Risk-adjusted 30-day heart-failure mortality vs annual occupancy, N = ",
scales::comma(nrow(analytic)),
" U.S. acute care hospitals"
),
x = "Bed occupancy rate",
y = "Risk-adjusted 30-day heart-failure mortality (%)"
) +
scale_x_continuous(labels = label_percent(accuracy = 1)) +
scale_y_continuous(labels = label_number(accuracy = 0.1))
```
Figure 1 already shows the main result: the fitted curve does not turn upward at high occupancy. Instead, both the adjusted quadratic model and the LOWESS smoother slope downward. In this national cross section, the main pattern is nonlinear decline rather than an upward mortality spike at high occupancy.
## Regression Summaries
```{r model-summaries, results='asis'}
show_summary(mod1, "summary(mod1): simple linear regression")
show_summary(mod2, "summary(mod2): multiple linear regression with controls")
show_summary(mod3, "summary(mod3): primary quadratic model")
show_summary(mod4, "summary(mod4): rural interaction model")
```
```{r model-comparison}
model_comp %>%
kable(
caption = "Table 2. Model comparison for the primary heart-failure outcome"
)
```
```{r mod3-coefs}
coef_table_mod3 %>%
kable(
caption = "Table 3. Coefficients for the primary quadratic model (Model 3)"
)
```
### Interpreting the Primary Model
Model 1 already shows a negative linear association between occupancy and heart-failure mortality. The estimated slope is `r round(coef(mod1)[["occupancy_rate"]], 4)`, with p = `r fmt_p(coef(summary(mod1))["occupancy_rate", "Pr(>|t|)"])`. So before any controls, higher annual occupancy is associated with lower risk-adjusted heart-failure mortality in this cross section.
In Model 3, the centered linear term is `r round(linear_row["Estimate"], 4)` and the quadratic term is `r round(quad_row["Estimate"], 4)`. Because occupancy was centered before squaring, the linear term describes the pattern near the average occupancy level in the data, and the quadratic term tells whether the fitted line bends upward or downward. Since the quadratic term is negative, the fitted relationship bends downward rather than upward as occupancy increases.
The bed-size coefficient is also negative: larger hospitals tend to have lower heart-failure mortality after adjustment. Rural hospitals have higher mortality than otherwise similar urban hospitals. Teaching status is negative but not statistically significant at the 0.05 level in this specification.
## Primary Hypothesis Test
The primary null and alternative were:
$$
H_0: \beta_2 = 0 \qquad \text{vs.} \qquad H_a: \beta_2 \neq 0
$$
The partial F-test comparing Model 2 and Model 3 gave p = `r fmt_p(anova_23[2, 6])`, so the quadratic term improves model fit. The estimated quadratic coefficient is negative, which means the fitted curve bends downward rather than upward. Therefore, I reject $H_0$ at $\alpha = 0.05$ and conclude that the occupancy relationship is nonlinear in this sample.
In plain English: the national hospital cross section does not provide evidence that mortality accelerates upward once occupancy gets high. Instead, the fitted annual relationship curves downward.
## Rural Interaction
```{r mod4-coefs}
coef_table_mod4 %>%
kable(
caption = "Table 4. Coefficients for the rural interaction model (Model 4)"
)
```
```{r rural-figure, fig.cap="Figure 2. Predicted heart-failure mortality curves from the rural interaction model. The rural line stays above the urban line, but the occupancy-by-rural interaction is not statistically significant at alpha = 0.05."}
ggplot(rural_grid, aes(x = occupancy_rate, y = predicted, color = location)) +
geom_line(linewidth = 1.2) +
scale_color_manual(values = c("Urban" = "#08519c", "Rural" = "#cb181d")) +
scale_x_continuous(labels = label_percent(accuracy = 1)) +
scale_y_continuous(labels = label_number(accuracy = 0.1)) +
labs(
title = "Rural Hospitals Have Higher Predicted Mortality, but Not a Different Occupancy Slope",
subtitle = "Predictions from Model 4 for a median-size, nonprofit, non-teaching hospital",
x = "Bed occupancy rate",
y = "Predicted heart-failure mortality (%)",
color = "Hospital location"
)
```
The occupancy-by-rural interaction estimate in Model 4 is `r round(interaction_row["Estimate"], 4)` with p = `r fmt_p(interaction_row["Pr(>|t|)"])`. At $\alpha = 0.05$, I fail to reject $H_0$ for the rural interaction. Rural hospitals have higher predicted mortality overall, but the data do not provide strong evidence that their occupancy response is different from urban hospitals once the rest of the model is included.
## Diagnostics
```{r diagnostics-figure, fig.height=7, fig.width=9, fig.cap="Figure 3. Diagnostic plots for the primary quadratic model. The panel includes Residuals vs Fitted, Normal Q-Q, Cook's distance, and Residuals vs Order."}
par(mfrow = c(2, 2))
plot(mod3, which = 1)
plot(mod3, which = 2)
plot(mod3, which = 4)
plot(
mod3$residuals ~ seq_along(mod3$residuals),
xlab = "Hospital order in analytic file",
ylab = "Residual",
main = "Residuals vs Order"
)
abline(h = 0, lty = 2)
par(mfrow = c(1, 1))
```
The Residuals vs Fitted plot shows a mostly horizontal cloud centered around zero once the quadratic term is included. There is some uneven spread at the lower end of the fitted values, but there is not a strong remaining curve, so the model form looks acceptable for this class-level analysis.
The vertical spread of residuals is reasonably stable across the fitted range. I do not see a strong funnel shape, so the constant-variance assumption looks reasonable for this analysis.
The Q-Q plot is fairly straight through the center, with some tail departure at the extremes. With `r scales::comma(nrow(analytic))` hospitals, those mild tail deviations do not appear severe enough to undermine the regression inference in a practical sense.
The Residuals vs Order plot does not show an obvious run pattern or periodic structure. Since the units are hospitals rather than repeated measurements through time, that visual check is weaker than it would be in a time series, but there is still no clear sign of dependence.
The fixed-$X$ assumption is also reasonable here. Occupancy, beds, rurality, and teaching status are observed administrative characteristics taken from CMS files. In the textbook sense, treating those predictor values as fixed is a reasonable approximation for this hospital-level regression.
```{r cooks-table}
influential_tbl %>%
slice_head(n = 12) %>%
kable(
caption = "Table 5. Highest Cook's distance hospitals in the primary model"
)
```
Using the common cutoff $4/n$, `r scales::comma(nrow(influential_tbl))` hospitals were flagged by Cook's distance. Because $n$ is large, that cutoff is very small here, so it is more of a screening rule than a sign that the model is dominated by a handful of hospitals. The appendix re-fits the model after removing those hospitals.
# Discussion
The most important result is that the occupancy-mortality relationship is nonlinear, but not in the form many readers might expect. In these annual hospital-level public data, higher occupancy is associated with lower heart-failure mortality after adjustment, and the curvature is negative rather than positive. That is still a useful discovery because it shows that annual average occupancy is not behaving like a simple hospital stress-threshold signal in this national cross section.
There are several plausible reasons for that mismatch. First, annual average occupancy is much smoother than the day-to-day or shift-to-shift strain that clinicians experience. A hospital can have a moderate annual occupancy average while still suffering dangerous daily spikes, and the CMS heart-failure mortality measure averages over long windows too. Second, higher-occupancy hospitals in this sample also tend to be larger and more urban, which may correlate with specialty coverage, transfer networks, and rescue capacity that are not fully captured by the regression controls. Third, the CMS outcome is already risk-adjusted, which is appropriate, but it also absorbs part of the variability that raw mortality might show.
The rural result fits that interpretation. Rural hospitals had higher mortality overall, but I did not find strong evidence that their occupancy slope was different. So the cross-sectional disadvantage seems more like a level shift than a different crowding response in this specification.
This project still matters for hospital administrators. If a manager used annual occupancy alone as a hospital safety alarm, this analysis suggests that the signal would be too blunt. The better interpretation is that occupancy must be paired with more proximate operational measures such as daily boarding, nurse staffing, transfer delays, or ICU strain. Public annual hospital aggregates appear too coarse to reveal short-run crowding effects cleanly.
## Limitations
1. The study is cross-sectional and observational, so it cannot support a causal claim that occupancy changes mortality.
2. The occupancy measure is an annual average from HCRIS. It misses the daily spikes that probably matter most for real crowding.
3. The outcome is a hospital-level CMS risk-adjusted rate, so ecological fallacy is possible.
4. Unmeasured confounding remains likely, especially around staffing, referral patterns, specialty services, and case-mix complexity beyond what CMS already adjusted.
5. The tertiary staffing interaction was not estimated because the public staffing headcounts did not align well with the occupancy denominator in a defensible one-file workflow.
# Conclusion
Using national CMS public data and a course-standard applied linear regression workflow, I found that annual hospital bed occupancy did predict risk-adjusted 30-day heart-failure mortality, but the fitted relationship bent downward rather than upward. The quadratic term improved model fit, and the LOWESS curve supported the same general shape. For hospital operations, the lesson is that annual occupancy alone is too coarse to act as a reliable safety threshold in this setting. In this discovery analysis, occupancy captured cross-sectional hospital differences more clearly than it captured a high-occupancy mortality threshold.
# Appendix
## A1. Different Mortality Measures
```{r appendix-measures}
sensitivity_tbl %>%
kable(
caption = "Appendix Table A1. Quadratic occupancy results across CMS mortality measures"
)
```
The heart-failure and AMI models both showed statistically significant negative quadratic terms, while pneumonia did not. Across the condition-specific models, the curvature was either negative or statistically indistinguishable from zero.
## A2. Excluding Influential Hospitals
```{r appendix-influential}
appendix_tbl %>%
kable(
caption = "Appendix Table A2. Sensitivity analyses for the primary heart-failure model"
)
```
After removing hospitals with Cook's distance above $4/n$, the quadratic term remained negative, so the overall story did not change.
## A3. Cubic Term
The cubic occupancy term estimate was `r round(coef(mod3_cubic)[['occupancy_c_cu']], 4)` with p = `r fmt_p(coef(summary(mod3_cubic))['occupancy_c_cu', 'Pr(>|t|)'])`. I therefore fail to reject $H_0$ for the cubic term, which suggests that the quadratic specification is flexible enough for this assignment.
## A4. Hospitals with 100+ Beds Only
Among hospitals with at least 100 beds, the quadratic term remained negative (`r round(coef(mod3_100)[['occupancy_c_sq']], 4)`, p = `r fmt_p(coef(summary(mod3_100))['occupancy_c_sq', 'Pr(>|t|)'])`). So restricting the sample to larger hospitals did not reverse the core result.
## A5. Optional Pairs Plot
```{r appendix-pairs, fig.height=8, fig.width=8, fig.cap="Figure A1. Pairs plot for selected analytic variables. The plot shows the basic cross-sectional relationships among occupancy, heart-failure mortality, beds, and discharges."}
pairs(
analytic %>%
dplyr::select(occupancy_rate, mortality_rate, total_beds, total_discharges) %>%
rename(
Occupancy = occupancy_rate,
Mortality = mortality_rate,
Beds = total_beds,
Discharges = total_discharges
),
panel = panel.smooth
)
```
# References
1. Aiken, L. H., Clarke, S. P., Sloane, D. M., Sochalski, J., & Silber, J. H. (2002). Hospital nurse staffing and patient mortality, nurse burnout, and job dissatisfaction. *JAMA*, 288(16), 1987-1993. https://pubmed.ncbi.nlm.nih.gov/12387650/
2. Hoot, N. R., & Aronsky, D. (2008). Systematic review of emergency department crowding: causes, effects, and solutions. *Annals of Emergency Medicine*, 52(2), 126-136. https://pubmed.ncbi.nlm.nih.gov/18433933/
3. Madsen, F., Ladelund, S., & Linneberg, A. (2014). High levels of bed occupancy associated with increased inpatient and thirty-day hospital mortality in Denmark. *Health Affairs*, 33(7), 1236-1244. https://pubmed.ncbi.nlm.nih.gov/25006151/
4. Needleman, J., Buerhaus, P., Mattke, S., Stewart, M., & Zelevinsky, K. (2002). Nurse-staffing levels and the quality of care in hospitals. *New England Journal of Medicine*, 346(22), 1715-1722. https://pubmed.ncbi.nlm.nih.gov/12037152/
5. Schilling, P. L., Campbell, D. A., Englesbe, M. J., & Davis, M. M. (2010). A comparison of in-hospital mortality risk conferred by high hospital occupancy, differences in nurse staffing levels, weekend admission, and seasonal influenza. *Medical Care*, 48(3), 224-232. https://pubmed.ncbi.nlm.nih.gov/20168260/
6. Sprivulis, P. C., Da Silva, J.-A., Jacobs, I. G., Frazer, A. R. L., & Jelinek, G. A. (2006). Increase in patient mortality at 10 days associated with emergency department overcrowding. *Medical Journal of Australia*, 184(5), 213-216. https://pubmed.ncbi.nlm.nih.gov/16515430/
7. Saunders, G. (n.d.). *Applied Linear Regression*. https://saundersg.github.io/Applied-Linear-Regression/
8. CMS Care Compare Complications and Deaths - Hospital, current public file mirrored by NBER. https://data.nber.org/cms/compare/hospital/2025/2/complications_and_deaths_hospital.csv
9. CMS Hospital General Information, current public file mirrored by NBER. https://data.nber.org/cms/compare/hospital/2025/2/hospital_general_information.csv
10. CMS Provider of Services File, December 2024, mirrored by NBER. https://data.nber.org/cms/pos/csv/2024/posotherdec2024.csv
11. CMS HCRIS Hospital 2552-10 FY2023 cost report zip. https://downloads.cms.gov/FILES/HCRIS/HOSP10FY2023.ZIP
12. CMS-2552-10 Worksheet S-3 Part I instructions used to verify beds, bed days available, inpatient days, and discharges. https://www.cms.gov/Regulations-and-Guidance/Guidance/Transmittals/Downloads/R8P240s.pdf