## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

# Set gtsummary print engine for proper rendering
options(gtsummary.print_engine = "gt")

## ----setup--------------------------------------------------------------------
# library(sumExtras)
# library(gtsummary)
# library(dplyr)
# library(gt)
# 
# # Apply the recommended JAMA theme
# use_jama_theme()

## ----setup2-------------------------------------------------------------------
library(sumExtras)
library(gtsummary)
library(dplyr)
library(gt)
library(ggplot2)

# Apply the recommended JAMA theme
use_jama_theme()

## -----------------------------------------------------------------------------
# Create a simple dataset
trial_example <- trial

# Set a label attribute on a variable
attr(trial_example$age, "label") <- "Age at Enrollment (years)"

# Check the label
attr(trial_example$age, "label")

## -----------------------------------------------------------------------------
# Create a dictionary for the trial dataset
dictionary <- tibble::tribble(
  ~Variable,    ~Description,
  "trt",        "Chemotherapy Treatment",
  "age",        "Age at Enrollment (years)",
  "marker",     "Marker Level (ng/mL)",
  "stage",      "T Stage",
  "grade",      "Tumor Grade",
  "response",   "Tumor Response",
  "death",      "Patient Died"
)

dictionary

## ----eval=FALSE---------------------------------------------------------------
# # Typically at the top of your analysis script
# dictionary <- readr::read_csv("data/variable_dictionary.csv")

## -----------------------------------------------------------------------------
trial |>
  tbl_summary(by = trt, include = c(age, grade, marker)) |>
  add_auto_labels(dictionary = dictionary) |>
  extras()

## -----------------------------------------------------------------------------
# Dictionary is already in environment from above
trial |>
  tbl_summary(by = trt, include = c(age, stage, response)) |>
  add_auto_labels() |>  # Finds dictionary automatically
  extras()

## -----------------------------------------------------------------------------
# Create data with label attributes
labeled_trial <- trial
attr(labeled_trial$age, "label") <- "Patient Age at Baseline"
attr(labeled_trial$marker, "label") <- "Biomarker Concentration (ng/mL)"

# Use attributes for labeling (no dictionary needed)
labeled_trial |>
  tbl_summary(by = trt, include = c(age, marker)) |>
  add_auto_labels()  # Reads from label attributes

## -----------------------------------------------------------------------------
trial |>
  tbl_summary(
    by = trt,
    include = c(age, grade, marker),
    label = list(age ~ "Age (Custom Label)")  # This overrides dictionary/attributes
  ) |>
  add_auto_labels(dictionary = dictionary) |>
  extras()

## -----------------------------------------------------------------------------
lm(marker ~ age + grade + stage, data = trial) |>
  tbl_regression() |>
  add_auto_labels(dictionary = dictionary)

## -----------------------------------------------------------------------------
# Apply labels to data as attributes
trial_labeled <- trial |>
  apply_labels_from_dictionary(dictionary = dictionary)

# Check that labels were set
attr(trial_labeled$age, "label")
attr(trial_labeled$marker, "label")

## -----------------------------------------------------------------------------
# Labels are automatically recognized
trial_labeled |>
  tbl_summary(by = trt, include = c(age, marker, grade)) |>
  add_auto_labels() |>  # Reads attributes automatically
  extras()

## ----fig.width=7, fig.height=4------------------------------------------------
# Labels appear automatically on axes and legend!
trial_labeled |>
  ggplot(aes(x = age, y = marker, color = trt)) +
  geom_point(alpha = 0.6) +
  theme_minimal()

## -----------------------------------------------------------------------------
# Create data with both sources of labels
trial_both <- trial
attr(trial_both$age, "label") <- "Age from Attribute"

# Also have dictionary (already defined above)
dictionary_conflict <- tibble::tribble(
  ~Variable, ~Description,
  "age", "Age from Dictionary"
)

# Default: attribute wins
trial_both |>
  tbl_summary(by = trt, include = age) |>
  add_auto_labels(dictionary = dictionary_conflict) |>
  extras()
# Shows: "Age from Attribute"

## -----------------------------------------------------------------------------
# Prioritize dictionary over attributes
options(sumExtras.preferDictionary = TRUE)

trial_both |>
  tbl_summary(by = trt, include = age) |>
  add_auto_labels(dictionary = dictionary_conflict) |>
  extras()
# Shows: "Age from Dictionary"

# Reset to default for rest of vignette
options(sumExtras.preferDictionary = FALSE)

## ----fig.width=7, fig.height=5------------------------------------------------
# 1. Define dictionary once
my_dictionary <- tibble::tribble(
  ~Variable,    ~Description,
  "age",        "Age at Enrollment (years)",
  "marker",     "Marker Level (ng/mL)",
  "trt",        "Treatment Group",
  "grade",      "Tumor Grade",
  "stage",      "T Stage"
)

# 2. Apply to data
trial_final <- trial |>
  apply_labels_from_dictionary(my_dictionary)

# 3. Create gtsummary table
trial_final |>
  tbl_summary(
    by = trt,
    include = c(age, marker, grade, stage)
  ) |>
  add_auto_labels() |>
  extras()

# 4. Create ggplot2 visualization with same labels
trial_final |>
  filter(!is.na(marker)) |>
  ggplot(aes(x = age, y = marker)) +
  geom_point(aes(color = grade), alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  facet_wrap(~trt) +
  theme_minimal() +
  theme(legend.position = "bottom")

## ----fig.width=8, fig.height=6------------------------------------------------
# Step 1: Define your master dictionary
# In practice, this would be loaded from a CSV file
study_dictionary <- tibble::tribble(
  ~Variable,    ~Description,
  "trt",        "Treatment Assignment",
  "age",        "Age at Baseline (years)",
  "marker",     "Biomarker Level (ng/mL)",
  "stage",      "Clinical Stage",
  "grade",      "Tumor Grade",
  "response",   "Treatment Response",
  "death",      "Patient Died"
)

# Step 2: Apply labels to your data once
trial_study <- trial |>
  apply_labels_from_dictionary(study_dictionary)

# Step 3: Create multiple tables using the same labels

# Table 1: Overall summary
trial_study |>
  tbl_summary(include = c(age, marker, stage, grade)) |>
  add_auto_labels() |>
  extras(overall = TRUE, pval = FALSE)

# Table 2: By treatment comparison
trial_study |>
  tbl_summary(
    by = trt,
    include = c(age, marker, response)
  ) |>
  add_auto_labels() |>
  extras()

# Table 3: Regression analysis
lm(marker ~ age + grade + stage, data = trial_study) |>
  tbl_regression() |>
  add_auto_labels()

# Step 4: Create plots using the same labels

# Plot 1: Age distribution by treatment
trial_study |>
  ggplot(aes(x = trt, y = age, fill = trt)) +
  geom_boxplot(alpha = 0.7) +
  theme_minimal() +
  theme(legend.position = "none")

# Plot 2: Marker vs age relationship
trial_study |>
  filter(!is.na(marker)) |>
  ggplot(aes(x = age, y = marker, color = trt)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "loess", se = FALSE) +
  theme_minimal()

# Plot 3: Response rates by grade and treatment
trial_study |>
  filter(!is.na(response)) |>
  count(grade, trt, response) |>
  group_by(grade, trt) |>
  mutate(prop = n / sum(n)) |>
  filter(response == 1) |>
  ggplot(aes(x = grade, y = prop, fill = trt)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent) +
  labs(y = "Response Rate") +
  theme_minimal()

## -----------------------------------------------------------------------------
# Create a subset
trial_subset <- trial_labeled |>
  filter(stage %in% c("T1", "T2")) |>
  select(age, marker, stage, trt)

# Labels are still there
trial_subset |>
  tbl_summary(by = trt) |>
  add_auto_labels() |>
  extras()

## -----------------------------------------------------------------------------
# Labels persist through mutations
trial_labeled |>
  mutate(
    age_group = cut(age, breaks = c(0, 50, 70, 100),
                    labels = c("<50", "50-70", ">70"))
  ) |>
  select(age, age_group, marker, trt) |>
  tbl_summary(by = trt, include = c(age, marker)) |>
  add_auto_labels() |>
  extras()

## -----------------------------------------------------------------------------
# Demographics dictionary
demographics_dict <- tibble::tribble(
  ~Variable, ~Description,
  "age",     "Age at Enrollment (years)",
  "sex",     "Biological Sex"
)

# Clinical dictionary
clinical_dict <- tibble::tribble(
  ~Variable,  ~Description,
  "marker",   "Marker Level (ng/mL)",
  "stage",    "T Stage",
  "grade",    "Tumor Grade"
)

# Combine for use
combined_dict <- bind_rows(demographics_dict, clinical_dict)

trial |>
  tbl_summary(include = c(age, marker, grade)) |>
  add_auto_labels(dictionary = combined_dict) |>
  extras()

## -----------------------------------------------------------------------------
# Check for label attributes
str(trial_labeled$age)