## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)

## ----setup, message=FALSE, warning=FALSE--------------------------------------
library(pep725)

# Download the synthetic dataset
pep <- pep_download()

# For this vignette, we'll focus on two well-represented species:
# 1. Apple (Malus domestica) - excellent coverage across Europe
# 2. Grapevine (Vitis vinifera) - longest historical records

apple <- pep[species == "Malus domestica"]
vine <- pep[species == "Vitis vinifera"]

# Verify subsets contain data
stopifnot(nrow(apple) > 0, nrow(vine) > 0)

# Quick overview of our data
cat("=== Apple (Malus domestica) ===\n")
cat("Observations:", nrow(apple), "\n")
cat("Year range:", min(apple$year), "-", max(apple$year), "\n")
cat("Stations:", length(unique(apple$s_id)), "\n\n")

cat("=== Grapevine (Vitis vinifera) ===\n")
cat("Observations:", nrow(vine), "\n")
cat("Year range:", min(vine$year), "-", max(vine$year), "\n")
cat("Stations:", length(unique(vine$s_id)), "\n")

## ----normals-basic------------------------------------------------------------
# Calculate normals for apple across countries and phases
# Using 1990-2015 for illustration (ensures enough data in synthetic dataset)
normals <- pheno_normals(
  pep = apple,
  period = 1990:2015,
  by = c("country", "phase_id"),
  min_years = 10
)

print(normals)

## ----normals-summary----------------------------------------------------------
# Get a summary of the normals
summary(normals)

## ----normals-plot, fig.width=7, fig.height=5----------------------------------
plot(normals)

## ----normals-filtered---------------------------------------------------------
# Normals for apple first flowering (BBCH 60) only
flowering_normals <- pheno_normals(
  pep = pep,                          # Can use full dataset
  period = 1990:2015,
  species = "Malus domestica",         # Filter by species
  phase_id = 60,                      # Filter by phase (first flowers)
  by = "country",                     # Group only by country
  min_years = 5                       # Lower threshold for more countries
)

print(flowering_normals)

## ----normals-compare, warning=FALSE, message=FALSE----------------------------
# Calculate normals for two periods
# Period 1: Historical (1961-1990)
period1 <- pheno_normals(
  pep,
  period = 1961:1990,
  species = "Malus",
  phase_id = 60,
  by = "country",
  min_years = 10
)

# Period 2: Recent (1991-2020)
period2 <- pheno_normals(
  pep,
  period = 1991:2020,
  species = "Malus",
  phase_id = 60,
  by = "country",
  min_years = 10
)

# Compare the two periods by merging on country
# Negative shift = flowering got earlier
# Positive shift = flowering got later
if (nrow(period1) > 0 && nrow(period2) > 0) {
  comparison <- merge(
    period1[, .(country, mean_doy_early = mean_doy)],
    period2[, .(country, mean_doy_recent = mean_doy)],
    by = "country"
  )
  comparison[, shift := mean_doy_recent - mean_doy_early]

  cat("Countries compared:", nrow(comparison), "\n")
  cat("Mean shift:", round(mean(comparison$shift, na.rm = TRUE), 1), "days\n")
  cat("Range of shifts:", round(min(comparison$shift, na.rm = TRUE), 1), "to",
      round(max(comparison$shift, na.rm = TRUE), 1), "days\n")

  if (mean(comparison$shift, na.rm = TRUE) < 0) {
    cat("\nInterpretation: On average, flowering has shifted EARLIER\n")
  } else {
    cat("\nInterpretation: On average, flowering has shifted LATER\n")
  }
}

## ----normals-vine, warning=FALSE, message=FALSE-------------------------------
# Calculate normals for grapevine flowering
vine_normals <- pheno_normals(
  pep = vine,
  period = 1990:2015,
  phase_id = 65,  # Full flowering
  by = "country",
  min_years = 5
)

# Compare with apple for countries that have both
if (nrow(vine_normals) > 0) {
  cat("Grapevine flowering normals (BBCH 65):\n")
  print(vine_normals[!is.na(mean_doy), .(country, n_years, mean_doy, sd_doy)])
}

## ----anomalies-basic----------------------------------------------------------
# Calculate anomalies for apple
# Using 1990-2010 as baseline period
anomalies <- pheno_anomaly(
  pep = apple,
  baseline_period = 1990:2010,
  by = c("country", "phase_id"),
  min_years = 5
)

print(anomalies)

## ----anomalies-extreme--------------------------------------------------------
# Find extreme years (is_extreme == TRUE)
extreme <- anomalies[is_extreme == TRUE]

if (nrow(extreme) > 0) {
  cat("Total extreme years found:", nrow(extreme), "\n\n")

  # Show the most extreme early years
  cat("Most extreme EARLY years:\n")
  early <- extreme[direction == "early"][order(anomaly_days)]
  if (nrow(early) > 0) {
    print(early[1:min(5, nrow(early)),
          .(country, phase_id, year, anomaly_days, z_score)])
  }

  # Show the most extreme late years
  cat("\nMost extreme LATE years:\n")
  late <- extreme[direction == "late"][order(-anomaly_days)]
  if (nrow(late) > 0) {
    print(late[1:min(5, nrow(late)),
          .(country, phase_id, year, anomaly_days, z_score)])
  }
} else {
  cat("No extreme years detected in this dataset\n")
}

## ----anomalies-summary--------------------------------------------------------
summary(anomalies)

## ----anomalies-plot, fig.width=7, fig.height=5--------------------------------
plot(anomalies)

## ----quality-basic------------------------------------------------------------
# Assess quality at the station level
quality <- pep_quality(
  pep = apple,
  by = c("s_id", "phase_id")
)

print(quality)

## ----quality-summary----------------------------------------------------------
# Summary of quality across all stations
summary(quality)

## ----quality-plot, fig.height=5, fig.width=11---------------------------------
# Overview plot: grade distribution + station map (requires pep for coordinates)
plot(quality, pep = apple)

## ----quality-plot-grades, fig.height=4----------------------------------------
# Just the grade distribution (no pep data needed)
plot(quality, which = "grades")

## ----quality-plot-map, fig.height=5-------------------------------------------
# Just the map of station quality
plot(quality, which = "map", pep = apple)

## ----quality-filter-----------------------------------------------------------
# Get high-quality stations
high_quality <- quality[quality_grade %in% c("A", "B")]

cat("Quality distribution:\n")
print(table(quality$quality_grade))

cat("\nHigh quality (A or B):", nrow(high_quality), "of", nrow(quality),
    "(", round(100 * nrow(high_quality) / nrow(quality), 1), "%)\n")

# Filter original data to these stations
if (nrow(high_quality) > 0) {
  good_stations <- unique(high_quality$s_id)
  apple_hq <- apple[s_id %in% good_stations]

  cat("\nOriginal apple observations:", nrow(apple), "\n")
  cat("After quality filter:", nrow(apple_hq),
      "(", round(100 * nrow(apple_hq) / nrow(apple), 1), "%)\n")
}

## ----quality-country----------------------------------------------------------
# Country-level quality (coarser assessment)
country_quality <- pep_quality(
  pep = pep,
  by = c("country", "species", "phase_id")
)

print(country_quality)

## ----workflow-----------------------------------------------------------------
cat("=== STEP 1: Data Quality Assessment ===\n\n")

# Assess quality
quality <- pep_quality(apple, by = c("s_id", "phase_id"))
cat("Quality grades:\n")
print(table(quality$quality_grade))

cat("\n=== STEP 2: Filter to Good Quality Data ===\n\n")

# Keep grades A, B, and C (exclude only grade D)
good_stations <- quality[quality_grade %in% c("A", "B", "C"), s_id]
apple_clean <- apple[s_id %in% good_stations]

cat("Original observations:", nrow(apple), "\n")
cat("After quality filter:", nrow(apple_clean), "\n")
cat("Retained:", round(100 * nrow(apple_clean) / nrow(apple), 1), "%\n")

stopifnot(nrow(apple_clean) > 0)

cat("\n=== STEP 3: Calculate Baseline Normals ===\n\n")

normals <- pheno_normals(
  apple_clean,
  period = 1990:2010,
  by = c("country", "phase_id"),
  min_years = 5
)

cat("Normals calculated for", sum(!is.na(normals$mean_doy)), "groups\n")
cat("Mean DOY range:", round(min(normals$mean_doy, na.rm = TRUE), 0),
    "to", round(max(normals$mean_doy, na.rm = TRUE), 0), "\n")

cat("\n=== STEP 4: Calculate Anomalies ===\n\n")

anomalies <- pheno_anomaly(
  apple_clean,
  baseline_period = 1990:2010,
  by = c("country", "phase_id"),
  min_years = 5
)

# Convert to plain data.table so grouped aggregation works correctly
# (the pheno_anomaly S3 class can interfere with [.data.table dispatch)
anomalies_dt <- as.data.table(anomalies)

cat("Anomalies calculated for", sum(!is.na(anomalies_dt$anomaly_days)), "year-groups\n")
cat("Extreme years:", sum(anomalies_dt$is_extreme, na.rm = TRUE), "\n")

cat("\n=== STEP 5: Analyze Trends by Decade ===\n\n")

decade_summary <- anomalies_dt[
  !is.na(anomaly_days),
  .(
    mean_anomaly = round(mean(anomaly_days, na.rm = TRUE), 1),
    sd_anomaly = round(sd(anomaly_days, na.rm = TRUE), 1),
    n_obs = .N,
    pct_extreme = round(100 * sum(is_extreme, na.rm = TRUE) / .N, 1)
  ),
  by = .(decade = floor(year / 10) * 10)
][order(decade)]

print(decade_summary)

cat("\nInterpretation:\n")
first_decade <- decade_summary$mean_anomaly[1]
last_decade <- decade_summary$mean_anomaly[nrow(decade_summary)]
change <- last_decade - first_decade

if (change < -3) {
  cat("- Strong shift toward EARLIER phenology\n")
} else if (change < 0) {
  cat("- Moderate shift toward earlier phenology\n")
} else if (change > 3) {
  cat("- Strong shift toward LATER phenology\n")
} else {
  cat("- No clear directional change\n")
}

## ----timeseries-basic, fig.height=5-------------------------------------------
# Prepare aggregated data for visualization
apple_annual <- apple[, .(
  day = mean(day, na.rm = TRUE),
  n = .N
), by = .(year, species, phase_id)]

# Add human-readable phase labels
apple_annual[, phase := factor(phase_id,
  levels = c(60, 65, 100),
  labels = c("First flowers", "Full flowering", "Harvest"))]

# Remove rows where phase mapping failed
apple_annual <- apple_annual[!is.na(phase)]

# Plot time series with trend lines
if (nrow(apple_annual) > 0) {
  pheno_plot_timeseries(
    apple_annual,
    color_by = "phase",
    smooth = TRUE,
    title = "Apple Phenology Trends"
  )
}

## ----trends-turning, fig.height=5---------------------------------------------
# Get apple flowering data
apple_flowering <- pep[species == "Malus domestica" & phase_id == 60]

# Detect turning points
turning <- pheno_trend_turning(apple_flowering, min_years = 10)
print(turning)

# Visualize the analysis
plot(turning)

## ----kendall-simple-----------------------------------------------------------
# Aggregate to annual means
annual_doy <- apple_flowering[, .(
  mean_doy = mean(day, na.rm = TRUE)
), by = year][order(year)]

# Calculate Mann-Kendall test statistic
# Note: kendall_tau() returns the standardized z-statistic, not Kendall's
# tau correlation. Values beyond ±1.96 indicate significant trends.
tau <- kendall_tau(annual_doy$mean_doy)

cat("Mann-Kendall z-statistic:", round(tau, 3), "\n")
cat("\nInterpretation:\n")
if (tau < -2.58) {
  cat("- Highly significant earlier trend (p < 0.01)\n")
} else if (tau < -1.96) {
  cat("- Significant earlier trend (p < 0.05)\n")
} else if (tau > 2.58) {
  cat("- Highly significant later trend (p < 0.01)\n")
} else if (tau > 1.96) {
  cat("- Significant later trend (p < 0.05)\n")
} else {
  cat("- No statistically significant trend\n")
}

## ----regional-climate, eval=FALSE---------------------------------------------
# # Compile regional data and link to temperature anomalies
# regional_data <- pheno_regional(
#   pep = pep,
#   species_name = "Malus domestica",
#   year_min = 1961
# )
# 
# # Visualize phenology alongside temperature anomalies
# pheno_plot(regional_data)

## ----eval = FALSE-------------------------------------------------------------
# vignette("spatial-patterns", package = "pep725")

## ----eval = FALSE-------------------------------------------------------------
# vignette("data-quality", package = "pep725")

## ----session------------------------------------------------------------------
sessionInfo()