---
title: "How has ENEM performance evolved by state?"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{How has ENEM performance evolved by state?}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment  = "#>",
  eval     = FALSE,
  message  = FALSE,
  warning  = FALSE
)
suppressPackageStartupMessages(library(systemfonts))
suppressPackageStartupMessages(library(textshaping))
```

This vignette shows how to use educabR to analyze the evolution of ENEM
scores across Brazilian states over time.

```{r setup}
library(educabR)
library(dplyr)
library(tidyr)
library(ggplot2)
```

## Downloading ENEM data for multiple years

ENEM microdata files are large (1-3 GB each), so we use `n_max` to work
with samples. For a full analysis, remove the `n_max` parameter.

```{r download}
# WARNING: each year downloads ~600 MB from INEP. This may take several minutes.
years <- c(2019, 2021, 2022, 2023)

enem <-
  years |>
  purrr::map(\(y) get_enem(year = y, n_max = 50000) |> mutate(year = y)) |>
  bind_rows()
```

## Average math score by state and year

```{r math-by-state}
scores_by_state <-
  enem |>
  filter(!is.na(nu_nota_mt), !is.na(sg_uf_prova)) |>
  summarise(
    mean_math = mean(nu_nota_mt, na.rm = TRUE),
    n = n(),
    .by = c(sg_uf_prova, year)
  )

# Top 5 states in the most recent year
top_states <-
  scores_by_state |>
  filter(year == max(year)) |>
  slice_max(mean_math, n = 5) |>
  pull(sg_uf_prova)

scores_by_state |>
  filter(sg_uf_prova %in% top_states) |>
  ggplot(aes(x = factor(year), y = mean_math, color = sg_uf_prova, group = sg_uf_prova)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  labs(
    title = "ENEM Math Score Evolution - Top 5 States",
    x     = "Year",
    y     = "Average Math Score",
    color = "State"
  ) +
  theme_minimal()
```

![](../man/figures/vignette-enem-math-top5.png)

## Score gap between regions

```{r region-gap}
region_map <- c(
  AC = "North",     AP = "North",     AM = "North",     PA = "North",
  RO = "North",     RR = "North",     TO = "North",
  AL = "Northeast", BA = "Northeast", CE = "Northeast",
  MA = "Northeast", PB = "Northeast", PE = "Northeast",
  PI = "Northeast", RN = "Northeast", SE = "Northeast",
  DF = "Midwest",   GO = "Midwest",   MT = "Midwest",   MS = "Midwest",
  ES = "Southeast", MG = "Southeast", RJ = "Southeast", SP = "Southeast",
  PR = "South",     RS = "South",     SC = "South"
)

enem |>
  filter(!is.na(nu_nota_mt), !is.na(sg_uf_prova)) |>
  mutate(region = region_map[sg_uf_prova]) |>
  summarise(
    mean_math = mean(nu_nota_mt, na.rm = TRUE),
    .by = c(region, year)
  ) |>
  ggplot(aes(x = factor(year), y = mean_math, color = region, group = region)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  labs(
    title = "ENEM Math Score by Region",
    x     = "Year",
    y     = "Average Math Score",
    color = "Region"
  ) +
  theme_minimal()
```

![](../man/figures/vignette-enem-math-by-region.png)

## All five scores compared

```{r all-scores}
enem |>
  filter(!is.na(sg_uf_prova)) |>
  summarise(
    math       = mean(nu_nota_mt, na.rm = TRUE),
    languages  = mean(nu_nota_lc, na.rm = TRUE),
    humanities = mean(nu_nota_ch, na.rm = TRUE),
    sciences   = mean(nu_nota_cn, na.rm = TRUE),
    essay      = mean(nu_nota_redacao, na.rm = TRUE),
    .by = year
  ) |>
  pivot_longer(-year, names_to = "subject", values_to = "mean_score") |>
  ggplot(aes(x = factor(year), y = mean_score, color = subject, group = subject)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  labs(
    title = "National Average ENEM Scores by Subject",
    x     = "Year",
    y     = "Average Score",
    color = "Subject"
  ) +
  theme_minimal()
```

![](../man/figures/vignette-enem-scores-by-subject.png)