--- title: "How has ENEM performance evolved by state?" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{How has ENEM performance evolved by state?} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE, message = FALSE, warning = FALSE ) suppressPackageStartupMessages(library(systemfonts)) suppressPackageStartupMessages(library(textshaping)) ``` This vignette shows how to use educabR to analyze the evolution of ENEM scores across Brazilian states over time. ```{r setup} library(educabR) library(dplyr) library(tidyr) library(ggplot2) ``` ## Downloading ENEM data for multiple years ENEM microdata files are large (1-3 GB each), so we use `n_max` to work with samples. For a full analysis, remove the `n_max` parameter. ```{r download} # WARNING: each year downloads ~600 MB from INEP. This may take several minutes. years <- c(2019, 2021, 2022, 2023) enem <- years |> purrr::map(\(y) get_enem(year = y, n_max = 50000) |> mutate(year = y)) |> bind_rows() ``` ## Average math score by state and year ```{r math-by-state} scores_by_state <- enem |> filter(!is.na(nu_nota_mt), !is.na(sg_uf_prova)) |> summarise( mean_math = mean(nu_nota_mt, na.rm = TRUE), n = n(), .by = c(sg_uf_prova, year) ) # Top 5 states in the most recent year top_states <- scores_by_state |> filter(year == max(year)) |> slice_max(mean_math, n = 5) |> pull(sg_uf_prova) scores_by_state |> filter(sg_uf_prova %in% top_states) |> ggplot(aes(x = factor(year), y = mean_math, color = sg_uf_prova, group = sg_uf_prova)) + geom_line(linewidth = 1) + geom_point(size = 2) + labs( title = "ENEM Math Score Evolution - Top 5 States", x = "Year", y = "Average Math Score", color = "State" ) + theme_minimal() ``` ![](../man/figures/vignette-enem-math-top5.png) ## Score gap between regions ```{r region-gap} region_map <- c( AC = "North", AP = "North", AM = "North", PA = "North", RO = "North", RR = "North", TO = "North", AL = "Northeast", BA = "Northeast", CE = "Northeast", MA = "Northeast", PB = "Northeast", PE = "Northeast", PI = "Northeast", RN = "Northeast", SE = "Northeast", DF = "Midwest", GO = "Midwest", MT = "Midwest", MS = "Midwest", ES = "Southeast", MG = "Southeast", RJ = "Southeast", SP = "Southeast", PR = "South", RS = "South", SC = "South" ) enem |> filter(!is.na(nu_nota_mt), !is.na(sg_uf_prova)) |> mutate(region = region_map[sg_uf_prova]) |> summarise( mean_math = mean(nu_nota_mt, na.rm = TRUE), .by = c(region, year) ) |> ggplot(aes(x = factor(year), y = mean_math, color = region, group = region)) + geom_line(linewidth = 1) + geom_point(size = 2) + labs( title = "ENEM Math Score by Region", x = "Year", y = "Average Math Score", color = "Region" ) + theme_minimal() ``` ![](../man/figures/vignette-enem-math-by-region.png) ## All five scores compared ```{r all-scores} enem |> filter(!is.na(sg_uf_prova)) |> summarise( math = mean(nu_nota_mt, na.rm = TRUE), languages = mean(nu_nota_lc, na.rm = TRUE), humanities = mean(nu_nota_ch, na.rm = TRUE), sciences = mean(nu_nota_cn, na.rm = TRUE), essay = mean(nu_nota_redacao, na.rm = TRUE), .by = year ) |> pivot_longer(-year, names_to = "subject", values_to = "mean_score") |> ggplot(aes(x = factor(year), y = mean_score, color = subject, group = subject)) + geom_line(linewidth = 1) + geom_point(size = 2) + labs( title = "National Average ENEM Scores by Subject", x = "Year", y = "Average Score", color = "Subject" ) + theme_minimal() ``` ![](../man/figures/vignette-enem-scores-by-subject.png)