--- title: "What does school infrastructure look like across Brazil?" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{What does school infrastructure look like across Brazil?} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE, message = FALSE, warning = FALSE ) suppressPackageStartupMessages(library(systemfonts)) suppressPackageStartupMessages(library(textshaping)) ``` This vignette shows how to use educabR to explore school infrastructure across Brazil using the School Census. We look at internet access, libraries, science labs, sports courts, and accessibility features -- and how they vary by region, administrative type, and location. ```{r setup} library(educabR) library(dplyr) library(tidyr) library(ggplot2) ``` ## Downloading School Census data The School Census contains one row per school (~217,000 schools in 2023) with over 400 variables covering infrastructure, staffing, and programs. ```{r download} # Download all schools for 2023 escolas <- get_censo_escolar(year = 2023) # Or filter by state for faster exploration escolas_sp <- get_censo_escolar(year = 2023, uf = "SP") ``` > **Note:** the full national file is about 30 MB compressed. When > filtering by state, all rows are read before filtering, so the first > call may take a moment. ## Overview: key infrastructure indicators The School Census uses binary columns (`1` = yes, `0` = no) for each infrastructure item. Let us compute the percentage of schools that have each resource nationwide. ```{r overview} indicators <- c( "in_internet", "in_banda_larga", "in_biblioteca", "in_laboratorio_informatica", "in_laboratorio_ciencias", "in_quadra_esportes", "in_agua_potavel", "in_esgoto_rede_publica" ) infra_summary <- escolas |> summarise(across(all_of(indicators), ~ mean(. == 1, na.rm = TRUE) * 100)) |> pivot_longer(everything(), names_to = "indicator", values_to = "pct") |> mutate( label = c( "Internet", "Broadband", "Library", "Computer lab", "Science lab", "Sports court", "Drinking water", "Public sewage" ) ) ggplot(infra_summary, aes(x = reorder(label, pct), y = pct)) + geom_col(fill = "steelblue") + coord_flip() + labs( title = "Percentage of Schools with Key Infrastructure (2023)", x = NULL, y = "% of schools" ) + theme_minimal() ``` ![](../man/figures/vignette-infra-overview.png) ## Infrastructure by administrative type Federal, state, municipal, and private schools have very different resource levels. The `tp_dependencia` column encodes the administrative type. ```{r by-admin} admin_labels <- c( "1" = "Federal", "2" = "State", "3" = "Municipal", "4" = "Private" ) infra_admin <- escolas |> mutate(admin = admin_labels[as.character(tp_dependencia)]) |> group_by(admin) |> summarise( Internet = mean(in_internet == 1, na.rm = TRUE) * 100, Library = mean(in_biblioteca == 1, na.rm = TRUE) * 100, `Computer lab` = mean(in_laboratorio_informatica == 1, na.rm = TRUE) * 100, `Science lab` = mean(in_laboratorio_ciencias == 1, na.rm = TRUE) * 100, .groups = "drop" ) |> pivot_longer(-admin, names_to = "resource", values_to = "pct") ggplot(infra_admin, aes(x = resource, y = pct, fill = admin)) + geom_col(position = "dodge") + labs( title = "School Infrastructure by Administrative Type (2023)", x = NULL, y = "% of schools", fill = "Type" ) + theme_minimal() + theme(axis.text.x = element_text(angle = 20, hjust = 1)) ``` ![](../man/figures/vignette-infra-by-admin.png) ## Regional inequality Northern and Northeastern states typically have fewer resources than the South and Southeast. Grouping by region reveals the gap. ```{r by-region} region_labels <- c( "Norte" = "North", "Nordeste" = "Northeast", "Sudeste" = "Southeast", "Sul" = "South", "Centro-Oeste" = "Central-West" ) infra_region <- escolas |> mutate(region = region_labels[no_regiao]) |> group_by(region) |> summarise( Internet = mean(in_internet == 1, na.rm = TRUE) * 100, Library = mean(in_biblioteca == 1, na.rm = TRUE) * 100, `Science lab` = mean(in_laboratorio_ciencias == 1, na.rm = TRUE) * 100, `Sports court` = mean(in_quadra_esportes == 1, na.rm = TRUE) * 100, .groups = "drop" ) |> pivot_longer(-region, names_to = "resource", values_to = "pct") ggplot(infra_region, aes(x = region, y = pct, fill = resource)) + geom_col(position = "dodge") + labs( title = "School Infrastructure by Region (2023)", x = NULL, y = "% of schools", fill = NULL ) + theme_minimal() ``` ![](../man/figures/vignette-infra-by-region.png) ## Urban vs rural schools The `tp_localizacao` column distinguishes urban (1) from rural (2) schools. The infrastructure gap between them is one of the starkest in Brazilian education. ```{r urban-rural} infra_location <- escolas |> mutate( location = ifelse(tp_localizacao == 1, "Urban", "Rural") ) |> group_by(location) |> summarise( Internet = mean(in_internet == 1, na.rm = TRUE) * 100, Broadband = mean(in_banda_larga == 1, na.rm = TRUE) * 100, Library = mean(in_biblioteca == 1, na.rm = TRUE) * 100, `Computer lab` = mean(in_laboratorio_informatica == 1, na.rm = TRUE) * 100, `Science lab` = mean(in_laboratorio_ciencias == 1, na.rm = TRUE) * 100, `Sports court` = mean(in_quadra_esportes == 1, na.rm = TRUE) * 100, .groups = "drop" ) |> pivot_longer(-location, names_to = "resource", values_to = "pct") ggplot(infra_location, aes(x = resource, y = pct, fill = location)) + geom_col(position = "dodge") + scale_fill_manual(values = c("Urban" = "steelblue", "Rural" = "coral")) + labs( title = "School Infrastructure: Urban vs Rural (2023)", x = NULL, y = "% of schools", fill = NULL ) + theme_minimal() + theme(axis.text.x = element_text(angle = 20, hjust = 1)) ``` ![](../man/figures/vignette-infra-urban-rural.png) ## Accessibility features The School Census tracks specific accessibility features. The `in_acessibilidade_inexistente` flag marks schools with no accessibility at all. ```{r accessibility} access_cols <- c( "in_acessibilidade_rampas", "in_acessibilidade_corrimao", "in_acessibilidade_elevador", "in_acessibilidade_pisos_tateis", "in_acessibilidade_sinal_sonoro", "in_acessibilidade_sinal_tatil", "in_acessibilidade_sinal_visual", "in_acessibilidade_inexistente" ) access_labels <- c( "Ramps", "Handrails", "Elevator", "Tactile floors", "Sound signals", "Tactile signals", "Visual signals", "None" ) access_summary <- escolas |> summarise(across(all_of(access_cols), ~ mean(. == 1, na.rm = TRUE) * 100)) |> pivot_longer(everything(), names_to = "feature", values_to = "pct") |> mutate(label = access_labels) ggplot(access_summary, aes(x = reorder(label, pct), y = pct)) + geom_col(fill = "#2a9d8f") + coord_flip() + labs( title = "School Accessibility Features (2023)", x = NULL, y = "% of schools" ) + theme_minimal() ``` ![](../man/figures/vignette-infra-accessibility.png) ## Internet access by state A per-state view highlights which states are lagging behind in digital connectivity. ```{r internet-by-state} internet_uf <- escolas |> group_by(sg_uf) |> summarise( pct_internet = mean(in_internet == 1, na.rm = TRUE) * 100, .groups = "drop" ) |> arrange(pct_internet) ggplot(internet_uf, aes(x = reorder(sg_uf, pct_internet), y = pct_internet)) + geom_col(fill = "steelblue") + coord_flip() + labs( title = "Schools with Internet Access by State (2023)", x = NULL, y = "% of schools" ) + theme_minimal() ``` ![](../man/figures/vignette-infra-internet-state.png)