--- title: "International Survey Compatibility" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{International Survey Compatibility} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", warning = FALSE, message = FALSE ) has_eph <- requireNamespace("eph", quietly = TRUE) has_pnad <- requireNamespace("PNADcIBGE", quietly = TRUE) has_ipumsr <- requireNamespace("ipumsr", quietly = TRUE) has_haven <- requireNamespace("haven", quietly = TRUE) # Handle strata with a single PSU (common in example/sample data) options(survey.lonely.psu = "adjust") ``` ## Introduction metasurvey's `Survey` class and step pipeline (`step_compute`, `step_recode`, `step_rename`, `step_remove`, `step_join`) are survey-agnostic: they work with any tabular data. This vignette demonstrates compatibility with 7 household surveys from 6 countries, using real bundled data where possible. For each survey we show the full flow: load data, create a `Survey`, apply steps, estimate with `workflow()`, and package the pipeline as a `Recipe`. ### Compatibility matrix | Feature | ECH | EPH | CASEN | PNADc | CPS | ENIGH | DHS | |---------|:---:|:---:|:-----:|:-----:|:---:|:-----:|:---:| | `step_compute` | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | `step_recode` | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | `step_rename` | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | `step_remove` | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | `add_weight` | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | `strata` | N/A | N/A | ✓ | ✓ | N/A | ✓ | ✓ | | `psu` | N/A | N/A | ✓ | ✓ | N/A | ✓ | ✓ | | `add_replicate` | ✓ | N/A | N/A | ✓ | ✓ | N/A | N/A | | `workflow` | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | `Recipe` | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ### Installing companion packages Most packages are on CRAN. The `casen` package is only available from GitHub. Sections for unavailable packages are skipped automatically. Install them manually to see all examples: ```r # CRAN packages install.packages(c("eph", "PNADcIBGE", "ipumsr", "rdhs")) # GitHub-only packages # install.packages("remotes") remotes::install_github("pachadotdev/casen") ``` ## ECH -- Uruguay The ECH (Encuesta Continua de Hogares) is the primary survey metasurvey was built for. A bundled sample is included in the package. ```{r ech} library(metasurvey) library(data.table) dt_ech <- fread( system.file("extdata", "ech_2023_sample.csv", package = "metasurvey") ) svy_ech <- Survey$new( data = dt_ech, edition = "2023", type = "ech", psu = NULL, engine = "data.table", weight = add_weight(annual = "W_ANO") ) svy_ech <- svy_ech |> step_recode(labor_status, POBPCOAC == 2 ~ "Employed", POBPCOAC %in% 3:5 ~ "Unemployed", POBPCOAC %in% c(6:10, 1) ~ "Inactive or under 14", comment = "ILO labor force status" ) |> step_compute( income_pc = HT11 / nper, comment = "Per capita household income" ) |> bake_steps() workflow( list(svy_ech), survey::svymean(~HT11, na.rm = TRUE), estimation_type = "annual" ) ``` For the complete ECH pipeline, see `vignette("ech-case-study")`. ## EPH -- Argentina The EPH (Encuesta Permanente de Hogares) is Argentina's quarterly labor force survey. The `eph` package ([CRAN](https://cran.r-project.org/package=eph), [GitHub](https://github.com/ropensci/eph)) includes a bundled toybase. ```{r eph, eval = has_eph} library(eph) data("toybase_individual_2016_04", package = "eph") dt_eph <- data.table(toybase_individual_2016_04) svy_eph <- Survey$new( data = dt_eph, edition = "201604", type = "eph", psu = NULL, engine = "data.table", weight = add_weight(quarterly = "PONDERA") ) svy_eph <- svy_eph |> step_recode(labor_status, ESTADO == 1 ~ "Employed", ESTADO == 2 ~ "Unemployed", ESTADO == 3 ~ "Inactive", .default = NA_character_, comment = "Labor force status (INDEC)" ) |> step_recode(sex, CH04 == 1 ~ "Male", CH04 == 2 ~ "Female", .default = NA_character_, comment = "Sex from CH04" ) |> step_compute( employed = ifelse(ESTADO == 1, 1L, 0L), comment = "Employment indicator" ) |> bake_steps() # Employment rate workflow( list(svy_eph), survey::svymean(~employed, na.rm = TRUE), estimation_type = "quarterly" ) ``` To download actual EPH microdata use `eph::get_microdata(year = 2023, period = 3, type = "individual")`. ## CASEN -- Chile The CASEN (Encuesta de Caracterizacion Socioeconomica Nacional) is Chile's main socioeconomic survey. It uses stratified cluster sampling. The `casen` package ([GitHub](https://github.com/pachadotdev/casen)) includes a bundled sample from the Los Rios region. Install with `remotes::install_github("pachadotdev/casen")`. ```r # Requires: remotes::install_github("pachadotdev/casen") data("casen_2017_los_rios") dt_casen <- data.table(casen_2017_los_rios) svy_casen <- Survey$new( data = dt_casen, edition = "2017", type = "casen", psu = "varunit", strata = "varstrat", engine = "data.table", weight = add_weight(annual = "expc") ) svy_casen <- svy_casen |> step_recode(sex, sexo == 1 ~ "Male", sexo == 2 ~ "Female", .default = NA_character_, comment = "Sex (MDS codebook)" ) |> step_recode(poverty_status, pobreza == 1 ~ "Extreme poverty", pobreza == 2 ~ "Non-extreme poverty", pobreza == 3 ~ "Not poor", .default = NA_character_, comment = "Poverty status" ) |> step_compute( log_income = log(ytotcorh + 1), comment = "Log household income" ) |> bake_steps() # Mean household income workflow( list(svy_casen), survey::svymean(~ytotcorh, na.rm = TRUE), estimation_type = "annual" ) ``` For the full CASEN microdata, use `descargar_casen_github(2017, tempdir())` from the casen package. ## PNADc -- Brazil The PNADc (Pesquisa Nacional por Amostra de Domicilios Continua) is Brazil's quarterly labor force survey with stratified cluster sampling. The `PNADcIBGE` package ([CRAN](https://cran.r-project.org/package=PNADcIBGE)) includes bundled example microdata. ```{r pnadc, eval = has_pnad} library(PNADcIBGE) dt_pnadc <- data.table(read_pnadc( microdata = system.file("extdata", "exampledata.txt", package = "PNADcIBGE"), input_txt = system.file("extdata", "input_example.txt", package = "PNADcIBGE") )) svy_pnadc <- Survey$new( data = dt_pnadc, edition = "202301", type = "pnadc", psu = "UPA", strata = "Estrato", engine = "data.table", weight = add_weight(quarterly = "V1028") ) svy_pnadc <- svy_pnadc |> step_recode(sex, V2007 == 1 ~ "Male", V2007 == 2 ~ "Female", .default = NA_character_, comment = "Sex (V2007)" ) |> step_compute( age = as.integer(V2009), comment = "Age in years" ) |> bake_steps() workflow( list(svy_pnadc), survey::svymean(~age, na.rm = TRUE), estimation_type = "quarterly" ) ``` For real PNADc microdata: `PNADcIBGE::get_pnadc(year = 2023, quarter = 1)`. The download is approximately 200 MB. ## CPS -- United States The CPS (Current Population Survey) is the US monthly labor force survey. The `ipumsr` package ([CRAN](https://cran.r-project.org/package=ipumsr), [GitHub](https://github.com/ipums/ipumsr)) includes a bundled CPS extract. ```{r cps, eval = has_ipumsr} library(ipumsr) ddi <- read_ipums_ddi( system.file("extdata", "cps_00160.xml", package = "ipumsr") ) dt_cps <- data.table(read_ipums_micro(ddi, verbose = FALSE)) svy_cps <- Survey$new( data = dt_cps, edition = "2011", type = "cps", psu = NULL, engine = "data.table", weight = add_weight(annual = "ASECWT") ) svy_cps <- svy_cps |> step_recode(health_status, HEALTH == 1 ~ "Excellent", HEALTH == 2 ~ "Very good", HEALTH == 3 ~ "Good", HEALTH == 4 ~ "Fair", HEALTH == 5 ~ "Poor", .default = NA_character_, comment = "Self-reported health status" ) |> step_compute( log_income = log(INCTOT + 1), comment = "Log total income" ) |> bake_steps() workflow( list(svy_cps), survey::svymean(~INCTOT, na.rm = TRUE), estimation_type = "annual" ) ``` IPUMS data requires a free account at . The DDI XML file provides variable metadata for labeling. ## ENIGH -- Mexico The ENIGH (Encuesta Nacional de Ingresos y Gastos de los Hogares) is Mexico's income and expenditure survey. There is no dedicated R package; data is available from [INEGI](https://www.inegi.org.mx/programas/enigh/). Below is a synthetic example that mirrors the real structure. ```{r enigh} set.seed(42) dt_enigh <- data.table( id = 1:200, upm = rep(1:40, each = 5), est_dis = rep(1:10, each = 20), factor = runif(200, 100, 500), sexo_jefe = sample(1:2, 200, replace = TRUE), edad_jefe = sample(18:80, 200, replace = TRUE), ing_cor = rlnorm(200, 10, 1), tam_hog = sample(1:8, 200, replace = TRUE) ) svy_enigh <- Survey$new( data = dt_enigh, edition = "2022", type = "enigh", psu = "upm", strata = "est_dis", engine = "data.table", weight = add_weight(annual = "factor") ) svy_enigh <- svy_enigh |> step_recode(sex_head, sexo_jefe == 1 ~ "Male", sexo_jefe == 2 ~ "Female", .default = NA_character_, comment = "Sex of household head" ) |> step_compute( income_pc = ing_cor / tam_hog, comment = "Per capita household income" ) |> bake_steps() workflow( list(svy_enigh), survey::svymean(~income_pc, na.rm = TRUE), estimation_type = "annual" ) ``` ## DHS -- International The DHS (Demographic and Health Surveys) program covers 90+ countries. The `rdhs` package ([CRAN](https://cran.r-project.org/package=rdhs), [GitHub](https://github.com/ropensci/rdhs)) provides API access. Model datasets (no authentication needed) are available at . ```{r dhs, eval = FALSE} library(haven) # Download the model Individual Recode (no credentials needed) tf <- tempfile(fileext = ".zip") download.file( "https://dhsprogram.com/data/model_data/dhs/zzir62dt.zip", tf, mode = "wb", quiet = TRUE ) td <- tempdir() unzip(tf, exdir = td) dta_file <- list.files(td, pattern = "\\.DTA$", full.names = TRUE, ignore.case = TRUE ) dt_dhs <- data.table(read_dta(dta_file[1])) # DHS weights must be divided by 1,000,000 dt_dhs[, wt := as.numeric(v005) / 1e6] svy_dhs <- Survey$new( data = dt_dhs, edition = "2020", type = "dhs", psu = "v001", strata = "v023", engine = "data.table", weight = add_weight(annual = "wt") ) svy_dhs <- svy_dhs |> step_recode(education, v106 == 0 ~ "No education", v106 == 1 ~ "Primary", v106 == 2 ~ "Secondary", v106 == 3 ~ "Higher", .default = NA_character_, comment = "Education level (v106)" ) |> step_compute( children = as.numeric(v201), comment = "Children ever born" ) |> bake_steps() workflow( list(svy_dhs), survey::svymean(~children, na.rm = TRUE), estimation_type = "annual" ) ``` DHS data requires registration at . The `rdhs` package handles API authentication. Weights (v005) must be divided by 1,000,000 before use. ## Recipe portability The same `Recipe` structure works regardless of the source survey: ```{r recipe-portability} set.seed(42) dt_demo <- data.table( id = 1:100, age = sample(18:65, 100, replace = TRUE), income = round(runif(100, 1000, 5000), 2), w = round(runif(100, 0.5, 2), 4) ) svy_demo <- Survey$new( data = dt_demo, edition = "2023", type = "demo", psu = NULL, engine = "data.table", weight = add_weight(annual = "w") ) svy_demo <- svy_demo |> step_compute(indicator = ifelse(age > 30, 1L, 0L)) |> step_recode(age_group, age < 30 ~ "Young", age >= 30 ~ "Adult", .default = NA_character_ ) my_recipe <- steps_to_recipe( name = "Demo Indicators", user = "Research Team", svy = svy_demo, description = "Reusable demographic indicators", steps = get_steps(svy_demo), topic = "demographics" ) doc <- my_recipe$doc() cat("Inputs:", paste(doc$input_variables, collapse = ", "), "\n") cat("Outputs:", paste(doc$output_variables, collapse = ", "), "\n") ``` Recipes capture *what* transformations to apply, not *which* survey they came from. A recipe built for EPH can be adapted for PNADc by simply renaming variables. ## Next steps - [Getting Started](getting-started.html) -- Survey objects and steps - [Survey Designs and Validation](complex-designs.html) -- Stratified and clustered designs, replicate weights - [Rotating Panels](panel-analysis.html) -- `RotativePanelSurvey` and `PoolSurvey` - [Recipes](recipes.html) -- Creating, saving, and sharing recipes - [ECH Case Study](ech-case-study.html) -- Full Uruguay ECH pipeline