--- title: "canpumf Pipeline Architecture" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{canpumf Pipeline Architecture} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = FALSE) ``` This document describes how `get_pumf()` turns a Statistics Canada PUMF zip file into a lazy DuckDB-backed `dplyr::tbl()`, covering every choice and fallback along the way. The LFS has its own accumulating pipeline, described separately at the end. --- ## High-level flow ```{r pipeline-diagram, echo=FALSE, out.width="100%", fig.alt="canpumf pipeline: get_pumf dispatches LFS vs. the three-stage pipeline (locate/download, parse metadata, build DuckDB), then registers provenance and returns a lazy tbl."} # The diagram is rendered to a static image (SVG for HTML, PNG for PDF) via # Graphviz so it renders identically and reliably in every output format, # without relying on JavaScript htmlwidgets (which never render in PDF). dot <- ' digraph pipeline { graph [rankdir = TB, fontname = "Helvetica", nodesep = 0.30, ranksep = 0.40, compound = true]; node [fontname = "Helvetica", fontsize = 10, style = "filled", fillcolor = "#eef3f8", color = "#5b7da3", margin = "0.09,0.05"]; edge [fontname = "Helvetica", fontsize = 9, color = "#666666", arrowsize = 0.7]; A [label = "get_pumf(series, version, lang)", shape = box, style = "filled,rounded", fillcolor = "#d9ead3"]; LFS [label = "series == LFS?", shape = diamond, fillcolor = "#fce8b2"]; LFSP [label = "lfs_get_pumf()", shape = box, style = "filled,rounded", fillcolor = "#d9ead3"]; A -> LFS; LFS -> LFSP [label = "yes"]; LFS -> CHK [label = "no", lhead = cluster_s1]; subgraph cluster_s1 { label = "Stage 1 — locate / download"; labeljust = "l"; fontname = "Helvetica-Bold"; fontsize = 11; style = "rounded,filled"; fillcolor = "#fbfdff"; color = "#9fb6cc"; CHK [label = "version dir exists?", shape = diamond, fillcolor = "#fce8b2"]; COL [label = "look up collection URL", shape = box]; EFT [label = "EFT-only?", shape = diamond, fillcolor = "#fce8b2"]; ERR [label = "stop: deposit zip manually", shape = box, fillcolor = "#f4cccc"]; DL [label = "download zip", shape = box]; UZ [label = "robust_unzip()", shape = box]; EXTR [label = "zip already extracted?", shape = diamond, fillcolor = "#fce8b2"]; CHK -> EXTR [label = "yes, not refresh"]; CHK -> COL [label = "no / refresh"]; COL -> EFT; EFT -> ERR [label = "yes"]; EFT -> DL [label = "no"]; DL -> UZ; UZ -> EXTR; EXTR -> UZ [label = "no"]; } subgraph cluster_s2 { label = "Stage 2 — parse metadata"; labeljust = "l"; fontname = "Helvetica-Bold"; fontsize = 11; style = "rounded,filled"; fillcolor = "#fbfdff"; color = "#9fb6cc"; MC [label = "metadata already exists?", shape = diamond, fillcolor = "#fce8b2"]; DF [label = "detect_formats()", shape = box]; P1 [label = "LFS codebook.csv", shape = box]; P2 [label = "CPSS variables.csv", shape = box]; P3 [label = "SAS cards (.lay + .lbe)", shape = box]; P4 [label = "SPSS split (vare/vale/_i)", shape = box]; P5 [label = "SPSS mono (.sps / SPSS.txt / .xmf)", shape = box]; P6 [label = "SPSS .sav", shape = box]; P7 [label = "PDF Dictionary", shape = box]; P8 [label = "PDF frequency codebook", shape = box]; MRG [label = "merge_metadata()", shape = box]; WR [label = "write variables.csv / codes.csv / layout.csv", shape = box]; MC -> DF [label = "no / refresh"]; DF -> P1; DF -> P2; DF -> P3; DF -> P4; DF -> P5; DF -> P6; DF -> P7; DF -> P8; P1 -> MRG; P2 -> MRG; P3 -> MRG; P4 -> MRG; P5 -> MRG; P6 -> MRG; P7 -> MRG; P8 -> MRG; MRG -> WR; } EXTR -> MC [label = "yes", lhead = cluster_s2]; subgraph cluster_s3 { label = "Stage 3 — build DuckDB"; labeljust = "l"; fontname = "Helvetica-Bold"; fontsize = 11; style = "rounded,filled"; fillcolor = "#fbfdff"; color = "#9fb6cc"; TB [label = "table already in DuckDB?", shape = diamond, fillcolor = "#fce8b2"]; FF [label = "find data file", shape = box]; FWF [label = "layout.csv exists\nand file not .csv?", shape = diamond, fillcolor = "#fce8b2"]; RFW [label = "read_fwf", shape = box]; RCS [label = "read_csv", shape = box]; JNK [label = "drop trailing junk rows", shape = box]; FX [label = "apply data fixups\n(str_pad, rename, cols_swap, force_*)", shape = box]; BSW [label = "BSW mask in registry?", shape = diamond, fillcolor = "#fce8b2"]; RBW [label = "join bootstrap weights", shape = box]; NC [label = "numeric conversion\n(missing ranges + na_values)", shape = box]; CL [label = "code labels to factors", shape = box]; WD [label = "write DuckDB table", shape = box]; EN [label = "enforce ENUM / force_* types", shape = box]; OD [label = "open read-only connection", shape = box]; TB -> FF [label = "no / refresh"]; FF -> FWF; FWF -> RFW [label = "yes (FWF)"]; FWF -> RCS [label = "no (CSV)"]; RFW -> JNK; RCS -> JNK; JNK -> FX; FX -> BSW; BSW -> RBW [label = "yes"]; BSW -> NC [label = "no"]; RBW -> NC; NC -> CL; CL -> WD; WD -> EN; EN -> OD; } WR -> TB [lhead = cluster_s3]; MC -> TB [label = "yes, not refresh"]; TB -> OD [label = "yes, not refresh"]; REG [label = "register provenance (series, version, lang)", shape = box, style = "filled,rounded", fillcolor = "#d9ead3"]; TBL [label = "return lazy dplyr::tbl()", shape = box, style = "filled,rounded", fillcolor = "#d9ead3"]; OD -> REG; REG -> TBL; } ' have_render <- requireNamespace("DiagrammeR", quietly = TRUE) && requireNamespace("DiagrammeRsvg", quietly = TRUE) if (have_render) { svg <- DiagrammeRsvg::export_svg(DiagrammeR::grViz(dot)) if (knitr::is_latex_output()) { if (requireNamespace("rsvg", quietly = TRUE)) { png <- knitr::fig_path(".png") dir.create(dirname(png), recursive = TRUE, showWarnings = FALSE) rsvg::rsvg_png(charToRaw(svg), png, width = 1800) knitr::include_graphics(png) } else { message("Install rsvg to render the pipeline diagram in PDF output.") } } else { svg_file <- knitr::fig_path(".svg") dir.create(dirname(svg_file), recursive = TRUE, showWarnings = FALSE) writeLines(svg, svg_file) knitr::include_graphics(svg_file) } } else { cat("Install DiagrammeR and DiagrammeRsvg to render this diagram.") } ``` --- ## Stage 1 — Locate or download `pumf_locate_or_download()` ensures the version directory exists with extracted content before any parsing begins. **Cache layout:** ``` / / / .zip # retained after extraction / metadata/ # written by Stage 2 _.duckdb ``` **Decision sequence:** 1. **`refresh = TRUE`** — delete the `.duckdb` file(s) and `metadata/` subdirectory, leaving the zip and extracted content untouched. Stages 2 and 3 then re-run without re-downloading. 2. **`redownload = TRUE`** — wipe the *entire* version directory first, then proceed as a first-time run. Implies `refresh`. 3. **Already extracted?** — `version_is_extracted()` returns `TRUE` if any subdirectory (other than `metadata/`) or non-zip non-duckdb file is present. If `TRUE`, the zip step is skipped even when the zip is still on disk. 4. **Download** — the URL is looked up in `list_canpumf_collection()`. Surveys distributed only via Statistics Canada's EFT portal have the marker `"(EFT)"` instead of a URL; the function stops with instructions to deposit the zip manually. 5. **Extract** — `robust_unzip()` handles two edge cases: - *Naming collision*: some zips contain a single top-level directory with the same name as the archive (e.g. `2025-CSV.zip/`). The colliding directory is renamed to strip `.zip` before being moved into the version directory. - *Encoding*: older StatCan zips store filenames without the UTF-8 flag (General Purpose Bit Flag bit 11). `grep`/`sub` calls on zip entry names use `useBytes = TRUE` to avoid "invalid in this locale" warnings. ### Version resolution `pumf_resolve_version()` canonicalises Census version strings before any registry lookup. Any string starting with a four-digit year is parsed flexibly: the file type is detected by grepping for `"hierarchical"`, `"household"`, or `"famil"` (defaulting to `"individuals"`), and CMA vs provincial by grepping for `"cma"`. The registry is then probed to determine the correct canonical format for that year. Examples: | User input | Resolved | |---|---| | `"2021"` | `"2021 (individuals)"` | | `"1971"` | `"1971/individuals_prov"` | | `"1971 CMA"` | `"1971/individuals_cma"` | | `"1971 households CMA"` | `"1971/households_cma"` | | `"1986 families"` | `"1986/families"` | | `"2001 households"` | `"2001 (households)"` | --- ## Stage 2 — Parse metadata `pumf_parse_metadata()` converts raw SPSS/SAS command files into three canonical CSVs. The function is idempotent: it does nothing if `metadata/variables.csv` already exists and `refresh = FALSE`. ### Format detection `detect_formats()` scans the entire version directory recursively and identifies which parser(s) apply. **Multiple parsers can fire for the same survey** (e.g. SPSS split for layout/codes and SAS cards for BSW weights). | Priority | Format | Detection rule | |---|---|---| | 1 | **LFS codebook CSV** | filename matches `codebook\.csv` (case-insensitive) | | 2 | **CPSS variables CSV** | filename is exactly `variables.csv` | | 3 | **SAS reading cards** | directory contains both a `.lay` and a `.lbe` file | | 4 | **SPSS split-file** | any `.sps` file whose name ends in `vare`, `vale`, or `_i` | | 5 | **SPSS monolithic** | `.sps` file, `*SPSS.txt` file, or `.xmf` file whose content contains `VALUE LABELS` **or `DATA LIST`** (checked with `useBytes = TRUE` to tolerate CP850/Latin-1 data); `VARIABLE LABELS` is optional | | 6 | **SPSS `.sav`** | a `.sav` binary file readable by haven | | 7 | **PDF Data Dictionary** | `*Dictionary.pdf` present and `pdftools` installed; supplements label-only surveys where the SPSS file has `DATA LIST` but no `VARIABLE LABELS` or `VALUE LABELS` | | 8 | **PDF frequency codebook** | a bilingual StatCan frequency codebook PDF (per-variable `Variable Name:` / `Answer Categories` blocks) under a `Codebook`/`LivreDesCodes` path, content-verified; `pdftools` installed. A **last-resort** fallback consulted only when no command file or codebook CSV was found — recovers labels for surveys whose only machine-readable companion is the data file (e.g. CPSS cycle 1) | Detection for case 5 also searches for a parallel French file — any candidate in the same set whose path includes `/fran` or `/french` (case-insensitive). ### Parsers #### SPSS monolithic (`parse_spss_mono`) Handles the single-file SPSS format used by Census (2001–2021), SFS 1999, SHS, and others. The file typically contains `DATA LIST`, `VARIABLE LABELS`, `VALUE LABELS`, and sometimes `MISSING VALUES` and `FORMATS` sections. `VARIABLE LABELS` is optional (e.g. Census 2011 individuals omits it). Older releases like SFS 1999 have only `DATA LIST` with no label sections at all — these produce a fully importable table with raw codes but no human-readable factor levels. Key parsing details: - **Column ranges** — `DATA LIST` ranges may have spaces on either side of the dash (`129-135`, `129 - 135`, or `129- 135`). All three are normalised by the regex `(\\d+)\\s*-\\s*(\\d+)` before tokenisation. - **Record-group marker** — a leading `/` on the first variable line (e.g. `/PROVP 1-2`) is stripped, not discarded, so the variable is retained. - **Section terminator** — the `DATA LIST` section ends at the first blank line, `.` line, or occurrence of `VARIABLE LABELS`, `VALUE LABELS`, `MISSING VALUES`, `FORMATS`, or `EXECUTE` at the start of a line. The keyword check is the reliable terminator for older files (e.g. 1991 XMF) that have no blank line between `DATA LIST` and `VARIABLE LABELS`. - **DATA LIST type annotations** — the `(A)` suffix after a column range marks a character-type variable. The parser records an `is_char` flag per column and uses it to populate `variables.csv` types when no `VARIABLE LABELS` section is present. - **Sentinel detection** — variables whose only VALUE LABELS are sentinel phrases ("Not applicable", "Valid skip", "Don't know", "Data not available", etc.) are classified as `numeric` with a `missing_low/missing_high` range, not as `character`. This prevents spurious NA warnings when numeric values fall outside the label set. - **Zero-padded codes** — unquoted SPSS numeric codes like `01`, `02` are normalised via `as.numeric()` → `as.character()` so they match bare integer values in CSV data. - **Multi-variable VALUE LABELS blocks** — `/VAR1 VAR2 VAR3` headers (possibly spanning continuation lines) are fully parsed so all listed variables receive the code/label pairs. #### SPSS split-file (`parse_spss_split`) Used by SFS, CPSS, and similar surveys that ship separate files for variable labels (`*vare.sps`), value labels (`*vale.sps`), missing values (`*miss.sps`), and layout (`*_i.sps`). The `layout_mask` from the registry disambiguates when a single directory holds multiple sets (e.g. individual vs. household files). #### SAS reading cards (`parse_sas_cards`) `.lay` files supply the fixed-width column positions; `.lbe` files supply the value labels in `PROC FORMAT` syntax. Variable labels come from a companion `.sas` file if present. This parser reuses `parse_spss_split`'s layout parser since the `.lay` format is identical. #### LFS codebook CSV (`parse_lfs_codebook`) The LFS ships a single `*codebook.csv` with one row per code value. Columns are always read as CP1252 regardless of the `metadata_encoding` registry field. #### CPSS variables CSV (`parse_cpss_csv`) The Canadian Perspectives Survey Series ships a `variables.csv` with variable metadata only; no layout or codes. The encoding defaults to Latin-1 (CP1252 if the registry overrides). #### SPSS `.sav` (`parse_spss_sav`) Haven is used for binary `.sav` files when no text-format command file is available. This is a fallback for surveys that do not ship SPSS syntax. #### PDF Data Dictionary (`parse_pdf_dictionary`) StatCan PDF Data Dictionaries follow a standard bilingual format. Variable blocks start with ` Position: N Character/Numeric(w)`. The parser extracts variable long-names (`Long name:` / `Long nom:`) and code-value labels (`Codes:` / `Domaine:`). Reserved codes (`Reserved Codes:` / `Codes Réservés:`) set `missing_low/missing_high` ranges. This parser produces only `variables` and `codes` (no `layout`), and fires only when `pdftools` is installed and a matching `*Dictionary.pdf` is found. It is used as a label-only supplement for surveys like SFS 1999 where the SPSS file is `DATA LIST`-only. #### PDF frequency codebook (`parse_pdf_codebook`) A second, distinct StatCan PDF layout, used when a survey ships *no* machine-readable command file or codebook CSV — only a bilingual frequency codebook PDF. Variable blocks start with `Variable Name:` / `Nom de la variable :` and carry the label on the `Concept:` line; an `Answer Categories` / `Catégories de réponse` frequency table supplies the value labels (parsed from a right-anchored code-row regex that tolerates comma- and space-grouped counts and rejoins wrapped answer text). Produces only `variables` and `codes`. Like the dictionary parser it requires `pdftools`, but detection is a **fallback of last resort** — only consulted when no command file or codebook CSV was found, and only for PDFs under a `Codebook`/`LivreDesCodes` path that content-verify for the `Variable Name:` + `Answer Categories` signature. This is what gives CPSS cycle 1 (the only cycle without a `variables.csv`) full bilingual labels. ### Metadata encoding The registry `metadata_encoding` field sets the encoding for all text-format parsers. Default is `"CP1252"` (a superset of Latin-1 that correctly decodes Windows-era en-dashes and curly quotes). Exceptions: | Surveys | Encoding | Reason | |---|---|---| | Census 2021, 2021 hierarchical | `"UTF-8"` | Command files shipped as UTF-8 | | Census 1991 (individuals) | `"CP850"` | DOS-era IBM Code Page 850 | ### Merge `merge_metadata()` takes the list of parser outputs and produces a single `list(variables, codes, layout)`. Conflicts are resolved: later parsers win on duplicate variable names. If a layout is present in only some parsers, the function checks that every variable with a layout entry also appears in `variables`, stopping with a diagnostic otherwise. The final result is written to: - `metadata/variables.csv` — one row per variable (name, label_en, label_fr, type, decimals, missing_low, missing_high) - `metadata/codes.csv` — one row per code value (name, val, label_en, label_fr) - `metadata/layout.csv` — one row per fixed-width column (name, start, end); absent for CSV-format surveys --- ## Stage 3 — Build DuckDB `pumf_build_duckdb()` reads the canonical CSVs from `metadata/`, reads the raw data file, applies transformations, and writes a `.duckdb` file. The function skips the build if the target table already exists and `refresh = FALSE`. ### Data file selection `find_pumf_data_file()` searches the version directory recursively. **Extension pre-filter** — derived from the registry `file_mask`: | `file_mask` ends in | Pre-filter | |---|---| | `.csv` | only files matching `\.csv$` | | `.txt` or `.dat` | only files matching `\.(txt\|dat)$` | | other / unusual (e.g. `.INDIV`) | all files (relies on `file_mask` alone) | | absent + layout exists | `\.(txt\|dat)$` (FWF inferred from layout) | | absent + no layout | `\.csv$` | Several subdirectories are always excluded from the search: `metadata/`, `SPSS/`, `Command/`, `Syntax/`, `Layout/`, `SpssCard/`, `Reading_cards/`, `Documents/`. Bootstrap weight (`_BSW.`) files are also excluded; they are handled separately. When multiple candidates survive, the `file_mask` regex narrows the list. If more than one still remains, the function stops with a message listing the ambiguous files and asks to set `file_mask` in the registry. ### FWF vs. CSV After the data file is identified: - **FWF** when `metadata/layout.csv` exists *and* the data file does not end in `.csv`. This handles the edge case (e.g. CHS) where the SPSS DATA LIST produces a layout but the actual data ships as CSV. - **CSV** otherwise. Both paths read all columns as character (`col_types = cols(.default = "c")`) to preserve leading zeros and avoid premature type coercion. Numeric conversion happens explicitly in the next step. ### Trailing junk row removal (FWF only) After reading a fixed-width file, any row where fewer than two columns are non-NA is dropped. FWF files from older StatCan archives often end with `\r\n\x1a` (a DOS EOF marker), which the FWF reader interprets as a one-character row with a single non-NA field; this step removes it silently. CSV files are not affected — CSV parsers handle trailing newlines correctly. ### Data fixups (pre-label) Registry `data_fixups` entries are applied to the raw character data before label mapping: - **`str_pad`** — left- or right-pad specified columns to a target width. Used to zero-pad codes that arrive without leading zeros in some CSV formats (e.g. SFS). - **`rename`** — rename a column; applied only when the old name is present (safe for surveys that ship in multiple release variants, e.g. Census 2021 RELIG/RELIGION_DER). - **`cols_swap`** — named character vector `c(A = "B", C = "D")` swapping pairs of column names. Used for surveys where the DATA LIST variable names are transposed relative to the PDF documentation (e.g. WKACTMA/WKACTFA and FAOCC81/MAOCC81 in Census 1981 individuals). - **`force_numeric`** — character vector of column names to treat as numeric regardless of how many VALUE LABELS are declared. Used when a variable carries boundary or top-code labels (e.g. `"85 years and over"`) alongside otherwise-continuous values, or is an integer index the SPSS file mis-classifies as categorical (e.g. SUBSAMPL in Census 1971). The codes are dropped, but any **true-missing** sentinel codes (Not stated, Don't know, Valid skip, … — *not* zero-value labels like "None") are first converted into a per-variable `missing_low/missing_high` range so those sentinels still become `NA`. An existing missing range (from `MISSING VALUES` or a split-SPSS miss file) takes precedence. - **`force_character` / `force_integer` / `force_bigint`** — character vectors of variable names whose **DuckDB storage type** is overridden. Unlike the conversions above, the raw string values are kept verbatim (no numeric conversion, no code labeling), so geographic codes retain leading zeros and out-of-`int`-range IDs survive. `force_character` keeps the column VARCHAR; `force_integer` / `force_bigint` cast it to INTEGER / BIGINT via `ALTER COLUMN` *after* the table is written (an INTEGER cast that overflows 2^31 errors — use `force_bigint`). A variable may appear in at most one `force_*` set (including `force_numeric`); this is validated at build time. LFS sources its `SURVYEAR` / `SURVMNTH` / `REC_NUM` integer-forcing through this mechanism from the shared LFS registry entry. - **`codes_supplement`** — named list of `data.frame`s injecting code-label rows absent from the SPSS command files (values present in the data but not declared in the command files, e.g. the CHS `PPROV` territories code). Each data frame has columns `val`, `label_en`, `label_fr`. Setting `label_en = NA` marks a value as intentionally missing (produces a silent `NA` factor entry without a warning, and without introducing a spurious factor level). All entries are verified in the override ledger (`tests/testthat/override_verification.csv`). - **`na_values`** — character vector of raw string sentinels that become `NA`. In numeric columns they are exact-matched and NA'd during numeric conversion; in labeled (factor) columns they are silently blanked. Used for undeclared Census income sentinels and SAS-style `"."` missing markers. - **`labels_supplement`** — named list `c(VAR = c(label_en =, label_fr =))` supplying *variable* labels the source metadata leaves blank (e.g. CPSS 1 ships only a PDF codebook whose weight variable `COVID_WT` has an empty `Concept:` line in both languages). Applied in both Stage 3 and `label_pumf_columns()` / `pumf_var_labels()`, and fills only `NA` labels, so genuine source labels always win. ### Bootstrap weight join (BSW) When the registry has `bsw_mask` + `bsw_join_key` + `bsw_file_mask`, the BSW file is found, read (CSV or FWF), and left-joined onto the main data by the join key before numeric conversion. ### Numeric conversion `apply_numeric_conversion()` converts character columns typed `"numeric"` in `variables.csv`: 1. `as.numeric()` on the raw character values. 2. **Missing range** — values in `[missing_low, missing_high]` become `NA`. This handles SPSS-declared `MISSING VALUES` blocks. 3. **`na_values` fixup** — additional raw string sentinels from the registry (`data_fixups$na_values`) are set to `NA` via `trimws(raw) %in% na_values`. Used for undeclared income sentinels in older Census files. The two mechanisms complement each other: the SPSS `MISSING VALUES` block catches sentinels declared in the command file; `na_values` catches those that StatCan omitted from the command file but documents in the user guide. **Census income sentinel widths** (confirmed from SPSS DATA LIST sections): | Census years | Income field width | Sentinels (`na_values`) | |---|---|---| | 2016, 2021 | 8 chars | `"99999999"`, `"88888888"` | | 1991–2011 | 7 chars | `"9999999"`, `"8888888"` | | 1986 and earlier | unverified | none applied | The two sets are kept separate: applying the 7-digit sentinel to an 8-char field would incorrectly NA out a valid $9,999,999 income value (stored as `" 9999999"` which trims to `"9999999"`). ### Code labels → factors `apply_code_labels()` maps raw character values to R factors using `codes.csv`. The factor levels are the complete ordered set from the codes table, not just the values present in the data. Unmatched raw values become `NA` with a warning showing the first five offending values. An exception is made for values that appear in `codes.csv` with `label_en = NA` (injected via `codes_supplement`): these are treated as *intentionally* NA and silently produce `NA` factor entries without a warning. When `lang = "fra"`, any missing French label falls back per-row to the English label. ### DuckDB write and ENUM enforcement The labelled data frame is written to DuckDB with `dbWriteTable()`. Factor columns are stored as DuckDB `ENUM` types. DuckDB >= 1.5.2 does this automatically; for older versions `ensure_enum_columns()` runs `ALTER TABLE ... ALTER COLUMN ... TYPE ENUM(...)` for each factor column. A separate DuckDB table is created per language (table names `eng` and `fra`, or `eng_` for surveys with multiple file types). The write connection is shut down before `pumf_open_duckdb()` re-opens the file in read-only mode, preventing in-process lock conflicts when building both language tables in the same session. --- ## Multi-module surveys Some surveys ship several linked files that share a respondent key and are meant to be joined for analysis (GSS cycle 16 / Aging and Social Support 2002, the GSS Time Use cycles, the Survey of Household Spending 2017, the Giving/Volunteering/ Participating cycles). `canpumf` models these as **several tables inside one DuckDB file** — not separate databases, which could not be joined on a single connection. A registry entry declares `modules = list(MAIN = ..., CG4 = ...)`; each module carries its own `layout_mask`, `file_mask`, `data_fixups`, and bootstrap-weight config. One module is the **primary** (the respondent-level file that carries the survey weight); its config is auto-derived to the entry's top level so all the single-table code paths above keep working unchanged. The entry also records `module_key` — the shared key the modules join on (it varies: `RECID`, `PUMFID`, `MICRO_ID`, `CASEID`, `IDNUM`). `pumf_run_pipeline()` loops the modules, running Stage 2 and Stage 3 once per module so every table lands in the **one** DuckDB file. Each module parses its metadata into `metadata//` (the primary uses `metadata/`) and joins its **own** bootstrap weights, so e.g. the SHS Interview replicate weights are not mis-joined onto the Diary table. The primary module's tbl is returned. User-facing, `get_pumf()` returns the primary module and emits a one-time message listing the sibling modules; `pumf_module(tbl, "")` opens a sibling on the **same** connection so the two are joinable. The dedicated [*Working with multi-module PUMF surveys*](submodules.html) vignette covers the user-facing workflow in full. --- ## LFS pipeline The LFS is handled by `lfs_get_pumf()` (delegated directly from `get_pumf()` without going through `get_pumf_connection()`). Instead of one DuckDB per version, all LFS versions share a single `/LFS/LFS.duckdb` with accumulating tables `lfs_eng` and `lfs_fra`. Key differences from the standard pipeline: - **Schema evolution** — when a new LFS version adds a variable absent from earlier versions, the column is added via `ALTER TABLE ADD COLUMN`. When a variable changes type (e.g. VARCHAR → ENUM), `ALTER COLUMN SET DATA TYPE` is used. - **Annual supersedes monthly** — if annual and monthly versions for the same year are both loaded, the annual version supersedes the monthly rows for that year. - **Version tracking** — a `lfs_versions` table in the shared DuckDB records which versions have been downloaded and parsed, so `refresh = "auto"` downloads only new versions. - **Read-only fast path** — when the requested version is already in the database, `lfs_get_pumf()` opens only a read-only connection and returns immediately. No write lock is acquired unless new data actually needs to be written. - **`get_pumf()` return** — when a specific version is requested, the function applies a `dplyr::filter()` on `SURVYEAR` (and `SURVMNTH` for monthly requests) over the full shared table. Calling `get_pumf("LFS")` without a version returns the unfiltered table. - **`label_pumf_columns()` for LFS** — because the shared schema is the union of all loaded versions, variables introduced in later years (e.g. `GENDER` added ~2020) are absent from older versions' `variables.csv`. `label_pumf_columns()` therefore reads and merges metadata from *every* loaded version directory in chronological order, with the most-recent label winning on conflicts. --- ## Connection provenance registry `get_pumf()` registers `(series, version, cache_path, lang)` in a package-level environment keyed by the DuckDB connection's C++ external-pointer address: ``` .pumf_con_registry <- new.env(hash = TRUE, parent = emptyenv()) key = format(con@conn_ref) # stable across R-level S4 copies ``` This key survives `dplyr` tbl transformations and `select()`/`filter()` calls because those operations do not create new connections. `label_pumf_columns()` uses `.pumf_lookup_con()` to retrieve the provenance; `close_pumf()` removes the entry and disconnects. This internal provenance registry is distinct from the **RStudio Connections pane**. Whether the DuckDB connection is advertised to that pane is controlled separately by the `register_connection` argument to `get_pumf()` (default `getOption("canpumf.register_connection", TRUE)`); set it to `FALSE` to keep the pane from being spammed when opening and closing many connections programmatically. --- ## Registry configuration `pumf_registry_lookup(series, version)` returns a named list that controls every per-survey choice in the pipeline. Surveys without an entry use auto-detection with defaults (see *Newest-sibling inheritance* below for the one exception). | Field | Purpose | Default | |---|---|---| | `file_mask` | regex to select the data file | `NULL` (auto) | | `layout_mask` | SPSS file disambiguator for split-file surveys | `NULL` | | `data_encoding` | encoding of the raw data file | `"CP1252"` | | `metadata_encoding` | encoding of SPSS/SAS command files | `"CP1252"` | | `bsw_mask` | `layout_mask` for BSW-specific SPSS files | `NULL` | | `bsw_file_mask` | filename pattern for the BSW data file | `NULL` | | `bsw_join_key` | column(s) to join BSW onto the main data | `NULL` | | `bsw_drop_cols` | BSW columns to drop before joining | `character(0)` | | `data_fixups` | list of `str_pad`, `rename`, `cols_swap`, `force_numeric`, `force_character`, `force_integer`, `force_bigint`, `codes_supplement`, `na_values`, `labels_supplement` transforms | `list()` | | `missing_supplement` | named list of `c(lo, hi)` pairs — explicit missing-range overrides for sentinels no generic pattern can classify (e.g. non-integer sentinels like `999.5`) | `NULL` | | `doc_mask` | regex applied to PDF filenames to filter a shared documentation directory to the relevant file type (e.g. `"Family\|Familles"` for 1986 Census families) | `NULL` | | `modules` / `module_key` | for multi-module surveys: per-module config (`layout_mask`, `file_mask`, `data_fixups`, BSW) and the shared respondent key the modules join on (see *Multi-module surveys* above) | `NULL` | ### Newest-sibling inheritance Surveys without a registry entry normally fall back to pure auto-detection, with one exception. When the requested version is a bare four-digit year and the same series already has at least one other year-keyed entry, `pumf_registry_lookup()` inherits the configuration of the newest registered sibling whose year is <= the requested year (or the oldest sibling if the requested year predates them all). This lets a freshly released year deposited in the cache reuse the prior year's config — which works cleanly now that recent `file_mask`s use a generic `\d{4}` year placeholder rather than a hard-coded year. A `message()` fires once per session so the implicit reuse is discoverable; a genuinely changed release (new file layout, codes, or BSW join) still needs its own explicit entry. Inheritance is **skipped** for multi-part versions (e.g. Census `2021 (individuals)`) and for LFS, which has its own shared registry entry.