## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = FALSE) ## ----pipeline-diagram, echo=FALSE, out.width="100%", fig.alt="canpumf pipeline: get_pumf dispatches LFS vs. the three-stage pipeline (locate/download, parse metadata, build DuckDB), then registers provenance and returns a lazy tbl."---- # The diagram is rendered to a static image (SVG for HTML, PNG for PDF) via # Graphviz so it renders identically and reliably in every output format, # without relying on JavaScript htmlwidgets (which never render in PDF). dot <- ' digraph pipeline { graph [rankdir = TB, fontname = "Helvetica", nodesep = 0.30, ranksep = 0.40, compound = true]; node [fontname = "Helvetica", fontsize = 10, style = "filled", fillcolor = "#eef3f8", color = "#5b7da3", margin = "0.09,0.05"]; edge [fontname = "Helvetica", fontsize = 9, color = "#666666", arrowsize = 0.7]; A [label = "get_pumf(series, version, lang)", shape = box, style = "filled,rounded", fillcolor = "#d9ead3"]; LFS [label = "series == LFS?", shape = diamond, fillcolor = "#fce8b2"]; LFSP [label = "lfs_get_pumf()", shape = box, style = "filled,rounded", fillcolor = "#d9ead3"]; A -> LFS; LFS -> LFSP [label = "yes"]; LFS -> CHK [label = "no", lhead = cluster_s1]; subgraph cluster_s1 { label = "Stage 1 — locate / download"; labeljust = "l"; fontname = "Helvetica-Bold"; fontsize = 11; style = "rounded,filled"; fillcolor = "#fbfdff"; color = "#9fb6cc"; CHK [label = "version dir exists?", shape = diamond, fillcolor = "#fce8b2"]; COL [label = "look up collection URL", shape = box]; EFT [label = "EFT-only?", shape = diamond, fillcolor = "#fce8b2"]; ERR [label = "stop: deposit zip manually", shape = box, fillcolor = "#f4cccc"]; DL [label = "download zip", shape = box]; UZ [label = "robust_unzip()", shape = box]; EXTR [label = "zip already extracted?", shape = diamond, fillcolor = "#fce8b2"]; CHK -> EXTR [label = "yes, not refresh"]; CHK -> COL [label = "no / refresh"]; COL -> EFT; EFT -> ERR [label = "yes"]; EFT -> DL [label = "no"]; DL -> UZ; UZ -> EXTR; EXTR -> UZ [label = "no"]; } subgraph cluster_s2 { label = "Stage 2 — parse metadata"; labeljust = "l"; fontname = "Helvetica-Bold"; fontsize = 11; style = "rounded,filled"; fillcolor = "#fbfdff"; color = "#9fb6cc"; MC [label = "metadata already exists?", shape = diamond, fillcolor = "#fce8b2"]; DF [label = "detect_formats()", shape = box]; P1 [label = "LFS codebook.csv", shape = box]; P2 [label = "CPSS variables.csv", shape = box]; P3 [label = "SAS cards (.lay + .lbe)", shape = box]; P4 [label = "SPSS split (vare/vale/_i)", shape = box]; P5 [label = "SPSS mono (.sps / SPSS.txt / .xmf)", shape = box]; P6 [label = "SPSS .sav", shape = box]; P7 [label = "PDF Dictionary", shape = box]; P8 [label = "PDF frequency codebook", shape = box]; MRG [label = "merge_metadata()", shape = box]; WR [label = "write variables.csv / codes.csv / layout.csv", shape = box]; MC -> DF [label = "no / refresh"]; DF -> P1; DF -> P2; DF -> P3; DF -> P4; DF -> P5; DF -> P6; DF -> P7; DF -> P8; P1 -> MRG; P2 -> MRG; P3 -> MRG; P4 -> MRG; P5 -> MRG; P6 -> MRG; P7 -> MRG; P8 -> MRG; MRG -> WR; } EXTR -> MC [label = "yes", lhead = cluster_s2]; subgraph cluster_s3 { label = "Stage 3 — build DuckDB"; labeljust = "l"; fontname = "Helvetica-Bold"; fontsize = 11; style = "rounded,filled"; fillcolor = "#fbfdff"; color = "#9fb6cc"; TB [label = "table already in DuckDB?", shape = diamond, fillcolor = "#fce8b2"]; FF [label = "find data file", shape = box]; FWF [label = "layout.csv exists\nand file not .csv?", shape = diamond, fillcolor = "#fce8b2"]; RFW [label = "read_fwf", shape = box]; RCS [label = "read_csv", shape = box]; JNK [label = "drop trailing junk rows", shape = box]; FX [label = "apply data fixups\n(str_pad, rename, cols_swap, force_*)", shape = box]; BSW [label = "BSW mask in registry?", shape = diamond, fillcolor = "#fce8b2"]; RBW [label = "join bootstrap weights", shape = box]; NC [label = "numeric conversion\n(missing ranges + na_values)", shape = box]; CL [label = "code labels to factors", shape = box]; WD [label = "write DuckDB table", shape = box]; EN [label = "enforce ENUM / force_* types", shape = box]; OD [label = "open read-only connection", shape = box]; TB -> FF [label = "no / refresh"]; FF -> FWF; FWF -> RFW [label = "yes (FWF)"]; FWF -> RCS [label = "no (CSV)"]; RFW -> JNK; RCS -> JNK; JNK -> FX; FX -> BSW; BSW -> RBW [label = "yes"]; BSW -> NC [label = "no"]; RBW -> NC; NC -> CL; CL -> WD; WD -> EN; EN -> OD; } WR -> TB [lhead = cluster_s3]; MC -> TB [label = "yes, not refresh"]; TB -> OD [label = "yes, not refresh"]; REG [label = "register provenance (series, version, lang)", shape = box, style = "filled,rounded", fillcolor = "#d9ead3"]; TBL [label = "return lazy dplyr::tbl()", shape = box, style = "filled,rounded", fillcolor = "#d9ead3"]; OD -> REG; REG -> TBL; } ' have_render <- requireNamespace("DiagrammeR", quietly = TRUE) && requireNamespace("DiagrammeRsvg", quietly = TRUE) if (have_render) { svg <- DiagrammeRsvg::export_svg(DiagrammeR::grViz(dot)) if (knitr::is_latex_output()) { if (requireNamespace("rsvg", quietly = TRUE)) { png <- knitr::fig_path(".png") dir.create(dirname(png), recursive = TRUE, showWarnings = FALSE) rsvg::rsvg_png(charToRaw(svg), png, width = 1800) knitr::include_graphics(png) } else { message("Install rsvg to render the pipeline diagram in PDF output.") } } else { svg_file <- knitr::fig_path(".svg") dir.create(dirname(svg_file), recursive = TRUE, showWarnings = FALSE) writeLines(svg, svg_file) knitr::include_graphics(svg_file) } } else { cat("Install DiagrammeR and DiagrammeRsvg to render this diagram.") }