## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, message = FALSE, warning = FALSE ) ## ----setup-------------------------------------------------------------------- library(tidylearn) library(dplyr) ## ----dispatcher-concept, eval = FALSE----------------------------------------- # # Format is auto-detected from the file extension # data <- tl_read("sales.csv") # data <- tl_read("results.xlsx", sheet = "Q1") # data <- tl_read("experiment.parquet") # data <- tl_read("config.json") # data <- tl_read("model_data.rds") # # # Override format detection when the extension is ambiguous # data <- tl_read("export.txt", format = "tsv") ## ----print-demo--------------------------------------------------------------- tmp <- tempfile(fileext = ".csv") write.csv(mtcars, tmp, row.names = FALSE) data <- tl_read(tmp, .quiet = TRUE) data ## ----cleanup-1, include = FALSE----------------------------------------------- unlink(tmp) ## ----csv-demo----------------------------------------------------------------- # Create example files tmp_csv <- tempfile(fileext = ".csv") tmp_tsv <- tempfile(fileext = ".tsv") write.csv(iris, tmp_csv, row.names = FALSE) write.table(iris, tmp_tsv, sep = "\t", row.names = FALSE) csv_data <- tl_read_csv(tmp_csv) tsv_data <- tl_read_tsv(tmp_tsv) nrow(csv_data) ## ----cleanup-csv, include = FALSE--------------------------------------------- unlink(c(tmp_csv, tmp_tsv)) ## ----excel-demo--------------------------------------------------------------- library(readxl) path <- readxl_example("datasets.xlsx") excel_data <- tl_read_excel(path, sheet = "mtcars") head(excel_data, 3) ## ----parquet-demo------------------------------------------------------------- library(nanoparquet) tmp_pq <- tempfile(fileext = ".parquet") write_parquet(iris, tmp_pq) pq_data <- tl_read_parquet(tmp_pq) nrow(pq_data) ## ----cleanup-pq, include = FALSE---------------------------------------------- unlink(tmp_pq) ## ----json-demo---------------------------------------------------------------- library(jsonlite) tmp_json <- tempfile(fileext = ".json") write_json(mtcars[1:5, ], tmp_json) json_data <- tl_read_json(tmp_json) json_data ## ----cleanup-json, include = FALSE-------------------------------------------- unlink(tmp_json) ## ----rds-demo----------------------------------------------------------------- tmp_rds <- tempfile(fileext = ".rds") saveRDS(iris, tmp_rds) rds_data <- tl_read_rds(tmp_rds) nrow(rds_data) ## ----rdata-demo--------------------------------------------------------------- tmp_rdata <- tempfile(fileext = ".rdata") my_data <- mtcars save(my_data, file = tmp_rdata) # Name is auto-detected when there is a single data frame rdata_data <- tl_read_rdata(tmp_rdata) nrow(rdata_data) ## ----cleanup-rds, include = FALSE--------------------------------------------- unlink(c(tmp_rds, tmp_rdata)) ## ----sqlite-demo-------------------------------------------------------------- library(DBI) library(RSQLite) # Create an example database tmp_db <- tempfile(fileext = ".sqlite") conn <- dbConnect(SQLite(), tmp_db) dbWriteTable(conn, "iris_tbl", iris) dbDisconnect(conn) # Read with tl_read_sqlite db_data <- tl_read_sqlite( tmp_db, "SELECT * FROM iris_tbl WHERE Species = 'setosa'" ) nrow(db_data) ## ----cleanup-sqlite, include = FALSE------------------------------------------ unlink(tmp_db) ## ----db-demo------------------------------------------------------------------ conn <- dbConnect(SQLite(), ":memory:") dbWriteTable(conn, "mtcars_tbl", mtcars) sql <- "SELECT mpg, wt, hp FROM mtcars_tbl WHERE mpg > 20" db_result <- tl_read_db(conn, sql) db_result dbDisconnect(conn) ## ----remote-db, eval = FALSE-------------------------------------------------- # # PostgreSQL # pg_data <- tl_read_postgres( # dsn = "localhost", # query = "SELECT * FROM sales WHERE year = 2025", # dbname = "analytics", # user = "myuser", # password = "mypass" # ) # # # MySQL / MariaDB # nolint: commented_code_linter. # mysql_data <- tl_read_mysql( # dsn = "mysql://user:pass@host:3306/mydb", # query = "SELECT * FROM customers LIMIT 1000" # ) # # # BigQuery # bq_data <- tl_read_bigquery( # project = "my-gcp-project", # query = "SELECT * FROM `dataset.table` LIMIT 1000" # ) ## ----s3, eval = FALSE--------------------------------------------------------- # data <- tl_read_s3("s3://my-bucket/data/sales_2025.csv") # data <- tl_read_s3("s3://my-bucket/data/results.parquet", region = "eu-west-1") ## ----github, eval = FALSE----------------------------------------------------- # # Read a CSV from a public GitHub repository # data <- tl_read_github("tidyverse/dplyr", # path = "data-raw/starwars.csv", ref = "main" # ) ## ----kaggle, eval = FALSE----------------------------------------------------- # data <- tl_read_kaggle("zillow/zecon", file = "Zip_time_series.csv") # data <- tl_read_kaggle("titanic", file = "train.csv", type = "competition") ## ----multi-path--------------------------------------------------------------- dir <- tempdir() write.csv(iris[1:50, ], file.path(dir, "batch1.csv"), row.names = FALSE) write.csv(iris[51:100, ], file.path(dir, "batch2.csv"), row.names = FALSE) paths <- file.path(dir, c("batch1.csv", "batch2.csv")) combined <- tl_read(paths, .quiet = TRUE) table(combined$source_file) ## ----cleanup-multi, include = FALSE------------------------------------------- unlink(file.path(dir, c("batch1.csv", "batch2.csv"))) ## ----dir-demo----------------------------------------------------------------- dir <- tempfile(pattern = "tl_vignette_") dir.create(dir) write.csv(iris[1:50, ], file.path(dir, "jan.csv"), row.names = FALSE) write.csv(iris[51:100, ], file.path(dir, "feb.csv"), row.names = FALSE) write.csv(iris[101:150, ], file.path(dir, "mar.csv"), row.names = FALSE) # Read all CSVs from the directory all_data <- tl_read_dir(dir, format = "csv", .quiet = TRUE) nrow(all_data) table(all_data$source_file) ## ----dir-pattern-------------------------------------------------------------- # Filter with a regex pattern subset <- tl_read_dir(dir, pattern = "^(jan|feb)", .quiet = TRUE) nrow(subset) ## ----cleanup-dir, include = FALSE--------------------------------------------- unlink(dir, recursive = TRUE) ## ----dir-dispatch, eval = FALSE----------------------------------------------- # data <- tl_read("data/monthly_exports/") ## ----zip-demo----------------------------------------------------------------- # Create an example zip dir <- tempfile(pattern = "tl_zip_src_") dir.create(dir) write.csv(iris, file.path(dir, "iris.csv"), row.names = FALSE) zip_path <- tempfile(fileext = ".zip") old_wd <- getwd() setwd(dir) utils::zip(zip_path, "iris.csv") setwd(old_wd) zip_data <- tl_read_zip(zip_path, .quiet = TRUE) nrow(zip_data) attr(zip_data, "tl_format") ## ----cleanup-zip, include = FALSE--------------------------------------------- unlink(c(dir, zip_path), recursive = TRUE) ## ----zip-dispatch, eval = FALSE----------------------------------------------- # data <- tl_read("download.zip") # data <- tl_read("download.zip", file = "train.csv") ## ----class-demo--------------------------------------------------------------- tmp <- tempfile(fileext = ".csv") write.csv(mtcars, tmp, row.names = FALSE) data <- tl_read(tmp, .quiet = TRUE) # Check metadata attr(data, "tl_format") # Works with dplyr data %>% filter(mpg > 20) %>% select(mpg, wt, hp) %>% head(3) ## ----cleanup-class, include = FALSE------------------------------------------- unlink(tmp) ## ----pipeline----------------------------------------------------------------- # 1. Ingest tmp <- tempfile(fileext = ".csv") write.csv(iris, tmp, row.names = FALSE) data <- tl_read(tmp, .quiet = TRUE) # CSV files lose factor information, so convert character columns as needed data <- data %>% mutate(Species = as.factor(Species)) # 2. Split split <- tl_split(data, prop = 0.7, stratify = "Species", seed = 42) # 3. Model model <- tl_model(split$train, Species ~ ., method = "forest") # 4. Evaluate eval_result <- tl_evaluate(model, new_data = split$test) eval_result ## ----cleanup-pipeline, include = FALSE---------------------------------------- unlink(tmp)