--- title: "Introduction to birddog" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Introduction to birddog} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, tidy = FALSE, comment = "#>", warning = FALSE, message = FALSE, fig.width = 9, fig.height = 7 ) set.seed(888L) local_data <- "~/Sync/birddog/data-biogas/rawfiles-direct" has_local_data <- dir.exists(local_data) knitr::opts_chunk$set(eval = has_local_data) # When local data is absent (e.g. CRAN), no chunk runs; the pre-built HTML ships instead. ``` ## Overview `birddog` helps you detect emergence and trace trajectories in scientific literature and patents. It reads datasets from OpenAlex and Web of Science (WoS), builds citation-based networks, identifies groups, and summarizes their dynamics. The stable release is on CRAN. The development version is available on GitHub: . ### Methodological workflow ![](https://roneyfraga.com/volume/keep_it/cnpq-pq-horizontal.svg){width=99%} ## Installation ```{r eval = FALSE} # stable version (CRAN) install.packages("birddog") # development version (GitHub) # install.packages("remotes") remotes::install_github("roneyfraga/birddog") ``` ```{r} library(birddog) ``` ## Data sources `birddog` supports two main data sources: - [OpenAlex](https://openalex.org/): browser search with CSV export, or API via `{openalexR}`. - [Web of Science](https://www.webofscience.com/wos/woscc/smart-search): multiple export formats (`.bib`, `.ris`, plain-text `.txt`, tab-delimited `.txt`). ### OpenAlex via API or CSV ```{r eval = FALSE} library(openalexR) # Fetch works from OpenAlex API url_api <- "https://api.openalex.org/works?page=1&filter=primary_location.source.id:s121026525" openalexR::oa_request(query_url = url_api) |> openalexR::oa2df(entity = "works") |> birddog::read_openalex(format = "api") -> M # Or from a CSV export M <- birddog::read_openalex("path/to/openalex-export.csv", format = "csv") ``` ### Web of Science (WoS) ```{r eval = FALSE} # BibTeX M <- birddog::read_wos("path/to/savedrecs.bib", format = "bib") # RIS M <- birddog::read_wos("path/to/savedrecs.ris", format = "ris") # Plain text M <- birddog::read_wos("path/to/savedrecs.txt", format = "txt-plain-text") # Tab-delimited M <- birddog::read_wos("path/to/savedrecs.txt", format = "txt-tab-delimited") ``` ## Example dataset We use a biogas dataset from OpenAlex with 57,734 documents as a running example. ```{r eval = FALSE} # Download from OpenAlex (~15 min) query_oa <- "( biogas )" openalexR::oa_fetch( entity = "works", title_and_abstract.search = query_oa, verbose = TRUE ) -> papers M <- birddog::read_openalex(papers, format = "api") ``` ```{r eval = FALSE} # Pre-computed dataset url_m <- "https://roneyfraga.com/volume/keep_it/biogas-data/M.rds" M <- readRDS(url(url_m)) ``` ```{r include = FALSE, } M <- readRDS(file.path(local_data, "M.rds")) ``` ```{r } dplyr::glimpse(M) ``` ## Citation network Build a citation network to map the relationships between documents. Direct citation captures time-ordered influence; bibliographic coupling groups papers that share references. ```{r eval = FALSE} net <- birddog::sniff_network(M, type = "direct citation") ``` ```{r include = FALSE, } net <- readRDS(file.path(local_data, "net.rds")) ``` ```{r } net |> tidygraph::activate(nodes) |> dplyr::select(name, AU, PY, TI, TC) |> dplyr::arrange(dplyr::desc(TC)) ``` ## Components Identify connected components to eliminate disconnected documents that do not share the same bibliographic references. ```{r eval = FALSE} comps <- birddog::sniff_components(net) ``` ```{r include = FALSE, } comps <- readRDS(file.path(local_data, "comps.rds")) ``` ```{r } comps$components |> dplyr::slice_head(n = 5) |> gt::gt() ``` ## Groups (community detection) Detect research communities within the citation network. Each group represents a cluster of related publications. ```{r eval = FALSE} groups <- birddog::sniff_groups( comps, algorithm = "fast_greedy", min_group_size = 30, seed = 888L ) ``` ```{r include = FALSE, } groups <- readRDS(file.path(local_data, "groups.rds")) ``` ```{r } groups$aggregate |> gt::gt() ``` ### Group attributes Summarize group-level statistics including publication trends and growth rates. ```{r eval = FALSE} # ~2 min groups_attributes <- birddog::sniff_groups_attributes( groups, growth_rate_period = 2010:2024, show_results = FALSE ) ``` ```{r include = FALSE, } groups_attributes <- readRDS(file.path(local_data, "groups_attributes.rds")) ``` ```{r } groups_attributes$attributes_table ``` ### Group keywords Explore the most frequent keywords in each group. ```{r } groups_keywords <- birddog::sniff_groups_keywords(groups) groups_keywords |> dplyr::filter(group %in% c('c1g1', 'c1g2', 'c1g3')) |> gt::gt() ``` ### Group NLP terms Extract key phrases from abstracts using natural language processing. ```{r eval = FALSE} # ~30 min groups_terms <- birddog::sniff_groups_terms(groups, algorithm = "phrase") ``` ```{r include = FALSE, } groups_terms <- readRDS(file.path(local_data, "groups_terms.rds")) ``` ```{r } groups_terms$terms_table |> dplyr::slice_head(n = 3) |> gt::gt() ``` ### Hubs Classify documents by their role in the network using the Zi-Pi method. Hub documents connect different research communities. ```{r eval = FALSE} # ~20 min groups_hubs <- birddog::sniff_groups_hubs(groups) ``` ```{r include = FALSE, } groups_hubs <- readRDS(file.path(local_data, "groups_hubs.rds")) ``` ```{r } groups_hubs |> dplyr::filter(zone != "noHub") |> dplyr::mutate(Zi = round(Zi, 2), Pi = round(Pi, 2)) |> dplyr::arrange(dplyr::desc(zone), dplyr::desc(Zi)) |> dplyr::slice_head(n = 15) |> gt::gt() |> gt::text_transform( locations = gt::cells_body(columns = name), fn = function(x) { glue::glue('{x}') } ) ``` ## Indexes: Citations Cycle Time Measure the pace of change in each group by tracking how old the cited references are over time. ```{r eval = FALSE} # ~1.5 min groups_cct <- birddog::sniff_citations_cycle_time( groups, scope = "groups", start_year = 2000, end_year = 2024 ) groups_cct$plots[["c1g3"]] ``` ```{r include = FALSE, } groups_cct <- readRDS(file.path(local_data, "groups_cct.rds")) ``` ![](cct-c1g3.png) ## Indexes: Entropy Track keyword diversity within each group over time. Increasing entropy signals thematic diversification; decreasing entropy signals convergence. ```{r eval = FALSE} groups_entropy <- birddog::sniff_entropy( groups, scope = "groups", start_year = 2000, end_year = 2024 ) groups_entropy$plots[["c1g3"]] ``` ![](entropy-c1g3.png) ## Group trajectories Track how research communities evolve over time by building cumulative networks at each year and following groups through consecutive periods. ```{r eval = FALSE} # ~2 min groups_cumulative <- birddog::sniff_groups_cumulative(groups) ``` ```{r include = FALSE, } groups_cumulative <- readRDS(file.path(local_data, "groups_cumulative.rds")) ``` ```{r eval = FALSE} suppressMessages({ groups_cumulative_trajectories <- birddog::sniff_groups_trajectories(groups_cumulative) }) birddog::plot_group_trajectories_2d( groups_cumulative_trajectories, group = "c1g3", label_vertical_position = -2 ) birddog::plot_group_trajectories_3d( groups_cumulative_trajectories, group = "c1g3" ) ``` ![](group-evolution-2d-c1g3.png){width=99%} ![](group-evolution-3d-c1g3.png) ### Trajectory detection and variable-width lines Automatically detect the highest-scoring temporal paths within a group using dynamic programming, then display them as variable-width lines that grow with cumulative paper counts. ```{r eval = FALSE, warning = FALSE} traj_data <- birddog::detect_main_trajectories( groups_cumulative_trajectories, group = "c1g3" ) traj_filtered <- birddog::filter_trajectories( traj_data$trajectories, top_n = 3 ) birddog::plot_group_trajectories_lines_2d( traj_data = traj_data, traj_filtered = traj_filtered, title = "c1g3" ) birddog::plot_group_trajectories_lines_3d( traj_data = traj_data, traj_filtered = traj_filtered, group_id = "c1g3" ) ``` ![](group_trajectories_lines_2d-c1g3.png){width=99%} ![](group_trajectories_lines_3d-c1g3.png) ## Citation growth per document Track how individual documents accumulate citations over time to identify fast-growing papers. ```{r eval = FALSE} # ~11 min groups_cumulative_citations <- birddog::sniff_groups_cumulative_citations( groups, min_citations = 2 ) ``` ## Main Path Analysis Identify the key route through the citation network, revealing the most influential chain of documents over time. ```{r eval = FALSE} groups_key_route <- birddog::sniff_key_route(groups, scope = "groups") groups_key_route[["c1g3"]]$plot groups_key_route[["c1g3"]]$data |> dplyr::select(-name) |> gt::gt() ``` ![](key-route-c1g3.png){width=99%} ```{r include = FALSE, } key_route_c1g3_data <- readRDS(file.path(local_data, "key_route_c1g3_data.rds")) ``` ```{r } key_route_c1g3_data |> dplyr::select(document = name, name2, title = TI) |> gt::gt() |> gt::text_transform( locations = gt::cells_body(columns = document), fn = function(x) { glue::glue('{x}') } ) ``` ## Topic modeling (STM) Detect topics within a group using Structural Topic Modeling, creating sub-groups based on linguistic similarities. ```{r eval = FALSE} # Prepare STM data (~30 min) groups_stm_prepare <- birddog::sniff_groups_stm_prepare( groups, group_to_stm = "c1g3" ) ``` ```{r include = FALSE, } groups_stm_prepare <- readRDS(file.path(local_data, "groups_stm_prepare.rds")) ``` **17 topics** is the best fit. ```{r eval = FALSE} groups_stm_prepare$plots[['metrics_by_k']] groups_stm_prepare$plots[['exclusivity_vs_coherence']] ``` ```{r eval = FALSE} # Run STM (~35 sec) groups_stm_run <- birddog::sniff_groups_stm_run( groups_stm_prepare, k_topics = 17, n_top_documents = 20 ) ``` ```{r eval = FALSE} groups_stm_run$topic_proportion |> dplyr::mutate(topic_proportion = round(topic_proportion, 3)) |> gt::gt() groups_stm_run$top_documents |> dplyr::group_by(topic) |> dplyr::arrange(dplyr::desc(gamma)) |> dplyr::slice_head(n = 3) |> dplyr::select(-DI) |> gt::gt() |> gt::text_transform( locations = gt::cells_body(columns = document), fn = function(x) { glue::glue('{x}') } ) ``` ## Session info ```{r} sessionInfo() ``` ## Hardware - Hostname: `r Sys.info()[["nodename"]]` - Processor: `r tryCatch(benchmarkme::get_cpu()[[2]], error = function(e) "N/A")` - RAM: `r tryCatch(paste0(round(as.numeric(benchmarkme::get_ram()) / 1073741824, digits = 1), " GB"), error = function(e) "N/A")` - Storage: 2 SSD’s in `raid0` for data and 1 SSD for the OS.