## ----global_options, include = FALSE------------------------------------------ knitr::opts_chunk$set(fig.width = 8, fig.height = 3, fig.align = 'centre', echo = TRUE, warning = FALSE, message = FALSE, eval = FALSE, tidy = FALSE) ## ----setup-------------------------------------------------------------------- # # The packages we will need # # install.packages("dplyr") # # install.packages("lubridate") # # install.packages("ggplot2") # # install.packages("tidync") # # install.packages("doParallel") # # install.packages("rerddap") # # install.packages("plyr") # Note that this library should never be loaded, only installed # # # The packages we will use # library(dplyr) # A staple for modern data management in R # library(lubridate) # Useful functions for dealing with dates # library(ggplot2) # The preferred library for data visualisation # library(tidync) # For easily dealing with NetCDF data # library(rerddap) # For easily downloading subsets of data # library(doParallel) # For parallel processing ## ----erddap-info-------------------------------------------------------------- # # The information for the NOAA OISST data # rerddap::info(datasetid = "ncdcOisst21Agg_LonPM180", url = "https://coastwatch.pfeg.noaa.gov/erddap/") # # # Note that there is also a version with lon values from 0 yo 360 # rerddap::info(datasetid = "ncdcOisst21Agg", url = "https://coastwatch.pfeg.noaa.gov/erddap/") ## ----download-func------------------------------------------------------------ # # This function downloads and prepares data based on user provided start and end dates # OISST_sub_dl <- function(time_df){ # OISST_dat <- rerddap::griddap(datasetx = "ncdcOisst21Agg_LonPM180", # url = "https://coastwatch.pfeg.noaa.gov/erddap/", # time = c(time_df$start, time_df$end), # zlev = c(0, 0), # latitude = c(-40, -35), # longitude = c(15, 21), # fields = "sst")$data |> # dplyr::mutate(time = base::as.Date(stringr::str_remove(time, "T12:00:00Z"))) |> # dplyr::rename(t = time, temp = sst, lon = longitude, lat = latitude) |> # dplyr::select(lon, lat, t, temp) |> # stats::na.omit() # } ## ----year-index--------------------------------------------------------------- # # Date download range by start and end dates per year # dl_years <- data.frame(date_index = 1:5, # start = c("1982-01-01", "1990-01-01", # "1998-01-01", "2006-01-01", "2014-01-01"), # end = c("1989-12-31", "1997-12-31", # "2005-12-31", "2013-12-31", "2019-12-31")) ## ----download-data------------------------------------------------------------ # # Download all of the data with one nested request # # The time this takes will vary greatly based on connection speed # base::system.time( # OISST_data <- dl_years |> # dplyr::group_by(date_index) |> # dplyr::group_modify(~OISST_sub_dl(.x)) |> # dplyr::ungroup() |> # dplyr::select(lon, lat, t, temp) # ) # 518 seconds, ~100 seconds per batch ## ----SA-visual---------------------------------------------------------------- # OISST_data |> # dplyr::filter(t == "2019-12-01") |> # ggplot2::ggplot(aes(x = lon, y = lat)) + # ggplot2::geom_tile(aes(fill = temp)) + # # ggplot2::borders() + # Activate this line to see the global map # ggplot2::scale_fill_viridis_c() + # ggplot2::coord_quickmap(expand = F) + # ggplot2::labs(x = NULL, y = NULL, fill = "SST (°C)") + # ggplot2::theme(legend.position = "bottom") ## ----prep-data---------------------------------------------------------------- # # Save the data as an .Rds file because it has a much better compression rate than .RData # base::saveRDS(OISST_data, file = "~/Desktop/OISST_vignette.Rds") ## ----NOAA-info---------------------------------------------------------------- # # First we tell R where the data are on the interwebs # OISST_base_url <- "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/" # # Note that one may go to this URL in any web browser to manually inspect the files # # # Now we create a data.frame that contains all of the dates we want to download # # NB: In order to change the dates download changes the dates in the following line # OISST_dates <- base::data.frame(t = seq(as.Date("2019-12-01"), as.Date("2019-12-31"), by = "day")) # # # To finish up this step we add some text to those dates so they match the OISST file names # OISST_files <- OISST_dates |> # dplyr::mutate(t_day = base::gsub("-", "", t), # t_month = base::substr(t_day, 1, 6), # t_year = lubridate::year(t), # file_name = base::paste0(OISST_base_url, t_month, "/", "oisst-avhrr-v02r01.", t_day ,".nc")) ## ----NOAA-dl------------------------------------------------------------------ # # This function will go about downloading each day of data as a NetCDF file # # Note that this will download files into a 'data/OISST' folder in the root directory # # If this folder does not exist it will create it # # If it does not automatically create the folder it will need to be done manually # # The folder that is created must be a new folder with no other files in it # # A possible bug with netCDF files in R is they won't load correctly from # # existing folders with other file types in them # # This function will also check if the file has been previously downloaded # # If it has it will not download it again # OISST_url_daily_dl <- function(target_URL){ # base::dir.create("~/data/OISST", showWarnings = F) # file_name <- base::paste0("~/data/OISST/",base::sapply(base::strsplit(target_URL, split = "/"), "[[", 10)) # if(!base::file.exists(file_name)) utils::download.file(url = target_URL, method = "libcurl", destfile = file_name) # } # # # The more cores used, the faster the data may be downloaded # # It is best practice to not use all of the cores on one's machine # # The laptop on which I am running this code has 8 cores, so I use 7 here # doParallel::registerDoParallel(cores = 7) # # # And with that we are clear for take off # base::system.time(plyr::l_ply(OISST_files$file_name, .fun = OISST_url_daily_dl, .parallel = T)) # ~15 seconds # # # In roughly 15 seconds a user may have a full month of global data downloaded # # This scales well into years and decades, and is much faster with more cores # # Download speeds will also depend on the speed of the users internet connection ## ----NOAA-load---------------------------------------------------------------- # # This function will load and subset daily data into one data.frame # # Note that the subsetting by lon/lat is done before the data are loaded # # This means it will use much less RAM and is viable for use on most laptops # # Assuming one's study area is not too large # OISST_load <- function(file_name, lon1, lon2, lat1, lat2){ # OISST_dat <- tidync::tidync(file_name) |> # tidync::hyper_filter(lon = dplyr::between(lon, lon1, lon2), # lat = dplyr::between(lat, lat1, lat2)) |> # tidync::hyper_tibble(select_var = c("sst"), drop = FALSE) |> # dplyr::select(lon, lat, time, sst) |> # dplyr::rename(t = time, temp = sst) |> # dplyr::mutate(t = as.Date(t), # lon = as.numeric(lon), # lat = as.numeric(lat)) # return(OISST_dat) # } # # # Locate the files that will be loaded # OISST_files <- dir("~/data/OISST", full.names = T) # # # Load the data in parallel # OISST_dat <- plyr::ldply(.data = OISST_files[1:365], .fun = OISST_load, .parallel = T, # lon1 = 270, lon2 = 320, lat1 = 30, lat2 = 50) # # # It should only take a few seconds to load one month of data depending on the size of the lon/lat extent chosen ## ----NOAA-visual-------------------------------------------------------------- # OISST_dat |> # dplyr::filter(t == "2019-12-01") |> # ggplot2::ggplot(aes(x = lon, y = lat)) + # ggplot2::geom_tile(aes(fill = temp)) + # ggplot2::scale_fill_viridis_c() + # ggplot2::coord_quickmap(expand = F) + # ggplot2::labs(x = NULL, y = NULL, fill = "SST (°C)") + # ggplot2::theme(legend.position = "bottom")