---
title: "1. Data preparation"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{1. Data preparation}
  %\VignetteEncoding{UTF-8}
  %\VignetteEngine{knitr::rmarkdown}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse  = TRUE,
  comment   = "#>",
  fig.width = 7,
  fig.height = 5,
  out.width = "90%"
)
has_terra <- requireNamespace("terra",   quietly = TRUE)
has_ggplot2 <- requireNamespace("ggplot2", quietly = TRUE)
```

`prepare_bean()` cleans raw occurrence records and attaches environmental
values to each point. The function:

1. drops records with missing coordinates;
2. optionally standardises (`"scale"`) or PCA-rotates (`"pca"`) the
   environmental rasters before extraction;
3. extracts environmental values for every occurrence;
4. drops records that fall outside the raster extent.

## Load raw data

```{r, eval = has_terra, message = FALSE, warning = FALSE}
library(bean)
library(terra)

occ_file <- system.file("extdata", "Rusa_unicolor.csv", package = "bean")
env_file <- system.file("extdata", "thai_env.tif",     package = "bean")

occ_data_raw <- read.csv(occ_file)
env <- terra::rast(env_file)

head(occ_data_raw)
env
```

## Visualise the environmental layers

```{r, eval = has_terra, fig.width = 9, fig.height = 7}
plot(env, mar = c(1.5, 1.5, 2, 4))
```

## Visualise the raw occurrence points

The raw points are clearly clustered along roads and around cities — a
classic example of spatial sampling bias.

```{r, eval = has_terra && has_ggplot2, fig.width = 7, fig.height = 6}
library(ggplot2)
env_df <- as.data.frame(env[[1]], xy = TRUE)

ggplot(occ_data_raw, aes(x, y)) +
  geom_raster(data = env_df, aes(x, y, fill = .data[[names(env)[1]]])) +
  geom_point(alpha = 0.6, colour = "darkred", size = 1.4) +
  scale_fill_gradient(low = "grey95", high = "grey55", guide = "none") +
  coord_fixed() +
  labs(title = "Raw occurrence points",
       subtitle = sprintf("n = %d records", nrow(occ_data_raw))) +
  theme_classic()
```

## Run `prepare_bean()`

```{r, eval = has_terra}
prepared <- prepare_bean(
  data = occ_data_raw,
  env_rasters = env,
  longitude = "x",
  latitude = "y",
  transform = "scale"
)
head(prepared)
summary(prepared[, -(1:3)])
```

## Visualise the prepared points

After cleaning, the records that survived are mapped here in blue. Points
that were dropped (missing coordinates or outside the raster extent) are
shown in red for comparison.

```{r, eval = has_terra && has_ggplot2, fig.width = 7, fig.height = 6}
ggplot() +
  geom_raster(data = env_df, aes(x, y, fill = .data[[names(env)[1]]])) +
  geom_point(data = occ_data_raw, aes(x, y),
             colour = "red", size = 1.4, alpha = 0.5) +
  geom_point(data = prepared, aes(x, y),
             colour = "#118ab2", size = 1.4, alpha = 0.8) +
  scale_fill_gradient(low = "grey95", high = "grey55", guide = "none") +
  coord_fixed() +
  labs(title = "Prepared occurrence points (blue) vs. raw (grey)",
       subtitle = sprintf("retained %d of %d records",
                          nrow(prepared), nrow(occ_data_raw))) +
  theme_classic()
```

## The shipped, pre-computed dataset

For users without `terra`, the package also ships the result of running the
same pipeline on the bundled rasters:

```{r}
data(origin_dat_prepared, package = "bean")
head(origin_dat_prepared)
```

This is the object used in the next two vignettes.