## ----import_library, warning = FALSE, message = FALSE-------------------------
library(o2geosocial)
## We used the data.table syntax throughout this example
library(data.table)
data("toy_outbreak_short")
print(toy_outbreak_short$cases, nrows = 10)
# Extract dataset
dt_cases <- toy_outbreak_short[["cases"]]
dt_cases <- dt_cases[order(Date), ]

## ----plot_ref, warning = FALSE, fig.width = 6.5, fig.height = 4, fig.cap="Figure 2: Cluster size distribution of the simulated dataset"----
# Reference cluster size distribution
hist(table(dt_cases$cluster), breaks = 0:max(table(dt_cases$cluster)), 
     ylab = "Number of clusters", xlab = "Cluster size", main = "", las=1)

## ----distrib------------------------------------------------------------------
# Distribution of the latent period
f_dens <- dgamma(x = 1:100, scale = 0.43, shape = 26.83)
# Distribution of the generation time
w_dens <- dnorm(x = 1:100, mean = 11.7, sd = 2.0)

## ----age, message = FALSE, warning = FALSE------------------------------------
# From the list toy_outbreak_short  
a_dens <- toy_outbreak_short$age_contact
# Alternatively, from POLYMOD:
polymod_matrix <-
  t(socialmixr::contact_matrix(socialmixr::polymod, 
                               countries="United Kingdom",
                               age.limits=seq(0, 70, by=5),
                               missing.contact.age="remove",
                               estimated.contact.age="mean", 
                               missing.participant.age="remove")$matrix)
polymod_matrix <-data.table::as.data.table(polymod_matrix)
# Compute the proportion of connection to each age group
a_dens <- t(t(polymod_matrix)/colSums(polymod_matrix))

## ----distance_pop, warning = FALSE--------------------------------------------
# Extract all regions in the territory
dt_regions <- toy_outbreak_short[["dt_regions"]]
# Extract the population vector
pop_vect <- dt_regions$population
# Create the matrices of coordinates for each region (one "from"; one "to")
mat_dist_from <- matrix(c(rep(dt_regions$long, nrow(dt_regions)),
                          rep(dt_regions$lat, nrow(dt_regions))), ncol = 2)
mat_dist_to <- matrix(c(rep(dt_regions$long, each = nrow(dt_regions)), 
                        rep(dt_regions$lat, each = nrow(dt_regions))),
                      ncol = 2)
# Compute all the distances between the two matrices
all_dist <- geosphere::distGeo(mat_dist_from, mat_dist_to)
# Compile into a distance matrix
dist_mat <- matrix(all_dist/1000, nrow = nrow(dt_regions))
# Rename the matrix columns and rows, and the population vector
names(pop_vect) <- rownames(dist_mat) <- colnames(dist_mat) <- 
  dt_regions$region

## ----first_model, message = FALSE, warning = FALSE----------------------------
# Default moves, likelihoods and priors
moves <- custom_moves()
likelihoods <- custom_likelihoods()
priors <- custom_priors()
# Data and config, model 1
data1 <- outbreaker_data(dates = dt_cases$Date, #Onset dates
                         age_group = dt_cases$age_group, #Age group
                         region = dt_cases$Cens_tract, #Location
                         genotype = dt_cases$Genotype, #Genotype
                         w_dens = w_dens, #Serial interval
                         f_dens = f_dens, #Latent period
                         a_dens = a_dens, #Age stratified contact matrix
                         population = pop_vect, #Population 
                         distance = dist_mat #Distance matrix
)
config1 <- create_config(data = data1, 
                         n_iter = 5000, #Iteration number: main run
                         n_iter_import = 1500, #Iteration number: short run
                         burnin = 500, #burnin period: first run
                         outlier_relative = T, #Absolute / relative threshold 
                         outlier_threshold = 0.9 #Value of the threshold
)
# Run model 1
out1 <- outbreaker(data = data1, config = config1, moves = moves, 
                   priors = priors, likelihoods = likelihoods)
# Set data and config for model 2
data2 <- outbreaker_data(dates = dt_cases$Date, 
                         age_group = dt_cases$age_group,
                         region = dt_cases$Cens_tract,
                         genotype = dt_cases$Genotype, w_dens = w_dens, 
                         f_dens = f_dens, a_dens = a_dens,
                         population = pop_vect, distance = dist_mat,
                         import = dt_cases$import #Import status of the cases
)
config2 <- create_config(data = data2, 
                         find_import = FALSE, # No inference of import status
                         n_iter = 5000, 
                         sample_every = 50, # 1 in 50 iterations is kept
                         burnin = 500)
# Run model 2
out2 <- outbreaker(data = data2, config = config2, moves = moves, 
                   priors = priors, likelihoods = likelihoods)


## ----params, warning = FALSE--------------------------------------------------
burnin <- config1$burnin
# Summary parameters a and b
#Model 1
print(summary(out1, burnin = burnin)$a)
print(summary(out1, burnin = burnin)$b)
# Model 2
print(summary(out2, burnin = burnin)$a)
print(summary(out2, burnin = burnin)$b)

## ----plot_first, warning = FALSE, fig.width = 6.5, fig.height = 4, fig.cap= "Figure 3: Comparison of inferred cluster size distribution with the reference data"----
# We create groups of cluster size: initialise the breaks for each group
group_cluster <- c(1, 2, 3, 5, 10, 15, 40, 100) - 1
# Reference data: h$counts
h <- hist(table(dt_cases$cluster), breaks = group_cluster, plot = FALSE)

# Grouped cluster size distribution in each run
clust_infer1 <- summary(out1, burnin = burnin, 
                        group_cluster = group_cluster)$cluster
clust_infer2 <- summary(out2, group_cluster = group_cluster, 
                        burnin = burnin)$cluster
# Merge inferred and reference cluster size distributions into one matrix
clust_size_matrix <- rbind(clust_infer1["Median",], clust_infer2["Median",],
                           h$counts)
# Plot  
b <- barplot(clust_size_matrix, names.arg = colnames(clust_infer1), las=1,
             ylab = "Number of clusters", xlab = "Cluster size", main = "", 
             beside = T, ylim = c(0, max(c(clust_infer1, clust_infer2))))
# Add the 50% CI
arrows(b[1,], clust_infer1["1st Qu.",], b[1,], clust_infer1["3rd Qu.",], 
       angle = 90, code = 3, length = 0.1)
arrows(b[2,], clust_infer2["1st Qu.",], b[2,], clust_infer2["3rd Qu.",], 
       angle = 90, code = 3, length = 0.1)
# Add legend
legend("topright", fill = grey.colors(3), bty = "n",
       legend = c("Inferred import status", 
                  "Known import status", "Simulated dataset"))

## ----index_infer, warning = FALSE---------------------------------------------
#' Title: Compute the proportion of iterations in the outbreaker() output 
#` where the inferred index matches the actual index in dt_cases
#'
#' @param dt_cases: reference dataset
#' @param out: Matrix output of outbreaker()
#' @param burnin: Numeric, length of the burnin phase
#'
#' @return Numeric vector showing the proportion of iterations pointing to
#' the correct index case
index_infer <- function(dt_cases, out, burnin){
  out_index <- out[out$step > burnin, grep("alpha", colnames(out))]
  ID_index <- matrix(dt_cases[unlist(out_index), ID], ncol = nrow(dt_cases))
  match_infer_data <- t(ID_index) == dt_cases$infector_ID
  match_infer_data[is.na(t(ID_index)) & is.na(dt_cases$infector_ID)] <- TRUE
  prop_correct <- rowSums(match_infer_data, na.rm = T)/ncol(match_infer_data)
  
  return(prop_correct)
}
# Same as index_infer, except it returns the proportion of inferred indexes
# who are in the same reference cluster as the case
index_clust <- function(dt_cases, out, burnin){
  out_index <- out[out$step > burnin, grep("alpha", colnames(out))]
  clust_index <- matrix(dt_cases[unlist(out_index), cluster], 
                        ncol = nrow(dt_cases))
  match_infer_data <- t(clust_index) == dt_cases$cluster
  match_infer_data <- match_infer_data[!is.na(dt_cases$infector_ID),]
  
  
  prop_correct <- rowSums(match_infer_data, na.rm = T)/ncol(match_infer_data)
  
  return(prop_correct)
}
# Run index_infer for each model
index_infer1 <- index_infer(dt_cases = dt_cases, out = out1, burnin = burnin)
index_infer2 <- index_infer(dt_cases = dt_cases, out = out2, burnin = burnin)
# Run index_clust for each model
index_clust1 <- index_clust(dt_cases = dt_cases, out = out1, burnin = burnin)
index_clust2 <- index_clust(dt_cases = dt_cases, out = out2, burnin = burnin)

## ----plot index_infer, warning = FALSE, fig.width = 7, fig.height = 3, fig.cap = "Figure 4: Panel A: Proportion of iterations with the correct index for each case; Panel B: Proportion of iterations where the index is from the correct cluster"----
# Plot the sorted proportion in each model
oldpar <- par(no.readonly =TRUE)
par(bty = "n", mfrow = c(1, 2), mar = c(5,4,2,0), oma = c(0, 0, 0, 0))
# Panel A
plot(sort(index_infer1), type = "l", ylab = "Proportion", xlab = "Case", 
     main =  "A", las=1, col = grey.colors(3)[1], lwd = 3)
lines(sort(index_infer2), col = grey.colors(3)[2], lwd = 3)

# Panel B
plot(sort(index_clust1), type = "l", xlab = "Case", ylab = "", 
     main =  "B", las=1, col = grey.colors(3)[1], lwd = 3)
lines(sort(index_clust2), col = grey.colors(3)[2], lwd = 3)
legend("bottomright", col = grey.colors(3)[1:2], lwd = 3, bty = "n",
       legend = c("Inferred import status", 
                  "Known import status"))

## ----import_sf_file, message = FALSE, warning = FALSE, fig.width = 6.5, fig.height = 2.5----
library(ggplot2)
# Read the shapefile and create one map for each model
map1 <- tigris::tracts(state = "Ohio", class = "sf", progress_bar = FALSE)
map1$INTPTLON <- as.numeric(map1$INTPTLON)
map1$INTPTLAT <- as.numeric(map1$INTPTLAT)
map2 <- map1
map1$model <- "Model 1"
map2$model <- "Model 2"


## ----n_import_reg_per_case, warning = FALSE-----------------------------------
# Add the proportion of iteration in model 1 where each case is an import
dt_cases[, prop_infer1 := summary(out1, burnin = burnin)$tree$import]
# Add the proportion of iteration in model 2 where each case is an import
dt_cases[, prop_infer2 := summary(out2, burnin = burnin)$tree$import]

## ----n_import_reg_per_reg, warning = FALSE------------------------------------
# Number of import per region in model 1
prop_reg1 <- dt_cases[, .(prop_per_reg = sum(prop_infer1)), 
                      by = Cens_tract]$prop_per_reg
# Number of import per region in model 2
prop_reg2 <- dt_cases[, .(prop_per_reg = sum(prop_infer2)), 
                      by = Cens_tract]$prop_per_reg
names(prop_reg1) <- names(prop_reg2) <- unique(dt_cases$Cens_tract)

# Add the number of imports in each region to the maps
map1$prop_reg <- prop_reg1[as.character(map1$GEOID)]
map2$prop_reg <- prop_reg2[as.character(map2$GEOID)]

## ----create_map1, warning = FALSE, fig.width = 6.5, fig.height = 2.5, fig.cap = "Figure 5: Average number of import cases per census tract"----
# Merge maps
maps <- rbind(map1, map2)

limits_long <- c(-84, -82)
limits_lat <- c(40, 41.5)
maps <- maps[maps$INTPTLON > limits_long[1] & maps$INTPTLON < limits_long[2],]
maps <- maps[maps$INTPTLAT > limits_lat[1] & maps$INTPTLAT < limits_lat[2],]

# Plot: number of imports per region, two panels
ggplot(maps) +  geom_sf(aes(fill = prop_reg)) + facet_grid(~model)  +     
  scale_fill_gradient2(na.value = "lightgrey", midpoint = 0.8, 
                       breaks = c(0, 0.5, 1, 1.5), name = "Nb imports",
                       low = "white", mid = "lightblue", high = "darkblue") + 
  coord_sf(xlim = c(-83.8, -82.2), ylim = c(40.2, 41.3)) +
  theme_classic(base_size = 9)

## ----n_sec_region, warning = FALSE--------------------------------------------
#' Title: Compute the number of secondary cases per case in each region
#'
#' @param dt_cases: reference dataset
#' @param out: Matrix output of outbreaker()
#' @param burnin: Numeric, length of the burnin phase
#'
#' @return A numeric matrix: the first column is the census tract ID, the
#' other columns show the number of secondary cases per case. Each row 
#' corresponds to a different iteration.
n_sec_per_reg <- function(dt_cases, out, burnin){
  ## Number of secondary cases per case
  n_sec <- apply(out[out$step > burnin, grep("alpha", colnames(out))], 1, 
                 function(X){
                   X <- factor(X, 1:length(X))
                   return(table(X))})
  ## Aggregate by region
  tot_n_sec_reg <- aggregate(n_sec, list(dt_cases$Cens_tract), sum)
  ## Divide by the number of cases in each region
  tot_n_sec_reg <- cbind(tot_n_sec_reg[, 1], 
                         tot_n_sec_reg[, -1] / table(dt_cases$Cens_tract))
  return(tot_n_sec_reg)
}
  
n_sec_tot1 <- n_sec_per_reg(dt_cases = dt_cases, out = out1, burnin = burnin)
n_sec_tot2 <- n_sec_per_reg(dt_cases = dt_cases, out = out2, burnin = burnin)
n_sec1 <- apply(n_sec_tot1[,-1], 1, median)
n_sec2 <- apply(n_sec_tot2[,-1], 1, median)
names(n_sec1) <- names(n_sec2) <- unique(dt_cases$Cens_tract)

## ----add_variables_sec, warning = FALSE---------------------------------------
map1$n_sec <- as.numeric(n_sec1[as.character(map1$GEOID)])
map2$n_sec <- as.numeric(n_sec2[as.character(map2$GEOID)])

## ----create_maps2, warning = FALSE, fig.width = 6.5, fig.height = 2.5, fig.cap = "Figure 6: Median number of secondary transmission per case in each census tract"----
# Merge maps
maps <- rbind(map1, map2)

limits_long <- c(-84, -82)
limits_lat <- c(40, 41.5)
maps <- maps[maps$INTPTLON > limits_long[1] & maps$INTPTLON < limits_long[2],]
maps <- maps[maps$INTPTLAT > limits_lat[1] & maps$INTPTLAT < limits_lat[2],]

# Plot the geographical distribution of the number of secondary cases
ggplot(maps) +  geom_sf(aes(fill = n_sec)) + facet_grid(~model)  +     
  scale_fill_gradient2(na.value = "lightgrey", mid = "lightblue",
                       low = "white", midpoint = 1, high = "darkblue",
                       breaks = seq(0, 5, 0.5),name = "Sec cases") +
  coord_sf(xlim = c(-83.8, -82.2), ylim = c(40.2, 41.3)) +
  theme_classic(base_size = 9) 

## ----create_stouff_matrix, message = FALSE, warning = FALSE-------------------
# For every column of the distance matrix, use the cumulative sum of the 
# population vector ordered by the distance. Remove the values where 
# the distance between the regions is above gamma
dist_mat_stouffer <- apply(dist_mat, 2, function(X){
  pop_X <- cumsum(pop_vect[order(X)])
  omega_X <- pop_X[names(X)]
  # omega_X is set to -1 if the distance between two regions is above gamma
  omega_X[X > config1$gamma] <- -1
  return(omega_X)
})
# The new value of gamma is equal to the maximum of dist_mat_stouffer + 1
gamma <- max(dist_mat_stouffer) + 1
# The values previously set to -1 are now set to the new value of gamma
dist_mat_stouffer[dist_mat_stouffer == -1] <- max(dist_mat_stouffer) * 2

## ----moves_priors, message = FALSE, warning = FALSE---------------------------
moves3 <- custom_moves(a = cpp_stouffer_move_a)

f_null <- function(param) {
  return(0.0)
}
priors3 <- custom_priors(b = f_null)

## ----out_stouffer, message = FALSE, warning = FALSE---------------------------
# Set data and config lists
data3 <- outbreaker_data(dates = dt_cases$Date, #Onset dates
                         age_group = dt_cases$age_group, #Age group
                         region = dt_cases$Cens_tract, #Location
                         genotype = dt_cases$Genotype, #Genotype
                         w_dens = w_dens, #Serial interval
                         f_dens = f_dens, #Latent period
                         a_dens = a_dens, #Age stratified contact matrix
                         population = pop_vect, #Population 
                         distance = dist_mat_stouffer #Distance matrix
)
config3 <- create_config(data = data3, 
                         gamma = gamma,
                         move_b = FALSE, # b is not estimated
                         init_b = 0, 
                         spatial_method = "power-law",
                         n_iter = 5000, #Iteration number: main run
                         n_iter_import = 1500, #Iteration number: short run
                         burnin = 500, #burnin period: first run
                         outlier_relative = T, #Absolute / relative threshold
                         outlier_threshold = 0.9 #Value of the threshold
)
# Run the model using the Stouffer's rank method
out_stouffer <- outbreaker(data = data3, config = config3, moves = moves3, 
                           priors = priors3, likelihoods = likelihoods)

## ----stouffer_cluster, message = FALSE, warning = FALSE, fig.width = 6.5, fig.height = 4, fig.cap = "Figure 7: Comparison of inferred cluster size distribution with the reference data"----
# Grouped cluster size distribution in the Stouffer's rank model
clust_infer_stouf <- summary(out_stouffer, burnin = burnin, 
                             group_cluster = group_cluster)$cluster
# Merge inferred and reference cluster size distributions
clust_size_matrix <- rbind(clust_infer_stouf["Median",], h$counts) 
# Plot the two distributions
b <- barplot(clust_size_matrix, names.arg = colnames(clust_infer_stouf), 
             beside = T)
# Add CIs
arrows(b[1,], clust_infer_stouf["1st Qu.",], b[1,], 
       clust_infer_stouf["3rd Qu.",], angle = 90, code = 3, length = 0.1)
legend("topright", fill = grey.colors(2), bty = "n",
       legend = c("Inferred import status, Stouffer's rank method", 
                  "Simulated dataset"))

## ----match_stouffer, message = FALSE, warning = FALSE, fig.width = 7, fig.height = 3, fig.cap = "Figure 8: Panel A: Proportion of iterations with the correct index for each case; Panel B: Proportion of iterations where the index is from the correct cluster"----
index_infer_stouf <- index_infer(dt_cases = dt_cases, out = out_stouffer, 
                                 burnin = config1$burnin)
index_clust_stouf <- index_clust(dt_cases = dt_cases, out = out_stouffer, 
                                 burnin = config1$burnin)
# Plot the sorted proportion in each model
par(bty = "n", mfrow = c(1, 2), mar = c(5,4,2,0), oma = c(0, 0, 0, 0))
# Panel A
plot(sort(index_infer_stouf), type = "l", xlab = "Case", ylab = "", 
     main =  "A", las=1, col = grey.colors(2)[1], lwd = 3)
# Panel B
plot(sort(index_clust_stouf), type = "l", ylab = "Proportion", xlab = "Case", 
     main =  "B", las=1, col = grey.colors(2)[1], lwd = 3)
par(oldpar)