## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(comment = "#",
                      collapse = TRUE,
                      eval = TRUE,
                      echo = TRUE,
                      warning = FALSE,
                      message = FALSE)

## ----Requisite Packages-------------------------------------------------------
library(e1071)
library(future)
library(modeltuning) # devtools::install_github("dmolitor/modeltuning")
library(parallelly)
library(paws)
library(rsample)
library(yardstick)

## ----Iris Big-----------------------------------------------------------------
iris_new <- do.call(
  what = rbind,
  args = replicate(n = 10, iris, simplify = FALSE)
) |>
  transform(
    Sepal.Length = jitter(Sepal.Length, 0.1),
    Sepal.Width = jitter(Sepal.Width, 0.1),
    Petal.Length = jitter(Petal.Length, 0.1),
    Petal.Width = jitter(Petal.Width, 0.1),
    Species = factor(Species == "virginica")
  )

# Shuffle the data-set
iris_new <- iris_new[sample(1:nrow(iris_new), nrow(iris_new)), ]

# Quick overview of the dataset
summary(iris_new[, 1:4])

## ----Grid Search--------------------------------------------------------------
# Create a splitter function that will return CV folds
splitter_fn <- function(data) lapply(vfold_cv(data, v = 5)$splits, \(y) y$in_id)

iris_grid <- GridSearchCV$new(
  learner = svm,
  tune_params = list(
    cost = c(0.01, 0.1, 0.5, 1, 3, 6),
    kernel = c("polynomial", "radial", "sigmoid")
  ),
  learner_args = list(
    scale = TRUE,
    type = "C-classification",
    probability = TRUE
  ),
  splitter = splitter_fn,
  scorer = list(
    accuracy = accuracy_vec,
    f_measure = f_meas_vec,
    auc = roc_auc_vec
  ),
  prediction_args = list(
    accuracy = NULL,
    f_measure = NULL,
    auc = list(probability = TRUE)
  ),
  convert_predictions = list(
    accuracy = NULL,
    f_measure = NULL,
    auc = function(.x) attr(.x, "probabilities")[, "FALSE"]
  ),
  optimize_score = "max"
)

## ----N-Models-----------------------------------------------------------------
cat("We will estimate", nrow(iris_grid$tune_params), "SVM models\n")

## ----Launch Instances, eval = FALSE-------------------------------------------
# ec2_client <- ec2()
# 
# # Request Instances
# instance_req <- ec2_client$run_instances(
#   ImageId = "ami-06dd49fc9e3a5acee",
#   InstanceType = "t2.large",
#   KeyName = key_name,
#   MaxCount = 6,
#   MinCount = 6,
#   InstanceInitiatedShutdownBehavior = "terminate",
#   SecurityGroupIds = security_group,
#   # This names the instances
#   TagSpecifications = list(
#     list(
#       ResourceType = "instance",
#       Tags = list(
#         list(
#           Key = "Name",
#           Value = "Worker Node"
#         )
#       )
#     )
#   )
# )

## ----Wait for Instances, eval = FALSE-----------------------------------------
# # Chalk up a quick function to return instance IDs from our request
# instance_ids <- function(response) {
#   vapply(response$Instances, function(i) i$InstanceId, character(1))
# }
# 
# # Wait for instances to all respond as 'running'
# while(
#   !all(
#     vapply(
#       ec2_client$
#       describe_instances(InstanceIds = instance_ids(instance_req))$
#       Reservations[[1]]$
#       Instances,
#       function(i) i$State$Name,
#       character(1)
#     ) == "running"
#   )
# ) {
#   Sys.sleep(5)
# }
# 
# # Rough heuristic -- give additional 45 seconds for instances to initialize
# Sys.sleep(45)

## ----IPs, eval = FALSE--------------------------------------------------------
# # Get public IPs
# inst_public_ips <- vapply(
#   ec2_client$
#     describe_instances(InstanceIds = instance_ids(instance_req))$
#     Reservations[[1]]$
#     Instances,
#   function(i) i$PublicIpAddress,
#   character(1)
# )

## ----Compute Cluster, eval = FALSE--------------------------------------------
# cl <- makeClusterPSOCK(
#   worker = inst_public_ips,
#   user = "ubuntu",
#   rshopts = c("-o", "StrictHostKeyChecking=no",
#               "-o", "IdentitiesOnly=yes",
#               "-i", pem_fp), # Local filepath to private SSH key-pair
#   connectTimeout = 25,
#   tries = 3
# )

## ----Parallel plan, eval = FALSE----------------------------------------------
# plan(
#   list(
#     tweak(cluster, workers = cl),
#     multisession
#   )
# )

## ----Estimate Models----------------------------------------------------------
iris_grid_fitted <- iris_grid$fit(
  formula = Species ~ .,
  data = iris_new,
  progress = TRUE
)

## ----Best Model Info----------------------------------------------------------
best_idx <- iris_grid_fitted$best_idx
metrics <- iris_grid_fitted$metrics

# Print model metrics of best model
cat(
  " Accuracy:", round(100 * metrics$accuracy[[best_idx]], 2),
  "%\nF-Measure:", round(100 * metrics$f_measure[[best_idx]], 2),
  "%\n      AUC:", round(metrics$auc[[best_idx]], 4), "\n"
)

params <- iris_grid_fitted$best_params

# Print the best hyper-parameters
cat(
  "  Optimal Cost:", params[["cost"]],
  "\nOptimal Kernel:", params[["kernel"]], "\n"
)

## ----Kill Instances, eval = FALSE---------------------------------------------
# ec2_client$stop_instances(
#   InstanceIds = instance_ids(instance_req)
# )