## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(collapse = TRUE, comment = "#>") if (!requireNamespace("bigmemory", quietly = TRUE)) { cat("This vignette requires the 'bigmemory' package.\n") knitr::knit_exit() } library(bigKNN) library(bigmemory) ## ----helpers, include=FALSE--------------------------------------------------- knn_table <- function(result, query_ids) { do.call(rbind, lapply(seq_along(query_ids), function(i) { data.frame( query = query_ids[i], rank = seq_len(result$k), neighbor = result$index[i, ], distance = signif(result$distance[i, ], 5), row.names = NULL ) })) } radius_slice_table <- function(index, distance, offset, query_ids, i) { start <- as.integer(offset[i]) end <- as.integer(offset[i + 1L] - 1L) if (start > end) { return(data.frame( query = query_ids[i], neighbor = integer(0), distance = numeric(0) )) } data.frame( query = query_ids[i], neighbor = as.integer(index[start:end]), distance = signif(as.numeric(distance[start:end]), 5), row.names = NULL ) } ## ----create-data-------------------------------------------------------------- i <- seq_len(160) reference_matrix <- cbind( x1 = i, x2 = (i %% 7) + 1, x3 = (i %% 11) + 0.5, x4 = (i %% 13) + 2 ) reference <- as.big.matrix(reference_matrix) dense_query <- rbind( reference_matrix[5, ] + c(0.2, 0.0, 0.1, 0.0), reference_matrix[50, ] + c(-0.3, 0.2, 0.0, 0.1), reference_matrix[120, ] + c(0.4, -0.1, 0.2, 0.0), reference_matrix[151, ] + c(0.1, 0.2, -0.2, 0.3) ) query_ids <- paste0("q", seq_len(nrow(dense_query))) dim(reference_matrix) dense_query ## ----build-plan--------------------------------------------------------------- plan <- knn_plan_bigmatrix( reference, metric = "euclidean", memory_budget = "64KB", num_threads = 2L, progress = FALSE ) plan ## ----plan-comparison---------------------------------------------------------- plan_small <- knn_plan_bigmatrix( reference, metric = "euclidean", memory_budget = "4KB", num_threads = 2L, progress = FALSE ) plan_large <- knn_plan_bigmatrix( reference, metric = "euclidean", memory_budget = "1MB", num_threads = 2L, progress = FALSE ) data.frame( memory_budget = c(plan_small$memory_budget, plan$memory_budget, plan_large$memory_budget), block_size = c(plan_small$block_size, plan$block_size, plan_large$block_size), row.names = NULL ) ## ----planned-search----------------------------------------------------------- planned_knn <- knn_bigmatrix( reference, query = dense_query, k = 3, plan = plan, exclude_self = FALSE ) planned_knn knn_table(planned_knn, query_ids = query_ids) ## ----stream-knn--------------------------------------------------------------- index_store <- big.matrix(nrow(dense_query), 3, type = "integer") distance_store <- big.matrix(nrow(dense_query), 3, type = "double") streamed_knn <- knn_stream_bigmatrix( reference, query = dense_query, xpIndex = index_store, xpDistance = distance_store, k = 3, plan = plan, exclude_self = FALSE ) bigmemory::as.matrix(streamed_knn$index) round(bigmemory::as.matrix(streamed_knn$distance), 4) ## ----stream-knn-compare------------------------------------------------------- identical(bigmemory::as.matrix(streamed_knn$index), planned_knn$index) all.equal(bigmemory::as.matrix(streamed_knn$distance), planned_knn$distance) ## ----stream-radius-counts----------------------------------------------------- radius_counts <- count_within_radius_bigmatrix( reference, query = dense_query, radius = 2.2, plan = plan, exclude_self = FALSE ) radius_counts total_matches <- sum(radius_counts) total_matches ## ----stream-radius------------------------------------------------------------ radius_index_store <- big.matrix(total_matches, 1, type = "integer") radius_distance_store <- big.matrix(total_matches, 1, type = "double") radius_offset_store <- big.matrix(length(radius_counts) + 1L, 1, type = "double") streamed_radius <- radius_stream_bigmatrix( reference, query = dense_query, xpIndex = radius_index_store, xpDistance = radius_distance_store, xpOffset = radius_offset_store, radius = 2.2, plan = plan, exclude_self = FALSE ) streamed_radius streamed_radius$n_match ## ----stream-radius-offsets---------------------------------------------------- radius_offset <- as.vector(bigmemory::as.matrix(streamed_radius$offset)) radius_index <- as.vector(bigmemory::as.matrix(streamed_radius$index)) radius_distance <- as.vector(bigmemory::as.matrix(streamed_radius$distance)) radius_offset radius_slice_table(radius_index, radius_distance, radius_offset, query_ids, 1) radius_slice_table(radius_index, radius_distance, radius_offset, query_ids, 2) ## ----sparse-queries----------------------------------------------------------- sparse_query <- Matrix::Matrix(dense_query, sparse = TRUE) sparse_knn <- knn_bigmatrix( reference, query = sparse_query, k = 3, plan = plan, exclude_self = FALSE ) identical(sparse_knn$index, planned_knn$index) all.equal(sparse_knn$distance, planned_knn$distance)