## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(collapse = TRUE, comment = "#>") if (!requireNamespace("bigmemory", quietly = TRUE)) { cat("This vignette requires the 'bigmemory' package.\n") knitr::knit_exit() } library(bigKNN) library(bigmemory) ## ----helpers, include=FALSE--------------------------------------------------- neighbor_table <- function(index, query_ids, ref_ids, distance = NULL) { do.call(rbind, lapply(seq_along(query_ids), function(i) { out <- data.frame( query = query_ids[i], rank = seq_len(ncol(index)), neighbor = ref_ids[index[i, ]], row.names = NULL ) if (!is.null(distance)) { out$distance <- signif(distance[i, ], 5) } out })) } ## ----create-data-------------------------------------------------------------- reference_points <- data.frame( id = paste0("r", 1:8), x1 = c(0.0, 0.2, 1.0, 1.2, 3.0, 3.2, 4.0, 4.2), x2 = c(0.0, 0.1, 0.9, 1.0, 3.0, 3.1, 4.0, 4.1), x3 = c(0.5, 0.4, 1.2, 1.1, 2.8, 2.9, 3.8, 3.9) ) query_points <- data.frame( id = paste0("q", 1:3), x1 = c(0.1, 1.1, 3.1), x2 = c(0.0, 1.0, 3.0), x3 = c(0.45, 1.15, 2.85) ) reference <- as.big.matrix(as.matrix(reference_points[c("x1", "x2", "x3")])) query_matrix <- as.matrix(query_points[c("x1", "x2", "x3")]) reference_points query_points ## ----exact-truth-------------------------------------------------------------- exact <- knn_bigmatrix( reference, query = query_matrix, k = 3, metric = "euclidean", exclude_self = FALSE ) exact neighbor_table( exact$index, query_ids = query_points$id, ref_ids = reference_points$id, distance = exact$distance ) ## ----approx-top3-------------------------------------------------------------- approx_top3 <- rbind( c(8, 3, 1), c(7, 4, 2), c(6, 2, 5) ) neighbor_table( approx_top3, query_ids = query_points$id, ref_ids = reference_points$id ) ## ----recall------------------------------------------------------------------- recall_before <- recall_against_exact(exact, approx_top3, k = 3) recall_before data.frame( query = query_points$id, recall = recall_before$per_query, row.names = NULL ) ## ----candidate-pool----------------------------------------------------------- candidate_pool <- rbind( c(8, 3, 1, 2, 6), c(7, 4, 2, 3, 1), c(6, 2, 5, 7, 1) ) neighbor_table( candidate_pool, query_ids = query_points$id, ref_ids = reference_points$id ) ## ----rerank------------------------------------------------------------------- reranked <- rerank_candidates_bigmatrix( reference, query = query_matrix, candidate_index = candidate_pool, metric = "euclidean", top_k = 3 ) reranked neighbor_table( reranked$index, query_ids = query_points$id, ref_ids = reference_points$id, distance = reranked$distance ) ## ----compare-before-after----------------------------------------------------- recall_after <- recall_against_exact(exact, reranked, k = 3) data.frame( stage = c("Approximate top-3", "Reranked top-3 from 5 candidates"), overall_recall = c(recall_before$overall, recall_after$overall), row.names = NULL ) ## ----rerank-limit------------------------------------------------------------- reranked_limited <- rerank_candidates_bigmatrix( reference, query = query_matrix, candidate_index = approx_top3, metric = "euclidean", top_k = 3 ) recall_after_limited <- recall_against_exact(exact, reranked_limited, k = 3) data.frame( stage = c("Approximate top-3", "Reranked top-3 from same 3 candidates"), overall_recall = c(recall_before$overall, recall_after_limited$overall), row.names = NULL )