% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/obn_sketch.R
\name{ob_numerical_sketch}
\alias{ob_numerical_sketch}
\title{Optimal Binning for Numerical Variables using Sketch-based Algorithm}
\usage{
ob_numerical_sketch(
  feature,
  target,
  min_bins = 3,
  max_bins = 5,
  bin_cutoff = 0.05,
  max_n_prebins = 20,
  monotonic = TRUE,
  convergence_threshold = 1e-06,
  max_iterations = 1000,
  sketch_k = 200
)
}
\arguments{
\item{feature}{Numeric vector of feature values. Missing values (NA) are \strong{not
permitted} and will trigger an error. Infinite values (Inf, -Inf) and NaN are also
not allowed.}

\item{target}{Integer vector of binary target values (must contain only 0 and 1).
Must have the same length as \code{feature}. Missing values are not permitted.}

\item{min_bins}{Minimum number of bins (default: 3). Must be at least 2.}

\item{max_bins}{Maximum number of bins (default: 5). Must be >= \code{min_bins}.}

\item{bin_cutoff}{Minimum fraction of total observations per bin (default: 0.05).
Must be in (0, 1). Bins with fewer observations will be merged with neighbors.}

\item{max_n_prebins}{Maximum number of pre-bins to generate from quantiles (default: 20).
This parameter controls the initial granularity of binning candidates. Higher values
provide more flexibility but increase computational cost.}

\item{monotonic}{Logical flag to enforce WoE monotonicity (default: TRUE). Uses
PAVA (Pool Adjacent Violators Algorithm) for enforcement. Direction (increasing/
decreasing) is automatically detected from the data.}

\item{convergence_threshold}{Convergence threshold for IV change (default: 1e-6).
Optimization stops when the change in total IV between iterations falls below this value.}

\item{max_iterations}{Maximum iterations for bin optimization (default: 1000).
Prevents infinite loops in the optimization process.}

\item{sketch_k}{Integer parameter controlling sketch accuracy (default: 200).
Larger values improve quantile precision but increase memory usage.
\strong{Approximation error}: \eqn{\epsilon \approx 1/k} (200 → 0.5\% error).
\strong{Valid range}: [10, 1000]. Typical values: 50 (fast), 200 (balanced), 500 (precise).}
}
\value{
A list of class \code{c("OptimalBinningSketch", "OptimalBinning")} containing:
\describe{
  \item{id}{Numeric vector of bin identifiers (1-based indexing).}
  \item{bin_lower}{Numeric vector of lower bin boundaries (inclusive).}
  \item{bin_upper}{Numeric vector of upper bin boundaries (inclusive for last bin,
    exclusive for others).}
  \item{woe}{Numeric vector of Weight of Evidence values. Monotonic if
    \code{monotonic = TRUE}.}
  \item{iv}{Numeric vector of Information Value contributions per bin.}
  \item{count}{Integer vector of total observations per bin.}
  \item{count_pos}{Integer vector of positive class (target = 1) counts per bin.}
  \item{count_neg}{Integer vector of negative class (target = 0) counts per bin.}
  \item{cutpoints}{Numeric vector of bin split points (length = number of bins - 1).
    These are the internal boundaries between bins.}
  \item{converged}{Logical flag indicating whether optimization converged.}
  \item{iterations}{Integer number of optimization iterations performed.}
}
}
\description{
Implements optimal binning using the **KLL Sketch** (Karnin, Lang, Liberty, 2016),
a probabilistic data structure for quantile approximation in data streams. This is
the \strong{only method in the package} that uses a fundamentally different algorithmic
approach (streaming algorithms) compared to batch processing methods (MOB, MDLP, etc.).

The sketch-based approach enables:
\itemize{
  \item \strong{Sublinear space complexity}: O(k log N) vs O(N) for batch methods
  \item \strong{Single-pass processing}: Suitable for streaming data
  \item \strong{Provable approximation guarantees}: Quantile error \eqn{\epsilon \approx O(1/k)}
}

The method combines KLL Sketch for candidate generation with either Dynamic Programming
(for small N <= 50) or greedy IV-based selection (for larger datasets), followed by
monotonicity enforcement via the Pool Adjacent Violators Algorithm (PAVA).
}
\details{
\strong{Algorithm Overview}

The sketch-based binning algorithm executes in four phases:

\strong{Phase 1: KLL Sketch Construction}

The KLL Sketch maintains a compressed, multi-level representation of the data distribution:

\deqn{\text{Sketch} = \{\text{Compactor}_0, \text{Compactor}_1, \ldots, \text{Compactor}_L\}}

where each \eqn{\text{Compactor}_\ell} stores items with weight \eqn{2^\ell}. When a
compactor exceeds capacity \eqn{k} (controlled by \code{sketch_k}), it is compacted.

\strong{Theoretical Guarantees} (Karnin et al., 2016):

For a quantile \eqn{q} with estimated value \eqn{\hat{q}}:

\deqn{|\text{rank}(\hat{q}) - q \cdot N| \le \epsilon \cdot N}

where \eqn{\epsilon \approx O(1/k)} and space complexity is \eqn{O(k \log(N/k))}.

\strong{Phase 2: Candidate Extraction}

Approximately 40 quantiles are extracted from the sketch using a non-uniform grid
with higher resolution in distribution tails.

\strong{Phase 3: Optimal Cutpoint Selection}

For small datasets (N <= 50), Dynamic Programming maximizes total IV. For larger
datasets, a greedy IV-based selection is used.

\strong{Phase 4: Bin Refinement}

Bins are refined through frequency constraint enforcement, monotonicity enforcement
(if requested), and bin count optimization to minimize IV loss.

\strong{Computational Complexity}

\itemize{
  \item \strong{Time}: \eqn{O(N \log k + N \times C + k^2 \times I)}
  \item \strong{Space}: \eqn{O(k \log N)} for large N
}

\strong{When to Use Sketch-based Binning}

\itemize{
  \item \strong{Use}: Large datasets (N > 10^6) with memory constraints or streaming data
  \item \strong{Avoid}: Small datasets (N < 1000) where approximation error may dominate
}
}
\examples{
\donttest{
# Example 1: Basic usage with simulated data
set.seed(123)
feature <- rnorm(500, mean = 100, sd = 20)
target <- rbinom(500, 1, prob = plogis((feature - 100) / 20))

result <- ob_numerical_sketch(
  feature = feature,
  target = target,
  min_bins = 3,
  max_bins = 5
)

# Display results
print(data.frame(
  Bin = result$id,
  Count = result$count,
  WoE = round(result$woe, 4),
  IV = round(result$iv, 4)
))

# Example 2: Comparing different sketch_k values
set.seed(456)
x <- rnorm(1000, 50, 15)
y <- rbinom(1000, 1, prob = 0.3)

result_k50 <- ob_numerical_sketch(x, y, sketch_k = 50)
result_k200 <- ob_numerical_sketch(x, y, sketch_k = 200)

cat("K=50 IV:", sum(result_k50$iv), "\n")
cat("K=200 IV:", sum(result_k200$iv), "\n")
}

}
\references{
\itemize{
  \item Karnin, Z., Lang, K., & Liberty, E. (2016). "Optimal Quantile Approximation in
    Streams". \emph{Proceedings of the 57th Annual IEEE Symposium on Foundations of
    Computer Science (FOCS)}, 71-78. \doi{10.1109/FOCS.2016.20}
  \item Greenwald, M., & Khanna, S. (2001). "Space-efficient online computation of
    quantile summaries". \emph{ACM SIGMOD Record}, 30(2), 58-66.
    \doi{10.1145/376284.375670}
  \item Barlow, R. E., Bartholomew, D. J., Bremner, J. M., & Brunk, H. D. (1972).
    \emph{Statistical Inference Under Order Restrictions}. Wiley.
  \item Siddiqi, N. (2006). \emph{Credit Risk Scorecards: Developing and Implementing
    Intelligent Credit Scoring}. Wiley. \doi{10.1002/9781119201731}
}
}
\seealso{
\code{\link{ob_numerical_mdlp}}, \code{\link{ob_numerical_mblp}}
}
\author{
Lopes, J. E.
}
