## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = FALSE
)

## ----data-types---------------------------------------------------------------
# library(ggmlR)
# 
# # Standard floating point
# GGML_TYPE_F32   # 32-bit float (4 bytes per element)
# GGML_TYPE_F16   # 16-bit float (2 bytes per element)
# 
# # Integer
# GGML_TYPE_I32   # 32-bit integer
# 
# # Quantized types
# GGML_TYPE_Q4_0  # 4-bit quantization, type 0
# GGML_TYPE_Q4_1  # 4-bit quantization, type 1
# GGML_TYPE_Q8_0  # 8-bit quantization

## ----memory-comparison--------------------------------------------------------
# ctx <- ggml_init(64 * 1024 * 1024)
# 
# # Create tensors of same logical size with different types
# n <- 1000000  # 1M elements
# 
# f32_tensor <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n)
# f16_tensor <- ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n)
# q8_tensor  <- ggml_new_tensor_1d(ctx, GGML_TYPE_Q8_0, n)
# q4_tensor  <- ggml_new_tensor_1d(ctx, GGML_TYPE_Q4_0, n)
# 
# cat("Memory usage for", n, "elements:\n")
# cat("  F32:", ggml_nbytes(f32_tensor) / 1024^2, "MB\n")
# cat("  F16:", ggml_nbytes(f16_tensor) / 1024^2, "MB\n")
# cat("  Q8_0:", ggml_nbytes(q8_tensor) / 1024^2, "MB\n")
# cat("  Q4_0:", ggml_nbytes(q4_tensor) / 1024^2, "MB\n")
# 
# ggml_free(ctx)

## ----init-quant---------------------------------------------------------------
# # Initialize quantization (required before first use)
# ggml_quantize_init(GGML_TYPE_Q4_0)
# ggml_quantize_init(GGML_TYPE_Q8_0)

## ----quantize-data------------------------------------------------------------
# ctx <- ggml_init(16 * 1024 * 1024)
# 
# # Create source data (F32)
# n <- 256  # Must be multiple of block size (32 for Q4_0)
# src <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n)
# ggml_set_f32(src, rnorm(n))
# 
# # Extract numeric data from tensor
# src_data <- ggml_get_f32(src)
# 
# # Quantize to Q4_0
# quantized <- ggml_quantize_chunk(
#   type = GGML_TYPE_Q4_0,
#   src = src_data,
#   nrows = 1,
#   n_per_row = n
# )
# 
# cat("Original size:", length(src_data) * 4, "bytes\n")  # F32 = 4 bytes
# cat("Quantized size:", length(quantized), "bytes\n")
# cat("Compression ratio:", round(ggml_nbytes(src) / length(quantized), 1), "x\n")
# 
# ggml_free(ctx)

## ----dequantize---------------------------------------------------------------
# # Q4_0 dequantization
# q4_data <- quantized  # From previous example
# dequantized <- dequantize_row_q4_0(q4_data, n)
# 
# # Compare with original
# error <- mean(abs(src_data - dequantized))
# cat("Mean absolute error:", error, "\n")

## ----block-info---------------------------------------------------------------
# # Get block information for quantized types
# q4_info <- ggml_quant_block_info(GGML_TYPE_Q4_0)
# cat("Q4_0 block size:", q4_info$blck_size, "elements\n")
# cat("Q4_0 type size:", q4_info$type_size, "bytes per block\n")
# 
# q8_info <- ggml_quant_block_info(GGML_TYPE_Q8_0)
# cat("Q8_0 block size:", q8_info$blck_size, "elements\n")
# cat("Q8_0 type size:", q8_info$type_size, "bytes per block\n")
# 
# # Check if type is quantized
# cat("\nIs Q4_0 quantized?", ggml_is_quantized(GGML_TYPE_Q4_0), "\n")
# cat("Is F32 quantized?", ggml_is_quantized(GGML_TYPE_F32), "\n")

## ----compute-quantized--------------------------------------------------------
# ctx <- ggml_init(32 * 1024 * 1024)
# 
# # Create quantized weight matrix (e.g., for neural network)
# weight_rows <- 256
# weight_cols <- 128
# 
# # In practice, you would load pre-quantized weights
# # Here we create F32 weights and the computation handles mixed types
# weights <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, weight_cols, weight_rows)
# input <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, weight_cols)
# 
# # Matrix-vector multiplication works with mixed types
# output <- ggml_mul_mat(ctx, weights, input)
# 
# graph <- ggml_build_forward_expand(ctx, output)
# 
# # Initialize data
# ggml_set_f32(weights, rnorm(weight_rows * weight_cols, sd = 0.1))
# ggml_set_f32(input, rnorm(weight_cols))
# 
# ggml_graph_compute(ctx, graph)
# 
# cat("Output shape:", ggml_tensor_shape(output), "\n")
# cat("Output sample:", head(ggml_get_f32(output), 5), "\n")
# 
# ggml_free(ctx)

## ----dequant-functions--------------------------------------------------------
# # Standard quantization
# # dequantize_row_q4_0()  - 4-bit, type 0
# # dequantize_row_q4_1()  - 4-bit, type 1
# # dequantize_row_q5_0()  - 5-bit, type 0
# # dequantize_row_q5_1()  - 5-bit, type 1
# # dequantize_row_q8_0()  - 8-bit, type 0
# 
# # K-quants (better quality)
# # dequantize_row_q2_K()  - 2-bit K-quant
# # dequantize_row_q3_K()  - 3-bit K-quant
# # dequantize_row_q4_K()  - 4-bit K-quant
# # dequantize_row_q5_K()  - 5-bit K-quant
# # dequantize_row_q6_K()  - 6-bit K-quant
# # dequantize_row_q8_K()  - 8-bit K-quant
# 
# # I-quants (importance matrix)
# # dequantize_row_iq2_xxs(), dequantize_row_iq2_xs(), dequantize_row_iq2_s()
# # dequantize_row_iq3_xxs(), dequantize_row_iq3_s()
# # dequantize_row_iq4_nl(), dequantize_row_iq4_xs()
# 
# # Special types
# # dequantize_row_tq1_0()  - Ternary quantization
# # dequantize_row_tq2_0()

## ----imatrix------------------------------------------------------------------
# # Check if type requires importance matrix
# cat("Q4_0 requires imatrix:", ggml_quantize_requires_imatrix(GGML_TYPE_Q4_0),
#     "\n")
# 
# # I-quants typically require importance matrix for best results
# # The imatrix captures which weights are most important for model quality

## ----cleanup------------------------------------------------------------------
# # Free quantization tables
# ggml_quantize_free()