Working with Quantized Models

Supported Data Types

ggmlR supports the following data types:

library(ggmlR)

# Standard floating point
GGML_TYPE_F32   # 32-bit float (4 bytes per element)
GGML_TYPE_F16   # 16-bit float (2 bytes per element)

# Integer
GGML_TYPE_I32   # 32-bit integer

# Quantized types
GGML_TYPE_Q4_0  # 4-bit quantization, type 0
GGML_TYPE_Q4_1  # 4-bit quantization, type 1
GGML_TYPE_Q8_0  # 8-bit quantization

Memory Savings

Quantization provides significant memory savings:

ctx <- ggml_init(64 * 1024 * 1024)

# Create tensors of same logical size with different types
n <- 1000000  # 1M elements

f32_tensor <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n)
f16_tensor <- ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n)
q8_tensor  <- ggml_new_tensor_1d(ctx, GGML_TYPE_Q8_0, n)
q4_tensor  <- ggml_new_tensor_1d(ctx, GGML_TYPE_Q4_0, n)

cat("Memory usage for", n, "elements:\n")
cat("  F32:", ggml_nbytes(f32_tensor) / 1024^2, "MB\n")
cat("  F16:", ggml_nbytes(f16_tensor) / 1024^2, "MB\n")
cat("  Q8_0:", ggml_nbytes(q8_tensor) / 1024^2, "MB\n")
cat("  Q4_0:", ggml_nbytes(q4_tensor) / 1024^2, "MB\n")

ggml_free(ctx)

Quantization Functions

Initialize Quantization Tables

Before quantizing, initialize the quantization tables:

# Initialize quantization (required before first use)
ggml_quantize_init(GGML_TYPE_Q4_0)
ggml_quantize_init(GGML_TYPE_Q8_0)

Quantize Data

Use ggml_quantize_chunk() to quantize floating-point data:

ctx <- ggml_init(16 * 1024 * 1024)

# Create source data (F32)
n <- 256  # Must be multiple of block size (32 for Q4_0)
src <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n)
ggml_set_f32(src, rnorm(n))

# Extract numeric data from tensor
src_data <- ggml_get_f32(src)

# Quantize to Q4_0
quantized <- ggml_quantize_chunk(
  type = GGML_TYPE_Q4_0,
  src = src_data,
  nrows = 1,
  n_per_row = n
)

cat("Original size:", length(src_data) * 4, "bytes\n")  # F32 = 4 bytes
cat("Quantized size:", length(quantized), "bytes\n")
cat("Compression ratio:", round(ggml_nbytes(src) / length(quantized), 1), "x\n")

ggml_free(ctx)

Dequantize Data

To convert quantized data back to float:

# Q4_0 dequantization
q4_data <- quantized  # From previous example
dequantized <- dequantize_row_q4_0(q4_data, n)

# Compare with original
error <- mean(abs(src_data - dequantized))
cat("Mean absolute error:", error, "\n")

Block Sizes and Alignment

Quantized types have specific block sizes:

# Get block information for quantized types
q4_info <- ggml_quant_block_info(GGML_TYPE_Q4_0)
cat("Q4_0 block size:", q4_info$blck_size, "elements\n")
cat("Q4_0 type size:", q4_info$type_size, "bytes per block\n")

q8_info <- ggml_quant_block_info(GGML_TYPE_Q8_0)
cat("Q8_0 block size:", q8_info$blck_size, "elements\n")
cat("Q8_0 type size:", q8_info$type_size, "bytes per block\n")

# Check if type is quantized
cat("\nIs Q4_0 quantized?", ggml_is_quantized(GGML_TYPE_Q4_0), "\n")
cat("Is F32 quantized?", ggml_is_quantized(GGML_TYPE_F32), "\n")

Using Quantized Tensors in Computations

GGML automatically handles dequantization during computation:

ctx <- ggml_init(32 * 1024 * 1024)

# Create quantized weight matrix (e.g., for neural network)
weight_rows <- 256
weight_cols <- 128

# In practice, you would load pre-quantized weights
# Here we create F32 weights and the computation handles mixed types
weights <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, weight_cols, weight_rows)
input <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, weight_cols)

# Matrix-vector multiplication works with mixed types
output <- ggml_mul_mat(ctx, weights, input)

graph <- ggml_build_forward_expand(ctx, output)

# Initialize data
ggml_set_f32(weights, rnorm(weight_rows * weight_cols, sd = 0.1))
ggml_set_f32(input, rnorm(weight_cols))

ggml_graph_compute(ctx, graph)

cat("Output shape:", ggml_tensor_shape(output), "\n")
cat("Output sample:", head(ggml_get_f32(output), 5), "\n")

ggml_free(ctx)

Available Dequantization Functions

ggmlR provides dequantization for all GGML quantized types:

# Standard quantization
# dequantize_row_q4_0()  - 4-bit, type 0
# dequantize_row_q4_1()  - 4-bit, type 1
# dequantize_row_q5_0()  - 5-bit, type 0
# dequantize_row_q5_1()  - 5-bit, type 1
# dequantize_row_q8_0()  - 8-bit, type 0

# K-quants (better quality)
# dequantize_row_q2_K()  - 2-bit K-quant
# dequantize_row_q3_K()  - 3-bit K-quant
# dequantize_row_q4_K()  - 4-bit K-quant
# dequantize_row_q5_K()  - 5-bit K-quant
# dequantize_row_q6_K()  - 6-bit K-quant
# dequantize_row_q8_K()  - 8-bit K-quant

# I-quants (importance matrix)
# dequantize_row_iq2_xxs(), dequantize_row_iq2_xs(), dequantize_row_iq2_s()
# dequantize_row_iq3_xxs(), dequantize_row_iq3_s()
# dequantize_row_iq4_nl(), dequantize_row_iq4_xs()

# Special types
# dequantize_row_tq1_0()  - Ternary quantization
# dequantize_row_tq2_0()

Importance Matrix Quantization

Some quantization types require an importance matrix for better quality:

# Check if type requires importance matrix
cat("Q4_0 requires imatrix:", ggml_quantize_requires_imatrix(GGML_TYPE_Q4_0),
    "\n")

# I-quants typically require importance matrix for best results
# The imatrix captures which weights are most important for model quality

Cleanup

Always free quantization resources when done:

# Free quantization tables
ggml_quantize_free()

Performance Considerations

When to Use Quantization

Large models: Quantization is essential for running large language models
Memory-constrained environments: Reduce memory footprint by 2-8x
Inference: Quantization is primarily used for inference, not training

Choosing Quantization Type

Type	Bits	Quality	Speed	Use Case
Q8_0	8	High	Fast	When quality matters
Q4_K	4	Good	Fast	Balanced choice
Q4_0	4	Medium	Fastest	Maximum compression
Q2_K	2	Lower	Fast	Extreme compression

Tips

Start with Q4_K or Q5_K for a good balance of quality and size
Use Q8_0 when quality is critical
Test accuracy after quantization on your specific use case
Align tensor sizes to block sizes for optimal performance