% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/RcppExports.R
\name{bdRemoveMAF_hdf5}
\alias{bdRemoveMAF_hdf5}
\title{Remove SNPs Based on Minor Allele Frequency}
\usage{
bdRemoveMAF_hdf5(
  filename,
  group,
  dataset,
  outgroup,
  outdataset,
  maf,
  bycols,
  blocksize,
  overwrite = NULL
)
}
\arguments{
\item{filename}{Character string. Path to the HDF5 file.}

\item{group}{Character string. Path to the group containing input dataset.}

\item{dataset}{Character string. Name of the dataset to filter.}

\item{outgroup}{Character string. Output group path for filtered data.}

\item{outdataset}{Character string. Output dataset name for filtered data.}

\item{maf}{Numeric (optional). MAF threshold for filtering (0-1).
Default is 0.05. SNPs with MAF above this threshold are removed.}

\item{bycols}{Logical (optional). Whether to process by columns (TRUE) or
rows (FALSE). Default is FALSE.}

\item{blocksize}{Integer (optional). Block size for processing. Default is 100.
Larger values use more memory but may be faster.}

\item{overwrite}{Logical (optional). Whether to overwrite existing dataset.
Default is FALSE.}
}
\value{
List with components. If an error occurs, all string values are returned as empty strings (""):
\describe{
\item{fn}{Character string with the HDF5 filename}
\item{ds}{Character string with the full dataset path to the filtered dataset (group/dataset)}
\item{nremoved}{Integer with the number of SNPs removed due to low Minor Allele Frequency (MAF)}
}
}
\description{
Filters SNPs (Single Nucleotide Polymorphisms) based on Minor Allele
Frequency (MAF) in genomic data stored in HDF5 format.
}
\details{
This function provides efficient MAF-based filtering capabilities with:
\itemize{
\item Filtering options:
\itemize{
\item MAF threshold-based filtering
\item Row-wise or column-wise processing
\item Block-based processing
}
\item Implementation features:
\itemize{
\item Memory-efficient processing
\item Block-based operations
\item Safe file operations
\item Progress reporting
}
}

The function supports both in-place modification and creation of new datasets.
}
\examples{
\dontrun{
library(BigDataStatMeth)

# Create test SNP data
snps <- matrix(sample(c(0, 1, 2), 1000, replace = TRUE,
                     prob = c(0.7, 0.2, 0.1)), 100, 10)

# Save to HDF5
fn <- "snp_data.hdf5"
bdCreate_hdf5_matrix(fn, snps, "genotype", "raw_snps",
                     overwriteFile = TRUE)

# Remove SNPs with high MAF
bdRemoveMAF_hdf5(
  filename = fn,
  group = "genotype",
  dataset = "raw_snps",
  outgroup = "genotype_filtered",
  outdataset = "filtered_snps",
  maf = 0.1,
  bycols = TRUE,
  blocksize = 50
)

# Cleanup
if (file.exists(fn)) {
  file.remove(fn)
}
}

}
\references{
\itemize{
\item The HDF Group. (2000-2010). HDF5 User's Guide.
\item Marees, A. T., et al. (2018). A tutorial on conducting genome‐wide
association studies: Quality control and statistical analysis. International
Journal of Methods in Psychiatric Research, 27(2), e1608.
}
}
\seealso{
\itemize{
\item \code{\link{bdRemovelowdata_hdf5}} for removing low-representation SNPs
\item \code{\link{bdImputeSNPs_hdf5}} for imputing missing SNP values
}
}
