% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/cdisc_validate.R
\name{cdisc_compare}
\alias{cdisc_compare}
\title{Compare Two Datasets with CDISC Validation}
\usage{
cdisc_compare(
  df1,
  df2,
  domain = NULL,
  standard = NULL,
  id_vars = NULL,
  vars = NULL,
  ts_data = NULL,
  detect_outliers = FALSE,
  tolerance = 0,
  where = NULL
)
}
\arguments{
\item{df1}{First data frame to compare, or a file path (character string
ending in \code{.xpt}, \code{.sas7bdat}, \code{.csv}, or \code{.rds}).
When a file path is provided, the dataset is loaded automatically.
Domain is auto-detected from filename if not specified (e.g.,
\code{"dm.xpt"} sets domain to \code{"DM"}).}

\item{df2}{Second data frame to compare, or a file path.}

\item{domain}{Optional character string specifying the CDISC domain code or dataset name
(e.g., "DM", "AE", "ADSL"). Strongly recommended -- auto-detection can be
ambiguous for datasets with common columns. If NULL, auto-detected from df1.}

\item{standard}{Optional character string: "SDTM" or "ADaM". If NULL, auto-detected from df1.}

\item{id_vars}{Optional character vector of ID variable names (e.g.,
\code{c("USUBJID", "VISITNUM")}) used to match rows between datasets.
When provided, rows are joined by these keys instead of matched by position.
Unmatched rows are reported separately. When \code{NULL} (default) and
domain is known, CDISC-standard keys are auto-detected (e.g.,
STUDYID + USUBJID + \<DOMAIN\>SEQ for SDTM). Only variables present in
both datasets are used. To add extra keys on top of the defaults, prefix
with \code{"+"}: e.g., \code{id_vars = c("+", "AETOXGR")} appends AETOXGR
to the standard keys. To override completely, pass without \code{"+"}.}

\item{vars}{Optional character vector of variable names to compare. Only these columns are included in value comparison. Structural and CDISC validation still covers all columns.}

\item{ts_data}{Optional data frame of the TS (Trial Summary) domain.
When provided, CDISC standard versions (e.g., SDTM IG 3.4, ADaM IG 1.3)
are extracted and included in the results and reports. If NULL (default),
version information is omitted.}

\item{detect_outliers}{Logical. When TRUE, runs z-score outlier detection
on numeric columns and includes results in the output. Defaults to FALSE.}

\item{tolerance}{Numeric tolerance value for floating-point comparisons (default 0).
When tolerance > 0, numeric values are considered equal if their absolute
difference is within the tolerance threshold. Character and factor columns
always use exact matching regardless of tolerance.}

\item{where}{Optional filter expression as a string (e.g., "AESEV == 'SEVERE'").
Applied to both datasets before comparison. Equivalent to a WHERE clause.}
}
\value{
A list containing:
\item{domain}{Character: detected or supplied CDISC domain}
\item{standard}{Character: detected or supplied CDISC standard (SDTM/ADaM)}
\item{nrow_df1}{Integer: number of rows in df1}
\item{ncol_df1}{Integer: number of columns in df1}
\item{nrow_df2}{Integer: number of rows in df2}
\item{ncol_df2}{Integer: number of columns in df2}
\item{id_vars}{Character vector of ID variables used for matching (NULL if
positional matching was used)}
\item{comparison}{Result of \code{\link[=compare_datasets]{compare_datasets()}} function}
\item{variable_comparison}{Result of \code{\link[=compare_variables]{compare_variables()}} function}
\item{metadata_comparison}{List of metadata differences: type_mismatches,
label_mismatches, length_mismatches, format_mismatches, column ordering}
\item{observation_comparison}{Result of \code{\link[=compare_observations]{compare_observations()}} if dimensions match,
otherwise NULL with explanatory message}
\item{unified_comparison}{Data frame combining attribute and value differences
per variable. Columns: variable, attribute, base_value, compare_value,
and optionally id columns and row when value differences exist}
\item{unmatched_rows}{List with df1_only and df2_only data frames of rows that
could not be matched by id_vars (NULL when id_vars is not used)}
\item{cdisc_validation_df1}{CDISC validation results for df1}
\item{cdisc_validation_df2}{CDISC validation results for df2}
\item{cdisc_conformance_comparison}{Data frame showing which CDISC issues are unique
to df1, unique to df2, or common to both}
\item{outlier_notes}{Data frame of z-score outliers (|z| > 3) found in
numeric columns of either dataset (NULL when detect_outliers is FALSE)}
\item{cdisc_version}{List of CDISC version information extracted from TS
domain (NULL when ts_data is not provided). See \code{\link[=extract_cdisc_version]{extract_cdisc_version()}}}
}
\description{
Flagship function that compares two datasets AND runs CDISC validation on both.
Combines dataset comparison with CDISC conformance analysis to provide comprehensive
insights into both differences and regulatory compliance.
}
\examples{
\donttest{
# Create sample SDTM DM domains
dm1 <- data.frame(
  STUDYID = "STUDY001",
  USUBJID = c("SUBJ001", "SUBJ002"),
  DMSEQ = c(1, 1),
  RACE = c("WHITE", "BLACK OR AFRICAN AMERICAN"),
  stringsAsFactors = FALSE
)

dm2 <- data.frame(
  STUDYID = "STUDY001",
  USUBJID = c("SUBJ001", "SUBJ003"),
  DMSEQ = c(1, 1),
  RACE = c("WHITE", "ASIAN"),
  ETHNIC = c("NOT HISPANIC", "NOT HISPANIC"),
  stringsAsFactors = FALSE
)

# Positional matching (default)
result <- cdisc_compare(dm1, dm2, domain = "DM", standard = "SDTM")

# Key-based matching by ID variables
result <- cdisc_compare(dm1, dm2, domain = "DM", id_vars = c("USUBJID"))
names(result)
}
}
