This vignette demonstrates how to compute diagFDR
diagnostics from a generic mzIdentML file
(.mzid).
Many search engines can export identifications in mzIdentML format.
When explicit q-values and/or PEPs are not present in the mzIdentML,
diagFDR can:
Note: mzIdentML is flexible and tool-dependent. If the score CV term or decoy labeling is not encoded consistently, you may need to adjust
score_accession_preference,score_direction, and/ordecoy_regex.
This section provides a small toy dataset that mimics the
PSM-level competed universe returned by
read_dfdr_mzid() after choosing a score and reconstructing
TDC q-values.
library(diagFDR)
set.seed(3)
n <- 7000
pi_decoy <- 0.03
# Decoy indicator
is_decoy <- sample(c(FALSE, TRUE), n, replace = TRUE, prob = c(1 - pi_decoy, pi_decoy))
# Targets are a mixture: some null-like (close to decoys), some true (higher score)
# This yields realistic separation and non-trivial discoveries at 1% FDR.
is_true_target <- (!is_decoy) & (runif(n) < 0.30) # 30% of targets are "true"
is_null_target <- (!is_decoy) & (!is_true_target)
score <- numeric(n)
score[is_decoy] <- rnorm(sum(is_decoy), mean = 0.0, sd = 1.0)
score[is_null_target] <- rnorm(sum(is_null_target), mean = 0.2, sd = 1.0)
score[is_true_target] <- rnorm(sum(is_true_target), mean = 3.0, sd = 1.0)
toy <- data.frame(
id = paste0("psm", seq_len(n)),
is_decoy = is_decoy,
run = sample(paste0("run", 1:4), n, replace = TRUE),
score = score,
pep = NA_real_
)
# Score-based TDC q-values (higher score is better)
toy <- toy[order(toy$score, decreasing = TRUE), ]
toy$D_cum <- cumsum(toy$is_decoy)
toy$T_cum <- cumsum(!toy$is_decoy)
toy$FDR_hat <- (toy$D_cum + 1) / pmax(toy$T_cum, 1)
toy$q <- rev(cummin(rev(toy$FDR_hat)))
toy <- toy[, c("id","is_decoy","q","pep","run","score")]
x_toy <- as_dfdr_tbl(
toy,
unit = "psm",
scope = "global",
q_source = "toy TDC from score",
q_max_export = 1,
provenance = list(tool = "toy")
)
diag <- dfdr_run_all(
xs = list(mzid_PSM = x_toy),
alpha_main = 0.01,
alphas = c(1e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 1e-1, 2e-1),
eps = 0.2,
win_rel = 0.2,
truncation = "warn_drop",
low_conf = c(0.2, 0.5)
)diag$tables$headline
#> # A tibble: 1 × 24
#> alpha T_alpha D_alpha FDR_hat CV_hat FDR_minus1 FDR_plus1 FDR_minusK FDR_plusK
#> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.01 2819 27 0.00958 0.192 0.00922 0.00993 0.00603 0.0131
#> # ℹ 15 more variables: k2sqrtD <int>, FDR_minus2sqrtD <dbl>,
#> # FDR_plus2sqrtD <dbl>, list <chr>, D_alpha_win <int>, effect_abs <dbl>,
#> # IPE <dbl>, flag_Dalpha <chr>, flag_CV <chr>, flag_Dwin <chr>,
#> # flag_IPE <chr>, flag_FDR <chr>, flag_equalchance <chr>, status <chr>,
#> # interpretation <chr>
if (nrow(diag$tables$headline) > 0 && diag$tables$headline$T_alpha[1] == 0) {
cat("\nNote: No discoveries at alpha_main for this toy run. ",
"For demonstration, consider using alpha_main = 0.02.\n", sep = "")
}diag$plots$elasticity
#> Warning: Removed 3 rows containing missing values or values outside the scale range
#> (`geom_line()`).
#> Warning: Removed 3 rows containing missing values or values outside the scale range
#> (`geom_point()`).diag$tables$equal_chance_pooled
#> # A tibble: 1 × 12
#> qmax_export low_lo low_hi N_test N_D_test pi_D_hat effect_abs ci95_lo ci95_hi
#> <dbl> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.2 0.5 0 0 NA NA NA NA
#> # ℹ 3 more variables: p_value_binom <dbl>, pass_minN <lgl>, list <chr>
diag$plots$equal_chance__mzid_PSMThe code below shows how to run diagFDR directly
from a .mzid file.
library(diagFDR)
mzid_path <- "path/to/search_results.mzid"
x_mzid <- read_dfdr_mzid(
mzid_path = mzid_path,
rank = 1L, # competed universe: take rank-1 SpectrumIdentificationItem
# Choose a score CV term (priority list) and interpret its direction
score_accession_preference = c(
"MS:1002257", # MS-GF:RawScore (higher is better)
"MS:1001330", # Mascot:score (higher is better)
"MS:1001328", # SEQUEST:xcorr (higher is better)
"MS:1002052",
"MS:1002049",
"MS:1001331",
"MS:1001171",
"MS:1001950",
"MS:1002466"
),
score_direction = "auto", # or "higher_better"/"lower_better" if auto fails
# TDC correction: FDR_hat = (D + add_decoy)/T
add_decoy = 1L,
# Strict by default: require score for all PSMs (set <1 to allow missing)
min_score_coverage = 1.0,
# Fallback decoy inference if PeptideEvidence@isDecoy is not informative
decoy_regex = "(^##|_REVERSED$|^REV_|^DECOY_)",
unit = "psm",
scope = "global",
provenance = list(file = basename(mzid_path))
)
diag <- dfdr_run_all(
xs = list(mzid_PSM = x_mzid),
alpha_main = 0.01,
alphas = c(1e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 1e-1, 2e-1),
eps = 0.2,
win_rel = 0.2,
truncation = "warn_drop",
low_conf = c(0.2, 0.5)
)
# Export outputs
dfdr_write_report(
diag,
out_dir = "diagFDR_mzid_out",
formats = c("csv", "png", "manifest", "readme", "summary")
)
# Render a single HTML report (requires rmarkdown in Suggests)
dfdr_render_report(diag, out_dir = "diagFDR_mzid_out")If you want to run the p-value diagnostics (calibration plots, Storey pi0, BH stability), you can map scores to pseudo-p-values using the empirical decoy null tail.
x_mzid$p <- score_to_pvalue(
score = x_mzid$score,
method = "decoy_ecdf",
is_decoy = x_mzid$is_decoy
)
attr(x_mzid, "meta")$p_source <- "score_to_pvalue(method='decoy_ecdf' on mzid score)"
diag_p <- dfdr_run_all(
xs = list(mzid_PSM = x_mzid),
alpha_main = 0.01
)
dfdr_write_report(
diag_p,
out_dir = "diagFDR_mzid_out_with_p",
formats = c("csv","png","manifest","readme","summary")
)
dfdr_render_report(diag_p, out_dir = "diagFDR_mzid_out_with_p")Score selection matters: mzIdentML files can
contain multiple score CV terms. If read_dfdr_mzid() picks
an unexpected score, adjust score_accession_preference
and/or set score_direction explicitly.
Decoy labeling matters: ideally, mzIdentML
includes PeptideEvidence@isDecoy. If not,
read_dfdr_mzid() falls back to decoy_regex
applied to protein accessions. If your pipeline uses a different decoy
naming convention, update decoy_regex.
Missing scores: if some PSMs are missing the
chosen score CV term, you can relax min_score_coverage
(e.g., 0.95) but then interpret results carefully, as the hypothesis
universe changes.
Scope: this vignette constructs a PSM-level universe. Peptide- or protein-level claims require additional aggregation/inference.