4. Latent and Mixed-Scale Correlation

Scope

This vignette covers the latent-correlation estimators used when the observed data are binary, ordinal, or mixed. These methods do not target the same quantity as an ordinary Pearson correlation on coded categories. They are designed for settings where the observed variables are treated as thresholded versions of latent continuous variables.

The relevant functions are:

A small latent-data example

library(matrixCorr)

set.seed(30)
n <- 500
Sigma <- matrix(c(
  1.00, 0.55, 0.35, 0.20,
  0.55, 1.00, 0.40, 0.30,
  0.35, 0.40, 1.00, 0.45,
  0.20, 0.30, 0.45, 1.00
), 4, 4, byrow = TRUE)

Z <- matrix(rnorm(n * 4), n, 4) %*% chol(Sigma)

X_bin <- data.frame(
  b1 = Z[, 1] > qnorm(0.70),
  b2 = Z[, 2] > qnorm(0.55),
  b3 = Z[, 3] > qnorm(0.50)
)

X_ord <- data.frame(
  o1 = ordered(cut(Z[, 2], breaks = c(-Inf, -0.5, 0.4, Inf),
    labels = c("low", "mid", "high")
  )),
  o2 = ordered(cut(Z[, 3], breaks = c(-Inf, -1, 0, 1, Inf),
    labels = c("1", "2", "3", "4")
  ))
)

X_cont <- data.frame(x1 = Z[, 1], x2 = Z[, 4])

Binary-binary and ordinal-ordinal settings

tetrachoric() is used for binary variables. polychoric() is used for ordered categorical variables.

fit_tet <- tetrachoric(X_bin, ci = TRUE, p_value = TRUE)
fit_pol <- polychoric(X_ord, ci = TRUE, p_value = TRUE)

print(fit_tet, digits = 2)
#> Tetrachoric correlation matrix
#>   method      : tetrachoric
#>   dimensions  : 3 x 3
#>   ci          : yes
#>   p_values    : yes
#> 
#>      b1   b2   b3
#> b1 1.00 0.62 0.42
#> b2 0.62 1.00 0.48
#> b3 0.42 0.48 1.00
summary(fit_pol)
#> Polychoric correlation summary
#>   output      : matrix
#>   dimensions  : 2 x 2
#>   retained_pairs: 1
#>   threshold   : 0.0000
#>   diag        : included
#>   estimate    : 0.4757
#>   thresholds  : 2 set(s)
#>   inference   : wald_z_polychoric
#>   ci          : 95%
#>   ci_method   : wald_information_polychoric
#>   ci_width    : 0.164
#> 
#>  item1 item2 estimate n   95% CI           p_value
#>  o1    o2    0.4757   500 [0.3940, 0.5575] 0.0000
estimate(fit_tet)
#>           b1        b2        b3
#> b1 1.0000000 0.6166173 0.4201956
#> b2 0.6166173 1.0000000 0.4750753
#> b3 0.4201956 0.4750753 1.0000000
confint(fit_tet)
#>   item1 item2       lwr       upr
#> 1    b1    b2 0.5119264 0.7213082
#> 2    b1    b3 0.2922621 0.5481292
#> 3    b2    b3 0.3596730 0.5904775
ci(fit_tet)
#> $est
#>           b1        b2        b3
#> b1 1.0000000 0.6166173 0.4201956
#> b2 0.6166173 1.0000000 0.4750753
#> b3 0.4201956 0.4750753 1.0000000
#> attr(,"method")
#> [1] "tetrachoric"
#> attr(,"description")
#> [1] "Pairwise tetrachoric correlation matrix"
#> attr(,"package")
#> [1] "matrixCorr"
#> attr(,"output")
#> [1] "matrix"
#> attr(,"threshold")
#> [1] 0
#> attr(,"diag")
#> [1] TRUE
#> attr(,"diagnostics")
#> attr(,"diagnostics")$zero_cells
#>    b1 b2 b3
#> b1  0  0  0
#> b2  0  0  0
#> b3  0  0  0
#> 
#> attr(,"diagnostics")$n_complete
#>     b1  b2  b3
#> b1 500 500 500
#> b2 500 500 500
#> b3 500 500 500
#> 
#> attr(,"diagnostics")$corrected
#>       b1    b2    b3
#> b1 FALSE FALSE FALSE
#> b2 FALSE FALSE FALSE
#> b3 FALSE FALSE FALSE
#> 
#> attr(,"diagnostics")$boundary
#>       b1    b2    b3
#> b1 FALSE FALSE FALSE
#> b2 FALSE FALSE FALSE
#> b3 FALSE FALSE FALSE
#> 
#> attr(,"diagnostics")$near_boundary
#>       b1    b2    b3
#> b1 FALSE FALSE FALSE
#> b2 FALSE FALSE FALSE
#> b3 FALSE FALSE FALSE
#> 
#> attr(,"diagnostics")$converged
#>      b1   b2   b3
#> b1 TRUE TRUE TRUE
#> b2 TRUE TRUE TRUE
#> b3 TRUE TRUE TRUE
#> 
#> attr(,"diagnostics")$optimizer_tol
#> [1] 1e-08
#> 
#> attr(,"corr_result")
#> [1] TRUE
#> attr(,"corr_output_class")
#> [1] "corr_matrix"
#> attr(,"corr_estimator_class")
#> [1] "tetrachoric_corr"
#> attr(,"corr_dim")
#> [1] 3 3
#> attr(,"corr_dimnames")
#> attr(,"corr_dimnames")[[1]]
#> [1] "b1" "b2" "b3"
#> 
#> attr(,"corr_dimnames")[[2]]
#> [1] "b1" "b2" "b3"
#> 
#> attr(,"corr_symmetric")
#> [1] TRUE
#> attr(,"thresholds")
#> attr(,"thresholds")$b1
#> [1] 0.5769104
#> 
#> attr(,"thresholds")$b2
#> [1] 0.1408354
#> 
#> attr(,"thresholds")$b3
#> [1] 0.04513463
#> 
#> attr(,"correct")
#> [1] 0.5
#> 
#> $lwr.ci
#>           b1        b2        b3
#> b1        NA 0.5119264 0.2922621
#> b2 0.5119264        NA 0.3596730
#> b3 0.2922621 0.3596730        NA
#> 
#> $upr.ci
#>           b1        b2        b3
#> b1        NA 0.7213082 0.5481292
#> b2 0.7213082        NA 0.5904775
#> b3 0.5481292 0.5904775        NA
#> 
#> $conf.level
#> [1] 0.95
#> 
#> $ci.method
#> [1] "wald_information_tetrachoric"
tidy(fit_tet)
#>   item1 item2  estimate       lwr       upr statistic      p_value
#> 1    b1    b2 0.6166173 0.5119264 0.7213082 11.543960 7.919380e-31
#> 2    b1    b3 0.4201956 0.2922621 0.5481292  6.437469 1.214821e-10
#> 3    b2    b3 0.4750753 0.3596730 0.5904775  8.068563 7.113027e-16

These estimators assume a latent-normal threshold model. That assumption should be stated whenever the results are reported, because the interpretation is not simply “correlation between coded categories.”

It is often useful to compare that latent estimate with a naive Pearson correlation computed on coded categories.

fit_bin_naive <- pearson_corr(data.frame(lapply(X_bin[, 1:2], as.numeric)))
fit_ord_naive <- pearson_corr(data.frame(lapply(X_ord, as.numeric)))

round(c(
  b1_b2_pearson = fit_bin_naive[1, 2],
  b1_b2_tetrachoric = fit_tet[1, 2],
  o1_o2_pearson = fit_ord_naive[1, 2],
  o1_o2_polychoric = fit_pol[1, 2]
), 2)
#>     b1_b2_pearson b1_b2_tetrachoric     o1_o2_pearson  o1_o2_polychoric 
#>              0.40              0.62              0.40              0.48

Those numbers need not agree. The latent estimators target the association between the underlying continuous variables, not the correlation between arbitrarily coded categories.

Mixed continuous-discrete settings

polyserial() is used when one variable is continuous and the other is ordinal. biserial() is used when one variable is continuous and the other is binary.

fit_ps <- polyserial(X_cont, X_ord, ci = TRUE, p_value = TRUE)
fit_bis <- biserial(X_cont, X_bin[, 1:2], ci = TRUE, p_value = TRUE)

summary(fit_ps)
#> Polyserial correlation summary
#>   output      : matrix
#>   dimensions  : 2 x 2
#>   retained_pairs: 4
#>   threshold   : 0.0000
#>   diag        : included
#>   estimate    : 0.3282 to 0.6043
#>   inference   : wald_z_polyserial
#>   ci          : 95%
#>   ci_method   : wald_information_polyserial
#>   ci_width    : 0.123 to 0.175
#> 
#>  item1 item2 estimate n   95% CI           p_value
#>  x1    o1    0.6043   500 [0.5426, 0.6660] 0.0000 
#>  x2    o2    0.4654   500 [0.3943, 0.5365] 0.0000 
#>  x1    o2    0.4183   500 [0.3432, 0.4934] 0.0000 
#>  x2    o1    0.3282   500 [0.2407, 0.4158] 0.0000
summary(fit_bis)
#> Biserial correlation summary
#>   output      : matrix
#>   dimensions  : 2 x 2
#>   retained_pairs: 4
#>   threshold   : 0.0000
#>   diag        : included
#>   estimate    : 0.2120 to 1.0000
#>   inference   : biserial_t_test
#>   ci          : 95%
#>   ci_method   : fisher_z_biserial
#>   ci_width    : 0.000 to 0.168
#> 
#>  item1 item2 estimate n   95% CI           p_value
#>  x1    b1    1.0000   500 [1.0000, 1.0000] 0.0000 
#>  x1    b2    0.6024   500 [0.5435, 0.6555] 0.0000 
#>  x2    b2    0.3191   500 [0.2381, 0.3957] 0.0000 
#>  x2    b1    0.2120   500 [0.1267, 0.2942] 0.0000

Confidence intervals and p-values

These functions now follow the same user-facing pattern as the rest of the package:

The important point is that inference is tied to the fitted latent model rather than to an ordinary Pearson-correlation formula applied to coded categories.

Practical guidance

These estimators are appropriate when the scientific question is explicitly about latent association under a threshold model.

If the variables are nominal rather than ordered, these latent-correlation functions are not the right tools.