% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/kendall_corr.R
\name{kendall_tau}
\alias{kendall_tau}
\alias{print.kendall_matrix}
\alias{plot.kendall_matrix}
\title{Pairwise (or Two-Vector) Kendall's Tau Rank Correlation}
\usage{
kendall_tau(data, y = NULL, check_na = TRUE)

\method{print}{kendall_matrix}(x, digits = 4, max_rows = NULL, max_cols = NULL, ...)

\method{plot}{kendall_matrix}(
  x,
  title = "Kendall's Tau correlation heatmap",
  low_color = "indianred1",
  high_color = "steelblue1",
  mid_color = "white",
  value_text_size = 4,
  ...
)
}
\arguments{
\item{data}{For matrix/data frame, it is expected a numeric matrix or a data frame with at
least two numeric columns. All non-numeric columns will be excluded.
For two-vector mode, a numeric vector \code{x}.}

\item{y}{Optional numeric vector \code{y} of the same length as \code{data}
when \code{data} is a vector. If supplied, the function computes the
Kendall correlation \emph{between \code{data} and \code{y}} using a
low-overhead scalar path and returns a single number.}

\item{check_na}{Logical (default \code{TRUE}). If \code{TRUE}, inputs must
be free of missing/undefined values. Use \code{FALSE} only when you have
already filtered or imputed them.}

\item{x}{An object of class \code{kendall_matrix}.}

\item{digits}{Integer; number of decimal places to print}

\item{max_rows}{Optional integer; maximum number of rows to display.
If \code{NULL}, all rows are shown.}

\item{max_cols}{Optional integer; maximum number of columns to display.
If \code{NULL}, all columns are shown.}

\item{...}{Additional arguments passed to \code{ggplot2::theme()} or other
\code{ggplot2} layers.}

\item{title}{Plot title. Default is \code{"Kendall's Tau Correlation
Heatmap"}.}

\item{low_color}{Color for the minimum tau value. Default is
\code{"indianred1"}.}

\item{high_color}{Color for the maximum tau value. Default is
\code{"steelblue1"}.}

\item{mid_color}{Color for zero correlation. Default is \code{"white"}.}

\item{value_text_size}{Font size for displaying correlation values. Default
is \code{4}.}
}
\value{
\itemize{
\item If \code{y} is \code{NULL} and \code{data} is a matrix/data frame: a
symmetric numeric matrix where entry \code{(i, j)} is the Kendall's tau
correlation between the \code{i}-th and \code{j}-th numeric columns.
\item If \code{y} is provided (two-vector mode): a single numeric scalar,
the Kendall's tau correlation between \code{data} and \code{y}.
}

Invisibly returns the \code{kendall_matrix} object.

A \code{ggplot} object representing the heatmap.
}
\description{
Computes Kendall's tau rank correlation either for
\strong{all pairs of numeric columns} in a matrix/data frame, or
for \strong{two numeric vectors} directly (scalar path).

This function uses a scalable algorithm implemented in 'C++' to
compute Kendall's tau-b (tie-robust). When there are no ties, tau-b reduces
to tau-a. The implementation follows the Knight (1966) \eqn{O(n \log n)}
scheme, where a single sort on one variable, in-block sorting of the paired
variable within tie groups, and a global merge-sort–based inversion count
with closed-form tie corrections.

Prints a summary of the Kendall's tau correlation matrix,
including description and method metadata.

Generates a ggplot2-based heatmap of the Kendall's tau
correlation matrix.
}
\details{
Kendall's tau is a rank-based measure of association between two variables.
For a dataset with \eqn{n} observations on variables \eqn{X} and \eqn{Y},
let \eqn{n_0 = n(n - 1)/2} be the number of unordered pairs, \eqn{C} the
number of concordant pairs, and \eqn{D} the number of discordant pairs.
Let \eqn{T_x = \sum_g t_g (t_g - 1)/2} and \eqn{T_y = \sum_h u_h (u_h - 1)/2}
be the numbers of tied pairs within \eqn{X} and within \eqn{Y}, respectively,
where \eqn{t_g} and \eqn{u_h} are tie-group sizes in \eqn{X} and \eqn{Y}.

The tie-robust Kendall's tau-b is:
\deqn{ \tau_b = \frac{C - D}{\sqrt{(n_0 - T_x)\,(n_0 - T_y)}}. }
When there are no ties (\eqn{T_x = T_y = 0}), this reduces to tau-a:
\deqn{ \tau_a = \frac{C - D}{n(n-1)/2}. }

The function automatically handles ties. In degenerate cases where a
variable is constant (\eqn{n_0 = T_x} or \eqn{n_0 = T_y}), the tau-b
denominator is zero and the correlation is undefined (returned as \code{NA}).

\strong{Performance:}
\itemize{
\item In the \strong{two-vector mode} (\code{y} supplied), the C++ backend uses a
raw-double path (no intermediate 2\eqn{\times}2 matrix, no discretisation).
\item In the \strong{matrix/data-frame mode}, columns are discretised once and all
pairwise correlations are computed via the Knight \eqn{O(n \log n)}
procedure; where available, pairs are evaluated in parallel.
}
}
\note{
Missing values are not allowed when \code{check_na = TRUE}. Columns
with fewer than two observations are excluded.
}
\examples{
# Basic usage with a matrix
mat <- cbind(a = rnorm(100), b = rnorm(100), c = rnorm(100))
kt <- kendall_tau(mat)
print(kt)
plot(kt)

# Two-vector mode (scalar path)
x <- rnorm(1000); y <- 0.5 * x + rnorm(1000)
kendall_tau(x, y)

# With a large data frame
df <- data.frame(x = rnorm(1e4), y = rnorm(1e4), z = rnorm(1e4))
kendall_tau(df)

# Including ties
tied_df <- data.frame(
  v1 = rep(1:5, each = 20),
  v2 = rep(5:1, each = 20),
  v3 = rnorm(100)
)
kt <- kendall_tau(tied_df)
print(kt)
plot(kt)

}
\references{
Kendall, M. G. (1938). A New Measure of Rank Correlation. \emph{Biometrika},
30(1/2), 81–93.

Knight, W. R. (1966). A Computer Method for Calculating Kendall’s Tau with
Ungrouped Data. \emph{Journal of the American Statistical Association},
61(314), 436–439.
}
\seealso{
\code{\link{print.kendall_matrix}}, \code{\link{plot.kendall_matrix}}
}
\author{
Thiago de Paula Oliveira
}
