%\VignetteEngine{knitr::knitr} %\VignetteIndexEntry{tbm} %\VignetteDepends{tram, tbm, mboost, survival, lattice, latticeExtra, partykit,TH.data, colorspace,gamlss.data,trtf} \documentclass[article,nojss,shortnames]{jss} %% packages \usepackage{thumbpdf} %%% \usepackage{animate} \usepackage{amsfonts,amstext,amsmath,amssymb,amsthm} \usepackage{accents} \usepackage{color} \usepackage{rotating} \usepackage{verbatim} \usepackage[utf8]{inputenc} %%\usepackage[nolists]{endfloat} \newcommand{\cmd}[1]{\texttt{#1()}} <>= set.seed(290875) dir <- system.file("applications", package = "tbm") datadir <- system.file("rda", package = "TH.data") sapply(c("tram", "survival", "tbm", "mboost", "lattice", "latticeExtra", "partykit"), library, char = TRUE) trellis.par.set(list(plot.symbol = list(col=1,pch=20, cex=0.7), box.rectangle = list(col=1), box.umbrella = list(lty=1, col=1), strip.background = list(col = "white"))) ltheme <- canonical.theme(color = FALSE) ## in-built B&W theme ltheme$strip.background$col <- "transparent" ## change strip bg lattice.options(default.theme = ltheme) knitr::opts_chunk$set(echo = TRUE, results = 'markup', error = FALSE, warning = FALSE, message = FALSE, tidy = FALSE, cache = FALSE, size = "small", fig.width = 6, fig.height = 4, fig.align = "center", out.width = NULL, ###'.6\\linewidth', out.height = NULL, fig.scap = NA) knitr::render_sweave() # use Sweave environments knitr::set_header(highlight = '') # do not \usepackage{Sweave} ## R settings options(prompt = "R> ", continue = "+ ", useFancyQuotes = FALSE) # JSS style options(width = 75) library("colorspace") col <- diverge_hcl(2, h = c(246, 40), c = 96, l = c(65, 90)) fill <- diverge_hcl(2, h = c(246, 40), c = 96, l = c(65, 90), alpha = .3) @ \newcommand{\TODO}[1]{{\color{red} #1}} \newcommand\Torsten[1]{{\color{blue}Torsten: ``#1''}} % File with math commands etc. \input{defs.tex} \renewcommand{\thefootnote}{} %% code commands \newcommand{\Rclass}[1]{`\code{#1}'} %% JSS \author{Torsten Hothorn \\ Universit\"at Z\"urich} \Plainauthor{Hothorn} \title{Transformation Boosting Machines: Empirical Evaluation of the \textbf{tbm} Package} \Plaintitle{Transformation Boosting Machines} \Shorttitle{The \pkg{tbm} Package} \Abstract{ This document discusses technical details on the empirical evaluation of the reference implementation of transformation boosting machines in the \pkg{tbm} package. Model estimation, interpretation, and criticism is illustrated for eight life science applications. Setup and detailed results of a simulation study based on artificial data generating processes are presented. } \Keywords{Conditional Transformation Model, Shift Transformation Model} \Plainkeywords{Conditional Transformation Model, Shift Transformation Model} \Address{ Torsten Hothorn\\ Institut f\"ur Epidemiologie, Biostatistik und Pr\"avention \\ Universit\"at Z\"urich \\ Hirschengraben 84, CH-8001 Z\"urich, Switzerland \\ \texttt{Torsten.Hothorn@uzh.ch} \\ \url{http://tiny.uzh.ch/bh} \\ } \begin{document} <>= year <- substr(packageDescription("tbm")$Date, 1, 4) version <- packageDescription("tbm")$Version @ %\footnote{Please cite this document as: Torsten Hothorn (\Sexpr{year}) %Transformation Boosting Machines: Empirical Evaluation of the \textbf{tbm} %Package. R package vignette version \Sexpr{version}, %URL \url{https://CRAN.R-project.org/package=tram}.} % \input{todo} \section{Introduction} Transformation boosting machines estimate conditional transformation models (CTMs) and shift transformation models (STMs) by optimising the corresponding log-likelihood. In a nutshell, transformation models are fully parameterised by appropriate basis functions those parameters are (in total or partially) linked to predictor variables. The method is applicable to a broad range of problems because (1) the likelihood allows to handle discrete and continuous responses under all forms or random censoring and truncation and (2) simple linear, more complex nonlinear additive, or completely unstructured (tree-based) conditional paramater functions can be estimated by using appropriate basis functions for the predictor variables. %The algorithms employ the statistical view on boosting and iteratively %improve the fit estimating (relatively) simple models to updated gradients. %A simple illustration (for the Body Fat Mass application presented in %Section~\ref{sec:bodyfat}) is given in Figure~\ref{fig:movie}. The inital %frame of this animation shows the density obtained from the unconditional %maximum likelihood estimator in the model %\begin{eqnarray*} %\Prob(\rY \le \ry) = \Phi(\basisy(\ry)^\top \hat{\parm}_\text{ML}). %\end{eqnarray*} %%% this needs Adobe Acroread but they don't serve Linux anymore %\begin{figure} %\begin{center} %\animategraphics[controls,width=\linewidth]{1}{fit_movie}{1}{101} %\caption{Iterative boosting for the Body Fat Mass data. The left panel shows %the unconditional (first frame) or conditional densities (other frames). %Blue dots correspond to the likelihood of observations. The %right panel shows the increasing in-sample log-likelihood as the model gets %better and better. Use Adobe AcroRead to watch the animation. \label{fig:movie}} %\end{center} %\end{figure} %The Bernstein basis $\basisy$ allows a relatively flexible transformation %function, and thus density, to be estimated (the initial frame shows a %bimodel unconditional distribution). The quality of this model is assessed by %the multivariate gradient of the log-likelihood contributions with respect to %$\hat{\parm}_\text{ML}$. \CTM~starts by fitting potentially nonlinear %functions of each predictor variable to this gradient and updates the %parameter $\parm(\rx)$. Starting with the second frame, the left panel of %Figure~\ref{fig:movie} depicts the conditional densities for all %observations (the blue dots representing the observation) and the right %panel shows the increasing total in-sample log-likelihood. This document describes technical details of the empirical evaluation of transformation boosting machines by means of eight applications (Section~\ref{sec:app}) and artificial data generating processes (Section~\ref{sec:dgp}). For each of the eight applications, the exact model setup, the best performing model, and the corresponding model interpretation and model criticism is presented. The source code for these benchmark analysis is available from the directory <>= system.file("applications", package = "tbm") @ Simulation experiments based on the data generating processes presented in Section~\ref{sec:dgp} can be reproduced using the source code in <>= system.file("simulations", package = "tbm") @ This document also contains graphical representations of the out-of-sample log-likelihoods for all settings in the simulation experiments (Section~\ref{sec:results}). \section{Applications} \label{sec:app} <>= ds <- c("Beetles Exctinction Risk", "Birth Weight Prediction", "Body Fat Mass", "CAO/ARO/AIO-04 DFS", "Childhood Malnutrition", "Head Circumference", "PRO-ACT ALSFRS", "PRO-ACT OS") names(ds) <- c("ex_beetles.rda", "ex_fetus.rda", "ex_bodyfat.rda", "ex_CAO.rda", "ex_india.rda", "ex_heads.rda", "ex_ALSFRS.rda", "ex_ALSsurv.rda") models <- c(expression(paste("L ", bold(vartheta)(bold(x)))), expression(paste("L ", beta(bold(x)))), expression(paste("N ", bold(vartheta)(bold(x)))), expression(paste("N ", beta(bold(x)))), expression(paste("T ", bold(vartheta)(bold(x)))), expression(paste("T ", beta(bold(x)))), expression(paste("Tree ", bold(vartheta)(bold(x)))), expression(paste("F ", bold(vartheta)(bold(x))))) names(models) <- c("glm_ctm", "glm_tram", "gam_ctm", "gam_tram", "tree_ctm", "tree_tram", "trtf_tree", "trtf_forest") mor <- c("gam_ctm", "glm_ctm", "tree_ctm", "gam_tram", "glm_tram", "tree_tram", "trtf_tree", "trtf_forest") pit <- function(object, data) { cf <- predict(object, newdata = data, coef = TRUE) tmp <- object$model ret <- numeric(NROW(data)) for (i in 1:NROW(data)) { coef(tmp) <- cf[i,] ret[i] <- predict(tmp, newdata = data[i,,drop = FALSE], type = "distribution") } ret } pitplot <- function(object, data, main = "") { u <- pit(object, data) plot(1:length(u) / length(u), sort(u), xlab = "Theoretical Uniform Quantiles", ylab = "PIT Quantiles", main = main, ylim = c(0, 1), xlim = c(0, 1)) abline(a = 0, b = 1, col = "red") } plot.ctm <- function(x, which = sort(unique(selected(object))), obs = TRUE, ...) { object <- x q <- mkgrid(x$model, n = 100) X <- model.matrix(x$model$model, data = q) pfun <- function(which) { stopifnot(length(which) == 1) df <- model.frame(object, which = which)[[1]] df <- lapply(df, function(x) seq(from = min(x), to = max(x), length = 50)) df2 <- do.call("expand.grid", c(df, q)) class(object) <- class(object)[-1] tmp <- predict(object, newdata = df, which = which) CS <- diag(ncol(tmp[[1]])) CS[upper.tri(CS)] <- 1 cf <- tmp[[1]] %*% CS df2$pr <- as.vector(t(X %*% t(cf))) ret <- cbind(df2, which = names(df)[1]) names(ret)[1:2] <- c("x", "y") ret } ret <- do.call("rbind", lapply(which, pfun)) at <- quantile(ret$pr, prob = 0:10/10) pf <- function(x, y, z, subscripts, ...) { xname <- as.character(unique(ret[subscripts, "which"])) xo <- model.frame(object, which = xname)[[1]][[xname]] yo <- object$response panel.levelplot.raster(x, y, z, subscripts, ...) panel.contourplot(x, y, z, subscripts, contour = TRUE, ...) if (obs) panel.xyplot(x = xo, y = yo, pch = 20) } levelplot(pr ~ x + y | which, data = ret, panel = pf, scales = list(x = list(relation = "free")), region = TRUE, at = at, col.regions = grey.colors(length(at)), ...) } ### source("movie.R") @ For eight prediction problems, the out-of-sample log-likelihood was evalulated based on $100$ subsamples ($3/4 N$ as learning sample and $1/4 N$ as test sample). For problems with categorical responses or right-censored responses, subsamples were stratified with respect to the response class or censoring status (such that the distribution of the response or the the censoring rate were the same in learning and test samples). The out-of-sample log-likelihood was centered by the out-of-sample log-likelihood of the uninformative model which was estimated and tested on the very same folds. Transformation boosting machines were estimated using package \pkg{tbm} \citep{pkg:tbm}. Transformation trees and forests as implemented in package \pkg{trtf} \citep{pkg:trtf} served as main competitors. \CTM~(parameter $\parm(\rx)$) with nonlinear (N, B-spline) basis functions, linear (L) basis functions, and tree-based (T, depth two) basis functions. \STM~(parameter $\eshiftparm(\rx)$) with nonlinear (N, B-spline) basis functions, linear (L) basis functions, and tree-based (T, depth two) basis functions. \subsection{Beetles Exctinction Risk} <>= x <- risk[,mor] - ll0 med <- apply(x, 2, median, na.rm = TRUE) boxplot(x, axes = FALSE, main = ds[file], outline = FALSE, var.width = TRUE, ylab = "Out-of-sample log-likelihood (centered)", col = col[(med == max(med)) + 1], ylim = ylim) abline(v = c(3.5, 6.5), col = "lightgrey") abline(h = 0, col = "lightgrey") abline(h = max(med), col = col[2]) axis(1, at = 1:ncol(x), labels = models[colnames(x)]) axis(2) box() @ \begin{figure}[t] <>= load(file.path(dir, file <- "ex_beetles.rda")) ylim <- c(-.5, .5) <> @ \caption{\Sexpr{ds[file]}. Out-of-sample log-likelihood (centered by out-of-sample log-likelihood of unconditional model). \label{\Sexpr{paste0("fig", file, "oob")}}} \end{figure} <>= load(file.path(datadir, "beetles.rda")) ldata <- bmodel ldata$TS <- NULL X <- model.matrix(~ species - 1, data = ldata) ldata$species <- NULL nvars <- c("mean_body_size", "mean_elev", "flowers", "niche_decay", "niche_diam", "niche_canopy", "distribution") fvars <- colnames(ldata)[!colnames(ldata) %in% c("RL", nvars)] xvars <- c(nvars, fvars) ldata[nvars] <- lapply(nvars, function(x) as.numeric(scale(ldata[[x]]))) fm_gam <- c( "ctm" = as.formula(paste("RL ~ buser(X, K = Ainv, df = 2) + ", paste("bols(", xvars, ", df = 2)", collapse = "+"), "+", paste("bbs(", nvars, ", center = TRUE, df = 2)", collapse = "+"))), "stm" = as.formula(paste("RL ~ buser(X, K = Ainv, df = 1) + ", paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"), "+", paste("bbs(", nvars, ", center = TRUE, df = 1)", collapse = "+")))) fm_glm <- c( "ctm" = as.formula(paste("RL ~ buser(X, K = Ainv, df = 2) +", paste("bols(", xvars, ", df = 2)", collapse = "+"))), "stm" = as.formula(paste("RL ~ buser(X, K = Ainv, df = 1) +", paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+")))) fm_tree <- as.formula(paste("RL ~ ", paste(xvars, collapse = "+"))) mf <- model.frame(fm_tree, data = ldata) xf <- sum(sapply(mf[,-1], is.factor)) N <- dim(mf)[1] p <- dim(mf)[2] - 1 vars <- names(mf)[-1] @ This problem aims at the prediction of the exctinction risk of beetles \citep{Seibold_Brandl_Schmidl_2014}. The response is the Red List status ranging from 0 (least concern) to 5 (regionally extinct) of $N = \Sexpr{N}$ saproxylic beetles, where for each species $\Sexpr{p}$ ($\Sexpr{p - xf}$ numeric and $\Sexpr{xf}$ categorical) predictors are available. We start with an unconditional proportional odds model (using $\pZ = \text{expit}$ and one parameter for each but the highest Red List category): <>= m_mlt <- Polr(RL ~ 1, data = ldata) logLik(m_mlt) @ The corresponding unconditional distribution is depicted in Figure~\ref{\Sexpr{paste0("fig", file, "0")}}. \begin{figure} <>= q <- mkgrid(m_mlt, 200)[[1]] p <- c(predict(m_mlt, newdata = data.frame(RL = q), type = "density")) names(p) <- as.character(q) barplot(p, xlab = "Red List Status", ylab = "Distribution") @ \caption{\Sexpr{ds[file]}. Unconditional response distribution. \label{\Sexpr{paste0("fig", file, "0")}}} \end{figure} Model evaluation (Figure~\ref{\Sexpr{paste0("fig", file, "oob")}}) indicated that a tree-structured model for $\eshiftparm(\rx)$ had the best performance; this model was fitted using <>= fm_tree bm <- stmboost(m_mlt, formula = fm_tree, data = ldata, method = quote(mboost::blackboost))[626] @ (the default tree was grown to a depth of two). The in-sample log-likelihood is shown in Figure~\ref{\Sexpr{paste0("fig", file, "ib")}}. \begin{figure} <>= plot(-risk(bm), xlab = "Boosting Iteration", ylab = "Log-likelihood", type = "l") @ \caption{\Sexpr{ds[file]}. In-sample log-likelihood maximisation. \label{\Sexpr{paste0("fig", file, "ib")}}} \end{figure} The model consists of trees with two-way interactions, basically all variables seemed to play a role in the model (the numbers giving the absolute number of splits in each of these variables): <>= x <- tabulate(do.call("c", sapply(get("ens", environment(bm$predict)), function(x) nodeapply(x$model, ids = nodeids(x$model, terminal = FALSE), FUN = function(x) split_node(x)$varid - 1))), nbins = length(vars)) names(x) <- vars x @ For six selected beetle species, the conditional distribution of extinction as predicted from the tree-based model is given in Figure~\ref{\Sexpr{paste0("fig", file, "1")}}. \begin{figure} <>= idx <- sapply(levels(ldata$RL), function(x) which(ldata$RL == x)[1]) q <- mkgrid(m_mlt, n = 200)[[1]] d <- predict(bm, newdata = ldata[idx,], type = "density", q = q) layout(matrix(1:6, nrow = 3, byrow = TRUE)) out <- sapply(1:6, function(x) barplot(d[,x], xlab = "Red List", ylab = "Density", main = paste(gsub("_", " ", rownames(ldata)[idx[x]]), " (RL:", ldata$RL[idx[x]], ")", sep = ""))) @ \caption{\Sexpr{ds[file]}. Conditional response distribution. \label{\Sexpr{paste0("fig", file, "1")}}} \end{figure} In this special case, the algorithm is identical to boosting for proportional odds models as implemented in the \cmd{PropOdds} family \citep{Schmid_Hothorn_Maloney_2011}; and thus the in-sample risks are essentially identical: <>= po <- blackboost(fm_tree, data = ldata, family = PropOdds())[626] max(abs(risk(bm) - risk(po))) @ \subsection{Birth Weight Prediction} Models shall be used to improve birth weight prediction in small fetuses (weighting $\le 1600$ g at birth) based on measurements obtained using three-dimensional (3D) sonography \citep{Schild_Maringa_Siemer_2008}. \begin{figure}[t] <>= load(file.path(dir, file <- "ex_fetus.rda")) ylim <- c(0, 1.7) <> @ \caption{\Sexpr{ds[file]}. Out-of-sample log-likelihood (centered by out-of-sample log-likelihood of unconditional model). \label{\Sexpr{paste0("fig", file, "oob")}}} \end{figure} <>= load(file.path(datadir, "fetus.rda")) fetus$birthweight <- as.double(fetus$birthweight) ldata <- fetus xvars <- colnames(ldata) xvars <- xvars[xvars != "birthweight"] ldata[xvars] <- lapply(xvars, function(x) as.numeric(scale(ldata[[x]]))) fm_gam <- c("ctm" = as.formula(paste("birthweight ~ ", paste("bols(", xvars, ", df = 2)", collapse = "+"), "+", paste("bbs(", xvars, ", center = TRUE, df = 2)", collapse = "+"))), "stm" = as.formula(paste("birthweight ~ ", paste("bols(", xvars, ", intercept = FALSE)", collapse = "+"), "+", paste("bbs(", xvars, ", center = TRUE, df = 1)", collapse = "+")))) fm_glm <- c("ctm" = as.formula(paste("birthweight ~ ", paste("bols(", xvars, ", df = 2)", collapse = "+"))), "stm" = as.formula(paste("birthweight ~ ", paste("bols(", xvars, ", intercept = FALSE)", collapse = "+")))) fm_tree <- birthweight ~ . mf <- model.frame(fm_tree, data = ldata) xf <- sum(sapply(mf[,-1], is.factor)) N <- dim(mf)[1] p <- dim(mf)[2] - 1 vars <- names(mf)[-1] @ The response is the birth weight in gram of $N = \Sexpr{N}$ singleton fetuses, where for each fetus $\Sexpr{p}$ numeric predictors are available. We start with an unconditional model (using $\pZ = \Phi$ and an Bernstein polynomial of order six): <>= m_mlt <- BoxCox(birthweight ~ 1, data = ldata, extrapolate = TRUE) logLik(m_mlt) @ The corresponding unconditional distribution is depicted in Figure~\ref{\Sexpr{paste0("fig", file, "0")}}. \begin{figure} <>= plot(as.mlt(m_mlt), newdata = data.frame(1), type = "density", K = 200, xlab = "Birth Weight (in gram)", ylab = "Density", col = "black") @ \caption{\Sexpr{ds[file]}. Unconditional response distribution. \label{\Sexpr{paste0("fig", file, "0")}}} \end{figure} Model evaluation (Figure~\ref{\Sexpr{paste0("fig", file, "oob")}}) indicated that an additive smooth model for $\eshiftparm(\rx)$ had the best performance; this model was fitted using <>= fm_gam[["stm"]] bm <- stmboost(m_mlt, formula = fm_gam[["stm"]], data = ldata, method = quote(mboost::mboost))[253] @ The in-sample log-likelihood is shown in Figure~\ref{\Sexpr{paste0("fig", file, "ib")}} and a QQ-plot based on the probability-integral transform in Figure~\ref{\Sexpr{paste0("fig", file, "qq")}}. It is important to note that the term $\eshiftparm(\rx)$ must not contain an intercept. The model was therefore composed of linear model terms (without intercept) and smooth deviations from linear functions (all with the same degree of freedom to facilitate unbiasedness). Thus, the model selects between linear and smooth additive functions. \begin{figure} <>= <> @ \caption{\Sexpr{ds[file]}. In-sample log-likelihood maximisation. \label{\Sexpr{paste0("fig", file, "ib")}}} \end{figure} \begin{figure}[t] <>= pitplot(bm, ldata, main = ds[file]) @ \caption{\Sexpr{ds[file]}. Probability-integral transform QQ-plot. \label{\Sexpr{paste0("fig", file, "qq")}}} \end{figure} The selected terms were <>= tab <- matrix(tabulate(selected(bm), nbins = length(bm$baselearner)), ncol = 1) rownames(tab) <- names(bm$baselearner) colnames(tab) <- "" tab @ and the corresponding partial contributions are plotted in Figure~\ref{\Sexpr{paste0("fig", file, "partial")}}. Only minor deviations from linearity can be observed (and the out-of-sample risk of the linear model was almost the same). In addition, the baseline transformation (Figure~\ref{\Sexpr{paste0("fig", file, "base")}}) is almost linear, indicating that a normal linear model \citep[reported by][]{Schild_Maringa_Siemer_2008} might be a good compromise between prediction accuracy and interpretability for this problem. \begin{figure} <>= w <- sort(unique(selected(bm))) mbm <- bm class(mbm) <- class(mbm)[-1] nc <- ceiling(length(w) / 2) * 2 layout(matrix(1:nc, ncol = 2)) plot(mbm, which = w, ylim = c(-3, 3)) @ \caption{\Sexpr{ds[file]}. Partial contributions to additive predictor $\eshiftparm(\rx)$. \label{\Sexpr{paste0("fig", file, "partial")}}} \end{figure} \begin{figure} <>= tmp <- as.mlt(m_mlt) coef(tmp) <- nuisance(bm) plot(tmp, newdata = data.frame(1), K = 200, type = "trafo", xlab = "Birth weight (in gram)", ylab = "Transformation h", col = "black") @ \caption{\Sexpr{ds[file]}. Baseline transformation. \label{\Sexpr{paste0("fig", file, "base")}}} \end{figure} For five selected observations, the conditional distribution of birth weight is shown in Figure~\ref{\Sexpr{paste0("fig", file, "1")}}. The blue dots indicate the actual observations. Except for the right-most density, the predicted conditional densities are almost perfectly symmetric (as a consequence of the linear baseline transformation). \begin{figure} <>= idx <- order(ldata$birthweight)[floor(c(.1, .25, .5, .75, .9) * nrow(ldata))] q <- mkgrid(m_mlt, n = 200)[[1]] matplot(x = q, y = d <- predict(bm, newdata = ldata[idx,], type = "density", q = q), type = "l", col = rgb(.1, .1, .1, .6), lty = 1, xlab = "Birth Weight (in gram)", ylab = "Density") j <- sapply(ldata[idx, "birthweight"], function(y) which.min((q - y)^2)) points(ldata[idx, "birthweight"], sapply(1:5, function(i) d[j[i], i]), pch = 19, col = col[1]) @ \caption{\Sexpr{ds[file]}. Conditional response distribution. \label{\Sexpr{paste0("fig", file, "1")}}} \end{figure} \subsection{Body Fat Mass} \label{sec:bodyfat} \begin{figure}[t] <>= load(file.path(dir, file <- "ex_bodyfat.rda")) ylim <- c(0, 1.7) <> @ \caption{\Sexpr{ds[file]}. Out-of-sample log-likelihood (centered by out-of-sample log-likelihood of unconditional model). \label{\Sexpr{paste0("fig", file, "oob")}}} \end{figure} <>= data("bodyfat", package = "TH.data") ldata <- bodyfat xvars <- colnames(ldata) xvars <- xvars[xvars != "DEXfat"] ldata[xvars] <- lapply(xvars, function(x) as.numeric(scale(ldata[[x]]))) fm_gam <- c("ctm" = as.formula(paste("DEXfat ~ ", paste("bbs(", xvars, ")", collapse = "+"))), "stm" = as.formula(paste("DEXfat ~ ", paste("bols(", xvars, ", intercept = FALSE)", collapse = "+"), "+", paste("bbs(", xvars, ", center = TRUE, df = 1)", collapse = "+")))) fm_glm <- c("ctm" = as.formula(paste("DEXfat ~ ", paste("bols(", xvars, ")", collapse = "+"))), "stm" = as.formula(paste("DEXfat ~ ", paste("bols(", xvars, ", intercept = FALSE)", collapse = "+")))) fm_tree <- DEXfat ~ . mf <- model.frame(fm_tree, data = ldata) xf <- sum(sapply(mf[,-1], is.factor)) N <- dim(mf)[1] p <- dim(mf)[2] - 1 vars <- names(mf)[-1] @ The response is the body fat mass (in kilogram) of $N = \Sexpr{N}$ study participants, where for each subject $\Sexpr{p}$ numeric predictors are available \citep{Garcia_Wagner_Hothorn_2005}. We start with an unconditional model (using $\pZ = \Phi$ and an Bernstein polynomial of order six): <>= m_mlt <- BoxCox(DEXfat ~ 1, data = ldata, prob = c(.1, .99)) logLik(m_mlt) @ The corresponding unconditional distribution is depicted in Figure~\ref{\Sexpr{paste0("fig", file, "0")}}. \begin{figure} <>= plot(as.mlt(m_mlt), newdata = data.frame(1), type = "density", K = 200, xlab = "Body Fat Mass (in kilogram)", ylab = "Density", col = "black") @ \caption{\Sexpr{ds[file]}. Unconditional response distribution. \label{\Sexpr{paste0("fig", file, "0")}}} \end{figure} Model evaluation (Figure~\ref{\Sexpr{paste0("fig", file, "oob")}}) indicated that an additive smooth model for $\parm(\rx)$ had the best performance; this model was fitted using <>= fm_gam[["ctm"]] bm <- ctmboost(m_mlt, formula = fm_gam[["ctm"]], data = ldata, method = quote(mboost::mboost))[1000] @ The in-sample log-likelihood is shown in Figure~\ref{\Sexpr{paste0("fig", file, "ib")}} and a QQ-plot based on the probability-integral transform in Figure~\ref{\Sexpr{paste0("fig", file, "qq")}}. \begin{figure} <>= <> @ \caption{\Sexpr{ds[file]}. In-sample log-likelihood maximisation. \label{\Sexpr{paste0("fig", file, "ib")}}} \end{figure} \begin{figure}[t] <>= pitplot(bm, ldata, main = ds[file]) @ \caption{\Sexpr{ds[file]}. Probability-integral transform QQ-plot. \label{\Sexpr{paste0("fig", file, "qq")}}} \end{figure} The model is additive in the sense that $\h(\ry \mid \rx) = h_0(\ry) + \sum_{j = 1}^J \h_j(\ry \mid x_j)$. The partial transformation functions $\h_j$ are plotted in Figure~\ref{\Sexpr{paste0("fig", file, "ctm")}}. \begin{figure} \begin{center} <>= plot.ctm(bm, ylab = "Body Fat Mass") @ \caption{\Sexpr{ds[file]}. Scatterplots of all predictor variables and the response, overlaid with partial transformation functions. \label{\Sexpr{paste0("fig", file, "ctm")}}} \end{center} \end{figure} <>= tab <- matrix(tabulate(selected(bm), nbins = length(bm$baselearner)), ncol = 1) rownames(tab) <- names(bm$baselearner) colnames(tab) <- "" tab @ For five selected observations, the conditional distribution of body fat mass is shown in Figure~\ref{\Sexpr{paste0("fig", file, "1")}}. The blue dots indicate the actual observations. \begin{figure} <>= idx <- order(ldata$DEXfat)[floor(c(.1, .25, .5, .75, .9) * nrow(ldata))] q <- mkgrid(m_mlt, n = 200)[[1]] matplot(x = q, y = d <- predict(bm, newdata = ldata[idx,], type = "density", q = q), type = "l", col = rgb(.1, .1, .1, .6), lty = 1, xlab = "Body Fat Mass (in kilogram)", ylab = "Density") j <- sapply(ldata[idx, "DEXfat"], function(y) which.min((q - y)^2)) points(ldata[idx, "DEXfat"], sapply(1:5, function(i) d[j[i], i]), pch = 19, col = col[1]) @ \caption{\Sexpr{ds[file]}. Conditional response distribution. \label{\Sexpr{paste0("fig", file, "1")}}} \end{figure} \subsection{CAO/ARO/AIO-04 DFS} \begin{figure}[t] <>= load(file.path(dir, file <- "ex_CAO.rda")) ylim <- NULL <> @ \caption{\Sexpr{ds[file]}. Out-of-sample log-likelihood (centered by out-of-sample log-likelihood of unconditional model). \label{\Sexpr{paste0("fig", file, "oob")}}} \end{figure} <>= load(file.path(datadir, "Primary_endpoint_data.rda")) ldata <- CAOsurv xvars <- c("strat_t", "strat_n", "randarm", "geschlecht", "age", "groesse", "gewicht", "ecog_b", "mason", "gesamt_t", "gesamt_n", "histo", "grading_b", "ap_anlage", "stenose", "gesamt_n_col", "UICC", "bentf") ldata <- ldata[, c("DFS", xvars)] ldata <- ldata[complete.cases(ldata),] ldata$UICC <- ldata$UICC[,drop = TRUE] ldata$ecog_b <- ldata$ecog_b[,drop = TRUE] nvars <- c("age", "groesse", "gewicht") fvars <- xvars[!xvars %in% nvars] xvars <- c(nvars, fvars) ldata[nvars] <- lapply(nvars, function(x) as.numeric(scale(ldata[[x]]))) fm_gam <- c( "ctm" = as.formula(paste("DFS ~ ", paste("bols(", xvars, ", df = 2)", collapse = "+"), "+", paste("bbs(", nvars, ", center = TRUE, df = 2)", collapse = "+"))), "stm" = as.formula(paste("DFS ~ ", paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"), "+", paste("bbs(", nvars, ", center = TRUE, df = 1)", collapse = "+")))) fm_glm <- c( "ctm" = as.formula(paste("DFS ~ ", paste("bols(", xvars, ", df = 2)", collapse = "+"))), "stm" = as.formula(paste("DFS ~", paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+")))) fm_tree <- as.formula(paste("DFS ~ ", paste(xvars, collapse = "+"))) mf <- model.frame(fm_tree, data = ldata) xf <- sum(sapply(mf[,-1], is.factor)) N <- dim(mf)[1] p <- dim(mf)[2] - 1 vars <- names(mf)[-1] @ The response is the disease-free survival time of rectal cancer patients from the CAO/ARO/AIO-04 randomised controlled clinical trial \citep{Roedel_Graeven_Fietkau_2015}. For $N = \Sexpr{N}$ patients, $\Sexpr{p}$ ($\Sexpr{p - xf}$ numeric and $\Sexpr{xf}$ categorical) baseline predictors are available. We start with an unconditional Cox model ($\pZ = 1 - \exp(-\exp())$ and a Bernstein polynomial of order six): <>= m_mlt <- Coxph(DFS ~ 1, data = ldata, prob = c(0, .9)) logLik(m_mlt) @ The corresponding unconditional distribution is depicted in Figure~\ref{\Sexpr{paste0("fig", file, "0")}}. \begin{figure} <>= plot(as.mlt(m_mlt), newdata = data.frame(1), type = "survivor", K = 200, xlab = "Time (in days)", ylab = "Time", col = "black") @ \caption{\Sexpr{ds[file]}. Unconditional response distribution. \label{\Sexpr{paste0("fig", file, "0")}}} \end{figure} Model evaluation (Figure~\ref{\Sexpr{paste0("fig", file, "oob")}}) indicated that a linear model for $\eshiftparm(\rx)$ had the best performance, this model was fitted using <>= fm_glm[["stm"]] bm <- stmboost(m_mlt, formula = fm_glm[["stm"]], data = ldata, method = quote(mboost::mboost))[963] @ This is, in fact, a linear Cox model which could have also been fitted using <>= bmCoxPH <- mboost(fm_glm[["stm"]], data = ldata, family = CoxPH())[963] @ %%% this takes too long in a package vignette %The in-sample log-likelihood is shown in Figure~\ref{\Sexpr{paste0("fig", file, "ib")}}. %\begin{figure} %<>= %<> %@ %\caption{\Sexpr{ds[file]}. In-sample log-likelihood maximisation. \label{\Sexpr{paste0("fig", file, "ib")}}} %\end{figure} % %The model coefficients are %<>= %mboost:::coef.mboost(bm) %@ % %The negative coefficients are practically equivalent %<>= %coef(bmCoxPH) %@ % %For five selected observations, the conditional survivor functions %are shown in Figure~\ref{\Sexpr{paste0("fig", file, "1")}}. % %\begin{figure} %<>= %idx <- sample(1:nrow(ldata), 5) %q <- mkgrid(m_mlt, n = 200)[[1]] %matplot(x = q, y = d <- predict(bm, newdata = ldata[idx,], type = "survivor", q = q), type = % "l", col = rgb(.1, .1, .1, .6), lty = 1, xlab = "Time (in days)", ylab = "Probability") %@ %\caption{\Sexpr{ds[file]}. Conditional response distribution. \label{\Sexpr{paste0("fig", file, "1")}}} %\end{figure} \subsection{Childhood Malnutrition} \begin{figure}[t] <>= load(file.path(dir, file <- "ex_india.rda")) ylim <- NULL <> @ \caption{\Sexpr{ds[file]}. Out-of-sample log-likelihood (centered by out-of-sample log-likelihood of unconditional model). \label{\Sexpr{paste0("fig", file, "oob")}}} \end{figure} <>= load(file.path(datadir, "india.rda")) xvars <- c("cage", "breastfeeding", "mbmi", "mage", "medu", "edupartner", "csex", "ctwin", "cbirthorder", "munemployed", "mreligion", "mresidence", "deadchildren", "electricity", "radio", "television", "refrigerator", "bicycle", "motorcycle", "car") fvars <- xvars[sapply(xvars, function(x) length(unique(kids[[x]]))) < 6] nvars <- xvars[!xvars %in% fvars] kids[fvars] <- lapply(fvars, function(f) factor(kids[[f]])) kids[nvars] <- lapply(nvars, function(n) scale(kids[[n]])) fm_gam <- c( "ctm" = as.formula(paste("stunting ~ ", paste("bols(", xvars, ", df = 2)", collapse = "+"), "+", paste("bbs(", nvars, ", center = TRUE, df = 2)", collapse = "+"))), "stm" = as.formula(paste("stunting ~", paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"), "+", paste("bbs(", nvars, ", center = TRUE, df = 1)", collapse = "+")))) fm_glm <- c( "ctm" = as.formula(paste("stunting ~ ", paste("bols(", xvars, ", df = 2)", collapse = "+"))), "stm" = as.formula(paste("stunting ~", paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+")))) fm_tree <- as.formula(paste("stunting ~ ", paste(xvars, collapse = "+"))) kids$stunting <- as.double(kids$stunting) ldata <- kids mf <- model.frame(fm_tree, data = ldata) xf <- sum(sapply(mf[,-1], is.factor)) N <- dim(mf)[1] p <- dim(mf)[2] - 1 vars <- names(mf)[-1] @ The aim is to predict childhood malnutrition, here measured as stunting, that is, insufficient height for age. The data for $N = \Sexpr{N}$ children from India were compiled by \cite{Fenske_Kneib_Hothorn_2011} and include $\Sexpr{p}$ ($\Sexpr{p - xf}$ numeric and $\Sexpr{xf}$ categorical) predictors. We start with an unconditional model (using $\pZ = \Phi$ and an Bernstein polynomial of order six): <>= m_mlt <- BoxCox(stunting ~ 1, data = ldata, prob = c(.05, .975), extrapolate = TRUE) logLik(m_mlt) @ The corresponding unconditional distribution is depicted in Figure~\ref{\Sexpr{paste0("fig", file, "0")}}. \begin{figure} <>= plot(as.mlt(m_mlt), newdata = data.frame(1), type = "density", K = 200, xlab = "Stunting", ylab = "Density", col = "black") @ \caption{\Sexpr{ds[file]}. Unconditional response distribution. \label{\Sexpr{paste0("fig", file, "0")}}} \end{figure} Model evaluation %% (Figure~\ref{\Sexpr{paste0("fig", file, "oob")}}) indicated that a tree-based model for $\parm(\rx)$ had the best performance; this model was fitted using <>= fm_tree bm <- ctmboost(m_mlt, formula = fm_tree, data = ldata, method = quote(mboost::blackboost))[515] @ %%% this takes too long in a package vignette %The in-sample log-likelihood is shown in Figure~\ref{\Sexpr{paste0("fig", file, "ib")}} %and a QQ-plot based on the probability-integral transform in Figure~\ref{\Sexpr{paste0("fig", file, "qq")}}. % %\begin{figure} %<>= %<> %@ %\caption{\Sexpr{ds[file]}. In-sample log-likelihood maximisation. \label{\Sexpr{paste0("fig", file, "ib")}}} %\end{figure} % %\begin{figure}[t] %<>= %pitplot(bm, ldata, main = ds[file]) %@ %\caption{\Sexpr{ds[file]}. Probability-integral transform QQ-plot. \label{\Sexpr{paste0("fig", file, "qq")}}} %\end{figure} % % %The model consists of trees with two-way interactions, most splits occured %in age and breastfeeding, birthorder, and religion: %<>= %x <- tabulate(do.call("c", sapply(get("ens", environment(bm$predict)), % function(x) nodeapply(x$model, ids = nodeids(x$model, terminal = FALSE), FUN = function(x) split_node(x)$varid - %1))), nbins = length(all.vars(fm_tree)) - 1) %names(x) <- all.vars(fm_tree)[-1] %x %@ % %For five selected observations, the conditional distribution of stunting %is shown in Figure~\ref{\Sexpr{paste0("fig", file, "1")}}. The blue dots %indicate the actual observations. % %\begin{figure} %<>= %idx <- order(ldata$stunting)[floor(c(.1, .25, .5, .75, .9) * nrow(ldata))] %q <- mkgrid(m_mlt, n = 200)[[1]] %matplot(x = q, y = d <- predict(bm, newdata = ldata[idx,], type = "density", q = q), type = % "l", col = rgb(.1, .1, .1, .6), lty = 1, xlab = "Stunting", ylab = "Density") %j <- sapply(ldata[idx, "stunting"], function(y) which.min((q - y)^2)) %points(ldata[idx, "stunting"], sapply(1:5, function(i) d[j[i], i]), pch = 19, col = col[1]) %@ %\caption{\Sexpr{ds[file]}. Conditional response distribution. \label{\Sexpr{paste0("fig", file, "1")}}} %\end{figure} % \subsection{Head Circumference} \begin{figure}[t] <>= load(file.path(dir, file <- "ex_heads.rda")) ylim <- NULL <> @ \caption{\Sexpr{ds[file]}. Out-of-sample log-likelihood (centered by out-of-sample log-likelihood of unconditional model). \label{\Sexpr{paste0("fig", file, "oob")}}} \end{figure} The Fourth Dutch Growth Study \citep{Fredriks_Buuren_Burgmeijer_2000} is a cross-sectional study that measures growth and development of the Dutch population between the ages of 0 and 22 years. The study measured, among other variables, head circumference (HC) and age of 7482 males and 7018 females. \cite{Stasinopoulos_Rigby_2007} analysed the head circumference of $7040$ males with explanatory variable age using a GAMLSS model with a Box-Cox $t$ distribution describing the first four moments of head circumference conditionally on age. The models show evidence of kurtosis, especially for older boys. <>= ### Paul Eilers: Using 1/2 is better data("db", package = "gamlss.data") db$lage <- with(db, age^(1/3)) ldata <- db fm_gam <- c("ctm" = head ~ bbs(lage), "stm" = head ~ bols(lage, intercept = FALSE) + bbs(lage, center = TRUE, df = 1)) fm_glm <- c("ctm" = head ~ bols(lage), "stm" = head ~ bols(lage, intercept = FALSE)) fm_tree <- head ~ . N <- nrow(ldata) @ We start with an unconditional model (using $\pZ = \Phi$ and an Bernstein polynomial of order six): <>= m_mlt <- BoxCox(head ~ 1, data = ldata) logLik(m_mlt) @ Model evaluation (Figure~\ref{\Sexpr{paste0("fig", file, "oob")}}) indicated that an additive smooth model for $\parm(\rx)$ had the best performance; this model was fitted using <>= fm_gam[["ctm"]] bm <- ctmboost(m_mlt, formula = fm_gam[["ctm"]], data = ldata, method = quote(mboost::mboost))[1000] @ The in-sample log-likelihood is shown in Figure~\ref{\Sexpr{paste0("fig", file, "ib")}} and a QQ-plot based on the probability-integral transform in Figure~\ref{\Sexpr{paste0("fig", file, "qq")}}. \begin{figure} <>= <> @ \caption{\Sexpr{ds[file]}. In-sample log-likelihood maximisation. \label{\Sexpr{paste0("fig", file, "ib")}}} \end{figure} \begin{figure}[t] <>= pitplot(bm, ldata, main = ds[file]) @ \caption{\Sexpr{ds[file]}. Probability-integral transform QQ-plot. \label{\Sexpr{paste0("fig", file, "qq")}}} \end{figure} Figure~\ref{heads-plot} shows the model as a growth curve model. The observations are overlaid with quantile curves obtained via inversion of the estimated conditional distributions. The figure can be directly compared with Figure~16 of \cite{Stasinopoulos_Rigby_2007} (the quantile curves in these two plots are essentially equivalent) and also indicates a certain asymmetry towards older boys. \begin{figure}[t] <>= l <- with(db, seq(from = min(lage), to = max(lage), length = 100)) q <- mkgrid(m_mlt, n = 200)[[1]] pr <- expand.grid(head = q, lage = l^3) pr$p <- c(predict(bm, newdata = data.frame(lage = l), q = q, type = "distribution")) pr$cut <- factor(pr$lage > 2.5) levels(pr$cut) <- c("Age < 2.5 yrs", "Age > 2.5 yrs") pfun <- function(x, y, z, subscripts, at, ...) { panel.contourplot(x, y, z, subscripts, at = c(0.4, 2, 10, 25, 50, 75, 90, 98, 99.6)/ 100, ...) panel.xyplot(x = db$age, y = db$head, pch = 20, col = rgb(.1, .1, .1, .1), ...) } print(contourplot(p ~ lage + head | cut, data = pr, panel = pfun, region = FALSE, xlab = "Age (years)", ylab = "Head circumference (cm)", scales = list(x = list(relation = "free")))) @ \caption{\Sexpr{ds[file]}. Observed head circumference and age for $7040$ boys with estimated quantile curves for $\tau = 0.04, 0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98, 0.996$. \label{heads-plot}} \end{figure} \subsection{PRO-ACT ALSFRS} \begin{figure}[t] <>= load(file.path(dir, file <- "ex_ALSFRS.rda")) ylim <- NULL <> @ \caption{\Sexpr{ds[file]}. Out-of-sample log-likelihood (centered by out-of-sample log-likelihood of unconditional model). \label{\Sexpr{paste0("fig", file, "oob")}}} \end{figure} <>= load(file.path(datadir, "ALSFRSdata.rda")) ldata <- ALSFRSdata[complete.cases(ALSFRSdata),] xvars <- names(ldata) xvars <- xvars[xvars != "ALSFRS.halfYearAfter"] fvars <- xvars[sapply(ldata[xvars], is.factor)] nvars <- xvars[!xvars %in% fvars] ldata[nvars] <- lapply(nvars, function(x) as.numeric(scale(ldata[[x]]))) fm_gam <- c( "ctm" = as.formula(paste("ALSFRS.halfYearAfter ~ ", paste("bols(", xvars, ", df = 2)", collapse = "+"), "+", paste("bbs(", nvars, ", center = TRUE, df = 2)", collapse = "+"))), "stm" = as.formula(paste("ALSFRS.halfYearAfter ~", paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"), "+", paste("bbs(", nvars, ", center = TRUE, df = 1)", collapse = "+")))) fm_glm <- c( "ctm" = as.formula(paste("ALSFRS.halfYearAfter ~ ", paste("bols(", xvars, ", df = 2)", collapse = "+"))), "stm" = as.formula(paste("ALSFRS.halfYearAfter ~", paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+")))) fm_tree <- as.formula(paste("ALSFRS.halfYearAfter ~ ", paste(xvars, collapse = "+"))) mf <- model.frame(fm_tree, data = ldata) xf <- sum(sapply(mf[,-1], is.factor)) N <- dim(mf)[1] p <- dim(mf)[2] - 1 vars <- names(mf)[-1] @ This data origines from the PRO-ACT Prize4Life challenge 2012 (\url{http://prize4life.org.il/en/prediction-prize/}), comprising clinical trials data from $N = \Sexpr{N}$ patients suffering Amypthropic Lateral Sclerosis (ALS). The response is the ALS-Functional Rating Scale (ALSFRS) six months after diagnosis. For each patient, $\Sexpr{p}$ ($\Sexpr{p - xf}$ numeric and $\Sexpr{xf}$ categorical) predictors are available. An overview on the challenge and winning algorithms is available from \cite{Kueffner_Zach_Norel_2014}. The response is an ordinal variable, ranging from $0$ to $40$. We start with an unconditional proportional odds model featuring a continuous basis transformation of the response ($\pZ = \text{expit}$ and a Bernstein polynomial of order six): <>= m_mlt <- Colr(ALSFRS.halfYearAfter ~ 1, data = ldata, prob = c(.05, .9), extrapolate = TRUE) logLik(m_mlt) @ The corresponding unconditional distribution is depicted in Figure~\ref{\Sexpr{paste0("fig", file, "0")}}. \begin{figure} <>= plot(as.mlt(m_mlt), newdata = data.frame(1), type = "density", K = 200, xlab = "ALSFRS at 6 months", ylab = "Density", col = "black") @ \caption{\Sexpr{ds[file]}. Unconditional response distribution. \label{\Sexpr{paste0("fig", file, "0")}}} \end{figure} Model evaluation (Figure~\ref{\Sexpr{paste0("fig", file, "oob")}}) indicated that a linear model for $\eshiftparm(\rx)$ (also known as ``distribution regression'') had the best performance; this model was fitted using <>= bm <- ctmboost(m_mlt, formula = fm_glm[["ctm"]], data = ldata, method = quote(mboost::mboost))[613] @ The in-sample log-likelihood is shown in Figure~\ref{\Sexpr{paste0("fig", file, "ib")}}. \begin{figure} <>= <> @ \caption{\Sexpr{ds[file]}. In-sample log-likelihood maximisation. \label{\Sexpr{paste0("fig", file, "ib")}}} \end{figure} Time since onset and ALSFRS at time of diagnosis are the two most important variables (roughly assessed by the selection frequency): <>= tab <- matrix(tabulate(selected(bm), nbins = length(bm$baselearner)), ncol = 1) rownames(tab) <- names(bm$baselearner) colnames(tab) <- "" tab[tab > 0,] @ The response-varying coefficients for these two variables are given in Figure~\ref{ALSFRS-plot}. \begin{figure} <>= cf <- mboost:::coef.mboost(bm) q <- mkgrid(m_mlt, n = 100)[[1]] X <- model.matrix(m_mlt$model, data = data.frame(ALSFRS.halfYearAfter = q)) vars <- c("time_onset_treatment", "ALSFRS.Start") layout(matrix(1:length(vars), ncol = 2)) out <- sapply(vars, function(v) { cf <- matrix(cf[[grep(v, names(cf))]], ncol = 2, byrow = TRUE) cf <- cumsum(cf[,2]) plot(q, X %*% cf, main = v, xlab = "ALSFRS at 6 months", ylab = "Varying coefficient", type = "l") }) @ \caption{\Sexpr{ds[file]}. Response-varying coefficients. \label{ALSFRS-plot}} \end{figure} \subsection{PRO-ACT OS} \begin{figure}[t] <>= load(file.path(dir, file <- "ex_ALSsurv.rda")) ylim <- NULL <> @ \caption{\Sexpr{ds[file]}. Out-of-sample log-likelihood (centered by out-of-sample log-likelihood of unconditional model). \label{\Sexpr{paste0("fig", file, "oob")}}} \end{figure} <>= load(file.path(datadir, "ALSsurvdata.rda")) ldata <- ALSsurvdata[complete.cases(ALSsurvdata),] ldata$y <- with(ldata, Surv(survival.time, cens)) ldata$survival.time <- NULL ldata$cens <- NULL nvars <- c("time_onset_treatment", "age", "height") fvars <- names(ldata)[!names(ldata) %in% c("y", nvars)] xvars <- c(nvars, fvars) ldata[nvars] <- lapply(nvars, function(x) as.numeric(scale(ldata[[x]]))) fm_gam <- c( "ctm" = as.formula(paste("y ~ ", paste("bols(", xvars, ", df = 2)", collapse = "+"), "+", paste("bbs(", nvars, ", center = TRUE, df = 2)", collapse = "+"))), "stm" = as.formula(paste("y ~ ", paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"), "+", paste("bbs(", nvars, ", center = TRUE, df = 1)", collapse = "+")))) fm_glm <- c( "ctm" = as.formula(paste("y ~ ", paste("bols(", xvars, ", df = 2)", collapse = "+"))), "stm" = as.formula(paste("y ~", paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+")))) fm_tree <- as.formula(paste("y ~ ", paste(xvars, collapse = "+"))) mf <- model.frame(fm_tree, data = ldata) xf <- sum(sapply(mf[,-1], is.factor)) N <- dim(mf)[1] p <- dim(mf)[2] - 1 vars <- names(mf)[-1] @ Overall survival data from ALS patients were compiled from the PRO-ACT database by \cite{Seibold_Zeileis_Hothorn_2017}. For each of $N = \Sexpr{N}$ patients, $\Sexpr{p}$ ($\Sexpr{p - xf}$ numeric and $\Sexpr{xf}$ categorical) predictors are available. We start with an unconditional Cox model ($\pZ = 1 - \exp(-\exp())$ and a Bernstein polynomial of order six): <>= m_mlt <- Coxph(y ~ 1, data = ldata) logLik(m_mlt) @ The corresponding unconditional distribution is depicted in Figure~\ref{\Sexpr{paste0("fig", file, "0")}}. \begin{figure} <>= plot(as.mlt(m_mlt), newdata = data.frame(1), type = "survivor", K = 200, xlab = "ALS Overall Survival (in days)", ylab = "Probability", col = "black") @ \caption{\Sexpr{ds[file]}. Unconditional response distribution. \label{\Sexpr{paste0("fig", file, "0")}}} \end{figure} Model evaluation %%% (Figure~\ref{\Sexpr{paste0("fig", file, "oob")}}) indicated that tree-based model for $\eshiftparm(\rx)$ had the best performance; this model was fitted using <>= fm_tree bm <- stmboost(m_mlt, formula = fm_tree, data = ldata, control = boost_control(nu = 0.01), method = quote(mboost::blackboost))[451] @ This model is roughly equivalent to a boosted Cox-model using trees as baselearners: <>= blackboost(fm_tree, data = ldata, family = CoxPH()) @ %%% this takes too long for a package vignette %The in-sample log-likelihood is shown in Figure~\ref{\Sexpr{paste0("fig", file, "ib")}}. % %\begin{figure} %<>= %<> %@ %\caption{\Sexpr{ds[file]}. In-sample log-likelihood maximisation. \label{\Sexpr{paste0("fig", file, "ib")}}} %\end{figure} % %For five selected observations, the conditional survivor curves %are shown in Figure~\ref{\Sexpr{paste0("fig", file, "1")}}. % %\begin{figure} %<>= %idx <- sample(1:nrow(ldata), 5) %q <- mkgrid(m_mlt, n = 200)[[1]] %matplot(x = q, y = d <- predict(bm, newdata = ldata[idx,], type = "survivor", q = q), type = % "l", col = rgb(.1, .1, .1, .6), lty = 1, xlab = "ALS Overall Survival (in days)", ylab = "Probability") %@ %\caption{\Sexpr{ds[file]}. Conditional response distribution. \label{\Sexpr{paste0("fig", file, "1")}}} %\end{figure} \section{Artificial Data Generating Processes} \label{sec:dgp} \subsection{Simulation Models} Data were generated based on two groups and one numeric predictor variable $x \in [0, 1]$ based on transformation models of the form \begin{eqnarray*} \Prob(\rY \le \ry \mid \text{Group}, x) = \Phi(h(\ry \mid \text{Group}, x)) \end{eqnarray*} where the conditional transformation function $\h(\ry \mid \text{Group}, x) = \Phi^{-1}(\Prob(\rY \le \ry \mid \text{Group}, x))$ for four data generating processes is given in Table~\ref{tab:dgp}. \begin{table} \begin{tabular}{lll} \hline DGP & $\Phi^{-1}(\Prob(\rY \le \ry \mid \text{Group 1}, x))$ & $\Phi^{-1}(\Prob(\rY \le \ry \mid \text{Group 2}, x))$ \\ \hline \hline Linear $\eshiftparm(\rx)$ & $\h_\rY(\ry) - 2 x$ & $\h_\rY(\ry) + 2 - x$ \\ Nonlinear $\eshiftparm(\rx)$ & $\h_\rY(\ry) - 2 g(x)$ & $\h_\rY(\ry) + 2 - g(x)$ \\ Linear $\parm(\rx)$ & $\h_\rY(\ry) - \eshiftparm_1(\ry) - \eshiftparm_2(\ry) x$ & $\h_\rY(\ry) + \eshiftparm_1(\ry) - (\eshiftparm_2(\ry) + \eshiftparm_3(\ry)) x$ \\ Nonlinear $\parm(\rx)$ & $\h_\rY(\ry) - \eshiftparm_1(\ry) - \eshiftparm_2(\ry) g(x)$ & $\h_\rY(\ry) + \eshiftparm_1(\ry) - (\eshiftparm_2(\ry) + \eshiftparm_3(\ry)) g(x)$ \\ \hline \end{tabular} \caption{Artificial Data Generating Processes (DGPs). Description of four simulation models. $g(x) = \sin(2 \pi x)(1 + x)$, $\h_\rY, \eshiftparm_1, \eshiftparm_2, \eshiftparm_3$ are Bernstein polynomials of order six. \label{tab:dgp}} \end{table} General parameters of the simulation were defined as <>= library("tram") ### set-up RNG set.seed(27031105) ### order of Bernstein polynomials order <- 6 ### support of distributions sup <- c(-4, 6) ### bounds (essentially for plots) bds <- c(-8, 8) ### shift effects: main effect of grp, x, and interaction effect beta <- c(-2, 2, -1) @ Group information and $x$ along with $g(x)$ was generated via <>= ### two groups grp <- gl(2, 1) ### uniform predictor x <- seq(from = 0, to = 1, length.out = 1000) d <- expand.grid(grp = grp, x = x) ### transformation of x: sin(2 pi x) (1 + x) d$g <- with(d, sin(x * 2 * pi) * (1 + x)) ### generate some response (this is NOT the ### response used for the simulations) X <- model.matrix(~ grp * x, data = d)[,-1] d$y <- rnorm(nrow(d), mean = X %*% beta) @ For the first model (``Linear $\eshiftparm(\rx)$''), the baseline transformation $\h_\rY$ is a nonlinear function, parameterised as a Bernstein polynomial with the following coefficients <>= ### h_Y (cf0 <- seq(from = sup[1], to = sup[2], length = order + 1) + sin(seq(from = 0, to = 2*pi, length = order + 1)) * (seq(from = 0, to = 2*pi, length = order + 1) <= pi) * 2) m1 <- BoxCox(y ~ grp * x, data = d, order = order, model_only = TRUE, support = sup, bounds = bds) cf <- coef(m1) (cf[] <- c(cf0, beta)) coef(m1) <- cf @ The linear part simply consists of a main effect of group, a main effect of $x$ and an interaction effect. This is a linear transformation model with nonnormal response. The second model (``Nonlinear $\eshiftparm(\rx)$'') is based on the same coefficients, however, after the transformation $g(x)$ <>= m2 <- BoxCox(y ~ grp * g, data = d, order = order, model_only = TRUE, support = sup, bounds = bds) cf <- coef(m2) cf[] <- c(cf0, beta) coef(m2) <- cf @ The third model (``Linear $\parm(\rx)$'') is a distribution regression model featuring response-varying coefficients: <>= m3 <- BoxCox(y | grp * x ~ 1, data = d, order = order, model_only = TRUE, support = sup, bounds = bds) cf <- coef(m3) ### beta_1 (cf1 <- sin(seq(from = 0, to = pi / 2, length.out = order + 1)) * beta[1]) ### beta_2 (cf2 <- sqrt(seq(from = 0, to = 2, length.out = order + 1)) / sqrt(2) * beta[2]) ### beta_3 (cf21 <- sin(seq(from = 0, to = pi / 2, length.out = order + 1)) * beta[3]) cf[] <- c(cf0, cf1, cf2, cf21) coef(m3) <- cf @ In the last model (``Nonlinear $\parm(\rx)$''), the same response-varying coefficients are used after the transformation $g(x)$: <>= m4 <- BoxCox(y | grp * g ~ 1, data = d, order = order, model_only = TRUE, support = sup, bounds = bds) cf <- coef(m4) cf[] <- c(cf0, cf1, cf2, cf21) coef(m4) <- cf @ The conditional transformation functions are depicted in Figure~\ref{fig:dgp}. \begin{figure} \begin{center} <>= xlim <- c(-5, 5) q <- mkgrid(m1, n = 500)[["y"]] x <- 0:10 / 10 nd <- expand.grid(grp = grp, x = x) ndq <- expand.grid(y = q, grp = grp, x = x) ndq$xconfig <- with(ndq, interaction(grp, factor(x))) nd$g <- with(nd, sin(x * 2 * pi) * (1 + x)) ndq$d1 <- c(predict(m1, newdata = nd, type = "density", q = q)) ndq$d2 <- c(predict(m2, newdata = nd, type = "density", q = q)) ndq$d3 <- c(predict(m3, newdata = nd, type = "density", q = q)) ndq$d4 <- c(predict(m4, newdata = nd, type = "density", q = q)) ndq$t1 <- c(predict(m1, newdata = nd, type = "trafo", q = q)) ndq$t2 <- c(predict(m2, newdata = nd, type = "trafo", q = q)) ndq$t3 <- c(predict(m3, newdata = nd, type = "trafo", q = q)) ndq$t4 <- c(predict(m4, newdata = nd, type = "trafo", q = q)) colr <- rep(grey.colors(length(x), alpha = .9), each = 2) xlim <- bds ret <- do.call("rbind", list(ndq)[rep(1, 4)]) ret$d <- with(ndq, c(d1, d2, d3, d4)) ret$t <- with(ndq, c(t1, t2, t3, t4)) ret$model <- gl(4, nrow(ndq)) levels(ret$model) <- mlev <- c("GLM Tram", "GAM Tram", "DR", "CTM") ret$model <- factor(as.character(ret$model), levels = rev(mlev), labels = rev(mlev)) levels(ret$grp) <- c("Group 1", "Group 2") lev <- c(expression(paste("Nonlinear ", bold(vartheta)(bold(x)))), expression(paste("Linear ", bold(vartheta)(bold(x)))), expression(paste("Nonlinear ", beta(bold(x)))), expression(paste("Linear ", beta(bold(x))))) sc <- function(which.given, ..., factor.levels) { if (which.given == 1) { strip.default(which.given = which.given, ..., factor.levels) } else { if (which.given == 2) strip.default(which.given = 2, ..., factor.levels = lev) } } obj <- (xyplot(t ~ y | grp + model, data = ret, type = "l", groups = xconfig, col = colr, strip = sc, lwd = 2, lty = 1, xlim = xlim, layout = c(4, 2), ylab = "Transformation h", xlab = "Response Y", scales = list(y = list(relation = "free")))) obj <- useOuterStrips(obj, strip.left = function(which.given, ..., factor.levels) strip.custom(horizontal = FALSE)(which.given = which.given, ..., factor.levels = lev)) plot(obj <- update(obj, legend = list(right = list(fun = "draw.colorkey", args = list(list(at = 1:22 / 22, col = colr, title = "x")))))) trellis.focus("legend", side="right", clipp.off=TRUE, highlight=FALSE) grid.text(expression(x), 0.5, 1.07, hjust=0.5, vjust=1) trellis.unfocus() @ \caption{Artificial Data Generating Processes. Conditional transformation functions $h(\ry \mid \text{Group}, x)$ given two groups (left and right panel) and $x \in [0, 1]$ (grey color coding) for four different types of transformation models. \label{fig:dgp}} \end{center} \end{figure} Samples from the conditional distributions described by these four models were drawn as follows: <>= y1 <- simulate(m1, newdata = d, n = 100) y2 <- simulate(m2, newdata = d, n = 100) y3 <- simulate(m3, newdata = d, n = 100) y4 <- simulate(m4, newdata = d, n = 100) @ For each combination of data generating process, sample size ($N = 75, 150, 300)$, number of additional uninformative predictor variables ($J^+ = 0, 5, 25$), two choices of $\pZ$ (standard normal and standard logistic in the specification of the boosting procedure, the data were always generated using $\pZ = \Phi$), and two choices of the order of the Bernstein polynomials (six and $12$, also only for the specification of the boosting procedures ), the out-of-sample log-likelihood was estimated for $100$ simulation runs. Very extreme outliers (centered log-likelihoods smaller than $-10^5$) were not drawn. \subsection{Results} \label{sec:results} The following figures depict the out-of-sample log-likelihoods (centered by the out-of-sample log-likelihood of the true model) for each combination of distribution function $\pZ$, number of additional noninformative uniform predictors $J^+$, and order $M$ of the Bernstein polynomial. The panels correspond to the different DGPs (columns) and sample sizes (rows). Abbreviations of the boosting procedures and basis functions used are given at the x-axis. The boxplot of the best performing model is highlighted in yellow. <>= dir <- system.file("simulations", package = "tbm") ctm <- list.files(path = dir, pattern = "ctm.*rda", full = TRUE) tram <- list.files(path = dir, pattern = "tram.*rda", full = TRUE) out <- c() for (f in c(ctm, tram)) { load(f) out <- rbind(out, ret) } out$model <- factor(out$model, levels = c("ctm_gam", "ctm_glm", "ctm_tree", "tram_gam", "tram_glm", "tram_tree"), labels = c("gam_ctm", "glm_ctm", "tree_ctm", "gam_tram", "glm_tram", "tree_tram")) out$PNON <- factor(out$PNON) nlab <- paste("N =", nlev <- rev(sort(unique(out$NOBS)))) out$NOBS <- factor(out$NOBS, levels = nlev, labels = nlab) out$m <- factor(out$m) out$todistr <- factor(out$todistr) out$order <- factor(out$order) out$boost_ll[grep("rror", out$boost_ll)] <- NA out$boost_ll <- as.double(out$boost_ll) out$true_ll <- as.double(out$true_ll) out$mlt_ll <- as.double(out$mlt_ll) out$mstop <- as.double(out$mstop) out$todistr <- factor(as.character(out$todistr), levels = c("normal", "logistic", "minextrval"), labels = c("Normal", "Logistic", "MEV")) lev <- rev(c(expression(paste("Nonlinear ", bold(vartheta)(bold(x)))), expression(paste("Linear ", bold(vartheta)(bold(x)))), expression(paste("Nonlinear ", beta(bold(x)))), expression(paste("Linear ", beta(bold(x)))))) sc <- function(which.given, ..., factor.levels) { if (which.given == 2) { strip.default(which.given = which.given, ..., factor.levels) } else { if (which.given == 1) strip.default(which.given = 1, ..., factor.levels = lev) } } mypanel <- function(x, y, groups, subscripts, ...) { med <- tapply(y, x, median, na.rm = TRUE) panel.bwplot(x = x, y = y, col = col[(med == max(med)) + 1], fill = col[(med == max(med)) + 1], pch = "|", ...) panel.abline(v = c(3.5), col = "lightgrey") panel.abline(h = 0, col = "lightgrey") panel.abline(h = max(med), col = col[2]) panel.bwplot(x = x, y = y, col = col[(med == max(med)) + 1], fill = col[(med == max(med)) + 1], pch = "|", ...) # tapply(1:length(y), groups[subscripts], function(i) { # llines(x = 1:nlevels(x), y = y[i][order(x[i])], # col = rgb(0.1, 0.1, 0.1, 0.1)) # }) } @ <>= lto <- levels(out$todistr) lp <- levels(out$PNON) lo <- levels(out$order) ret <- c() i <- 1 for (l1 in lto) { for(l2 in lp) { for (l3 in lo) { os <- subset(out, todistr == l1 & PNON == l2 & order == l3) os <- subset(os, boost_ll - true_ll > -10000) txt <- paste("J = ", l2, "M = ", l3) if (l1 == "Normal") { main <- expression(F[Z] == Phi) } else if (l1 == "Logistic") { main <- expression(F[Z] == expit) } else { main <- expression(F[Z] == MEV) } pdf(fig <- paste("sfig", i, ".pdf", sep = ""), width = 12, height = 8) pl <- bwplot(I(boost_ll - true_ll) ~ model | m + NOBS, data = os, main = main, strip = sc, ylab = "Out-of-sample log-likelihood (centered)", xlab = txt, panel = mypanel, scales = list(y = list(relation = "free"), x = list(at = 1:6, label = models[levels(os$model)]))) plot(pl) dev.off() ret <- c(ret, "\\begin{sidewaysfigure}", "\\begin{center}", paste("\\includegraphics{", fig, "}", sep = ""), "\\end{center}", "\\end{sidewaysfigure}", " ") i <- i + 1 } } } writeLines(ret) @ \clearpage \bibliography{mlt,appl,packages} <>= if (file.exists("packages.bib")) file.remove("packages.bib") pkgversion <- function(pkg) { pkgbib(pkg) packageDescription(pkg)$Version } pkgbib <- function(pkg) { x <- citation(package = pkg, auto = TRUE)[[1]] b <- toBibtex(x) b <- gsub("Buehlmann", "B{\\\\\"u}hlmann", b) i <- grep("title = \\{ICsurv", b) if (length(i) == 1) b[i] <- " title = {ICsurv: A Package for Semiparametric Regression Analysis of Interval-censored Data}," i <- grep("title = \\{dynsurv", b) if (length(i) == 1) b[i] <- " title = {dynsurv: Dynamic Models for Survival Data}," i <- grep("title = \\{np", b) if (length(i) == 1) b[i] <- " title = {np: Nonparametric Kernel Smoothing Methods for Mixed Data Types}," i <- grep("Kenneth Hess", b) if (length(i) == 1) b[i] <- " author = {Kenneth Hess and Robert Gentleman}," b <- gsub("with contributions from", "and", b) b <- gsub("Göran Broström", "G\\\\\"oran Brostr\\\\\"om", b) b <- gsub(" The publishers web page is", "},", b) b <- gsub("http://www.crcpress.com/product/isbn/9781584885740},", "", b) b <- gsub("R package", "\\\\proglang{R} package", b) b[1] <- paste("@Manual{pkg:", pkg, ",", sep = "") if (is.na(b["url"])) { b[length(b)] <- paste(" URL = {http://CRAN.R-project.org/package=", pkg, "}", sep = "") b <- c(b, "}") } cat(b, sep = "\n", file = "packages.bib", append = TRUE) } pkg <- function(pkg) paste("\\\\pkg{", pkg, "} \\\\citep[version~", pkgversion(pkg), ",][]{pkg:", pkg, "}", sep = "") pkgs <- c("mlt", "variables", "basefun", "survival", "trtf") library("trtf") out <- sapply(pkgs, pkg) cat(c("@Manual{pkg:tbm,", " title = {tbm: Transformation Boosting Machines},", " author = {Torsten Hothorn},", paste(" year = ", substr(packageDescription("tbm")$Date, 1, 4), ",", sep = ""), paste(" note = {{R} package version ", packageDescription("tbm")$Version, "},", sep = ""), " url = {https://r-forge.r-project.org/projects/ctm/},", "}"), file = "packages.bib", append = TRUE, sep = "\n") x <- readLines("packages.bib") for (p in pkgs) x <- gsub(paste("\\{", p, ":", sep = ""), paste("\\{\\\\pkg{", p, "}:", sep = ""), x) cat(x, sep = "\n", file = "packages.bib", append = FALSE) @ <>= sessionInfo() @ \end{document}