%\VignetteIndexEntry{Residuals} 
%\VignettePackage{MARSS}
\documentclass[]{article}
%set margins to 1in without fullsty
	\addtolength{\oddsidemargin}{-.875in}
	\addtolength{\evensidemargin}{-.875in}
	\addtolength{\textwidth}{1.75in}

	\addtolength{\topmargin}{-.875in}
	\addtolength{\textheight}{1.75in}
%\usepackage{fullpage} %more standardized margins

% choose options for [] as required from the list
% in the Reference Guide, Sect. 2.2

\usepackage{multirow}
\usepackage[bottom]{footmisc}% places footnotes at page bottom
\usepackage[round]{natbib}

% Math stuff
\usepackage{amsmath} % the standard math package
\usepackage{amsfonts} % the standard math package
%%%% bold maths symbol system:
\def\uupsilon{\pmb{\upsilon}}
\def\llambda{\pmb{\lambda}}
\def\bbeta{\pmb{\beta}}
\def\aalpha{\pmb{\alpha}}
\def\zzeta{\pmb{\zeta}}
\def\etaeta{\mbox{\boldmath $\eta$}}
\def\xixi{\mbox{\boldmath $\xi$}}
\def\pipi{\pmb{\pi}}
\def\ep{\mbox{\boldmath $\epsilon$}}
\def\DEL{\mbox{\boldmath $\Delta$}}
\def\PHI{\mbox{\boldmath $\Phi$}}
\def\PI{\mbox{\boldmath $\Pi$}}
\def\LAM{\mbox{\boldmath $\Lambda$}}
\def\LAMm{\mathbb{L}}
\def\GAM{\mbox{\boldmath $\Gamma$}}
\def\OMG{\mbox{\boldmath $\Omega$}}
\def\SI{\mbox{\boldmath $\Sigma$}}
\def\TH{\mbox{\boldmath $\Theta$}}
\def\UPS{\mbox{\boldmath $\Upsilon$}}
\def\XI{\mbox{\boldmath $\Xi$}}
\def\AA{\mbox{$\mathbf A$}}	\def\aa{\mbox{$\mathbf a$}}
\def\Ab{\mbox{$\mathbf D$}} \def\Aa{\mbox{$\mathbf d$}} \def\Am{\PI}
\def\BB{\mbox{$\mathbf B$}}	\def\bb{\mbox{$\mathbf b$}} \def\Bb{\mbox{$\mathbf J$}} \def\Ba{\mbox{$\mathbf L$}} \def\Bm{\UPS}
\def\CC{\mbox{$\mathbf C$}}	\def\cc{\mbox{$\mathbf c$}}
\def\Ca{\Delta} \def\Cb{\GAM}
\def\DD{\mbox{$\mathbf D$}}	\def\dd{\mbox{$\mathbf d$}}
\def\EE{\mbox{$\mathbf E$}}	\def\ee{\mbox{$\mathbf e$}}
\def\E{\,\textup{\textrm{E}}}	
\def\EXy{\,\textup{\textrm{E}}_{\text{{\bf XY}}}}
\def\FF{\mbox{$\mathbf F$}} \def\ff{\mbox{$\mathbf f$}}
\def\GG{\mbox{$\mathbf G$}}	\def\gg{\mbox{$\mathbf g$}}
\def\HH{\mbox{$\mathbf H$}}	\def\hh{\mbox{$\mathbf h$}}
\def\II{\mbox{$\mathbf I$}} \def\ii{\mbox{$\mathbf i$}}
\def\IIm{\mbox{$\mathbf I$}}
\def\JJ{\mbox{$\mathbf J$}}
\def\KK{\mbox{$\mathbf K$}}
\def\LL{\mbox{$\mathbf L$}}	\def\ll{\mbox{$\mathbf l$}}
\def\MM{\mbox{$\mathbf M$}}  \def\mm{\mbox{$\mathbf m$}}
\def\N{\,\textup{\textrm{N}}}
\def\MVN{\,\textup{\textrm{MVN}}}
\def\OO{\mbox{$\mathbf O$}}
\def\PP{\mbox{$\mathbf P$}}  \def\pp{\mbox{$\mathbf p$}}
\def\QQ{\mbox{$\mathbf Q$}}	 \def\qq{\mbox{$\mathbf q$}} \def\Qb{\mbox{$\mathbf G$}}  \def\Qm{\mathbb{Q}}
\def\RR{\mbox{$\mathbf R$}}	 \def\rr{\mbox{$\mathbf r$}} \def\Rb{\mbox{$\mathbf H$}}	\def\Rm{\mathbb{R}}
\def\Ss{\mbox{$\mathbf S$}}
\def\UU{\mbox{$\mathbf U$}}	\def\uu{\mbox{$\mathbf u$}}
\def\Ub{\mbox{$\mathbf C$}} \def\Ua{\mbox{$\mathbf c$}} \def\Um{\UPS}
%\def\VV{\mbox{$\mathbf V$}}	\def\vv{\mbox{$\mathbf v$}}
\def\VV{\mbox{$\pmb{V}$}}	\def\vv{\mbox{$\pmb{v}$}}
%\def\WW{\mbox{$\mathbf W$}}	\def\ww{\mbox{$\mathbf w$}}
\def\WW{\mbox{$\pmb{W}$}}	\def\ww{\mbox{$\pmb{w}$}}
%\def\XX{\mbox{$\mathbf X$}}
\def\XX{\mbox{$\pmb{X}$}}	\def\xx{\mbox{$\pmb{x}$}}
%\def\xx{\mbox{$\mathbf x$}}
%\def\YY{\mbox{$\mathbf Y$}}
\def\YY{\mbox{$\pmb{Y}$}}	\def\yy{\mbox{$\pmb{y}$}}
%\def\yy{\mbox{$\mathbf y$}}
\def\ZZ{\mbox{$\mathbf Z$}}	\def\zz{\mbox{$\mathbf z$}}	\def\Zb{\mbox{$\mathbf M$}} \def\Za{\mbox{$\mathbf N$}} \def\Zm{\XI}
\def\zer{\mbox{\boldmath $0$}}
\def\chol{\,\textup{\textrm{chol}}}
\def\vec{\,\textup{\textrm{vec}}}
\def\var{\,\textup{\textrm{var}}}
\def\cov{\,\textup{\textrm{cov}}}
\def\diag{\,\textup{\textrm{diag}}}
\def\trace{\,\textup{\textrm{trace}}}
\def\dg{\,\textup{\textrm{dg}}}
\def\hatxtT{\widetilde{\xx}_t^T}
\def\hatxtpT{\widetilde{\xx}_{t+1}^T}
\def\hatxtt{\widetilde{\xx}_t^{t}}
\def\hatxttm{\widetilde{\xx}_t^{t-1}}
\def\hatxone{\widetilde{\xx}_1}
\def\hatxzero{\widetilde{\xx}_0}
\def\hatxtmT{\widetilde{\xx}_{t-1}^T}
\def\hatxtmt1{\widetilde{\xx}_{t-1}^{t-1}}
\def\hatxtpt1{\widetilde{\xx}_{t+1}^{t-1}}
\def\hatxQtm{\widetilde{\mathbb{x}}_{t-1}}
\def\hatyt{\widetilde{\yy}_t}
\def\hatyyt{\widetilde{\mbox{$\mathbf y$}\mbox{$\mathbf y$}^\top}_t}
\def\hatyone{\widetilde{\yy}_1}
\def\hatOt{\widetilde{\OO}_t}
\def\hatWt{\widehat{\WW}_t}
\def\hatWtp{\widehat{\WW}_{t+1}}
\def\hatwt{\widehat{\ww}_t}
\def\hatwtp{\widehat{\ww}_{t+1}}
\def\checkWt{\overline{\WW}_{t}}
\def\checkWtp{\overline{\WW}_{t+1}}
\def\checkwt{\overline{\ww}_t}
\def\checkwtp{\overline{\ww}_{t+1}}
\def\hatYXt{\widetilde{\mbox{$\yy\xx$}}_t}
\def\hatXYt{\widetilde{\mbox{$\xx\yy$}}_t}
\def\hatYXttm{\widetilde{\mbox{$\yy\xx$}}_{t,t-1}}
\def\hatPt{\widetilde{\PP}_t}
\def\hatPtm{\widetilde{\PP}_{t-1}}
\def\hatPQtm{\widetilde{\mathbb{P}}_{t-1}}
\def\hatPttm{\widetilde{\PP}_{t,t-1}}
\def\hatPQttm{\widetilde{\mathbb{P}}_{t,t-1}}
\def\hatPtmt{\widetilde{\PP}_{t-1,t}}
\def\hatVt{\widehat{\VV}_t}
\def\hatvt{\widehat{\vv}_t}
\def\checkvt{\overline{\vv}_t}
\def\checkvtp{\overline{\vv}_{t+1}}
\def\checkVtp{\overline{\VV}_{t+1}}
\def\checkVt{\overline{\VV}_t}
\def\dotvt{\dot{\vv}_t}
\def\dotvtp{\dot{\vv}_{t+1}}
\def\dotVtp{\dot{\VV}_{t+1}}
\def\dotVt{\dot{\VV}_t}
\def\hatVtT{\widetilde{\VV}_t^T}
\def\hatVttm{\widetilde{\VV}_t^{t-1}}
\def\hatVtt{\widetilde{\VV}_t^{t}}
\def\hatVtmT{\widetilde{\VV}_{t-1}^T}
\def\hatVtmt1{\widetilde{\VV}_{t-1}^{t-1}}
\def\hatVttmT{\widetilde{\VV}_{t,t-1}^T}
\def\hatVtmtT{\widetilde{\VV}_{t-1,t}^T}
\def\hatVttmt1{\widetilde{\VV}_{t,t-1}^{t-1}}
\def\hatVtpT{\widetilde{\VV}_{t+1}^T}
\def\hatVttpT{\widetilde{\VV}_{t,t+1}^T}
\def\hatVttpt1{\widetilde{\VV}_{t,t+1}^{t-1}}
\def\hatVtptT{\widetilde{\VV}_{t+1,t}^T}
\def\hatVtptt1{\widetilde{\VV}_{t+1,t}^{t-1}}
\def\hatUtT{\widetilde{\UU}_t^T}
\def\hatUtt1{\widetilde{\UU}_t^{t-1}}
\def\hatStT{\widetilde{\Ss}_t^T}
\def\hatSttm{\widetilde{\Ss}_t^{t-1}}
\def\hatStt{\widetilde{\Ss}_t^{t}}
\def\hatSttmT{\widetilde{\Ss}_{t,t-1}^T}
\def\hatSttmt1{\widetilde{\Ss}_{t,t-1}^{t-1}}
\def\hatSttpT{\widetilde{\Ss}_{t,t+1}^T}
\def\hatSttpt1{\widetilde{\Ss}_{t,t+1}^{t-1}}
\def\hatBmt{\widetilde{\Bm}_t}
\def\hatCat{\widetilde{\Ca}_t}
\def\hatCbt{\widetilde{\Cb}_t}
\def\hatZmt{\widetilde{\Zm}_t}
\def\YYr{\dot{\mbox{$\pmb{Y}$}}}
\def\yyr{\dot{\mbox{$\pmb{y}$}}}
\def\aar{\dot{\mbox{$\mathbf a$}}}
\def\ZZr{\dot{\mbox{$\mathbf Z$}}}
\def\RRr{\dot{\mbox{$\mathbf R$}}}
\def\IR{\nabla}
\usepackage[round]{natbib} % to get references that are like in ecology papers
% \citet{} for inline citation name (year); \citep for citation in parens (name year)

%allow lines in matrix
\makeatletter
\renewcommand*\env@matrix[1][*\c@MaxMatrixCols c]{%
  \hskip -\arraycolsep
  \let\@ifnextchar\new@ifnextchar
  \array{#1}}
\makeatother
\setcounter{tocdepth}{1} %no subsections in toc

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}
\SweaveOpts{concordance=TRUE}
\author{E. E. Holmes\footnote{Northwest Fisheries Science Center, NOAA Fisheries, Seattle, WA 98112, 
       eli.holmes@noaa.gov, http://faculty.washington.edu/eeholmes}}
\title{Computation of Standardized Residuals for MARSS Models}
\date{original 31 Oct 2014 \\ 
\small{\emph{added normalization calculations, 20 Oct 2020}}}
\maketitle
\begin{abstract}
This report walks through a derivation of the variance-covariance matrix for the joint conditional model and state residuals for multivariate autoregressive Gaussian state-space (MARSS) models. The bulk of the report focuses on `smoothations' residuals \citep{Harveyetal1998}, which are the residuals conditioned on all the data $t=1$ to $T$. The final part of the report covers `innovations' residuals, which are residuals conditioned on the data $t=1$ to $t-1$. 

The MARSS model can be written: $\xx_t=\BB\xx_{t-1}+\uu+\ww_t$, $\yy_t=\ZZ\xx_t+\zz+\vv_t$, where $\ww_t$ and $\vv_t$ are independent multivariate Gaussian error-terms with variance-covariance matrices $\QQ_t$ and $\RR_t$ respectively. The joint conditional residuals are the $\ww_t$ and $\vv_t$ conditioned on the observed data, which may be incomplete (missing values). Harvey, Koopman and Penzer (1998) show a recursive algorithm for the smoothation residuals (residuals conditioned on all the data). I show an alternate algorithm to compute these residuals using the conditional variances of the states and the conditional covariance between unobserved data and states. This allows one to compute the variance of un-observed smoothation residuals (residuals associated with missing or left-out data), which is needed for leave-one-out cross-validation tests using smoothation residuals. I show how to modify the Harvey et al. algorithm in the case of missing values and how to modify it to return the non-normalized conditional residuals.
\end{abstract}
Keywords: Time-series analysis, Kalman filter, residuals, maximum-likelihood, vector autoregressive model, dynamic linear model, parameter estimation, state-space
\vfill
{\noindent \small citation: Holmes, E. E. 2014. Computation of standardized residuals for (MARSS) models. Technical Report. arXiv:1411.0045 }
 \newpage
 
\section{Overview of MARSS residuals}

This report discusses the computation of the variance of the conditional model and state residuals for MARSS models of the  form:
\begin{equation}\label{eq:residsMARSS}
\begin{gathered}
\xx_t = \BB_t\xx_{t-1} + \uu_t + \ww_t, \text{ where } \WW_t \sim \MVN(0,\QQ_t)\\
\yy_t = \ZZ_t\xx_t + \aa_t + \vv_t, \text{ where } \VV_t \sim \MVN(0,\RR_t)\\
\XX_0 \sim \MVN(\xixi,\LAM) \text{ or } \xx_0 = \pipi .
\end{gathered}
\end{equation}
The state and model residuals are respectively
\begin{equation}\label{eq:resids}
\begin{gathered}
\ww_t = \xx_t - \BB_t\xx_{t-1} - \uu_t\\
\vv_t = \yy_t - \ZZ_t\xx_t - \aa_t .
\end{gathered}
\end{equation}
The model (and state) residuals are a random variables since $\yy_t$ and $\xx_t$ are drawn from the joint multivariate distribution of $\YY_t$ and $\XX_t$ defined by the MARSS equations (Equation \ref{eq:residsMARSS}).
The unconditional\footnote{meaning not conditioning on any particular set of observed data but rather taking the expectation across all possible values of $\yy_t$ and $\xx_t$.} variance of the model residuals is
\begin{equation}\label{eq:unconditiondistofVt}
\var_{XY}[\VV_t] = \var_{XY}[\YY_t - (\ZZ_t \XX_t + \aa_t)] = \RR_t\\
\end{equation}
based on the distribution of $\VV_t$ in Equation \ref{eq:residsMARSS}.  $\var_{XY}$ indicates that the integration is over the joint unconditional distribution of $\XX$ and $\YY$. 

Once we have data, $\RR_t$ is not the variance-covariance matrix of our model residuals because our residuals are now conditioned\footnote{`conditioned' means that the probability distribution of the residual has changed. The distribution is now the distribution given that $\YY=\yy$, say. Expectations and variances $\var[ ]$ are integrals over the value that a random variable might take multiplied by the probability of that value. When presenting an   `expectation', the probability distribution is normally implicit but for derivations involving conditional expectations, it is important to be explicit about the distribution that is being integrated over.} on a set of observed data. There are two types of conditional model residuals used in MARSS analyses: innovations and smoothations.  Innovations are the model residuals at time $t$ using the expected value of $\XX_t$ conditioned on the data from 1 to $t-1$.  Smoothations  are the model residuals using the expected value of $\XX_t$ conditioned on all the data, $t=1$ to $T$.  Smoothations are used in computing standardized residuals for outlier and structural break detection \citep{HarveyKoopman1992, Harveyetal1998, deJongPenzer1998, CommandeurKoopman2007}.

It should be noted that all the calculations discussed here are conditioned on the MARSS parameters: $\BB$, $\QQ$, $\UU$, $\RR$, $\ZZ$ and $\AA$. These are treated as known. This is different than standard discussions of residual distributions for linear models where the uncertainty in the model parameters enters into the calculations (as it enters into the calculation of the influence of $\yy$ on the expected (or fitted) value of $\yy$). In the calculations in this report, $\yy$ does not affect the estimates of the parameters (which are fixed, perhaps at estimated values) but does affect the expected value of $\YY_t$ by affecting the estimate of the expected value and variance of $\XX_t$. 

\section{Distribution of MARSS smoothation residuals}\label{sec:smoothations}

This section discusses computation of the variance of the model and state residuals conditioned on all the data from $t=1$ to $T$.  These MARSS residuals are often used for outlier detection and shock detection, and in this case you only need the distribution of the model residuals for the observed values.  However if you wanted to do a leave-one-out cross-validation, you would need to know the distribution of the residuals for data points you left out (treated as unobserved).  The equations in this report give you the former and the latter, while the algorithm by \citet{Harveyetal1998} gives only the former. These equations for residuals for `left-out` data are different that other (typical) discussions of state-space cross-validation \citep{deJong1988} in that they are conditioned on all the data (smoothations residuals) rather than conditioned on data up to $t-1$ (innovations residuals).

\subsection{Notation and relations}

Throughout, I follow the convention that capital letters are random variables and small letters are a realization from the random variable.  This only applies to random variables; parameters are not random variables\footnote{in a frequentist framework}. Parameters are shown in Roman font while while random variables are bold slanted font. Parameters written as capital letters are matrices, while parameters written in small letters are strictly column matrices. 

In this report, the distribution over which the integration is done in an expectation or variance  is given by the subscript; e.g., $\E_A[f(A)]$ indicates an unconditional expectation over the distribution of $A$ without conditioning on another random variable while $\E_{A|b}[f(A)|b]$ would indicate an expectation over the distribution of $A$ conditioned on $B=b$; presumably $A$ and $B$ are not independent otherwise $B=b$ would have no effect on $A$. $\E_{A|b}[f(A)|b]$ is a fixed value, not random. It is the expected value when $B=b$. In contrast, $\E_{A|B}[f(A)|B]$ denotes the random variable over all the possible $\E_{A|b}[f(A)|b]$ given all the possible $b$ values that $B$ might take. The variance of $\E_{A|B}[f(A)|B]$ is the variance of this random variable. The variance of $\E_{A|b}[f(A)|b]$ in contrast is 0 since it is a fixed value.  We will often be working with the random variables, $\E_{A|B}[f(A)|B]$ or $\var_{A|B}[f(A)|B]$, inside an expectation or variance: such as $\var_B[\E_{A|B}[f(A)|B]]$.

\subsubsection{Law of total variance}

The ``law of total variance'' can be written
\begin{equation}\label{eq:lawoftotvar}
\var_A[A] = \var_B[\E_{A|B}[A|B]] + \E_B[\var_{A|B}[A|B]] .
\end{equation}
The subscripts on the inner expectations make it explicit that the expectations are being taken over the conditional distributions.  $\var_{A|B}[A|B]$ and $\E_{A|B}[A|B]$ are random variables because the $B$ in the conditional is a random variable. We take the expectation or variance with $B$ fixed at one value, $b$, but $B$ can take other values of $b$ also.  

Going forward, I will write the law or total variance more succinctly as
\begin{equation}
\var[A] = \var_B[\E[A|B]] + \E_B[\var[A|B]] .
\end{equation}
I leave off the subscript on the inner conditional expectation or variance. Just remember that when you see a conditional in an expectation or variance, the integration is over over the conditional distribution of $A$ conditioned on $B=b$. Even when you see $A|B$, the conditioning is on $B=b$ and the $B$ indicates that this is a random variable because $B$ can take different $b$ values. When computing $\var_B[\E_{A|B}[A|B]]$, we will typically compute $\E_{A|b}[A|b]$ and then compute (or infer) the variance or expectation of that over all possible values of $b$. 

The law of total variance will appear in this report in the following form:
\begin{equation}
\var_{XY}[f(\YY,\XX)] = \var_{Y^{(1)}}[\E_{XY|Y^{(1)}}[f(\YY,\XX)|\YY^{(1)}]] + \E_{Y^{(1)}}[\var_{XY|Y^{(1)}}[f(\YY,\XX)|\YY^{(1)}]] ,
\end{equation}
where $f(\YY_t,\XX_t)$ is some function of $\XX_t$ and $\YY_t$ and $\YY^{(1)}$ is the observed data from $t=1$ to $T$ ($\YY^{(2)}$ is the unobserved data). 


\subsection{Model residuals conditioned on all the data}

Define the smoothations $\hatvt$ as:
\begin{equation}\label{eq:vtT}
\hatvt = \yy_t - \ZZ_t\hatxtT - \aa_t,
\end{equation}
where  $\hatxtT$ is $\E[\XX_t|\yy^{(1)}]$. The smoothation is different from $\vv_t$ because it uses $\hatxtT$ not $\xx_t$; $\xx_t$ is not known, and $\hatxtT$ is its estimate. $\hatxtT$ is output by the Kalman smoother. $\yy^{(1)}$ means all the observed data from $t=1$ to $T$. $\yy^{(1)}$ is a sample from the random variable $\YY^{(1)}$. The unobserved $\yy$ will be termed $\yy^{(2)}$ and is a sample from the random variable $\YY^{(2)}$. When $\YY$ appears without a superscript, it means both $\YY^{(1)}$ and $\YY^{(2)}$ together. Similarly $\yy$ means both $\yy^{(1)}$ and $\yy^{(2)}$ together---the observed data that we use to estimate $\hatxtT$ and the unobserved data that we do not use and may or may not know. $\hatvt$ exists for both $\yy^{(1)}$ and $\yy^{(2)}$, though we might not know $\yy^{(2)}$ and thus might not know its corresponding $\hatvt$. In some cases, however, we do know $\yy^{(2)}$; they are data that we left out of our model fitting, in say a k-fold or leave-one-out cross-validation.

$\hatvt$ is a sample from the random variable $\hatVt$.  We want to compute the mean and variance of this random variable over all possibles values that $\XX_t$ and $\YY_t$ might take. The mean of $\hatVt$ is 0 and we are concerned only with computing the variance:
\begin{equation}\label{eq:var.vtT}
\var[\hatVt] = \var_{XY}[\YY_t - \ZZ_t\E[\XX_t|\YY^{(1)}] - \aa_t] .
\end{equation}
Notice we have an unconditional variance over ${\XX,\YY}$ on the outside and a conditional expectation over a specific value of $\YY^{(1)}$ on the inside (in the $\E[\;]$).

From the law of total variance (Equation \ref{eq:lawoftotvar}), we can write the variance of the model residuals as
\begin{equation}\label{eq:varvvtgeneral}
\var[\hatVt] = \var_{Y^{(1)}}[\E[\hatVt|\YY^{(1)}]] + \E_{Y^{(1)}}[\var[\hatVt|\YY^{(1)}]] .
\end{equation}

\subsubsection{First term on right hand side of Equation \ref{eq:varvvtgeneral}}

The random variable inside the $\var[\;]$ in the first term is
\begin{equation}\label{eq:first.term.rhs.varvvtgeneral}
\E[\hatVt|\YY^{(1)}]= \E[(\YY_t + \ZZ_t \E[\XX_t|\YY^{(1)}] + \aa_t)|\YY^{(1)}] .
\end{equation}
Let's consider this for a specific value $\YY^{(1)}=\yy^{(1)}$.
\begin{equation}
\E[\hatVt|\yy^{(1)}] = \E[(\YY_t + \ZZ_t \E[\XX_t|\yy^{(1)}] + \aa_t)|\yy^{(1)}] =
\E[\YY_t|\yy^{(1)}] + \ZZ_t \E[\E[\XX_t|\yy^{(1)}]|\yy^{(1)}] + \E[\aa_t|\yy^{(1)}] .
\end{equation}
$\E[\XX_t|\yy^{(1)}]$ is a fixed value, and the expected value of a fixed value is itself. So $\E[\E[\XX_t|\yy^{(1)}]|\yy^{(1)}]=\E[\XX_t|\yy^{(1)}]$. 
Thus,
\begin{equation}
\E[\hatVt|\yy^{(1)}] = \E[\YY_t|\yy^{(1)}] + \ZZ_t \E[\XX_t|\yy^{(1)}] + \E[\aa_t|\yy^{(1)}] .
\end{equation}
We can move the conditional out and write
\begin{equation}
\E[\hatVt|\yy^{(1)}]= \E[(\YY_t + \ZZ_t \XX_t + \aa_t)|\yy^{(1)}]=\E[\VV_t|\yy^{(1)}].
\end{equation}
The right side is $\E[\VV_t|\yy^{(1)}]$, no `hat' on the $\VV_t$, and this applies for all $\yy^{(1)}$.  This means that the first term in Equation \ref{eq:varvvtgeneral} can be written with no hat on $\VV$:
\begin{equation}\label{eq:no.hat.on.V}
\var_{Y^{(1)}}[\E[\hatVt|\YY^{(1)}]] = \var_{Y^{(1)}}[\E[\VV_t|\YY^{(1)}]] .
\end{equation}
If $\YY_t$ were completely observed (no missing values), this would be zero since $\E[\VV_t|\yy]$ would be a fixed value in that case. But $\YY_t$ is not assumed to be fully observed; it may have $\YY^{(2)}_t$ which is unobserved, or more precisely, not included in the estimation of $VV_t$ for whatever reason (`unknown" because it was unobserved being one reason). The derivation of $\E[\YY_t|\yy^{(1)}]$ is given in \citet{Holmes2010}\footnote{$\E[\YY^{(2)}_t|\yy^{(1)}]$ is not $\ZZ_t \hatxtT + \aa_t$ in general since $\YY^{(2)}_t$ and $\YY^{(1)}_t$ may be correlated through $\RR$ in addition to being correlated through $\hatxtT$}.

Using the law of total variance, we can re-write $\var[\VV_t]$ as:
\begin{equation}\label{eq:varianceVt}
\var[\VV_t] = \var_{Y^{(1)}}[\E[\VV_t|\YY^{(1)}]] + \E_{Y^{(1)}}[\var[\VV_t|\YY^{(1)}]].
\end{equation}
From Equation \ref{eq:varianceVt}, we can solve for $\var_{Y^{(1)}}[\E[\VV_t|\YY^{(1)}]]$:
\begin{equation}\label{eq:var.E.vtT}
\var_{Y^{(1)}}[\E[\VV_t|\YY^{(1)}]] = \var[\VV_t] - \E_{Y^{(1)}}[\var[\VV_t|\YY^{(1)}]].
\end{equation}
From Equation \ref{eq:unconditiondistofVt}, we know that $\var[\VV_t]=\RR_t$ (this is the unconditional variance). Thus,
\begin{equation}\label{eq:var.E.vtT.R}
\var_{Y^{(1)}}[\E[\VV_t|\YY^{(1)}]] = \RR_t - \E_{Y^{(1)}}[\var[\VV_t|\YY^{(1)}]].
\end{equation}

The second term in Equation \ref{eq:var.E.vtT.R} to the right of the equal sign and inside the expectation is $\var[\VV_t|\YY^{(1)}]$. This is the variance of $\VV_t$ with $\YY^{(1)}$ held at a specific fixed $\yy^{(1)}$. The variability in $\var[\VV_t|\yy^{(1)}]$ (notice $\yy^{(1)}$ not $\YY^{(1)}$ now) comes from $\XX_t$ and $\YY^{(2)}$ which are random variables. Let's compute this variance for a specific $\yy^{(1)}$ value.
\begin{equation}\label{eq:varvtcondy}
\var[\VV_t|\yy^{(1)}] = \var[ \YY_t - \ZZ_t\XX_t-\aa_t | \yy^{(1)} ].
\end{equation}
Notice that there is no $\E$ (expectation) on the $\XX_t$; this is $\VV_t$ not $\hatVt$. $\aa_t$ is a fixed value and can be dropped.

Equation \ref{eq:varvtcondy} can be written as\footnote{$\var(A+B)=\var(A)+\var(B)+\cov(A,B)+\cov(B,A)$}:
\begin{equation}\label{eq:var.Vt.yy}
\begin{split}
\var[\VV_t|\yy^{(1)}] &= \var[ \YY_t - \ZZ_t\XX_t | \yy^{(1)} ]\\
&=\var[ - \ZZ_t\XX_t | \yy^{(1)} ] + \var[ \YY_t|\yy^{(1)}] + \cov[ \YY_t, - \ZZ_t\XX_t | \yy^{(1)} ] + \cov[ - \ZZ_t\XX_t, \YY_t | \yy^{(1)} ]\\
&=\ZZ_t \hatVtT \ZZ_t^\top + \hatUtT - \hatStT\ZZ_t^\top - \ZZ_t(\hatStT)^\top .
\end{split}
\end{equation}
$\hatVtT = \var[ \XX_t | \yy^{(1)} ]$ and is output by the Kalman smoother. $\hatUtT=\var[\YY_t|\yy^{(1)}]$ and $\hatStT=\cov[\YY_t,\XX_t|\yy^{(1)}]$. The equations for these are given in \citet{Holmes2010} and are output by the \verb@MARSShatyt()@ function in the \{MARSS\} package\footnote{$\hatUtT$ is  \texttt{OtT - tcrossprod(ytT)} in the \texttt{MARSShatyt()} output.}.  If there were no missing data, i.e., if $\yy^{(1)}=\yy$, then $\hatUtT$ and $\hatStT$ would be zero because $\YY_t$ would be fixed at $\yy_t$. This would reduce Equation \ref{eq:var.Vt.yy} to $\ZZ_t \hatVtT \ZZ_t^\top$. But we are concerned with the case where there are missing values. Those missing values need not be for all $t$. That is, there may be some observed $y$ at time t and some missing $y$. $\yy_t$ is multivariate.

From Equation \ref{eq:var.Vt.yy}, we know $\var[\VV_t|\yy^{(1)}]$ for a specific $\yy^{(1)}$. We want $\E_{Y^{(1)}}[\var[\VV_t|\YY^{(1)}]]$ which is its expected value over all possible values of $\yy^{(1)}$.  $\VV_t$ is a multivariate normal random variable with two random variables $\YY^{(1)}$ and $\YY^{(2)}$. The conditional variance of a multivariate Normal does not depend on the value that you are conditioning on. Let the $\AA$ be a N-dimensional multivariate Normal random variable partitioned into $\AA_1$ and $\AA_2$ with variance-covariance matrix $\Sigma = \begin{bmatrix}
\Sigma_1 & \Sigma_{12} \\
\Sigma_21 & \Sigma_{2}
\end{bmatrix}$.  The variance-covariance matrix of $\AA$ conditioned on $\AA_1=\aa$ is $\Sigma = \begin{bmatrix}
0 & 0 \\
0 & \Sigma_2 - \Sigma_{12}\Sigma_{1}\Sigma_{21}
\end{bmatrix}$. Notice that $\aa$ does not appear in the conditional variance matrix. This means that $\var[\VV_t|\yy^{(1)}]$ does not depend on $\yy^{(1)}$. Its variance only depends on the MARSS model parameters\footnote{This also implies that $\hatVtT$, $\hatUtT$ and $\hatStT$ do not depend on $\yy^{(1)}$, which is rather counter-intuitive since $\yy^{(1)}$ is part of the algorithm (Kalman filter and smoother) used to compute these values. The value of $\yy^{(1)}$ affects the expected values of $\XX_t$ but only its presence\ (versus absence) affects $\XX_t$'s conditional variance.}.

Because $\var[\VV_t|\yy^{(1)}]$ only depends on the MARSS parameters values, $\QQ$, $\BB$, $\RR$, etc., the second term in Equation \ref{eq:var.E.vtT}, $\E_{Y^{(1)}}[\var[\VV_t|\YY^{(1)}]]$, is equal to $\var[\VV_t|\yy^{(1)}]$ (Equation \ref{eq:var.Vt.yy}). Putting this into Equation \ref{eq:var.E.vtT.R}, we have
\begin{equation}\label{eq:conditionalvtfinala}
\var_{Y^{(1)}}[\E[\VV_t|\YY^{(1)})]]  = \RR_t - \var[\VV_t|\yy^{(1)}] = \RR_t - \ZZ_t \hatVtT \ZZ_t^\top - \hatUtT + \hatStT\ZZ_t^\top + \ZZ_t(\hatStT)^\top.
\end{equation}
Since $\var_{Y^{(1)}}[\E[\VV_t|\YY^{(1)})]] = \var_{Y^{(1)}}[\E[\hatVt|\YY^{(1)})]]$ (Equation \ref{eq:no.hat.on.V}), this means that the first term in Equation \ref{eq:varvvtgeneral} is
\begin{equation}\label{eq:conditionalvtfinal}
\var_{Y^{(1)}}[\E[\hatVt|\YY^{(1)})]]  =  \RR_t - \ZZ_t \hatVtT \ZZ_t^\top - \hatUtT + \hatStT\ZZ_t^\top + \ZZ_t(\hatStT)^\top.
\end{equation}

\subsubsection{Second term on right hand side of Equation \ref{eq:varvvtgeneral}}

Consider the second term in Equation \ref{eq:varvvtgeneral}.  This term is 
\begin{equation}\label{eq:second.term.rhs.9}
\E_{Y^{(1)}}[\var[\hatVt|\YY^{(1)}]] = \E_{Y^{(1)}}[\var[(\YY_t-\ZZ_t\E[\XX_t|\YY^{(1)}]-\aa_t)|\YY^{(1)}]] .
\end{equation}
The middle term is:
\begin{equation}
\E_{Y^{(1)}}[\var[\E[\XX_t|\YY^{(1)}]|\YY^{(1)}]].
\end{equation}
Let's solve the inner part for a specific $\YY^{(1)}=\yy^{(1)}$. $\E[\XX_t|\yy^{(1)}]$ is a fixed value. Thus  $\var[\E[\XX_t|\yy^{(1)}]|\yy^{(1)}]=0$ since the variance of a fixed value is 0. This is true for all $\yy^{(1)}$ so the middle term reduces to 0. $\aa_t$ is also fixed and its variance is also 0. The covariance between a random variable and a fixed value is 0\footnote{$\var[A+B] = \var[A]+\var[B]+\cov[A,B]$}. Thus for a specific $\YY^{(1)}=\yy^{(1)}$, the inside of the right hand side expectation reduces to $\var[\YY_t|\yy^{(1)}]$ which is $\hatUtT$. As noted in the previous section, $\hatUtT$ is only a function of the MARSS parameters; it is not a function of $\yy^{(1)}$ and $\var[\YY_t|\yy^{(1)}]=\hatUtT$ for all $\yy^{(1)}$. Thus the second term in Equation \ref{eq:varvvtgeneral} is simply $\hatUtT$:
\begin{equation}\label{eq:second.term.rhs.9final}
\E_{Y^{(1)}}[\var[\hatVt|\YY^{(1)}]] = \var[\hatVt|\yy^{(1)}] = \hatUtT .
\end{equation}

\subsubsection{Putting together the first and second terms}

We can now put the first and second terms in Equation \ref{eq:varvvtgeneral} together (Equations \ref{eq:conditionalvtfinal} and \ref{eq:second.term.rhs.9final}) and write out the variance of the model residuals:
\begin{equation}\label{eq:first.and.secons.vvtgeneral}
\begin{split}
\var[\hatVt] &= \RR_t - \ZZ_t \hatVtT \ZZ_t^\top - \hatUtT + \hatStT\ZZ_t^\top + \ZZ_t(\hatStT)^\top + \hatUtT\\
&= \RR_t - \ZZ_t \hatVtT \ZZ_t^\top + \hatStT\ZZ_t^\top + \ZZ_t(\hatStT)^\top .
\end{split}
\end{equation}
Equation \ref{eq:first.and.secons.vvtgeneral} will reduce to $\RR_t - \ZZ_t \hatVtT \ZZ_t^\top$ if $\yy_t$ has no missing values since $\hatStT = 0$ in this case.  If $\yy_t$ is all missing values, $\hatStT = \ZZ_t \hatVtT$ because 
\begin{equation}\label{eq:cov.Yt.Xt.no.missing.vals}
\cov[\YY_t, \XX_t|\yy^{(1)}] = \cov[\ZZ_t \XX_t + \aa_t + \VV_t, \XX_t|\yy^{(1)}] = \cov[\ZZ_t \XX_t, \XX_t|\yy^{(1)}] = \ZZ_t \cov[\XX_t, \XX_t|\yy^{(1)}] = \ZZ_t \hatVtT .
\end{equation}
The reduction in Equation \ref{eq:cov.Yt.Xt.no.missing.vals} occurs because $\VV_t$ and $\WW_t$ and by extension $\VV_t$ and $\XX_t$ are independent in the form of MARSS model used in this report (Equation \ref{eq:residsMARSS})\footnote{This is not the case for the \citet{Harveyetal1998} form of the MARSS model where $\VV_t$ and $\WW_t$ are allowed to be correlated.}. Thus when $\yy_t$ is all missing values, Equation \ref{eq:first.and.secons.vvtgeneral} will reduce to $\RR_t + \ZZ_t \hatVtT \ZZ_t^\top$ . The behavior if $\yy_t$ has some missing and some not missing values depends on whether $\RR_t$ is a diagonal matrix or not, i.e., if the $\yy_t^{(1)}$ and $\yy_t^{(2)}$ are correlated.

\subsection{State residuals conditioned on the data}

The state residuals are $\xx_t - (\BB_t \xx_{t-1} + \uu_t)=\ww_t$.  The unconditional expected value of the state residuals is $\E[\XX_t - (\BB_t \XX_{t-1} + \uu_t)] = \E[\WW_t] = 0$ and the unconditional variance of the state residuals is
\begin{equation}
\var[\XX_t - (\BB_t \XX_{t-1} + \uu_t)] = \var[\WW_t] = \QQ_t
\end{equation}
based on the definition of $\WW_t$ in Equation \ref{eq:residsMARSS}.
The conditional state residuals (conditioned on the data from $t=1$ to $t=T$) are defined as
\begin{equation}
\hatwt = \hatxtT - \BB_t\hatxtmT - \uu_t.
\end{equation}
where $\hatxtT = E[\XX_t|\yy^{(1)}]$ and $\hatxtmT = E[\XX_{t-1}|\yy^{(1)}]$.  $\hatwt$ is a sample from the random variable $\hatWt$; random over different possible data sets.  The expected value of $\hatWt$ is 0, and we are concerned with computing its variance.

We can write the variance of $\WW_t$ (no hat) using the law of total variance.
\begin{equation}\label{eq:Wlawoftotvar}
\var[\WW_t] = \var_{Y^{(1)}}[\E[\WW_t|\YY^{(1)}]] + \E_{Y^{(1)}}[\var[\WW_t|\YY^{(1)}]] .
\end{equation}
Notice that
\begin{equation}
\E[\WW_t|\yy^{(1)}] = \E[(\XX_t - \BB_t \XX_{t-1} - \uu_t)|\yy^{(1)}] =  \hatxtT - \BB_t \hatxtmT - \uu_t = \E[\hatWt|\yy^{(1)}] = \hatwt .
\end{equation}
This is true for all $\yy^{(1)}$, thus $\E[\WW_t|\YY^{(1)}]$ is $\hatWt$, and $\var_{Y^{(1)}}[\E[\WW_t|\YY^{(1)}]] = \var[\hatWt]$. Equation \ref{eq:Wlawoftotvar} can thus be written
\begin{equation}
\var[\WW_t] = \var[\hatWt] + \E_{Y^{(1)}}[\var[\WW_t|\YY^{(1)}]] .
\end{equation}
Solve for $\var[\hatWt]$:
\begin{equation}\label{eq:varwwt}
\var[\hatWt] = \var[\WW_t] - \E_{Y^{(1)}}[\var[\WW_t|\YY^{(1)}]].
\end{equation}

The variance in the expectation on the far right for a specific $\YY^{(1)}=\yy^{(1)}$ is
\begin{equation}
\var[\WW_t|\yy^{(1)}] = \var[ (\XX_t - \BB_t\XX_{t-1}-\uu_t) | \yy^{(1)} ] .
\end{equation}
$\uu_t$  is not a random variable and can be dropped. Thus\footnote{$\var[A-B]=\var[A]+\var[B]+\cov[A,-B]+\cov[-B,A]$. Be careful about the signs in this case as they are a little non-intuitive.},
\begin{equation}\label{eq:var.W.cond.y1}
\begin{split}
\var[\WW_t&|\yy^{(1)}] = \var[ (\XX_t - \BB_t\XX_{t-1}) | \yy^{(1)} ] \\
& = \var[ \XX_t | \yy^{(1)} ] + \var[\BB_t\XX_{t-1} | \yy^{(1)} ] + \cov[\XX_t, -\BB_t\XX_{t-1} | \yy^{(1)} ] + \cov[ -\BB_t\XX_{t-1}, \XX_t | \yy^{(1)} ]\\
& = \hatVtT + \BB_t \hatVtmT \BB_t^\top - \hatVttmT\BB_t^\top - \BB_t\hatVtmtT .
\end{split}
\end{equation}
Again this is conditional multivariate normal variance, and its value does not depend on the value, $\yy^{(1)}$ that we are conditioning on.  It depends only on the parameters values, $\QQ$, $\BB$, $\RR$, etc., and is the same for all values of $\yy^{(1)}$. So $\E_{Y^{(1)}}[\var[\WW_t|\YY^{(1)}]] = \var[\WW_t|\yy^{(1)}]$, using any value of $\yy^{(1)}$. Thus
\begin{equation}\label{eq:E.var.Wt.yt}
\E_{Y^{(1)}}[\var[\WW_t|\YY^{(1)}]] =  \hatVtT + \BB_t\hatVtmT\BB_t^\top - \hatVttmT\BB_t^\top - \BB_t\hatVtmtT .
\end{equation}

Putting $\E_{Y^{(1)}}[\var[\WW_t|\YY^{(1)}]]$ from Equation \ref{eq:E.var.Wt.yt} and $\var[\WW_t]=\QQ_t$ into Equation \ref{eq:varwwt}, the variance of the conditional state residuals is
\begin{equation}
\var[\hatWt] = \QQ_t - \hatVtT - \BB_t\hatVtmT\BB_t^\top + \hatVttmT\BB_t^\top + \BB_t\hatVtmtT .
\end{equation}

\subsection{Covariance of the conditional model and state residuals}

The unconditional model and state residuals, $\VV_t$ and $\WW_t$, are independent by definition\footnote{This independence is specific to the way the MARSS model for this report (Equation \ref{eq:residsMARSS}). It is possible for the model and state residuals to covary. In the MARSS model written in \citet{Harveyetal1998} form, they do covary.} (in Equation \ref{eq:residsMARSS}), i.e., $\cov[\VV_t,\WW_t]=0$.  However the conditional model and state residuals, $\cov[\hatVt,\hatWt]$, are not independent since both depend on $\yy^{(1)}$.  
Using the law of total covariance, we can write
\begin{equation}\label{eq:covhatVtWt1}
\cov[\hatVt,\hatWt] = 
\cov_{Y^{(1)}}[\E[\hatVt|\YY^{(1)}],\E[\hatWt|\YY^{(1)}]] + \E_{Y^{(1)}}[\cov[\hatVt, \hatWt|\YY^{(1)}]] .
\end{equation}

For a specific value of $\YY^{(1)}=\yy^{(1)}$, the covariance in the second term on the right is $\cov[\hatVt, \hatWt|\yy^{(1)}]$. Conditioned on a specific value of $\YY^{(1)}$, $\hatWt$ is a fixed value, $\hatwt = \hatxtT - \BB_t\hatxtmT - \uu_t$, and conditioned on $\yy^{(1)}$, $\hatxtT$ and $\hatxtmT$ are fixed values. $\uu_t$ is also fixed; it is a parameter. $\hatVt$ is not a fixed value because it has $\YY_t^{(2)}$ and that is a random variable.  Thus $\cov[\hatVt, \hatWt|\yy^{(1)}]$ is the covariance between a random variable and a fixed variable and thus the covariance is 0. This is true for all $\yy^{(1)}$.
Thus the second right-side term in Equation \ref{eq:covhatVtWt1} is zero, and the equation reduces to
\begin{equation}\label{eq:covhatVtWt3}
\cov[\hatVt,\hatWt] = \cov_{Y^{(1)}}[\E[\hatVt|\YY^{(1)}],\E[\hatWt|\YY^{(1)}]].
\end{equation}
Notice that $\E[\hatWt|\yy^{(1)}]=\E[\WW_t|\yy^{(1)}]$ and $\E[\hatVt|\yy^{(1)}]=\E[\VV_t|\yy^{(1)}]$ since
\begin{equation}
\E[\WW_t|\yy^{(1)}]= \E[\XX_t|\yy^{(1)}]-\BB_t\E[\XX_{t-1}|\yy^{(1)}] - \uu_t = \hatxtT-\BB_t\hatxtmT - \uu_t = \hatwt = \E[\hatWt|\yy^{(1)}]
\end{equation}
and
\begin{equation}
\E[\VV_t|\yy^{(1)}]= \E[\YY_t|\yy^{(1)}]-\ZZ_t\E[\XX_{t}|\yy^{(1)}] - \aa_t = \E[\YY_t|\yy^{(1)}]-\ZZ_t \hatxtT - \aa_t = \E[\hatVt|\yy^{(1)}] .
\end{equation}
Thus the right side of Equation \ref{eq:covhatVtWt3} can be written in terms of $\VV_t$ and $\WW_t$ instead of $\hatVt$ and $\hatWt$:
\begin{equation}\label{eq:covhatVtWt2}
\cov[\hatVt,\hatWt] = \cov_{Y^{(1)}}[\E[\VV_t|\YY^{(1)}],\E[\WW_t|\YY^{(1)}]].
\end{equation}

Using the law of total covariance, we can write:
\begin{equation}\label{eq:covVtWt}
\cov[\VV_t, \WW_t] = \E_{Y^{(1)}}[\cov[\VV_t, \WW_t|\YY^{(1)}]] + \cov_{Y^{(1)}}[\E[\VV_t|\YY^{(1)}],\E[\WW_t|\YY^{(1)}]] .
\end{equation}
The unconditional covariance of $\VV_t$ and $\WW_t$  is 0. Thus the left side of Equation \ref{eq:covVtWt} is 0 and we can rearrange the equation as
\begin{equation}\label{eq:covVtWt2}
\cov_{Y^{(1)}}[\E[\VV_t|\YY^{(1)}],\E[\WW_t|\YY^{(1)}]] = - \E_{Y^{(1)}}[\cov[\VV_t, \WW_t|\YY^{(1)}]] .
\end{equation}
Combining Equation \ref{eq:covhatVtWt2} and \ref{eq:covVtWt2}, we get
\begin{equation}\label{eq:conditionalcovvtwt}
\cov[\hatVt,\hatWt] = - \E_{Y^{(1)}}[ \cov[\VV_t, \WW_t|\YY^{(1)}] ] ,
\end{equation}
and our problem reduces to solving for the conditional covariance of the model and state residuals (right side of Equation \ref{eq:conditionalcovvtwt}).  

For a specific $\YY^{(1)}=\yy^{(1)}$, the conditional covariance $\cov[\VV_t, \WW_t|\yy^{(1)}]$ can be written out as
\begin{equation}
\cov[\VV_t, \WW_t|\yy^{(1)}] = \cov[\YY_t-\ZZ_t\XX_t-\aa_t,\, \XX_t-\BB_t\XX_{t-1}-\uu_t|\yy^{(1)}] .
\end{equation}
$\aa_t$ and $\uu_t$ are fixed values and can be dropped. Thus\footnote{$\cov[\BB \mathbf{A},\CC \mathbf{D}]=\BB\cov[\mathbf{A},\mathbf{D}]\CC^\top$.}
\begin{equation}
\begin{split}
\cov&[\VV_t, \WW_t|\yy^{(1)}] =\cov[\YY_t-\ZZ_t\XX_t, \XX_t-\BB_t\XX_{t-1}|\yy^{(1)}] \\
& =\cov[\YY_t,\XX_t|\yy^{(1)}] + \cov[\YY_t,-\BB_t\XX_{t-1}|\yy^{(1)}] + \cov[-\ZZ_t\XX_t,\XX_t|\yy^{(1)}] + \cov[-\ZZ_t\XX_t,-\BB_t\XX_{t-1}|\yy^{(1)}]\\
& = \hatStT - \hatSttmT\BB_t^\top - \ZZ_t \hatVtT + \ZZ_t\hatVttmT\BB_t^\top ,
\end{split}
\end{equation}
where $\hatStT=\cov[\YY_t,\XX_t|\yy^{(1)}]$ and $\hatSttmT=\cov[\YY_t,\XX_{t-1}|\yy^{(1)}]$; the equations for $\hatStT$ and $\hatSttmT$ are given in \citet{Holmes2010} and are output by the \verb@MARSShatyt()@ function in the \{MARSS\} package.

Both $\VV_t$ and $\WW_t$ are multivariate normal random variables that depend on $\YY^{(1)}$ and $\YY^{(2)}$ and the conditional covariance is not a function of the variable that we condition on (in this case $\yy^{(1)}$). The conditional covariance is only a function of the MARSS parameters\footnote{By extension, this is also the case for $\hatVtT$, $\hatVttmT$, $\hatStT$ and $\hatSttmT$ which may seem counter-intuitive, but you can show it is true by working through the Kalman filter and smoother equations starting at t=1. Or run a Kalman filter/smoother algorithm with different data and the same parameters and you will see that the variances do not change with different data.}. Thus 
\begin{equation}
\E_{Y^{(1)}}[ \cov[\VV_t, \WW_t|\YY^{(1)}] ]= \cov[\VV_t, \WW_t|\yy^{(1)}] = \hatStT - \hatSttmT\BB_t^\top - \ZZ_t \hatVtT + \ZZ_t\hatVttmT\BB_t^\top .
\end{equation}
$\cov[\hatVt,\hatWt]$ is the negative of this (Equation \ref{eq:conditionalcovvtwt}), thus
\begin{equation}
\cov[\hatVt,\hatWt] = - \hatStT + \hatSttmT\BB_t^\top + \ZZ_t \hatVtT - \ZZ_t\hatVttmT\BB_t^\top .
\end{equation}

The Harvey et al. algorithm (next section) gives the joint distribution of the model residuals at time $t$ and state residuals at time $t+1$.  Using the law of total covariance as above, the covariance in this case is
\begin{equation}
\cov_{Y^{(1)}}[\E[\VV_t|\YY^{(1)}],\E[\WW_{t+1}|\YY^{(1)}]] = - \E_{Y^{(1)}}[ \cov[\VV_t, \WW_{t+1}|\YY^{(1)}] ] 
\end{equation}
and
\begin{equation}
\begin{split}
\cov[\VV_t, \WW_{t+1}|\yy^{(1)}] & =\cov[\YY_t-\ZZ_t\XX_t-\aa_t,\, \XX_{t+1}-\BB_{t+1}\XX_t-\uu_{t+1}|\yy^{(1)}] \\
& =\cov[\YY_t-\ZZ_t\XX_t,\, \XX_{t+1}-\BB_{t+1}\XX_t|\yy^{(1)}] \\
& = \hatSttpT - \hatStT\BB_{t+1}^\top - \ZZ_t\hatVttpT + \ZZ_t \hatVtT \BB_{t+1}^\top .
\end{split}
\end{equation}
Thus,
\begin{equation}
\begin{split}
\cov_{Y^{(1)}}[\E[\VV_t|\YY^{(1)}],\E[\WW_{t+1}|\YY^{(1)}]] & = - \E_{Y^{(1)}}[ \cov[\VV_t, \WW_{t+1}|\YY^{(1)}] ] \\ 
& = - \hatSttpT + \hatStT\BB_{t+1}^\top + \ZZ_t\hatVttpT - \ZZ_t\hatVtT\BB_{t+1}^\top.
\end{split}
\end{equation}


\subsection{Joint distribution of the conditional residuals}

We now can write the variance of the joint distribution of the conditional residuals. Define
\begin{equation}
\widehat{\varepsilon}_t = \begin{bmatrix}\hatvt\\ \hatwt\end{bmatrix} =
\begin{bmatrix}\yy_t - \ZZ_t\hatxtT - \aa_t\\ \hatxtT - \BB_t\hatxtmT - \uu_t \end{bmatrix}.
\end{equation}
$\widehat{\varepsilon}_t$ is a sample drawn from the distribution of the random variable $\widehat{\mathcal{E}}_t$.  The expected value of $\widehat{\mathcal{E}}_t$ over all possible $\yy$ is 0 and the variance of $\widehat{\mathcal{E}}_t$  is
\begin{equation}
\widehat{\Sigma}_t = \var[\widehat{\mathcal{E}}_t] = \begin{bmatrix}[c|c]
 \var[\hatVt]&
 \cov[\hatVt, \hatWt] \\
 \rule[.5ex]{20ex}{0.25pt} & \rule[.5ex]{20ex}{0.25pt} \\
 (\cov[\hatVt, \hatWt])^\top& 
 \var[\hatWt] \end{bmatrix}
\end{equation}
which is
\begin{equation}\label{eq:jointcondresid1general}
 \begin{bmatrix}[c|c]
 \RR_t - \ZZ_t\hatVtT\ZZ_t^\top + \hatStT\ZZ_t^\top+\ZZ_t(\hatStT)^\top&
 - \hatStT + \hatSttmT\BB_t^\top + \ZZ_t\hatVtT - \ZZ_t\hatVttmT\BB_t^\top\\
 \rule[.5ex]{40ex}{0.25pt} & \rule[.5ex]{50ex}{0.25pt} \\
 (- \hatStT + \hatSttmT\BB_t^\top + \ZZ_t\hatVtT  - \ZZ_t\hatVttmT\BB_t^\top)^\top& 
 \QQ_t - \hatVtT - \BB_t\hatVtmT\BB_t^\top + \hatVttmT\BB_t^\top + \BB_t\hatVtmtT \end{bmatrix},
\end{equation}
where $\hatStT=\cov[\YY_t,\XX_t|\yy^{(1)}]$, $\hatSttmT=\cov[\YY_t,\XX_{t-1}|\yy^{(1)}]$, $\hatVtT=\var[\XX_t|\yy^{(1)}]$, $\hatVttmT=\cov[\XX_t,\XX_{t-1}|\yy^{(1)}]$, and $\hatVtmtT=\cov[\XX_{t-1},\XX_t|\yy^{(1)}]$. This gives the variance of both `observed' model residuals (the ones associated with $\yy^{(1)}$) and the unobserved model residuals (the ones associated with $\yy^{(2)}$).  
When there are no missing values in $\yy_t$, the $\hatStT$ and $\hatSttmT$ terms equal 0 and drop out.

If the residuals are defined as in \citet{Harveyetal1998} with $\hatvt$ on top and $\hatwtp$ on the bottom instead of $\hatwt$, then
\begin{equation}
\widehat{\varepsilon}_t = \begin{bmatrix}\hatvt\\ \hatwtp \end{bmatrix} =
\begin{bmatrix}\yy_t - \ZZ_t\hatxtT - \aa_t\\ \hatxtpT - \BB_{t+1}\hatxtT - \uu_{t+1} \end{bmatrix}
\end{equation}
and the variance of $\widehat{\mathcal{E}}_t$ is
\begin{equation}
 \begin{bmatrix}[c|c]
 \var[\hatVt]&
 \cov[\hatVt, \hatWtp] \\
 \rule[.5ex]{20ex}{0.25pt} & \rule[.5ex]{20ex}{0.25pt} \\
 (\cov[\hatVt, \hatWtp])^\top& 
 \var[\hatWtp] \end{bmatrix}
\end{equation}
which is
\begin{equation}\label{eq:jointcondresid2}
\begin{bmatrix}[c|c]
\RR_t - \ZZ_t\hatVtT\ZZ_t^\top + \hatStT\ZZ_t^\top + \ZZ_t(\hatStT)^\top&
- \hatSttpT + \hatStT\BB_{t+1}^\top + \ZZ_t\hatVttpT - \ZZ_t\hatVtT\BB_{t+1}^\top \\
\rule[.5ex]{40ex}{0.25pt} & \rule[.5ex]{50ex}{0.25pt} \\
(- \hatSttpT + \hatStT\BB_{t+1}^\top + \ZZ_t\hatVttpT - \ZZ_t\hatVtT\BB_{t+1}^\top)^\top& 
\QQ_{t+1} - \hatVtpT - \BB_{t+1}\hatVtT\BB_{t+1}^\top + \hatVtptT \BB_{t+1}^\top + \BB_{t+1}\hatVttpT \end{bmatrix} .
\end{equation}


\section{Harvey et al. 1998 algorithm for the conditional residuals}
\citet[pgs 112-113]{Harveyetal1998} give a recursive algorithm for computing the variance of the conditional residuals when the time-varying MARSS equation is written as: 
\begin{equation}\label{eq:residsMARSSHarvey}
\begin{gathered}
\xx_{t+1} = \BB_{t+1}\xx_t + \uu_{t+1} + \HH_{t+1}\epsilon_{t},\\
\yy_t = \ZZ_t\xx_t + \aa_t + \GG_t\epsilon_t,\\
\mbox{ where } \epsilon_t \sim \MVN(0,\II_{m+n \times m+n}) \\
\HH_t\HH_t^\top=\QQ_t, \GG_t\GG_t^\top=\RR_t, \text{ and } \HH_t\GG_t^\top = \cov[\WW_t, \VV_t]
\end{gathered}
\end{equation}
The $\HH_t$ and $\GG_t$ matrices specify the variance and covariance of $\WW_t$ and $\VV_t$. $\HH_t$ has $m$  rows and $m+n$ columns and $\GG_t$ has $n$ rows and $m+n$ columns. In the MARSS equation for this report (Equation \ref{eq:residsMARSS}), $\WW_t$ and $\VV_t$ are independent. To achieve this in the Harvey et al. form (Equation \ref{eq:residsMARSSHarvey}), the first $n$ columns of $\HH_t$ are all 0 and the last $m$ columns of $\GG_t$ are all zero.  

The algorithm in \citet{Harveyetal1998} gives the variance of the `normalized' residuals, the $\epsilon_t$.  I have modified their algorithm so it  returns the `non-normalized' residuals:
$$\varepsilon_t=\begin{bmatrix}\GG_t\epsilon_t\\ \HH_{t+1}\epsilon_t\end{bmatrix}=\begin{bmatrix}\vv_t\\ \ww_{t+1} \end{bmatrix}.$$

The Harvey et al. algorithm is a backwards recursion using the following output from the Kalman filter: the one-step ahead prediction covariance $\FF_t$, the Kalman gain $\KK_t$, $\hatxttm=\E[\XX_t|\yy^{(1),1:{t-1}}]$ and $\hatVttm=\var[\XX_t|\yy^{(1),1:{t-1}}]$. In the \{MARSS\} package, these are output from \texttt{MARSSkfss()} in \texttt{Sigma}, \texttt{Kt}, \texttt{xtt1} and \texttt{Vtt1}.

\subsection{Algorithm for the non-normalized residuals}

Start from $t=T$ and work backwards to $t=1$. At time $T$, $r_T=0_{1 \times m}$ and $N_T=0_{m \times m}$. $\BB_{t+1}$ and $\QQ_{t+1}$ can be set to NA or 0. They will not appear in the algorithm at time $T$ since $r_T=0$ and $N_T=0$. Note that the $\ww$ residual and its associated variance and covariance with $\vv$ at time $T$ is NA since this residual would be for $\xx_T$ to $\xx_{T+1}$.

% algorithm with \GG_t and \HH_t
% \begin{equation}\label{eq:Harveyalgo}
% \begin{gathered}
% \FF_t = \ZZ_t\hatVt\ZZ_t^\top+\RR_t\\
% G_t= \GG_t\QQ_t\GG_t^\top, \mbox{  }H_t = \HH_t\RR_t\HH_t^\top,  \mbox{ } K_t = \BB_t\KK^{*}_t\\
% L_t = \BB_t - K_t\ZZ_t, \mbox{ } J_t= H_t - K_t G_t, \mbox{ } u_t = \FF_t^{-1} - K_t^\top r_t\\
% r_{t-1} = \ZZ_t^\top u_t + \BB_t^\top r_t, \mbox{ } N_{t-1} = K_t^\top N_t K_t + L_t^\top N_t L_t
% \end{gathered}
% \end{equation}

\begin{equation}\label{eq:Harveyalgo}
\begin{gathered}
\QQ^\prime_{t+1}=\begin{bmatrix}0_{m \times n}&\QQ_{t+1}\end{bmatrix}, \mbox{    } \RR^\prime_t=\begin{bmatrix}\RR_t^* & 0_{n \times m}\end{bmatrix}\\
\FF_t = \ZZ_t^*\hatVttm{\ZZ_t^*}^\top+\RR_t^* , \,\, n \times n \\
K_t = \BB_{t+1}\KK_t = \BB_{t+1} \hatVttm{\ZZ_t^*}^\top \FF_t^{-1}  , \,\, m \times n  \\
L_t = \BB_{t+1} - K_t\ZZ_t^*  , \,\, m \times m \\
J_t= \QQ^\prime_{t+1} - K_t \RR^\prime_t  , \,\, m \times (n+m) \\
v_t = \yy_t^* - \ZZ_t\hatxttm - \aa_t , \,\, n \times 1 \\
u_t = \FF_t^{-1} v_t - K_t^\top r_t , \,\, n \times 1 \\
r_{t-1} = {\ZZ_t^*}^\top u_t + \BB_{t+1}^\top r_t , \,\, m \times 1  \\
N_{t-1} = {\ZZ_t^*}^\top \FF_t^{-1} \ZZ_t^* + L_t^\top N_t L_t   , \,\, m \times m .
\end{gathered}
\end{equation}
$\yy_t^*$ is the observed data at time $t$ with the $i$-th rows set to 0 if the $i$-th $y$ is missing. 
Bolded terms are the same as in Equation \ref{eq:residsMARSSHarvey} (and are output by \texttt{MARSSkfss()}).  Unbolded terms are terms used in \citet{Harveyetal1998}.  The * on $\ZZ_t$ and $\RR_t$, indicates that they are the missing value modified versions  discussed in \citet[section 6.4]{ShumwayStoffer2006} and \citet{Holmes2010}: to construct $\ZZ_t^*$ and $\RR_t^*$, the rows of $\ZZ_t$ corresponding to missing rows of $\yy_t$ are set to zero and the $(i,j)$ and $(j,i)$ terms of $\RR_t$ corresponding the missing rows of $\yy_t$ are set to zero.  For the latter, this means if the $i$-th row of $\yy_t$ is missing, then then the $i$-th row and column (including the value on the diagonal) in $\RR_t$ are set to 0. Notice that $\FF_t$ will have 0's on the diagonal if there are missing values. A modified inverse of $\FF_t$ is used: any 0's on the diagonal of $\FF_t$ are replaced with 1, the inverse is taken, and 1s on diagonals is replaced back with 0s.

The residuals \citep[eqn 24]{Harveyetal1998} are
\begin{equation}\label{eq:Harveyresiduals}
\widehat{\varepsilon}_t = \begin{bmatrix}\hatvt\\ \hatwtp \end{bmatrix} =({\RR^\prime_t})^\top u_t + ({\QQ^\prime_{t+1}})^\top r_t
\end{equation}
The expected value of $\widehat{\mathcal{E}}_t$ is 0 and its variance is
\begin{equation}\label{eq:Harveyvariance}
\widehat{\Sigma]}_t = \var[\widehat{\mathcal{E}}_t] ={\RR^\prime_t}^\top \FF_t^{-1} \RR^\prime_t + J_t^\top N_t J_t .
\end{equation}
These $\widehat{\varepsilon}_t$ and $\widehat{\Sigma]}_t$ are for both the non-missing and missing $\yy_t$. This is a modification to the \citet{Harveyetal1998} algorithm which does not give the variance for missing $\yy$.

\subsection{Difference in notation}

In Equation 20 in \citet{Harveyetal1998}, their $T_t$ is my $\BB_{t+1}$ and their $H_t H_t^\top$ is my $\QQ_{t+1}$.  Notice the difference in the time indexing. My time indexing on $\BB$ and $\QQ$ matches the left $\xx$ while in theirs, $T$ and $H$ indexing matches the right $\xx$. Thus in my implementation of their algorithm \citep[eqns. 21-24]{Harveyetal1998}, $\BB_{t+1}$ appears in place of $T_t$ and $\QQ_{t+1}$ appears in place of $H_t$. See comments below on normalization and the difference between $\QQ$ and $H$. 

\citet[eqns. 19, 20]{Harveyetal1998} use $G_t$ to refer to the $\chol(\RR_t)^\top$ (non-zero part of the $n \times n+m$ matrix) and $H_t$ to refer to $\chol(\QQ_t)^\top$.  I have replaced these with $\RR_t^\prime$ and $\QQ_t^\prime$ (Equation \ref{eq:Harveyalgo}) which causes my variant of their algorithm (Equation \ref{eq:Harveyalgo}) to give the `non-normalized' variance of the residuals. The residuals function in the \{MARSS\} package has an option to give either normalized or non-normalized residuals.

$\KK_t$ is the Kalman gain output by the \{MARSS\} package \verb@MARSSkf()@ function.  The Kalman gain as used in the \citet{Harveyetal1998} algorithm is $K_t=\BB_{t+1}\KK_t$. Notice that Equation 21 in \citet{Harveyetal1998} has $H_t G_t^\top$ in the equation for $K_t$. This is the covariance of the state and observation errors, which is allowed to be non-zero given the way Harvey et al. write the errors in their Equations 19 and 20. The way the \{MARSS\} package model is written, the state and observation errors are independent of each other. Thus $H_t G_t^\top = 0$ and this term drops out of the $K_t$ equation in Equation \ref{eq:Harveyalgo}.

\subsection{Computing the normalized residuals}

To compute the normalized residuals and residuals variance, a block diagonal matrix with the inverse of the $\RR$ and $\QQ$ matrices is used. The normalized residuals are:
\begin{equation}
\begin{bmatrix}\RR^{-1}_t&0\\0&\QQ^{-1}_t\end{bmatrix} \widehat{\varepsilon}_t
\end{equation}
The variance of the normalized residuals is
\begin{equation}
\begin{bmatrix}\RR^{-1}_t&0\\0&\QQ^{-1}_t\end{bmatrix} \widehat{\Sigma]}_t \begin{bmatrix}\RR^{-1}_t&0\\0&\QQ^{-1}_t\end{bmatrix}
\end{equation}

\subsection{Computing the Cholesky standardized residuals}

The Cholesky standardized residuals are computed by multiplying $\widehat{\varepsilon}_t$ by $(\widehat{\Sigma}_t)^{-1/2}$ (the inverse of the Cholesky decomposition of  the variance-covariance matrix for $\widehat{\varepsilon}_t$):
\begin{equation}\label{eq:std.resid}
\widehat{\varepsilon}_{t, std} = (\widehat{\Sigma}_t)^{-1/2}\widehat{\varepsilon}_t .
\end{equation}
These residuals are uncorrelated (across the residuals at time $t$ in $\widehat{\varepsilon}_t$).  See \citet{HarveyKoopman1992} and \citet[section V]{Harveyetal1998} for a discussion on how to use these residuals for outlier detection and structural break detection.

It is also common to use the marginal standardized residuals, which would be $\widehat{\varepsilon}_t$ multiplied by the inverse of the square-root of $\dg(\widehat{\Sigma}_t)$, where $\dg(\AA)$ is a diagonal matrix formed from the diagonal of the square matrix $\AA$.
\begin{equation}
\widehat{\varepsilon}_{t, mar} = \dg(\widehat{\Sigma}_t)^{-1/2}\widehat{\varepsilon}_t 
\end{equation}
Marginal standardized residuals may be correlated at time $t$ (unlike the Cholesky standardized residuals) but are a bit easier to interpret when there is correlation across the model residuals. 

\section{Distribution of the MARSS innovation residuals}

One-step-ahead predictions (innovations) are often shown for MARSS models and these are used for likelihood calculations. Innovations are the difference between the data at time $t$ minus the prediction of $\yy_t$ given data up to $t-1$. This section gives the residual variance for the innovations and the analogous values for the states.   Innovations residuals are the more common residuals discussed for state-space models; these are also known as the `one-step-ahead` prediction residuals. 

\subsection{One-step-ahead model residuals}

Define the innovations $\checkvt$ as:
\begin{equation}\label{eq:vtt1}
\checkvt = E[\YY_t|\yy^{(1)}_t] - \ZZ_t\hatxttm - \aa_t,
\end{equation}
where $\hatxttm$ is $\E[\XX_t|\yy^{(1),t-1}]$ (expected value of $\XX_t$ conditioned on the data up to time $t-1$). The random variable, innovations over all possible $\yy_t$, is $\checkVt$. Its mean is 0 and we want to find its variance. 

This is conceptually different than the observed `innovations'. First, this is the random variable `innovation'.  $\yy^{(1)}_t$ here is not the actual data that you observe (the one data set that you have). It's the data that you could observe. $\yy_t$ is a sample from the random variable $\YY_t$ and $\checkvt$ is a sample from the innovations you could observe. Second, $\checkvt$ includes both $\yy_t^{(1)}$ and $\yy_t^{(2)}$ (observed and unobserved $\yy$). Normally, the innovations for missing data would appear as 0s, e.g., from a call to \verb@MARSSfss()@. For missing data, $\checkvt$ is not necessarily 0. For example if $\yy$ is multivariate and correlated with each other through $\RR$ or a shared $\xx$ dependency.

The derivation of the variance of $\checkVt$ follows the exact same steps as the smoothations $\hatVt$, except that we condition on the data up to $t-1$ not up to $T$. Thus using Equation \ref{eq:first.and.secons.vvtgeneral}, we can write the variance directly as:
\begin{equation}\label{eq:innov.model}
\var[\checkVt] = \RR_t - \ZZ_t \hatVttm \ZZ_t^\top + \hatSttm\ZZ_t^\top + \ZZ_t(\hatSttm)^\top
\end{equation}
where the $\hatVttm$ and $\hatSttm$ are now conditioned on only the data from 1 to $t-1$.
$\hatSttm = \cov[\YY_t,\XX_t|\yy^{(1), t-1}] = \cov[\ZZ_t \XX_t + \aa_t+\VV_t,\XX_t|\yy^{(1), t-1}]$. 
$\yy_t$ is not in the conditional since it only includes data up to $t-1$. Without $\yy_t$ in the conditional, $\VV_t$ and $\WW_t$ and by extension $\VV_t$ and $\XX_t$ are independent\footnote{This is only true given the way the MARSS equation is written in this report where $V_t$ and $W_t$ are independent. This is not the case for the more general Harvey et al. MARSS model which allows covariance between $V_t$ and $W_t$.} and $\cov[\ZZ_t \XX_t + \aa_t + \VV_t,\XX_t|\yy^{(1), t-1}] = \cov[\ZZ_t \XX_t,\XX_t|\yy^{(1), t-1}] = \ZZ_t \hatVttm$. Therefore, $\ZZ_t(\hatSttm)^\top =  \ZZ_t \hatVttm \ZZ_t^\top = \hatSttm(\ZZ_t)^\top$. Thus Equation \ref{eq:innov.model} reduces to
\begin{equation}\label{eq:innov.model2}
\var[\checkVt] = \RR_t + \ZZ_t \hatVttm \ZZ_t^\top.
\end{equation}

Note $\var[\checkVt]$ is a standard output from Kalman filter functions and is used to compute the likelihood of the data (conditioned on a set of parameters). In the Kalman filter in the \{MARSS\} package, it is output as \texttt{Sigma} from \texttt{MARSSkfss()}.


\subsection{One-step ahead state residuals}

Define the state residuals conditioned on the data from 1 to $t-1$ as $\checkwt$.
\begin{equation}\label{eq:wtt1}
\checkwt = \hatxtt - \BB_t\hatxtmt1 - \uu_t,
\end{equation}
where $\hatxtmt1$ is $\E[\XX_{t-1}|\yy^{(1),t-1}]$ (expected value of $\XX_{t-1}$ conditioned on the data up to time $t-1$) and $\hatxtt$ is $\E[\XX_{t}|\yy^{(1),t}]$ (expected value of $\XX_{t}$ conditioned on the data up to time $t$). 
From the Kalman filter equations:
\begin{equation}\label{eq:wtt1.2}
\hatxtt = \BB_t\hatxtmt1 + \uu_t + \KK_t \checkvt
\end{equation}
Thus, $\checkwt$ is a transformed $\checkvt$:
\begin{equation}\label{eq:wtt1.3}
\checkwt = \KK_t \checkvt,
\end{equation}
where $\KK_t$ is the Kalman gain. $\KK_t = \hatVttm \ZZ_t^\top[\ZZ_t \hatVttm \ZZ_t^\top + \RR_t]^{-1}$ and $\ZZ_t$ is the missing values modified $\ZZ_t$ where if the $i$-th $\yy_t$ is missing, the $i$-th row of $\ZZ_t$ is set to all 0s (Shumway and Stoffer 2006, equation 6.78).
Thus the variance of $\checkWt$ is 
\begin{equation}\label{eq:var.Wt}
\var[\checkWt] = \KK_t \var[\checkVt] \KK_t^\top
\end{equation}

\subsection{Joint distribution of the conditional one-step-ahead residuals}

\subsubsection{with the state residuals defined from $t-1$ to $t$}

Define the one-step ahead residuals as
\begin{equation}
\overline{\varepsilon}_t = \begin{bmatrix}\checkvt\\ \checkwt \end{bmatrix}
\end{equation}

The covariance of $\checkVt$ and $\checkWt$ is
\begin{equation}\label{eq:covhatVtWtp.onestep}
\cov[\checkVt,\checkWtp] = \cov[\checkVt,\KK_{t}\checkVt] = \var[\checkVt]\KK_{t}^\top
\end{equation}


The joint variance-covariance matrix is
\begin{equation}\label{eq:jointcondresid.onestep}
\overline{\Sigma}_t = \var[\overline{\varepsilon}_t] = \begin{bmatrix}[c|c]
 \var[\checkVt]&  \var[\checkVt]\KK_{t}^\top\\
 \rule[.5ex]{20ex}{0.25pt} & \rule[.5ex]{20ex}{0.25pt} \\
\KK_{t}\var[\checkVt]& \KK_{t}\var[\checkVt]\KK_{t}^\top \end{bmatrix},
\end{equation}

Since $\checkwt=\KK_t \checkvt$, the state one-step-ahead residuals are perfectly correlated with the model one-step-ahead residuals so the joint distribution is not useful (all the information is in the variance of $\checkVt$).


The Cholesky standardized residuals for $\overline{\varepsilon}_t$ are
\begin{equation}
\overline{\Sigma}_t^{-1/2} \overline{\varepsilon}_t
\end{equation}
However the Cholesky standardized joint residuals cannot be computed since $\overline{\Sigma}_t$ is not positive definite. Because $\checkwt$ equals $\KK_t \checkvt$, the state residuals are completely explained by the model residuals. However we can compute the Cholesky standardized model residuals using
\begin{equation}
\var[\checkVt]^{-1/2} \checkvt
\end{equation}
$\var[\checkVt]^{-1/2}$ is the inverse of the lower triangle of the Cholesky decomposition of $\var[\checkVt]$.

The marginal standardized joint residuals for $\overline{\varepsilon}_t$ (both model and state one-step ahead residuals) can be computed with the inverse of the square root of the diagonal of $\overline{\Sigma}_t$:
\begin{equation}
\dg(\overline{\Sigma}_t)^{-1/2} \overline{\varepsilon}_t
\end{equation}
where $dg(\AA)$ is a diagonal matrix formed from the diagonal of the square matrix $\AA$.

\subsubsection{with the state residuals defined from $t$ to $t+1$}

\emph{Note that the model residual is conditioned on data 1 to $t-1$ and the state residual on data 1 to $t$ in this case.}

Define the one-step ahead residuals as
\begin{equation}
\overline{\varepsilon}_t^* = \begin{bmatrix}\checkvt\\ \checkwtp \end{bmatrix}
\end{equation}

The covariance of $\checkVt$ and $\checkWtp$ is
\begin{equation}\label{eq:covhatVtWtp.onestep.2}
\cov[\checkVt,\checkWtp] = \cov[\checkVt,\KK_{t+1}\checkVtp] = \cov[\checkVt, \checkVtp]\KK_{t+1}^\top
\end{equation}
Innovations residuals are temporally uncorrelated, thus $\cov[\checkVt,\checkVtp]=0$ and thus $\cov[\checkVt,\checkWtp]=0$.

The joint variance-covariance matrix is
\begin{equation}\label{eq:jointcondresid.onestep2}
\overline{\Sigma}_t^* = \var[\overline{\varepsilon}_t^*] = \begin{bmatrix}[c|c]
 \var[\checkVt]&  0\\
 \rule[.5ex]{20ex}{0.25pt} & \rule[.5ex]{20ex}{0.25pt} \\
0 & \KK_{t+1}\var[\checkVtp]\KK_{t+1}^\top \end{bmatrix},
\end{equation}

The Cholesky standardized residuals for $\overline{\varepsilon}_t^*$ are
\begin{equation}
(\overline{\Sigma}_t^*)^{-1/2} \overline{\varepsilon}_t^*
\end{equation}
 $\overline{\Sigma}_t^*$ is positive definite so its Cholesky decomposition can be computed.

The marginal standardized joint residuals for $\overline{\varepsilon}_t^*$ are:
\begin{equation}
\dg(\overline{\Sigma}_t^*)^{-1/2} \overline{\varepsilon}_t^*
\end{equation}
where $dg(\AA)$ is a diagonal matrix formed from the diagonal of the square matrix $\AA$.

\section{Distribution of the MARSS contemporaneous model residuals}

Contemporaneous model residuals are the difference between the data at time $t$ minus the prediction of $\yy_t$ given data up to $t$. This section gives the residual variance for these residuals. There are no state residuals for this case as that would require the expected value of $\XX_t$ conditioned on the data up to $t+1$. 

Define the contemporaneous model residuals $\dotvt$ as:
\begin{equation}\label{eq:vtt}
\dotvt = E[\YY_t|\yy^{(1)}_t] - \ZZ_t\hatxtt - \aa_t,
\end{equation}
where $\hatxtt$ is $\E[\XX_t|\yy^{(1),t}]$ (expected value of $\XX_t$ conditioned on the data up to time $t$). The random variable, contemporaneous model residuals over all possible $\yy_t$, is $\dotVt$. Its mean is 0 and we want to find its variance. 

The derivation of the variance of $\dotVt$ follows the exact same steps as the smoothations $\hatVt$, except that we condition on the data up to $t$ not up to $T$. Thus using Equation \ref{eq:first.and.secons.vvtgeneral}, we can write the variance directly as:
\begin{equation}\label{eq:contemp.model}
\var[\dotVt] = \RR_t - \ZZ_t \hatVtt \ZZ_t^\top + \hatStt\ZZ_t^\top + \ZZ_t(\hatStt)^\top
\end{equation}
where the $\hatVtt$ and $\hatStt$ are now conditioned on only the data from 1 to $t$.
$\hatStt = \cov[\YY_t,\XX_t|\yy^{(1), t}] = \cov[\ZZ_t \XX_t + \aa_t+\VV_t,\XX_t|\yy^{(1), t}]$. 
If $\yy_t$ has no missing values, this reduces to $\RR_t - \ZZ_t \hatVtt \ZZ_t^\top$ while if $\yy_t$ is all missing values, this reduces to $\RR_t + \ZZ_t \hatVtt \ZZ_t^\top$. See discussion of this after Equation \ref{eq:first.and.secons.vvtgeneral}.

Equation \ref{eq:contemp.model} gives the equation for the case where $\yy_t$ is partially observed. $\hatStt$ is output by the \verb@MARSShatyt()@ function in the \{MARSS\} package and $\hatVtt$ is output by the \verb@MARSSkfss()@ function.

\bibliography{./EMDerivation}
\bibliographystyle{apalike}

\end{document}