---
title: "Ensemble Quadratic MCMC algorithm detailed example for fitting a Weibull survival function"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Ensemble Quadratic MCMC algorithm detailed example for fitting a Weibull survival function}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = (4/6)*7
)
```

```{r setup}
library(QAEnsemble)
```
Load the 'svMisc' package to use the progress bar in the Ensemble MCMC algorithm and load the 'coda' package to return a 'mcmc.list' object from the Ensemble MCMC algorithm that can be used with various MCMC diagnostic functions in the 'coda' package.

```{r eval=TRUE}
library(svMisc)
library(coda)
```
Set the seed for this example.

```{r eval=TRUE}
set.seed(10)
```
Assume the true parameters in the Weibull survival function 
$S(t) = e^{- \big( \frac{t}{\sigma} \big)^a}$ are shape $a = 8$ and scale 
$\sigma = 1100$.This Weibull lifetime survival function is in terms of months 
and this Weibull lifetime survival function is extended for 1320 months (110 
years).

```{r eval=TRUE}
a_shape = 8
sigma_scale = 1100

time_use = 12*c(1:110)

plot(time_use, exp(-1*((time_use/sigma_scale)^a_shape)), xlab="Months (t)", ylab="Survival probability (S)", type = "l", col = "red")
```
Then we take 50 random samples from this Weibull distribution, 
$T \sim \textrm{WEI}(a = 8, \sigma = 1100)$, and these random samples provide 
mortality times. From these mortality time random samples, $t_i$ for 
$i = 1, ..., 50$, a life table is created for every 12 month interval (every 
year interval) up to 1320 months (110 years) and the estimated survival 
probability is calculated for every 12 month interval. It is assumed that only 
the life table information is known for every 12 month interval, and the actual 
mortality times from the Weibull distribution random samples, $t_i$, are 
unknown. So, the life table information provides interval-censored mortality 
data.

```{r eval=TRUE}

#Input for the 'create_life_table_info' function:

#interval_time_input: the start and end times for each time interval (Note: it 
#is assumed in the 'create_life_table_info' function that the first start time 
#is at month 0)

#num_ran_samples_input: the number of random samples from the Weibull 
#distribution

#data_weibull_input: the Weibull distribution mortality time samples

#Output for the 'create_life_table_info' is a list containing two data frames. 
#The first data frame contains the life table information: time intervals, the 
#number of people at risk in the interval, number of deaths that occur in the 
#interval, risk of death in the interval, probability of surviving in the 
#interval, and the survival probabilities. The second data frame contains the 
#interval-censored information in a format easy to use in the likelihood 
#function: interval-censored mortality time observation number, interval left 
#bound, and interval right bound.

create_life_table_info <- function(interval_time_input, num_ran_samples_input, data_weibull_input)
{
  num_at_risk_during_interval_output = matrix(NA, nrow = 1, ncol = length(interval_time_input))
  num_deaths_during_interval_output = matrix(NA, nrow = 1, ncol = length(interval_time_input))
  risk_death_during_interval_output = matrix(NA, nrow = 1, ncol = length(interval_time_input))
  prob_surv_during_interval_output = matrix(NA, nrow = 1, ncol = length(interval_time_input))
  s_values_output = matrix(NA, nrow = 1, ncol = length(interval_time_input))
  
  obs_num_output = matrix(1:num_ran_samples_input, nrow = 1, ncol = num_ran_samples_input)
  left_endpoint_output = matrix(NA, nrow = 1, ncol = num_ran_samples_input)
  right_endpoint_output = matrix(NA, nrow = 1, ncol = num_ran_samples_input)
  
  for(t in 1:length(interval_time_input))
  {
  if(t == 1)
  {
    num_at_risk_during_interval_output[t] = num_ran_samples_input
    
    num_deaths_during_interval_output[t] = length(data_weibull_input[0 <= data_weibull_input & data_weibull_input <= interval_time_input[1]])
    
    risk_death_during_interval_output[t] = num_deaths_during_interval_output[t]/num_at_risk_during_interval_output[t]
    
    prob_surv_during_interval_output[t] = 1 - risk_death_during_interval_output[t]
    
    s_values_output[t] = 1*prob_surv_during_interval_output[t]
  }
  
  if(t != 1)
  {
    if(num_at_risk_during_interval_output[t-1] - num_deaths_during_interval_output[t-1] > 0)
    {
    num_at_risk_during_interval_output[t] = num_at_risk_during_interval_output[t-1] - num_deaths_during_interval_output[t-1]
    
    num_deaths_during_interval_output[t] = length(data_weibull_input[interval_time_input[t-1] < data_weibull_input & data_weibull_input <= interval_time_input[t]])
    
    risk_death_during_interval_output[t] = num_deaths_during_interval_output[t]/num_at_risk_during_interval_output[t]
    
    prob_surv_during_interval_output[t] = 1 - risk_death_during_interval_output[t]
    
    s_values_output[t] = s_values_output[t-1]*prob_surv_during_interval_output[t]
    }

    if(num_at_risk_during_interval_output[t-1] - num_deaths_during_interval_output[t-1] <= 0)
    {
    num_at_risk_during_interval_output[t] = 0
    
    num_deaths_during_interval_output[t] = 0
    
    risk_death_during_interval_output[t] = 1
    
    prob_surv_during_interval_output[t] = 1 - risk_death_during_interval_output[t]
    
    s_values_output[t] = s_values_output[t-1]*prob_surv_during_interval_output[t]
    }
  }
  
  }
  
  string_time_interval_names = matrix(NA, nrow = 1, ncol = length(interval_time_input))
  
  i = 1
  
  for(t in 1:length(interval_time_input))
  {
    if(t == 1)
    {
      string_time_interval_names[1] = paste(0, "-", 12, " mo", " (", 0, "-", 1, " yr)", sep = "")
      
      if(num_deaths_during_interval_output[t] > 0)
      {
        for(j in 1:num_deaths_during_interval_output[t])
        {
          left_endpoint_output[i] = 0
          right_endpoint_output[i] = 12
        
          i = i + 1
        }
      }
    }
    
    if(t != 1)
    {
      string_time_interval_names[t] = paste(interval_time_input[t-1], "-", interval_time_input[t], " mo", " (", interval_time_input[t-1]/12, "-", interval_time_input[t]/12, " yr)", sep = "")
      
      if(num_deaths_during_interval_output[t] > 0)
      {
        for(j in 1:num_deaths_during_interval_output[t])
        {
          left_endpoint_output[i] = interval_time_input[t-1]
          right_endpoint_output[i] = interval_time_input[t]
        
          i = i + 1
        }
      }
    }
  }
  
  life_table_df = data.frame(t(string_time_interval_names), t(num_at_risk_during_interval_output), t(num_deaths_during_interval_output), t(risk_death_during_interval_output), t(prob_surv_during_interval_output), t(s_values_output))
  
  names(life_table_df)[1] = "Time interval"
  names(life_table_df)[2] = "Number of people at risk in the interval"
  names(life_table_df)[3] = "Number of deaths that occur in the interval"
  names(life_table_df)[4] = "Risk of death in the interval"
  names(life_table_df)[5] = "Probability of surviving in the interval"
  names(life_table_df)[6] = "Survival probability"
  
  interval_censored_df = data.frame(t(obs_num_output), t(left_endpoint_output), t(right_endpoint_output))
  
  names(interval_censored_df)[1] = "Interval-censored mortality time observation number"
  names(interval_censored_df)[2] = "Interval left bound"
  names(interval_censored_df)[3] = "Interval right bound"

  return(list("life_table_df" = life_table_df, "interval_censored_df" = interval_censored_df))
}

```

```{r eval=TRUE}

num_ran_samples = 50

data_weibull = rweibull(num_ran_samples, shape = a_shape, scale = sigma_scale)

info_use = create_life_table_info(time_use, num_ran_samples, data_weibull)

knitr::kable(info_use$life_table_df, caption = "Life table data", format = "pipe", padding = 0)
```
```{r eval=TRUE}
knitr::kable(info_use$interval_censored_df, caption = "Interval-censored data", format = "pipe", padding = 0)
```
```{r eval=TRUE}
#Plot the survival probability from the life table data
plot(time_use, info_use$life_table_df[[6]], xlab="Months (t)", ylab="Survival probability (S)", type = "s")

```
Now, we want to fit a Weibull distribution with shape parameter $a$ and scale 
parameter $\sigma$ to this interval-censored data using 
the Ensemble Quadratic MCMC algorithm and the prior knowledge that the shape 
parameter $a \sim \textrm{U}(0.01, 100)$ and scale parameter 
$\sigma \sim \textrm{U}(1, 10000)$.

The likelihood function is the following:

$L(\theta) = \prod_{i = 1}^{50} P(L_i \leq T_i \leq R_i) = \prod_{i = 1}^{50} S(L_i) - S(R_i) = \prod_{i = 1}^{50} e^{- \big( \frac{L_i}{\sigma} \big)^a} - e^{- \big( \frac{R_i}{\sigma} \big)^a}$,

where $\theta = \begin{bmatrix} a \\ \sigma \end{bmatrix}$ and $[L_i, R_i]$ is the time interval in the life table where the $T_i$ death occurred.

For more information about formulating the likelihood for interval censored data, please see the following journal articles:

Guure CB, Ibrahim NA, Adam MB (2013) Bayesian inference of the Weibull model based on interval-censored survival data. Comput Math Methods Med. 2013:849520. https://doi.org/10.1155/2013/849520

Pan C, Cai B, Wang L (2020) A Bayesian approach for analyzing partly interval-censored data under the proportional hazards model. Stat Methods Med Res. 29(11):3192-3204. https://doi.org/10.1177/0962280220921552

The log prior function and the log likelihood function for $a$ and $\sigma$ are coded below.

```{r eval=TRUE}

logp <- function(param)
{
  a_shape_use = param[1]
  sigma_scale_use = param[2]

  logp_val = dunif(a_shape_use, min = 1e-2, max = 1e2, log = TRUE) + dunif(sigma_scale_use, min = 1, max = 1e4, log = TRUE)

  return(logp_val)
}

logl <- function(param)
{
  a_shape_use = param[1]
  sigma_scale_use = param[2]
  
  logl_val = 0
  
  interval_l_bound = info_use$interval_censored_df[[2]]
  interval_r_bound = info_use$interval_censored_df[[3]]
  
  for(i in 1:num_ran_samples)
  {

  logl_val = logl_val + log(exp(-1*((interval_l_bound[i]/sigma_scale_use)^a_shape_use)) - exp(-1*((interval_r_bound[i]/sigma_scale_use)^a_shape_use)))
    
  }
  
  if(is.na(logl_val) || is.infinite(logl_val))
  {
    logl_val = -Inf
  }

  return(logl_val)
}

logfuns = list(logp = logp, logl = logl)
```
It is recommended to use at least twice as many chains as the number of 
parameters to be estimated. Since there are two parameters to be estimated in 
this example, four MCMC chains will be used in the Ensemble Quadratic MCMC 
algorithm.

Next, initial guesses are proposed for each of the four MCMC chains and the 
following while loops ensure that the initial guesses for each of the four MCMC 
chains produces a log unnormalized posterior density value that is not NA or 
infinite.

```{r eval=TRUE}
num_par = 2
num_chains = 2*num_par

theta0 = matrix(0, nrow = num_par, ncol = num_chains)

temp_val = 0
j = 0

while(j < num_chains)
{
  initial = c(runif(1, 1e-2, 1e2), runif(1, 1, 1e4))
  temp_val = logl(initial) + logp(initial)

  while(is.na(temp_val) || is.infinite(temp_val))
  {
    initial = c(runif(1, 1e-2, 1e2), runif(1, 1, 1e4))
    temp_val = logl(initial) + logp(initial)
  }

  j = j + 1

  message(paste('j:', j))
  
  theta0[1,j] = initial[1]
  theta0[2,j] = initial[2]

}
```
The Ensemble Quadratic MCMC algorithm will be used with 1e5 iterations for each 
chain in the Ensemble MCMC algorithm and each 10th iteration is returned.

The total number of samples returned by the algorithm is given by
floor(num_chain_iterations/thin_val_par)*num_chains

```{r eval=TRUE}
num_chain_iterations = 1e5
thin_val_par = 10

floor(num_chain_iterations/thin_val_par)*num_chains
```
Next, the Ensemble Quadratic MCMC algorithm is run.

```{r eval=TRUE}

Weibull_Surv_Quad_result = ensemblealg(theta0, logfuns, T_iter = num_chain_iterations, Thin_val = thin_val_par, ShowProgress = TRUE, ReturnCODA = TRUE)

```
After the Ensemble Quadratic MCMC algorithm is run, the matrix of parameter 
samples, matrix of log likelihood samples, matrix of log prior samples, and 
'coda' package 'mcmc.list' object are returned from the algorithm.

```{r eval=TRUE}
my_samples = Weibull_Surv_Quad_result$theta_sample
my_log_prior = Weibull_Surv_Quad_result$log_prior_sample
my_log_like = Weibull_Surv_Quad_result$log_like_sample
my_mcmc_list_coda = Weibull_Surv_Quad_result$mcmc_list_coda
```
Use the 'plot' function on the 'mcmc.list' object to view the convergence of the 
chains over the iterations.

```{r eval=TRUE}
varnames(my_mcmc_list_coda) = c("a_shape", "sigma_scale")

plot(my_mcmc_list_coda)
```
Now, we remove (also called burn-in) 25% of each MCMC chain to ensure that we 
use the samples that converged to the estimated unnormalized posterior 
distribution.

```{r eval=TRUE}
my_samples_burn_in = my_samples[,,-c(1:floor((num_chain_iterations/thin_val_par)*0.25))]

my_log_prior_burn_in = my_log_prior[,-c(1:floor((num_chain_iterations/thin_val_par)*0.25))]

my_log_like_burn_in = my_log_like[,-c(1:floor((num_chain_iterations/thin_val_par)*0.25))]
```
Then we calculate the potential scale reduction factors, which provide a formal 
test of the convergence of the MCMC sampling to the estimated posterior distribution. The potential scale reduction factor are close to 1 for 
all the estimated parameters, this indicates that the MCMC sampling converged to 
the estimated posterior distribution for each parameter.

```{r eval=TRUE}
diagnostic_result = psrfdiagnostic(my_samples_burn_in, 0.05)

diagnostic_result$p_s_r_f_vec
```
We collapse the sample matrices into sample vectors to pool all of the MCMC 
chain samples together. 

```{r eval=TRUE}
log_un_post_vec = as.vector(my_log_prior_burn_in + my_log_like_burn_in)
k1 = as.vector(my_samples_burn_in[1,,])
k2 = as.vector(my_samples_burn_in[2,,])
```
Next, we calculate the posterior median, 95% credible intervals, and maximum 
posterior for the parameters.

```{r eval=TRUE}
#a_shape posterior median and 95% credible interval
median(k1)
hpdparameter(k1, 0.05)
#(The 95% CI is narrower than the prior distribution and it contains the true 
#value of a_shape, 8)

#sigma_scale posterior median and 95% credible interval
median(k2)
hpdparameter(k2, 0.05)
#(The 95% CI is narrower than the prior distribution and it contains the true 
#value of sigma_scale, 1100)

#a_shape, sigma_scale, and stand_dev maximum posterior
k1[which.max(log_un_post_vec)]
k2[which.max(log_un_post_vec)]
```
The following plots display the silhouette of the unnormalized posterior surface 
from the chosen parameter's perspective.

```{r eval=TRUE}
plot(k1, exp(log_un_post_vec), xlab="a_shape", ylab="unnormalized posterior density")
```

```{r eval=TRUE}
plot(k2, exp(log_un_post_vec), xlab="sigma_scale", ylab="unnormalized posterior density")
```
Now, ten thousand MCMC samples are used to generate the posterior predictive 
distribution. (Also, the Bayesian p-value is calculated in this for loop, which 
assesses the goodness of fit of the Weibull survival function with truncated 
normal error to the data).

```{r eval=TRUE}
num_samples = 10000

w_predict = matrix(NA, nrow = num_samples, ncol = num_ran_samples)
s_predict = matrix(NA, nrow = num_samples, ncol = length(time_use))

discrepancy_s = matrix(NA, nrow = num_samples, ncol = 1)
discrepancy_s_pred = matrix(NA, nrow = num_samples, ncol = 1)
ind_pred_exceeds_s = matrix(NA, nrow = num_samples, ncol = 1)

for (i in (length(log_un_post_vec) - num_samples + 1):length(log_un_post_vec))
{
  l = i - (length(log_un_post_vec) - num_samples)
  
  w_predict[l,] = rweibull(num_ran_samples, shape = k1[i], scale = k2[i])
  
  data_weibull_temp = w_predict[l,]
  
  info_use_predict = create_life_table_info(time_use, num_ran_samples, data_weibull_temp)
  
  s_predict[l,] = t(info_use_predict$life_table_df[[6]])
  
  s_data = t(info_use$life_table_df[[6]])
  
  my_s_mean = exp(-1*((time_use/k2[i])^k1[i]))
  
  discrepancy_s[l,1] = sum(((s_data - my_s_mean)^2))

    discrepancy_s_pred[l,1] = sum(((s_predict[l,] - my_s_mean)^2))

    if(discrepancy_s_pred[l,1] > discrepancy_s[l,1])
    {
      ind_pred_exceeds_s[l,1] = 1
    }
    else
    {
      ind_pred_exceeds_s[l,1] = 0
    }

  svMisc::progress(l,num_samples,progress.bar = TRUE,init = (l == 1))
}
```
The Bayesian p-value uses the sum of squares discrepancy function and the
Bayesian p-value is 0.6038, which indicates that there is no evidence against 
the null hypothesis that the model predictions fit the data.

This Bayesian p-value measures how extreme is the realized value of sum of 
squares discrepancy among all possible values that could have been realized
under this model with the same parameters that generates the current data.

Note: If this Bayesian p-value was close to 0 or 1, then there would be
evidence against the null hypothesis that the model predictions fit the data.
Extreme tail-area probabilities, less than 0.01 or more than 0.99, indicate a
major failure of the model predictions to fit the data.

```{r eval=TRUE}
Bayes_p_value = sum(ind_pred_exceeds_s)/length(ind_pred_exceeds_s)

Bayes_p_value
```
Next, the lower and upper 95% prediction interval is calculated.

```{r eval=TRUE}
s_predict_lower = matrix(NA, nrow = 1, ncol = length(time_use))
s_predict_upper = matrix(NA, nrow = 1, ncol = length(time_use))

for(j in 1:length(time_use))
{
  s_predict_lower[j] = quantile(s_predict[,j], probs = 0.025)
  s_predict_upper[j] = quantile(s_predict[,j], probs = 0.975)
}
```
Lastly, the data, posterior predictive mean, and 95% prediction interval is 
plotted, and this plot illustrates the good fit of the Weibull survival 
function with truncated normal error to the generated data.

```{r eval=TRUE}
plot(time_use, info_use$life_table_df[[6]], main="Weibull survival curve (a = 8, sigma = 1100) Example", cex.main = 1, xlab="Months (t)", ylab="Survival probability (S)", type = "s", ylim = c(0, 1))
lines(time_use, exp(-1*((time_use/sigma_scale)^a_shape)), type = "l", lty = 1, col = "red")
lines(time_use, colMeans(s_predict), type = "l", lty = 1, col = "blue")
lines(time_use, s_predict_lower, type = "l", lty = 2, col = "blue")
lines(time_use, s_predict_upper, type = "l", lty = 2, col = "blue")

legend("bottomleft", legend = c("Data", "True model", "Posterior predictive mean", "95% prediction interval"), lty=c(1,1,1,2), col = c("black", "red", "blue", "blue"), pch=c(NA,NA,NA,NA))
```