medicalrisk: Advanced comorbidity analysis

Patrick McCormick patrick.mccormick@alum.mit.edu

2020-02-28

Introduction

To demonstrate how the medicalrisk package can be useful, this vignette shows the kinds of descriptive statistics and inferences that can be made from a simple administrative dataset.

This vignette assumes you have read the introductory vignette for medicalrisk.

Calculating Mortality Risk

First, use the medicalrisk package to create a single dataframe with information on each patient:

library(medicalrisk)
library(plyr)
data(vt_inp_sample)
cm_df <- generate_comorbidity_df(vt_inp_sample, icd9mapfn=icd9cm_charlson_quan)
cci_df <- generate_charlson_index_df(cm_df)
rsi_df <- ddply(vt_inp_sample, .(id), function(x) { icd9cm_sessler_rsi(x$icd9cm) } )
num_icd9_df <- count(vt_inp_sample, c('id'))
num_icd9_df <- rename(num_icd9_df, c("freq" = "num_icd9"))
wide_df <- merge(merge(merge(merge(
  rsi_df, cci_df), 
    cm_df), 
      unique(vt_inp_sample[,c('id','scu_days','drg','mdc')])),
        num_icd9_df)
id rsi_1yrpod rsi_30dlos rsi_30dpod rsi_inhosp index mi chf perivasc cvd dementia chrnlung rheum
1 -2.019 0.156 -1.699 -1.848 2 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
2 -4.142 0.893 -3.802 -3.543 0 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
3 -2.627 0.831 -2.911 -2.861 0 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
4 -0.798 0.336 -1.551 -0.267 4 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
5 2.580 -1.790 2.455 1.762 2 FALSE FALSE FALSE FALSE FALSE TRUE FALSE
id ulcer liver dm dmcx para renal tumor modliver mets aids scu_days drg mdc num_icd9
1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 0 470 8 12
2 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 0 470 8 15
3 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 0 462 8 7
4 FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE 0 470 8 10
5 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 0 689 11 24

Let’s explore the data here with some graphs. First, a histogram:

library(reshape2)
library(ggplot2)
# generate a 100 pt x 17 comorbidity table (1700 rows)
cm_melted <- melt(cm_df, id.vars=c('id'), variable.name='cm')
# get rid of all the false entries
cm_melted <- cm_melted[cm_melted$value,]
## count only flags that are true
ggplot(cm_melted, aes(cm, fill=cm)) + 
  geom_bar() + 
  scale_fill_discrete()

The chrnlung comorbidity seems well represented. Let’s create a histogram breaking down which ICD-9-CM codes are mapping to chrnlung in this dataset:

# make a histogram dataframe for all the icd-9 codes
icd9cm_df <- count(vt_inp_sample, vars='icd9cm')
# create a charlson comorbidity map for all icd-9 codes
icd9cm_charlson_df <- icd9cm_charlson_quan(icd9cm_df$icd9cm)
# isolate just the chrnlung icd_9_cm codes
icd9cm_chrnlung <- row.names(icd9cm_charlson_df[icd9cm_charlson_df$chrnlung,])
# create a hist df
icd9cm_chrnlung_hist <- icd9cm_df[icd9cm_df$icd9cm %in% icd9cm_chrnlung,]
# plot it
ggplot(icd9cm_chrnlung_hist, aes(icd9cm, freq)) + 
  geom_bar(stat="identity")

Let’s see how often ICD-9-CM codes used for chrnlung coincide within patients:

# create base dataset
pairs <- unique(
  vt_inp_sample[vt_inp_sample$icd9cm %in% icd9cm_chrnlung, c('id','icd9cm')])

# create coincidence matrix
t <- table(
    ddply(pairs, c('id'), function(x) { if (length(x$icd9cm) > 1) {
      data.frame(t(combn(as.character(x$icd9cm),2))) } })[c('X1','X2')])
D4168 D49390
D49122 1 0
D4168 0 1

How often do comorbidities coincide?

# create coincidence matrix
t <- table(
    ddply(cm_melted, c('id'), function(x) { if (length(x$cm) > 1) {
      data.frame(t(combn(as.character(x$cm),2))) } })[c('X1','X2')])
# sort it
t <- t[order(rownames(t)),order(colnames(t))]
chf chrnlung cvd dementia dm liver mets para perivasc renal rheum tumor ulcer
chf 0 8 0 0 3 0 1 0 4 3 3 1 1
chrnlung 0 0 0 0 7 1 0 1 0 5 3 1 0
cvd 0 1 0 2 1 0 0 1 0 1 1 0 0
dementia 0 2 0 0 0 0 0 0 0 1 1 0 0
dm 0 0 0 0 0 0 0 1 0 5 0 2 0
liver 0 0 0 0 1 0 0 1 0 0 0 0 0
mi 4 6 1 1 4 0 0 0 0 1 3 1 1
perivasc 0 3 1 1 2 0 0 0 0 3 1 1 0
rheum 0 0 0 0 1 0 1 0 0 0 0 0 0
ulcer 0 0 0 0 1 0 0 0 0 1 0 0 0

Plot the above table:

m <- melt(t)
ggplot(m[m$value>0,], aes(X1,X2)) + stat_sum(aes(group=value))

This is a scatterplot of the Charlson Comorbidity Index versus each RSI mortality estimate. A linear regression line is superimposed:

library(grid)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.6.2
p.inhosp <- ggplot(wide_df, aes(rsi_inhosp, index)) + geom_point() + geom_smooth(method=lm) +
  scale_y_continuous(limits=c(-3,10))
p.30dpod <- ggplot(wide_df, aes(rsi_30dpod, index)) + geom_point() + geom_smooth(method=lm) +
  scale_y_continuous(limits=c(-3,10))
p.1yrpod <- ggplot(wide_df, aes(rsi_1yrpod, index)) + geom_point() + geom_smooth(method=lm) +
  scale_y_continuous(limits=c(-3,10))
grid.arrange(p.inhosp, p.30dpod, p.1yrpod, nrow=1)

As expected, the Risk Stratification Index is correlated with an increased Charlson Comorbidity Index.