C-reactive protein

Published

December 13, 2024

Introduction

Code
set.seed(123)
if (file.exists("/.dockerenv")) { # Check if running in Docker
  # Assume igmm/cvallejo-predicct/libdr/ is passed to the data volume
  prefix <- "data/"
} else {
  # Assume running outside of a Docker container and the IGC(/IGMM) datastore is
  # mounted at /Volumes
  prefix <- "/Volumes/igmm/cvallejo-predicct/libdr/"
}


##########################
#--     Packages       --#
##########################

library(tidyverse)
# Support package (source found in libdr/)
library(libdr)
## Modelling ##
library(lcmm)
library(kml) # K-means
## Presentation ##
library(patchwork)
library(ggdist)
library(ggalluvial)
library(pander)
library(qqplotr)


##########################
#--     Data read      --#
#########################

dict <- readRDS(paste0(prefix, "processed/dict.RDS"))
crp_median <- readRDS(paste0(prefix, "processed/median-crp.RDS"))
crp <- readRDS(paste0(prefix, "processed/crp.RDS"))

dk.crp <- read.csv(paste0(prefix, "Denmark/2024-11-29/crp_8_models.csv"),
                    sep = ";")

Model selection

Code
# set the number of groups
G.crp <- numeric()
models.crp.ma <- list()
G.cands <- seq(2, 10)
for (G.cand in G.cands) {
  file.name <- paste0(prefix, "/cache/crp-ma/crp-", G.cand, ".RDS")
  if (file.exists(file.name)) {
    G.crp <- c(G.crp, G.cand)
    models.crp.ma[[G.cand]] <- readRDS(file.name)
  }
}
rm(G.cand)
Code
alluvial.df <- matrix(nrow = 0, ncol = 3)
colnames(alluvial.df) <- c("ids", "class", "G")

for (G in G.crp) {
  alluvial.df <- rbind(alluvial.df, cbind(models.crp.ma[[G]]$pprob[, 1:2],
    G = G
  ))
}
alluvial.df <- as.data.frame(alluvial.df)

alluvial.df$ids <- as.character(alluvial.df$ids)
alluvial.df$class <- as.factor(alluvial.df$class)

alluvial.df[alluvial.df[, "G"] == 3, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 3, "class"],
    from = c(seq(1, 3)),
    to = c(1, 2, 3)
  )

alluvial.df[alluvial.df[, "G"] == 4, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 4, "class"],
    from = c(seq(1, 4)),
    to = c(3, 1, 2, 4)
  )

alluvial.df[alluvial.df[, "G"] == 5, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 5, "class"],
    from = c(seq(1, 5)),
    to = c(3, 5, 2, 1, 4)
  )

alluvial.df[alluvial.df[, "G"] == 6, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 6, "class"],
    from = c(seq(1, 6)),
    to = c(2, 1, 3, 6, 4, 5)
  )

alluvial.df[alluvial.df[, "G"] == 7, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 7, "class"],
    from = c(seq(1, 7)),
    to = c(2, 1, 7, 3, 5, 4, 6)
  )

alluvial.df[alluvial.df[, "G"] == 8, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 8, "class"],
    from = c(seq(1, 8)),
    to = c(2, 6, 1, 7, 5, 4, 3, 8)
  )


alluvial.df[alluvial.df[, "G"] == 9, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 9, "class"],
    from = c(seq(1, 9)),
    to = c(8, 2, 4, 1, 5, 9, 3, 6, 7)
  )

alluvial.df[alluvial.df[, "G"] == 10, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 10, "class"],
    from = c(seq(1, 10)),
    to = c(9, 7, 1, 5, 3, 4, 2, 10, 6, 8)
  )

# eliminate label switching

p <- ggplot(
  alluvial.df,
  aes(
    x = G,
    stratum = class,
    alluvium = ids,
    fill = class,
    label = class
  )
) +
  scale_x_discrete(expand = c(.1, .1)) +
  geom_flow() +
  geom_stratum(alpha = 0.5) +
  geom_text(stat = "stratum", size = 3) +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = rainbow(10)) +
  xlab("Assumed number of clusters") +
  ylab("Frequency")
print(p)


for (format in c("png", "pdf")) {
  ggsave(paste0("plots/ma-nocor/alluvial.", format),
    p,
    width = 12,
    height = 6.75,
    units = "in"
  )
}
Figure 1: Alluvial plot of cluster membership across G for CRP
Code
alluvial.df <- matrix(nrow = 0, ncol = 3)
colnames(alluvial.df) <- c("ids", "class", "G")

for (G in G.crp) {
  alluvial.df <- rbind(alluvial.df, cbind(models.crp.ma[[G]]$pprob[, 1:2],
    G = G
  ))
}
alluvial.df <- as.data.frame(alluvial.df)

alluvial.df$ids <- as.character(alluvial.df$ids)
alluvial.df$class <- as.factor(alluvial.df$class)

p <- ggplot(
  alluvial.df,
  aes(
    x = G,
    stratum = class,
    alluvium = ids,
    fill = class,
    label = class
  )
) +
  scale_x_discrete(expand = c(.1, .1)) +
  geom_flow() +
  geom_stratum(alpha = 0.5) +
  geom_text(stat = "stratum", size = 3) +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = rainbow(10)) +
  xlab("Assumed number of clusters") +
  ylab("Frequency")
print(p)
Figure 2: Alluvial plot of cluster membership across G for CRP without label switching

Posterior classifications

From the below data, we can see how posterior probabilities change as the number of assumed clusters increase

  class1 class2
N 1453 385
% 79.05 20.95
  prob1 prob2
class1 0.9027 0.0973
class2 0.1982 0.8018
  class1 class2
prob>0.7 90.78 69.61
prob>0.8 82.66 54.55
prob>0.9 68.27 37.14
  class1 class2 class3
N 893 359 586
% 48.59 19.53 31.88
  prob1 prob2 prob3
class1 0.8537 0.0778 0.0685
class2 0.1336 0.7989 0.0676
class3 0.1016 0.0572 0.8412
  class1 class2 class3
prob>0.7 79.73 68.25 77.47
prob>0.8 70.66 55.71 68.26
prob>0.9 56.66 38.72 52.56
  class1 class2 class3 class4
N 499 886 341 112
% 27.15 48.2 18.55 6.09
  prob1 prob2 prob3 prob4
class1 0.8014 0.0807 0.057 0.0609
class2 0.0562 0.8502 0.0702 0.0233
class3 0.0684 0.1304 0.7961 0.005
class4 0.1601 0.086 0.0038 0.7501
  class1 class2 class3 class4
prob>0.7 68.54 78.56 66.86 60.71
prob>0.8 58.72 70.2 58.06 45.54
prob>0.9 42.48 55.3 39.3 26.79
  class1 class2 class3 class4 class5
N 507 98 274 856 103
% 27.58 5.33 14.91 46.57 5.6
  prob1 prob2 prob3 prob4 prob5
class1 0.7923 0.0304 0.0384 0.0842 0.0547
class2 0.0892 0.7494 0.1004 0.0608 2e-04
class3 0.0417 0.0662 0.7612 0.1265 0.0044
class4 0.0531 0.0137 0.0649 0.8494 0.0189
class5 0.1526 5e-04 0.0029 0.0896 0.7545
  class1 class2 class3 class4 class5
prob>0.7 67.26 63.27 63.14 78.39 59.22
prob>0.8 56.02 44.9 46.35 69.39 49.51
prob>0.9 41.22 27.55 29.93 55.37 26.21
  class1 class2 class3 class4 class5 class6
N 280 823 477 54 104 100
% 15.23 44.78 25.95 2.94 5.66 5.44
  prob1 prob2 prob3 prob4 prob5 prob6
class1 0.7611 0.1161 0.0397 0.0106 0.005 0.0675
class2 0.0649 0.8252 0.0515 0.0277 0.0181 0.0126
class3 0.0374 0.0808 0.7844 0.0179 0.0528 0.0267
class4 0.0299 0.1145 0.0655 0.7707 0.0194 1e-04
class5 0.0046 0.0669 0.1413 0.0188 0.7682 3e-04
class6 0.0939 0.0584 0.0883 0.0019 2e-04 0.7573
  class1 class2 class3 class4 class5 class6
prob>0.7 63.93 74.85 67.51 62.96 62.5 63
prob>0.8 47.86 65.98 54.51 53.7 51.92 47
prob>0.9 31.79 47.75 40.88 38.89 28.85 31
  class1 class2 class3 class4 class5 class6 class7
N 281 783 57 457 106 97 57
% 15.29 42.6 3.1 24.86 5.77 5.28 3.1
  prob1 prob2 prob3 prob4 prob5 prob6 prob7
class1 0.7627 0.1136 9e-04 0.0368 0.0675 0.009 0.0094
class2 0.0625 0.8184 0.018 0.0441 0.0107 0.0139 0.0324
class3 0.0069 0.1064 0.7244 0.0964 0.0146 0.0462 0.0051
class4 0.0345 0.0744 0.0157 0.7807 0.0277 0.0497 0.0173
class5 0.0917 0.0647 0.002 0.0747 0.765 4e-04 0.0015
class6 0.0112 0.0595 0.023 0.1333 5e-04 0.7533 0.0193
class7 0.0203 0.1127 0.0011 0.0643 0 0.0314 0.7701
  class1 class2 class3 class4 class5 class6 class7
prob>0.7 63.7 74.71 57.89 66.74 62.26 62.89 63.16
prob>0.8 47.69 64.5 42.11 53.83 50 48.45 52.63
prob>0.9 32.74 45.34 22.81 39.82 33.02 30.93 43.86
  class1 class2 class3 class4 class5 class6 class7 class8
N 225 51 702 60 110 84 434 172
% 12.24 2.77 38.19 3.26 5.98 4.57 23.61 9.36
  prob1 prob2 prob3 prob4 prob5 prob6 prob7 prob8
class1 0.7803 0.0068 0.0921 0.0013 0.0633 0.0026 0.0534 3e-04
class2 0.013 0.7735 0.0897 0.0017 2e-04 0.0278 0.0845 0.0096
class3 0.0512 0.029 0.8324 0.0208 0.0096 0.0113 0.0451 6e-04
class4 0.0036 0.0032 0.0684 0.7425 0.0162 0.0314 0.0883 0.0463
class5 0.0735 0.002 0.0397 0.0037 0.7708 2e-04 0.0822 0.0279
class6 7e-04 0.0237 0.0384 0.0278 1e-04 0.7563 0.0997 0.0534
class7 0.0407 0.0195 0.0731 0.0222 0.0292 0.0406 0.6896 0.0851
class8 6e-04 0.0015 4e-04 0.007 0.0208 0.0231 0.1606 0.786
  class1 class2 class3 class4 class5 class6 class7 class8
prob>0.7 68.44 64.71 78.49 55 64.55 61.9 48.85 71.51
prob>0.8 51.56 52.94 67.52 46.67 53.64 48.81 31.34 52.33
prob>0.9 34.22 39.22 49.72 23.33 37.27 33.33 11.06 31.98
Table continues below
  class1 class2 class3 class4 class5 class6 class7 class8
N 172 223 86 601 113 102 432 50
% 9.36 12.13 4.68 32.7 6.15 5.55 23.5 2.72
  class9
N 59
% 3.21
Table continues below
  prob1 prob2 prob3 prob4 prob5 prob6 prob7
class1 0.7829 6e-04 0.0229 2e-04 0.0212 1e-04 0.1635
class2 3e-04 0.7827 0.0025 0.0856 0.0618 0.0073 0.0515
class3 0.0531 7e-04 0.7501 0.0368 1e-04 8e-04 0.1077
class4 3e-04 0.0532 0.0113 0.8317 0.0083 6e-04 0.0471
class5 0.027 0.0752 2e-04 0.0293 0.7653 0.011 0.0823
class6 0.0015 0.0374 0.0107 5e-04 0.0152 0.8437 0.028
class7 0.0843 0.04 0.0387 0.0726 0.0298 0.0031 0.6891
class8 0.0098 0.0128 0.0286 0.0725 2e-04 0.0106 0.0853
class9 0.0468 0.0036 0.0318 0.0647 0.0091 0.0028 0.088
  prob8 prob9
class1 0.0015 0.0071
class2 0.007 0.0013
class3 0.0232 0.0276
class4 0.0263 0.0213
class5 0.0022 0.0075
class6 0.0473 0.0156
class7 0.0196 0.0229
class8 0.7786 0.0017
class9 0.0031 0.7501
Table continues below
  class1 class2 class3 class4 class5 class6 class7
prob>0.7 71.51 69.51 60.47 78.37 63.72 83.33 48.38
prob>0.8 52.33 52.02 47.67 68.05 52.21 69.61 31.25
prob>0.9 31.4 33.63 33.72 48.75 36.28 53.92 10.65
  class8 class9
prob>0.7 66 57.63
prob>0.8 54 47.46
prob>0.9 40 27.12
Table continues below
  class1 class2 class3 class4 class5 class6 class7 class8
N 102 59 417 112 434 85 222 183
% 5.55 3.21 22.69 6.09 23.61 4.62 12.08 9.96
  class9 class10
N 52 172
% 2.83 9.36
Table continues below
  prob1 prob2 prob3 prob4 prob5 prob6 prob7
class1 0.845 0.0157 0 0.0152 0.0283 0.0103 0.0362
class2 0.0026 0.7509 0.0443 0.0091 0.0876 0.0322 0.0035
class3 0 0.0248 0.8528 0.0072 0.0415 0.0131 0.0341
class4 0.0111 0.0077 0.0108 0.7673 0.0825 2e-04 0.0746
class5 0.003 0.0231 0.0419 0.0297 0.6881 0.0396 0.0398
class6 8e-04 0.0276 0.0357 1e-04 0.1034 0.7512 6e-04
class7 0.0074 0.0014 0.0183 0.0616 0.0523 0.0025 0.7769
class8 4e-04 0.0138 0.0239 0.0127 0.0583 0.0061 0.0921
class9 0.0101 0.0017 0.0555 2e-04 0.0896 0.0276 0.012
class10 1e-04 0.0071 2e-04 0.0212 0.1603 0.0232 6e-04
  prob8 prob9 prob10
class1 1e-04 0.0477 0.0015
class2 0.0202 0.0031 0.0464
class3 3e-04 0.0259 4e-04
class4 0.0161 0.0023 0.0275
class5 0.0305 0.0196 0.0847
class6 0.0029 0.0243 0.0535
class7 0.0726 0.0069 3e-04
class8 0.7699 0.0228 1e-04
class9 0.0292 0.7646 0.0094
class10 0 0.0015 0.7857
Table continues below
  class1 class2 class3 class4 class5 class6 class7
prob>0.7 83.33 57.63 82.49 64.29 48.85 61.18 68.47
prob>0.8 70.59 45.76 72.9 52.68 30.88 48.24 50.9
prob>0.9 53.92 25.42 53.24 36.61 11.29 32.94 32.88
  class8 class9 class10
prob>0.7 68.85 63.46 71.51
prob>0.8 52.46 51.92 52.33
prob>0.9 32.24 38.46 31.98
Code
rm(output)
Code
pprobs.2 <- c()
groups.2 <- models.crp.ma[[2]]
pprobs.3 <- c()
groups.3 <- models.crp.ma[[3]]
pprobs.4 <- c()
groups.4 <- models.crp.ma[[4]]
pprobs.5 <- c()
groups.5 <- models.crp.ma[[5]]
pprobs.6 <- c()
groups.6 <- models.crp.ma[[6]]
pprobs.7 <- c()
groups.7 <- models.crp.ma[[7]]
pprobs.8 <- c()
groups.8 <- models.crp.ma[[8]]
pprobs.9 <- c()
groups.9 <- models.crp.ma[[9]]
pprobs.10 <- c()
groups.10 <- models.crp.ma[[10]]


for (i in 1:nrow(models.crp.ma[[2]]$pprob)) {
  class.2 <- groups.2$pprob[i, 2]
  pprobs.2 <- c(pprobs.2, groups.2$pprob[i, class.2 + 2])
  class.3 <- groups.3$pprob[i, 2]
  pprobs.3 <- c(pprobs.3, groups.3$pprob[i, class.3 + 2])
  class.4 <- groups.4$pprob[i, 2]
  pprobs.4 <- c(pprobs.4, groups.4$pprob[i, class.4 + 2])
  class.5 <- groups.5$pprob[i, 2]
  pprobs.5 <- c(pprobs.5, groups.5$pprob[i, class.5 + 2])
  class.6 <- groups.6$pprob[i, 2]
  pprobs.6 <- c(pprobs.6, groups.6$pprob[i, class.6 + 2])
  class.7 <- groups.7$pprob[i, 2]
  pprobs.7 <- c(pprobs.7, groups.7$pprob[i, class.7 + 2])
  class.8 <- groups.8$pprob[i, 2]
  pprobs.8 <- c(pprobs.8, groups.8$pprob[i, class.8 + 2])
  class.9 <- groups.9$pprob[i, 2]
  pprobs.9 <- c(pprobs.9, groups.9$pprob[i, class.9 + 2])
  class.10 <- groups.10$pprob[i, 2]
  pprobs.10 <- c(pprobs.10, groups.10$pprob[i, class.10 + 2])
}
pprobs.2 <- tibble(prob = pprobs.2)
pprobs.3 <- tibble(prob = pprobs.3)
pprobs.4 <- tibble(prob = pprobs.4)
pprobs.5 <- tibble(prob = pprobs.5)
pprobs.6 <- tibble(prob = pprobs.6)
pprobs.7 <- tibble(prob = pprobs.7)
pprobs.8 <- tibble(prob = pprobs.8)
pprobs.9 <- tibble(prob = pprobs.9)
pprobs.10 <- tibble(prob = pprobs.10)

pprobs.2$Model <- as.factor(rep("Two clusters", nrow(pprobs.2)))
pprobs.3$Model <- as.factor(rep("Three clusters", nrow(pprobs.3)))
pprobs.4$Model <- as.factor(rep("Four clusters", nrow(pprobs.4)))
pprobs.5$Model <- as.factor(rep("Five clusters", nrow(pprobs.5)))
pprobs.6$Model <- as.factor(rep("Six clusters", nrow(pprobs.6)))
pprobs.7$Model <- as.factor(rep("Seven clusters", nrow(pprobs.7)))
pprobs.8$Model <- as.factor(rep("Eight clusters", nrow(pprobs.8)))
pprobs.9$Model <- as.factor(rep("Nine clusters", nrow(pprobs.9)))
pprobs.10$Model <- as.factor(rep("Ten clusters", nrow(pprobs.10)))
pprobs <- rbind(
  pprobs.2,
  pprobs.3,
  pprobs.4,
  pprobs.5,
  pprobs.6,
  pprobs.7,
  pprobs.8,
  pprobs.9,
  pprobs.10
)

p <- pprobs %>%
  ggplot(aes(x = prob, y = Model)) +
  # geom_histogram(bins = 40, fill = NA, position="identity")
  stat_slab(
    fill = "#5D5179", color = "gray",
    size = 0.8,
    alpha = 0.7
  ) +
  geom_dots(color = "#4F759B", dotsize = 1) +
  xlab("Posterior probability for cluster membership") +
  ylab("") +
  ggtitle(
    "Distribution of Posterior Probabilities Across Models",
    "Subject-specific posterior probabilities for assigned cluster"
  ) +
  theme_minimal() +
  scale_y_discrete(limits = rev)
ggsave("plots/pprob-distribution-crp-ma.png",
  p,
  width = 8.5,
  height = 8.5,
  units = "in"
)
ggsave("plots/pprob-distribution-crp-ma.pdf",
  p,
  width = 8.5,
  height = 8.5,
  units = "in"
)
print(p)

Residual plots

##### G = 3 ##### G = 4 ##### G = 5 ##### G = 6 ##### G = 7 ##### G = 8 ##### G = 9 ##### G = 10

Model metrics

Code
summaryplot(models.crp.ma[[4]],
  models.crp.ma[[5]],
  models.crp.ma[[6]],
  models.crp.ma[[7]],
  models.crp.ma[[8]],
  models.crp.ma[[9]],
  models.crp.ma[[10]],
  which = c("loglik", "AIC", "BIC", "entropy", "ICL"),
  mfrow = c(2, 3),
  axis = "Class"
)
Figure 3: Model metrics for CRP MA for G = 4-10
Code
crp.metrics <- makeMetrics(G.crp, models.crp.ma)
buildDT(crp.metrics)
Clusters Maximum log-likelihood AIC BIC
2 -15451.41 30932.82 31015.57
3 -15314.76 30675.52 30802.40
4 -15240.15 30542.29 30713.30
5 -15173.04 30424.09 30639.23
6 -15106.25 30306.50 30565.78
7 -15051.43 30212.86 30516.27
8 -15016.96 30159.91 30507.45
9 -15014.76 30171.53 30563.20
10 -15014.40 30186.81 30622.61

Spaghetti plots per cluster

Code
for (G in G.crp) {
  # Data frame to hold processed data
  new.crp <- data.frame(
    ids = numeric(),
    crp_result = numeric(),
    crp_time = numeric(),
    class = numeric()
  )

  for (clust in 1:G) {
    ids.clust <- subset(models.crp.ma[[G]]$pprob, class == clust)$ids
    n.clust <- length(ids.clust)
    rand <- sample(n.clust, n.clust) # Randomise the order of the ids
    iters <- floor(n.clust / 6) # How many groups of six are there?

    # Matrix to hold the smoothed data
    crp.ma <- matrix(NA, nrow = iters, ncol = 7)
    for (i in 0:(iters - 1)) {
      # Find ids for group of five
      ids.select <- ids.clust[rand[((i * 6) + 1):((i * 6) + 6)]]
      crp.subset <- subset(crp_median, ids %in% ids.select)
      # Median process as per CRP preprocessing
      for (j in seq(0, 6)) {
        if (j == 6) {
          sub.obs <- subset(
            crp.subset,
            crp_time >= j - 0.5 & crp_time <= j + 1
          )
        } else {
          sub.obs <- subset(
            crp.subset,
            crp_time >= j - 0.5 & crp_time < j + 0.5
          )
        }
        if (nrow(sub.obs) > 0) {
          crp.ma[i + 1, j + 1] <- median(sub.obs$crp_result)
        }
      }
    }

    rownames(crp.ma) <- 1:iters
    crp.ma <- reshape2::melt(t(crp.ma),
      id.vars = row.names(crp.ma),
      na.rm = TRUE
    )
    colnames(crp.ma) <- c("crp_time", "ids", "crp_result")
    crp.ma <- crp.ma[, c(2, 3, 1)] # Make ids first column
    crp.ma$crp_time <- crp.ma$crp_time - 1
    # Take into account uneven spacing at start and end
    crp.ma$crp_time <- plyr::mapvalues(crp.ma$crp_time,
      from = c(0, 6),
      to = c(0.25, 6.25)
    )
    crp.ma$class <- clust # Identify cluster assignment
    new.crp <- rbind(new.crp, crp.ma)
  }

  cairo_pdf(paste0("plots/spaghetti/crp-", G, ".pdf"),
    width = 10,
    height = 17.5
  )
  grid::grid.newpage()
  spaghettiPlot(new.crp,
    models.crp.ma,
    G,
    clusters = TRUE,
    tmax = 6.25,
    sizes = TRUE,
    var.time = "crp_time",
    ylim = "data"
  )
  invisible(dev.off())

  png(paste0("plots/spaghetti/crp-", G, ".png"),
    width = 10,
    height = 17.5,
    units = "in",
    res = 300
  )
  grid::grid.newpage()
  spaghettiPlot(new.crp,
    models.crp.ma,
    G,
    clusters = TRUE,
    tmax = 6.25,
    sizes = TRUE,
    var.time = "crp_time",
    ylim = "data"
  )
  invisible(dev.off())

  grid::grid.newpage()
  spaghettiPlot(new.crp,
    models.crp.ma,
    G,
    clusters = TRUE,
    tmax = 6.25,
    sizes = TRUE,
    var.time = "crp_time",
    ylim = "data"
  )
}

Chosen model

Code
new.crp <- data.frame(
  ids = numeric(),
  crp_result = numeric(),
  crp_time = numeric(),
  class = numeric()
)

for (clust in 1:8) {
  ids.clust <- subset(models.crp.ma[[8]]$pprob, class == clust)$ids
  n.clust <- length(ids.clust)
  rand <- sample(n.clust, n.clust) # Randomise the order of the ids
  iters <- floor(n.clust / 6) # How many groups of six are there?

  # Matrix to hold the smoothed data
  crp.ma <- matrix(NA, nrow = iters, ncol = 7)
  for (i in 0:(iters - 1)) {
    # Find ids for group of five
    ids.select <- ids.clust[rand[((i * 6) + 1):((i * 6) + 6)]]
    crp.subset <- subset(crp_median, ids %in% ids.select)
    # Median process as per CRP preprocessing
    for (j in seq(0, 6)) {
      if (j == 6) {
        sub.obs <- subset(
          crp.subset,
          crp_time >= j - 0.5 & crp_time <= j + 1
        )
      } else {
        sub.obs <- subset(
          crp.subset,
          crp_time >= j - 0.5 & crp_time < j + 0.5
        )
      }
      if (nrow(sub.obs) > 0) {
        crp.ma[i + 1, j + 1] <- median(sub.obs$crp_result)
      }
    }
  } 

  rownames(crp.ma) <- 1:iters
  crp.ma <- reshape2::melt(t(crp.ma),
    id.vars = row.names(crp.ma),
    na.rm = TRUE
  )
  colnames(crp.ma) <- c("crp_time", "ids", "crp_result")
  crp.ma <- crp.ma[, c(2, 3, 1)] # Make ids first column
  crp.ma$crp_time <- crp.ma$crp_time - 1
  # Take into account uneven spacing at start and end
  crp.ma$crp_time <- plyr::mapvalues(crp.ma$crp_time,
    from = c(0, 6),
    to = c(0.25, 6.25)
  )
  crp.ma$class <- clust # Identify cluster assignment
  new.crp <- rbind(new.crp, crp.ma)
}

Labelling

The labels/order of the clusters generated above are random. To improve the readability and assist with interpretation of associations, it would be beneficial to order clusters by some analogue for disease severity. This section explores possible solutions for this task.

Cumulative inflammation across follow-up

One possible solution is to order clusters by cumulative inflammation which is given by the area under the curve for each mean cluster trajectory.

Code
rank.full <- rankCumulative(models.crp.ma[[8]],
  tmax = 6.25,
  var.time = "crp_time"
)
rank.full %>%
  ggplot(aes(x = paste0("CRP", New), y = Area)) +
  geom_bar(stat = "identity", fill = "#745C97", color = "#39375B") +
  ylab("Cumulative inflammation (Area)") +
  xlab("Cluster") +
  theme_minimal()

Code
png(paste0("plots/spaghetti/crp-ncs-reordered.png"),
  width = 10,
  height = 16,
  units = "in",
  res = 300
)
grid::grid.newpage()
spaghettiPlot(new.crp,
  models.crp.ma,
  G = 8,
  log = TRUE,
  tmax = 6.25,
  sizes = TRUE,
  knots = FALSE,
  var.time = "crp_time",
  clusters = TRUE,
  ylim = "data",
  mapping = rank.full$Original
)
invisible(dev.off())

cairo_pdf(paste0("plots/spaghetti/crp-ncs-reordered.pdf"),
  width = 10,
  height = 16
)
grid::grid.newpage()
spaghettiPlot(new.crp,
  models.crp.ma,
  G = 8,
  log = TRUE,
  tmax = 6.25,
  sizes = TRUE,
  knots = FALSE,
  var.time = "crp_time",
  clusters = TRUE,
  ylim = "data",
  mapping = rank.full$Original
)
invisible(dev.off())

spaghettiPlot(new.crp,
  models.crp.ma,
  G = 8,
  log = TRUE,
  tmax = 6.25,
  sizes = TRUE,
  knots = FALSE,
  var.time = "crp_time",
  clusters = TRUE,
  ylim = "data",
  mapping = rank.full$Original
)

With Danish predictions

Code
dk.crp <- dk.crp %>%
  mutate(cluster = str_replace_all(cluster, "Ypred_class", "")) %>%
  filter(time <= 4.25)
Code
dk.crp$cluster <- plyr::mapvalues(dk.crp$cluster,
                                   from = seq(1, 8),
                                   to = c(1, 2, 8, 3, 5, 7, 6, 4))
Code
cairo_pdf(paste0("plots/spaghetti/crp-scot-dk.pdf"),
  width = 10,
  height = 16
)
grid::grid.newpage()
spaghettiPlot(new.crp,
  models.crp.ma,
  G = 8,
  log = TRUE,
  tmax = 6.25,
  sizes = TRUE,
  knots = FALSE,
  var.time = "crp_time",
  clusters = TRUE,
  ylim = "data",
  external = dk.crp,
  mapping = rank.full$Original
)
invisible(dev.off())

png(paste0("plots/spaghetti/crp-scot-dk.png"),
  width = 10,
  height = 16,
  units = "in",
  res = 300
)
grid::grid.newpage()
spaghettiPlot(new.crp,
  models.crp.ma,
  G = 8,
  log = TRUE,
  tmax = 6.25,
  sizes = TRUE,
  knots = FALSE,
  var.time = "crp_time",
  clusters = TRUE,
  ylim = "data",
  external = dk.crp,
  mapping = rank.full$Original
)
invisible(dev.off())

grid::grid.newpage()
spaghettiPlot(new.crp,
  models.crp.ma,
  G = 8,
  log = TRUE,
  tmax = 6.25,
  sizes = TRUE,
  knots = FALSE,
  var.time = "crp_time",
  clusters = TRUE,
  ylim = "data",
  external = dk.crp,
  mapping = rank.full$Original
)

Code
for (G in c(7, 9)) {
  new.crp <- data.frame(
    ids = numeric(),
    crp_result = numeric(),
    crp_time = numeric(),
    class = numeric()
  )

  for (clust in 1:G) {
    ids.clust <- subset(models.crp.ma[[G]]$pprob, class == clust)$ids
    n.clust <- length(ids.clust)
    rand <- sample(n.clust, n.clust) # Randomise the order of the ids
    iters <- floor(n.clust / 6) # How many groups of six are there?

    # Matrix to hold the smoothed data
    crp.ma <- matrix(NA, nrow = iters, ncol = 7)
    for (i in 0:(iters - 1)) {
      # Find ids for group of five
      ids.select <- ids.clust[rand[((i * 6) + 1):((i * 6) + 6)]]
      crp.subset <- subset(crp_median, ids %in% ids.select)
      # Median process as per CRP preprocessing
      for (j in seq(0, 6)) {
        if (j == 6) {
          sub.obs <- subset(
            crp.subset,
            crp_time >= j - 0.5 & crp_time <= j + 1
          )
        } else {
          sub.obs <- subset(
            crp.subset,
            crp_time >= j - 0.5 & crp_time < j + 0.5
          )
        }
        if (nrow(sub.obs) > 0) {
          crp.ma[i + 1, j + 1] <- median(sub.obs$crp_result)
        }
      }
    }

    rownames(crp.ma) <- 1:iters
    crp.ma <- reshape2::melt(t(crp.ma),
      id.vars = row.names(crp.ma),
      na.rm = TRUE
    )
    colnames(crp.ma) <- c("crp_time", "ids", "crp_result")
    crp.ma <- crp.ma[, c(2, 3, 1)] # Make ids first column
    crp.ma$crp_time <- crp.ma$crp_time - 1
    # Take into account uneven spacing at start and end
    crp.ma$crp_time <- plyr::mapvalues(crp.ma$crp_time,
      from = c(0, 6),
      to = c(0.25, 6.25)
    )
    crp.ma$class <- clust # Identify cluster assignment
    new.crp <- rbind(new.crp, crp.ma)
  }


  rank.full <- rankCumulative(models.crp.ma[[G]],
    tmax = 6.25,
    var.time = "crp_time"
  )
  png(paste0("plots/spaghetti/crp-ncs-reordered-", G, ".png"),
    width = 10,
    height = 16,
    units = "in",
    res = 300
  )

  grid::grid.newpage()
  spaghettiPlot(
    new.crp,
    models.crp.ma,
    G = G,
    log = TRUE,
    tmax = 6.25,
    sizes = TRUE,
    knots = FALSE,
    var.time = "crp_time",
    clusters = TRUE,
    ylim = "data",
    mapping = rank.full$Original
  )
  invisible(dev.off())

  cairo_pdf(paste0("plots/spaghetti/crp-ncs-reordered-", G, ".pdf"),
    width = 10,
    height = 16
  )
  grid::grid.newpage()
  spaghettiPlot(
    new.crp,
    models.crp.ma,
    G = G,
    log = TRUE,
    tmax = 6.25,
    sizes = TRUE,
    knots = FALSE,
    var.time = "crp_time",
    clusters = TRUE,
    ylim = "data",
    mapping = rank.full$Original
  )
  invisible(dev.off())
}

Alluvial anchored plot

Code
alluvial.df <- matrix(nrow = 0, ncol = 3)
colnames(alluvial.df) <- c("ids", "class", "G")

for (G in G.crp) {
  alluvial.df <- rbind(alluvial.df, cbind(models.crp.ma[[G]]$pprob[, 1:2],
    G = G
  ))
}
alluvial.df <- as.data.frame(alluvial.df)

alluvial.df$ids <- as.character(alluvial.df$ids)
alluvial.df$class <- as.factor(alluvial.df$class)


alluvial.df[alluvial.df[, "G"] == 2, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 2, "class"],
    from = c(1, 2),
    to = c(1, 2)
  )

alluvial.df[alluvial.df[, "G"] == 3, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 3, "class"],
    from = c(seq(1, 3)),
    to = c(1, 2, 6)
  )

alluvial.df[alluvial.df[, "G"] == 4, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 4, "class"],
    from = c(seq(1, 4)),
    to = c(6, 1, 2, 7)
  )

alluvial.df[alluvial.df[, "G"] == 5, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 5, "class"],
    from = c(seq(1, 5)),
    to = c(6, 5, 2, 1, 7)
  )

alluvial.df[alluvial.df[, "G"] == 6, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 6, "class"],
    from = c(seq(1, 6)),
    to = c(2, 1, 6, 3, 7, 5)
  )

alluvial.df[alluvial.df[, "G"] == 7, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 7, "class"],
    from = c(seq(1, 7)),
    to = c(2, 1, 4, 6, 5, 7, 3)
  )

alluvial.df[alluvial.df[, "G"] == 8, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 8, "class"],
    from = rank.full$Original,
    to = rank.full$New
  )


alluvial.df[alluvial.df[, "G"] == 9, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 9, "class"],
    from = c(seq(1, 9)),
    to = c(8, 2, 7, 1, 5, 9, 6, 3, 4)
  )


alluvial.df[alluvial.df[, "G"] == 10, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 10, "class"],
    from = c(seq(1, 10)),
    to = c(9, 4, 1, 5, 6, 7, 2, 10, 3, 8)
  )


# eliminate label switching

p <- ggplot(
  alluvial.df,
  aes(
    x = G,
    stratum = class,
    alluvium = ids,
    fill = class,
    label = class
  )
) +
  scale_x_discrete(expand = c(.1, .1)) +
  geom_flow() +
  geom_stratum(alpha = 0.5) +
  geom_text(stat = "stratum", size = 3) +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = c(viridis::viridis(8), "#FFF282", "#F8F3C3")) +
  xlab("Assumed number of clusters") +
  ylab("Frequency")
print(p)

Code
for (format in c("png", "pdf")) {
  ggsave(paste0("plots/ma-nocor/anchored-alluvial.", format),
    p,
    width = 12,
    height = 6.75,
    units = "in"
  )
}

saveRDS(p, paste0(prefix, "processed/plots/crp-alluvial.RDS"))

Acknowledgments

This work is funded by the Medical Research Council & University of Edinburgh via a Precision Medicine PhD studentship (MR/N013166/1, to NC-C).

This work has made use of the resources provided by the Edinburgh Compute and Data Facility (ECDF).

Author contributions

References

Reuse

Licensed by CC BY unless otherwise stated.

MRC Human Genetics Unit logoCentre for Genomic & Experimental Medicine logo


Session info

Code

R version 4.4.2 (2024-10-31)

Platform: aarch64-apple-darwin20

locale: en_GB.UTF-8||en_GB.UTF-8||en_GB.UTF-8||C||en_GB.UTF-8||en_GB.UTF-8

attached base packages: splines, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: qqplotr(v.0.0.6), pander(v.0.6.5), ggalluvial(v.0.12.5), ggdist(v.3.3.2), kml(v.2.5.0), longitudinalData(v.2.4.7), misc3d(v.0.9-1), rgl(v.1.3.16), clv(v.0.3-2.4), class(v.7.3-22), cluster(v.2.1.6), lcmm(v.2.1.0), libdr(v.1.0.1), magrittr(v.2.0.3), patchwork(v.1.3.0), lubridate(v.1.9.4), forcats(v.1.0.0), stringr(v.1.5.1), dplyr(v.1.1.4), purrr(v.1.0.2), readr(v.2.1.5), tidyr(v.1.3.1), tibble(v.3.2.1), ggplot2(v.3.5.1) and tidyverse(v.2.0.0)

loaded via a namespace (and not attached): tidyselect(v.1.2.1), viridisLite(v.0.4.2), farver(v.2.1.2), viridis(v.0.6.5), bitops(v.1.0-9), fastmap(v.1.2.0), pracma(v.2.4.4), digest(v.0.6.37), timechange(v.0.3.0), lifecycle(v.1.0.4), survival(v.3.7-0), compiler(v.4.4.2), rlang(v.1.1.4), tools(v.4.4.2), yaml(v.2.3.10), knitr(v.1.49), labeling(v.0.4.3), htmlwidgets(v.1.6.4), plyr(v.1.8.9), withr(v.3.0.2), numDeriv(v.2016.8-1.1), grid(v.4.4.2), opdisDownsampling(v.1.0.1), caTools(v.1.18.3), colorspace(v.2.1-1), extrafontdb(v.1.0), scales(v.1.3.0), iterators(v.1.0.14), MASS(v.7.3-61), cli(v.3.6.3), mvtnorm(v.1.3-2), rmarkdown(v.2.29), ragg(v.1.3.3), marqLevAlg(v.2.0.8), generics(v.0.1.3), twosamples(v.2.0.1), robustbase(v.0.99-4-1), reshape2(v.1.4.4), tzdb(v.0.4.0), parallel(v.4.4.2), base64enc(v.0.1-3), vctrs(v.0.6.5), Matrix(v.1.7-1), jsonlite(v.1.8.9), hms(v.1.1.3), pbmcapply(v.1.5.1), systemfonts(v.1.1.0), foreach(v.1.5.2), glue(v.1.8.0), DEoptimR(v.1.1-3-1), codetools(v.0.2-20), rngWELL(v.0.10-10), distributional(v.0.5.0), stringi(v.1.8.4), gtable(v.0.3.6), quadprog(v.1.5-8), extrafont(v.0.19), randtoolbox(v.2.0.5), munsell(v.0.5.1), pillar(v.1.10.1), htmltools(v.0.5.8.1), R6(v.2.5.1), textshaping(v.0.4.1), tcltk(v.4.4.2), doParallel(v.1.0.17), evaluate(v.1.0.1), lattice(v.0.22-6), qqconf(v.1.3.2), Rcpp(v.1.0.13-1), gridExtra(v.2.3), nlme(v.3.1-166), Rttf2pt1(v.1.3.12), xfun(v.0.50) and pkgconfig(v.2.0.3)