Model selection

Nathan Constantine-Cooke; Karla Monterrubio-Gómez; Catalina A. Vallejos

Model selection

C-reactive protein

Authors

Affiliations

Nathan Constantine-Cooke

Centre for Genomic and Experimental Medicine

MRC Human Genetics Unit

Karla Monterrubio-Gómez

MRC Human Genetics Unit

Catalina A. Vallejos

MRC Human Genetics Unit

Published

December 13, 2024

Introduction

Code

set.seed(123)
if (file.exists("/.dockerenv")) { # Check if running in Docker
  # Assume igmm/cvallejo-predicct/libdr/ is passed to the data volume
  prefix <- "data/"
} else {
  # Assume running outside of a Docker container and the IGC(/IGMM) datastore is
  # mounted at /Volumes
  prefix <- "/Volumes/igmm/cvallejo-predicct/libdr/"
}


##########################
#--     Packages       --#
##########################

library(tidyverse)
# Support package (source found in libdr/)
library(libdr)
## Modelling ##
library(lcmm)
library(kml) # K-means
## Presentation ##
library(patchwork)
library(ggdist)
library(ggalluvial)
library(pander)
library(qqplotr)


##########################
#--     Data read      --#
#########################

dict <- readRDS(paste0(prefix, "processed/dict.RDS"))
crp_median <- readRDS(paste0(prefix, "processed/median-crp.RDS"))
crp <- readRDS(paste0(prefix, "processed/crp.RDS"))

Model selection

Code

# set the number of groups
G.crp <- numeric()
models.crp.ma <- list()
G.cands <- seq(2, 10)
for (G.cand in G.cands) {
  file.name <- paste0(prefix, "/cache/crp-ma/crp-", G.cand, ".RDS")
  if (file.exists(file.name)) {
    G.crp <- c(G.crp, G.cand)
    models.crp.ma[[G.cand]] <- readRDS(file.name)
  }
}
rm(G.cand)

Code

alluvial.df <- matrix(nrow = 0, ncol = 3)
colnames(alluvial.df) <- c("ids", "class", "G")

for (G in G.crp) {
  alluvial.df <- rbind(alluvial.df, cbind(models.crp.ma[[G]]$pprob[, 1:2],
    G = G
  ))
}
alluvial.df <- as.data.frame(alluvial.df)

alluvial.df$ids <- as.character(alluvial.df$ids)
alluvial.df$class <- as.factor(alluvial.df$class)

alluvial.df[alluvial.df[, "G"] == 3, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 3, "class"],
    from = c(seq(1, 3)),
    to = c(1, 2, 3)
  )

alluvial.df[alluvial.df[, "G"] == 4, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 4, "class"],
    from = c(seq(1, 4)),
    to = c(3, 1, 2, 4)
  )

alluvial.df[alluvial.df[, "G"] == 5, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 5, "class"],
    from = c(seq(1, 5)),
    to = c(3, 5, 2, 1, 4)
  )

alluvial.df[alluvial.df[, "G"] == 6, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 6, "class"],
    from = c(seq(1, 6)),
    to = c(2, 1, 3, 6, 4, 5)
  )

alluvial.df[alluvial.df[, "G"] == 7, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 7, "class"],
    from = c(seq(1, 7)),
    to = c(2, 1, 7, 3, 5, 4, 6)
  )

alluvial.df[alluvial.df[, "G"] == 8, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 8, "class"],
    from = c(seq(1, 8)),
    to = c(2, 6, 1, 7, 5, 4, 3, 8)
  )


alluvial.df[alluvial.df[, "G"] == 9, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 9, "class"],
    from = c(seq(1, 9)),
    to = c(8, 2, 4, 1, 5, 9, 3, 6, 7)
  )

alluvial.df[alluvial.df[, "G"] == 10, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 10, "class"],
    from = c(seq(1, 10)),
    to = c(9, 7, 1, 5, 3, 4, 2, 10, 6, 8)
  )

# eliminate label switching

p <- ggplot(
  alluvial.df,
  aes(
    x = G,
    stratum = class,
    alluvium = ids,
    fill = class,
    label = class
  )
) +
  scale_x_discrete(expand = c(.1, .1)) +
  geom_flow() +
  geom_stratum(alpha = 0.5) +
  geom_text(stat = "stratum", size = 3) +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = rainbow(10)) +
  xlab("Assumed number of clusters") +
  ylab("Frequency")
print(p)


for (format in c("png", "pdf")) {
  ggsave(paste0("plots/ma-nocor/alluvial.", format),
    p,
    width = 12,
    height = 6.75,
    units = "in"
  )
}

Figure 1: Alluvial plot of cluster membership across G for CRP

Code

alluvial.df <- matrix(nrow = 0, ncol = 3)
colnames(alluvial.df) <- c("ids", "class", "G")

for (G in G.crp) {
  alluvial.df <- rbind(alluvial.df, cbind(models.crp.ma[[G]]$pprob[, 1:2],
    G = G
  ))
}
alluvial.df <- as.data.frame(alluvial.df)

alluvial.df$ids <- as.character(alluvial.df$ids)
alluvial.df$class <- as.factor(alluvial.df$class)

p <- ggplot(
  alluvial.df,
  aes(
    x = G,
    stratum = class,
    alluvium = ids,
    fill = class,
    label = class
  )
) +
  scale_x_discrete(expand = c(.1, .1)) +
  geom_flow() +
  geom_stratum(alpha = 0.5) +
  geom_text(stat = "stratum", size = 3) +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = rainbow(10)) +
  xlab("Assumed number of clusters") +
  ylab("Frequency")
print(p)

Figure 2: Alluvial plot of cluster membership across G for CRP without label switching

Posterior classifications

From the below data, we can see how posterior probabilities change as the number of assumed clusters increase

	class1	class2
N	1453	385
%	79.05	20.95

	prob1	prob2
class1	0.9027	0.0973
class2	0.1982	0.8018

	class1	class2
prob>0.7	90.78	69.61
prob>0.8	82.66	54.55
prob>0.9	68.27	37.14

	class1	class2	class3
N	893	359	586
%	48.59	19.53	31.88

	prob1	prob2	prob3
class1	0.8537	0.0778	0.0685
class2	0.1336	0.7989	0.0676
class3	0.1016	0.0572	0.8412

	class1	class2	class3
prob>0.7	79.73	68.25	77.47
prob>0.8	70.66	55.71	68.26
prob>0.9	56.66	38.72	52.56

	class1	class2	class3	class4
N	499	886	341	112
%	27.15	48.2	18.55	6.09

	prob1	prob2	prob3	prob4
class1	0.8014	0.0807	0.057	0.0609
class2	0.0562	0.8502	0.0702	0.0233
class3	0.0684	0.1304	0.7961	0.005
class4	0.1601	0.086	0.0038	0.7501

	class1	class2	class3	class4
prob>0.7	68.54	78.56	66.86	60.71
prob>0.8	58.72	70.2	58.06	45.54
prob>0.9	42.48	55.3	39.3	26.79

	class1	class2	class3	class4	class5
N	507	98	274	856	103
%	27.58	5.33	14.91	46.57	5.6

	prob1	prob2	prob3	prob4	prob5
class1	0.7923	0.0304	0.0384	0.0842	0.0547
class2	0.0892	0.7494	0.1004	0.0608	2e-04
class3	0.0417	0.0662	0.7612	0.1265	0.0044
class4	0.0531	0.0137	0.0649	0.8494	0.0189
class5	0.1526	5e-04	0.0029	0.0896	0.7545

	class1	class2	class3	class4	class5
prob>0.7	67.26	63.27	63.14	78.39	59.22
prob>0.8	56.02	44.9	46.35	69.39	49.51
prob>0.9	41.22	27.55	29.93	55.37	26.21

	class1	class2	class3	class4	class5	class6
N	280	823	477	54	104	100
%	15.23	44.78	25.95	2.94	5.66	5.44

	prob1	prob2	prob3	prob4	prob5	prob6
class1	0.7611	0.1161	0.0397	0.0106	0.005	0.0675
class2	0.0649	0.8252	0.0515	0.0277	0.0181	0.0126
class3	0.0374	0.0808	0.7844	0.0179	0.0528	0.0267
class4	0.0299	0.1145	0.0655	0.7707	0.0194	1e-04
class5	0.0046	0.0669	0.1413	0.0188	0.7682	3e-04
class6	0.0939	0.0584	0.0883	0.0019	2e-04	0.7573

	class1	class2	class3	class4	class5	class6
prob>0.7	63.93	74.85	67.51	62.96	62.5	63
prob>0.8	47.86	65.98	54.51	53.7	51.92	47
prob>0.9	31.79	47.75	40.88	38.89	28.85	31

	class1	class2	class3	class4	class5	class6	class7
N	281	783	57	457	106	97	57
%	15.29	42.6	3.1	24.86	5.77	5.28	3.1

	prob1	prob2	prob3	prob4	prob5	prob6	prob7
class1	0.7627	0.1136	9e-04	0.0368	0.0675	0.009	0.0094
class2	0.0625	0.8184	0.018	0.0441	0.0107	0.0139	0.0324
class3	0.0069	0.1064	0.7244	0.0964	0.0146	0.0462	0.0051
class4	0.0345	0.0744	0.0157	0.7807	0.0277	0.0497	0.0173
class5	0.0917	0.0647	0.002	0.0747	0.765	4e-04	0.0015
class6	0.0112	0.0595	0.023	0.1333	5e-04	0.7533	0.0193
class7	0.0203	0.1127	0.0011	0.0643	0	0.0314	0.7701

	class1	class2	class3	class4	class5	class6	class7
prob>0.7	63.7	74.71	57.89	66.74	62.26	62.89	63.16
prob>0.8	47.69	64.5	42.11	53.83	50	48.45	52.63
prob>0.9	32.74	45.34	22.81	39.82	33.02	30.93	43.86

	class1	class2	class3	class4	class5	class6	class7	class8
N	225	51	702	60	110	84	434	172
%	12.24	2.77	38.19	3.26	5.98	4.57	23.61	9.36

	prob1	prob2	prob3	prob4	prob5	prob6	prob7	prob8
class1	0.7803	0.0068	0.0921	0.0013	0.0633	0.0026	0.0534	3e-04
class2	0.013	0.7735	0.0897	0.0017	2e-04	0.0278	0.0845	0.0096
class3	0.0512	0.029	0.8324	0.0208	0.0096	0.0113	0.0451	6e-04
class4	0.0036	0.0032	0.0684	0.7425	0.0162	0.0314	0.0883	0.0463
class5	0.0735	0.002	0.0397	0.0037	0.7708	2e-04	0.0822	0.0279
class6	7e-04	0.0237	0.0384	0.0278	1e-04	0.7563	0.0997	0.0534
class7	0.0407	0.0195	0.0731	0.0222	0.0292	0.0406	0.6896	0.0851
class8	6e-04	0.0015	4e-04	0.007	0.0208	0.0231	0.1606	0.786

	class1	class2	class3	class4	class5	class6	class7	class8
prob>0.7	68.44	64.71	78.49	55	64.55	61.9	48.85	71.51
prob>0.8	51.56	52.94	67.52	46.67	53.64	48.81	31.34	52.33
prob>0.9	34.22	39.22	49.72	23.33	37.27	33.33	11.06	31.98

Table continues below
	class1	class2	class3	class4	class5	class6	class7	class8
N	172	223	86	601	113	102	432	50
%	9.36	12.13	4.68	32.7	6.15	5.55	23.5	2.72

	class9
N	59
%	3.21

Table continues below
	prob1	prob2	prob3	prob4	prob5	prob6	prob7
class1	0.7829	6e-04	0.0229	2e-04	0.0212	1e-04	0.1635
class2	3e-04	0.7827	0.0025	0.0856	0.0618	0.0073	0.0515
class3	0.0531	7e-04	0.7501	0.0368	1e-04	8e-04	0.1077
class4	3e-04	0.0532	0.0113	0.8317	0.0083	6e-04	0.0471
class5	0.027	0.0752	2e-04	0.0293	0.7653	0.011	0.0823
class6	0.0015	0.0374	0.0107	5e-04	0.0152	0.8437	0.028
class7	0.0843	0.04	0.0387	0.0726	0.0298	0.0031	0.6891
class8	0.0098	0.0128	0.0286	0.0725	2e-04	0.0106	0.0853
class9	0.0468	0.0036	0.0318	0.0647	0.0091	0.0028	0.088

	prob8	prob9
class1	0.0015	0.0071
class2	0.007	0.0013
class3	0.0232	0.0276
class4	0.0263	0.0213
class5	0.0022	0.0075
class6	0.0473	0.0156
class7	0.0196	0.0229
class8	0.7786	0.0017
class9	0.0031	0.7501

Table continues below
	class1	class2	class3	class4	class5	class6	class7
prob>0.7	71.51	69.51	60.47	78.37	63.72	83.33	48.38
prob>0.8	52.33	52.02	47.67	68.05	52.21	69.61	31.25
prob>0.9	31.4	33.63	33.72	48.75	36.28	53.92	10.65

	class8	class9
prob>0.7	66	57.63
prob>0.8	54	47.46
prob>0.9	40	27.12

Table continues below
	class1	class2	class3	class4	class5	class6	class7	class8
N	102	59	417	112	434	85	222	183
%	5.55	3.21	22.69	6.09	23.61	4.62	12.08	9.96

	class9	class10
N	52	172
%	2.83	9.36

Table continues below
	prob1	prob2	prob3	prob4	prob5	prob6	prob7
class1	0.845	0.0157	0	0.0152	0.0283	0.0103	0.0362
class2	0.0026	0.7509	0.0443	0.0091	0.0876	0.0322	0.0035
class3	0	0.0248	0.8528	0.0072	0.0415	0.0131	0.0341
class4	0.0111	0.0077	0.0108	0.7673	0.0825	2e-04	0.0746
class5	0.003	0.0231	0.0419	0.0297	0.6881	0.0396	0.0398
class6	8e-04	0.0276	0.0357	1e-04	0.1034	0.7512	6e-04
class7	0.0074	0.0014	0.0183	0.0616	0.0523	0.0025	0.7769
class8	4e-04	0.0138	0.0239	0.0127	0.0583	0.0061	0.0921
class9	0.0101	0.0017	0.0555	2e-04	0.0896	0.0276	0.012
class10	1e-04	0.0071	2e-04	0.0212	0.1603	0.0232	6e-04

	prob8	prob9	prob10
class1	1e-04	0.0477	0.0015
class2	0.0202	0.0031	0.0464
class3	3e-04	0.0259	4e-04
class4	0.0161	0.0023	0.0275
class5	0.0305	0.0196	0.0847
class6	0.0029	0.0243	0.0535
class7	0.0726	0.0069	3e-04
class8	0.7699	0.0228	1e-04
class9	0.0292	0.7646	0.0094
class10	0	0.0015	0.7857

Table continues below
	class1	class2	class3	class4	class5	class6	class7
prob>0.7	83.33	57.63	82.49	64.29	48.85	61.18	68.47
prob>0.8	70.59	45.76	72.9	52.68	30.88	48.24	50.9
prob>0.9	53.92	25.42	53.24	36.61	11.29	32.94	32.88

	class8	class9	class10
prob>0.7	68.85	63.46	71.51
prob>0.8	52.46	51.92	52.33
prob>0.9	32.24	38.46	31.98

Code

rm(output)

Code

pprobs.2 <- c()
groups.2 <- models.crp.ma[[2]]
pprobs.3 <- c()
groups.3 <- models.crp.ma[[3]]
pprobs.4 <- c()
groups.4 <- models.crp.ma[[4]]
pprobs.5 <- c()
groups.5 <- models.crp.ma[[5]]
pprobs.6 <- c()
groups.6 <- models.crp.ma[[6]]
pprobs.7 <- c()
groups.7 <- models.crp.ma[[7]]
pprobs.8 <- c()
groups.8 <- models.crp.ma[[8]]
pprobs.9 <- c()
groups.9 <- models.crp.ma[[9]]
pprobs.10 <- c()
groups.10 <- models.crp.ma[[10]]


for (i in 1:nrow(models.crp.ma[[2]]$pprob)) {
  class.2 <- groups.2$pprob[i, 2]
  pprobs.2 <- c(pprobs.2, groups.2$pprob[i, class.2 + 2])
  class.3 <- groups.3$pprob[i, 2]
  pprobs.3 <- c(pprobs.3, groups.3$pprob[i, class.3 + 2])
  class.4 <- groups.4$pprob[i, 2]
  pprobs.4 <- c(pprobs.4, groups.4$pprob[i, class.4 + 2])
  class.5 <- groups.5$pprob[i, 2]
  pprobs.5 <- c(pprobs.5, groups.5$pprob[i, class.5 + 2])
  class.6 <- groups.6$pprob[i, 2]
  pprobs.6 <- c(pprobs.6, groups.6$pprob[i, class.6 + 2])
  class.7 <- groups.7$pprob[i, 2]
  pprobs.7 <- c(pprobs.7, groups.7$pprob[i, class.7 + 2])
  class.8 <- groups.8$pprob[i, 2]
  pprobs.8 <- c(pprobs.8, groups.8$pprob[i, class.8 + 2])
  class.9 <- groups.9$pprob[i, 2]
  pprobs.9 <- c(pprobs.9, groups.9$pprob[i, class.9 + 2])
  class.10 <- groups.10$pprob[i, 2]
  pprobs.10 <- c(pprobs.10, groups.10$pprob[i, class.10 + 2])
}
pprobs.2 <- tibble(prob = pprobs.2)
pprobs.3 <- tibble(prob = pprobs.3)
pprobs.4 <- tibble(prob = pprobs.4)
pprobs.5 <- tibble(prob = pprobs.5)
pprobs.6 <- tibble(prob = pprobs.6)
pprobs.7 <- tibble(prob = pprobs.7)
pprobs.8 <- tibble(prob = pprobs.8)
pprobs.9 <- tibble(prob = pprobs.9)
pprobs.10 <- tibble(prob = pprobs.10)

pprobs.2$Model <- as.factor(rep("Two clusters", nrow(pprobs.2)))
pprobs.3$Model <- as.factor(rep("Three clusters", nrow(pprobs.3)))
pprobs.4$Model <- as.factor(rep("Four clusters", nrow(pprobs.4)))
pprobs.5$Model <- as.factor(rep("Five clusters", nrow(pprobs.5)))
pprobs.6$Model <- as.factor(rep("Six clusters", nrow(pprobs.6)))
pprobs.7$Model <- as.factor(rep("Seven clusters", nrow(pprobs.7)))
pprobs.8$Model <- as.factor(rep("Eight clusters", nrow(pprobs.8)))
pprobs.9$Model <- as.factor(rep("Nine clusters", nrow(pprobs.9)))
pprobs.10$Model <- as.factor(rep("Ten clusters", nrow(pprobs.10)))
pprobs <- rbind(
  pprobs.2,
  pprobs.3,
  pprobs.4,
  pprobs.5,
  pprobs.6,
  pprobs.7,
  pprobs.8,
  pprobs.9,
  pprobs.10
)

p <- pprobs %>%
  ggplot(aes(x = prob, y = Model)) +
  # geom_histogram(bins = 40, fill = NA, position="identity")
  stat_slab(
    fill = "#5D5179", color = "gray",
    size = 0.8,
    alpha = 0.7
  ) +
  geom_dots(color = "#4F759B", dotsize = 1) +
  xlab("Posterior probability for cluster membership") +
  ylab("") +
  ggtitle(
    "Distribution of Posterior Probabilities Across Models",
    "Subject-specific posterior probabilities for assigned cluster"
  ) +
  theme_minimal() +
  scale_y_discrete(limits = rev)
ggsave("plots/pprob-distribution-crp-ma.png",
  p,
  width = 8.5,
  height = 8.5,
  units = "in"
)
ggsave("plots/pprob-distribution-crp-ma.pdf",
  p,
  width = 8.5,
  height = 8.5,
  units = "in"
)
print(p)

Residual plots

G = 2

##### G = 3 ##### G = 4 ##### G = 5 ##### G = 6 ##### G = 7 ##### G = 8 ##### G = 9 ##### G = 10

Model metrics

Code

summaryplot(models.crp.ma[[4]],
  models.crp.ma[[5]],
  models.crp.ma[[6]],
  models.crp.ma[[7]],
  models.crp.ma[[8]],
  models.crp.ma[[9]],
  models.crp.ma[[10]],
  which = c("loglik", "AIC", "BIC", "entropy", "ICL"),
  mfrow = c(2, 3),
  axis = "Class"
)

Figure 3: Model metrics for CRP MA for G = 4-10

Code

crp.metrics <- makeMetrics(G.crp, models.crp.ma)
buildDT(crp.metrics)

Clusters	Maximum log-likelihood	AIC	BIC
2	-15451.41	30932.82	31015.57
3	-15314.76	30675.52	30802.40
4	-15240.15	30542.29	30713.30
5	-15173.04	30424.09	30639.23
6	-15106.25	30306.50	30565.78
7	-15051.43	30212.86	30516.27
8	-15016.96	30159.91	30507.45
9	-15014.76	30171.53	30563.20
10	-15014.40	30186.81	30622.61

Spaghetti plots per cluster

Code

for (G in G.crp) {
  # Data frame to hold processed data
  new.crp <- data.frame(
    ids = numeric(),
    crp_result = numeric(),
    crp_time = numeric(),
    class = numeric()
  )

  for (clust in 1:G) {
    ids.clust <- subset(models.crp.ma[[G]]$pprob, class == clust)$ids
    n.clust <- length(ids.clust)
    rand <- sample(n.clust, n.clust) # Randomise the order of the ids
    iters <- floor(n.clust / 6) # How many groups of six are there?

    # Matrix to hold the smoothed data
    crp.ma <- matrix(NA, nrow = iters, ncol = 7)
    for (i in 0:(iters - 1)) {
      # Find ids for group of five
      ids.select <- ids.clust[rand[((i * 6) + 1):((i * 6) + 6)]]
      crp.subset <- subset(crp_median, ids %in% ids.select)
      # Median process as per CRP preprocessing
      for (j in seq(0, 6)) {
        if (j == 6) {
          sub.obs <- subset(
            crp.subset,
            crp_time >= j - 0.5 & crp_time <= j + 1
          )
        } else {
          sub.obs <- subset(
            crp.subset,
            crp_time >= j - 0.5 & crp_time < j + 0.5
          )
        }
        if (nrow(sub.obs) > 0) {
          crp.ma[i + 1, j + 1] <- median(sub.obs$crp_result)
        }
      }
    }

    rownames(crp.ma) <- 1:iters
    crp.ma <- reshape2::melt(t(crp.ma),
      id.vars = row.names(crp.ma),
      na.rm = TRUE
    )
    colnames(crp.ma) <- c("crp_time", "ids", "crp_result")
    crp.ma <- crp.ma[, c(2, 3, 1)] # Make ids first column
    crp.ma$crp_time <- crp.ma$crp_time - 1
    # Take into account uneven spacing at start and end
    crp.ma$crp_time <- plyr::mapvalues(crp.ma$crp_time,
      from = c(0, 6),
      to = c(0.25, 6.25)
    )
    crp.ma$class <- clust # Identify cluster assignment
    new.crp <- rbind(new.crp, crp.ma)
  }

  cairo_pdf(paste0("plots/spaghetti/crp-", G, ".pdf"),
    width = 10,
    height = 17.5
  )
  grid::grid.newpage()
  spaghettiPlot(new.crp,
    models.crp.ma,
    G,
    clusters = TRUE,
    tmax = 6.25,
    sizes = TRUE,
    var.time = "crp_time",
    ylim = "data"
  )
  invisible(dev.off())

  png(paste0("plots/spaghetti/crp-", G, ".png"),
    width = 10,
    height = 17.5,
    units = "in",
    res = 300
  )
  grid::grid.newpage()
  spaghettiPlot(new.crp,
    models.crp.ma,
    G,
    clusters = TRUE,
    tmax = 6.25,
    sizes = TRUE,
    var.time = "crp_time",
    ylim = "data"
  )
  invisible(dev.off())

  grid::grid.newpage()
  spaghettiPlot(new.crp,
    models.crp.ma,
    G,
    clusters = TRUE,
    tmax = 6.25,
    sizes = TRUE,
    var.time = "crp_time",
    ylim = "data"
  )
}

Chosen model

Code

new.crp <- data.frame(
  ids = numeric(),
  crp_result = numeric(),
  crp_time = numeric(),
  class = numeric()
)

for (clust in 1:8) {
  ids.clust <- subset(models.crp.ma[[8]]$pprob, class == clust)$ids
  n.clust <- length(ids.clust)
  rand <- sample(n.clust, n.clust) # Randomise the order of the ids
  iters <- floor(n.clust / 6) # How many groups of six are there?

  # Matrix to hold the smoothed data
  crp.ma <- matrix(NA, nrow = iters, ncol = 7)
  for (i in 0:(iters - 1)) {
    # Find ids for group of five
    ids.select <- ids.clust[rand[((i * 6) + 1):((i * 6) + 6)]]
    crp.subset <- subset(crp_median, ids %in% ids.select)
    # Median process as per CRP preprocessing
    for (j in seq(0, 6)) {
      if (j == 6) {
        sub.obs <- subset(
          crp.subset,
          crp_time >= j - 0.5 & crp_time <= j + 1
        )
      } else {
        sub.obs <- subset(
          crp.subset,
          crp_time >= j - 0.5 & crp_time < j + 0.5
        )
      }
      if (nrow(sub.obs) > 0) {
        crp.ma[i + 1, j + 1] <- median(sub.obs$crp_result)
      }
    }
  }

  rownames(crp.ma) <- 1:iters
  crp.ma <- reshape2::melt(t(crp.ma),
    id.vars = row.names(crp.ma),
    na.rm = TRUE
  )
  colnames(crp.ma) <- c("crp_time", "ids", "crp_result")
  crp.ma <- crp.ma[, c(2, 3, 1)] # Make ids first column
  crp.ma$crp_time <- crp.ma$crp_time - 1
  # Take into account uneven spacing at start and end
  crp.ma$crp_time <- plyr::mapvalues(crp.ma$crp_time,
    from = c(0, 6),
    to = c(0.25, 6.25)
  )
  crp.ma$class <- clust # Identify cluster assignment
  new.crp <- rbind(new.crp, crp.ma)
}

Labelling

The labels/order of the clusters generated above are random. To improve the readability and assist with interpretation of associations, it would be beneficial to order clusters by some analogue for disease severity. This section explores possible solutions for this task.

Cumulative inflammation across follow-up

One possible solution is to order clusters by cumulative inflammation which is given by the area under the curve for each mean cluster trajectory.

Code

rank.full <- rankCumulative(models.crp.ma[[8]],
  tmax = 6.25,
  var.time = "crp_time"
)
rank.full %>%
  ggplot(aes(x = paste0("CRP", New), y = Area)) +
  geom_bar(stat = "identity", fill = "#745C97", color = "#39375B") +
  ylab("Cumulative inflammation (Area)") +
  xlab("Cluster") +
  theme_minimal()

Code

png(paste0("plots/spaghetti/crp-ncs-reordered.png"),
  width = 10,
  height = 16,
  units = "in",
  res = 300
)
grid::grid.newpage()
spaghettiPlot(new.crp,
  models.crp.ma,
  G = 8,
  log = TRUE,
  tmax = 6.25,
  sizes = TRUE,
  knots = FALSE,
  var.time = "crp_time",
  clusters = TRUE,
  ylim = "data",
  mapping = rank.full$Original
)
invisible(dev.off())

cairo_pdf(paste0("plots/spaghetti/crp-ncs-reordered.pdf"),
  width = 10,
  height = 16
)
grid::grid.newpage()
spaghettiPlot(new.crp,
  models.crp.ma,
  G = 8,
  log = TRUE,
  tmax = 6.25,
  sizes = TRUE,
  knots = FALSE,
  var.time = "crp_time",
  clusters = TRUE,
  ylim = "data",
  mapping = rank.full$Original
)
invisible(dev.off())

spaghettiPlot(new.crp,
  models.crp.ma,
  G = 8,
  log = TRUE,
  tmax = 6.25,
  sizes = TRUE,
  knots = FALSE,
  var.time = "crp_time",
  clusters = TRUE,
  ylim = "data",
  mapping = rank.full$Original
)

Code

for (G in c(7, 9)) {
  new.crp <- data.frame(
    ids = numeric(),
    crp_result = numeric(),
    crp_time = numeric(),
    class = numeric()
  )

  for (clust in 1:G) {
    ids.clust <- subset(models.crp.ma[[G]]$pprob, class == clust)$ids
    n.clust <- length(ids.clust)
    rand <- sample(n.clust, n.clust) # Randomise the order of the ids
    iters <- floor(n.clust / 6) # How many groups of six are there?

    # Matrix to hold the smoothed data
    crp.ma <- matrix(NA, nrow = iters, ncol = 7)
    for (i in 0:(iters - 1)) {
      # Find ids for group of five
      ids.select <- ids.clust[rand[((i * 6) + 1):((i * 6) + 6)]]
      crp.subset <- subset(crp_median, ids %in% ids.select)
      # Median process as per CRP preprocessing
      for (j in seq(0, 6)) {
        if (j == 6) {
          sub.obs <- subset(
            crp.subset,
            crp_time >= j - 0.5 & crp_time <= j + 1
          )
        } else {
          sub.obs <- subset(
            crp.subset,
            crp_time >= j - 0.5 & crp_time < j + 0.5
          )
        }
        if (nrow(sub.obs) > 0) {
          crp.ma[i + 1, j + 1] <- median(sub.obs$crp_result)
        }
      }
    }

    rownames(crp.ma) <- 1:iters
    crp.ma <- reshape2::melt(t(crp.ma),
      id.vars = row.names(crp.ma),
      na.rm = TRUE
    )
    colnames(crp.ma) <- c("crp_time", "ids", "crp_result")
    crp.ma <- crp.ma[, c(2, 3, 1)] # Make ids first column
    crp.ma$crp_time <- crp.ma$crp_time - 1
    # Take into account uneven spacing at start and end
    crp.ma$crp_time <- plyr::mapvalues(crp.ma$crp_time,
      from = c(0, 6),
      to = c(0.25, 6.25)
    )
    crp.ma$class <- clust # Identify cluster assignment
    new.crp <- rbind(new.crp, crp.ma)
  }


  rank.full <- rankCumulative(models.crp.ma[[G]],
    tmax = 6.25,
    var.time = "crp_time"
  )
  png(paste0("plots/spaghetti/crp-ncs-reordered-", G, ".png"),
    width = 10,
    height = 16,
    units = "in",
    res = 300
  )

  grid::grid.newpage()
  spaghettiPlot(
    new.crp,
    models.crp.ma,
    G = G,
    log = TRUE,
    tmax = 6.25,
    sizes = TRUE,
    knots = FALSE,
    var.time = "crp_time",
    clusters = TRUE,
    ylim = "data",
    mapping = rank.full$Original
  )
  invisible(dev.off())

  cairo_pdf(paste0("plots/spaghetti/crp-ncs-reordered-", G, ".pdf"),
    width = 10,
    height = 16
  )
  grid::grid.newpage()
  spaghettiPlot(
    new.crp,
    models.crp.ma,
    G = G,
    log = TRUE,
    tmax = 6.25,
    sizes = TRUE,
    knots = FALSE,
    var.time = "crp_time",
    clusters = TRUE,
    ylim = "data",
    mapping = rank.full$Original
  )
  invisible(dev.off())
}

Alluvial anchored plot

Code

alluvial.df <- matrix(nrow = 0, ncol = 3)
colnames(alluvial.df) <- c("ids", "class", "G")

for (G in G.crp) {
  alluvial.df <- rbind(alluvial.df, cbind(models.crp.ma[[G]]$pprob[, 1:2],
    G = G
  ))
}
alluvial.df <- as.data.frame(alluvial.df)

alluvial.df$ids <- as.character(alluvial.df$ids)
alluvial.df$class <- as.factor(alluvial.df$class)


alluvial.df[alluvial.df[, "G"] == 2, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 2, "class"],
    from = c(1, 2),
    to = c(1, 2)
  )

alluvial.df[alluvial.df[, "G"] == 3, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 3, "class"],
    from = c(seq(1, 3)),
    to = c(1, 2, 6)
  )

alluvial.df[alluvial.df[, "G"] == 4, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 4, "class"],
    from = c(seq(1, 4)),
    to = c(6, 1, 2, 7)
  )

alluvial.df[alluvial.df[, "G"] == 5, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 5, "class"],
    from = c(seq(1, 5)),
    to = c(6, 5, 2, 1, 7)
  )

alluvial.df[alluvial.df[, "G"] == 6, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 6, "class"],
    from = c(seq(1, 6)),
    to = c(2, 1, 6, 3, 7, 5)
  )

alluvial.df[alluvial.df[, "G"] == 7, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 7, "class"],
    from = c(seq(1, 7)),
    to = c(2, 1, 4, 6, 5, 7, 3)
  )

alluvial.df[alluvial.df[, "G"] == 8, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 8, "class"],
    from = rank.full$Original,
    to = rank.full$New
  )


alluvial.df[alluvial.df[, "G"] == 9, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 9, "class"],
    from = c(seq(1, 9)),
    to = c(8, 2, 7, 1, 5, 9, 6, 3, 4)
  )


alluvial.df[alluvial.df[, "G"] == 10, "class"] <-
  plyr::mapvalues(alluvial.df[alluvial.df[, "G"] == 10, "class"],
    from = c(seq(1, 10)),
    to = c(9, 4, 1, 5, 6, 7, 2, 10, 3, 8)
  )


# eliminate label switching

p <- ggplot(
  alluvial.df,
  aes(
    x = G,
    stratum = class,
    alluvium = ids,
    fill = class,
    label = class
  )
) +
  scale_x_discrete(expand = c(.1, .1)) +
  geom_flow() +
  geom_stratum(alpha = 0.5) +
  geom_text(stat = "stratum", size = 3) +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = c(viridis::viridis(8), "#FFF282", "#F8F3C3")) +
  xlab("Assumed number of clusters") +
  ylab("Frequency")
print(p)

Code

for (format in c("png", "pdf")) {
  ggsave(paste0("plots/ma-nocor/anchored-alluvial.", format),
    p,
    width = 12,
    height = 6.75,
    units = "in"
  )
}

saveRDS(p, paste0(prefix, "processed/plots/crp-alluvial.RDS"))

Acknowledgments

This work is funded by the Medical Research Council & University of Edinburgh via a Precision Medicine PhD studentship (MR/N013166/1, to NC-C).

This work has made use of the resources provided by the Edinburgh Compute and Data Facility (ECDF).

Author contributions

References

Reuse

Licensed by CC BY unless otherwise stated.

MRC Human Genetics Unit logo

Session info

Code

pander(sessionInfo())

R version 4.4.2 (2024-10-31)

Platform: aarch64-apple-darwin20

locale: en_GB.UTF-8||en_GB.UTF-8||en_GB.UTF-8||C||en_GB.UTF-8||en_GB.UTF-8

attached base packages: splines, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: qqplotr(v.0.0.6), pander(v.0.6.5), ggalluvial(v.0.12.5), ggdist(v.3.3.2), kml(v.2.5.0), longitudinalData(v.2.4.7), misc3d(v.0.9-1), rgl(v.1.3.14), clv(v.0.3-2.4), class(v.7.3-22), cluster(v.2.1.6), lcmm(v.2.1.0), libdr(v.1.0.0), magrittr(v.2.0.3), patchwork(v.1.3.0), lubridate(v.1.9.3), forcats(v.1.0.0), stringr(v.1.5.1), dplyr(v.1.1.4), purrr(v.1.0.2), readr(v.2.1.5), tidyr(v.1.3.1), tibble(v.3.2.1), ggplot2(v.3.5.1) and tidyverse(v.2.0.0)

loaded via a namespace (and not attached): tidyselect(v.1.2.1), viridisLite(v.0.4.2), farver(v.2.1.2), viridis(v.0.6.5), bitops(v.1.0-9), fastmap(v.1.2.0), pracma(v.2.4.4), digest(v.0.6.37), timechange(v.0.3.0), lifecycle(v.1.0.4), survival(v.3.7-0), compiler(v.4.4.2), rlang(v.1.1.4), tools(v.4.4.2), utf8(v.1.2.4), yaml(v.2.3.10), knitr(v.1.49), labeling(v.0.4.3), htmlwidgets(v.1.6.4), plyr(v.1.8.9), withr(v.3.0.2), numDeriv(v.2016.8-1.1), grid(v.4.4.2), fansi(v.1.0.6), opdisDownsampling(v.1.0.1), caTools(v.1.18.3), colorspace(v.2.1-1), extrafontdb(v.1.0), scales(v.1.3.0), iterators(v.1.0.14), MASS(v.7.3-61), cli(v.3.6.3), mvtnorm(v.1.3-2), rmarkdown(v.2.29), ragg(v.1.3.3), marqLevAlg(v.2.0.8), generics(v.0.1.3), twosamples(v.2.0.1), robustbase(v.0.99-4-1), reshape2(v.1.4.4), tzdb(v.0.4.0), parallel(v.4.4.2), base64enc(v.0.1-3), vctrs(v.0.6.5), Matrix(v.1.7-1), jsonlite(v.1.8.9), hms(v.1.1.3), pbmcapply(v.1.5.1), systemfonts(v.1.1.0), foreach(v.1.5.2), glue(v.1.8.0), DEoptimR(v.1.1-3-1), codetools(v.0.2-20), rngWELL(v.0.10-10), distributional(v.0.5.0), stringi(v.1.8.4), gtable(v.0.3.6), quadprog(v.1.5-8), extrafont(v.0.19), randtoolbox(v.2.0.5), munsell(v.0.5.1), pillar(v.1.9.0), htmltools(v.0.5.8.1), R6(v.2.5.1), textshaping(v.0.4.0), tcltk(v.4.4.2), doParallel(v.1.0.17), evaluate(v.1.0.1), lattice(v.0.22-6), qqconf(v.1.3.2), Rcpp(v.1.0.13-1), gridExtra(v.2.3), nlme(v.3.1-166), Rttf2pt1(v.1.3.12), xfun(v.0.49) and pkgconfig(v.2.0.3)