Code
# TODO: add devel/tidy_rmsfit stuff to rcookbook
library(ggplot2)
library(gridExtra)
library(reshape2)
library(swt)
library(readxl)
Statistical report
This cookbook introduces the Swisstransplant R package swt
to make high-quality reports and publication-ready graphics in our in-house style—much more.
The source code for swt
is maintained on GitHub and can be accessed here. Additionally, a package manual is available for reference.
# TODO: add devel/tidy_rmsfit stuff to rcookbook
library(ggplot2)
library(gridExtra)
library(reshape2)
library(swt)
library(readxl)
The swt
package contains the Swisstransplant statistical report as a Quarto project template. Quarto documents enable dynamic reporting and unite written text, statistical programming code, results, tables, and figures into a reproducible document. All the steps for obtaining the results from the data are precisely specified. This enables transparency, reproducibility, and trust in statistical analyses.
A new project in RStudio can be created using the Swisstransplant project template: Click New Project… then choose New Directory, and select the Swisstransplant Project at the bottom of the list.
The Swisstransplant Quarto template is written in markdown. It is possible to add callout blocks to highlight important information.
::: {.callout-note appearance="simple"}
:
Note that there are five types of callouts, including`note`, `warning`, `important`, `tip`, and `caution`.
:::
Note that there are five types of callouts: note
, warning
, important
, tip
, and caution
.
Something important.
The new project will suggest a structure for an analysis pipeline. However, every project is different; therefore, only a rough structure is suggested.
flowchart TB O(Objectives) --> I(Data import) I --> P(Data processing) P --> Q(Quality control) Q --> D(Descriptive statistics) D --> A(Primary analysis) A --> S(Secondary analyses) S --> C(Computing information) O --- OC[Study objectives<br>Primary outcome<br>Secondary outcomes<br>Methods] I --- IC[Custom R functions<br>Load libraries<br>Two-pass reading when necessary<br>Data class conversion<br>Data cleaning<br>Reshape<br>Merge<br>Aggregate] P --- PC[Data overview<br>Definition of outcomes<br>Consent status<br>Inclusion, exclusion and subsets<br>Missing data and imputations<br>Transform<br>Matching<br>Create analysis data set] Q --- QC[Data checks<br>Assert logical expressions<br>Variable distributions] D --- DC[Sample size<br>Table 1<br>Consort Diagram] A --- AC[Results<br>Tables<br>Figures] S --- SC[Results<br>Tables<br>Figures] C --- CC[Runtime<br>Environment<br>Package names and versions]
The swt_colors()
function creates an object for user-friendly access to the Swisstransplant color scheme:
= swt::swt_colors()
col $blue.dark col
The following colors are available:
= swt_colors()
col par(mfrow=c(1,1), mai=c(0.5,0.1,0.2,0.1))
barplot(rep(1,12), axes=FALSE, col=c(col$blue.dark,
$blue.alt,
col$turkis.tpx,
col$yellow.donation,
col$strongred.akzent,
col$pink.heart,
col$red.liver,
col$darkyellow.kidney,
col$green.pancreas,
col$lightblue.lungs,
col$beige.intestine,
col$purple.alt
col
),names.arg = c("blue\ndark", "blue\nalt", "turkis\ntpx", "yellow\ndonat",
"strongr\nakzent", "pink\nheart", "red\nliver", "darkylw\nkidney",
"green\npancr", "lightb\nlungs", "beige\nintest", "purple\nalt"),
cex.names = 0.8
)
The swt_color
object also includes single hue palettes with three additional color strengths: 75%, 50%, and 25%.
par(mfrow=c(12,1), mai=c(0.1,0.1,0.2,0.1))
barplot(rep(1,4), axes=FALSE, col=c(col$pal.blue.swt[1],
$pal.blue.swt[2],
col$pal.blue.swt[3],
col$pal.blue.swt[4])
col
)
barplot(rep(1,4), axes=FALSE, col=c(col$pal.blue.alt[1],
$pal.blue.alt[2],
col$pal.blue.alt[3],
col$pal.blue.alt[4])
col
)
barplot(rep(1,4), axes=FALSE, col=c(col$pal.turkis.tpx[1],
$pal.turkis.tpx[2],
col$pal.turkis.tpx[3],
col$pal.turkis.tpx[4])
col
)
barplot(rep(1,4), axes=FALSE, col=c(col$pal.yellow.donation[1],
$pal.yellow.donation[2],
col$pal.yellow.donation[3],
col$pal.yellow.donation[4])
col
)
barplot(rep(1,4), axes=FALSE, col=c(col$pal.strongred.akzent[1],
$pal.strongred.akzent[2],
col$pal.strongred.akzent[3],
col$pal.strongred.akzent[4])
col
)
barplot(rep(1,4), axes=FALSE, col=c(col$pal.pink.heart[1],
$pal.pink.heart[2],
col$pal.pink.heart[3],
col$pal.pink.heart[4])
col
)
barplot(rep(1,4), axes=FALSE, col=c(col$pal.red.liver[1],
$pal.red.liver[2],
col$pal.red.liver[3],
col$pal.red.liver[4])
col
)
barplot(rep(1,4), axes=FALSE, col=c(col$pal.darkyellow.kidney[1],
$pal.darkyellow.kidney[2],
col$pal.darkyellow.kidney[3],
col$pal.darkyellow.kidney[4])
col
)
barplot(rep(1,4), axes=FALSE, col=c(col$pal.green.pancreas[1],
$pal.green.pancreas[2],
col$pal.green.pancreas[3],
col$pal.green.pancreas[4])
col
)
barplot(rep(1,4), axes=FALSE, col=c(col$pal.lightblue.lungs[1],
$pal.lightblue.lungs[2],
col$pal.lightblue.lungs[3],
col$pal.lightblue.lungs[4])
col
)
barplot(rep(1,4), axes=FALSE, col=c(col$pal.beige.intestine[1],
$pal.beige.intestine[2],
col$pal.beige.intestine[3],
col$pal.beige.intestine[4])
col
)
barplot(rep(1,4), axes=FALSE, col=c(col$pal.purple.alt[1],
$pal.purple.alt[2],
col$pal.purple.alt[3],
col$pal.purple.alt[4]),
colnames.arg = c("100%", "75%", "50%", "25%"),
cex.names = 0.8
)
ggplot2
Below are a few examples of creating various types of data plots in the SWT color scheme. The function swt_style()
adds the correct styling to the plot.
set.seed(1980)
= 100
n = c(rnorm(n/2, mean=0), rnorm(n/2, mean=3) )
var1 = data.frame(var1 = var1,
d var2 = var1 + rnorm(n, sd = 0.4),
group = as.factor(rep(c("abc", "mno" ), each=n/2))
)
= ggplot(d, aes(x=group, y=var1, group=group, col=group)) +
p1 geom_boxplot(fill=col$grey.bg) +
geom_point(size=2, shape=1, position = position_jitter(height = 0, width = 0.25),
col=c(rep(col$pal.blue.swt[1], n/2),
rep(col$pal.red.liver[1], n/2))) +
labs(title = "Title", tag = "A") +
scale_color_manual(values = c(col$blue.swt,
$red.liver)) +
colswt_style(legend_position = "none", grey_theme = TRUE, font_size = 14, title_size = 16) +
theme(plot.tag = element_text(size = 16, face = "bold"))
= ggplot(d, aes(x=group, y=var1, group=group, col=group)) +
p2 geom_boxplot() +
geom_point(size=2, position = position_jitter(height = 0, width = 0.25),
col=c(rep(col$pal.turkis.tpx[3], n/2),
rep(col$pal.darkyellow.kidney[3], n/2))) +
scale_color_manual(values = c(col$turkis.tpx,
$darkyellow.kidney)) +
colswt_style(legend_position = "none")
= ggplot(d, aes(x=var2, fill=group)) +
p3 geom_histogram(position = "identity", bins=20, alpha=0.5) +
scale_fill_manual(values = c(col$lightblue.lungs,
$beige.intestine)) +
colswt_style(grey_theme = FALSE)
= ggplot(d, aes(x=var2, y=var1, group=group, col=group)) +
p4 geom_point(size=2, alpha=0.5) +
scale_color_manual(values = c(col$blue.swt,
$yellow.donation)) +
collabs(title = "Title", tag = "D") +
swt_style(legend_position = "bottom", grey_theme = TRUE)
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
# Data taken from Annual Report 2020 (p. 32)
.2 = t(array(c(
table313.3,17.2,18.6,18.4,17.0,
11.5,12.6,14.9,11.7,11.2,
1.8,4.6,3.8,6.7,5.8), dim = c(5,3)))
colnames(table3.2) = 2016:2020
rownames(table3.2) = c("Total", "DBD", "DCD")
.2 = melt(table3.2, varnames = c("Gruppe", "Jahr"), value.name = "Anzahl") data3
= -0.8
number_height
ggplot(data3.2, aes(x=Jahr, y=Anzahl, col=Gruppe, group=Gruppe)) +
# plot line with numbers
geom_line(data = data3.2, linewidth=1) +
geom_text(data = data3.2, aes(label=Anzahl), vjust=number_height,
col="black", size=4) +
# some adjustments (colors, axes, etc)
scale_color_manual(values=c(col$strongred.akzent,
$yellow.donation,
col$blue.swt)) +
colscale_y_continuous(breaks = seq(0,22,4), limits = c(0,22)) +
ylab("pmp") +
labs(title="Title",
subtitle = "Subtitle"
+
) swt_style(grey_theme = TRUE) +
theme(legend.position = "top")
# Data taken from Annual Report 2020 (p. 31)
.1 = t(array(c(
table396,106,126,100,96,
15,39,32,57,50), dim = c(5,2)))
1.totals = array(colSums(table3.1), dim = c(1,5))
table3.colnames(table3.1) = 2016:2020
colnames(table3.1.totals) = 2016:2020
rownames(table3.1) = c("DBD", "DCD")
rownames(table3.1.totals) = c("Total")
.1 = melt(table3.1, varnames = c("Gruppe", "Jahr"), value.name = "Anzahl")
data31.totals = melt(table3.1.totals, varnames = c("Gruppe", "Jahr"),
data3.value.name = "Anzahl")
= -0.8
number_height = 0.5
bar_with
ggplot(data3.1, aes(x=Jahr, y=Anzahl, fill=Gruppe, group=Gruppe)) +
# plot bars with numbers
geom_bar(data = data3.1, stat="identity", position="dodge", width=bar_with) +
geom_text(data = data3.1, aes(label=Anzahl), vjust=number_height,
position = position_dodge(width=bar_with)) +
# plot line with numbers
geom_line(data = data3.1.totals, col = col$strongred.akzent, linewidth=1) +
geom_text(data = data3.1.totals, aes(label=Anzahl), vjust=number_height,
position = position_dodge(width=bar_with), col=col$strongred.akzent) +
# some adjustments (colors, axes, etc)
scale_fill_manual(values=c(col$yellow.donation,
$blue.swt,
col$strongred.akzent)) +
colscale_y_continuous(breaks = seq(0,180,20), limits = c(0,180)) +
ylab("Personen") +
swt_style(grey_theme = TRUE)
Several helper functions are implemented to support statistical computing.
mean_sd
for the mean and standard deviationmedian_irq
for the median and interquartile rangefreq_perc
for the frequency count and percentmiss_perc
for the frequency count and percent for missing data (NA
)tidy_pvalues
for formatting p-valuesThe helper functions above are applied to single variables (vectors).
tidy_missing
displays missing data of all the variables in a data frame.set.seed(1980)
= data.frame(age = rnorm(n = 200, mean = 50, sd = 10))
data $hypertension = rbinom(n = 200, size = 1, prob = 0.20)
data
= data.frame(age = mean_sd(data$age),
tab hypertension = freq_perc(data$hypertension)
)= t(tab)
tab colnames(tab) = "Descriptives"
= as.data.frame(tab)
tab rownames(tab) = c("age, mean (SD)", "hypertension, count (%)")
tab
Descriptives | |
---|---|
age, mean (SD) | 50.2 (10.7) |
hypertension, count (%) | 57 (28.5) |
$age[1:5] = NA
datatidy_missing(data)
Missing | |
---|---|
age | 5 (2.5%) |
hypertension | 0 (0.0%) |
TOTAL | 5 (2.5) |
Dates are sometimes entered inconsistently. If we force the data type to be a date using the option col_types
, these values are discarded and show up as NA
.
We also get a warning; see below.
= as.data.frame(read_xlsx(path = "../data/dates.xlsx", col_types = "date")) data
Warning: Expecting date in A4 / R4C1: got '11.03.1980 11:11:11'
data
mydate |
---|
2021-08-22 16:43:01 |
2023-02-17 22:12:02 |
NA |
An alternative is to import them as data type text. The dates show up as numbers and, if not recognized, as characters. The numbers are the number of days since 1899-12-30; this is an Excel convention.
= as.data.frame(read_xlsx(path = "../data/dates.xlsx", col_types = "text"))
data data
mydate |
---|
44430.696539351855 |
44974.925023148149 |
11.03.1980 11:11:11 |
We can use the function num2date()
to convert numbers into dates. There is also a built-in filter that recognizes the alternative format, which can also be modified via the option; see ?num2date
.
$mydate = num2date(data$mydate, format = "%d.%m.%Y %H:%M:%OS", round = FALSE)
data data
mydate |
---|
2021-08-22 16:43:01 |
2023-02-17 22:12:02 |
1980-03-11 11:11:10 |
One disadvantage of handling dates as numbers is when performing data cleaning. For example, manually correcting dates or adding missing data may be necessary. This has to be done in numbers before the conversion into the POSIXct
data type. However, the function date2num()
can be used for this purpose.
= as.data.frame(read_xlsx(path = "../data/dates.xlsx", col_types = "text"))
data $mydate[3] = date2num("1980-03-11 02:10:00")
data$mydate = num2date(data$mydate)
data data
mydate |
---|
2021-08-22 16:43:01 |
2023-02-17 22:12:02 |
1980-03-11 02:10:00 |
Using num2date()
for all date variables will also handle the time zone consistently using CET
time (and CEST
during summer). Note that having mixed time zones, UTC
and CET
, can cause offsets of 1 hour.
sessionInfo()
R version 4.4.3 (2025-02-28 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 10 x64 (build 19045)
Matrix products: default
locale:
[1] LC_COLLATE=English_Switzerland.utf8 LC_CTYPE=English_Switzerland.utf8
[3] LC_MONETARY=English_Switzerland.utf8 LC_NUMERIC=C
[5] LC_TIME=English_Switzerland.utf8
time zone: Europe/Zurich
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] readxl_1.4.5 swt_0.3 reshape2_1.4.4 gridExtra_2.3 ggplot2_3.5.1
loaded via a namespace (and not attached):
[1] gtable_0.3.6 jsonlite_1.9.1 dplyr_1.1.4 compiler_4.4.3
[5] tidyselect_1.2.1 Rcpp_1.0.14 stringr_1.5.1 splines_4.4.3
[9] scales_1.3.0 yaml_2.3.10 fastmap_1.2.0 lattice_0.22-6
[13] R6_2.6.1 plyr_1.8.9 labeling_0.4.3 generics_0.1.3
[17] knitr_1.50 MASS_7.3-65 htmlwidgets_1.6.4 tibble_3.2.1
[21] munsell_0.5.1 lubridate_1.9.4 pillar_1.10.1 rlang_1.1.5
[25] stringi_1.8.4 xfun_0.51 segmented_2.1-4 timechange_0.3.0
[29] cli_3.6.4 withr_3.0.2 magrittr_2.0.3 digest_0.6.37
[33] grid_4.4.3 nlme_3.1-167 lifecycle_1.0.4 vctrs_0.6.5
[37] evaluate_1.0.3 glue_1.8.0 farver_2.1.2 cellranger_1.1.0
[41] colorspace_2.1-1 rmarkdown_2.29 tools_4.4.3 pkgconfig_2.0.3
[45] htmltools_0.5.8.1