## Options
options(tidyverse.quiet = TRUE) # quietly
options(dplyr.summarise.inform = FALSE) # quietly
options(mc.cores = min(parallel::detectCores(), 8)) # cpu cores for MCMC
## Packages
# Utilities
library(readxl) # I/O
library(lubridate) # date formatting
library(broman) # number formatting
# Models
library(brms) # fit Bayesian regression models in stan
library(cmdstanr) # R interface to CmdStan
library(tidybayes) # extract posterior samples
# Graphs
library(patchwork) # combine plots
library(ragg) # more consistent graphics output
# Tidyverse
library(knitr) # rendering
library(quarto) # rendering
library(tidyverse) # load last to avoid conflicts (particularly with data.table)
## Load custom functions
source("bgcomp_funcs.R")
Estimating Household Fecal Contamination Associations with Enteric Pathogens in Child Stool
Version 1.1
Description
This report provides R code to conduct the analysis of household fecal marker associations with enteric pathogens and child diarrhea described in:
Holcomb DA, Knee J, Adriano Z, Capone D, Cumming O, Kowalsky E, Nalá R, Viegas E, Stewart JR, Brown J. (2025). Associations between fecal contamination of the household environment and enteric pathogen detection in children living in Maputo, Mozambique. Environment & Health. https://doi.org/10.1021/envhealth.4c00283
A companion R script, bgcomp_funcs.R
, abstracts the functions developed in this analysis to simultaneously estimate individual and pooled associations of a single predictor with multiple binary outcomes, using marginal standardization to obtain prevalence ratio and prevalence difference estimates. It can be sourced into an R session to make the functions available in the global environment for use. The accompanying dataset, data_mst_gpp.xlsx
, is provided to enable replicating this analysis and includes a variable codebook. All files are available in the manuscript repository.
Setup
Load packages and set options.
Data
Read
The data have been prepared in “long” format, meaning that each combination of pathogen, environmental sample type, and fecal marker assessed is presented on its own row. The data file has two sheets: “data” containing the data and “codebook” with a description of each variable. Read in the dataset, extract summary information such as pathogen labels and detections, and save in RDS format for downstream use.
## read in data
<- read_xlsx("../data/data_mst_gpp.xlsx", sheet = "data")
df_raw
## extract pathogen labels
<- df_raw %>%
path_labels select(path, class,
label,%>%
name) distinct() %>%
bind_rows(tibble(path = c("diarrhea",
"bacteria", "viruses", "protozoa", "sth",
"pooled"),
class = c("diarrhea",
"bacteria", "viruses", "protozoa", "sth",
"all"),
label = c("Diarrhea",
"Any bacteria", "Any viruses", "Any protozoa", "Any STH",
"Typical pathogen")))
saveRDS(path_labels, "../output/path_labels.rds")
## extract pathogen detection counts
<- df_raw %>%
path_detects select(path, class,
%>%
n_obs, n_pos, pct_pos) distinct()
saveRDS(path_detects, "../output/path_detects.rds")
## refine data and save
<- df_raw %>%
df_cln filter(include_n5 == "yes",
!(type == "hw" & marker == "Mnif")) %>%
select(comp, hh, child, time,
indv, stool,
female,
age_years_scl,
careprimary,
wealth_scl,
diarrhea,
bacteria, viruses, protozoa, sth,
class, path, infect,
type, marker, envr_id,
detect, quant, quant_cnt)saveRDS(df_cln, "../output/data_mst_gpp.rds")
Summarize
Additional data summaries for downstream use.
## diarrhea data
<- readRDS("../output/data_mst_gpp.rds") %>%
obs_diarr select(indv, time, stool, type,
envr_id, marker, detect, quant,
diarrhea,%>%
female, age_years_scl, careprimary, wealth_scl) distinct() %>%
mutate(infect = diarrhea,
path = "diarrhea",
source = "survey")
## class data
<- readRDS("../output/data_mst_gpp.rds") %>%
obs_class select(indv, time, stool, type,
envr_id, marker, detect, quant,
bacteria, viruses, protozoa, sth,
diarrhea,%>%
female, age_years_scl, careprimary, wealth_scl) distinct() %>%
pivot_longer(c(bacteria, viruses, protozoa, sth),
names_to = "path",
values_to = "infect") %>%
mutate(source = ifelse(path == "sth", "kk", "gpp"))
## path data
<- readRDS("../output/data_mst_gpp.rds") %>%
obs_path select(indv, time, stool, type,
envr_id, marker, detect, quant,
path, infect, diarrhea,%>%
female, age_years_scl, careprimary, wealth_scl) distinct()
## microscopy data
<- readRDS("../output/data_mst_gpp.rds") %>%
obs_kk filter(class %in% c("sth")) %>%
select(indv, time, stool, type,
envr_id, marker, detect, quant,
path, infect, diarrhea,%>%
female, age_years_scl, careprimary, wealth_scl) distinct() %>%
mutate(source = "kk")
## gpp data
<- readRDS("../output/data_mst_gpp.rds") %>%
obs_gpp filter(class %in% c("bacteria", "viruses", "protozoa")) %>%
select(indv, time, stool, type,
envr_id, marker, detect, quant,
path, infect, diarrhea,%>%
female, age_years_scl, careprimary, wealth_scl) distinct() %>%
mutate(source = "gpp")
## stacked
<- bind_rows(obs_gpp,
obs_source
obs_kk,
obs_class,%>%
obs_diarr) arrange(source, stool, path)
## Summarise outcomes
<- obs_source %>%
tab_outcome filter(!is.na(infect)) %>%
select(indv, time, stool,
path, infect,%>%
female, age_years_scl, careprimary, wealth_scl) distinct() %>%
drop_na() %>%
left_join(readRDS("../output/path_labels.rds"), by = "path") %>%
group_by(path, label, class) %>%
summarise(n_obs = n(),
infect_n = sum(infect),
infect_pct = round((infect_n / n_obs) * 100, 0)) %>%
ungroup() %>%
mutate(class = fct_relevel(class, c("bacteria", "viruses", "protozoa", "sth", "diarrhea")),
path = fct_relevel(path,
c("bacteria", "cdif", "campy", "etec", "stec", "shig",
"viruses", "noro",
"protozoa", "crypto", "giardia",
"sth", "asc", "trich",
"diarrhea"))) %>%
arrange(class, path)
saveRDS(tab_outcome, "../output/tab_outcome.rds")
Model Functions
We fit Bayesian hierarchical models with random slopes for the exposure variable (i.e., fecal marker detection) and all other covariates by pathogen to estimate the individual association of the exposure with each pathogen as well as a pooled association across all the pathogens. We then generate posterior predictions for all-exposed and none-exposed scenarios to obtain marginal estimates of the prevalence ratio and prevalence difference for each pathogen and pooled across pathogens, an approach known as marginal standardization or, alternatively, the Bayesian parametric g-formula (“bgcomp”). The models are specified and sampled using the brms
package with cmdstanr
backend and posterior predictions are generated with the tidybayes
package. Custom functions are defined to wrap the functionality from these other packages and implement the bgcomp procedure.
Utility function
The bgcomp procedure is implemented somewhat differently for binary and continuous exposures, so define a utility function to determine if a given variable is binary or not.
## determine if variable is binary:
<- function(x) { all(na.omit(x) %in% 0:1) } is.binary
Binary exposure
Marginal standardization via the Bayesian g-formula is accomplished for binary exposures by sampling from the posterior predictive distribution of a fitted model under two counterfactual scenarios, the first assuming all participants were exposed and the second assuming none were exposed, while all other covariates retain their original, observed values. Define a function that takes a fitted brms
model object, samples the posterior predictive distributions under both scenarios, and calculates the contrasts of interest (ratio and difference). An additional “pathogen” is added in the process to obtain marginal estimates of the pooled exposure effect across pathogens.
### BINARY EXPOSURE posterior predictions
<- function(fit, data,
bgcomp.bin
exposure, outcome, group){# row-wise posterior predictive expectations from fitted brms model object
# fit: brms model object
# data: dataframe to which the model was fit
# exposure: string with the name of the exposure variable in "fit"
# outcome: string with the name of the outcome variable in "fit"
# group: string with the name of the multiple outcome grouping variable (e.g. the variable containing pathogen names)
if(is.null(group)){
<- data
df_pred
else {
}
## add pooled outcome data (new group)
<- data %>%
df_pred mutate(!!group := "pooled",
!!outcome := NA) %>%
distinct() %>%
bind_rows(data)
}
## prediction data frame
<- bind_rows(df_pred %>%
dat_pred mutate(expose_obs = .data[[exposure]],
:= expose_obs,
{{exposure}} contrast = "p_observed"),
%>%
df_pred mutate(expose_obs = .data[[exposure]],
:= 0,
{{exposure}} contrast = "p_unexposed"),
%>%
df_pred mutate(expose_obs = .data[[exposure]],
:= 1,
{{exposure}} contrast = "p_exposed")
)
## posterior predictive expectation
<- epred_draws(object = fit,
ppe newdata = dat_pred,
allow_new_levels = TRUE,
sample_new_levels = "uncertainty") %>%
ungroup() %>%
select(-c(.row, .chain, .iteration, {{exposure}})) %>%
pivot_wider(names_from = contrast,
values_from = .epred) %>%
mutate(RD = p_exposed - p_unexposed, # marginal risk difference
RR = p_exposed / p_unexposed, # marginal risk ratio
OR = (p_exposed / (1 - p_exposed)) / # marginal odds ratio
/ (1 - p_unexposed)),
(p_unexposed support = "binary")
return(ppe)
}
Continuous exposure
For continuous exposures, “all-exposed” and “none-exposed” are not really meaningful. Instead, we want to estimate the change in the probability of the outcome for a one-unit increased in the value of the exposure variable. Because probability is non-linear and constrained between 0 and 1, the rate of change in the probability of the outcome for changing exposure values depends on the starting value of the exposure. Therefore, we approximate the instantaneous rate of change in the outcome probability at the observed exposure value by sampling posterior predictions for values a tiny bit larger and a tiny bit smaller than the observed exposure value and calculate the difference in the predicted values normalized by the difference in the exposure values.
### CONTINUOUS EXPOSURE posterior predictions
<- function(fit, data,
bgcomp.cont
exposure, outcome, group,eps = 1e-4){
# row-wise posterior predictive expectations from fitted brms model object
# fit: brms model object
# data: dataframe to which the model was fit
# exposure: string with the name of the exposure variable in "fit"
# outcome: string with the name of the outcome variable in "fit"
# group: string with the name of the multiple outcome grouping variable (e.g. the variable containing pathogen names)
# eps: value to increment/decrement continuous exposure by for posterior prediction
if(is.null(group)){
<- data
df_pred
else {
}
## add pooled outcome data (new group)
<- data %>%
df_pred mutate(!!group := "pooled",
!!outcome := NA) %>%
distinct() %>%
bind_rows(data)
}
## prediction data frame
<- bind_rows(df_pred %>%
dat_pred mutate(expose_obs = .data[[exposure]],
:= expose_obs,
{{exposure}} contrast = "p_observed"),
%>%
df_pred mutate(expose_obs = .data[[exposure]],
:= expose_obs - eps/2,
{{exposure}} contrast = "p_unexposed"),
%>%
df_pred mutate(expose_obs = .data[[exposure]],
:= expose_obs + eps/2,
{{exposure}} contrast = "p_exposed")
)
## posterior predictive expectation
<- epred_draws(object = fit,
ppe newdata = dat_pred,
allow_new_levels = TRUE,
sample_new_levels = "uncertainty") %>%
ungroup() %>%
select(-c(.row, .chain, .iteration, {{exposure}})) %>%
pivot_wider(names_from = contrast,
values_from = .epred) %>%
mutate(RD = (p_exposed - p_unexposed) / eps, # marginal slope (dY/dX)--risk difference for 1 unit increase
RR = (p_observed + RD) / p_observed, # marginal risk ratio (1 unit increase)
OR = ((p_observed + RD) / # marginal odds ratio (1 unit increase)
1 - (p_observed + RD))) /
(/ (1 - p_observed)),
(p_observed support = "continuous")
return(ppe)
}
Wrapper function
Define a wrapper function to fit the model in brms
, perform marginal standardization using bgcomp.bin()
or bgcomp.cont()
as appropriate, and summarize the results. Note, the dataset must include a unique exposure id column if there are multiple exposures per subject (e.g., a child stool paired with both a household soil sample and a compound latrine soil sample). This ID column is not used directly in the model and is therefor easy to overlook, but is used indirectly when summarizing the predictive distributions to avoid indexing errors.
### Overall function to fit model and perform g-computation
<- function(data, # input data frame
bgcomp # regression model formula (brms style)
formula, summary = TRUE, # summarize predictions across observations?
# exposure variable name (string)
exposure, outcome = "infect", # outcome variable name (string)
group = "path", # grouping variable name (string)
eps = 1e-4, # epsilon size for finite difference (scalar)
prior = NULL, # brms model priors (default: brms improper flat priors)
sample_prior = "no", # set to "yes" to sample prior predictive distribution (requires specifying proper priors)
file = NULL, # provide file path & name to save model object as RDS
file_refit = "always", # if model is saved, overwrite ("always", default), reuse ("never"),
# or only overwrite if model changes ("on_change")
chains = 4, # independent MCMC chains
iter = 2000, # total MCMC iterations per chain (including warmup)
warmup = floor(iter/2),# warmup iterations to discard
cores = getOption("mc.cores", 1)){
# data should already be filtered for outcome & complete cases
# currently only accepts binary outcomes
# must include a unique exposure id column if multiple exposures per subject
### fit the regression model
## QC formula
<- as.formula(formula)
form
## Determine exposure type
<- data %>%
expose_bin select({{exposure}}) %>%
pull() %>%
is.binary()
if(expose_bin){
<- data %>%
data mutate({{exposure}} := as.integer(.data[[exposure]]))
}
## brms bayesian hierarchical model
<- brm(data = data,
fit formula = form,
family = bernoulli(), # currently only set up for binary outcomes
prior = prior,
sample_prior = sample_prior,
backend = "cmdstanr",
file = file,
file_refit = file_refit,
chains = chains,
iter = iter,
warmup = warmup,
cores = cores)
### posterior predictions
## Make posterior predictions
if(expose_bin){
# binary exposure
<- bgcomp.bin(fit = fit, data = data,
ppe exposure = exposure, outcome = outcome, group = group)
else {
} # continuous exposure
<- bgcomp.cont(fit = fit, data = data,
ppe exposure = exposure, outcome = outcome, group = group,
eps = eps)
}
### Summarise and return
if(summary){
if(is.null(group)){
<- "path"
group
<- ppe %>%
ppe mutate(path = "single")
}
<- ppe %>%
ppe select({{group}},
p_observed, p_unexposed, p_exposed,%>%
RD, RR, OR) pivot_longer(c(p_observed, p_unexposed, p_exposed,
RD, RR, OR),names_to = "param",
values_to = "pred") %>%
group_by(across({{group}}), param) %>%
summarise(avg = mean(pred),
sd = sd(pred),
iqr = IQR(pred),
mad = mad(pred),
p2.5 = quantile(pred, prob = 0.025),
p5 = quantile(pred, prob = 0.05),
p10 = quantile(pred, prob = 0.1),
p25 = quantile(pred, prob = 0.25),
p50 = quantile(pred, prob = 0.5),
p75 = quantile(pred, prob = 0.75),
p90 = quantile(pred, prob = 0.9),
p95 = quantile(pred, prob = 0.95),
p97.5 = quantile(pred, prob = 0.975)) %>%
ungroup()
}
return(ppe)
}
Analysis
Fit separate models for each combination of environmental sample matrix (water, soil) and fecal marker (cEC, EC23S, Hf183, Mnif) for three groups of outcomes: individual pathogens, pathogen classes, and reported diarrhea symptoms. The bgcomp()
function is applied across each of these combinations by nesting the long-format dataset and using purrr::map()
approaches. The envr_id
variable provides the unique exposure ID needed to avoid indexing errors.
Pathogens
Fit the models for individual pathogen outcomes. All pathogens are included in the same model, but separate models are fit for each environmental sample type and fecal marker. The slope and intercept for the exposure variable and all cosvariates are allowed to vary by pathogen.
Note: depending on platform and Stan installation, a large number of arcane warning messages may be printed to the console. These are generally safe to ignore. Do pay attention to warnings about divergent transitions—these indicate the sampler is not exploring the parameter space well and the resulting estimates may be unreliable.
source("bgcomp_funcs.R")
### Set up for all
## data
<- readRDS("../output/data_mst_gpp.rds") %>%
dat_path filter(!is.na(detect),
!is.na(infect)) %>%
select(type, marker,
stool, indv, comp,
path, infect,
envr_id, detect, quant_cnt,%>%
female, age_years_scl, careprimary, wealth_scl) distinct() %>%
drop_na() %>%
mutate(exposure = case_when(marker %in% c("HF183", "Mnif") ~ detect,
%in% c("cEC", "EC23S") ~ quant_cnt))
marker
## priors
<- c(prior(normal(-1,2), class = "Intercept"),
prior_path prior(normal(0,1), class = "b"),
prior(normal(0,1), class = "sd"),
prior(lkj(2), class = "cor"))
## formula
<- formula(infect ~ exposure + female + age_years_scl + careprimary + wealth_scl +
form_path 1 + exposure + female + age_years_scl + careprimary + wealth_scl | path) +
(1 | stool) + (1 | comp))
(
### fit for all environmental samples and markers
<- dat_path %>%
est_path group_by(type, marker) %>%
nest() %>%
mutate(name = str_c("../model/report", type, "_", marker),
est = map2(.x = data, .y = name,
~bgcomp(data = .x,
formula = form_path,
summary = TRUE,
exposure = "exposure",
outcome = "infect",
group = "path",
prior = prior_path,
file = .y))) %>%
ungroup() %>%
select(-c(name, data)) %>%
unnest(est) %>%
mutate(source = "gpp")
saveRDS(est_path, "../est/bgcomp_path.rds")
Class
Models are fit by pathogen class rather than individual pathogen, but otherwise model structure is the same.
### Set up for all
## data
<- readRDS("../output/data_mst_gpp.rds") %>%
dat_class select(type, marker,
stool, indv, comp,
bacteria, viruses, protozoa, sth,
envr_id, detect, quant_cnt,%>%
female, age_years_scl, careprimary, wealth_scl) distinct() %>%
pivot_longer(bacteria:sth,
names_to = "path",
values_to = "infect") %>%
filter(!is.na(detect),
!is.na(infect)) %>%
drop_na() %>%
mutate(exposure = case_when(marker %in% c("HF183", "Mnif") ~ detect,
%in% c("cEC", "EC23S") ~ quant_cnt))
marker
## priors
<- c(prior(normal(-1,2), class = "Intercept"),
prior_class prior(normal(0,1), class = "b"),
prior(normal(0,1), class = "sd"),
prior(lkj(2), class = "cor"))
## formula
<- formula(infect ~ exposure + female + age_years_scl + careprimary + wealth_scl +
form_class 1 + exposure + female + age_years_scl + careprimary + wealth_scl | path) +
(1 | stool) + (1 | comp))
(
### fit for all
<- dat_class %>%
est_class group_by(type, marker) %>%
nest() %>%
mutate(name = str_c("../model/report_class", type, "_", marker),
est = map2(.x = data, .y = name,
~bgcomp(data = .x,
formula = form_class,
summary = TRUE,
exposure = "exposure",
outcome = "infect",
group = "path",
prior = prior_class,
file = .y))) %>%
ungroup() %>%
select(-c(name, data)) %>%
unnest(est) %>%
mutate(source = "class")
saveRDS(est_class, "../est/bgcomp_class.rds")
Diarrhea
Since diarrhea is evaluated as a singlet outcome, the model formula is simplified to remove the pathogen and stool-varying terms. Otherwise, the same modeling and marginal standardization approach applies, without the pooled estimate.
### Set up for all
## data
<- readRDS("../output/data_mst_gpp.rds") %>%
dat_diarr select(type, marker,
stool, indv, comp,
diarrhea,
envr_id, detect, quant_cnt,%>%
female, age_years_scl, careprimary, wealth_scl) distinct() %>%
drop_na() %>%
mutate(exposure = case_when(marker %in% c("HF183", "Mnif") ~ detect,
%in% c("cEC", "EC23S") ~ quant_cnt))
marker
## priors
<- c(prior(normal(-1,2), class = "Intercept"),
prior_diarr prior(normal(0,1), class = "b"),
prior(normal(0,1), class = "sd"))
## formula
<- formula(diarrhea ~ exposure + female + age_years_scl + careprimary + wealth_scl + (1 | comp))
form_diarr
### fit for all
<- dat_diarr %>%
est_diarr group_by(type, marker) %>%
nest() %>%
mutate(name = str_c("../model/report_diarr", type, "_", marker),
est = map2(.x = data, .y = name,
~bgcomp(data = .x,
formula = form_diarr,
summary = TRUE,
exposure = "exposure",
outcome = "diarrhea",
group = NULL,
prior = prior_diarr,
file = .y))) %>%
ungroup() %>%
select(-c(name, data)) %>%
unnest(est) %>%
mutate(path = "diarrhea",
source = "diarrhea")
saveRDS(est_diarr, "../est/bgcomp_diarr.rds")
Plot
The marginal prevalence ratio and difference estimates are presented as forest plots and color-coded by class.
Inputs
Prepare elements used for plotting.
## MST summary
<- readRDS("../output/data_mst_gpp.rds") %>%
mst_stat filter(path == "shig",
!is.na(infect),
!is.na(detect)) %>%
select(path, type, envr_id, marker, detect) %>%
distinct() %>%
group_by(path, type, marker) %>%
summarise(n_envr_obs = n_distinct(envr_id),
n_envr_pos = sum(detect),
pct_pos_envr = 100 * (n_envr_pos / n_envr_obs)) %>%
ungroup() %>%
select(type, marker, pct_pos_envr)
## Stack estimates
<- bind_rows(readRDS("../est/bgcomp_path.rds"),
est_all readRDS("../est/bgcomp_diarr.rds")) %>%
ungroup() %>%
filter(!(source %in% c("class", "diarrhea") & path == "pooled")) %>%
add_case(type = "hw", marker = "Mnif",
path = "shig", param = "RR") %>%
add_case(type = "hw", marker = "Mnif",
path = "shig", param = "RD") %>%
left_join(readRDS("../output/path_labels.rds"),
by = c("path")) %>%
left_join(readRDS("../output/tab_outcome.rds") %>%
select(path, pct_pos = infect_pct),
by = "path") %>%
mutate(label = ifelse(path == "pooled", "Typical\npathogen", label),
class = ifelse(path == "pooled", "all", class) %>%
fct_recode("STH" = "sth",
"pathogen" = "all",
"symptom" = "diarrhea") %>%
fct_relevel(c("symptom", "bacteria", "viruses", "protozoa", "STH", "pathogen"))) %>%
mutate(pct_pos = ifelse(path == "pooled", 0, pct_pos),
manual = ifelse(path == "diarrhea", 100, pct_pos)) %>%
arrange(manual, type, marker, class, label) %>%
mutate(label = ifelse(path == "pooled", label, paste0(label, "\n(", round(pct_pos),"%)")),
label = fct_inorder(label))
## define color palette
<- RColorBrewer::brewer.pal(6, "Set2")
class_color_hex
<- c("symptom" = "#FFD92F",
class_color "bacteria" = "#66C2A5",
"viruses" = "#FC8D62",
"protozoa" = "#8DA0CB",
"STH" = "#E78AC3",
"pathogen" = "#A6D854")
Ratios
Plot marginal prevalence ratio estimates.
### Ratios
## Soil
<- est_all %>%
plot_est_pr_soil filter(type == "soil") %>%
mutate(type = fct_recode(type,
"domestic soil" = "soil"),
marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>%
fct_recode("E. coli colony counts\n(95%)" = "cEC",
"E. coli gene copies\n(98%)" = "EC23S",
"Human HF183 detection\n(39%)" = "HF183",
"Human Mnif detection\n(38%)" = "Mnif")) %>%
filter(param == "RR") %>%
ggplot(data = .) +
geom_vline(aes(xintercept = 1), linetype = "dashed") +
::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5,
ggstancey = label,
color = class),
position = position_dodge2(width = 0.75,
preserve = "single"),
size = 1, fatten = 2.5) +
facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) +
scale_x_log10() +
coord_cartesian(xlim = c(0.4, 1/0.4)) +
scale_color_manual(values = class_color,
name = "outcome\nclass") +
labs(x = "prevalence ratio",
y = NULL) +
theme(legend.key.size = unit(1.5, "lines"),
legend.key = element_rect(fill = "white", colour = NA),
panel.border = element_rect(colour = "black", fill=NA),
panel.spacing = unit(0.25, "lines"),
plot.title = element_text(face="bold"),
plot.margin = margin(c(2, 1, 1, 1)),
legend.margin = margin(c(5, .5, 5, .5)))
## water
<- est_all %>%
plot_est_pr_water filter(type == "hw") %>%
mutate(type = fct_recode(type,
"household stored water" = "hw"),
marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>%
fct_recode("E. coli colony counts\n(85%)" = "cEC",
"E. coli gene copies\n(90%)" = "EC23S",
"Human HF183 detection\n(15%)" = "HF183",
"Human Mnif detection\n(<1%)" = "Mnif")) %>%
filter(param == "RR") %>%
ggplot(data = .) +
geom_vline(aes(xintercept = 1), linetype = "dashed") +
::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5,
ggstancey = label,
color = class),
position = position_dodge2(width = 0.75,
preserve = "single"),
size = 1, fatten = 2.5) +
facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) +
scale_x_log10() +
coord_cartesian(xlim = c(0.4, 1/0.4)) +
scale_color_manual(values = class_color,
name = "outcome\nclass") +
labs(x = NULL,
y = NULL) +
theme(legend.key.size = unit(1.5, "lines"),
legend.key = element_rect(fill = "white", colour = NA),
panel.border = element_rect(colour = "black", fill=NA),
panel.spacing = unit(0.25, "lines"),
plot.title = element_text(face="bold"),
plot.margin = margin(c(2, 1, 1, 1)),
legend.margin = margin(c(5, .5, 5, .5)))
## Combine
<- plot_est_pr_water + plot_est_pr_soil +
plot_est_pr plot_layout(nrow = 2,
guides = "collect")
ggsave(filename = "../fig/mst_gpp_est_pr.png",
plot = plot_est_pr,
device = agg_png, dpi = 300,
width = 8.5, height = 11, units = "in")
Differences
Plot marginal prevalence difference estimates.
### Differences
## Soil
<- est_all %>%
plot_est_pd_soil filter(type == "soil") %>%
mutate(type = fct_recode(type,
"domestic soil" = "soil"),
marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>%
fct_recode("E. coli colony counts\n(95%)" = "cEC",
"E. coli gene copies\n(98%)" = "EC23S",
"Human HF183 detection\n(39%)" = "HF183",
"Human Mnif detection\n(38%)" = "Mnif")) %>%
filter(param == "RD") %>%
mutate(across(c(avg, p2.5, p97.5), ~.x*100)) %>%
ggplot(data = .) +
geom_vline(aes(xintercept = 0), linetype = "dashed") +
::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5,
ggstancey = label,
color = class),
position = position_dodge2(width = 0.75,
preserve = "single"),
size = 1, fatten = 2.5) +
facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) +
coord_cartesian(xlim = c(-15, 15)) +
scale_color_manual(values = class_color,
name = "outcome\nclass") +
labs(x = "prevalence difference (percentage points)",
y = NULL) +
theme(legend.key.size = unit(1.5, "lines"),
legend.key = element_rect(fill = "white", colour = NA),
panel.border = element_rect(colour = "black", fill=NA),
panel.spacing = unit(0.25, "lines"),
plot.title = element_text(face="bold"),
plot.margin = margin(c(2, 1, 1, 1)),
legend.margin = margin(c(5, .5, 5, .5)))
## water
<- est_all %>%
plot_est_pd_water filter(type == "hw") %>%
mutate(type = fct_recode(type,
"household stored water" = "hw"),
marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>%
fct_recode("E. coli colony counts\n(85%)" = "cEC",
"E. coli gene copies\n(90%)" = "EC23S",
"Human HF183 detection\n(15%)" = "HF183",
"Human Mnif detection\n(<1%)" = "Mnif")) %>%
filter(param == "RD") %>%
mutate(across(c(avg, p2.5, p97.5), ~.x*100)) %>%
ggplot(data = .) +
geom_vline(aes(xintercept = 0), linetype = "dashed") +
::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5,
ggstancey = label,
color = class),
position = position_dodge2(width = 0.75,
preserve = "single"),
size = 1, fatten = 2.5) +
facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) +
coord_cartesian(xlim = c(-15, 15)) +
scale_color_manual(values = class_color,
name = "outcome\nclass") +
labs(x = NULL,
y = NULL) +
theme(legend.key.size = unit(1.5, "lines"),
legend.key = element_rect(fill = "white", colour = NA),
panel.border = element_rect(colour = "black", fill=NA),
panel.spacing = unit(0.25, "lines"),
plot.title = element_text(face="bold"),
plot.margin = margin(c(2, 1, 1, 1)),
legend.margin = margin(c(5, .5, 5, .5)))
## Combine
<- plot_est_pd_water + plot_est_pd_soil +
plot_est_pd plot_layout(nrow = 2,
guides = "collect")
ggsave(filename = "../fig/mst_gpp_est_pd.png",
plot = plot_est_pd,
device = agg_png, dpi = 300,
width = 8.5, height = 11, units = "in")