## Options
options(tidyverse.quiet = TRUE) # quietly
options(dplyr.summarise.inform = FALSE) # quietly
options(mc.cores = min(parallel::detectCores(), 8)) # cpu cores for MCMC
## Packages
# Utilities
library(readxl) # I/O
library(lubridate) # date formatting
library(broman) # number formatting
# Models
library(brms) # fit Bayesian regression models in stan
library(cmdstanr) # R interface to CmdStan
library(tidybayes) # extract posterior samples
# Graphs
library(patchwork) # combine plots
library(ragg) # more consistent graphics output
# Tidyverse
library(knitr) # rendering
library(quarto) # rendering
library(tidyverse) # load last to avoid conflicts (particularly with data.table)
## Load custom functions
source("bgcomp_funcs.R")Estimating Household Fecal Contamination Associations with Enteric Pathogens in Child Stool
Version 1.1
Description
This report provides R code to conduct the analysis of household fecal marker associations with enteric pathogens and child diarrhea described in:
Holcomb DA, Knee J, Adriano Z, Capone D, Cumming O, Kowalsky E, Nalá R, Viegas E, Stewart JR, Brown J. (2025). Associations between fecal contamination of the household environment and enteric pathogen detection in children living in Maputo, Mozambique. Environment & Health. https://doi.org/10.1021/envhealth.4c00283
A companion R script, bgcomp_funcs.R, abstracts the functions developed in this analysis to simultaneously estimate individual and pooled associations of a single predictor with multiple binary outcomes, using marginal standardization to obtain prevalence ratio and prevalence difference estimates. It can be sourced into an R session to make the functions available in the global environment for use. The accompanying dataset, data_mst_gpp.xlsx, is provided to enable replicating this analysis and includes a variable codebook. All files are available in the manuscript repository.
Setup
Load packages and set options.
Data
Read
The data have been prepared in “long” format, meaning that each combination of pathogen, environmental sample type, and fecal marker assessed is presented on its own row. The data file has two sheets: “data” containing the data and “codebook” with a description of each variable. Read in the dataset, extract summary information such as pathogen labels and detections, and save in RDS format for downstream use.
## read in data
df_raw <- read_xlsx("../data/data_mst_gpp.xlsx", sheet = "data")
## extract pathogen labels
path_labels <- df_raw %>%
select(path, class,
label,
name) %>%
distinct() %>%
bind_rows(tibble(path = c("diarrhea",
"bacteria", "viruses", "protozoa", "sth",
"pooled"),
class = c("diarrhea",
"bacteria", "viruses", "protozoa", "sth",
"all"),
label = c("Diarrhea",
"Any bacteria", "Any viruses", "Any protozoa", "Any STH",
"Typical pathogen")))
saveRDS(path_labels, "../output/path_labels.rds")
## extract pathogen detection counts
path_detects <- df_raw %>%
select(path, class,
n_obs, n_pos, pct_pos) %>%
distinct()
saveRDS(path_detects, "../output/path_detects.rds")
## refine data and save
df_cln <- df_raw %>%
filter(include_n5 == "yes",
!(type == "hw" & marker == "Mnif")) %>%
select(comp, hh, child, time,
indv, stool,
female,
age_years_scl,
careprimary,
wealth_scl,
diarrhea,
bacteria, viruses, protozoa, sth,
class, path, infect,
type, marker, envr_id,
detect, quant, quant_cnt)
saveRDS(df_cln, "../output/data_mst_gpp.rds")Summarize
Additional data summaries for downstream use.
## diarrhea data
obs_diarr <- readRDS("../output/data_mst_gpp.rds") %>%
select(indv, time, stool, type,
envr_id, marker, detect, quant,
diarrhea,
female, age_years_scl, careprimary, wealth_scl) %>%
distinct() %>%
mutate(infect = diarrhea,
path = "diarrhea",
source = "survey")
## class data
obs_class <- readRDS("../output/data_mst_gpp.rds") %>%
select(indv, time, stool, type,
envr_id, marker, detect, quant,
bacteria, viruses, protozoa, sth,
diarrhea,
female, age_years_scl, careprimary, wealth_scl) %>%
distinct() %>%
pivot_longer(c(bacteria, viruses, protozoa, sth),
names_to = "path",
values_to = "infect") %>%
mutate(source = ifelse(path == "sth", "kk", "gpp"))
## path data
obs_path <- readRDS("../output/data_mst_gpp.rds") %>%
select(indv, time, stool, type,
envr_id, marker, detect, quant,
path, infect, diarrhea,
female, age_years_scl, careprimary, wealth_scl) %>%
distinct()
## microscopy data
obs_kk <- readRDS("../output/data_mst_gpp.rds") %>%
filter(class %in% c("sth")) %>%
select(indv, time, stool, type,
envr_id, marker, detect, quant,
path, infect, diarrhea,
female, age_years_scl, careprimary, wealth_scl) %>%
distinct() %>%
mutate(source = "kk")
## gpp data
obs_gpp <- readRDS("../output/data_mst_gpp.rds") %>%
filter(class %in% c("bacteria", "viruses", "protozoa")) %>%
select(indv, time, stool, type,
envr_id, marker, detect, quant,
path, infect, diarrhea,
female, age_years_scl, careprimary, wealth_scl) %>%
distinct() %>%
mutate(source = "gpp")
## stacked
obs_source <- bind_rows(obs_gpp,
obs_kk,
obs_class,
obs_diarr) %>%
arrange(source, stool, path)
## Summarise outcomes
tab_outcome <- obs_source %>%
filter(!is.na(infect)) %>%
select(indv, time, stool,
path, infect,
female, age_years_scl, careprimary, wealth_scl) %>%
distinct() %>%
drop_na() %>%
left_join(readRDS("../output/path_labels.rds"), by = "path") %>%
group_by(path, label, class) %>%
summarise(n_obs = n(),
infect_n = sum(infect),
infect_pct = round((infect_n / n_obs) * 100, 0)) %>%
ungroup() %>%
mutate(class = fct_relevel(class, c("bacteria", "viruses", "protozoa", "sth", "diarrhea")),
path = fct_relevel(path,
c("bacteria", "cdif", "campy", "etec", "stec", "shig",
"viruses", "noro",
"protozoa", "crypto", "giardia",
"sth", "asc", "trich",
"diarrhea"))) %>%
arrange(class, path)
saveRDS(tab_outcome, "../output/tab_outcome.rds")Model Functions
We fit Bayesian hierarchical models with random slopes for the exposure variable (i.e., fecal marker detection) and all other covariates by pathogen to estimate the individual association of the exposure with each pathogen as well as a pooled association across all the pathogens. We then generate posterior predictions for all-exposed and none-exposed scenarios to obtain marginal estimates of the prevalence ratio and prevalence difference for each pathogen and pooled across pathogens, an approach known as marginal standardization or, alternatively, the Bayesian parametric g-formula (“bgcomp”). The models are specified and sampled using the brms package with cmdstanr backend and posterior predictions are generated with the tidybayes package. Custom functions are defined to wrap the functionality from these other packages and implement the bgcomp procedure.
Utility function
The bgcomp procedure is implemented somewhat differently for binary and continuous exposures, so define a utility function to determine if a given variable is binary or not.
## determine if variable is binary:
is.binary <- function(x) { all(na.omit(x) %in% 0:1) }Binary exposure
Marginal standardization via the Bayesian g-formula is accomplished for binary exposures by sampling from the posterior predictive distribution of a fitted model under two counterfactual scenarios, the first assuming all participants were exposed and the second assuming none were exposed, while all other covariates retain their original, observed values. Define a function that takes a fitted brms model object, samples the posterior predictive distributions under both scenarios, and calculates the contrasts of interest (ratio and difference). An additional “pathogen” is added in the process to obtain marginal estimates of the pooled exposure effect across pathogens.
### BINARY EXPOSURE posterior predictions
bgcomp.bin <- function(fit, data,
exposure, outcome, group){
# row-wise posterior predictive expectations from fitted brms model object
# fit: brms model object
# data: dataframe to which the model was fit
# exposure: string with the name of the exposure variable in "fit"
# outcome: string with the name of the outcome variable in "fit"
# group: string with the name of the multiple outcome grouping variable (e.g. the variable containing pathogen names)
if(is.null(group)){
df_pred <- data
} else {
## add pooled outcome data (new group)
df_pred <- data %>%
mutate(!!group := "pooled",
!!outcome := NA) %>%
distinct() %>%
bind_rows(data)
}
## prediction data frame
dat_pred <- bind_rows(df_pred %>%
mutate(expose_obs = .data[[exposure]],
{{exposure}} := expose_obs,
contrast = "p_observed"),
df_pred %>%
mutate(expose_obs = .data[[exposure]],
{{exposure}} := 0,
contrast = "p_unexposed"),
df_pred %>%
mutate(expose_obs = .data[[exposure]],
{{exposure}} := 1,
contrast = "p_exposed")
)
## posterior predictive expectation
ppe <- epred_draws(object = fit,
newdata = dat_pred,
allow_new_levels = TRUE,
sample_new_levels = "uncertainty") %>%
ungroup() %>%
select(-c(.row, .chain, .iteration, {{exposure}})) %>%
pivot_wider(names_from = contrast,
values_from = .epred) %>%
mutate(RD = p_exposed - p_unexposed, # marginal risk difference
RR = p_exposed / p_unexposed, # marginal risk ratio
OR = (p_exposed / (1 - p_exposed)) / # marginal odds ratio
(p_unexposed / (1 - p_unexposed)),
support = "binary")
return(ppe)
}Continuous exposure
For continuous exposures, “all-exposed” and “none-exposed” are not really meaningful. Instead, we want to estimate the change in the probability of the outcome for a one-unit increased in the value of the exposure variable. Because probability is non-linear and constrained between 0 and 1, the rate of change in the probability of the outcome for changing exposure values depends on the starting value of the exposure. Therefore, we approximate the instantaneous rate of change in the outcome probability at the observed exposure value by sampling posterior predictions for values a tiny bit larger and a tiny bit smaller than the observed exposure value and calculate the difference in the predicted values normalized by the difference in the exposure values.
### CONTINUOUS EXPOSURE posterior predictions
bgcomp.cont <- function(fit, data,
exposure, outcome, group,
eps = 1e-4){
# row-wise posterior predictive expectations from fitted brms model object
# fit: brms model object
# data: dataframe to which the model was fit
# exposure: string with the name of the exposure variable in "fit"
# outcome: string with the name of the outcome variable in "fit"
# group: string with the name of the multiple outcome grouping variable (e.g. the variable containing pathogen names)
# eps: value to increment/decrement continuous exposure by for posterior prediction
if(is.null(group)){
df_pred <- data
} else {
## add pooled outcome data (new group)
df_pred <- data %>%
mutate(!!group := "pooled",
!!outcome := NA) %>%
distinct() %>%
bind_rows(data)
}
## prediction data frame
dat_pred <- bind_rows(df_pred %>%
mutate(expose_obs = .data[[exposure]],
{{exposure}} := expose_obs,
contrast = "p_observed"),
df_pred %>%
mutate(expose_obs = .data[[exposure]],
{{exposure}} := expose_obs - eps/2,
contrast = "p_unexposed"),
df_pred %>%
mutate(expose_obs = .data[[exposure]],
{{exposure}} := expose_obs + eps/2,
contrast = "p_exposed")
)
## posterior predictive expectation
ppe <- epred_draws(object = fit,
newdata = dat_pred,
allow_new_levels = TRUE,
sample_new_levels = "uncertainty") %>%
ungroup() %>%
select(-c(.row, .chain, .iteration, {{exposure}})) %>%
pivot_wider(names_from = contrast,
values_from = .epred) %>%
mutate(RD = (p_exposed - p_unexposed) / eps, # marginal slope (dY/dX)--risk difference for 1 unit increase
RR = (p_observed + RD) / p_observed, # marginal risk ratio (1 unit increase)
OR = ((p_observed + RD) / # marginal odds ratio (1 unit increase)
(1 - (p_observed + RD))) /
(p_observed / (1 - p_observed)),
support = "continuous")
return(ppe)
}Wrapper function
Define a wrapper function to fit the model in brms, perform marginal standardization using bgcomp.bin() or bgcomp.cont() as appropriate, and summarize the results. Note, the dataset must include a unique exposure id column if there are multiple exposures per subject (e.g., a child stool paired with both a household soil sample and a compound latrine soil sample). This ID column is not used directly in the model and is therefor easy to overlook, but is used indirectly when summarizing the predictive distributions to avoid indexing errors.
### Overall function to fit model and perform g-computation
bgcomp <- function(data, # input data frame
formula, # regression model formula (brms style)
summary = TRUE, # summarize predictions across observations?
exposure, # exposure variable name (string)
outcome = "infect", # outcome variable name (string)
group = "path", # grouping variable name (string)
eps = 1e-4, # epsilon size for finite difference (scalar)
prior = NULL, # brms model priors (default: brms improper flat priors)
sample_prior = "no", # set to "yes" to sample prior predictive distribution (requires specifying proper priors)
file = NULL, # provide file path & name to save model object as RDS
file_refit = "always", # if model is saved, overwrite ("always", default), reuse ("never"),
# or only overwrite if model changes ("on_change")
chains = 4, # independent MCMC chains
iter = 2000, # total MCMC iterations per chain (including warmup)
warmup = floor(iter/2),# warmup iterations to discard
cores = getOption("mc.cores", 1)){
# data should already be filtered for outcome & complete cases
# currently only accepts binary outcomes
# must include a unique exposure id column if multiple exposures per subject
### fit the regression model
## QC formula
form <- as.formula(formula)
## Determine exposure type
expose_bin <- data %>%
select({{exposure}}) %>%
pull() %>%
is.binary()
if(expose_bin){
data <- data %>%
mutate({{exposure}} := as.integer(.data[[exposure]]))
}
## brms bayesian hierarchical model
fit <- brm(data = data,
formula = form,
family = bernoulli(), # currently only set up for binary outcomes
prior = prior,
sample_prior = sample_prior,
backend = "cmdstanr",
file = file,
file_refit = file_refit,
chains = chains,
iter = iter,
warmup = warmup,
cores = cores)
### posterior predictions
## Make posterior predictions
if(expose_bin){
# binary exposure
ppe <- bgcomp.bin(fit = fit, data = data,
exposure = exposure, outcome = outcome, group = group)
} else {
# continuous exposure
ppe <- bgcomp.cont(fit = fit, data = data,
exposure = exposure, outcome = outcome, group = group,
eps = eps)
}
### Summarise and return
if(summary){
if(is.null(group)){
group <- "path"
ppe <- ppe %>%
mutate(path = "single")
}
ppe <- ppe %>%
select({{group}},
p_observed, p_unexposed, p_exposed,
RD, RR, OR) %>%
pivot_longer(c(p_observed, p_unexposed, p_exposed,
RD, RR, OR),
names_to = "param",
values_to = "pred") %>%
group_by(across({{group}}), param) %>%
summarise(avg = mean(pred),
sd = sd(pred),
iqr = IQR(pred),
mad = mad(pred),
p2.5 = quantile(pred, prob = 0.025),
p5 = quantile(pred, prob = 0.05),
p10 = quantile(pred, prob = 0.1),
p25 = quantile(pred, prob = 0.25),
p50 = quantile(pred, prob = 0.5),
p75 = quantile(pred, prob = 0.75),
p90 = quantile(pred, prob = 0.9),
p95 = quantile(pred, prob = 0.95),
p97.5 = quantile(pred, prob = 0.975)) %>%
ungroup()
}
return(ppe)
}Analysis
Fit separate models for each combination of environmental sample matrix (water, soil) and fecal marker (cEC, EC23S, Hf183, Mnif) for three groups of outcomes: individual pathogens, pathogen classes, and reported diarrhea symptoms. The bgcomp() function is applied across each of these combinations by nesting the long-format dataset and using purrr::map() approaches. The envr_id variable provides the unique exposure ID needed to avoid indexing errors.
Pathogens
Fit the models for individual pathogen outcomes. All pathogens are included in the same model, but separate models are fit for each environmental sample type and fecal marker. The slope and intercept for the exposure variable and all cosvariates are allowed to vary by pathogen.
Note: depending on platform and Stan installation, a large number of arcane warning messages may be printed to the console. These are generally safe to ignore. Do pay attention to warnings about divergent transitions—these indicate the sampler is not exploring the parameter space well and the resulting estimates may be unreliable.
source("bgcomp_funcs.R")
### Set up for all
## data
dat_path <- readRDS("../output/data_mst_gpp.rds") %>%
filter(!is.na(detect),
!is.na(infect)) %>%
select(type, marker,
stool, indv, comp,
path, infect,
envr_id, detect, quant_cnt,
female, age_years_scl, careprimary, wealth_scl) %>%
distinct() %>%
drop_na() %>%
mutate(exposure = case_when(marker %in% c("HF183", "Mnif") ~ detect,
marker %in% c("cEC", "EC23S") ~ quant_cnt))
## priors
prior_path <- c(prior(normal(-1,2), class = "Intercept"),
prior(normal(0,1), class = "b"),
prior(normal(0,1), class = "sd"),
prior(lkj(2), class = "cor"))
## formula
form_path <- formula(infect ~ exposure + female + age_years_scl + careprimary + wealth_scl +
(1 + exposure + female + age_years_scl + careprimary + wealth_scl | path) +
(1 | stool) + (1 | comp))
### fit for all environmental samples and markers
est_path <- dat_path %>%
group_by(type, marker) %>%
nest() %>%
mutate(name = str_c("../model/report", type, "_", marker),
est = map2(.x = data, .y = name,
~bgcomp(data = .x,
formula = form_path,
summary = TRUE,
exposure = "exposure",
outcome = "infect",
group = "path",
prior = prior_path,
file = .y))) %>%
ungroup() %>%
select(-c(name, data)) %>%
unnest(est) %>%
mutate(source = "gpp")
saveRDS(est_path, "../est/bgcomp_path.rds")Class
Models are fit by pathogen class rather than individual pathogen, but otherwise model structure is the same.
### Set up for all
## data
dat_class <- readRDS("../output/data_mst_gpp.rds") %>%
select(type, marker,
stool, indv, comp,
bacteria, viruses, protozoa, sth,
envr_id, detect, quant_cnt,
female, age_years_scl, careprimary, wealth_scl) %>%
distinct() %>%
pivot_longer(bacteria:sth,
names_to = "path",
values_to = "infect") %>%
filter(!is.na(detect),
!is.na(infect)) %>%
drop_na() %>%
mutate(exposure = case_when(marker %in% c("HF183", "Mnif") ~ detect,
marker %in% c("cEC", "EC23S") ~ quant_cnt))
## priors
prior_class <- c(prior(normal(-1,2), class = "Intercept"),
prior(normal(0,1), class = "b"),
prior(normal(0,1), class = "sd"),
prior(lkj(2), class = "cor"))
## formula
form_class <- formula(infect ~ exposure + female + age_years_scl + careprimary + wealth_scl +
(1 + exposure + female + age_years_scl + careprimary + wealth_scl | path) +
(1 | stool) + (1 | comp))
### fit for all
est_class <- dat_class %>%
group_by(type, marker) %>%
nest() %>%
mutate(name = str_c("../model/report_class", type, "_", marker),
est = map2(.x = data, .y = name,
~bgcomp(data = .x,
formula = form_class,
summary = TRUE,
exposure = "exposure",
outcome = "infect",
group = "path",
prior = prior_class,
file = .y))) %>%
ungroup() %>%
select(-c(name, data)) %>%
unnest(est) %>%
mutate(source = "class")
saveRDS(est_class, "../est/bgcomp_class.rds")Diarrhea
Since diarrhea is evaluated as a singlet outcome, the model formula is simplified to remove the pathogen and stool-varying terms. Otherwise, the same modeling and marginal standardization approach applies, without the pooled estimate.
### Set up for all
## data
dat_diarr <- readRDS("../output/data_mst_gpp.rds") %>%
select(type, marker,
stool, indv, comp,
diarrhea,
envr_id, detect, quant_cnt,
female, age_years_scl, careprimary, wealth_scl) %>%
distinct() %>%
drop_na() %>%
mutate(exposure = case_when(marker %in% c("HF183", "Mnif") ~ detect,
marker %in% c("cEC", "EC23S") ~ quant_cnt))
## priors
prior_diarr <- c(prior(normal(-1,2), class = "Intercept"),
prior(normal(0,1), class = "b"),
prior(normal(0,1), class = "sd"))
## formula
form_diarr <- formula(diarrhea ~ exposure + female + age_years_scl + careprimary + wealth_scl + (1 | comp))
### fit for all
est_diarr <- dat_diarr %>%
group_by(type, marker) %>%
nest() %>%
mutate(name = str_c("../model/report_diarr", type, "_", marker),
est = map2(.x = data, .y = name,
~bgcomp(data = .x,
formula = form_diarr,
summary = TRUE,
exposure = "exposure",
outcome = "diarrhea",
group = NULL,
prior = prior_diarr,
file = .y))) %>%
ungroup() %>%
select(-c(name, data)) %>%
unnest(est) %>%
mutate(path = "diarrhea",
source = "diarrhea")
saveRDS(est_diarr, "../est/bgcomp_diarr.rds")Plot
The marginal prevalence ratio and difference estimates are presented as forest plots and color-coded by class.
Inputs
Prepare elements used for plotting.
## MST summary
mst_stat <- readRDS("../output/data_mst_gpp.rds") %>%
filter(path == "shig",
!is.na(infect),
!is.na(detect)) %>%
select(path, type, envr_id, marker, detect) %>%
distinct() %>%
group_by(path, type, marker) %>%
summarise(n_envr_obs = n_distinct(envr_id),
n_envr_pos = sum(detect),
pct_pos_envr = 100 * (n_envr_pos / n_envr_obs)) %>%
ungroup() %>%
select(type, marker, pct_pos_envr)
## Stack estimates
est_all <- bind_rows(readRDS("../est/bgcomp_path.rds"),
readRDS("../est/bgcomp_diarr.rds")) %>%
ungroup() %>%
filter(!(source %in% c("class", "diarrhea") & path == "pooled")) %>%
add_case(type = "hw", marker = "Mnif",
path = "shig", param = "RR") %>%
add_case(type = "hw", marker = "Mnif",
path = "shig", param = "RD") %>%
left_join(readRDS("../output/path_labels.rds"),
by = c("path")) %>%
left_join(readRDS("../output/tab_outcome.rds") %>%
select(path, pct_pos = infect_pct),
by = "path") %>%
mutate(label = ifelse(path == "pooled", "Typical\npathogen", label),
class = ifelse(path == "pooled", "all", class) %>%
fct_recode("STH" = "sth",
"pathogen" = "all",
"symptom" = "diarrhea") %>%
fct_relevel(c("symptom", "bacteria", "viruses", "protozoa", "STH", "pathogen"))) %>%
mutate(pct_pos = ifelse(path == "pooled", 0, pct_pos),
manual = ifelse(path == "diarrhea", 100, pct_pos)) %>%
arrange(manual, type, marker, class, label) %>%
mutate(label = ifelse(path == "pooled", label, paste0(label, "\n(", round(pct_pos),"%)")),
label = fct_inorder(label))
## define color palette
class_color_hex <- RColorBrewer::brewer.pal(6, "Set2")
class_color <- c("symptom" = "#FFD92F",
"bacteria" = "#66C2A5",
"viruses" = "#FC8D62",
"protozoa" = "#8DA0CB",
"STH" = "#E78AC3",
"pathogen" = "#A6D854")Ratios
Plot marginal prevalence ratio estimates.
### Ratios
## Soil
plot_est_pr_soil <- est_all %>%
filter(type == "soil") %>%
mutate(type = fct_recode(type,
"domestic soil" = "soil"),
marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>%
fct_recode("E. coli colony counts\n(95%)" = "cEC",
"E. coli gene copies\n(98%)" = "EC23S",
"Human HF183 detection\n(39%)" = "HF183",
"Human Mnif detection\n(38%)" = "Mnif")) %>%
filter(param == "RR") %>%
ggplot(data = .) +
geom_vline(aes(xintercept = 1), linetype = "dashed") +
ggstance::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5,
y = label,
color = class),
position = position_dodge2(width = 0.75,
preserve = "single"),
size = 1, fatten = 2.5) +
facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) +
scale_x_log10() +
coord_cartesian(xlim = c(0.4, 1/0.4)) +
scale_color_manual(values = class_color,
name = "outcome\nclass") +
labs(x = "prevalence ratio",
y = NULL) +
theme(legend.key.size = unit(1.5, "lines"),
legend.key = element_rect(fill = "white", colour = NA),
panel.border = element_rect(colour = "black", fill=NA),
panel.spacing = unit(0.25, "lines"),
plot.title = element_text(face="bold"),
plot.margin = margin(c(2, 1, 1, 1)),
legend.margin = margin(c(5, .5, 5, .5)))
## water
plot_est_pr_water <- est_all %>%
filter(type == "hw") %>%
mutate(type = fct_recode(type,
"household stored water" = "hw"),
marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>%
fct_recode("E. coli colony counts\n(85%)" = "cEC",
"E. coli gene copies\n(90%)" = "EC23S",
"Human HF183 detection\n(15%)" = "HF183",
"Human Mnif detection\n(<1%)" = "Mnif")) %>%
filter(param == "RR") %>%
ggplot(data = .) +
geom_vline(aes(xintercept = 1), linetype = "dashed") +
ggstance::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5,
y = label,
color = class),
position = position_dodge2(width = 0.75,
preserve = "single"),
size = 1, fatten = 2.5) +
facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) +
scale_x_log10() +
coord_cartesian(xlim = c(0.4, 1/0.4)) +
scale_color_manual(values = class_color,
name = "outcome\nclass") +
labs(x = NULL,
y = NULL) +
theme(legend.key.size = unit(1.5, "lines"),
legend.key = element_rect(fill = "white", colour = NA),
panel.border = element_rect(colour = "black", fill=NA),
panel.spacing = unit(0.25, "lines"),
plot.title = element_text(face="bold"),
plot.margin = margin(c(2, 1, 1, 1)),
legend.margin = margin(c(5, .5, 5, .5)))
## Combine
plot_est_pr <- plot_est_pr_water + plot_est_pr_soil +
plot_layout(nrow = 2,
guides = "collect")
ggsave(filename = "../fig/mst_gpp_est_pr.png",
plot = plot_est_pr,
device = agg_png, dpi = 300,
width = 8.5, height = 11, units = "in")Differences
Plot marginal prevalence difference estimates.
### Differences
## Soil
plot_est_pd_soil <- est_all %>%
filter(type == "soil") %>%
mutate(type = fct_recode(type,
"domestic soil" = "soil"),
marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>%
fct_recode("E. coli colony counts\n(95%)" = "cEC",
"E. coli gene copies\n(98%)" = "EC23S",
"Human HF183 detection\n(39%)" = "HF183",
"Human Mnif detection\n(38%)" = "Mnif")) %>%
filter(param == "RD") %>%
mutate(across(c(avg, p2.5, p97.5), ~.x*100)) %>%
ggplot(data = .) +
geom_vline(aes(xintercept = 0), linetype = "dashed") +
ggstance::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5,
y = label,
color = class),
position = position_dodge2(width = 0.75,
preserve = "single"),
size = 1, fatten = 2.5) +
facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) +
coord_cartesian(xlim = c(-15, 15)) +
scale_color_manual(values = class_color,
name = "outcome\nclass") +
labs(x = "prevalence difference (percentage points)",
y = NULL) +
theme(legend.key.size = unit(1.5, "lines"),
legend.key = element_rect(fill = "white", colour = NA),
panel.border = element_rect(colour = "black", fill=NA),
panel.spacing = unit(0.25, "lines"),
plot.title = element_text(face="bold"),
plot.margin = margin(c(2, 1, 1, 1)),
legend.margin = margin(c(5, .5, 5, .5)))
## water
plot_est_pd_water <- est_all %>%
filter(type == "hw") %>%
mutate(type = fct_recode(type,
"household stored water" = "hw"),
marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>%
fct_recode("E. coli colony counts\n(85%)" = "cEC",
"E. coli gene copies\n(90%)" = "EC23S",
"Human HF183 detection\n(15%)" = "HF183",
"Human Mnif detection\n(<1%)" = "Mnif")) %>%
filter(param == "RD") %>%
mutate(across(c(avg, p2.5, p97.5), ~.x*100)) %>%
ggplot(data = .) +
geom_vline(aes(xintercept = 0), linetype = "dashed") +
ggstance::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5,
y = label,
color = class),
position = position_dodge2(width = 0.75,
preserve = "single"),
size = 1, fatten = 2.5) +
facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) +
coord_cartesian(xlim = c(-15, 15)) +
scale_color_manual(values = class_color,
name = "outcome\nclass") +
labs(x = NULL,
y = NULL) +
theme(legend.key.size = unit(1.5, "lines"),
legend.key = element_rect(fill = "white", colour = NA),
panel.border = element_rect(colour = "black", fill=NA),
panel.spacing = unit(0.25, "lines"),
plot.title = element_text(face="bold"),
plot.margin = margin(c(2, 1, 1, 1)),
legend.margin = margin(c(5, .5, 5, .5)))
## Combine
plot_est_pd <- plot_est_pd_water + plot_est_pd_soil +
plot_layout(nrow = 2,
guides = "collect")
ggsave(filename = "../fig/mst_gpp_est_pd.png",
plot = plot_est_pd,
device = agg_png, dpi = 300,
width = 8.5, height = 11, units = "in")