Estimating Household Fecal Contamination Associations with Enteric Pathogens in Child Stool

Version 1.1

Author

David A Holcomb

Published

April 15, 2025

Description

This report provides R code to conduct the analysis of household fecal marker associations with enteric pathogens and child diarrhea described in:

Holcomb DA, Knee J, Adriano Z, Capone D, Cumming O, Kowalsky E, Nalá R, Viegas E, Stewart JR, Brown J. (2025). Associations between fecal contamination of the household environment and enteric pathogen detection in children living in Maputo, Mozambique. Environment & Health. https://doi.org/10.1021/envhealth.4c00283

A companion R script, bgcomp_funcs.R, abstracts the functions developed in this analysis to simultaneously estimate individual and pooled associations of a single predictor with multiple binary outcomes, using marginal standardization to obtain prevalence ratio and prevalence difference estimates. It can be sourced into an R session to make the functions available in the global environment for use. The accompanying dataset, data_mst_gpp.xlsx, is provided to enable replicating this analysis and includes a variable codebook. All files are available in the manuscript repository.

Setup

Load packages and set options.

## Options
options(tidyverse.quiet = TRUE)                       # quietly
options(dplyr.summarise.inform = FALSE)               # quietly
options(mc.cores = min(parallel::detectCores(), 8))   # cpu cores for MCMC


## Packages
# Utilities
library(readxl)       # I/O
library(lubridate)    # date formatting
library(broman)       # number formatting

# Models
library(brms)         # fit Bayesian regression models in stan
library(cmdstanr)     # R interface to CmdStan
library(tidybayes)    # extract posterior samples

# Graphs
library(patchwork)    # combine plots
library(ragg)         # more consistent graphics output

# Tidyverse
library(knitr)        # rendering
library(quarto)       # rendering
library(tidyverse)    # load last to avoid conflicts (particularly with data.table)


## Load custom functions
source("bgcomp_funcs.R")

Data

Read

The data have been prepared in “long” format, meaning that each combination of pathogen, environmental sample type, and fecal marker assessed is presented on its own row. The data file has two sheets: “data” containing the data and “codebook” with a description of each variable. Read in the dataset, extract summary information such as pathogen labels and detections, and save in RDS format for downstream use.

## read in data
df_raw <- read_xlsx("../data/data_mst_gpp.xlsx", sheet = "data")

## extract pathogen labels
path_labels <- df_raw %>%
  select(path, class,
         label,
         name) %>%
  distinct() %>%
  bind_rows(tibble(path = c("diarrhea",
                            "bacteria", "viruses", "protozoa", "sth", 
                            "pooled"),
                   class = c("diarrhea",
                             "bacteria", "viruses", "protozoa", "sth", 
                             "all"),
                   label = c("Diarrhea",
                             "Any bacteria", "Any viruses", "Any protozoa", "Any STH", 
                             "Typical pathogen")))
saveRDS(path_labels, "../output/path_labels.rds")

## extract pathogen detection counts
path_detects <- df_raw %>%
  select(path, class,
         n_obs, n_pos, pct_pos) %>%
  distinct()
saveRDS(path_detects, "../output/path_detects.rds")


## refine data and save
df_cln <- df_raw %>%
  filter(include_n5 == "yes",
         !(type == "hw" & marker == "Mnif")) %>%
  select(comp, hh, child, time,
         indv, stool,
         female,
         age_years_scl,
         careprimary,
         wealth_scl,
         diarrhea,
         bacteria, viruses, protozoa, sth,
         class, path, infect,
         type, marker, envr_id,
         detect, quant, quant_cnt)
saveRDS(df_cln, "../output/data_mst_gpp.rds")

Summarize

Additional data summaries for downstream use.

## diarrhea data
obs_diarr <- readRDS("../output/data_mst_gpp.rds") %>%
  select(indv, time, stool, type, 
         envr_id, marker, detect, quant,
         diarrhea,
         female, age_years_scl, careprimary, wealth_scl) %>%
  distinct() %>%
  mutate(infect = diarrhea,
         path = "diarrhea",
         source = "survey")

## class data
obs_class <- readRDS("../output/data_mst_gpp.rds") %>%
  select(indv, time, stool, type, 
         envr_id, marker, detect, quant,
         bacteria, viruses, protozoa, sth,
         diarrhea,
         female, age_years_scl, careprimary, wealth_scl) %>%
  distinct() %>%
  pivot_longer(c(bacteria, viruses, protozoa, sth),
               names_to = "path",
               values_to = "infect") %>%
  mutate(source = ifelse(path == "sth", "kk", "gpp"))

## path data
obs_path <- readRDS("../output/data_mst_gpp.rds") %>%
  select(indv, time, stool, type, 
         envr_id, marker, detect, quant,
         path, infect, diarrhea,
         female, age_years_scl, careprimary, wealth_scl) %>%
  distinct()


## microscopy data
obs_kk <- readRDS("../output/data_mst_gpp.rds") %>%
  filter(class %in% c("sth")) %>%
  select(indv, time, stool, type, 
         envr_id, marker, detect, quant,
         path, infect, diarrhea,
         female, age_years_scl, careprimary, wealth_scl) %>%
  distinct() %>%
  mutate(source = "kk")

## gpp data
obs_gpp <- readRDS("../output/data_mst_gpp.rds") %>%
  filter(class %in% c("bacteria", "viruses", "protozoa")) %>%
  select(indv, time, stool, type, 
         envr_id, marker, detect, quant,
         path, infect, diarrhea,
         female, age_years_scl, careprimary, wealth_scl) %>%
  distinct() %>%
  mutate(source = "gpp")


## stacked
obs_source <- bind_rows(obs_gpp,
                        obs_kk,
                        obs_class,
                        obs_diarr) %>%
  arrange(source, stool, path)


## Summarise outcomes
tab_outcome <- obs_source %>%
  filter(!is.na(infect)) %>%
  select(indv, time, stool,
         path, infect,
         female, age_years_scl, careprimary, wealth_scl) %>%
  distinct() %>%
  drop_na() %>%
  left_join(readRDS("../output/path_labels.rds"), by = "path") %>%
  group_by(path, label, class) %>%
  summarise(n_obs = n(),
            infect_n = sum(infect),
            infect_pct = round((infect_n / n_obs) * 100, 0)) %>%
  ungroup() %>%
  mutate(class = fct_relevel(class, c("bacteria", "viruses", "protozoa", "sth", "diarrhea")),
         path = fct_relevel(path,
                            c("bacteria", "cdif", "campy", "etec", "stec", "shig",
                              "viruses", "noro", 
                              "protozoa", "crypto", "giardia", 
                              "sth", "asc", "trich",
                              "diarrhea"))) %>%
  arrange(class, path)

saveRDS(tab_outcome, "../output/tab_outcome.rds")

Model Functions

We fit Bayesian hierarchical models with random slopes for the exposure variable (i.e., fecal marker detection) and all other covariates by pathogen to estimate the individual association of the exposure with each pathogen as well as a pooled association across all the pathogens. We then generate posterior predictions for all-exposed and none-exposed scenarios to obtain marginal estimates of the prevalence ratio and prevalence difference for each pathogen and pooled across pathogens, an approach known as marginal standardization or, alternatively, the Bayesian parametric g-formula (“bgcomp”). The models are specified and sampled using the brms package with cmdstanr backend and posterior predictions are generated with the tidybayes package. Custom functions are defined to wrap the functionality from these other packages and implement the bgcomp procedure.

Utility function

The bgcomp procedure is implemented somewhat differently for binary and continuous exposures, so define a utility function to determine if a given variable is binary or not.

## determine if variable is binary:
is.binary <- function(x) { all(na.omit(x) %in% 0:1) }

Binary exposure

Marginal standardization via the Bayesian g-formula is accomplished for binary exposures by sampling from the posterior predictive distribution of a fitted model under two counterfactual scenarios, the first assuming all participants were exposed and the second assuming none were exposed, while all other covariates retain their original, observed values. Define a function that takes a fitted brms model object, samples the posterior predictive distributions under both scenarios, and calculates the contrasts of interest (ratio and difference). An additional “pathogen” is added in the process to obtain marginal estimates of the pooled exposure effect across pathogens.

### BINARY EXPOSURE posterior predictions
bgcomp.bin <- function(fit, data,
                       exposure, outcome, group){
  # row-wise posterior predictive expectations from fitted brms model object
  # fit:      brms model object
  # data:     dataframe to which the model was fit
  # exposure: string with the name of the exposure variable in "fit"
  # outcome:  string with the name of the outcome variable in "fit"
  # group:    string with the name of the multiple outcome grouping variable (e.g. the variable containing pathogen names)
  
  if(is.null(group)){
    
    df_pred <- data
    
  } else {
    
    ## add pooled outcome data (new group)
    df_pred <- data %>%
      mutate(!!group := "pooled",
             !!outcome := NA) %>%
      distinct() %>%
      bind_rows(data)
    
  }
  
  
  ## prediction data frame
  dat_pred <- bind_rows(df_pred %>%
                          mutate(expose_obs = .data[[exposure]],
                                 {{exposure}} := expose_obs,
                                 contrast = "p_observed"),
                        df_pred %>%
                          mutate(expose_obs = .data[[exposure]],
                                 {{exposure}} := 0,
                                 contrast = "p_unexposed"),
                        df_pred %>%
                          mutate(expose_obs = .data[[exposure]],
                                 {{exposure}} := 1,
                                 contrast = "p_exposed")
  )
  
  
  ## posterior predictive expectation
  ppe <- epred_draws(object = fit,
                     newdata = dat_pred,
                     allow_new_levels = TRUE,
                     sample_new_levels = "uncertainty") %>%
    ungroup() %>%
    select(-c(.row, .chain, .iteration, {{exposure}})) %>%
    pivot_wider(names_from = contrast,
                values_from = .epred) %>%
    mutate(RD = p_exposed - p_unexposed,          # marginal risk difference
           RR = p_exposed / p_unexposed,          # marginal risk ratio
           OR = (p_exposed / (1 - p_exposed)) /   # marginal odds ratio
             (p_unexposed / (1 - p_unexposed)),
           support = "binary")
  
  
  return(ppe)
  
}

Continuous exposure

For continuous exposures, “all-exposed” and “none-exposed” are not really meaningful. Instead, we want to estimate the change in the probability of the outcome for a one-unit increased in the value of the exposure variable. Because probability is non-linear and constrained between 0 and 1, the rate of change in the probability of the outcome for changing exposure values depends on the starting value of the exposure. Therefore, we approximate the instantaneous rate of change in the outcome probability at the observed exposure value by sampling posterior predictions for values a tiny bit larger and a tiny bit smaller than the observed exposure value and calculate the difference in the predicted values normalized by the difference in the exposure values.

### CONTINUOUS EXPOSURE posterior predictions
bgcomp.cont <- function(fit, data,
                        exposure, outcome, group,
                        eps = 1e-4){
  
  # row-wise posterior predictive expectations from fitted brms model object
  # fit:      brms model object
  # data:     dataframe to which the model was fit
  # exposure: string with the name of the exposure variable in "fit"
  # outcome:  string with the name of the outcome variable in "fit"
  # group:    string with the name of the multiple outcome grouping variable (e.g. the variable containing pathogen names)
  # eps:      value to increment/decrement continuous exposure by for posterior prediction
  
  if(is.null(group)){
    
    df_pred <- data
    
  } else {
    
    ## add pooled outcome data (new group)
    df_pred <- data %>%
      mutate(!!group := "pooled",
             !!outcome := NA) %>%
      distinct() %>%
      bind_rows(data)
    
  }
  
  ## prediction data frame
  dat_pred <- bind_rows(df_pred %>%
                          mutate(expose_obs = .data[[exposure]],
                                 {{exposure}} := expose_obs,
                                 contrast = "p_observed"),
                        df_pred %>%
                          mutate(expose_obs = .data[[exposure]],
                                 {{exposure}} := expose_obs - eps/2,
                                 contrast = "p_unexposed"),
                        df_pred %>%
                          mutate(expose_obs = .data[[exposure]],
                                 {{exposure}} := expose_obs + eps/2,
                                 contrast = "p_exposed")
  )
  
  
  ## posterior predictive expectation
  ppe <- epred_draws(object = fit,
                     newdata = dat_pred,
                     allow_new_levels = TRUE,
                     sample_new_levels = "uncertainty") %>%
    ungroup() %>%
    select(-c(.row, .chain, .iteration, {{exposure}})) %>%
    pivot_wider(names_from = contrast,
                values_from = .epred) %>%
    mutate(RD = (p_exposed - p_unexposed) / eps,  # marginal slope (dY/dX)--risk difference for 1 unit increase
           RR = (p_observed + RD) / p_observed,   # marginal risk ratio (1 unit increase)
           OR = ((p_observed + RD) /              # marginal odds ratio (1 unit increase)
                   (1 - (p_observed + RD))) / 
             (p_observed / (1 - p_observed)),
           support = "continuous")
  
  
  return(ppe)
  
}

Wrapper function

Define a wrapper function to fit the model in brms, perform marginal standardization using bgcomp.bin() or bgcomp.cont() as appropriate, and summarize the results. Note, the dataset must include a unique exposure id column if there are multiple exposures per subject (e.g., a child stool paired with both a household soil sample and a compound latrine soil sample). This ID column is not used directly in the model and is therefor easy to overlook, but is used indirectly when summarizing the predictive distributions to avoid indexing errors.

### Overall function to fit model and perform g-computation
bgcomp <- function(data,                  # input data frame
                   formula,               # regression model formula (brms style)
                   summary = TRUE,        # summarize predictions across observations?
                   exposure,              # exposure variable name (string)
                   outcome = "infect",    # outcome variable name (string) 
                   group = "path",        # grouping variable name (string)
                   eps = 1e-4,            # epsilon size for finite difference (scalar)
                   prior = NULL,          # brms model priors (default: brms improper flat priors)
                   sample_prior = "no",   # set to "yes" to sample prior predictive distribution (requires specifying proper priors)
                   file = NULL,           # provide file path & name to save model object as RDS
                   file_refit = "always", # if model is saved, overwrite ("always", default), reuse ("never"), 
                                          #    or only overwrite if model changes ("on_change")
                   chains = 4,            # independent MCMC chains
                   iter = 2000,           # total MCMC iterations per chain (including warmup)
                   warmup = floor(iter/2),# warmup iterations to discard
                   cores = getOption("mc.cores", 1)){
  # data should already be filtered for outcome & complete cases
  # currently only accepts binary outcomes
  # must include a unique exposure id column if multiple exposures per subject
  
  
  ### fit the regression model
  
  ## QC formula
  form <- as.formula(formula)
  
  ## Determine exposure type
  expose_bin <- data %>%
    select({{exposure}}) %>%
    pull() %>%
    is.binary()
  
  if(expose_bin){
    data <- data %>%
      mutate({{exposure}} := as.integer(.data[[exposure]]))
  }
  
  ## brms bayesian hierarchical model
  fit <- brm(data = data,
             formula = form,
             family = bernoulli(),  # currently only set up for binary outcomes
             prior = prior,
             sample_prior = sample_prior,
             backend = "cmdstanr",
             file = file,
             file_refit = file_refit,
             chains = chains,
             iter = iter,
             warmup = warmup,
             cores = cores)
  
  
  
  ### posterior predictions
  
  ## Make posterior predictions
  
  if(expose_bin){
    # binary exposure
    ppe <- bgcomp.bin(fit = fit, data = data,
                      exposure = exposure, outcome = outcome, group = group)
    
  } else {
    # continuous exposure
    ppe <- bgcomp.cont(fit = fit, data = data,
                       exposure = exposure, outcome = outcome, group = group,
                       eps = eps)
  }
  
  
  
  ### Summarise and return 
  if(summary){
    
    if(is.null(group)){
      
      group <- "path"
      
      ppe <- ppe %>%
        mutate(path = "single")
      
    }
        
    ppe <- ppe %>%
      select({{group}},
             p_observed, p_unexposed, p_exposed,
             RD, RR, OR) %>%
      pivot_longer(c(p_observed, p_unexposed, p_exposed,
                     RD, RR, OR),
                   names_to = "param",
                   values_to = "pred") %>%
      group_by(across({{group}}), param) %>%
      summarise(avg = mean(pred),
                sd = sd(pred),
                iqr = IQR(pred),
                mad = mad(pred),
                p2.5 = quantile(pred, prob = 0.025),
                p5 = quantile(pred, prob = 0.05),
                p10 = quantile(pred, prob = 0.1),
                p25 = quantile(pred, prob = 0.25),
                p50 = quantile(pred, prob = 0.5),
                p75 = quantile(pred, prob = 0.75),
                p90 = quantile(pred, prob = 0.9),
                p95 = quantile(pred, prob = 0.95),
                p97.5 = quantile(pred, prob = 0.975)) %>%
      ungroup()
    
  }
  
  return(ppe)
  
}

Analysis

Fit separate models for each combination of environmental sample matrix (water, soil) and fecal marker (cEC, EC23S, Hf183, Mnif) for three groups of outcomes: individual pathogens, pathogen classes, and reported diarrhea symptoms. The bgcomp() function is applied across each of these combinations by nesting the long-format dataset and using purrr::map() approaches. The envr_id variable provides the unique exposure ID needed to avoid indexing errors.

Pathogens

Fit the models for individual pathogen outcomes. All pathogens are included in the same model, but separate models are fit for each environmental sample type and fecal marker. The slope and intercept for the exposure variable and all cosvariates are allowed to vary by pathogen.

Note: depending on platform and Stan installation, a large number of arcane warning messages may be printed to the console. These are generally safe to ignore. Do pay attention to warnings about divergent transitions—these indicate the sampler is not exploring the parameter space well and the resulting estimates may be unreliable.

source("bgcomp_funcs.R")

### Set up for all

## data
dat_path <- readRDS("../output/data_mst_gpp.rds") %>%
  filter(!is.na(detect),
         !is.na(infect)) %>%
  select(type, marker,
         stool, indv, comp,
         path, infect,
         envr_id, detect, quant_cnt,
         female, age_years_scl, careprimary, wealth_scl) %>%
  distinct() %>%
  drop_na() %>%
  mutate(exposure = case_when(marker %in% c("HF183", "Mnif") ~ detect,
                              marker %in% c("cEC", "EC23S") ~ quant_cnt))


## priors
prior_path <- c(prior(normal(-1,2), class = "Intercept"),
                prior(normal(0,1), class = "b"),
                prior(normal(0,1), class = "sd"),
                prior(lkj(2), class = "cor"))

## formula
form_path <- formula(infect ~ exposure + female + age_years_scl + careprimary + wealth_scl +
                       (1 + exposure + female + age_years_scl + careprimary + wealth_scl | path) +
                       (1 | stool) + (1 | comp))


### fit for all environmental samples and markers
est_path <- dat_path %>%
  group_by(type, marker) %>%
  nest() %>%
  mutate(name = str_c("../model/report", type, "_", marker),
         est = map2(.x = data, .y = name,
                    ~bgcomp(data = .x,
                            formula = form_path,
                            summary = TRUE,
                            exposure = "exposure",
                            outcome = "infect",
                            group = "path",
                            prior = prior_path,
                            file = .y))) %>%
  ungroup() %>%
  select(-c(name, data)) %>%
  unnest(est) %>%
  mutate(source = "gpp")

saveRDS(est_path, "../est/bgcomp_path.rds")

Class

Models are fit by pathogen class rather than individual pathogen, but otherwise model structure is the same.

### Set up for all

## data
dat_class <- readRDS("../output/data_mst_gpp.rds") %>%
  select(type, marker,
         stool, indv, comp,
         bacteria, viruses, protozoa, sth,
         envr_id, detect, quant_cnt,
         female, age_years_scl, careprimary, wealth_scl) %>%
  distinct() %>%
  pivot_longer(bacteria:sth,
               names_to = "path",
               values_to = "infect") %>%
  filter(!is.na(detect),
         !is.na(infect)) %>%
  drop_na() %>%
  mutate(exposure = case_when(marker %in% c("HF183", "Mnif") ~ detect,
                              marker %in% c("cEC", "EC23S") ~ quant_cnt))


## priors
prior_class <- c(prior(normal(-1,2), class = "Intercept"),
                 prior(normal(0,1), class = "b"),
                 prior(normal(0,1), class = "sd"),
                 prior(lkj(2), class = "cor"))

## formula
form_class <- formula(infect ~ exposure + female + age_years_scl + careprimary + wealth_scl +
                        (1 + exposure + female + age_years_scl + careprimary + wealth_scl | path) +
                        (1 | stool) + (1 | comp))


### fit for all

est_class <- dat_class %>%
  group_by(type, marker) %>%
  nest() %>%
  mutate(name = str_c("../model/report_class", type, "_", marker),
         est = map2(.x = data, .y = name,
                    ~bgcomp(data = .x,
                            formula = form_class,
                            summary = TRUE,
                            exposure = "exposure",
                            outcome = "infect",
                            group = "path",
                            prior = prior_class,
                            file = .y))) %>%
  ungroup() %>%
  select(-c(name, data)) %>%
  unnest(est) %>%
  mutate(source = "class")

saveRDS(est_class, "../est/bgcomp_class.rds")

Diarrhea

Since diarrhea is evaluated as a singlet outcome, the model formula is simplified to remove the pathogen and stool-varying terms. Otherwise, the same modeling and marginal standardization approach applies, without the pooled estimate.

### Set up for all

## data
dat_diarr <- readRDS("../output/data_mst_gpp.rds") %>%
  select(type, marker,
         stool, indv, comp,
         diarrhea,
         envr_id, detect, quant_cnt,
         female, age_years_scl, careprimary, wealth_scl) %>%
  distinct() %>%
  drop_na() %>%
  mutate(exposure = case_when(marker %in% c("HF183", "Mnif") ~ detect,
                              marker %in% c("cEC", "EC23S") ~ quant_cnt))


## priors
prior_diarr <- c(prior(normal(-1,2), class = "Intercept"),
                 prior(normal(0,1), class = "b"),
                 prior(normal(0,1), class = "sd"))

## formula
form_diarr <- formula(diarrhea ~ exposure + female + age_years_scl + careprimary + wealth_scl + (1 | comp))


### fit for all
est_diarr <- dat_diarr %>%
  group_by(type, marker) %>%
  nest() %>%
  mutate(name = str_c("../model/report_diarr", type, "_", marker),
         est = map2(.x = data, .y = name,
                    ~bgcomp(data = .x,
                            formula = form_diarr,
                            summary = TRUE,
                            exposure = "exposure",
                            outcome = "diarrhea",
                            group = NULL,
                            prior = prior_diarr,
                            file = .y))) %>%
  ungroup() %>%
  select(-c(name, data)) %>%
  unnest(est) %>%
  mutate(path = "diarrhea",
         source = "diarrhea")

saveRDS(est_diarr, "../est/bgcomp_diarr.rds")

Plot

The marginal prevalence ratio and difference estimates are presented as forest plots and color-coded by class.

Inputs

Prepare elements used for plotting.

## MST summary
mst_stat <- readRDS("../output/data_mst_gpp.rds") %>%
  filter(path == "shig",
         !is.na(infect),
         !is.na(detect)) %>%
  select(path, type, envr_id, marker, detect) %>%
  distinct() %>%
  group_by(path, type, marker) %>%
  summarise(n_envr_obs = n_distinct(envr_id),
            n_envr_pos = sum(detect),
            pct_pos_envr = 100 * (n_envr_pos / n_envr_obs)) %>%
  ungroup() %>%
  select(type, marker, pct_pos_envr)


## Stack estimates
est_all <- bind_rows(readRDS("../est/bgcomp_path.rds"),
                     readRDS("../est/bgcomp_diarr.rds")) %>%
  ungroup() %>%
  filter(!(source %in% c("class", "diarrhea") & path == "pooled")) %>%
  add_case(type = "hw", marker = "Mnif", 
           path = "shig", param = "RR") %>%
  add_case(type = "hw", marker = "Mnif", 
           path = "shig", param = "RD") %>%
  left_join(readRDS("../output/path_labels.rds"),
            by = c("path")) %>%
  left_join(readRDS("../output/tab_outcome.rds") %>%
              select(path, pct_pos = infect_pct),
            by = "path") %>%
  mutate(label = ifelse(path == "pooled", "Typical\npathogen", label),
         class = ifelse(path == "pooled", "all", class) %>%
           fct_recode("STH" = "sth",
                      "pathogen" = "all",
                      "symptom" = "diarrhea") %>%
           fct_relevel(c("symptom", "bacteria", "viruses", "protozoa", "STH", "pathogen"))) %>%
  mutate(pct_pos = ifelse(path == "pooled", 0, pct_pos),
         manual = ifelse(path == "diarrhea", 100, pct_pos)) %>%
  arrange(manual, type, marker, class, label) %>%
  mutate(label = ifelse(path == "pooled", label, paste0(label, "\n(", round(pct_pos),"%)")),
         label = fct_inorder(label))

## define color palette
class_color_hex <- RColorBrewer::brewer.pal(6, "Set2")

class_color <- c("symptom" = "#FFD92F",
                 "bacteria" = "#66C2A5",
                 "viruses" = "#FC8D62",
                 "protozoa" = "#8DA0CB",
                 "STH" = "#E78AC3",
                 "pathogen" = "#A6D854")

Ratios

Plot marginal prevalence ratio estimates.

### Ratios

## Soil
plot_est_pr_soil <- est_all %>%
  filter(type == "soil") %>%
  mutate(type = fct_recode(type,
                           "domestic soil" = "soil"),
         marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>%
           fct_recode("E. coli colony counts\n(95%)" = "cEC",
                      "E. coli gene copies\n(98%)" = "EC23S",
                      "Human HF183 detection\n(39%)" = "HF183",
                      "Human Mnif detection\n(38%)" = "Mnif")) %>%
  filter(param == "RR") %>%
  ggplot(data = .) +
  geom_vline(aes(xintercept = 1), linetype = "dashed") +
  ggstance::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5,
                                 y = label,
                                 color = class),
                             position = position_dodge2(width = 0.75,
                                                       preserve = "single"),
                             size = 1, fatten = 2.5) +
  facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) +
  scale_x_log10() + 
  coord_cartesian(xlim = c(0.4, 1/0.4)) +
  scale_color_manual(values = class_color,
                     name = "outcome\nclass") +
  labs(x = "prevalence ratio",
       y = NULL) +
  theme(legend.key.size = unit(1.5, "lines"),
        legend.key = element_rect(fill = "white", colour = NA),
        panel.border = element_rect(colour = "black", fill=NA),
        panel.spacing = unit(0.25, "lines"),
        plot.title = element_text(face="bold"),
        plot.margin = margin(c(2, 1, 1, 1)),
        legend.margin = margin(c(5, .5, 5, .5)))
  

## water
plot_est_pr_water <- est_all %>%
  filter(type == "hw") %>%
  mutate(type = fct_recode(type,
                           "household stored water" = "hw"),
         marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>%
           fct_recode("E. coli colony counts\n(85%)" = "cEC",
                      "E. coli gene copies\n(90%)" = "EC23S",
                      "Human HF183 detection\n(15%)" = "HF183",
                      "Human Mnif detection\n(<1%)" = "Mnif")) %>%
  filter(param == "RR") %>%
  ggplot(data = .) +
  geom_vline(aes(xintercept = 1), linetype = "dashed") +
  ggstance::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5,
                                 y = label,
                                 color = class),
                             position = position_dodge2(width = 0.75,
                                                       preserve = "single"),
                             size = 1, fatten = 2.5) +
  facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) +
  scale_x_log10() + 
  coord_cartesian(xlim = c(0.4, 1/0.4)) +
  scale_color_manual(values = class_color,
                     name = "outcome\nclass") +
  labs(x = NULL,
       y = NULL) +
  theme(legend.key.size = unit(1.5, "lines"),
        legend.key = element_rect(fill = "white", colour = NA),
        panel.border = element_rect(colour = "black", fill=NA),
        panel.spacing = unit(0.25, "lines"),
        plot.title = element_text(face="bold"),
        plot.margin = margin(c(2, 1, 1, 1)),
        legend.margin = margin(c(5, .5, 5, .5)))

## Combine
plot_est_pr <- plot_est_pr_water + plot_est_pr_soil +
  plot_layout(nrow = 2,
              guides = "collect")


ggsave(filename = "../fig/mst_gpp_est_pr.png",
       plot = plot_est_pr,
       device = agg_png, dpi = 300,
       width = 8.5, height = 11, units = "in")

Marginal prevalence ratio estimates of diarrhea and of enteric pathogens in stool for a ten-fold increase in *E. coli* concentration or detection of a human fecal marker in household stored water and domestic soil.

Differences

Plot marginal prevalence difference estimates.

### Differences

## Soil
plot_est_pd_soil <- est_all %>%
  filter(type == "soil") %>%
  mutate(type = fct_recode(type,
                           "domestic soil" = "soil"),
         marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>%
           fct_recode("E. coli colony counts\n(95%)" = "cEC",
                      "E. coli gene copies\n(98%)" = "EC23S",
                      "Human HF183 detection\n(39%)" = "HF183",
                      "Human Mnif detection\n(38%)" = "Mnif")) %>%
  filter(param == "RD") %>%
  mutate(across(c(avg, p2.5, p97.5), ~.x*100)) %>%
  ggplot(data = .) +
  geom_vline(aes(xintercept = 0), linetype = "dashed") +
  ggstance::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5,
                                 y = label,
                                 color = class),
                             position = position_dodge2(width = 0.75,
                                                       preserve = "single"),
                             size = 1, fatten = 2.5) +
  facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) +
  coord_cartesian(xlim = c(-15, 15)) +
  scale_color_manual(values = class_color,
                     name = "outcome\nclass") +
  labs(x = "prevalence difference (percentage points)",
       y = NULL) +
  theme(legend.key.size = unit(1.5, "lines"),
        legend.key = element_rect(fill = "white", colour = NA),
        panel.border = element_rect(colour = "black", fill=NA),
        panel.spacing = unit(0.25, "lines"),
        plot.title = element_text(face="bold"),
        plot.margin = margin(c(2, 1, 1, 1)),
        legend.margin = margin(c(5, .5, 5, .5)))
  

## water
plot_est_pd_water <- est_all %>%
  filter(type == "hw") %>%
  mutate(type = fct_recode(type,
                           "household stored water" = "hw"),
         marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>%
           fct_recode("E. coli colony counts\n(85%)" = "cEC",
                      "E. coli gene copies\n(90%)" = "EC23S",
                      "Human HF183 detection\n(15%)" = "HF183",
                      "Human Mnif detection\n(<1%)" = "Mnif")) %>%
  filter(param == "RD") %>%
  mutate(across(c(avg, p2.5, p97.5), ~.x*100)) %>%
  ggplot(data = .) +
  geom_vline(aes(xintercept = 0), linetype = "dashed") +
  ggstance::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5,
                                 y = label,
                                 color = class),
                             position = position_dodge2(width = 0.75,
                                                       preserve = "single"),
                             size = 1, fatten = 2.5) +
  facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) +
  coord_cartesian(xlim = c(-15, 15)) +
  scale_color_manual(values = class_color,
                     name = "outcome\nclass") +
  labs(x = NULL,
       y = NULL) +
  theme(legend.key.size = unit(1.5, "lines"),
        legend.key = element_rect(fill = "white", colour = NA),
        panel.border = element_rect(colour = "black", fill=NA),
        panel.spacing = unit(0.25, "lines"),
        plot.title = element_text(face="bold"),
        plot.margin = margin(c(2, 1, 1, 1)),
        legend.margin = margin(c(5, .5, 5, .5)))

## Combine
plot_est_pd <- plot_est_pd_water + plot_est_pd_soil +
  plot_layout(nrow = 2,
              guides = "collect")


ggsave(filename = "../fig/mst_gpp_est_pd.png",
       plot = plot_est_pd,
       device = agg_png, dpi = 300,
       width = 8.5, height = 11, units = "in")

Marginal prevalence difference estimates of diarrhea and of enteric pathogens in stool for a ten-fold increase in *E. coli* concentration or detection of a human fecal marker in household stored water and domestic soil.

--- title: "Estimating Household Fecal Contamination Associations with Enteric Pathogens in Child Stool" date: 2025-04-15 author: - name: David A Holcomb orcid: 0000-0003-4055-7164 subtitle: Version 1.1 csl: numbered-bracket.csl format: html: theme: light: yeti fontsize: 0.9em page-layout: article toc: true toc-location: left toc_float: collapsed: false smooth_scroll: true self-contained: true fig-align: center fig-dpi: 300 fig-format: svg code-fold: false code-tools: true cache: true --- # Description This report provides **R** code to conduct the analysis of household fecal marker associations with enteric pathogens and child diarrhea described in: > Holcomb DA, Knee J, Adriano Z, Capone D, Cumming O, Kowalsky E, Nalá R, Viegas E, Stewart JR, Brown J. (2025). Associations between fecal contamination of the household environment and enteric pathogen detection in children living in Maputo, Mozambique. *Environment & Health*. [https://doi.org/10.1021/envhealth.4c00283](https://doi.org/10.1021/envhealth.4c00283) A companion **R** script, [`bgcomp_funcs.R`](https://daholcomb.github.io/manuscripts/mapsan_mst/bgcomp_funcs.R), abstracts the functions developed in this analysis to simultaneously estimate individual and pooled associations of a single predictor with multiple binary outcomes, using marginal standardization to obtain prevalence ratio and prevalence difference estimates. It can be sourced into an **R** session to make the functions available in the global environment for use. The accompanying dataset, [`data_mst_gpp.xlsx`](https://daholcomb.github.io/manuscripts/mapsan_mst/data_mst_gpp.xlsx), is provided to enable replicating this analysis and includes a variable codebook. All files are available in the manuscript [repository](https://daholcomb.github.io/manuscripts/mapsan_mst/). **Setup** Load packages and set options. ```{r} #| label: setup #| message: false #| warning: false #| results: hide #| cache: false ## Options options(tidyverse.quiet = TRUE) # quietly options(dplyr.summarise.inform = FALSE) # quietly options(mc.cores = min(parallel::detectCores(), 8)) # cpu cores for MCMC ## Packages # Utilities library(readxl) # I/O library(lubridate) # date formatting library(broman) # number formatting # Models library(brms) # fit Bayesian regression models in stan library(cmdstanr) # R interface to CmdStan library(tidybayes) # extract posterior samples # Graphs library(patchwork) # combine plots library(ragg) # more consistent graphics output # Tidyverse library(knitr) # rendering library(quarto) # rendering library(tidyverse) # load last to avoid conflicts (particularly with data.table) ## Load custom functions source("bgcomp_funcs.R") ``` # Data ## Read The data have been prepared in "long" format, meaning that each combination of pathogen, environmental sample type, and fecal marker assessed is presented on its own row. The data file has two sheets: "data" containing the data and "codebook" with a description of each variable. Read in the dataset, extract summary information such as pathogen labels and detections, and save in RDS format for downstream use. ```{r} #| label: data_in ## read in data df_raw <- read_xlsx("../data/data_mst_gpp.xlsx", sheet = "data") ## extract pathogen labels path_labels <- df_raw %>% select(path, class, label, name) %>% distinct() %>% bind_rows(tibble(path = c("diarrhea", "bacteria", "viruses", "protozoa", "sth", "pooled"), class = c("diarrhea", "bacteria", "viruses", "protozoa", "sth", "all"), label = c("Diarrhea", "Any bacteria", "Any viruses", "Any protozoa", "Any STH", "Typical pathogen"))) saveRDS(path_labels, "../output/path_labels.rds") ## extract pathogen detection counts path_detects <- df_raw %>% select(path, class, n_obs, n_pos, pct_pos) %>% distinct() saveRDS(path_detects, "../output/path_detects.rds") ## refine data and save df_cln <- df_raw %>% filter(include_n5 == "yes", !(type == "hw" & marker == "Mnif")) %>% select(comp, hh, child, time, indv, stool, female, age_years_scl, careprimary, wealth_scl, diarrhea, bacteria, viruses, protozoa, sth, class, path, infect, type, marker, envr_id, detect, quant, quant_cnt) saveRDS(df_cln, "../output/data_mst_gpp.rds") ``` ## Summarize Additional data summaries for downstream use. ```{r} #| label: data_summ ## diarrhea data obs_diarr <- readRDS("../output/data_mst_gpp.rds") %>% select(indv, time, stool, type, envr_id, marker, detect, quant, diarrhea, female, age_years_scl, careprimary, wealth_scl) %>% distinct() %>% mutate(infect = diarrhea, path = "diarrhea", source = "survey") ## class data obs_class <- readRDS("../output/data_mst_gpp.rds") %>% select(indv, time, stool, type, envr_id, marker, detect, quant, bacteria, viruses, protozoa, sth, diarrhea, female, age_years_scl, careprimary, wealth_scl) %>% distinct() %>% pivot_longer(c(bacteria, viruses, protozoa, sth), names_to = "path", values_to = "infect") %>% mutate(source = ifelse(path == "sth", "kk", "gpp")) ## path data obs_path <- readRDS("../output/data_mst_gpp.rds") %>% select(indv, time, stool, type, envr_id, marker, detect, quant, path, infect, diarrhea, female, age_years_scl, careprimary, wealth_scl) %>% distinct() ## microscopy data obs_kk <- readRDS("../output/data_mst_gpp.rds") %>% filter(class %in% c("sth")) %>% select(indv, time, stool, type, envr_id, marker, detect, quant, path, infect, diarrhea, female, age_years_scl, careprimary, wealth_scl) %>% distinct() %>% mutate(source = "kk") ## gpp data obs_gpp <- readRDS("../output/data_mst_gpp.rds") %>% filter(class %in% c("bacteria", "viruses", "protozoa")) %>% select(indv, time, stool, type, envr_id, marker, detect, quant, path, infect, diarrhea, female, age_years_scl, careprimary, wealth_scl) %>% distinct() %>% mutate(source = "gpp") ## stacked obs_source <- bind_rows(obs_gpp, obs_kk, obs_class, obs_diarr) %>% arrange(source, stool, path) ## Summarise outcomes tab_outcome <- obs_source %>% filter(!is.na(infect)) %>% select(indv, time, stool, path, infect, female, age_years_scl, careprimary, wealth_scl) %>% distinct() %>% drop_na() %>% left_join(readRDS("../output/path_labels.rds"), by = "path") %>% group_by(path, label, class) %>% summarise(n_obs = n(), infect_n = sum(infect), infect_pct = round((infect_n / n_obs) * 100, 0)) %>% ungroup() %>% mutate(class = fct_relevel(class, c("bacteria", "viruses", "protozoa", "sth", "diarrhea")), path = fct_relevel(path, c("bacteria", "cdif", "campy", "etec", "stec", "shig", "viruses", "noro", "protozoa", "crypto", "giardia", "sth", "asc", "trich", "diarrhea"))) %>% arrange(class, path) saveRDS(tab_outcome, "../output/tab_outcome.rds") ``` # Model Functions We fit Bayesian hierarchical models with random slopes for the exposure variable (i.e., fecal marker detection) and all other covariates by pathogen to estimate the individual association of the exposure with each pathogen as well as a pooled association across all the pathogens. We then generate posterior predictions for all-exposed and none-exposed scenarios to obtain marginal estimates of the prevalence ratio and prevalence difference for each pathogen and pooled across pathogens, an approach known as *marginal standardization* or, alternatively, the *Bayesian parametric g-formula* ("bgcomp"). The models are specified and sampled using the `brms` package with `cmdstanr` backend and posterior predictions are generated with the `tidybayes` package. Custom functions are defined to wrap the functionality from these other packages and implement the bgcomp procedure. ## Utility function The bgcomp procedure is implemented somewhat differently for binary and continuous exposures, so define a utility function to determine if a given variable is binary or not. ```{r} #| label: f_binary ## determine if variable is binary: is.binary <- function(x) { all(na.omit(x) %in% 0:1) } ``` ## Binary exposure Marginal standardization via the Bayesian g-formula is accomplished for binary exposures by sampling from the posterior predictive distribution of a fitted model under two counterfactual scenarios, the first assuming all participants were exposed and the second assuming none were exposed, while all other covariates retain their original, observed values. Define a function that takes a fitted `brms` model object, samples the posterior predictive distributions under both scenarios, and calculates the contrasts of interest (ratio and difference). An additional "pathogen" is added in the process to obtain marginal estimates of the pooled exposure effect across pathogens. ```{r} #| label: f_bgcomp.bin ### BINARY EXPOSURE posterior predictions bgcomp.bin <- function(fit, data, exposure, outcome, group){ # row-wise posterior predictive expectations from fitted brms model object # fit: brms model object # data: dataframe to which the model was fit # exposure: string with the name of the exposure variable in "fit" # outcome: string with the name of the outcome variable in "fit" # group: string with the name of the multiple outcome grouping variable (e.g. the variable containing pathogen names) if(is.null(group)){ df_pred <- data } else { ## add pooled outcome data (new group) df_pred <- data %>% mutate(!!group := "pooled", !!outcome := NA) %>% distinct() %>% bind_rows(data) } ## prediction data frame dat_pred <- bind_rows(df_pred %>% mutate(expose_obs = .data[[exposure]], {{exposure}} := expose_obs, contrast = "p_observed"), df_pred %>% mutate(expose_obs = .data[[exposure]], {{exposure}} := 0, contrast = "p_unexposed"), df_pred %>% mutate(expose_obs = .data[[exposure]], {{exposure}} := 1, contrast = "p_exposed") ) ## posterior predictive expectation ppe <- epred_draws(object = fit, newdata = dat_pred, allow_new_levels = TRUE, sample_new_levels = "uncertainty") %>% ungroup() %>% select(-c(.row, .chain, .iteration, {{exposure}})) %>% pivot_wider(names_from = contrast, values_from = .epred) %>% mutate(RD = p_exposed - p_unexposed, # marginal risk difference RR = p_exposed / p_unexposed, # marginal risk ratio OR = (p_exposed / (1 - p_exposed)) / # marginal odds ratio (p_unexposed / (1 - p_unexposed)), support = "binary") return(ppe) } ``` ## Continuous exposure For continuous exposures, "all-exposed" and "none-exposed" are not really meaningful. Instead, we want to estimate the change in the probability of the outcome for a one-unit increased in the value of the exposure variable. Because probability is non-linear and constrained between 0 and 1, the rate of change in the probability of the outcome for changing exposure values depends on the starting value of the exposure. Therefore, we approximate the instantaneous rate of change in the outcome probability at the observed exposure value by sampling posterior predictions for values a tiny bit larger and a tiny bit smaller than the observed exposure value and calculate the difference in the predicted values normalized by the difference in the exposure values. ```{r} #| label: f_bgcomp.cont ### CONTINUOUS EXPOSURE posterior predictions bgcomp.cont <- function(fit, data, exposure, outcome, group, eps = 1e-4){ # row-wise posterior predictive expectations from fitted brms model object # fit: brms model object # data: dataframe to which the model was fit # exposure: string with the name of the exposure variable in "fit" # outcome: string with the name of the outcome variable in "fit" # group: string with the name of the multiple outcome grouping variable (e.g. the variable containing pathogen names) # eps: value to increment/decrement continuous exposure by for posterior prediction if(is.null(group)){ df_pred <- data } else { ## add pooled outcome data (new group) df_pred <- data %>% mutate(!!group := "pooled", !!outcome := NA) %>% distinct() %>% bind_rows(data) } ## prediction data frame dat_pred <- bind_rows(df_pred %>% mutate(expose_obs = .data[[exposure]], {{exposure}} := expose_obs, contrast = "p_observed"), df_pred %>% mutate(expose_obs = .data[[exposure]], {{exposure}} := expose_obs - eps/2, contrast = "p_unexposed"), df_pred %>% mutate(expose_obs = .data[[exposure]], {{exposure}} := expose_obs + eps/2, contrast = "p_exposed") ) ## posterior predictive expectation ppe <- epred_draws(object = fit, newdata = dat_pred, allow_new_levels = TRUE, sample_new_levels = "uncertainty") %>% ungroup() %>% select(-c(.row, .chain, .iteration, {{exposure}})) %>% pivot_wider(names_from = contrast, values_from = .epred) %>% mutate(RD = (p_exposed - p_unexposed) / eps, # marginal slope (dY/dX)--risk difference for 1 unit increase RR = (p_observed + RD) / p_observed, # marginal risk ratio (1 unit increase) OR = ((p_observed + RD) / # marginal odds ratio (1 unit increase) (1 - (p_observed + RD))) / (p_observed / (1 - p_observed)), support = "continuous") return(ppe) } ``` ## Wrapper function Define a wrapper function to fit the model in `brms`, perform marginal standardization using `bgcomp.bin()` or `bgcomp.cont()` as appropriate, and summarize the results. Note, the dataset must include a unique exposure id column if there are multiple exposures per subject (e.g., a child stool paired with both a household soil sample and a compound latrine soil sample). This ID column is not used directly in the model and is therefor easy to overlook, but is used indirectly when summarizing the predictive distributions to avoid indexing errors. ```{r} #| label: f_bgcomp ### Overall function to fit model and perform g-computation bgcomp <- function(data, # input data frame formula, # regression model formula (brms style) summary = TRUE, # summarize predictions across observations? exposure, # exposure variable name (string) outcome = "infect", # outcome variable name (string) group = "path", # grouping variable name (string) eps = 1e-4, # epsilon size for finite difference (scalar) prior = NULL, # brms model priors (default: brms improper flat priors) sample_prior = "no", # set to "yes" to sample prior predictive distribution (requires specifying proper priors) file = NULL, # provide file path & name to save model object as RDS file_refit = "always", # if model is saved, overwrite ("always", default), reuse ("never"), # or only overwrite if model changes ("on_change") chains = 4, # independent MCMC chains iter = 2000, # total MCMC iterations per chain (including warmup) warmup = floor(iter/2),# warmup iterations to discard cores = getOption("mc.cores", 1)){ # data should already be filtered for outcome & complete cases # currently only accepts binary outcomes # must include a unique exposure id column if multiple exposures per subject ### fit the regression model ## QC formula form <- as.formula(formula) ## Determine exposure type expose_bin <- data %>% select({{exposure}}) %>% pull() %>% is.binary() if(expose_bin){ data <- data %>% mutate({{exposure}} := as.integer(.data[[exposure]])) } ## brms bayesian hierarchical model fit <- brm(data = data, formula = form, family = bernoulli(), # currently only set up for binary outcomes prior = prior, sample_prior = sample_prior, backend = "cmdstanr", file = file, file_refit = file_refit, chains = chains, iter = iter, warmup = warmup, cores = cores) ### posterior predictions ## Make posterior predictions if(expose_bin){ # binary exposure ppe <- bgcomp.bin(fit = fit, data = data, exposure = exposure, outcome = outcome, group = group) } else { # continuous exposure ppe <- bgcomp.cont(fit = fit, data = data, exposure = exposure, outcome = outcome, group = group, eps = eps) } ### Summarise and return if(summary){ if(is.null(group)){ group <- "path" ppe <- ppe %>% mutate(path = "single") } ppe <- ppe %>% select({{group}}, p_observed, p_unexposed, p_exposed, RD, RR, OR) %>% pivot_longer(c(p_observed, p_unexposed, p_exposed, RD, RR, OR), names_to = "param", values_to = "pred") %>% group_by(across({{group}}), param) %>% summarise(avg = mean(pred), sd = sd(pred), iqr = IQR(pred), mad = mad(pred), p2.5 = quantile(pred, prob = 0.025), p5 = quantile(pred, prob = 0.05), p10 = quantile(pred, prob = 0.1), p25 = quantile(pred, prob = 0.25), p50 = quantile(pred, prob = 0.5), p75 = quantile(pred, prob = 0.75), p90 = quantile(pred, prob = 0.9), p95 = quantile(pred, prob = 0.95), p97.5 = quantile(pred, prob = 0.975)) %>% ungroup() } return(ppe) } ``` # Analysis Fit separate models for each combination of environmental sample matrix (water, soil) and fecal marker (cEC, EC23S, Hf183, Mnif) for three groups of outcomes: individual pathogens, pathogen classes, and reported diarrhea symptoms. The `bgcomp()` function is applied across each of these combinations by nesting the long-format dataset and using `purrr::map()` approaches. The `envr_id` variable provides the unique exposure ID needed to avoid indexing errors. ## Pathogens Fit the models for individual pathogen outcomes. All pathogens are included in the same model, but separate models are fit for each environmental sample type and fecal marker. The slope and intercept for the exposure variable and all cosvariates are allowed to vary by pathogen. Note: depending on platform and Stan installation, a large number of arcane warning messages may be printed to the console. These are generally safe to ignore. Do pay attention to warnings about divergent transitions---these indicate the sampler is not exploring the parameter space well and the resulting estimates may be unreliable. ```{r} #| label: fit_path #| eval: false source("bgcomp_funcs.R") ### Set up for all ## data dat_path <- readRDS("../output/data_mst_gpp.rds") %>% filter(!is.na(detect), !is.na(infect)) %>% select(type, marker, stool, indv, comp, path, infect, envr_id, detect, quant_cnt, female, age_years_scl, careprimary, wealth_scl) %>% distinct() %>% drop_na() %>% mutate(exposure = case_when(marker %in% c("HF183", "Mnif") ~ detect, marker %in% c("cEC", "EC23S") ~ quant_cnt)) ## priors prior_path <- c(prior(normal(-1,2), class = "Intercept"), prior(normal(0,1), class = "b"), prior(normal(0,1), class = "sd"), prior(lkj(2), class = "cor")) ## formula form_path <- formula(infect ~ exposure + female + age_years_scl + careprimary + wealth_scl + (1 + exposure + female + age_years_scl + careprimary + wealth_scl | path) + (1 | stool) + (1 | comp)) ### fit for all environmental samples and markers est_path <- dat_path %>% group_by(type, marker) %>% nest() %>% mutate(name = str_c("../model/report", type, "_", marker), est = map2(.x = data, .y = name, ~bgcomp(data = .x, formula = form_path, summary = TRUE, exposure = "exposure", outcome = "infect", group = "path", prior = prior_path, file = .y))) %>% ungroup() %>% select(-c(name, data)) %>% unnest(est) %>% mutate(source = "gpp") saveRDS(est_path, "../est/bgcomp_path.rds") ``` ## Class Models are fit by pathogen class rather than individual pathogen, but otherwise model structure is the same. ```{r} #| label: fit_class #| eval: false ### Set up for all ## data dat_class <- readRDS("../output/data_mst_gpp.rds") %>% select(type, marker, stool, indv, comp, bacteria, viruses, protozoa, sth, envr_id, detect, quant_cnt, female, age_years_scl, careprimary, wealth_scl) %>% distinct() %>% pivot_longer(bacteria:sth, names_to = "path", values_to = "infect") %>% filter(!is.na(detect), !is.na(infect)) %>% drop_na() %>% mutate(exposure = case_when(marker %in% c("HF183", "Mnif") ~ detect, marker %in% c("cEC", "EC23S") ~ quant_cnt)) ## priors prior_class <- c(prior(normal(-1,2), class = "Intercept"), prior(normal(0,1), class = "b"), prior(normal(0,1), class = "sd"), prior(lkj(2), class = "cor")) ## formula form_class <- formula(infect ~ exposure + female + age_years_scl + careprimary + wealth_scl + (1 + exposure + female + age_years_scl + careprimary + wealth_scl | path) + (1 | stool) + (1 | comp)) ### fit for all est_class <- dat_class %>% group_by(type, marker) %>% nest() %>% mutate(name = str_c("../model/report_class", type, "_", marker), est = map2(.x = data, .y = name, ~bgcomp(data = .x, formula = form_class, summary = TRUE, exposure = "exposure", outcome = "infect", group = "path", prior = prior_class, file = .y))) %>% ungroup() %>% select(-c(name, data)) %>% unnest(est) %>% mutate(source = "class") saveRDS(est_class, "../est/bgcomp_class.rds") ``` ## Diarrhea Since diarrhea is evaluated as a singlet outcome, the model formula is simplified to remove the pathogen and stool-varying terms. Otherwise, the same modeling and marginal standardization approach applies, without the pooled estimate. ```{r} #| label: fit_diarrhea #| eval: false ### Set up for all ## data dat_diarr <- readRDS("../output/data_mst_gpp.rds") %>% select(type, marker, stool, indv, comp, diarrhea, envr_id, detect, quant_cnt, female, age_years_scl, careprimary, wealth_scl) %>% distinct() %>% drop_na() %>% mutate(exposure = case_when(marker %in% c("HF183", "Mnif") ~ detect, marker %in% c("cEC", "EC23S") ~ quant_cnt)) ## priors prior_diarr <- c(prior(normal(-1,2), class = "Intercept"), prior(normal(0,1), class = "b"), prior(normal(0,1), class = "sd")) ## formula form_diarr <- formula(diarrhea ~ exposure + female + age_years_scl + careprimary + wealth_scl + (1 | comp)) ### fit for all est_diarr <- dat_diarr %>% group_by(type, marker) %>% nest() %>% mutate(name = str_c("../model/report_diarr", type, "_", marker), est = map2(.x = data, .y = name, ~bgcomp(data = .x, formula = form_diarr, summary = TRUE, exposure = "exposure", outcome = "diarrhea", group = NULL, prior = prior_diarr, file = .y))) %>% ungroup() %>% select(-c(name, data)) %>% unnest(est) %>% mutate(path = "diarrhea", source = "diarrhea") saveRDS(est_diarr, "../est/bgcomp_diarr.rds") ``` # Plot The marginal prevalence ratio and difference estimates are presented as forest plots and color-coded by class. ## Inputs Prepare elements used for plotting. ```{r} #| label: plot_setup ## MST summary mst_stat <- readRDS("../output/data_mst_gpp.rds") %>% filter(path == "shig", !is.na(infect), !is.na(detect)) %>% select(path, type, envr_id, marker, detect) %>% distinct() %>% group_by(path, type, marker) %>% summarise(n_envr_obs = n_distinct(envr_id), n_envr_pos = sum(detect), pct_pos_envr = 100 * (n_envr_pos / n_envr_obs)) %>% ungroup() %>% select(type, marker, pct_pos_envr) ## Stack estimates est_all <- bind_rows(readRDS("../est/bgcomp_path.rds"), readRDS("../est/bgcomp_diarr.rds")) %>% ungroup() %>% filter(!(source %in% c("class", "diarrhea") & path == "pooled")) %>% add_case(type = "hw", marker = "Mnif", path = "shig", param = "RR") %>% add_case(type = "hw", marker = "Mnif", path = "shig", param = "RD") %>% left_join(readRDS("../output/path_labels.rds"), by = c("path")) %>% left_join(readRDS("../output/tab_outcome.rds") %>% select(path, pct_pos = infect_pct), by = "path") %>% mutate(label = ifelse(path == "pooled", "Typical\npathogen", label), class = ifelse(path == "pooled", "all", class) %>% fct_recode("STH" = "sth", "pathogen" = "all", "symptom" = "diarrhea") %>% fct_relevel(c("symptom", "bacteria", "viruses", "protozoa", "STH", "pathogen"))) %>% mutate(pct_pos = ifelse(path == "pooled", 0, pct_pos), manual = ifelse(path == "diarrhea", 100, pct_pos)) %>% arrange(manual, type, marker, class, label) %>% mutate(label = ifelse(path == "pooled", label, paste0(label, "\n(", round(pct_pos),"%)")), label = fct_inorder(label)) ## define color palette class_color_hex <- RColorBrewer::brewer.pal(6, "Set2") class_color <- c("symptom" = "#FFD92F", "bacteria" = "#66C2A5", "viruses" = "#FC8D62", "protozoa" = "#8DA0CB", "STH" = "#E78AC3", "pathogen" = "#A6D854") ``` ## Ratios Plot marginal prevalence ratio estimates. ```{r} #| label: plot_pr #| warning: false #| message: false ### Ratios ## Soil plot_est_pr_soil <- est_all %>% filter(type == "soil") %>% mutate(type = fct_recode(type, "domestic soil" = "soil"), marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>% fct_recode("E. coli colony counts\n(95%)" = "cEC", "E. coli gene copies\n(98%)" = "EC23S", "Human HF183 detection\n(39%)" = "HF183", "Human Mnif detection\n(38%)" = "Mnif")) %>% filter(param == "RR") %>% ggplot(data = .) + geom_vline(aes(xintercept = 1), linetype = "dashed") + ggstance::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5, y = label, color = class), position = position_dodge2(width = 0.75, preserve = "single"), size = 1, fatten = 2.5) + facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) + scale_x_log10() + coord_cartesian(xlim = c(0.4, 1/0.4)) + scale_color_manual(values = class_color, name = "outcome\nclass") + labs(x = "prevalence ratio", y = NULL) + theme(legend.key.size = unit(1.5, "lines"), legend.key = element_rect(fill = "white", colour = NA), panel.border = element_rect(colour = "black", fill=NA), panel.spacing = unit(0.25, "lines"), plot.title = element_text(face="bold"), plot.margin = margin(c(2, 1, 1, 1)), legend.margin = margin(c(5, .5, 5, .5))) ## water plot_est_pr_water <- est_all %>% filter(type == "hw") %>% mutate(type = fct_recode(type, "household stored water" = "hw"), marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>% fct_recode("E. coli colony counts\n(85%)" = "cEC", "E. coli gene copies\n(90%)" = "EC23S", "Human HF183 detection\n(15%)" = "HF183", "Human Mnif detection\n(<1%)" = "Mnif")) %>% filter(param == "RR") %>% ggplot(data = .) + geom_vline(aes(xintercept = 1), linetype = "dashed") + ggstance::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5, y = label, color = class), position = position_dodge2(width = 0.75, preserve = "single"), size = 1, fatten = 2.5) + facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) + scale_x_log10() + coord_cartesian(xlim = c(0.4, 1/0.4)) + scale_color_manual(values = class_color, name = "outcome\nclass") + labs(x = NULL, y = NULL) + theme(legend.key.size = unit(1.5, "lines"), legend.key = element_rect(fill = "white", colour = NA), panel.border = element_rect(colour = "black", fill=NA), panel.spacing = unit(0.25, "lines"), plot.title = element_text(face="bold"), plot.margin = margin(c(2, 1, 1, 1)), legend.margin = margin(c(5, .5, 5, .5))) ## Combine plot_est_pr <- plot_est_pr_water + plot_est_pr_soil + plot_layout(nrow = 2, guides = "collect") ggsave(filename = "../fig/mst_gpp_est_pr.png", plot = plot_est_pr, device = agg_png, dpi = 300, width = 8.5, height = 11, units = "in") ``` ![Marginal prevalence ratio estimates of diarrhea and of enteric pathogens in stool for a ten-fold increase in *E. coli* concentration or detection of a human fecal marker in household stored water and domestic soil.](../fig/mst_gpp_est_pr.png) ## Differences Plot marginal prevalence difference estimates. ```{r} #| label: plot_pd #| warning: false #| message: false ### Differences ## Soil plot_est_pd_soil <- est_all %>% filter(type == "soil") %>% mutate(type = fct_recode(type, "domestic soil" = "soil"), marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>% fct_recode("E. coli colony counts\n(95%)" = "cEC", "E. coli gene copies\n(98%)" = "EC23S", "Human HF183 detection\n(39%)" = "HF183", "Human Mnif detection\n(38%)" = "Mnif")) %>% filter(param == "RD") %>% mutate(across(c(avg, p2.5, p97.5), ~.x*100)) %>% ggplot(data = .) + geom_vline(aes(xintercept = 0), linetype = "dashed") + ggstance::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5, y = label, color = class), position = position_dodge2(width = 0.75, preserve = "single"), size = 1, fatten = 2.5) + facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) + coord_cartesian(xlim = c(-15, 15)) + scale_color_manual(values = class_color, name = "outcome\nclass") + labs(x = "prevalence difference (percentage points)", y = NULL) + theme(legend.key.size = unit(1.5, "lines"), legend.key = element_rect(fill = "white", colour = NA), panel.border = element_rect(colour = "black", fill=NA), panel.spacing = unit(0.25, "lines"), plot.title = element_text(face="bold"), plot.margin = margin(c(2, 1, 1, 1)), legend.margin = margin(c(5, .5, 5, .5))) ## water plot_est_pd_water <- est_all %>% filter(type == "hw") %>% mutate(type = fct_recode(type, "household stored water" = "hw"), marker = fct_relevel(marker, c("cEC", "EC23S", "HF183", "Mnif")) %>% fct_recode("E. coli colony counts\n(85%)" = "cEC", "E. coli gene copies\n(90%)" = "EC23S", "Human HF183 detection\n(15%)" = "HF183", "Human Mnif detection\n(<1%)" = "Mnif")) %>% filter(param == "RD") %>% mutate(across(c(avg, p2.5, p97.5), ~.x*100)) %>% ggplot(data = .) + geom_vline(aes(xintercept = 0), linetype = "dashed") + ggstance::geom_pointrangeh(aes(x = avg, xmin = p2.5, xmax = p97.5, y = label, color = class), position = position_dodge2(width = 0.75, preserve = "single"), size = 1, fatten = 2.5) + facet_grid(rows = vars(type), cols = vars(marker), drop = FALSE) + coord_cartesian(xlim = c(-15, 15)) + scale_color_manual(values = class_color, name = "outcome\nclass") + labs(x = NULL, y = NULL) + theme(legend.key.size = unit(1.5, "lines"), legend.key = element_rect(fill = "white", colour = NA), panel.border = element_rect(colour = "black", fill=NA), panel.spacing = unit(0.25, "lines"), plot.title = element_text(face="bold"), plot.margin = margin(c(2, 1, 1, 1)), legend.margin = margin(c(5, .5, 5, .5))) ## Combine plot_est_pd <- plot_est_pd_water + plot_est_pd_soil + plot_layout(nrow = 2, guides = "collect") ggsave(filename = "../fig/mst_gpp_est_pd.png", plot = plot_est_pd, device = agg_png, dpi = 300, width = 8.5, height = 11, units = "in") ``` ![Marginal prevalence difference estimates of diarrhea and of enteric pathogens in stool for a ten-fold increase in *E. coli* concentration or detection of a human fecal marker in household stored water and domestic soil.](../fig/mst_gpp_est_pd.png) ------------------------------------------------------------------------