## How to make epidemiological training infectious. ## Bellan et al. 2012 PLoS Biology ## ## NOTE: This file can be converted to an R script by replacing the .txt suffix with a .R ## suffix. Individual R files, additional exercises, and future code updates are available ## for download from the MMED website at: ## http://lalashan.mcmaster.ca/theobio/mmed/index.php/MMF ## ## ##################################################################### ## This code gives all the necessary scripts for the below ## described exercises. Please feel free to modify and distribute ## this script as long as you retain the above citation ## in the header. ## ## 1 - Computer simulations to determine R0 of the outbreak given the ## number of course participants (to be performed by the organizers) ## 2 - Case Control Study ## 3 - Estimation of latent and infectious periods ## 4 - Stochastic simulation of outbreaks using fitted parameters ###################################################################### ###################################################################### ## 1 - Computer simulations to determine R0 of the outbreak given the ## number of course participants (to be performed by the organizers) ###################################################################### ## By manipulating N, R0, and infectious period quantities, organizers ## can experiment with how they think their outbreak may proceed. We ## assume a latent period of 0 as participants can infect individuals ## as soon as they are infected as we performed the exercise but we ## leave it up to others to add latent periods. ## Note that the following stochastic simulation uses a chain binomial ## algorithm to simulate the epidemics while the model for Figure 3 is ## produced using the Gillespie algorithm. These both are tenable ## ways to model an outbreak and differ primarily in that the former ## considers time in discrete units and the latter considers time to ## be continuous. par(mfrow=c(1,2)) num.sims <- 10 N <- 40 # number participants vacc.prop <- .35 # immune proportion at the beginning init.immune <- round(vacc.prop*N) # initial immune individuals init.infect <- 4 R0 <- 3.5 # R0 infper <- 1 # infectious period in days ## latper <- 0 # latent period in days ### R0 = beta*N/(1/infper) = beta*N*infper ### beta = R0/(N*infper) beta <- R0/(N*infper) # transmission coefficient step <- .1 # time steps (in days) timeseq <- seq(0, 14, by = step) # discrete time intervals to simulate plot(0,0, type ="n", xlim = c(min(timeseq), max(timeseq)), ylim = c(0, N+10), bty = "n", xlab = "time (days)", ylab = "# hosts") cum.incidence <- rep(init.infect,num.sims) for(ss in 1:num.sims) # do num.sims outbreaks { sim <- data.frame(S = N - init.infect - init.immune, I = init.infect, R = init.immune) for(ii in 2:length(timeseq)) # run through time series { p.trans <- 1 - exp(-beta*step*sim$I[ii-1]) # probability of S -> I per unit S new.inf <- rbinom(1, sim$S[ii-1], p.trans) # number new infections p.recov <- 1 - exp(-1/infper*step) # probability of I -> R new.recov <- rbinom(1, sim$I[ii-1], p.recov) temp.S <- sim$S[ii-1] - new.inf temp.I <- sim$I[ii-1] + new.inf - new.recov temp.R <- sim$R[ii-1] + new.recov sim <- rbind(sim, c(temp.S,temp.I,temp.R)) cum.incidence[ss] <- cum.incidence[ss]+new.inf } ## lines(timeseq,sim[,"S"],col = "black") lines(timeseq,sim[,"I"],col = "red") lines(timeseq,sim[,"R"],col = "blue") } legend("topright", names(sim), col=c("black","red","blue"), lwd = 1) hist(cum.incidence, xlim = c(0,N), breaks = seq(0,N, by=1), col = "red", main = "cumulative incidence", ylim = c(-2,20)) ## abline(v = 23, lty = 3, col = "gray", lwd = 5) arrows(23, -5, 23, 0, code = 2, angle = 45, length = .1, lwd = 2) ###################################################################### ## 3 - Case Control Study ###################################################################### ###################################################################### ## In case control studies, the population is selected by picking ## known cases and appropriate controls. Thus the disease status is ## known from the outset of the study and we cannot calculate ## proportion of diseased or non-diseased individuals since those ## proportions are determined by how many controls we select for each ## case. For that reason, we use odds of exposure to a specific risk ## factor instead, where odds is defined by (# exposed)/(# not ## exposed). And the odds ratio is the ratio between the odds of ## exposure in diseased individual to that in non-diseased individuals. ## Change below to the directory where you saved Dataset S6: setwd("~/Documents/R files/MMF/MMFsupinfo/") ## 2010 riskdat <- read.csv("Dataset_S6_MMFRF2010.csv") ## writing your own odds ratio & confidence interval calculator odds.ratio <- function(tab) { or.out <- tab[1,1]*tab[2,2] / (tab[1,2] * tab[2,1]) ## se(logOR) = sqrt(1/n1 +1/n2 + 1/n3 + 1/n4) se <- sqrt(sum(1/gend.tab)) upper <- exp(log(or.out) + 1.96*se) lower <- exp(log(or.out) - 1.96*se) out <- data.frame(or = or.out, lower95 = lower, upper95 = upper, ref=paste(rownames(tab)[1],":",rownames(tab)[2],sep = "")) return(out) } names(riskdat) ## What is the OR for being infected based on gender? gend.tab <- xtabs(~gender + reported.case, data = riskdat) odds.ratio(gend.tab) # odds of females being infected is .75 that of males, but this is not # significant because the confidence intervals contain 1. ## What is the OR for being infected based on accomodation? xtabs(~where.stay + reported.case, data = riskdat) ## To many different places people stayed, why don't we compare AIMS ## to everywhere else by creating a logical vector stating whether participants were ## housed at aims or not. aims <- grepl("aims", riskdat$where.stay, ignore.case = TRUE) odds.ratio(xtabs(~aims + reported.case, data = riskdat)) ## odds of being infected if you weren't at AIMS was 2x bigger than if ## you were at AIMS, but this is not significant because the ## confidence intervals contain 1 ## What is the OR by age? riskdat$age.cat ## though it is called age category, it is represented by a number ## which corresponds to increasing age. We can treat this as a ## continuous variable (which makes the assumptioin that ## OR(1:2)=OR(2:3)=OR(3:4)=OR(4:5) and then fit a logistic model: logit.mod <- glm(reported.case ~ age.cat, data = riskdat, family=binomial) summary(logit.mod) coef(logit.mod) # intercept and logOR confint(logit.mod) # gives the confidence intervals for the logOR exp(coef(logit.mod)["age.cat"]) # OR exp(confint(logit.mod)["age.cat",]) # gives the confidence intervals for OR ## So there's a 0.64 OR by age meaning that with each increase in age ## category there is a .64 lower odds of being infected, but since the ## confidence interval overlaps 1 this is not significant. ## We could also have done this with the package epicalc, but its good ## to see the calculations done above. library(epicalc) cc(riskdat$reported.case, riskdat$gender, graph = FALSE) odds.ratio(gend.tab) exp(confint(glm(reported.case ~ gender, family = binomial, data = riskdat))) ## Note that the confidence intervals are slightly different, this is ## because they use different approximation methods for calculating ## the confidence interval. confint() uses profile likelihood ## confidence intervals which are the best option. ###################################################################### ###################################################################### ## Retrospective Cohort Study ###################################################################### library(epicalc) ## In a cohort study, the population is chosen without knowing their ## disease status. Thus it is possible to use the population as the ## denominator and calculate incidence ratios and not just odds ## ratios. We do not provide a tutorial here, but encourage you to ## read epicalc's help or learn how to do Poisson regression with ## person-time offsets. ###################################################################### ###################################################################### ## 3 - Estimation of latent and infectious periods ###################################################################### lat.pers <- rgamma(50, shape = 2, rate = 5) # fake latent period data hist(lat.pers, freq = T, col = "gray", xlab = "latent period", main = "histogram of observed latent periods", xlim = c(0,1.5*max(lat.pers))) ## create a function that calculates the negative log-likelihood of a ## pair of gamma distribution parameters. We optimize over the log of ## the shape and rate parameters because then optimize can search ## through all real numbers. gamnll <- function(pers = lat.pers, params = c(log.shape = NULL, log.rate = NULL)) { nll <- sum(-dgamma(pers, shape = exp(params["shape"]), rate = exp(params["rate"]), log = TRUE)) return(nll) } ## Try out a few to get an idea of how they compare hist(lat.pers, freq = F, col = "gray", xlab = "latent period", main = "histogram of observed latent periods", xlim = c(0,1.5*max(lat.pers)), ylim = c(0,3)) shape.examples <- c(2,1,5) # example shape parameters to calculate NLL rate.examples <- c(5,3,3) # example rate parameters to calculate NLL cols <- rainbow(length(shape.examples)) for(ii in 1:length(shape.examples)) # for each pair of shape & rate parameters { curve(dgamma(x,shape.examples[ii],rate.examples[ii]), # draw gamma pdf add = T, col = cols[ii]) ## calculate negative log likelihood and write it on the plot tempnll <- gamnll(params=c(shape =log(shape.examples)[ii], rate=log(rate.examples)[ii])) mtext(signif(tempnll,2), side = 3, adj = .55, col = cols[ii], line = -3-ii) } ## title the negative log likelihoods on the plot, the smallest one ## means the best fit mtext("-log likelihoods:", side = 3, adj = .55, col = "black", line = -3) ## Now let's optimize gamnll to give the lowest NLL init.params <- log(c(shape = 1, rate = 1)) opt <- optim(init.params, gamnll, pers = lat.pers) opt$par hist(lat.pers, freq = F, col = "gray", xlab = "latent period", main = "histogram of observed latent periods", xlim = c(0,1.5*max(lat.pers)), ylim = c(0,3)) curve(dgamma(x,exp(opt$par["shape"]),exp(opt$par["rate"])), # draw gamma pdf add = T, col = "black") ###################################################################### ## Now do it with the real infectious period data from MMF 2011 ###################################################################### ## Change below to the directory where you saved Dataset S7: setwd("~/Documents/R files/MMF/MMFsupinfo/") ## 2011 epidat <- read.csv("Dataset_S7_infper.csv") ## Look at the distribution first hist(epidat$Duration, breaks = 6, col = "gray", xlab = "infectious period in hours", main = "infectious period distribution of MMF outbreak, 2011") ## There is a clear outlier from someone who never recovered, so we ## remove it. epidat$Duration[epidat$Duration>100] <- NA ## Redo the histogram hist(epidat$Duration, breaks = 6, col = "gray", xlab = "infectious period in hours", main = "infectious period distribution of MMF outbreak, 2011") ## Get rid of NAs to feed our optim function durs <- as.vector(na.omit(epidat$Duration)) opt <- optim(init.params, gamnll, pers = durs) ## Plot resulting distribution on the data hist(epidat$Duration, breaks = 6, col = "gray", xlab = "infectious period in hours", freq = F, main = "infectious period distribution of MMF outbreak, 2011") curve(dgamma(x,exp(opt$par["shape"]),exp(opt$par["rate"])), # draw gamma pdf add = T, col = "black") ## Get the average infections period from fitted gamma and from data exp(opt$par)[1]/exp(opt$par)[2] #mean of the gamma distribution is the shape/rate mean(epidat$Duration, na.rm=T) ###################################################################### ###################################################################### ## 4 - Stochastic simulation of outbreaks using fitted parameters ###################################################################### ## To do this participants simply need to repeat what the organizers ## do in (1) above with the estimated parameters. It ## should be an interesting exercise for students to compare the ## fitted parameters with to their true values. There are many ## reasons they may differ (individuals incorrectly following ## directions, newly infected individuals overly excited to infect new ## people and doing so right away, etc...) ## Other things that could be examined include the effectiveness of ## vaccination vs quarantine vs treatment. ###################################################################### ## This code shows how to look at the distribution of final epidemic ## size vs initial proportion vaccinated. This uses a chain binomial ## algorithm for simulation. For an alternative, continuous time ## algorithm (that is computationally faster for smaller populations) ## see the script to produce Figure 3 in supporting information file ## Text_S3_Rcode_figs.txt ### vacc.prop.seq <- seq(0,1,by = .05) init.infect <- 4 num.sims <- 100 ## Set up matrix to fill in final epidemic size for each simulation ## for each vaccination proportion cum.inc.matrix <- matrix(init.infect, num.sims, length(vacc.prop.seq)) N <- 40 # number participants R0 <- 3.5 # R0 infper <- 1 # infectious period in days beta <- R0/(N*infper) # force of infection step <- .1 # time steps (in days) timeseq <- seq(0, 14, by = step) # discrete time intervals to simulate for(vv in 1:length(vacc.prop.seq)) { vacc.prop <- vacc.prop.seq[vv] # immune proportion at the beginning print(paste("Starting simulations for a vaccinated proportion of",vacc.prop)) init.immune <- round(vacc.prop*N) # initial immune individuals for(ss in 1:num.sims) # do num.sims outbreaks { sim <- data.frame(S = N - init.infect - init.immune, I = init.infect, R = init.immune) for(ii in 2:length(timeseq)) # run through time series { p.trans <- 1 - exp(-beta*step*sim$I[ii-1]) # probability of S -> I per unit S new.inf <- rbinom(1, sim$S[ii-1], p.trans) # number new infections p.recov <- 1 - exp(-1/infper*step) # probability of I -> R new.recov <- rbinom(1, sim$I[ii-1], p.recov) temp.S <- sim$S[ii-1] - new.inf temp.I <- sim$I[ii-1] + new.inf - new.recov temp.R <- sim$R[ii-1] + new.recov sim <- rbind(sim, c(temp.S,temp.I,temp.R)) cum.inc.matrix[ss,vv] <- cum.inc.matrix[ss,vv]+new.inf } # end time loop } # end sim loop } # end vacc loop par(mar=c(5,4,2,2)) boxplot(cum.inc.matrix, xlab = "initial vaccinated proportion", ylab = "final size of epidemic", ylim = c(0,N), xaxt="n") axis(1, at = 1:length(vacc.prop.seq), labels = vacc.prop.seq)