## How to make epidemiological training infectious. ## Bellan et al. 2012 PLoS Biology ## ## NOTE: This file can be converted to an R script by replacing the .txt suffix with a .R ## suffix. Individual R files, additional exercises, and future code updates are available ## for download from the MMED website at: ## http://lalashan.mcmaster.ca/theobio/mmed/index.php/MMF ## ## ###################################################################### ## Figure 1 - Epidemic curve showing S, I, R_effective over time ###################################################################### ## Set working directory ## Change below to the directory where you saved Datasets S1-S4: setwd("~/Documents/MMF/MMFsupinfo/") ## Load epidemic data epidat <- read.csv("Dataset_S1_MMFlinelist2010.csv") ## Make sure that data is in POSIXct format for R to understand time epidat$begin.inf <- as.POSIXct(epidat$begin.inf) epidat$end.inf <- as.POSIXct(epidat$end.inf) ## For individuals who did not return their forms we do not know when ## they recovered. We assume that they recovered 2 days after ## infection (as this was the instructed recovery time) epidat$end.inf[is.na(epidat$end.inf)] <- epidat$begin.inf[is.na(epidat$end.inf)]+3600*48 ## Create vector of times at which any event occurred (someone became ## infected or recovered) event.times <- c(epidat$begin.inf, epidat$end.inf) event.times <- na.omit(unique(event.times)) # remove duplicates and NAs event.times <- event.times[order(event.times)] ## Create a vector of time ticks from the first to the last day for ## plotting the time series time.ticks <- seq(as.POSIXct("2010-05-24 12:00"),as.POSIXct("2010-06-01 12:00"), by = 3600*24) event.times <- c(min(time.ticks), event.times) # add a time 12 hours earlier than the epidemic ## Initialize a data frame to create an epidemic time series where ## rows relate to specific events (infections or recoveries). This ## allows calculation of how many susceptible, infected & recovered ## individuals were in the population at any given time since we know ## the true population size (all participants) dat <- data.frame(tt = event.times, sus = NA, inf = NA, rec = NA) ## All 46 people at MMED 2010 started out as susceptible dat[1,2:4] <- c(46, 0 , 0) ## For each time at which an event happened for(ii in 2:nrow(dat)) { tt <- dat$tt[ii] ## Figure out how many new infections were during that event new.inf <- sum(epidat$begin.inf == tt, na.rm = T) ## Figure out how many new recoveries were during that event new.rec <- sum(epidat$end.inf == tt, na.rm = T) ## Calculate new state variables dat$sus[ii] <- dat$sus[ii-1]-new.inf dat$inf[ii] <- dat$inf[ii-1]+new.inf-new.rec dat$rec[ii] <- dat$rec[ii-1]+new.rec } # add a last point at the end of the time.ticks to show the flatline # at the end of the epidemic dat <- rbind(dat, dat[nrow(dat),]) dat$tt[nrow(dat)] <- max(time.ticks) ## Find R0 from the empirical infectious contact distribution. The ## below file has a row for every infectious contact showing the ## individual id of the source and recipient individuals as well as if ## the infectious contact resulted in an infection (i.e. if the ## recipient was susceptible) cont <- read.csv("Dataset_S2_MMFcontacts2010.csv") ## Tabulate how many times each ID came up as a source of infection ## from the contact tracing data cont.dist <- table(factor(cont$source, levels = epidat$id)) ## Average the number of infectious contacts made by each infected ## individual to estimate R0. Note this is more representative of the ## actual R0 than the instructed value in the outbreak because it ## takes into the account that not everyone followed directions. r0est <- mean(cont.dist) ## Calculate susceptible proportion and Reffective at each time dat$susprop <- dat$sus / rowSums(dat[,2:4]) dat$reff <- r0est*dat$susprop ## Initialize pdf pdf("Bellan et al - Figure 1.pdf", width = 5.5, height = 4) layout(matrix(c(1:4),2,2)) ### Layout plot panels par(mar=c(1,4,1,2)) ### Set margins ## Plot infecteds over time plot(dat$tt, dat$inf, type="s", col="red", ylim = c(0,60), lwd = 1, bty="n", lty = 1, xlab = "", ylab = "# people", axes = F) axis(2, at = seq(0,60, by = 20)) axis.POSIXct(1, at = time.ticks, las = 2, format = "%a", label = FALSE) lines(dat$tt, dat$sus, type = "s", col = "black", lwd = 1, lty = 2) #add S lines(dat$tt, dat$rec, type = "s", col = "blue", lwd = 1, lty = 3) #add R par(mar=c(4,4,1,2)) ## Change margins ## Plot Reff over time plot(dat$tt, dat$reff, type = "s", col = "purple", lty = 1, lwd = 1, xlab = "", ylab = expression(R[eff]), yaxt="n", xaxt = "n", ylim = c(.5,2), bty="n") axis.POSIXct(1, at = time.ticks, las = 2, format = "%a") axis(2, seq(.5,2,by=.5)) ## Draw a dashed line at the threshold Reffective=1 abline(h = 1, lty = 2) ## Draw an arrow pointing out where R0 is arrows(time.ticks[4], r0est, time.ticks[1], r0est, length = .2) text(time.ticks[4], r0est, pos = 4, expression(R[0])) ###################################################################### ## Repeat the above for the 2011 outbreak epidat <- read.csv("Dataset_S3_MMFlinelist2011.csv") epidat$begin.inf <- as.POSIXct(epidat$begin.inf) epidat$end.inf <- as.POSIXct(epidat$end.inf) epidat$end.inf[is.na(epidat$end.inf)] <- epidat$begin.inf[is.na(epidat$end.inf)]+3600*48 event.times <- c(epidat$begin.inf, epidat$end.inf) event.times <- na.omit(unique(event.times)) event.times <- event.times[order(event.times)] time.ticks <- seq(as.POSIXct("2011-05-29 12:00"),as.POSIXct("2011-06-07 12:00"), by = 3600*24) dat <- data.frame(tt = event.times, sus = NA, inf = NA, rec = NA) ## 4 people were exposed by the instructor but at two different ## times so we only have 2 infecteds at the first event time dat[1,2:4] <- c(57-14-2, 2 , 14) for(ii in 2:nrow(dat)) { tt <- dat$tt[ii] new.inf <- sum(epidat$begin.inf == tt, na.rm = T) new.rec <- sum(epidat$end.inf == tt, na.rm = T) dat$sus[ii] <- dat$sus[ii-1]-new.inf dat$inf[ii] <- dat$inf[ii-1]+new.inf-new.rec dat$rec[ii] <- dat$rec[ii-1]+new.rec } dat <- na.omit(dat) dat <- rbind(dat, dat[nrow(dat),]) dat$tt[nrow(dat)] <- max(time.ticks) ## R_effective panel cont11 <- read.csv("Dataset_S4_MMFcontacts2011.csv") num.inf <- as.vector(na.omit(cont11$source[!is.na(cont11$recipient)])) num.inf <- factor(num.inf, levels = as.vector(na.omit(unique(cont11[,1])))) tab <- table(num.inf) r0est <- mean(tab) dat$susprop <- dat$sus / rowSums(dat[,2:4]) dat$reff <- r0est*dat$susprop par(mar=c(1,4,1,2)) plot(dat$tt, dat$inf, type="s", col="red", ylim = c(0,60), lwd = 1, bty="n", xlab = "time (hours)", ylab = "", axes = F) axis(2, at = seq(0,60, by = 20), labels = FALSE) lines(dat$tt, dat$sus, type = "s", col = "black", lwd = 1, lty = 2) lines(dat$tt, dat$rec, type = "s", col = "blue", lwd = 1, lty = 3) axis.POSIXct(1, at = time.ticks, las = 2, format = "%a", label = FALSE) legend("topright", c("Susceptible","Infected","Recovered/Immune"), lty = c(2,1,3), col = c("black","red","blue"), lwd = 1, bty = "n", cex = .8) par(mar=c(4,4,1,2)) plot(dat$tt, dat$reff, type = "s", col = "purple", lty = 1, lwd = 1, xlab = "", ylab = "", axes = F, ylim = c(0.5,2), bty="n") axis.POSIXct(1, at = time.ticks, las = 2, format = "%a", label = TRUE) axis(2, at = seq(.5,2, by = .5), labels = FALSE) abline(h = 1, lty = 2) text(time.ticks[5], r0est, pos = 4, expression(R[0])) arrows(time.ticks[5], r0est, time.ticks[2], r0est, length = .2) dev.off() ## Note: Minor formatting changes were made to the figure in Adobe ## Illustrator after production in R. ###################################################################### ## Figure 2 - Odds Ratios & CIs for MMF 2011 outbreak, illustrating ## confounding bias and information bias. ###################################################################### ## Change below to the directory where you saved Dataset S5: setwd("~/Documents/R files/MMF/MMFsupinfo/") ## load data dat <- read.csv("Dataset_S5_MMFRF2011.csv") ## examine the first few rows of the data head(dat) ## The below is a function which will do a multiple logistic ## regression on a data set given the outcome (dependent variable) and ## the exposure(s) (independent variable) names which refer to columns ## in the data frame. The odds ratios and profile likelihood ## confidence intervals are then given in table or data frame format ## to be used in the figure-making code below. The subset option ## allows us to only use a particular subset of the data frame rows ## (for example, to look at how selection bias can change the results) conf.string <- function(outcome = "sero", exposure = "lastyr", subset = NULL, format.tab=F) { ## create a character string of the form "output ~ exposure1 + exposure2 mod <- paste(outcome, "~", paste(exposure, collapse = " + ")) ## do a logistic regression out <- glm(mod, dat, family = binomial, subset = subset) ## exponentiate regression coefficients to get maximum likelihood ## estimates of odds ratios or <- exp(out$coeff) ## use confint() to get profile likelihood confidence intervals ## around coefficients & exponentiate to make them on odds ratio ## scale cis <- exp(confint(out)) ## create a data frame with results frame <- data.frame(or, lower = cis[,1], upper = cis[,2])[exposure,] ## option to format the data as OR (CI) to produce a CSV type ## table. Leave False if making a figure instead if(format.tab==T){ ## only show two significant figures in CSV frame <- signif(frame, 2) frame <- data.frame(or = frame[,1], ci = paste("(",frame[,2], ", ", frame[,3],")", sep="")) } return(frame) } ## Initialize pdf pdf("Bellan et al - Figure 2.pdf", width = 4.75, height = 4) ## first, setting graphical parameters: par(mar=c(1,4,0,1)) # margins lwd <- 2 # line width pch <- 15 # point type (square) cex <- 1.5 # point size offset <- .4 # offset for arrows showing bias len <- .1 # length of arrow tips ## Initialize the plot, (not actually plotting any points due to ## type="n"). Note that y-axis is on a log scale since we are showing ## odds ratios. plot(0,.1,type="n", bty="n", log="y", ylim =c(.002,500), yaxt="n", xlim=c(0,12), xaxt="n",xlab="", ylab="Odds Ratio") ticks <- c(.01,.1, 1,10, 100) # y axis ticks axis(2, at = c(1/seq(10,100, by=10),1/1:10,2:10, seq(10,100,by=10)), labels = NA) axis(2, at = ticks, las = 2, labels = as.character(ticks)) # make y-axis abline(h=1, col ="black", lty=3, lwd=lwd) # put dotted line at null hypothesis (OR=1) xes <- c(1,5,7,11) # locations of points to plot on x-axis ## Use the function conf.string() created above to do a multiple ## logistic regression with lastyr, gender, and movie in the analysis ## even though only the former actually affects the outcome. The ## outcome (case definition) we use here is "sero" - short for serological, ## referring to the fact that we know everyone's true infection status even ## if they were asymptomatic. This type of information could be achieved in ## practice if a serological assay could identify recent infections (e.g., ## through detection of IgM antibodies). serall <- conf.string("sero",c("lastyr","gend","mov")) points(xes[1],serall$or[1], pch =pch, col ="black", cex=cex) arrows(xes[1],serall$lower[1],xes[1],serall$upper[1], col="black",angle=90, code=3, length=len, lwd=lwd) ## Information (Nondifferential misclassification) bias - Panel A ## Same as above but now the outcome is "rep" short for reported, ## indicating our case definition now only includes symptomatic ## (reported) cases. repall <- conf.string("rep",c("lastyr","gend","mov")) points(xes[2],repall$or[1], pch =pch, col ="dark gray", cex=cex) arrows(xes[2],repall$lower[1],xes[2],repall$upper[1], col="dark gray",angle=90, code=3, length=len, lwd=lwd) arrows(xes[1] + offset, serall$or[1], xes[2]-offset, repall$or[1], length =.07, col = "blue", lwd=lwd) ## Confounding - Panel B ## Now we focus on whether participants who arrived a day or ## more early to the site of the workshop were at a greater risk of ## infection. We first plot the analysis with lastyr (and other ## variables) included and find no significant effect of arriving ## early. We then do the same analysis but as a univariate regression ## and show that arriving early does significantly increase your risk ## of arriving early. This is because individuals who had attended ## the clinic last year (and were therefore immune) were less likely ## to arrive early. This is an example of confounding bias, with early ## not really having an effect but appearing to when not adjusted for, ## due to the relationship between lastyr and early as well as lastyr ## and sero. ser.arrive.multi <- conf.string("sero",c("early","lastyr","gend","mov")) points(xes[3],ser.arrive.multi$or[1], pch =pch, col ="black", cex=cex) arrows(xes[3],ser.arrive.multi$lower[1],xes[3],ser.arrive.multi$upper[1], col="black",angle=90, code=3, length=len, lwd=lwd) ser.arrive <- conf.string("sero",c("early")) points(xes[4],ser.arrive$or[1], pch =pch, col ="dark gray", cex=cex) arrows(xes[4],ser.arrive$lower[1],xes[4],ser.arrive$upper[1], col="dark gray",angle=90, code=3, length=len, lwd=lwd) ## Arrows showing bias arrows(xes[3] + offset, ser.arrive.multi$or[1], xes[4] - offset, ser.arrive$or[1], length =.07, col = "blue", lwd=lwd) dev.off() ## Note: Minor formatting changes were made to the figure in Adobe ## Illustrator after production in R. ###################################################################### ## Figure 3 - Stochastic simulation of MMF outbreaks for varying ## proportions of individuals with immunity (e.g., vaccinated) at the ## start of the outbreak. ###################################################################### ## Set local working directory ## You may want to change to the directory below: setwd("~/Documents/R files/MMF/MMFsupinfo/") pdf("Bellan et al - Figure 3.pdf", width = 5.5, height = 3) par(mar=c(4,4,1,1), mfrow=c(1,2)) ## This for() loop loops through the two panels of the plot. We write it this way since the code for each panel is basically identical except panel A only does 5 time series and B does 1000 for each of 20 vaccination proportions. When panelA is TRUE (the first time around) we get panel A, and when FALSE we get panel B. for(panelA in c(T,F)) { ## Make a sequence of vaccinated proportions to simulate outbreaks at vacc.prop.seq <- seq(0,1,by = .05) num.sims <- 1000 # number of simulations at each vaccinated proportion if(panelA) { # if panel A just do 5 sims at one vaccinated proportion vacc.prop.seq <- 0.25 num.sims <- 5 } init.infect <- 1 # initial number of infected people in the population ## Set up matrix to fill in final epidemic size for each simulation ## for each vaccination proportion N <- 57 # number participants R0 <- 1.82 # R0 for 2011 outbreak as calculated in Figure 1 code time.ticks <- seq(as.POSIXct("2011-05-29 12:00"),as.POSIXct("2011-06-07 12:00"), by = 3600*24) infper <- 1 # infectious period in days beta <- R0/(N*infper) # force of infection ## cumulative incidence matrix, fill in for each simulation. Each ## row is a simulation, each column corresponds to a different ## vaccinated proportion cum.inc.matrix <- matrix(0, num.sims, length(vacc.prop.seq)) ## for each vaccinated proportion for(vv in 1:length(vacc.prop.seq)) { vacc.prop <- vacc.prop.seq[vv] # set immune proportion at the beginning ## Give us an update on progress so we know about how long its taking. print(paste("Starting simulations for a vaccinated proportion of",vacc.prop)) # initial immune individuals, must be rounded since individuals are discrete init.immune <- round(vacc.prop*(N-init.infect)) ## Panel A if(panelA) # if plotting time series { ## initialize empty plot plot(0,0, xlim = c(min(time.ticks), max(time.ticks)), ylim = c(0,N/2), type = "n", xaxt="n", ylab="# infected", xlab = "", bty = "n") ## make time axis, label by weekday axis.POSIXct(1, at = time.ticks, las = 2, format = "%a", label = TRUE) } for(ss in 1:num.sims) # do num.sims number of outbreaks at that vacc.prop { ## initialize dataframe to add rows to sim <- data.frame(tt = min(time.ticks), S = N - init.infect - init.immune, I = init.infect, R = init.immune) while(sim$I[nrow(sim)] > 0) # while there are still infected individuals { lr <- nrow(sim) #last row ## This simulation is done using the Gillespie algorithm - for more ## details on how this algorithm works, visit the MMED website's ## Tutorials page ## Total event rate event.rate <- beta*sim$I[lr]*sim$S[lr] + 1/infper*sim$I[lr] ## new time new.tt <- sim$tt[lr] + 3600*24*rexp(1, event.rate) ## if the event is a new recovery if(runif(1) < ( (1/infper*sim$I[lr]) / event.rate)) { new.ss <- sim$S[lr] new.ii <- sim$I[lr] - 1 new.rr <- sim$R[lr] + 1 cum.inc.matrix[ss,vv] <- cum.inc.matrix[ss,vv] + 1 }else{ # the event is a new infection new.ss <- sim$S[lr] - 1 new.ii <- sim$I[lr] + 1 new.rr <- sim$R[lr] } ## add new empty row and fill it in with the new time and new state variables sim <- rbind(sim, rep(NA,4)) sim$tt[lr+1] <- new.tt sim[lr+1,2:4] <- c(new.ss, new.ii, new.rr) } # end while loop ## if plotting time series, add it ot the plot if(panelA) lines(sim$tt, sim$I, type = "s", col = rainbow(num.sims)[ss]) } # end simulation loop } # vaccine prop loop } ## Panel B ## Plot results of the simulations par(mar=c(4,4,1,1)) plot(0,0, xlab = "initial proportion immune", ylab = "outbreak size", xlim = c(0,1), ylim = c(0,60), axes = F, type="n", bty = "n") axis(2, at = seq(0,60, by = 20)) axis(1, at = seq(0,1, by = .5)) ## calculate medians meds <- apply(cum.inc.matrix,2, quantile, prob = .5) ## calculate quantiles frome 0-100% by 10% quants <- apply(cum.inc.matrix,2, quantile, prob = seq(0,1, by = .1)) # for each pair of symmetric quantiles (0,100), (10,90), (20,80), (30,70), (40,60) ## create a polygon, with darker shades for simulation results closer to the median for(ii in 0:4) { polygon(c(vacc.prop.seq, rev(vacc.prop.seq)), c(quants[ii+1,], rev(quants[11-ii,])), col = gray(.7-ii/10), lty = 0) } ## Create a black line for the median lines(vacc.prop.seq, meds, lwd = 1) dev.off() ## Note that the additional labeling of the quantiles and other formatting was performed in ## Adobe Illustrator outside of R.