# Written using Rversion 2.15.3 - security blanket # Program by Rod Walker, Biostatistician, Group Health Research Institute # Email: walker.rl@ghc.org # Required R packages (not written by Walker) library(osDesign) library(ggplot2) library(gridExtra) # See osDesign documentation for detail on that package and two-phase analysis methods. # Further, the osDesign package is geared for broader functionality in analysis of two-phase designs # than this simulation tool. As such, it is recommended that the user review the osDesign package # documentation to determine whether its suite of functions would better suit the user's needs. # Also, it is recommended that the user of this simulation tool be familiar with simulations, R, and # two-phase methodology, so as to understand the functionality, interpretation, and limitations of the code. # Finally, feel free to alter or repurpose the code as needed. # This simulation assumes a cohort study with binary exposure (X), outcome (Y), and 2 confounders (C1,C2). # Assumes true relationship dictated by logistic regression model: logit(p) = b0 + bX*X + bC1*C1 + bC2*C2 # Y is assumed to be known at phase 1 and used in the formation of phase 1 strata. # Options allow for knowledge of X, C1, C2, or surrogates (Xs, C1s, C2s) at phase 1. # Phase 2 is asssumed to collect information on X, C1, and C2, using a balanced design. # The goal of the simulation is to evaluate performance of a two-phase study design. ### BEGINNING OF INPUT SPECIFICATION myseed <- 5743 # choose seed value to be able to reproduce simulation, if needed set.seed(myseed) tolFail <- log(1000) # exclusion tolerance # NOTE: Some simulated data might yield unusally extreme model parameter estimates. # If you want to exclude these extremes, you can set this tolFail value to a large number. # For instance, setting tolFail=log(1000) tells the simulation to exclude any simulation trials # that produce an estimate for bX, bC1, or bC2 that is outside the interval [-log(1000),log(1000)], # i.e., it excludes if an odds ratio > 1000 or an odds ratio < 1/1000. Ntrials <- 10000 # number of simulation trials to run Ntot <- 150000 # Phase 1 sample size pIIsampleN <- 250 # Phase 2 sample size pY <- 0.01 # Outcome (Y) incidence pX <- 0.20 # Exposure (X) prevalence pC1 <- 0.10 # Confounder (C1) prevalence pC2 <- 0.40 # Confounder (C2) prevalence bX <- 1.00 # odds ratio for association between Y and X bC1 <- 4.00 # odds ratio for association between Y and C1 bC2 <- 2.00 # odds ratio for association between Y and C2 aC1 <- 3.00 # odds ratio for association between X and C1 aC2 <- 2.00 # odds ratio for association between X and C2 sensX <- 1.00 # sensitivity of phase 1 proxy measure of X, if exists specX <- 1.00 # specificity of phase 1 proxy measure of X, if exists sensC1 <- 0.40 # sensitivity of phase 1 proxy measure of C1, if exists specC1 <- 0.99 # specificity of phase 1 proxy measure of C1, if exists sensC2 <- 1.00 # sensitivity of phase 1 proxy measure of C2, if exists specC2 <- 1.00 # specificity of phase 1 proxy measure of C2, if exists ph2method <- c("PL") # analysis method: "PL" for profile-likelihood or "WL" for weighted-likelihood ph1svars <- c("X","C1s") # planned phase 1 stratification, in addition to Y # NOTE: Do not include Y in ph1svars above, as this program already assumes # it will be used in formation of the phase 1 strata. The possible variables # that can be included are: "X" or "Xs", "C1" or "C1s", and "C2" or "C2s". # The "s" versions of the variables are used to denote a phase 1 proxy measure. # So, for instance, if phase 1 strata were going to be formed on the basis of # known exposure X (in addition to outcome Y), then set ph1svars <- c("X"). # If phase 1 strata were going to be formed on the basis of known exposure X and # known confounder C1 (in addition to outcome Y), then set ph1svars <- c("X","C1"). # If phase 1 strata were going to be formed on the basis of known exposure X and # a proxy for confounder C1 (in addition to outcome Y), then set ph1svars <- c("X","C1s"). mypdf <- c("//HOME/walkrl3/MiniSentinel/DocumentSimulation.pdf") # path and name of output pdf report # NOTE: This is the end of the required simulation inputs; however, you may need or want to adjust # formatting, axes, etc. of the output tables and figures. Those can be adjusted in the # last section of this program, the SUMMARIZING RESULTS section. ### END OF INPUT SPECIFICATION ### BEGINNING OF SIMULATION # Setup phase 1 stratafication variables for proper formula syntax strataForm <- as.formula(paste(c("Nobs", paste(c(ph1svars, "Y"), collapse="+")), collapse="~")) NoYstrataForm <- as.formula(paste(c("Nobs", paste(ph1svars, collapse="+")), collapse="~")) # Setup initial matrices and vectors to store simulation results simsCoefSE <- matrix(NA,Ntrials,2) colnames(simsCoefSE) <- c("logbX","se") simsCIsW <- matrix(NA,Ntrials,2) colnames(simsCIsW) <- c("ci_025","ci_975") simsDatList <- as.list(rep(NA,Ntrials)) simsFail <- rep(0,Ntrials) simsMaxCoef <- rep(NA,Ntrials) # Setup alpha and beta parameters (log odds ratios) based on above inputs alpha1 <- log(c(aC1, aC2)) beta1 <- log(c(bX, bC1, bC2)) # Setup initial design matrices used by program based on above inputs var_grid <- expand.grid(X=c(0,1), C1=c(0,1), C2=c(0,1), Xs=c(0,1), C1s=c(0,1), C2s=c(0,1)) var_desmat <- as.matrix(cbind(Int=1,var_grid[,colnames(var_grid) %in% c('X','C1','C2')])) var_desmatC <- as.matrix(cbind(Int=1,var_grid[,colnames(var_grid) %in% c('C1','C2')])) var_grid$prev <- NA var_grid <- within(var_grid, { prev[C1 == 1] <- pC1 prev[C1 == 0] <- (1 - pC1) prev[C1 == 1 & C1s == 1] <- prev[C1 == 1 & C1s == 1] * sensC1 prev[C1 == 1 & C1s == 0] <- prev[C1 == 1 & C1s == 0] * (1-sensC1) prev[C1 == 0 & C1s == 0] <- prev[C1 == 0 & C1s == 0] * specC1 prev[C1 == 0 & C1s == 1] <- prev[C1 == 0 & C1s == 1] * (1-specC1) prev[C2 == 1] <- prev[C2 == 1] * pC2 prev[C2 == 0] <- prev[C2 == 0] * (1 - pC2) prev[C2 == 1 & C2s == 1] <- prev[C2 == 1 & C2s == 1] * sensC2 prev[C2 == 1 & C2s == 0] <- prev[C2 == 1 & C2s == 0] * (1-sensC2) prev[C2 == 0 & C2s == 0] <- prev[C2 == 0 & C2s == 0] * specC2 prev[C2 == 0 & C2s == 1] <- prev[C2 == 0 & C2s == 1] * (1-specC2) }) alpha1all <- c(beta0(alpha1, X=var_desmatC, N=var_grid$prev, rhoY=pX), alpha1) prevX <- (exp(var_desmatC %*% alpha1all)/(1 + exp(var_desmatC %*% alpha1all))) var_grid <- within(var_grid, { prev[X == 1] <- prev[X == 1] * prevX[X == 1] prev[X == 0] <- prev[X == 0] * (1 - prevX[X == 0]) prev[X == 1 & Xs == 1] <- prev[X == 1 & Xs == 1] * sensX prev[X == 1 & Xs == 0] <- prev[X == 1 & Xs == 0] * (1-sensX) prev[X == 0 & Xs == 0] <- prev[X == 0 & Xs == 0] * specX prev[X == 0 & Xs == 1] <- prev[X == 0 & Xs == 1] * (1-specX) }) beta1all <- c(beta0(beta1, X=var_desmat, N=var_grid$prev, rhoY=pY), beta1) var_grid$probY <- (exp(var_desmat %*% beta1all)/(1 + exp(var_desmat %*% beta1all))) var_grid$Nobs <- round(var_grid$prev * Ntot) rm(prevX,var_desmatC) # Trials start here for(j in 0:Ntrials){ # Initialize data for simulation trial Mdat <- var_grid Mdat$NY1 <- 0 Mdat$NY0 <- 0 # In first iteration, compute expected number of cases and controls # Otherwise, simulate number of cases and controls if(j==0){ Mdat$NY1 <- round(Mdat$Nobs*Mdat$probY) Mdat$NY0 <- Mdat$Nobs - Mdat$NY1 } else{ for(b in 1:nrow(Mdat)){ Mdat$NY1[b] <- rbinom(1,Mdat$Nobs[b],Mdat$probY[b]) Mdat$NY0[b] <- Mdat$Nobs[b] - Mdat$NY1[b] } } Mdat <- rbind(cbind(Mdat,Y=0),cbind(Mdat,Y=1)) Mdat$Nobs <- NULL Mdat$Nobs <- ifelse(Mdat$Y==1, Mdat$NY1, Mdat$NY0) Mdat <- Mdat[,colnames(Mdat) %in% c('X','C1','C2','Xs','C1s','C2s','Y','Nobs')] # Compute number of observations in phase 1 strata aggMdat <- aggregate(formula=strataForm, data=Mdat, FUN=sum) numstrata <- nrow(aggMdat) aggMdat$strata <- 1:numstrata # Compute the number of observations in each stratum that will be sampled for phase 2 data collection. # We assume a balanced phase 2 design (i.e., attempt to select equal numbers from each stratum). # If a stratum has less observations than were proposed to be sampled from that stratum, # then the program will redistribute the surplus of the phase 2 sample to the other strata. aggMdat$Ssize <- floor(pmin(pIIsampleN/nrow(aggMdat),aggMdat$Nobs)) extraN <- (pIIsampleN - sum(aggMdat$Ssize)) extraS <- sum((aggMdat$Nobs - aggMdat$Ssize)!=0) while(extraN>=extraS){ aggMdat$Ssize <- aggMdat$Ssize + floor(pmin(extraN/extraS,aggMdat$Nobs - aggMdat$Ssize)) extraN <- (pIIsampleN - sum(aggMdat$Ssize)) extraS <- sum((aggMdat$Nobs - aggMdat$Ssize)!=0) } # In first iteration, stop here and store the expected phase 1 and 2 counts to display in table format for report # Otherwise, continue with simulating the phase 2 data and analysis if(j==0){ mystext <- tableGrob(aggMdat, show.rownames=FALSE) } else{ # Put strata information into format in preparation for use of the tps function NoYstrata <- aggregate(formula=NoYstrataForm, data=aggMdat, FUN=sum) NoYstrata$tps_strata <- 1:nrow(NoYstrata) NoYstrata$Nobs <- NULL aggMdat <- merge(x=aggMdat, y=NoYstrata) Controls <- aggMdat[aggMdat$Y==0,] Cases <- aggMdat[aggMdat$Y==1,] nn0_tps_strata <- Controls$Nobs[with(Controls, order(tps_strata))] nn1_tps_strata <- Cases$Nobs[with(Cases, order(tps_strata))] nII0_tps_strata <- Controls$Ssize[with(Controls, order(tps_strata))] nII1_tps_strata <- Cases$Ssize[with(Cases, order(tps_strata))] aggMdat$Nobs <- NULL Mdat <- merge(x=Mdat, y=aggMdat) # Simulate selection of the phase 2 sample for(i in 1:numstrata){ Mdat$Sdrawn[Mdat$strata==i] <- rmvhyper(Mdat$Nobs[Mdat$strata==i],aggMdat$Ssize[aggMdat$strata==i]) } # Store the sample that was generated for this trial of the simulation. simsDatList[[j]] <- Mdat[Mdat$Nobs!=0,] # Expand dataset to create one record per phase 2 observation Mdat <- Mdat[rep(seq_len(nrow(Mdat)), Mdat$Sdrawn), 1:ncol(Mdat)] # Perform phase 2 analysis using weighted likelihood approach ModFit <- suppressWarnings(tps(formula=Y ~ X + C1 + C2, data=Mdat, nn0=nn0_tps_strata, nn1=nn1_tps_strata, group=Mdat$tps_strata, method=ph2method, cohort=TRUE)) # Store estimated regression coefficient of X, i.e., the log odds ratio (bX) # for the adjusted association between Y and X, along with the standard error. # Also, store the Wald-based 95% confidence interval for this log odds ratio. if(ph2method=='WL'){ tempCoefSE <- cbind(ModFit$coef,sqrt(diag(ModFit$cove)))[2,] } else{ tempCoefSE <- cbind(ModFit$coef,sqrt(diag(ModFit$covm)))[2,] } tempCIsW <- c(tempCoefSE[1]-1.96*tempCoefSE[2],tempCoefSE[1]+1.96*tempCoefSE[2]) tempMaxCoef <- max(abs(ModFit$coef[-1])) if(tempMaxCoef>tolFail){ simsFail[j] <- 1 simsMaxCoef[j] <- tempMaxCoef } else{ simsCoefSE[j,] <- tempCoefSE simsCIsW[j,] <- tempCIsW } rm(tempCoefSE,tempCIsW,tempMaxCoef) } print(j) } ### END OF SIMULATION ### BEGINNING OF SUMMARIZING RESULTS # Compute bias and MSE (on logOR and OR scales), as well as coverage probability Bias_logbX <- mean(simsCoefSE[,1]-beta1all[2],na.rm=TRUE) MSE_logbX <- mean((simsCoefSE[,1]-beta1all[2])^2,na.rm=TRUE) Bias_bX <- mean(exp(simsCoefSE[,1])-exp(beta1all[2]),na.rm=TRUE) MSE_bX <- mean((exp(simsCoefSE[,1])-exp(beta1all[2]))^2,na.rm=TRUE) CoverW <- mean((simsCIsW[,1] < beta1all[2]) & (beta1all[2] < simsCIsW[,2]),na.rm=TRUE) # Create histogram (i.e., sampling distribution) of the point estimates (for the odds ratio # of the association between Y and X) generated across all simulated trials, # along with the average (on the odds ratio scale). # You may need or want to adjust axes for display in the figure below. hplotdata <- as.data.frame(exp(simsCoefSE[,1])) colnames(hplotdata) <- c("OR") myhplot <- ggplot(hplotdata, aes(x = OR)) + geom_histogram() + xlim(0.0,2.0) + xlab("Estimated OR (between Y and X)") + ylab("Frequency") + ggtitle(paste("ORs from all simulated trials, Avg =", round(mean(exp(simsCoefSE[,1]),na.rm=TRUE),2))) # Create figure showing a sample (e.g., 25) of the 95% CIs (for the odds ratio of the association # between Y and X) generated from the simulated trials, along with the average width # of the CIs across all simulated trials (on the odds ratio scale). # You may need or want to adjust axes for display in the figure below. avgCIlength <- mean(exp(simsCIsW[,2])-exp(simsCIsW[,1]),na.rm=TRUE) Ndisp <- 25 fplotdata <- as.data.frame(exp(cbind(simsCoefSE[1:Ndisp,1], simsCIsW[1:Ndisp,1], simsCIsW[1:Ndisp,2]))) colnames(fplotdata) <- c("y","ylo","yhi") fplotdata$x <- seq(1:Ndisp) forestplotCUST <- function(d, avelen, xlab="Estimated OR (between Y and X)", ylab="Simulation trial"){ require(ggplot2) p <- ggplot(d, aes(x=x, y=y, ymin=ylo, ymax=yhi)) + geom_pointrange() + ylim(0,2.5) + coord_flip() + geom_hline(aes(yintercept=1), lty=2) + ylab(xlab) + xlab(ylab) + ggtitle(paste("Sample CIs, Avg Width =",round(avelen,2))) return(p)} myfplot <- forestplotCUST(fplotdata,avgCIlength) # Create table showing the proportion of the 95% CIs (for the odds ratio of the association # between Y and X) across simulated trial that exclude certain values (e.g., 1.0 to 1.5). # You may need or want to change the values shown in the table below. counter <- 1 vals <- seq(1,1.5,by=0.1) coveragevals <- matrix(NA,length(vals),2) colnames(coveragevals) <- c("OR","Prop_CIs_Excluding") for(val in vals){ coverageX <- ((log(val) > simsCIsW[,1]) & (log(val) < simsCIsW[,2])) excludeX <- (1 - mean(coverageX,na.rm=TRUE)) coveragevals[counter,] <- c(val,round(excludeX,2)) counter <- counter + 1 } coveragevals myctext <- tableGrob(coveragevals, show.rownames=FALSE) # Display summary of bias and mean squared error for the estimate of the confounder adjusted # outcome/exposure association on the log odds (log_bX) and odds ratio (bX) scales. # Display 95% Wald confidence interval coverage probability for this parameter. # Display proportion of simulated trials for which extreme estimates were excluded due to exceeding tolFail. SimResults <- as.vector(round(c(log(bX), Bias_logbX, MSE_logbX, bX, Bias_bX, MSE_bX, CoverW, sum(simsFail)/Ntrials),4)) names(SimResults) <- c("True_logbX", "Bias_logbX", "MSE_logbX", "True_bX", "Bias_bX", "MSE_bX", "Wald95Coverage", "pctFailed") print(SimResults) # Generate pdf containing the tables and figures summarizing simulation results. pdf(mypdf,width=8,height=12) grid.arrange(mystext,myhplot,myfplot,myctext, nrow=2, ncol=2) dev.off() ### END OF SUMMARIZING RESULTS