#This file contains functions to assess the behavior of #logistic regression models for case-control GWAS when a covariate is available. #The file 'log_regression_covariate_examples.R' explains how to use these functions. #For theoretical details, see #"Including known covariates can reduce power to detect genetic effects in case-control studies." #By Matti Pirinen, Peter Donnelly and Chris Spencer #Nat Genet (2012) #Written by #Matti Pirinen, February 2012 #matti.pirinen@iki.fi # ## ### ######## BINARY COVARIATE ######### ### ## # model.1.loglkhood<-function(par,p,q,phi){ #negative log-likelihood function for logistic regression with #base-line parameter (par[1]) and #additive effect for genotype (par[2]) #loglikelihood is evaluated at the expected counts #in cases defined by 'p' and in controls defined by 'q' #and with the case proportion 'phi' #INPUT #par vector of length 2 giving the argument for the log-likelihood #p genotype frequencies in cases (vector of length 3) #q genotype frequencies in controls (vector of length 3) #phi proportion of cases in the sample #OUTPUT #- log-likelihood at the given argument N=100000 #assuming sample size N=100000 as only the shape matters here, KEEP SAME AS IN gradient function below stopifnot(abs(1-sum(p))<1e-9 & abs(1-sum(q))<1e-9 ) stopifnot(phi>0 & phi <1) stopifnot(length(par)==2) A=par[1]+c(0,1,2)*par[2] #log-odds for the three genotypes -N*(phi*sum(p*log(1/(1+exp(-A))))+(1-phi)*sum(q*log(1/(1+exp(A))))) } model.1.gradient<-function(par,p,q,phi){ #gradient of the negative loglkhood function #INPUT see loglkhood #OUTPUT two dimensional gradient vector N=100000 #KEEP SAME AS IN the loglkhood function above stopifnot(abs(1-sum(p))<1e-9 & abs(1-sum(q))<1e-9 ) stopifnot(phi>0 & phi <1) stopifnot(length(par)==2) A=par[1]+c(0,1,2)*par[2] -N*c(sum((phi*p-(1-phi)*q*exp(A))/(1+exp(A))), sum(c(0,1,2)*(phi*p-(1-phi)*q*exp(A))/(1+exp(A)))) } model.2.loglkhood<-function(par,p,q,phi){ #negative log-likelihood function for logistic regression with #base-line parameter (par[1]) and #additive effect for genotype (par[2]) and #additive effect for binary covariate (par[3]) #loglikelihood is evaluated at the expected counts #in cases defined by 2x3 matrix 'p' and in controls defined by 2x3 matrix 'q' #and with the case proportion 'phi' #INPUT #par vector of length 3 giving the argument for the log-likelihood #p genotype x covariate frequencies in cases (matrix of dimension 2x3) #q genotype x covariate frequencies in controls (matrix of dimension 2x3) #phi proportion of cases in the sample #OUTPUT #- log-likelihood at the given argument N=100000 #assuming sample size N=100000 as only the shape matters here, KEEP SAME AS IN gradient function below stopifnot(sum(dim(p)==c(2,3))==2) stopifnot(sum(dim(q)==c(2,3))==2) stopifnot(abs(1-sum(p))<1e-9 & abs(1-sum(q))<1e-9 ) stopifnot(phi>0 & phi <1) A=par[1]+matrix(c(0,par[2],2*par[2],par[3],par[2]+par[3],2*par[2]+par[3]),byrow=TRUE,nrow=2) -N*(phi*sum(p*log(1/(1+exp(-A))))+(1-phi)*sum(q*log(1/(1+exp(A))))) } model.2.gradient<-function(par,p,q,phi){ #gradient of the loglkhood function #INPUT see loglkhood #OUTPUT two dimensional gradient vector N=100000 #keep same as in loglkhood function above stopifnot(sum(dim(p)==c(2,3))==2) stopifnot(sum(dim(q)==c(2,3))==2) stopifnot(abs(1-sum(p))<1e-9 & abs(1-sum(q))<1e-9 ) stopifnot(phi>0 & phi <1) A=par[1]+matrix(c(0,par[2],2*par[2],par[3],par[2]+par[3],2*par[2]+par[3]),byrow=TRUE,nrow=2) G=matrix(c(0,1,2,0,1,2),byrow=TRUE,nrow=2) X=matrix(c(0,0,0,1,1,1),byrow=TRUE,nrow=2) -N*c(sum((phi*p-(1-phi)*q*exp(A))/(1+exp(A))), sum(G*(phi*p-(1-phi)*q*exp(A))/(1+exp(A))), sum(X*(phi*p-(1-phi)*q*exp(A))/(1+exp(A)))) } binary.covariate<-function(K,freq.G,or.G,freq.X,or.X,ncases,ncontrols,population.controls=FALSE){ #Matti Pirinen February 2012 #Finds population parameters for a given set of disease parameters. #INPUT #K, the (target) prevalence of the disease #freq.G, frequency of risk allele in general population #or.G, odds-ratio for each copy of the risk allele #freq.X, frequency of the risk variant of binary exposure #or.X, odds-ratio of X #ncases, number of cases in the case-control sample (needed for variance computation) #ncontrols, number of controls in the case-control sample (needed for variance computation) #population.controls, boolean, if TRUE then controls have general population frequencies, otherwise # controls have proper control frequencies #OUTPUT #K, the prevalence of the disease (will be close to target prevalence given as input) #ncases, number of cases for which the standard error was computed #ncontrols, number of controls for which the standard error was computed #pop.a, logistic regression base-line parameter in the population (NOT in the case-control sample!) #case.freq 2x3 matrix of case frequencies, element i,j is for X=i-1, G=j-1 #control.freq 2x3 matrix of control frequencies, element i,j is for X=i-1, G=j-1 #population.controls, if TRUE, then control.freq are population frequencies #marg.or.G marginal odds-ratio for each copy of risk allele (i.e. effect of G omitting X from the model) #joint.or.G odds-ratio for each copy of risk allele when adjusted for covariate X #marg.or.X marginal odds-ratio for copy of risk exposure X=1 (i.e. effect of X omitting G from the model) #joint.or.X odds-ratio for risk exposure X=1 when adjusted for genotype G #marg.se.G standard error of marg.or.G #joint.se.G standard error of joint.or.G #NCP.GX.G ratio of non-centrality parameters for X-adjusted G to marginal G (called SSM in the paper) stopifnot(K>0 & K<1) stopifnot(freq.G>0 & freq.G<1) stopifnot(freq.X>0 & freq.X<1) stopifnot(or.G>0 & or.X>0) stopifnot(ncases>0 & ncontrols>0) stopifnot(is.logical(population.controls)) fx=freq.X fg=freq.G #there are six classes of individuals (X=0,1 x G=0,1,2) #the frequencies of classes (0,0),(0,1),(0,2),(1,0),(1,1),(1,2) in the general population are pop.freq=c((1-fx)*(1-fg)^2,(1-fx)*2*fg*(1-fg),(1-fx)*fg^2,fx*(1-fg)^2,fx*2*fg*(1-fg),fx*fg^2) #the additive contribution to the log-odds of having the disease for these classes are a=c(0,log(or.G),2*log(or.G),log(or.X),log(or.X)+log(or.G),log(or.X)+2*log(or.G)) #choose base-line log-odds alpha in such a way that the population prevalence of the disease is approx. K alpha.found=0;iter=1; alpha=seq(-30,10,by=0.001) while(!alpha.found & iter<100){ K.table=rep(NA,length(alpha)) for(i in 1:length(alpha)){K.table[i]=sum(1/(1+exp(-(alpha[i]+a)))*pop.freq)} #this is prevalence for alpha[i] i=which.min(abs(K-K.table)) if(abs(K-K.table[i])<0.01*K){alpha.found=1;K=K.table[i];alpha=alpha[i];} #you can choose relative error tolerance, here 0.01 if(!alpha.found){ if(i==1) alpha=seq(-30-iter*40,10-iter*40,by=0.001) if(i==length(alpha)) alpha=seq(-30+iter*40,10+iter*40,by=0.001) if(i>1 && ioptim.res stopifnot(optim.res$convergence==0) joint.or.G=exp(optim.res$par[2]) joint.or.X=exp(optim.res$par[3]) #marginal odds-ratio for X marg.or.X=sum(p[2,])/sum(q[2,])*sum(q[1,])/sum(p[1,]) #marginal odds-ratio for G as maximum likelihood estimate from the corresponding logistic regression model: optim(c(alpha,log(or.G)),fn=model.1.loglkhood,gr=model.1.gradient,colSums(p),colSums(q),phi,method='BFGS')->optim.res stopifnot(optim.res$convergence==0) marg.or.G=exp(optim.res$par[2]) #phi.0 = proportion of cases given X=0 phi.0=sum(p[1,])*phi/(sum(q[1,])*(1-phi)+sum(p[1,])*phi) #phi.1 = proportion of cases given X=1 phi.1=sum(p[2,])*phi/(sum(q[2,])*(1-phi)+sum(p[2,])*phi) #assuming small genetic effects (see Genet Epidemiol 35:278-290) #asymptotic variance is 1/(N*var(genotype)*phi*(1-phi)) when omitting X t=phi*p+(1-phi)*q #frequencies in the case-control sample var.geno=sum(t[,2])+4*sum(t[,3])-(sum(t[,2])+2*sum(t[,3]))^2 var.G=1/(N*var.geno*phi*(1-phi)) #and 0.5*harmonic mean of the subgroup variances when stratifying by X N.0=ncases*sum(p[1,])+ncontrols*sum(q[1,]) N.1=N-N.0 var.geno.0=(t[1,2]+4*t[1,3])/sum(t[1,])-((t[1,2]+2*t[1,3])/sum(t[1,]))^2 var.geno.1=(t[2,2]+4*t[2,3])/sum(t[2,])-((t[2,2]+2*t[2,3])/sum(t[2,]))^2 var.GX=1/(N.0*phi.0*(1-phi.0)*var.geno.0+N.1*phi.1*(1-phi.1)*var.geno.1) NCP.ratio=log(joint.or.G)^2/var.GX/log(marg.or.G)^2*var.G return(list(K=K,ncases=ncases,ncontrols=ncontrols,pop.a=alpha,case.freq=p,control.freq=q,population.controls=population.controls, marg.or.G=marg.or.G,joint.or.G=joint.or.G,marg.or.X=marg.or.X,joint.or.X=joint.or.X,marg.se.G=sqrt(var.G),joint.se.G=sqrt(var.GX),NCP.GX.G=NCP.ratio)) } # ## ### ######### CONTINUOUS COVARIATE ### ## # K.integrand<-function(x,a,b,c,fg){ #evaluates the following function at point x #sum_g=0,1,2 {exp(a+b*g+c*x)/(1+exp(a+b*g+c*x))}*pi(g)*dnorm(x), #where pi=((1-fg)^2,2*fg*(1-fg),fg^2) is the vector of genotype frequencies in H-W equilibrium #INPUT #x, point at which function is evaluated #a,b,c parameters of the logistic regression model #fg, frequency of allele 1 stopifnot(fg>0 & fg<1) return(((1-fg)^2/(1+exp(-a-c*x))+2*fg*(1-fg)/(1+exp(-a-b-c*x))+fg^2/(1+exp(-a-2*b-c*x)))*dnorm(x)) } G.integrand<-function(x,a,b,c,g,case){ #evaluates #dnorm(x)*exp(a+b*g+c*x)/(1+exp(a+b*g+c*x)), if case==1 #dnorm(x)/(1+exp(a+b*g+c*x)), if case==0 stopifnot(case==0 || case==1) return( ( 1-case+(-1)^(1-case)/(1+exp(-a-b*g-c*x)) )*dnorm(x) ) } Gx.integrand<-function(x,a,b,c,g,case){ #as G.integrand but mutiplied by x return( ( 1-case+(-1)^(1-case)/(1+exp(-a-b*g-c*x)) )*dnorm(x)*x )} Gx2.integrand<-function(x,a,b,c,g,case){ #as G.integrand but mutiplied by x^2 return( ( 1-case+(-1)^(1-case)/(1+exp(-a-b*g-c*x)) )*dnorm(x)*x^2 )} gaussian.covariate<-function(K,freq.G,or.G,or.X,ncases,ncontrols,nsimulations=1000,population.controls=FALSE,x.lower=-6,x.upper=6,print.plots=FALSE){ #Matti Pirinen, February 2012 #INPUT # #K, the (target) prevalence of the disease #freq.G, frequency of risk allele in general population #or.G, odds-ratio for each copy of the risk allele #or.X, odds-ratio per each std deviation of the continuous covariate X #ncases, number of cases in the case-control sample #ncontrols, number of controls in the case-control sample #nsimulations, numer of data sets simulated to get the parameter estimates #population.controls, boolean, if TRUE then controls have general population frequencies, otherwise # controls have proper control frequencies #x.lower, the lower limit of the integration interval for X, on log-odds scale #x.upper, the upper limit of the integration interval for X, on log-odds scale #print.plots, boolean, if true prints the densities of X in each of the six phenotype x genotype group, # and compares cdfs of those distributions to gaussians with the same mean and std # # #OUTPUT #K, the prevalence of the disease (will be close to target prevalence given as input) #ncases, number of cases for which the standard error was computed #ncontrols, number of controls for which the standard error was computed #sample.a, the value of the base-line parameter 'a' in the case-control sample #case.G.freq, the genotype frequencies in cases (0,1,2) #case.X.means, the means of X in cases for each genotype group (0,1,2) #case.X.ses, the standard deviations of X in cases for each genotype group (0,1,2) #control.G.freq, the genotype frequencies in controls (0,1,2) #control.X.means, the means of X in controls for each genotype group (0,1,2) #contrl.X.ses, the standard deviations of X in controls for each genotype group (0,1,2) #population.controls, if TRUE, then controls are from the general population #marg.or.X marginal odds-ratio of X per each stdev (i.e. the effect of X omitting G from the model) #joint.or.X odds-ratio of X per each stdev when adjusted for genotype G #marg.or.G marginal odds-ratio for each copy of risk allele (i.e. effect of G omitting X from the model) #joint.or.G odds-ratio for each copy of risk allele when adjusted for covariate X #marg.se.G standard error of marg.or.G #joint.se.G standard error of joint.or.G #NCP.GX.G ratio of the non-centrality parameters for X-adjusted G to marginal G (called SSM in the paper) # stopifnot(K>0 & K<1) stopifnot(freq.G>0 & freq.G<1) stopifnot(or.G>0 & or.X>0) stopifnot(ncases>0 & ncontrols>0) stopifnot(nsimulations>0) stopifnot(x.lower1 && ires[iter,1:2] glm(y~g+x,family="binomial")->glm.fit summary(glm.fit)$coefficients[3,1]->res[iter,3] summary(glm.fit)$coefficients[2,c(1,2)]->res[iter,4:5] glm(y~x,family="binomial")->glm.fit summary(glm.fit)$coefficients[2,1]->res[iter,6] summary(glm.fit)$coefficients[1,1]->res[iter,7] #a parameter in case-control sample } return(list(K=K,ncases=ncases,ncontrols=ncontrols,sample.a=median(res[,7]),case.G.freq=p,case.X.means=means[1,],case.X.ses=ses[1,],control.G.freq=q,control.X.means=means[2,],control.X.ses=ses[2,],population.controls=population.controls,marg.or.X=exp(mean(res[,6])),joint.or.X=exp(mean(res[,3])),marg.or.G=exp(mean(res[,1])),joint.or.G=exp(mean(res[,4])),marg.se.G=median(res[,2]),joint.se.G=median(res[,5]),NCP.GX.G=(mean(res[,4])/median(res[,5])*median(res[,2])/mean(res[,1]))^2)) } phi.1.minus.phi.integrand<-function(x,a,c,mean,sd){ #evaluates phi(x)*(1-phi(x))*theta(x), when theta is dnorm, see subsection "General case" in SoM return( exp(a+c*x)/(1+exp(a+c*x))^2 *dnorm(x,mean=mean,sd=sd) ) } analytic.var.ratio.normal<-function(a,c,phi,case.G.freq,case.X.means,case.X.ses,control.G.freq,control.X.means,control.X.ses,x.lower=-6,x.upper=6){ #evaluates formula (10) in Online Methods for a single continuous covariate which has a normal distribution #in all six groups of individuals stopifnot(length(case.G.freq)==3) stopifnot(length(case.X.means)==3) stopifnot(length(case.X.ses)==3) stopifnot(abs(sum(case.G.freq)-1)<1e-4) stopifnot(length(control.G.freq)==3) stopifnot(length(control.X.means)==3) stopifnot(length(control.X.ses)==3) stopifnot(abs(sum(control.G.freq)-1)<1e-4) stopifnot(x.lower0 & phi<1) v=0.0 for(g in 1:3){ v=v+(1-phi)*control.G.freq[g]*integrate(phi.1.minus.phi.integrand,lower=x.lower,upper=x.upper,a=a,c=c,mean=control.X.means[g],sd=control.X.ses[g])$value } for(g in 1:3){ v=v+phi*case.G.freq[g]*integrate(phi.1.minus.phi.integrand,lower=x.lower,upper=x.upper,a=a,c=c,mean=case.X.means[g],sd=case.X.ses[g])$value } return(v/phi/(1-phi)) }