#The code is for the implementation of methods described in paper: #Gene Set Enrichment Analysis for non-monotone association and multiple experimental categories #By Rongheng Lin, Shuangshuang Dai, Richard D Irwin, Alexandra N. Heinloth, Gary Boorman and Leping Li #The research was supported by the Intramural Research Program of the NIH, National Institute of Environmental Health Sciences. #Maintained by Rongheng Lin #Contact: Rongheng Lin #Division of Biostatistics and Epidemiology #University of Massachusetts #715 North Pleasant Street #Amherst, MA 01003 #Email address: rlin **AT** schoolph.umass.edu #Last update: August 9, 2007 #The file includes following functions: #1. rsq() :calculate R-squared function #2. gseascore() :calculate GSEA score with R-squared from rsq() function #3. comb() :combine p-values of separate analyses and select sets in the way described in Lin.et.al(2007). #=================================The beginning of R-squared calculation function=========================================================== #rsqhy2(): The function to get R-square, internal use only rsqhy2<-function(hat,y){#Warning: require y to be standardized before usage. f<-hat%*%y sum(f^2)/(length(y)-1) } #stdf(): The function to standardize a vector, internal use only stdf<-function(x){(x-mean(x))/sd(x)} rsq<-function(expr,pheno, permuteMatrix=NULL, degree=5, ...){ require(splines) x<-stdf(expr) y<-stdf(pheno) if (is.null(permuteMatrix)) permuteMatrix<-matrix(1:length(y), nrow=1) rsq.v<-vector(length=nrow(permuteMatrix)) tryx<-try(ns(x,df=degree), TRUE) if (attr(tryx, "class")[1] == "try-error"){stop("ns() error; expr might not have enough unique values for setting spline knots")} else { xk<-cbind(1,tryx) qx<-qr(xk) Qt<-t(qr.Q(qx)) hat<-crossprod(Qt,Qt) for (i.permute in 1:nrow(permuteMatrix)){ rsq.v[i.permute]<-rsqhy2(hat,y[permuteMatrix[i.permute,]])} return(rsq.v) } } #expr :gene expression levels of a specific gene, length(expr) is the number of samples #pheno : phenotypic endpoints, assumed to be continuous #permuteMatrix : Each row of the matrix is a permutation of 1:length(expr). It should be given when permutation method is used to estimate the null distribution of R-squared. The default value is NULL, where no permutation will be done. #degree : degree of freedom to be used in ns() function for setting up the splines #... : Other parameters to be passed to natual cubic splines setting function ns() in package ``splines''. See help for ns() #Other details :The routine recycling the hat matrix between permutations to save computation time. #Returned value :R-squared values #Examples: #onegene<-rnorm(30)#generate expression levels of one gene, assuming N(0,1) #onepheno<-3*onegene^2+rnorm(30)#generate phenotypic endpoint #onepm<-matrix(nrow=51,ncol=30)#declare permutation matrix #onepm[1,]<-1:30#let first permutation be itself. #for (i.pm in 2:51){onepm[i.pm,]<-sample(1:30,30)}#each row is a permutation of 1:30 #rsq(expr=onegene, pheno=onepheno)#no permutation #rsq(expr=onegene, pheno=onepheno,permuteMatrix=onepm)#With permutation #================================= The end of R-squared function=========================================================== #=================================The beginning of score function=========================================================== #gseascore function gseascore<-function(rsqvec,genesetindex, logi.abs=F, logi.miss=T){ pcomb<-vector(length=length(rsqvec))#pcomb is either the value of phit or pmiss depending on whether a gene is in the geneset if (logi.miss){ missing.in.geneset<-is.na(rsqvec[genesetindex]) missing.not.in.geneset<-is.na(rsqvec[-genesetindex]) #if the rsq value for a gene is missing, we set phit or pmiss 0. rsqvec[genesetindex][missing.in.geneset]<-0#the missing in geneset is set to zero phit<-abs(rsqvec[genesetindex]) phit<-phit/sum(phit) pcomb[genesetindex]<-phit#the missing in geneset was set to zero above rm(phit) gc() pcomb[-genesetindex]<- -1/(length(rsqvec)-length(genesetindex)-sum(missing.not.in.geneset)) pcomb[-genesetindex][missing.not.in.geneset]<-0 } else { phit<-(abs(rsqvec[genesetindex])) phit<-phit/sum(phit) pcomb[genesetindex]<-phit rm(phit) gc() pcomb[-genesetindex]<- - 1/(length(rsqvec)-length(genesetindex)) } if (logi.abs) {pcomb <- pcomb[order( abs(rsqvec), decreasing = TRUE)]} else {pcomb <- pcomb[order(rsqvec,decreasing=TRUE)] } score<-cumsum(pcomb) return(score[which.max(abs(score))]) } #rsqvec : the association measurements between genes and the endpoint, each element stands for one gene. This is different from the returned vector of function rsq(). #genesetindex : An integer vector indicating the coordinates of those genes in the set #logi.abs :a logic variable indicating whether absolute value of rsqvec should be used to rank genes. These could be desirable when association measurement, e.g., Pearson correlation, can be either positive or negative. #logi.miss : a logic variable indicating whehter missing values are expected in rsqvec. When it is TRUE, the weight of those genes with missing rsqvec values are set to zero. #Examples: #rsqvec<-runif(1000)#assuming 1000 genes totally, association measurements in (0,1) #genesetindex<-sample(1:1000,50)#50 genes assigned to the geneset #gseascore(rsqvec,genesetindex) #gseascore(rsqvec[1000:1],1001-genesetindex)#The score is same after reversing the order of genes #================================= The end of score function=========================================================== #=================================The beginning of combv() function=========================================================== combv<-function( ptable1, pt, nOutOfAll, ptable2=matrix(0,nrow=nrow(ptable1),ncol=ncol(ptable1)), columns=NULL) { if ((nrow(ptable1) != nrow(ptable2))|(ncol(ptable1) != ncol(ptable2) )) stop("ptable1 and ptable2 must match in dimension."); if ((nOutOfAll > ncol(ptable1))|(nOutOfAll <1)) stop("invalid nOutOfAll value"); # if (is.null(columns) ) columns<-1:ncol(ptable1); comb.table<-matrix(nrow=nrow(ptable1), ncol=length(columns)); combv.result<-vector(length=nrow(ptable1)); for (i.row in 1:nrow(ptable1)){ for (i.col in 1:length(columns)){ comb.table[i.row,i.col]<-max(ptable1[i.row,i.col], ptable2[i.row, i.col]) } combv.result[i.row] <- sum(comb.table[i.row,] < pt) } return(combv.result >= nOutOfAll) } #ptable1, ptable2: the tables of nominal p-values. Each element is for one gene set in one category. # nrow(ptable1) and nrow(ptable2) are the number of gene sets; # ncol(ptable1) and ncol(ptable2) are the number of categories/compounds. # ptable2 needs input only if there are two classes of categories of experimental conditions. The default value is zero matrix. #pt: the threshold of nominal p-values to be used. This is $p_0$ in Lin et.al (2007) # nOutOfAll: The number of categories among all in which we would like to see that p-values are smaller than pt. #columns: A vector indicating which columns of ptables to be included. When NULL, all columns will be used. #Returned values: A logic vector with length equal to nrow(ptable1), i.e., the number of gene sets. #Examples: # np.liver.temp<-matrix(runif(1000),nrow=125,ncol=8)#generate liver p-values matrix for illustration, assuming 125 sets, 8 compounds # np.blood.temp<-matrix(runif(1000),nrow=125,ncol=8)#generate blood p-values matrix for illustration, assuming 125 sets, 8 compounds # both.sig<-combv(ptable1=np.liver.temp, pt=0.05, nOutOfAll=6, ptable2=np.blood.temp); ## requires p<0.05 in both liver and blood for at least 6 out of ncol(np.liver.temp) compounds # liver.sig<-combv(ptable1=np.liver.temp, pt=.1, nOutOfAll=6); ## requires p<0.1 in liver for at least 6 out of ncol(np.liv.temp) compounds #================================= The end of combv() function===========================================================