# Functions for two-step mixed model approach to # analyzing differential alternative RNA splicing # Updated on 5/18/2020 # Author: Huining Kang & Li Luo library(nlme) library(aod) library(gtools) library(genefilter) library(parallel) # Plot isoforms of one gene plot.gene<-function(data, gene="MYB", xlim=c(-0.2, 1.6), lwd=3, Phenotype=NULL, logged2=T, ylab="RPKM in log scale", isoform.id.opt=1) # Phenotype = "OUTCOME" or "Tissue TPYE" or NULL # isoform.id.opt = Options for isoform ids: 1 = Ensembl ids, 2 = Gene symbol ids { p<-which(data$gene==gene) if(length(p)<1) { cat("Cannot find", gene, "\n") return(data) } if(!logged2) x <- log2(data$x[p, ]+1) else x <- data$x[p, ] y <- data$y if(isoform.id.opt==1) isoform.id <- as.numeric(gsub(pattern = "ENST", "", data$isoform[p])) else { tmp <- strsplit(data$isoform_name[p[1]], split = "[-]") nTmp <- length(tmp[[1]]) tmp <- strsplit(data$isoform_name[p], split = "[-]") isoform.id <- matrix(unlist(tmp), ncol=nTmp, byrow=T)[, nTmp] } num<-dim(x) p1<-which(y==1) p2<-which(y==2) mat1<-x[, p1] mat2<-x[, p2] minV<-min(c(rowMeans(mat1), rowMeans(mat2))) maxV<-max(c(rowMeans(mat1), rowMeans(mat2))) plot(c(0,1), c(minV, maxV), type="n", xlim=xlim, ylab=ylab, xaxt="n", ann=F) for( k in 1:num[1]) lines(c(0, 1), c(mean(x[k, p1]), mean(x[k, p2])), lty=k, col=k, lwd=lwd) axis(side=1, at=c(0, 1), labels = data$y.lab) title(ylab=ylab, main=paste("Isoform abundances of ", gene)) if(is.null(Phenotype))title(xlab=Phenotype) legend("topright", inset=0.02, title="Isoform", legend=isoform.id, lty=1:num[1], col=1:num[1], lwd=lwd) return(list(x=x, y=y, isoform=isoform.id, y.lab=data$y.lab)) } plot.gene.bat <- function(data=NULL, gene, xlim=c(-0.2, 1.6), lwd=3, Phenotype=NULL, file="figure.pdf", pPanel=c(2, 2), width=7, height=9, isoform.id.opt=1) { nG <- length(gene) pdf(file=file, width=width, height=height) nF <- pPanel[1]*pPanel[2] for( k in 1:nG) { if((k-1) %% nF==0) par(mfrow=pPanel) plot.gene(data=data, gene=gene[k], xlim=xlim, lwd=lwd, Phenotype=Phenotype, isoform.id.opt=isoform.id.opt) } dev.off() } # Perform LRT for one gene (to be used by lapply) lrt.test.1G.4.par <- function(geneSymbol, data, VarCov.type="uneqvar", screen.type=1) { # geneSymbol <- gene[idx] tim.start <- Sys.time() cat("Working on gene", geneSymbol, "...\n") x <- data$x[data$gene == geneSymbol, ] num <- dim(x) # num[1] = number of isoforms, num[2] = total sample size nIso<-num[1] work.dat <- data.frame(Y = as.vector(t(x)), j = factor(rep(data$y, num[1])), k = factor(rep(1:num[2], num[1])), l = factor(rep(1:num[1], each=num[2]))) nC <- nlevels(work.dat$j) if(screen.type==1) df <- (nC-1)*nIso else df <- (nC-1)*(nIso-1) if(VarCov.type=="cs") { fit.full<-try(lme(Y ~ j*l, data=work.dat, random = ~1|k, method="ML"), TRUE) if(screen.type==1) fit.red<-try(lme(Y ~ l, data=work.dat, random = ~1|k, method="ML"), TRUE) else fit.red<-try(lme(Y ~ j + l, data=work.dat, random = ~1|k, method="ML"), TRUE) } else if(VarCov.type=="uneqvar") { fit.full<-try(lme(Y ~ j*l, data=work.dat, random = ~1|k, weights = varIdent(form = ~1 | l), method="ML"), TRUE) if(screen.type==1) fit.red<- try(lme(Y ~ l, data=work.dat, random = ~1|k, weights = varIdent(form = ~1 | l), method="ML"), TRUE) else fit.red<- try(lme(Y ~ j + l, data=work.dat, random = ~1|k, weights = varIdent(form = ~1 | l), method="ML"), TRUE) } else # unstr { fit.full<-try(gls(Y ~ j*l, data=work.dat, corr=corSymm(form = ~ 1 | k), weights = varIdent(form = ~1 | l), method="ML"), TRUE) if(screen.type==1) fit.red<- try(gls(Y ~ l, data=work.dat, corr=corSymm(form = ~ 1 | k), weights = varIdent(form = ~1 | l), method="ML"), TRUE) else fit.red<- try(gls(Y ~ j + l, data=work.dat, corr=corSymm(form = ~ 1 | k), weights = varIdent(form = ~1 | l), method="ML"), TRUE) } tim.end <- Sys.time() tim.usd <- as.numeric(tim.end)-as.numeric(tim.start) if(!inherits(fit.full, "try-error") & !inherits(fit.red, "try-error")) rtf <- c(2*(fit.full$logLik-fit.red$logLik), df, nIso, tim.usd) else rtf <- c(NA, df, nIso, tim.usd) return(rtf) } # Perform LRT using mclapply. (using multiple cores under Linux) lrt.test.mclapply <- function(data, VarCov.type="uneqvar", nCore=NULL, outfile=NULL, screen.type=1) # data = original data in the form of list(x, y, gene, isoform, y.lab) # VarCov.type = type of variance and covariance structure. should be one of "cs", "uneqvar" or "unstr" # nCore = number of cores to be used. # outfile = filename for the output # screen.type = 1 or 2 for screening test type I and 2 # output values: lrt=lrt test statistics, nI=# of isoforms, which is also degree of freedom of LRT test. { if(is.null(nCore)) nCore <- detectCores()-1 if(Sys.info()[1]=="Windows") { nCore <- 1 cat("Parallel computing option is disabled under Windows.\n") cat("The task will take more time to complete.\n") } tim.start = as.numeric(Sys.time()) # cl <- makeCluster(nCore) gene <- unique(data$gene) nG <- length(gene) res <- mclapply(gene, lrt.test.1G.4.par, data=data, VarCov.type=VarCov.type, screen.type=screen.type, mc.cores=nCore) res <- matrix(unlist(res), ncol=4, byrow = T) rownames(res) <- gene colnames(res) <- c("lrt", "df", "nIso", "Time") num <- dim(res) p <- fdr <- rep(NA, num[1]) idx <- which(!is.na(res[, 1])) cat("\n\np-value adjustment for", length(idx), "genes\n") p[idx] <- pchisq(q=res[idx, 1], df = res[idx, 2], lower.tail = F) fdr[idx] <- p.adjust(p[idx], method = "BH") res <- cbind(res, matrix(c(p, fdr), ncol=2)) colnames(res)[5:6] <- c("p.value", "FDR") res <- res[, c(1:3, 5:6, 4)] tim.end =as.numeric(Sys.time()) cat("# genes =", nG, "\tnCore =", nCore, "\tTotal time =", tim.end-tim.start, "\n\n") if(!is.null(outfile)) save(res, file=outfile) return(res) } # This program adds p-values and FDR of LRT to the output of the above program add.p.2.lrt <- function(res=NULL, infile="data/uneqvar.RData", outfile="data/uneqvar_w_p.RData") # nC = number of conductions { if(is.null(res)) load(file=infile) num <- dim(res) p <- fdr <- rep(NA, num[1]) idx <- which(!is.na(res[, 1])) cat("Will work on", length(idx), "genes\n") p[idx] <- pchisq(q=res[idx, 1], df = res[idx, 2], lower.tail = F) fdr[idx] <- p.adjust(p[idx], method = "BH") cat("Done!\n") res <- cbind(res, matrix(c(p, fdr), ncol=2)) colnames(res)[5:6] <- c("p.value", "FDR") res <- res[, c(1:3, 5:6, 4)] if(!is.null(outfile)) save(res, file=outfile) return(res) } # Find the number of isoform for every gene of the given data set find.nL <-function(data, nCore=NULL) { if(is.null(nCore)) nCore <- detectCores()-1 if(Sys.info()[1]=="Windows") { nCore <- 1 cat("Parallel computing option is disabled under Windows.\n") cat("The task will take more time to complete.\n") } gene <- unique(data$gene) nL <- mclapply(gene, function(geneS, data) {sum(data$gene==geneS)}, data=data, mc.cores=nCore) nL <- unlist(nL) names(nL) <- gene return(nL) } # t-test/anova for stage II confirmatory test step2.Ftest <- function(data=NULL, file.data, file.step1, alpha=0.05, method="holm") # method = c("holm", "hochberg", "bonferroni") for FWER # file.data = file of the original data, e.g. "data/TYPE_data.RData" # file.step1 = name of file for the result from step 1 screening test, e.g. "output/uneqvar_type1.RData" # { if(is.null(data)) load(file=file.data) load(file=file.step1) # remove genes with missing step 1 results p <- which(!is.na(res[, 1])) res <- res[p, ] # p <- which(data$gene %in% rownames(res)) # cat("Debug", length(p), "\n") # data$x <- data$x[p, ] # data$gene <- data$gene[p] # data$isoform <- data$isoform[p] FDR <- res[, "FDR"] # print(colnames(res)) sig.gene <- rownames(res)[FDR <= alpha] R <- length(sig.gene) # number of significant genes if( R < 1) { cat("No significant genes\n") return(NULL) } else cat("Number of significant genes:", R, "\n") # M <- length(unique(data$gene)) M <- dim(res)[1] p <- which(data$gene %in% sig.gene) # cat("Debug: Total Number of genes:", M, "\n") # cat("Debug: Number of significant genes:", length(unique(data$gene[p])), "\n") # t.res <- rowFtests(x=data$x[p, ], fac=factor(data$y)) t.res <- rowttests(x=data$x[p, ], fac=factor(data$y)) FC <- logratio2foldchange(- t.res$dm) # cat("Debug: Pass\n") final.res <- data.frame(gene=data$gene[p], isoform=data$isoform[p], FC=FC, step2.p.value=t.res$p.value, adj.p=rep(NA, length(p)), threshold=rep(NA, length(p)), stringsAsFactors = F) for( k in 1:R) { p <- which(final.res$gene==sig.gene[k]) final.res$adj.p[p] <- p.adjust(final.res$step2.p.value[p], method=method) final.res$threshold[p] <- R*alpha/M * final.res$step2.p.value[p] / final.res$adj.p[p] } final.res$decision <- (final.res$adj.p <= R*alpha/M) cat("Cutoff for significant isoform is", R*alpha/M, "\n") return(final.res) } #Step 2 test for isoform difference after using lme Iso.test.lme<-function(fit, nIso) { L<-matrix(0, 1, 2*nIso) L[1, 2]<-1 pv.wald<-rep(NA, nIso) pv.wald[1]<- (wald.test(Sigma=fit$varFix,b=fit$coefficients$fixed,L=L))$result$chi2["P"] for(k in 2:nIso) { L.new<-L L.new[1, nIso+k]<-1 pv.wald[k]<- (wald.test(Sigma=fit$varFix,b=fit$coefficients$fixed,L=L.new))$result$chi2["P"] } c.effect<-fit$coefficient$fixed[2] + c(0, fit$coefficient$fixed[nIso+(2:nIso)]) return(list(pv.wald=pv.wald, effect=c.effect)) } #Step 2 test for isoform difference after using gls Iso.test.gls<-function(fit, nIso) { L<-matrix(0, 1, 2*nIso) L[1, 2]<-1 pv.wald<-rep(NA, nIso) pv.wald[1]<- (wald.test(Sigma=fit$varBeta, b=fit$coefficients, L=L))$result$chi2["P"] for(k in 2:nIso) { L.new<-L L.new[1, nIso+k]<-1 pv.wald[k]<- (wald.test(Sigma=fit$varBeta, b=fit$coefficients, L=L.new))$result$chi2["P"] } c.effect<-fit$coefficient[2] + c(0, fit$coefficient[nIso+(2:nIso)]) return(list(pv.wald=pv.wald, effect=c.effect)) } # Perform LRT for Wald-test for one gene (to be used by lapply) do.wald.test <- function(geneSymbol, data, VarCov.type="uneqvar") # lrt.test.1G.4.wald <- function(geneSymbol, data, VarCov.type="uneqvar") { # geneSymbol <- gene[idx] cat("Working on gene", geneSymbol, "...\n") x <- data$x[data$gene == geneSymbol, ] num <- dim(x) # num[1] = number of isoforms, num[2] = total sample size nIso<-num[1] work.dat <- data.frame(Y = as.vector(t(x)), j = factor(rep(data$y, num[1])), k = factor(rep(1:num[2], num[1])), l = factor(rep(1:num[1], each=num[2]))) # nC <- nlevels(work.dat$j) # if(screen.type==1) df <- (nC-1)*nIso # else df <- (nC-1)*(nIso-1) if(VarCov.type=="cs") { fit<-try(lme(Y ~ j*l, data=work.dat, random = ~1|k, method="ML"), TRUE) } else if(VarCov.type=="uneqvar") { fit<-try(lme(Y ~ j*l, data=work.dat, random = ~1|k, weights = varIdent(form = ~1 | l), method="ML"), TRUE) } else # unstr { fit<-try(gls(Y ~ j*l, data=work.dat, corr=corSymm(form = ~ 1 | k), weights = varIdent(form = ~1 | l), method="ML"), TRUE) } # if(!inherits(fit, "try-error")) # rtf <- list(fit=fit, nIso=nIso) # else rtf <- NULL if(!inherits(fit, "try-error")) { if(VarCov.type=="unstr") r.step2<-Iso.test.gls(fit=fit, nIso=nIso) else r.step2<-Iso.test.lme(fit=fit, nIso=nIso) rtf<-list(gene=geneSymbol, r.step2=r.step2) } else rtf <- NULL rm(list="fit") return(rtf) } # Wald test for stage II confirmatory test (using multiple cores under Linux) mc.step2.Waldtest <- function(data=NULL, file.data, file.step1, alpha=0.05, VarCov.type="uneqvar", method="holm", nCore) # method = c("holm", "hochberg", "bonferroni") for FWER # file.data = file of the original data, e.g. "data/TYPE_data.RData" # file.step1 = name of file for the result from step 1 screening test, e.g. "output/uneqvar_type1.RData" { if(is.null(nCore)) nCore <- detectCores()-1 if(Sys.info()[1]=="Windows") { nCore <- 1 cat("Parallel computing option is disabled under Windows.\n") cat("The task will take more time to complete.\n") } if(is.null(data)) load(file=file.data) load(file=file.step1) # remove genes with missing step 1 results p <- which(!is.na(res[, 1])) res <- res[p, ] FDR <- res[, "FDR"] sig.gene <- rownames(res)[FDR <= alpha] R <- length(sig.gene) # number of significant genes if( R < 1) { cat("No significant genes\n") return(NULL) } else cat("Number of significant genes:", R, "\n") # M <- length(unique(data$gene)) M <- dim(res)[1] p <- which(data$gene %in% sig.gene) final.res <- data.frame(gene=data$gene[p], isoform=data$isoform[p], step2.p.value=rep(NA, length(p)), adj.p=rep(NA, length(p)), threshold=rep(NA, length(p)), stringsAsFactors = F) cat("Fitting mixed effects model. This may take a while...\n") r.step1 <- mclapply(sig.gene, do.wald.test, data=data, VarCov.type=VarCov.type, mc.cores=nCore) cat("Model fitting is completed\n") # save(r.step1, file="temp.RData") for( k in 1:R) { p <- which(final.res$gene==r.step1[[k]]$gene) final.res$step2.p.value[p] <- r.step1[[k]]$r.step2$pv.wald final.res$adj.p[p] <- p.adjust(r.step1[[k]]$r.step2$pv.wald, method=method) final.res$threshold[p] <- R*alpha/M * final.res$step2.p.value[p] / final.res$adj.p[p] } final.res$decision <- (final.res$adj.p <= R*alpha/M) cat("# Number of total genes tested:", M, "\n") cat("# Number of significant genes:", R, "\n") cat("# Cutoff for significant isoform is", R*alpha/M, "\n") cat("# Number of significant isoform:", sum(final.res$decision, na.rm = T), "\n") return(final.res) } step2.brief <- function(file.step1.type1, file.step1.type2, file.step2, alpha=0.05) # file.step1.type1 = name of file for the result from step 1 type 1 screening test, e.g. "output/uneqvar_type1.RData" # file.step1.type2 = name of file for the result from step 1 type 2 screening test, e.g. "output/uneqvar_type1.RData" # file.step2 = name of file for the result from step 2 confirmation test, e.g. "output/uneqvar_type1.RData" { load(file=file.step1.type1) load(file=file.step2) # remove genes with missing step 1 results p <- which(!is.na(res[, 1])) res <- res[p, ] M <- dim(res)[1] sig.gene <- unique(final.res1$gene) R <- length(sig.gene) # number of significant genes cat("#######################################\n") cat("# Number of total genes tested:", M, "\n") cat("# Number of significant genes:", R, "\n") cat("# Cutoff for significant isoform is", R*alpha/M, "\n") cat("# Number of significant isoform:", sum(final.res1$decision), "\n") load(file=file.step1.type2) # remove genes with missing step 1 results p <- which(!is.na(res[, 1])) res <- res[p, ] M <- dim(res)[1] sig.gene <- unique(final.res2$gene) R <- length(sig.gene) # number of significant genes cat("#######################################\n") cat("# Number of total genes tested:", M, "\n") cat("# Number of significant genes:", R, "\n") cat("# Cutoff for significant isoform is", R*alpha/M, "\n") cat("# Number of significant isoform:", sum(final.res2$decision), "\n") } # Plot isoforms of one gene plot.gene<-function(data, gene="MYB", xlim=c(-0.2, 1.6), lwd=3, Phenotype=NULL, logged2=T, ylab="RPKM in log scale", isoform.id.opt=1, y.lab=NULL) # Phenotype = "OUTCOME" or "Tissue TPYE" or NULL # isoform.id.opt = Options for isoform ids: 1 = Ensembl ids, 2 = Gene symbol ids { p<-which(data$gene==gene) if(length(p)<1) { cat("Cannot find", gene, "\n") return(data) } if(!logged2) x <- log2(data$x[p, ]+1) else x <- data$x[p, ] y <- data$y if(isoform.id.opt==1) isoform.id <- as.numeric(gsub(pattern = "ENST", "", data$isoform[p])) else { tmp <- strsplit(data$isoform_name[p[1]], split = "[-]") nTmp <- length(tmp[[1]]) tmp <- strsplit(data$isoform_name[p], split = "[-]") isoform.id <- matrix(unlist(tmp), ncol=nTmp, byrow=T)[, nTmp] } num<-dim(x) p1<-which(y==1) p2<-which(y==2) mat1<-x[, p1] mat2<-x[, p2] minV<-min(c(rowMeans(mat1), rowMeans(mat2))) maxV<-max(c(rowMeans(mat1), rowMeans(mat2))) plot(c(0,1), c(minV, maxV), type="n", xlim=xlim, ylab=ylab, xaxt="n", ann=F) for( k in 1:num[1]) lines(c(0, 1), c(mean(x[k, p1]), mean(x[k, p2])), lty=k, col=k, lwd=lwd) if(is.null(y.lab)) axis(side=1, at=c(0, 1), labels = data$y.lab) else axis(side=1, at=c(0, 1), labels = y.lab) title(ylab=ylab, main=paste("Isoform abundances of ", gene)) if(!is.null(Phenotype)) title(xlab=Phenotype) legend("topright", inset=0.02, title="Isoform", legend=isoform.id, lty=1:num[1], col=1:num[1], lwd=lwd) return(list(x=x, y=y, isoform=isoform.id, y.lab=data$y.lab)) } plot.gene.bat <- function(data=NULL, gene, xlim=c(-0.2, 1.6), lwd=3, Phenotype=NULL, file="figure.pdf", pPanel=c(2, 2), width=7, height=9, isoform.id.opt=1, y.lab=NULL) { nG <- length(gene) pdf(file=file, width=width, height=height) nF <- pPanel[1]*pPanel[2] for( k in 1:nG) { if((k-1) %% nF==0) par(mfrow=pPanel) plot.gene(data=data, gene=gene[k], xlim=xlim, lwd=lwd, Phenotype=Phenotype, isoform.id.opt=isoform.id.opt, y.lab=y.lab) } dev.off() }