# Functions for two-step mixed model approach to 
# analyzing differential alternative RNA splicing
# Updated on 5/18/2020
# Author: Huining Kang & Li Luo

library(nlme)
library(aod)
library(gtools)
library(genefilter)
library(parallel)
# Plot isoforms of one gene
plot.gene<-function(data, gene="MYB", xlim=c(-0.2, 1.6), lwd=3, Phenotype=NULL, logged2=T,
                    ylab="RPKM in log scale", isoform.id.opt=1)
  # Phenotype = "OUTCOME" or "Tissue TPYE" or NULL
  # isoform.id.opt = Options for isoform ids: 1 = Ensembl ids, 2 = Gene symbol ids
  
{
  p<-which(data$gene==gene)
  if(length(p)<1)
  {
    cat("Cannot find", gene, "\n")
    return(data)
  }
  if(!logged2) x <- log2(data$x[p, ]+1)
  else x <- data$x[p, ]
  y <- data$y
  if(isoform.id.opt==1) isoform.id <- as.numeric(gsub(pattern = "ENST", "", data$isoform[p]))
  else
  {
    tmp <- strsplit(data$isoform_name[p[1]], split = "[-]")
    nTmp <- length(tmp[[1]])
    tmp <- strsplit(data$isoform_name[p], split = "[-]")
    isoform.id <- matrix(unlist(tmp), ncol=nTmp, byrow=T)[, nTmp]
  }
  num<-dim(x)
  p1<-which(y==1)
  p2<-which(y==2)
  mat1<-x[, p1]
  mat2<-x[, p2]
  minV<-min(c(rowMeans(mat1), rowMeans(mat2)))
  maxV<-max(c(rowMeans(mat1), rowMeans(mat2)))
  plot(c(0,1), c(minV, maxV), type="n", xlim=xlim, ylab=ylab, xaxt="n", ann=F)
  for( k in 1:num[1]) lines(c(0, 1), c(mean(x[k, p1]), mean(x[k, p2])), lty=k, col=k, lwd=lwd)
  axis(side=1, at=c(0, 1), labels = data$y.lab)
  title(ylab=ylab, main=paste("Isoform abundances of ", gene))
  if(is.null(Phenotype))title(xlab=Phenotype)
  legend("topright", inset=0.02, title="Isoform", legend=isoform.id, lty=1:num[1], col=1:num[1], lwd=lwd)
  
  return(list(x=x, y=y, isoform=isoform.id, y.lab=data$y.lab))
}

plot.gene.bat <- function(data=NULL, gene, xlim=c(-0.2, 1.6), lwd=3, Phenotype=NULL, file="figure.pdf",
                          pPanel=c(2, 2), width=7, height=9, isoform.id.opt=1)
{
  nG <- length(gene)
  pdf(file=file, width=width, height=height)
  nF <- pPanel[1]*pPanel[2]
  for( k in 1:nG)
  {
    if((k-1) %% nF==0) par(mfrow=pPanel)
    plot.gene(data=data, gene=gene[k], xlim=xlim, lwd=lwd, Phenotype=Phenotype, isoform.id.opt=isoform.id.opt)
  }
  dev.off()
}

# Perform LRT for one gene (to be used by lapply)
lrt.test.1G.4.par <- function(geneSymbol, data, VarCov.type="uneqvar", screen.type=1)
{
  #    geneSymbol <- gene[idx]
  tim.start <- Sys.time()
  cat("Working on gene", geneSymbol, "...\n")
  x <- data$x[data$gene == geneSymbol, ]
  num <- dim(x) # num[1] = number of isoforms, num[2] = total sample size
  nIso<-num[1]
  work.dat <- data.frame(Y = as.vector(t(x)), j = factor(rep(data$y, num[1])),
                         k = factor(rep(1:num[2], num[1])), 
                         l = factor(rep(1:num[1], each=num[2])))
  nC <- nlevels(work.dat$j)
  if(screen.type==1) df <- (nC-1)*nIso
  else df <- (nC-1)*(nIso-1)
  if(VarCov.type=="cs")
  {
    fit.full<-try(lme(Y ~ j*l, data=work.dat, random = ~1|k, method="ML"), TRUE)
    if(screen.type==1) fit.red<-try(lme(Y ~ l, data=work.dat, random = ~1|k, method="ML"), TRUE)
    else fit.red<-try(lme(Y ~ j + l, data=work.dat, random = ~1|k, method="ML"), TRUE)
  }
  else if(VarCov.type=="uneqvar")
  {
    fit.full<-try(lme(Y ~ j*l, data=work.dat, random = ~1|k, weights = varIdent(form = ~1 | l), method="ML"), TRUE)
    if(screen.type==1) fit.red<- try(lme(Y ~ l,   data=work.dat, random = ~1|k, weights = varIdent(form = ~1 | l), method="ML"), TRUE)
    else fit.red<- try(lme(Y ~ j + l,   data=work.dat, random = ~1|k, weights = varIdent(form = ~1 | l), method="ML"), TRUE)
  }
  else # unstr
  {
    fit.full<-try(gls(Y ~ j*l, data=work.dat, corr=corSymm(form = ~ 1 | k), weights = varIdent(form = ~1 | l), method="ML"), TRUE)
    if(screen.type==1) fit.red<- try(gls(Y ~ l,   data=work.dat, corr=corSymm(form = ~ 1 | k), weights = varIdent(form = ~1 | l), method="ML"), TRUE)
    else fit.red<- try(gls(Y ~ j + l,   data=work.dat, corr=corSymm(form = ~ 1 | k), weights = varIdent(form = ~1 | l), method="ML"), TRUE)
  }
  tim.end <- Sys.time()
  tim.usd <- as.numeric(tim.end)-as.numeric(tim.start)
  if(!inherits(fit.full, "try-error") & !inherits(fit.red, "try-error"))
    rtf <- c(2*(fit.full$logLik-fit.red$logLik), df, nIso, tim.usd)
  else rtf <- c(NA, df, nIso, tim.usd)
  
  return(rtf)
}

# Perform LRT using mclapply. (using multiple cores under Linux)
lrt.test.mclapply <- function(data, VarCov.type="uneqvar", nCore=NULL, outfile=NULL, screen.type=1)
  # data = original data in the form of list(x, y, gene, isoform, y.lab)
  # VarCov.type = type of variance and covariance structure. should be one of "cs", "uneqvar" or "unstr"
  # nCore = number of cores to be used.
  # outfile = filename for the output
  # screen.type = 1 or 2 for screening test type I and 2
  # output values: lrt=lrt test statistics, nI=# of isoforms, which is also degree of freedom of LRT test.
{
  if(is.null(nCore)) nCore <- detectCores()-1
  if(Sys.info()[1]=="Windows")
  {
    nCore <- 1
    cat("Parallel computing option is disabled under Windows.\n")
    cat("The task will take more time to complete.\n")
  }
  tim.start = as.numeric(Sys.time())
  # cl <- makeCluster(nCore)
  gene <- unique(data$gene)
  nG <- length(gene)
  res <- mclapply(gene, lrt.test.1G.4.par, data=data, VarCov.type=VarCov.type, screen.type=screen.type, mc.cores=nCore)
  res <- matrix(unlist(res), ncol=4, byrow = T)
  rownames(res) <- gene
  colnames(res) <- c("lrt", "df", "nIso", "Time")
  num <- dim(res)
  p <- fdr <- rep(NA, num[1])
  idx <- which(!is.na(res[, 1]))
  cat("\n\np-value adjustment for", length(idx), "genes\n")
  p[idx] <- pchisq(q=res[idx, 1], df = res[idx, 2], lower.tail = F)
  fdr[idx] <- p.adjust(p[idx], method = "BH")
  res <- cbind(res, matrix(c(p, fdr), ncol=2))
  colnames(res)[5:6] <- c("p.value", "FDR")
  res <- res[, c(1:3, 5:6, 4)]
  tim.end =as.numeric(Sys.time())
  cat("# genes =", nG, "\tnCore =", nCore, "\tTotal time =", tim.end-tim.start, "\n\n")
  if(!is.null(outfile)) save(res, file=outfile)
  return(res)
}

# This program adds p-values and FDR of LRT to the output of the above program
add.p.2.lrt <- function(res=NULL, infile="data/uneqvar.RData", outfile="data/uneqvar_w_p.RData")
  # nC = number of conductions
{
  if(is.null(res)) load(file=infile)
  num <- dim(res)
  p <- fdr <- rep(NA, num[1])
  idx <- which(!is.na(res[, 1]))
  cat("Will work on", length(idx), "genes\n")
  p[idx] <- pchisq(q=res[idx, 1], df = res[idx, 2], lower.tail = F)
  fdr[idx] <- p.adjust(p[idx], method = "BH")
  cat("Done!\n")
  res <- cbind(res, matrix(c(p, fdr), ncol=2))
  colnames(res)[5:6] <- c("p.value", "FDR")
  res <- res[, c(1:3, 5:6, 4)]
  if(!is.null(outfile)) save(res, file=outfile)
  return(res)
}

# Find the number of isoform for every gene of the given data set
find.nL <-function(data, nCore=NULL)
{
  if(is.null(nCore)) nCore <- detectCores()-1
  if(Sys.info()[1]=="Windows")
  {
    nCore <- 1
    cat("Parallel computing option is disabled under Windows.\n")
    cat("The task will take more time to complete.\n")
  }
  
  gene <- unique(data$gene)
  nL <- mclapply(gene, function(geneS, data) {sum(data$gene==geneS)}, data=data, mc.cores=nCore)
  nL <- unlist(nL)
  names(nL) <- gene
  return(nL)
}

# t-test/anova for stage II confirmatory test
step2.Ftest <- function(data=NULL, file.data, file.step1, alpha=0.05, method="holm")
  # method = c("holm", "hochberg", "bonferroni") for FWER
  # file.data = file of the original data, e.g. "data/TYPE_data.RData"
  # file.step1 = name of file for the result from step 1 screening test, e.g. "output/uneqvar_type1.RData"
  # 
{
  if(is.null(data)) load(file=file.data)
  load(file=file.step1)
  # remove genes with missing step 1 results
  p <- which(!is.na(res[, 1]))
  res <- res[p, ]
  #    p <- which(data$gene %in% rownames(res))
  # cat("Debug", length(p), "\n")
  #    data$x <- data$x[p, ]
  #    data$gene <- data$gene[p]
  #    data$isoform <- data$isoform[p]
  FDR <- res[, "FDR"]
  # print(colnames(res))
  sig.gene <- rownames(res)[FDR <= alpha]    
  R <- length(sig.gene) # number of significant genes
  if( R < 1)
  {
    cat("No significant genes\n")
    return(NULL)
  }
  else cat("Number of significant genes:", R, "\n")
  # M <- length(unique(data$gene)) 
  M <- dim(res)[1]
  p <- which(data$gene %in% sig.gene)
  # cat("Debug: Total Number of genes:", M, "\n")
  # cat("Debug: Number of significant genes:", length(unique(data$gene[p])), "\n")
  #  t.res <- rowFtests(x=data$x[p, ], fac=factor(data$y))
  t.res <- rowttests(x=data$x[p, ], fac=factor(data$y))
  FC <- logratio2foldchange(- t.res$dm)
  # cat("Debug: Pass\n")
  final.res <- data.frame(gene=data$gene[p], isoform=data$isoform[p], FC=FC, 
                          step2.p.value=t.res$p.value, adj.p=rep(NA, length(p)), 
                          threshold=rep(NA, length(p)), stringsAsFactors = F)
  for( k in 1:R)
  {
    p <- which(final.res$gene==sig.gene[k])
    final.res$adj.p[p] <- p.adjust(final.res$step2.p.value[p], method=method)
    final.res$threshold[p] <- R*alpha/M * final.res$step2.p.value[p] / final.res$adj.p[p]
  }
  final.res$decision <- (final.res$adj.p <= R*alpha/M)
  cat("Cutoff for significant isoform is", R*alpha/M, "\n")
  return(final.res)
}

#Step 2 test for isoform difference after using lme
Iso.test.lme<-function(fit, nIso)
{
  L<-matrix(0, 1, 2*nIso)
  L[1, 2]<-1
  pv.wald<-rep(NA, nIso)
  pv.wald[1]<- (wald.test(Sigma=fit$varFix,b=fit$coefficients$fixed,L=L))$result$chi2["P"]
  for(k in 2:nIso) 
  {
    L.new<-L
    L.new[1, nIso+k]<-1
    pv.wald[k]<- (wald.test(Sigma=fit$varFix,b=fit$coefficients$fixed,L=L.new))$result$chi2["P"]
  }
  c.effect<-fit$coefficient$fixed[2] + c(0, fit$coefficient$fixed[nIso+(2:nIso)])
  return(list(pv.wald=pv.wald, effect=c.effect))
}

#Step 2 test for isoform difference after using gls
Iso.test.gls<-function(fit, nIso)
{
  L<-matrix(0, 1, 2*nIso)
  L[1, 2]<-1
  pv.wald<-rep(NA, nIso)
  pv.wald[1]<- (wald.test(Sigma=fit$varBeta, b=fit$coefficients, L=L))$result$chi2["P"]
  for(k in 2:nIso) 
  {
    L.new<-L
    L.new[1, nIso+k]<-1
    pv.wald[k]<- (wald.test(Sigma=fit$varBeta, b=fit$coefficients, L=L.new))$result$chi2["P"]
  }
  c.effect<-fit$coefficient[2] + c(0, fit$coefficient[nIso+(2:nIso)])
  return(list(pv.wald=pv.wald, effect=c.effect))
}

# Perform LRT for Wald-test for one gene (to be used by lapply)
do.wald.test <- function(geneSymbol, data, VarCov.type="uneqvar")
  # lrt.test.1G.4.wald <- function(geneSymbol, data, VarCov.type="uneqvar")
{
  #    geneSymbol <- gene[idx]
  cat("Working on gene", geneSymbol, "...\n")
  x <- data$x[data$gene == geneSymbol, ]
  num <- dim(x) # num[1] = number of isoforms, num[2] = total sample size
  nIso<-num[1]
  work.dat <- data.frame(Y = as.vector(t(x)), j = factor(rep(data$y, num[1])),
                         k = factor(rep(1:num[2], num[1])), 
                         l = factor(rep(1:num[1], each=num[2])))
  # nC <- nlevels(work.dat$j)
  # if(screen.type==1) df <- (nC-1)*nIso
  # else df <- (nC-1)*(nIso-1)
  if(VarCov.type=="cs")
  {
    fit<-try(lme(Y ~ j*l, data=work.dat, random = ~1|k, method="ML"), TRUE)
  }
  else if(VarCov.type=="uneqvar")
  {
    fit<-try(lme(Y ~ j*l, data=work.dat, random = ~1|k, weights = varIdent(form = ~1 | l), method="ML"), TRUE)
  }
  else # unstr
  {
    fit<-try(gls(Y ~ j*l, data=work.dat, corr=corSymm(form = ~ 1 | k), weights = varIdent(form = ~1 | l), method="ML"), TRUE)
  }
  # if(!inherits(fit, "try-error"))
  #  rtf <- list(fit=fit, nIso=nIso)
  # else rtf <- NULL
  if(!inherits(fit, "try-error"))
  {
    if(VarCov.type=="unstr") r.step2<-Iso.test.gls(fit=fit, nIso=nIso)
    else r.step2<-Iso.test.lme(fit=fit, nIso=nIso)
    rtf<-list(gene=geneSymbol, r.step2=r.step2)
  }
  else rtf <- NULL
  rm(list="fit")
  return(rtf)
}
  
# Wald test for stage II confirmatory test (using multiple cores under Linux)
mc.step2.Waldtest <- function(data=NULL, file.data, file.step1, alpha=0.05, VarCov.type="uneqvar", method="holm", nCore)
  # method = c("holm", "hochberg", "bonferroni") for FWER
  # file.data = file of the original data, e.g. "data/TYPE_data.RData"
  # file.step1 = name of file for the result from step 1 screening test, e.g. "output/uneqvar_type1.RData"
{
  if(is.null(nCore)) nCore <- detectCores()-1
  if(Sys.info()[1]=="Windows")
  {
    nCore <- 1
    cat("Parallel computing option is disabled under Windows.\n")
    cat("The task will take more time to complete.\n")
  }
  
  if(is.null(data)) load(file=file.data)
  load(file=file.step1)
  # remove genes with missing step 1 results
  p <- which(!is.na(res[, 1]))
  res <- res[p, ]
  FDR <- res[, "FDR"]
  sig.gene <- rownames(res)[FDR <= alpha]    
  R <- length(sig.gene) # number of significant genes
  if( R < 1)
  {
    cat("No significant genes\n")
    return(NULL)
  }
  else cat("Number of significant genes:", R, "\n")
  # M <- length(unique(data$gene)) 
  M <- dim(res)[1]
  p <- which(data$gene %in% sig.gene)
  
  final.res <- data.frame(gene=data$gene[p], isoform=data$isoform[p], step2.p.value=rep(NA, length(p)),
                          adj.p=rep(NA, length(p)), threshold=rep(NA, length(p)), stringsAsFactors = F)
  cat("Fitting mixed effects model. This may take a while...\n")
  r.step1 <- mclapply(sig.gene, do.wald.test, data=data, VarCov.type=VarCov.type, mc.cores=nCore)
  cat("Model fitting is completed\n")
  # save(r.step1, file="temp.RData")
  for( k in 1:R)
  {
    p <- which(final.res$gene==r.step1[[k]]$gene)
    final.res$step2.p.value[p] <- r.step1[[k]]$r.step2$pv.wald
    final.res$adj.p[p] <- p.adjust(r.step1[[k]]$r.step2$pv.wald, method=method)
    final.res$threshold[p] <- R*alpha/M * final.res$step2.p.value[p] / final.res$adj.p[p]
  }
  final.res$decision <- (final.res$adj.p <= R*alpha/M)
  cat("# Number of total genes tested:", M, "\n")
  cat("# Number of significant genes:", R, "\n")
  cat("# Cutoff for significant isoform is", R*alpha/M, "\n")
  cat("# Number of significant isoform:", sum(final.res$decision, na.rm = T), "\n") 
  
  return(final.res)
}

step2.brief <- function(file.step1.type1, file.step1.type2, file.step2, alpha=0.05)
  # file.step1.type1 = name of file for the result from step 1 type 1 screening test, e.g. "output/uneqvar_type1.RData"
  # file.step1.type2 = name of file for the result from step 1 type 2 screening test, e.g. "output/uneqvar_type1.RData"
  # file.step2 = name of file for the result from step 2 confirmation test, e.g. "output/uneqvar_type1.RData"
{
  load(file=file.step1.type1)
  load(file=file.step2)
  # remove genes with missing step 1 results
  p <- which(!is.na(res[, 1]))
  res <- res[p, ]
  M <- dim(res)[1]
  sig.gene <- unique(final.res1$gene)    
  R <- length(sig.gene) # number of significant genes
  cat("#######################################\n")
  cat("# Number of total genes tested:", M, "\n")
  cat("# Number of significant genes:", R, "\n")
  cat("# Cutoff for significant isoform is",  R*alpha/M, "\n")
  cat("# Number of significant isoform:", sum(final.res1$decision), "\n")

  load(file=file.step1.type2)
  # remove genes with missing step 1 results
  p <- which(!is.na(res[, 1]))
  res <- res[p, ]
  M <- dim(res)[1]
  sig.gene <- unique(final.res2$gene)    
  R <- length(sig.gene) # number of significant genes
  cat("#######################################\n")
  cat("# Number of total genes tested:", M, "\n")
  cat("# Number of significant genes:", R, "\n")
  cat("# Cutoff for significant isoform is",  R*alpha/M, "\n")
  cat("# Number of significant isoform:", sum(final.res2$decision), "\n")
}

# Plot isoforms of one gene
plot.gene<-function(data, gene="MYB", xlim=c(-0.2, 1.6), lwd=3, Phenotype=NULL, logged2=T,
                    ylab="RPKM in log scale", isoform.id.opt=1, y.lab=NULL)
  # Phenotype = "OUTCOME" or "Tissue TPYE" or NULL
  # isoform.id.opt = Options for isoform ids: 1 = Ensembl ids, 2 = Gene symbol ids
  
{
  p<-which(data$gene==gene)
  if(length(p)<1)
  {
    cat("Cannot find", gene, "\n")
    return(data)
  }
  if(!logged2) x <- log2(data$x[p, ]+1)
  else x <- data$x[p, ]
  y <- data$y
  if(isoform.id.opt==1) isoform.id <- as.numeric(gsub(pattern = "ENST", "", data$isoform[p]))
  else
  {
    tmp <- strsplit(data$isoform_name[p[1]], split = "[-]")
    nTmp <- length(tmp[[1]])
    tmp <- strsplit(data$isoform_name[p], split = "[-]")
    isoform.id <- matrix(unlist(tmp), ncol=nTmp, byrow=T)[, nTmp]
  }
  num<-dim(x)
  p1<-which(y==1)
  p2<-which(y==2)
  mat1<-x[, p1]
  mat2<-x[, p2]
  minV<-min(c(rowMeans(mat1), rowMeans(mat2)))
  maxV<-max(c(rowMeans(mat1), rowMeans(mat2)))
  plot(c(0,1), c(minV, maxV), type="n", xlim=xlim, ylab=ylab, xaxt="n", ann=F)
  for( k in 1:num[1]) lines(c(0, 1), c(mean(x[k, p1]), mean(x[k, p2])), lty=k, col=k, lwd=lwd)
  if(is.null(y.lab)) axis(side=1, at=c(0, 1), labels = data$y.lab)
  else axis(side=1, at=c(0, 1), labels = y.lab)
  title(ylab=ylab, main=paste("Isoform abundances of ", gene))
  if(!is.null(Phenotype)) title(xlab=Phenotype)
  legend("topright", inset=0.02, title="Isoform", legend=isoform.id, lty=1:num[1], col=1:num[1], lwd=lwd)
  
  return(list(x=x, y=y, isoform=isoform.id, y.lab=data$y.lab))
}

plot.gene.bat <- function(data=NULL, gene, xlim=c(-0.2, 1.6), lwd=3, Phenotype=NULL, file="figure.pdf",
                          pPanel=c(2, 2), width=7, height=9, isoform.id.opt=1, y.lab=NULL)
{
  nG <- length(gene)
  pdf(file=file, width=width, height=height)
  nF <- pPanel[1]*pPanel[2]
  for( k in 1:nG)
  {
    if((k-1) %% nF==0) par(mfrow=pPanel)
    plot.gene(data=data, gene=gene[k], xlim=xlim, lwd=lwd, Phenotype=Phenotype,
              isoform.id.opt=isoform.id.opt, y.lab=y.lab)
  }
  dev.off()
}