---
#title: "RNAseq_clusters_script"
#author: Dr. Antonio Scialdone
#created: 10/2017
# application: PhD thesis of Paulina Mrozek-Gorska, time-course analysis of cells uninfected and infected on daily basis with EBV
# input: non normalized data, mapped and counted data from Thomas Schwarzmayr
#output: 
  #  fig_width: 5
  #  fig_height: 5
  #  fig_caption: yes
   # code_folding: hide
  #  toc: true
  #    collapsed: false

  # chunk_output_type: console
---



#```{r load_libraries}
require(DESeq2)
require(data.table)
require(ggplot2)
require(grid)
require(statmod)
require(WGCNA)
require(dynamicTreeCut)
require(Rtsne)
require(gplots)
require(edgeR)
require(zoo)
require(scran)
require(vioplot)
require(limSolve)
require(xlsx)
require(topGO)
require(org.Mm.eg.db)
require(zoo)


#```{r define_functions}


#PLOTTING ########



#std scatter plot with colors (z)
plot.std <- function(df, xname, yname,legend, title){
  
  ggplot(df, aes(x=x, y=y, color=z))+
    geom_point(size=5)+
    xlab(xname)+
    ylab(yname)+
    ggtitle(title)+
    theme(axis.title.x = element_text(face="bold", size=30, vjust=-2),
          axis.text.x  = element_text( size=25),
          axis.title.y = element_text(face="bold", size=30,vjust=2),
          axis.text.y  = element_text( size=30),
          plot.margin=unit(c(1,1,1.5,1.2),"cm"),
          legend.text=element_text(size=15),#size of legend
          legend.title=element_text(size=15),
          plot.title = element_text(lineheight=.8, face="bold",size=30)) +
    scale_color_discrete(name=legend)#it depends which variable the variable in the legend is mapped to
  
  
}





#std scatter plot with colors specified by cluster names
plot.std.cluster <- function(df, xname, yname,legend, title,col){
  
  ggplot(df, aes(x=x, y=y, color=z))+
    geom_point(size=2)+
    xlab(xname)+
    ylab(yname)+
    ggtitle(title)+
    theme(axis.title.x = element_text(face="bold", size=15, vjust=-2),
          axis.text.x  = element_text( size=10),
          axis.title.y = element_text(face="bold", size=15,vjust=2),
          axis.text.y  = element_text( size=10),
          plot.margin=unit(c(1,1,1.5,1.2),"cm"),
          legend.text=element_text(size=10),#size of legend
          legend.title=element_text(size=15),
          plot.title = element_text(lineheight=.8, face="bold",size=30)) +
    scale_color_manual(name=legend, values=col)#it depends which variable the variable in the legend is mapped to
  
  
}



#simple boxplot 
plot.box<-function(df,xname,yname,title,logy){
  
  
  if(logy==TRUE){
    ggplot(df, aes(x=x, y=y))+geom_boxplot()+
      stat_summary(fun.y="median", geom="point")+
      #ggtitle("Genes on chromosome X")+
      #ylim(0,1)+
      xlab(xname)+
      ylab(yname)+
      ggtitle(title)+#plot title
      theme(axis.title.x = element_text(face="bold", size=15, vjust=-2),
            axis.text.x  = element_text( size=10, angle=90),
            axis.title.y = element_text(face="bold", size=25,vjust=2),
            axis.text.y  = element_text( size=10),
            plot.title = element_text(size=15, face="bold")) +
      theme(plot.margin=unit(c(1,1,1.5,1.2),"cm"))+
      scale_y_log10()  
  }
  else if(logy==F){
    ggplot(df, aes(x=x, y=y))+geom_boxplot()+
      stat_summary(fun.y="median", geom="point")+
      #ggtitle("Genes on chromosome X")+
      #ylim(0,1)+
      xlab(xname)+
      ylab(yname)+
      ggtitle(title)+#plot title
      theme(axis.title.x = element_text(face="bold", size=15, vjust=-2),
            axis.text.x  = element_text( size=10, angle=90),
            axis.title.y = element_text(face="bold", size=15,vjust=2),
            axis.text.y  = element_text( size=10),
            plot.title = element_text(size=15, face="bold")) +
      theme(plot.margin=unit(c(1,1,1.5,1.2),"cm"))
    
  }
  
  
}





#color by gene expression
plot.std.col<-function(df, xname, yname, title){
  
  
  ggplot(df, aes(x=x, y=y, color=log10.exp))+
    geom_point(size=1.5)+
    xlab(xname)+
    ylab(yname)+
    ggtitle(title)+
    theme(axis.title.x = element_text(face="bold", size=15, vjust=-2),
          axis.text.x  = element_text( size=10),
          axis.title.y = element_text(face="bold", size=15,vjust=2),
          axis.text.y  = element_text( size=10),
          plot.margin=unit(c(1,1,1.5,1.2),"cm"),
          legend.text=element_text(size=10),#size of legend
          legend.title=element_text(size=10),
          plot.title = element_text(lineheight=.8, face="bold",size=15)) +
    scale_color_gradient(low="green4", high="red")#it depends which variable the variable in the legend is mapped to
  
  
}


#barplot
plot.bar<-function(df, xname, yname, title=NULL, names=NULL, legend.title=NULL,
                   x.leg=0.65, y.leg=0.65){
  
ggplot(data=df)+
  geom_bar(aes(x=x, y=y, fill=z),size=1, stat="identity", colour="black",  position="dodge" )+
   xlab(xname)+
   ylab(yname)+
    ggtitle(title)+
   theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), 
                      panel.grid.minor = element_blank(),  axis.line = element_line(colour = "black"))+
   theme(axis.title.x = element_text(face="bold", size=20, vjust=-2),
         axis.text.x  = element_text( size=20, angle=90),
         axis.title.y = element_text(face="bold", size=20,vjust=2),
         axis.text.y  = element_text( size=20),
         plot.margin=unit(c(1,1,1.5,1.2),"cm"),
         legend.text=element_text(size=15),#size of legend
         legend.title=element_text(size=15),
         legend.key = element_blank(), 
         legend.background=element_rect( size=0.3, color="black"),
 #        #panel.border = element_rect(fill=NA,color="black"),
         legend.justification=c(0,0),
          legend.position=c(x.leg, y.leg))+
    scale_fill_discrete(name=legend.title)
 
 # scale_fill_manual(values=c("red","black"), guide=FALSE)
   # scale_fill_discrete(name="Number of tubes",
   #                     labels=c("408", expression(10^3), expression(4%.% 10^3), 
   #                              expression(10^4)))
}


#multiplot
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  require(grid)
  
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

#resave(plot.box,file = file.to.load )




#HVG ######


#"dm" method 
find.high.var.dm<-function(data,#matrix with normalized data 
                           n.hvg, #the n.hvg genes with highest dm will be selected as hvg
                           plot=TRUE,  
                           title){
  
  
  
  df<-data.frame(log.avg=log10(apply(data, 1, function(x) mean(x))), 
                 log.cv2=log10(apply(data, 1, function(x) (sd(x)/mean(x))^2))
  )
  
  df.roll<-data.frame(roll.avg=rollapply(df[order(df[,"log.avg"],decreasing=F),"log.avg"], width=100, FUN=median, by=50),#50,25
                      roll.cv2=rollapply(df[order(df[,"log.avg"],decreasing=F),"log.cv2"], width=100, FUN=median, by=50))
  
  
  dm<-apply(df, 1,function(x){
    
    exp<-x["log.avg"]
    
    test<-which(df.roll[,"roll.avg"]>exp)
    if(length(test)==0) window=nrow(df.roll)#length(roll.median.cv2)
    else window=min(test)
    
    return(x["log.cv2"]-df.roll[window,"roll.cv2"])
    
    
    
  })
  
  
  #select highly variable
  
  high.var<-order(dm, decreasing=T)[1:n.hvg]
  
  if(plot==TRUE){
    par(mfrow=c(1,3))
    plot(df[,"log.avg"],
         df[,"log.cv2"], xlab = "Avg. expression", 
         ylab="CV^2", cex.lab=1.7, cex.axis=1.5,main=title)
    points(df[high.var,"log.avg"], df[high.var,"log.cv2"], col="red")
    
    plot(df[,"log.avg"], dm, xlab="Avg. expression", 
         ylab="DM", cex.lab=1.7, cex.axis=1.5)
    points(df[high.var,"log.avg"], dm[high.var], col="red")
    
    plot(df.roll[,1], df.roll[,2])
    
  }
  
  
  #df[which(df[,"log.cv2"]==min(df[df$log.avg<1.2,"log.cv2"])),]
  
  
  return(list(dm=dm, genes.high.var=high.var))
  
  
}




#DESEQ

#differentially expressed gene, 1  factor
deseq.1<-function(counts.merged,cells.1, name.1, cells.2, name.2,genes, cooks.dist){
  
  
  
  counts.1<-counts.merged[genes,cells.1]
  counts.2<-counts.merged[genes,cells.2]
  
  countData<-cbind(counts.1, counts.2)#collect data
  colnames(countData)<-c(cells.1,cells.2)
  row.names(countData)<-row.names(counts.1)
  
  
  colData<-data.frame(Type=factor(c(rep(name.1, ncol(counts.1)), #group names(needs to have levels specified)
                                    rep(name.2, ncol(counts.2))), 
                                  levels=c(name.1,name.2)))
  row.names(colData)<-colnames(countData)
  
  
  
  
  
  dds<-DESeqDataSetFromMatrix(countData=countData, 
                              colData=colData,
                              design= ~ Type)
  dds$Type<-factor(dds$Type, 
                   levels=c(name.1,name.2))#it's important to put the control condition as first element here
  
  
  dds <- DESeq(dds)
  DESeq2::plotMA(dds, main=paste0(name.1, " vs ", name.2))
  
  
  res <- results(dds,cooksCutoff = cooks.dist)
  resOrdered<- res[order(res$padj),]
  
  return(resOrdered)
  
}



#Analysis of bulk RNA-seq data from B cells infected with EBV. 


#```{r load_data_basic_qc}
#load data
raw.counts<-read.csv("/Users/PaulinaMrozek/Desktop/RNA-seq_procedure_AS/Antonio_analysis/171206/Data_sent_to_Antonio_from_Alex/2Alex_for_Antonio/Non_norm.PolyA_NamedByAlex.csv", 
         head=T)
row.names(raw.counts)<-as.character(raw.counts[,1])
raw.counts<-raw.counts[,-1]

#meta data
meta.data<-data.frame(day=unlist(lapply(strsplit(colnames(raw.counts), "_"), function(x) x[[1]])),
                      batch=unlist(lapply(strsplit(colnames(raw.counts), "_"), function(x) x[[2]])))
row.names(meta.data)<-colnames(raw.counts)

meta.data<-meta.data[order(as.numeric(unlist(lapply(strsplit(as.character(meta.data[,"day"]), split = "Day"), function(x) x[[2]])))),]

raw.counts<-raw.counts[,row.names(meta.data)]

is.viral<-grepl(pattern=".t01", x=row.names(raw.counts))
is.ercc<-grepl(pattern="ERCC-", row.names(raw.counts))

#library size
test<-colSums(raw.counts)
qc<-data.frame(tot.counts=test)
row.names(qc)<-colnames(raw.counts)

#fraction of endogenous genes (incl. viral) w.r.t. spikes
test<-colSums(raw.counts[!is.ercc,])/colSums(raw.counts[is.ercc,])  
qc<-cbind(qc, endog.viral.genes=test)  

#fraction of endogenous genes (no viral) w.r.t. spikes
is.ercc<-grepl(pattern="ERCC-", row.names(raw.counts))
test<-colSums(raw.counts[(!is.ercc)&(!is.viral),])/colSums(raw.counts[is.ercc,])  
qc<-cbind(qc, endog.genes=test)  

#viral/endogenous genes 
test<-colSums(raw.counts[is.viral,])/colSums(raw.counts[(!is.viral)&(!is.ercc),])  
qc<-cbind(qc, viral.endog=test)  

#abs. viral genes 
test<-colSums(raw.counts[is.viral,])/colSums(raw.counts[is.ercc,])  
qc<-cbind(qc, viral.genes=test)  

#Number of detected endogenous genes (threshold: > 10RPM)
temp<-apply(raw.counts[(!is.ercc)&(!is.viral),],2,function(x) 1e6*(x/sum(x)))
test<-apply(temp,2,function(x) length(which(x>10))) 
qc<-cbind(qc, n.det.endog=test)  




#Quality Control

#QC plots.
#```{r plot_qc}


#library size
df<-data.frame(x=factor(row.names(qc), level=row.names(qc)),
               y=qc[,"tot.counts"],
               z=factor(meta.data[row.names(qc),"day"], level=unique(meta.data[row.names(qc),"day"])))

plot.bar(df, xname = "Sample", yname="Library size", legend.title = "Day")


#fract. endogenous+viral / spikes
df<-data.frame(x=factor(row.names(qc), level=row.names(qc)),
               y=qc[,"endog.viral.genes"],
               z=factor(meta.data[row.names(qc),"day"], level=unique(meta.data[row.names(qc),"day"])))

plot.bar(df, xname = "Sample", yname="(Endog+Viral)/Spikes", legend.title = "Day")


#fract. endogenous / spikes
df<-data.frame(x=factor(row.names(qc), level=row.names(qc)),
               y=qc[,"endog.genes"],
               z=factor(meta.data[row.names(qc),"day"], level=unique(meta.data[row.names(qc),"day"])))

plot.bar(df, xname = "Sample", yname="Endogenous/Spikes", legend.title = "Day")


#viral/endog
df<-data.frame(x=factor(row.names(qc), level=row.names(qc)),
               y=qc[,"viral.endog"],
               z=factor(meta.data[row.names(qc),"day"], level=unique(meta.data[row.names(qc),"day"])))

plot.bar(df, xname = "Sample", yname="Viral/endog genes", legend.title = "Day",x.leg=0.3)

#viral/spikes
df<-data.frame(x=factor(row.names(qc), level=row.names(qc)),
               y=qc[,"viral.genes"],
               z=factor(meta.data[row.names(qc),"day"], level=unique(meta.data[row.names(qc),"day"])))

plot.bar(df, xname = "Sample", yname="Viral/spikes genes", legend.title = "Day")


#N. of detected genes (>10RPM)
df<-data.frame(x=factor(row.names(qc), level=row.names(qc)),
               y=qc[,"n.det.endog"],
               z=factor(meta.data[row.names(qc),"day"], level=unique(meta.data[row.names(qc),"day"])))

plot.bar(df, xname = "Sample", yname="N. det. genes", legend.title = "Day",y.leg=0.2)




#Data normalization
#```{r normalization}

#how many genes are never 0
test<-apply(raw.counts[(!is.viral)&(!is.ercc),], 1, function(x) all(x!=0))
length(test[test])

#only endog. genes
sf<-estimateSizeFactorsForMatrix(raw.counts[(!is.viral)&(!is.ercc),])
barplot(sf, las=2)

norm.endog<-t(t(raw.counts[(!is.viral)&(!is.ercc),])/sf)


#all genes
sf<-estimateSizeFactorsForMatrix(raw.counts[(!is.ercc),])
#barplot(sf, las=2, ylab="")

norm.all<-t(t(raw.counts[(!is.ercc),])/sf)


#Principal component analysis only endogenous genes
#```{r pca_all}

pca.endog<-prcomp(t(log10(1+norm.endog)))
df<-data.frame(x=pca.endog$x[,1], 
               y=pca.endog$x[,2], 
               z=factor(meta.data$day, level=unique(meta.data$day)))

xname<-paste0("PC1 (", round(100*pca.endog$sdev[1]^2/sum(pca.endog$sdev^2),0), "%)"  )
yname<-paste0("PC2 (", round(100*pca.endog$sdev[2]^2/sum(pca.endog$sdev^2),0), "%)"  )


 #pdf(file="/Users/scialdone/Google Drive/EBV/Analysis/Figures/pca_all.pdf", 
 #    useDingbats = FALSE, width = 9, height=7)
plot.std(df, xname=xname, yname=yname, title="", legend="Day")
#dev.off()

#genes contributing the most to PC1

test<-sort(pca.endog$rotation[,1])[1:10]
test
test<-names(test)
sort(pca.endog$rotation[,1], decreasing = T)[1:10]


#genes plotting

df<-data.frame(x=factor(colnames(norm.endog), level=colnames(norm.endog)),
               y=norm.endog[test[1],],
               z=factor(meta.data[colnames(norm.endog),"day"], level=unique(meta.data[colnames(norm.endog),"day"])))
p1<-plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
         y.leg=0.3, title =test[1] )

df<-data.frame(x=factor(colnames(norm.endog), level=colnames(norm.endog)),
               y=norm.endog[test[2],],
               z=factor(meta.data[colnames(norm.endog),"day"], level=unique(meta.data[colnames(norm.endog),"day"])))
p2<-plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
         y.leg=0.3, title =test[2] )

df<-data.frame(x=factor(colnames(norm.endog), level=colnames(norm.endog)),
               y=norm.endog[test[3],],
               z=factor(meta.data[colnames(norm.endog),"day"], level=unique(meta.data[colnames(norm.endog),"day"])))
p3<-plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
         y.leg=0.3, title =test[3] )


df<-data.frame(x=factor(colnames(norm.endog), level=colnames(norm.endog)),
               y=norm.endog[test[4],],
               z=factor(meta.data[colnames(norm.endog),"day"], level=unique(meta.data[colnames(norm.endog),"day"])))
p4<-plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
         y.leg=0.3, title =test[4] )



pdf(file="/Users/scialdone/Google Drive/EBV/Analysis/Figures/high_genes.pdf", 
    useDingbats = FALSE, width = 18, height=14)
multiplot(p1,p2,p3,p4, cols=2)
dev.off()


#Remove sample 8_1, pca including viral genes.
#```{r pca_exc_1sample_with_viral}
#only endog. genes
sf<-estimateSizeFactorsForMatrix(raw.counts[(!is.ercc),colnames(raw.counts)!="Day8_1"])
#barplot(sf, las=2)

norm.endog<-t(t(raw.counts[(!is.ercc),colnames(raw.counts)!="Day8_1"])/sf)

  
pca.endog<-prcomp(t(log10(1+norm.endog)))
df<-data.frame(x=pca.endog$x[,1], 
               y=pca.endog$x[,2], 
               z=factor(meta.data[colnames(norm.endog),"day"], level=unique(meta.data[colnames(norm.endog),"day"])))

xname<-paste0("PC1 (", round(100*pca.endog$sdev[1]^2/sum(pca.endog$sdev^2),0), "%)"  )
yname<-paste0("PC2 (", round(100*pca.endog$sdev[2]^2/sum(pca.endog$sdev^2),0), "%)"  )

plot.std(df, xname=xname, yname=yname, title="", legend="Day")

#genes contributing the most to PC1
sort(pca.endog$rotation[,1])[1:10]
sort(pca.endog$rotation[,1], decreasing = T)[1:10]

#gene plotting
gene<-"ZBTB16"
df<-data.frame(x=factor(colnames(norm.endog), level=colnames(norm.endog)),
               y=norm.endog[gene,],
               z=factor(meta.data[colnames(norm.endog),"day"], level=unique(meta.data[colnames(norm.endog),"day"])))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
         y.leg=0.3, title =gene )

gene<-"VAV3-AS1"
df<-data.frame(x=factor(colnames(norm.endog), level=colnames(norm.endog)),
               y=norm.endog[gene,],
               z=factor(meta.data[colnames(norm.endog),"day"], level=unique(meta.data[colnames(norm.endog),"day"])))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
         y.leg=0.3, title =gene )

gene<-"CXCR4"
df<-data.frame(x=factor(colnames(norm.endog), level=colnames(norm.endog)),
               y=norm.endog[gene,],
               z=factor(meta.data[colnames(norm.endog),"day"], level=unique(meta.data[colnames(norm.endog),"day"])))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
         y.leg=0.3, title =gene )

gene<-"LOC643401"
df<-data.frame(x=factor(colnames(norm.endog), level=colnames(norm.endog)),
               y=norm.endog[gene,],
               z=factor(meta.data[colnames(norm.endog),"day"], level=unique(meta.data[colnames(norm.endog),"day"])))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
         y.leg=0.3, title =gene )

gene<-"TIMD4"
df<-data.frame(x=factor(colnames(norm.endog), level=colnames(norm.endog)),
               y=norm.endog[gene,],
               z=factor(meta.data[colnames(norm.endog),"day"], level=unique(meta.data[colnames(norm.endog),"day"])))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
         y.leg=0.3, title =gene )

gene<-"HRASLS2"
df<-data.frame(x=factor(colnames(norm.endog), level=colnames(norm.endog)),
               y=norm.endog[gene,],
               z=factor(meta.data[colnames(norm.endog),"day"], level=unique(meta.data[colnames(norm.endog),"day"])))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
         y.leg=0.3, title =gene )


#viral genes that change the most
temp<-sort(pca.endog$rotation[row.names(raw.counts)[is.viral],1])
temp[temp!=0]

write.table(temp, file="/Users/PaulinaMrozek/Desktop/RNA-seq_procedure_AS/Antonio_analysis/171206/R_scripts/analysis/polyA_viral_changed.tsv", row.names=T, col.names=F, quote=F)

gene<-"gene_name.t01"
df<-data.frame(x=factor(colnames(norm.endog), level=colnames(norm.endog)),
               y=norm.endog[gene,],
               z=factor(meta.data[colnames(norm.endog),"day"], level=unique(meta.data[colnames(norm.endog),"day"])))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
         y.leg=0.3, title =gene )


#Principal component analysis only highly variable genes
#```{r hvg_find}
#only endog. genes
sf<-estimateSizeFactorsForMatrix(raw.counts[(!is.ercc),colnames(raw.counts)!="Day8_1"])
barplot(sf, las=2)
norm.endog<-t(t(raw.counts[(!is.ercc),colnames(raw.counts)!="Day8_1"])/sf)

hvg<-find.high.var.dm(data=norm.endog, n.hvg = 3e3, plot = TRUE, title = "")


pca.endog<-prcomp(t(log10(1+norm.endog[hvg$genes.high.var,])))
df<-data.frame(x=pca.endog$x[,1], 
               y=pca.endog$x[,2], 
               z=factor(meta.data[colnames(norm.endog),"day"], level=unique(meta.data[colnames(norm.endog),"day"])))

xname<-paste0("PC1 (", round(100*pca.endog$sdev[1]^2/sum(pca.endog$sdev^2),0), "%)"  )
yname<-paste0("PC2 (", round(100*pca.endog$sdev[2]^2/sum(pca.endog$sdev^2),0), "%)"  )


#pdf(file="/Users/scialdone/Google Drive/EBV/Analysis/Figures/pca_hvg.pdf", 
   # useDingbats = FALSE, width = 9, height=7)
plot.std(df, xname=xname, yname=yname, title="", legend="Day")
#dev.off()

# write.table(file="/Users/scialdone/Google Drive/EBV/Analysis/hvg_top3000.tsv", x=row.names(norm.endog)[hvg$genes.high.var], row.names=F, col.names=F, quote=F)
write.table(file="/Users/scialdone/Google Drive/EBV/Analysis/hvg_top3000.tsv", x=row.names(norm.endog)[hvg$genes.high.var], row.names=F, col.names=F, quote=F)


#Gene analysis

#Gene filtering.
#Select the genes that:
#- have an average expression >50 norm. counts 
#- are diff. expressed in at least one pairwise comparison
#```{r gene_selection}
#only endog. genes
sf<-estimateSizeFactorsForMatrix(raw.counts[(!is.ercc),colnames(raw.counts)!="Day8_1"])
#barplot(sf, las=2)
norm.data<-t(t(raw.counts[(!is.ercc),colnames(raw.counts)!="Day8_1"])/sf)
# 
#find avg genes
labels<-meta.data[colnames(norm.data), "day"]
names(labels)<-colnames(norm.data)

avg<-list()
sd<-list()
for(d in unique(labels)){
  #d<-unique(labels)[1]
  x<-names(labels[labels==d])

  temp<-rowMeans(norm.data[,x])

  avg[[d]]<-temp
  
 temp<-sqrt(rowVars(norm.data[,x]))

sd[[d]]<-temp
  
  
}
names(avg)<-unique(labels)
names(sd)<-unique(labels)

avg<-do.call(cbind, avg)
sd<-do.call(cbind, sd)
row.names(avg)<-row.names(norm.data)
row.names(sd)<-row.names(norm.data)

# 
# #find value of max
 max.genes<-apply(res, 1, max) 

#find diff. exp. genes between pairs 
pairs<-t(combn(x = unique(labels), m = 2))

 diff<-list()
  for(p in 1:nrow(pairs)){
   #p<-1
   p<-(pairs[p,])
   sample.1<-colnames(norm.data)[meta.data[colnames(norm.data),"day"]==p[1]]
  sample.2<-colnames(norm.data)[meta.data[colnames(norm.data),"day"]==p[2]]
 
   temp<-norm.data[,c(sample.1,sample.2)]
   test<-rowMeans(temp)#   genes<-names(test)[test>=50]
 res<-deseq.1(raw.counts[genes,c(sample.1,sample.2)],
                sample.1, (p)[1],
              sample.2, (p)[2],
                genes, cooks.dist = T)
 
  
   diff[[ paste0(p[1],"_",p[2]) ]]<-res
  
   
   
 }
setwd("/Users/PaulinaMrozek/Desktop/RNA-seq_procedure_AS/Antonio_analysis/171206/R_scripts/analysis/2_polyA_analysis/")
save(list = "diff", file="diff_polyA.RData")

load("diff_polyA.RData")

sig.genes<-lapply(diff_antonio, function(x) row.names(x[x$padj<0.1 & !is.na(x$padj) &
                                                  abs(x$log2FoldChange)>log2(2),]))
sig.genes<-unique(unlist(sig.genes))
# 
write.table(file="/Users/PaulinaMrozek/Desktop/RNA-seq_procedure_AS/Antonio_analysis/171206/R_scripts/analysis/2_polyA_analysis/polyA__sig.tsv", x=sig.genes, quote=F, row.names=F, col.names = F)

tested.genes<-lapply(diff_antonio, function(x) row.names(x[!is.na(x$padj) ,]))
tested.genes<-unique(unlist(tested.genes))
 write.table(file="/Users/scialdone/Google Drive/EBV/Analysis/gene_lists/background.tsv", x=tested.genes, quote=F, row.names=F, col.names = F)

#plotting genes after DESeq2 
gene<-"gene_name"
df<-data.frame(x=factor(colnames(norm.data), level=colnames(norm.data)),
               y=norm.data[gene,],
               z=factor(meta.data[colnames(norm.data),"day"], level=unique(meta.data[colnames(norm.data),"day"])))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
         y.leg=0.3, title =gene )


# mean of days

#```{r test_naming}


norm.data_mean <- cbind(norm.data, apply(norm.data[,c(1:3)], 1, mean), apply(norm.data[,c(4:6)], 1, mean),
          apply(norm.data[,c(7:9)], 1, mean), apply(norm.data[,c(10:12)], 1, mean), 
          apply(norm.data[,c(13:15)], 1, mean), apply(norm.data[,c(16:18)], 1, mean), 
          apply(norm.data[,c(19:20)], 1, mean), apply(norm.data[,c(21:23)], 1, mean))[ ,c(24:31)]

colnames(norm.data_mean) <- c("Day0", "Day1",  "Day2",  "Day3",  "Day4",  "Day5",  "Day8",  "Day14")

gene<-"gene_name"

df<-data.frame(x=factor(colnames(norm.data_mean), level=colnames(norm.data_mean)),
               y=norm.data_mean[gene,],
               z=factor(colnames(norm.data_mean),"day", level=unique(colnames(norm.data_mean),"day")))

plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
         y.leg=0.3, title =gene )


#norm_data intersections for UpSet

sf<-estimateSizeFactorsForMatrix(raw.counts[(!is.ercc),colnames(raw.counts)!="Day8_1"])
#barplot(sf, las=2)
norm.data<-t(t(raw.counts[(!is.ercc),colnames(raw.counts)!="Day8_1"])/sf)
#exclude samples with 0 reads
norm.data_0removed <- norm.data[which(rowSums(norm.data) > 0),] 

head(norm.data_0removed)
class(norm.data_0removed)

write.csv(norm.data_0removed,file="/Users/PaulinaMrozek/Desktop/RNA-seq_procedure_AS/Antonio_analysis/171206/R_scripts/analysis/Days intersections/norm_data_all_0removed_2.csv", row.names=T, col.names = F, quote=F)
#cut out each day fro norm data
norm_day0_1 <- norm.data_0removed[,"Day0_1"] 
head(norm_day0_1)
length(norm_day0_1)
class(norm_day0_1)

wo0_norm_day14_3 <- norm_day14_3[norm_day14_3 != "0"]
head(wo0_norm_day14_3)
length(norm_day14_3)
df_norm_day0_1 <- data.frame(wo0_norm_day0_1)
head(df_norm_day0_1)
class(wo0_norm_day0_1)

write.table(df_norm_day0_1,file="/Users/PaulinaMrozek/Desktop/RNA-seq_procedure_AS/Antonio_analysis/171206/R_scripts/analysis/Days intersections/df_Day0_1_norm_data.csv",sep = "", row.names=T, quote=F)


#Gene clustering
#```{r gene_clustering}
#only endog. genes
sf<-estimateSizeFactorsForMatrix(raw.counts[(!is.ercc),colnames(raw.counts)!="Day8_1"])
#barplot(sf, las=2)
norm.data<-t(t(raw.counts[(!is.ercc),colnames(raw.counts)!="Day8_1"])/sf)
head(norm.data)

data<-log10(1+norm.data[sig.genes,])


#norm. by max
#data<-t(apply(data, 1, function(x) x/max(x)))
cor.genes<-cor(t(data), method="spearman")
test.dissim<-sqrt(0.5*((1-cor.genes)))#
test.dist<-as.dist(test.dissim)
test.clust<-hclust(test.dist, method="average")#method="ward.D2")
# #cutreeDynamic
cut2<-cutreeDynamic(test.clust,distM=as.matrix(test.dist), 
                     minClusterSize=500, method="hybrid",deepSplit = 1 )
 clust.colour<-labels2colors(cut2) 
# 
names(clust.colour)<-row.names(data)
 length(unique(clust.colour))
table(clust.colour)

clust.selected<-"turquoise"
g<-names(clust.colour)[clust.colour==clust.selected]
# write.table(file=paste0("/Users/scialdone/Google Drive/EBV/Analysis/gene_lists/", clust.selected,".tsv"), x=g, quote=F, row.names=F, col.names = F)


g<-names(clust.colour)[clust.colour=="turquoise"]

   
  pdf(file="/Users/PaulinaMrozek/Desktop/RNA-seq_procedure_AS/Antonio_analysis/171206/R_scripts/analysis/2_polyA_analysis/polyA_clusters_2.pdf", 
      useDingbats = FALSE, width = 27, height=21)
 par(mfrow=c(3,3))
for(c in unique(clust.colour[clust.colour!="pink"])){
  g<-names(clust.colour)[clust.colour=="turquoise"]

clust.mean<-rowMeans(apply(avg[g,], 1, function(x) x/max(x)))
#max.avg<-apply(avg[g,], 1, function(x) max(x))
clust.sd<-sqrt(rowVars(apply(avg[g,], 1, function(x) x/max(x))))


par(cex=2)
plot(c(0,1,2,3,4,5,8,14),clust.mean, ylim=c(min(clust.mean-clust.sd), max(clust.mean+clust.sd)), 
     main=paste(c, "(N=", length(clust.colour[clust.colour=="turquoise"]) ,")"), type="l",
     xlab="Day", ylab="Mean Norm. Expression")
points(c(0,1,2,3,4,5,8,14),clust.mean)
lines(c(0,1,2,3,4,5,8,14),clust.mean-clust.sd, lty=2)
lines(c(0,1,2,3,4,5,8,14),clust.mean+clust.sd, lty=2)



}
dev.off()



#Figures
#```{r export_figures}

#viral/endog
df<-data.frame(x=factor(row.names(qc), level=row.names(qc)),
               y=qc[,"viral.endog"],
               z=factor(meta.data[row.names(qc),"day"], level=unique(meta.data[row.names(qc),"day"])))

# pdf(file="/Users/scialdone/Google Drive/EBV/Analysis/Figures/viral_endog.pdf", 
#     useDingbats = FALSE, width = 9, height=7)
p1<-plot.bar(df, xname = "Sample", yname="Viral/endog genes", legend.title = "Day",x.leg=0.4,y.leg=0.51)
# dev.off()




#library size
df<-data.frame(x=factor(row.names(qc), level=row.names(qc)),
               y=qc[,"tot.counts"],
               z=factor(meta.data[row.names(qc),"day"], level=unique(meta.data[row.names(qc),"day"])))

# pdf(file="/Users/scialdone/Google Drive/EBV/Analysis/Figures/library_size.pdf", 
#     useDingbats = FALSE, width = 9, height=7)
p2<-plot.bar(df, xname = "Sample", yname="Library size", legend.title = "Day", x.leg=0.3,y.leg=0.57)
# dev.off()



#N. of detected genes (>10RPM)
df<-data.frame(x=factor(row.names(qc), level=row.names(qc)),
               y=qc[,"n.det.endog"],
               z=factor(meta.data[row.names(qc),"day"], level=unique(meta.data[row.names(qc),"day"])))


# pdf(file="/Users/scialdone/Google Drive/EBV/Analysis/Figures/det_genes.pdf", 
#     useDingbats = FALSE, width = 9, height=7)
p3<-plot.bar(df, xname = "Sample", yname="N. det. genes", legend.title = "Day",y.leg=0.2)
# dev.off()

 pdf(file="/Users/scialdone/Google Drive/EBV/Analysis/Figures/QC.pdf", 
     useDingbats = FALSE, width = 18, height=14)
multiplot(p1,p2,p3, cols=2)
dev.off()
