require(DESeq2)
require(data.table)
require(ggplot2)
require(grid)
require(statmod)
require(WGCNA)
require(dynamicTreeCut)
require(Rtsne)
require(gplots)
require(edgeR)
require(zoo)
require(scran)
require(vioplot)
require(limSolve)
require(xlsx)
require(topGO)
require(org.Mm.eg.db)
require(zoo)
#PLOTTING ########
#std scatter plot with colors (z)
plot.std <- function(df, xname, yname,legend, title){
ggplot(df, aes(x=x, y=y, color=z))+
geom_point(size=5)+
xlab(xname)+
ylab(yname)+
ggtitle(title)+
theme(axis.title.x = element_text(face="bold", size=30, vjust=-2),
axis.text.x  = element_text( size=25),
axis.title.y = element_text(face="bold", size=30,vjust=2),
axis.text.y  = element_text( size=30),
plot.margin=unit(c(1,1,1.5,1.2),"cm"),
legend.text=element_text(size=15),#size of legend
legend.title=element_text(size=15),
plot.title = element_text(lineheight=.8, face="bold",size=30)) +
scale_color_discrete(name=legend)#it depends which variable the variable in the legend is mapped to
}
#std scatter plot with colors specified by cluster names
plot.std.cluster <- function(df, xname, yname,legend, title,col){
ggplot(df, aes(x=x, y=y, color=z))+
geom_point(size=2)+
xlab(xname)+
ylab(yname)+
ggtitle(title)+
theme(axis.title.x = element_text(face="bold", size=15, vjust=-2),
axis.text.x  = element_text( size=10),
axis.title.y = element_text(face="bold", size=15,vjust=2),
axis.text.y  = element_text( size=10),
plot.margin=unit(c(1,1,1.5,1.2),"cm"),
legend.text=element_text(size=10),#size of legend
legend.title=element_text(size=15),
plot.title = element_text(lineheight=.8, face="bold",size=30)) +
scale_color_manual(name=legend, values=col)#it depends which variable the variable in the legend is mapped to
}
#simple boxplot
plot.box<-function(df,xname,yname,title,logy){
if(logy==TRUE){
ggplot(df, aes(x=x, y=y))+geom_boxplot()+
stat_summary(fun.y="median", geom="point")+
#ggtitle("Genes on chromosome X")+
#ylim(0,1)+
xlab(xname)+
ylab(yname)+
ggtitle(title)+#plot title
theme(axis.title.x = element_text(face="bold", size=15, vjust=-2),
axis.text.x  = element_text( size=10, angle=90),
axis.title.y = element_text(face="bold", size=25,vjust=2),
axis.text.y  = element_text( size=10),
plot.title = element_text(size=15, face="bold")) +
theme(plot.margin=unit(c(1,1,1.5,1.2),"cm"))+
scale_y_log10()
}
else if(logy==F){
ggplot(df, aes(x=x, y=y))+geom_boxplot()+
stat_summary(fun.y="median", geom="point")+
#ggtitle("Genes on chromosome X")+
#ylim(0,1)+
xlab(xname)+
ylab(yname)+
ggtitle(title)+#plot title
theme(axis.title.x = element_text(face="bold", size=15, vjust=-2),
axis.text.x  = element_text( size=10, angle=90),
axis.title.y = element_text(face="bold", size=15,vjust=2),
axis.text.y  = element_text( size=10),
plot.title = element_text(size=15, face="bold")) +
theme(plot.margin=unit(c(1,1,1.5,1.2),"cm"))
}
}
#color by gene expression
plot.std.col<-function(df, xname, yname, title){
ggplot(df, aes(x=x, y=y, color=log10.exp))+
geom_point(size=1.5)+
xlab(xname)+
ylab(yname)+
ggtitle(title)+
theme(axis.title.x = element_text(face="bold", size=15, vjust=-2),
axis.text.x  = element_text( size=10),
axis.title.y = element_text(face="bold", size=15,vjust=2),
axis.text.y  = element_text( size=10),
plot.margin=unit(c(1,1,1.5,1.2),"cm"),
legend.text=element_text(size=10),#size of legend
legend.title=element_text(size=10),
plot.title = element_text(lineheight=.8, face="bold",size=15)) +
scale_color_gradient(low="green4", high="red")#it depends which variable the variable in the legend is mapped to
}
#barplot
plot.bar<-function(df, xname, yname, title=NULL, names=NULL, legend.title=NULL,
x.leg=0.65, y.leg=0.65){
ggplot(data=df)+
geom_bar(aes(x=x, y=y, fill=z),size=1, stat="identity", colour="black",  position="dodge" )+
xlab(xname)+
ylab(yname)+
ggtitle(title)+
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),  axis.line = element_line(colour = "black"))+
theme(axis.title.x = element_text(face="bold", size=20, vjust=-2),
axis.text.x  = element_text( size=20, angle=90),
axis.title.y = element_text(face="bold", size=20,vjust=2),
axis.text.y  = element_text( size=20),
plot.margin=unit(c(1,1,1.5,1.2),"cm"),
legend.text=element_text(size=15),#size of legend
legend.title=element_text(size=15),
legend.key = element_blank(),
legend.background=element_rect( size=0.3, color="black"),
#        #panel.border = element_rect(fill=NA,color="black"),
legend.justification=c(0,0),
legend.position=c(x.leg, y.leg))+
scale_fill_discrete(name=legend.title)
# scale_fill_manual(values=c("red","black"), guide=FALSE)
# scale_fill_discrete(name="Number of tubes",
#                     labels=c("408", expression(10^3), expression(4%.% 10^3),
#                              expression(10^4)))
}
#multiplot
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
require(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
#resave(plot.box,file = file.to.load )
#HVG ######
#"dm" method
find.high.var.dm<-function(data,#matrix with normalized data
n.hvg, #the n.hvg genes with highest dm will be selected as hvg
plot=TRUE,
title){
df<-data.frame(log.avg=log10(apply(data, 1, function(x) mean(x))),
log.cv2=log10(apply(data, 1, function(x) (sd(x)/mean(x))^2))
)
df.roll<-data.frame(roll.avg=rollapply(df[order(df[,"log.avg"],decreasing=F),"log.avg"], width=100, FUN=median, by=50),#50,25
roll.cv2=rollapply(df[order(df[,"log.avg"],decreasing=F),"log.cv2"], width=100, FUN=median, by=50))
dm<-apply(df, 1,function(x){
exp<-x["log.avg"]
test<-which(df.roll[,"roll.avg"]>exp)
if(length(test)==0) window=nrow(df.roll)#length(roll.median.cv2)
else window=min(test)
return(x["log.cv2"]-df.roll[window,"roll.cv2"])
})
#select highly variable
high.var<-order(dm, decreasing=T)[1:n.hvg]
if(plot==TRUE){
par(mfrow=c(1,3))
plot(df[,"log.avg"],
df[,"log.cv2"], xlab = "Avg. expression",
ylab="CV^2", cex.lab=1.7, cex.axis=1.5,main=title)
points(df[high.var,"log.avg"], df[high.var,"log.cv2"], col="red")
plot(df[,"log.avg"], dm, xlab="Avg. expression",
ylab="DM", cex.lab=1.7, cex.axis=1.5)
points(df[high.var,"log.avg"], dm[high.var], col="red")
plot(df.roll[,1], df.roll[,2])
}
#df[which(df[,"log.cv2"]==min(df[df$log.avg<1.2,"log.cv2"])),]
return(list(dm=dm, genes.high.var=high.var))
}
#DESEQ
#differentially expressed gene, 1  factor
deseq.1<-function(counts.merged,cells.1, name.1, cells.2, name.2,genes, cooks.dist){
counts.1<-counts.merged[genes,cells.1]
counts.2<-counts.merged[genes,cells.2]
countData<-cbind(counts.1, counts.2)#collect data
colnames(countData)<-c(cells.1,cells.2)
row.names(countData)<-row.names(counts.1)
colData<-data.frame(Type=factor(c(rep(name.1, ncol(counts.1)), #group names(needs to have levels specified)
rep(name.2, ncol(counts.2))),
levels=c(name.1,name.2)))
row.names(colData)<-colnames(countData)
dds<-DESeqDataSetFromMatrix(countData=countData,
colData=colData,
design= ~ Type)
dds$Type<-factor(dds$Type,
levels=c(name.1,name.2))#it's important to put the control condition as first element here
dds <- DESeq(dds)
DESeq2::plotMA(dds, main=paste0(name.1, " vs ", name.2))
res <- results(dds,cooksCutoff = cooks.dist)
resOrdered<- res[order(res$padj),]
return(resOrdered)
}
#load data
raw.counts<-read.csv("/Users/PaulinaMrozek/Desktop/RNA-seq_procedure_AS/Antonio_analysis/171206/Data_sent_to_Antonio_from_Alex/2Alex_for_Antonio/Non_norm.PolyA_NamedByAlex.csv",
head=T)
row.names(raw.counts)<-as.character(raw.counts[,1])
raw.counts<-raw.counts[,-1]
#meta data
meta.data<-data.frame(day=unlist(lapply(strsplit(colnames(raw.counts), "_"), function(x) x[[1]])),
batch=unlist(lapply(strsplit(colnames(raw.counts), "_"), function(x) x[[2]])))
row.names(meta.data)<-colnames(raw.counts)
meta.data<-meta.data[order(as.numeric(unlist(lapply(strsplit(as.character(meta.data[,"day"]), split = "Day"), function(x) x[[2]])))),]
raw.counts<-raw.counts[,row.names(meta.data)]
is.viral<-grepl(pattern=".t01", x=row.names(raw.counts))
is.ercc<-grepl(pattern="ERCC-", row.names(raw.counts))
#library size
test<-colSums(raw.counts)
qc<-data.frame(tot.counts=test)
row.names(qc)<-colnames(raw.counts)
#fraction of endogenous genes (incl. viral) w.r.t. spikes
test<-colSums(raw.counts[!is.ercc,])/colSums(raw.counts[is.ercc,])
qc<-cbind(qc, endog.viral.genes=test)
#fraction of endogenous genes (no viral) w.r.t. spikes
is.ercc<-grepl(pattern="ERCC-", row.names(raw.counts))
test<-colSums(raw.counts[(!is.ercc)&(!is.viral),])/colSums(raw.counts[is.ercc,])
qc<-cbind(qc, endog.genes=test)
#viral/endogenous genes
test<-colSums(raw.counts[is.viral,])/colSums(raw.counts[(!is.viral)&(!is.ercc),])
qc<-cbind(qc, viral.endog=test)
#abs. viral genes
test<-colSums(raw.counts[is.viral,])/colSums(raw.counts[is.ercc,])
qc<-cbind(qc, viral.genes=test)
#Number of detected endogenous genes (threshold: > 10RPM)
temp<-apply(raw.counts[(!is.ercc)&(!is.viral),],2,function(x) 1e6*(x/sum(x)))
test<-apply(temp,2,function(x) length(which(x>10)))
qc<-cbind(qc, n.det.endog=test)
#library size
df<-data.frame(x=factor(row.names(qc), level=row.names(qc)),
y=qc[,"tot.counts"],
z=factor(meta.data[row.names(qc),"day"], level=unique(meta.data[row.names(qc),"day"])))
plot.bar(df, xname = "Sample", yname="Library size", legend.title = "Day")
#only endog. genes
sf<-estimateSizeFactorsForMatrix(raw.counts[(!is.ercc),colnames(raw.counts)!="Day8_1"])
#barplot(sf, las=2)
norm.data<-t(t(raw.counts[(!is.ercc),colnames(raw.counts)!="Day8_1"])/sf)
#
#find avg genes
labels<-meta.data[colnames(norm.data), "day"]
names(labels)<-colnames(norm.data)
norm.data_mean <- cbind(norm.data, apply(norm.data[,c(1:3)], 1, mean), apply(norm.data[,c(4:6)], 1, mean),
apply(norm.data[,c(7:9)], 1, mean), apply(norm.data[,c(10:12)], 1, mean),
apply(norm.data[,c(13:15)], 1, mean), apply(norm.data[,c(16:18)], 1, mean),
apply(norm.data[,c(19:20)], 1, mean), apply(norm.data[,c(21:23)], 1, mean))[ ,c(24:31)]
colnames(norm.data_mean) <- c("Day0", "Day1",  "Day2",  "Day3",  "Day4",  "Day5",  "Day8",  "Day14")
gene<-"CD73"
df<-data.frame(x=factor(colnames(norm.data_mean), level=colnames(norm.data_mean)),
y=norm.data_mean[gene,],
z=factor(colnames(norm.data_mean),"day", level=unique(colnames(norm.data_mean),"day")))
gene<-"NT5E"
df<-data.frame(x=factor(colnames(norm.data_mean), level=colnames(norm.data_mean)),
y=norm.data_mean[gene,],
z=factor(colnames(norm.data_mean),"day", level=unique(colnames(norm.data_mean),"day")))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
y.leg=0.3, title =gene )
gene<-"CALJA"
df<-data.frame(x=factor(colnames(norm.data_mean), level=colnames(norm.data_mean)),
y=norm.data_mean[gene,],
z=factor(colnames(norm.data_mean),"day", level=unique(colnames(norm.data_mean),"day")))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
y.leg=0.3, title =gene )
gene<-"E5NT"
df<-data.frame(x=factor(colnames(norm.data_mean), level=colnames(norm.data_mean)),
y=norm.data_mean[gene,],
z=factor(colnames(norm.data_mean),"day", level=unique(colnames(norm.data_mean),"day")))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
y.leg=0.3, title =gene )
gene<-"NTE"
df<-data.frame(x=factor(colnames(norm.data_mean), level=colnames(norm.data_mean)),
y=norm.data_mean[gene,],
z=factor(colnames(norm.data_mean),"day", level=unique(colnames(norm.data_mean),"day")))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
y.leg=0.3, title =gene )
gene<-"NT5E"
df<-data.frame(x=factor(colnames(norm.data_mean), level=colnames(norm.data_mean)),
y=norm.data_mean[gene,],
z=factor(colnames(norm.data_mean),"day", level=unique(colnames(norm.data_mean),"day")))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
y.leg=0.3, title =gene )
gene<-"KLF4"
df<-data.frame(x=factor(colnames(norm.data_mean), level=colnames(norm.data_mean)),
y=norm.data_mean[gene,],
z=factor(colnames(norm.data_mean),"day", level=unique(colnames(norm.data_mean),"day")))
plot.bar(df, xname = "Sample", yname="Norm. counts", legend.title = "Day", x.leg = 0.4,
y.leg=0.3, title =gene )
