#title: "MNaseseq_plots_script"
# author: Dr. Alexander Buschle
# application: PhD thesis of Paulina Mrozek-Gorska, visualize fragmnt lenghts distribution of MNase-seq data

# visualize the average fragment length


library(data.table)

rm(list = ls())



setwd("/Volumes/My_Passport_for_Mac/10.0_MNase_ReSeq/Demultiplexed_data/1_Sams_Cut/1_Distribution_cut_off/New_run/0_Sam")

txt_files <- list.files()[grep("sam.txt", list.files())]



Mean_vector <- vector()

SD_vector <- vector()

Mean_EBV_vector <- vector()

SD_EBV_vector <- vector()

Boxplot_vector <- vector()

Boxplot_EBV <- vector()



#i <- 3

for (i in 1:length(txt_files)){
  
  setwd("/Volumes/My_Passport_for_Mac/10.0_MNase_ReSeq/Demultiplexed_data/1_Sams_Cut/1_Distribution_cut_off/New_run")
  
  testround <- fread(txt_files[i], skip = 90)
#  testround <- fread(txt_files[i], skip = 52960846)
  
  
  testround <- testround[testround$V2 > 0, ]
  
  
  
  testround_EBV2089 <- testround[!grep("chr", testround$V1), ]
  
  testround_NONebv2089 <- testround[grep("chr", testround$V1), ]
  
  
  
  title <- gsub("col_3_9_cutTo_53960846_", "", paste(txt_files[i]))
  

  setwd("/Volumes/My_Passport_for_Mac/10.0_MNase_ReSeq/Demultiplexed_data/1_Sams_Cut/1_Distribution_cut_off/New_run/0_Sam/0_Out")
  
  pdf(file = paste(txt_files[i], "with_SD.pdf", sep = ""), width = 10, height = 10, family = "Helvetica", pointsize = 12, useDingbats = F)
  
  
  
  #  hist(testround_NONebv2089$V2, breaks = 50, prob = T,  col = "blue", main = paste(txt_files[i]), xlab = "Fragment length") # ylim = c(1,100000)
  
  #  hist(testround_EBV2089$V2, log = "y", breaks = 50, prob = T, add = T, col = "red")
  
  
  
  hist.data <- hist(testround_NONebv2089$V2, breaks = 50, plot = F) # ylim = c(1,100000)
  
  hist.data$counts <-  log10(hist.data$counts)
  
  plot(hist.data, col = "lightgrey", main = title, xlab = "Fragment length", ylab = "log10 occurance", ylim = c(0,6.8))
  
  
  
  text(mean(testround_NONebv2089$V2), max(hist.data$counts)-1, labels = "|", cex = 3, col = "grey45")
  
  text(mean(testround_NONebv2089$V2) - sd(testround_NONebv2089$V2), max(hist.data$counts)-1, labels = "|", cex = 3, col = "grey45")
  
  text(mean(testround_NONebv2089$V2) + sd(testround_NONebv2089$V2), max(hist.data$counts)-1, labels = "|", cex = 3, col = "grey45")
  
  arrows(mean(testround_NONebv2089$V2) - sd(testround_NONebv2089$V2), max(hist.data$counts)-1, mean(testround_NONebv2089$V2) + sd(testround_NONebv2089$V2), max(hist.data$counts)-1, code = 0, lwd = 3, col = "grey45")
  
  
  
  text(65, 6.0, paste("Mean cellular genome:", round(mean(testround_NONebv2089$V2), 2)))
  
  text(65, 5.7, paste("    SD cellular genome:  ",  round(sd(testround_NONebv2089$V2), 2)))
  
  
  
  
  
  hist.data_EBV <- hist(testround_EBV2089$V2, breaks = 50, plot = F) # ylim = c(1,100000)
  
  hist.data_EBV$counts <-  log10(hist.data_EBV$counts)
  
  plot(hist.data_EBV, col = "darkgrey", xlab = "Fragment length", add = T)
  
  
  
  text(mean(testround_EBV2089$V2), max(hist.data_EBV$counts)-0.5, labels = "|", cex = 2, col = "grey20")
  
  text(mean(testround_EBV2089$V2) - sd(testround_EBV2089$V2), max(hist.data_EBV$counts)-0.5, labels = "|", cex = 2, col = "grey30")
  
  text(mean(testround_EBV2089$V2) + sd(testround_EBV2089$V2), max(hist.data_EBV$counts)-0.5, labels = "|", cex = 2, col = "grey30")
  
  arrows(mean(testround_EBV2089$V2) - sd(testround_EBV2089$V2), max(hist.data_EBV$counts)-0.5, mean(testround_EBV2089$V2) + sd(testround_EBV2089$V2), max(hist.data_EBV$counts)-0.5, code = 0, lwd = 2, col = "grey30")
  
  
  
  text(65, 5.4, paste("    Mean EBV genome:", round(mean(testround_EBV2089$V2), 2)))
  
  text(65, 5.1, paste("        SD EBV genome:  ",  round(sd(testround_EBV2089$V2), 2)))
  
  
  
  abline(v = 147, col = "black", lwd = 3)
  
  text(152, 6.9, "147")
  
  text(253, log10(245), "300")
  
  abline(h = log10(300), col = "black", lwd = 1)
  
  text(253, log10(500), "600")
  
  abline(h = log10(600), col = "black", lwd = 1)
  
  text(253, log10(1100), "900")
  
  abline(h = log10(900), col = "black", lwd = 1)
  
  text(243, log10(3750000), "3.000.000")
  
  abline(h = log10(3000000), col = "black", lwd = 1)
  
  legend(205, 5.9, c("Cellular", "EBV_2089"), col = c("lightgrey", "darkgrey"), pch = c(15,15))
  
  
  
  dev.off()
  
  
  
  
  
  Mean_vector <- c(Mean_vector, mean(testround_EBV2089$V2))
  
  SD_vector <- c(SD_vector, sd(testround_EBV2089$V2))
  
  Mean_EBV_vector <- c(Mean_EBV_vector, mean(testround_NONebv2089$V2))
  
  SD_EBV_vector <- c(SD_EBV_vector, sd(testround_NONebv2089$V2))
  
  Boxplot_vector <- c(Boxplot_vector, boxplot(testround_EBV2089$V2, horizontal = T))
  
  Boxplot_EBV <- c(Boxplot_EBV, boxplot(testround_NONebv2089$V2, horizontal = T))
  
  
  
}





pdf(file = "Mean_SD_.pdf", width = 10, height = 10, family = "Helvetica", pointsize = 12, useDingbats = F)

plot(c(Mean_vector[1:6],NA,NA, Mean_vector[7], NA), xlim = c(1,13), ylim =c(100,200), pch = 8, xlab = c("Days"), ylab = "Fragment length", col = "blue", lwd = 3, axes = F,
     
     main = "Mean and Standard deviation")

axis(1, labels = c(0,1,2,3,4,5,NA, NA, 8,NA, "LCL32", "LCL55", "LCL110"), at = c(1:13))

axis(2)

points(c(1,2,3,4,5,6,9), c(Mean_vector[1:6], Mean_vector[7]), type = "l", col = "blue", lwd = 3)

points(11, Mean_vector[8], pch = 8, col = "blue", lwd = 3)

arrows(c(1,2,3,4,5,6,9,11,12,13), Mean_vector[1:10] - SD_vector[1:10], c(1,2,3,4,5,6,9,11,12,13), Mean_vector[1:10] + SD_vector[1:10], code = 3, angle = 90 , lwd = 1, col = "blue")

abline(h = 147, lwd = 2)



points(c(2:6,9), Mean_EBV_vector[2:7], pch = 4, col = "red", lwd = 3)

points(c(2:6,9), Mean_EBV_vector[2:7], type = "l", xlim = c(1,8), ylim =c(100,200), col = "red", lwd = 3)

points(11, Mean_EBV_vector[8], pch = 4, col = "red", lwd = 3)

arrows(c(2,3,4,5,6,9,11,12,13), Mean_EBV_vector[2:10] - SD_EBV_vector[2:10], c(2,3,4,5,6,9,11,12,13), Mean_EBV_vector[2:10] + SD_EBV_vector[2:10], code = 3, angle = 90 ,col = "red", lwd =1)



legend(1, 195, c("Cellular", "EBV"), col = c("blue", "red"), lwd = 3, pch =c(8, 4))

dev.off()







setwd("/Volumes/My_Passport_for_Mac/10.0_MNase_ReSeq/Demultiplexed_data/1_Sams_Cut/1_Distribution_cut_off/New_run/0_Sam")

Day0 <- fread(txt_files[1], skip = 90)

Day0 <- Day0[Day0$V2 > 0, ]

Day0_EBV2089 <- Day0[!grep("chr", Day0$V1), ]

Day0_NONebv2089 <- Day0[grep("chr", Day0$V1), ]



Day1 <- fread(txt_files[2], skip = 90)

Day1 <- Day1[Day1$V2 > 0, ]

Day1_EBV2089 <- Day1[!grep("chr", Day1$V1), ]

Day1_NONebv2089 <- Day1[grep("chr", Day1$V1), ]



Day2 <- fread(txt_files[3], skip = 90)

Day2 <- Day2[Day2$V2 > 0, ]

Day2_EBV2089 <- Day2[!grep("chr", Day2$V1), ]

Day2_NONebv2089 <- Day2[grep("chr", Day2$V1), ]



Day3 <- fread(txt_files[4], skip = 90)

Day3 <- Day3[Day3$V2 > 0, ]

Day3_EBV2089 <- Day3[!grep("chr", Day3$V1), ]

Day3_NONebv2089 <- Day3[grep("chr", Day3$V1), ]



Day4 <- fread(txt_files[5], skip = 90)

Day4 <- Day4[Day4$V2 > 0, ]

Day4_EBV2089 <- Day4[!grep("chr", Day4$V1), ]

Day4_NONebv2089 <- Day4[grep("chr", Day4$V1), ]



Day5 <- fread(txt_files[6], skip = 90)

Day5 <- Day5[Day5$V2 > 0, ]

Day5_EBV2089 <- Day5[!grep("chr", Day5$V1), ]

Day5_NONebv2089 <- Day5[grep("chr", Day5$V1), ]



Day8 <- fread(txt_files[7], skip = 90)

Day8 <- Day8[Day8$V2 > 0, ]

Day8_EBV2089 <- Day8[!grep("chr", Day8$V1), ]

Day8_NONebv2089 <- Day8[grep("chr", Day8$V1), ]



LCL32 <- fread(txt_files[9], skip = 90)

LCL32 <- LCL32[LCL32$V2 > 0, ]

LCL32_EBV2089 <- LCL32[!grep("chr", LCL32$V1), ]

LCL32_NONebv2089 <- LCL32[grep("chr", LCL32$V1), ]


LCL55 <- fread(txt_files[10], skip = 90)

LCL55 <- LCL55[LCL55$V2 > 0, ]

LCL55_EBV2089 <- LCL55[!grep("chr", LCL55$V1), ]

LCL55_NONebv2089 <- LCL55[grep("chr", LCL55$V1), ]


LCL110 <- fread(txt_files[8], skip = 90)

LCL110 <- LCL110[LCL110$V2 > 0, ]

LCL110_EBV2089 <- LCL110[!grep("chr", LCL110$V1), ]

LCL110_NONebv2089 <- LCL110[grep("chr", LCL110$V1), ]


setwd("/Volumes/My_Passport_for_Mac/10.0_MNase_ReSeq/Demultiplexed_data/1_Sams_Cut/1_Distribution_cut_off/New_run/0_Sam/0_Out")



pdf(file = "Boxplot_EBV.pdf", width = 10, height = 10, family = "Helvetica", pointsize = 12, useDingbats = F)

boxplot(NA, Day1_EBV2089$V2, Day2_EBV2089$V2, Day3_EBV2089$V2, Day4_EBV2089$V2, Day5_EBV2089$V2, Day8_EBV2089$V2, LCL32_EBV2089$V2, LCL55_EBV2089$V2, LCL110_EBV2089$V2,
        
        ylab = "DNA fragment length [bp]", xlab = "Days post infection", axes = F, col = c(rep("grey",7), c(rep("red",3))),
        
        main = "DNA fragment size distribution in EBV genome", axes = F, cex.lab = 1, cex.main = 1, outline = F)

axis(1, labels = c(0:5,8,"LCL32", "LCL55", "LCL110"), at = 1:10, cex.axis = 1)

axis(2, cex.axis = 1)

abline(h = 147)

dev.off()







pdf(file = "Boxplot_Cellular.pdf", width = 10, height = 10, family = "Helvetica", pointsize = 12, useDingbats = F)

boxplot(Day0_NONebv2089$V2, Day1_NONebv2089$V2, Day2_NONebv2089$V2, Day3_NONebv2089$V2, Day4_NONebv2089$V2, Day5_NONebv2089$V2, Day8_NONebv2089$V2, LCL32_NONebv2089$V2, LCL55_NONebv2089$V2, LCL110_NONebv2089$V2,
        
        ylab = "DNA fragment length [bp]", xlab = "Days post infection", axes = F,  col = c(rep("lightgrey",7), c(rep("red",3))),
        
        main = "DNA fragment size distribution in cellular genome", cex.lab = 1, cex.main = 1, outline = F)

axis(1, labels = c(0:5,8,"LCL32", "LCL55", "LCL110"), at = 1:10, cex.axis = 1, cex.lab = 1)

axis(2, cex.axis = 1, cex.lab = 1)

abline(h = 147)

dev.off()





png(file = "Boxplot_EBV.png", width = 960, height = 960, family = "Calibri")

boxplot(NA, Day1_EBV2089$V2, Day2_EBV2089$V2, Day3_EBV2089$V2, Day4_EBV2089$V2, Day5_EBV2089$V2, Day8_EBV2089$V2, LCL32_EBV2089$V2, LCL55_EBV2089$V2 ,LCL110_EBV2089$V2,
        
        ylab = "", xlab = "Days post infection", axes = F, col = c(rep("grey",7), c(rep("red",3))),
        
        main = "DNA fragment size distribution in EBV genome", cex.main = 2, cex.lab=2, outline = F)

title(ylab="DNA fragment length [bp]", line=3, cex.lab=2, family="Calibri")

axis(1, labels = c(0:5,8,"LCL32", "LCL55", "LCL110"), at = 1:10, cex.axis = 2, cex.lab = 2)

axis(2, cex.axis = 2, cex.lab = 2)

abline(h = 147)

dev.off()






png(file = "Boxplot_Cellular.png", width = 960, height = 960, family = "Helvetica")

boxplot(Day0_NONebv2089$V2, Day1_NONebv2089$V2, Day2_NONebv2089$V2, Day3_NONebv2089$V2, Day4_NONebv2089$V2, Day5_NONebv2089$V2, Day8_NONebv2089$V2, LCL32_NONebv2089$V2, LCL55_NONebv2089$V2, LCL110_NONebv2089$V2,
        
        ylab = "DNA fragment length [bp]", xlab = "Days post infection", axes = F, col = c(rep("lightgrey",7), c(rep("red",3))),
        
        main = "DNA fragment size distribution in cellular genome", cex.lab = 2, cex.main = 2)

axis(1, labels = c(0:5,8, "LCL32", "LCL55", "LCL110"), at = 1:10, cex.axis = 2, cex.lab = 2, pos = 0.5)

axis(2, cex.axis = 2, cex.lab = 2)

abline(h = 147)

dev.off()

getwd()




#

# points(c(2:7), Mean_EBV_vector[2:7], pch = 4, col = "red", lwd = 3)

# points(c(2:7), Mean_EBV_vector[2:7], type = "l", xlim = c(1,8), ylim =c(100,200), col = "red", lwd = 3)

# points(c(2:7), Mean_EBV_vector[2:7] - SD_EBV_vector[2:7], pch = 4, col = "red", lwd = 3)

# points(c(2:7), Mean_EBV_vector[2:7] + SD_EBV_vector[2:7], pch = 4, col = "red", lwd = 3)

# points(8, Mean_EBV_vector[8], pch = 4, col = "red", lwd = 3)

# points(8, Mean_EBV_vector[8] - SD_EBV_vector[8], col = "red", lwd = 3, pch = 4)

# points(8, Mean_EBV_vector[8] + SD_EBV_vector[8], col = "red", lwd = 3, pch = 4)

# arrows(c(2,3,4,5,6,7,8), Mean_EBV_vector[2:8] - SD_EBV_vector[2:8], c(2,3,4,5,6,7,8), Mean_EBV_vector[2:8] + SD_EBV_vector[2:8], code = 0, col = "red", lwd = 3)

#



# testround_sub <- abs(testround[seq(2,nrow(testround), 2), "V2"]) - abs(testround[seq(1,nrow(testround), 2), "V2"])

# testround_sub_noZero <- testround_sub[testround_sub$V2 != 0,]

#







# check percentage of EBV reads



library(data.table)

rm(list = ls())



setwd("/Volumes/My_Passport_for_Mac/10.0_MNase_ReSeq/Demultiplexed_data/1_Sams_Cut/1_Distribution_cut_off/New_run/0_Sam/0_Out")


txt_files <- list.files()[grep("sam.txt", list.files())]



#i <- 1

vector_percentage <- vector()

vector_plot <- vector()



for (i in 1:length(txt_files)){
  
  setwd("/Volumes/Seagate Backup Plus Drive/PAULA/MNase-seq/1_Distribution_cut_off/New_run")
  
  testround <- fread(txt_files[i], skip = 90)
  
  testround <- testround[testround$V2 > 0, ]
  
  
  
  testround_EBV2089 <- testround[!grep("chr", testround$V1), ]
  
  testround_NONebv2089 <- testround[grep("chr", testround$V1), ]
  
  
  
  vector_percentage <- rbind(vector_percentage, paste("In sample", txt_files[i], round((nrow(testround_EBV2089) / sum(nrow(testround_EBV2089), nrow(testround_NONebv2089))) * 100, 2), "% of all reads were mapped to the 2089 genome"))
  
  
  
  vector_plot <- c(vector_plot, round((nrow(testround_EBV2089) / sum(nrow(testround_EBV2089), nrow(testround_NONebv2089))) * 100, 2))
  
}



setwd("/Volumes/Seagate Backup Plus Drive/PAULA/MNase-seq/1_Distribution_cut_off/New_run")

write.table(vector_percentage, "Percentage_of_EBV2089_reads_on_total_reads.txt", quote = F, col.names = F, row.names = F)



pdf(file = "Distribution_of_EBV_nonEnriched_percentage_over_days.pdf")



plot(vector_plot[2:7], col = "blue", xlab = "Days post infection",
     
     main = "Distribution of EBV percentage on total reads over days",
     
     type = "l", ylim = c(0,0.08), xlim = c(0,7), lwd = 4, axes = F, ylab = "Percentage of EBV reads on total reads") # ylim = c(1,100000)

axis(side = 1, at = c(0:9), labels = c(0:5,8, "LCL32", "LCL55", "LCL110"))

axis(side = 2, at = seq(0,0.08,0.01))

points(0, vector_plot[1], pch = 16, col = "green")

points(7, vector_plot[8], pch = 16, col = "red")

dev.off()



