analysis

# The following pipeline is written by Dr. Ildem Akerman (IMSR, University of Birmingham) for the analysis of Lexogen 5' Quantseq RNA-seq data generated by the University of Birmingham Genomics Facility 
# Quantseq Library kits information : https://www.lexogen.com/quantseq-3mrna-sequencing/ 
# Pipeline modified from PMID: 28041957  PMCID: PMC5300904  DOI: 10.1016/j.cmet.2016.11.016 Akerman Et al 2017 

# please note that the true directory names have been removed and replaced with the words "folder or directory" - please insert your own directory names. 
# Any special parameters used for specific pipelines are indicated in the article's materials and methods section. 
# Please reach me from University of Birmingham e-mail address for questions or to report a fault; https://www.birmingham.ac.uk/staff/profiles/metabolism-systems/akerman-ildem.aspx
 
#############################################################################################
# TRIMMING adapters, low quality and polyA tails READS and fastQC to test if it makes a difference 
#############################################################################################
# Quality controls were performed using fastQC and reads are trimmed using Trimmomatic and/or bbduk 
#!/bin/bash
for file in *.fastq
do
java -jar ${TMATIC_ROOT}/trimmomatic-0.32.jar SE -trimlog Triming_LOG_"${file}" ${file} tt_"${file}" ILLUMINACLIP:TruSeq3-SE:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
done
echo "trimmomatic done"
###### ######################
#Alternatively bbduk script from BBMAP suite 
#cd /bbmap/
#for i in *.fastq
#do 
#cat $i | bbduk.sh in=${i} out=/trimmo_bbdukpA/"${i}" ref=sourcefolderhere/polyA.fa.gz k=13 ktrim=r useshortkmers=t mink=5 qtrim=r trimq=10 minlength=20
#done
########################


#############################################################################################
# Align to human or mouse genome (example mouse) 
#############################################################################################
#!/bin/bash
# Remember to create an index directory for STAR (/genomeDirectory/STARmm10index)
STAR --runThreadN 6 --runMode genomeGenerate --genomeDir genomeDirectory/STARmm10index --genomeFastaFiles /folder/mm10.fa --sjdbGTFfile folder/gencode.vM20.annotation.gtf --sjdbOverhang 74

# Now align to the mouse (or human) genome
for file in *.fastq
do
STAR --runThreadN 8 --runMode alignReads --genomeDir /genomeDirectory/STARmm10index --readFilesIn ${file} --outFileNamePrefix mm10_"${file}" --outFilterScoreMinOverLread 0.3 --outFilterMatchNminOverLread 0.3 --outFilterMultimapNmax 20 --outReadsUnmapped Fastx_failed --outSAMtype BAM SortedByCoordinate  
done

#############################################################################################
# HT-seq count 
#############################################################################################
# Count the number of reads taht fall on each gene (gene annotation indicated below for mouse)
#!/bin/bash
for file in *.bam
do
htseq-count -m intersection-nonempty -s yes -f bam -r pos ${file} directory/gencode.vM20.annotation.gtf > "${file}"_HTseq
done

# Please note that -s is used for stranded libraries ONLY. 


#############################################################################################
# DeSeq2 in R (not bash)  Example analysis for Nasteska et al (David Hodson Lab) in R Studio / R 
#############################################################################################

# Get the gene name conversion from source - for MOUSE (similar for human). This makes life easier as one can see gene symbols, rather than gencode names. 
#source("https://bioconductor.org/biocLite.R")
#biocLite("EnsDb.Hsapiens.v92")
source("https://bioconductor.org/biocLite.R")
biocLite("ensembldb")
library('biomaRt')
#### convert genes 
#get the gene names from gencode mouse file
x <- read.table("U:AkermanFolder/HTseq_COUNTS_GencodevM20_mm10.txt", header = FALSE)
x <- x[,1]
foo <- data.frame(do.call('rbind', strsplit(as.character(x),'.',fixed=TRUE)))
genes <- foo[,1]
############
mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl"))
G_list <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","mgi_symbol","entrezgene", "description"),values=genes,mart= mart)
###
x <- read.table("U:AkermanFolder/DH_ttbb_HTseq_COUNTS_GencodevM20_mm10.txt", header = FALSE)
x <- x[,1]
merged <- cbind(x,foo)
colnames(merged) <- c("ENSG.version","ensembl_gene_id","version") 
x <- merge(merged,G_list, by = "ensembl_gene_id")
setwd("U:AkermanFolder")
write.table(x, file = "GencodeMousev20_conversion_table.txt", sep = "\t", col.names = T, row.names = F, qmethod = "double", quote=FALSE)

############################## deseq2 ###############################
library(DESeq2)

sampleTable <- read.table("U:AkermanFolder/sample_table.txt", header = TRUE)
sampleTable <- data.frame(sampleTable)
directory<-'U:AkermanFolder/HT-seq'

### create the deseq object  
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable=sampleTable, directory=directory, design= ~ individual + condition)
colData(ddsHTSeq)$condition<-factor(colData(ddsHTSeq)$condition, levels=c('M3C','WT')) # CTL vs treatment first 


#run deseq2
############## cut off low expression (at least 3 samples 10 or above) 
dds<-DESeq(ddsHTSeq)
dds <- estimateSizeFactors(dds)
idx <- rowSums( counts(dds, normalized=TRUE) >= 10 ) >= 3
#This would say, e.g. filter out genes where there are less than 3 samples with normalized counts greater than or equal to 5.
dds <- dds[idx,]
dds <- DESeq(dds)
res  <- results(dds, contrast=c("condition","WT","M3C"))

#write.table(as.data.frame(res),file="HodsonMelton3_WTvM3C_paired_DEseq2.txt")
## get baseMean for each condition 
BM <- sapply( levels(dds$condition), function(lvl) rowMeans( counts(dds,normalized=TRUE)[,dds$condition == lvl] ) )
BM <- as.data.frame(BM)
BM$ENSG.version <- row.names(BM)
############ with gene name 

res <- as.data.frame(res)
rn <- row.names(res)
res$ENSG.version <- rn
DH <- merge(res, x, by = "ENSG.version", all.x = TRUE)
DH <- merge(DH, BM, by = "ENSG.version", all.x = TRUE)
DH <- DH[,c(1,14,13,2:12)]
write.table(DH, file = "DH4b_WTvM3_1-3-2019_DESeq2cutoff10.txt", sep = "\t", col.names = T, row.names = F, qmethod = "double", quote=FALSE)


# Please find an example sample table below (text file for DeSeq2) 
sampleName	fileName	condition	individual
A1_M3C-3	mm10_4c_50pttpe_A1-Exp-3-L16-M2-M3C_S3.fastqAligned.sortedByCoord.out.bam_HTseq	M3C	exp3
B2_WT-1	mm10_4c_50pttpe_B2-Exp1-L14-WT-3_S2.fastqAligned.sortedByCoord.out.bam_HTseq	WT	exp1
B2_WT-6	mm10_4c_50pttpe_B2-Exp-6-L24-M3-WT_S9.fastqAligned.sortedByCoord.out.bam_HTseq	WT	exp6
C1_WT-3	mm10_4c_50pttpe_C1-Exp-3-L16-M5-WT_S4.fastqAligned.sortedByCoord.out.bam_HTseq	WT	exp3
C1_M3C-5	mm10_4c_50pttpe_C1-Exp-5-L22-M1-M3C_S7.fastqAligned.sortedByCoord.out.bam_HTseq	M3C	exp5
C2_M3C-6	mm10_4c_50pttpe_C2-Exp-6-L24-M4-M3C_S1.fastqAligned.sortedByCoord.out.bam_HTseq	M3C	exp6
D1_WT-5	mm10_4c_50pttpe_D1-Exp-5-L22-M2-WT_S8.fastqAligned.sortedByCoord.out.bam_HTseq	WT	exp5
E1_WT-3	mm10_4c_50pttpe_E1-Exp-3-L16-M7-WT_S5.fastqAligned.sortedByCoord.out.bam_HTseq	WT	exp3
F1_M3C-1	mm10_4c_50pttpe_F1-Exp-1-L14-M3C-1_S1.fastqAligned.sortedByCoord.out.bam_HTseq	M3C	exp1
F1_M3C-3	mm10_4c_50pttpe_F1-Exp-3-L16-M8-M3C_S6.fastqAligned.sortedByCoord.out.bam_HTseq	M3C	exp3