-
Notifications
You must be signed in to change notification settings - Fork 2
/
analysis
137 lines (113 loc) · 7.2 KB
/
analysis
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# The following pipeline is written by Dr. Ildem Akerman (IMSR, University of Birmingham) for the analysis of Lexogen 5' Quantseq RNA-seq data generated by the University of Birmingham Genomics Facility
# Quantseq Library kits information : https://www.lexogen.com/quantseq-3mrna-sequencing/
# Pipeline modified from PMID: 28041957 PMCID: PMC5300904 DOI: 10.1016/j.cmet.2016.11.016 Akerman Et al 2017
# please note that the true directory names have been removed and replaced with the words "folder or directory" - please insert your own directory names.
# Any special parameters used for specific pipelines are indicated in the article's materials and methods section.
# Please reach me from University of Birmingham e-mail address for questions or to report a fault; https://www.birmingham.ac.uk/staff/profiles/metabolism-systems/akerman-ildem.aspx
#############################################################################################
# TRIMMING adapters, low quality and polyA tails READS and fastQC to test if it makes a difference
#############################################################################################
# Quality controls were performed using fastQC and reads are trimmed using Trimmomatic and/or bbduk
#!/bin/bash
for file in *.fastq
do
java -jar ${TMATIC_ROOT}/trimmomatic-0.32.jar SE -trimlog Triming_LOG_"${file}" ${file} tt_"${file}" ILLUMINACLIP:TruSeq3-SE:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
done
echo "trimmomatic done"
###### ######################
#Alternatively bbduk script from BBMAP suite
#cd /bbmap/
#for i in *.fastq
#do
#cat $i | bbduk.sh in=${i} out=/trimmo_bbdukpA/"${i}" ref=sourcefolderhere/polyA.fa.gz k=13 ktrim=r useshortkmers=t mink=5 qtrim=r trimq=10 minlength=20
#done
########################
#############################################################################################
# Align to human or mouse genome (example mouse)
#############################################################################################
#!/bin/bash
# Remember to create an index directory for STAR (/genomeDirectory/STARmm10index)
STAR --runThreadN 6 --runMode genomeGenerate --genomeDir genomeDirectory/STARmm10index --genomeFastaFiles /folder/mm10.fa --sjdbGTFfile folder/gencode.vM20.annotation.gtf --sjdbOverhang 74
# Now align to the mouse (or human) genome
for file in *.fastq
do
STAR --runThreadN 8 --runMode alignReads --genomeDir /genomeDirectory/STARmm10index --readFilesIn ${file} --outFileNamePrefix mm10_"${file}" --outFilterScoreMinOverLread 0.3 --outFilterMatchNminOverLread 0.3 --outFilterMultimapNmax 20 --outReadsUnmapped Fastx_failed --outSAMtype BAM SortedByCoordinate
done
#############################################################################################
# HT-seq count
#############################################################################################
# Count the number of reads taht fall on each gene (gene annotation indicated below for mouse)
#!/bin/bash
for file in *.bam
do
htseq-count -m intersection-nonempty -s yes -f bam -r pos ${file} directory/gencode.vM20.annotation.gtf > "${file}"_HTseq
done
# Please note that -s is used for stranded libraries ONLY.
#############################################################################################
# DeSeq2 in R (not bash) Example analysis for Nasteska et al (David Hodson Lab) in R Studio / R
#############################################################################################
# Get the gene name conversion from source - for MOUSE (similar for human). This makes life easier as one can see gene symbols, rather than gencode names.
#source("https://bioconductor.org/biocLite.R")
#biocLite("EnsDb.Hsapiens.v92")
source("https://bioconductor.org/biocLite.R")
biocLite("ensembldb")
library('biomaRt')
#### convert genes
#get the gene names from gencode mouse file
x <- read.table("U:AkermanFolder/HTseq_COUNTS_GencodevM20_mm10.txt", header = FALSE)
x <- x[,1]
foo <- data.frame(do.call('rbind', strsplit(as.character(x),'.',fixed=TRUE)))
genes <- foo[,1]
############
mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl"))
G_list <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","mgi_symbol","entrezgene", "description"),values=genes,mart= mart)
###
x <- read.table("U:AkermanFolder/DH_ttbb_HTseq_COUNTS_GencodevM20_mm10.txt", header = FALSE)
x <- x[,1]
merged <- cbind(x,foo)
colnames(merged) <- c("ENSG.version","ensembl_gene_id","version")
x <- merge(merged,G_list, by = "ensembl_gene_id")
setwd("U:AkermanFolder")
write.table(x, file = "GencodeMousev20_conversion_table.txt", sep = "\t", col.names = T, row.names = F, qmethod = "double", quote=FALSE)
############################## deseq2 ###############################
library(DESeq2)
sampleTable <- read.table("U:AkermanFolder/sample_table.txt", header = TRUE)
sampleTable <- data.frame(sampleTable)
directory<-'U:AkermanFolder/HT-seq'
### create the deseq object
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable=sampleTable, directory=directory, design= ~ individual + condition)
colData(ddsHTSeq)$condition<-factor(colData(ddsHTSeq)$condition, levels=c('M3C','WT')) # CTL vs treatment first
#run deseq2
############## cut off low expression (at least 3 samples 10 or above)
dds<-DESeq(ddsHTSeq)
dds <- estimateSizeFactors(dds)
idx <- rowSums( counts(dds, normalized=TRUE) >= 10 ) >= 3
#This would say, e.g. filter out genes where there are less than 3 samples with normalized counts greater than or equal to 5.
dds <- dds[idx,]
dds <- DESeq(dds)
res <- results(dds, contrast=c("condition","WT","M3C"))
#write.table(as.data.frame(res),file="HodsonMelton3_WTvM3C_paired_DEseq2.txt")
## get baseMean for each condition
BM <- sapply( levels(dds$condition), function(lvl) rowMeans( counts(dds,normalized=TRUE)[,dds$condition == lvl] ) )
BM <- as.data.frame(BM)
BM$ENSG.version <- row.names(BM)
############ with gene name
res <- as.data.frame(res)
rn <- row.names(res)
res$ENSG.version <- rn
DH <- merge(res, x, by = "ENSG.version", all.x = TRUE)
DH <- merge(DH, BM, by = "ENSG.version", all.x = TRUE)
DH <- DH[,c(1,14,13,2:12)]
write.table(DH, file = "DH4b_WTvM3_1-3-2019_DESeq2cutoff10.txt", sep = "\t", col.names = T, row.names = F, qmethod = "double", quote=FALSE)
# Please find an example sample table below (text file for DeSeq2)
sampleName fileName condition individual
A1_M3C-3 mm10_4c_50pttpe_A1-Exp-3-L16-M2-M3C_S3.fastqAligned.sortedByCoord.out.bam_HTseq M3C exp3
B2_WT-1 mm10_4c_50pttpe_B2-Exp1-L14-WT-3_S2.fastqAligned.sortedByCoord.out.bam_HTseq WT exp1
B2_WT-6 mm10_4c_50pttpe_B2-Exp-6-L24-M3-WT_S9.fastqAligned.sortedByCoord.out.bam_HTseq WT exp6
C1_WT-3 mm10_4c_50pttpe_C1-Exp-3-L16-M5-WT_S4.fastqAligned.sortedByCoord.out.bam_HTseq WT exp3
C1_M3C-5 mm10_4c_50pttpe_C1-Exp-5-L22-M1-M3C_S7.fastqAligned.sortedByCoord.out.bam_HTseq M3C exp5
C2_M3C-6 mm10_4c_50pttpe_C2-Exp-6-L24-M4-M3C_S1.fastqAligned.sortedByCoord.out.bam_HTseq M3C exp6
D1_WT-5 mm10_4c_50pttpe_D1-Exp-5-L22-M2-WT_S8.fastqAligned.sortedByCoord.out.bam_HTseq WT exp5
E1_WT-3 mm10_4c_50pttpe_E1-Exp-3-L16-M7-WT_S5.fastqAligned.sortedByCoord.out.bam_HTseq WT exp3
F1_M3C-1 mm10_4c_50pttpe_F1-Exp-1-L14-M3C-1_S1.fastqAligned.sortedByCoord.out.bam_HTseq M3C exp1
F1_M3C-3 mm10_4c_50pttpe_F1-Exp-3-L16-M8-M3C_S6.fastqAligned.sortedByCoord.out.bam_HTseq M3C exp3