-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path06.centrifuge-classify.sh
105 lines (81 loc) · 3.5 KB
/
06.centrifuge-classify.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env bash
#$ -S /bin/bash
#$ -q bioinfo.q
#$ -V
#$ -cwd
#$ -N classify_centrifuge
#$ -pe shared 72
#$ -M [email protected]
set -e
source /gpfs0/bioinfo/apps/Miniconda2/Miniconda_v4.3.21/bin/activate /gpfs0/bioinfo/apps/Miniconda2/Miniconda_v4.3.21/envs/Metagenomics
export PERL5LIB='/gpfs0/bioinfo/apps/Miniconda2/Miniconda_v4.3.21/envs/Metagenomics/lib/site_perl/5.26.2/x86_64-linux-thread-multi'
PROJECT_DIR='/gpfs0/bioinfo/projects/Amit_Gross/03.Metagenomics_stormwater_biofiltration/'
CENTRIFUGE_DB='/gpfs0/bioinfo/databases/Centrifuge/db_arch_bact_vir_2018/'
CENTRIFUGE_CLASSIFY="${PROJECT_DIR}/04.read_based_analysis/09.centrifuge-classify/"
METADATA="${PROJECT_DIR}/metadata.tsv"
FASTQ_SUFFIX='_host_removed.fastq.gz'
# This assumes that the first column contains your sample names
SAMPLES=($(cat ${METADATA} | awk 'NR>1{print $1}'))
RAW_DATA_DIR="${PROJECT_DIR}/03.remove_host/fastq_files/"
KRONA_TAXONOMY='/gpfs0/bioinfo/apps/Miniconda2/Miniconda_v4.3.21/envs/Metagenomics/databases/krona/taxonomy'
function centrifuge_classify(){
local SAMPLE=$1
local OUT_DIR=$2
local FORWARD=$3
local REVERSE=$4
local DATABASE=$5
local THREADS=$6
# Classify single-end or paired-end reads using centrifuge
# Single-end
if [ ${REVERSE} == 'NULL' ]; then
centrifuge \
-x ${DATABASE} \
--seed 4 \
-q \
--threads ${THREADS} \
-S ${OUT_DIR}/${SAMPLE}/${SAMPLE}.centrifuge.out \
-U ${FORWARD}
else
# Paired-end
centrifuge \
-x ${DATABASE} \
--seed 4 \
-q \
--threads ${THREADS} \
-S ${OUT_DIR}/${SAMPLE}/${SAMPLE}.centrifuge.out \
-1 ${FORWARD} \
-2 ${REVERSE}
fi
# Generate report
centrifuge-kreport \
-x ${DATABASE} \
${OUT_DIR}/${SAMPLE}/${SAMPLE}.centrifuge.out \
> ${OUT_DIR}/${SAMPLE}/${SAMPLE}.centrifuge.out.report
# Create text file for Krona
cut -f 1,3 ${OUT_DIR}/${SAMPLE}/${SAMPLE}.centrifuge.out \
> ${OUT_DIR}/${SAMPLE}/${SAMPLE}.centrifuge.out.4krona
}
export -f centrifuge_classify
# Create directory for each sample
parallel --jobs 0 "[ -d ${CENTRIFUGE_CLASSIFY}/{} ] || mkdir ${CENTRIFUGE_CLASSIFY}/{}" ::: ${SAMPLES[*]}
# Run centrifuge
# For single-end reads
parallel --jobs 5 "centrifuge_classify {} ${CENTRIFUGE_CLASSIFY} ${RAW_DATA_DIR}/{}${FASTQ_SUFFIX} NULL ${CENTRIFUGE_DB}/abv 10" ::: ${SAMPLES[*]}
# For paired-end reads
#parallel --jobs 5 "centrifuge_classify {} ${CENTRIFUGE_CLASSIFY} ${RAW_DATA_DIR}/{}${FORWARD_SUFFIX} ${RAW_DATA_DIR}/{}${REVERSE_SUFFIX} ${CENTRIFUGE_DB}/abv 10" ::: ${SAMPLES[*]}
find ${CENTRIFUGE_CLASSIFY}/ -type f -name "*.4krona" |sort > ${CENTRIFUGE_CLASSIFY}/krona_files.txt
FILES=($(find ${CENTRIFUGE_CLASSIFY}/ -type f -name "*.4krona"))
basename -a -s '.4krona' ${FILES[*]} | sort | sed -i -E 's/\.centrifuge\.out//g' > ${CENTRIFUGE_CLASSIFY}/sample_names.txt
KTAXONOMY_FILES=($(paste -d',' "${CENTRIFUGE_CLASSIFY}/krona_files.txt" "${CENTRIFUGE_CLASSIFY}/sample_names.txt"))
# Running ktImportTaxonomy to create a krona chart for samples
ktImportTaxonomy \
-tax ${KRONA_TAXONOMY} \
-u http://krona.sourceforge.net \
-o ${CENTRIFUGE_CLASSIFY}/centrifuge_krona_report.html \
-q 1 \
-t 2 \
${KTAXONOMY_FILES[*]}
# Make a directory for Export
[ -d ${CENTRIFUGE_CLASSIFY}/Export ] || mkdir ${CENTRIFUGE_CLASSIFY}/Export
find ${CENTRIFUGE_CLASSIFY}/ -type f -name "*.report" -exec cp {} ${CENTRIFUGE_CLASSIFY}/Export \;
`