forked from tallulandrews/scRNASeqPipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
00_LIST_OF_BSUB_COMMANDS.sh
135 lines (111 loc) · 9.72 KB
/
00_LIST_OF_BSUB_COMMANDS.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/bin/bash
ORGANISM="Mmus"
RUNNINGDIR=`pwd`
#### Create GTF & FA for construct
/nfs/users/nfs_t/ta6/RNASeqPipeline/0_GBK2FASTA.pl
/nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/gffread --help
#-----------------------------------------------------------------
# Initial QC
# This is not generalized
# When & on what do I want to run this?
QCDIR=/lustre/scratch108/compgen/team218/TA/QualityControl
mkdir -p $QCDIR
#cd $QCDIR
#bsub -R"select[mem>150] rusage[mem=150]" -M150 -o fastqcout.%J -e fastqcout.%J -q normal /nfs/users/nfs_t/ta6/RNASeqPipeline/0_FASTQC.sh C6H8GANXX_C1_IB_8TFs_exp1_15s009116-2-1_Bergiers_lane615s009116E10_1_sequence.txt.gz
#cd $RUNNINGDIR
# These don't like running in parallel for some reason...
bsub -R"select[mem>10000] rusage[mem=10000]" -M10000 -q normal -o output.%J -e error.%J ../0_FASTQC_Streaming.sh /lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap *lane5*1.1.fq.gz Bergiers_lane5_end1 /lustre/scratch108/compgen/team218/TA/QualityControl/
bsub -R"select[mem>10000] rusage[mem=10000]" -M10000 -q normal -o output.%J -e error.%J ../0_FASTQC_Streaming.sh /lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap *lane5*2.1.fq.gz Bergiers_lane5_end2 /lustre/scratch108/compgen/team218/TA/QualityControl/
bsub -R"select[mem>10000] rusage[mem=10000]" -M10000 -q normal -o output.%J -e error.%J ../0_FASTQC_Streaming.sh /lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap *lane6*1.1.fq.gz Bergiers_lane6_end1 /lustre/scratch108/compgen/team218/TA/QualityControl/
bsub -R"select[mem>10000] rusage[mem=10000]" -M10000 -q normal -o output.%J -e error.%J ../0_FASTQC_Streaming.sh /lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap *lane6*2.1.fq.gz Bergiers_lane6_end2 /lustre/scratch108/compgen/team218/TA/QualityControl/
#-----------------------------------------------------------------
#Set up genome for mapping
WORKINGDIR=/lustre/scratch108/compgen/team218/TA/genomebuilding
GENOMEDIR=/lustre/scratch108/compgen/team218/TA/STRIPED_GENOMES
LOG="GenomeBuildingLog$ORG.tar.gz"
bsub -R"select[mem>35000] rusage[mem=35000]" -M35000 -q normal -o $WORKINGDIR/FarmJobOutput.%J -e $WORKINGDIR/FarmJobErrors.%J /nfs/users/nfs_t/ta6/RNASeqPipeline/0_BuildGenome.sh $WORKINGDIR $GENOMEDIR 1 125 $ORGANISM /nfs/users/nfs_t/ta6/Collaborations/Bergiers_Italy
tar -zcf $LOG $WORKINGDIR # Or should I just delete it?
#-----------------------------------------------------------------
#Breakdown files for mapping
#Note these are not particularly well designed (eg. everything is currently hard coded not passed as an argument)
# in particular the latter one tends to crash if my computer goes to sleep or something.....
# and the former one should really submit the jobs as a jobarray rather than in packs of 40 then waiting 5 minutes.
#perl 1_breakdown_all_files_parallel_not_general.pl #Calls 1_Breakdown_PairedEnds.pl
#perl 1_Compress_Broken_Down_FASTQ_and_Cell_Specific_Files_forQC.pl
OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap/
mkdir -p $OUTPUTDIR
INPUTDIR=/nfs/team218/MH/2015-03-02-C6H8GANXX/2015-03-02-C6H8GANXX/
PATTERN="*exp2*_sequence.txt.gz"
INPUTFILES=($INPUTDIR/$PATTERN)
NUMFILES=${#INPUTFILES[@]}
MAXJOBS=$(($NUMFILES))
bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%40" -R"select[mem>4000] rusage[mem=4000]" -M4000 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/1_BreakDown_Files_wrapper.sh 100000000 "$INPUTDIR" "$PATTERN2" $OUTPUTDIR
#-----------------------------------------------------------------
#Map all FastQfiles using job array
# Note job array requires indexing to start at 1 but array indexing starts at 0
# 2_MapReadsFile.sh arguments: NThreads, directory of files to map, directory for output, STAR Parameter file, outputfile prefix
# things I must test for this bit: (1) memory allocation is sufficient, (2) indices include all the files in the directory,
# This set found in 2_DO_MapReadsFile.sh
OUTPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/
mkdir -p $OUTPUTDIR
INPUTDIR=/lustre/scratch108/compgen/team218/TA/RNASeqFilesToMap/
INPUTFILES=($INPUTDIR/*)
NUMFILES=${#INPUTFILES[@]}
MAXJOBS=$(($NUMFILES/2))
TAG="Bergiers_Exp1"
bsub -J"mappingwithstararrayjob[1-$MAXJOBS]%40" -R"select[mem>30000] rusage[mem=30000]" -M30000 -q normal -o output.%J.%I /nfs/users/nfs_t/ta6/RNASeqPipeline/2_MapReadsFile.sh 1 $INPUTDIR $OUTPUTDIR /nfs/users/nfs_t/ta6/RNASeqPipeline/2_STAR_Parameters.txt $TAG
#%40 restricts it to running 40 elements at the same time, from the 100 element job array. all outputfiles will have the same job id (%J) but different indexes (%I)
# STAR Output files:
# Log.out = stuff about defining parameters & reading in genome, should be identical except for input files for all jobs
# Log.progress.out = speed & proportion mapped/unmapped at various points while running (not particularly useful once job has finished)
# Log.final.out = summary statistics after all reads have been attempted to be mapped.
# SJ.out.tab = table of identified splice junctions.
#-----------------------------------------------------------------
#Summarize mapping output!, %mapped, %uniquely mapped, etc...
# This set found in 3_CLEANUP_MapReadFiles.sh
perl /nfs/users/nfs_t/ta6/RNASeqPipeline/3_Compile_Mapping_Statistics.pl $OUPUTDIR > /nfs/users/nfs_t/ta6/RNASeqPipeline/$TAG.mapped_summary.out
rm $OUTPUTDIR/*.Log.progress.out
tar -cvzf $OUTPUTDIR/ParameterLogfiles.tar.gz $OUTPUTDIR/*Log.out
tar -cvzf $OUTPUTDIR/SpliceJunctionfiles.tar.gz $OUTPUTDIR/*SJ.out.tab
tar -cvzf $OUTPUTDIR/FinalLogfiles.tar.gz $OUTPUTDIR/*Log.final.out
rm $OUTPUTDIR/*Log.out
rm $OUTPUTDIR/*SJ.out.tab
rm $OUTPUTDIR/*Log.final.out
#-----------------------------------------------------------------
#Combine & sort using samtools -> on largest bam used 2000Mb & took 105 sec (2 Apr 2014)
# Not necessary now changed STAR parameters to do the sorting.
perl 3_SortBAMs.pl #all hard coded at the moment not flexible. Runs each job on cluster
#bsub -R"select[mem>3000] rusage[mem=3000]" -M3000 -q normal -o testsamtoolssort.%J samtools sort -m 3000000000 Bergiers_lane515s009116E9_exp1_1:N:0:AAGAGGCAAAGGAGTA_1.1Aligned.out.bam Bergiers_lane515s009116E9_exp1_1:N:0:AAGAGGCAAAGGAGTA_1.1Aligned.out.sorted
#-----------------------------------------------------------------
# This set (merge & reorganize) found in 3_merge_dedup_MappedReads.sh
#Merge & de-dup
bsub -q normal -o MergeBAMs_job.output perl 4_MergeBAMS.pl $OUTPUTDIR $TAG
# Reorganize files
MappedDedupDIR=$OUTPUTDIR/Deduplicated
MappedWdupDIR=$OUTPUTDIR/WithDuplicates
mkdir -p $MappedDedupDIR
mkdir -p $MappedWdupDIR
mv $OUTPUTDIR/*dedup* $MappedDedupDIR
mv $OUTPUTDIR/*sorted.bam $MappedWdupDIR
tar -cvzf Bergiers_exp1_mapping_output.tar.gz $OUTPUTDIR/Bergiers*exp1*
rm $OUTPUTDIR/Bergiers_lane*
# ----------------------------------------------------------------
#QC with RSeQC - sample command to test is working
python /nfs/users/nfs_t/ta6/RNASeqPipeline/software/RSeQC-2.6.1/scripts/clipping_profile.py -i Bergiers_exp1_TAGGCATGCTCTCTAT.sorted.dedupped.bam -o test
# rRNA/mt-mRNA content
bsub -q normal -o RSeQC.output perl 4_RSeQC.pl #As add further tests to be done make this parallelized
bsub -R"select[mem>1000] rusage[mem=1000]" -M1000 -q normal -o /lustre/scratch108/compgen/team218/TA/TEST/RSEQC_test.output /nfs/users/nfs_t/ta6/RNASeqPipeline/4_RSeQC_Multiple.sh Mmus /lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Deduplicated/Bergiers_exp1_TCCTGAGCTCTCTCCG.sorted.dedupped.bam /lustre/scratch108/compgen/team218/TA/TEST/RSEQC_test 1
# Now used 4_DO_RSeQC_Multiple.sh
#-----------------------------------------------------------------
# de novo transcript assembly using all data:
bsub -q normal -o mergingallbams.out samtools merge /lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Deduplicated/All.dedupped.bam /lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Deduplicated/*.bam
# took ~2h to complete
#quantify transcripts using cufflinks & feature count
# Cufflinks on everything on 10 processors used just over 10GB and on normal queue got about halfway done (12h limit).
bsub -R"select[mem>20000] rusage[mem=20000] span[ptile=10]" -M20000 -n 10 -q long -o cufflinkseverything.output -e cufflinkseverything.error /nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/cufflinks --GTF-guide /lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf --frag-bias-correct /lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.dna.primary_assembly.fa --mask-file /lustre/scratch108/compgen/team218/TA/TemporaryFileDir/Mmus-rRNAtRNAmtmRNAs-mask.gtf --multi-read-correct --max-intron-length 1000000 --min-intron-length 20 -o /lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified --quiet --no-update-check --seed 100 --num-threads 10 --no-faux-reads /lustre/scratch108/compgen/team218/TA/RNASeqFilesMapped/Deduplicated/All.dedupped.bam
#-----------------------------------------------------------------
# Expression Level Quantification
bsub -R"select[mem>1500] rusage[mem=1500]" -M1500 -q normal -o Cuffoutput.%J -e Cufferror.%J /nfs/users/nfs_t/ta6/RNASeqPipeline/software/cufflinks-2.2.1.Linux_x86_64/cufflinks --GTF-guide $Somecombinedannotationfile --frag-bias-correct /lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.dna.primary_assembly.fa --mask-file /lustre/scratch108/compgen/team218/TA/TemporaryFileDir/Mmus-rRNAtRNAmtmRNAs-mask.gtf --multi-read-correct --max-intron-length 1000000 --min-intron-length 20 -o /lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified --quiet --no-update-check --seed $somerandomseed --num-threads 1 $Cellbasedbamfile
bsub 5_featureCounts_warpper.sh $Somecombinedannotationfile 1 $Cellbasedbamfile
# RSEM
bsub -R"select[mem>10000] rusage[mem=10000]" -M10000 -q normal -o RSEMbuildref.%J.out -e RSEMbuildref.%J.err /nfs/users/nfs_t/ta6/RNASeqPipeline/5_RSEM_build_refrence.sh