diff --git a/Snakefile b/Snakefile index 757d5d9..c3be5c7 100644 --- a/Snakefile +++ b/Snakefile @@ -17,7 +17,8 @@ include: "workflow/rules/06_SingleGenomeBins.smk" rule all: input: - get_rules + get_rules, + expand("results/prokka_out/{sample}/{sample}.tsv", sample=samples["sample"]) #"results/vcontact2_data/vcontact2_output/genome_by_genome_overview.csv" # Make report for snakemake. diff --git a/config/sample_sheet.tsv b/config/sample_sheet.tsv index 9e0aa71..b8fb9a4 100644 --- a/config/sample_sheet.tsv +++ b/config/sample_sheet.tsv @@ -1,4 +1,3 @@ sample dataset forward_read reverse_read B22_RePMA MGX_DATA /projects/p31648/bmo_shotgun_fastq/ds.82d3fd5e1df54b8a825201cee0cbe2ec/B22_RePMA_S58_L001_R1_001.fastq.gz /projects/p31648/bmo_shotgun_fastq/ds.82d3fd5e1df54b8a825201cee0cbe2ec/B22_RePMA_S58_L001_R2_001.fastq.gz B16_LyPMA MGX_DATA /projects/p31648/bmo_shotgun_fastq/ds.26de31cab7fa4bed90e4ef42a5acc773/B16_LyPMA_S8_L001_R1_001.fastq.gz /projects/p31648/bmo_shotgun_fastq/ds.26de31cab7fa4bed90e4ef42a5acc773/B16_LyPMA_S8_L001_R2_001.fastq.gz -20221117_Zymo_10ng MGX_DATA /projects/p31648/bmo_shotgun_fastq/ds.fdfd243b09b143689e333b02d6a47c78/20221117_Zymo_10ng_S81_L001_R1_001.fastq.gz /projects/p31648/bmo_shotgun_fastq/ds.fdfd243b09b143689e333b02d6a47c78/20221117_Zymo_10ng_S81_L001_R2_001.fastq.gz diff --git a/workflow/envs/prokka.yml b/workflow/envs/prokka.yml new file mode 100644 index 0000000..42eb201 --- /dev/null +++ b/workflow/envs/prokka.yml @@ -0,0 +1,7 @@ +name: prokka +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - prokka diff --git a/workflow/rules/00_TrimReads.smk b/workflow/rules/00_TrimReads.smk index ffbafd5..013a551 100644 --- a/workflow/rules/00_TrimReads.smk +++ b/workflow/rules/00_TrimReads.smk @@ -320,6 +320,6 @@ rule fastqc_multiqc: shell: """ module load multiqc - multiqc --outdir {params.out_dir} --dirs --dirs-depth 2 results/fastqc_out/ + multiqc --outdir {params.out_dir} --dirs --dirs-depth 2 results/fastqc_out/ -f """ diff --git a/workflow/rules/05_AssemblyAnalysis.smk b/workflow/rules/05_AssemblyAnalysis.smk index cb1258e..4874962 100644 --- a/workflow/rules/05_AssemblyAnalysis.smk +++ b/workflow/rules/05_AssemblyAnalysis.smk @@ -48,7 +48,7 @@ rule spades: # -k 21,33,55,77,99,127 --only-assembler ############################ -### PART 2: FILTER ### +# PART 2A: QC FILTER # ############################ rule drop_short_contigs_megahit: @@ -81,7 +81,7 @@ rule drop_short_contigs_spades: """ ############################ -### PART 3: QUAST ### +# PART 2B: QC QUAST # ############################ rule quast_megahit: @@ -125,5 +125,51 @@ rule multiqc_quast: shell: """ module load multiqc - multiqc --outdir {params.out_dir} --dirs --dirs-depth 2 results/quast_out/ + multiqc --outdir {params.out_dir} --dirs --dirs-depth 2 results/quast_out/ -f """ + +############################ +# PART 3A: ANNOTATION # +############################ + +rule prep_annotation: + """ + Takes spades headers and transforms them to be >NODE_#### only + """ + input: + scaffolds="results/spades_out/{sample}/scaffolds.fasta" + output: + scaffolds_clean=temp("results/prokka_out/tmp_scaffolds/{sample}_scaffolds_clean.fasta") + threads: 1 + resources: + mem="3g", + time="00:05:00" + shell: + """ + awk '/^>/ {{ sub(/_length_[0-9]+_cov_[0-9.]+/, "", $0) }} 1' {input.scaffolds} > {output.scaffolds_clean} + """ + +rule annotate_prokka: + input: + scaffolds_clean="results/prokka_out/tmp_scaffolds/{sample}_scaffolds_clean.fasta" + output: + annotation="results/prokka_out/{sample}/{sample}.tsv", + out_dir=directory("results/prokka_out/{sample}/") + threads: 20 + resources: + mem="25g", + time="01:00:00" + conda: + "../envs/prokka.yml" + shell: + """ + module load prokka + prokka {input.scaffolds_clean} --cpus {threads} \ + --metagenome \ + --locustag {wildcards.sample} \ + --prefix {wildcards.sample} \ + --outdir {output.out_dir}/ \ + --addgenes \ + --mincontiglen 200 \ + --force + """ \ No newline at end of file