mpa4 and others

jtsumner · Jul 13, 2023 · 37228a0 · 37228a0
1 parent 520f6b4
commit 37228a0
Show file tree

Hide file tree

Showing 17 changed files with 671 additions and 419 deletions.
diff --git a/README.md b/README.md
@@ -154,7 +154,15 @@ cd resources/bowtie_human
 wget https://genome-idx.s3.amazonaws.com/bt/GRCh38_noalt_as.zip
 unzip GRCh38_noalt_as.zip
 ```
+or use T2T reference
+```
+mkdir resources/bowtie_human
+cd resources/bowtie_human
+wget https://genome-idx.s3.amazonaws.com/bt/chm13.draft_v1.0_plusY.zip
+unzip chm13.draft_v1.0_plusY.zip
+```
 
+See https://benlangmead.github.io/aws-indexes/bowtie for more options and including other organisms
 ## Execution
 
 1. (Optional) Check that snakemake is correctly interpretting your sample spreadsheet by executing a dryrun or one of the commands in the notes above

diff --git a/Snakefile b/Snakefile
@@ -19,14 +19,8 @@ include: "workflow/rules/04_ViralAnalysis.smk"
 
 rule all:
     input:
-        get_rules,
-        "results/vcontact2_data/vcontact2_output/genome_by_genome_overview.csv"        #"results/vibrant_output/VIBRANT_parsed_scaffolds/VIBRANT_phages_parsed_scaffolds/parsed_scaffolds.phages_combined.faa",
-        #"results/vibrant_output/VIBRANT_parsed_scaffolds/VIBRANT_phages_parsed_scaffolds/parsed_scaffolds.phages_combined.simple.faa"
-
-        #expand("results/vibrant_output/{sample}/VIBRANT_{sample}/VIBRANT_phages_{sample}/{sample}.phages_combined.faa", sample=samples["sample"]),
-        #"results/spades_parsed/DNA_B03_27/DNA_B03_27.fa", 
-        #"results/vibrant_output/DNA_B03_27/VIBRANT_DNA_B03_27/VIBRANT_phages_DNA_B03_27/DNA_B03_27.phages_combined.faa"
-        #expand("results/AMP_trimmed/{sample}_fastp-merged.fq.gz", sample=samples["sample"])        #g
+        get_rules
+        #"results/vcontact2_data/vcontact2_output/genome_by_genome_overview.csv"
 
 # Make report for snakemake. 
 report: "workflow/report/workflow.rst"

diff --git a/config/config.yaml b/config/config.yaml
@@ -7,23 +7,24 @@ samples: "config/sample_sheet.tsv"
 
 #samples: "config/samples_minimal.tsv"
 
-metaphlan_idx: "mpa_v30_CHOCOPhlAn_201901"
+metaphlan_idx: "mpa_vOct22_CHOCOPhlAnSGB_202212"
+metaphlan_nwk: "https://github.com/biobakery/MetaPhlAn/blob/master/metaphlan/utils/mpa_vJan21_CHOCOPhlAnSGB_202103.nwk"
 human_genome: "https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.fa.gz"
 genome_name: "hg38.fa.gz"
 
 
 # Configuration for which rules to execute are listed below\
 MODULE_READ_QC: True
 TRIM_READS: True
-COMPLEXITY_FILTER: True
+COMPLEXITY_FILTER: False
 DECONVOLUTE: True
 BOWTIE2: True
 MERGE_READS: False
 NONPAREIL: True
 FASTQC: True
 
 METAPHLAN: False
-KRAKEN2: True
+KRAKEN2: False
 METAXA2: False
 ASSEMBLE: True
 MEGAHIT: False

diff --git a/config/sample_sheet_bmo.tsv → config/sample_sheet copy.tsv b/config/sample_sheet_bmo.tsv → config/sample_sheet copy.tsv
diff --git a/config/sample_sheet.tsv b/config/sample_sheet.tsv
diff --git a/config/sample_sheet_mgx_bas.tsv b/config/sample_sheet_mgx_bas.tsv
diff --git a/notes/20230713_notes.md b/notes/20230713_notes.md
@@ -0,0 +1,41 @@
+# Notes 20230713
+
+Working on several project, BMO, Hauser, and BAS/multiomics
+
+**Goals**
+- Add functional anlaysis using a gene catalog
+    - IDEA: https://microbiomejournal.biomedcentral.com/articles/10.1186/s40168-022-01259-2
+    - GET Gene catalog
+        - RULE prokka annotation - get genes
+        - RULES coassembly - get more genes (megahit)
+        - RULE concat all genes (bash)
+        - RULE cluster/deduplicate all genes (nonredundant, CDHit?, Orthogroup)
+        - RULE functional annotation (eggNOG mapper)
+    - GET functional table
+        - RULE align samples to all genes (bowtie2)
+        - RULE get feature counts (feature counts)
+    - GET database QC
+- Add functional analysis with HUMANN
+- Fix metaphlan database setup issue
+
+**Debugging**
+- Working on snakemake 7.20.0. 
+- Still having issue where snakemake cant activate conda env if using "script" directive. 
+- Made clone of current snakemake env `mamba create --name snakemake_7.20.0 --clone snakemake`
+- updating snakemake env to 7.30.1 (bug fixed in this version) `mamba install -n snakemake snakemake=7.30.1`
+
+**Prokka and Prodigal**
+- Testing in interactive job `prokka scaffolds.fasta --cpus 23 --metagenome`
+- Using module
+- Version 1.14.6
+- NOTE: prokka is wrapper of prodigal with function assignment integrated
+- prokka needs smaller contigs names (<=37 bp) 
+    awk line from chatgpt `awk '/^>/ { sub(/_length_[0-9]+_cov_[0-9.]+/, "", $0) } 1' scaffolds.fa`
+- Prodigal `prodigal -i scaffolds.fasta -p meta -o sample.gbk`
+    - prodigal works and gives best match in genesls
+- NOTES on function
+    - prokka is incldues fucntional annotation but has limited default databases
+    - can specify that you want to include KEGG and GO 
+    - headers only 37 length
+    - change headers needed as step, can do in above awk
+    - 
diff --git a/workflow/envs/biopython.yml b/workflow/envs/biopython.yml
@@ -4,5 +4,7 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - python=3.7
-  - biopython=1.78
+  - python>=3.7
+  - pip
+  - pip:
+    - biopython==1.78
diff --git a/workflow/envs/metaphlan.yml b/workflow/envs/metaphlan.yml
@@ -6,5 +6,5 @@ channels:
   - biobakery
 dependencies:
   - python=3.7
-  - metaphlan=3.0.13
+  - metaphlan=4
   - bowtie2=2.4.4
diff --git a/workflow/envs/seq_processing.yml b/workflow/envs/seq_processing.yml
@@ -4,7 +4,7 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - fastp=0.22.0
+  - fastp=0.23
   - biopython=1.78
   - fastqc=0.11.9
   - quast=5.0.2

diff --git a/workflow/notebooks/00_NonPareilAnalysis.Rmd b/workflow/notebooks/00_NonPareilAnalysis.Rmd
@@ -16,6 +16,7 @@ library(Nonpareil)
 # Get list of NonPareil outputs
 ```{r}
 np_list <- list.files(path = "../../results/nonpareil_out", pattern='*\\.npo', recursive=TRUE, full.names=TRUE)
+np_list<-np_list[grep("B1|B2", np_list)]
 np_length <- length(np_list)
 ```
 
@@ -28,7 +29,7 @@ Nonpareil.legend(Nonpareil.curve.batch(np_list,
                                                         plot.diversity = F,
                                                         main = NA)), cex = 0.90)
 Nonpareil.curve.batch(np_list, plot.opts = list(legend = NA, 
-                                                        plot.model = F, 
+                                                        plot.model = T, 
                                                         plot.diversity = F,
                                                         main = NA))
 

diff --git a/workflow/notebooks/00_NonPareilAnalysis.nb.html b/workflow/notebooks/00_NonPareilAnalysis.nb.html
diff --git a/workflow/rules/00_TrimReads.smk b/workflow/rules/00_TrimReads.smk
@@ -28,10 +28,8 @@ rule fastp_pe:
             --out1 {output.r1_filtered} \
             --out2 {output.r2_filtered} \
             --detect_adapter_for_pe \
-            --trim_poly_g \
-            --dedup \
             --thread {threads} \
-            --length_required 60 \
+            --length_required 50 \
             -j {output.json} \
             -h {output.html} \
             -V 
@@ -100,7 +98,7 @@ rule complexity_filter:
         r1 = "results/bbduk_out/{sample}/{sample}.bbduk.r1.fastq.gz",
         r2 = "results/bbduk_out/{sample}/{sample}.bbduk.r2.fastq.gz"
     params:
-        entropy = 0.5
+        entropy = 0.8
     threads: 5
     resources:
         mem="10G"
@@ -113,8 +111,8 @@ rule complexity_filter:
             out={output.r1} \
             out2={output.r2} \
             entropy={params.entropy} \
-            entropywindow=50 \
-            entropyk=5 \
+            entropywindow=100 \
+            entropyk=10 \
             threads={threads}
         """
 
@@ -135,18 +133,19 @@ rule host_decontamination:
         sorted_bam = "results/bowtie_out/{sample}/{sample}.mapped.sorted.bam",
         flagstat = "results/bowtie_out/{sample}/{sample}.flagstat.tsv"
     params:
-        filter_db = "resources/bowtie_human/GRCh38_noalt_as/GRCh38_noalt_as",
+        filter_db = "resources/bowtie_human/chm13.draft_v1.0_plusY/chm13.draft_v1.0_plusY"
+        #filter_db = "resources/bowtie_human/GRCh38_noalt_as/GRCh38_noalt_as",
         #filter_db = "/projects/b1042/HartmannLab/jack/mlm/resources/bowtie_mouse/GRCm39/GRCm39",
     threads: 15
     resources:
         mem="25G",
         time="02:00:00"
     shell:
         """
-        module load bowtie2
+        module load bowtie2/2.4.5
         module load samtools/1.10.1
 
-        bowtie2 -p {threads} -x {params.filter_db} --very-sensitive -1 {input.r1} -2 {input.r2}| \
+        bowtie2 -p {threads} -x {params.filter_db} -1 {input.r1} -2 {input.r2}| \
         samtools view -bS -@ {threads}| \
         samtools sort -@ {threads} -n -o {output.sorted_bam}
 

diff --git a/workflow/rules/02_TaxonomicAnalysis.smk b/workflow/rules/02_TaxonomicAnalysis.smk
@@ -10,10 +10,13 @@ def metaphlan_merge_inputs(wildcards):
         sample=samples["sample"])
     return files
 
+#        metaphlan_db_file="resources/metaphlan_db/{}.rev.1.bt2".format(config["metaphlan_idx"])
+
 rule metaphlan_setup:
     output:
         metaphlan_db=directory("resources/metaphlan_db"),
-        metaphlan_db_file="resources/metaphlan_db/{}.rev.1.bt2".format(config["metaphlan_idx"])
+        metaphlan_db_file="resources/metaphlan_db/{}.rev.2.bt2l".format(config["metaphlan_idx"])
+
     conda: 
         "../envs/metaphlan.yml"
     params:
@@ -39,9 +42,9 @@ rule metaphlan:
         "../envs/metaphlan.yml"
     params:
         metaphlan_idx = config["metaphlan_idx"] # Index for metaphlan
-    threads: 20
+    threads: 10
     resources:
-        mem="10g",
+        mem="20g",
         time="04:00:00"
     shell:
         """
@@ -51,7 +54,7 @@ rule metaphlan:
         --bowtie2db {input.metaphlan_db} \
         --nproc {threads} \
         --input_type fastq \
-        --unknown_estimation \
+        --unclassified_estimation \
         -t rel_ab_w_read_stats \
         -o {output.profile}
         """
@@ -65,7 +68,7 @@ rule metaphlan_merge:
         "../envs/metaphlan.yml"
     shell:
         """
-        merge_metaphlan_tables.py {input} > {output}
+        workflow/scripts/merge_metaphlan_tables_abs.py {input} > {output}
         """
 
 rule metaphlan_species_abundance:

diff --git a/workflow/rules/assembly_qc.smk b/workflow/rules/assembly_qc.smk
@@ -50,7 +50,7 @@ rule drop_short_contigs_megahit:
     output:
         "results/megahit_parsed/{sample}/{sample}.parsed_assembly.fa"
     conda:
-        "../envs/seq_processing.yml"
+        "../envs/biopython.yml"
     script:
         "../scripts/parse_contigs.py"
 
@@ -61,7 +61,7 @@ rule drop_short_contigs_spades:
     output:
         "results/spades_parsed/{sample}/{sample}.fa"
     conda:
-        "../envs/seq_processing.yml"
+        "../envs/biopython.yml"
     script:
         "../scripts/parse_contigs.py"
 
diff --git a/workflow/rules/bin_metabat2.smk b/workflow/rules/bin_metabat2.smk
@@ -95,7 +95,7 @@ rule metabat_bin:
         """
         metabat2 -t {threads} \
             --inFile {input.contigs} \
-            --outFile {output.bin_dir}/bin \
+            --outFile {output.bin_dir}/{wildcards.sample}_bin \
             --minContig 1500 \
             --abdFile {input.depth_fi} \
             --seed=100 \

diff --git a/workflow/scripts/merge_metaphlan_tables_abs.py b/workflow/scripts/merge_metaphlan_tables_abs.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# Credit to timyerg https://forum.biobakery.org/t/merge-metaphlan-tables-py-with-absolute-abundance/1839
+
+import argparse
+import os
+import sys
+import re
+import pandas as pd
+from itertools import takewhile
+
+def merge( aaastrIn, ostm ):
+    """
+    Outputs the table join of the given pre-split string collection.
+    :param  aaastrIn:   One or more split lines from which data are read.
+    :type   aaastrIn:   collection of collections of string collections
+    :param  iCol:       Data column in which IDs are matched (zero-indexed).
+    :type   iCol:       int
+    :param  ostm:       Output stream to which matched rows are written.
+    :type   ostm:       output stream
+    """
+
+    listmpaVersion = set()
+    merged_tables = pd.DataFrame()
+
+    for f in aaastrIn:
+        with open(f) as fin:
+            headers = [x.strip() for x in takewhile(lambda x: x.startswith('#'), fin)]
+        if len(headers) == 1:
+            names = ['clade_name', 'estimated_number_of_reads_from_the_clade']
+            index_col = 0
+        if len(headers) >= 4:
+            names = headers[-1].split('#')[1].strip().split('\t')
+            index_col = [0,1]
+
+        mpaVersion = list(filter(re.compile('#mpa_v[0-9]{2,}_CHOCOPhlAn_[0-9]{0,}').match, headers))
+        if len(mpaVersion):
+            listmpaVersion.add(mpaVersion[0])
+
+        if len(listmpaVersion) > 1:
+            print('merge_metaphlan_tables found tables made with different versions of the MetaPhlAn2 database.\nPlease re-run MetaPhlAn2 with the same database.\n')
+            return
+
+        iIn = pd.read_csv(f, 
+                          sep='\t',
+                          skiprows=len(headers),
+                          names = names,
+                          usecols=range(5),
+                        ).fillna('')
+        iIn.drop(['relative_abundance','coverage'],axis=1,inplace=True)
+        iIn = iIn.set_index(iIn.columns[index_col].to_list())
+        if merged_tables.empty:
+            merged_tables = iIn.iloc[:,0].rename(os.path.splitext(os.path.basename(f))[0]).to_frame()
+        else:
+            merged_tables = pd.merge(iIn.iloc[:,0].rename(os.path.splitext(os.path.basename(f))[0]).to_frame(),
+                                    merged_tables,
+                                    how='outer', 
+                                    left_index=True, 
+                                    right_index=True
+                                    )
+    if listmpaVersion:
+        ostm.write(list(listmpaVersion)[0]+'\n')
+    merged_tables.fillna('0').reset_index().to_csv(ostm, index=False, sep = '\t')
+
+argp = argparse.ArgumentParser( prog = "merge_metaphlan_tables_abs.py",
+    description = """Performs a table join on one or more metaphlan output files.""")
+argp.add_argument( "aistms",    metavar = "input.txt", nargs = "+",
+    help = "One or more tab-delimited text tables to join" )
+argp.add_argument( '-o',    metavar = "output.txt", nargs = 1,
+    help = "Name of output file in which joined tables are saved" )
+
+__doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" )
+
+argp.usage = argp.format_usage()[7:]+"\n\n\tPlease make sure to supply file paths to the files to combine. If combining 3 files (Table1.txt, Table2.txt, and Table3.txt) the call should be:\n\n\t\tpython merge_metaphlan_tables_abs.py Table1.txt Table2.txt Table3.txt > output.txt\n\n\tA wildcard to indicate all .txt files that start with Table can be used as follows:\n\n\t\tpython merge_metaphlan_tables_abs.py Table*.txt > output.txt"
+
+
+def main( ):
+    args = argp.parse_args( )
+    if args.o is None:
+        merge(args.aistms, sys.stdout)
+    else:
+        with open(args.o[0], 'w') as fout:
+            merge(args.aistms, fout)
+
+if __name__ == '__main__':
+    main()