Merge pull request #411 from Ecogenomics/write_doc

Write doc
Ecogenomics · Jul 4, 2022 · 81abbe6 · 81abbe6
2 parents 12f4de7 + 4d9dcf4
commit 81abbe6
Show file tree

Hide file tree

Showing 12 changed files with 627 additions and 138 deletions.
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -4,3 +4,6 @@ sphinx-rtd-theme ~= 0.5.0
 recommonmark ~= 0.7.0
 sphinx-sitemap ~= 2.2.0
 nbsphinx ~= 0.8.0
+matplotlib ~= 3.5.2
+linuxdoc == 20211220
+jupyter ~= 1.0.0
diff --git a/docs/src/announcements.rst b/docs/src/announcements.rst
@@ -3,7 +3,7 @@ Announcements
 
 
 GTDB-Tk 2.1.0 available
--------------------
+-----------------------
 
 *May 11, 2022*
 

diff --git a/docs/src/commands/convert_to_itol.rst b/docs/src/commands/convert_to_itol.rst
@@ -0,0 +1,41 @@
+.. _commands/convert_to_itol:
+
+convert_to_itol
+===============
+
+The `convert_to_itol` command will remove internal labels from Newick tree, making it suitable for visualization in `iTOL <http://itol.embl.de/>`_.  
+
+Arguments
+---------
+
+.. argparse::
+   :module: gtdbtk.cli
+   :func: get_main_parser
+   :prog: gtdbtk
+   :path: convert_to_itol
+   :nodefaultconst:
+
+Example
+-------
+
+Input
+^^^^^
+
+
+.. code-block:: bash
+
+    gtdbtk convert_to_itol --input some_tree.tree --output itol.tree
+
+
+Output
+^^^^^^
+
+
+.. code-block:: text
+
+    [2022-06-30 18:44:54] INFO: GTDB-Tk v2.1.0
+    [2022-06-30 18:44:54] INFO: gtdbtk convert_to_itol --input /tmp/decorated.tree --output new.tree
+    [2022-06-30 18:44:54] INFO: Using GTDB-Tk reference data version r207: /gtdbtk-data
+    [2022-06-30 18:44:54] INFO: Convert GTDB-Tk tree to iTOL format
+    [2022-06-30 18:44:54] INFO: Done.
+
diff --git a/docs/src/commands/index.rst b/docs/src/commands/index.rst
@@ -14,6 +14,7 @@ Below is a list of all GTDB-Tk command line options:
    check_install
    classify
    classify_wf
+   convert_to_itol
    de_novo_wf
    decorate
    export_msa

diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -28,8 +28,8 @@
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['sphinxarg.ext', 'sphinx.ext.napoleon', 'sphinx.ext.autodoc',
-              'recommonmark', 'sphinx_sitemap', 'nbsphinx']
+extensions = ['sphinxarg.ext', 'sphinx.ext.napoleon', 'sphinx.ext.autodoc', 'linuxdoc.rstFlatTable',
+              'recommonmark', 'sphinx_sitemap', 'nbsphinx','matplotlib.sphinxext.plot_directive']
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']

diff --git a/docs/src/index.rst b/docs/src/index.rst
@@ -46,6 +46,7 @@ We encourage you to cite GTDB-Tk and the third-party dependencies as described i
    :caption: Running GTDB-Tk
    :maxdepth: 1
 
+   performance/index
    commands/index
    files/index
    examples/classify_wf

diff --git a/docs/src/installing/index.rst b/docs/src/installing/index.rst
@@ -38,7 +38,7 @@ Hardware requirements
      - ~65 GB
      - ~1 hour / 1,000 genomes @ 64 CPUs
    * - Bacteria
-     - ~320 GB ( 55GB for divide-and-conquer)
+     - ~55GB (320 GB when using --full_tree)
      - ~65 GB
      - ~1 hour / 1,000 genomes @ 64 CPUs
 

diff --git a/docs/src/performance/accuracy.rst b/docs/src/performance/accuracy.rst
@@ -0,0 +1,58 @@
+.. _performance/Accuracy:
+
+
+Accuracy
+========
+
+The similarity of GTDB-Tk v1 and v2 classifications were first assessed using 16,710 bacterial genomes from the GEMs dataset (Nayfach et al., 2021) that represent novel taxa relative to GTDB R07-RS207.
+| Only 12 genomes (0.07%) did not have identical classifications between GTDB-Tk v1 and the divide-and-conquer approach used in GTDB-Tk v2.
+| The majority of incongruence was due to genomes being over- (6 genomes) or under-classified (4 genomes) by a single taxonomic rank. Only 2 genomes had conflicting taxonomic assignments, and these were both relatively poor-quality genomes assigned as new classes in alternative phyla.
+
+.. flat-table:: Table 1. Novelty of GEM genomes relative to GTDB R07-RS207 based on GTDB-Tk v1 classifications.
+   :header-rows: 2
+
+   * -
+     -
+     - :cspan:`4` GTDB-Tk v2 classifications relative to GTDB-Tk v1 classifications
+   * - Toxon Novelty
+     - No genomes
+     - Congruent
+     - Conflict
+     - Underclassified
+     - Overclassified
+   * - Novel phylum
+     - 3
+     - 2
+     - 0
+     - 0
+     - 1
+   * - Novel class
+     - 42
+     - 35
+     - 2
+     - 2
+     - 2
+   * - Novel order
+     - 144
+     - 143
+     - 0
+     - 0
+     - 1
+   * - Novel family
+     - 543
+     - 540
+     - 0
+     - 1
+     - 2
+   * - Novel genus
+     - 3,222
+     - 3,219
+     - 0
+     - 1
+     - 0
+   * - Novel species
+     - 12,756
+     - 12,756
+     - 0
+     - 0
+     - 0
diff --git a/docs/src/performance/index.rst b/docs/src/performance/index.rst
@@ -0,0 +1,12 @@
+.. _performance:
+
+############################
+  Performance and Accuracy
+############################
+
+
+.. toctree::
+   :maxdepth: 1
+
+   performance
+   accuracy
diff --git a/docs/src/performance/performance.rst b/docs/src/performance/performance.rst
@@ -0,0 +1,77 @@
+.. _performance/Performance:
+
+Performance
+===========
+
+
+| GTDB-Tk v2 also runs 22% to 35% faster when processing 1000 genomes with 1 to 64 CPUs (Fig.1) and is >40% faster when processing 5,000 genomes using 32 CPUs (Fig.2).
+| The tests below were run on a machine with 4 AMD EPYC 7402 24-Core Processor and 512 GB of RAM.
+
+.. plot::
+
+      import matplotlib.pyplot as plt
+      from matplotlib.ticker import ScalarFormatter
+      import numpy as np
+
+      color_setup = ['#5a6855','red','color']
+      setup = color_setup
+      cpus_list = [1, 8, 16, 32, 64]
+      split_list = [2571, 457, 309, 230, 216]
+      nosplit_list= [3980, 638, 398, 295, 279]
+      values = ['1', '8', '16', '32','64']
+
+      fig, ax = plt.subplots()
+
+      plt.ylim(200,4500)
+      plt.xticks(cpus_list, values)
+
+      ax.scatter(cpus_list, split_list, label="GTDB-Tk v2", marker="s", s=30, color=setup[0])
+      ax.plot(cpus_list, split_list,linestyle='dashed', color=setup[0])
+      ax.scatter(cpus_list, nosplit_list, label="GTDB-Tk v1",color=setup[1])
+      ax.plot(cpus_list, nosplit_list,linestyle=':', color=setup[1])
+
+      ax.set_yscale('log')
+      ax.set_yticks([200,500,1000,2000,4000])
+      ax.yaxis.set_major_formatter(ScalarFormatter())
+
+      plt.ylabel('Runtime (min)')
+      plt.xlabel('No. CPUs')
+      plt.title('Fig.1: GTDB-Tk runtime for 1000 genomes')
+
+      # show a legend on the plot
+      plt.legend(loc=1, prop={'size': 12},frameon=False)
+      plt.show()
+
+Fig. 1: Processing time for 1,000 randomly selected GEM MAGs for increasing numbers of CPUs.
+
+.. plot::
+
+      color_setup = ['#5a6855','red','color']
+      setup = color_setup
+      pool_size_list=[10, 50, 100, 200, 500, 1000, 2000, 5000]
+      split_list=[88, 150, 160, 169, 195, 235, 312, 558]
+      nosplit_list=[137, 151, 163, 180, 219, 280, 416, 934]
+      values = [10,500,1000,1500,2000,2500,3000,3500,4000,4500,5000]
+
+      plt.scatter(pool_size_list, split_list, label="GTDB-Tk v2" , marker="s", s=30, color=setup[0])
+      plt.plot(pool_size_list, split_list,linestyle='dashed', color=setup[0])
+      plt.scatter(pool_size_list, nosplit_list, label="GTDB-Tk v1",color=setup[1])
+      plt.plot(pool_size_list, nosplit_list,linestyle=':', color=setup[1])
+
+      # naming the x axis
+
+      plt.xticks(values)
+      plt.yticks([100,300,500,700,900])
+
+      # naming the axis
+      plt.ylabel('Runtime (min)')
+      plt.xlabel('No. genomes')
+      # giving a title to my graph
+      plt.title('Fig.2: GTDB-Tk runtime with 32CPUs')
+
+      # show a legend on the plot
+      plt.legend(loc=2, prop={'size': 12},frameon=False)
+
+      # function to show the plot
+      plt.show()
+Fig. 2: Processing time with 32 CPUs on increasing numbers of randomly selected GEM MAGs.
diff --git a/gtdbtk/config/config.py b/gtdbtk/config/config.py
@@ -218,7 +218,7 @@
 PPLACER_AR53_REF_PKG = f"gtdb_{VERSION_DATA}_ar53.refpkg"
 PPLACER_RPS23_REF_PKG = f"gtdb_{VERSION_DATA}_rps23.refpkg"
 PPLACER_MIN_RAM_BAC_FULL = 320
-PPLACER_MIN_RAM_BAC_SPLIT = 50
+PPLACER_MIN_RAM_BAC_SPLIT = 55
 PPLACER_MIN_RAM_ARC = 40
 
 # Fastani configuration
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,7 +3,7 @@ Announcements @@
     GTDB-Tk 2.1.0 available
-    -------------------
+    -----------------------
     *May 11, 2022*
@@ Expand Down @@