prepare for release

wbyu · Sep 10, 2018 · 3fec17a · 3fec17a
1 parent 0352606
commit 3fec17a
Show file tree

Hide file tree

Showing 8 changed files with 123,700 additions and 114,180 deletions.
diff --git a/doc/source/index.rst b/doc/source/index.rst
@@ -79,6 +79,53 @@ wgd CLI:
 
 .. image:: flowchart.png
 
+Citation
+========
+
+A publication of the ``wgd`` package is in preparation. Until it is published
+please cite the github repository.
+
+Further, for the specific tools in ``wgd``, please cite the following:
+If you use ``wgd mcl`` please cite::
+
+   - Altschul, S. F., Madden, T. L., Schäffer, A. A., Zhang, J., Zhang, Z., Miller, W.,
+   and Lipman, D. J. (1997). Gapped BLAST and PSI-BLAST: a new generation of
+   protein database search programs. Nucleic Acids Research, 25(17), 3389–3402.
+
+   - van Dongen, S. (2000). Graph Clustering by Flow Simulation. Ph.D. thesis,
+   University of Utrecht, Utrecht.
+
+For ``wgd ksd``, please cite::
+
+   - Yang, Z. (2007). PAML 4: Phylogenetic Analysis by Maximum Likelihood.
+   Molecular Biology and Evolution, 24(8), 1586–1591.
+
+   - [if using MUSCLE] Edgar, R. C. (2004). MUSCLE: multiple sequence alignment with high accuracy and
+   high throughput. Nucleic Acids Research, 32(5), 1792–1797.
+
+   - [if using MAFFT] Katoh, K. and Standley, D. M. (2013). MAFFT multiple sequence alignment software
+   version 7: improvements in performance and usability. Molecular Biology and
+   Evolution, 30(4), 772–780.
+
+   - [if using PRANK] Löytynoja, A. and Goldman, N. (2008). Phylogeny-Aware Gap Placement Prevents
+   Errors in Sequence Alignment and Evolutionary Analysis. Science, 320(5883),
+   1632–1635.
+
+   - [if using FastTree] 2825–2830.
+   Price, M. N., Dehal, P. S., and Arkin, A. P. (2010). FastTree 2 - Approximately
+   Maximum-Likelihood Trees for Large Alignments. PLOS ONE, 5(3), e9490.
+
+   - [if using PhyML] Guindon, S., Dufayard, J.-F., Lefort, V., Anisimova, M., Hordijk, W., and
+   Gascuel, O. (2010). New algorithms and methods to estimate maximum-likelihood
+   phylogenies: assessing the performance of PhyML 3.0. Systematic Biology, 59(3),
+   307–321.
+
+For ``wgd syn``, please cite::
+
+   - Proost, S., Fostier, J., De Witte, D., Dhoedt, B., Demeester, P., Van de Peer, Y., and
+   Vandepoele, K. (2012). i-ADHoRe 3.0 : fast and sensitive detection of genomic
+   homology in extremely large data sets. NUCLEIC ACIDS RESEARCH, 40(2).
+
 
 Python package
 ==============

diff --git a/example/data/ath.ks.tsv b/example/data/ath.ks.tsv
diff --git a/example/data/ath.ks_anchors.tsv b/example/data/ath.ks_anchors.tsv
diff --git a/example/data/ath.mcl b/example/data/ath.mcl
diff --git a/example/data/ath_anchors.ks.tsv b/example/data/ath_anchors.ks.tsv
diff --git a/setup.py b/setup.py
@@ -2,10 +2,7 @@
 """
 Arthur Zwaenepoel
 
-Version 0.1:
-    - Rewrite of Ks distribution construction
-    - Implementation of one-vs-one ortholog Ks distributions
-    - Phylogenetic trees for weighting (FastTree & PhyML)
+Version 1.0
 
 Copyright (C) 2018 Arthur Zwaenepoel
 
@@ -27,15 +24,18 @@
 
 from setuptools import setup
 
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
 setup(
     name='wgd',
-    version='0.1.1',
+    version='1.0',
     packages=['wgd'],
     url='http://github.com/arzwa/wgd',
     license='GPL',
     author='Arthur Zwanepoel',
     author_email='[email protected]',
-    description='MORPH bulk CLI',
+    description='wgd',
     py_modules=['wgd_cli'],
     include_package_data=True,
     install_requires=[

diff --git a/wgd/viz.py b/wgd/viz.py
@@ -28,6 +28,7 @@
 from .modeling import filter_group_data
 import plumbum as pb
 import matplotlib
+
 if not 'DISPLAY' in pb.local.env:
     matplotlib.use('Agg')  # use this backend when no X server
 import matplotlib.pyplot as plt
@@ -39,7 +40,8 @@
 import pandas as pd
 
 
-def plot_dists(dists, var, scale, ax, alphas, colors, labels, bins=40, **kwargs):
+def plot_dists(dists, var, scale, ax, alphas, colors, labels, bins=40,
+               **kwargs):
     """
     Plot a bunch of histograms stacked on each other.
 
@@ -54,7 +56,7 @@ def plot_dists(dists, var, scale, ax, alphas, colors, labels, bins=40, **kwargs)
     :param kwargs: other args for plt.hist
     :return: ax
     """
-    xlabs = {"Ks" : "K_{\mathrm{S}}", "Ka": "K_{\mathrm{A}}", "Omega": "\omega"}
+    xlabs = {"Ks": "K_{\mathrm{S}}", "Ka": "K_{\mathrm{A}}", "Omega": "\omega"}
     data = [dist[var] for dist in dists]
     xlab = xlabs[var]
     if scale == "log10":
@@ -107,7 +109,8 @@ def plot_selection(
 
     # filtering and node-weighting
     for i in range(len(dists)):
-        dists[i] = filter_group_data(dists[i], filters[0], filters[1], filters[2],
+        dists[i] = filter_group_data(dists[i], filters[0], filters[1],
+                                     filters[2],
                                      ks_range[0], ks_range[1])
 
     # assemble the figure
@@ -116,7 +119,7 @@ def plot_selection(
     ax.set_xlim(0, ks_range[1])
     ax = fig.add_subplot(2, 2, 2)
     plot_dists(dists, "Ks", "log10", ax, alphas, colors, labels, bins, **kwargs)
-    ax.set_xlim(np.log10(ks_range[0]+1e-5), np.log10(ks_range[1]))
+    ax.set_xlim(np.log10(ks_range[0] + 1e-5), np.log10(ks_range[1]))
     ax = fig.add_subplot(2, 2, 3)
     plot_dists(dists, "Ka", "log10", ax, alphas, colors, labels, bins, **kwargs)
     ax = fig.add_subplot(2, 2, 4)
@@ -169,15 +172,14 @@ def syntenic_dotplot(df, min_length=250, output_file=None):
         genomic_elements[kv[0]] = previous
         previous += kv[1]
 
-    x = [genomic_elements[key] for key in sorted(genomic_elements.keys())]
+    x = [genomic_elements[key] for key in sorted(genomic_elements.keys())] + \
+        [previous]
     x = sorted(list(set(x)))  # FIXME hack
     if len(x) == 0:
         logging.warning("No multiplicons found!")
         return
 
     # plot layout stuff!
-    x = [genomic_elements[key] for key in sorted(genomic_elements.keys())]
-    x = sorted(list(set(x)))
     ax.vlines(ymin=0, ymax=previous, x=x, linestyles='dotted', alpha=0.2)
     ax.hlines(xmin=0, xmax=previous, y=x, linestyles='dotted', alpha=0.2)
     ax.plot(x, x, color='k', alpha=0.2)
@@ -222,7 +224,8 @@ def syntenic_dotplot(df, min_length=250, output_file=None):
 
 
 def syntenic_dotplot_ks_colored(
-        df, an, ks, min_length=250, color_map='Spectral', output_file=None
+        df, an, ks, min_length=50, color_map='Spectral', min_ks=0.05, max_ks=5,
+        output_file=None
 ):
     """
     Syntenic dotplot with segment colored by mean Ks value
@@ -232,6 +235,8 @@ def syntenic_dotplot_ks_colored(
     :param ks: Ks distribution data frame
     :param min_length: minimum length of a genomic element
     :param color_map: color map string
+    :param min_ks: minimum median Ks value
+    :param max_ks: maximum median Ks value
     :param output_file: output file name
     :return: figure
     """
@@ -253,8 +258,7 @@ def syntenic_dotplot_ks_colored(
         pairs = an[an['multiplicon'] == row['id']]['pair']
         med_ks = np.median(ks.loc[ks.index.intersection(pairs)]['Ks'])
         ks_multiplicons[row['id']] = med_ks
-        if med_ks < 5:
-            all_ks.append(med_ks)
+        all_ks.append(med_ks)
 
     z = [[0, 0], [0, 0]]
     levels = range(0, 101, 1)
@@ -281,7 +285,8 @@ def syntenic_dotplot_ks_colored(
         previous += kv[1]
 
     # plot layout
-    x = [genomic_elements[key] for key in sorted(genomic_elements.keys())]
+    x = [genomic_elements[key] for key in sorted(genomic_elements.keys())] + \
+        [previous]
     x = sorted(list(set(x)))
     ax.vlines(ymin=0, ymax=previous, x=x, linestyles='dotted', alpha=0.2)
     ax.hlines(xmin=0, xmax=previous, y=x, linestyles='dotted', alpha=0.2)
@@ -315,13 +320,15 @@ def syntenic_dotplot_ks_colored(
              [row['begin_x'], row['end_x']]]
         y = [genomic_elements[list_y] + x for x in
              [row['begin_y'], row['end_y']]]
-        ax.plot(x, y, alpha=0.9, linewidth=3,
-                color=cmap(ks_multiplicons[row['id']] / 5)),
-                # path_effects=[pe.Stroke(linewidth=4, foreground='k'), pe.Normal()])
-        ax.plot(y, x, alpha=0.9, linewidth=3,
-                color=cmap(ks_multiplicons[row['id']] / 5))
-                # path_effects=[pe.Stroke(linewidth=4, foreground='k'),
-                              #pe.Normal()])
+        med_ks = ks_multiplicons[row['id']]
+        if min_ks < med_ks <= max_ks:
+            ax.plot(x, y, alpha=0.9, linewidth=3,
+                    color=cmap(ks_multiplicons[row['id']] / 5)),
+            # path_effects=[pe.Stroke(linewidth=4, foreground='k'), pe.Normal()])
+            ax.plot(y, x, alpha=0.9, linewidth=3,
+                    color=cmap(ks_multiplicons[row['id']] / 5))
+            # path_effects=[pe.Stroke(linewidth=4, foreground='k'),
+            # pe.Normal()])
 
     # colorbar
     cbar = plt.colorbar(tmp, fraction=0.02, pad=0.01)
@@ -427,7 +434,7 @@ def get_data(df, var, scale, r1, r2, outliers_included):
             label="Don't adapt weights when filtering", active=False)
 
     # set up figure
-    p1 = figure(plot_width=1000, plot_height=700, # output_backend="svg",
+    p1 = figure(plot_width=1000, plot_height=700,  # output_backend="svg",
                 tools='pan,wheel_zoom,xwheel_zoom,ywheel_zoom,save')
     p1.xgrid.grid_line_color = None
     p1.ygrid.grid_line_color = None

diff --git a/wgd_cli.py b/wgd_cli.py
@@ -159,7 +159,8 @@
         '--logfile', '-l', default=None,
         help="File to write logs to (optional)"
 )
-def cli(verbosity, logfile):
+@click.option('--version', is_flag=True, help="Print version number")
+def cli(verbosity, logfile, version):
     """
     Welcome to the wgd command line interface!
 
@@ -205,6 +206,9 @@ def cli(verbosity, logfile):
         with open(os.devnull, "w") as f:
             subprocess.call("taskset -p 0xffffffffffff %d" % os.getpid(),
                             shell=True, stdout=f)
+
+    if version:
+        logging.info("This is wgd v1.0")
     pass
 
 
@@ -715,9 +719,13 @@ def ksd_(
         help="minimum length of a genomic element (in numbers of genes) to be "
              "included in dotplot."
 )
+@click.option(
+        '--ks_range', '-r', nargs=2, default=(0.05, 5), show_default=True,
+        type=float, help='Ks range to use for colored dotplot'
+)
 def syn(
         gff_file, gene_families, ks_distribution, output_dir, feature,
-        gene_attribute, min_length
+        gene_attribute, min_length, ks_range
 ):
     """
     Co-linearity analyses.
@@ -735,13 +743,13 @@ def syn(
     """
     syn_(
             gff_file, gene_families, output_dir, ks_distribution, feature,
-            gene_attribute, min_length
+            gene_attribute, min_length, ks_range
     )
 
 
 def syn_(
         gff_file, families, output_dir, ks_distribution, feature='mRNA',
-        gene_attribute='Parent', min_length=250
+        gene_attribute='Parent', min_length=250, ks_range=(0.05, 5)
 ):
     """
     Co-linearity analysis with I-ADHoRe 3.0. For usage in the ``wgd`` CLI.
@@ -849,8 +857,9 @@ def syn_(
 
         logging.info("Generating Ks colored (median Ks) dotplot")
         syntenic_dotplot_ks_colored(
-                multiplicons, anchor_points, anchors,
-                output_file=dotplot_out, min_length=min_length
+                multiplicons, anchor_points, anchors, min_ks=ks_range[0],
+                max_ks=ks_range[1], output_file=dotplot_out,
+                min_length=min_length
         )
 
         logging.info("Generating histogram")