Skip to content

Commit

Permalink
prepare for release
Browse files Browse the repository at this point in the history
  • Loading branch information
Arthur ZWAENEPOEL committed Sep 10, 2018
1 parent 0352606 commit 3fec17a
Show file tree
Hide file tree
Showing 8 changed files with 123,700 additions and 114,180 deletions.
47 changes: 47 additions & 0 deletions doc/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,53 @@ wgd CLI:

.. image:: flowchart.png

Citation
========

A publication of the ``wgd`` package is in preparation. Until it is published
please cite the github repository.

Further, for the specific tools in ``wgd``, please cite the following:
If you use ``wgd mcl`` please cite::

- Altschul, S. F., Madden, T. L., Schäffer, A. A., Zhang, J., Zhang, Z., Miller, W.,
and Lipman, D. J. (1997). Gapped BLAST and PSI-BLAST: a new generation of
protein database search programs. Nucleic Acids Research, 25(17), 3389–3402.

- van Dongen, S. (2000). Graph Clustering by Flow Simulation. Ph.D. thesis,
University of Utrecht, Utrecht.

For ``wgd ksd``, please cite::

- Yang, Z. (2007). PAML 4: Phylogenetic Analysis by Maximum Likelihood.
Molecular Biology and Evolution, 24(8), 1586–1591.

- [if using MUSCLE] Edgar, R. C. (2004). MUSCLE: multiple sequence alignment with high accuracy and
high throughput. Nucleic Acids Research, 32(5), 1792–1797.

- [if using MAFFT] Katoh, K. and Standley, D. M. (2013). MAFFT multiple sequence alignment software
version 7: improvements in performance and usability. Molecular Biology and
Evolution, 30(4), 772–780.

- [if using PRANK] Löytynoja, A. and Goldman, N. (2008). Phylogeny-Aware Gap Placement Prevents
Errors in Sequence Alignment and Evolutionary Analysis. Science, 320(5883),
1632–1635.

- [if using FastTree] 2825–2830.
Price, M. N., Dehal, P. S., and Arkin, A. P. (2010). FastTree 2 - Approximately
Maximum-Likelihood Trees for Large Alignments. PLOS ONE, 5(3), e9490.

- [if using PhyML] Guindon, S., Dufayard, J.-F., Lefort, V., Anisimova, M., Hordijk, W., and
Gascuel, O. (2010). New algorithms and methods to estimate maximum-likelihood
phylogenies: assessing the performance of PhyML 3.0. Systematic Biology, 59(3),
307–321.

For ``wgd syn``, please cite::

- Proost, S., Fostier, J., De Witte, D., Dhoedt, B., Demeester, P., Van de Peer, Y., and
Vandepoele, K. (2012). i-ADHoRe 3.0 : fast and sensitive detection of genomic
homology in extremely large data sets. NUCLEIC ACIDS RESEARCH, 40(2).


Python package
==============
Expand Down
216,573 changes: 105,984 additions & 110,589 deletions example/data/ath.ks.tsv

Large diffs are not rendered by default.

5,433 changes: 5,433 additions & 0 deletions example/data/ath.ks_anchors.tsv

Large diffs are not rendered by default.

12,189 changes: 12,189 additions & 0 deletions example/data/ath.mcl

Large diffs are not rendered by default.

3,560 changes: 0 additions & 3,560 deletions example/data/ath_anchors.ks.tsv

This file was deleted.

12 changes: 6 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@
"""
Arthur Zwaenepoel
Version 0.1:
- Rewrite of Ks distribution construction
- Implementation of one-vs-one ortholog Ks distributions
- Phylogenetic trees for weighting (FastTree & PhyML)
Version 1.0
Copyright (C) 2018 Arthur Zwaenepoel
Expand All @@ -27,15 +24,18 @@

from setuptools import setup

with open("README.md", "r") as fh:
long_description = fh.read()

setup(
name='wgd',
version='0.1.1',
version='1.0',
packages=['wgd'],
url='http://github.com/arzwa/wgd',
license='GPL',
author='Arthur Zwanepoel',
author_email='[email protected]',
description='MORPH bulk CLI',
description='wgd',
py_modules=['wgd_cli'],
include_package_data=True,
install_requires=[
Expand Down
45 changes: 26 additions & 19 deletions wgd/viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .modeling import filter_group_data
import plumbum as pb
import matplotlib

if not 'DISPLAY' in pb.local.env:
matplotlib.use('Agg') # use this backend when no X server
import matplotlib.pyplot as plt
Expand All @@ -39,7 +40,8 @@
import pandas as pd


def plot_dists(dists, var, scale, ax, alphas, colors, labels, bins=40, **kwargs):
def plot_dists(dists, var, scale, ax, alphas, colors, labels, bins=40,
**kwargs):
"""
Plot a bunch of histograms stacked on each other.
Expand All @@ -54,7 +56,7 @@ def plot_dists(dists, var, scale, ax, alphas, colors, labels, bins=40, **kwargs)
:param kwargs: other args for plt.hist
:return: ax
"""
xlabs = {"Ks" : "K_{\mathrm{S}}", "Ka": "K_{\mathrm{A}}", "Omega": "\omega"}
xlabs = {"Ks": "K_{\mathrm{S}}", "Ka": "K_{\mathrm{A}}", "Omega": "\omega"}
data = [dist[var] for dist in dists]
xlab = xlabs[var]
if scale == "log10":
Expand Down Expand Up @@ -107,7 +109,8 @@ def plot_selection(

# filtering and node-weighting
for i in range(len(dists)):
dists[i] = filter_group_data(dists[i], filters[0], filters[1], filters[2],
dists[i] = filter_group_data(dists[i], filters[0], filters[1],
filters[2],
ks_range[0], ks_range[1])

# assemble the figure
Expand All @@ -116,7 +119,7 @@ def plot_selection(
ax.set_xlim(0, ks_range[1])
ax = fig.add_subplot(2, 2, 2)
plot_dists(dists, "Ks", "log10", ax, alphas, colors, labels, bins, **kwargs)
ax.set_xlim(np.log10(ks_range[0]+1e-5), np.log10(ks_range[1]))
ax.set_xlim(np.log10(ks_range[0] + 1e-5), np.log10(ks_range[1]))
ax = fig.add_subplot(2, 2, 3)
plot_dists(dists, "Ka", "log10", ax, alphas, colors, labels, bins, **kwargs)
ax = fig.add_subplot(2, 2, 4)
Expand Down Expand Up @@ -169,15 +172,14 @@ def syntenic_dotplot(df, min_length=250, output_file=None):
genomic_elements[kv[0]] = previous
previous += kv[1]

x = [genomic_elements[key] for key in sorted(genomic_elements.keys())]
x = [genomic_elements[key] for key in sorted(genomic_elements.keys())] + \
[previous]
x = sorted(list(set(x))) # FIXME hack
if len(x) == 0:
logging.warning("No multiplicons found!")
return

# plot layout stuff!
x = [genomic_elements[key] for key in sorted(genomic_elements.keys())]
x = sorted(list(set(x)))
ax.vlines(ymin=0, ymax=previous, x=x, linestyles='dotted', alpha=0.2)
ax.hlines(xmin=0, xmax=previous, y=x, linestyles='dotted', alpha=0.2)
ax.plot(x, x, color='k', alpha=0.2)
Expand Down Expand Up @@ -222,7 +224,8 @@ def syntenic_dotplot(df, min_length=250, output_file=None):


def syntenic_dotplot_ks_colored(
df, an, ks, min_length=250, color_map='Spectral', output_file=None
df, an, ks, min_length=50, color_map='Spectral', min_ks=0.05, max_ks=5,
output_file=None
):
"""
Syntenic dotplot with segment colored by mean Ks value
Expand All @@ -232,6 +235,8 @@ def syntenic_dotplot_ks_colored(
:param ks: Ks distribution data frame
:param min_length: minimum length of a genomic element
:param color_map: color map string
:param min_ks: minimum median Ks value
:param max_ks: maximum median Ks value
:param output_file: output file name
:return: figure
"""
Expand All @@ -253,8 +258,7 @@ def syntenic_dotplot_ks_colored(
pairs = an[an['multiplicon'] == row['id']]['pair']
med_ks = np.median(ks.loc[ks.index.intersection(pairs)]['Ks'])
ks_multiplicons[row['id']] = med_ks
if med_ks < 5:
all_ks.append(med_ks)
all_ks.append(med_ks)

z = [[0, 0], [0, 0]]
levels = range(0, 101, 1)
Expand All @@ -281,7 +285,8 @@ def syntenic_dotplot_ks_colored(
previous += kv[1]

# plot layout
x = [genomic_elements[key] for key in sorted(genomic_elements.keys())]
x = [genomic_elements[key] for key in sorted(genomic_elements.keys())] + \
[previous]
x = sorted(list(set(x)))
ax.vlines(ymin=0, ymax=previous, x=x, linestyles='dotted', alpha=0.2)
ax.hlines(xmin=0, xmax=previous, y=x, linestyles='dotted', alpha=0.2)
Expand Down Expand Up @@ -315,13 +320,15 @@ def syntenic_dotplot_ks_colored(
[row['begin_x'], row['end_x']]]
y = [genomic_elements[list_y] + x for x in
[row['begin_y'], row['end_y']]]
ax.plot(x, y, alpha=0.9, linewidth=3,
color=cmap(ks_multiplicons[row['id']] / 5)),
# path_effects=[pe.Stroke(linewidth=4, foreground='k'), pe.Normal()])
ax.plot(y, x, alpha=0.9, linewidth=3,
color=cmap(ks_multiplicons[row['id']] / 5))
# path_effects=[pe.Stroke(linewidth=4, foreground='k'),
#pe.Normal()])
med_ks = ks_multiplicons[row['id']]
if min_ks < med_ks <= max_ks:
ax.plot(x, y, alpha=0.9, linewidth=3,
color=cmap(ks_multiplicons[row['id']] / 5)),
# path_effects=[pe.Stroke(linewidth=4, foreground='k'), pe.Normal()])
ax.plot(y, x, alpha=0.9, linewidth=3,
color=cmap(ks_multiplicons[row['id']] / 5))
# path_effects=[pe.Stroke(linewidth=4, foreground='k'),
# pe.Normal()])

# colorbar
cbar = plt.colorbar(tmp, fraction=0.02, pad=0.01)
Expand Down Expand Up @@ -427,7 +434,7 @@ def get_data(df, var, scale, r1, r2, outliers_included):
label="Don't adapt weights when filtering", active=False)

# set up figure
p1 = figure(plot_width=1000, plot_height=700, # output_backend="svg",
p1 = figure(plot_width=1000, plot_height=700, # output_backend="svg",
tools='pan,wheel_zoom,xwheel_zoom,ywheel_zoom,save')
p1.xgrid.grid_line_color = None
p1.ygrid.grid_line_color = None
Expand Down
21 changes: 15 additions & 6 deletions wgd_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,8 @@
'--logfile', '-l', default=None,
help="File to write logs to (optional)"
)
def cli(verbosity, logfile):
@click.option('--version', is_flag=True, help="Print version number")
def cli(verbosity, logfile, version):
"""
Welcome to the wgd command line interface!
Expand Down Expand Up @@ -205,6 +206,9 @@ def cli(verbosity, logfile):
with open(os.devnull, "w") as f:
subprocess.call("taskset -p 0xffffffffffff %d" % os.getpid(),
shell=True, stdout=f)

if version:
logging.info("This is wgd v1.0")
pass


Expand Down Expand Up @@ -715,9 +719,13 @@ def ksd_(
help="minimum length of a genomic element (in numbers of genes) to be "
"included in dotplot."
)
@click.option(
'--ks_range', '-r', nargs=2, default=(0.05, 5), show_default=True,
type=float, help='Ks range to use for colored dotplot'
)
def syn(
gff_file, gene_families, ks_distribution, output_dir, feature,
gene_attribute, min_length
gene_attribute, min_length, ks_range
):
"""
Co-linearity analyses.
Expand All @@ -735,13 +743,13 @@ def syn(
"""
syn_(
gff_file, gene_families, output_dir, ks_distribution, feature,
gene_attribute, min_length
gene_attribute, min_length, ks_range
)


def syn_(
gff_file, families, output_dir, ks_distribution, feature='mRNA',
gene_attribute='Parent', min_length=250
gene_attribute='Parent', min_length=250, ks_range=(0.05, 5)
):
"""
Co-linearity analysis with I-ADHoRe 3.0. For usage in the ``wgd`` CLI.
Expand Down Expand Up @@ -849,8 +857,9 @@ def syn_(

logging.info("Generating Ks colored (median Ks) dotplot")
syntenic_dotplot_ks_colored(
multiplicons, anchor_points, anchors,
output_file=dotplot_out, min_length=min_length
multiplicons, anchor_points, anchors, min_ks=ks_range[0],
max_ks=ks_range[1], output_file=dotplot_out,
min_length=min_length
)

logging.info("Generating histogram")
Expand Down

0 comments on commit 3fec17a

Please sign in to comment.