Skip to content

Commit

Permalink
Merge branch 'master' into working
Browse files Browse the repository at this point in the history
  • Loading branch information
Ben Tully committed Jan 8, 2020
2 parents 3839895 + 10ffa99 commit 701e912
Show file tree
Hide file tree
Showing 12 changed files with 66 additions and 27 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file modified KEGGDecoder/.DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions KEGGDecoder/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.idea/*
4 changes: 2 additions & 2 deletions KEGGDecoder/KEGG_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ def hClust_correlation(genome_df):

def hClust_most_least(genome_df):
sort_dex = genome_df.sum(axis=1).sort_values(ascending=True).index
genome_df = genome_df.ix[sort_dex]
genome_df = genome_df.loc[sort_dex]

return genome_df

def hClust_least_most(genome_df):
sort_dex = genome_df.sum(axis=1).sort_values(ascending=False).index
genome_df = genome_df.ix[sort_dex]
genome_df = genome_df.loc[sort_dex]

return genome_df

Expand Down
42 changes: 32 additions & 10 deletions KEGGDecoder/KEGG_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@
V.1.0.6
Add the biosynthesis of the 20 amino acids - represented as the last
step in the pathway
KEGG-decoder.py V.1.0.8.2
V.1.0.8
Several recent updates have improved all three outputs for visualization
expanded further in the ReadMe note. Additionally, a correction to
determining the completeness of ubiquinol-cytochrome c reductase. Previously,
only checked for the presence of K00411 and K00410. K00410 is a fusion of
K00412 and K00413 only present in a subset of Proteobacteria. Identified
by Grayson Chadwick
V.1.0.5
Added tanglegram correction for minimizing euclidean distance
V.1.0.4
Expand Down Expand Up @@ -387,7 +395,7 @@ def c_degradation(ko_match):
if ('K01183' in ko_match):
out_data['chitinase'] += 1
if ('K13381' in ko_match):
out_data['bifunctional chitinase/lyase'] += 1
out_data['bifunctional chitinase/lysozyme'] += 1
if ('K20547' in ko_match):
out_data['basic endochitinase B'] += 1
if ('K03478' in ko_match or 'K18454' in ko_match):
Expand Down Expand Up @@ -821,6 +829,8 @@ def oxidative_phoshorylation(ko_match):
for i in nuo_ko:
if i in ko_match:
out_data['NADH-quinone oxidoreductase'] += 0.07
value = out_data['NADH-quinone oxidoreductase']
out_data['NADH-quinone oxidoreductase'] = float("%.2f" % (value))
#ndcABCDEFGHIJKLMN
ndc_ko = ['K05574', 'K05582', 'K05581', 'K05579',
'K05572', 'K05580', 'K05578', 'K05576',
Expand Down Expand Up @@ -855,12 +865,15 @@ def oxidative_phoshorylation(ko_match):
if i in ko_match:
out_data['Cytochrome aa3-600 menaquinol oxidase'] += 0.25
#petA,fbcH; ubiquinol-cytochrome c reductase
ubiquinol_ko = ['K00411', 'K00410']
for i in ubiquinol_ko:
if i in ko_match:
out_data['Ubiquinol-cytochrome c reductase'] += 0.5
value = out_data['NADH-quinone oxidoreductase']
out_data['NADH-quinone oxidoreductase'] = float("%.2f" % (value))
#petA,petB,petC; ubiquinol-cytochrome c reductase
if ('K00411' in ko_match) and ('K00410' in ko_match):
out_data['Ubiquinol-cytochrome c reductase'] = 1
else:
ubiquinol_ko = ['K00411', 'K00412', 'K00413']
for i in ubiquinol_ko:
if i in ko_match:
out_data['Ubiquinol-cytochrome c reductase'] += 0.33

#nqrABCDEF; Na+-transporting NADH:ubiquinone oxidoreductase
na_ubiquinone_ko = ['K00346', 'K00347', 'K00348', 'K00349',
'K00350', 'K00351']
Expand All @@ -869,7 +882,7 @@ def oxidative_phoshorylation(ko_match):
out_data['Na-NADH-ubiquinone oxidoreductase'] += 0.167
value = out_data['Na-NADH-ubiquinone oxidoreductase']
out_data['Na-NADH-ubiquinone oxidoreductase'] = float("%.2f" % (value))

return out_data

def photosynthesis(ko_match):
Expand Down Expand Up @@ -1348,7 +1361,7 @@ def default_viz(genome_df, outfile_name):
#xLen = len(genome_df.columns.values.tolist())*20
#yLen = len(genome_df.index.tolist())*20
fig.set_size_inches(100, 100)
fig.savefig(outfile_name)
fig.savefig(outfile_name, bbox_inches='tight', pad_inches=0.1)

def main():
import os
Expand All @@ -1370,6 +1383,7 @@ def main():
map figure")
parser.add_argument('-v', '--vizoption', help="Options: static, interactive, tanglegram")
parser.add_argument('--newick', help="Required input for tanglegram visualization")
parser.add_argument("-m", "--myorder", help ="Orders output as specified by user.", default="None")
args = parser.parse_args()
arg_dict = vars(args)

Expand Down Expand Up @@ -1523,10 +1537,18 @@ def main():

file_in = open(filehandle, "r")
genome = pd.read_csv(file_in, index_col=0, sep='\t')
rearrange = False
if arg_dict["myorder"] != 'None' and os.path.exists(arg_dict["myorder"]):
rearrange = True
leaf_order = []
for line in open(str(arg_dict["myorder"]), "r"):
line = line.rstrip("\r\n")
leaf_order.append(line)
genome = genome.reindex(leaf_order)

if arg_dict['vizoption'] == 'static':
from .KEGG_clustering import hClust_euclidean
if len(genome.index) >= 2:
if len(genome.index) >= 2 and not rearrange:
genome = hClust_euclidean(genome)
default_viz(genome, os.path.splitext(filehandle)[0] + ".svg")
if arg_dict['vizoption'] == 'interactive':
Expand Down
4 changes: 2 additions & 2 deletions KEGGDecoder/Plotly_viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def plotly_viz(genome_df, output_file):
len_genomes = len(genome_df.index.tolist())
if len_genomes >= 200:
yLen = len_genomes * 40
menL = 1.0
menL = 1.05
elif len_genomes >= 100:
yLen = len_genomes * 30
menL = 1.2
Expand Down Expand Up @@ -125,7 +125,7 @@ def plotly_viz(genome_df, output_file):


fig = go.Figure(data=data, layout=layout)
py.plot(fig, filename=output_file)
py.plot(fig, filename=output_file, auto_open=False)
# py.iplot(data, filename='pandas.heatmap')


Expand Down
24 changes: 19 additions & 5 deletions KEGGDecoder/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ Designed to parse through a KEGG-Koala outputs (including blastKOALA, ghostKOALA

* if you are interested in certain pathway and the genes are listed in KEGG it is possible to add it to file (with some Python scripting)

[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/rotheconrad/KEGGDecoder-binder/master)

### Please Cite ###
If you find that using KEGG Decoder to process your data has been useful, please cite this manuscript. If you are using KEGG Decoder to make figures then definitely cite this manuscript!

Expand All @@ -26,10 +24,19 @@ If you find that using KEGG Decoder to process your data has been useful, please
* [tanglegram](https://github.com/schlegelp/tanglegram)

## Installation ##
<strong>Recommend installing KEGG-Decoder in it virtual environment (conda or python). The current pip install will set the various dependencies (matplotlib, seaborn, pandas, etc.) to versions that actively work with this version of the script. This will likely revert several dependencies on your system to older versions. </strong>

This is partially due to avoid a bug in matplotlib=3.0.4 that would cut the top and bottom line of the `static` image output.

```
python3 -m pip install KEGGDecoder
```

## Upgrade ##
```
pip install --upgrade KEGGDecoder
```

## Procedure ##
* Start with protein FASTA file (INPUT_PROTEIN.fasta). This file can be multiple genomes combined. Be sure your submitted FASTA file has headers that group genomes together, KEGG-decoder.py groups based on the name provided in FASTA header before the first underscore (_)
```
Expand Down Expand Up @@ -58,7 +65,7 @@ KEGG-decoder --input (-i) <KOALA_OUTPUT.txt> --output (-o) <FUNCTION_OUT.list> -

* 'static' figure output is an SVG file function_heatmap.svg. Each distinct identifier before the underscore in the FASTA file will have a row

* 'interactive' figure output is an HTML file function_heatmap.html. Each distinct identifier before the underscore in the FASTA file will have a row, but can be loaded into a browser and value will be displayed by hovering over a cell with the mouse. Draw a box to zoom in on specific regions. Designed to allow easier parsing of larger sets of genomes. Currently, not available for smaller sets of genomes.
* 'interactive' figure output is an HTML file function_heatmap.html. Each distinct identifier before the underscore in the FASTA file will have a row, but can be loaded into a browser and value will be displayed by hovering over a cell with the mouse. Draw a box to zoom in on specific regions. Designed to allow easier parsing of larger sets of genomes.

* 'tanglegram' -- For a little more advanced analysis, KEGGDecoder can generate a tanglegram to compare the order of two trees, one generated by the clustered KEGG metabolic outputs and a Newick format (presumably phylogenetic) tree provided by the user. At least 3 input genomes are required, but more is recommended. Genome names must match.

Expand All @@ -77,17 +84,18 @@ HMM models are predominantly from the PFam database, but when necessary are pull

### Additional Information ###
* Details as to which HMM models and genes are in each described pathway or process can be found in the supporting document, Pfam_definitions.txt
* In version 0.7, KEGG-Expander targets several transporter subunits to link with metal transporter columns in KEGG-Decoder. Removed the peptidase entries due to ineffective interpretation.
* In version 0.6, KEGG-Expander targets: phototrophy via proteorhodopsin, (some) peptidases, alternative nitrogenases, ammonia transport, DMSP lyase, and DMSP synthase, and ferrioxamine biosynthesis
* Unfortunately, accuracy depends on the model used, using a bit score cutoff of 75 (approximately an E-value <10E-20) does not always capture the best matches. For example the rhodopsin model does not distinguish between proteorhodopsin and other light driven rhodopsins (we use a tree to determine the proteorhodopsins). Or several of the DMSP lyases at low bit scores will match metalloproteases; in this instance the script has been modified to look for a more stringent bit score (>500). Or the TIGRfam models for the Fe-only and Vanadium nitrogenases generally match the same protein.

## Prodecure ##
* Using a protein FASTA file with the same gene name set-up as described above - GENOMEID_Number - run a search against the custom HMM database
```
hmmsearch --tblout <NAME>_expanderv0.3.tbl -T 75 /path/to/BioData/KEGGDecoder/HMM_Models/expander_dbv0.3.hmm <INPUT_PROTEIN.fasta>
hmmsearch --tblout <NAME>_expanderv0.7.tbl -T 75 /path/to/BioData/KEGGDecoder/HMM_Models/expander_dbv0.7.hmm <INPUT_PROTEIN.fasta>
```
* The HMM results table is used to construct the heatmap by running KEGG-expander.py
```
python KEGG-expander.py <NAME>_expanderv0.3.tbl <HMM_OUT.list>
python KEGG-expander.py <NAME>_expanderv0.7.tbl <HMM_OUT.list>
```
* The OUTPUT LIST generates a text version of the heat map. The first row contains pathway/process names, subsequent rows contain submitted groups/genomes and fractional percentage of pathway/process

Expand Down Expand Up @@ -115,6 +123,12 @@ valine & isoleucine
phenylalanine & tyrosine
aspartate & glutamate

## V1.0.6-1.0.8 ##
Updates made as part of the Speeding Up Science Part 2 hackathon. Updates were made by Chris Neely, Jason Fell, and Marisa Lim.
Changes include reduction of white space in the `static` output, removal of a minimum requirement for the `interactive` output, and increased functioning of `tanglegram` output. Specifically, `tanglegram` now uses complete-linkage Euclidean distance to determine the clusters on the KEGG-Decoder tree. This provides the best resolution for visualizing possible groups with similar functional capacity.
In V1.0.8.2, a correction to determining the completeness of ubiquinol-cytochrome c reductase. Previously, only checked for the presence of K00411 and K00410. K00410 is a fusion of K00412 and K00413 only present in a subset of Proteobacteria. Identified by Grayson Chadwick.
In V1.0.8.1, a mismatch in the terms used to identify `bifunctional chitinase/lysozyme` would result in a `0` not matter if K13381 was present. This has been corrected. Identified Chris Neely.

## V1.0.5 ##
Various upgrades to the tanglegram visualization and enchanced naming efficiency.

Expand Down
2 changes: 2 additions & 0 deletions KEGGDecoder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@
the completeness of various KEGG pathways
"""


__version__ = "1.0.6.1"

Binary file added KEGGDecoder/images/interactive.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added KEGGDecoder/images/static.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added KEGGDecoder/images/tanglegram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 8 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ author-email = "[email protected]"
home-page = "https://github.com/bjtully/BioData/KEGGDecoder"
classifiers = ["License :: OSI Approved :: MIT License"]
requires = [
"matplotlib",
"seaborn",
"pandas",
"numpy",
"plotly",
"tanglegram",
"biopython",
"scipy"
"matplotlib==3.0.3",
"seaborn==0.9.0",
"pandas==0.25.2",
"numpy==1.17.3",
"plotly==4.2.1",
"tanglegram==0.1.0",
"biopython==1.74",
"scipy==1.3.1",
]

[tool.flit.scripts]
Expand Down

0 comments on commit 701e912

Please sign in to comment.