Merge branch 'master' into working

zheminzhou · Jan 8, 2020 · 701e912 · 701e912
2 parents 3839895 + 10ffa99
commit 701e912
Show file tree

Hide file tree

Showing 12 changed files with 66 additions and 27 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/KEGGDecoder/.DS_Store b/KEGGDecoder/.DS_Store
diff --git a/KEGGDecoder/.gitignore b/KEGGDecoder/.gitignore
@@ -0,0 +1 @@
+.idea/*
diff --git a/KEGGDecoder/KEGG_clustering.py b/KEGGDecoder/KEGG_clustering.py
@@ -33,13 +33,13 @@ def hClust_correlation(genome_df):
 
 def hClust_most_least(genome_df):
 	sort_dex = genome_df.sum(axis=1).sort_values(ascending=True).index
-	genome_df = genome_df.ix[sort_dex]
+	genome_df = genome_df.loc[sort_dex]
 
 	return genome_df
 
 def hClust_least_most(genome_df):
 	sort_dex = genome_df.sum(axis=1).sort_values(ascending=False).index
-	genome_df = genome_df.ix[sort_dex]
+	genome_df = genome_df.loc[sort_dex]
 
 	return genome_df
 

diff --git a/KEGGDecoder/KEGG_decoder.py b/KEGGDecoder/KEGG_decoder.py
@@ -5,6 +5,14 @@
 V.1.0.6
 Add the biosynthesis of the 20 amino acids - represented as the last
 step in the pathway
+KEGG-decoder.py V.1.0.8.2
+V.1.0.8
+Several recent updates have improved all three outputs for visualization
+expanded further in the ReadMe note. Additionally, a correction to 
+determining the completeness of ubiquinol-cytochrome c reductase. Previously,
+only checked for the presence of K00411 and K00410. K00410 is a fusion of
+K00412 and K00413 only present in a subset of Proteobacteria. Identified
+by Grayson Chadwick
 V.1.0.5
 Added tanglegram correction for minimizing euclidean distance
 V.1.0.4
@@ -387,7 +395,7 @@ def c_degradation(ko_match):
 	if ('K01183' in ko_match):
 		out_data['chitinase'] += 1
 	if ('K13381' in ko_match):
-		out_data['bifunctional chitinase/lyase'] += 1
+		out_data['bifunctional chitinase/lysozyme'] += 1
 	if ('K20547' in ko_match):
 		out_data['basic endochitinase B'] += 1
 	if ('K03478' in ko_match or 'K18454' in ko_match):
@@ -821,6 +829,8 @@ def oxidative_phoshorylation(ko_match):
 	for i in nuo_ko:
 		if i in ko_match:
 			out_data['NADH-quinone oxidoreductase'] += 0.07
+	value = out_data['NADH-quinone oxidoreductase']
+	out_data['NADH-quinone oxidoreductase'] = float("%.2f" % (value))
 #ndcABCDEFGHIJKLMN
 	ndc_ko = ['K05574', 'K05582', 'K05581', 'K05579',
 	'K05572', 'K05580', 'K05578', 'K05576',
@@ -855,12 +865,15 @@ def oxidative_phoshorylation(ko_match):
 		if i in ko_match:
 			out_data['Cytochrome aa3-600 menaquinol oxidase'] += 0.25
 #petA,fbcH; ubiquinol-cytochrome c reductase 
-	ubiquinol_ko = ['K00411', 'K00410']
-	for i in ubiquinol_ko:
-		if i in ko_match:
-			out_data['Ubiquinol-cytochrome c reductase'] += 0.5
-	value = out_data['NADH-quinone oxidoreductase']
-	out_data['NADH-quinone oxidoreductase'] = float("%.2f" % (value))
+#petA,petB,petC; ubiquinol-cytochrome c reductase
+	if ('K00411' in ko_match) and ('K00410' in ko_match):
+		out_data['Ubiquinol-cytochrome c reductase'] = 1
+	else:
+		ubiquinol_ko = ['K00411', 'K00412', 'K00413']
+		for i in ubiquinol_ko:
+			if i in ko_match:
+				out_data['Ubiquinol-cytochrome c reductase'] += 0.33
+
 #nqrABCDEF; Na+-transporting NADH:ubiquinone oxidoreductase
 	na_ubiquinone_ko = ['K00346', 'K00347', 'K00348', 'K00349',
 	'K00350', 'K00351']
@@ -869,7 +882,7 @@ def oxidative_phoshorylation(ko_match):
 			out_data['Na-NADH-ubiquinone oxidoreductase'] += 0.167
 	value = out_data['Na-NADH-ubiquinone oxidoreductase']
 	out_data['Na-NADH-ubiquinone oxidoreductase'] = float("%.2f" % (value))
-
+	
 	return out_data
 
 def photosynthesis(ko_match):
@@ -1348,7 +1361,7 @@ def default_viz(genome_df, outfile_name):
 	#xLen = len(genome_df.columns.values.tolist())*20
 	#yLen = len(genome_df.index.tolist())*20
 	fig.set_size_inches(100, 100)
-	fig.savefig(outfile_name)
+	fig.savefig(outfile_name, bbox_inches='tight', pad_inches=0.1)
 
 def main():
 	import os
@@ -1370,6 +1383,7 @@ def main():
 						map figure")
 	parser.add_argument('-v', '--vizoption', help="Options: static, interactive, tanglegram")
 	parser.add_argument('--newick', help="Required input for tanglegram visualization")
+	parser.add_argument("-m", "--myorder", help ="Orders output as specified by	user.", default="None")
 	args = parser.parse_args()
 	arg_dict = vars(args)
 
@@ -1523,10 +1537,18 @@ def main():
 
 	file_in = open(filehandle, "r")
 	genome = pd.read_csv(file_in, index_col=0, sep='\t')
+	rearrange = False
+	if arg_dict["myorder"] != 'None' and os.path.exists(arg_dict["myorder"]):
+		rearrange = True
+		leaf_order = []
+		for line in open(str(arg_dict["myorder"]), "r"):
+			line = line.rstrip("\r\n")
+			leaf_order.append(line)
+		genome = genome.reindex(leaf_order)
 
 	if arg_dict['vizoption'] == 'static':
 		from .KEGG_clustering import hClust_euclidean
-		if len(genome.index) >= 2:
+		if len(genome.index) >= 2 and not rearrange:
 			genome = hClust_euclidean(genome)
 		default_viz(genome, os.path.splitext(filehandle)[0] + ".svg")
 	if arg_dict['vizoption'] == 'interactive':

diff --git a/KEGGDecoder/Plotly_viz.py b/KEGGDecoder/Plotly_viz.py
@@ -28,7 +28,7 @@ def plotly_viz(genome_df, output_file):
 	len_genomes = len(genome_df.index.tolist())
 	if len_genomes >= 200:
 		yLen = len_genomes * 40
-		menL = 1.0
+		menL = 1.05
 	elif len_genomes >= 100:
 		yLen = len_genomes * 30
 		menL = 1.2
@@ -125,7 +125,7 @@ def plotly_viz(genome_df, output_file):
 
 
 	fig = go.Figure(data=data, layout=layout)
-	py.plot(fig, filename=output_file)
+	py.plot(fig, filename=output_file, auto_open=False)
 	# py.iplot(data, filename='pandas.heatmap')
 
 

diff --git a/KEGGDecoder/README.md b/KEGGDecoder/README.md
@@ -7,8 +7,6 @@ Designed to parse through a KEGG-Koala outputs (including blastKOALA, ghostKOALA
 
 * if you are interested in certain pathway and the genes are listed in KEGG it is possible to add it to file (with some Python scripting)
 
-[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/rotheconrad/KEGGDecoder-binder/master)
-
 ### Please Cite ###
 If you find that using KEGG Decoder to process your data has been useful, please cite this manuscript. If you are using KEGG Decoder to make figures then definitely cite this manuscript!
 
@@ -26,10 +24,19 @@ If you find that using KEGG Decoder to process your data has been useful, please
 * [tanglegram](https://github.com/schlegelp/tanglegram)
 
 ## Installation ##
+<strong>Recommend installing KEGG-Decoder in it virtual environment (conda or python). The current pip install will set the various dependencies (matplotlib, seaborn, pandas, etc.) to versions that actively work with this version of the script. This will likely revert several dependencies on your system to older versions. </strong>
+
+This is partially due to avoid a bug in matplotlib=3.0.4 that would cut the top and bottom line of the `static` image output.
+
 ```
 python3 -m pip install KEGGDecoder
 ```
 
+## Upgrade ##
+```
+pip install --upgrade KEGGDecoder
+```
+
 ## Procedure ##
 * Start with protein FASTA file (INPUT_PROTEIN.fasta). This file can be multiple genomes combined. Be sure your submitted FASTA file has headers that group genomes together, KEGG-decoder.py groups based on the name provided in FASTA header before the first underscore (_) 
 ```
@@ -58,7 +65,7 @@ KEGG-decoder --input (-i) <KOALA_OUTPUT.txt> --output (-o) <FUNCTION_OUT.list> -
 
 * 'static' figure output is an SVG file function_heatmap.svg. Each distinct identifier before the underscore in the FASTA file will have a row
 
-* 'interactive' figure output is an HTML file function_heatmap.html. Each distinct identifier before the underscore in the FASTA file will have a row, but can be loaded into a browser and value will be displayed by hovering over a cell with the mouse. Draw a box to zoom in on specific regions. Designed to allow easier parsing of larger sets of genomes. Currently, not available for smaller sets of genomes.
+* 'interactive' figure output is an HTML file function_heatmap.html. Each distinct identifier before the underscore in the FASTA file will have a row, but can be loaded into a browser and value will be displayed by hovering over a cell with the mouse. Draw a box to zoom in on specific regions. Designed to allow easier parsing of larger sets of genomes.
 
 * 'tanglegram' -- For a little more advanced analysis, KEGGDecoder can generate a tanglegram to compare the order of two trees, one generated by the clustered KEGG metabolic outputs and a Newick format (presumably phylogenetic) tree provided by the user. At least 3 input genomes are required, but more is recommended. Genome names must match. 
 
@@ -77,17 +84,18 @@ HMM models are predominantly from the PFam database, but when necessary are pull
 
 ### Additional Information ###
 * Details as to which HMM models and genes are in each described pathway or process can be found in the supporting document, Pfam_definitions.txt
+* In version 0.7, KEGG-Expander targets several transporter subunits to link with metal transporter columns in KEGG-Decoder. Removed the peptidase entries due to ineffective interpretation.
 * In version 0.6, KEGG-Expander targets: phototrophy via proteorhodopsin, (some) peptidases, alternative nitrogenases, ammonia transport, DMSP lyase, and DMSP synthase, and ferrioxamine biosynthesis
 * Unfortunately, accuracy depends on the model used, using a bit score cutoff of 75 (approximately an E-value <10E-20) does not always capture the best matches. For example the rhodopsin model does not distinguish between proteorhodopsin and other light driven rhodopsins (we use a tree to determine the proteorhodopsins). Or several of the DMSP lyases at low bit scores will match metalloproteases; in this instance the script has been modified to look for a more stringent bit score (>500). Or the TIGRfam models for the Fe-only and Vanadium nitrogenases generally match the same protein. 
 
 ## Prodecure ##
 * Using a protein FASTA file with the same gene name set-up as described above - GENOMEID_Number - run a search against the custom HMM database
 ```
-hmmsearch --tblout <NAME>_expanderv0.3.tbl -T 75 /path/to/BioData/KEGGDecoder/HMM_Models/expander_dbv0.3.hmm <INPUT_PROTEIN.fasta>
+hmmsearch --tblout <NAME>_expanderv0.7.tbl -T 75 /path/to/BioData/KEGGDecoder/HMM_Models/expander_dbv0.7.hmm <INPUT_PROTEIN.fasta>
 ```
 * The HMM results table is used to construct the heatmap by running KEGG-expander.py
 ```
-python KEGG-expander.py <NAME>_expanderv0.3.tbl <HMM_OUT.list>
+python KEGG-expander.py <NAME>_expanderv0.7.tbl <HMM_OUT.list>
 ```
 * The OUTPUT LIST generates a text version of the heat map. The first row contains pathway/process names, subsequent rows contain submitted groups/genomes and fractional percentage of pathway/process
 
@@ -115,6 +123,12 @@ valine & isoleucine
 phenylalanine & tyrosine
 aspartate & glutamate
 
+## V1.0.6-1.0.8 ##
+Updates made as part of the Speeding Up Science Part 2 hackathon. Updates were made by Chris Neely, Jason Fell, and Marisa Lim.
+Changes include reduction of white space in the `static` output, removal of a minimum requirement for the `interactive` output, and increased functioning of `tanglegram` output. Specifically, `tanglegram` now uses complete-linkage Euclidean distance to determine the clusters on the KEGG-Decoder tree. This provides the best resolution for visualizing possible groups with similar functional capacity.
+In V1.0.8.2, a correction to determining the completeness of ubiquinol-cytochrome c reductase. Previously, only checked for the presence of K00411 and K00410. K00410 is a fusion of K00412 and K00413 only present in a subset of Proteobacteria. Identified by Grayson Chadwick.
+In V1.0.8.1, a mismatch in the terms used to identify `bifunctional chitinase/lysozyme` would result in a `0` not matter if K13381 was present. This has been corrected. Identified Chris Neely.
+
 ## V1.0.5 ##
 Various upgrades to the tanglegram visualization and enchanced naming efficiency.
 

diff --git a/KEGGDecoder/__init__.py b/KEGGDecoder/__init__.py
@@ -3,4 +3,6 @@
 the completeness of various KEGG pathways
 """
 
+
 __version__ = "1.0.6.1"
+
diff --git a/KEGGDecoder/images/interactive.png b/KEGGDecoder/images/interactive.png
diff --git a/KEGGDecoder/images/static.png b/KEGGDecoder/images/static.png
diff --git a/KEGGDecoder/images/tanglegram.png b/KEGGDecoder/images/tanglegram.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,14 +9,14 @@ author-email = "[email protected]"
 home-page = "https://github.com/bjtully/BioData/KEGGDecoder"
 classifiers = ["License :: OSI Approved :: MIT License"]
 requires = [
-    "matplotlib",
-    "seaborn",
-    "pandas",
-    "numpy",
-    "plotly",
-    "tanglegram",
-    "biopython",
-    "scipy"
+    "matplotlib==3.0.3",
+    "seaborn==0.9.0",
+    "pandas==0.25.2",
+    "numpy==1.17.3",
+    "plotly==4.2.1",
+    "tanglegram==0.1.0",
+    "biopython==1.74",
+    "scipy==1.3.1",
 ]
 
 [tool.flit.scripts]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,4 +3,6 @@
		the completeness of various KEGG pathways
		"""


		__version__ = "1.0.6.1"