From 706d1d8dd56be10033e84575945b16f7180727e2 Mon Sep 17 00:00:00 2001
From: George Rosenberger <gr2578@cumc.columbia.edu>
Date: Sat, 18 Jan 2020 17:29:18 -0500
Subject: [PATCH] [FEATURE] 1.0.5

---
 README.md      | 34 ++++++++++++++++++++++------------
 secat/learn.py |  5 +++--
 secat/main.py  |  5 +++--
 setup.py       |  2 +-
 4 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 358e4d2..584bf64 100644
--- a/README.md
+++ b/README.md
@@ -63,18 +63,28 @@ The data set includes the expected output as SQLite-files. Note: Since the ``PyP
 
 **1. Data preprocessing**
 
+The primary input for SECAT are quantitative, proteotypic/unique peptide-level profiles, e.g. acquired by SEC-SWATH-MS. The input can be supplied either as matrix (protein, peptide and run-wise peptide intensities columns) or as transposed long list. Protein identifiers need to be provided in UniProtKB/Swiss-Prot format. The column names can be freely specified (``secat preprocess --columns``; see help for a complete description).
+
+The second required input file represents the experimental design and molecular weight calibration of the experiment. The primary column covers the run identifiers (matching the quantitative profiles above), with additional columns for SEC fraction identifiers (integer value), SEC molecular weight (float value), a group condition identifier (freetext value) and a replicate identifier (freetext value). The column names can be freely specified (``secat preprocess --columns``; see help for a complete description).
+
+The third required file covers UniProtKB/Swiss-Prot meta data in XML format, matching the proteome, and can be obtained from [UniProt](https://www.uniprot.org/downloads).
+
+Optionally, reference PPI networks can be specified to support semi-supervised learning and to restrict the peptide query space. SECAT can accept three files: A positive reference network and a negative reference network for the learning steps and a separate reference network to restrict the query space. SECAT natively supports HUPO-PSI MITAB (2.5-2.7), STRING-DB, BioPlex and PrePPI formats and provides filtering options to optionally exclude lower confidence PPIs. The inverted CORUM reference PPI network was generated by using the inverted set of PPI (i.e. all possible PPI that are not covered by CORUM) and removing all PPI in this set covered by STRING, IID, PrePPI or BioPlex.
+
+The Zenodo archives linked above contain example files and parameter sets for all described analyses and can be used to test the algorithm and reproduce the results.
+
 First, the input quantitative proteomics matrix and parameters are preprocessed to a single file:
 
 ````
 secat preprocess
---out=hela_string_negative.secat \ # Output filename
+--out=hela_string.secat \ # Output filename
 --sec=input/hela_sec_mw.csv \ # SEC annotation file
---net=../common/9606.protein.links.v11.0.txt.gz \ # Reference PPI network
---posnet=../common/corum_targets.txt.gz \ # Reference positive interaction network for learning
---negnet=../common/corum_decoys.txt.gz \ # Reference negative interaction network for learning
---uniprot=../common/uniprot_9606_20190402.xml.gz \ # Uniprot reference XML file
+--net=common/9606.protein.links.v11.0.txt.gz \ # Reference PPI network
+--posnet=common/corum_targets.txt.gz \ # Reference positive interaction network for learning
+--negnet=common/corum_decoys.txt.gz \ # Reference negative interaction network for learning
+--uniprot=common/uniprot_9606_20190402.xml.gz \ # Uniprot reference XML file
 --min_interaction_confidence=0 # Minimum interaction confidence
-input/hela_normsw.tsv \ # Input data files
+input/pep*.tsv \ # Input data files
 ````
 
 **2. Signal processing**
@@ -82,7 +92,7 @@ input/hela_normsw.tsv \ # Input data files
 Next, the signal processing is conducted in a parallelized fashion:
 
 ````
-secat score --in=hela_string_negative.secat --threads=8
+secat score --in=hela_string.secat --threads=8
 ````
 
 **3. PPI detection**
@@ -90,7 +100,7 @@ secat score --in=hela_string_negative.secat --threads=8
 The statistical confidence of the PPI is evaluated by machine learning:
 
 ````
-secat learn --in=hela_string_negative.secat --threads=5
+secat learn --in=hela_string.secat --threads=5
 ````
 
 **4. PPI quantification**
@@ -98,7 +108,7 @@ secat learn --in=hela_string_negative.secat --threads=5
 Quantitative features are generated for all PPIs and proteins:
 
 ````
-secat quantify --in=hela_string_negative.secat --control_condition=inter
+secat quantify --in=hela_string.secat --control_condition=inter
 ````
 
 **5. Export of results**
@@ -106,7 +116,7 @@ secat quantify --in=hela_string_negative.secat --control_condition=inter
 CSV tables can be exported for import in downstream tools, e.g. Cytoscape:
 
 ````
-secat export --in=hela_string_negative.secat
+secat export --in=hela_string.secat
 ````
 
 **6. Plotting of chromatograms**
@@ -114,7 +124,7 @@ secat export --in=hela_string_negative.secat
 PDF reports can be generated for the top (or selected) results:
 
 ````
-secat plot --in=hela_string_negative.secat
+secat plot --in=hela_string.secat
 ````
 
 **7. Report of statistics**
@@ -122,7 +132,7 @@ secat plot --in=hela_string_negative.secat
 Statistics reports can be generated for the top (or selected) results:
 
 ````
-secat statistics --in=hela_string_negative.secat
+secat statistics --in=hela_string.secat
 ````
 
 **Further options and default parameters**
diff --git a/secat/learn.py b/secat/learn.py
index ad56e2e..56c2f80 100644
--- a/secat/learn.py
+++ b/secat/learn.py
@@ -25,13 +25,14 @@
 from hyperopt import hp
 
 class pyprophet:
-    def __init__(self, outfile, apply_model, minimum_abundance_ratio, maximum_sec_shift, cb_decoys, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, plot_reports, threads, test):
+    def __init__(self, outfile, apply_model, minimum_abundance_ratio, maximum_sec_shift, cb_decoys, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, xgb_autotune, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, plot_reports, threads, test):
 
         self.outfile = outfile
         self.apply_model = apply_model
         self.classifier = 'XGBoost'
+        self.xgb_autotune = xgb_autotune
 
-        self.xgb_hyperparams = {'autotune': False, 'autotune_num_rounds': 10, 'num_boost_round': 100, 'early_stopping_rounds': 10, 'test_size': 0.33}
+        self.xgb_hyperparams = {'autotune': self.xgb_autotune, 'autotune_num_rounds': 10, 'num_boost_round': 100, 'early_stopping_rounds': 10, 'test_size': 0.33}
 
         self.xgb_params = {'eta': 1.0, 'gamma': 0, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'lambda': 1, 'alpha': 0, 'scale_pos_weight': 1, 'silent': 1, 'objective': 'binary:logitraw', 'nthread': 1, 'eval_metric': 'auc'}
 
diff --git a/secat/main.py b/secat/main.py
index 613776a..0917531 100644
--- a/secat/main.py
+++ b/secat/main.py
@@ -211,6 +211,7 @@ def score(infile, outfile, monomer_threshold_factor, minimum_peptides, maximum_p
 @click.option('--ss_initial_fdr', default=0.1, show_default=True, type=float, help='Initial FDR cutoff for best scoring targets.')
 @click.option('--ss_iteration_fdr', default=0.05, show_default=True, type=float, help='Iteration FDR cutoff for best scoring targets.')
 @click.option('--ss_num_iter', default=10, show_default=True, type=int, help='Number of iterations for semi-supervised learning step.')
+@click.option('--xgb_autotune/--no-xgb_autotune', default=False, show_default=True, help='Autotune hyperparameters after semi-supervised learning.')
 # Statistics
 @click.option('--parametric/--no-parametric', default=False, show_default=True, help='Do parametric estimation of p-values.')
 @click.option('--pfdr/--no-pfdr', default=False, show_default=True, help='Compute positive false discovery rate (pFDR) instead of FDR.')
@@ -226,7 +227,7 @@ def score(infile, outfile, monomer_threshold_factor, minimum_peptides, maximum_p
 @click.option('--plot_reports/--no-plot_reports', default=False, show_default=True, help='Plot reports for all confidence bins.')
 @click.option('--threads', default=1, show_default=True, type=int, help='Number of threads used for parallel processing. -1 means all available CPUs.', callback=transform_threads)
 @click.option('--test/--no-test', default=False, show_default=True, help='Run in test mode with fixed seed to ensure reproducibility.')
-def learn(infile, outfile, apply_model, minimum_abundance_ratio, maximum_sec_shift, cb_decoys, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, plot_reports, threads, test):
+def learn(infile, outfile, apply_model, minimum_abundance_ratio, maximum_sec_shift, cb_decoys, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, xgb_autotune, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, plot_reports, threads, test):
     """
     Learn true/false interaction features in SEC data.
     """
@@ -247,7 +248,7 @@ def learn(infile, outfile, apply_model, minimum_abundance_ratio, maximum_sec_shi
     c.execute('DROP TABLE IF EXISTS FEATURE_SCORED;')
     con.close()
 
-    pyprophet(outfile, apply_model, minimum_abundance_ratio, maximum_sec_shift, cb_decoys, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, plot_reports, threads, test)
+    pyprophet(outfile, apply_model, minimum_abundance_ratio, maximum_sec_shift, cb_decoys, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, xgb_autotune, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, plot_reports, threads, test)
 
     # Combine all replicates
     click.echo("Info: Combine evidence across replicate runs.")
diff --git a/setup.py b/setup.py
index 52a8011..493a722 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='secat',
-    version='1.0.4',
+    version='1.0.5',
     description='Size-Exclusion Chromatography Algorithmic Toolkit',
     long_description=long_description,
     long_description_content_type='text/markdown',