Added mokapot.to_txt(). Now time for tests.

danielgeiszler · Mar 17, 2021 · c3626d4 · c3626d4
1 parent fd3d97b
commit c3626d4
Show file tree

Hide file tree

Showing 4 changed files with 153 additions and 41 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,10 +4,24 @@
 ### Added
 - Support for downstream quantitation with
   [FlashLFQ](https://github.com/smith-chem-wisc/FlashLFQ). This is accomplished
-  through the `to_flashlfq()` method of `LinearConfidence` objects. Note that
-  to support the FlashLFQ format, you'll need to specify additional columns in
-  `read_pin()` or use a PepXML input file (`read_pepxml()`).
-- Tests accompanying the support for the above format.
+  through the `mokapot.to_flashlfq()` function or the `to_flashlfq()` method of
+  `LinearConfidence` objects. Note that to support the FlashLFQ format, you'll
+  need to specify additional columns in `read_pin()` or use a PepXML input file
+  (`read_pepxml()`). 
+- Added a top-level function for exporting confident PSMs, peptides, and
+  proteins from one or more `LinearConfidence` objects as a tab-delimited file:
+  `mokapot.to_txt()`.
+- Tests accompanying the support for the features above.
+
+### Changed
+- Corresponding with support for new formats, the `mokapot.read_pin()` function
+  and the `LinearPsmDataset` constructor now have many new optional parameters.
+  These specify the columns containing the metadata needed to write the added
+  formats.
+- Starting mokapot should be slightly faster for Python >= 3.8. We were able to
+  eliminate the runtime call to setuptools, because of the recent addition of
+  `importlib.metadata` to the standard library, saving a few hundred
+  milliseconds.
 
 ## [0.6.1] - 2021-03-11  
 ### Fixed  

diff --git a/mokapot/confidence.py b/mokapot/confidence.py
@@ -26,7 +26,7 @@
 from . import qvalues
 from . import utils
 from .picked_protein import picked_protein
-from .writers import to_flashlfq
+from .writers import to_flashlfq, to_txt
 
 LOGGER = logging.getLogger(__name__)
 
@@ -100,29 +100,52 @@ def groups(self):
         """The groups for confidence estimation"""
         return list(self._group_confidence_estimates.keys())
 
-    def to_txt(self, dest_dir=None, file_root=None, sep="\t", decoys=False):
-        """
-        Save confidence estimates to delimited text files.
+    def to_txt(
+        self,
+        dest_dir=None,
+        file_root=None,
+        sep="\t",
+        decoys=False,
+        combine=False,
+    ):
+        """Save confidence estimates to delimited text files.
 
         Parameters
         ----------
         dest_dir : str or None, optional
             The directory in which to save the files. `None` will use the
             current working directory.
         file_root : str or None, optional
-            An optional prefix for the confidence estimate files. The
-            suffix will always be `mokapot.psms.txt` and
-            `mokapot.peptides.txt`.
+            An optional prefix for the confidence estimate files. The suffix
+            will be "mokapot.{level}.txt", where "{level}" indicates the level
+            at which confidence estimation was performed (i.e. PSMs, peptides,
+            proteins) if :code:`combine=True`. If :code:`combine=False` (the
+            default), additionally the group value is prepended, yeilding a
+            suffix "{group}.mokapot.{level}.txt".
         sep : str, optional
             The delimiter to use.
         decoys : bool, optional
             Save decoys confidence estimates as well?
+        combine : bool, optional
+            Should groups be combined into a single file?
 
         Returns
         -------
         list of str
             The paths to the saved files.
+
         """
+        if combine:
+            res = self.group_confidence_estimates.values()
+            ret_files = to_txt(
+                res,
+                dest_dir=dest_dir,
+                file_root=file_root,
+                sep=sep,
+                decoys=decoys,
+            )
+            return ret_files
+
         ret_files = []
         for group, res in self.group_confidence_estimates.items():
             prefix = file_root + f".{group}"
@@ -205,18 +228,18 @@ def levels(self):
         return list(self.confidence_estimates.keys())
 
     def to_txt(self, dest_dir=None, file_root=None, sep="\t", decoys=False):
-        """
-        Save confidence estimates to delimited text files.
+        """Save confidence estimates to delimited text files.
 
         Parameters
         ----------
         dest_dir : str or None, optional
             The directory in which to save the files. `None` will use the
             current working directory.
         file_root : str or None, optional
-            An optional prefix for the confidence estimate files. The
-            suffix will always be `mokapot.psms.txt` and
-            `mokapot.peptides.txt`.
+            An optional prefix for the confidence estimate files. The suffix
+            will always be "mokapot.{level}.txt", where "{level}" indicates the
+            level at which confidence estimation was performed (i.e. PSMs,
+            peptides, proteins).
         sep : str, optional
             The delimiter to use.
         decoys : bool, optional
@@ -226,32 +249,15 @@ def to_txt(self, dest_dir=None, file_root=None, sep="\t", decoys=False):
         -------
         list of str
             The paths to the saved files.
+
         """
-        file_base = "mokapot"
-        if file_root is not None:
-            file_base = file_root + "." + file_base
-        if dest_dir is not None:
-            file_base = Path(dest_dir, file_base)
-
-        out_files = []
-        for level, qvals in self.confidence_estimates.items():
-            if qvals is None:
-                continue
-
-            out_file = str(file_base) + f".{level}.txt"
-            qvals.to_csv(out_file, sep=sep, index=False)
-            out_files.append(out_file)
-
-        if decoys:
-            for level, qvals in self.decoy_confidence_estimates.items():
-                if qvals is None:
-                    continue
-
-                out_file = str(file_base) + f".decoys.{level}.txt"
-                qvals.to_csv(out_file, sep=sep, index=False)
-                out_files.append(out_file)
-
-        return out_files
+        return to_txt(
+            self,
+            dest_dir=dest_dir,
+            file_root=file_root,
+            sep=sep,
+            decoys=decoys,
+        )
 
     def _perform_tdc(self, psm_columns):
         """Perform target-decoy competition.

diff --git a/mokapot/writers/__init__.py b/mokapot/writers/__init__.py
@@ -1,2 +1,3 @@
 """Define the public functions for the writers"""
+from .txt import to_txt
 from .flashlfq import to_flashlfq
diff --git a/mokapot/writers/txt.py b/mokapot/writers/txt.py
@@ -0,0 +1,91 @@
+"""Writer to save results in a tab-delmited format"""
+from pathlib import Path
+from collections import defaultdict
+
+import pandas as pd
+
+
+def to_txt(conf, dest_dir=None, file_root=None, sep="\t", decoys=False):
+    """Save confidence estimates to delimited text files.
+
+    Parameters
+    ----------
+    conf : Confidence object or tuple of Confidence objects
+        One or more :py:class:`~mokapot.confidence.LinearConfidence` objects.
+    dest_dir : str or None, optional
+        The directory in which to save the files. `None` will use the current
+        working directory.
+   file_root : str or None, optional
+        An optional prefix for the confidence estimate files. The suffix will
+        always be "mokapot.{level}.txt" where "{level}" indicates the level at
+        which confidence estimation was performed (i.e. PSMs, peptides,
+        proteins).
+    sep : str, optional
+        The delimiter to use.
+    decoys : bool, optional
+        Save decoys confidence estimates as well?
+
+    Returns
+    -------
+    list of str
+        The paths to the saved files.
+
+    """
+    try:
+        assert not isinstance(conf, str)
+        iter(conf)
+    except TypeError:
+        conf = [conf]
+    except AssertionError:
+        raise ValueError("'conf' should be a Confidence object, not a string.")
+
+    file_base = "mokapot"
+    if file_root is not None:
+        file_base = file_root + "." + file_base
+    if dest_dir is not None:
+        file_base = Path(dest_dir, file_base)
+
+    results = defaultdict(list)
+    for res in conf:
+        for level, qval_list in _get_level_data(res, decoys).items():
+            results[level] += qval_list
+
+    out_files = []
+    for level, qval_list in results.items():
+        out_file = str(file_base) + f".{level}.txt"
+        pd.concat(qval_list).to_csv(out_file, sep=sep, index=False)
+        out_files.append(out_file)
+
+    return out_files
+
+
+def _get_level_data(conf, decoys):
+    """Return the dataframes for each level.
+
+    Parameters
+    ----------
+    conf : a Confidence object
+        A LinearConfidence object.
+    decoys : bool
+        Should decoys be included?
+
+    Returns
+    -------
+    Dict
+        Each entry contains a level, dataframe pair.
+    """
+    results = defaultdict(list)
+    for level, qvals in conf.confidence_estimates.items():
+        if qvals is None:
+            continue
+
+        results[level].append(qvals)
+
+    if decoys:
+        for level, qvals in conf.decoy_confidence_estiamtes.items():
+            if qvals is None:
+                continue
+
+            results[f"decoy.{level}"].append(qvals)
+
+    return results