Additional tests, docs, critical bug fix in IndexedQuery

mobiusklein · Jul 21, 2022 · 1c7a65f · 1c7a65f
1 parent 74d94d7
commit 1c7a65f
Show file tree

Hide file tree

Showing 9 changed files with 151 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -6,7 +6,9 @@ I wanted to see how far I could take a proteomics search engine in ~1000 lines o
 
 I was inspired by the elegant data structure discussed in the [MSFragger paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5409104/), and decided to implement an (open source) version of it in Rust - with great results.
 
-Carina has excellent performance characteristics (>2x faster and >2x less memory usage than MSFragger), but does not sacrifice code quality or size to do so!
+Carina has excellent performance characteristics (>2x faster and >2x less memory usage than MSFragger for narrow searches), but does not sacrifice code quality or size to do so!
+
+- MSFragger still blows the pants off this tool on Open Searches (> 50 Da precursor_tol) ... for now!
 
 ## Features
 
@@ -46,7 +48,7 @@ Data repository: [PXD016766](http://proteomecentral.proteomexchange.org/cgi/GetD
 Performance results: (Intel i7-12700KF + 32GB RAM)
 
 - ~30 seconds to process 12 files, using less than 4GB of RAM
-- Active scanning: ~45,000 scans/s for narrow window (can be tuned to use more ram and go 5x faster!)
+- Active scanning: ~35,000 scans/s for narrow window (can be tuned to use more ram and go 5x faster!)
 
 
 ### Search methods

diff --git a/figures/TMT_IDs.png b/figures/TMT_IDs.png
diff --git a/figures/TMT_Panel.png b/figures/TMT_Panel.png
diff --git a/src/bin/carina.rs b/src/bin/carina.rs
@@ -135,24 +135,24 @@ impl<'db> Scorer<'db> {
         // * Calculate q-value
 
         scores.sort_by(|b, a| a.hyperlog.total_cmp(&b.hyperlog));
-        // let mut q_values = vec![0.0; scores.len()];
-        // let mut decoy = 1;
-        // let mut target = 0;
-
-        // for (idx, score) in scores.iter().enumerate() {
-        //     match self.db[score.peptide] {
-        //         TargetDecoy::Target(_) => target += 1,
-        //         TargetDecoy::Decoy(_) => decoy += 1,
-        //     }
-        //     q_values[idx] = decoy as f32 / target as f32;
-        // }
-
-        // // Reverse array, and calculate the cumulative minimum
-        // let mut q_min = 1.0f32;
-        // for idx in (0..q_values.len()).rev() {
-        //     q_min = q_min.min(q_values[idx]);
-        //     q_values[idx] = q_min;
-        // }
+        let mut q_values = vec![0.0; scores.len()];
+        let mut decoy = 1;
+        let mut target = 0;
+
+        for (idx, score) in scores.iter().enumerate() {
+            match self.db[score.peptide] {
+                TargetDecoy::Target(_) => target += 1,
+                TargetDecoy::Decoy(_) => decoy += 1,
+            }
+            q_values[idx] = decoy as f32 / target as f32;
+        }
+
+        // Reverse array, and calculate the cumulative minimum
+        let mut q_min = 1.0f32;
+        for idx in (0..q_values.len()).rev() {
+            q_min = q_min.min(q_values[idx]);
+            q_values[idx] = q_min;
+        }
 
         let mut reporting = Vec::new();
         if scores.is_empty() {
@@ -182,8 +182,7 @@ impl<'db> Scorer<'db> {
                 matched_peaks: better.matched_b + better.matched_y,
                 summed_intensity: better.summed_b + better.summed_y,
                 total_candidates: scores.len(),
-                // q_value: q_values[idx],
-                q_value: 0.0,
+                q_value: q_values[idx],
             })
         }
         reporting
@@ -251,6 +250,15 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     let db = search.database.clone().build()?;
 
+    let buckets = db.buckets();
+    let mut avg_delta = 0.0;
+
+    for i in 1..buckets.len() {
+        let delta = buckets[i] - buckets[i - 1];
+        avg_delta += delta;
+    }
+    dbg!(avg_delta / buckets.len() as f32);
+
     info!(
         "generated {} fragments in {}ms",
         db.size(),
@@ -274,7 +282,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         let start = Instant::now();
         let scores: Vec<Percolator> = spectra
             .par_iter()
-            .progress()
+            // .progress()
             .flat_map(|spectra| scorer.score(spectra))
             .collect();
         let duration = Instant::now() - start;

diff --git a/src/database.rs b/src/database.rs
@@ -86,6 +86,7 @@ impl Parameters {
 
         let trypsin = Trypsin::new(
             self.decoy,
+            // false,
             self.missed_cleavages,
             self.peptide_min_len,
             self.peptide_max_len,
@@ -106,6 +107,21 @@ impl Parameters {
         let mut target_decoys = peptides
             .par_iter()
             .filter_map(|f| Peptide::try_from(f).ok().map(|pep| (f, pep)))
+            // .flat_map(|(digest, mut peptide)|  {
+            //     let mut reversed = peptide.clone();
+            //     reversed.sequence.reverse();
+            //     // First modification we apply takes priority
+            //     if let Some(m) = self.n_term_mod {
+            //         peptide.set_nterm_mod(m);
+            //         reversed.set_nterm_mod(m)
+            //     }
+            //     // Apply any relevant static modifications
+            //     for (resi, mass) in &self.static_mods {
+            //         peptide.static_mod(*resi, *mass);
+            //         reversed.static_mod(*resi, *mass);
+            //     };
+            //     [TargetDecoy::Decoy(reversed), TargetDecoy::Target(peptide)]
+            // }).collect::<Vec<_>>();
             .map(|(digest, mut peptide)| {
                 // First modification we apply takes priority
                 if let Some(m) = self.n_term_mod {
@@ -253,6 +269,10 @@ impl IndexedDatabase {
     pub fn size(&self) -> usize {
         self.fragments.len()
     }
+
+    pub fn buckets(&self) -> &[f32] {
+        &self.min_value
+    }
 }
 
 impl std::ops::Index<PeptideIx> for IndexedDatabase {
@@ -281,24 +301,28 @@ impl<'d, 'q> IndexedQuery<'d, 'q> {
         // indices that can be used with `self.db.fragments`
         let (left_idx, right_idx) =
             binary_search_slice(&self.db.min_value, |m| *m, fragment_lo, fragment_hi);
-        let left_idx = left_idx * self.db.bucket_size;
-
-        // Last chunk not guaranted to be modulo bucket size, make sure we don't
-        // accidentally go out of bounds!
-        let right_idx = (right_idx * self.db.bucket_size).min(self.db.fragments.len());
-
-        // Narrow down into our region of interest, then perform another binary
-        // search to further refine down to the slice of matching precursor mzs
-        let slice = &&self.db.fragments[left_idx..right_idx];
-        let (inner_left, inner_right) =
-            binary_search_slice(slice, |frag| frag.precursor_mz, precursor_lo, precursor_hi);
-
-        // Finally, filter down our slice into exact matches only
-        slice[inner_left..inner_right].iter().filter(move |frag| {
-            frag.precursor_mz >= precursor_lo
-                && frag.precursor_mz <= precursor_hi
-                && frag.fragment_mz >= fragment_lo
-                && frag.fragment_mz <= fragment_hi
+
+        // It is absolutely critical that we do not cross page boundaries!
+        // If we do, we can no longer rely on total ordering of precursor m/z
+        (left_idx..right_idx).flat_map(move |page| {
+            let left_idx = page * self.db.bucket_size;
+            // Last chunk not guaranted to be modulo bucket size, make sure we don't
+            // accidentally go out of bounds!
+            let right_idx = ((page + 1) * self.db.bucket_size).min(self.db.fragments.len());
+
+            // Narrow down into our region of interest, then perform another binary
+            // search to further refine down to the slice of matching precursor mzs
+            let slice = &&self.db.fragments[left_idx..right_idx];
+            let (inner_left, inner_right) =
+                binary_search_slice(slice, |frag| frag.precursor_mz, precursor_lo, precursor_hi);
+
+            // Finally, filter down our slice into exact matches only
+            slice[inner_left..inner_right].iter().filter(move |frag| {
+                frag.precursor_mz >= precursor_lo
+                    && frag.precursor_mz <= precursor_hi
+                    && frag.fragment_mz >= fragment_lo
+                    && frag.fragment_mz <= fragment_hi
+            })
         })
     }
 }
@@ -309,9 +333,9 @@ impl<'d, 'q> IndexedQuery<'d, 'q> {
 ///
 /// # Invariants
 ///
-/// * `slice[left] < low || left == 0`
-/// * `slice[right] > high || right == slice.len() - 1`
-/// * `0 <= left <= right < slice.len()`
+/// * `slice[left] <= low || left == 0`
+/// * `slice[right] <= high && (slice[right+1] > high || right == slice.len())`
+/// * `0 <= left <= right <= slice.len()`
 #[inline]
 pub fn binary_search_slice<T, F>(slice: &[T], key: F, low: f32, high: f32) -> (usize, usize)
 where
@@ -320,15 +344,21 @@ where
     let left_idx = match slice.binary_search_by(|a| key(a).total_cmp(&low)) {
         Ok(idx) | Err(idx) => {
             let mut idx = idx.saturating_sub(1);
-            while idx > 0 && key(&slice[idx]) >= low {
+            while idx > 0 && key(&slice[idx]) > low {
                 idx -= 1;
             }
             idx
         }
     };
 
-    let right_idx = match slice[left_idx..].binary_search_by(|a| key(a).total_cmp(&high)) {
-        Ok(idx) | Err(idx) => (left_idx + idx).min(slice.len().saturating_sub(1)),
+    let right_idx = match slice.binary_search_by(|a| key(a).total_cmp(&high)) {
+        Ok(idx) | Err(idx) => {
+            let mut idx = idx;
+            while idx < slice.len() && key(&slice[idx]) <= high {
+                idx = idx.saturating_add(1);
+            }
+            idx.min(slice.len())
+        }
     };
     (left_idx, right_idx)
 }
@@ -347,26 +377,25 @@ mod test {
     fn binary_search_slice_smoke() {
         // Make sure that our query returns the maximal set of indices
         let data = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0];
-        let bounds = binary_search_slice(&data, |x| *x, 1.75, 3.25);
-        assert_eq!(bounds, (1, 5));
+        let bounds = binary_search_slice(&data, |x| *x, 1.75, 3.5);
+        assert_eq!(bounds, (1, 6));
         assert!(data[bounds.0] <= 1.75);
-        assert!(data[bounds.1] > 3.25);
-        assert_eq!(&data[bounds.0..bounds.1], &[1.5, 2.0, 2.5, 3.0]);
+        assert_eq!(&data[bounds.0..bounds.1], &[1.5, 2.0, 2.5, 3.0, 3.5]);
 
         let bounds = binary_search_slice(&data, |x| *x, 0.0, 5.0);
-        assert_eq!(bounds, (0, data.len() - 1));
+        assert_eq!(bounds, (0, data.len()));
     }
 
     #[test]
     fn binary_search_slice_run() {
         // Make sure that our query returns the maximal set of indices
         let data = [1.0, 1.5, 1.5, 1.5, 1.5, 2.0, 2.5, 3.0, 3.0, 3.5, 4.0];
         let (left, right) = binary_search_slice(&data, |x| *x, 1.5, 3.25);
-        assert!(data[left] < 1.5);
+        assert!(data[left] <= 1.5);
         assert!(data[right] > 3.25);
         assert_eq!(
             &data[left..right],
-            &[1.0, 1.5, 1.5, 1.5, 1.5, 2.0, 2.5, 3.0, 3.0]
+            &[1.5, 1.5, 1.5, 1.5, 2.0, 2.5, 3.0, 3.0]
         );
     }
 }
diff --git a/src/fasta.rs b/src/fasta.rs
@@ -63,7 +63,7 @@ pub struct Digest<'s> {
 
 impl<'s> PartialEq for Digest<'s> {
     fn eq(&self, other: &Self) -> bool {
-        self.sequence == other.sequence && self.reversed == other.reversed
+        self.sequence == other.sequence
     }
 }
 
@@ -74,7 +74,7 @@ impl<'s> Eq for Digest<'s> {
 impl<'s> std::hash::Hash for Digest<'s> {
     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
         self.sequence.hash(state);
-        self.reversed.hash(state);
+        // self.reversed.hash(state);
     }
 }
 
@@ -165,7 +165,10 @@ mod tests {
             observed.iter().map(|d| &d.sequence).collect::<Vec<_>>()
         );
 
-        observed[1].protein = "B";
+        assert_eq!(observed[0].sequence, observed[1].sequence);
+
+        observed[1].protein = "A";
+        observed[1].reversed = true;
         // Make sure hashing a digest works
         let set = observed.drain(..).collect::<HashSet<_>>();
         assert_eq!(set.len(), 1);
@@ -187,6 +190,36 @@ mod tests {
         );
     }
 
+    #[test]
+    fn reverse() {
+        let trypsin = Trypsin::new(true, 0, 2, 50);
+        let sequence = "MADEEKLPPGWEKRMSRSSGRVYYFNHITNASQWERPSGN";
+        let expected = vec![
+            "MADEEK",
+            "LPPGWEK",
+            "MSR",
+            "SSGR",
+            "VYYFNHITNASQWERPSGN",
+            "NGSPR",
+            "EWQSANTIHNFYYVR",
+            "GSSR",
+            "SMR",
+            "EWGPPLK",
+            "EEDAM",
+        ];
+
+        trypsin.digest("".into(), sequence.into());
+
+        assert_eq!(
+            trypsin
+                .digest("".into(), sequence.into())
+                .into_iter()
+                .map(|d| d.sequence)
+                .collect::<Vec<_>>(),
+            expected
+        );
+    }
+
     #[test]
     fn digest_missed_cleavage() {
         let trypsin = Trypsin::new(false, 1, 0, 50);

diff --git a/src/mass.rs b/src/mass.rs
@@ -90,12 +90,20 @@ impl std::fmt::Display for Residue {
 
 #[cfg(test)]
 mod test {
-    use super::{Mass, VALID_AA};
+    use super::{Mass, Tolerance, VALID_AA};
 
     #[test]
     fn smoke() {
         for ch in VALID_AA {
             assert!(ch.monoisotopic() > 0.0);
         }
     }
+
+    #[test]
+    fn tolerances() {
+        assert_eq!(Tolerance::Ppm(10.0).bounds(1000.0), (999.99, 1000.01));
+        assert_eq!(Tolerance::Ppm(10.0).bounds(487.0), (486.99513, 487.00487));
+
+        assert_eq!(Tolerance::Ppm(50.0).bounds(1000.0), (999.95, 1000.05));
+    }
 }
diff --git a/tests/analyze.py b/tests/analyze.py
@@ -83,6 +83,7 @@ def plot(self, specific_scan: Optional[int]):
                     row = self.results.loc[
                         sn,
                     ]
+                    print(row)
                     peptide = ''.join([c for c in row['peptide'] if c.isalpha()])
                     pylab_aux.annotate_spectrum(
                         scan,
@@ -101,4 +102,5 @@ def plot(self, specific_scan: Optional[int]):
 
 _ = Pipeline()
 p = PostAnalysis()
-p.plot(30069)
+# p.plot(30069)
+p.plot(38525)
diff --git a/tests/integration.rs b/tests/integration.rs
@@ -115,11 +115,17 @@ pub fn confirm_charge_state_simulation() -> Result<(), Box<dyn std::error::Error
     fragments.sort_by(|a, b| a.fragment_mz.total_cmp(&b.fragment_mz));
     assert_eq!(fragments.len(), 36);
 
+    let (matched_b, matched_y) = match_peaks(&fragments, &processed.peaks, 10.0);
+    assert_eq!(matched_b + matched_y, 8);
+
+    Ok(())
+}
+
+fn match_peaks(fragments: &[Theoretical], peaks: &[(f32, f32)], ppm: f32) -> (usize, usize) {
     let mut matched_b = 0;
     let mut matched_y = 0;
-
-    for (mz, int) in processed.peaks {
-        let (low, high) = Tolerance::Ppm(10.0).bounds(mz);
+    for (mz, int) in peaks {
+        let (low, high) = Tolerance::Ppm(ppm).bounds(*mz);
         let (i, j) = binary_search_slice(&fragments, |f| f.fragment_mz, low, high);
         for fragment in fragments[i..j]
             .iter()
@@ -138,8 +144,5 @@ pub fn confirm_charge_state_simulation() -> Result<(), Box<dyn std::error::Error
             }
         }
     }
-
-    assert_eq!(matched_b + matched_y, 8);
-
-    Ok(())
+    (matched_b, matched_y)
 }