Skip to content

Commit

Permalink
Additional tests, docs, critical bug fix in IndexedQuery
Browse files Browse the repository at this point in the history
  • Loading branch information
lazear committed Jul 21, 2022
1 parent 74d94d7 commit 1c7a65f
Show file tree
Hide file tree
Showing 9 changed files with 151 additions and 66 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ I wanted to see how far I could take a proteomics search engine in ~1000 lines o

I was inspired by the elegant data structure discussed in the [MSFragger paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5409104/), and decided to implement an (open source) version of it in Rust - with great results.

Carina has excellent performance characteristics (>2x faster and >2x less memory usage than MSFragger), but does not sacrifice code quality or size to do so!
Carina has excellent performance characteristics (>2x faster and >2x less memory usage than MSFragger for narrow searches), but does not sacrifice code quality or size to do so!

- MSFragger still blows the pants off this tool on Open Searches (> 50 Da precursor_tol) ... for now!

## Features

Expand Down Expand Up @@ -46,7 +48,7 @@ Data repository: [PXD016766](http://proteomecentral.proteomexchange.org/cgi/GetD
Performance results: (Intel i7-12700KF + 32GB RAM)

- ~30 seconds to process 12 files, using less than 4GB of RAM
- Active scanning: ~45,000 scans/s for narrow window (can be tuned to use more ram and go 5x faster!)
- Active scanning: ~35,000 scans/s for narrow window (can be tuned to use more ram and go 5x faster!)


### Search methods
Expand Down
Binary file modified figures/TMT_IDs.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified figures/TMT_Panel.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
50 changes: 29 additions & 21 deletions src/bin/carina.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,24 +135,24 @@ impl<'db> Scorer<'db> {
// * Calculate q-value

scores.sort_by(|b, a| a.hyperlog.total_cmp(&b.hyperlog));
// let mut q_values = vec![0.0; scores.len()];
// let mut decoy = 1;
// let mut target = 0;

// for (idx, score) in scores.iter().enumerate() {
// match self.db[score.peptide] {
// TargetDecoy::Target(_) => target += 1,
// TargetDecoy::Decoy(_) => decoy += 1,
// }
// q_values[idx] = decoy as f32 / target as f32;
// }

// // Reverse array, and calculate the cumulative minimum
// let mut q_min = 1.0f32;
// for idx in (0..q_values.len()).rev() {
// q_min = q_min.min(q_values[idx]);
// q_values[idx] = q_min;
// }
let mut q_values = vec![0.0; scores.len()];
let mut decoy = 1;
let mut target = 0;

for (idx, score) in scores.iter().enumerate() {
match self.db[score.peptide] {
TargetDecoy::Target(_) => target += 1,
TargetDecoy::Decoy(_) => decoy += 1,
}
q_values[idx] = decoy as f32 / target as f32;
}

// Reverse array, and calculate the cumulative minimum
let mut q_min = 1.0f32;
for idx in (0..q_values.len()).rev() {
q_min = q_min.min(q_values[idx]);
q_values[idx] = q_min;
}

let mut reporting = Vec::new();
if scores.is_empty() {
Expand Down Expand Up @@ -182,8 +182,7 @@ impl<'db> Scorer<'db> {
matched_peaks: better.matched_b + better.matched_y,
summed_intensity: better.summed_b + better.summed_y,
total_candidates: scores.len(),
// q_value: q_values[idx],
q_value: 0.0,
q_value: q_values[idx],
})
}
reporting
Expand Down Expand Up @@ -251,6 +250,15 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {

let db = search.database.clone().build()?;

let buckets = db.buckets();
let mut avg_delta = 0.0;

for i in 1..buckets.len() {
let delta = buckets[i] - buckets[i - 1];
avg_delta += delta;
}
dbg!(avg_delta / buckets.len() as f32);

info!(
"generated {} fragments in {}ms",
db.size(),
Expand All @@ -274,7 +282,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let start = Instant::now();
let scores: Vec<Percolator> = spectra
.par_iter()
.progress()
// .progress()
.flat_map(|spectra| scorer.score(spectra))
.collect();
let duration = Instant::now() - start;
Expand Down
91 changes: 60 additions & 31 deletions src/database.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ impl Parameters {

let trypsin = Trypsin::new(
self.decoy,
// false,
self.missed_cleavages,
self.peptide_min_len,
self.peptide_max_len,
Expand All @@ -106,6 +107,21 @@ impl Parameters {
let mut target_decoys = peptides
.par_iter()
.filter_map(|f| Peptide::try_from(f).ok().map(|pep| (f, pep)))
// .flat_map(|(digest, mut peptide)| {
// let mut reversed = peptide.clone();
// reversed.sequence.reverse();
// // First modification we apply takes priority
// if let Some(m) = self.n_term_mod {
// peptide.set_nterm_mod(m);
// reversed.set_nterm_mod(m)
// }
// // Apply any relevant static modifications
// for (resi, mass) in &self.static_mods {
// peptide.static_mod(*resi, *mass);
// reversed.static_mod(*resi, *mass);
// };
// [TargetDecoy::Decoy(reversed), TargetDecoy::Target(peptide)]
// }).collect::<Vec<_>>();
.map(|(digest, mut peptide)| {
// First modification we apply takes priority
if let Some(m) = self.n_term_mod {
Expand Down Expand Up @@ -253,6 +269,10 @@ impl IndexedDatabase {
pub fn size(&self) -> usize {
self.fragments.len()
}

pub fn buckets(&self) -> &[f32] {
&self.min_value
}
}

impl std::ops::Index<PeptideIx> for IndexedDatabase {
Expand Down Expand Up @@ -281,24 +301,28 @@ impl<'d, 'q> IndexedQuery<'d, 'q> {
// indices that can be used with `self.db.fragments`
let (left_idx, right_idx) =
binary_search_slice(&self.db.min_value, |m| *m, fragment_lo, fragment_hi);
let left_idx = left_idx * self.db.bucket_size;

// Last chunk not guaranted to be modulo bucket size, make sure we don't
// accidentally go out of bounds!
let right_idx = (right_idx * self.db.bucket_size).min(self.db.fragments.len());

// Narrow down into our region of interest, then perform another binary
// search to further refine down to the slice of matching precursor mzs
let slice = &&self.db.fragments[left_idx..right_idx];
let (inner_left, inner_right) =
binary_search_slice(slice, |frag| frag.precursor_mz, precursor_lo, precursor_hi);

// Finally, filter down our slice into exact matches only
slice[inner_left..inner_right].iter().filter(move |frag| {
frag.precursor_mz >= precursor_lo
&& frag.precursor_mz <= precursor_hi
&& frag.fragment_mz >= fragment_lo
&& frag.fragment_mz <= fragment_hi

// It is absolutely critical that we do not cross page boundaries!
// If we do, we can no longer rely on total ordering of precursor m/z
(left_idx..right_idx).flat_map(move |page| {
let left_idx = page * self.db.bucket_size;
// Last chunk not guaranted to be modulo bucket size, make sure we don't
// accidentally go out of bounds!
let right_idx = ((page + 1) * self.db.bucket_size).min(self.db.fragments.len());

// Narrow down into our region of interest, then perform another binary
// search to further refine down to the slice of matching precursor mzs
let slice = &&self.db.fragments[left_idx..right_idx];
let (inner_left, inner_right) =
binary_search_slice(slice, |frag| frag.precursor_mz, precursor_lo, precursor_hi);

// Finally, filter down our slice into exact matches only
slice[inner_left..inner_right].iter().filter(move |frag| {
frag.precursor_mz >= precursor_lo
&& frag.precursor_mz <= precursor_hi
&& frag.fragment_mz >= fragment_lo
&& frag.fragment_mz <= fragment_hi
})
})
}
}
Expand All @@ -309,9 +333,9 @@ impl<'d, 'q> IndexedQuery<'d, 'q> {
///
/// # Invariants
///
/// * `slice[left] < low || left == 0`
/// * `slice[right] > high || right == slice.len() - 1`
/// * `0 <= left <= right < slice.len()`
/// * `slice[left] <= low || left == 0`
/// * `slice[right] <= high && (slice[right+1] > high || right == slice.len())`
/// * `0 <= left <= right <= slice.len()`
#[inline]
pub fn binary_search_slice<T, F>(slice: &[T], key: F, low: f32, high: f32) -> (usize, usize)
where
Expand All @@ -320,15 +344,21 @@ where
let left_idx = match slice.binary_search_by(|a| key(a).total_cmp(&low)) {
Ok(idx) | Err(idx) => {
let mut idx = idx.saturating_sub(1);
while idx > 0 && key(&slice[idx]) >= low {
while idx > 0 && key(&slice[idx]) > low {
idx -= 1;
}
idx
}
};

let right_idx = match slice[left_idx..].binary_search_by(|a| key(a).total_cmp(&high)) {
Ok(idx) | Err(idx) => (left_idx + idx).min(slice.len().saturating_sub(1)),
let right_idx = match slice.binary_search_by(|a| key(a).total_cmp(&high)) {
Ok(idx) | Err(idx) => {
let mut idx = idx;
while idx < slice.len() && key(&slice[idx]) <= high {
idx = idx.saturating_add(1);
}
idx.min(slice.len())
}
};
(left_idx, right_idx)
}
Expand All @@ -347,26 +377,25 @@ mod test {
fn binary_search_slice_smoke() {
// Make sure that our query returns the maximal set of indices
let data = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0];
let bounds = binary_search_slice(&data, |x| *x, 1.75, 3.25);
assert_eq!(bounds, (1, 5));
let bounds = binary_search_slice(&data, |x| *x, 1.75, 3.5);
assert_eq!(bounds, (1, 6));
assert!(data[bounds.0] <= 1.75);
assert!(data[bounds.1] > 3.25);
assert_eq!(&data[bounds.0..bounds.1], &[1.5, 2.0, 2.5, 3.0]);
assert_eq!(&data[bounds.0..bounds.1], &[1.5, 2.0, 2.5, 3.0, 3.5]);

let bounds = binary_search_slice(&data, |x| *x, 0.0, 5.0);
assert_eq!(bounds, (0, data.len() - 1));
assert_eq!(bounds, (0, data.len()));
}

#[test]
fn binary_search_slice_run() {
// Make sure that our query returns the maximal set of indices
let data = [1.0, 1.5, 1.5, 1.5, 1.5, 2.0, 2.5, 3.0, 3.0, 3.5, 4.0];
let (left, right) = binary_search_slice(&data, |x| *x, 1.5, 3.25);
assert!(data[left] < 1.5);
assert!(data[left] <= 1.5);
assert!(data[right] > 3.25);
assert_eq!(
&data[left..right],
&[1.0, 1.5, 1.5, 1.5, 1.5, 2.0, 2.5, 3.0, 3.0]
&[1.5, 1.5, 1.5, 1.5, 2.0, 2.5, 3.0, 3.0]
);
}
}
39 changes: 36 additions & 3 deletions src/fasta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ pub struct Digest<'s> {

impl<'s> PartialEq for Digest<'s> {
fn eq(&self, other: &Self) -> bool {
self.sequence == other.sequence && self.reversed == other.reversed
self.sequence == other.sequence
}
}

Expand All @@ -74,7 +74,7 @@ impl<'s> Eq for Digest<'s> {
impl<'s> std::hash::Hash for Digest<'s> {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.sequence.hash(state);
self.reversed.hash(state);
// self.reversed.hash(state);
}
}

Expand Down Expand Up @@ -165,7 +165,10 @@ mod tests {
observed.iter().map(|d| &d.sequence).collect::<Vec<_>>()
);

observed[1].protein = "B";
assert_eq!(observed[0].sequence, observed[1].sequence);

observed[1].protein = "A";
observed[1].reversed = true;
// Make sure hashing a digest works
let set = observed.drain(..).collect::<HashSet<_>>();
assert_eq!(set.len(), 1);
Expand All @@ -187,6 +190,36 @@ mod tests {
);
}

#[test]
fn reverse() {
let trypsin = Trypsin::new(true, 0, 2, 50);
let sequence = "MADEEKLPPGWEKRMSRSSGRVYYFNHITNASQWERPSGN";
let expected = vec![
"MADEEK",
"LPPGWEK",
"MSR",
"SSGR",
"VYYFNHITNASQWERPSGN",
"NGSPR",
"EWQSANTIHNFYYVR",
"GSSR",
"SMR",
"EWGPPLK",
"EEDAM",
];

trypsin.digest("".into(), sequence.into());

assert_eq!(
trypsin
.digest("".into(), sequence.into())
.into_iter()
.map(|d| d.sequence)
.collect::<Vec<_>>(),
expected
);
}

#[test]
fn digest_missed_cleavage() {
let trypsin = Trypsin::new(false, 1, 0, 50);
Expand Down
10 changes: 9 additions & 1 deletion src/mass.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,20 @@ impl std::fmt::Display for Residue {

#[cfg(test)]
mod test {
use super::{Mass, VALID_AA};
use super::{Mass, Tolerance, VALID_AA};

#[test]
fn smoke() {
for ch in VALID_AA {
assert!(ch.monoisotopic() > 0.0);
}
}

#[test]
fn tolerances() {
assert_eq!(Tolerance::Ppm(10.0).bounds(1000.0), (999.99, 1000.01));
assert_eq!(Tolerance::Ppm(10.0).bounds(487.0), (486.99513, 487.00487));

assert_eq!(Tolerance::Ppm(50.0).bounds(1000.0), (999.95, 1000.05));
}
}
4 changes: 3 additions & 1 deletion tests/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def plot(self, specific_scan: Optional[int]):
row = self.results.loc[
sn,
]
print(row)
peptide = ''.join([c for c in row['peptide'] if c.isalpha()])
pylab_aux.annotate_spectrum(
scan,
Expand All @@ -101,4 +102,5 @@ def plot(self, specific_scan: Optional[int]):

_ = Pipeline()
p = PostAnalysis()
p.plot(30069)
# p.plot(30069)
p.plot(38525)
17 changes: 10 additions & 7 deletions tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,17 @@ pub fn confirm_charge_state_simulation() -> Result<(), Box<dyn std::error::Error
fragments.sort_by(|a, b| a.fragment_mz.total_cmp(&b.fragment_mz));
assert_eq!(fragments.len(), 36);

let (matched_b, matched_y) = match_peaks(&fragments, &processed.peaks, 10.0);
assert_eq!(matched_b + matched_y, 8);

Ok(())
}

fn match_peaks(fragments: &[Theoretical], peaks: &[(f32, f32)], ppm: f32) -> (usize, usize) {
let mut matched_b = 0;
let mut matched_y = 0;

for (mz, int) in processed.peaks {
let (low, high) = Tolerance::Ppm(10.0).bounds(mz);
for (mz, int) in peaks {
let (low, high) = Tolerance::Ppm(ppm).bounds(*mz);
let (i, j) = binary_search_slice(&fragments, |f| f.fragment_mz, low, high);
for fragment in fragments[i..j]
.iter()
Expand All @@ -138,8 +144,5 @@ pub fn confirm_charge_state_simulation() -> Result<(), Box<dyn std::error::Error
}
}
}

assert_eq!(matched_b + matched_y, 8);

Ok(())
(matched_b, matched_y)
}

0 comments on commit 1c7a65f

Please sign in to comment.