Skip to content

Commit

Permalink
Change integration tests to use ms2 files
Browse files Browse the repository at this point in the history
  • Loading branch information
lazear committed Jul 20, 2022
1 parent eb11fc2 commit 74d94d7
Show file tree
Hide file tree
Showing 12 changed files with 364 additions and 1,235 deletions.
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,14 @@ Data repository: [PXD016766](http://proteomecentral.proteomexchange.org/cgi/GetD

Performance results: (Intel i7-12700KF + 32GB RAM)

- ~40 seconds to process 12 files, using less than 4GB of RAM
- Active scanning: ~25,000 scans/s (can be tuned to use more ram and go 5x faster!)
- Amortized scanning: ~7,000 scans/s (most time used on fragment indexing, IO)
- ~30 seconds to process 12 files, using less than 4GB of RAM
- Active scanning: ~45,000 scans/s for narrow window (can be tuned to use more ram and go 5x faster!)


### Search methods

- MS2 files generated using the [ProteoWizard MSConvert tool](http://www.proteowizard.org/download.html)
- MSFragger and Comet were configured with analogous parameters (50ppm precursor tolerance, 10ppm fragment tolerance - or for Comet setting `fragment_bin_tol` to 0.02 Da).
- MSFragger and Comet were configured with analogous parameters (+/- 25 ppm precursor tolerance, +/- 10 ppm fragment tolerance - or for Comet setting `fragment_bin_tol` to 0.02 Da).
- [Mokapot](https://github.com/wfondrie/mokapot) was then used to refine FDR for all search results

Carina search settings file:
Expand All @@ -62,7 +61,7 @@ Carina search settings file:
"database": {
"bucket_size": 8192,
"fragment_min_mz": 75.0,
"fragment_max_mz": 4000.0,
"fragment_max_mz": 2000.0,
"peptide_min_len": 5,
"peptide_max_len": 50,
"decoy": true,
Expand All @@ -75,7 +74,7 @@ Carina search settings file:
"fasta": "UP000005640_9606.fasta"
},
"precursor_tol": {
"ppm": 50.0
"ppm": 25.0
},
"fragment_tol": {
"ppm": 10.0
Expand Down
Binary file modified figures/TMT_IDs.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified figures/TMT_Panel.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
45 changes: 23 additions & 22 deletions src/bin/carina.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use carina::database::{IndexedDatabase, PeptideIx, Theoretical};
use carina::ion_series::Kind;
use carina::mass::{Tolerance, PROTON};
use carina::peptide::TargetDecoy;
use carina::spectrum::{read_spectrum, ProcessedSpectrum, SpectrumProcessor};
use carina::spectrum::{read_ms2, ProcessedSpectrum, SpectrumProcessor};
use clap::{Arg, Command};
use indicatif::ParallelProgressIterator;
use log::info;
Expand Down Expand Up @@ -135,24 +135,24 @@ impl<'db> Scorer<'db> {
// * Calculate q-value

scores.sort_by(|b, a| a.hyperlog.total_cmp(&b.hyperlog));
let mut q_values = vec![0.0; scores.len()];
let mut decoy = 1;
let mut target = 0;

for (idx, score) in scores.iter().enumerate() {
match self.db[score.peptide] {
TargetDecoy::Target(_) => target += 1,
TargetDecoy::Decoy(_) => decoy += 1,
}
q_values[idx] = decoy as f32 / target as f32;
}

// Reverse array, and calculate the cumulative minimum
let mut q_min = 1.0f32;
for idx in (0..q_values.len()).rev() {
q_min = q_min.min(q_values[idx]);
q_values[idx] = q_min;
}
// let mut q_values = vec![0.0; scores.len()];
// let mut decoy = 1;
// let mut target = 0;

// for (idx, score) in scores.iter().enumerate() {
// match self.db[score.peptide] {
// TargetDecoy::Target(_) => target += 1,
// TargetDecoy::Decoy(_) => decoy += 1,
// }
// q_values[idx] = decoy as f32 / target as f32;
// }

// // Reverse array, and calculate the cumulative minimum
// let mut q_min = 1.0f32;
// for idx in (0..q_values.len()).rev() {
// q_min = q_min.min(q_values[idx]);
// q_values[idx] = q_min;
// }

let mut reporting = Vec::new();
if scores.is_empty() {
Expand Down Expand Up @@ -182,7 +182,8 @@ impl<'db> Scorer<'db> {
matched_peaks: better.matched_b + better.matched_y,
summed_intensity: better.summed_b + better.summed_y,
total_candidates: scores.len(),
q_value: q_values[idx],
// q_value: q_values[idx],
q_value: 0.0,
})
}
reporting
Expand Down Expand Up @@ -265,15 +266,15 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {

let mut pin_paths = Vec::with_capacity(search.ms2_paths.len());
for ms2_path in &search.ms2_paths {
let spectra = read_spectrum(ms2_path)?
let spectra = read_ms2(ms2_path)?
.into_iter()
.map(|spectra| sp.process(spectra))
.collect::<Vec<_>>();

let start = Instant::now();
let scores: Vec<Percolator> = spectra
.par_iter()
// .progress()
.progress()
.flat_map(|spectra| scorer.score(spectra))
.collect();
let duration = Instant::now() - start;
Expand Down
13 changes: 4 additions & 9 deletions src/database.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,13 @@
use crate::fasta::Trypsin;
use crate::fasta::{Fasta, Trypsin};
use crate::ion_series::{IonSeries, Kind};
use crate::mass::Tolerance;
use crate::peptide::{Peptide, TargetDecoy};
use crate::spectrum::ProcessedSpectrum;
use crate::{
fasta::Fasta,
ion_series::{IonSeries, Kind},
mass::{Tolerance, PROTON},
};

use log::error;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::hash::Hash;

use std::collections::{HashMap, HashSet};
use std::hash::Hash;
use std::path::PathBuf;

#[derive(Deserialize)]
Expand Down
36 changes: 6 additions & 30 deletions src/spectrum.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
use std::collections::BTreeMap;

use serde::Deserialize;

use crate::mass::PROTON;

pub struct SpectrumProcessor {
Expand All @@ -17,8 +13,6 @@ pub struct ProcessedSpectrum {
pub charge: u8,
pub rt: f32,
pub peaks: Vec<(f32, f32)>,
// pub mz: Vec<f32>,
// pub int: Vec<f32>,
}

impl SpectrumProcessor {
Expand All @@ -32,8 +26,6 @@ impl SpectrumProcessor {

pub fn process(&self, mut s: Spectrum) -> ProcessedSpectrum {
let charge = self.max_fragment_charge.min(s.charge);
// let mut mz = Vec::with_capacity(charge as usize * self.take_top_n);
// let mut int = Vec::with_capacity(charge as usize * self.take_top_n);
let mut peaks = Vec::with_capacity(charge as usize * self.take_top_n);

s.peaks.sort_by(|(_, a), (_, b)| b.total_cmp(&a));
Expand All @@ -45,9 +37,9 @@ impl SpectrumProcessor {
// OK, this bit is kinda weird - to save memory, instead of calculating theoretical
// m/z's for different charge states, we instead resample the experimental spectra
//
// We assume that the m/z we are observing are all at charge state 1, and we
// want to convert them to higher charge states in order to simulate a calculated
// theoretical fragment with this m/z
// We assume that the m/z we are observing are potentially at charge state >1, and we
// want to convert them to charge state = 1 (well, really neutral monoisotopic mass)
// in order to simulate a calculated theoretical fragment with a higher charge
let fragment_mz = (fragment_mz - PROTON) * charge as f32;
if fragment_mz < self.max_fragment_mz {
peaks.push((fragment_mz, fragment_int.sqrt()));
Expand All @@ -58,39 +50,23 @@ impl SpectrumProcessor {
// Sort by m/z
peaks.sort_by(|a, b| a.0.total_cmp(&b.0));

// for (Intensity(fragment_int), fragment_mz) in s.peaks.iter().rev().take(self.take_top_n) {
// for charge in 1..=charge {
// let fragment_mz = (fragment_mz * charge as f32) - (charge as f32 * PROTON);
// if fragment_mz < self.max_fragment_mz {
// mz.push(fragment_mz);
// int.push(fragment_int.sqrt());
// }
// }
// }

// dbg!(mz.len());

ProcessedSpectrum {
scan: s.scan,
monoisotopic_mass: s.precursor_mass - PROTON,
charge: s.charge,
rt: s.rt,
peaks,
// mz,
// int,
}
}
}

/// An observed MS2 spectrum
#[derive(Clone, Default, Debug, Deserialize)]
// #[cfg_attr(test, derive(Deserialize))]
#[derive(Clone, Default, Debug)]
pub struct Spectrum {
scan: u32,
rt: f32,
precursor_mass: f32,
charge: u8,
// peaks: BTreeMap<Intensity, f32>,
peaks: Vec<(f32, f32)>,
}

Expand All @@ -108,7 +84,7 @@ impl std::cmp::Ord for Intensity {
}

/// Read a `.ms2` file
pub fn read_spectrum<P: AsRef<std::path::Path>>(path: P) -> std::io::Result<Vec<Spectrum>> {
pub fn read_ms2<P: AsRef<std::path::Path>>(path: P) -> std::io::Result<Vec<Spectrum>> {
let buf = std::fs::read_to_string(path)?;
let mut spectra = Vec::new();
let mut current = Spectrum::default();
Expand Down Expand Up @@ -141,10 +117,10 @@ pub fn read_spectrum<P: AsRef<std::path::Path>>(path: P) -> std::io::Result<Vec<
let mut ws = line.split_whitespace().map(|s| s.parse::<f32>());
let mz = ws.next().unwrap().unwrap();
let abundance = ws.next().unwrap().unwrap();
// current.peaks.insert(Intensity(abundance), mz);
current.peaks.push((mz, abundance));
}
}
spectra.push(current);

Ok(spectra)
}
5 changes: 3 additions & 2 deletions tests/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
!LQSRPAAPPAPGPGQLTLR.json
pxd/
pxd/
fragpipe/
!LQSRPAAPPAPGPGQLTLR.ms2
Loading

0 comments on commit 74d94d7

Please sign in to comment.