Skip to content

Commit

Permalink
Heapselect algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
lazear committed Jul 30, 2023
1 parent 75391bb commit d214225
Show file tree
Hide file tree
Showing 7 changed files with 209 additions and 49 deletions.
57 changes: 55 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion crates/sage-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ impl Runner {
let features: Vec<_> = spectra
.par_iter()
.filter(|spec| spec.peaks.len() >= self.parameters.min_peaks && spec.level == 2)
.flat_map(|spec| scorer.score(spec, self.parameters.report_psms))
.flat_map(|spec| scorer.score(spec))
.collect();

let quant = self
Expand Down Expand Up @@ -227,6 +227,7 @@ impl Runner {
min_fragment_mass: self.parameters.database.fragment_min_mz,
max_fragment_mass: self.parameters.database.fragment_max_mz,
chimera: self.parameters.chimera,
report_psms: self.parameters.report_psms,
wide_window: self.parameters.wide_window,
};

Expand Down
3 changes: 2 additions & 1 deletion crates/sage-cli/tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ fn integration() -> anyhow::Result<()> {
min_fragment_mass: 0.0,
max_fragment_mass: 1500.0,
chimera: false,
report_psms: 1,
wide_window: false,
};

let psm = scorer.score(&processed, 1);
let psm = scorer.score(&processed);
assert_eq!(psm.len(), 1);
assert_eq!(psm[0].matched_peaks, 21);

Expand Down
6 changes: 5 additions & 1 deletion crates/sage/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,8 @@ serde = { version="1.0", features = ["derive"] }
serde_json = "1.0"
quick-xml = { version = "0.25", features = ["async-tokio"] }

sage-cloudpath = { path = "../sage-cloudpath" }
sage-cloudpath = { path = "../sage-cloudpath" }

[dev-dependencies]
quickcheck = "1"
quickcheck_macros = "1"
101 changes: 101 additions & 0 deletions crates/sage/src/heap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/// Perform a max k-selection (e.g. select the 50 largest items) on an
/// array, in place. This algorithm works by building a bounded min heap
/// in the array. The `k` highest elements in the slice will be stored in
/// first `k` elements in the slice, but they will be stored in min-heap order,
/// not in sorted order. Otherwise, this is equivalent to performing a partial
/// sort
pub fn bounded_min_heapify<T: Ord>(slice: &mut [T], k: usize) {
if slice.len() <= k {
return;
}

// Build a heap in place for the first `k` elements
for i in (0..k / 2).rev() {
sift_down(&mut slice[..k], i);
}

debug_assert!(check_heap(&slice[..k]));

// Scan the vector, "inserting" items into the heap if they are larger
// than the smallest item in the heap. This performs the `k` select operation
for i in k..slice.len() {
if slice[i] > slice[0] {
slice.swap(i, 0);
sift_down(&mut slice[..k], 0);
debug_assert!(check_heap(&slice[..k]));
}
}
}

fn check_heap<T: Ord>(slice: &[T]) -> bool {
for i in 1..slice.len() {
let parent = (i - 1) / 2;
if slice[parent] > slice[i] {
return false;
}
}
true
}

fn sift_down<T: Ord>(slice: &mut [T], mut index: usize) {
while let Some(left) = slice.get(index * 2 + 1) {
let mut smallest = index;
if left < &slice[smallest] {
smallest = index * 2 + 1;
}

if let Some(right) = slice.get(index * 2 + 2) {
if right < &slice[smallest] {
smallest = index * 2 + 2;
}
}

if smallest != index {
slice.swap(smallest, index);
index = smallest;
} else {
break;
}
}
}

#[cfg(test)]
mod tests {
use std::fmt::Debug;

use quickcheck_macros::quickcheck;

use super::bounded_min_heapify;
use super::check_heap;

fn check<T: Ord + Clone + Debug>(mut data: Vec<T>, k: usize) {
let k = k.min(data.len());
let mut cloned = data.clone();
// Stable sort the data
cloned.sort_by(|a, b| b.cmp(&a));

bounded_min_heapify(&mut data, k);

// Take the heap part, and sort it
let top_k = &mut data[..k];

// Check that heap property is maintained, or that k == length of the data
assert!(check_heap(top_k) || k == cloned.len());

top_k.sort_by(|a, b| b.cmp(&a));
assert_eq!(top_k, &mut cloned[..k]);
}

#[quickcheck]
fn run_quickcheck(data: Vec<i32>, k: usize) {
check(data, k);
}

#[test]
fn smoke() {
let asc = (0..500).collect::<Vec<_>>();
let desc = (0..500).rev().collect::<Vec<_>>();
check(asc, 50);
check(desc, 50);
}
}
1 change: 1 addition & 0 deletions crates/sage/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ pub mod database;
pub mod enzyme;
pub mod fasta;
pub mod fdr;
pub mod heap;
pub mod ion_series;
pub mod isotopes;
pub mod lfq;
Expand Down
Loading

0 comments on commit d214225

Please sign in to comment.