-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 761e82c
Showing
11 changed files
with
597 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
*.ipynb filter=lfs diff=lfs merge=lfs -text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
/target | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
.pytest_cache/ | ||
*.py[cod] | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
.venv/ | ||
env/ | ||
bin/ | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
include/ | ||
man/ | ||
venv/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
pip-selfcheck.json | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.coverage | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
|
||
# Translations | ||
*.mo | ||
|
||
# Mr Developer | ||
.mr.developer.cfg | ||
.project | ||
.pydevproject | ||
|
||
# Rope | ||
.ropeproject | ||
|
||
# Django stuff: | ||
*.log | ||
*.pot | ||
|
||
.DS_Store | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyCharm | ||
.idea/ | ||
|
||
# VSCode | ||
.vscode/ | ||
|
||
# Pyenv | ||
.python-version |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
[package] | ||
name = "precellar" | ||
version = "0.1.0" | ||
edition = "2021" | ||
|
||
[dependencies] | ||
bwa = { git = "https://github.com/regulatory-genomics/bwa-rust.git", rev = "1eb8967f4952337c9ba4955242db2d485843f7c5" } | ||
noodles = { version = "0.80", features = ["core", "fastq", "bam", "sam", "async"] } | ||
yaml-rust = "0.4" | ||
anyhow = "1.0" | ||
flate2 = "1.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
use crate::seqspec::SeqSpec; | ||
|
||
use noodles::{sam, fastq}; | ||
use noodles::sam::alignment::{ | ||
Record, record_buf::RecordBuf, record::data::field::tag::Tag, | ||
}; | ||
use noodles::sam::alignment::record_buf::data::field::value::Value; | ||
|
||
pub trait Alinger { | ||
fn align_reads<'a, I>(&'a self, records: I) -> impl Iterator<Item = sam::Record> + '_ | ||
where I: Iterator<Item = fastq::Record> + 'a; | ||
|
||
fn align_read_pairs<'a, I>(&'a self, records: I) -> impl Iterator<Item = (sam::Record, sam::Record)> + '_ | ||
where I: Iterator<Item = (fastq::Record, fastq::Record)> + 'a; | ||
} | ||
|
||
pub struct FastqProcessor<A> { | ||
seqspec: SeqSpec, | ||
aligner: A, | ||
read1: Option<String>, | ||
read2: Option<String>, | ||
} | ||
|
||
impl<A: Alinger> FastqProcessor<A> { | ||
pub fn new(seqspec: SeqSpec, aligner: A) -> Self { | ||
Self { seqspec, aligner, read1: None, read2: None } | ||
} | ||
} | ||
|
||
pub fn add_cell_barcode<R: Record>( | ||
header: &sam::Header, | ||
record: &R, | ||
ori_barcode: &str, | ||
ori_qual: &[u8], | ||
correct_barcode: Option<&str>, | ||
) -> std::io::Result<RecordBuf> { | ||
let mut record_buf = RecordBuf::try_from_alignment_record(header, record)?; | ||
let data = record_buf.data_mut(); | ||
data.insert(Tag::CELL_BARCODE_SEQUENCE, Value::String(ori_barcode.into())); | ||
data.insert(Tag::CELL_BARCODE_QUALITY_SCORES, Value::String(ori_qual.into())); | ||
if let Some(barcode) = correct_barcode { | ||
data.insert(Tag::CELL_BARCODE_ID, Value::String(barcode.into())); | ||
} | ||
Ok(record_buf) | ||
} | ||
|
||
fn slice_fastq_record(mut record: fastq::Record, start: usize, end: usize) -> fastq::Record { | ||
*record.sequence_mut() = record.sequence()[start..end].to_vec(); | ||
*record.quality_scores_mut() = record.quality_scores()[start..end].to_vec(); | ||
record | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
use core::f64; | ||
use std::collections::HashMap; | ||
|
||
const BC_MAX_QV: u8 = 66; // This is the illumina quality value | ||
pub(crate) const BASE_OPTS: [u8; 4] = [b'A', b'C', b'G', b'T']; | ||
|
||
pub struct Whitelist { | ||
whitelist_exists: bool, | ||
barcode_counts: HashMap<String, usize>, | ||
mismatch_count: usize, | ||
total_count: usize, | ||
total_base_count: u64, | ||
q30_base_count: u64, | ||
base_qual_sum: i64, | ||
} | ||
|
||
impl Whitelist { | ||
pub fn empty() -> Self { | ||
Self { | ||
whitelist_exists: false, | ||
barcode_counts: HashMap::new(), | ||
mismatch_count: 0, | ||
total_count: 0, | ||
total_base_count: 0, | ||
q30_base_count: 0, | ||
base_qual_sum: 0, | ||
} | ||
} | ||
|
||
pub fn new<I: IntoIterator<Item = S>, S: ToString>(iter: I) -> Self { | ||
let mut whitelist = Self::empty(); | ||
whitelist.whitelist_exists = true; | ||
whitelist.barcode_counts = iter.into_iter().map(|x| (x.to_string(), 0)).collect(); | ||
whitelist | ||
} | ||
|
||
/// Update the barcode counter with a barcode and its quality scores. | ||
pub fn count_barcode(&mut self, barcode: &str, barcode_qual: &[u8]) { | ||
if self.whitelist_exists { | ||
if let Some(count) = self.barcode_counts.get_mut(barcode) { | ||
*count += 1; | ||
} else { | ||
self.mismatch_count += 1; | ||
} | ||
} else if barcode.len() > 1 { | ||
*self.barcode_counts.entry(barcode.to_string()).or_insert(0) += 1; | ||
} else { | ||
self.mismatch_count += 1; | ||
} | ||
|
||
self.total_count += 1; | ||
|
||
for &qual in barcode_qual { | ||
let qual_int = (qual as u32) - 33; | ||
self.base_qual_sum += qual_int as i64; | ||
if qual_int >= 30 { | ||
self.q30_base_count += 1; | ||
} | ||
self.total_base_count += 1; | ||
} | ||
} | ||
|
||
pub fn get_barcode_counts(&self) -> &HashMap<String, usize> { | ||
&self.barcode_counts | ||
} | ||
|
||
pub fn get_mean_base_quality_score(&self) -> f64 { | ||
if self.total_base_count <= 0 { | ||
0.0 | ||
} else { | ||
self.base_qual_sum as f64 / self.total_base_count as f64 | ||
} | ||
} | ||
} | ||
|
||
/// A barcode validator that uses a barcode counter to validate barcodes. | ||
pub struct BarcodeCorrector { | ||
/// threshold for sum of probability of error on barcode QVs. Barcodes exceeding | ||
/// this threshold will be marked as not valid. | ||
max_expected_errors: f64, | ||
/// if the posterior probability of a correction | ||
/// exceeds this threshold, the barcode will be corrected. | ||
bc_confidence_threshold: f64, | ||
} | ||
|
||
impl Default for BarcodeCorrector { | ||
fn default() -> Self { | ||
Self { | ||
max_expected_errors: f64::MAX, | ||
bc_confidence_threshold: 0.975, | ||
} | ||
} | ||
} | ||
|
||
#[derive(Copy, Clone, Debug)] | ||
pub enum BarcodeError { | ||
ExceedExpectedError(f64), | ||
LowConfidence(f64), | ||
NoMatch, | ||
} | ||
|
||
impl BarcodeCorrector { | ||
/// Determine if a barcode is valid. A barcode is valid if any of the following conditions are met: | ||
/// 1) It is in the whitelist and the number of expected errors is less than the max_expected_errors. | ||
/// 2) It is not in the whitelist, but the number of expected errors is less than the max_expected_errors and the corrected barcode is in the whitelist. | ||
/// 3) If the whitelist does not exist, the barcode is always valid. | ||
/// | ||
/// Return the corrected barcode | ||
pub fn correct(&self, barcode_counts: &HashMap<String, usize>, barcode: &str, qual: &[u8]) -> Result<String, BarcodeError> { | ||
let expected_errors: f64 = qual.iter().map(|&q| probability(q)).sum(); | ||
if expected_errors >= self.max_expected_errors { | ||
return Err(BarcodeError::ExceedExpectedError(expected_errors)); | ||
} | ||
if barcode_counts.contains_key(barcode) { | ||
return Ok(barcode.to_string()); | ||
} | ||
|
||
let mut best_option = None; | ||
let mut total_likelihood = 0.0; | ||
let mut barcode_bytes = barcode.as_bytes().to_vec(); | ||
for (pos, &qv) in qual.iter().enumerate() { | ||
let qv = qv.min(BC_MAX_QV); | ||
let existing = barcode_bytes[pos]; | ||
for val in BASE_OPTS { | ||
if val != existing { | ||
barcode_bytes[pos] = val; | ||
let bc = std::str::from_utf8(&barcode_bytes).unwrap(); | ||
if let Some(raw_count) = barcode_counts.get(bc) { | ||
let bc_count = 1 + raw_count; | ||
let prob_edit = probability(qv); | ||
let likelihood = bc_count as f64 * prob_edit; | ||
match best_option { | ||
None => best_option = Some((likelihood, bc.to_string())), | ||
Some(ref old_best) => { | ||
if old_best.0 < likelihood { | ||
best_option = Some((likelihood, bc.to_string())); | ||
} | ||
}, | ||
} | ||
total_likelihood += likelihood; | ||
} | ||
} | ||
} | ||
barcode_bytes[pos] = existing; | ||
} | ||
|
||
if let Some((best_like, best_bc)) = best_option { | ||
if best_like / total_likelihood >= self.bc_confidence_threshold { | ||
return Ok(best_bc) | ||
} | ||
} | ||
Err(BarcodeError::NoMatch) | ||
} | ||
} | ||
|
||
/// Convert quality scores to base-calling error probabilities. | ||
fn probability(qual: u8) -> f64 { | ||
let offset = 33.0; | ||
10f64.powf(-((qual as f64 - offset) / 10.0)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
pub mod barcode; | ||
pub mod align; | ||
pub mod seqspec; |
Oops, something went wrong.