Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Sep 7, 2024
0 parents commit 761e82c
Show file tree
Hide file tree
Showing 11 changed files with 597 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.ipynb filter=lfs diff=lfs merge=lfs -text
72 changes: 72 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/target

# Byte-compiled / optimized / DLL files
__pycache__/
.pytest_cache/
*.py[cod]

# C extensions
*.so

# Distribution / packaging
.Python
.venv/
env/
bin/
build/
develop-eggs/
dist/
eggs/
lib/
lib64/
parts/
sdist/
var/
include/
man/
venv/
*.egg-info/
.installed.cfg
*.egg

# Installer logs
pip-log.txt
pip-delete-this-directory.txt
pip-selfcheck.json

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml

# Translations
*.mo

# Mr Developer
.mr.developer.cfg
.project
.pydevproject

# Rope
.ropeproject

# Django stuff:
*.log
*.pot

.DS_Store

# Sphinx documentation
docs/_build/

# PyCharm
.idea/

# VSCode
.vscode/

# Pyenv
.python-version
Empty file added README.md
Empty file.
11 changes: 11 additions & 0 deletions precellar/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[package]
name = "precellar"
version = "0.1.0"
edition = "2021"

[dependencies]
bwa = { git = "https://github.com/regulatory-genomics/bwa-rust.git", rev = "1eb8967f4952337c9ba4955242db2d485843f7c5" }
noodles = { version = "0.80", features = ["core", "fastq", "bam", "sam", "async"] }
yaml-rust = "0.4"
anyhow = "1.0"
flate2 = "1.0"
51 changes: 51 additions & 0 deletions precellar/src/align.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
use crate::seqspec::SeqSpec;

use noodles::{sam, fastq};
use noodles::sam::alignment::{
Record, record_buf::RecordBuf, record::data::field::tag::Tag,
};
use noodles::sam::alignment::record_buf::data::field::value::Value;

pub trait Alinger {
fn align_reads<'a, I>(&'a self, records: I) -> impl Iterator<Item = sam::Record> + '_
where I: Iterator<Item = fastq::Record> + 'a;

fn align_read_pairs<'a, I>(&'a self, records: I) -> impl Iterator<Item = (sam::Record, sam::Record)> + '_
where I: Iterator<Item = (fastq::Record, fastq::Record)> + 'a;
}

pub struct FastqProcessor<A> {
seqspec: SeqSpec,
aligner: A,
read1: Option<String>,
read2: Option<String>,
}

impl<A: Alinger> FastqProcessor<A> {
pub fn new(seqspec: SeqSpec, aligner: A) -> Self {
Self { seqspec, aligner, read1: None, read2: None }
}
}

pub fn add_cell_barcode<R: Record>(
header: &sam::Header,
record: &R,
ori_barcode: &str,
ori_qual: &[u8],
correct_barcode: Option<&str>,
) -> std::io::Result<RecordBuf> {
let mut record_buf = RecordBuf::try_from_alignment_record(header, record)?;
let data = record_buf.data_mut();
data.insert(Tag::CELL_BARCODE_SEQUENCE, Value::String(ori_barcode.into()));
data.insert(Tag::CELL_BARCODE_QUALITY_SCORES, Value::String(ori_qual.into()));
if let Some(barcode) = correct_barcode {
data.insert(Tag::CELL_BARCODE_ID, Value::String(barcode.into()));
}
Ok(record_buf)
}

fn slice_fastq_record(mut record: fastq::Record, start: usize, end: usize) -> fastq::Record {
*record.sequence_mut() = record.sequence()[start..end].to_vec();
*record.quality_scores_mut() = record.quality_scores()[start..end].to_vec();
record
}
160 changes: 160 additions & 0 deletions precellar/src/barcode.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
use core::f64;
use std::collections::HashMap;

const BC_MAX_QV: u8 = 66; // This is the illumina quality value
pub(crate) const BASE_OPTS: [u8; 4] = [b'A', b'C', b'G', b'T'];

pub struct Whitelist {
whitelist_exists: bool,
barcode_counts: HashMap<String, usize>,
mismatch_count: usize,
total_count: usize,
total_base_count: u64,
q30_base_count: u64,
base_qual_sum: i64,
}

impl Whitelist {
pub fn empty() -> Self {
Self {
whitelist_exists: false,
barcode_counts: HashMap::new(),
mismatch_count: 0,
total_count: 0,
total_base_count: 0,
q30_base_count: 0,
base_qual_sum: 0,
}
}

pub fn new<I: IntoIterator<Item = S>, S: ToString>(iter: I) -> Self {
let mut whitelist = Self::empty();
whitelist.whitelist_exists = true;
whitelist.barcode_counts = iter.into_iter().map(|x| (x.to_string(), 0)).collect();
whitelist
}

/// Update the barcode counter with a barcode and its quality scores.
pub fn count_barcode(&mut self, barcode: &str, barcode_qual: &[u8]) {
if self.whitelist_exists {
if let Some(count) = self.barcode_counts.get_mut(barcode) {
*count += 1;
} else {
self.mismatch_count += 1;
}
} else if barcode.len() > 1 {
*self.barcode_counts.entry(barcode.to_string()).or_insert(0) += 1;
} else {
self.mismatch_count += 1;
}

self.total_count += 1;

for &qual in barcode_qual {
let qual_int = (qual as u32) - 33;
self.base_qual_sum += qual_int as i64;
if qual_int >= 30 {
self.q30_base_count += 1;
}
self.total_base_count += 1;
}
}

pub fn get_barcode_counts(&self) -> &HashMap<String, usize> {
&self.barcode_counts
}

pub fn get_mean_base_quality_score(&self) -> f64 {
if self.total_base_count <= 0 {
0.0
} else {
self.base_qual_sum as f64 / self.total_base_count as f64
}
}
}

/// A barcode validator that uses a barcode counter to validate barcodes.
pub struct BarcodeCorrector {
/// threshold for sum of probability of error on barcode QVs. Barcodes exceeding
/// this threshold will be marked as not valid.
max_expected_errors: f64,
/// if the posterior probability of a correction
/// exceeds this threshold, the barcode will be corrected.
bc_confidence_threshold: f64,
}

impl Default for BarcodeCorrector {
fn default() -> Self {
Self {
max_expected_errors: f64::MAX,
bc_confidence_threshold: 0.975,
}
}
}

#[derive(Copy, Clone, Debug)]
pub enum BarcodeError {
ExceedExpectedError(f64),
LowConfidence(f64),
NoMatch,
}

impl BarcodeCorrector {
/// Determine if a barcode is valid. A barcode is valid if any of the following conditions are met:
/// 1) It is in the whitelist and the number of expected errors is less than the max_expected_errors.
/// 2) It is not in the whitelist, but the number of expected errors is less than the max_expected_errors and the corrected barcode is in the whitelist.
/// 3) If the whitelist does not exist, the barcode is always valid.
///
/// Return the corrected barcode
pub fn correct(&self, barcode_counts: &HashMap<String, usize>, barcode: &str, qual: &[u8]) -> Result<String, BarcodeError> {
let expected_errors: f64 = qual.iter().map(|&q| probability(q)).sum();
if expected_errors >= self.max_expected_errors {
return Err(BarcodeError::ExceedExpectedError(expected_errors));
}
if barcode_counts.contains_key(barcode) {
return Ok(barcode.to_string());
}

let mut best_option = None;
let mut total_likelihood = 0.0;
let mut barcode_bytes = barcode.as_bytes().to_vec();
for (pos, &qv) in qual.iter().enumerate() {
let qv = qv.min(BC_MAX_QV);
let existing = barcode_bytes[pos];
for val in BASE_OPTS {
if val != existing {
barcode_bytes[pos] = val;
let bc = std::str::from_utf8(&barcode_bytes).unwrap();
if let Some(raw_count) = barcode_counts.get(bc) {
let bc_count = 1 + raw_count;
let prob_edit = probability(qv);
let likelihood = bc_count as f64 * prob_edit;
match best_option {
None => best_option = Some((likelihood, bc.to_string())),
Some(ref old_best) => {
if old_best.0 < likelihood {
best_option = Some((likelihood, bc.to_string()));
}
},
}
total_likelihood += likelihood;
}
}
}
barcode_bytes[pos] = existing;
}

if let Some((best_like, best_bc)) = best_option {
if best_like / total_likelihood >= self.bc_confidence_threshold {
return Ok(best_bc)
}
}
Err(BarcodeError::NoMatch)
}
}

/// Convert quality scores to base-calling error probabilities.
fn probability(qual: u8) -> f64 {
let offset = 33.0;
10f64.powf(-((qual as f64 - offset) / 10.0))
}
3 changes: 3 additions & 0 deletions precellar/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pub mod barcode;
pub mod align;
pub mod seqspec;
Loading

0 comments on commit 761e82c

Please sign in to comment.