Skip to content

Commit

Permalink
Merge branch 'hotfix/1.4.4'
Browse files Browse the repository at this point in the history
  • Loading branch information
Hoohm committed Dec 10, 2020
2 parents 0567d2e + d5bbeb8 commit ff60a07
Show file tree
Hide file tree
Showing 6 changed files with 260 additions and 189 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ dist/
build/
CITE_seq_Count.egg-info/
__pycache__
*.pyc
*.pyc
.vscode
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/)
and this project adheres to [Semantic Versioning](http://semver.org/).

## [1.4.4] - 10.12.2020
### Changed
- Hotfix linked to the issues like #125 showing that python3.8 and above crash on modified loop.
- Fixed bug that checks filenames length instead of number of files.


## [1.4.3] - 05.10.2019
### Added
- Support for multiple files as input. This allows you to not merge different lanes before
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Installation
-------------------------------------------

```
pip install CITE-seq-Count==1.4.3
pip install CITE-seq-Count==1.4.4
```


Expand Down
160 changes: 93 additions & 67 deletions cite_seq_count/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from math import floor
from collections import OrderedDict
from itertools import combinations
from itertools import islice
from itertools import islice


def get_indexes(start_index, chunk_size, nth):
"""
Expand All @@ -23,7 +24,7 @@ def get_indexes(start_index, chunk_size, nth):
"""
start_index = nth * chunk_size
stop_index = chunk_size + nth * chunk_size
return([start_index,stop_index])
return [start_index, stop_index]


def chunk_reads(n_reads, n):
Expand All @@ -36,18 +37,18 @@ def chunk_reads(n_reads, n):
Returns:
indexes (list(list)): Each entry contains the first and the last index for a read.
"""
indexes=list()
indexes = list()
if n_reads % n == 0:
chunk_size = int(n_reads/n)
chunk_size = int(n_reads / n)
rest = 0
else:
chunk_size = floor(n_reads/n)
rest = n_reads - (n*chunk_size)
for i in range(0,n):
chunk_size = floor(n_reads / n)
rest = n_reads - (n * chunk_size)
for i in range(0, n):
indexes.append(get_indexes(i, chunk_size, i))
indexes[-1][1] += rest
return(indexes)
return indexes


def parse_whitelist_csv(filename, barcode_length, collapsing_threshold):
"""Reads white-listed barcodes from a CSV file.
Expand All @@ -66,18 +67,27 @@ def parse_whitelist_csv(filename, barcode_length, collapsing_threshold):
"""
STRIP_CHARS = '"0123456789- \t\n'
with open(filename, mode='r') as csv_file:
with open(filename, mode="r") as csv_file:
csv_reader = csv.reader(csv_file)
cell_pattern = regex.compile(r'[ATGC]{{{}}}'.format(barcode_length))
whitelist = [row[0].strip(STRIP_CHARS) for row in csv_reader
if (len(row[0].strip(STRIP_CHARS)) == barcode_length)]
cell_pattern = regex.compile(r"[ATGC]{{{}}}".format(barcode_length))
whitelist = [
row[0].strip(STRIP_CHARS)
for row in csv_reader
if (len(row[0].strip(STRIP_CHARS)) == barcode_length)
]
for cell_barcode in whitelist:
if not cell_pattern.match(cell_barcode):
sys.exit('This barcode {} is not only composed of ATGC bases.'.format(cell_barcode))
#collapsing_threshold=test_cell_distances(whitelist, collapsing_threshold)
sys.exit(
"This barcode {} is not only composed of ATGC bases.".format(
cell_barcode
)
)
# collapsing_threshold=test_cell_distances(whitelist, collapsing_threshold)
if len(whitelist) == 0:
sys.exit('Please check cell barcode indexes -cbs, -cbl because none of the given whitelist is valid.')
return(set(whitelist), collapsing_threshold)
sys.exit(
"Please check cell barcode indexes -cbs, -cbl because none of the given whitelist is valid."
)
return (set(whitelist), collapsing_threshold)


def test_cell_distances(whitelist, collapsing_threshold):
Expand All @@ -95,17 +105,21 @@ def test_cell_distances(whitelist, collapsing_threshold):
"""
ok = False
while not ok:
print('Testing cell barcode collapsing threshold of {}'.format(collapsing_threshold))
print(
"Testing cell barcode collapsing threshold of {}".format(
collapsing_threshold
)
)
all_comb = combinations(whitelist, 2)
for comb in all_comb:
if Levenshtein.hamming(comb[0], comb[1]) <= collapsing_threshold:
collapsing_threshold -= 1
print('Value is too high, reducing it by 1')
print("Value is too high, reducing it by 1")
break
else:
ok = True
print('Using {} for cell barcode collapsing threshold'.format(collapsing_threshold))
return(collapsing_threshold)
print("Using {} for cell barcode collapsing threshold".format(collapsing_threshold))
return collapsing_threshold


def parse_tags_csv(filename):
Expand All @@ -124,12 +138,12 @@ def parse_tags_csv(filename):
dict: A dictionary containing the TAGs and their names.
"""
with open(filename, mode='r') as csv_file:
with open(filename, mode="r") as csv_file:
csv_reader = csv.reader(csv_file)
tags = {}
for row in csv_reader:
tags[row[0].strip()] = row[1].strip()
return(tags)
return tags


def check_tags(tags, maximum_distance):
Expand All @@ -154,36 +168,41 @@ def check_tags(tags, maximum_distance):
"""
ordered_tags = OrderedDict()
for tag in sorted(tags, key=len, reverse=True):
ordered_tags[tag] = tags[tag] + '-' + tag
ordered_tags[tag] = tags[tag] + "-" + tag
# If only one TAG is provided, then no distances to compare.
if (len(tags) == 1):
return(ordered_tags)
if len(tags) == 1:
return ordered_tags

offending_pairs = []
for a, b in combinations(ordered_tags.keys(), 2):
distance = Levenshtein.distance(a, b)
if (distance <= (maximum_distance - 1)):
if distance <= (maximum_distance - 1):
offending_pairs.append([a, b, distance])
DNA_pattern = regex.compile('^[ATGC]*$')
DNA_pattern = regex.compile("^[ATGC]*$")
for tag in tags:
if not DNA_pattern.match(tag):
print('This tag {} is not only composed of ATGC bases.\nPlease check your tags file'.format(tag))
sys.exit('Exiting the application.\n')
print(
"This tag {} is not only composed of ATGC bases.\nPlease check your tags file".format(
tag
)
)
sys.exit("Exiting the application.\n")
# If offending pairs are found, print them all.
if offending_pairs:
print(
'[ERROR] Minimum Levenshtein distance of TAGs barcode is less '
'than given threshold.\n'
'Please use a smaller distance.\n\n'
'Offending case(s):\n'
"[ERROR] Minimum Levenshtein distance of TAGs barcode is less "
"than given threshold.\n"
"Please use a smaller distance.\n\n"
"Offending case(s):\n"
)
for pair in offending_pairs:
print(
'\t{tag1}\n\t{tag2}\n\tDistance = {distance}\n'
.format(tag1=pair[0], tag2=pair[1], distance=pair[2])
"\t{tag1}\n\t{tag2}\n\tDistance = {distance}\n".format(
tag1=pair[0], tag2=pair[1], distance=pair[2]
)
)
sys.exit('Exiting the application.\n')
return(ordered_tags)
sys.exit("Exiting the application.\n")
return ordered_tags


def get_read_length(filename):
Expand All @@ -197,18 +216,18 @@ def get_read_length(filename):
int: The file's SEQUENCE length.
"""
with gzip.open(filename, 'r') as fastq_file:
with gzip.open(filename, "r") as fastq_file:
secondlines = islice(fastq_file, 1, 1000, 4)
temp_length = len(next(secondlines).rstrip())
for sequence in secondlines:
read_length = len(sequence.rstrip())
if (temp_length != read_length):
if temp_length != read_length:
sys.exit(
'[ERROR] Sequence length in {} is not consistent. Please, trim all '
'sequences at the same length.\n'
'Exiting the application.\n'.format(filename)
"[ERROR] Sequence length in {} is not consistent. Please, trim all "
"sequences at the same length.\n"
"Exiting the application.\n".format(filename)
)
return(read_length)
return read_length


def check_barcodes_lengths(read1_length, cb_first, cb_last, umi_first, umi_last):
Expand All @@ -235,19 +254,20 @@ def check_barcodes_lengths(read1_length, cb_first, cb_last, umi_first, umi_last)

if barcode_umi_length > read1_length:
sys.exit(
'[ERROR] Read1 length is shorter than the option you are using for '
'Cell and UMI barcodes length. Please, check your options and rerun.\n\n'
'Exiting the application.\n'
"[ERROR] Read1 length is shorter than the option you are using for "
"Cell and UMI barcodes length. Please, check your options and rerun.\n\n"
"Exiting the application.\n"
)
elif barcode_umi_length < read1_length:
print(
'[WARNING] Read1 length is {}bp but you are using {}bp for Cell '
'and UMI barcodes combined.\nThis might lead to wrong cell '
'attribution and skewed umi counts.\n'
.format(read1_length, barcode_umi_length)
"[WARNING] Read1 length is {}bp but you are using {}bp for Cell "
"and UMI barcodes combined.\nThis might lead to wrong cell "
"attribution and skewed umi counts.\n".format(
read1_length, barcode_umi_length
)
)
return(barcode_slice, umi_slice, barcode_umi_length)

return (barcode_slice, umi_slice, barcode_umi_length)


def blocks(files, size=65536):
Expand All @@ -264,7 +284,8 @@ def blocks(files, size=65536):
"""
while True:
b = files.read(size)
if not b: break
if not b:
break
yield b


Expand All @@ -280,13 +301,15 @@ def get_n_lines(file_path):
Returns:
n_lines (int): Number of lines in the file
"""
print('Counting number of reads')
with gzip.open(file_path, "rt",encoding="utf-8",errors='ignore') as f:
print("Counting number of reads")
with gzip.open(file_path, "rt", encoding="utf-8", errors="ignore") as f:
n_lines = sum(bl.count("\n") for bl in blocks(f))
if n_lines %4 !=0:
sys.exit('{}\'s number of lines is not a multiple of 4. The file '
'might be corrupted.\n Exiting'.format(file_path))
return(n_lines)
if n_lines % 4 != 0:
sys.exit(
"{}'s number of lines is not a multiple of 4. The file "
"might be corrupted.\n Exiting".format(file_path)
)
return n_lines


def get_read_paths(read1_path, read2_path):
Expand All @@ -301,9 +324,12 @@ def get_read_paths(read1_path, read2_path):
_read1_path (list(string)): list of paths to read1.fq
_read2_path (list(string)): list of paths to read2.fq
"""
_read1_path = read1_path.split(',')
_read2_path = read2_path.split(',')
if len(read1_path) != len(read2_path):
sys.exit('Unequal number of read1 ({}) and read2({}) files provided'
'\n Exiting'.format(len(read1_path),len(read2_path)))
return(_read1_path, _read2_path)
_read1_path = read1_path.split(",")
_read2_path = read2_path.split(",")
if len(_read1_path) != len(_read2_path):
sys.exit(
"Unequal number of read1 ({}) and read2({}) files provided"
"\n Exiting".format(len(_read1_path), len(_read2_path))
)
return (_read1_path, _read2_path)

Loading

0 comments on commit ff60a07

Please sign in to comment.