Merge branch 'hotfix/1.4.4'

Xabiermm1 · Dec 10, 2020 · ff60a07 · ff60a07
2 parents 0567d2e + d5bbeb8
commit ff60a07
Show file tree

Hide file tree

Showing 6 changed files with 260 additions and 189 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@ dist/
 build/
 CITE_seq_Count.egg-info/
 __pycache__
-*.pyc
+*.pyc
+.vscode
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
+## [1.4.4] - 10.12.2020
+### Changed
+  - Hotfix linked to the issues like #125 showing that python3.8 and above crash on modified loop.
+  - Fixed bug that checks filenames length instead of number of files.
+
+
 ## [1.4.3] - 05.10.2019
 ### Added
   - Support for multiple files as input. This allows you to not merge different lanes before

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Installation
 -------------------------------------------
 
 ```
-pip install CITE-seq-Count==1.4.3
+pip install CITE-seq-Count==1.4.4
 ```
 
 

diff --git a/cite_seq_count/preprocessing.py b/cite_seq_count/preprocessing.py
@@ -7,7 +7,8 @@
 from math import floor
 from collections import OrderedDict
 from itertools import combinations
-from itertools import islice 
+from itertools import islice
+
 
 def get_indexes(start_index, chunk_size, nth):
     """
@@ -23,7 +24,7 @@ def get_indexes(start_index, chunk_size, nth):
     """
     start_index = nth * chunk_size
     stop_index = chunk_size + nth * chunk_size
-    return([start_index,stop_index])
+    return [start_index, stop_index]
 
 
 def chunk_reads(n_reads, n):
@@ -36,18 +37,18 @@ def chunk_reads(n_reads, n):
     Returns:
         indexes (list(list)): Each entry contains the first and the last index for a read.
     """
-    indexes=list()
+    indexes = list()
     if n_reads % n == 0:
-        chunk_size = int(n_reads/n)
+        chunk_size = int(n_reads / n)
         rest = 0
     else:
-        chunk_size = floor(n_reads/n)
-        rest = n_reads - (n*chunk_size)
-    for i in range(0,n):
+        chunk_size = floor(n_reads / n)
+        rest = n_reads - (n * chunk_size)
+    for i in range(0, n):
         indexes.append(get_indexes(i, chunk_size, i))
     indexes[-1][1] += rest
-    return(indexes)
-    
+    return indexes
+
 
 def parse_whitelist_csv(filename, barcode_length, collapsing_threshold):
     """Reads white-listed barcodes from a CSV file.
@@ -66,18 +67,27 @@ def parse_whitelist_csv(filename, barcode_length, collapsing_threshold):
 
     """
     STRIP_CHARS = '"0123456789- \t\n'
-    with open(filename, mode='r') as csv_file:
+    with open(filename, mode="r") as csv_file:
         csv_reader = csv.reader(csv_file)
-        cell_pattern = regex.compile(r'[ATGC]{{{}}}'.format(barcode_length))
-        whitelist = [row[0].strip(STRIP_CHARS) for row in csv_reader
-                     if (len(row[0].strip(STRIP_CHARS)) == barcode_length)]
+        cell_pattern = regex.compile(r"[ATGC]{{{}}}".format(barcode_length))
+        whitelist = [
+            row[0].strip(STRIP_CHARS)
+            for row in csv_reader
+            if (len(row[0].strip(STRIP_CHARS)) == barcode_length)
+        ]
     for cell_barcode in whitelist:
         if not cell_pattern.match(cell_barcode):
-            sys.exit('This barcode {} is not only composed of ATGC bases.'.format(cell_barcode))
-    #collapsing_threshold=test_cell_distances(whitelist, collapsing_threshold)
+            sys.exit(
+                "This barcode {} is not only composed of ATGC bases.".format(
+                    cell_barcode
+                )
+            )
+    # collapsing_threshold=test_cell_distances(whitelist, collapsing_threshold)
     if len(whitelist) == 0:
-        sys.exit('Please check cell barcode indexes -cbs, -cbl because none of the given whitelist is valid.')
-    return(set(whitelist), collapsing_threshold)
+        sys.exit(
+            "Please check cell barcode indexes -cbs, -cbl because none of the given whitelist is valid."
+        )
+    return (set(whitelist), collapsing_threshold)
 
 
 def test_cell_distances(whitelist, collapsing_threshold):
@@ -95,17 +105,21 @@ def test_cell_distances(whitelist, collapsing_threshold):
     """
     ok = False
     while not ok:
-        print('Testing cell barcode collapsing threshold of {}'.format(collapsing_threshold))
+        print(
+            "Testing cell barcode collapsing threshold of {}".format(
+                collapsing_threshold
+            )
+        )
         all_comb = combinations(whitelist, 2)
         for comb in all_comb:
             if Levenshtein.hamming(comb[0], comb[1]) <= collapsing_threshold:
                 collapsing_threshold -= 1
-                print('Value is too high, reducing it by 1')
+                print("Value is too high, reducing it by 1")
                 break
         else:
             ok = True
-    print('Using {} for cell barcode collapsing threshold'.format(collapsing_threshold))
-    return(collapsing_threshold)
+    print("Using {} for cell barcode collapsing threshold".format(collapsing_threshold))
+    return collapsing_threshold
 
 
 def parse_tags_csv(filename):
@@ -124,12 +138,12 @@ def parse_tags_csv(filename):
         dict: A dictionary containing the TAGs and their names.
 
     """
-    with open(filename, mode='r') as csv_file:
+    with open(filename, mode="r") as csv_file:
         csv_reader = csv.reader(csv_file)
         tags = {}
         for row in csv_reader:
             tags[row[0].strip()] = row[1].strip()
-    return(tags)
+    return tags
 
 
 def check_tags(tags, maximum_distance):
@@ -154,36 +168,41 @@ def check_tags(tags, maximum_distance):
     """
     ordered_tags = OrderedDict()
     for tag in sorted(tags, key=len, reverse=True):
-        ordered_tags[tag] = tags[tag] + '-' + tag
+        ordered_tags[tag] = tags[tag] + "-" + tag
     # If only one TAG is provided, then no distances to compare.
-    if (len(tags) == 1):
-        return(ordered_tags)
-    
+    if len(tags) == 1:
+        return ordered_tags
+
     offending_pairs = []
     for a, b in combinations(ordered_tags.keys(), 2):
         distance = Levenshtein.distance(a, b)
-        if (distance <= (maximum_distance - 1)):
+        if distance <= (maximum_distance - 1):
             offending_pairs.append([a, b, distance])
-    DNA_pattern = regex.compile('^[ATGC]*$')
+    DNA_pattern = regex.compile("^[ATGC]*$")
     for tag in tags:
         if not DNA_pattern.match(tag):
-            print('This tag {} is not only composed of ATGC bases.\nPlease check your tags file'.format(tag))
-            sys.exit('Exiting the application.\n')
+            print(
+                "This tag {} is not only composed of ATGC bases.\nPlease check your tags file".format(
+                    tag
+                )
+            )
+            sys.exit("Exiting the application.\n")
     # If offending pairs are found, print them all.
     if offending_pairs:
         print(
-            '[ERROR] Minimum Levenshtein distance of TAGs barcode is less '
-            'than given threshold.\n'
-            'Please use a smaller distance.\n\n'
-            'Offending case(s):\n'
+            "[ERROR] Minimum Levenshtein distance of TAGs barcode is less "
+            "than given threshold.\n"
+            "Please use a smaller distance.\n\n"
+            "Offending case(s):\n"
         )
         for pair in offending_pairs:
             print(
-                '\t{tag1}\n\t{tag2}\n\tDistance = {distance}\n'
-                .format(tag1=pair[0], tag2=pair[1], distance=pair[2])
+                "\t{tag1}\n\t{tag2}\n\tDistance = {distance}\n".format(
+                    tag1=pair[0], tag2=pair[1], distance=pair[2]
+                )
             )
-        sys.exit('Exiting the application.\n')
-    return(ordered_tags)
+        sys.exit("Exiting the application.\n")
+    return ordered_tags
 
 
 def get_read_length(filename):
@@ -197,18 +216,18 @@ def get_read_length(filename):
         int: The file's SEQUENCE length.
 
     """
-    with gzip.open(filename, 'r') as fastq_file:
+    with gzip.open(filename, "r") as fastq_file:
         secondlines = islice(fastq_file, 1, 1000, 4)
         temp_length = len(next(secondlines).rstrip())
         for sequence in secondlines:
             read_length = len(sequence.rstrip())
-            if (temp_length != read_length):
+            if temp_length != read_length:
                 sys.exit(
-                    '[ERROR] Sequence length in {} is not consistent. Please, trim all '
-                    'sequences at the same length.\n'
-                    'Exiting the application.\n'.format(filename)
+                    "[ERROR] Sequence length in {} is not consistent. Please, trim all "
+                    "sequences at the same length.\n"
+                    "Exiting the application.\n".format(filename)
                 )
-    return(read_length)
+    return read_length
 
 
 def check_barcodes_lengths(read1_length, cb_first, cb_last, umi_first, umi_last):
@@ -235,19 +254,20 @@ def check_barcodes_lengths(read1_length, cb_first, cb_last, umi_first, umi_last)
 
     if barcode_umi_length > read1_length:
         sys.exit(
-            '[ERROR] Read1 length is shorter than the option you are using for '
-            'Cell and UMI barcodes length. Please, check your options and rerun.\n\n'
-            'Exiting the application.\n'
+            "[ERROR] Read1 length is shorter than the option you are using for "
+            "Cell and UMI barcodes length. Please, check your options and rerun.\n\n"
+            "Exiting the application.\n"
         )
     elif barcode_umi_length < read1_length:
         print(
-            '[WARNING] Read1 length is {}bp but you are using {}bp for Cell '
-            'and UMI barcodes combined.\nThis might lead to wrong cell '
-            'attribution and skewed umi counts.\n'
-            .format(read1_length, barcode_umi_length)
+            "[WARNING] Read1 length is {}bp but you are using {}bp for Cell "
+            "and UMI barcodes combined.\nThis might lead to wrong cell "
+            "attribution and skewed umi counts.\n".format(
+                read1_length, barcode_umi_length
+            )
         )
-    
-    return(barcode_slice, umi_slice, barcode_umi_length)
+
+    return (barcode_slice, umi_slice, barcode_umi_length)
 
 
 def blocks(files, size=65536):
@@ -264,7 +284,8 @@ def blocks(files, size=65536):
     """
     while True:
         b = files.read(size)
-        if not b: break
+        if not b:
+            break
         yield b
 
 
@@ -280,13 +301,15 @@ def get_n_lines(file_path):
     Returns:
         n_lines (int): Number of lines in the file
     """
-    print('Counting number of reads')
-    with gzip.open(file_path, "rt",encoding="utf-8",errors='ignore') as f:
+    print("Counting number of reads")
+    with gzip.open(file_path, "rt", encoding="utf-8", errors="ignore") as f:
         n_lines = sum(bl.count("\n") for bl in blocks(f))
-    if n_lines %4 !=0:
-        sys.exit('{}\'s number of lines is not a multiple of 4. The file '
-                 'might be corrupted.\n Exiting'.format(file_path))
-    return(n_lines)
+    if n_lines % 4 != 0:
+        sys.exit(
+            "{}'s number of lines is not a multiple of 4. The file "
+            "might be corrupted.\n Exiting".format(file_path)
+        )
+    return n_lines
 
 
 def get_read_paths(read1_path, read2_path):
@@ -301,9 +324,12 @@ def get_read_paths(read1_path, read2_path):
         _read1_path (list(string)): list of paths to read1.fq
         _read2_path (list(string)): list of paths to read2.fq
     """
-    _read1_path = read1_path.split(',')
-    _read2_path = read2_path.split(',')
-    if len(read1_path) != len(read2_path):
-        sys.exit('Unequal number of read1 ({}) and read2({}) files provided'
-                 '\n Exiting'.format(len(read1_path),len(read2_path)))
-    return(_read1_path, _read2_path)
+    _read1_path = read1_path.split(",")
+    _read2_path = read2_path.split(",")
+    if len(_read1_path) != len(_read2_path):
+        sys.exit(
+            "Unequal number of read1 ({}) and read2({}) files provided"
+            "\n Exiting".format(len(_read1_path), len(_read2_path))
+        )
+    return (_read1_path, _read2_path)
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,4 +2,5 @@ dist/ @@
     build/
     CITE_seq_Count.egg-info/
     __pycache__
-    *.pyc
+    *.pyc
+    .vscode