Skip to content

Commit

Permalink
fix some lint
Browse files Browse the repository at this point in the history
  • Loading branch information
Yenaled committed Oct 8, 2024
1 parent f318603 commit 6095416
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 29 deletions.
6 changes: 4 additions & 2 deletions kb_python/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ def get_provided_kallisto_path() -> Optional[str]:
bin_name = 'kallisto'
if '_KALLISTO_OPTOFF' in globals():
if _KALLISTO_OPTOFF:
bin_name=f'{bin_name}_optoff'
bin_name = f'{bin_name}_optoff'
if '_KALLISTO_KMER_64' in globals():
if _KALLISTO_KMER_64:
bin_name=f'{bin_name}_k64'
bin_name = f'{bin_name}_k64'
bin_filename = f'{bin_name}.exe' if PLATFORM == 'windows' else bin_name
path = os.path.join(BINS_DIR, PLATFORM, CPU, 'kallisto', bin_filename)
if not os.path.isfile(path):
Expand All @@ -60,12 +60,14 @@ def get_provided_bustools_path() -> Optional[str]:
return None
return path


def set_special_kallisto_binary(k64: bool, optoff: bool):
global _KALLISTO_KMER_64
global _KALLISTO_OPTOFF
_KALLISTO_KMER_64 = k64
_KALLISTO_OPTOFF = optoff


def get_compiled_kallisto_path(alias: str = COMPILED_DIR) -> Optional[str]:
"""Finds platform-dependent kallisto binary compiled with `compile`.
Expand Down
56 changes: 38 additions & 18 deletions kb_python/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,15 @@ def bustools_extract(
run_executable(command)
return {"bus": out_path}


def is_gzipped(file_path):
"""
Checks if a file is gzipped by reading its magic number.
"""
with open(file_path, 'rb') as file:
return file.read(2) == b'\x1f\x8b'


def read_headers_from_fastq(fastq_file):
"""
Reads headers from a FASTQ file and returns a set of headers.
Expand Down Expand Up @@ -188,22 +190,22 @@ def remove_mm_from_bus(t2g_path, txnames, temp_dir, bus_in):
ecs_mm, _ = get_mm_ecs(t2g_path, txnames, temp_dir)

if len(ecs_mm) > 0:
## Remove mm ecs from bus file
# Remove mm ecs from bus file
bus_txt = os.path.join(temp_dir, "output.bus.txt")
bus_txt_no_mm = os.path.join(temp_dir, "output_no_mm.bus.txt")
bus_no_mm = os.path.join(temp_dir, "output_no_mm.bus")

# Convert bus to txt file
bustools_text(bus_path=bus_in, out_path=bus_txt, flags=True)

# Remove mm ecs
bus_df = pd.read_csv(bus_txt, sep="\t", header=None)
new_bus_df = bus_df[~bus_df[2].isin(ecs_mm)]
new_bus_df.to_csv(bus_txt_no_mm, sep="\t", index=False, header=None)

# Convert back to bus format
bustools_fromtext(txt_path=bus_txt_no_mm, out_path=bus_no_mm)

logger.debug(
f"BUS file without equivalence classes that map to multiple genes saved at {bus_no_mm}"
)
Expand All @@ -223,7 +225,8 @@ def remove_mm_from_mc(t2g_path, txnames, temp_dir):
ecmap_no_mm = os.path.join(temp_dir, "matrix_no_mm.ec")

logger.debug(
f"Replacing transcript entries with -1 for equivalence classes that map to multiple genes from {os.path.join(temp_dir, 'matrix.ec')}"
f"Replacing transcript entries with -1 for equivalence classes "
"that map to multiple genes from {os.path.join(temp_dir, 'matrix.ec')}"
)

# Get multimapped equivalence classes
Expand All @@ -233,9 +236,10 @@ def remove_mm_from_mc(t2g_path, txnames, temp_dir):
# Replace transcript entries for multimapped equivalence classes with -1
ec_df.loc[ec_df[0].isin(ecs_mm), 1] = -1
ec_df.to_csv(ecmap_no_mm, sep="\t", index=False, header=None)

logger.debug(
f"matrix.ec file where transcript entries were replaced with -1 for equivalence classes that map to multiple genes saved at {ecmap_no_mm}"
f"matrix.ec file where transcript entries were replaced with -1 for "
"equivalence classes that map to multiple genes saved at {ecmap_no_mm}"
)

return ecmap_no_mm
Expand Down Expand Up @@ -271,11 +275,19 @@ def extract(
targets: Gene or transcript names for which to extract the raw reads that align to the index
out_dir: Path to output directory
target_type: 'gene' (default) or 'transcript' -> Defines whether targets are gene or transcript names
extract_all: Extracts reads for all genes or transcripts (as defined in target_type), defaults to `False`. Might take a long time to run when the reference index contains a large number of genes. Set targets = None when using extract_all
extract_all_fast: Extracts all pseudo-aligned reads, defaults to `False`. Does not break down output by gene/transcript. Set targets = None when using extract_all_fast
extract_all_unmapped: Extracts all unmapped reads, defaults to `False`. Set targets = None when using extract_all_unmapped
extract_all: Extracts reads for all genes or transcripts (as defined in target_type), defaults to `False`.
Might take a long time to run when the reference index contains a large number of genes.
Set targets = None when using extract_all
extract_all_fast: Extracts all pseudo-aligned reads, defaults to `False`.
Does not break down output by gene/transcript.
Set targets = None when using extract_all_fast
extract_all_unmapped: Extracts all unmapped reads, defaults to `False`.
Set targets = None when using extract_all_unmapped
mm: Also extract reads that multi-mapped to several genes, defaults to `False`
t2g_path: Path to transcript-to-gene mapping file (required when mm = False, target_type = 'gene' (and extract_all_fast and extract_all_unmapped = False), OR extract_all = True)
t2g_path: Path to transcript-to-gene mapping file
(required when mm = False, target_type = 'gene'
(and extract_all_fast and extract_all_unmapped = False),
OR extract_all = True)
temp_dir: Path to temporary directory, defaults to `tmp`
threads: Number of threads to use, defaults to `8`
aa: Align to index generated from a FASTA-file containing amino acid sequences, defaults to `False`
Expand All @@ -287,19 +299,21 @@ def extract(
"""
if sum([extract_all, extract_all_fast, extract_all_unmapped]) > 1:
raise ValueError(
f"extract_all, extract_all_fast, and/or extract_all_unmapped cannot be used simultaneously"
"extract_all, extract_all_fast, and/or extract_all_unmapped cannot be used simultaneously"
)

if targets is None and not (
extract_all or extract_all_fast or extract_all_unmapped
):
raise ValueError(
f"targets must be provided (unless extract_all, extract_all_fast, or extract_all_unmapped are used to extract all reads)"
"targets must be provided "
"(unless extract_all, extract_all_fast, or extract_all_unmapped are used to extract all reads)"
)

if targets and (extract_all or extract_all_fast or extract_all_unmapped):
logger.warning(
f"targets will be ignored since extract_all, extract_all_fast, or extract_all_unmapped is activated which will extract all reads"
"targets will be ignored since extract_all, extract_all_fast, or extract_all_unmapped "
"is activated which will extract all reads"
)

if target_type not in ["gene", "transcript"]:
Expand All @@ -313,14 +327,16 @@ def extract(
or extract_all
) and (t2g_path is None):
raise ValueError(
"t2g_path must be provided if mm flag is not provided, target_type is 'gene' (and extract_all_fast and extract_all_unmapped are False), OR extract_all is True"
"t2g_path must be provided if mm flag is not provided, target_type is 'gene' "
"(and extract_all_fast and extract_all_unmapped are False), OR extract_all is True"
)

# extract_all_unmapped requires bustools version > 0.43.2 since previous versions have a bug in the output fastq format that changes the sequence headers
bustools_version_tuple = get_bustools_version()
if extract_all_unmapped and not (0, 43, 2) < bustools_version_tuple:
raise ValueError(
f"extract_all_unmapped requires bustools version > 0.43.2. You are currently using bustools version {'.'.join(str(i) for i in bustools_version_tuple)}."
f"extract_all_unmapped requires bustools version > 0.43.2. "
"You are currently using bustools version {'.'.join(str(i) for i in bustools_version_tuple)}."
)

make_directory(out_dir)
Expand Down Expand Up @@ -379,7 +395,11 @@ def extract(
# Save unmapped reads in a separate fastq file
unmapped_fastq = os.path.join(out_dir, "all_unmapped/1.fastq.gz")
mapped_fastq = os.path.join(extract_out_folder, "1.fastq.gz")
extract_matching_reads_by_header(mapped_fastq, fastq[0] if isinstance(fastq, list) else fastq, unmapped_fastq)
extract_matching_reads_by_header(
mapped_fastq,
fastq[0] if isinstance(fastq, list) else fastq,
unmapped_fastq
)

else:
if not mm:
Expand Down
28 changes: 20 additions & 8 deletions kb_python/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1597,8 +1597,14 @@ def setup_extract_args(

parser_extract = parser.add_parser(
'extract',
description='Extract sequencing reads that were pseudoaligned to specific genes/transcripts (or extract all reads that were / were not pseudoaligned).',
help='Extract sequencing reads that were pseudoaligned to specific genes/transcripts (or extract all reads that were / were not pseudoaligned)',
description=(
'Extract sequencing reads that were pseudoaligned to specific genes/transcripts '
'(or extract all reads that were / were not pseudoaligned).'
),
help=(
'Extract sequencing reads that were pseudoaligned to specific genes/transcripts '
'(or extract all reads that were / were not pseudoaligned)'
),
parents=[parent]
)
parser_extract._actions[0].help = parser_extract._actions[
Expand All @@ -1611,7 +1617,8 @@ def setup_extract_args(
type=str,
help=(
'Single fastq file containing the sequencing reads (e.g. in case of 10x data, provide the R2 file).'
' Sequencing technology will be treated as bulk here since barcode and UMI tracking is not necessary to extract reads.'
' Sequencing technology will be treated as bulk here since barcode and UMI tracking '
'is not necessary to extract reads.'
)
)
required_extract.add_argument(
Expand Down Expand Up @@ -1643,16 +1650,19 @@ def setup_extract_args(
parser_extract.add_argument(
'--extract_all',
help=(
'Extracts all reads that pseudo-aligned to any gene or transcript (as defined by target_type) (breaks down output by gene/transcript). '
'Using extract_all might take a long time to run when there are a large number of genes/transcripts in the index.'
'Extracts all reads that pseudo-aligned to any gene or transcript (as defined by target_type) '
'(breaks down output by gene/transcript). '
'Using extract_all might take a long time to run when there are a large number of '
'genes/transcripts in the index.'
),
action='store_true',
default=False
)
parser_extract.add_argument(
'--extract_all_fast',
help=(
'Extracts all reads that pseudo-aligned (does not break down output by gene/transcript; output saved in the "all" folder).'
'Extracts all reads that pseudo-aligned (does not break down output by gene/transcript; '
'output saved in the "all" folder).'
),
action='store_true',
default=False
Expand All @@ -1677,7 +1687,9 @@ def setup_extract_args(
'-g',
metavar='T2G',
help=(
'Path to transcript-to-gene mapping file (required when mm = False, target_type = "gene" (and extract_all_fast and extract_all_unmapped = False), OR extract_all = True).'
'Path to transcript-to-gene mapping file '
'(required when mm = False, target_type = "gene" '
'(and extract_all_fast and extract_all_unmapped = False), OR extract_all = True).'
),
type=str,
)
Expand Down Expand Up @@ -1837,7 +1849,7 @@ def main():
# Set binary paths
if args.command in ('ref', 'count', 'extract') and ('dry_run' not in args
or not args.dry_run):

use_kmer64 = False
opt_off = False
if args.k and args.k > 32:
Expand Down
2 changes: 1 addition & 1 deletion kb_python/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ def overlay_anndatas(
ambiguous_intersection = adata_ambiguous[obs_idx][:, var_idx]
a_layers.update({'ambiguous': ambiguous_intersection.X})
sum_X = sum_X + ambiguous_intersection.X

df_obs = unspliced_intersection.obs
df_var = unspliced_intersection.var
return anndata.AnnData(
Expand Down

0 comments on commit 6095416

Please sign in to comment.