Skip to content

Commit

Permalink
refactoring: python project tree reorganization
Browse files Browse the repository at this point in the history
  • Loading branch information
mikolmogorov committed Aug 22, 2018
1 parent 3a4134f commit 57d6e11
Show file tree
Hide file tree
Showing 27 changed files with 82 additions and 84 deletions.
Empty file added flye/assembly/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion flye/assemble.py → flye/assembly/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import logging
import os

from flye.utils import which
from flye.utils.utils import which

ASSEMBLE_BIN = "flye-assemble"
logger = logging.getLogger()
Expand Down
2 changes: 1 addition & 1 deletion flye/repeat_graph.py → flye/assembly/repeat_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import logging
import os

from flye.utils import which
from flye.utils.utils import which

REPEAT_BIN = "flye-repeat"
logger = logging.getLogger()
Expand Down
8 changes: 4 additions & 4 deletions flye/scaffolder.py → flye/assembly/scaffolder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
import sys
import logging

import flye.fasta_parser as fp
import flye.config as config
import flye.utils.fasta_parser as fp
import flye.config.py_cfg as cfg

logger = logging.getLogger()

Expand Down Expand Up @@ -68,7 +68,7 @@ def generate_scaffolds(contigs_file, links_file, out_scaffolds):
else:
scf_seq.append(fp.reverse_complement(
contigs_fasta[unsigned(scf_ctg)]))
gap = "N" * config.vals["scaffold_gap"]
gap = "N" * cfg.vals["scaffold_gap"]
scaffolds_fasta[scf_name] = gap.join(scf_seq)

fp.write_fasta_dict(scaffolds_fasta, out_scaffolds)
Expand Down Expand Up @@ -124,7 +124,7 @@ def generate_stats(repeat_file, polished_file, scaffolds, out_stats):
scaffolds_stats[scf] = SeqStats(scf)
scf_length = sum(map(lambda c: int(contigs_stats[unsigned(c)].length),
scf_seq))
scf_length += (len(scf_seq) - 1) * config.vals["scaffold_gap"]
scf_length += (len(scf_seq) - 1) * cfg.vals["scaffold_gap"]
scaffolds_stats[scf].length = str(scf_length)

scf_cov = _mean(map(lambda c: int(contigs_stats[unsigned(c)].coverage),
Expand Down
Empty file added flye/config/__init__.py
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file added flye/config/configurator.py
Empty file.
29 changes: 17 additions & 12 deletions flye/config.py → flye/config/py_cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,47 +3,52 @@
#Released under the BSD license (see LICENSE file)

"""
File with configurations
Configuration file for the Python part of the pipeline
"""

import os

vals = {
"raw_cfg" : "asm_raw_reads.cfg",
"corrected_cfg" : "asm_corrected_reads.cfg",
"subasm_cfg" : "asm_subasm.cfg",
"pkg_root" : os.path.dirname(os.path.dirname(os.path.abspath(__file__))),

"raw_cfg" : "config/bin_cfg/asm_raw_reads.cfg",
"corrected_cfg" : "config/bin_cfg/asm_corrected_reads.cfg",
"subasm_cfg" : "config/bin_cfg/asm_subasm.cfg",

"simple_kmer_length" : 4,
"solid_kmer_length" : 10,
"max_bubble_length" : 500,
"max_bubble_branches" : 50,
"min_aln_rate" : 0.50,
"read_aln_overhang" : 100,

"scaffold_gap" : 100,

"err_modes" : {
"pacbio" : {
"subs_matrix" : "pacbio_substitutions.mat",
"hopo_matrix" : "pacbio_homopolymers.mat",
"subs_matrix" : "config/bin_cfg/pacbio_substitutions.mat",
"hopo_matrix" : "config/bin_cfg/pacbio_homopolymers.mat",
"solid_missmatch" : 0.2,
"solid_indel" : 0.2,
"max_aln_error" : 0.25
},
"nano" : {
"subs_matrix" : "nano_substitutions.mat",
"hopo_matrix" : "nano_homopolymers.mat",
"subs_matrix" : "config/bin_cfg/nano_substitutions.mat",
"hopo_matrix" : "config/bin_cfg/nano_homopolymers.mat",
"solid_missmatch" : 0.3,
"solid_indel" : 0.3,
"max_aln_error" : 0.3
},
"pacbio_hi_err" : {
"subs_matrix" : "p6c4_substitutions.mat",
"hopo_matrix" : "p6c4_homopolymers.mat",
"subs_matrix" : "config/bin_cfg/p6c4_substitutions.mat",
"hopo_matrix" : "config/bin_cfg/p6c4_homopolymers.mat",
"solid_missmatch" : 0.25,
"solid_indel" : 0.25,
"max_aln_error" : 0.3
},
"subasm" : {
"subs_matrix" : "pacbio_substitutions.mat",
"hopo_matrix" : "pacbio_homopolymers.mat",
"subs_matrix" : "config/bin_cfg/pacbio_substitutions.mat",
"hopo_matrix" : "config/bin_cfg/pacbio_homopolymers.mat",
"solid_missmatch" : 0.2,
"solid_indel" : 0.2,
"max_aln_error" : 0.25
Expand Down
56 changes: 25 additions & 31 deletions flye/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,17 @@
import shutil
import subprocess

import flye.alignment as aln
import flye.bubbles as bbl
import flye.polish as pol
import flye.fasta_parser as fp
import flye.assemble as asm
import flye.repeat_graph as repeat
import flye.consensus as cons
import flye.scaffolder as scf
import flye.polishing.alignment as aln
import flye.polishing.bubbles as bbl
import flye.polishing.polish as pol
import flye.polishing.consensus as cons
import flye.assembly.assemble as asm
import flye.assembly.repeat_graph as repeat
import flye.assembly.scaffolder as scf
from flye.__version__ import __version__
import flye.config as config
from flye.bytes2human import human2bytes
import flye.config.py_cfg as cfg
from flye.utils.bytes2human import human2bytes
import flye.utils.fasta_parser as fp

logger = logging.getLogger()

Expand Down Expand Up @@ -180,7 +180,7 @@ def run(self):
consensus_fasta = cons.get_consensus(out_alignment, self.in_contigs,
contigs_info, self.args.threads,
self.args.platform,
config.vals["min_aln_rate"])
cfg.vals["min_aln_rate"])
fp.write_fasta_dict(consensus_fasta, self.out_consensus)


Expand Down Expand Up @@ -224,7 +224,7 @@ def run(self):
coverage_stats = \
bbl.make_bubbles(alignment_file, contigs_info, prev_assembly,
self.args.platform, self.args.threads,
config.vals["min_aln_rate"], bubbles_file)
cfg.vals["min_aln_rate"], bubbles_file)

logger.info("Correcting bubbles")
polished_file = os.path.join(self.polishing_dir,
Expand Down Expand Up @@ -293,14 +293,12 @@ def _set_kmer_size(args):


def _set_read_attributes(args):
root = os.path.dirname(__file__)
if args.read_type == "raw":
args.asm_config = os.path.join(root, "resource", config.vals["raw_cfg"])
args.asm_config = os.path.join(cfg.vals["pkg_root"], cfg.vals["raw_cfg"])
elif args.read_type == "corrected":
args.asm_config = os.path.join(root, "resource",
config.vals["corrected_cfg"])
args.asm_config = os.path.join(cfg.vals["pkg_root"], cfg.vals["corrected_cfg"])
elif args.read_type == "subasm":
args.asm_config = os.path.join(root, "resource", config.vals["subasm_cfg"])
args.asm_config = os.path.join(cfg.vals["pkg_root"], cfg.vals["subasm_cfg"])


def _run(args):
Expand Down Expand Up @@ -374,9 +372,9 @@ def _enable_logging(log_file, debug, overwrite):
def _usage():
return ("flye (--pacbio-raw | --pacbio-corr | --nano-raw |\n"
"\t --nano-corr | --subassemblies) file1 [file_2 ...]\n"
"\t --genome-size size --out-dir dir_path [--threads int]\n"
"\t [--iterations int] [--min-overlap int] [--resume]\n"
"\t [--debug] [--version] [--help]")
"\t --genome-size SIZE --out-dir PATH\n"
"\t [--threads int] [--iterations int] [--min-overlap int]\n"
"\t [--debug] [--version] [--help] [--resume]")


def _epilog():
Expand All @@ -389,10 +387,8 @@ def _epilog():
"files with reads (separated by spaces). Mixing different read\n"
"types is not yet supported.\n\n"
"You must provide an estimate of the genome size as input,\n"
"which is used for solid k-mers selection. The estimate could\n"
"be rough (e.g. withing 0.5x-2x range) and does not affect\n"
"the other assembly stages. Standard size modificators are\n"
"supported (e.g. 5m or 2.6g)")
"which is used for solid k-mers selection. Standard size\n"
"modificators are supported (e.g. 5m or 2.6g)")


def _version():
Expand Down Expand Up @@ -437,7 +433,7 @@ def check_int_range(value, min_val, max_val, require_odd=False):
help="ONT corrected reads")
read_group.add_argument("--subassemblies", dest="subassemblies", nargs="+",
default=None, metavar="path",
help="high-quality contig-like input")
help="high-quality contigs input")

parser.add_argument("-g", "--genome-size", dest="genome_size",
metavar="size", required=True,
Expand All @@ -448,16 +444,14 @@ def check_int_range(value, min_val, max_val, require_odd=False):

parser.add_argument("-t", "--threads", dest="threads",
type=lambda v: check_int_range(v, 1, 128),
default=1, metavar="int", help="number of parallel threads "
"(default: 1)")
default=1, metavar="int", help="number of parallel threads [1]")
parser.add_argument("-i", "--iterations", dest="num_iters",
type=lambda v: check_int_range(v, 0, 10),
default=1, help="number of polishing iterations "
"(default: 1)", metavar="int")
default=1, help="number of polishing iterations [1]",
metavar="int")
parser.add_argument("-m", "--min-overlap", dest="min_overlap", metavar="int",
type=lambda v: check_int_range(v, 1000, 10000),
default=None, help="minimum overlap between reads "
"(default: auto)")
default=None, help="minimum overlap between reads [auto]")

parser.add_argument("--resume", action="store_true",
dest="resume", default=False,
Expand Down
Empty file added flye/polishing/__init__.py
Empty file.
10 changes: 5 additions & 5 deletions flye/alignment.py → flye/polishing/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
import multiprocessing
import ctypes

import flye.fasta_parser as fp
from flye.utils import which
import flye.config as config
import flye.utils.fasta_parser as fp
from flye.utils.utils import which
import flye.config.py_cfg as cfg


logger = logging.getLogger()
Expand Down Expand Up @@ -195,7 +195,7 @@ def get_chunk(self):
qry_start, qry_end, qry_len, qry_seq, err_rate) = \
self.parse_cigar(cigar_str, read_str, read_contig, ctg_pos)

OVERHANG = 100
OVERHANG = cfg.vals["read_aln_overhang"]
if (float(qry_end - qry_start) / qry_len > self.min_aln_rate or
trg_start < OVERHANG or trg_len - trg_end < OVERHANG):
aln = Alignment(read_id, read_contig, qry_start,
Expand All @@ -217,7 +217,7 @@ def check_binaries():
def make_alignment(reference_file, reads_file, num_proc,
work_dir, platform, out_alignment):
"""
Runs minimap2 and sort its output
Runs minimap2 and sorts its output
"""
_run_minimap(reference_file, reads_file, num_proc, platform, out_alignment)
logger.debug("Sorting alignment file")
Expand Down
28 changes: 14 additions & 14 deletions flye/bubbles.py → flye/polishing/bubbles.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
import multiprocessing
import signal

import flye.fasta_parser as fp
import flye.config as config
from flye.alignment import shift_gaps, SynchronizedSamReader
import flye.utils.fasta_parser as fp
import flye.config.py_cfg as cfg
from flye.polishing.alignment import shift_gaps, SynchronizedSamReader


logger = logging.getLogger()
Expand Down Expand Up @@ -162,8 +162,8 @@ def _output_bubbles(bubbles, out_stream):


def _postprocess_bubbles(bubbles):
MAX_BUBBLE = config.vals["max_bubble_length"]
MAX_BRANCHES = config.vals["max_bubble_branches"]
MAX_BUBBLE = cfg.vals["max_bubble_length"]
MAX_BRANCHES = cfg.vals["max_bubble_branches"]

new_bubbles = []
long_branches = 0
Expand Down Expand Up @@ -213,9 +213,9 @@ def _is_solid_kmer(profile, position, err_mode):
"""
Checks if the kmer at given position is solid
"""
MISSMATCH_RATE = config.vals["err_modes"][err_mode]["solid_missmatch"]
INS_RATE = config.vals["err_modes"][err_mode]["solid_indel"]
SOLID_LEN = config.vals["solid_kmer_length"]
MISSMATCH_RATE = cfg.vals["err_modes"][err_mode]["solid_missmatch"]
INS_RATE = cfg.vals["err_modes"][err_mode]["solid_indel"]
SOLID_LEN = cfg.vals["solid_kmer_length"]

for i in xrange(position, position + SOLID_LEN):
if profile[i].coverage == 0:
Expand All @@ -232,7 +232,7 @@ def _is_simple_kmer(profile, position):
"""
Checks if the kmer with center at the given position is simple
"""
SIMPLE_LEN = config.vals["simple_kmer_length"]
SIMPLE_LEN = cfg.vals["simple_kmer_length"]

extended_len = SIMPLE_LEN * 2
nucl_str = map(lambda p: p.nucl, profile[position - extended_len / 2 :
Expand Down Expand Up @@ -268,7 +268,7 @@ def _compute_profile(alignment, platform, genome_len):
"""
Computes alignment profile
"""
max_aln_err = config.vals["err_modes"][platform]["max_aln_error"]
max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
aln_errors = []
filtered = 0
profile = [ProfileInfo() for _ in xrange(genome_len)]
Expand Down Expand Up @@ -311,9 +311,9 @@ def _get_partition(profile, err_mode):
Partitions genome into sub-alignments at solid regions / simple kmers
"""
#logger.debug("Partitioning genome")
SOLID_LEN = config.vals["solid_kmer_length"]
SIMPLE_LEN = config.vals["simple_kmer_length"]
MAX_BUBBLE = config.vals["max_bubble_length"]
SOLID_LEN = cfg.vals["solid_kmer_length"]
SIMPLE_LEN = cfg.vals["simple_kmer_length"]
MAX_BUBBLE = cfg.vals["max_bubble_length"]

solid_flags = [False for _ in xrange(len(profile))]
prof_pos = 0
Expand Down Expand Up @@ -358,7 +358,7 @@ def _get_bubble_seqs(alignment, platform, profile, partition, contig_info):
if not partition:
return []

max_aln_err = config.vals["err_modes"][platform]["max_aln_error"]
max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
bubbles = []
ext_partition = [0] + partition + [contig_info.length]
for p_left, p_right in zip(ext_partition[:-1], ext_partition[1:]):
Expand Down
8 changes: 4 additions & 4 deletions flye/consensus.py → flye/polishing/consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
import multiprocessing
import signal

from flye.alignment import shift_gaps, SynchronizedSamReader
import flye.config as config
import flye.fasta_parser as fp
from flye.polishing.alignment import shift_gaps, SynchronizedSamReader
import flye.config.py_cfg as cfg
import flye.utils.fasta_parser as fp

logger = logging.getLogger()

Expand Down Expand Up @@ -97,7 +97,7 @@ def _contig_profile(alignment, platform, genome_len):
"""
Computes alignment profile
"""
max_aln_err = config.vals["err_modes"][platform]["max_aln_error"]
max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
aln_errors = []
profile = [Profile() for _ in xrange(genome_len)]
for aln in alignment:
Expand Down
18 changes: 8 additions & 10 deletions flye/polish.py → flye/polishing/polish.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
from collections import defaultdict
from threading import Thread

import flye.bubbles as bbl
import flye.fasta_parser as fp
from flye.utils import which
import flye.config as config
import flye.polishing.bubbles as bbl
import flye.utils.fasta_parser as fp
from flye.utils.utils import which
import flye.config.py_cfg as cfg


POLISH_BIN = "flye-polish"
Expand Down Expand Up @@ -44,12 +44,10 @@ def check_binaries():


def polish(bubbles_file, num_proc, err_mode, work_dir, iter_id, out_polished):
_ROOT = os.path.dirname(__file__)

subs_matrix = os.path.join(_ROOT, "resource",
config.vals["err_modes"][err_mode]["subs_matrix"])
hopo_matrix = os.path.join(_ROOT, "resource",
config.vals["err_modes"][err_mode]["hopo_matrix"])
subs_matrix = os.path.join(cfg.vals["pkg_root"],
cfg.vals["err_modes"][err_mode]["subs_matrix"])
hopo_matrix = os.path.join(cfg.vals["pkg_root"],
cfg.vals["err_modes"][err_mode]["hopo_matrix"])

consensus_out = os.path.join(work_dir, "consensus_{0}.fasta"
.format(iter_id))
Expand Down
Empty file added flye/utils/__init__.py
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 57d6e11

Please sign in to comment.