From c46e7d94b6d970661c58f61d7f32f8bb6f39ba35 Mon Sep 17 00:00:00 2001 From: pwwang Date: Fri, 17 Jan 2020 12:23:26 -0600 Subject: [PATCH] Use coding styles by google --- .pre-commit-config.yaml | 8 + .pylintrc | 401 ++++++++++++-------- tests/test_formula.py | 6 +- vcfstats/__init__.py | 384 +++++++++++--------- vcfstats/formula.py | 788 +++++++++++++++++++++------------------- vcfstats/macros.py | 202 +++++----- vcfstats/one.py | 379 ++++++++++--------- 7 files changed, 1188 insertions(+), 980 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 24026b0..9972106 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,6 +10,14 @@ repos: - id: end-of-file-fixer - id: check-yaml - id: check-added-large-files +- repo: local + hooks: + - id: masterpylintrc + name: Overwrite local .pylintrc by master one + entry: cp ../.pylintrc ./.pylintrc + pass_filenames: false + always_run: true + language: system - repo: https://github.com/pre-commit/mirrors-pylint rev: v2.4.4 hooks: diff --git a/.pylintrc b/.pylintrc index ef5bd1f..9869a1f 100644 --- a/.pylintrc +++ b/.pylintrc @@ -7,7 +7,7 @@ extension-pkg-whitelist= # Add files or directories to the blacklist. They should be base names, not # paths. -ignore=CVS,tests +ignore=CVS # Add files or directories matching the regex patterns to the blacklist. The # regex matches against base names, not paths. @@ -15,7 +15,7 @@ ignore-patterns= # Python code to execute, usually for sys.path manipulation such as # pygtk.require(). -#init-hook=test*.py +#init-hook= # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the # number of processors available to use. @@ -26,7 +26,7 @@ jobs=1 # complex, nested conditions. limit-inference-results=100 -# List of plugins (as comma separated values of python modules names) to load, +# List of plugins (as comma separated values of python module names) to load, # usually to register additional checkers. load-plugins= @@ -60,15 +60,92 @@ confidence= # --enable=similarities". If you want to run only the classes checker, but have # no Warning level messages displayed, use "--disable=all --enable=classes # --disable=W". -disable=bad-whitespace, +disable=print-statement, + parameter-unpacking, + unpacking-in-except, + old-raise-syntax, + backtick, + long-suffix, + old-ne-operator, + old-octal-literal, + import-star-module-level, + non-ascii-bytes-literal, + raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + apply-builtin, + basestring-builtin, + buffer-builtin, + cmp-builtin, + coerce-builtin, + execfile-builtin, + file-builtin, + long-builtin, + raw_input-builtin, + reduce-builtin, + standarderror-builtin, + unicode-builtin, + xrange-builtin, + coerce-method, + delslice-method, + getslice-method, + setslice-method, + no-absolute-import, + old-division, + dict-iter-method, + dict-view-method, + next-method-called, + metaclass-assignment, + indexing-exception, + raising-string, + reload-builtin, + oct-method, + hex-method, + nonzero-method, + cmp-method, + input-builtin, + round-builtin, + intern-builtin, + unichr-builtin, + map-builtin-not-iterating, + zip-builtin-not-iterating, + range-builtin-not-iterating, + filter-builtin-not-iterating, + using-cmp-argument, + eq-without-hash, + div-method, + idiv-method, + rdiv-method, + exception-message-attribute, + invalid-str-codec, + sys-max-int, + bad-python3-import, + deprecated-string-function, + deprecated-str-translate-call, + deprecated-itertools-function, + deprecated-types-field, + next-method-defined, + dict-items-not-iterating, + dict-keys-not-iterating, + dict-values-not-iterating, + deprecated-operator-function, + deprecated-urllib-function, + xreadlines-attribute, + deprecated-sys-function, + exception-escape, + comprehension-escape, + import-error, protected-access, - bad-continuation, import-outside-toplevel, - import-error, - cyclic-import, + no-name-in-module, no-member, broad-except, - inconsistent-return-statements + cyclic-import # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option @@ -79,11 +156,11 @@ enable=c-extension-no-member [REPORTS] -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'error', 'warning', 'refactor', and 'convention' +# which contain the number of messages in each category, as well as 'statement' +# which is the total number of statements analyzed. This score is used by the +# global evaluation report (RP0004). evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) # Template used to display messages. This is a python new-style format string @@ -114,12 +191,30 @@ max-nested-blocks=5 never-returning-functions=sys.exit -[STRING] +[LOGGING] -# This flag controls whether the implicit-str-concat-in-sequence should -# generate a warning on implicit string concatenation in sequences defined over -# several lines. -check-str-concat-over-line-jumps=no +# Format style used to check logging format string. `old` means using % +# formatting, `new` is for `{}` formatting,and `fstr` is for f-strings. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[SIMILARITIES] + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + +# Minimum lines number of a similarity. +min-similarity-lines=4 [SPELLING] @@ -127,73 +222,21 @@ check-str-concat-over-line-jumps=no # Limits count of emitted suggestions for spelling mistakes. max-spelling-suggestions=4 -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package.. +# Spelling dictionary name. Available dictionaries: none. To make it work, +# install the python-enchant package. spelling-dict= # List of comma separated words that should not be checked. spelling-ignore-words= -# A path to a file that contains private dictionary; one word per line. +# A path to a file that contains the private dictionary; one word per line. spelling-private-dict-file= -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. spelling-store-unknown-words=no -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members= - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# Tells whether to warn about missing members when the owner of the attribute -# is inferred to be None. -ignore-none=yes - -# This flag controls whether pylint should warn about no-member and similar -# checks whenever an opaque object is returned when inferring. The inference -# can return multiple potential results while evaluating a Python object, but -# some branches might not be evaluated, which results in partial inference. In -# that case, it might be useful to still emit no-member and other checks for -# the rest of the inferred objects. -ignore-on-opaque-inference=yes - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis. It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# Show a hint with possible names when a member name was not found. The aspect -# of finding the hint is based on edit distance. -missing-member-hint=yes - -# The minimum edit distance a name should have in order to be considered a -# similar match for a missing member name. -missing-member-hint-distance=1 - -# The total number of similar names that should be taken in consideration when -# showing a hint for a missing member. -missing-member-max-choices=1 - - [BASIC] # Naming style matching correct argument names. @@ -204,7 +247,7 @@ argument-naming-style=snake_case #argument-rgx= # Naming style matching correct attribute names. -attr-naming-style=camelCase +attr-naming-style=snake_case # Regular expression matching correct attribute names. Overrides attr-naming- # style. @@ -254,17 +297,10 @@ function-naming-style=snake_case good-names=i, j, k, - f, ex, rc, + eq, Run, - X, - Y, - N, - x, - y, - r, - kw, _ # Include a hint for the correct naming format with invalid-name. @@ -278,7 +314,7 @@ inlinevar-naming-style=any #inlinevar-rgx= # Naming style matching correct method names. -method-naming-style=camelCase +method-naming-style=snake_case # Regular expression matching correct method names. Overrides method-naming- # style. @@ -312,19 +348,59 @@ variable-naming-style=snake_case #variable-rgx= -[SIMILARITIES] +[TYPECHECK] -# Ignore comments when computing similarities. -ignore-comments=yes +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager -# Ignore docstrings when computing similarities. -ignore-docstrings=yes +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= -# Ignore imports when computing similarities. -ignore-imports=no +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes -# Minimum lines number of a similarity. -min-similarity-lines=4 +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + +# List of decorators that change the signature of a decorated function. +signature-mutators= [MISCELLANEOUS] @@ -335,15 +411,49 @@ notes=FIXME, TODO -[LOGGING] +[FORMAT] -# Format style used to check logging format string. `old` means using % -# formatting, while `new` is for `{}` formatting. -logging-format-style=old +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= -# Logging modules to check that the string format arguments are in logging -# function parameter format. -logging-modules=logging +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=80 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# List of optional constructs for which whitespace checking is disabled. `dict- +# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. +# `trailing-comma` allows a space between comma and closing bracket: (a, ). +# `empty-line` allows space-only lines. +no-space-check=trailing-comma, + dict-separator + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[STRING] + +# This flag controls whether the implicit-str-concat-in-sequence should +# generate a warning on implicit string concatenation in sequences defined over +# several lines. +check-str-concat-over-line-jumps=no [VARIABLES] @@ -376,45 +486,45 @@ init-import=no redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io -[FORMAT] +[DESIGN] -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= +# Maximum number of arguments for function / method. +max-args=5 -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ +# Maximum number of attributes for a class (see R0902). +max-attributes=7 -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=1 +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string='\t' +# Maximum number of branch for function / method body. +max-branches=12 -# Maximum number of characters on a single line. -max-line-length=100 +# Maximum number of locals for function / method body. +max-locals=15 -# Maximum number of lines in a module. -max-module-lines=1500 +# Maximum number of parents for a class (see R0901). +max-parents=7 -# List of optional constructs for which whitespace checking is disabled. `dict- -# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. -# `trailing-comma` allows a space between comma and closing bracket: (a, ). -# `empty-line` allows space-only lines. -no-space-check=trailing-comma, - dict-separator +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no +# Maximum number of return / yield for function / method body. +max-returns=6 -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 [IMPORTS] +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + # Allow wildcard imports from modules that define __all__. allow-wildcard-with-all=no @@ -445,38 +555,8 @@ known-standard-library= # Force import order to recognize a module as part of a third party library. known-third-party=enchant - -[DESIGN] - -# Maximum number of arguments for function / method. -max-args=10 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Maximum number of boolean expressions in an if statement. -max-bool-expr=8 - -# Maximum number of branch for function / method body. -max-branches=15 - -# Maximum number of locals for function / method body. -max-locals=15 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of return / yield for function / method body. -max-returns=10 - -# Maximum number of statements in function / method body. -max-statements=50 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=2 +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= [CLASSES] @@ -484,7 +564,8 @@ min-public-methods=2 # List of method names used to declare (i.e. assign) instance attributes. defining-attr-methods=__init__, __new__, - setUp + setUp, + __post_init__ # List of member names, which should be excluded from the protected access # warning. diff --git a/tests/test_formula.py b/tests/test_formula.py index 9a9fb6e..2ec76fd 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -124,7 +124,7 @@ def test_aggr_init(): assert aggr.term == Term('1', None) assert aggr.filter is None assert aggr.group is None - assert not aggr.hasFILTER() + assert not aggr.has_filter() aggr = Aggr('COUNT(1, FILTER[PASS])', {'1': Term('1', None), 'FILTER[PASS]': Term('FILTER[PASS]', None)}) assert aggr.term == Term('1', None) @@ -135,7 +135,7 @@ def test_aggr_init(): assert aggr.term == Term('1', None) assert aggr.filter == Term('FILTER[PASS]', None) assert aggr.group is None - assert aggr.hasFILTER() + assert aggr.has_filter() aggr.setxgroup(Term('VARTYPE', None)) assert aggr.group == Term('VARTYPE', None) assert aggr.xgroup is None @@ -148,7 +148,7 @@ def test_aggr_init(): aggr.setxgroup(Term('GTTYPEs{0}', ['A'])) assert aggr.xgroup == Term('GTTYPEs{0}', ['A']) assert repr(aggr) == ", filter=, group=)>" - assert aggr.hasFILTER() + assert aggr.has_filter() aggr = Aggr('COUNT(1, FILTER, VARTYPE)', {'1': Term('1', None), 'FILTER': Term('FILTER', None), 'VARTYPE': Term('VARTYPE', None)}) assert aggr.term == Term('1', None) diff --git a/vcfstats/__init__.py b/vcfstats/__init__.py index fe9030a..8400bd8 100644 --- a/vcfstats/__init__.py +++ b/vcfstats/__init__.py @@ -5,203 +5,241 @@ from cyvcf2 import VCF from pyparam import params, Params MACROS = {} -logging.basicConfig( - level = logging.DEBUG, - format = '[%(asctime)-15s %(levelname)5s] %(message)s') +logging.basicConfig(level=logging.DEBUG, + format='[%(asctime)-15s %(levelname)5s] %(message)s') LOGGER = logging.getLogger(__name__) -from . import macros # pylint:disable=wrong-import-position -from .one import One # pylint:disable=wrong-import-position +from . import macros # pylint:disable=wrong-import-position +from .one import One # pylint:disable=wrong-import-position __version__ = "0.0.3" params._desc = 'vcfstats v{}: Powerful VCF statistics.'.format(__version__) -params.vcf.required = True -params.vcf.desc = 'The VCF file' -params.v = params.vcf -params.loglevel = 'INFO' -params.loglevel.desc = 'The logging level.' +params.vcf.required = True +params.vcf.desc = 'The VCF file' +params.v = params.vcf +params.loglevel = 'INFO' +params.loglevel.desc = 'The logging level.' params.outdir.required = True -params.outdir.desc = 'The output directory.' -params.o = params.outdir -params.Rscript = 'Rscript' -params.Rscript.desc = 'Path to Rscript to run R code for plotting.' -params.figtype = [] -params.figtype.desc = 'Your preferences for types of plots for each formula.' -params.r = [] -params.r.desc = 'Regions in format of [CHR] or [CHR]:[START]-[END]' -params.region = params.r -params.R.desc = 'Regions in a BED file\nIf both -r/R are provided, regions will be merged.' -params.Region = params.R -params.p = False -params.p.desc = [ 'Only analyze variants that pass all filters.', - 'This does not work if FILTER entry is in the analysis.'] -params.passed = params.p -params.l = False -params.l.desc = 'List all available macros.' +params.outdir.desc = 'The output directory.' +params.o = params.outdir +params.Rscript = 'Rscript' +params.Rscript.desc = 'Path to Rscript to run R code for plotting.' +params.figtype = [] +params.figtype.desc = 'Your preferences for types of plots for each formula.' +params.r = [] +params.r.desc = 'Regions in format of [CHR] or [CHR]:[START]-[END]' +params.region = params.r +params.R.desc = ['Regions in a BED file', + 'If both -r/R are provided, regions will be merged.'] +params.Region = params.R +params.p = False +params.p.desc = [ + 'Only analyze variants that pass all filters.', + 'This does not work if FILTER entry is in the analysis.' +] +params.passed = params.p +params.l = False +params.l.desc = 'List all available macros.' params.macro.desc = 'User-defined macro file.' -params.list = params.l -params.f.type = list +params.list = params.l +params.f.type = list params.f.required = True -params.f.desc = ['The formulas for plotting in format of [Y] ~ [X],', - 'where [Y] or [X] should be an entry or an aggregation'] -params.formula = params.f -params.t.type = list -params.t.required = True -params.t.desc = 'The title of each figure, will be used to name the output files.' -params.title = params.t -params.ggs = [] -params.ggs.desc = 'Extra ggplot2 expression for each plot' -params.devpars = dict(width = 2000, height = 2000, res = 300) -params.devpars.desc = [ 'The device parameters for plots.', - 'To specify devpars for each plot, use a configuration file.'] -params.c.desc = ['A configuration file defining how to plot in TOML format.', - 'If this is provided, CLI arguments will be overwritten if defined in this file.'] +params.f.desc = [ + 'The formulas for plotting in format of [Y] ~ [X],', + 'where [Y] or [X] should be an entry or an aggregation' +] +params.formula = params.f +params.t.type = list +params.t.required = True +params.t.desc = ('The title of each figure, ' + 'will be used to name the output files.') +params.title = params.t +params.ggs = [] +params.ggs.desc = 'Extra ggplot2 expression for each plot' +params.devpars = dict(width=2000, height=2000, res=300) +params.devpars.desc = [ + 'The device parameters for plots.', + 'To specify devpars for each plot, use a configuration file.' +] +params.c.desc = [ + 'A configuration file defining how to plot in TOML format.', + 'If this is provided, CLI arguments will be overwritten ' + 'if defined in this file.' +] params.config = params.c -params.l.callback = lambda opt, pms: pms.vcf.setValue('vcf') \ - and pms.outdir.setValue('outdir') \ - and pms.f.setValue(['f']) \ - and pms.t.setValue(['t']) if opt.value else None +params.l.callback = lambda opt, pms: (pms.vcf.set_value('vcf') + and pms.outdir.set_value('outdir') + and pms.f.set_value(['f']) + and pms.t.set_value(['t']) + if opt.value + else None) -params.t.callback = lambda opt, pms: \ - 'Wrong length of title (expect {}, got {})'.format(len(pms.f.value or []), len(opt.value or [])) \ - if len(opt.value or []) != len(pms.f.value or []) else None +params.t.callback = lambda opt, pms: ( + 'Wrong length of title (expect {}, got {})'.format(len(pms.f.value or []), + len(opt.value or [])) + if len(opt.value or []) != len(pms.f.value or []) + else None +) + +params.ggs.callback = lambda opt, pms: ( + 'Wrong length of ggs' + if (len(opt.value or []) > 1 + and len(opt.value or []) != len(pms.f.value or [])) + else None +) -params.ggs.callback = lambda opt, pms: 'Wrong length of ggs' \ - if len(opt.value or []) > 1 and len(opt.value or []) != len(pms.f.value or []) else None def get_vcf_by_regions(vcffile, regions): - """Compile all the regions provided by use together, and return a chained iterator.""" - LOGGER.info("Getting vcf handler by given regions ...") - vcf = VCF(str(vcffile), gts012=True) - samples = vcf.samples - if regions: - if len(regions) == 1: - vcf = vcf(regions[0]) - else: - vcf2 = chain(vcf(regions[0]), vcf(regions[1])) - for region in regions[2:]: - vcf2 = chain(vcf2, vcf(region)) - vcf = vcf2 - return vcf, samples + """Compile all the regions provided by use together, + and return a chained iterator.""" + LOGGER.info("Getting vcf handler by given regions ...") + vcf = VCF(str(vcffile), gts012=True) + samples = vcf.samples + if regions: + if len(regions) == 1: + vcf = vcf(regions[0]) + else: + vcf2 = chain(vcf(regions[0]), vcf(regions[1])) + for region in regions[2:]: + vcf2 = chain(vcf2, vcf(region)) + vcf = vcf2 + return vcf, samples + def combine_regions(regions, regfile): - """Combine all the regions. Users have to make sure there is no overlapping between regions""" - LOGGER.info("Combining regions, remind that regions should not be overlapping ...") - # make sure regions have no overlaps - ret = regions[:] if regions else [] - if regfile: - with open(regfile, 'r') as f: - for line in f: - if line.startswith('#'): - continue - parts = line.strip().split('\t')[:3] - ret.append('{}:{}-{}'.format(*parts)) - return ret + """Combine all the regions. + Users have to make sure there is no overlapping between regions""" + LOGGER.info( + "Combining regions, remind that regions should not be overlapping ...") + # make sure regions have no overlaps + ret = regions[:] if regions else [] + if regfile: + with open(regfile, 'r') as freg: + for line in freg: + if line.startswith('#'): + continue + parts = line.strip().split('\t')[:3] + ret.append('{}:{}-{}'.format(*parts)) + return ret + def get_ones(opts, samples): - """Get instances/formulas. This will determine how many figures we are plotting""" - LOGGER.info("Getting instances ...") - ret = [] - devpars = opts['devpars'] - if not isinstance(devpars, list): - devpars = [devpars] * len(opts['formula']) - for i, formula in enumerate(opts['formula']): - ggs = opts['ggs'][i] if i < len(opts['ggs']) else None - figtype = opts['figtype'][i] if i < len(opts['figtype']) else None - ret.append(One( formula, opts['title'][i], ggs, devpars[i], opts['outdir'], - samples, figtype, opts['passed'])) - return ret + """Get instances/formulas. This will determine h + ow many figures we are plotting""" + LOGGER.info("Getting instances ...") + ret = [] + devpars = opts['devpars'] + if not isinstance(devpars, list): + devpars = [devpars] * len(opts['formula']) + for i, formula in enumerate(opts['formula']): + ggs = opts['ggs'][i] if i < len(opts['ggs']) else None + figtype = opts['figtype'][i] if i < len(opts['figtype']) else None + ret.append( + One(formula, opts['title'][i], ggs, devpars[i], opts['outdir'], + samples, figtype, opts['passed'])) + return ret + def list_macros(): - """List the available macros, including user-provided ones""" - macropage = Params() - def helpx(helps): - helps.remove('Usage') - helps.remove('Optional options') - helps.add('Continuous terms', sectype = 'option') - helps.add('Categorical terms', sectype = 'option') - helps.add('Aggregations', sectype = 'option') - for name, macro in MACROS.items(): - if name == '_ONE': - name = '1' - if macro.get('aggr'): - helps.select('Aggregations').add((name, '', macro['func'].__doc__ or '')) - elif macro['type'] == 'continuous': - helps.select('Continuous').add((name, '', macro['func'].__doc__ or '')) - else: - helps.select('Categorical').add((name, '', macro['func'].__doc__ or '')) - macropage._helpx = helpx - macropage._help(print_and_exit = True) + """List the available macros, including user-provided ones""" + macropage = Params() + + def helpx(helps): + helps.remove('Usage') + helps.remove('Optional options') + helps.add('Continuous terms', sectype='option') + helps.add('Categorical terms', sectype='option') + helps.add('Aggregations', sectype='option') + for name, macro in MACROS.items(): + if name == '_ONE': + name = '1' + if macro.get('aggr'): + helps.select('Aggregations').add( + (name, '', macro['func'].__doc__ or '')) + elif macro['type'] == 'continuous': + helps.select('Continuous').add((name, '', macro['func'].__doc__ + or '')) + else: + helps.select('Categorical').add( + (name, '', macro['func'].__doc__ or '')) + + macropage._helpx = helpx + macropage._help(print_and_exit=True) + def load_macrofile(macrofile): - """Load the macros from a python file""" - macrofile = str(macrofile) - if not macrofile.endswith('.py'): - macrofile = macrofile + '.py' - if not path.isfile(macrofile): - raise OSError("Macro file does not exist: {}".format(macrofile)) - import importlib.util - spec = importlib.util.spec_from_file_location("mymacros", macrofile) - spec.loader.exec_module(importlib.util.module_from_spec(spec)) + """Load the macros from a python file""" + macrofile = str(macrofile) + if not macrofile.endswith('.py'): + macrofile = macrofile + '.py' + if not path.isfile(macrofile): + raise OSError("Macro file does not exist: {}".format(macrofile)) + import importlib.util + spec = importlib.util.spec_from_file_location("mymacros", macrofile) + spec.loader.exec_module(importlib.util.module_from_spec(spec)) + def load_config(config, opts): - """Load the configurations from file""" - if not path.isfile(config): - raise OSError("Config file does not exist: {}".format(config)) - configs = Params() - configs._loadFile(config) - configs = configs._asDict() - ones = [] - if 'one' in configs: - ones = configs['one'] - del configs['one'] - opts.update(configs) - # padding figtype and ggs, and devpars - N = len(opts['formula']) - opts['figtype'].extend([None] * (N - len(opts['figtype']))) - opts['ggs'].extend([None] * (N - len(opts['ggs']))) - if isinstance(opts['devpars'], list): - default_devpars = opts['devpars'][0] - opts['devpars'].extend([default_devpars] * (N - len(opts['devpars']))) - else: - default_devpars = opts['devpars'] - opts['devpars'] = [opts['devpars']] * N - for one in ones: - if 'formula' not in one: - raise ValueError("Formula not found in instance: {}".format(one)) - if 'title' not in one: - raise ValueError("Title not found in instance: {}".format(one)) - opts['formula'].append(one['formula']) - opts['title'].append(one['title']) - opts['figtype'].append(one.get('figtype')) - opts['ggs'].append(one.get('ggs')) - def_devpars = default_devpars.copy() - def_devpars.update(one.get('devpars', {})) - opts['devpars'].append(def_devpars) + """Load the configurations from file""" + if not path.isfile(config): + raise OSError("Config file does not exist: {}".format(config)) + configs = Params() + configs._load_file(config) + configs = configs._as_dict() + ones = [] + if 'one' in configs: + ones = configs['one'] + del configs['one'] + opts.update(configs) + # padding figtype and ggs, and devpars + len_fml = len(opts['formula']) + opts['figtype'].extend([None] * (len_fml - len(opts['figtype']))) + opts['ggs'].extend([None] * (len_fml - len(opts['ggs']))) + if isinstance(opts['devpars'], list): + default_devpars = opts['devpars'][0] + opts['devpars'].extend([default_devpars] * + (len_fml - len(opts['devpars']))) + else: + default_devpars = opts['devpars'] + opts['devpars'] = [opts['devpars']] * len_fml + for one in ones: + if 'formula' not in one: + raise ValueError("Formula not found in instance: {}".format(one)) + if 'title' not in one: + raise ValueError("Title not found in instance: {}".format(one)) + opts['formula'].append(one['formula']) + opts['title'].append(one['title']) + opts['figtype'].append(one.get('figtype')) + opts['ggs'].append(one.get('ggs')) + def_devpars = default_devpars.copy() + def_devpars.update(one.get('devpars', {})) + opts['devpars'].append(def_devpars) + def main(): - """Main entrance of the program""" - opts = params._parse() - LOGGER.setLevel(getattr(logging, opts['loglevel'].upper())) - if opts['config']: - load_config(opts['config'], opts) - if opts['macro']: - load_macrofile(opts['macro']) - if opts['l']: - list_macros() - vcf, samples = get_vcf_by_regions(opts['vcf'], combine_regions(opts['region'], opts['Region'])) - ones = get_ones(opts, samples) - LOGGER.info('Start reading variants ...') - for i, variant in enumerate(vcf): - for one in ones: - # save entries, cache aggr - one.iterate(variant) - if i % 10000 == 0: # pragma: no cover - LOGGER.debug("- %s variants read.", i) - LOGGER.info('%s variants read.', i) # pylint: disable=undefined-loop-variable - for i, one in enumerate(ones): - # save aggr - one.summarize() - one.plot(opts['Rscript']) + """Main entrance of the program""" + opts = params._parse() + LOGGER.setLevel(getattr(logging, opts['loglevel'].upper())) + if opts['config']: + load_config(opts['config'], opts) + if opts['macro']: + load_macrofile(opts['macro']) + if opts['l']: + list_macros() + vcf, samples = get_vcf_by_regions( + opts['vcf'], combine_regions(opts['region'], opts['Region'])) + ones = get_ones(opts, samples) + LOGGER.info('Start reading variants ...') + for i, variant in enumerate(vcf): + for one in ones: + # save entries, cache aggr + one.iterate(variant) + if i % 10000 == 0: # pragma: no cover + LOGGER.debug("- %s variants read.", i) + LOGGER.info('%s variants read.', i) # pylint: disable=undefined-loop-variable + for i, one in enumerate(ones): + # save aggr + one.summarize() + one.plot(opts['Rscript']) diff --git a/vcfstats/formula.py b/vcfstats/formula.py index a047dc4..3055199 100644 --- a/vcfstats/formula.py +++ b/vcfstats/formula.py @@ -3,386 +3,422 @@ from collections import OrderedDict from . import MACROS, LOGGER + def parse_subsets(subsets): - """Parse subsets written in short format""" - ret = [] - for subset in subsets.split(','): - subset = subset.strip() - if subset.count('-') == 1: - start, end = subset.split('-') - compref = commonprefix([start, end]) - if compref and compref[-1].isdigit(): - compref = compref[:-1] - start = start[len(compref):] - end = end[len(compref):] - if start.isdigit() and end.isdigit() and int(start) < int(end): - ret.extend([compref + str(i) for i in range(int(start), int(end)+1)]) - else: - ret.append(subset) - else: - ret.append(subset) - return ret - -def safe_split (string, delimter, trim = True): - """ - Split a string using a single-character delimter - @params: - `string`: the string - `delimter`: the single-character delimter - `trim`: whether to trim each part. Default: True - @examples: - ```python - ret = split("'a,b',c", ",") - # ret == ["'a,b'", "c"] - # ',' inside quotes will be recognized. - ``` - @returns: - The list of substrings - """ - ret = [] - special1 = ['(', ')', '[', ']', '{', '}'] - special2 = ['\'', '"', '`'] - special3 = '\\' - flags1 = [0, 0, 0] - flags2 = [False, False, False] - flags3 = False - start = 0 - for i, char in enumerate(string): - if char == special3: - flags3 = not flags3 - elif not flags3: - if char in special1: - index = special1.index(char) - if index % 2 == 0: - flags1[int(index/2)] += 1 - else: - flags1[int(index/2)] -= 1 - elif char in special2: - index = special2.index(char) - flags2[index] = not flags2[index] - elif char == delimter and not any(flags1) and not any(flags2): - rest = string[start:i] - if trim: - rest = rest.strip() - ret.append(rest) - start = i + 1 - else: - flags3 = False - rest = string[start:] - if trim: - rest = rest.strip() - ret.append(rest) - return ret + """Parse subsets written in short format""" + ret = [] + for subset in subsets.split(','): + subset = subset.strip() + if subset.count('-') == 1: + start, end = subset.split('-') + compref = commonprefix([start, end]) + if compref and compref[-1].isdigit(): + compref = compref[:-1] + start = start[len(compref):] + end = end[len(compref):] + if start.isdigit() and end.isdigit() and int(start) < int(end): + ret.extend([ + compref + str(i) for i in range(int(start), + int(end) + 1) + ]) + else: + ret.append(subset) + else: + ret.append(subset) + return ret + + +def safe_split(string, delimter, trim=True): + """ + Split a string using a single-character delimter + @params: + `string`: the string + `delimter`: the single-character delimter + `trim`: whether to trim each part. Default: True + @examples: + ```python + ret = split("'a,b',c", ",") + # ret == ["'a,b'", "c"] + # ',' inside quotes will be recognized. + ``` + @returns: + The list of substrings + """ + ret = [] + special1 = ['(', ')', '[', ']', '{', '}'] + special2 = ['\'', '"', '`'] + special3 = '\\' + flags1 = [0, 0, 0] + flags2 = [False, False, False] + flags3 = False + start = 0 + for i, char in enumerate(string): + if char == special3: + flags3 = not flags3 + elif not flags3: + if char in special1: + index = special1.index(char) + if index % 2 == 0: + flags1[int(index / 2)] += 1 + else: + flags1[int(index / 2)] -= 1 + elif char in special2: + index = special2.index(char) + flags2[index] = not flags2[index] + elif char == delimter and not any(flags1) and not any(flags2): + rest = string[start:i] + if trim: + rest = rest.strip() + ret.append(rest) + start = i + 1 + else: + flags3 = False + rest = string[start:] + if trim: + rest = rest.strip() + ret.append(rest) + return ret + class Term: - """The term in the formula""" - def __init__(self, term, samples): # pylint: disable=too-many-branches,too-many-statements - token = ' \t[{' - pos = [term.find(c) for c in token] - if max(pos) == -1: - remaining = '' - else: - pos = min(p for p in pos if p >= 0) - term, remaining = term[:pos], term[pos:] - - term = '_ONE' if term == '1' else term - if term not in MACROS: - raise ValueError("Term {!r} has not been registered.".format(term)) - self.name = term if term != '_ONE' else '1' - self.term = MACROS[term] - if not self.term.get('type'): - raise TypeError("No type specified for Term: {}".format(self.term)) - remaining = remaining.strip() - - errmsg = ('{}{}: Malformated decorations for an Term. ' - 'Expect {{SAMPLE}}, [SUBSETS] or a combination of both.'.format(term, remaining)) - self.samples = self.subsets = None - - if not remaining: - pass - elif remaining[0] == '[' and remaining[-1] == ']': - self.subsets = parse_subsets(remaining[1:-1]) - elif remaining[0] == '{' and remaining[-1] == '}': - self.samples = parse_subsets(remaining[1:-1]) - elif remaining[0] == '{' and remaining[-1] == ']': - if not '}[' in remaining: - raise ValueError(errmsg) - specified_samples, subsets = remaining[1:-1].split('}[', 1) - self.samples = parse_subsets(specified_samples) - self.subsets = parse_subsets(subsets) - elif remaining[0] == '[' and remaining[-1] == '}': - if not ']{' in remaining: - raise ValueError(errmsg) - subsets, specified_samples = remaining[1:-1].split(']{', 1) - self.samples = parse_subsets(specified_samples) - self.subsets = parse_subsets(subsets) - else: - raise ValueError(errmsg) - - if self.samples: - for i, sample in enumerate(self.samples): - if sample.isdigit(): - self.samples[i] = int(sample) - elif sample not in samples: - raise ValueError('Sample {!r} does not exist.'.format(sample)) - else: - self.samples[i] = samples.index(sample) - - if self.term['type'] == 'continuous' and self.subsets: - if len(self.subsets) != 2: - raise KeyError('Expect a subset of length 2 for continuous Term: {}'.format(self.term)) - if self.subsets[0]: - self.subsets[0] = float(self.subsets[0]) # try to raise - if self.subsets[1]: - self.subsets[1] = float(self.subsets[1]) - - def __repr__(self): - return ''.format(self.name, self.subsets, self.samples) - - def __eq__(self, other): - if not isinstance(other, Term): - return False - return self.term == other.term and self.subsets == other.subsets and self.samples == other.samples - - def __ne__(self, other): - return not self.__eq__(other) - - def run(self, variant, passed): - """Run the variant""" - if passed and variant.FILTER: - return False - value = self.term['func'](variant) - if value is False or value is None: - return False - # numpy.array - if not hasattr(value, 'T') and not isinstance(value, (tuple,list)): - value = [value] - if self.samples: - value = [value[sidx] for sidx in self.samples] - - if self.term['type'] == 'continuous' and self.subsets: - if self.subsets[0] != '' and any(val < self.subsets[0] for val in value): - return False - if self.subsets[1] != '' and any(val > self.subsets[1] for val in value): - return False - if self.term['type'] == 'categorical' and self.subsets: - if any(val not in self.subsets for val in value): - return False - return value + """The term in the formula""" + def __init__(self, term, samples): # pylint: disable=too-many-branches,too-many-statements + token = ' \t[{' + pos = [term.find(c) for c in token] + if max(pos) == -1: + remaining = '' + else: + pos = min(p for p in pos if p >= 0) + term, remaining = term[:pos], term[pos:] + + term = '_ONE' if term == '1' else term + if term not in MACROS: + raise ValueError("Term {!r} has not been registered.".format(term)) + self.name = term if term != '_ONE' else '1' + self.term = MACROS[term] + if not self.term.get('type'): + raise TypeError("No type specified for Term: {}".format(self.term)) + remaining = remaining.strip() + + errmsg = ( + '{}{}: Malformated decorations for an Term. ' + 'Expect {{SAMPLE}}, [SUBSETS] or a combination of both.'.format( + term, remaining)) + self.samples = self.subsets = None + + if not remaining: + pass + elif remaining[0] == '[' and remaining[-1] == ']': + self.subsets = parse_subsets(remaining[1:-1]) + elif remaining[0] == '{' and remaining[-1] == '}': + self.samples = parse_subsets(remaining[1:-1]) + elif remaining[0] == '{' and remaining[-1] == ']': + if not '}[' in remaining: + raise ValueError(errmsg) + specified_samples, subsets = remaining[1:-1].split('}[', 1) + self.samples = parse_subsets(specified_samples) + self.subsets = parse_subsets(subsets) + elif remaining[0] == '[' and remaining[-1] == '}': + if not ']{' in remaining: + raise ValueError(errmsg) + subsets, specified_samples = remaining[1:-1].split(']{', 1) + self.samples = parse_subsets(specified_samples) + self.subsets = parse_subsets(subsets) + else: + raise ValueError(errmsg) + + if self.samples: + for i, sample in enumerate(self.samples): + if sample.isdigit(): + self.samples[i] = int(sample) + elif sample not in samples: + raise ValueError( + 'Sample {!r} does not exist.'.format(sample)) + else: + self.samples[i] = samples.index(sample) + + if self.term['type'] == 'continuous' and self.subsets: + if len(self.subsets) != 2: + raise KeyError( + 'Expect a subset of length 2 for continuous Term: {}'. + format(self.term)) + if self.subsets[0]: + self.subsets[0] = float(self.subsets[0]) # try to raise + if self.subsets[1]: + self.subsets[1] = float(self.subsets[1]) + + def __repr__(self): + return ''.format( + self.name, self.subsets, self.samples) + + def __eq__(self, other): + if not isinstance(other, Term): + return False + return (self.term == other.term + and self.subsets == other.subsets + and self.samples == other.samples) + + def __ne__(self, other): + return not self.__eq__(other) + + def run(self, variant, passed): + """Run the variant""" + if passed and variant.FILTER: + return False + value = self.term['func'](variant) + if value is False or value is None: + return False + # numpy.array + if not hasattr(value, 'T') and not isinstance(value, (tuple, list)): + value = [value] + if self.samples: + value = [value[sidx] for sidx in self.samples] + + if self.term['type'] == 'continuous' and self.subsets: + if self.subsets[0] != '' and any(val < self.subsets[0] + for val in value): + return False + if self.subsets[1] != '' and any(val > self.subsets[1] + for val in value): + return False + if self.term['type'] == 'categorical' and self.subsets: + if any(val not in self.subsets for val in value): + return False + return value + class Aggr: - """The aggregation""" - def __init__(self, aggr, terms): - self.cache = OrderedDict() # cache data for aggregation - if '(' not in aggr: - raise ValueError("Expect an Aggregation in format of 'AGGR(...)'") - aggr, remaining = aggr.split('(', 1) - aggr = aggr.strip() - if aggr not in MACROS or not MACROS[aggr].get('aggr'): - raise ValueError("Aggregation {!r} has not been registered.".format(aggr)) - self.aggr = MACROS[aggr] - - remaining = remaining.strip() - if not remaining.endswith(')'): - raise ValueError("Expect an Aggregation in format of 'AGGR(...)'") - remaining = remaining[:-1] - if ',' not in remaining: - term, remaining = remaining, '' - else: - parts = safe_split(remaining, ',') - term, remaining = parts[0], ','.join(parts[1:]) - - term = term.strip() - remaining = remaining.strip() - self.term = terms[term] - self.filter = None - self.group = None - for term in safe_split(remaining, ','): - term = term.strip() - if not term: - continue - if '=' not in term: - kw, name = 'filter' if not self.filter else 'group', term - else: - kw, name = term.split('=') - kw = kw.strip() - name = name.strip() - if kw == 'filter': - self.filter = terms[name] - else: - self.group = terms[name] - - self.name = '{}({})'.format(aggr, self.term.name) - if self.term.term['type'] != 'continuous': - raise TypeError("Cannot aggregate on categorical data.") - - if self.group and self.group.term['type'] != 'categorical': - raise TypeError("Cannot aggregate on continuous groups.") - - self.xgroup = None - - def __repr__(self): - return ''.format( - self.aggr['func'].__name__, self.term, self.filter, self.group) - - def hasFILTER(self): - """Tell if I have filter""" - return self.term.name == 'FILTER' or (self.filter and self.filter.name == 'FILTER') or \ - (self.group and self.group.name == 'FILTER') - - def setxgroup(self, x): - """Set the group of X""" - if not self.group: - self.group = x - else: - self.xgroup = x - - def run(self, variant, passed): - """Run each variant""" - if self.filter and self.filter.run(variant, passed) is False: - return - if not self.group: - raise RuntimeError("No group specified, don't know how to aggregate.") - group = self.group.run(variant, passed) - if group is False: - return - if len(group) > 1: - raise ValueError("Cannot aggregate on more than one group, " + \ - "make sure you specified sample for sample data.") - group = group[0] - - xgroup = False - if self.xgroup: - xgroup = self.xgroup.run(variant, passed) - if xgroup is False: - return - if len(xgroup) > 1: - raise ValueError("Cannot aggregate on more than one level of xgroup.") - xgroup = xgroup[0] - - value = self.term.run(variant, passed) - - if value is False: - return - if xgroup: - self.cache.setdefault(xgroup, {}).setdefault(group, []).extend(value) - else: - self.cache.setdefault(group, []).extend(value) - - def dump(self): - """Dump and calculate the aggregations""" - ret = OrderedDict() - for key, value in self.cache.items(): - if isinstance(value, dict): - ret[key] = [(self.aggr['func'](val), grup) for grup, val in value.items()] - else: - ret[key] = self.aggr['func'](value) - self.cache.clear() - return ret + """The aggregation""" + def __init__(self, aggr, terms): + # pylint: disable=too-many-branches + self.cache = OrderedDict() # cache data for aggregation + if '(' not in aggr: + raise ValueError("Expect an Aggregation in format of 'AGGR(...)'") + aggr, remaining = aggr.split('(', 1) + aggr = aggr.strip() + if aggr not in MACROS or not MACROS[aggr].get('aggr'): + raise ValueError( + "Aggregation {!r} has not been registered.".format(aggr)) + self.aggr = MACROS[aggr] + + remaining = remaining.strip() + if not remaining.endswith(')'): + raise ValueError("Expect an Aggregation in format of 'AGGR(...)'") + remaining = remaining[:-1] + if ',' not in remaining: + term, remaining = remaining, '' + else: + parts = safe_split(remaining, ',') + term, remaining = parts[0], ','.join(parts[1:]) + + term = term.strip() + remaining = remaining.strip() + self.term = terms[term] + self.filter = None + self.group = None + for term in safe_split(remaining, ','): + term = term.strip() + if not term: + continue + if '=' not in term: + kword, name = 'filter' if not self.filter else 'group', term + else: + kword, name = term.split('=') + kword = kword.strip() + name = name.strip() + if kword == 'filter': + self.filter = terms[name] + else: + self.group = terms[name] + + self.name = '{}({})'.format(aggr, self.term.name) + if self.term.term['type'] != 'continuous': + raise TypeError("Cannot aggregate on categorical data.") + + if self.group and self.group.term['type'] != 'categorical': + raise TypeError("Cannot aggregate on continuous groups.") + + self.xgroup = None + + def __repr__(self): + return ''.format( + self.aggr['func'].__name__, self.term, self.filter, self.group) + + def has_filter(self): + """Tell if I have filter""" + return (self.term.name == 'FILTER' + or (self.filter and self.filter.name == 'FILTER') + or (self.group and self.group.name == 'FILTER')) + + def setxgroup(self, xvar): + """Set the group of X""" + if not self.group: + self.group = xvar + else: + self.xgroup = xvar + + def run(self, variant, passed): + """Run each variant""" + if self.filter and self.filter.run(variant, passed) is False: + return + if not self.group: + raise RuntimeError( + "No group specified, don't know how to aggregate.") + group = self.group.run(variant, passed) + if group is False: + return + if len(group) > 1: + raise ValueError("Cannot aggregate on more than one group, " + \ + "make sure you specified sample for sample data.") + group = group[0] + + xgroup = False + if self.xgroup: + xgroup = self.xgroup.run(variant, passed) + if xgroup is False: + return + if len(xgroup) > 1: + raise ValueError( + "Cannot aggregate on more than one level of xgroup.") + xgroup = xgroup[0] + + value = self.term.run(variant, passed) + + if value is False: + return + if xgroup: + self.cache.setdefault(xgroup, {}).setdefault(group, + []).extend(value) + else: + self.cache.setdefault(group, []).extend(value) + + def dump(self): + """Dump and calculate the aggregations""" + ret = OrderedDict() + for key, value in self.cache.items(): + if isinstance(value, dict): + ret[key] = [(self.aggr['func'](val), grup) + for grup, val in value.items()] + else: + ret[key] = self.aggr['func'](value) + self.cache.clear() + return ret + class Formula: - """Handling the formulas""" - def __init__(self, formula, samples, passed, title): - LOGGER.info("[%s] Parsing formulas ...", title) - self._terms = {} - if '~' not in formula: - formula = formula + '~1' - parts = formula.split('~', 1) - if not parts[1].strip(): - parts[1] = '1' - LOGGER.debug('[%s] - Y:%r, X:%r', title, parts[0], parts[1]) - self.Y = self._parsePart(parts[0].strip(), samples) - self.X = self._parsePart(parts[1].strip(), samples) - - if isinstance(self.Y, Aggr) and isinstance(self.X, Term): - self.Y.setxgroup(self.X) - elif isinstance(self.Y, Aggr) and isinstance(self.X, Aggr): - if not self.Y.group: - self.Y.group = self.X.group - if not self.X.group: - self.X.group = self.Y.group - if self.Y.group != self.X.group: - raise ValueError("Two aggregations have to group by the same entry.") - - self.passed = passed - if (isinstance(self.Y, Term) and self.Y.name == 'FILTER') or \ - (isinstance(self.Y, Aggr) and self.Y.hasFILTER()) or \ - (isinstance(self.X, Term) and self.X.name == 'FILTER') or \ - (isinstance(self.X, Aggr) and self.X.hasFILTER()): - self.passed = False - - def _parsePart(self, part, samples): - """Parse each part of the formula""" - aggr = None - if part.endswith(')') and '(' in part: - aggr, term_fms = part[:-1].split('(') - else: - term_fms = part - - parts = safe_split(term_fms, ',') - if not aggr and len(parts) == 1: - return Term(parts[0], samples) - if aggr and len(parts) == 1: - name = 'TERM' + str(len(self._terms)) - self._terms[name] = Term(term_fms, samples) - return Aggr('{}({})'.format(aggr, name), self._terms) - - if len(parts) > 3: - raise ValueError('Wrong number of arguments (at most 3) for Aggregation: {}.'.format(aggr)) - - name1 = 'TERM' + str(len(self._terms)) - self._terms[name1] = self._parsePart(parts[0], samples) - args = [name1] - for i, termstr in enumerate(parts[1:]): - kw = None - if '=' in termstr: - kw, termstr = termstr.split('=', 1) - kw, termstr = kw.strip(), termstr.strip() - kw = kw or ('filter' if i == 0 else 'group') - if kw not in ('filter', 'group'): - raise ValueError('Expect filter/group as keyword argument name, but got {}.'.format(kw)) - - name2 = 'TERM' + str(len(self._terms)) - self._terms[name2] = self._parsePart(termstr, samples) - args.append('{}={}'.format(kw, name2)) - return Aggr('{}({})'.format(aggr, ', '.join(args)), self._terms) - - def run(self, variant, datafile): - """Run each variant""" - if isinstance(self.Y, Term) and isinstance(self.X, Term): - y, x = self.Y.run(variant, self.passed), self.X.run(variant, self.passed) - if y is False or x is False: - return - lenx = len(x) - leny = len(y) - if leny != lenx and leny != 1 and lenx != 1: - raise RuntimeError('Unmatched length of MACRO results: Y({}), X({})'.format(leny, lenx)) - if lenx == 1: - x = x * leny - if leny == 1: - y = y * lenx - for i, r in enumerate(x): - datafile.write('{}\t{}\n'.format(y[i], r)) - elif isinstance(self.Y, Aggr) and isinstance(self.X, Aggr): - self.Y.run(variant, self.passed) - self.X.run(variant, self.passed) - elif isinstance(self.Y, Aggr) and isinstance(self.X, Term): - self.Y.run(variant, self.passed) - else: - raise TypeError("Cannot do 'TERM ~ AGGREGATION'. " + \ - "If you want to do that, transpose 'AGGREGATION ~ TERM'") - - def done(self, datafile): - """Done iteration, start summarizing""" - if isinstance(self.Y, Aggr): - if isinstance(self.X, Term): - for key, value in self.Y.dump().items(): - if isinstance(value, list): - for val, grup in value: - datafile.write("{}\t{}\t{}\n".format(val, key, grup)) - else: - datafile.write("{}\t{}\n".format(value, key)) - else: - xdump = self.X.dump() - for key, value in self.Y.dump().items(): - datafile.write("{}\t{}\t{}\n".format(value, xdump.get(key, 'NA'), key)) + """Handling the formulas""" + def __init__(self, formula, samples, passed, title): + LOGGER.info("[%s] Parsing formulas ...", title) + self._terms = {} + if '~' not in formula: + formula = formula + '~1' + parts = formula.split('~', 1) + if not parts[1].strip(): + parts[1] = '1' + LOGGER.debug('[%s] - Y:%rvar, X:%rvar', title, parts[0], parts[1]) + self.Y = self._parse_part(parts[0].strip(), samples) # pylint: disable=invalid-name + self.X = self._parse_part(parts[1].strip(), samples) # pylint: disable=invalid-name + + if isinstance(self.Y, Aggr) and isinstance(self.X, Term): + self.Y.setxgroup(self.X) + elif isinstance(self.Y, Aggr) and isinstance(self.X, Aggr): + if not self.Y.group: + self.Y.group = self.X.group + if not self.X.group: + self.X.group = self.Y.group + if self.Y.group != self.X.group: + raise ValueError( + "Two aggregations have to group by the same entry.") + + self.passed = passed + # pylint: disable=too-many-boolean-expressions + if ((isinstance(self.Y, Term) and self.Y.name == 'FILTER') + or (isinstance(self.Y, Aggr) and self.Y.has_filter()) + or (isinstance(self.X, Term) and self.X.name == 'FILTER') + or (isinstance(self.X, Aggr) and self.X.has_filter())): + self.passed = False + + def _parse_part(self, part, samples): + """Parse each part of the formula""" + aggr = None + if part.endswith(')') and '(' in part: + aggr, term_fms = part[:-1].split('(') + else: + term_fms = part + + parts = safe_split(term_fms, ',') + if not aggr and len(parts) == 1: + return Term(parts[0], samples) + if aggr and len(parts) == 1: + name = 'TERM' + str(len(self._terms)) + self._terms[name] = Term(term_fms, samples) + return Aggr('{}({})'.format(aggr, name), self._terms) + + if len(parts) > 3: + raise ValueError( + 'Wrong number of arguments (at most 3) for Aggregation: {}.'. + format(aggr)) + + name1 = 'TERM' + str(len(self._terms)) + self._terms[name1] = self._parse_part(parts[0], samples) + args = [name1] + for i, termstr in enumerate(parts[1:]): + kword = None + if '=' in termstr: + kword, termstr = termstr.split('=', 1) + kword, termstr = kword.strip(), termstr.strip() + kword = kword or ('filter' if i == 0 else 'group') + if kword not in ('filter', 'group'): + raise ValueError( + 'Expect filter/group as keyword argument name, but got {}.' + .format(kword)) + + name2 = 'TERM' + str(len(self._terms)) + self._terms[name2] = self._parse_part(termstr, samples) + args.append('{}={}'.format(kword, name2)) + return Aggr('{}({})'.format(aggr, ', '.join(args)), self._terms) + + def run(self, variant, datafile): + """Run each variant""" + if isinstance(self.Y, Term) and isinstance(self.X, Term): + yvar, xvar = (self.Y.run(variant, self.passed), + self.X.run(variant, self.passed)) + if yvar is False or xvar is False: + return + lenx = len(xvar) + leny = len(yvar) + if leny != lenx and leny != 1 and lenx != 1: + raise RuntimeError( + 'Unmatched length of MACRO results: Y({}), X({})'.format( + leny, lenx)) + if lenx == 1: + xvar = xvar * leny + if leny == 1: + yvar = yvar * lenx + for i, rvar in enumerate(xvar): + datafile.write('{}\t{}\n'.format(yvar[i], rvar)) + elif isinstance(self.Y, Aggr) and isinstance(self.X, Aggr): + self.Y.run(variant, self.passed) + self.X.run(variant, self.passed) + elif isinstance(self.Y, Aggr) and isinstance(self.X, Term): + self.Y.run(variant, self.passed) + else: + raise TypeError("Cannot do 'TERM ~ AGGREGATION'. " + \ + "If you want to do that, transpose 'AGGREGATION ~ TERM'") + + def done(self, datafile): + """Done iteration, start summarizing""" + if isinstance(self.Y, Aggr): + if isinstance(self.X, Term): + for key, value in self.Y.dump().items(): + if isinstance(value, list): + for val, grup in value: + datafile.write("{}\t{}\t{}\n".format( + val, key, grup)) + else: + datafile.write("{}\t{}\n".format(value, key)) + else: + xdump = self.X.dump() + for key, value in self.Y.dump().items(): + datafile.write("{}\t{}\t{}\n".format( + value, xdump.get(key, 'NA'), key)) diff --git a/vcfstats/macros.py b/vcfstats/macros.py index 0e3a068..29c3a31 100644 --- a/vcfstats/macros.py +++ b/vcfstats/macros.py @@ -5,137 +5,161 @@ # pylint: disable=invalid-name -def categorical(func = None, alias = None, _name = None): - """Categorical decorator""" - if alias: - return partial(categorical, _name = alias) - funcname = func.__name__ - if funcname not in MACROS: - MACROS[funcname] = {} - MACROS[funcname]['func'] = MACROS[funcname].get('func', func) - MACROS[funcname]['type'] = 'categorical' - if _name: - MACROS[_name] = MACROS[funcname] - return MACROS[funcname]['func'] - -def continuous(func = None, alias = None, _name = None): - """Continuous decorator""" - if alias: - return partial(continuous, _name = alias) - funcname = func.__name__ - if funcname not in MACROS: - MACROS[funcname] = {} - MACROS[funcname]['func'] = MACROS[funcname].get('func', func) - MACROS[funcname]['type'] = 'continuous' - if _name: - MACROS[_name] = MACROS[funcname] - return MACROS[funcname]['func'] - -def aggregation(func = None, alias = None, _name = None): - """Aggregation decorator""" - if alias: - return partial(aggregation, _name = alias) - funcname = func.__name__ - if funcname not in MACROS: - MACROS[funcname] = {} - MACROS[funcname]['func'] = MACROS[funcname].get('func', func) - MACROS[funcname]['aggr'] = True - if _name: - MACROS[_name] = MACROS[funcname] - return MACROS[funcname]['func'] - -cat = categorical + +def categorical(func=None, alias=None, _name=None): + """Categorical decorator""" + if alias: + return partial(categorical, _name=alias) + funcname = func.__name__ + if funcname not in MACROS: + MACROS[funcname] = {} + MACROS[funcname]['func'] = MACROS[funcname].get('func', func) + MACROS[funcname]['type'] = 'categorical' + if _name: + MACROS[_name] = MACROS[funcname] + return MACROS[funcname]['func'] + + +def continuous(func=None, alias=None, _name=None): + """Continuous decorator""" + if alias: + return partial(continuous, _name=alias) + funcname = func.__name__ + if funcname not in MACROS: + MACROS[funcname] = {} + MACROS[funcname]['func'] = MACROS[funcname].get('func', func) + MACROS[funcname]['type'] = 'continuous' + if _name: + MACROS[_name] = MACROS[funcname] + return MACROS[funcname]['func'] + + +def aggregation(func=None, alias=None, _name=None): + """Aggregation decorator""" + if alias: + return partial(aggregation, _name=alias) + funcname = func.__name__ + if funcname not in MACROS: + MACROS[funcname] = {} + MACROS[funcname]['func'] = MACROS[funcname].get('func', func) + MACROS[funcname]['aggr'] = True + if _name: + MACROS[_name] = MACROS[funcname] + return MACROS[funcname]['func'] + + +cat = categorical cont = continuous aggr = aggregation + @categorical def VARTYPE(variant): - """Variant type, one of deletion, indel, snp or sv""" - return variant.var_type + """Variant type, one of deletion, indel, snp or sv""" + return variant.var_type + @categorical def TITV(variant): - """Tell if a variant is a transition or transversion. The variant has to be an snp first.""" - if not variant.is_snp: - return False - return 'transition' if variant.is_transition else 'transversion' + """Tell if a variant is a transition or transversion. + The variant has to be an snp first.""" + if not variant.is_snp: + return False + return 'transition' if variant.is_transition else 'transversion' -@categorical(alias = 'CHROM') + +@categorical(alias='CHROM') def CONTIG(variant): - """Get the config/chromosome of a variant. Alias: CHROM""" - return variant.CHROM + """Get the config/chromosome of a variant. Alias: CHROM""" + return variant.CHROM + -@categorical(alias = 'GT_TYPEs') +@categorical(alias='GT_TYPEs') def GTTYPEs(variant): - """Get the genotypes(HOM_REF,HET,HOM_ALT,UNKNOWN) of a variant for each sample""" - gttypes = variant.gt_types - return ['HOM_REF'if gttype == 0 else \ - 'HET' if gttype == 1 else \ - 'HOM_ALT' if gttype == 2 else 'UNKNOWN' for gttype in gttypes] + """Get the genotypes(HOM_REF,HET,HOM_ALT,UNKNOWN) + of a variant for each sample""" + gttypes = variant.gt_types + return ['HOM_REF'if gttype == 0 else \ + 'HET' if gttype == 1 else \ + 'HOM_ALT' if gttype == 2 else 'UNKNOWN' for gttype in gttypes] + @categorical def FILTER(variant): - """Get the FILTER of a variant.""" - return variant.FILTER or 'PASS' + """Get the FILTER of a variant.""" + return variant.FILTER or 'PASS' + @categorical def SUBST(variant): - """Substitution of the variant, including all types of varinat""" - return '{}>{}'.format(variant.REF, ','.join(variant.ALT)) + """Substitution of the variant, including all types of varinat""" + return '{}>{}'.format(variant.REF, ','.join(variant.ALT)) + @continuous def NALT(variant): - """Number of alternative alleles""" - return len(variant.ALT) + """Number of alternative alleles""" + return len(variant.ALT) + @continuous def GQs(variant): - """get the GQ for each sample as a numpy array.""" - return variant.gt_quals + """get the GQ for each sample as a numpy array.""" + return variant.gt_quals + @continuous def QUAL(variant): - """Variant quality from QUAL field.""" - return variant.QUAL + """Variant quality from QUAL field.""" + return variant.QUAL + -@continuous(alias = 'DPs') +@continuous(alias='DPs') def DEPTHs(variant): - """Get the read-depth for each sample as a numpy array.""" - try: - return [sum(dp) for dp in variant.format('DP')] - except (TypeError, ValueError): - warnings.warn('Failed to fetch sample depth for variant: {}'.format(variant).rstrip("\n"), - stacklevel = 0) - return None + """Get the read-depth for each sample as a numpy array.""" + try: + return [sum(dp) for dp in variant.format('DP')] + except (TypeError, ValueError): + warnings.warn('Failed to fetch sample depth for variant: {}'.format( + variant).rstrip("\n"), + stacklevel=0) + return None + @continuous def AAF(variant): - """Alternate allele frequency across samples in this VCF.""" - return variant.aaf + """Alternate allele frequency across samples in this VCF.""" + return variant.aaf + @continuous def AFs(variant): - """get the freq of alternate reads as a numpy array.""" - return variant.gt_alt_freqs + """get the freq of alternate reads as a numpy array.""" + return variant.gt_alt_freqs + @continuous -def _ONE(variant): # pylint: disable=unused-argument - """Return 1 for a variant, usually used in aggregation, or indication of a distribution plot""" - return 1 +def _ONE(variant): # pylint: disable=unused-argument + """Return 1 for a variant, usually used in aggregation, + or indication of a distribution plot""" + return 1 + @aggregation def COUNT(entries): - """Count the variants in groups""" - return len(entries) + """Count the variants in groups""" + return len(entries) + @aggregation def SUM(entries): - """Sum up the values in groups""" - return sum(entries) + """Sum up the values in groups""" + return sum(entries) + -@aggregation(alias = 'AVG') +@aggregation(alias='AVG') def MEAN(entries): - """Get the mean of the values""" - if not entries: - return 0.0 - return sum(entries) / len(entries) + """Get the mean of the values""" + if not entries: + return 0.0 + return sum(entries) / len(entries) diff --git a/vcfstats/one.py b/vcfstats/one.py index 21f18a0..f436774 100644 --- a/vcfstats/one.py +++ b/vcfstats/one.py @@ -5,194 +5,215 @@ from . import LOGGER from .formula import Formula, Term, Aggr -def title_to_valid_path(title, allowed = '_-.()' + string.ascii_letters + string.digits): - """Convert a title to a valid file path""" - return ''.join(c if c in allowed else '_' for c in title) + +def title_to_valid_path(title, + allowed='_-.()' + string.ascii_letters + + string.digits): + """Convert a title to a valid file path""" + return ''.join(c if c in allowed else '_' for c in title) + def get_plot_type(formula, figtype): - """Get the real plot type""" - if isinstance(formula.Y, Aggr) and isinstance(formula.X, Aggr): - if figtype in ('', None, 'scatter'): - return figtype or 'scatter' - raise TypeError( - "Don't know how to plot AGGREGATION ~ AGGREGATION using plots other than scatter") - if isinstance(formula.Y, Aggr) and isinstance(formula.X, Term): - if figtype in ('', None, 'col', 'bar', 'pie'): - figtype = 'col' if figtype == 'bar' else figtype - return (figtype or 'pie') if formula.X.name == '1' else (figtype or 'col') - raise TypeError( - "Don't know how to plot AGGREGATION ~ CATEGORICAL using plots other than col/pie") - # all are terms, 'cuz we cannot have Term ~ Aggr - # if isinstance(formula.Y, Term) and isinstance(formula.X, Term): - if formula.Y.term['type'] == 'categorical' and formula.X.term['type'] == 'categorical': - if figtype in ('', None, 'bar', 'pie'): - return figtype or 'bar' - raise TypeError( - "Don't know how to plot CATEGORICAL ~ CATEGORICAL using plots other than bar/pie") - if formula.Y.term['type'] == 'continuous' and formula.X.term['type'] == 'categorical': - if figtype in ('', None, 'violin', 'boxplot', 'histogram', 'density', 'freqpoly'): - return figtype or 'violin' - raise TypeError("Don't know how to plot CONTINUOUS ~ CATEGORICAL " + \ - "using plots other than violin/boxplot/histogram/density/freqpoly") - if formula.Y.term['type'] == 'categorical' and formula.X.term['type'] == 'continuous': - if formula.X.term['func'].__name__ == '_ONE': - if figtype in ('', None, 'bar', 'pie'): - return figtype or 'pie' - raise TypeError("If you want to plot CATEGORICAL ~ CONTINUOUS, " + \ - "where CONTINUOUS is not 1, transpose CONTINUOUS ~ CATEGORICAL") - if formula.Y.term['type'] == 'continuous' and formula.X.term['type'] == 'continuous': - if formula.X.term['func'].__name__ == '_ONE': - if figtype in ('', None, 'histogram', 'freqpoly', 'density'): - return figtype or 'histogram' - raise TypeError("Don't know how to plot distribution " + \ - "using plots other than histogram/freqpoly/density") - if figtype in ('', None, 'scatter'): - return figtype or 'scatter' - raise TypeError( - "Don't know how to plot CONTINUOUS ~ CONTINUOUS using plots other than scatter") + """Get the real plot type""" + # pylint: disable=too-many-branches,too-many-return-statements + if isinstance(formula.Y, Aggr) and isinstance(formula.X, Aggr): + if figtype in ('', None, 'scatter'): + return figtype or 'scatter' + raise TypeError("Don't know how to plot AGGREGATION ~ AGGREGATION " + "using plots other than scatter") + if isinstance(formula.Y, Aggr) and isinstance(formula.X, Term): + if figtype in ('', None, 'col', 'bar', 'pie'): + figtype = 'col' if figtype == 'bar' else figtype + return (figtype or 'pie') if formula.X.name == '1' else (figtype + or 'col') + raise TypeError("Don't know how to plot AGGREGATION ~ CATEGORICAL " + "using plots other than col/pie") + # all are terms, 'cuz we cannot have Term ~ Aggr + # if isinstance(formula.Y, Term) and isinstance(formula.X, Term): + if formula.Y.term['type'] == 'categorical' and formula.X.term[ + 'type'] == 'categorical': + if figtype in ('', None, 'bar', 'pie'): + return figtype or 'bar' + raise TypeError("Don't know how to plot CATEGORICAL ~ CATEGORICAL " + "using plots other than bar/pie") + if formula.Y.term['type'] == 'continuous' and formula.X.term[ + 'type'] == 'categorical': + if figtype in ('', None, 'violin', 'boxplot', 'histogram', 'density', + 'freqpoly'): + return figtype or 'violin' + raise TypeError("Don't know how to plot CONTINUOUS ~ CATEGORICAL " + \ + "using plots other than violin/boxplot/histogram/density/freqpoly") + if formula.Y.term['type'] == 'categorical' and formula.X.term[ + 'type'] == 'continuous': + if formula.X.term['func'].__name__ == '_ONE': + if figtype in ('', None, 'bar', 'pie'): + return figtype or 'pie' + raise TypeError("If you want to plot CATEGORICAL ~ CONTINUOUS, " + \ + "where CONTINUOUS is not 1, transpose CONTINUOUS ~ CATEGORICAL") + if formula.Y.term['type'] == 'continuous' and formula.X.term[ + 'type'] == 'continuous': + if formula.X.term['func'].__name__ == '_ONE': + if figtype in ('', None, 'histogram', 'freqpoly', 'density'): + return figtype or 'histogram' + raise TypeError("Don't know how to plot distribution " + \ + "using plots other than histogram/freqpoly/density") + if figtype in ('', None, 'scatter'): + return figtype or 'scatter' + raise TypeError("Don't know how to plot CONTINUOUS ~ CONTINUOUS " + "using plots other than scatter") + return None class One: - """One instance/plot""" - def __init__(self, formula, title, ggs, devpars, outdir, samples, figtype, passed): + """One instance/plot""" + def __init__(self, # pylint: disable=too-many-arguments + formula, + title, + ggs, + devpars, + outdir, + samples, + figtype, + passed): - LOGGER.info("INSTANCE: %r", title) - self.title = title - self.formula = Formula(formula, samples, passed, title) - self.outprefix = path.join(outdir, title_to_valid_path(title)) - self.devpars = devpars - self.ggs = ggs - self.datafile = open(self.outprefix + '.txt', 'w') - if isinstance(self.formula.Y, Aggr) and \ - ((isinstance(self.formula.X, Term) and self.formula.Y.xgroup) or \ - isinstance(self.formula.X, Aggr)): - self.datafile.write("{}\t{}\tGroup\n".format( - self.formula.Y.name, self.formula.X.name)) - else: - self.datafile.write("{}\t{}\n".format( - self.formula.Y.name, self.formula.X.name)) - self.figtype = get_plot_type(self.formula, figtype) - LOGGER.info("[%s] plot type: %s", self.title, self.figtype) - LOGGER.debug("[%s] ggs: %s", self.title, self.ggs) - LOGGER.debug("[%s] devpars: %s", self.title, self.devpars) + LOGGER.info("INSTANCE: %r", title) + self.title = title + self.formula = Formula(formula, samples, passed, title) + self.outprefix = path.join(outdir, title_to_valid_path(title)) + self.devpars = devpars + self.ggs = ggs + self.datafile = open(self.outprefix + '.txt', 'w') + if isinstance(self.formula.Y, Aggr) and \ + ((isinstance(self.formula.X, Term) and self.formula.Y.xgroup) or \ + isinstance(self.formula.X, Aggr)): + self.datafile.write("{}\t{}\tGroup\n".format( + self.formula.Y.name, self.formula.X.name)) + else: + self.datafile.write("{}\t{}\n".format(self.formula.Y.name, + self.formula.X.name)) + self.figtype = get_plot_type(self.formula, figtype) + LOGGER.info("[%s] plot type: %s", self.title, self.figtype) + LOGGER.debug("[%s] ggs: %s", self.title, self.ggs) + LOGGER.debug("[%s] devpars: %s", self.title, self.devpars) - def __del__(self): - try: - if self.datafile: - self.datafile.close() - except Exception: - pass + def __del__(self): + try: + if self.datafile: + self.datafile.close() + except Exception: + pass - def iterate(self, variant): - """Iterate over each variant""" - # Y - self.formula.run(variant, self.datafile) + def iterate(self, variant): + """Iterate over each variant""" + # Y + self.formula.run(variant, self.datafile) - def summarize(self): - """Calculate the aggregations""" - LOGGER.info("[%s] Summarizing aggregations ...", self.title) - self.formula.done(self.datafile) - self.datafile.close() + def summarize(self): + """Calculate the aggregations""" + LOGGER.info("[%s] Summarizing aggregations ...", self.title) + self.formula.done(self.datafile) + self.datafile.close() - def plot(self, Rscript): # pylint: disable=invalid-name - """Plot the figures using R""" - LOGGER.info("[%s] Composing R code ...", self.title) - rcode = """ - require('ggplot2') - set.seed(8525) - figtype = {figtype!r} + def plot(self, Rscript): # pylint: disable=invalid-name + """Plot the figures using R""" + LOGGER.info("[%s] Composing R code ...", self.title) + rcode = """ + require('ggplot2') + set.seed(8525) + figtype = {figtype!r} - plotdata = read.table( paste0({outprefix!r}, '.txt'), - header = TRUE, row.names = NULL, check.names = FALSE, sep = "\t") - cnames = make.unique(colnames(plotdata)) - colnames(plotdata) = cnames + plotdata = read.table( paste0({outprefix!r}, '.txt'), + header = TRUE, row.names = NULL, check.names = FALSE, sep = "\t") + cnames = make.unique(colnames(plotdata)) + colnames(plotdata) = cnames - bQuote = function(s) paste0('`', s, '`') + bQuote = function(s) paste0('`', s, '`') - png(paste0({outprefix!r}, '.', figtype, '.png'), - height = {devpars[height]}, width = {devpars[width]}, res = {devpars[res]}) - if (length(cnames) > 2) {{ - aes_for_geom = aes_string(fill = bQuote(cnames[3])) - aes_for_geom_color = aes_string(color = bQuote(cnames[3])) - plotdata[,3] = factor(plotdata[,3], levels = rev(unique(as.character(plotdata[,3])))) - }} else {{ - aes_for_geom = NULL - aes_for_geom_color = NULL - }} - p = ggplot(plotdata, aes_string(y = bQuote(cnames[1]), x = bQuote(cnames[2]))) - xticks = theme(axis.text.x = element_text(angle = 60, hjust = 1)) - if (figtype == 'scatter') {{ - p = p + geom_point(aes_for_geom_color) - # }} else if (figtype == 'line') {{ - # p = p + geom_line(aes_for_geom) - }} else if (figtype == 'bar') {{ - p = ggplot(plotdata, aes_string(x = bQuote(cnames[2]))) - p = p + geom_bar(aes_string(fill = bQuote(cnames[1]))) + xticks - }} else if (figtype == 'col') {{ - p = p + geom_col(aes_for_geom) + xticks - }} else if (figtype == 'pie') {{ - library(ggrepel) - if (length(cnames) > 2) {{ - p = p + geom_col(aes_for_geom) + coord_polar("y", start=0) + - geom_label_repel( - aes_for_geom, - y = cumsum(plotdata[,1]) - plotdata[,1]/2, - label = paste0(unlist(round(plotdata[,1]/sum(plotdata[,1])*100,1)), '%'), - show.legend = FALSE) - }} else {{ - plotdata[,1] = factor(plotdata[,1], levels = rev(unique(as.character(plotdata[,1])))) - fills = rev(levels(plotdata[,1])) - sums = sapply(fills, function(f) sum(plotdata[,1] == f)) - p = ggplot(plotdata, aes_string(x = bQuote(cnames[2]))) + - geom_bar(aes_string(fill = bQuote(cnames[1]))) + coord_polar("y", start=0) + - geom_label_repel( - inherit.aes = FALSE, - data = data.frame(sums, fills), - x = 1, - y = cumsum(sums) - sums/2, - label = paste0(unlist(round(sums/sum(sums)*100,1)), '%'), - show.legend = FALSE) - }} - p = p + theme_minimal() + theme(axis.title.x = element_blank(), - axis.title.y = element_blank(), - axis.text.y =element_blank()) - }} else if (figtype == 'violin') {{ - p = p + geom_violin(aes_for_geom) + xticks - }} else if (figtype == 'boxplot') {{ - p = p + geom_boxplot(aes_for_geom) + xticks - }} else if (figtype == 'histogram' || figtype == 'density') {{ - plotdata[,2] = as.factor(plotdata[,2]) - p = ggplot(plotdata, aes_string(x = bQuote(cnames[1]))) - params = list(alpha = .6) - if (cnames[2] != '1') {{ - params$mapping = aes_string(fill = bQuote(cnames[2])) - }} - p = p + do.call(paste0("geom_", figtype), params) - }} else if (figtype == 'freqpoly') {{ - plotdata[,2] = as.factor(plotdata[,2]) - p = ggplot(plotdata, aes_string(x = bQuote(cnames[1]))) - if (cnames[2] != '1') {{ - params$mapping = aes_string(color = bQuote(cnames[2])) - }} - p = p + do.call(paste0("geom_", figtype), params) - }} else {{ - stop(paste('Unknown plot type:', figtype)) - }} - {extrggs} - print(p) - dev.off() - """.format( - figtype = self.figtype, - outprefix = self.outprefix, - devpars = self.devpars, - extrggs = ('p = p + ' + self.ggs) if self.ggs else '' - ) - with open(self.outprefix + '.plot.R', 'w') as f: - f.write(rcode) - LOGGER.info("[%s] Running R code to plot ...", self.title) - LOGGER.info("[%s] Data will be saved to: %s", self.title, self.outprefix + '.txt') - LOGGER.info("[%s] Plot will be saved to: %s", - self.title, self.outprefix + '.' + self.figtype + '.png') - cmd = cmdy.Rscript(self.outprefix + '.plot.R', _exe = Rscript, _raise = False) - if cmd.rc != 0: - for line in cmd.stderr.splitlines(): - LOGGER.error("[%s] %s", self.title, line) + png(paste0({outprefix!r}, '.', figtype, '.png'), + height = {devpars[height]}, width = {devpars[width]}, res = {devpars[res]}) + if (length(cnames) > 2) {{ + aes_for_geom = aes_string(fill = bQuote(cnames[3])) + aes_for_geom_color = aes_string(color = bQuote(cnames[3])) + plotdata[,3] = factor(plotdata[,3], levels = rev(unique(as.character(plotdata[,3])))) + }} else {{ + aes_for_geom = NULL + aes_for_geom_color = NULL + }} + p = ggplot(plotdata, aes_string(y = bQuote(cnames[1]), x = bQuote(cnames[2]))) + xticks = theme(axis.text.x = element_text(angle = 60, hjust = 1)) + if (figtype == 'scatter') {{ + p = p + geom_point(aes_for_geom_color) + # }} else if (figtype == 'line') {{ + # p = p + geom_line(aes_for_geom) + }} else if (figtype == 'bar') {{ + p = ggplot(plotdata, aes_string(x = bQuote(cnames[2]))) + p = p + geom_bar(aes_string(fill = bQuote(cnames[1]))) + xticks + }} else if (figtype == 'col') {{ + p = p + geom_col(aes_for_geom) + xticks + }} else if (figtype == 'pie') {{ + library(ggrepel) + if (length(cnames) > 2) {{ + p = p + geom_col(aes_for_geom) + coord_polar("y", start=0) + + geom_label_repel( + aes_for_geom, + y = cumsum(plotdata[,1]) - plotdata[,1]/2, + label = paste0(unlist(round(plotdata[,1]/sum(plotdata[,1])*100,1)), '%'), + show.legend = FALSE) + }} else {{ + plotdata[,1] = factor(plotdata[,1], levels = rev(unique(as.character(plotdata[,1])))) + fills = rev(levels(plotdata[,1])) + sums = sapply(fills, function(f) sum(plotdata[,1] == f)) + p = ggplot(plotdata, aes_string(x = bQuote(cnames[2]))) + + geom_bar(aes_string(fill = bQuote(cnames[1]))) + coord_polar("y", start=0) + + geom_label_repel( + inherit.aes = FALSE, + data = data.frame(sums, fills), + x = 1, + y = cumsum(sums) - sums/2, + label = paste0(unlist(round(sums/sum(sums)*100,1)), '%'), + show.legend = FALSE) + }} + p = p + theme_minimal() + theme(axis.title.x = element_blank(), + axis.title.y = element_blank(), + axis.text.y =element_blank()) + }} else if (figtype == 'violin') {{ + p = p + geom_violin(aes_for_geom) + xticks + }} else if (figtype == 'boxplot') {{ + p = p + geom_boxplot(aes_for_geom) + xticks + }} else if (figtype == 'histogram' || figtype == 'density') {{ + plotdata[,2] = as.factor(plotdata[,2]) + p = ggplot(plotdata, aes_string(x = bQuote(cnames[1]))) + params = list(alpha = .6) + if (cnames[2] != '1') {{ + params$mapping = aes_string(fill = bQuote(cnames[2])) + }} + p = p + do.call(paste0("geom_", figtype), params) + }} else if (figtype == 'freqpoly') {{ + plotdata[,2] = as.factor(plotdata[,2]) + p = ggplot(plotdata, aes_string(x = bQuote(cnames[1]))) + if (cnames[2] != '1') {{ + params$mapping = aes_string(color = bQuote(cnames[2])) + }} + p = p + do.call(paste0("geom_", figtype), params) + }} else {{ + stop(paste('Unknown plot type:', figtype)) + }} + {extrggs} + print(p) + dev.off() + """.format(figtype=self.figtype, + outprefix=self.outprefix, + devpars=self.devpars, + extrggs=('p = p + ' + self.ggs) if self.ggs else '') + with open(self.outprefix + '.plot.R', 'w') as fout: + fout.write(rcode) + LOGGER.info("[%s] Running R code to plot ...", self.title) + LOGGER.info("[%s] Data will be saved to: %s", self.title, + self.outprefix + '.txt') + LOGGER.info("[%s] Plot will be saved to: %s", self.title, + self.outprefix + '.' + self.figtype + '.png') + cmd = cmdy.Rscript(self.outprefix + '.plot.R', + _exe=Rscript, + _raise=False) + if cmd.rc != 0: + for line in cmd.stderr.splitlines(): + LOGGER.error("[%s] %s", self.title, line)