From 60072a6a26f232c5338c214533edd551041f8c62 Mon Sep 17 00:00:00 2001 From: Anze Vavpetic Date: Thu, 21 Apr 2016 13:58:38 +0200 Subject: [PATCH] Adds Hedwig widget --- workflows/ilp/hedwig/__init__.py | 152 +++++++ workflows/ilp/hedwig/__main__.py | 87 ++++ workflows/ilp/hedwig/assets/builtin.n3 | 30 ++ workflows/ilp/hedwig/core/__init__.py | 10 + workflows/ilp/hedwig/core/example.py | 34 ++ workflows/ilp/hedwig/core/helpers.py | 35 ++ workflows/ilp/hedwig/core/kb.py | 421 ++++++++++++++++++ workflows/ilp/hedwig/core/load.py | 187 ++++++++ workflows/ilp/hedwig/core/predicate.py | 87 ++++ workflows/ilp/hedwig/core/rule.py | 417 +++++++++++++++++ workflows/ilp/hedwig/core/settings.py | 53 +++ workflows/ilp/hedwig/learners/__init__.py | 4 + workflows/ilp/hedwig/learners/bottomup.py | 84 ++++ workflows/ilp/hedwig/learners/learner.py | 288 ++++++++++++ workflows/ilp/hedwig/learners/optimal.py | 51 +++ workflows/ilp/hedwig/stats/__init__.py | 6 + workflows/ilp/hedwig/stats/adjustment.py | 44 ++ workflows/ilp/hedwig/stats/scorefunctions.py | 147 ++++++ workflows/ilp/hedwig/stats/significance.py | 46 ++ workflows/ilp/hedwig/stats/validate.py | 28 ++ workflows/ilp/library.py | 53 +++ .../cfe8f700-0883-4386-9e2b-2fb61b9771c2.json | 375 ++++++++++++++++ 22 files changed, 2639 insertions(+) create mode 100644 workflows/ilp/hedwig/__init__.py create mode 100644 workflows/ilp/hedwig/__main__.py create mode 100755 workflows/ilp/hedwig/assets/builtin.n3 create mode 100644 workflows/ilp/hedwig/core/__init__.py create mode 100644 workflows/ilp/hedwig/core/example.py create mode 100644 workflows/ilp/hedwig/core/helpers.py create mode 100755 workflows/ilp/hedwig/core/kb.py create mode 100755 workflows/ilp/hedwig/core/load.py create mode 100644 workflows/ilp/hedwig/core/predicate.py create mode 100644 workflows/ilp/hedwig/core/rule.py create mode 100644 workflows/ilp/hedwig/core/settings.py create mode 100644 workflows/ilp/hedwig/learners/__init__.py create mode 100644 workflows/ilp/hedwig/learners/bottomup.py create mode 100755 workflows/ilp/hedwig/learners/learner.py create mode 100755 workflows/ilp/hedwig/learners/optimal.py create mode 100644 workflows/ilp/hedwig/stats/__init__.py create mode 100644 workflows/ilp/hedwig/stats/adjustment.py create mode 100644 workflows/ilp/hedwig/stats/scorefunctions.py create mode 100644 workflows/ilp/hedwig/stats/significance.py create mode 100644 workflows/ilp/hedwig/stats/validate.py create mode 100644 workflows/ilp/package_data/widgets/cfe8f700-0883-4386-9e2b-2fb61b9771c2.json diff --git a/workflows/ilp/hedwig/__init__.py b/workflows/ilp/hedwig/__init__.py new file mode 100644 index 0000000..1857eb2 --- /dev/null +++ b/workflows/ilp/hedwig/__init__.py @@ -0,0 +1,152 @@ +import os +import time +from datetime import datetime +import logging +import json + +from hedwig.core import ExperimentKB, Rule +from hedwig.learners import HeuristicLearner, OptimalLearner +from hedwig.stats import scorefunctions, adjustment, significance, Validate +from hedwig.core.load import load_graph +from hedwig.core.settings import VERSION, DESCRIPTION, logger + + +def _parameters_report(args, start, time_taken): + sep = '-'*40 + '\n' + rep = DESCRIPTION + '\n' +\ + 'Version: %s' % VERSION + '\n' +\ + 'Start: %s' % start + '\n' +\ + 'Time taken: %.2f seconds' % time_taken + '\n' +\ + 'Parameters:' + '\n' + + for arg, val in args.items(): + rep += '\t%s=%s\n' % (arg, str(val)) + rep = sep + rep + sep + + return rep + + +def generate_rules_report(kwargs, rules_per_target, + human=lambda label, rule: label): + rules_report = '' + for _, rules in rules_per_target: + if rules: + rules_report += Rule.ruleset_report(rules, show_uris=kwargs['uris'], + human=human) + rules_report += '\n' + if not rules_report: + rules_report = 'No significant rules found' + return rules_report + + +def run(kwargs, cli=False): + + if cli: + logger.setLevel(logging.DEBUG if kwargs['verbose'] else logging.INFO) + else: + logger.setLevel(logging.NOTSET) + + logger.info('Starting Hedwig') + start = time.time() + start_date = datetime.now().isoformat() + + graph = build_graph(kwargs) + + logger.info('Building the knowledge base') + score_func = getattr(scorefunctions, kwargs['score']) + kb = ExperimentKB(graph, score_func, instances_as_leaves=kwargs['leaves']) + + validator = Validate(kb, significance_test=significance.apply_fisher, + adjustment=getattr(adjustment, kwargs['adjust'])) + + rules_per_target = run_learner(kwargs, kb, validator) + rules_report = generate_rules_report(kwargs, rules_per_target) + + + end = time.time() + time_taken = end-start + logger.info('Finished in %d seconds' % time_taken) + + logger.info('Outputing results') + + if kwargs['covered']: + with open(kwargs['covered'], 'w') as f: + examples = Rule.ruleset_examples_json(rules_per_target) + f.write(json.dumps(examples, indent=2)) + + parameters_report = _parameters_report(kwargs, start_date, time_taken) + rules_out_file = kwargs['output'] + if rules_out_file: + with open(rules_out_file, 'w') as f: + if rules_out_file.endswith('json'): + f.write(Rule.to_json(rules_per_target, show_uris=kwargs['uris'])) + else: + f.write(parameters_report) + f.write(rules_report) + elif cli: + print parameters_report + print rules_report + + return rules_per_target + + +def build_graph(kwargs): + data = kwargs['data'] + base_name = data.split('.')[0] + + # Walk the dir to find BK files + ontology_list = [] + for root, sub_folders, files in os.walk(kwargs['bk_dir']): + ontology_list.extend(map(lambda f: os.path.join(root, f), files)) + + graph = load_graph( + ontology_list, + data, + def_format=kwargs['format'], + cache=not kwargs['nocache'] + ) + return graph + + +def run_learner(kwargs, kb, validator): + + if kb.is_discrete_target(): + targets = kb.class_values if not kwargs['target'] else [kwargs['target']] + else: + targets = [None] + + rules_report = '' + rules_per_target = [] + + for target in targets: + if target: + logger.info('Starting learner for target \'%s\'' % target) + else: + logger.info('Ranks detected - starting learner.') + + learner_cls = { + 'heuristic': HeuristicLearner, + 'optimal': OptimalLearner + } [kwargs['learner']] + learner = learner_cls(kb, + n=kwargs['beam'], + min_sup=int(kwargs['support']*kb.n_examples()), + target=target, + depth=kwargs['depth'], + sim=0.9, + use_negations=kwargs['negations'], + optimal_subclass=kwargs['optimalsubclass']) + rules = learner.induce() + + if kb.is_discrete_target(): + if kwargs['adjust'] == 'fdr': + logger.info('Validating rules, FDR = %.3f' % kwargs['FDR']) + elif kwargs['adjust'] == 'fwer': + logger.info('Validating rules, alpha = %.3f' % kwargs['alpha']) + rules = validator.test(rules, alpha=kwargs['alpha'], q=kwargs['FDR']) + + rules_per_target.append((target, rules)) + + return rules_per_target + + diff --git a/workflows/ilp/hedwig/__main__.py b/workflows/ilp/hedwig/__main__.py new file mode 100644 index 0000000..0593f3e --- /dev/null +++ b/workflows/ilp/hedwig/__main__.py @@ -0,0 +1,87 @@ +import argparse + +import hedwig +from hedwig.stats import scorefunctions, adjustment +from hedwig.core.settings import VERSION, DESCRIPTION, INPUT_FORMATS, Defaults + +__version__ = VERSION + +parser = argparse.ArgumentParser(description=DESCRIPTION) +functions = filter(lambda s: not s.startswith('_'), dir(scorefunctions)) +adjustments = filter(lambda s: not s.startswith('_'), dir(adjustment)) + +parser.add_argument('bk_dir', metavar='BKDIR', + help='Background knowledge directory. The program attempts\ + to load all RDF-type files from this directory.') + +parser.add_argument('data', metavar='DATASET', + help='File containing the learning examples. \ + Can be in RDF or JSON.') + +parser.add_argument('-f', '--format', choices=INPUT_FORMATS, + help="Input file format.", default=Defaults.FORMAT) + +parser.add_argument('-o', '--output', help='Output file. If none is specified, \ + the results are written to stdout. \ + Use .json suffix to write the results \ + in json.') + +parser.add_argument('-c', '--covered', help='File to write IDs of covered \ + examples.') + +parser.add_argument('-m', '--mode', choices=['features', 'subgroups'], + default=Defaults.MODE, + help='Running mode.') + +parser.add_argument('-t', '--target', + help='Target class label. If it is not specified, rules \ + produced for each class label.') + +parser.add_argument('-s', '--score', choices=functions, default=Defaults.SCORE, + help='Score function.') + +parser.add_argument('-n', '--negations', action='store_true', + help='Use negations.') + +parser.add_argument('-A', '--alpha', default=Defaults.ALPHA, type=float, + help='P-value threshold; applies if "--adjust fwer" \ + is used.') + +parser.add_argument('-a', '--adjust', default=Defaults.ADJUST, choices=adjustments, + help='Adjustment method for the multiple-testing problem.') + +parser.add_argument('-q', '--FDR', default=Defaults.FDR_Q, type=float, + help='Max false discovery rate; applies only if \ + "--adjust fdr" is used.') + +parser.add_argument('-l', '--leaves', action='store_true', + help='Use instance names in rule conjunctions.') + +parser.add_argument('-L', '--learner', choices=['heuristic', 'optimal'], + default=Defaults.LEARNER, + help='Type of learner to use.') + +parser.add_argument('-O', '--optimalsubclass', action='store_true', + help='In each step the full hierarchy under a particular \ + concept is searched') + +parser.add_argument('-u', '--uris', action='store_true', + help='Show URIs in rule conjunctions.') + +parser.add_argument('-b', '--beam', default=Defaults.BEAM_SIZE, type=int, + help='Beam size.') + +parser.add_argument('-S', '--support', default=Defaults.SUPPORT, type=float, + help='Minimum support.') + +parser.add_argument('-d', '--depth', default=Defaults.DEPTH, type=int, + help='Maximum number of conjunctions.') + +parser.add_argument('-C', '--nocache', action='store_true', + help='Don\'t cache background knowledge graph files.') + +parser.add_argument("-v", "--verbose", help="Increase output verbosity.", + action="store_true") + +args = parser.parse_args() +hedwig.run(args.__dict__, cli=True) diff --git a/workflows/ilp/hedwig/assets/builtin.n3 b/workflows/ilp/hedwig/assets/builtin.n3 new file mode 100755 index 0000000..241e6f2 --- /dev/null +++ b/workflows/ilp/hedwig/assets/builtin.n3 @@ -0,0 +1,30 @@ +@prefix : . +@prefix OWL: . +@prefix RDF: . +@prefix RDFS: . +@prefix XSD: . + +:Example + a OWL:Class ; + RDFS:label "Example". + +:score + a OWL:DatatypeProperty ; + RDFS:label "score" ; + RDFS:domain :Example ; + RDFS:range XSD:float . + +:annotated_with + a OWL:ObjectProperty ; + RDFS:label "annotated with" ; + RDFS:domain :Example ; + RDFS:range RDF:Resource . + +:GeneralizationPredicate + a OWL:Class. + +RDFS:subClassOf + a :GeneralizationPredicate. + +:partOf + a :GeneralizationPredicate. diff --git a/workflows/ilp/hedwig/core/__init__.py b/workflows/ilp/hedwig/core/__init__.py new file mode 100644 index 0000000..d29cdab --- /dev/null +++ b/workflows/ilp/hedwig/core/__init__.py @@ -0,0 +1,10 @@ +from hedwig.core.example import Example +from hedwig.core.predicate import UnaryPredicate, BinaryPredicate +from hedwig.core.rule import Rule +from hedwig.core.kb import ExperimentKB +from hedwig.core import settings +from hedwig.core import load + + +__all__ = ['Example', 'UnaryPredicate', 'BinaryPredicate', 'Rule', + 'ExperimentKB', 'settings', 'load'] diff --git a/workflows/ilp/hedwig/core/example.py b/workflows/ilp/hedwig/core/example.py new file mode 100644 index 0000000..f59514d --- /dev/null +++ b/workflows/ilp/hedwig/core/example.py @@ -0,0 +1,34 @@ +''' +Example-related classes. + +@author: anze.vavpetic@ijs.si +''' + + +class Example: + ''' + Represents an example with its score, label, id and annotations. + ''' + ClassLabeled = 'class' + Ranked = 'ranked' + + def __init__(self, id, label, score, annotations=[], weights={}): + self.id = id + self.label = label + self.score = score + if not type(score) in [str, unicode]: + self.target_type = Example.Ranked + else: + self.target_type = Example.ClassLabeled + self.annotations = annotations + self.weights = weights + + def __str__(self): + if self.target_type == Example.Ranked: + return '' % (self.id, + self.score, + self.label) + else: + return '' % (self.id, + self.score, + self.label) diff --git a/workflows/ilp/hedwig/core/helpers.py b/workflows/ilp/hedwig/core/helpers.py new file mode 100644 index 0000000..e9b7587 --- /dev/null +++ b/workflows/ilp/hedwig/core/helpers.py @@ -0,0 +1,35 @@ +''' +Helper functions. + +@author: anze.vavpetic@ijs.si +''' +from math import sqrt + +from hedwig.core.settings import W3C, HEDWIG + +def avg(x): + n = float(len(x)) + if n: + return sum(x)/n + else: + return 0 + + +def std(x): + n = float(len(x)) + if n: + return sqrt((sum(i*i for i in x) - sum(x)**2/n)/n) + else: + return 0 + + +def user_defined(uri): + ''' + Is this resource user defined? + ''' + return not uri.startswith(W3C) and not uri.startswith(HEDWIG) and \ + not anonymous_uri(uri) + + +def anonymous_uri(uri): + return not uri.startswith('http') diff --git a/workflows/ilp/hedwig/core/kb.py b/workflows/ilp/hedwig/core/kb.py new file mode 100755 index 0000000..b20fbea --- /dev/null +++ b/workflows/ilp/hedwig/core/kb.py @@ -0,0 +1,421 @@ +''' +Knowledge-base class. + +@author: anze.vavpetic@ijs.si +''' +from collections import defaultdict +from bitarray import bitarray +from rdflib import RDF, RDFS, URIRef + +from hedwig.core.example import Example +from hedwig.core.predicate import UnaryPredicate +from hedwig.core.helpers import avg, std, user_defined +from hedwig.core.settings import EXAMPLE_SCHEMA, logger, W3C, HEDWIG + + +class ExperimentKB: + ''' + The knowledge base for one specific experiment. + ''' + def __init__(self, triplets, score_fun, + instances_as_leaves=True): + ''' + Initialize the knowledge base with the given triplet graph. + The target class is given with 'target_class' - this is the + class to be described in the induction step. + ''' + self.instances_as_leaves = instances_as_leaves + self.score_fun = score_fun + self.sub_class_of = defaultdict(list) + self.super_class_of = defaultdict(list) + self.predicates = set() + self.binary_predicates = set() + self.class_values = set() + self.annotation_name = defaultdict(list) + + self.examples, all_annotations = self._build_examples(triplets) + + # Ranked or class-labeled data + self.target_type = self.examples[0].target_type + + self._build_subclassof(triplets) + self._calc_predicate_members(triplets) + self._find_roots(all_annotations) + self._calc_members_closure() + self._calc_binary_members() + self._propagate_annotation_names(triplets) + + # Statistics + if self.target_type == Example.Ranked: + self.mean = avg([ex.score for ex in self.examples]) + self.sd = std([ex.score for ex in self.examples]) + else: + self.distribution = defaultdict(int) + for ex in self.examples: + self.distribution[ex.score] += 1 + logger.debug('Class distribution: %s' % str(self.distribution)) + + + def _build_examples(self, g): + g.parse(EXAMPLE_SCHEMA, format='n3') + + # Extract the available examples from the graph + ex_subjects = g.subjects(predicate=RDF.type, object=HEDWIG.Example) + self.examples_uris = [ex for ex in ex_subjects] + self.uri_to_idx = {} + + all_annotations = set() + examples = [] + for i, ex_uri in enumerate(self.examples_uris): + + # Query for annotation link objects + annot_objects = g.objects(subject=ex_uri, + predicate=HEDWIG.annotated_with) + + annotation_links = [annot for annot in annot_objects] + annotations = [] + weights = {} + to_uni = lambda s: unicode(s).encode('ascii', 'ignore') + + for link in annotation_links: + + # Query for annotation objects via this link + annot_objects = g.objects(subject=link, + predicate=HEDWIG.annotation) + annotation = [to_uni(one) for one in annot_objects][0] + + # Query for weights on this link + weight_objects = g.objects(subject=link, + predicate=HEDWIG.weight) + weights_list = [one for one in weight_objects] + + if weights_list: + weights[annotation] = float(weights_list[0]) + + annotations.append(annotation) + + all_annotations.update(annotations) + + # Scores + score_list = list(g.objects(subject=ex_uri, + predicate=HEDWIG.score)) + if score_list: + score = float(score_list[0]) + else: + # Classes + score_list = list(g.objects(subject=ex_uri, + predicate=HEDWIG.class_label)) + + # If no scores or labels found at this stage + if not score_list: + raise Exception("No example labels or scores found! Examples should be " + + "instances of %s, with %s or %s provided." % (HEDWIG.Example, HEDWIG.score, HEDWIG.class_label)) + + score = str(score_list[0]) + self.class_values.add(score) + + self.uri_to_idx[ex_uri] = i + + examples.append(Example(i, str(ex_uri), score, + annotations=annotations, + weights=weights)) + + if not examples: + raise Exception("No examples provided! Examples should be " + + "instances of %s." % HEDWIG.Example) + + return examples, all_annotations + + + def _build_subclassof(self, g): + + for predicate in g.subjects(predicate=RDF.type, + object=HEDWIG.GeneralizationPredicate): + for sub, obj in g.subject_objects(predicate=predicate): + if user_defined(sub) and user_defined(obj): + self.add_sub_class(sub, obj) + + for predicate in g.subjects(predicate=RDF.type, + object=HEDWIG.SpecializationPredicate): + for sub, obj in g.subject_objects(predicate=predicate): + if user_defined(sub) and user_defined(obj): + # The subclass relation is reversed for predicates + # that specialize + self.add_sub_class(obj, sub) + + # Include the instances as predicates as well + if self.instances_as_leaves: + for sub, obj in g.subject_objects(predicate=RDF.type): + if user_defined(sub) and user_defined(obj): + self.add_sub_class(sub, obj) + + # Find the user-defined object predicates defined between examples + examples_as_domain = set(g.subjects(object=HEDWIG.Example, + predicate=RDFS.domain)) + + examples_as_range = set(g.subjects(object=HEDWIG.Example, + predicate=RDFS.range)) + + for pred in examples_as_domain.intersection(examples_as_range): + if user_defined(pred): + self.binary_predicates.add(str(pred)) + + + def _calc_predicate_members(self, g): + self.members = defaultdict(set) + for ex in self.examples: + for inst in ex.annotations: + if self.instances_as_leaves: + self.members[inst].add(ex.id) + else: + # Query for 'parents' of a given instance + inst_parents = list(g.objects(subject=URIRef(inst), + predicate=RDF.type)) + inst_parents += list(g.objects(subject=URIRef(inst), + predicate=RDFS.subClassOf)) + for obj in inst_parents: + self.members[str(obj)].add(ex.id) + + + def _find_roots(self, all_annotations): + roots = filter(lambda pred: not self.sub_class_of[pred], + self.super_class_of.keys()) + + # Check for annotations not in the ontology to add them as roots + for annotation in all_annotations: + if annotation not in self.predicates: + roots.append(annotation) + logger.debug('Adding leaf %s as root, as it is not specified in the ontology' % annotation) + + logger.debug('Detected root nodes: %s' % str(roots)) + + # Add a dummy root + self.dummy_root = 'root' + self.predicates.add(self.dummy_root) + for root in roots: + self.add_sub_class(root, self.dummy_root) + + + def _calc_members_closure(self): + self.sub_class_of_closure = defaultdict(set) + for pred in self.super_class_of.keys(): + self.sub_class_of_closure[pred].update(self.sub_class_of[pred]) + + # Calc the closure to get the members of the subClassOf hierarchy + def closure(pred, lvl, visited=[]): + + if pred in visited: + raise Exception('Cycle detected in the hierarchy at predicate %s!' % pred) + + children = self.super_class_of[pred] + self.levels[lvl].add(pred) + + if children: + mems = set() + visited.append(pred) + for child in children: + parent_closure = self.sub_class_of_closure[pred] + self.sub_class_of_closure[child].update(parent_closure) + mems.update(closure(child, lvl + 1, visited=visited)) + self.members[pred].update(mems) + visited.remove(pred) + + return self.members[pred] + else: + return self.members[pred] + + # Level-wise predicates + self.levels = defaultdict(set) + + # Run the closure from root + closure(self.dummy_root, 0) + + + def _calc_binary_members(self): + self.binary_members = defaultdict(dict) + self.reverse_binary_members = defaultdict(dict) + + for pred in self.binary_predicates: + pairs = g.subject_objects(predicate=URIRef(pred)) + + for pair in pairs: + el1, el2 = self.uri_to_idx[pair[0]], self.uri_to_idx[pair[1]] + if self.binary_members[pred].has_key(el1): + self.binary_members[pred][el1].append(el2) + else: + self.binary_members[pred][el1] = [el2] + + # Add the reverse as well + if self.reverse_binary_members[pred].has_key(el2): + self.reverse_binary_members[pred][el2].append(el1) + else: + self.reverse_binary_members[pred][el2] = [el1] + + # Bitset of examples for input and output + self.binary_domains = {} + for pred in self.binary_predicates: + self.binary_domains[pred] = ( + self.indices_to_bits(self.binary_members[pred].keys()), + self.indices_to_bits(self.reverse_binary_members[pred].keys()) + ) + + # Calc the corresponding bitsets + self.bit_members = {} + for pred in self.members.keys(): + self.bit_members[pred] = self.indices_to_bits(self.members[pred]) + + self.bit_binary_members = defaultdict(dict) + self.reverse_bit_binary_members = defaultdict(dict) + + for pred in self.binary_members.keys(): + + for el in self.binary_members[pred].keys(): + indices = self.indices_to_bits(self.binary_members[pred][el]) + self.bit_binary_members[pred][el] = indices + + for el in self.reverse_binary_members[pred].keys(): + reverse_members = self.reverse_binary_members[pred][el] + indices = self.indices_to_bits(reverse_members) + self.reverse_bit_binary_members[pred][el] = indices + + def _propagate_annotation_names(self, g): + to_uni = lambda s: unicode(s).encode('ascii', 'ignore') + + # Query for annotation names + for sub, obj in g.subject_objects(predicate=HEDWIG.annotation_name): + sub, obj = to_uni(sub), to_uni(obj) + self.annotation_name[sub].append(obj) + logger.debug('Annotation name root: %s, %s' % (sub, obj)) + + # Propagate the annotation names to children + annotation_name_roots = self.annotation_name.keys() + for pred in self.predicates: + for annotation_root in annotation_name_roots: + if annotation_root in self.super_classes(pred): + name = self.annotation_name[annotation_root] + self.annotation_name[pred] = name + + def add_sub_class(self, sub, obj): + ''' + Adds the resource 'sub' as a subclass of 'obj'. + ''' + to_uni = lambda s: unicode(s).encode('ascii', 'ignore') + sub, obj = to_uni(sub), to_uni(obj) + + self.predicates.update([sub, obj]) + if obj not in self.sub_class_of[sub]: + self.sub_class_of[sub].append(obj) + if sub not in self.super_class_of[obj]: + self.super_class_of[obj].append(sub) + + def super_classes(self, pred): + ''' + Returns all super classes of pred (with transitivity). + ''' + return self.sub_class_of_closure[pred] + + def get_root(self): + ''' + Root predicate, which covers all examples. + ''' + return UnaryPredicate(self.dummy_root, self.get_full_domain(), self, + custom_var_name='X') + + def get_subclasses(self, predicate, producer_pred=None): + ''' + Returns a list of subclasses (as predicate objects) for 'predicate'. + ''' + if isinstance(predicate, UnaryPredicate): + return self.super_class_of[predicate.label] + else: + return self.super_class_of[predicate] + + def get_members(self, predicate, bit=True): + ''' + Returns the examples for this predicate, + either as a bitset or a set of ids. + ''' + members = None + if predicate in self.predicates: + if bit: + members = self.bit_members[predicate] + else: + members = self.members[predicate] + else: + if bit: + members = self.bit_binary_members[predicate] + else: + members = self.binary_members[predicate] + + return members + + def get_reverse_members(self, predicate, bit=True): + ''' + Returns the examples for this predicate, + either as a bitset or a set of ids. + ''' + reverse_members = None + if bit: + reverse_members = self.reverse_bit_binary_members[predicate] + else: + reverse_members = self.reverse_binary_members[predicate] + + return reverse_members + + def n_members(self, predicate): + return self.get_members(predicate, bit=True).count() + + def get_domains(self, predicate): + ''' + Returns the bitsets for input and outputexamples + of the binary predicate 'predicate'. + ''' + return self.binary_domains[predicate] + + def get_examples(self): + ''' + Returns all examples for this experiment. + ''' + return self.examples + + def n_examples(self): + ''' + Returns the number of examples. + ''' + return len(self.examples) + + def get_full_domain(self): + ''' + Returns a bitset covering all examples. + ''' + return bitarray([True] * self.n_examples()) + + def get_empty_domain(self): + ''' + Returns a bitset covering no examples. + ''' + return bitarray([False] * self.n_examples()) + + def get_score(self, ex_idx): + ''' + Returns the score for example id 'ex_idx'. + ''' + return self.examples[ex_idx].score + + def bits_to_indices(self, bits): + ''' + Converts the bitset to a set of indices. + ''' + return bits.search(bitarray([1])) + + def indices_to_bits(self, indices): + ''' + Converts the indices to a bitset. + ''' + bits = self.get_empty_domain() + for idx in indices: + bits[idx] = True + return bits + + def is_discrete_target(self): + return bool(self.class_values) diff --git a/workflows/ilp/hedwig/core/load.py b/workflows/ilp/hedwig/core/load.py new file mode 100755 index 0000000..88a6fd7 --- /dev/null +++ b/workflows/ilp/hedwig/core/load.py @@ -0,0 +1,187 @@ +''' +Reading input data. + +@author: anze.vavpetic@ijs.si +''' +import rdflib +import json +import hashlib +import os +import cPickle + +from hedwig.core.settings import logger, HEDWIG, GENERIC_NAMESPACE +from hedwig.core.example import Example + + +def rdf(paths, def_format='n3'): + ''' + Loads the ontology into an rdf graph. + ''' + g = rdflib.graph.Graph() + errorMsg = '' + errorCount = 0 + for path in paths: + if path.endswith(def_format): + try: + g.parse(path, format=def_format) + except Exception, e: + errorMsg = errorMsg + 'Error parsing file: ' + path +'.\n' + str(e) + '\n\n' + errorCount += 1 + if errorCount > 0: + raise Exception(str(errorCount) + " errors loading files:\n" + errorMsg) + return g + + +def build_uri(class_string): + ''' + Checks if the string is a proper URI, if not it builds an URI + with the generic namespace. + ''' + class_string = class_string.strip() + if class_string.startswith('http://'): + class_uri = rdflib.term.URIRef(class_string) + else: + class_uri = rdflib.term.URIRef('%s%s' % (str(GENERIC_NAMESPACE), class_string)) + + return class_uri + + +def csv_parse_hierarchy(g, path): + ''' + Assumes a hierarchy file of the following format: + + class_1superclass_1_1; superclass_1_2; ...; superclass_1_n + class_2superclass_2_1; superclass_2_2; ...; superclass_2_n + ... + class_msuperclass_m_1; superclass_m_2; ...; superclass_m_n + ''' + with open(path) as f: + lines = f.read().splitlines() + for line in lines: + class_ = line.split('\t')[0] + superclasses = line.split('\t')[1].split(';') + for superclass in superclasses: + class_uri = build_uri(class_) + superclass_uri = build_uri(superclass) + g.add((class_uri, rdflib.RDFS.subClassOf, superclass_uri)) + + +def csv_parse_data(g, data_file): + ''' + Assumes the following csv format: + + example_uri_or_label; attr_uri_1; attr_uri_2; ...; attr_uri_n + http://example.org/uri_1; 0/1; 0/1; 0/1; 0/1; ... + http://example.org/uri_2; 0/1; 0/1; 0/1; 0/1; ... + ... + + Alternatively attribute values can be URIs themselves. + ''' + attributes = [] + class_labels = [] + examples = [] + + with open(data_file) as f: + data_lines = f.readlines() + domain = [a.strip() for a in data_lines[0].split(';')] + attributes = domain[:-1] + + logger.debug('Attributes: %s' % str(attributes)) + logger.debug('# Examples: %d' % (len(data_lines) - 1)) + + for ex_i, example_line in enumerate(data_lines[1:]): + values = [v.strip() for v in example_line.split(';')] + if len(values) != len(attributes) + 1: + raise Exception('Whoa! The number of values %d != the number of attributes (%d) on line %d.' % (len(values), len(attributes) + 1, ex_i + 2)) + + examples.append(values) + + for example in examples: + # Write to rdf graph + u = build_uri(example[0]) + g.add((u, rdflib.RDF.type, HEDWIG.Example)) + g.add((u, HEDWIG.class_label, rdflib.Literal(example[-1]))) + + for att_idx, att in enumerate(attributes): + + # Skip the label + if att_idx == 0: + continue + + attribute_value = example[att_idx] + value_is_uri = attribute_value.startswith('http://') + if not (value_is_uri or attribute_value == '1'): + continue + annotation_uri = build_uri(attribute_value) if value_is_uri else build_uri(att) + blank = rdflib.BNode() + g.add((u, HEDWIG.annotated_with, blank)) + g.add((blank, HEDWIG.annotation, annotation_uri)) + + +def csv(hierarchy_files, data): + ''' + Loads a simple hierarchy of features and data in csv format. + ''' + g = rdflib.graph.Graph() + errorMsg = '' + errorCount = 0 + for path in hierarchy_files + [data]: + try: + if path.endswith('tsv'): + csv_parse_hierarchy(g, path) + elif path.endswith('csv'): + csv_parse_data(g, data) + except Exception, e: + errorMsg = errorMsg + 'Error parsing file: ' + path +'.\n' + str(e) + '\n\n' + errorCount += 1 + if errorCount > 0: + raise Exception(str(errorCount) + " errors loading files:\n" + errorMsg) + return g + + +def load_graph(ontology_list, data, def_format='n3', cache=True): + + def filter_valid_files(paths): + if def_format == 'csv': + filter_fn = lambda p: p.endswith('.csv') or p.endswith('.tsv') + else: + filter_fn = lambda p: p.endswith(def_format) + return filter(filter_fn, paths) + + logger.info('Calculating data checksum') + paths = ontology_list + [data] + md5 = _md5_checksum(filter_valid_files(paths)) + + cached_fn = '.%s' % md5 + g = None + if os.path.exists(cached_fn) and cache: + logger.info('Loading cached graph structure') + g = _load_cached_graph(cached_fn) + else: + logger.info('Building graph structure') + if def_format == 'n3': + g = rdf(paths, def_format=def_format) + elif def_format == 'csv': + g = csv(ontology_list, data) + if cache: + _save_graph_to_cache(g, cached_fn) + return g + + +def _md5_checksum(paths): + md5 = hashlib.md5() + for path in paths: + with open(path, 'rb') as f: + for chunk in iter(lambda: f.read(2**20), b''): + md5.update(chunk) + return md5.hexdigest() + + +def _load_cached_graph(fn): + g = cPickle.load(open(fn)) + return g + + +def _save_graph_to_cache(g, fn): + with open(fn, 'w') as f: + cPickle.dump(g, f) diff --git a/workflows/ilp/hedwig/core/predicate.py b/workflows/ilp/hedwig/core/predicate.py new file mode 100644 index 0000000..42025dd --- /dev/null +++ b/workflows/ilp/hedwig/core/predicate.py @@ -0,0 +1,87 @@ +''' +Predicate-related classes. + +@author: anze.vavpetic@ijs.si +''' + +class Predicate: + ''' + Represents a predicate as a member of a certain rule. + ''' + i = -1 + + def __init__(self, label, kb, producer_pred): + self.label = label + self.kb = kb + + # Whose predicate's out var this predicate consumes + self.producer_predicate = producer_pred + if self.producer_predicate: + producer_pred.consumer_predicate = self + + # Which predicate consumes this predicate's out var + self.consumer_predicate = None + + @staticmethod + def _avar(): + ''' + Anonymous var name generator. + ''' + Predicate.i = Predicate.i + 1 + return 'X%d' % Predicate.i + + +class UnaryPredicate(Predicate): + ''' + A unary predicate. + ''' + def __init__(self, label, members, kb, + producer_pred=None, + custom_var_name=None, + negated=False): + Predicate.__init__(self, label, kb, producer_pred) + + if not producer_pred: + if not custom_var_name: + self.input_var = Predicate._avar() + else: + self.input_var = custom_var_name + else: + self.input_var = producer_pred.output_var + + self.output_var = self.input_var + self.negated = negated + self.domain = {self.input_var: members} + + +class BinaryPredicate(Predicate): + ''' + A binary predicate. + ''' + def __init__(self, label, pairs, kb, producer_pred=None): + ''' + The predicate's name and the tuples satisfying it. + ''' + Predicate.__init__(self, label, kb, producer_pred) + + # The input var should match with the producers output var + if not producer_pred: + self.input_var = Predicate._avar() + else: + self.input_var = producer_pred.output_var + + self.output_var = Predicate._avar() + if producer_pred: + prod_out_var = self.producer_predicate.output_var + potential_inputs = self.producer_predicate.domain[prod_out_var] + + # Find which inputs have pairs + inputs = potential_inputs & kb.get_domains(label)[0] + outputs = kb.get_empty_domain() + for el1 in kb.bits_to_indices(inputs): + outputs |= pairs[el1] + else: + # No producer predicate. + inputs, outputs = kb.get_domains(label) + + self.domain = {self.input_var: inputs, self.output_var: outputs} diff --git a/workflows/ilp/hedwig/core/rule.py b/workflows/ilp/hedwig/core/rule.py new file mode 100644 index 0000000..73a0824 --- /dev/null +++ b/workflows/ilp/hedwig/core/rule.py @@ -0,0 +1,417 @@ +''' +The rule class. + +@author: anze.vavpetic@ijs.si +''' +import json +from collections import defaultdict + +from hedwig.core.predicate import UnaryPredicate, BinaryPredicate +from hedwig.core.example import Example +from hedwig.core.helpers import avg, std +from hedwig.core.settings import DEFAULT_ANNOTATION_NAME + + +class Rule: + ''' + Represents a rule, along with its description, examples and statistics. + ''' + def __init__(self, kb, predicates=[], target=None): + self.predicates = predicates + self.kb = kb + self.covered_examples = kb.get_full_domain() + self.target_type = kb.target_type + self.target = target + + # Allow only unary predicates + for pred in predicates: + if isinstance(pred, UnaryPredicate): + self.covered_examples &= pred.domain[pred.input_var] + + self.head_var = None + if self.predicates: + self.head_var = self.predicates[0].input_var + + # Dictionary of predicates that share a certain variable + self.shared_var = {self.head_var: self.predicates} + + # Predicates that currently can be specialized + self.latest_var = self.head_var + + # Statistics + self.score = -1 + self.coverage = -1 + self.mean = -1 + self.sd = -1 + self.distribution = {} + self.__refresh_coverage() + self.__refresh_statistics() + + # Validation + self.pval = -1 + + def clone(self): + ''' + Returns a clone of this rule. The predicates themselves are NOT cloned. + ''' + new_rule = Rule(self.kb, target=self.target) + new_rule.predicates = self.predicates[:] + new_rule.covered_examples = self.covered_examples + new_rule.latest_var = self.latest_var + new_rule.head_var = self.head_var + new_rule.shared_var = {} + for var in self.shared_var: + new_rule.shared_var[var] = self.shared_var[var][:] + return new_rule + + def clone_negate(self, target_pred): + ''' + Returns a copy of this rule where 'taget_pred' is negated. + ''' + new_rule = self.clone() + + # Create the instance of the child pred + producer_pred = target_pred.producer_predicate + var_name = target_pred.input_var + members = target_pred.domain[target_pred.input_var].copy() + members.invert() + neg_pred = UnaryPredicate(target_pred.label, + members, + self.kb, + producer_pred=producer_pred, + custom_var_name=var_name, + negated=True) + + new_rule._replace_predicate(target_pred, neg_pred) + return new_rule + + def clone_swap_with_subclass(self, target_pred, child_pred_label): + ''' + Returns a copy of this rule where + 'target_pred' is swapped for 'child_pred_label'. + ''' + new_rule = self.clone() + + # Create the instance of the child pred + producer_pred = target_pred.producer_predicate + var_name = target_pred.input_var + child_pred = UnaryPredicate(child_pred_label, + self.kb.get_members(child_pred_label), + self.kb, + producer_pred=producer_pred, + custom_var_name=var_name) + + new_rule._replace_predicate(target_pred, child_pred) + return new_rule + + def clone_append(self, predicate_label, producer_pred, bin=False): + ''' + Returns a copy of this rule where 'predicate_label' + is appended to the rule. + ''' + if not bin: + new_rule = self.clone() + predicate = UnaryPredicate(predicate_label, + self.kb.get_members(predicate_label), + self.kb, + producer_pred=producer_pred) + new_rule.predicates.append(predicate) + new_rule.shared_var[producer_pred.output_var].append(predicate) + else: + new_rule = self.clone() + predicate = BinaryPredicate(predicate_label, + self.kb.get_members(predicate_label), + self.kb, + producer_pred=producer_pred) + new_rule.predicates.append(predicate) + + # Introduce new variable + new_rule.shared_var[predicate.output_var] = [predicate] + new_rule.shared_var[predicate.input_var].append(predicate) + new_rule.latest_var = predicate.output_var + + new_rule.__refresh_coverage() + new_rule.__refresh_statistics() + return new_rule + + def _replace_predicate(self, target, replacement): + ''' + Replaces 'target' with 'replacement' in the rule. + ''' + Rule.__replace(self.predicates, target, replacement) + self.covered_examples = self.covered_examples & \ + replacement.domain[replacement.input_var] + + # Reference possible consumers + replacement.consumer_predicate = target.consumer_predicate + + # Update the backlinks + if replacement.producer_predicate: + replacement.producer_predicate.consumer_predicate = replacement + if replacement.consumer_predicate: + replacement.consumer_predicate.producer_predicate = replacement + + # Update the shared var list + shared_list = self.shared_var[target.input_var] + Rule.__replace(shared_list, target, replacement) + + # Recalc the covered examples and statistics + self.__refresh_coverage() + self.__refresh_statistics() + + @staticmethod + def __replace(l, target, replacement): + idx = l.index(target) + l[idx] = replacement + + def __refresh_coverage(self): + ''' + Recalculates the covered examples. + ''' + var = self.shared_var[self.head_var] + self.covered_examples = self.__covered_examples(var) + + def __covered_examples(self, predicates): + ''' + Recursively calculates the covered examples for a given set of + predicates that share a variable. + ''' + covered_examples = self.kb.get_full_domain() + for pred in predicates: + if isinstance(pred, BinaryPredicate): + + # Predicates that share the new variable, without 'pred' + shared = self.shared_var[pred.output_var][:] + shared.remove(pred) + existential_cov_examples = self.__covered_examples(shared) + reverse_members = self.kb.get_reverse_members(pred.label) + tmp_covered = self.kb.get_empty_domain() + + # Calculate all examples that have a pair for this relation + for idx in self.kb.bits_to_indices(existential_cov_examples): + if reverse_members.has_key(idx): + tmp_covered |= reverse_members[idx] + covered_examples &= tmp_covered + else: + covered_examples &= pred.domain[pred.input_var] + return covered_examples + + def __refresh_statistics(self): + ''' + Recalculates the statistics for this rule. + ''' + self.coverage = self.covered_examples.count() + + indices = self.kb.bits_to_indices(self.covered_examples) + ex_scores = [self.kb.get_score(idx) for idx in indices] + + if self.target_type == Example.Ranked: + self.mean = avg(ex_scores) + self.sd = std(ex_scores) + self.score = self.kb.score_fun(self) + else: + self.distribution = defaultdict(int) + for score in ex_scores: + self.distribution[score] += 1 + self.score = self.kb.score_fun(self) + + def similarity(self, rule): + ''' + Calculates the similarity between this rule and 'rule'. + ''' + intersection = (self.covered_examples & rule.covered_examples).count() + union = (self.covered_examples | rule.covered_examples).count() + if union == 0: + return 1 + else: + return intersection/float(union) + + def size(self): + ''' + Returns the number of conjunts. + ''' + return len(self.predicates) + + def examples(self, positive_only=False): + ''' + Returns the covered examples. + ''' + indices = self.kb.bits_to_indices(self.covered_examples) + all_examples = [self.kb.examples[idx] for idx in indices] + + if positive_only: + return filter(lambda ex: ex.score == self.target, all_examples) + else: + return all_examples + + @property + def positives(self): + return self.distribution[self.target] + + def precision(self): + if self.coverage: + return self.positives / float(self.coverage) + else: + return 0 + + def rule_report(self, show_uris=False, latex=False): + ''' + Rule as string with some statistics. + ''' + if latex: + return self._latex_report() + else: + return self._plain_report(show_uris=show_uris) + + def _plain_report(self, show_uris=False, human=lambda label, rule: label): + ''' + Plain text rule report + ''' + s = self._plain_conjunctions(show_uris=show_uris, human=human) + ' ' + \ + self._plain_statistics() + return s + + def _plain_conjunctions(self, show_uris=False, + human=lambda label, rule: label): + conjuncts = [] + for pred in self.predicates: + + label = pred.label + if '#' in label and not show_uris: + label = pred.label.split('#')[-1] + label = human(label, self) + + if isinstance(pred, UnaryPredicate): + anno_names = self.kb.annotation_name.get(pred.label, [DEFAULT_ANNOTATION_NAME]) + predicate_label = '_and_'.join(anno_names) + + if pred.negated: + predicate_label = '~' + predicate_label + + conj = '%s(%s, %s)' % (predicate_label, pred.input_var, label) + else: + conj = '%s(%s, %s)' % (label, + pred.input_var, + pred.output_var) + conjuncts.append(conj) + + s = ', '.join(conjuncts) + return s + + def _plain_statistics(self): + if self.target_type == Example.ClassLabeled: + stats = (self.coverage, + self.positives, + self.precision(), + self.kb.score_fun.__name__, + self.score, + self.pval) + return '[cov=%d, pos=%d, prec=%.3f, %s=%.3f, pval=%.3f]' % stats + + else: + return '[size=%d, score=%.3f]' % (self.coverage, self.score) + + def _latex_report(self): + ''' + Latex rule report + ''' + conjuncts = [] + for pred in self.predicates: + + label = pred.label + if '#' in label: + label = pred.label.split('#')[-1] + + if isinstance(pred, UnaryPredicate): + if pred.negated: + label = r'$\neg$' + label + conj = '%s(%s)' % (label, pred.input_var) + else: + conj = '%s(%s, %s)' % (label, + pred.input_var, + pred.output_var) + conjuncts.append(conj) + + s = r' $\wedge$ '.join(conjuncts) + + return s + + def __str__(self): + return self.rule_report(show_uris=False) + + @staticmethod + def ruleset_report(rules, show_uris=False, latex=False, + human=lambda label, rule: label): + if latex: + return Rule._latex_ruleset_report(rules) + else: + return Rule._plain_ruleset_report(rules, show_uris=show_uris, + human=human) + + @staticmethod + def _latex_ruleset_report(rules): + target, var = rules[0].target, rules[0].head_var + if target: + head = '%s(%s) $\leftarrow$ ' % (target, var) + else: + head = '' + + _tex_report = \ + r'\begin{tabular}{clccccc}\hline' + '\n' \ + r'\textbf{\#} & \textbf{Rule} & \textbf{TP} & \textbf{FP} & \textbf{Precision} & \textbf{Lift} & \textbf{p-value}\\\hline' + '\n' + + for i, rule in enumerate(sorted(rules, key=lambda r: r.score, reverse=True)): + rule_report = rule._latex_report() + stats = (i+1, + head + rule_report, + rule.distribution[rule.target], + rule.coverage - rule.distribution[rule.target], + rule.distribution[rule.target]/float(rule.coverage), + rule.score, + rule.pval) + _tex_report += r'%d & \texttt{%s} & %d & %d & %.2f & %.2f & %.3f\\' % stats + _tex_report += '\n' + + _tex_report += \ + r'\hline' + '\n' \ + r'\end{tabular}' + '\n' + + return _tex_report + + @staticmethod + def _plain_ruleset_report(rules, show_uris=False, + human=lambda label, rule: label): + + target, var = rules[0].target, rules[0].head_var + if target: + head = '\'%s\'(%s) <--\n\t' % (target, var) + else: + head = '' + + ruleset = [] + for rule in sorted(rules, key=lambda r: r.score, reverse=True): + rule = rule._plain_report(show_uris=show_uris, human=human) + ruleset.append(rule) + + return head + '\n\t'.join(ruleset) + + @staticmethod + def ruleset_examples_json(rules_per_target, show_uris=False): + examples_output = [] + for target_class, rules in rules_per_target: + class_examples = [] + for _, rule in enumerate(sorted(rules, key=lambda r: r.score, + reverse=True)): + examples = rule.examples() + class_examples.append((rule._plain_conjunctions(), + [ex.label for ex in examples])) + examples_output.append((target_class, class_examples)) + return examples_output + + @staticmethod + def to_json(rules_per_target, show_uris=False): + results = {} + for target, rules in rules_per_target: + results[target] = [str(rule) for rule in rules] + + return json.dumps(results, indent=2) diff --git a/workflows/ilp/hedwig/core/settings.py b/workflows/ilp/hedwig/core/settings.py new file mode 100644 index 0000000..716be2c --- /dev/null +++ b/workflows/ilp/hedwig/core/settings.py @@ -0,0 +1,53 @@ +''' +Global settings file. + +@author: anze.vavpetic@ijs.si +''' +import os +import logging +from rdflib import Namespace + +VERSION = '0.3.0' +DESCRIPTION = '''Hedwig semantic pattern mining (anze.vavpetic@ijs.si)''' + +# Logging setup +logger = logging.getLogger("Hedwig") +ch = logging.StreamHandler() +formatter = logging.Formatter("%(name)s %(levelname)s: %(message)s") +ch.setFormatter(formatter) +logger.addHandler(ch) + +# Pre-defined assets path +PAR_DIR = os.path.join(os.path.dirname(__file__), os.pardir) +ASSETS_DIR = os.path.abspath(os.path.join(PAR_DIR, 'assets')) +EXAMPLE_SCHEMA = os.path.join(ASSETS_DIR, 'builtin.n3') + +# Built-in namespaces +W3C = Namespace('http://www.w3.org/') +HEDWIG = Namespace('http://kt.ijs.si/hedwig#') +DEFAULT_ANNOTATION_NAME = 'annotated_with' +GENERIC_NAMESPACE = Namespace('http://kt.ijs.si/ontology/generic#') + +INPUT_FORMATS = ['n3', 'xml', 'ntriples', 'trix', 'csv'] + +# Defaults +class Defaults: + FORMAT = INPUT_FORMATS[0] + OUTPUT = None + COVERED = None + MODE = 'subgroups' + TARGET = None + SCORE = 'lift' + NEGATIONS = False + ALPHA = 0.05 + ADJUST = 'fwer' + FDR_Q = 0.05 + LEAVES = False + LEARNER = 'heuristic' + OPTIMAL_SUBCLASS = False + URIS = False + BEAM_SIZE = 20 + SUPPORT = 0.1 + DEPTH = 5 + NO_CACHE = False + VERBOSE = False diff --git a/workflows/ilp/hedwig/learners/__init__.py b/workflows/ilp/hedwig/learners/__init__.py new file mode 100644 index 0000000..f610d4b --- /dev/null +++ b/workflows/ilp/hedwig/learners/__init__.py @@ -0,0 +1,4 @@ +from hedwig.learners.learner import Learner as HeuristicLearner +from hedwig.learners.optimal import OptimalLearner + +__all__ = ["HeuristicLearner", "OptimalLearner"] diff --git a/workflows/ilp/hedwig/learners/bottomup.py b/workflows/ilp/hedwig/learners/bottomup.py new file mode 100644 index 0000000..4cd7711 --- /dev/null +++ b/workflows/ilp/hedwig/learners/bottomup.py @@ -0,0 +1,84 @@ +''' +Main learner class. + +@author: anze.vavpetic@ijs.si +''' +from collections import defaultdict + +from hedwig.core import UnaryPredicate, Rule, Example +from hedwig.core.settings import logger +from hedwig.stats.significance import is_redundant +from hedwig.stats.scorefunctions import interesting + + +class BottomUpLearner: + ''' + Bottom-up learner. + ''' + Similarity = 'similarity' + Improvement = 'improvement' + Default = 'default' + + def __init__(self, kb, n=None, min_sup=1, sim=1, depth=4, target=None, + use_negations=False): + self.kb = kb + self.n = n # Beam length + self.min_sup = min_sup + self.sim = sim + self.extending = Learner.Improvement + self.depth = depth # Max number of conjunctions + self.use_negations = use_negations + + if kb.is_discrete_target(): + self.target = list(self.kb.class_values)[0] if not target else target + else: + self.target = None + + self.pruned_subclasses = self._pruned_subclasses() + self.pruned_superclasses_closure = self._pruned_superclasses() + self.implicit_roots = self._implicit_roots() + + def _pruned_subclasses(self): + min_sup = lambda pred: self.kb.n_members(pred) >= self.min_sup + pruned_subclasses = {} + for pred in self.kb.predicates: + subclasses = self.kb.get_subclasses(pred) + pruned_subclasses[pred] = filter(min_sup, subclasses) + + return pruned_subclasses + + def _pruned_superclasses(self): + min_sup = lambda pred: self.kb.n_members(pred) >= self.min_sup + pruned_superclasses = {} + for pred in self.kb.predicates: + superclasses = self.kb.super_classes(pred) + pruned_superclasses[pred] = filter(min_sup, superclasses) + + return pruned_superclasses + + def _implicit_roots(self): + implicit_roots = set() + n_examples = self.kb.n_examples() + for pred in self.kb.predicates: + if self.kb.n_members(pred) == n_examples: + implicit_roots.add(pred) + + return implicit_roots + + def get_subclasses(self, pred): + return self.pruned_subclasses[pred.label] + + def get_superclasses(self, pred): + return self.pruned_superclasses_closure[pred] + + def is_implicit_root(self, pred): + return pred in self.implicit_roots + + def induce(self): + ''' + Induces rules for the given knowledge base. + ''' + pass + + def bottom_clause(self): + pass diff --git a/workflows/ilp/hedwig/learners/learner.py b/workflows/ilp/hedwig/learners/learner.py new file mode 100755 index 0000000..e72c5e7 --- /dev/null +++ b/workflows/ilp/hedwig/learners/learner.py @@ -0,0 +1,288 @@ +''' +Main learner class. + +@author: anze.vavpetic@ijs.si +''' +from collections import defaultdict + +from hedwig.core import UnaryPredicate, Rule, Example +from hedwig.core.settings import logger +from hedwig.stats.significance import is_redundant +from hedwig.stats.scorefunctions import interesting + + +class Learner: + ''' + Learner class, supporting various types of induction + from the knowledge base. + + TODO: + - bottom clause approach + - feature construction + ''' + Similarity = 'similarity' + Improvement = 'improvement' + Default = 'default' + + def __init__(self, kb, n=None, min_sup=1, sim=1, depth=4, target=None, + use_negations=False, optimal_subclass=False): + self.kb = kb + self.n = n # Beam length + self.min_sup = min_sup + self.sim = sim + self.extending = Learner.Improvement + self.depth = depth # Max number of conjunctions + self.use_negations = use_negations + self.optimal_subclass = optimal_subclass + + if kb.is_discrete_target(): + self.target = list(self.kb.class_values)[0] if not target else target + else: + self.target = None + + self.pruned_subclasses = self._pruned_subclasses() + self.pruned_superclasses_closure = self._pruned_superclasses() + self.implicit_roots = self._implicit_roots() + + def _pruned_subclasses(self): + min_sup = lambda pred: self.kb.n_members(pred) >= self.min_sup + pruned_subclasses = {} + for pred in self.kb.predicates: + subclasses = self.kb.get_subclasses(pred) + pruned_subclasses[pred] = filter(min_sup, subclasses) + + return pruned_subclasses + + def _pruned_superclasses(self): + min_sup = lambda pred: self.kb.n_members(pred) >= self.min_sup + pruned_superclasses = {} + for pred in self.kb.predicates: + superclasses = self.kb.super_classes(pred) + pruned_superclasses[pred] = filter(min_sup, superclasses) + + return pruned_superclasses + + def _implicit_roots(self): + implicit_roots = set() + n_examples = self.kb.n_examples() + for pred in self.kb.predicates: + if self.kb.n_members(pred) == n_examples: + implicit_roots.add(pred) + + return implicit_roots + + def get_subclasses(self, pred): + return self.pruned_subclasses[pred.label] + + def get_superclasses(self, pred): + return self.pruned_superclasses_closure[pred] + + def is_implicit_root(self, pred): + return pred in self.implicit_roots + + def induce(self): + ''' + Induces rules for the given knowledge base. + ''' + root_pred = self.kb.get_root() + rules = [Rule(self.kb, predicates=[root_pred], target=self.target)] + rules = self.__induce_level(rules) + return filter(interesting, rules) + + def __induce_level(self, rules): + ''' + Specializes the rules for the last level with unary predicates. + ''' + while True: + old_score = self.group_score(rules) + new_rules = rules[:] + for i, rule in enumerate(rules): + specializations = self.specialize(rule) + self.extend(new_rules, specializations) + + # Take the first N rules + rules = sorted(new_rules, + key=lambda rule: rule.score, + reverse=True)[:self.n] + + new_score = self.group_score(rules) + + logger.debug("Old score: %.3f, New score: %.3f" % (old_score, new_score)) + + if 1 - abs(old_score/(new_score+0.0001)) < 0.01: + break + + return rules + + def extend(self, rules, specializations): + ''' + Extends the ruleset in the given way. + ''' + if self.extending == Learner.Default: + return rules.extend(specializations) + elif self.extending == Learner.Improvement: + return self.extend_replace_worst(rules, specializations) + elif self.extending == Learner.Similarity: + return self.extend_with_similarity(rules, specializations) + + def extend_with_similarity(self, rules, specializations): + ''' + Extends the list based on how similar is 'new_rule' + to the rules contained in 'rules'. + ''' + for new_rule in specializations: + tmp_rules = rules[:] + for rule in tmp_rules: + sim = rule.similarity(new_rule) + if sim >= self.sim and len(rules) > 0.5*self.n: + break + else: + rules.append(new_rule) + + def extend_replace_worst(self, rules, specializations): + ''' + Extends the list by replacing the worst rules. + ''' + def is_similar(new_rule): + for rule in rules[:]: + if rule.similarity(new_rule) == 1: + return True + return False + + improved = False + for new_rule in sorted(specializations, key=lambda rule: rule.score): + worst = sorted(rules, key=lambda rule: rule.score)[0] + if len(rules) < self.n: + rules.append(new_rule) + improved = True + elif new_rule.score > worst.score and not is_similar(new_rule): + self._replace(rules, worst, new_rule) + improved = True + return improved + + def _replace(self, rules, worst, new_rule): + idx = rules.index(worst) + rules[idx] = new_rule + + def specialize(self, rule): + ''' + Returns a list of all specializations of 'rule'. + ''' + is_unary = lambda p: isinstance(p, UnaryPredicate) + + def specialize_optimal_subclass(rule): + rules = [] + eligible_preds = rule.shared_var[rule.latest_var] + for pred in filter(is_unary, eligible_preds): + for sub_class in self.get_subclasses(pred): + logger.debug('Swapping with %s' % sub_class) + new_rule = rule.clone_swap_with_subclass(pred, sub_class) + if self.can_specialize(new_rule): + rules.append(new_rule) + rules.extend(specialize_optimal_subclass(new_rule)) + return rules + + logger.debug('Specializing rule: %s' % rule) + specializations = [] + eligible_preds = rule.shared_var[rule.latest_var] + + # Swapping unary predicates with subclasses, swap only + # the predicates with the latest variable + if not self.optimal_subclass: + for pred in filter(is_unary, eligible_preds): + logger.debug('Predicate to swap: %s' % pred.label) + for sub_class in self.get_subclasses(pred): + logger.debug('Swapping with %s' % sub_class) + new_rule = rule.clone_swap_with_subclass(pred, sub_class) + if self.can_specialize(new_rule): + specializations.append(new_rule) + else: + specializations.extend(specialize_optimal_subclass(rule)) + + if self.use_negations: + # Negate the last predicate + for pred in filter(is_unary, eligible_preds): + logger.debug('Predicate to negate: %s' % pred.label) + new_rule = rule.clone_negate(pred) + if self.can_specialize(new_rule): + specializations.append(new_rule) + + # This makes sure we are not specializing a default rule by appending, + # this rule should instead be reached by the specialization step above. + if not (len(eligible_preds) == 1 and + (eligible_preds[0].label == self.kb.get_root().label or + self.is_implicit_root(eligible_preds[0].label))): + + # Calculate the union of superclasses of each predicate + supers = set() + for pred in eligible_preds: + supers.update(self.get_superclasses(pred.label)) + supers.add(pred) + + # Calculate the top-most left-most non-ancestor + for lvl in sorted(self.kb.levels.keys()): + + level = self.kb.levels[lvl] + diff = level.difference(supers) + if diff: + + # The next predicate to specialize with is the left-most + for pred in sorted(list(diff)): + + # Appending a new predicate, the last predicate + # is always the producer + last_pred = rule.predicates[-1] + new_rule = rule.clone_append(pred, + producer_pred=last_pred) + if self.can_specialize(new_rule) and \ + self.non_redundant(rule, new_rule): + specializations.append(new_rule) + break + + # Introduce new binary relation + if isinstance(rule.predicates[-1], UnaryPredicate): + specializations.extend(self.specialize_add_relation(rule)) + + logger.debug('All specializations %s' + % [str(rule) for rule in specializations]) + + return specializations + + def specialize_add_relation(self, rule): + ''' + Specialize with new binary relation. + ''' + specializations = [] + for pred in self.kb.binary_predicates: + + last_pred = rule.predicates[-1] + new_rule = rule.clone_append(pred, producer_pred=last_pred, + bin=True) + + if self.can_specialize(new_rule): + specializations.append(new_rule) + return specializations + + def can_specialize(self, rule): + ''' + Is the rule good enough to be further refined? + ''' + return rule.coverage >= self.min_sup and rule.size() <= self.depth + + def non_redundant(self, rule, new_rule): + ''' + Is the rule non-redundant compared to its immediate generalization? + ''' + if new_rule.score < rule.score: + return False + + if rule.target_type == Example.Ranked: + return True + else: + return not is_redundant(rule, new_rule) + + def group_score(self, rules): + ''' + Calculates the score of the whole list of rules. + ''' + return sum([rule.score for rule in rules]) diff --git a/workflows/ilp/hedwig/learners/optimal.py b/workflows/ilp/hedwig/learners/optimal.py new file mode 100755 index 0000000..e979e37 --- /dev/null +++ b/workflows/ilp/hedwig/learners/optimal.py @@ -0,0 +1,51 @@ +''' +Main learner class. + +@author: anze.vavpetic@ijs.si +''' +from collections import defaultdict +from itertools import combinations + +from hedwig.core import UnaryPredicate, Rule, Example +from hedwig.core.settings import logger +from hedwig.stats.significance import is_redundant +from hedwig.stats.scorefunctions import interesting + +from learner import Learner + + +class OptimalLearner(Learner): + ''' + Finds the optimal top-k rules. + ''' + def __init__(self, kb, n=None, min_sup=1, sim=1, depth=4, target=None, + use_negations=False, optimal_subclass=True): + Learner.__init__(self, kb, n=n, min_sup=min_sup, sim=sim, depth=depth, + target=target, use_negations=use_negations) + + def induce(self): + ''' + Induces rules for the given knowledge base. + ''' + kb = self.kb + has_min_sup = lambda pred: kb.get_members(pred).count() >= self.min_sup + all_predicates = filter(has_min_sup, kb.predicates) + rules = [] + for depth in range(1, self.depth+1): + for attrs in combinations(all_predicates, depth): + rule = Rule(kb, predicates=self._labels_to_predicates(attrs), + target=self.target) + rules.append(rule) + rules = sorted(rules, key=lambda r: r.score, reverse=True) + return rules[:self.n] + + def _labels_to_predicates(self, labels): + predicates = [] + producer_pred = None + for label in labels: + members = self.kb.get_members(label) + predicates.append(UnaryPredicate(label, members, self.kb, + producer_pred=producer_pred, + custom_var_name='X')) + producer_pred = predicates[-1] + return predicates diff --git a/workflows/ilp/hedwig/stats/__init__.py b/workflows/ilp/hedwig/stats/__init__.py new file mode 100644 index 0000000..e6098f9 --- /dev/null +++ b/workflows/ilp/hedwig/stats/__init__.py @@ -0,0 +1,6 @@ +from hedwig.stats import scorefunctions +from hedwig.stats import adjustment +from hedwig.stats import significance +from hedwig.stats.validate import Validate + +__all__ = ["scorefunctions", "adjustment", "significance", "Validate"] diff --git a/workflows/ilp/hedwig/stats/adjustment.py b/workflows/ilp/hedwig/stats/adjustment.py new file mode 100644 index 0000000..0a23fb3 --- /dev/null +++ b/workflows/ilp/hedwig/stats/adjustment.py @@ -0,0 +1,44 @@ +''' +Multiple-testing adjustment methods. + +@author: anze.vavpetic@ijs.si +''' + + +def _holdout(ruleset): + ''' + TODO: The holdout approach. + ''' + return ruleset + + +def fwer(ruleset, alpha=0.05): + ''' + The Holm-Bonferroni direct adjustment method to control the FWER. + ''' + m = float(len(ruleset)) + ruleset = sorted(ruleset, key=lambda r: r.pval) + for k, rule in enumerate(ruleset): + if rule.pval > alpha/(m + 1 - (k + 1)): + ruleset = ruleset[:k] + break + + return ruleset + + +def fdr(ruleset, q=0.05): + ''' + The Benjamini-Hochberg-Yekutieli direct adjustment + method to control the FDR. + ''' + m = float(len(ruleset)) + ruleset = sorted(ruleset, key=lambda r: r.pval) + for k, rule in enumerate(ruleset): + if rule.pval > ((k + 1)*q)/m: + ruleset = ruleset[:k] + break + + return ruleset + +def none(ruleset): + return ruleset diff --git a/workflows/ilp/hedwig/stats/scorefunctions.py b/workflows/ilp/hedwig/stats/scorefunctions.py new file mode 100644 index 0000000..c097040 --- /dev/null +++ b/workflows/ilp/hedwig/stats/scorefunctions.py @@ -0,0 +1,147 @@ +''' +Score function definitions. + +@author: anze.vavpetic@ijs.si +''' +from math import sqrt + + +def z_score(rule): + return sqrt(rule.coverage) * (rule.mean - rule.kb.mean) / rule.kb.sd + + +def t_score(rule): + return sqrt(rule.coverage) * (rule.mean - rule.kb.mean) / rule.sd + + +def enrichment_score(rule): + # The enrichment score of a rule covering all examples is 1 + if rule.coverage == rule.kb.n_examples(): + return 1.0 + if rule.coverage == 0: + return - 1 / float(rule.kb.n_examples()) + increment = {} + incr1 = 1 / float(rule.coverage) + incr2 = 1 / float(rule.kb.n_examples() - rule.coverage) + max_diff = 0 + # All examples + for ex in rule.kb.examples: + increment[ex] = -incr2 + for ex in rule.examples(): + increment[ex] = incr1 + partial = 0 + for ex in rule.kb.examples: + partial += increment[ex] + if partial > max_diff: + max_diff = partial + return max_diff + + +def wracc(rule): + nX = rule.coverage + N = len(rule.kb.examples) + nXY = rule.distribution[rule.target] + nY = rule.kb.distribution[rule.target] + if nX: + return nX / float(N) * (nXY/float(nX) - nY/float(N)) + else: + return 0 + + +def precision(rule): + nX = rule.coverage + nXY = rule.distribution[rule.target] + if nX: + return nXY/float(nX) + else: + return 0 + + +def chisq(rule): + N = len(rule.kb.examples) + z = rule.distribution[rule.target]/float(N) + x = rule.coverage/float(N) + y = rule.kb.distribution[rule.target]/float(N) + if x not in [0, 1] and y not in [0, 1]: + return N*(z - x*y)**2 / float(x*y*(1 - x)*(1 - y)) + else: + return 0 + + +def lift(rule): + nX = float(rule.coverage) + N = float(len(rule.kb.examples)) + nXY = rule.distribution[rule.target] + nY = rule.kb.distribution[rule.target] + + if nX != 0 and N != 0: + return (nXY/nX)/(nY/N) + else: + return 0 + + +def leverage(rule): + nX = float(rule.coverage) + N = float(len(rule.kb.examples)) + nXY = rule.distribution[rule.target] + nY = rule.kb.distribution[rule.target] + + if N != 0: + return nXY/N - (nX/N)*(nY/N) + else: + return 0 + + +def kaplan_meier_AUC(rule): + examples = rule.kb.bits_to_indices(rule.covered_examples) + n_examples = float(len(examples)) + + if n_examples == 0: + return 0.0 + + def n_alive(examples, day): + + def is_alive(ex): + return rule.kb.get_score(ex) > day + + return len(filter(is_alive, examples)) + + auc, day = 0, 0 + prev = -1 + while True: + alive = n_alive(examples, day) + day += 14 + + curr = alive/n_examples + if prev != -1: + auc += (prev + curr)/2.0 + prev = curr + + if alive == 0: + break + + #print auc + return auc + + +# Bounds of interest for each score function +# A rule is interesting if its score is in (A, B] as defined below. +_bounds = { + z_score: (0, float('inf')), + t_score: (0, float('inf')), + kaplan_meier_AUC: (0, float('inf')), + enrichment_score: (-float('inf'), 1), + wracc: (0, 1), + precision: (0, 1), + chisq: (0, float('inf')), + lift: (1, float('inf')), + leverage: (0, 1) +} + + +def interesting(rule): + ''' + Checks if a given rule is interesting for the given score function + ''' + score_fun = rule.kb.score_fun + return _bounds[score_fun][0] < rule.score <= _bounds[score_fun][1] diff --git a/workflows/ilp/hedwig/stats/significance.py b/workflows/ilp/hedwig/stats/significance.py new file mode 100644 index 0000000..8ede3bc --- /dev/null +++ b/workflows/ilp/hedwig/stats/significance.py @@ -0,0 +1,46 @@ +''' +Significance testing methods. + +@author: anze.vavpetic@ijs.si +''' +import scipy.stats as st + + +def is_redundant(rule, new_rule): + ''' + Computes the redundancy coefficient of a new rule compared to its + immediate generalization. + + Rules with a coeff > 1 are deemed non-redundant. + ''' + return _fisher(new_rule, 'greater') > _fisher(rule, 'greater') + + +def fisher(rule): + ''' + Fisher's p-value for one rule. + ''' + return _fisher(rule, 'two-sided') + +def _fisher(rule, alternative): + ''' + Fisher's p-value for one rule. + fisher.two_tail ==> alternative = 'two-sided' + fisher.left_tail ==> alternative = 'less' + fisher.right_tail ==> alternative = 'greater' + ''' + N = float(len(rule.kb.examples)) + nX = float(rule.coverage) + nY = rule.kb.distribution[rule.target] + nXY = rule.distribution[rule.target] + nXnotY = nX - nXY + nnotXY = nY - nXY + nnotXnotY = N - nXnotY - nnotXY + return st.fisher_exact([[nXY, nXnotY], [nnotXY, nnotXnotY]], alternative=alternative)[1] + +def apply_fisher(ruleset): + ''' + Fisher's exact test to test rule significance. + ''' + for rule in ruleset: + rule.pval = fisher(rule) diff --git a/workflows/ilp/hedwig/stats/validate.py b/workflows/ilp/hedwig/stats/validate.py new file mode 100644 index 0000000..e192600 --- /dev/null +++ b/workflows/ilp/hedwig/stats/validate.py @@ -0,0 +1,28 @@ +''' +Module for ruleset validation. + +@author: anze.vavpetic@ijs.si +''' +from adjustment import fdr +from significance import apply_fisher + + +class Validate: + + def __init__(self, kb, significance_test=apply_fisher, adjustment=fdr): + self.kb = kb + self.significance_test = significance_test + self.adjustment = adjustment + + def test(self, ruleset, alpha=0.05, q=0.01): + ''' + Tests the given ruleset and returns the significant rules. + ''' + self.significance_test(ruleset) + + if self.adjustment.__name__ == 'fdr': + ruleset = self.adjustment(ruleset, q=q) + elif self.adjustment.__name__ == 'fwer': + ruleset = self.adjustment(ruleset, alpha=alpha) + + return ruleset diff --git a/workflows/ilp/library.py b/workflows/ilp/library.py index 5d558b5..06bf73c 100644 --- a/workflows/ilp/library.py +++ b/workflows/ilp/library.py @@ -12,6 +12,10 @@ from services.webservice import WebService +import sys +import os +sys.path.append(os.path.join(os.path.dirname(__file__))) + def ilp_aleph(input_dict): aleph = Aleph() settings = input_dict['settings'] @@ -124,3 +128,52 @@ def ilp_treeliker(input_dict): treeliker = TreeLiker(dataset, template, settings=settings) arff_train, arff_test = treeliker.run() return {'arff': arff_train, 'treeliker': treeliker} + + +def ilp_hedwig(input_dict): + import hedwig + + format = input_dict['format'] + suffix = '.' + format + bk_suffix = suffix + if format == 'csv': + bk_suffix = '.tsv' + # Writes examples file + data_file = tempfile.NamedTemporaryFile(delete=False, suffix=format) + data_file.write(input_dict['examples']) + data_file.close() + + # Write BK files to BK dir + bk_dir = tempfile.mkdtemp() + if format == 'csv': + suffix = 'tsv' + for bk_file in input_dict['bk_file']: + tmp_bk_file = tempfile.NamedTemporaryFile(delete=False, dir=bk_dir, suffix=bk_suffix) + tmp_bk_file.write(bk_file) + tmp_bk_file.close() + + output_file = tempfile.NamedTemporaryFile(delete=False) + hedwig.run({ + 'bk_dir': bk_dir, + 'data': data_file.name, + 'format': format, + 'output': output_file.name, + 'mode': 'subgroups', + 'target': input_dict['target'] if 'target' in input_dict else None, + 'score': input_dict['score'], + 'negations': input_dict['negations'] == 'true', + 'alpha': float(input_dict['alpha']), + 'adjust': input_dict['adjust'], + 'FDR': float(input_dict['fdr']), + 'leaves': input_dict['leaves'] == 'true', + 'learner': 'heuristic', + 'optimalsubclass': input_dict['optimalsubclass'] == 'true', + 'uris': input_dict['uris'] == 'true', + 'beam': int(input_dict['beam']), + 'support': float(input_dict['support']), + 'depth': int(input_dict['depth']), + 'nocache': True, + 'covered': None + }) + rules = open(output_file.name).read() + return {'rules': rules} diff --git a/workflows/ilp/package_data/widgets/cfe8f700-0883-4386-9e2b-2fb61b9771c2.json b/workflows/ilp/package_data/widgets/cfe8f700-0883-4386-9e2b-2fb61b9771c2.json new file mode 100644 index 0000000..a3ee002 --- /dev/null +++ b/workflows/ilp/package_data/widgets/cfe8f700-0883-4386-9e2b-2fb61b9771c2.json @@ -0,0 +1,375 @@ +[ + { + "model": "workflows.abstractwidget", + "fields": { + "category": "ed859be5-cc13-46b9-b249-c8f026732c1c", + "treeview_image": "", + "uid": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "windows_queue": false, + "package": "ilp", + "interaction_view": "", + "has_progress_bar": false, + "image": "", + "description": "A subgroup discovery tool that can use ontological domain knowledge (RDF graphs) in the learning process. Subgroup descriptions contain terms from the given domain knowledge and enable potentially better generalizations.", + "static_image": "ilp.png", + "action": "ilp_hedwig", + "visualization_view": "", + "streaming_visualization_view": "", + "post_interact_action": "", + "wsdl_method": "", + "wsdl": "", + "interactive": false, + "is_streaming": false, + "order": 1, + "name": "Hedwig" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "depth", + "short_name": "dep", + "default": "5", + "description": "Maximum number of conjunctions in a rule", + "required": false, + "multi": false, + "parameter_type": "text", + "variable": "depth", + "parameter": true, + "order": 1, + "uid": "0f214668-dc30-4d2b-9f26-f0b004616dae" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "format", + "short_name": "fmt", + "default": "n3", + "description": "Input file format", + "required": false, + "multi": false, + "parameter_type": "select", + "variable": "format", + "parameter": true, + "order": 3, + "uid": "1b0ced89-a266-49b7-9694-9f2253a745c3" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "leaves", + "short_name": "lvs", + "default": "true", + "description": "Instance names in rule conjunctions", + "required": false, + "multi": false, + "parameter_type": "checkbox", + "variable": "leaves", + "parameter": true, + "order": 10, + "uid": "200f7435-9efd-4a49-bb39-e3ffab23cdfa" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "uris", + "short_name": "uri", + "default": "false", + "description": "Show URIs in conjunctions", + "required": false, + "multi": false, + "parameter_type": "checkbox", + "variable": "uris", + "parameter": true, + "order": 1, + "uid": "2a2c0dfd-6f24-406a-8fba-6846d5f7713f" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "FDR", + "short_name": "fdr", + "default": "0.05", + "description": "Max false discovery rate (only for adjust=fdr)", + "required": false, + "multi": false, + "parameter_type": "text", + "variable": "fdr", + "parameter": true, + "order": 9, + "uid": "2f4f6d37-ea63-478f-8174-0baa96340784" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "score", + "short_name": "scr", + "default": "lift", + "description": "Rule score function", + "required": false, + "multi": false, + "parameter_type": "select", + "variable": "score", + "parameter": true, + "order": 5, + "uid": "30cbcd41-aa1f-44fd-a89f-be43d24e7bc0" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "bk_file", + "short_name": "bk", + "default": "", + "description": "Background knowledge file", + "required": true, + "multi": true, + "parameter_type": null, + "variable": "bk_file", + "parameter": false, + "order": 2, + "uid": "382028da-2afa-4316-8f07-5d61ab092b9d" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "optimalsubclass", + "short_name": "osc", + "default": "false", + "description": "Optimal subclass", + "required": false, + "multi": false, + "parameter_type": "checkbox", + "variable": "optimalsubclass", + "parameter": true, + "order": 1, + "uid": "39cf2072-e286-47f5-9703-e9ce9187e735" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "examples", + "short_name": "ex", + "default": "", + "description": "Examples", + "required": true, + "multi": false, + "parameter_type": null, + "variable": "examples", + "parameter": false, + "order": 1, + "uid": "40c101b9-d6e2-4952-9a7f-a78de0918916" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "adjust", + "short_name": "adj", + "default": "fwer", + "description": "Adjustment method for multiple-testing problem", + "required": false, + "multi": false, + "parameter_type": "select", + "variable": "adjust", + "parameter": true, + "order": 8, + "uid": "43a6fca9-6ec1-4565-946b-8cd9505f7ab8" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "support", + "short_name": "sup", + "default": "0.1", + "description": "Minimum rule support", + "required": false, + "multi": false, + "parameter_type": "text", + "variable": "support", + "parameter": true, + "order": 1, + "uid": "6aecfd1e-e502-4532-92d5-6753891a9e76" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "beam", + "short_name": "bm", + "default": "20", + "description": "Beam size", + "required": false, + "multi": false, + "parameter_type": "text", + "variable": "beam", + "parameter": true, + "order": 1, + "uid": "73524367-577d-438d-8f36-2df4165b9c50" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "alpha", + "short_name": "alp", + "default": "0.05", + "description": "P-value threshold", + "required": false, + "multi": false, + "parameter_type": "text", + "variable": "alpha", + "parameter": true, + "order": 7, + "uid": "84f4348f-7aa8-4b17-896b-2b0725f46dfc" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "target", + "short_name": "tgt", + "default": "", + "description": "Target class label", + "required": false, + "multi": false, + "parameter_type": "text", + "variable": "target", + "parameter": true, + "order": 4, + "uid": "ba809eb6-fb85-4866-9922-4cb2fd5a56bf" + } + }, + { + "model": "workflows.abstractinput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "negations", + "short_name": "neg", + "default": "", + "description": "Use negations", + "required": false, + "multi": false, + "parameter_type": "checkbox", + "variable": "negations", + "parameter": true, + "order": 6, + "uid": "f97c9900-5d30-4dde-9908-9f64d142e6e9" + } + }, + { + "model": "workflows.abstractoutput", + "fields": { + "widget": "cfe8f700-0883-4386-9e2b-2fb61b9771c2", + "name": "rules", + "short_name": "rls", + "description": "Output rules", + "variable": "rules", + "order": 1, + "uid": "cde31e50-c87a-412e-b28c-4cbfbc7ce92d" + } + }, + { + "model": "workflows.abstractoption", + "fields": { + "name": "Precision", + "uid": "4e84e71b-fd5d-4197-8f67-4c7ae92d12cc", + "value": "precision", + "abstract_input": "30cbcd41-aa1f-44fd-a89f-be43d24e7bc0" + } + }, + { + "model": "workflows.abstractoption", + "fields": { + "name": "WRAcc", + "uid": "4fa86270-633b-4201-bc31-3b937164c88d", + "value": "wracc", + "abstract_input": "30cbcd41-aa1f-44fd-a89f-be43d24e7bc0" + } + }, + { + "model": "workflows.abstractoption", + "fields": { + "name": "csv", + "uid": "52d86a1d-e931-4785-be60-6303b7e22713", + "value": "csv", + "abstract_input": "1b0ced89-a266-49b7-9694-9f2253a745c3" + } + }, + { + "model": "workflows.abstractoption", + "fields": { + "name": "n3", + "uid": "6b1711b0-ffac-48f6-b01a-a70c3f2754b2", + "value": "n3", + "abstract_input": "1b0ced89-a266-49b7-9694-9f2253a745c3" + } + }, + { + "model": "workflows.abstractoption", + "fields": { + "name": "Lift", + "uid": "75c2e615-bfe7-407c-80a6-97058aa0281e", + "value": "lift", + "abstract_input": "30cbcd41-aa1f-44fd-a89f-be43d24e7bc0" + } + }, + { + "model": "workflows.abstractoption", + "fields": { + "name": "Leverage", + "uid": "7d34a79f-bba7-4b7d-919f-8490e7221adf", + "value": "leverage", + "abstract_input": "30cbcd41-aa1f-44fd-a89f-be43d24e7bc0" + } + }, + { + "model": "workflows.abstractoption", + "fields": { + "name": "ChiSq", + "uid": "9973f1ff-fe21-46d9-9cf3-6c95d30a5a86", + "value": "chisq", + "abstract_input": "30cbcd41-aa1f-44fd-a89f-be43d24e7bc0" + } + }, + { + "model": "workflows.abstractoption", + "fields": { + "name": "Familywise Error Rate", + "uid": "cf0ae2bd-e009-46ed-a38f-06a6aad8c13f", + "value": "fwer", + "abstract_input": "43a6fca9-6ec1-4565-946b-8cd9505f7ab8" + } + }, + { + "model": "workflows.abstractoption", + "fields": { + "name": "False Discovery Rate", + "uid": "fff8fec9-7051-4a78-b46a-60379d73b82e", + "value": "fdr", + "abstract_input": "43a6fca9-6ec1-4565-946b-8cd9505f7ab8" + } + } +] \ No newline at end of file