Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds Hedwig widget #10

Merged
merged 1 commit into from
May 3, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Adds Hedwig widget
  • Loading branch information
Anze Vavpetic committed Apr 21, 2016
commit 60072a6a26f232c5338c214533edd551041f8c62
152 changes: 152 additions & 0 deletions workflows/ilp/hedwig/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import os
import time
from datetime import datetime
import logging
import json

from hedwig.core import ExperimentKB, Rule
from hedwig.learners import HeuristicLearner, OptimalLearner
from hedwig.stats import scorefunctions, adjustment, significance, Validate
from hedwig.core.load import load_graph
from hedwig.core.settings import VERSION, DESCRIPTION, logger


def _parameters_report(args, start, time_taken):
sep = '-'*40 + '\n'
rep = DESCRIPTION + '\n' +\
'Version: %s' % VERSION + '\n' +\
'Start: %s' % start + '\n' +\
'Time taken: %.2f seconds' % time_taken + '\n' +\
'Parameters:' + '\n'

for arg, val in args.items():
rep += '\t%s=%s\n' % (arg, str(val))
rep = sep + rep + sep

return rep


def generate_rules_report(kwargs, rules_per_target,
human=lambda label, rule: label):
rules_report = ''
for _, rules in rules_per_target:
if rules:
rules_report += Rule.ruleset_report(rules, show_uris=kwargs['uris'],
human=human)
rules_report += '\n'
if not rules_report:
rules_report = 'No significant rules found'
return rules_report


def run(kwargs, cli=False):

if cli:
logger.setLevel(logging.DEBUG if kwargs['verbose'] else logging.INFO)
else:
logger.setLevel(logging.NOTSET)

logger.info('Starting Hedwig')
start = time.time()
start_date = datetime.now().isoformat()

graph = build_graph(kwargs)

logger.info('Building the knowledge base')
score_func = getattr(scorefunctions, kwargs['score'])
kb = ExperimentKB(graph, score_func, instances_as_leaves=kwargs['leaves'])

validator = Validate(kb, significance_test=significance.apply_fisher,
adjustment=getattr(adjustment, kwargs['adjust']))

rules_per_target = run_learner(kwargs, kb, validator)
rules_report = generate_rules_report(kwargs, rules_per_target)


end = time.time()
time_taken = end-start
logger.info('Finished in %d seconds' % time_taken)

logger.info('Outputing results')

if kwargs['covered']:
with open(kwargs['covered'], 'w') as f:
examples = Rule.ruleset_examples_json(rules_per_target)
f.write(json.dumps(examples, indent=2))

parameters_report = _parameters_report(kwargs, start_date, time_taken)
rules_out_file = kwargs['output']
if rules_out_file:
with open(rules_out_file, 'w') as f:
if rules_out_file.endswith('json'):
f.write(Rule.to_json(rules_per_target, show_uris=kwargs['uris']))
else:
f.write(parameters_report)
f.write(rules_report)
elif cli:
print parameters_report
print rules_report

return rules_per_target


def build_graph(kwargs):
data = kwargs['data']
base_name = data.split('.')[0]

# Walk the dir to find BK files
ontology_list = []
for root, sub_folders, files in os.walk(kwargs['bk_dir']):
ontology_list.extend(map(lambda f: os.path.join(root, f), files))

graph = load_graph(
ontology_list,
data,
def_format=kwargs['format'],
cache=not kwargs['nocache']
)
return graph


def run_learner(kwargs, kb, validator):

if kb.is_discrete_target():
targets = kb.class_values if not kwargs['target'] else [kwargs['target']]
else:
targets = [None]

rules_report = ''
rules_per_target = []

for target in targets:
if target:
logger.info('Starting learner for target \'%s\'' % target)
else:
logger.info('Ranks detected - starting learner.')

learner_cls = {
'heuristic': HeuristicLearner,
'optimal': OptimalLearner
} [kwargs['learner']]
learner = learner_cls(kb,
n=kwargs['beam'],
min_sup=int(kwargs['support']*kb.n_examples()),
target=target,
depth=kwargs['depth'],
sim=0.9,
use_negations=kwargs['negations'],
optimal_subclass=kwargs['optimalsubclass'])
rules = learner.induce()

if kb.is_discrete_target():
if kwargs['adjust'] == 'fdr':
logger.info('Validating rules, FDR = %.3f' % kwargs['FDR'])
elif kwargs['adjust'] == 'fwer':
logger.info('Validating rules, alpha = %.3f' % kwargs['alpha'])
rules = validator.test(rules, alpha=kwargs['alpha'], q=kwargs['FDR'])

rules_per_target.append((target, rules))

return rules_per_target


87 changes: 87 additions & 0 deletions workflows/ilp/hedwig/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import argparse

import hedwig
from hedwig.stats import scorefunctions, adjustment
from hedwig.core.settings import VERSION, DESCRIPTION, INPUT_FORMATS, Defaults

__version__ = VERSION

parser = argparse.ArgumentParser(description=DESCRIPTION)
functions = filter(lambda s: not s.startswith('_'), dir(scorefunctions))
adjustments = filter(lambda s: not s.startswith('_'), dir(adjustment))

parser.add_argument('bk_dir', metavar='BKDIR',
help='Background knowledge directory. The program attempts\
to load all RDF-type files from this directory.')

parser.add_argument('data', metavar='DATASET',
help='File containing the learning examples. \
Can be in RDF or JSON.')

parser.add_argument('-f', '--format', choices=INPUT_FORMATS,
help="Input file format.", default=Defaults.FORMAT)

parser.add_argument('-o', '--output', help='Output file. If none is specified, \
the results are written to stdout. \
Use .json suffix to write the results \
in json.')

parser.add_argument('-c', '--covered', help='File to write IDs of covered \
examples.')

parser.add_argument('-m', '--mode', choices=['features', 'subgroups'],
default=Defaults.MODE,
help='Running mode.')

parser.add_argument('-t', '--target',
help='Target class label. If it is not specified, rules \
produced for each class label.')

parser.add_argument('-s', '--score', choices=functions, default=Defaults.SCORE,
help='Score function.')

parser.add_argument('-n', '--negations', action='store_true',
help='Use negations.')

parser.add_argument('-A', '--alpha', default=Defaults.ALPHA, type=float,
help='P-value threshold; applies if "--adjust fwer" \
is used.')

parser.add_argument('-a', '--adjust', default=Defaults.ADJUST, choices=adjustments,
help='Adjustment method for the multiple-testing problem.')

parser.add_argument('-q', '--FDR', default=Defaults.FDR_Q, type=float,
help='Max false discovery rate; applies only if \
"--adjust fdr" is used.')

parser.add_argument('-l', '--leaves', action='store_true',
help='Use instance names in rule conjunctions.')

parser.add_argument('-L', '--learner', choices=['heuristic', 'optimal'],
default=Defaults.LEARNER,
help='Type of learner to use.')

parser.add_argument('-O', '--optimalsubclass', action='store_true',
help='In each step the full hierarchy under a particular \
concept is searched')

parser.add_argument('-u', '--uris', action='store_true',
help='Show URIs in rule conjunctions.')

parser.add_argument('-b', '--beam', default=Defaults.BEAM_SIZE, type=int,
help='Beam size.')

parser.add_argument('-S', '--support', default=Defaults.SUPPORT, type=float,
help='Minimum support.')

parser.add_argument('-d', '--depth', default=Defaults.DEPTH, type=int,
help='Maximum number of conjunctions.')

parser.add_argument('-C', '--nocache', action='store_true',
help='Don\'t cache background knowledge graph files.')

parser.add_argument("-v", "--verbose", help="Increase output verbosity.",
action="store_true")

args = parser.parse_args()
hedwig.run(args.__dict__, cli=True)
30 changes: 30 additions & 0 deletions workflows/ilp/hedwig/assets/builtin.n3
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
@prefix : <http://kt.ijs.si/hedwig#>.
@prefix OWL: <http://www.w3.org/2002/07/owl#>.
@prefix RDF: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.
@prefix RDFS: <http://www.w3.org/2000/01/rdf-schema#>.
@prefix XSD: <http://www.w3.org/2001/XMLSchema#>.

:Example
a OWL:Class ;
RDFS:label "Example".

:score
a OWL:DatatypeProperty ;
RDFS:label "score" ;
RDFS:domain :Example ;
RDFS:range XSD:float .

:annotated_with
a OWL:ObjectProperty ;
RDFS:label "annotated with" ;
RDFS:domain :Example ;
RDFS:range RDF:Resource .

:GeneralizationPredicate
a OWL:Class.

RDFS:subClassOf
a :GeneralizationPredicate.

:partOf
a :GeneralizationPredicate.
10 changes: 10 additions & 0 deletions workflows/ilp/hedwig/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from hedwig.core.example import Example
from hedwig.core.predicate import UnaryPredicate, BinaryPredicate
from hedwig.core.rule import Rule
from hedwig.core.kb import ExperimentKB
from hedwig.core import settings
from hedwig.core import load


__all__ = ['Example', 'UnaryPredicate', 'BinaryPredicate', 'Rule',
'ExperimentKB', 'settings', 'load']
34 changes: 34 additions & 0 deletions workflows/ilp/hedwig/core/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
'''
Example-related classes.

@author: [email protected]
'''


class Example:
'''
Represents an example with its score, label, id and annotations.
'''
ClassLabeled = 'class'
Ranked = 'ranked'

def __init__(self, id, label, score, annotations=[], weights={}):
self.id = id
self.label = label
self.score = score
if not type(score) in [str, unicode]:
self.target_type = Example.Ranked
else:
self.target_type = Example.ClassLabeled
self.annotations = annotations
self.weights = weights

def __str__(self):
if self.target_type == Example.Ranked:
return '<id=%d, score=%.5f, label=%s>' % (self.id,
self.score,
self.label)
else:
return '<id=%d, class=%s, label=%s>' % (self.id,
self.score,
self.label)
35 changes: 35 additions & 0 deletions workflows/ilp/hedwig/core/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
'''
Helper functions.

@author: [email protected]
'''
from math import sqrt

from hedwig.core.settings import W3C, HEDWIG

def avg(x):
n = float(len(x))
if n:
return sum(x)/n
else:
return 0


def std(x):
n = float(len(x))
if n:
return sqrt((sum(i*i for i in x) - sum(x)**2/n)/n)
else:
return 0


def user_defined(uri):
'''
Is this resource user defined?
'''
return not uri.startswith(W3C) and not uri.startswith(HEDWIG) and \
not anonymous_uri(uri)


def anonymous_uri(uri):
return not uri.startswith('http')
Loading