load_parse.py

#!/usr/bin/python !/usr/bin/env python
# -*- coding: utf-8 -*


# Function to extract knowledge from medical text

import json
# import os
# import py2neo
# import csv
import subprocess
import urllib.request, urllib.error, urllib.parse
# import requests
# import unicodecsv as csv2
# import pandas as pd
from nltk.tokenize import sent_tokenize
from .config import settings


def mmap_extract(text):
    """
    Function-wrapper for metamap binary. Extracts concepts
    found in text.

    !!!! REMEMBER TO START THE METAMAP TAGGER AND
        WordSense DISAMBIGUATION SERVER !!!!
    
    Input:
        - text: str,
        a piece of text or sentence
    Output:
        - concepts: list,
        list of metamap concepts extracted
    """

    # Tokenize into sentences
    sents = sent_tokenize(text)
    mm = MetaMap.get_instance(settings['load']['path']['metamap'])
    concepts, errors = mm.extract_concepts(sents, list(range(len(sents))), 
                                         word_sense_disambiguation=True)
    if errors:
        print('Errors with extracting concepts!')
        print(errors)
    return concepts


def runProcess(exe, working_dir):    
    """
    Function that opens a command line and runs a command.
    Captures the output and returns.
    Input:
        - exe: str,
        string of the command to be run. ! REMEMBER TO ESCAPE CHARS!
        - working_dir: str,
        directory where the cmd should be executed
    Output:
        - lines: list,
        list of strings generated from the command
    """

    p = subprocess.Popen(exe, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=working_dir, shell=True)
    lines = p.stdout.readlines()
    return lines


def stopw_removal(inp, stop):
    """
    Stopwords removal in line of text.
    Input:
        - inp: str,
        string of the text input
        - stop: list,
        list of stop-words to be removed 
    """

    # Final string to be returned
    final = ''
    for w in inp.lower().split():
        if w not in stop:
            final += w + ' '
    # Remove last whitespace that was added ' '
    final = final[:-1]
    return final


def reverb_wrapper(text, stop=None):
    """
    Function-wrapper for ReVerb binary. Extracts relations
    found in text.
    Input:
        - text: str,
        a piece of text or sentence
        - stop: list,
        list of stopwords to remove from the relations
    Output:
        - total: list,
        list of lists. Each inner list contains one relation in the form
        [subject, predicate, object]
    """
    total = []
    for sent in sent_tokenize(text):
        cmd = 'echo "' + sent + '"' "| ./reverb -q | tr '\t' '\n' | cat -n"
        reverb_dir = settings['load']['path']['reverb']
        result = runProcess(cmd, reverb_dir)
        # Extract relations from reverb output
        result = result[-3:]
        result = [row.split('\t')[1].strip('\n') for row in result]
        # Remove common stopwords from relations
        if stop:
            result = [stopw_removal(res, stop) for res in result]
        total.append(result)
    # Remove empty relations
    total = [t for t in total if t]
    return total


def cui_to_uri(api_key, cui):
    """
    Function to map from cui to uri if possible. Uses biontology portal
    Input:
        - api_key: str,
        api usage key change it in setting.yaml
        - cui: str,
        cui of the entity we wish to map the uri
    Output:
        - the uri found in string format or None
    """

    REST_URL = "http://data.bioontology.org"
    annotations = get_json_with_api(api_key, REST_URL + "/search?include_properties=true&q=" + urllib.parse.quote(cui))
    try:
        return annotations['collection'][0]['@id']
    except Exception as e:
        print(Exception)
        print(e)
        return None

def get_json_with_api(api_key, url):
    """
    Helper funtion to retrieve a json from a url through urlib2
    Input:
        - api_key: str,
        api usage key change it in setting.yaml
        - url: str,
        url to curl
    Output:
        - json-style dictionary with the curl results 
    """

    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + api_key)]
    return json.loads(opener.open(url).read())


def threshold_concepts(concepts, hard_num=3, score=None):
    """
    Thresholding concepts from metamap to keep only the most probable ones.
    Currently supporting thresholding on the first-N (hard_num) or based on
    the concept score.
    Input:
        - concepts: list,
        list of Metamap Class concepts
        - hard_num: int,
        the first-N concepts to keep, if this thresholidng is selected
        - score: float,
        lowest accepted concept score, if this thresholidng is selected 
    """

    if hard_num:
        if hard_num >= len(concepts):
            return concepts
        elif hard_num < len(concepts):
            return concepts[:hard_num]
    elif score:
            return [c for c in concepts if c.score > score]
    else:
        return concepts
        

def get_name_concept(concept):
    """
    Get name from the metamap concept. Tries different variations and
    returns the name found.
    Input:
        - concept: Metamap class concept, as generated from mmap_extract
        for example
    Output:
        - name: str,
        the name found for this concept
    """

    name = ''
    if hasattr(concept, 'preferred_name'):
        name = concept.preferred_name
    elif hasattr(concept, 'long_form') and hasattr(concept, 'short_form'):
        name = concept.long_form + '|' + concept.short_form
    elif hasattr(concept, 'long_form'):
        name = concept.long_form
    elif hasattr(concept, 'short_form'):
        name =  concept.short_form
    else:
        name = 'NO NAME IN CONCEPT'
    return name


def metamap_ents(x):
    """
    Function to get entities in usable form.
    Exctracts metamap concepts first, thresholds them and
    tries to extract names and uris for the concepts to be
    more usable.
    Input:
        - x: str,
        sentence to extract entities
    Output:
        - ents: list,
        list of entities found. Each entity is a dictionary with
        fields id (no. found in sentence), name if retrieved, cui if 
        available and uri if found
    """

    # API KEY to biontology mapping from cui to uri
    API_KEY = settings['apis']['biont']
    concepts = mmap_extract(x)
    concepts = threshold_concepts(concepts)
    ents = []
    for i, concept in enumerate(concepts):
        ent = {}
        ent['ent_id'] = i
        ent['name'] = get_name_concept(concept)
        if hasattr(concept, 'cui'):
            ent['cui'] = concept.cui
            ent['uri'] = cui_to_uri(API_KEY, ent['cui']) 
        else:
            ent['cui'] = None
            ent['uri'] = None
        ents.append(ent)
    return ents


def extract_entities(text, json_={}):
    """
    Extract entities from a given text using metamap and
    generate a json, preserving infro regarding the sentence
    of each entity that was found. For the time being, we preserve
    both concepts and the entities related to them
    Input:
         - text: str,
        a piece of text or sentence
        - json_: dic,
        sometimes the json to be returned is given to us to be enriched
        Defaults to an empty json_
    Output:
        - json_: dic,
        json with fields text, sents, concepts and entities
        containg the final results
    """
    json_['text'] = text
    # Tokenize the text
    sents = sent_tokenize(text)
    json_['sents'] = [{'sent_id': i, 'sent_text': sent} for i, sent in enumerate(sents)]
    json_['concepts'], _ = mmap_extract(text)
    json_['entities'] = {}
    for i, sent in enumerate(json_['sents']):
        ents = metamap_ents(sent)
        json_['entities'][sent['sent_id']] = ents
    return json_


def enrich_with_triples(results, subject, pred='MENTIONED_IN'):
    """
    Enrich with rdf triples a json dictionary in the form of:
    entity-URI -- MENTIONED_IN -- 'Text 'Title'. Only entities with
    uri's are considered.
    Input:
        - results: dic,
        json-style dictionary genereated from the extract_entities function
        - subject: str,
        the name of the text document in which the entities are mentioned
        - pred: str,
        the predicate to be used as a link between the uri and the title
    Output:
        - results: dic,
        the same dictionary with one more 
    """
    triples = []
    for sent_key, ents in results['entities'].items():
        for ent in ents:
            if ent['uri']:
               triples.append({'subj': ent['uri'], 'pred': pred, 'obj': subject})
    results['triples'] = triples
    return results
        

def semrep_wrapper(text):
    """
    Function wrapper for SemRep binary. It is called with flags
    -F only and changing this will cause this parsing to fail, cause
    the resulting lines won't have the same structure.
    Input:
        - text: str,
        a piece of text or sentence
    Output:
        - results: dic,
        jston-style dictionary with fields text and sents. Each
        sentence has entities and relations found in it. Each entity and
        each relation has attributes denoted in the corresponding
        mappings dictionary. 
    """
    # Exec the binary
    cmd = "echo " + text + " | ./semrep.v1.7 -L 2015 -Z 2015AA -F"
    semrep_dir = settings['load']['path']['semrep']
    lines = runProcess(cmd, semrep_dir)
    # mapping of line elements to fields
    mappings = {
        "text": {
            "sent_id": 4,
            "sent_text": 6
        },
        "entity": {
            'cuid': 6,
            'label': 7,
            'sem_types': 8,
            'score': 15
        },
        "relation": {
            'subject__cui': 8,
            'subject__label': 9,
            'subject__sem_types': 10,
            'subject__sem_type': 11,
            'subject__score': 18,
            'predicate__type': 21,
            'predicate': 22,
            'negation': 23,
            'object__cui': 28,
            'object__label': 29,
            'object__sem_types': 30,
            'object__sem_type': 31,
            'object__score': 38,
        }
    }
    results = {'sents': [], 'text': text}
    for line in lines:
        # If Sentence
        if line.startswith('SE'):
            elements = line.split('|')
            # New sentence that was processed
            if elements[5] == 'text':
                tmp = {"entities": [], "relations": []}
                for key, ind in mappings['text'].items():
                    tmp[key] = elements[ind]
                results['sents'].append(tmp)
            # A line containing entity info
            if elements[5] == 'entity':
                tmp = {}
                for key, ind in mappings['entity'].items():
                    if key == 'sem_types':
                        tmp[key] = elements[ind].split(',')
                    tmp[key] = elements[ind]
                results['sents'][-1]['entities'].append(tmp)
            # A line containing relation info
            if elements[5] == 'relation':
                tmp = {}
                for key, ind in mappings['relation'].items():
                    if 'sem_types' in key:
                        tmp[key] = elements[ind].split(',')
                    else:
                        tmp[key] = elements[ind]
                results['sents'][-1]['relations'].append(tmp)
    return results


results = extract_entities(text)
results = enrich_with_triples(results, subject='Text Title')