Skip to content

Commit

Permalink
Added method processText to interface EntityLookup
Browse files Browse the repository at this point in the history
Implemented processText method in EntityLookup4 and 5.
  • Loading branch information
willjrogers committed Mar 30, 2022
1 parent 77a68ba commit aa271ff
Show file tree
Hide file tree
Showing 4 changed files with 261 additions and 27 deletions.
51 changes: 46 additions & 5 deletions src/main/java/gov/nih/nlm/nls/metamap/lite/EntityLookup.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import gov.nih.nlm.nls.metamap.prefix.ERToken;

/**
*
* Entity Lookup Signature
*/

public interface EntityLookup {
Expand All @@ -20,26 +20,67 @@ public interface EntityLookup {
* Term is automatically assigned noun as part of speech.
*
* @param term term containing one or more words to looked up in dictionary.
* @param semTypeRestrictSet retained concepts must have this set of semantic types, if empty than all concepts are retained.
* @param sourceRestrictSet retained concepts must be from this set of source, if empty than all concepts are retained.
* @param semTypeRestrictSet retained concepts must have this set of
* semantic types, if empty than all concepts are retained.
* @param sourceRestrictSet retained concepts must be from this set
* of source, if empty than all concepts are retained.
* @return entityList list of entities found.
*/
List<Entity> lookupTerm(String term,
Set<String> semTypeRestrictSet,
Set<String> sourceRestrictSet);

/**
* Process Text
*
* @param docid document identifier for input text
* @param fieldid field identifier for input text
* @param text string containing input text
* @param useNegationDetection use ConText or other negation detector
* @param semTypeRestrictSet retained concepts must have this set of
* semantic types, if empty than all concepts are retained.
* @param sourceRestrictSet retained concepts must be from this set
* of source, if empty than all concepts are retained.
* @return entityList list of entities found.
*/
List<Entity> processText(String docid,
String fieldid,
String text,
boolean useNegationDetection,
Set<String> semTypeRestrictSet,
Set<String> sourceRestrictSet);

/**
* Process Text String
*
* @param text string containing input text
* @param useNegationDetection use ConText or other negation detector
* @param semTypeRestrictSet retained concepts must have this set of semantic types, if empty than all concepts are retained.
* @param sourceRestrictSet retained concepts must be from this set of source, if empty than all concepts are retained.
* @return entityList list of entities found.
*/
List<Entity> processText(String text,
boolean useNegationDetection,
Set<String> semTypeRestrictSet,
Set<String> sourceRestrictSet);

/**
* Process Passage
*
* @param docid document identifier for passage
* @param passage BioCPassage instance contains content to processed.
* @param useNegationDetection use ConText or other negation detector
* @param semTypeRestrictSet retained concepts must have this set of semantic types, if empty than all concepts are retained.
* @param sourceRestrictSet retained concepts must be from this set of source, if empty than all concepts are retained.
* @param semTypeRestrictSet retained concepts must have this set of
* semantic types, if empty than all concepts are retained.
* @param sourceRestrictSet retained concepts must be from this set
* of source, if empty than all concepts are retained.
* @return entityList list of entities found.
*/
List<Entity> processPassage(String docid, BioCPassage passage,
boolean useNegationDetection,
Set<String> semTypeRestrictSet,
Set<String> sourceRestrictSet);

Set<BioCAnnotation> generateBioCEntitySet(String docid, List<ERToken> tokenList);
}
21 changes: 21 additions & 0 deletions src/main/java/gov/nih/nlm/nls/metamap/lite/EntityLookup3.java
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,27 @@ public List<Entity> lookupTerm(String term,
return entityList;
}


/** Process text string */
public List<Entity> processText(String docid,
String fieldid,
String text,
boolean detectNegationsFlag,
Set<String> semTypeRestrictSet,
Set<String> sourceRestrictSet) {
return null;
}

/** Process text string */
public List<Entity> processText(String text,
boolean useNegationDetection,
Set<String> semTypeRestrictSet,
Set<String> sourceRestrictSet) {
return processText("000000", "text", text, useNegationDetection,
semTypeRestrictSet, sourceRestrictSet);
}


/** Process passage
*
* @param docid document id
Expand Down
124 changes: 107 additions & 17 deletions src/main/java/gov/nih/nlm/nls/metamap/lite/EntityLookup4.java
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,11 @@ public class EntityLookup4 implements EntityLookup {
Integer.parseInt(System.getProperty("metamaplite.entitylookup4.maxtokensize","15"));
SpecialTerms excludedTerms = new SpecialTerms();
SentenceAnnotator sentenceAnnotator;
SentenceExtractor sentenceExtractor;
NegationDetector negationDetector;
boolean addPartOfSpeechTagsFlag =
Boolean.parseBoolean(System.getProperty("metamaplite.enable.postagging","true"));
Properties properties;

/** Part of speech tags used for term lookup, can be set using
* property: metamaplite.postaglist; the tag list is a set of Penn
Expand Down Expand Up @@ -116,6 +118,7 @@ public void defaultAllowedPartOfSpeech() {
public EntityLookup4(Properties properties)
throws IOException, FileNotFoundException
{
this.properties = properties;
MMLDictionaryLookupRegistry registry = new MMLDictionaryLookupRegistry();
registry.put("ivf", new IVFLookup());
registry.put("mapdb", new MapDbLookup());
Expand Down Expand Up @@ -149,15 +152,16 @@ public EntityLookup4(Properties properties)
Boolean.toString(addPartOfSpeechTagsFlag)));

if (this.addPartOfSpeechTagsFlag) {

this.sentenceAnnotator = new OpenNLPPoSTagger(properties);
String allowedPartOfSpeechTaglist = properties.getProperty("metamaplite.postaglist");
if (allowedPartOfSpeechTaglist != null) {
for (String pos: allowedPartOfSpeechTaglist.split(",")) {
this.allowedPartOfSpeechSet.add(pos);
}
} else {
this.defaultAllowedPartOfSpeech();
if (this.sentenceAnnotator == null) {
this.sentenceAnnotator = new OpenNLPPoSTagger(properties);
String allowedPartOfSpeechTaglist = properties.getProperty("metamaplite.postaglist");
if (allowedPartOfSpeechTaglist != null) {
for (String pos: allowedPartOfSpeechTaglist.split(",")) {
this.allowedPartOfSpeechSet.add(pos);
}
} else {
this.defaultAllowedPartOfSpeech();
}
}
} else {
this.allowedPartOfSpeechSet.add(""); // empty if not part-of-speech tagged (accept everything)
Expand Down Expand Up @@ -209,14 +213,16 @@ public EntityLookup4(Properties properties)
public void setPoSTagger(Properties properties, InputStream instream) {
if (this.addPartOfSpeechTagsFlag) {
// skip this if pos tag is false
this.sentenceAnnotator = new OpenNLPPoSTagger(instream);
String allowedPartOfSpeechTaglist = properties.getProperty("metamaplite.postaglist");
if (allowedPartOfSpeechTaglist != null) {
for (String pos: allowedPartOfSpeechTaglist.split(",")) {
this.allowedPartOfSpeechSet.add(pos);
}
} else {
this.defaultAllowedPartOfSpeech();
if (this.sentenceAnnotator == null) {
this.sentenceAnnotator = new OpenNLPPoSTagger(instream);
String allowedPartOfSpeechTaglist = properties.getProperty("metamaplite.postaglist");
if (allowedPartOfSpeechTaglist != null) {
for (String pos: allowedPartOfSpeechTaglist.split(",")) {
this.allowedPartOfSpeechSet.add(pos);
}
} else {
this.defaultAllowedPartOfSpeech();
}
}
} else {
this.allowedPartOfSpeechSet.add(""); // empty if not part-of-speech tagged (accept everything)
Expand Down Expand Up @@ -567,6 +573,90 @@ public List<Entity> lookupTerm(String term,
return entityList;
}

/** Process text string */
public List<Entity> processText(String docid,
String fieldid,
String text,
boolean detectNegationsFlag,
Set<String> semTypeRestrictSet,
Set<String> sourceRestrictSet) {
EntityStartComparator entityComparator = new EntityStartComparator();
try {
Set<Entity> entitySet0 = new HashSet<Entity>();
int i = 0;
if (this.sentenceExtractor == null) {
this.sentenceExtractor = new OpenNLPSentenceExtractor(this.properties);
}
List<Sentence> sentenceList = this.sentenceExtractor.createSentenceList(text);
for (Sentence sentence: sentenceList) {
List<ERToken> tokenList = Scanner.analyzeText(sentence);
if (this.addPartOfSpeechTagsFlag) {
sentenceAnnotator.addPartOfSpeech(tokenList);
}
Set<Entity> sentenceEntitySet =
this.processSentenceTokenList(docid, fieldid, tokenList,
semTypeRestrictSet,
sourceRestrictSet);
sentenceEntitySet.addAll(UserDefinedAcronym.generateEntities
(docid, this.udaMap, tokenList));
for (Entity entity: sentenceEntitySet) {
entity.setLocationPosition(i);
}
entitySet0.addAll(sentenceEntitySet);
i++;
}
// look for negation and other relations using Context.
for (Sentence sentence: sentenceList) {
List<ERToken> tokenList = Scanner.analyzeText(sentence);

// mark abbreviations that are entities and add them to entity set.

Set<Entity> abbrevEntitySet =
new HashSet(MarkAbbreviations.markAbbreviations
(text, this.uaMap,
new ArrayList(entitySet0)));
// dbg
// for (Entity entity: abbrevEntitySet) {
// logger.debug("abbrevEntitySet.entity: " + entity);
// }
// end of dbg
entitySet0.addAll(abbrevEntitySet);
if (detectNegationsFlag) {
detectNegations(entitySet0, sentence.getText(), tokenList);
}
}

Set<Entity> entitySet1 = new HashSet<Entity>();
for (Entity entity: entitySet0) {
ConceptInfoUtils.filterEntityEvListBySemanticType(entity, semTypeRestrictSet);
ConceptInfoUtils.filterEntityEvListBySource(entity, sourceRestrictSet);
if (entity.getEvList().size() > 0) {
entitySet1.add(entity);
}
}
Set<Entity> entitySet = removeSubsumedEntities(entitySet1);

List<Entity> resultList = new ArrayList<Entity>(entitySet);
Collections.sort(resultList, entityComparator);
return resultList;
} catch (FileNotFoundException fnfe) {
throw new RuntimeException(fnfe);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
} catch (Exception e) {
throw new RuntimeException(e);
}
}

/** Process text string */
public List<Entity> processText(String text,
boolean useNegationDetection,
Set<String> semTypeRestrictSet,
Set<String> sourceRestrictSet) {
return processText("000000", "text", text, useNegationDetection,
semTypeRestrictSet, sourceRestrictSet);
}

/** Process passage */
public List<Entity> processPassage(String docid, BioCPassage passage,
boolean detectNegationsFlag,
Expand Down
92 changes: 87 additions & 5 deletions src/main/java/gov/nih/nlm/nls/metamap/lite/EntityLookup5.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import gov.nih.nlm.nls.metamap.lite.metamap.MetaMapIvfIndexes;
import gov.nih.nlm.nls.metamap.lite.ChunkerMethod;
import gov.nih.nlm.nls.metamap.lite.OpenNLPChunker;
import gov.nih.nlm.nls.metamap.lite.SentenceExtractor;

import gov.nih.nlm.nls.metamap.prefix.CharUtils;

Expand Down Expand Up @@ -83,6 +84,7 @@ public class EntityLookup5 implements EntityLookup {
Integer.parseInt(System.getProperty("metamaplite.entitylookup4.maxtokensize","15"));
SpecialTerms excludedTerms = new SpecialTerms();
SentenceAnnotator sentenceAnnotator;
SentenceExtractor sentenceExtractor;
NegationDetector negationDetector;
boolean addPartOfSpeechTagsFlag =
Boolean.parseBoolean(System.getProperty("metamaplite.enable.postagging","true"));
Expand Down Expand Up @@ -877,6 +879,81 @@ public List<Entity> lookupTerm(String term,
return entityList;
}

/** Process text string */
public List<Entity> processText(String docid,
String fieldid,
String text,
boolean detectNegationsFlag,
Set<String> semTypeRestrictSet,
Set<String> sourceRestrictSet) {
EntityStartComparator entityComparator = new EntityStartComparator();
try {
Set<Entity> entitySet0 = new HashSet<Entity>();
int i = 0;
List<Sentence> sentenceList = this.sentenceExtractor.createSentenceList(text);
for (Sentence sentence: sentenceList) {
List<ERToken> tokenList = Scanner.analyzeText(sentence);
Set<Entity> sentenceEntitySet =
this.processSentenceTokenList(docid, fieldid, tokenList,
semTypeRestrictSet,
sourceRestrictSet);
sentenceEntitySet.addAll(UserDefinedAcronym.generateEntities
(docid, this.udaMap, tokenList));
for (Entity entity: sentenceEntitySet) {
entity.setLocationPosition(i);
}
entitySet0.addAll(sentenceEntitySet);
i++;
}
// look for negation and other relations using Context.
for (Sentence sentence: sentenceList) {
List<ERToken> tokenList = Scanner.analyzeText(sentence);

// mark abbreviations that are entities and add them to sentence entity set.
Set<Entity> abbrevEntitySet =
new HashSet(MarkAbbreviations.markAbbreviations
(text, this.uaMap,
new ArrayList(entitySet0)));
entitySet0.addAll(abbrevEntitySet);
if (detectNegationsFlag) {
detectNegations(entitySet0, sentence.getText(), tokenList);
}
}

// remove any entities subsumed by another entity
Set<Entity> entitySet1 = removeSubsumedEntities(entitySet0);
// filter entities by semantic type and source sets.
Set<Entity> entitySet = new HashSet<Entity>();
for (Entity entity: entitySet1) {
ConceptInfoUtils.filterEntityEvListBySemanticType(entity, semTypeRestrictSet);
ConceptInfoUtils.filterEntityEvListBySource(entity, sourceRestrictSet);
if (entity.getEvList().size() > 0) {
entitySet.add(entity);
}
}
List<Entity> resultList = new ArrayList<Entity>(entitySet);
Collections.sort(resultList, entityComparator);
return resultList;
} catch (FileNotFoundException fnfe) {
throw new RuntimeException(fnfe);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
} catch (Exception e) {
throw new RuntimeException(e);
}

}

/** Process text string */
public List<Entity> processText(String text,
boolean useNegationDetection,
Set<String> semTypeRestrictSet,
Set<String> sourceRestrictSet) {
return processText("000000", "text", text, useNegationDetection,
semTypeRestrictSet, sourceRestrictSet);
}


/** Process passage */
public List<Entity> processPassage(String docid, BioCPassage passage,
boolean detectNegationsFlag,
Expand All @@ -893,10 +970,12 @@ public List<Entity> processPassage(String docid, BioCPassage passage,
int i = 0;
for (BioCSentence sentence: passage.getSentences()) {
List<ERToken> tokenList = Scanner.analyzeText(sentence);
Set<Entity> sentenceEntitySet = this.processSentenceTokenList(docid, fieldid, tokenList,
semTypeRestrictSet,
sourceRestrictSet);
sentenceEntitySet.addAll(UserDefinedAcronym.generateEntities(docid, this.udaMap, tokenList));
Set<Entity> sentenceEntitySet =
this.processSentenceTokenList(docid, fieldid, tokenList,
semTypeRestrictSet,
sourceRestrictSet);
sentenceEntitySet.addAll(UserDefinedAcronym.generateEntities
(docid, this.udaMap, tokenList));
for (Entity entity: sentenceEntitySet) {
entity.setLocationPosition(i);
}
Expand All @@ -909,7 +988,10 @@ public List<Entity> processPassage(String docid, BioCPassage passage,
List<ERToken> tokenList = Scanner.analyzeText(sentence);

// mark abbreviations that are entities and add them to sentence entity set.
Set<Entity> abbrevEntitySet = new HashSet(MarkAbbreviations.markAbbreviations(passage, new ArrayList(entitySet0)));
Set<Entity> abbrevEntitySet =
new HashSet(MarkAbbreviations.markAbbreviations
(passage, this.uaMap,
new ArrayList(entitySet0)));
entitySet0.addAll(abbrevEntitySet);
if (detectNegationsFlag) {
detectNegations(entitySet0, sentence.getText(), tokenList);
Expand Down

0 comments on commit aa271ff

Please sign in to comment.