Added method processText to interface EntityLookup

Implemented processText method in EntityLookup4 and 5.
LHNCBC · Mar 30, 2022 · aa271ff · aa271ff
1 parent 77a68ba
commit aa271ff
Show file tree

Hide file tree

Showing 4 changed files with 261 additions and 27 deletions.
diff --git a/src/main/java/gov/nih/nlm/nls/metamap/lite/EntityLookup.java b/src/main/java/gov/nih/nlm/nls/metamap/lite/EntityLookup.java
@@ -10,7 +10,7 @@
 import gov.nih.nlm.nls.metamap.prefix.ERToken;
 
 /**
- *
+ * Entity Lookup Signature
  */
 
 public interface EntityLookup {
@@ -20,26 +20,67 @@ public interface EntityLookup {
    * Term is automatically assigned noun as part of speech.
    *
    * @param term term containing one or more words to looked up in dictionary.
-   * @param semTypeRestrictSet retained concepts must have this set of semantic types, if empty than all concepts are retained.
-   * @param sourceRestrictSet  retained concepts must be from this set of source, if empty than all concepts are retained.
+   * @param semTypeRestrictSet retained concepts must have this set of
+   *        semantic types, if empty than all concepts are retained.
+   * @param sourceRestrictSet retained concepts must be from this set
+   *        of source, if empty than all concepts are retained.
    * @return entityList list of entities found.
    */
   List<Entity> lookupTerm(String term,
 			  Set<String> semTypeRestrictSet,
 			  Set<String> sourceRestrictSet);
+
+  /**
+   * Process Text
+   * 
+   * @param docid document identifier for input text
+   * @param fieldid field identifier for input text
+   * @param text string containing input text 
+   * @param useNegationDetection use ConText or other negation detector 
+   * @param semTypeRestrictSet retained concepts must have this set of
+   *        semantic types, if empty than all concepts are retained.
+   * @param sourceRestrictSet retained concepts must be from this set
+   *        of source, if empty than all concepts are retained.
+   * @return entityList list of entities found.
+   */
+  List<Entity> processText(String docid,
+			   String fieldid,
+			   String text,
+			   boolean useNegationDetection,
+			   Set<String> semTypeRestrictSet,
+			   Set<String> sourceRestrictSet);
+
+  /**
+   * Process Text String
+   * 
+   * @param text string containing input text 
+   * @param useNegationDetection use ConText or other negation detector 
+   * @param semTypeRestrictSet retained concepts must have this set of semantic types, if empty than all concepts are retained.
+   * @param sourceRestrictSet  retained concepts must be from this set of source, if empty than all concepts are retained.
+   * @return entityList list of entities found.
+   */
+  List<Entity> processText(String text,
+			   boolean useNegationDetection,
+			   Set<String> semTypeRestrictSet,
+			   Set<String> sourceRestrictSet);
+
   /**
    * Process Passage
    * 
    * @param docid document identifier for passage
    * @param passage BioCPassage instance contains content to processed.
    * @param useNegationDetection use ConText or other negation detector 
-   * @param semTypeRestrictSet retained concepts must have this set of semantic types, if empty than all concepts are retained.
-   * @param sourceRestrictSet  retained concepts must be from this set of source, if empty than all concepts are retained.
+
+   * @param semTypeRestrictSet retained concepts must have this set of
+   *        semantic types, if empty than all concepts are retained.
+   * @param sourceRestrictSet retained concepts must be from this set
+   *        of source, if empty than all concepts are retained.
    * @return entityList list of entities found.
    */
   List<Entity> processPassage(String docid, BioCPassage passage,
 			      boolean useNegationDetection,
 			      Set<String> semTypeRestrictSet,
 			      Set<String> sourceRestrictSet);
+
   Set<BioCAnnotation> generateBioCEntitySet(String docid, List<ERToken> tokenList);
 }
diff --git a/src/main/java/gov/nih/nlm/nls/metamap/lite/EntityLookup3.java b/src/main/java/gov/nih/nlm/nls/metamap/lite/EntityLookup3.java
@@ -519,6 +519,27 @@ public List<Entity> lookupTerm(String term,
     return entityList;
   }
 
+
+  /** Process text string */
+  public List<Entity> processText(String docid,
+				  String fieldid,
+				  String text,
+				  boolean detectNegationsFlag,
+				  Set<String> semTypeRestrictSet,
+				  Set<String> sourceRestrictSet) {
+    return null;
+  }
+
+    /** Process text string */
+  public List<Entity> processText(String text,
+				  boolean useNegationDetection,
+				  Set<String> semTypeRestrictSet,
+				  Set<String> sourceRestrictSet) {
+    return processText("000000", "text", text, useNegationDetection,
+		       semTypeRestrictSet, sourceRestrictSet);
+  }
+
+
   /** Process passage
    *
    * @param docid document id

diff --git a/src/main/java/gov/nih/nlm/nls/metamap/lite/EntityLookup4.java b/src/main/java/gov/nih/nlm/nls/metamap/lite/EntityLookup4.java
@@ -76,9 +76,11 @@ public class EntityLookup4 implements EntityLookup {
     Integer.parseInt(System.getProperty("metamaplite.entitylookup4.maxtokensize","15"));
   SpecialTerms excludedTerms = new SpecialTerms();
   SentenceAnnotator sentenceAnnotator;
+  SentenceExtractor sentenceExtractor;
   NegationDetector negationDetector;
   boolean addPartOfSpeechTagsFlag =
     Boolean.parseBoolean(System.getProperty("metamaplite.enable.postagging","true"));
+  Properties properties;
 
   /** Part of speech tags used for term lookup, can be set using
    * property: metamaplite.postaglist; the tag list is a set of Penn
@@ -116,6 +118,7 @@ public void defaultAllowedPartOfSpeech() {
   public EntityLookup4(Properties properties) 
     throws IOException, FileNotFoundException
   {
+    this.properties = properties;
     MMLDictionaryLookupRegistry registry = new MMLDictionaryLookupRegistry();
     registry.put("ivf", new IVFLookup());
     registry.put("mapdb", new MapDbLookup());
@@ -149,15 +152,16 @@ public EntityLookup4(Properties properties)
 						  Boolean.toString(addPartOfSpeechTagsFlag)));
 
     if (this.addPartOfSpeechTagsFlag) {
-
-      this.sentenceAnnotator = new OpenNLPPoSTagger(properties);
-      String allowedPartOfSpeechTaglist = properties.getProperty("metamaplite.postaglist");
-      if (allowedPartOfSpeechTaglist != null) {
-	for (String pos: allowedPartOfSpeechTaglist.split(",")) {
-	  this.allowedPartOfSpeechSet.add(pos);
-	} 
-      } else {
-	this.defaultAllowedPartOfSpeech();
+      if (this.sentenceAnnotator == null) {
+	this.sentenceAnnotator = new OpenNLPPoSTagger(properties);
+	String allowedPartOfSpeechTaglist = properties.getProperty("metamaplite.postaglist");
+	if (allowedPartOfSpeechTaglist != null) {
+	  for (String pos: allowedPartOfSpeechTaglist.split(",")) {
+	    this.allowedPartOfSpeechSet.add(pos);
+	  } 
+	} else {
+	  this.defaultAllowedPartOfSpeech();
+	}
       }
     } else {
       this.allowedPartOfSpeechSet.add(""); // empty if not part-of-speech tagged (accept everything)
@@ -209,14 +213,16 @@ public EntityLookup4(Properties properties)
   public void setPoSTagger(Properties properties, InputStream instream) {
     if (this.addPartOfSpeechTagsFlag) {
       // skip this if pos tag is false
-      this.sentenceAnnotator = new OpenNLPPoSTagger(instream);
-      String allowedPartOfSpeechTaglist = properties.getProperty("metamaplite.postaglist");
-      if (allowedPartOfSpeechTaglist != null) {
-	for (String pos: allowedPartOfSpeechTaglist.split(",")) {
-	  this.allowedPartOfSpeechSet.add(pos);
-	} 
-      } else {
-	this.defaultAllowedPartOfSpeech();
+      if (this.sentenceAnnotator == null) {
+	this.sentenceAnnotator = new OpenNLPPoSTagger(instream);
+	String allowedPartOfSpeechTaglist = properties.getProperty("metamaplite.postaglist");
+	if (allowedPartOfSpeechTaglist != null) {
+	  for (String pos: allowedPartOfSpeechTaglist.split(",")) {
+	    this.allowedPartOfSpeechSet.add(pos);
+	  } 
+	} else {
+	  this.defaultAllowedPartOfSpeech();
+	}
       }
     } else {
       this.allowedPartOfSpeechSet.add(""); // empty if not part-of-speech tagged (accept everything)
@@ -567,6 +573,90 @@ public List<Entity> lookupTerm(String term,
     return entityList;
   }
 
+  /** Process text string */
+  public List<Entity> processText(String docid,
+				  String fieldid,
+				  String text,
+				  boolean detectNegationsFlag,
+				  Set<String> semTypeRestrictSet,
+				  Set<String> sourceRestrictSet) {
+    EntityStartComparator entityComparator = new EntityStartComparator();
+    try {
+      Set<Entity> entitySet0 = new HashSet<Entity>();
+      int i = 0;
+      if (this.sentenceExtractor == null) {
+	this.sentenceExtractor = new OpenNLPSentenceExtractor(this.properties);
+      }
+      List<Sentence> sentenceList = this.sentenceExtractor.createSentenceList(text);
+      for (Sentence sentence: sentenceList) {
+	List<ERToken> tokenList = Scanner.analyzeText(sentence);
+	if (this.addPartOfSpeechTagsFlag) {
+	  sentenceAnnotator.addPartOfSpeech(tokenList);
+	}
+	Set<Entity> sentenceEntitySet =
+	  this.processSentenceTokenList(docid, fieldid, tokenList,
+					semTypeRestrictSet,
+					sourceRestrictSet);
+	sentenceEntitySet.addAll(UserDefinedAcronym.generateEntities
+				 (docid, this.udaMap, tokenList));
+	for (Entity entity: sentenceEntitySet) {
+	  entity.setLocationPosition(i);
+	}
+	entitySet0.addAll(sentenceEntitySet);
+	i++;
+      }
+      // look for negation and other relations using Context.
+      for (Sentence sentence: sentenceList) {
+	List<ERToken> tokenList = Scanner.analyzeText(sentence);
+
+	// mark abbreviations that are entities and add them to entity set.
+
+	Set<Entity> abbrevEntitySet =
+	  new HashSet(MarkAbbreviations.markAbbreviations
+		      (text, this.uaMap,
+		       new ArrayList(entitySet0)));
+	// dbg
+	// for (Entity entity: abbrevEntitySet) {
+	//   logger.debug("abbrevEntitySet.entity: " + entity);
+	// }
+	// end of dbg
+	entitySet0.addAll(abbrevEntitySet);
+	if (detectNegationsFlag) {
+	  detectNegations(entitySet0, sentence.getText(), tokenList);
+	}
+      }
+
+      Set<Entity> entitySet1 = new HashSet<Entity>();
+      for (Entity entity: entitySet0) {
+	ConceptInfoUtils.filterEntityEvListBySemanticType(entity, semTypeRestrictSet);
+	ConceptInfoUtils.filterEntityEvListBySource(entity, sourceRestrictSet);
+	if (entity.getEvList().size() > 0) {
+	  entitySet1.add(entity);
+	}
+      }
+      Set<Entity> entitySet = removeSubsumedEntities(entitySet1);
+
+      List<Entity> resultList = new ArrayList<Entity>(entitySet);
+      Collections.sort(resultList, entityComparator);
+      return resultList;
+    } catch (FileNotFoundException fnfe) {
+      throw new RuntimeException(fnfe);
+    } catch (IOException ioe) {
+      throw new RuntimeException(ioe);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  /** Process text string */
+  public List<Entity> processText(String text,
+				  boolean useNegationDetection,
+				  Set<String> semTypeRestrictSet,
+				  Set<String> sourceRestrictSet) {
+    return processText("000000", "text", text, useNegationDetection,
+		       semTypeRestrictSet, sourceRestrictSet);
+  }
+
   /** Process passage */
   public List<Entity> processPassage(String docid, BioCPassage passage,
 				     boolean detectNegationsFlag,

diff --git a/src/main/java/gov/nih/nlm/nls/metamap/lite/EntityLookup5.java b/src/main/java/gov/nih/nlm/nls/metamap/lite/EntityLookup5.java
@@ -41,6 +41,7 @@
 import gov.nih.nlm.nls.metamap.lite.metamap.MetaMapIvfIndexes;
 import gov.nih.nlm.nls.metamap.lite.ChunkerMethod;
 import gov.nih.nlm.nls.metamap.lite.OpenNLPChunker;
+import gov.nih.nlm.nls.metamap.lite.SentenceExtractor;
 
 import gov.nih.nlm.nls.metamap.prefix.CharUtils;
 
@@ -83,6 +84,7 @@ public class EntityLookup5 implements EntityLookup {
     Integer.parseInt(System.getProperty("metamaplite.entitylookup4.maxtokensize","15"));
   SpecialTerms excludedTerms = new SpecialTerms();
   SentenceAnnotator sentenceAnnotator;
+  SentenceExtractor sentenceExtractor;
   NegationDetector negationDetector;
   boolean addPartOfSpeechTagsFlag =
     Boolean.parseBoolean(System.getProperty("metamaplite.enable.postagging","true"));
@@ -877,6 +879,81 @@ public List<Entity> lookupTerm(String term,
     return entityList;
   }
 
+  /** Process text string */
+  public List<Entity> processText(String docid,
+				  String fieldid,
+				  String text,
+				  boolean detectNegationsFlag,
+				  Set<String> semTypeRestrictSet,
+				  Set<String> sourceRestrictSet) {
+    EntityStartComparator entityComparator = new EntityStartComparator();
+    try {
+      Set<Entity> entitySet0 = new HashSet<Entity>();
+      int i = 0;
+      List<Sentence> sentenceList = this.sentenceExtractor.createSentenceList(text);
+      for (Sentence sentence: sentenceList) {
+	List<ERToken> tokenList = Scanner.analyzeText(sentence);
+	Set<Entity> sentenceEntitySet =
+	  this.processSentenceTokenList(docid, fieldid, tokenList,
+					semTypeRestrictSet,
+					sourceRestrictSet);
+	sentenceEntitySet.addAll(UserDefinedAcronym.generateEntities
+				 (docid, this.udaMap, tokenList));
+	for (Entity entity: sentenceEntitySet) {
+	  entity.setLocationPosition(i);
+	}
+	entitySet0.addAll(sentenceEntitySet);
+	i++;
+      }
+      // look for negation and other relations using Context.
+      for (Sentence sentence: sentenceList) {
+	List<ERToken> tokenList = Scanner.analyzeText(sentence);
+
+	// mark abbreviations that are entities and add them to sentence entity set.
+	Set<Entity> abbrevEntitySet =
+	  new HashSet(MarkAbbreviations.markAbbreviations
+		      (text, this.uaMap,
+		       new ArrayList(entitySet0)));
+	entitySet0.addAll(abbrevEntitySet);
+	if (detectNegationsFlag) {
+	  detectNegations(entitySet0, sentence.getText(), tokenList);
+	}
+      }
+
+      // remove any entities subsumed by another entity
+      Set<Entity> entitySet1 = removeSubsumedEntities(entitySet0);
+      // filter entities by semantic type and source sets.
+      Set<Entity> entitySet = new HashSet<Entity>();
+      for (Entity entity: entitySet1) {
+	ConceptInfoUtils.filterEntityEvListBySemanticType(entity, semTypeRestrictSet);
+	ConceptInfoUtils.filterEntityEvListBySource(entity, sourceRestrictSet);
+	if (entity.getEvList().size() > 0) {
+	  entitySet.add(entity);
+	}
+      }
+      List<Entity> resultList = new ArrayList<Entity>(entitySet);
+      Collections.sort(resultList, entityComparator);
+      return resultList;
+    } catch (FileNotFoundException fnfe) {
+      throw new RuntimeException(fnfe);
+    } catch (IOException ioe) {
+      throw new RuntimeException(ioe);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+
+  }
+
+  /** Process text string */
+  public List<Entity> processText(String text,
+				  boolean useNegationDetection,
+				  Set<String> semTypeRestrictSet,
+				  Set<String> sourceRestrictSet) {
+    return processText("000000", "text", text, useNegationDetection,
+		       semTypeRestrictSet, sourceRestrictSet);
+  }
+
+
   /** Process passage */
   public List<Entity> processPassage(String docid, BioCPassage passage,
 				     boolean detectNegationsFlag,
@@ -893,10 +970,12 @@ public List<Entity> processPassage(String docid, BioCPassage passage,
       int i = 0;
       for (BioCSentence sentence: passage.getSentences()) {
 	List<ERToken> tokenList = Scanner.analyzeText(sentence);
-	Set<Entity> sentenceEntitySet = this.processSentenceTokenList(docid, fieldid, tokenList,
-								      semTypeRestrictSet,
-								      sourceRestrictSet);
-	sentenceEntitySet.addAll(UserDefinedAcronym.generateEntities(docid, this.udaMap, tokenList));	
+	Set<Entity> sentenceEntitySet =
+	  this.processSentenceTokenList(docid, fieldid, tokenList,
+					semTypeRestrictSet,
+					sourceRestrictSet);
+	sentenceEntitySet.addAll(UserDefinedAcronym.generateEntities
+				 (docid, this.udaMap, tokenList));	
 	for (Entity entity: sentenceEntitySet) {
 	  entity.setLocationPosition(i);
 	}
@@ -909,7 +988,10 @@ public List<Entity> processPassage(String docid, BioCPassage passage,
 	List<ERToken> tokenList = Scanner.analyzeText(sentence);
 
 	// mark abbreviations that are entities and add them to sentence entity set.
-	Set<Entity> abbrevEntitySet = new HashSet(MarkAbbreviations.markAbbreviations(passage, new ArrayList(entitySet0)));
+	Set<Entity> abbrevEntitySet =
+	  new HashSet(MarkAbbreviations.markAbbreviations
+		      (passage, this.uaMap,
+		       new ArrayList(entitySet0)));
 	entitySet0.addAll(abbrevEntitySet);
 	if (detectNegationsFlag) {
 	  detectNegations(entitySet0, sentence.getText(), tokenList);