Skip to content

Commit

Permalink
Intern strings for distsim in NERFeatureFactory. Saves quite a bit of…
Browse files Browse the repository at this point in the history
… memory downfield
  • Loading branch information
AngledLuffa authored and Stanford NLP committed Mar 2, 2020
1 parent b58a48f commit e98687c
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 221 deletions.
8 changes: 7 additions & 1 deletion src/edu/stanford/nlp/ie/NERFeatureFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.trees.international.pennchinese.RadicalMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Interner;
import edu.stanford.nlp.util.PaddedList;
import edu.stanford.nlp.util.Timing;
import edu.stanford.nlp.util.logging.Redwood;
Expand Down Expand Up @@ -576,6 +577,11 @@ private void initLexicon(SeqClassifierFlags flags) {
return;
}
Timing timing = new Timing();
// should work better than String.intern()
// interning the strings like this means they should be serialized
// in an interned manner, saving disk space and also memory when
// loading them back in
Interner<String> interner = new Interner<>();
lexicon = Generics.newHashMap(10000);
boolean terryKoo = "terryKoo".equals(flags.distSimFileFormat);
Pattern p = Pattern.compile(terryKoo ? "\\t" : "\\s+");
Expand All @@ -602,7 +608,7 @@ private void initLexicon(SeqClassifierFlags flags) {
if (flags.numberEquivalenceDistSim) {
word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS);
}
lexicon.put(word, wordClass);
lexicon.put(word, interner.intern(wordClass));
}
timing.done(log, "Loading distsim lexicon from " + flags.distSimLexicon);
}
Expand Down

This file was deleted.

0 comments on commit e98687c

Please sign in to comment.