Skip to content

Commit

Permalink
CoreStopWordDictionary支持自定义过滤逻辑
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Nov 13, 2015
1 parent 504a9eb commit 1f29a35
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 34 deletions.
10 changes: 10 additions & 0 deletions src/main/java/com/hankcs/hanlp/corpus/tag/Nature.java
Original file line number Diff line number Diff line change
Expand Up @@ -783,4 +783,14 @@ public boolean startsWith(char prefix)
{
return toString().charAt(0) == prefix;
}

/**
* 词性的首字母<br>
* 词性根据开头的几个字母可以判断大的类别
* @return
*/
public char firstChar()
{
return toString().charAt(0);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -61,56 +61,56 @@ public static boolean contains(String key)
}

/**
* 核心停用词典的核心过滤器
* 核心停用词典的核心过滤器,词性属于名词、动词、副词、形容词,并且不在停用词表中才不会被过滤
*/
public static Filter FILTER = new Filter()
{
@Override
public boolean shouldInclude(Term term)
{
return CoreStopWordDictionary.shouldInclude(term);
// 除掉停用词
String nature = term.nature != null ? term.nature.toString() : "空";
char firstChar = nature.charAt(0);
switch (firstChar)
{
case 'm':
case 'b':
case 'c':
case 'e':
case 'o':
case 'p':
case 'q':
case 'u':
case 'y':
case 'z':
case 'r':
case 'w':
{
return false;
}
default:
{
if (term.word.length() > 1 && !CoreStopWordDictionary.contains(term.word))
{
return true;
}
}
break;
}

return false;
}
};

/**
* 是否应当将这个term纳入计算,词性属于名词、动词、副词、形容词,并且不在停用词表中
* 是否应当将这个term纳入计算
*
* @param term
* @return 是否应当
*/
public static boolean shouldInclude(Term term)
{
// 除掉停用词
String nature = term.nature != null ? term.nature.toString() : "空";
char firstChar = nature.charAt(0);
switch (firstChar)
{
case 'm':
case 'b':
case 'c':
case 'e':
case 'o':
case 'p':
case 'q':
case 'u':
case 'y':
case 'z':
case 'r':
case 'w':
{
return false;
}
default:
{
if (term.word.length() > 1 && !CoreStopWordDictionary.contains(term.word))
{
return true;
}
}
break;
}

return false;
return FILTER.shouldInclude(term);
}

/**
Expand Down
16 changes: 16 additions & 0 deletions src/test/java/com/hankcs/demo/DemoStopWord.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@


import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
import com.hankcs.hanlp.dictionary.stopword.Filter;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.BasicTokenizer;
import com.hankcs.hanlp.tokenizer.NotionalTokenizer;
Expand All @@ -38,5 +39,20 @@ public static void main(String[] args)
System.out.println(termList);
CoreStopWordDictionary.apply(termList);
System.out.println(termList);
// 还可以自定义过滤逻辑
CoreStopWordDictionary.FILTER = new Filter()
{
@Override
public boolean shouldInclude(Term term)
{
switch (term.nature)
{
case nz:
return !CoreStopWordDictionary.contains(term.word);
}
return false;
}
};
System.out.println(NotionalTokenizer.segment(text));
}
}

0 comments on commit 1f29a35

Please sign in to comment.