forked from hankcs/HanLP
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
《自然语言处理入门》新书携v1.7.5发布🔥:http://nlp.hankcs.com/book.php
- Loading branch information
Showing
56 changed files
with
3,224 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
/* | ||
* <author>Han He</author> | ||
* <email>[email protected]</email> | ||
* <create-date>2018-05-18 下午5:38</create-date> | ||
* | ||
* <copyright file="HelloWord.java"> | ||
* Copyright (c) 2018, Han He. All Rights Reserved, http://www.hankcs.com/ | ||
* This source is subject to Han He. Please contact Han He for more information. | ||
* </copyright> | ||
*/ | ||
package com.hankcs.book.ch01; | ||
|
||
import com.hankcs.hanlp.HanLP; | ||
|
||
/** | ||
* 《自然语言处理入门》1.6 开源工具 | ||
* 配套书籍:http://nlp.hankcs.com/book.php | ||
* 讨论答疑:https://bbs.hankcs.com/ | ||
* | ||
* @author hankcs | ||
* @see <a href="http://nlp.hankcs.com/book.php">《自然语言处理入门》</a> | ||
* @see <a href="https://bbs.hankcs.com/">讨论答疑</a> | ||
*/ | ||
public class HelloWord | ||
{ | ||
public static void main(String[] args) | ||
{ | ||
HanLP.Config.enableDebug(); // 首次运行会自动建立模型缓存,为了避免你等得无聊,开启调试模式说点什么:-) | ||
System.out.println(HanLP.segment("王国维和服务员")); | ||
} | ||
} |
152 changes: 152 additions & 0 deletions
152
src/test/java/com/hankcs/book/ch02/AhoCorasickDoubleArrayTrieSegmentation.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
/* | ||
* <author>Han He</author> | ||
* <email>[email protected]</email> | ||
* <create-date>2018-05-28 下午5:59</create-date> | ||
* | ||
* <copyright file="AhoCorasickDoubleArrayTrieSegmentation.java"> | ||
* Copyright (c) 2018, Han He. All Rights Reserved, http://www.hankcs.com/ | ||
* This source is subject to Han He. Please contact Han He for more information. | ||
* </copyright> | ||
*/ | ||
package com.hankcs.book.ch02; | ||
|
||
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie; | ||
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie; | ||
import com.hankcs.hanlp.corpus.io.IOUtil; | ||
import com.hankcs.hanlp.dictionary.CoreDictionary; | ||
|
||
import java.io.IOException; | ||
import java.util.Iterator; | ||
import java.util.LinkedList; | ||
import java.util.List; | ||
import java.util.TreeMap; | ||
|
||
/** | ||
* 《自然语言处理入门》2.7 基于双数组字典树的AC自动机 | ||
* 配套书籍:http://nlp.hankcs.com/book.php | ||
* 讨论答疑:https://bbs.hankcs.com/ | ||
* | ||
* @author hankcs | ||
* @see <a href="http://nlp.hankcs.com/book.php">《自然语言处理入门》</a> | ||
* @see <a href="https://bbs.hankcs.com/">讨论答疑</a> | ||
*/ | ||
public class AhoCorasickDoubleArrayTrieSegmentation | ||
{ | ||
public static void main(String[] args) throws IOException | ||
{ | ||
classicDemo(); | ||
for (int i = 1; i <= 10; ++i) | ||
{ | ||
evaluateSpeed(i); | ||
System.gc(); | ||
} | ||
} | ||
|
||
private static void classicDemo() throws IOException | ||
{ | ||
String[] keyArray = new String[]{"hers", "his", "she", "he"}; | ||
TreeMap<String, String> map = new TreeMap<String, String>(); | ||
for (String key : keyArray) | ||
map.put(key, key.toUpperCase()); | ||
AhoCorasickDoubleArrayTrie<String> acdat = new AhoCorasickDoubleArrayTrie<String>(map); | ||
for (AhoCorasickDoubleArrayTrie<String>.Hit<String> hit : acdat.parseText("ushers")) // 一下子获取全部结果 | ||
{ | ||
System.out.printf("[%d:%d]=%s\n", hit.begin, hit.end, hit.value); | ||
} | ||
System.out.println(); | ||
acdat.parseText("ushers", new AhoCorasickDoubleArrayTrie.IHit<String>() // 及时处理查询结果 | ||
{ | ||
@Override | ||
public void hit(int begin, int end, String value) | ||
{ | ||
System.out.printf("[%d:%d]=%s\n", begin, end, value); | ||
} | ||
}); | ||
} | ||
|
||
private static void evaluateSpeed(int wordLength) throws IOException | ||
{ | ||
TreeMap<String, CoreDictionary.Attribute> dictionary = loadDictionary(wordLength); | ||
|
||
AhoCorasickDoubleArrayTrie<CoreDictionary.Attribute> acdat = new AhoCorasickDoubleArrayTrie<CoreDictionary.Attribute>(dictionary); | ||
DoubleArrayTrie<CoreDictionary.Attribute> dat = new DoubleArrayTrie<CoreDictionary.Attribute>(dictionary); | ||
|
||
String text = "江西鄱阳湖干枯,中国最大淡水湖变成大草原"; | ||
long start; | ||
double costTime; | ||
final int pressure = 1000000; | ||
System.out.printf("长度%d:\n", wordLength); | ||
|
||
start = System.currentTimeMillis(); | ||
for (int i = 0; i < pressure; ++i) | ||
{ | ||
acdat.parseText(text, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() | ||
{ | ||
@Override | ||
public void hit(int begin, int end, CoreDictionary.Attribute value) | ||
{ | ||
|
||
} | ||
}); | ||
} | ||
costTime = (System.currentTimeMillis() - start) / (double) 1000; | ||
System.out.printf("ACDAT: %.2f万字/秒\n", text.length() * pressure / 10000 / costTime); | ||
|
||
start = System.currentTimeMillis(); | ||
for (int i = 0; i < pressure; ++i) | ||
{ | ||
dat.parseText(text, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() | ||
{ | ||
@Override | ||
public void hit(int begin, int end, CoreDictionary.Attribute value) | ||
{ | ||
|
||
} | ||
}); | ||
} | ||
costTime = (System.currentTimeMillis() - start) / (double) 1000; | ||
System.out.printf("DAT: %.2f万字/秒\n", text.length() * pressure / 10000 / costTime); | ||
} | ||
|
||
/** | ||
* 加载词典,并限制词语长度 | ||
* | ||
* @param minLength 最低长度 | ||
* @return TreeMap形式的词典 | ||
* @throws IOException | ||
*/ | ||
public static TreeMap<String, CoreDictionary.Attribute> loadDictionary(int minLength) throws IOException | ||
{ | ||
TreeMap<String, CoreDictionary.Attribute> dictionary = | ||
IOUtil.loadDictionary("data/dictionary/CoreNatureDictionary.mini.txt"); | ||
|
||
Iterator<String> iterator = dictionary.keySet().iterator(); | ||
while (iterator.hasNext()) | ||
{ | ||
if (iterator.next().length() < minLength) | ||
iterator.remove(); | ||
} | ||
return dictionary; | ||
} | ||
|
||
/** | ||
* 基于ACDAT的完全切分式的中文分词算法 | ||
* | ||
* @param text 待分词的文本 | ||
* @param acdat 词典 | ||
* @return 单词列表 | ||
*/ | ||
public static List<String> segmentFully(final String text, AhoCorasickDoubleArrayTrie<CoreDictionary.Attribute> acdat) | ||
{ | ||
final List<String> wordList = new LinkedList<String>(); | ||
acdat.parseText(text, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() | ||
{ | ||
@Override | ||
public void hit(int begin, int end, CoreDictionary.Attribute value) | ||
{ | ||
wordList.add(text.substring(begin, end)); | ||
} | ||
}); | ||
return wordList; | ||
} | ||
} |
89 changes: 89 additions & 0 deletions
89
src/test/java/com/hankcs/book/ch02/AhoCorasickSegmentation.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
/* | ||
* <author>Han He</author> | ||
* <email>[email protected]</email> | ||
* <create-date>2018-05-28 上午11:00</create-date> | ||
* | ||
* <copyright file="AhoCorasickSegmentation.java"> | ||
* Copyright (c) 2018, Han He. All Rights Reserved, http://www.hankcs.com/ | ||
* This source is subject to Han He. Please contact Han He for more information. | ||
* </copyright> | ||
*/ | ||
package com.hankcs.book.ch02; | ||
|
||
import com.hankcs.hanlp.algorithm.ahocorasick.trie.Emit; | ||
import com.hankcs.hanlp.algorithm.ahocorasick.trie.Trie; | ||
import com.hankcs.hanlp.corpus.io.IOUtil; | ||
import com.hankcs.hanlp.dictionary.CoreDictionary; | ||
|
||
import java.io.IOException; | ||
import java.util.LinkedList; | ||
import java.util.List; | ||
import java.util.TreeMap; | ||
|
||
/** | ||
* 《自然语言处理入门》2.6 AC 自动机 | ||
* 配套书籍:http://nlp.hankcs.com/book.php | ||
* 讨论答疑:https://bbs.hankcs.com/ | ||
* | ||
* @author hankcs | ||
* @see <a href="http://nlp.hankcs.com/book.php">《自然语言处理入门》</a> | ||
* @see <a href="https://bbs.hankcs.com/">讨论答疑</a> | ||
*/ | ||
public class AhoCorasickSegmentation | ||
{ | ||
public static void main(String[] args) throws IOException | ||
{ | ||
classicDemo(); | ||
evaluateSpeed(); | ||
} | ||
|
||
private static void classicDemo() | ||
{ | ||
String[] keyArray = new String[]{"hers", "his", "she", "he"}; | ||
Trie trie = new Trie(); | ||
for (String key : keyArray) | ||
trie.addKeyword(key); | ||
for (Emit emit : trie.parseText("ushers")) | ||
System.out.printf("[%d:%d]=%s\n", emit.getStart(), emit.getEnd(), emit.getKeyword()); | ||
} | ||
|
||
private static void evaluateSpeed() throws IOException | ||
{ | ||
// 加载词典 | ||
TreeMap<String, CoreDictionary.Attribute> dictionary = | ||
IOUtil.loadDictionary("data/dictionary/CoreNatureDictionary.mini.txt"); | ||
Trie trie = new Trie(dictionary.keySet()); | ||
|
||
String text = "江西鄱阳湖干枯,中国最大淡水湖变成大草原"; | ||
long start; | ||
double costTime; | ||
final int pressure = 1000000; | ||
|
||
System.out.println("===AC自动机接口==="); | ||
System.out.println("完全切分"); | ||
start = System.currentTimeMillis(); | ||
for (int i = 0; i < pressure; ++i) | ||
{ | ||
segmentFully(text, trie); | ||
} | ||
costTime = (System.currentTimeMillis() - start) / (double) 1000; | ||
System.out.printf("%.2f万字/秒\n", text.length() * pressure / 10000 / costTime); | ||
} | ||
|
||
/** | ||
* 基于AC自动机的完全切分式的中文分词算法 | ||
* | ||
* @param text 待分词的文本 | ||
* @param dictionary 词典 | ||
* @return 单词列表 | ||
*/ | ||
public static List<String> segmentFully(final String text, Trie dictionary) | ||
{ | ||
final List<String> wordList = new LinkedList<String>(); | ||
for (Emit emit : dictionary.parseText(text)) | ||
{ | ||
wordList.add(emit.getKeyword()); | ||
} | ||
return wordList; | ||
} | ||
} |
Oops, something went wrong.