From 6182ca48a7c8c42cb256960d5391221a3d460e26 Mon Sep 17 00:00:00 2001 From: hankcs Date: Wed, 24 Dec 2014 19:21:56 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96BiGram=E8=AF=8D=E5=85=B8?= =?UTF-8?q?=E5=8A=A0=E8=BD=BD=E9=80=9F=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../AhoCorasickDoubleArrayTrie.java | 29 +++++++-- .../collection/trie/DoubleArrayTrie.java | 6 ++ .../dictionary/CoreBiGramMixDictionary.java | 54 ++++++++-------- .../dictionary/CoreBiGramTableDictionary.java | 62 ++++++++++++------- .../hanlp/dictionary/CoreDictionary.java | 2 +- .../hanlp/seg/HiddenMarkovModelSegment.java | 4 +- .../com/hankcs/hanlp/utility/MathTools.java | 1 - .../hankcs/test/seg/testBiGramDictionary.java | 48 +++++++++++++- 8 files changed, 145 insertions(+), 61 deletions(-) diff --git a/src/main/java/com/hankcs/hanlp/collection/AhoCorasick/AhoCorasickDoubleArrayTrie.java b/src/main/java/com/hankcs/hanlp/collection/AhoCorasick/AhoCorasickDoubleArrayTrie.java index 961fa17f8..1a3e8442a 100644 --- a/src/main/java/com/hankcs/hanlp/collection/AhoCorasick/AhoCorasickDoubleArrayTrie.java +++ b/src/main/java/com/hankcs/hanlp/collection/AhoCorasick/AhoCorasickDoubleArrayTrie.java @@ -13,9 +13,11 @@ import com.hankcs.hanlp.corpus.io.ByteArray; -import com.hankcs.hanlp.corpus.io.ICacheAble; import java.io.DataOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; import java.util.*; import java.util.concurrent.LinkedBlockingDeque; @@ -256,7 +258,7 @@ public void parseText(char[] text, IHitFull processor) { for (int hit : hitArray) { - processor.hit(position - l[hit], position, v, hit); + processor.hit(position - l[hit], position, v[hit], hit); } } ++position; @@ -292,6 +294,25 @@ public void save(DataOutputStream out) throws Exception } } + public void save(ObjectOutputStream out) throws IOException + { + out.writeObject(base); + out.writeObject(check); + out.writeObject(fail); + out.writeObject(output); + out.writeObject(l); + } + + public void load(ObjectInputStream in, V[] value) throws IOException, ClassNotFoundException + { + base = (int[]) in.readObject(); + check = (int[]) in.readObject(); + fail = (int[]) in.readObject(); + output = (int[][]) in.readObject(); + l = (int[]) in.readObject(); + v = value; + } + public boolean load(ByteArray byteArray, V[] value) { if (byteArray == null) return false; @@ -362,10 +383,10 @@ public interface IHitFull * 命中一个模式串 * @param begin 模式串在母文本中的起始位置 * @param end 模式串在母文本中的终止位置 - * @param value 模式串对应的值的数组 + * @param value 模式串对应的值 * @param index 模式串对应的值的下标 */ - void hit(int begin, int end, V[] value, int index); + void hit(int begin, int end, V value, int index); } /** diff --git a/src/main/java/com/hankcs/hanlp/collection/trie/DoubleArrayTrie.java b/src/main/java/com/hankcs/hanlp/collection/trie/DoubleArrayTrie.java index 74d4b560e..bb4751152 100644 --- a/src/main/java/com/hankcs/hanlp/collection/trie/DoubleArrayTrie.java +++ b/src/main/java/com/hankcs/hanlp/collection/trie/DoubleArrayTrie.java @@ -480,6 +480,12 @@ public boolean save(DataOutputStream out) return true; } + public void save(ObjectOutputStream out) throws IOException + { + out.writeObject(base); + out.writeObject(check); + } + /** * 从磁盘加载,需要额外提供值 * diff --git a/src/main/java/com/hankcs/hanlp/dictionary/CoreBiGramMixDictionary.java b/src/main/java/com/hankcs/hanlp/dictionary/CoreBiGramMixDictionary.java index e34e7709e..da6592139 100644 --- a/src/main/java/com/hankcs/hanlp/dictionary/CoreBiGramMixDictionary.java +++ b/src/main/java/com/hankcs/hanlp/dictionary/CoreBiGramMixDictionary.java @@ -170,33 +170,33 @@ else if (midVal > key) - public static int getBiFrequency(Vertex from, Vertex to) - { - StringBuilder key = new StringBuilder(); - int idA = from.wordID; - if (idA == -1) - { - key.append(from.word); - } - else - { - key.append(ByteUtil.convertIntToTwoChar(idA)); - } - key.append('@'); - int idB = to.wordID; - if (idB == -1) - { - key.append(to.word); - } - else - { - key.append(ByteUtil.convertIntToTwoChar(idB)); - } - - Integer freq = trie.get(key.toString()); - if (freq == null) return 0; - return freq; - } +// public static int getBiFrequency(Vertex from, Vertex to) +// { +// StringBuilder key = new StringBuilder(); +// int idA = from.wordID; +// if (idA == -1) +// { +// key.append(from.word); +// } +// else +// { +// key.append(ByteUtil.convertIntToTwoChar(idA)); +// } +// key.append('@'); +// int idB = to.wordID; +// if (idB == -1) +// { +// key.append(to.word); +// } +// else +// { +// key.append(ByteUtil.convertIntToTwoChar(idB)); +// } +// +// Integer freq = trie.get(key.toString()); +// if (freq == null) return 0; +// return freq; +// } static void buildID(String word, StringBuilder sbStorage) { diff --git a/src/main/java/com/hankcs/hanlp/dictionary/CoreBiGramTableDictionary.java b/src/main/java/com/hankcs/hanlp/dictionary/CoreBiGramTableDictionary.java index cf14e419a..f626b033f 100644 --- a/src/main/java/com/hankcs/hanlp/dictionary/CoreBiGramTableDictionary.java +++ b/src/main/java/com/hankcs/hanlp/dictionary/CoreBiGramTableDictionary.java @@ -138,17 +138,21 @@ static boolean saveDat(String path) { try { - DataOutputStream out = new DataOutputStream(new FileOutputStream(path)); - out.writeInt(start.length); - for (int i : start) - { - out.writeInt(i); - } - out.writeInt(pair.length); - for (int i : pair) - { - out.writeInt(i); - } +// DataOutputStream out = new DataOutputStream(new FileOutputStream(path)); +// out.writeInt(start.length); +// for (int i : start) +// { +// out.writeInt(i); +// } +// out.writeInt(pair.length); +// for (int i : pair) +// { +// out.writeInt(i); +// } +// out.close(); + ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(path)); + out.writeObject(start); + out.writeObject(pair); out.close(); } catch (Exception e) @@ -162,23 +166,35 @@ static boolean saveDat(String path) static boolean loadDat(String path) { - ByteArray byteArray = ByteArray.createByteArray(path); - if (byteArray == null) return false; +// ByteArray byteArray = ByteArray.createByteArray(path); +// if (byteArray == null) return false; +// +// int size = byteArray.nextInt(); // 这两个数组从byte转为int竟然要花4秒钟 +// start = new int[size]; +// for (int i = 0; i < size; ++i) +// { +// start[i] = byteArray.nextInt(); +// } +// +// size = byteArray.nextInt(); +// pair = new int[size]; +// for (int i = 0; i < size; ++i) +// { +// pair[i] = byteArray.nextInt(); +// } - int size = byteArray.nextInt(); - start = new int[size]; - for (int i = 0; i < size; ++i) + try { - start[i] = byteArray.nextInt(); + ObjectInputStream in = new ObjectInputStream(new FileInputStream(path)); + start = (int[]) in.readObject(); + pair = (int[]) in.readObject(); + in.close(); } - - size = byteArray.nextInt(); - pair = new int[size]; - for (int i = 0; i < size; ++i) + catch (Exception e) { - pair[i] = byteArray.nextInt(); + logger.log(Level.WARNING, "尝试载入缓存文件" + path + "发生异常", e); + return false; } - return true; } diff --git a/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java b/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java index fdf385d9c..9d30e1d83 100644 --- a/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java +++ b/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java @@ -77,7 +77,7 @@ public static boolean load(String path) logger.info("核心词典读入词条" + map.size() + " 全部频次" + MAX_FREQUENCY + ",耗时" + (System.currentTimeMillis() - start) + "ms"); br.close(); trie.build(map); - logger.info("核心词典加载成功:" + trie.size() + "个词条"); + logger.info("核心词典加载成功:" + trie.size() + "个词条,下面将写入缓存……"); try { DataOutputStream out = new DataOutputStream(new FileOutputStream(path + Predefine.BIN_EXT)); diff --git a/src/main/java/com/hankcs/hanlp/seg/HiddenMarkovModelSegment.java b/src/main/java/com/hankcs/hanlp/seg/HiddenMarkovModelSegment.java index 118e4df9c..9fe28446f 100644 --- a/src/main/java/com/hankcs/hanlp/seg/HiddenMarkovModelSegment.java +++ b/src/main/java/com/hankcs/hanlp/seg/HiddenMarkovModelSegment.java @@ -431,9 +431,9 @@ protected WordNet GenerateWordNet(final String sSentence, final WordNet wordNetS CoreDictionary.trie.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHitFull() { @Override - public void hit(int begin, int end, CoreDictionary.Attribute[] value, int index) + public void hit(int begin, int end, CoreDictionary.Attribute value, int index) { - wordNetStorage.add(begin + 1, new Vertex(new String(charArray, begin, end - begin), value[index], index)); + wordNetStorage.add(begin + 1, new Vertex(new String(charArray, begin, end - begin), value, index)); } }); // 用户词典查询 diff --git a/src/main/java/com/hankcs/hanlp/utility/MathTools.java b/src/main/java/com/hankcs/hanlp/utility/MathTools.java index bbb37e9e7..4f10f5caf 100644 --- a/src/main/java/com/hankcs/hanlp/utility/MathTools.java +++ b/src/main/java/com/hankcs/hanlp/utility/MathTools.java @@ -36,7 +36,6 @@ public static double calculateWeight(Vertex from, Vertex to) { frequency = 1; // 防止发生除零错误 } -// int nTwoWordsFreq = CoreBiGramDictionaryEx.getBiFrequency(from, to); // int nTwoWordsFreq = BiGramDictionary.getBiFrequency(from.word, to.word); int nTwoWordsFreq = CoreBiGramTableDictionary.getBiFrequency(from.wordID, to.wordID); double value = -Math.log(dSmoothingPara * frequency / (MAX_FREQUENCY) + (1 - dSmoothingPara) * ((1 - dTemp) * nTwoWordsFreq / frequency + dTemp)); diff --git a/src/test/java/com/hankcs/test/seg/testBiGramDictionary.java b/src/test/java/com/hankcs/test/seg/testBiGramDictionary.java index a40f004bb..d5e5ff5cb 100644 --- a/src/test/java/com/hankcs/test/seg/testBiGramDictionary.java +++ b/src/test/java/com/hankcs/test/seg/testBiGramDictionary.java @@ -17,9 +17,10 @@ import com.hankcs.hanlp.dictionary.CoreBiGramTableDictionary; import junit.framework.TestCase; -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.InputStreamReader; +import java.io.*; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; /** * @author hankcs @@ -104,6 +105,47 @@ public void testFastBiGram() throws Exception public void testSingle() throws Exception { + HanLP.Config.enableDebug(); System.out.println(CoreBiGramTableDictionary.getBiFrequency("团结", "奋斗")); } + + public void testBenchmark() throws Exception + { + BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(HanLP.Config.BiGramDictionaryPath))); + String line; + List twoWordList = new LinkedList<>(); + while ((line = br.readLine()) != null) + { + String[] params = line.split("\\s"); + String[] twoWord = params[0].split("@", 2); + twoWordList.add(twoWord); + } + br.close(); + long start = System.currentTimeMillis(); + for (String[] twoWord : twoWordList) + { + } + } + + public void testObjectOutPut() throws Exception + { + int size = 5563418; + int[] array = new int[size]; + for (int i = 0; i < array.length; i++) + { + array[i] = i; + } + ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream("data/test/out.bin")); + out.writeObject(array); + out.close(); + + long start = System.currentTimeMillis(); + ObjectInputStream in = new ObjectInputStream(new FileInputStream("data/test/out.bin")); + int[] inArray = (int[]) in.readObject(); + System.out.println(System.currentTimeMillis() - start); + for (int i = 0; i < inArray.length; i++) + { + assertEquals(i, inArray[i]); + } + } }