Skip to content

Commit

Permalink
优化BiGram词典加载速度
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Dec 24, 2014
1 parent 88315d0 commit 6182ca4
Show file tree
Hide file tree
Showing 8 changed files with 145 additions and 61 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@


import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.corpus.io.ICacheAble;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.*;
import java.util.concurrent.LinkedBlockingDeque;

Expand Down Expand Up @@ -256,7 +258,7 @@ public void parseText(char[] text, IHitFull<V> processor)
{
for (int hit : hitArray)
{
processor.hit(position - l[hit], position, v, hit);
processor.hit(position - l[hit], position, v[hit], hit);
}
}
++position;
Expand Down Expand Up @@ -292,6 +294,25 @@ public void save(DataOutputStream out) throws Exception
}
}

public void save(ObjectOutputStream out) throws IOException
{
out.writeObject(base);
out.writeObject(check);
out.writeObject(fail);
out.writeObject(output);
out.writeObject(l);
}

public void load(ObjectInputStream in, V[] value) throws IOException, ClassNotFoundException
{
base = (int[]) in.readObject();
check = (int[]) in.readObject();
fail = (int[]) in.readObject();
output = (int[][]) in.readObject();
l = (int[]) in.readObject();
v = value;
}

public boolean load(ByteArray byteArray, V[] value)
{
if (byteArray == null) return false;
Expand Down Expand Up @@ -362,10 +383,10 @@ public interface IHitFull<V>
* 命中一个模式串
* @param begin 模式串在母文本中的起始位置
* @param end 模式串在母文本中的终止位置
* @param value 模式串对应的值的数组
* @param value 模式串对应的值
* @param index 模式串对应的值的下标
*/
void hit(int begin, int end, V[] value, int index);
void hit(int begin, int end, V value, int index);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,12 @@ public boolean save(DataOutputStream out)
return true;
}

public void save(ObjectOutputStream out) throws IOException
{
out.writeObject(base);
out.writeObject(check);
}

/**
* 从磁盘加载,需要额外提供值
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,33 +170,33 @@ else if (midVal > key)



public static int getBiFrequency(Vertex from, Vertex to)
{
StringBuilder key = new StringBuilder();
int idA = from.wordID;
if (idA == -1)
{
key.append(from.word);
}
else
{
key.append(ByteUtil.convertIntToTwoChar(idA));
}
key.append('@');
int idB = to.wordID;
if (idB == -1)
{
key.append(to.word);
}
else
{
key.append(ByteUtil.convertIntToTwoChar(idB));
}

Integer freq = trie.get(key.toString());
if (freq == null) return 0;
return freq;
}
// public static int getBiFrequency(Vertex from, Vertex to)
// {
// StringBuilder key = new StringBuilder();
// int idA = from.wordID;
// if (idA == -1)
// {
// key.append(from.word);
// }
// else
// {
// key.append(ByteUtil.convertIntToTwoChar(idA));
// }
// key.append('@');
// int idB = to.wordID;
// if (idB == -1)
// {
// key.append(to.word);
// }
// else
// {
// key.append(ByteUtil.convertIntToTwoChar(idB));
// }
//
// Integer freq = trie.get(key.toString());
// if (freq == null) return 0;
// return freq;
// }

static void buildID(String word, StringBuilder sbStorage)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,17 +138,21 @@ static boolean saveDat(String path)
{
try
{
DataOutputStream out = new DataOutputStream(new FileOutputStream(path));
out.writeInt(start.length);
for (int i : start)
{
out.writeInt(i);
}
out.writeInt(pair.length);
for (int i : pair)
{
out.writeInt(i);
}
// DataOutputStream out = new DataOutputStream(new FileOutputStream(path));
// out.writeInt(start.length);
// for (int i : start)
// {
// out.writeInt(i);
// }
// out.writeInt(pair.length);
// for (int i : pair)
// {
// out.writeInt(i);
// }
// out.close();
ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(path));
out.writeObject(start);
out.writeObject(pair);
out.close();
}
catch (Exception e)
Expand All @@ -162,23 +166,35 @@ static boolean saveDat(String path)

static boolean loadDat(String path)
{
ByteArray byteArray = ByteArray.createByteArray(path);
if (byteArray == null) return false;
// ByteArray byteArray = ByteArray.createByteArray(path);
// if (byteArray == null) return false;
//
// int size = byteArray.nextInt(); // 这两个数组从byte转为int竟然要花4秒钟
// start = new int[size];
// for (int i = 0; i < size; ++i)
// {
// start[i] = byteArray.nextInt();
// }
//
// size = byteArray.nextInt();
// pair = new int[size];
// for (int i = 0; i < size; ++i)
// {
// pair[i] = byteArray.nextInt();
// }

int size = byteArray.nextInt();
start = new int[size];
for (int i = 0; i < size; ++i)
try
{
start[i] = byteArray.nextInt();
ObjectInputStream in = new ObjectInputStream(new FileInputStream(path));
start = (int[]) in.readObject();
pair = (int[]) in.readObject();
in.close();
}

size = byteArray.nextInt();
pair = new int[size];
for (int i = 0; i < size; ++i)
catch (Exception e)
{
pair[i] = byteArray.nextInt();
logger.log(Level.WARNING, "尝试载入缓存文件" + path + "发生异常", e);
return false;
}

return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public static boolean load(String path)
logger.info("核心词典读入词条" + map.size() + " 全部频次" + MAX_FREQUENCY + ",耗时" + (System.currentTimeMillis() - start) + "ms");
br.close();
trie.build(map);
logger.info("核心词典加载成功:" + trie.size() + "个词条");
logger.info("核心词典加载成功:" + trie.size() + "个词条,下面将写入缓存……");
try
{
DataOutputStream out = new DataOutputStream(new FileOutputStream(path + Predefine.BIN_EXT));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -431,9 +431,9 @@ protected WordNet GenerateWordNet(final String sSentence, final WordNet wordNetS
CoreDictionary.trie.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHitFull<CoreDictionary.Attribute>()
{
@Override
public void hit(int begin, int end, CoreDictionary.Attribute[] value, int index)
public void hit(int begin, int end, CoreDictionary.Attribute value, int index)
{
wordNetStorage.add(begin + 1, new Vertex(new String(charArray, begin, end - begin), value[index], index));
wordNetStorage.add(begin + 1, new Vertex(new String(charArray, begin, end - begin), value, index));
}
});
// 用户词典查询
Expand Down
1 change: 0 additions & 1 deletion src/main/java/com/hankcs/hanlp/utility/MathTools.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ public static double calculateWeight(Vertex from, Vertex to)
{
frequency = 1; // 防止发生除零错误
}
// int nTwoWordsFreq = CoreBiGramDictionaryEx.getBiFrequency(from, to);
// int nTwoWordsFreq = BiGramDictionary.getBiFrequency(from.word, to.word);
int nTwoWordsFreq = CoreBiGramTableDictionary.getBiFrequency(from.wordID, to.wordID);
double value = -Math.log(dSmoothingPara * frequency / (MAX_FREQUENCY) + (1 - dSmoothingPara) * ((1 - dTemp) * nTwoWordsFreq / frequency + dTemp));
Expand Down
48 changes: 45 additions & 3 deletions src/test/java/com/hankcs/test/seg/testBiGramDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
import com.hankcs.hanlp.dictionary.CoreBiGramTableDictionary;
import junit.framework.TestCase;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.*;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/**
* @author hankcs
Expand Down Expand Up @@ -104,6 +105,47 @@ public void testFastBiGram() throws Exception

public void testSingle() throws Exception
{
HanLP.Config.enableDebug();
System.out.println(CoreBiGramTableDictionary.getBiFrequency("团结", "奋斗"));
}

public void testBenchmark() throws Exception
{
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(HanLP.Config.BiGramDictionaryPath)));
String line;
List<String[]> twoWordList = new LinkedList<>();
while ((line = br.readLine()) != null)
{
String[] params = line.split("\\s");
String[] twoWord = params[0].split("@", 2);
twoWordList.add(twoWord);
}
br.close();
long start = System.currentTimeMillis();
for (String[] twoWord : twoWordList)
{
}
}

public void testObjectOutPut() throws Exception
{
int size = 5563418;
int[] array = new int[size];
for (int i = 0; i < array.length; i++)
{
array[i] = i;
}
ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream("data/test/out.bin"));
out.writeObject(array);
out.close();

long start = System.currentTimeMillis();
ObjectInputStream in = new ObjectInputStream(new FileInputStream("data/test/out.bin"));
int[] inArray = (int[]) in.readObject();
System.out.println(System.currentTimeMillis() - start);
for (int i = 0; i < inArray.length; i++)
{
assertEquals(i, inArray[i]);
}
}
}

0 comments on commit 6182ca4

Please sign in to comment.