Skip to content

Commit

Permalink
支持拼音推荐器
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Nov 6, 2014
1 parent e46da03 commit dccb5f3
Show file tree
Hide file tree
Showing 38 changed files with 2,479 additions and 1,670 deletions.
90 changes: 90 additions & 0 deletions main/java/com/hankcs/hanlp/algoritm/EditDistance.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,48 @@ public static long compute(long[] arrayA, long[] arrayB)
return d[m][n];
}

public static int compute(int[] arrayA, int[] arrayB)
{
final int m = arrayA.length;
final int n = arrayB.length;
if (m == 0 || n == 0) return Integer.MAX_VALUE / 3;

int[][] d = new int[m + 1][n + 1];
for (int j = 0; j <= n; ++j)
{
d[0][j] = j;
}
for (int i = 0; i <= m; ++i)
{
d[i][0] = i;
}

for (int i = 1; i <= m; ++i)
{
int ci = arrayA[i - 1];
for (int j = 1; j <= n; ++j)
{
int cj = arrayB[j - 1];
if (ci == cj)
{
d[i][j] = d[i - 1][j - 1];
}
// else if (i > 1 && j > 1 && ci == arrayA[j - 2] && cj == arrayB[i - 2])
// {
// // 交错相等
// d[i][j] = 1 + Math.min(d[i - 2][j - 2], Math.min(d[i][j - 1], d[i - 1][j]));
// }
else
{
// 等号右边的分别代表 将ci改成cj 错串加cj 错串删ci
d[i][j] = Math.min(d[i - 1][j - 1] + Math.abs(ci - cj), Math.min(d[i][j - 1] + cj, d[i - 1][j] + ci));
}
}
}

return d[m][n];
}

/**
* 编辑距离
*
Expand Down Expand Up @@ -127,4 +169,52 @@ else if (i > 1 && j > 1 && ci == rightWord.charAt(j - 2) && cj == wrongWord.char

return d[m][n];
}

/**
* 编辑距离
*
* @param wrongWord 串A,其实它们两个调换位置还是一样的
* @param rightWord 串B
* @return 它们之间的距离
*/
public static int compute(char[] wrongWord, char[] rightWord)
{
final int m = wrongWord.length;
final int n = rightWord.length;

int[][] d = new int[m + 1][n + 1];
for (int j = 0; j <= n; ++j)
{
d[0][j] = j;
}
for (int i = 0; i <= m; ++i)
{
d[i][0] = i;
}

for (int i = 1; i <= m; ++i)
{
char ci = wrongWord[i - 1];
for (int j = 1; j <= n; ++j)
{
char cj = rightWord[j - 1];
if (ci == cj)
{
d[i][j] = d[i - 1][j - 1];
}
else if (i > 1 && j > 1 && ci == rightWord[j - 2] && cj == wrongWord[i - 2])
{
// 交错相等
d[i][j] = 1 + Math.min(d[i - 2][j - 2], Math.min(d[i][j - 1], d[i - 1][j]));
}
else
{
// 等号右边的分别代表 将ci改成cj 错串加cj 错串删ci
d[i][j] = Math.min(d[i - 1][j - 1] + 1, Math.min(d[i][j - 1] + 1, d[i - 1][j] + 1));
}
}
}

return d[m][n];
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,9 @@ public String getFragment()

public abstract Emit getEmit();

@Override
public String toString()
{
return fragment + "/" + isMatch();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
*/
package com.hankcs.hanlp.collection.trie;

import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.TextUtility;

import java.io.*;
import java.nio.ByteBuffer;
Expand Down Expand Up @@ -523,15 +522,15 @@ private boolean loadBaseAndCheckByFileChannel(String path)
fis.close();

int index = 0;
size = CharUtility.bytesHighFirstToInt(bytes, index);
size = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
base = new int[size + 65535]; // 多留一些,防止越界
check = new int[size + 65535];
for (int i = 0; i < size; i++)
{
base[i] = CharUtility.bytesHighFirstToInt(bytes, index);
base[i] = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
check[i] = CharUtility.bytesHighFirstToInt(bytes, index);
check[i] = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@

import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.TextUtility;

import java.io.DataOutputStream;
import java.io.FileOutputStream;
Expand Down Expand Up @@ -265,7 +264,7 @@ public boolean save(String path)
}
catch (Exception e)
{
logger.warning("保存到" + path + "失败" + CharUtility.exceptionToString(e));
logger.warning("保存到" + path + "失败" + TextUtility.exceptionToString(e));
return false;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@
import com.hankcs.hanlp.corpus.document.sentence.word.Word;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.corpus.util.Precompiler;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.TextUtility;
import com.hankcs.hanlp.utility.Predefine;
import com.hankcs.hanlp.utility.CharUtility;

import java.util.LinkedList;
import java.util.List;
Expand Down Expand Up @@ -118,7 +117,7 @@ boolean shouldInclude(Word word)
case "mq":
case "w":
case "t":
if (!CharUtility.isAllChinese(word.value)) return false;
if (!TextUtility.isAllChinese(word.value)) return false;
case "nr":
return false;
}
Expand Down
6 changes: 3 additions & 3 deletions main/java/com/hankcs/hanlp/corpus/io/ByteArray.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
*/
package com.hankcs.hanlp.corpus.io;

import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.TextUtility;

/**
* 对字节数组进行封装,提供方便的读取操作
Expand Down Expand Up @@ -46,7 +46,7 @@ public static ByteArray createByteArray(String path)
*/
public int nextInt()
{
int result = CharUtility.bytesHighFirstToInt(bytes, offset);
int result = TextUtility.bytesHighFirstToInt(bytes, offset);
offset += 4;
return result;
}
Expand All @@ -57,7 +57,7 @@ public int nextInt()
*/
public char nextChar()
{
char result = CharUtility.bytesHighFirstToChar(bytes, offset);
char result = TextUtility.bytesHighFirstToChar(bytes, offset);
offset += 2;
return result;
}
Expand Down
1 change: 0 additions & 1 deletion main/java/com/hankcs/hanlp/corpus/util/Precompiler.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

import com.hankcs.hanlp.corpus.document.sentence.word.Word;
import com.hankcs.hanlp.utility.Predefine;
import com.hankcs.hanlp.utility.CharUtility;

/**
* 预编译与反编译一些词语
Expand Down
7 changes: 3 additions & 4 deletions main/java/com/hankcs/hanlp/dictionary/BiGramDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.TextUtility;

import java.io.*;
import java.nio.ByteBuffer;
Expand Down Expand Up @@ -130,12 +129,12 @@ private static boolean loadDat(String path)
fis.close();

int index = 0;
int size = CharUtility.bytesHighFirstToInt(bytes, index);
int size = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
Integer[] value = new Integer[size];
for (int i = 0; i < size; i++)
{
value[i] = CharUtility.bytesHighFirstToInt(bytes, index);
value[i] = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
}
logger.info("加载值" + path + ".value.dat成功,耗时" + (System.currentTimeMillis() - start) + "ms");
Expand Down
13 changes: 6 additions & 7 deletions main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.TextUtility;

import java.io.*;
import java.util.*;
Expand Down Expand Up @@ -136,24 +135,24 @@ static boolean loadDat(String path)
byte[] bytes = IOUtil.readBytes(path + ".value.dat");
if (bytes == null) return false;
int index = 0;
int size = CharUtility.bytesHighFirstToInt(bytes, index);
int size = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
Attribute[] attributes = new Attribute[size];
final Nature[] natureIndexArray = Nature.values();
for (int i = 0; i < size; ++i)
{
// 第一个是全部频次,第二个是词性个数
int currentTotalFrequency = CharUtility.bytesHighFirstToInt(bytes, index);
int currentTotalFrequency = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
int length = CharUtility.bytesHighFirstToInt(bytes, index);
int length = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
attributes[i] = new Attribute(length);
attributes[i].totalFrequency = currentTotalFrequency;
for (int j = 0; j < length; ++j)
{
attributes[i].nature[j] = natureIndexArray[CharUtility.bytesHighFirstToInt(bytes, index)];
attributes[i].nature[j] = natureIndexArray[TextUtility.bytesHighFirstToInt(bytes, index)];
index += 4;
attributes[i].frequency[j] = CharUtility.bytesHighFirstToInt(bytes, index);
attributes[i].frequency[j] = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
import com.hankcs.hanlp.algoritm.EditDistance;
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.TextUtility;

import java.io.FileInputStream;
import java.util.ArrayList;
Expand Down Expand Up @@ -78,7 +77,7 @@ public static List<Long[]> convert(List<Term> sentence, boolean withUndefinedIte
{
case 'm':
{
if (!CharUtility.isAllChinese(term.word)) continue;
if (!TextUtility.isAllChinese(term.word)) continue;
}break;
case 'w':
{
Expand Down
13 changes: 6 additions & 7 deletions main/java/com/hankcs/hanlp/dictionary/CustomDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.TextUtility;

import java.io.*;
import java.util.*;
Expand Down Expand Up @@ -121,24 +120,24 @@ static boolean loadDat(String path)
byte[] bytes = IOUtil.readBytes(path + ".value.dat");
if (bytes == null) return false;
int index = 0;
int size = CharUtility.bytesHighFirstToInt(bytes, index);
int size = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
CoreDictionary.Attribute[] attributes = new CoreDictionary.Attribute[size];
final Nature[] natureIndexArray = Nature.values();
for (int i = 0; i < size; ++i)
{
// 第一个是全部频次,第二个是词性个数
int currentTotalFrequency = CharUtility.bytesHighFirstToInt(bytes, index);
int currentTotalFrequency = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
int length = CharUtility.bytesHighFirstToInt(bytes, index);
int length = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
attributes[i] = new CoreDictionary.Attribute(length);
attributes[i].totalFrequency = currentTotalFrequency;
for (int j = 0; j < length; ++j)
{
attributes[i].nature[j] = natureIndexArray[CharUtility.bytesHighFirstToInt(bytes, index)];
attributes[i].nature[j] = natureIndexArray[TextUtility.bytesHighFirstToInt(bytes, index)];
index += 4;
attributes[i].frequency[j] = CharUtility.bytesHighFirstToInt(bytes, index);
attributes[i].frequency[j] = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
}
}
Expand Down
12 changes: 5 additions & 7 deletions main/java/com/hankcs/hanlp/dictionary/nr/NRDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,9 @@
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.corpus.tag.NR;
import com.hankcs.hanlp.dictionary.CommonDictionary;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.CharUtility;
import com.hankcs.hanlp.utility.TextUtility;

import java.io.*;
import java.util.AbstractMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -79,19 +77,19 @@ private EnumItem<NR>[] loadDat(String path)
if (bytes == null) return null;
NR[] nrArray = NR.values();
int index = 0;
int size = CharUtility.bytesHighFirstToInt(bytes, index);
int size = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
EnumItem<NR>[] valueArray = new EnumItem[size];
for (int i = 0; i < size; ++i)
{
int currentSize = CharUtility.bytesHighFirstToInt(bytes, index);
int currentSize = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
EnumItem<NR> item = new EnumItem<>();
for (int j = 0; j < currentSize; ++j)
{
NR nr = nrArray[CharUtility.bytesHighFirstToInt(bytes, index)];
NR nr = nrArray[TextUtility.bytesHighFirstToInt(bytes, index)];
index += 4;
int frequency = CharUtility.bytesHighFirstToInt(bytes, index);
int frequency = TextUtility.bytesHighFirstToInt(bytes, index);
index += 4;
item.labelMap.put(nr, frequency);
}
Expand Down
Loading

0 comments on commit dccb5f3

Please sign in to comment.