Skip to content

Commit

Permalink
CharType的二进制由程序自动生成
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Jan 10, 2017
1 parent d86c545 commit 8d17cf5
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 72 deletions.
Binary file removed data/dictionary/other/CharType.dat.yes
Binary file not shown.
2 changes: 1 addition & 1 deletion src/main/java/com/hankcs/hanlp/HanLP.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ public static final class Config
/**
* 字符类型对应表
*/
public static String CharTypePath = "data/dictionary/other/CharType.dat.yes";
public static String CharTypePath = "data/dictionary/other/CharType.bin";

/**
* 字符正规化表(全角转半角,繁体转简体)
Expand Down
85 changes: 71 additions & 14 deletions src/main/java/com/hankcs/hanlp/dictionary/other/CharType.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,22 @@
* </copyright>
*/
package com.hankcs.hanlp.dictionary.other;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.utility.TextUtility;

import java.io.DataOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;

import static com.hankcs.hanlp.utility.Predefine.logger;

/**
* 字符类型
*
* @author hankcs
*/
public class CharType
Expand Down Expand Up @@ -55,7 +64,7 @@ public class CharType
* 其他
*/
public static final byte CT_OTHER = CT_SINGLE + 12;

public static byte[] type;

static
Expand All @@ -66,32 +75,80 @@ public class CharType
ByteArray byteArray = ByteArray.createByteArray(HanLP.Config.CharTypePath);
if (byteArray == null)
{
logger.severe("字符类型对应表加载失败:" + HanLP.Config.CharTypePath);
System.exit(-1);
try
{
byteArray = generate();
}
catch (IOException e)
{
e.printStackTrace();
logger.severe("字符类型对应表 " + HanLP.Config.CharTypePath + " 加载失败: " + TextUtility.exceptionToString(e));
System.exit(-1);
}
}
while (byteArray.hasMore())
{
int b = byteArray.nextChar();
int e = byteArray.nextChar();
byte t = byteArray.nextByte();
for (int i = b; i <= e; ++i)
{
type[i] = t;
}
}
else
logger.info("字符类型对应表加载成功,耗时" + (System.currentTimeMillis() - start) + " ms");
}

private static ByteArray generate() throws IOException
{
int preType = 5;
int preChar = 0;
List<int[]> typeList = new LinkedList<int[]>();
for (int i = 0; i <= Character.MAX_VALUE; ++i)
{
while (byteArray.hasMore())
int type = TextUtility.charType((char) i);
// System.out.printf("%d %d\n", i, TextUtility.charType((char) i));
if (type != preType)
{
int b = byteArray.nextChar();
int e = byteArray.nextChar();
byte t = byteArray.nextByte();
for (int i = b; i <= e; ++i)
{
type[i] = t;
}
int[] array = new int[3];
array[0] = preChar;
array[1] = i - 1;
array[2] = preType;
typeList.add(array);
// System.out.printf("%d %d %d\n", array[0], array[1], array[2]);
preChar = i;
}
logger.info("字符类型对应表加载成功,耗时" + (System.currentTimeMillis() - start) + " ms");
preType = type;
}
{
int[] array = new int[3];
array[0] = preChar;
array[1] = (int) Character.MAX_VALUE;
array[2] = preType;
typeList.add(array);
}
// System.out.print("int[" + typeList.size() + "][3] array = \n");
DataOutputStream out = new DataOutputStream(new FileOutputStream(HanLP.Config.CharTypePath));
for (int[] array : typeList)
{
// System.out.printf("%d %d %d\n", array[0], array[1], array[2]);
out.writeChar(array[0]);
out.writeChar(array[1]);
out.writeByte(array[2]);
}
out.close();
ByteArray byteArray = ByteArray.createByteArray(HanLP.Config.CharTypePath);
return byteArray;
}

/**
* 获取字符的类型
*
* @param c
* @return
*/
public static byte get(char c)
{
return type[(int)c];
return type[(int) c];
}
}
57 changes: 0 additions & 57 deletions src/test/java/com/hankcs/test/seg/TestCharType.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,63 +28,6 @@
*/
public class TestCharType extends TestCase
{
/**
* 制作字符类型表
* @throws Exception
*/
public void testMakeCharType() throws Exception
{
int preType = 5;
int preChar = 0;
List<int[]> typeList = new LinkedList<int[]>();
for (int i = 0; i <= Character.MAX_VALUE; ++i)
{
int type = TextUtility.charType((char) i);
// System.out.printf("%d %d\n", i, TextUtility.charType((char) i));
if (type != preType)
{
int[] array = new int[3];
array[0] = preChar;
array[1] = i - 1;
array[2] = preType;
typeList.add(array);
// System.out.printf("%d %d %d\n", array[0], array[1], array[2]);
preChar = i;
}
preType = type;
}
{
int[] array = new int[3];
array[0] = preChar;
array[1] = (int) Character.MAX_VALUE;
array[2] = preType;
typeList.add(array);
}
System.out.print("int[" + typeList.size() + "][3] array = \n");
DataOutputStream out = new DataOutputStream(new FileOutputStream(HanLP.Config.CharTypePath));
for (int[] array : typeList)
{
System.out.printf("%d %d %d\n", array[0], array[1], array[2]);
out.writeChar(array[0]);
out.writeChar(array[1]);
out.writeByte(array[2]);
}
out.close();
ByteArray byteArray = ByteArray.createByteArray(HanLP.Config.CharTypePath);
Iterator<int[]> iterator = typeList.iterator();
while (byteArray.hasMore())
{
int b = byteArray.nextChar();
int e = byteArray.nextChar();
byte t = byteArray.nextByte();
int[] array = iterator.next();
if (b != array[0] || e != array[1] || t != array[2])
{
System.out.printf("%d %d %d\n", b, e, t);
}
}
}

/**
* 测试字符类型表
* @throws Exception
Expand Down

0 comments on commit 8d17cf5

Please sign in to comment.