Skip to content

Commit

Permalink
新增一个有趣的“同义改写”功能:DemoRewriteText
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Nov 13, 2015
1 parent 1f29a35 commit 2a8388b
Show file tree
Hide file tree
Showing 6 changed files with 173 additions and 25 deletions.
3 changes: 2 additions & 1 deletion src/main/java/com/hankcs/hanlp/algoritm/VectorDistance.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
*/
package com.hankcs.hanlp.algoritm;

import com.hankcs.hanlp.corpus.synonym.Synonym;
import com.hankcs.hanlp.dictionary.common.CommonSynonymDictionary;

import java.util.List;
Expand Down Expand Up @@ -50,7 +51,7 @@ public static double compute(List<CommonSynonymDictionary.SynonymItem> synonymIt
for (CommonSynonymDictionary.SynonymItem itemB : synonymItemListB)
{
long distance;
if (itemA.type != CommonSynonymDictionary.SynonymItem.Type.UNDEFINED && itemB.type != CommonSynonymDictionary.SynonymItem.Type.UNDEFINED)
if (itemA.type != Synonym.Type.UNDEFINED && itemB.type != Synonym.Type.UNDEFINED)
{
distance = Math.abs(itemA.entry.id - itemB.entry.id);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1042,6 +1042,26 @@ public int transition(String path, int from)
return p;
}

/**
* 转移状态
* @param c
* @param from
* @return
*/
public int transition(char c, int from)
{
int b = from;
int p;

p = b + (int) (c) + 1;
if (b == check[p])
b = base[p];
else
return -1;

return b;
}

/**
* 检查状态是否对应输出
*
Expand Down Expand Up @@ -1099,7 +1119,8 @@ public class Searcher

/**
* 构造一个双数组搜索工具
* @param offset 搜索的起始位置
*
* @param offset 搜索的起始位置
* @param charArray 搜索的目标字符数组
*/
public Searcher(int offset, char[] charArray)
Expand All @@ -1117,6 +1138,7 @@ public Searcher(int offset, char[] charArray)

/**
* 取出下一个命中输出
*
* @return 是否命中,当返回false表示搜索结束,否则使用公开的成员读取命中的详细信息
*/
public boolean next()
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/com/hankcs/hanlp/corpus/synonym/Synonym.java
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ public long distance(Synonym other)
return Math.abs(id - other.id);
}

public static enum Type
public enum Type
{
/**
* 完全同义词,对应词典中的=号
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,21 @@ public static CommonSynonymDictionary.SynonymItem get(String key)
return dictionary.get(key);
}

/**
* 不分词直接转换
* @param text
* @return
*/
public static String rewriteQuickly(String text)
{
return dictionary.rewriteQuickly(text);
}

public static String rewrite(String text)
{
return dictionary.rewrite(text);
}

/**
* 语义距离
* @param itemA
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
package com.hankcs.hanlp.dictionary.common;

import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.corpus.dependency.CoNll.PosTagCompiler;
import com.hankcs.hanlp.corpus.synonym.Synonym;
import com.hankcs.hanlp.corpus.synonym.SynonymHelper;

Expand All @@ -20,7 +21,14 @@
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import java.util.TreeMap;
import com.hankcs.hanlp.corpus.synonym.Synonym.Type;
import com.hankcs.hanlp.corpus.util.Precompiler;
import com.hankcs.hanlp.dictionary.CoreBiGramTableDictionary;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.StandardTokenizer;
import com.hankcs.hanlp.utility.Predefine;

import static com.hankcs.hanlp.utility.Predefine.logger;

Expand Down Expand Up @@ -126,6 +134,82 @@ public long distance(String a, String b)
return itemA.distance(itemB);
}

public String rewriteQuickly(String text)
{
assert text != null;
StringBuilder sbOut = new StringBuilder((int) (text.length() * 1.2));
String preWord = Predefine.SENTENCE_BEGIN;
for (int i = 0; i < text.length(); ++i)
{
int state = 1;
state = trie.transition(text.charAt(i), state);
if (state > 0)
{
int start = i;
int to = i + 1;
int end = - 1;
SynonymItem value = null;
for (; to < text.length(); ++to)
{
state = trie.transition(text.charAt(to), state);
if (state < 0) break;
SynonymItem output = trie.output(state);
if (output != null)
{
value = output;
end = to + 1;
}
}
if (value != null)
{
Synonym synonym = value.randomSynonym(Type.EQUAL, preWord);
if (synonym != null)
{
sbOut.append(synonym.realWord);
preWord = synonym.realWord;
}
else
{
preWord = text.substring(start, end);
sbOut.append(preWord);
}
i = end - 1;
}
else
{
preWord = String.valueOf(text.charAt(i));
sbOut.append(text.charAt(i));
}
}
else
{
preWord = String.valueOf(text.charAt(i));
sbOut.append(text.charAt(i));
}
}

return sbOut.toString();
}

public String rewrite(String text)
{
List<Term> termList = StandardTokenizer.segment(text.toCharArray());
StringBuilder sbOut = new StringBuilder((int) (text.length() * 1.2));
String preWord = Predefine.SENTENCE_BEGIN;
for (Term term : termList)
{
SynonymItem synonymItem = get(term.word);
Synonym synonym;
if (synonymItem != null && (synonym = synonymItem.randomSynonym(Type.EQUAL, preWord)) != null)
{
sbOut.append(synonym.realWord);
}
else sbOut.append(term.word);
preWord = PosTagCompiler.compile(term.nature.toString(), term.word);
}
return sbOut.toString();
}

/**
* 词典中的一个条目
*/
Expand All @@ -140,27 +224,6 @@ public static class SynonymItem
*/
public List<Synonym> synonymList;

public static enum Type
{
/**
* 完全同义词,对应词典中的=号
*/
EQUAL,
/**
* 同类词,对应#
*/
LIKE,
/**
* 封闭词,没有同义词或同类词
*/
SINGLE,

/**
* 未定义,通常属于非词典中的词
*/
UNDEFINED,
}

/**
* 这个条目的类型,同义词或同类词或封闭词
*/
Expand Down Expand Up @@ -191,6 +254,29 @@ public SynonymItem(Synonym entry, List<Synonym> synonymList, char type)
}
}

/**
* 随机挑一个近义词
* @param type 类型
* @return
*/
public Synonym randomSynonym(Type type, String preWord)
{
ArrayList<Synonym> synonymArrayList = new ArrayList<Synonym>(synonymList);
ListIterator<Synonym> listIterator = synonymArrayList.listIterator();
if (type != null) while (listIterator.hasNext())
{
Synonym synonym = listIterator.next();
if (synonym.type != type || (preWord != null && CoreBiGramTableDictionary.getBiFrequency(preWord, synonym.realWord) == 0)) listIterator.remove();
}
if (synonymArrayList.size() == 0) return null;
return synonymArrayList.get((int) (System.currentTimeMillis() % (long)synonymArrayList.size()));
}

public Synonym randomSynonym()
{
return randomSynonym(null, null);
}

@Override
public String toString()
{
Expand Down Expand Up @@ -225,6 +311,5 @@ public static SynonymItem createUndefined(String word)
SynonymItem item = new SynonymItem(new Synonym(word, word.hashCode() * 1000000 + Long.MAX_VALUE / 3), null, Type.UNDEFINED);
return item;
}

}
}
25 changes: 25 additions & 0 deletions src/test/java/com/hankcs/demo/DemoRewriteText.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* <summary></summary>
* <author>hankcs</author>
* <email>[email protected]</email>
* <create-date>2015/11/13 18:36</create-date>
*
* <copyright file="DemoRewriteDocument.java">
* Copyright (c) 2003-2015, hankcs. All Right Reserved, http://www.hankcs.com/
* </copyright>
*/
package com.hankcs.demo;

import com.hankcs.hanlp.dictionary.CoreSynonymDictionary;

/**
* @author hankcs
*/
public class DemoRewriteText
{
public static void main(String[] args)
{
String text = "这个方法可以利用同义词词典将一段文本改写成意思相似的另一段文本,而且差不多符合语法";
System.out.println(CoreSynonymDictionary.rewrite(text));
}
}

0 comments on commit 2a8388b

Please sign in to comment.