Skip to content

Commit

Permalink
短语提取、语义距离、文本推荐示例
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Dec 9, 2014
1 parent fa18a33 commit 9dec9b5
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 0 deletions.
50 changes: 50 additions & 0 deletions test/java/com/hankcs/demo/DemoPhraseExtractor.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>[email protected]</email>
* <create-date>2014/12/9 13:55</create-date>
*
* <copyright file="DemoPhraseExtractor.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.demo;

import com.hankcs.hanlp.phrase.IPhraseExtractor;
import com.hankcs.hanlp.phrase.MutualInformationEntropyPhraseExtractor;

import java.util.List;

/**
* 短语提取
* @author hankcs
*/
public class DemoPhraseExtractor
{
public static void main(String[] args)
{
IPhraseExtractor extractor = new MutualInformationEntropyPhraseExtractor();
String text = "算法工程师\n" +
"算法(Algorithm)是一系列解决问题的清晰指令,也就是说,能够对一定规范的输入,在有限时间内获得所要求的输出。如果一个算法有缺陷,或不适合于某个问题,执行这个算法将不会解决这个问题。不同的算法可能用不同的时间、空间或效率来完成同样的任务。一个算法的优劣可以用空间复杂度与时间复杂度来衡量。算法工程师就是利用算法处理事物的人。\n" +
"\n" +
"1职位简介\n" +
"算法工程师是一个非常高端的职位;\n" +
"专业要求:计算机、电子、通信、数学等相关专业;\n" +
"学历要求:本科及其以上的学历,大多数是硕士学历及其以上;\n" +
"语言要求:英语要求是熟练,基本上能阅读国外专业书刊;\n" +
"必须掌握计算机相关知识,熟练使用仿真工具MATLAB等,必须会一门编程语言。\n" +
"\n" +
"2研究方向\n" +
"视频算法工程师、图像处理算法工程师、音频算法工程师 通信基带算法工程师\n" +
"\n" +
"3目前国内外状况\n" +
"目前国内从事算法研究的工程师不少,但是高级算法工程师却很少,是一个非常紧缺的专业工程师。算法工程师根据研究领域来分主要有音频/视频算法处理、图像技术方面的二维信息算法处理和通信物理层、雷达信号处理、生物医学信号处理等领域的一维信息算法处理。\n" +
"在计算机音视频和图形图像技术等二维信息算法处理方面目前比较先进的视频处理算法:机器视觉成为此类算法研究的核心;另外还有2D转3D算法(2D-to-3D conversion),去隔行算法(de-interlacing),运动估计运动补偿算法(Motion estimation/Motion Compensation),去噪算法(Noise Reduction),缩放算法(scaling),锐化处理算法(Sharpness),超分辨率算法(Super Resolution),手势识别(gesture recognition),人脸识别(face recognition)。\n" +
"在通信物理层等一维信息领域目前常用的算法:无线领域的RRM、RTT,传送领域的调制解调、信道均衡、信号检测、网络优化、信号分解等。\n" +
"另外数据挖掘、互联网搜索算法也成为当今的热门方向。\n" +
"算法工程师逐渐往人工智能方向发展。";
List<String> phraseList = extractor.extractPhrase(text, 10);
System.out.println(phraseList);
}
}
42 changes: 42 additions & 0 deletions test/java/com/hankcs/demo/DemoSuggester.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>[email protected]</email>
* <create-date>2014/12/9 13:27</create-date>
*
* <copyright file="DemoSuggestor.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.demo;

import com.hankcs.hanlp.suggest.Suggester;

/**
* 文本推荐(句子级别,从一系列句子中挑出与输入句子最相似的那一个)
* @author hankcs
*/
public class DemoSuggester
{
public static void main(String[] args)
{
Suggester suggester = new Suggester();
String[] titleArray =
(
"威廉王子发表演说 呼吁保护野生动物\n" +
"《时代》年度人物最终入围名单出炉 普京马云入选\n" +
"“黑格比”横扫菲:菲吸取“海燕”经验及早疏散\n" +
"日本保密法将正式生效 日媒指其损害国民知情权\n" +
"英报告说空气污染带来“公共健康危机”"
).split("\\n");
for (String title : titleArray)
{
suggester.addSentence(title);
}

System.out.println(suggester.suggest("发言", 1)); // 语义
System.out.println(suggester.suggest("公共危机", 1)); // 字符
System.out.println(suggester.suggest("mayun", 1)); // 拼音
}
}
34 changes: 34 additions & 0 deletions test/java/com/hankcs/demo/DemoWordDistance.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>[email protected]</email>
* <create-date>2014/12/9 13:49</create-date>
*
* <copyright file="DemoWordDistance.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.demo;

import com.hankcs.hanlp.dictionary.CoreSynonymDictionary;
import com.hankcs.hanlp.dictionary.common.CommonSynonymDictionary;

/**
* 语义距离
* @author hankcs
*/
public class DemoWordDistance
{
public static void main(String[] args)
{
String apple = "苹果";
String banana = "香蕉";
String bike = "自行车";
CommonSynonymDictionary.SynonymItem synonymApple = CoreSynonymDictionary.get(apple);
CommonSynonymDictionary.SynonymItem synonymBanana = CoreSynonymDictionary.get(banana);
CommonSynonymDictionary.SynonymItem synonymBike = CoreSynonymDictionary.get(bike);
System.out.println(apple + " " + banana + " 之间的距离是 " + synonymApple.distance(synonymBanana));
System.out.println(apple + " " + bike + " 之间的距离是 " + synonymApple.distance(synonymBike));
}
}

0 comments on commit 9dec9b5

Please sign in to comment.