diff --git a/.gitignore b/.gitignore index 2a95a24e0..c41f82d27 100644 --- a/.gitignore +++ b/.gitignore @@ -24,21 +24,28 @@ config.h include/ lib/ bin/ -lgdpj -lgsrl -otcws -otpos -otner -maxent -cws -cws_cmdline -multi_cws_cmdline -pos -par -ner +tools/train/lgdpj +tools/train/lgsrl +tools/train/otcws +tools/train/otpos +tools/train/otner +tools/train/maxent +examples/cws +examples/cws_cmdline +examples/multi_cws_cmdline +examples/pos +examples/pos_cmdline +examples/multi_pos_cmdline +examples/par +examples/ner ############### # data file # ############### new_ltp_data/ ltp_data/ + +################## +# running folder # +################## +dummy/ diff --git a/.travis.yml b/.travis.yml index 655d0f854..4c29d62cf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,19 @@ compiler: before_install: - sudo apt-get install gfortran - sudo apt-get install cmake +env: + global: + - secure: "VdY9DQK8PdZ5cBpn9qG+8KqyC7BFuYPNwU4f5n19nto62V6ifU5XOLBkxCF36bSF8C4Nf0y0uDdj4gqMnL7OqgwFjucBRQLOuk/10uuy3azEjGzTxePgXlYw15XmMEpWy3hvxEfFqvonJ0g9+fZjKeEmbASVixIbWpYN/pIy2jQ=" + +addons: + coverity_scan: + project: + name: "HIT-SCIR/ltp" + description: "Language Technology Platform" + notification_email: oneplus.lau@gmail.com + build_command_prepend: "./configure" + build_command: "make -j4" + branch_pattern: master script: - ./configure diff --git a/ChangeLog.md b/ChangeLog.md index e657e7628..b622c124b 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,3 +1,13 @@ +2014-01-20 +---------- +* 在分词、词性标注和依存句法分析模块中加入模型裁剪功能,减少了模型大小。用户可以通过配置文件里的rare-feature-threshold参数配置裁剪力度,如果rare-feature-threshold为0,则只去掉为0的特征;rare-feature-threshold大于0时将一步去掉更新次数低于阈值的特征。这一优化方法主要参考[Learning Sparser Perceptron Models](http://www.cs.bgu.ac.il/~yoavg/publications/acl2011sparse.pdf)。 +* 增加了`ltp_server`在异常输入情况下返回错误代码,如果输入数据编码错误或者输入xml不符合规则,将返回400 +* 修复了词性标注、命名实体识别、依存句法分析训练套件中的内存泄露问题 +* 修复了语义角色标注的内存泄露问题 +* 修复了词性标注、命名实体识别模型文件的错误标示符,这项修改将导致3.1.0以及之后的版本不能与3.0.x的模型兼容,请务必注意 +* 修复了由boost.multi_array.views引起的MSVC下不能以Debug方式编译的问题 +* 修复了由打开文件时字符串为空引起的Windows下不能正常运行的bug + 2013-09-29 ---------- * 解决windows编译问题 @@ -199,11 +209,9 @@ float CParser::Smoothen(float ftd, float ftt, float ftttt, float ftw, float fwt, * 胡禹轩修改了srl中overlapped的bug * 上午修改IRLAS_DLL_x.cpp的时候使用到了MyLib.cpp。但是ltp中有很多MyLib.cpp,如: -``` -_irlas/MyLib.cpp -__util/MyLib.cpp -_parser/MyLib.cpp -``` + * _irlas/MyLib.cpp + * __util/MyLib.cpp + * _parser/MyLib.cpp 等,现在还没有统一。 * 我简单的将_irlas/MyLib.cpp替换__util/MyLib.cpp,导致出现了新的bug。因为_irlas/MyLib.cpp和__util/MyLib.cpp中convert_to_pair的实现不相同。完成功能是:`、/wp => [、][wp]`,_irlas/MyLib.cpp中的实现是错误的。 @@ -268,7 +276,9 @@ string itos(int i); ``` char* presult = new char[5000]; ``` + 当句子过长的时候,会出现内存越界。修改为: + ``` int nChar = 0; for (int i=0; i<(int)vecWord.size(); ++i) { @@ -292,17 +302,17 @@ string itos(int i); 2007-11-22 ---------- -SDS中:`void SDS_TS::SelectSnt()` -定义: +SDS中:`void SDS_TS::SelectSnt()` 定义: + ``` unsigned sntNum; ``` 但是后面用到: ``` - sntNum = m_vctSntPairs_Score[summarySntNum].m_nSntNum - 1; - if(sntNum >= 0) { - ... - } + sntNum = m_vctSntPairs_Score[summarySntNum].m_nSntNum - 1; + if(sntNum >= 0) { + ... + } ``` 此时当`m_vctSntPairs_Score[summarySntNum].m_nSntNum == 0`时:`sntNum = 0xFFFF;` @@ -313,8 +323,7 @@ SDS中:`void SDS_TS::SelectSnt()` 2007-11-22 ---------- -* Parser中:parser_dll_x.cpp中`void Parse(vector < string >& vecWord, vector < string >& vecPOS, vector < pair >& vecParse)`中 -原来为: +* parser_dll_x.cpp中`void Parse(vector < string >& vecWord, vector < string >& vecPOS, vector < pair >& vecParse)`中原来为: ``` char * csOutput = new char[vecWord.size() * 50]; ``` @@ -326,9 +335,8 @@ SDS中:`void SDS_TS::SelectSnt()` } char * csOutput = new char[nChar * 2 + vecWord.size() * 32]; ``` -因为有的时候会输入: -```"------------------------------------"``` -或者很长的数字串,这样会造成内存越界问题。 + +因为有的时候会输入`"------------------------------------"`或者很长的数字串,这样会造成内存越界问题。 2007-11-21 ---------- diff --git a/README.md b/README.md index 786aa23d5..852b93bd4 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ --- * [百度云](http://pan.baidu.com/share/link?shareid=1988562907&uk=2738088569) -* 当前模型版本3.0.3 +* 当前模型版本3.1.0 开源协议 ------- diff --git a/doc/ltp-document-3.0.md b/doc/ltp-document-3.0.md index 7f9e1c9b7..1691f5df4 100644 --- a/doc/ltp-document-3.0.md +++ b/doc/ltp-document-3.0.md @@ -3,14 +3,23 @@ LTP使用文档v3.0 #### 作者 -* 刘一佳 << yjliu@ir.hit.edu.cn>> 2013年7月17日创建文档 +* 牛国成 << gcniu@ir.hit.edu.cn >> 2014年5月10日,增加词性词典相关文档 +* 刘一佳 << yjliu@ir.hit.edu.cn >> 2013年7月17日,创建文档 版权所有:哈尔滨工业大学社会计算与信息检索研究中心 ## 目录 +* [简介](#简介) +* [开始使用LTP](#开始使用LTP) +* [使用ltp_test以及模型](#使用ltp_test以及模型) +* [编程接口](#编程接口) +* [使用ltp_server](#使用ltp_server) +* [实现原理与性能](#实现原理与性能) +* [使用训练套件](#使用训练套件) +* [发表论文](#发表论文) +* [附录](#附录) # 简介 - 语言技术平台(Language Technology Platform,LTP)是哈工大社会计算与信息检索研究中心历时十年开发的一整套中文语言处理系统。LTP制定了基于XML的语言处理结果表示,并在此基础上提供了一整套自底向上的丰富而且高效的中文语言处理模块(包括词法、句法、语义等6项中文处理核心技术),以及基于动态链接库(Dynamic Link Library, DLL)的应用程序接口,可视化工具,并且能够以网络服务(Web Service)的形式进行使用。 从2006年9月5日开始该平台对外免费共享目标代码,截止目前,已经有国内外400多家研究单位共享了LTP,也有国内外多家商业公司购买了LTP,用于实际的商业项目中。2010年12月获得中国中文信息学会颁发的行业最高奖项:“钱伟长中文信息处理科学技术奖”一等奖。 @@ -18,7 +27,6 @@ LTP使用文档v3.0 2011年6月1日,为了与业界同行共同研究和开发中文信息处理核心技术,我中心正式将LTP开源。 # 开始使用LTP - 如果你是第一次使用LTP,不妨花一些时间了解LTP能帮你做什么。 LTP提供了一系列中文自然语言处理工具,用户可以使用这些工具对于中文文本进行分词、词性标注、句法分析等等工作。从应用角度来看,LTP为用户提供了下列组件: @@ -31,11 +39,11 @@ LTP提供了一系列中文自然语言处理工具,用户可以使用这些 如果你的公司需要一套高性能的中文语言分析工具以处理海量的文本,或者你的在研究工作建立在一系列底层中文自然语言处理任务之上,或者你想将自己的科研成果与前沿先进工作进行对比,LTP都可能是你的选择。 -# 如何安装LTP +## 如何安装LTP 下面的文档将介绍如何安装LTP -## 获得LTP +### 获得LTP 作为安装的第一步,你需要获得LTP。LTP包括两部分,分别是项目源码和编译好的模型文件。你可以从以下链接获得最新的LTP项目源码。 @@ -43,13 +51,14 @@ LTP提供了一系列中文自然语言处理工具,用户可以使用这些 同时,你可以从以下一些地方获得LTP的模型。 -* +* [百度云](http://pan.baidu.com/share/link?shareid=1988562907&uk=2738088569) +* 当前模型版本3.1.0 -## 安装CMake +### 安装CMake LTP使用编译工具CMake构建项目。在安装LTP之前,你需要首先安装CMake。CMake的网站在[这里](http://www.cmake.org)。如果你是Windows用户,请下载CMake的二进制安装包;如果你是Linux,Mac OS或Cygwin的用户,可以通过编译源码的方式安装CMake,当然,你也可以使用Linux的软件源来安装。 -## Windows(MSVC)编译 +### Windows(MSVC)编译 第一步:构建VC Project @@ -63,18 +72,19 @@ LTP使用编译工具CMake构建项目。在安装LTP之前,你需要首先安 第二步:编译 -构建后得到ALL_BUILD、RUN_TESTS、ZERO_CHECK三个VC Project。使用VS打开ALL_BUILD项目,选择Release方式构建项目。 +构建后得到ALL_BUILD、RUN_TESTS、ZERO_CHECK三个VC Project。使用VS打开ALL_BUILD项目,选择Release(*)方式构建项目。 -(注:由于boost::multi_array与VS2010不兼容,并不能使用Debug方式构建项目。) +(注*:boost::multi_array与VS2010不兼容的bug已经在3.1.0中得到修复,3.1.x及以上版本已经可以使用Debug方式构建,但出于效率考虑,仍旧建议使用Release方式构建。) -## Linux,Mac OSX和Cygwin编译 +### Linux,Mac OSX和Cygwin编译 -Linux、Mac OSX和Cygwin的用户,可以直接在项目根目录下使用命令 +Linux、Mac OSX(*)和Cygwin的用户,可以直接在项目根目录下使用命令 ./configure - ./make + make +(注:Mac OSX如果要编译example下的示例程序,请加入-std=c++11 -stdlib=libstdc++ -Wno-error=c++11-narrowing选项) 进行编译。 @@ -87,7 +97,7 @@ Linux、Mac OSX和Cygwin的用户,可以直接在项目根目录下使用命 | ltp_test | LTP调用程序 | | ltp_server* | LTP Server程序 | -在lib文件夹下生成以下一些静态链接库 +在lib文件夹下生成以下一些静态链接库(**) | 程序名 | 说明 | | ------ | ---- | @@ -104,15 +114,14 @@ Linux、Mac OSX和Cygwin的用户,可以直接在项目根目录下使用命 | ------ | ---- | | otcws | 分词的训练和测试套件 | | otpos | 词性标注的训练和测试套件 | +| otner | 命名实体识别的训练和测试套件 | | lgdpj | 依存句法分析训练和测试套件 | -| maxent* | 最大熵工具包,用于训练命名实体识别和语义角色标注模型 | -| SRLExtract* | 语义角色标注训练程序 | -| SRLGetInstance* | | +| lgsrl | 语义角色标注训练和测试套件 | -(注*:在window版本中ltp_server、Maxent、SRLExtract、SRLGetInstance并不被编译。) +* (注*:在window版本中ltp_server、Maxent、SRLExtract、SRLGetInstance并不被编译。) +* (注**:window下产生的静态库的后缀是.lib,linux下产生的静态库的后缀是.a) # 使用ltp_test以及模型 - 一般来讲,基于统计机器学习方法构建的自然语言处理工具通常包括两部分,即:算法逻辑以及模型。模型从数据中学习而得,通常保存在文件中以持久化;而算法逻辑则与程序对应。 ltp提供一整套算法逻辑以及模型,其中的模型包括: @@ -127,9 +136,9 @@ ltp提供一整套算法逻辑以及模型,其中的模型包括: ltp_test是一个整合ltp中各模块的命令行工具。他完成加载模型,依照指定方法执行分析的功能。ltp_test加载的模型通过配置文件指定。配置文件的样例如下: - segmentor-model = new_ltp_data/cws.model - postagger-model = new_ltp_data/pos.model - parser-model = new_ltp_data/parser.model + segmentor-model = ltp_data/cws.model + postagger-model = ltp_data/pos.model + parser-model = ltp_data/parser.model ner-model = ltp_data/ner.model srl-data = ltp_data/srl_data @@ -147,8 +156,17 @@ ltp_test的使用方法如下: 分析结果以xml格式显示在stdout中。关于xml如何表示分析结果,请参考理解Web Service Client结果一节。 -# 编程接口 +## Window动态链接库 + +在Window下首次运行LTP会提示找不到动态链接库,这时请将编译产生的lib/*.dll拷贝到bin/Release/下,即可正常运行。 +## 编码以及显示 + +自3.0.0及以后版本,LTP的所有模型文件均使用UTF8编码训练,故请确保待分析文本的编码为UTF8格式。 + +由于Windows终端采用gbk编码显示,运行ltp_test后会在终端输出乱码。您可以将标准输出重定向到文件,以UTF8方式查看文件,就可以解决乱码的问题。 + +# 编程接口 下面的文档将介绍使用LTP编译产生的静态链接库编写程序的方法。 (注:2.30以后,LTP的所有模型文件均使用UTF8编码训练,故请确保待分析文本的编码为UTF8格式) @@ -256,6 +274,12 @@ ltp_test的使用方法如下: |参数名 | 参数描述 | |-------|----------| |const char * path | 词性标注模型路径 | +|const char * lexicon_file | 指定词性标注外部词典路径。如果lexicon_file为NULL,则不加载外部词典 | + +lexicon_file参数指定的外部词典文件样例如下所示。每行指定一个词,第一列指定单词,第二列之后指定该词的候选词性(可以有多项,每一项占一列),列与列之间用空格区分。 + + 雷人 v a + 】 wp 返回值: @@ -548,24 +572,18 @@ ltp_test的使用方法如下: ## 语义角色标注接口 -# Web Service使用方法 - -除C++的编程接口,LTP还提供针对自然语言处理任务,基于云端的编程接口。用户可以通过使用LTP Web Service Client调用云端服务。 +# 使用ltp_server +## 重要注意 -* 免安装:用户只需要下载LTP Web Service客户端源代码,编译执行后即可获得分析结果,无需调用静态库或下载模型文件。 -* 省硬件:LTP Web Service Client几乎可以运行于任何硬件配置的计算机上,用户不需要购买高性能的机器,即可快捷的获得分析结果。 -* 跨平台跨语言:LTP Web Service客户端几乎可以运行于任何操作系统之上,无论是Windows、Linux各个发行版或者Mac OS。 -* 跨编程语言:时至今日,LTP Web Service Client已经提供了包括C++,Java,C#在内的编程接口,其他语言的编程接口也在开发之中。 +本文档中提到的LTP Server与语言云服务不同。语言云建立在LTP Server之上,并封装了一层REST API接口。语言云API(ltp-cloud-api)的请求方式与LTP Server不同。 -在运算资源有限,编程语言受限的情况下,Web Service无疑是用户使用LTP更好的选择。 +## 搭建LTP Server -## 获得Web Service Client +LTP Server在轻量级服务器程序mongoose基础上开发。在编译LTP源码之后,运行ltp_server就可以启动LTP Server。LTP Server启动后,将会监听12345(*)端口的HTTP请求。 -你可以从以下链接获得最新的LTP项目源码。 +(*注:如需指定监听其他端口,请在src/server/ltp_server.cpp中将宏`LISTENING_PORT "12345"`设置为其他整数即可。) -* github项目托管:[https://github.com/HIT-SCIR/ltp-service](https://github.com/HIT-SCIR/ltp-service]) - -## 利用Web Service Client获得分析结果 +## 请求LTP Server ### 原理 @@ -582,56 +600,36 @@ client提交的post请求主要有以下几个字段。 | c | 用以指明输入编码方式 | | t | 用以指明分析目标,t可以为分词(ws),词性标注(pos),命名实体识别(ner),依存句法分析(dp),语义角色标注(srl)或者全部任务(all) | -### Python例子 - -下面这个python版client的例子将介绍如何利用client获得分析结果。 - -#### 示例程序 - - # -*- coding:utf8 -*- - import ltpservice - from account import username, password - - client = ltpservice.LTPService("%s:%s" % (username, password)) - result = client.analysis("我爱北京天安门。天安门上太阳伞。", ltpservice.LTPOption.WS) - - pid = 0 - for sid in xrange(result.count_sentence(pid)): - print "|".join([word.encode("utf8") for word in result.get_words(pid, sid)]) - -首先,第2行import ltpservice这个package,然后第5行实例化一个新的Service Client对象,用户名和密码被保存在这个对象中。然后第6行client发起一个请求,并指明分析目标位分词。请求结果返回并保存在一个LTML对象result中。9到10行解析这个结果。 - -## 理解Web Service Client结果 +### 数据表示 LTP 数据表示标准称为LTML。下图是LTML的一个简单例子: - + - - - + + + - - + + - LTML 标准要求如下:结点标签分别为 xml4nlp, note, doc, para, sent, word, arg 共七种结点标签: 1. xml4nlp 为根结点,无任何属性值; -2. note 为标记结点,具有的属性分别为:sent, word, pos, ne, parser, wsd, srl;分别代表分句,分词,词性标注,命名实体识别,依存句法分析,词义消歧,语义角色标注;值为”n”,表明未做,值为”y”则表示完成,如pos=”y”,表示已经完成了词性标注; +2. note 为标记结点,具有的属性分别为:sent, word, pos, ne, parser, srl;分别代表分句,分词,词性标注,命名实体识别,依存句法分析,词义消歧,语义角色标注;值为”n”,表明未做,值为”y”则表示完成,如pos=”y”,表示已经完成了词性标注; 3. doc 为篇章结点,以段落为单位包含文本内容;无任何属性值; 4. para 为段落结点,需含id 属性,其值从0 开始; 5. sent 为句子结点,需含属性为id,cont;id 为段落中句子序号,其值从0 开始;cont 为句子内容; -6. word 为分词结点,需含属性为id, cont;id 为句子中的词的序号,其值从0 开始,cont为分词内容;可选属性为 pos, ne, wsd, wsdexp, parent, relate;pos 的内容为词性标注内容;ne 为命名实体内容;wsd 与wsdexp 成对出现,wsd 为词义消歧内容,wsdexp 为相应的解释说明;parent 与relate 成对出现,parent 为依存句法分析的父亲结点id 号,relate 为相对应的关系; +6. word 为分词结点,需含属性为id, cont;id 为句子中的词的序号,其值从0 开始,cont为分词内容;可选属性为 pos, ne, parent, relate;pos 的内容为词性标注内容;ne 为命名实体内容;parent 与relate 成对出现,parent 为依存句法分析的父亲结点id 号,relate 为相对应的关系; 7. arg 为语义角色信息结点,任何一个谓词都会带有若干个该结点;其属性为id, type, beg,end;id 为序号,从0 开始;type 代表角色名称;beg 为开始的词序号,end 为结束的序号; 各结点及属性的逻辑关系说明如下: @@ -643,21 +641,41 @@ LTML 标准要求如下:结点标签分别为 xml4nlp, note, doc, para, sent, 1. 如果 pos=”y”则分词结点中必须包含pos 属性; 2. 如果 ne=”y”则分词结点中必须包含ne 属性; 3. 如果 parser=”y”则分词结点中必须包含parent 及relate 属性; - 4. 如果 wsd=”y”则分词结点中必须包含wsd 及wsdexp 属性; - 5. 如果 srl=”y”则凡是谓词(predicate)的分词会包含若干个arg 结点; + 4. 如果 srl=”y”则凡是谓词(predicate)的分词会包含若干个arg 结点; -关于web service client更多的信息请参考[https://github.com/HIT-SCIR/ltp-service](https://github.com/HIT-SCIR/ltp-service) +### 示例程序 -# 搭建一个私人的LTP Server +下面这个python程序例子显示如何向LTP Server发起http请求,并获得返回结果。 -LTP Server在轻量级服务器程序mongoose基础上开发。在编译LTP源码之后,运行LTP Server就可以启动LTP Server。LTP Server启动后,将会监听12345(*)端口的HTTP请求。 +``` +# -*- coding: utf-8 -*- +#!/usr/bin/env python +import urllib, urllib2 -在搭建好私人LTPServer后,请将client程序的目标主机地址改为你的LTPServer的地址。 +uri_base = "http://127.0.0.1:12345/ltp" -(注*:如需指定监听其他端口,请在src/server/ltp_server.cpp中将宏LISTENING_PORT “12345”设置为其他整数即可。) +data = { + 's': '我爱北京天安门', + 'x': 'n', + 't': 'all'} + +request = urllib2.Request(uri_base) +params = urllib.urlencode(data) +response = urllib2.urlopen(request, params) +content = response.read().strip() +print content +``` +### 错误返回 + +如果请求有不符合格式要求,LTP Server会返回400错误。下面的表格显示了LTP Server返回的错误类型以及原因呢。 + +|code | reason | 解释 | +|-----|--------|-----| +|400 | EMPTY SENTENCE | 输入句子为空 | +|400 | ENCODING NOT IN UTF8 | 输入句子非UTF8编码 | +|400 | BAD XML FORMAT | 输入句子不符合LTML格式 | # 实现原理与性能 - ## 在线学习算法框架 在机器学习领域,在线学习(Online learning)指每次通过一个训练实例学习模型的学习方法。在线学习的目的是正确预测训练实例的标注。在线学习最重要的一个特点是,当一次预测完成时,其正确结果便被获得,这一结果可直接用来修正模型。 @@ -666,6 +684,12 @@ LTP Server在轻量级服务器程序mongoose基础上开发。在编译LTP源 在自然语言处理领域,在线学习已经被广泛地应用在分词、词性标注、依存句法分析等结构化学习任务中。 +## 模型裁剪 + +在LTP中,词性标注、句法分析两个模块还存在模型比较大的问题。为了缩小模型的大小,我们参考[Learning Sparser Perceptron Model](http://www.cs.bgu.ac.il/~yoavg/publications/acl2011sparse.pdf),将其中提到的特征裁剪策略加入了LTP。 + +由于LTP所采用的在线机器学习框架的特征映射方式是以特征前缀为单位进行映射的,所以裁剪时的策略也是如果该前缀的更新次数比较小,就裁剪。 + ## 分词模块 在LTP中,我们将分词任务建模为基于字的序列标注问题。对于输入句子的字序列,模型给句子中的每个字标注一个标识词边界的标记。在LTP中,我们采用的标记集如附录所示。 @@ -848,19 +872,26 @@ CTB6数据来源于,训练集和测试集按照官方文档中建议的划分 ## 语义角色标注模块 -# 使用训练套件 +在LTP中,我们将SRL分为两个子任务,其一是谓词的识别(Predicate Identification, PI),其次是论元的识别以及分类(Argument Identification and Classification, AIC)。对于论元的识别及分类,我们将其视作一个联合任务,即将“非论元”也看成是论元分类问题中的一个类别。在SRL系统中,我们在最大熵模型中引入L1正则,使得特征维度降至约为原来的1/40,从而大幅度地减小了模型的内存使用率,并且提升了预测的速度。同时,为了保证标注结果满足一定的约束条件,系统增加了一个后处理过程。 + +在CoNLL 2009评测数据集上,利用LTP的自动词性及句法信息,SRL性能如下所示。 + +|Precision | Recall | F-Score | Speed | Mem. | +|----------|--------|---------|-------|------| +|0.8444 | 0.7234 | 0.7792 | 41.1 sent./s | 94M(PI+AIC) | +# 使用训练套件 ## 分词训练套件otcws用法 otcws是ltp分词模型的训练套件,用户可以使用otcws训练获得ltp的分词模型。 编译之后,在tools/train下面会产生名为otcws的二进制程序。调用方法是 - ./otcws [config_file]。 + ./otcws [config_file] otcws分别支持从人工切分数据中训练分词模型和调用分词模型对句子进行切分。人工切分的句子的样例如下: - 对外 , 他们 代表 国家 。 + 对外 , 他们 代表 国家 。 otcws主要通过配置文件指定执行的工作,其中主要有两类配置文件:训练配置和测试配置。 @@ -872,15 +903,17 @@ otcws主要通过配置文件指定执行的工作,其中主要有两类配置 algorithm = pa model-name = model/ctb5-seg max-iter = 5 + rare-feature-threshold = 0 其中, * [train] 配置组指定执行训练 - * Ttain-file 配置项指定训练集文件 - * Holdout-file 配置项指定开发集文件 - * Algorithm 指定参数学习方法,现在otcws支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 - * Model-name 指定输出模型文件名 - * Max-iter 指定最大迭代次数 + * train-file 配置项指定训练集文件 + * holdout-file 配置项指定开发集文件 + * algorithm 指定参数学习方法,现在otcws支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 + * model-name 指定输出模型文件名 + * max-iter 指定最大迭代次数 + * rare-feature-threshold 配置裁剪力度,如果rare-feature-threshold为0,则只去掉为0的特征;rare-feature-threshold;如果大于0时将进一步去掉更新次数低于阈值的特征 测试配置的配置文件样例如下所示。 @@ -891,8 +924,8 @@ otcws主要通过配置文件指定执行的工作,其中主要有两类配置 其中, * [test] 配置组指定执行测试 - * Test-file 指定测试文件 - * Model-file 指定模型文件位置 + * test-file 指定测试文件 + * model-file 指定模型文件位置 切分结果将输入到标准io中。 @@ -924,23 +957,31 @@ otpos主要通过配置文件指定执行的工作,其中主要有两类配置 其中, * [train] 配置组指定执行训练 - * Ttain-file 配置项指定训练集文件 - * Holdout-file 配置项指定开发集文件 - * Algorithm 指定参数学习方法,现在otcws支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 - * Model-name 指定输出模型文件名 - * Max-iter 指定最大迭代次数 + * train-file 配置项指定训练集文件 + * holdout-file 配置项指定开发集文件 + * algorithm 指定参数学习方法,现在otcws支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 + * model-name 指定输出模型文件名 + * max-iter 指定最大迭代次数 + * rare-feature-threshold 配置裁剪力度,如果rare-feature-threshold为0,则只去掉为0的特征;rare-feature-threshold;如果大于0时将进一步去掉更新次数低于阈值的特征 测试配置的配置文件样例如下所示。 [test] test-file = data/ctb5-test.pos model-file = model/ctb5-pos.3.model + lexicon-file = lexicon/pos-lexicon.constrain 其中, * [test] 配置组指定执行测试 - * Test-file 指定测试文件 - * Model-file 指定模型文件位置 + * test-file 指定测试文件 + * model-file 指定模型文件位置 + * lexicon-file 指定外部词典文件位置(此项可以不配置) + +lexicon-file文件样例如下所示。每行指定一个词,第一列指定单词,第二列之后指定该词的候选词性(可以有多项,每一项占一列),列与列之间用空格区分。 + + 雷人 v a + 】 wp 词性标注结果将输入到标准io中。 @@ -972,11 +1013,11 @@ Otner主要通过配置文件指定执行的工作,其中主要有两类配置 其中, * [train] 配置组指定执行训练 - * Train-file 配置项指定训练集文件 - * Holdout-file 配置项指定开发集文件 - * Algorithm 指定参数学习方法,现在otner支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 - * Model-name 指定输出模型文件名 - * Max-iter 指定最大迭代次数 + * train-file 配置项指定训练集文件 + * holdout-file 配置项指定开发集文件 + * algorithm 指定参数学习方法,现在otner支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 + * model-name 指定输出模型文件名 + * max-iter 指定最大迭代次数 测试配置的配置文件样例如下所示。 @@ -987,8 +1028,8 @@ Otner主要通过配置文件指定执行的工作,其中主要有两类配置 其中, * [test] 配置组指定执行测试 - * Test-file 指定测试文件 - * Model-file 指定模型文件位置 + * test-file 指定测试文件 + * model-file 指定模型文件位置 命名实体识别结果将输入到标准io中。 @@ -1000,7 +1041,7 @@ lgdpj是ltp依存句法分析模型的训练套件,用户可以使用lgdpj训 编译之后,在tools/train下面会产生名为lgdpj的二进制程序。调用方法是 - ./lgdpj [config_file]。 + ./lgdpj [config_file] lgdpj分别支持从人工标注依存句法的数据中训练依存句法分析模型和调用依存句法分析模型对句子进行依存句法分析。人工标注的词性标注依存句法的句子遵从conll格式,其样例如下: @@ -1039,6 +1080,7 @@ lgdpj主要通过配置文件指定执行的工作,其中主要有两类配置 max-iter = 5 algorithm = pa model-name = model/parser/ldc-o2carreras + rare-feature-threshold = 0 其中, @@ -1047,11 +1089,12 @@ lgdpj主要通过配置文件指定执行的工作,其中主要有两类配置 * decoder-name 表示采用的解码算法,现在lgdpj支持三种解码算法,分别是1o,2o-sib,2o-carreras * [feature] 配置组指定使用的特征 * [train] 配置组指定执行训练 - * Train-file 配置项指定训练集文件 - * Holdout-file 配置项指定开发集文件 - * Algorithm 指定参数学习方法,现在otcws支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 - * Model-name 指定输出模型文件名 - * Max-iter 指定最大迭代次数 + * train-file 配置项指定训练集文件 + * holdout-file 配置项指定开发集文件 + * algorithm 指定参数学习方法,现在otcws支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 + * model-name 指定输出模型文件名 + * max-iter 指定最大迭代次数 + * rare-feature-threshold 配置裁剪力度,如果rare-feature-threshold为0,则只去掉为0的特征;rare-feature-threshold;如果大于0时将进一步去掉更新次数低于阈值的特征 测试配置的配置文件样例如下所示。 @@ -1062,13 +1105,12 @@ lgdpj主要通过配置文件指定执行的工作,其中主要有两类配置 其中, * [test] 配置组指定执行测试 - * Test-file 指定测试文件 - * Model-file 指定模型文件位置 + * test-file 指定测试文件 + * model-file 指定模型文件位置 依存句法分析结果将输入到标准io中。 # 发表论文 - * Meishan Zhang, Zhilong Deng,Wanxiang Che, Ting Liu. [Combining Statistical Model and Dictionary for Domain Adaption of Chinese Word Segmentation](http://ir.hit.edu.cn/~mszhang/Conll06Tolgdpj.jar). Journal of Chinese Information Processing. 2012, 26 (2) : 8-12 (in Chinese) * Zhenghua Li, Min Zhang, Wanxiang Che, Ting Liu, Wenliang Chen, Haizhou Li. [Joint Models for Chinese POS Tagging and Dependency Parsing](http://ir.hit.edu.cn/~lzh/papers/zhenghua-D11-joint%20pos%20and%20dp.pdf). In Proceedings of the 2011Conference on Empirical Methods in Natural Language Processing (EMNLP 2011). 2011.07, pp. 1180-1191. Edinburgh, Scotland, UK. * Wanxiang Che, Zhenghua Li, Ting Liu. [LTP: A Chinese Language Technology Platform](http://www.aclweb.org/anthology/C/C10/C10-3.pdf#page=16). In Proceedings of the Coling 2010:Demonstrations. 2010.08, pp13-16, Beijing, China. @@ -1077,9 +1119,7 @@ lgdpj主要通过配置文件指定执行的工作,其中主要有两类配置 * Liu, Ting, Jinshan Ma, and Sheng Li. 2006. [Building a dependency treebank for improving Chinese parser](http://ir.hit.edu.cn/phpwebsite/index.php?module=documents&JAS_DocumentManager_op=downloadFile&JAS_File_id=255#page=43). Journal of Chinese Language and Computing, 16(4):207–224. * Lijie Wang, Wanxiang Che, and Ting Liu. 2009. An SVMTool-based Chinese POS Tagger. Journal of Chinese Information Processing, 23(4):16–22. - # 附录 - ## 分词标注集 | 标记 | 含义 | 举例 | diff --git a/examples/Makefile b/examples/Makefile index 9bcbc2cc5..64746de08 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -5,7 +5,8 @@ all: cws cws_cmdline multi_cws_cmdline \ pos pos_cmdline multi_pos_cmdline \ par \ - ner + ner \ + srl cws: cws.cpp g++ -o cws cws.cpp -I./ \ @@ -58,6 +59,10 @@ par: par.cpp g++ -o par par.cpp -I./ \ -I../src/parser/ \ -L../lib -lparser +srl: srl.cpp + g++ -o srl srl.cpp -I./ \ + -I../src/srl/ \ + -L../lib -lsrl .PHONY: clean @@ -68,5 +73,6 @@ clean: rm pos_cmdline rm ner rm par + rm srl rm multi_cws_cmdline rm multi_pos_cmdline diff --git a/examples/cws.cpp b/examples/cws.cpp index 0f42063e0..6a9355c54 100644 --- a/examples/cws.cpp +++ b/examples/cws.cpp @@ -3,38 +3,38 @@ #include "segment_dll.h" int main(int argc, char * argv[]) { - if (argc < 2) { - std::cerr << "cws [model path] [lexicon_file]" << std::endl; - return 1; - } + if (argc < 2) { + std::cerr << "cws [model path] [lexicon_file]" << std::endl; + return 1; + } - void * engine = 0; - if (argc == 2) { - engine = segmentor_create_segmentor(argv[1]); - } else if (argc == 3) { - engine = segmentor_create_segmentor(argv[1], argv[2]); - } + void * engine = 0; + if (argc == 2) { + engine = segmentor_create_segmentor(argv[1]); + } else if (argc == 3) { + engine = segmentor_create_segmentor(argv[1], argv[2]); + } - if (!engine) { - return -1; - } - std::vector words; + if (!engine) { + return -1; + } + std::vector words; - const char * suite[2] = { - "What's wrong with you? 别灰心! http://t.cn/zQz0Rn", - "台北真的是天子骄子吗?",}; + const char * suite[2] = { + "What's wrong with you? 别灰心! http://t.cn/zQz0Rn", + "台北真的是天子骄子吗?",}; - for (int i = 0; i < 2; ++ i) { - words.clear(); - int len = segmentor_segment(engine, suite[i], words); - for (int i = 0; i < len; ++ i) { - std::cout << words[i]; - if (i+1 == len) std::cout < words; - std::string sentence; + if (!engine) { + return -1; + } + std::vector words; + std::string sentence; - std::cerr << "TRACE: Model is loaded" << std::endl; - double tm = get_time(); + std::cerr << "TRACE: Model is loaded" << std::endl; + double tm = get_time(); - while (std::getline(std::cin, sentence, '\n')) { - words.clear(); - if (sentence.size() == 0) { continue; } - int len = segmentor_segment(engine, sentence, words); - for (int i = 0; i < len; ++ i) { - std::cout << words[i]; - if (i+1 == len) std::cout < +#include #include #include #include @@ -92,7 +93,9 @@ void multithreaded_segment( void * args) { int main(int argc, char ** argv) { if (argc < 2 || (0 == strcmp(argv[1], "-h"))) { - std::cerr << "Example: ./multi_cws_cmdline [model path] [lexicon file]=NULL threadnum" << std::endl; + std::cerr << "Example: ./multi_cws_cmdline " + << "[model path] [lexicon file](optional) threadnum" + << std::endl; std::cerr << std::endl; std::cerr << "This program recieve input word sequence from stdin." << std::endl; std::cerr << "One sentence per line." << std::endl; diff --git a/examples/ner.cpp b/examples/ner.cpp index 223e31402..c1c3589e1 100644 --- a/examples/ner.cpp +++ b/examples/ner.cpp @@ -4,40 +4,40 @@ #include "ner_dll.h" int main(int argc, char * argv[]) { - if (argc < 2) { - std::cerr << "usage: ./ner [model_path]" << std::endl; - return -1; - } - - void * engine = ner_create_recognizer(argv[1]); - if (!engine) { - std::cerr << "failed to load model" << std::endl; - return -1; - } - - std::vector words; - std::vector postags; - - words.push_back("中国"); postags.push_back("ns"); - words.push_back("国际"); postags.push_back("n"); - words.push_back("广播"); postags.push_back("n"); - words.push_back("电台"); postags.push_back("n"); - words.push_back("创办"); postags.push_back("v"); - words.push_back("于"); postags.push_back("p"); - words.push_back("1941年"); postags.push_back("m"); - words.push_back("12月"); postags.push_back("m"); - words.push_back("3日"); postags.push_back("m"); - words.push_back("。"); postags.push_back("wp"); - - std::vector tags; - - ner_recognize(engine, words, postags, tags); - - for (int i = 0; i < tags.size(); ++ i) { - std::cout << words[i] << "\t" << postags[i] << "\t" << tags[i] << std::endl; - } - - ner_release_recognizer(engine); - return 0; + if (argc < 2) { + std::cerr << "usage: ./ner [model_path]" << std::endl; + return -1; + } + + void * engine = ner_create_recognizer(argv[1]); + if (!engine) { + std::cerr << "failed to load model" << std::endl; + return -1; + } + + std::vector words; + std::vector postags; + + words.push_back("中国"); postags.push_back("ns"); + words.push_back("国际"); postags.push_back("n"); + words.push_back("广播"); postags.push_back("n"); + words.push_back("电台"); postags.push_back("n"); + words.push_back("创办"); postags.push_back("v"); + words.push_back("于"); postags.push_back("p"); + words.push_back("1941年"); postags.push_back("m"); + words.push_back("12月"); postags.push_back("m"); + words.push_back("3日"); postags.push_back("m"); + words.push_back("。"); postags.push_back("wp"); + + std::vector tags; + + ner_recognize(engine, words, postags, tags); + + for (int i = 0; i < tags.size(); ++ i) { + std::cout << words[i] << "\t" << postags[i] << "\t" << tags[i] << std::endl; + } + + ner_release_recognizer(engine); + return 0; } diff --git a/examples/par.cpp b/examples/par.cpp index 2db1855e7..02ed9dcf2 100644 --- a/examples/par.cpp +++ b/examples/par.cpp @@ -4,34 +4,34 @@ #include "parser_dll.h" int main(int argc, char * argv[]) { - if (argc < 2) { - return -1; - } + if (argc < 2) { + return -1; + } - void * engine = parser_create_parser(argv[1]); - if (!engine) { - return -1; - } + void * engine = parser_create_parser(argv[1]); + if (!engine) { + return -1; + } - std::vector words; - std::vector postags; + std::vector words; + std::vector postags; - words.push_back("一把手"); postags.push_back("n"); - words.push_back("亲自"); postags.push_back("d"); - words.push_back("过问"); postags.push_back("v"); - words.push_back("。"); postags.push_back("wp"); + words.push_back("一把手"); postags.push_back("n"); + words.push_back("亲自"); postags.push_back("d"); + words.push_back("过问"); postags.push_back("v"); + words.push_back("。"); postags.push_back("wp"); - std::vector heads; - std::vector deprels; + std::vector heads; + std::vector deprels; - parser_parse(engine, words, postags, heads, deprels); + parser_parse(engine, words, postags, heads, deprels); - for (int i = 0; i < heads.size(); ++ i) { - std::cout << words[i] << "\t" << postags[i] << "\t" - << heads[i] << "\t" << deprels[i] << std::endl; - } + for (int i = 0; i < heads.size(); ++ i) { + std::cout << words[i] << "\t" << postags[i] << "\t" + << heads[i] << "\t" << deprels[i] << std::endl; + } - parser_release_parser(engine); - return 0; + parser_release_parser(engine); + return 0; } diff --git a/examples/pos_cmdline.cpp b/examples/pos_cmdline.cpp index feca053b5..2785bfe33 100644 --- a/examples/pos_cmdline.cpp +++ b/examples/pos_cmdline.cpp @@ -21,62 +21,62 @@ #include "postag_dll.h" double get_time(void) { - struct timeval tv; - gettimeofday(&tv, NULL); - return tv.tv_sec + (tv.tv_usec / 1000000.0); + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + (tv.tv_usec / 1000000.0); } int main(int argc, char * argv[]) { - if (argc < 1 || (0 == strcmp(argv[1], "-h"))) { - std::cerr << "Example: ./pos_cmdline [model path]" << std::endl; - std::cerr << std::endl; - std::cerr << "This program recieve input word sequence from stdin." << std::endl; - std::cerr << "One sentence per line. Words are separated by space." << std::endl; - std::cerr << std::endl; - return 1; - } + if (argc < 1 || (0 == strcmp(argv[1], "-h"))) { + std::cerr << "Example: ./pos_cmdline [model path]" << std::endl; + std::cerr << std::endl; + std::cerr << "This program recieve input word sequence from stdin." << std::endl; + std::cerr << "One sentence per line. Words are separated by space." << std::endl; + std::cerr << std::endl; + return 1; + } - void * engine = postagger_create_postagger(argv[1]); - if (!engine) { - std::cerr << "WARNINIG : Failed to load model." << std::endl; - return -1; - } + void * engine = postagger_create_postagger(argv[1]); + if (!engine) { + std::cerr << "WARNINIG : Failed to load model." << std::endl; + return -1; + } - std::string line; - std::string word; - std::vector words; - std::vector postags; + std::string line; + std::string word; + std::vector words; + std::vector postags; - std::cerr << "TRACE: Model is loaded" << std::endl; - double tm = get_time(); + std::cerr << "TRACE: Model is loaded" << std::endl; + double tm = get_time(); - while (std::getline(std::cin, line, '\n')) { - std::stringstream S(line); - words.clear(); - while (S >> word) { words.push_back(word); } + while (std::getline(std::cin, line, '\n')) { + std::stringstream S(line); + words.clear(); + while (S >> word) { words.push_back(word); } - if (words.size() == 0) { continue; } - int len = postagger_postag(engine, words, postags); - if (postags.size() != words.size()) { - std::cerr << "WARNINIG: Number of postags is different from number of words" + if (words.size() == 0) { continue; } + int len = postagger_postag(engine, words, postags); + if (postags.size() != words.size()) { + std::cerr << "WARNINIG: Number of postags is different from number of words" << std::endl; - } + } - for (int i = 0; i < len; ++ i) { - std::cout << words[i] << "_" << postags[i]; - if (i+1 == len) std::cout < +#include + +#include "SRL_DLL.h" + +int main(int argc, char * argv[]) { + if (argc < 2) { + return -1; + } + + SRL_LoadResource(argv[1]); + + std::vector words; + std::vector postags; + std::vector nes; + std::vector > parse; + std::vector< std::pair< int, std::vector< std::pair > > > > srl; + words.push_back("一把手"); postags.push_back("n"); nes.push_back("O"); parse.push_back(make_pair(2,"SBV")); + words.push_back("亲自"); postags.push_back("d"); nes.push_back("O"); parse.push_back(make_pair(2,"ADV")); + words.push_back("过问"); postags.push_back("v"); nes.push_back("O"); parse.push_back(make_pair(-1,"HED")); + words.push_back("。"); postags.push_back("wp");nes.push_back("O"); parse.push_back(make_pair(2,"WP")); + + DoSRL(words,postags,nes,parse,srl); + + for(int i = 0;i vecSentences; - string para; - xml.GetParagraph(i, para); + for (int i = 0; i < paraNum; ++i) { + vector vecSentences; + string para; + xml.GetParagraph(i, para); - if (0 == SplitSentence( para, vecSentences )) { - ERROR_LOG("in LTP::splitsent, failed to split sentence"); - return -1; - } + if (0 == SplitSentence( para, vecSentences )) { + ERROR_LOG("in LTP::splitsent, failed to split sentence"); + return kSplitSentenceError; + } - // dummy - // vecSentences.push_back(para); - if (0 != xml.SetSentencesToParagraph(vecSentences, i)) { - ERROR_LOG("in LTP::splitsent, failed to write sentence to xml"); - return -1; - } + // dummy + // vecSentences.push_back(para); + if (0 != xml.SetSentencesToParagraph(vecSentences, i)) { + ERROR_LOG("in LTP::splitsent, failed to write sentence to xml"); + return kWriteXmlError; } + } - xml.SetNote(NOTE_SENT); - return 0; + xml.SetNote(NOTE_SENT); + return 0; } // integrate word segmentor into LTP int LTP::wordseg(XML4NLP & xml) { - if (xml.QueryNote(NOTE_WORD)) { - return 0; - } + if (xml.QueryNote(NOTE_WORD)) { + return 0; + } - // - if (0 != splitSentence_dummy(xml)) { - ERROR_LOG("in LTP::wordseg, failed to perform split sentence preprocess."); - return -1; - } + // + int ret = splitSentence_dummy(xml); + if (0 != ret) { + ERROR_LOG("in LTP::wordseg, failed to perform split sentence preprocess."); + return ret; + } - /*if (0 != m_ltpResource.LoadSegmentorResource(m_ltpOption.segmentor_model_path)) { - ERROR_LOG("in LTP::wordseg, failed to load segmentor resource"); - return -1; - }*/ + // get the segmentor pointer + void * segmentor = m_ltpResource.GetSegmentor(); + if (0 == segmentor) { + ERROR_LOG("in LTP::wordseg, failed to init a segmentor"); + return kWordsegError; + } - // get the segmentor pointer - void * segmentor = m_ltpResource.GetSegmentor(); - if (0 == segmentor) { - ERROR_LOG("in LTP::wordseg, failed to init a segmentor"); - return -1; - } + int stnsNum = xml.CountSentenceInDocument(); - int stnsNum = xml.CountSentenceInDocument(); + if (0 == stnsNum) { + ERROR_LOG("in LTP::wordseg, number of sentence equals 0"); + return kEmptyStringError; + } - if (0 == stnsNum) { - ERROR_LOG("in LTP::wordseg, number of sentence equals 0"); - return -1; - } + for (int i = 0; i < stnsNum; ++ i) { + std::string strStn = xml.GetSentence(i); + std::vector vctWords; - for (int i = 0; i < stnsNum; ++ i) { - string strStn = xml.GetSentence(i); - vector vctWords; - - if (ltp::strutils::codecs::length(strStn) > MAX_SENTENCE_LEN) { - ERROR_LOG("in LTP::wordseg, input sentence is too long"); - return -1; - } + if (ltp::strutils::codecs::length(strStn) > MAX_SENTENCE_LEN) { + ERROR_LOG("in LTP::wordseg, input sentence is too long"); + return kSentenceTooLongError; + } - if (0 == segmentor_segment(segmentor, strStn, vctWords)) { - ERROR_LOG("in LTP::wordseg, failed to perform word segment on \"%s\"", - strStn.c_str()); - return -1; - } + if (0 == segmentor_segment(segmentor, strStn, vctWords)) { + ERROR_LOG("in LTP::wordseg, failed to perform word segment on \"%s\"", + strStn.c_str()); + return kWordsegError; + } - if (0 != xml.SetWordsToSentence(vctWords, i)) { - ERROR_LOG("in LTP::wordseg, failed to write segment result to xml"); - return -1; - } + if (0 != xml.SetWordsToSentence(vctWords, i)) { + ERROR_LOG("in LTP::wordseg, failed to write segment result to xml"); + return kWriteXmlError; } + } - xml.SetNote(NOTE_WORD); - return 0; + xml.SetNote(NOTE_WORD); + return 0; } // integrate postagger into LTP int LTP::postag(XML4NLP & xml) { - if ( xml.QueryNote(NOTE_POS) ) { - return 0; - } + if ( xml.QueryNote(NOTE_POS) ) { + return 0; + } - // dependency - if (0 != wordseg(xml)) { - ERROR_LOG("in LTP::postag, failed to perform word segment preprocess"); - return -1; - } + // dependency + int ret = wordseg(xml); + if (0 != ret) { + ERROR_LOG("in LTP::postag, failed to perform word segment preprocess"); + return ret; + } - /*if (0 != m_ltpResource.LoadPostaggerResource(m_ltpOption.postagger_model_path)) { - ERROR_LOG("in LTP::postag, failed to load postagger resource."); - return -1; - }*/ + void * postagger = m_ltpResource.GetPostagger(); + if (0 == postagger) { + ERROR_LOG("in LTP::postag, failed to init a postagger"); + return kPostagError; + } - void * postagger = m_ltpResource.GetPostagger(); - if (0 == postagger) { - ERROR_LOG("in LTP::postag, failed to init a postagger"); - return -1; - } + int stnsNum = xml.CountSentenceInDocument(); - int stnsNum = xml.CountSentenceInDocument(); + if (0 == stnsNum) { + ERROR_LOG("in LTP::postag, number of sentence equals 0"); + return kEmptyStringError; + } - if (0 == stnsNum) { - ERROR_LOG("in LTP::postag, number of sentence equals 0"); - return -1; - } - - for (int i = 0; i < stnsNum; ++i) { - vector vecWord; - vector vecPOS; + for (int i = 0; i < stnsNum; ++i) { + vector vecWord; + vector vecPOS; - xml.GetWordsFromSentence(vecWord, i); + xml.GetWordsFromSentence(vecWord, i); - if (0 == vecWord.size()) { - ERROR_LOG("Input sentence is empty."); - return -1; - } + if (0 == vecWord.size()) { + ERROR_LOG("Input sentence is empty."); + return kEmptyStringError; + } - if (vecWord.size() > MAX_WORDS_NUM) { - ERROR_LOG("Input sentence is too long."); - return -1; - } + if (vecWord.size() > MAX_WORDS_NUM) { + ERROR_LOG("Input sentence is too long."); + return kSentenceTooLongError; + } - if (0 == postagger_postag(postagger, vecWord, vecPOS)) { - ERROR_LOG("in LTP::postag, failed to perform postag on sent. #%d", i+1); - return -1; - } + if (0 == postagger_postag(postagger, vecWord, vecPOS)) { + ERROR_LOG("in LTP::postag, failed to perform postag on sent. #%d", i+1); + return kPostagError; + } - if (xml.SetPOSsToSentence(vecPOS, i) != 0) { - ERROR_LOG("in LTP::postag, failed to write postag result to xml"); - return -1; - } + if (xml.SetPOSsToSentence(vecPOS, i) != 0) { + ERROR_LOG("in LTP::postag, failed to write postag result to xml"); + return kWriteXmlError; } + } - xml.SetNote(NOTE_POS); + xml.SetNote(NOTE_POS); - return 0; + return 0; } // perform ner over xml -int LTP::ner(XML4NLP & xml) { - if ( xml.QueryNote(NOTE_NE) ) { - return 0; - } +int LTP::ner(XML4NLP & xml) { + if ( xml.QueryNote(NOTE_NE) ) { + return 0; + } - // dependency - if (0 != postag(xml)) { - ERROR_LOG("in LTP::ner, failed to perform postag preprocess"); - return -1; - } + // dependency + int ret = postag(xml); + if (0 != ret) { + ERROR_LOG("in LTP::ner, failed to perform postag preprocess"); + return ret; + } - /*if (0 != m_ltpResource.LoadNEResource(m_ltpOption.ner_model_path)) { - ERROR_LOG("in LTP::ner, failed to load ner resource"); - return -1; - }*/ + void * ner = m_ltpResource.GetNER(); - void * ner = m_ltpResource.GetNER(); + if (NULL == ner) { + ERROR_LOG("in LTP::ner, failed to init a ner."); + return kNERError; + } - if (NULL == ner) { - ERROR_LOG("in LTP::ner, failed to init a ner."); - return -1; + int stnsNum = xml.CountSentenceInDocument(); + + if (stnsNum == 0) { + ERROR_LOG("in LTP::ner, number of sentence equals 0"); + return kEmptyStringError; + } + + for (int i = 0; i < stnsNum; ++ i) { + vector vecWord; + vector vecPOS; + vector vecNETag; + + if (xml.GetWordsFromSentence(vecWord, i) != 0) { + ERROR_LOG("in LTP::ner, failed to get words from xml"); + return kReadXmlError; } - int stnsNum = xml.CountSentenceInDocument(); + if (xml.GetPOSsFromSentence(vecPOS, i) != 0) { + ERROR_LOG("in LTP::ner, failed to get postags from xml"); + return kNERError; + } - if (stnsNum == 0) { - ERROR_LOG("in LTP::ner, number of sentence equals 0"); - return -1; + if (0 == vecWord.size()) { + ERROR_LOG("Input sentence is empty."); + return kEmptyStringError; } - for (int i = 0; i < stnsNum; ++ i) { - vector vecWord; - vector vecPOS; - vector vecNETag; + if (vecWord.size() > MAX_WORDS_NUM) { + ERROR_LOG("Input sentence is too long."); + return kSentenceTooLongError; + } - if (xml.GetWordsFromSentence(vecWord, i) != 0) { - ERROR_LOG("in LTP::ner, failed to get words from xml"); - return -1; - } + if (0 == ner_recognize(ner, vecWord, vecPOS, vecNETag)) { + ERROR_LOG("in LTP::ner, failed to perform ner on sent. #%d", i+1); + return kNERError; + } - if (xml.GetPOSsFromSentence(vecPOS, i) != 0) { - ERROR_LOG("in LTP::ner, failed to get postags from xml"); - return -1; - } + xml.SetNEsToSentence(vecNETag, i); + } - if (0 == vecWord.size()) { - ERROR_LOG("Input sentence is empty."); - return -1; - } + xml.SetNote(NOTE_NE); + return 0; +} - if (vecWord.size() > MAX_WORDS_NUM) { - ERROR_LOG("Input sentence is too long."); - return -1; - } +int LTP::parser(XML4NLP & xml) { + if ( xml.QueryNote(NOTE_PARSER) ) return 0; - if (0 == ner_recognize(ner, vecWord, vecPOS, vecNETag)) { - ERROR_LOG("in LTP::ner, failed to perform ner on sent. #%d", i+1); - return -1; - } + int ret = postag(xml); + if (0 != ret) { + ERROR_LOG("in LTP::parser, failed to perform postag preprocessing"); + return ret; + } - xml.SetNEsToSentence(vecNETag, i); - } + void * parser = m_ltpResource.GetParser(); - xml.SetNote(NOTE_NE); - return 0; -} + if (parser == NULL) { + ERROR_LOG("in LTP::parser, failed to init a parser"); + return kParserError; + } -int LTP::parser(XML4NLP & xml) { - if ( xml.QueryNote(NOTE_PARSER) ) return 0; + int stnsNum = xml.CountSentenceInDocument(); + if (stnsNum == 0) { + ERROR_LOG("in LTP::parser, number of sentences equals 0"); + return kEmptyStringError; + } - if (0 != postag(xml)) { - ERROR_LOG("in LTP::parser, failed to perform postag preprocessing"); - return -1; + for (int i = 0; i < stnsNum; ++i) { + std::vector vecWord; + std::vector vecPOS; + std::vector vecHead; + std::vector vecRel; + + if (xml.GetWordsFromSentence(vecWord, i) != 0) { + ERROR_LOG("in LTP::parser, failed to get words from xml"); + return kReadXmlError; } - /*if ( 0 != m_ltpResource.LoadParserResource(m_ltpOption.parser_model_path) ) { - ERROR_LOG("in LTP::parser, failed to load parser resource"); - return -1; - }*/ + if (xml.GetPOSsFromSentence(vecPOS, i) != 0) { + ERROR_LOG("in LTP::parser, failed to get postags from xml"); + return kReadXmlError; + } - void * parser = m_ltpResource.GetParser(); + if (0 == vecWord.size()) { + ERROR_LOG("Input sentence is empty."); + return kEmptyStringError; + } - if (parser == NULL) { - ERROR_LOG("in LTP::parser, failed to init a parser"); - return -1; + if (vecWord.size() > MAX_WORDS_NUM) { + ERROR_LOG("Input sentence is too long."); + return kSentenceTooLongError; } - int stnsNum = xml.CountSentenceInDocument(); - if (stnsNum == 0) { - ERROR_LOG("in LTP::parser, number of sentences equals 0"); - return -1; + if (-1 == parser_parse(parser, vecWord, vecPOS, vecHead, vecRel)) { + ERROR_LOG("in LTP::parser, failed to perform parse on sent. #%d", i+1); + return kParserError; } - for (int i = 0; i < stnsNum; ++i) { - vector vecWord; - vector vecPOS; - vector vecHead; - vector vecRel; + if (0 != xml.SetParsesToSentence(vecHead, vecRel, i)) { + ERROR_LOG("in LTP::parser, failed to write parse result to xml"); + return kWriteXmlError; + } + } - if (xml.GetWordsFromSentence(vecWord, i) != 0) { - ERROR_LOG("in LTP::parser, failed to get words from xml"); - return -1; - } + xml.SetNote(NOTE_PARSER); - if (xml.GetPOSsFromSentence(vecPOS, i) != 0) { - ERROR_LOG("in LTP::parser, failed to get postags from xml"); - return -1; - } + return 0; +} - if (0 == vecWord.size()) { - ERROR_LOG("Input sentence is empty."); - return -1; - } +int LTP::srl(XML4NLP & xml) { + if ( xml.QueryNote(NOTE_SRL) ) return 0; - if (vecWord.size() > MAX_WORDS_NUM) { - ERROR_LOG("Input sentence is too long."); - return -1; - } + // dependency + int ret = -1; - if (-1 == parser_parse(parser, vecWord, vecPOS, vecHead, vecRel)) { - ERROR_LOG("in LTP::parser, failed to perform parse on sent. #%d", i+1); - return -1; - } + ret = ner(xml); + if (0 != ret) { + ERROR_LOG("in LTP::srl, failed to perform ner preprocess"); + return ret; + } - if (0 != xml.SetParsesToSentence(vecHead, vecRel, i)) { - ERROR_LOG("in LTP::parser, failed to write parse result to xml"); - return -1; - } - } + ret = parser(xml); + if (0 != ret) { + ERROR_LOG("in LTP::srl, failed to perform parsing preprocess"); + return ret; + } - xml.SetNote(NOTE_PARSER); + int stnsNum = xml.CountSentenceInDocument(); + if (stnsNum == 0) { + ERROR_LOG("in LTP::srl, number of sentence equals 0"); + return kEmptyStringError; + } - return 0; -} + for (int i = 0; i < stnsNum; ++i) { + vector vecWord; + vector vecPOS; + vector vecNE; + vector< pair > vecParse; + vector< pair > > > > vecSRLResult; -int LTP::srl(XML4NLP & xml) { - if ( xml.QueryNote(NOTE_SRL) ) return 0; + if (xml.GetWordsFromSentence(vecWord, i) != 0) { + ERROR_LOG("in LTP::ner, failed to get words from xml"); + return kReadXmlError; + } - // dependency - if (0 != ner(xml)) { - ERROR_LOG("in LTP::srl, failed to perform ner preprocess"); - return -1; + if (xml.GetPOSsFromSentence(vecPOS, i) != 0) { + ERROR_LOG("in LTP::ner, failed to get postags from xml"); + return kReadXmlError; } - if (0 != parser(xml)) { - ERROR_LOG("in LTP::srl, failed to perform parsing preprocess"); - return -1; + if (xml.GetNEsFromSentence(vecNE, i) != 0) { + ERROR_LOG("in LTP::ner, failed to get ner result from xml"); + return kReadXmlError; } - /*if ( 0 != m_ltpResource.LoadSRLResource(m_ltpOption.srl_data_dir) ) { - ERROR_LOG("in LTP::srl, failed to load srl resource"); - return -1; - }*/ + if (xml.GetParsesFromSentence(vecParse, i) != 0) { + ERROR_LOG("in LTP::ner, failed to get parsing result from xml"); + return kReadXmlError; + } - int stnsNum = xml.CountSentenceInDocument(); - if (stnsNum == 0) { - ERROR_LOG("in LTP::srl, number of sentence equals 0"); - return -1; + if (0 != SRL(vecWord, vecPOS, vecNE, vecParse, vecSRLResult)) { + ERROR_LOG("in LTP::srl, failed to perform srl on sent. #%d", i+1); + return kSRLError; } - for (int i = 0; i < stnsNum; ++i) { - vector vecWord; - vector vecPOS; - vector vecNE; - vector< pair > vecParse; - vector< pair< int, vector< pair > > > > vecSRLResult; - - if (xml.GetWordsFromSentence(vecWord, i) != 0) { - ERROR_LOG("in LTP::ner, failed to get words from xml"); - return -1; - } - - if (xml.GetPOSsFromSentence(vecPOS, i) != 0) { - ERROR_LOG("in LTP::ner, failed to get postags from xml"); - return -1; - } - - if (xml.GetNEsFromSentence(vecNE, i) != 0) { - ERROR_LOG("in LTP::ner, failed to get ner result from xml"); - return -1; - } - - if (xml.GetParsesFromSentence(vecParse, i) != 0) { - ERROR_LOG("in LTP::ner, failed to get parsing result from xml"); - return -1; - } - - if (0 != SRL(vecWord, vecPOS, vecNE, vecParse, vecSRLResult)) { - ERROR_LOG("in LTP::srl, failed to perform srl on sent. #%d", i+1); - return -1; - } - - int j = 0; - for (; j < vecSRLResult.size(); ++j) { - vector vecType; - vector< pair > vecBegEnd; - int k = 0; - - for (; k < vecSRLResult[j].second.size(); ++k) { - vecType.push_back(vecSRLResult[j].second[k].first); - vecBegEnd.push_back(vecSRLResult[j].second[k].second); - } - - if (0 != xml.SetPredArgToWord(i, vecSRLResult[j].first, vecType, vecBegEnd)) { - return -1; - } - } - } - - xml.SetNote(NOTE_SRL); - return 0; + int j = 0; + for (; j < vecSRLResult.size(); ++j) { + vector vecType; + vector< pair > vecBegEnd; + int k = 0; + + for (; k < vecSRLResult[j].second.size(); ++k) { + vecType.push_back(vecSRLResult[j].second[k].first); + vecBegEnd.push_back(vecSRLResult[j].second[k].second); + } + + if (0 != xml.SetPredArgToWord(i, vecSRLResult[j].first, vecType, vecBegEnd)) { + return kWriteXmlError; + } + } + } + + xml.SetNote(NOTE_SRL); + return 0; } diff --git a/src/__ltp_dll/Ltp.h b/src/__ltp_dll/Ltp.h index 19faf579c..0f957d5c9 100644 --- a/src/__ltp_dll/Ltp.h +++ b/src/__ltp_dll/Ltp.h @@ -16,103 +16,119 @@ using namespace std; // extern ofstream ltp_log_file; #define MAX_SENTENCE_LEN 300 -#define MAX_WORDS_NUM 70 +#define MAX_WORDS_NUM 100 + +enum ErrorCodes { + kEmptyStringError = 1, /*< The input sentence is empty */ + kSplitSentenceError, /*< Failed to perform split sentence */ + kWordsegError, /*< Failed to perform wordseg */ + kPostagError, /*< Failed to perform postag */ + kParserError, /*< Failed to perform parsing */ + kNERError, /*< Failed to perform NER */ + kSRLError, /*< Failed to perform SRL */ + kEncodingError, /*< Sentence encoding not in UTF-8 */ + kXmlParseError, /*< Input xml is not well formatted */ + kSentenceTooLongError, /*< More than 300 characters or 70 words */ + kReadXmlError, /*< Failed to read XML in internal process */ + kWriteXmlError, /*< Failed to write XML in internal process */ +}; class LTP { public: - /* - * the constructor with config filepath specified to `conf/ltp.cnf` - */ - LTP(); - - /* - * the another constructor with user specified config file - * - * @param[in] cfg_file the path to the config file - */ - LTP(const char * cfg_file); - - /* - * deallocate the ltp resource - */ - ~LTP(); - - /* - * return true on the resource successful loaded, otherwise false - */ - bool loaded(); - - // discard - // int CreateDOMFromTxt(const char * cszTxtFileName, XML4NLP& m_xml4nlp); - - // discard - // int CreateDOMFromXml(const char * cszXmlFileName, XML4NLP& m_xml4nlp); - - // save dom tree - // int SaveDOM(const char *cszSaveFileName, XML4NLP& m_xml4nlp); - - /* - * do word segmentation. - * - * @param[in/out] xml the xml storing ltp result - * @return int 0 on success, otherwise -1 - */ - int wordseg(XML4NLP & xml); - - /* - * do postagging - * - * @param[in/out] xml the xml storing ltp result - * @return int 0 on success, otherwise -1 - */ - int postag(XML4NLP & xml); - - /* - * do name entities recognization - * - * @param[in/out] xml the xml storing ltp result - * @return int 0 on success, otherwise -1 - */ - int ner(XML4NLP & xml); - - /* - * do dependency parsing - * - * @param[in/out] xml the xml storing ltp result - * @return int 0 on success, otherwise -1 - */ - int parser(XML4NLP & xml); - - /* - * do semantic role labeling - * - * @param[in/out] xml the xml storing ltp result - * @return int 0 on success, otherwise -1 - */ - int srl(XML4NLP & xml); - int splitSentence_dummy(XML4NLP & xml); - + /* + * the constructor with config filepath specified to `conf/ltp.cnf` + */ + LTP(); + + /* + * the another constructor with user specified config file + * + * @param[in] cfg_file the path to the config file + */ + LTP(const char * cfg_file); + + /* + * deallocate the ltp resource + */ + ~LTP(); + + /* + * return true on the resource successful loaded, otherwise false + */ + bool loaded(); + + // discard + // int CreateDOMFromTxt(const char * cszTxtFileName, XML4NLP& m_xml4nlp); + + // discard + // int CreateDOMFromXml(const char * cszXmlFileName, XML4NLP& m_xml4nlp); + + // save dom tree + // int SaveDOM(const char *cszSaveFileName, XML4NLP& m_xml4nlp); + + /* + * do word segmentation. + * + * @param[in/out] xml the xml storing ltp result + * @return int 0 on success, otherwise -1 + */ + int wordseg(XML4NLP & xml); + + /* + * do postagging + * + * @param[in/out] xml the xml storing ltp result + * @return int 0 on success, otherwise -1 + */ + int postag(XML4NLP & xml); + + /* + * do name entities recognization + * + * @param[in/out] xml the xml storing ltp result + * @return int 0 on success, otherwise -1 + */ + int ner(XML4NLP & xml); + + /* + * do dependency parsing + * + * @param[in/out] xml the xml storing ltp result + * @return int 0 on success, otherwise -1 + */ + int parser(XML4NLP & xml); + + /* + * do semantic role labeling + * + * @param[in/out] xml the xml storing ltp result + * @return int 0 on success, otherwise -1 + */ + int srl(XML4NLP & xml); + + int splitSentence_dummy(XML4NLP & xml); private: - /* - * split the sentence - * - * @param[in/out] xml the xml storing ltp result - * @return int 0 on success, otherwise -1 - */ - - /* - * parse the config file, and load resource according the config - * - * @param[in] confFileName the config file - * @return int 0 on success, otherwise -1 - */ - int ReadConfFile(const char *confFileName = "conf/ltp.cnf"); + /* + * split the sentence + * + * @param[in/out] xml the xml storing ltp result + * @return int 0 on success, otherwise -1 + */ + //int splitSentence_dummy(XML4NLP & xml); + + /* + * parse the config file, and load resource according the config + * + * @param[in] confFileName the config file + * @return int 0 on success, otherwise -1 + */ + int ReadConfFile(const char *confFileName = "conf/ltp.cnf"); private: - LTPResource m_ltpResource; /*< the ltp resources */ - bool m_loaded; /*< use to sepcify if the resource is loaded */ + LTPResource m_ltpResource; /*< the ltp resources */ + bool m_loaded; /*< use to sepcify if the resource is loaded */ }; #endif // end for __LTP_H__ diff --git a/src/__ltp_dll_for_python/ltp_dll_for_python.cpp b/src/__ltp_dll_for_python/ltp_dll_for_python.cpp deleted file mode 100644 index 5a7683ac0..000000000 --- a/src/__ltp_dll_for_python/ltp_dll_for_python.cpp +++ /dev/null @@ -1,616 +0,0 @@ -#define LTP_DLL_FOR_PYTHON_EXPORT - -#include "ltp_dll_for_python.h" - -#include -#include -#include -using namespace std; - -int py_main2(const char *inFile, const char *outFile, const char* confFile) -{ - return main2(inFile, outFile, confFile); -} - -int CreateDOMFromTxt(const char *cszTxtFileName) -{ - return HIT_IR_LTP::CreateDOMFromTxt(cszTxtFileName); -} - -int CreateDOMFromXml(const char *cszXmlFileName) -{ - return HIT_IR_LTP::CreateDOMFromXml(cszXmlFileName); -} - -int CreateDOMFromString(const char *str) -{ - return HIT_IR_LTP::CreateDOMFromString(str); -} - -int ClearDOM() -{ - return HIT_IR_LTP::ClearDOM(); -} - -int SaveDOM(const char *cszSaveFileName) -{ - return HIT_IR_LTP::SaveDOM(cszSaveFileName); -} - -// Modules -int SplitSentence() -{ - return HIT_IR_LTP::SplitSentence(); -} -/* -int IRLAS() // Word segment and POS -{ - return HIT_IR_LTP::IRLAS(); -} -*/ -/* -int SegmentWord() // Word segment -{ - return HIT_IR_LTP::SegmentWord(); -} -*/ -int CRFWordSeg() // CRF-based Word segment -{ - return HIT_IR_LTP::CRFWordSeg(); -} -int PosTag() // POSTagging -{ - return HIT_IR_LTP::PosTag(); -} -int NER() // Named entity recognition -{ - return HIT_IR_LTP::NER(); -} -/* -int WSD() // Word sense disambiguation -{ - return HIT_IR_LTP::WSD(); -} -*/ -int GParser() // Dependency parser -{ - return HIT_IR_LTP::GParser(); -} -/* -int Parser() // Dependency parser -{ - return HIT_IR_LTP::Parser(); -} -*/ -int SRL() // Semantic role labeling -{ - return HIT_IR_LTP::SRL(); -} - -// Counting -int CountParagraphInDocument() -{ - return HIT_IR_LTP::CountParagraphInDocument(); -} - -int CountSentenceInParagraph(int paragraphIdx) -{ - return HIT_IR_LTP::CountSentenceInParagraph(paragraphIdx); -} -int CountSentenceInDocument() -{ - return HIT_IR_LTP::CountSentenceInDocument(); -} - -int CountWordInSentence_p(int paragraphIdx, int sentenceIdx) -{ - return HIT_IR_LTP::CountWordInSentence(paragraphIdx, sentenceIdx); -} -int CountWordInSentence(int globalSentIdx) -{ - return HIT_IR_LTP::CountWordInSentence(globalSentIdx); -} -int CountWordInDocument() -{ - return HIT_IR_LTP::CountWordInDocument(); -} - -// Get paragraph, NOTE: can ONLY used before split sentence. -const char *GetParagraph(int paragraphIdx) -{ - return HIT_IR_LTP::GetParagraph(paragraphIdx); -} - -// Get sentence -const char *GetSentence_p(int paragraphIdx, int sentenceIdx) -{ - return HIT_IR_LTP::GetSentence(paragraphIdx, sentenceIdx); -} -const char *GetSentence(int globalSentIdx) -{ - return HIT_IR_LTP::GetSentence(globalSentIdx); -} - -// Get Word -const char *GetWord_p_s(int paragraphIdx, int sentenceIdx, int wordIdx) -{ - return HIT_IR_LTP::GetWord(paragraphIdx, sentenceIdx, wordIdx); -} -const char *GetWord_s(int globalSentIdx, int wordIdx) -{ - return HIT_IR_LTP::GetWord(globalSentIdx, wordIdx); -} -const char *GetWord(int globalWordIdx) -{ - return HIT_IR_LTP::GetWord(globalWordIdx); -} - -// Get POS -const char *GetPOS_p_s(int paragraphIdx, int sentenceIdx, int wordIdx) -{ - return HIT_IR_LTP::GetPOS(paragraphIdx, sentenceIdx, wordIdx); -} -const char *GetPOS_s(int globalSentIdx, int wordIdx) -{ - return HIT_IR_LTP::GetPOS(globalSentIdx, wordIdx); -} -const char *GetPOS(int globalWordIdx) -{ - return HIT_IR_LTP::GetPOS(globalWordIdx); -} - -// Get NE -const char *GetNE_p_s(int paragraphIdx, int sentenceIdx, int wordIdx) -{ - return HIT_IR_LTP::GetNE(paragraphIdx, sentenceIdx, wordIdx); -} -const char *GetNE_s(int globalSentIdx, int wordIdx) -{ - return HIT_IR_LTP::GetNE(globalSentIdx, wordIdx); -} -const char *GetNE(int globalWordIdx) -{ - return HIT_IR_LTP::GetNE(globalWordIdx); -} - -// Get WSD -/* -int GetWSD_p_s(const char **p_wsd, const char **p_explain, int paragraphIdx, int sentenceIdx, int wordIdx) -{ - pair wsd_explain; - if (0 == HIT_IR_LTP::GetWSD(wsd_explain, paragraphIdx, sentenceIdx, wordIdx)) - { - *p_wsd = wsd_explain.first; - *p_explain = wsd_explain.second; - return 0; - } - else - return -1; -} -int GetWSD_s(const char **p_wsd, const char **p_explain, int globalSentIdx, int wordIdx) -{ - pair wsd_explain; - if (0 == HIT_IR_LTP::GetWSD(wsd_explain, globalSentIdx, wordIdx)) - { - *p_wsd = wsd_explain.first; - *p_explain = wsd_explain.second; - return 0; - } - else - return -1; -} -int GetWSD(const char **p_wsd, const char **p_explain, int globalWordIdx) -{ - pair wsd_explain; - if (0 == HIT_IR_LTP::GetWSD(wsd_explain, globalWordIdx)) - { - *p_wsd = wsd_explain.first; - *p_explain = wsd_explain.second; - return 0; - } - else - return -1; -} -*/ - -// Get Parser -int GetParse_p_s(int *p_parent, const char **p_relate, int paragraphIdx, int sentenceIdx, int wordIdx) -{ - pair parent_relate; - if (0 == HIT_IR_LTP::GetParse(parent_relate, paragraphIdx, sentenceIdx, wordIdx)) - { - *p_parent = parent_relate.first; - *p_relate = parent_relate.second; - return 0; - } - else - return -1; -} -int GetParse_s(int *p_parent, const char **p_relate, int globalSentIdx, int wordIdx) -{ - pair parent_relate; - if (0 == HIT_IR_LTP::GetParse(parent_relate, globalSentIdx, wordIdx)) - { - *p_parent = parent_relate.first; - *p_relate = parent_relate.second; - return 0; - } - else - return -1; -} -int GetParse(int *p_parent, const char **p_relate, int globalWordIdx) -{ - pair parent_relate; - if (0 == HIT_IR_LTP::GetParse(parent_relate, globalWordIdx)) - { - *p_parent = parent_relate.first; - *p_relate = parent_relate.second; - return 0; - } - else - return -1; -} - -// Get words -int GetWordsFromSentence_p(const char *word_arr[], int arr_size, int paragraphIdx, int sentenceIdx) -{ - vector vecWord; - if (0 == HIT_IR_LTP::GetWordsFromSentence(vecWord, paragraphIdx, sentenceIdx)) - { - if (vecWord.size() != arr_size) - { - cerr << "vecWord.size() != arr_size in GetWordsFromSentence_p()" << endl; - return -1; - } - else - { - copy(vecWord.begin(), vecWord.end(), word_arr); - } - } - else - return -1; - - return 0; -} - -int GetWordsFromSentence(const char *word_arr[], int arr_size, int globalSentIdx) -{ - vector vecWord; - if (0 == HIT_IR_LTP::GetWordsFromSentence(vecWord, globalSentIdx)) - { - if (vecWord.size() != arr_size) - { - cerr << "vecWord.size() != arr_size in GetWordsFromSentence()" << endl; - return -1; - } - else - { - copy(vecWord.begin(), vecWord.end(), word_arr); - } - } - else - return -1; - - return 0; -} - - -// Get POSs -int GetPOSsFromSentence_p(const char *pos_arr[], int arr_size, int paragraphIdx, int sentenceIdx) -{ - vector vecPOS; - if (0 == HIT_IR_LTP::GetPOSsFromSentence(vecPOS, paragraphIdx, sentenceIdx)) - { - if (vecPOS.size() != arr_size) - { - cerr << "vecWord.size() != arr_size in GetPOSsFromSentence_p()" << endl; - return -1; - } - else - { - copy(vecPOS.begin(), vecPOS.end(), pos_arr); - } - } - else - return -1; - - return 0; -} -int GetPOSsFromSentence(const char *pos_arr[], int arr_size, int globalSentIdx) -{ - vector vecPOS; - if (0 == HIT_IR_LTP::GetPOSsFromSentence(vecPOS, globalSentIdx)) - { - if (vecPOS.size() != arr_size) - { - cerr << "vecWord.size() != arr_size in GetPOSsFromSentence()" << endl; - return -1; - } - else - { - copy(vecPOS.begin(), vecPOS.end(), pos_arr); - } - } - else - return -1; - - return 0; -} -// Get NEs -int GetNEsFromSentence_p(const char *ne_arr[], int arr_size, int paragraphIdx, int sentenceIdx) -{ - vector vecNE; - if (0 == HIT_IR_LTP::GetNEsFromSentence(vecNE, paragraphIdx, sentenceIdx)) - { - if (vecNE.size() != arr_size) - { - cerr << "vecNE.size() != arr_size in GetNEsFromSentence_p()" << endl; - return -1; - } - else - { - copy(vecNE.begin(), vecNE.end(), ne_arr); - } - } - else - return -1; - - return 0; -} -int GetNEsFromSentence(const char *ne_arr[], int arr_size, int globalSentIdx) -{ - vector vecNE; - if (0 == HIT_IR_LTP::GetNEsFromSentence(vecNE, globalSentIdx)) - { - if (vecNE.size() != arr_size) - { - cerr << "vecNE.size() != arr_size in GetNEsFromSentence_p()" << endl; - return -1; - } - else - { - copy(vecNE.begin(), vecNE.end(), ne_arr); - } - } - else - return -1; - - return 0; -} - -// Get WSDs -/* -int GetWSDsFromSentence_p(const char *wsd_arr[], int arr_size, int paragraphIdx, int sentenceIdx) -{ - vector vecWSD; - if (0 == HIT_IR_LTP::GetWSDsFromSentence(vecWSD, paragraphIdx, sentenceIdx)) - { - if (vecWSD.size() != arr_size) - { - cerr << "vecWSD.size() != arr_size in GetWSDsFromSentence_p()" << endl; - return -1; - } - else - { - copy(vecWSD.begin(), vecWSD.end(), wsd_arr); - } - } - else - return -1; - - return 0; -} -int GetWSDsFromSentence(const char *wsd_arr[], int arr_size, int globalSentIdx) -{ - vector vecWSD; - if (0 == HIT_IR_LTP::GetWSDsFromSentence(vecWSD, globalSentIdx)) - { - if (vecWSD.size() != arr_size) - { - cerr << "vecWSD.size() != arr_size in GetWSDsFromSentence()" << endl; - return -1; - } - else - { - copy(vecWSD.begin(), vecWSD.end(), wsd_arr); - } - } - else - return -1; - - return 0; -} - -int GetWSDExplainsFromSentence_p(const char *explain_arr[], int arr_size, int paragraphIdx, int sentenceIdx) -{ - vector vecExplain; - if (0 == HIT_IR_LTP::GetWSDExplainsFromSentence(vecExplain, paragraphIdx, sentenceIdx)) - { - if (vecExplain.size() != arr_size) - { - cerr << "vecExplain.size() != arr_size in GetWSDExplainsFromSentence_p()" << endl; - return -1; - } - else - { - copy(vecExplain.begin(), vecExplain.end(), explain_arr); - } - } - else - return -1; - - return 0; -} -int GetWSDExplainsFromSentence(const char *explain_arr[], int arr_size, int globalSentIdx) -{ - vector vecExplain; - if (0 == HIT_IR_LTP::GetWSDExplainsFromSentence(vecExplain, globalSentIdx)) - { - if (vecExplain.size() != arr_size) - { - cerr << "vecExplain.size() != arr_size in GetWSDExplainsFromSentence()" << endl; - return -1; - } - else - { - copy(vecExplain.begin(), vecExplain.end(), explain_arr); - } - } - else - return -1; - - return 0; -} -*/ - -// Get Parses -int GetParsesFromSentence_p(int parent_arr[], const char *relate_arr[], int arr_size, int paragraphIdx, int sentenceIdx) -{ - vector< pair > parent_relate; - if (0 == HIT_IR_LTP::GetParsesFromSentence(parent_relate, paragraphIdx, sentenceIdx)) - { - if (parent_relate.size() != arr_size) - { - cerr << "parent_relate.size() != arr_size in GetParsesFromSentence_p()" << endl; - return -1; - } - else - { - int i = 0; - for (; i > parent_relate; - if (0 == HIT_IR_LTP::GetParsesFromSentence(parent_relate, globalSentIdx)) - { - if (parent_relate.size() != arr_size) - { - cerr << "parent_relate.size() != arr_size in GetParsesFromSentence()" << endl; - return -1; - } - else - { - int i = 0; - for (; i vecType; - vector< pair > vecBegEnd; - if (0 == HIT_IR_LTP::GetPredArgToWord(vecType, vecBegEnd, paragraphIdx, sentenceIdx, wordIdx)) - { - if (vecType.size() != arr_size) - { - cerr << "vecType.size() != arr_size in GetPredArgToWord_p_s()" << endl; - return -1; - } - else - { - int i = 0; - for (; i < arr_size; ++i) - { - type_arr[i] = vecType[i]; - beg_arr[i] = vecBegEnd[i].first; - end_arr[i] = vecBegEnd[i].second; - } - } - } - else - return -1; - - return 0; -} -int GetPredArgToWord_p( const char *type_arr[], int beg_arr[], int end_arr[], int arr_size, - int globalSentIdx, int wordIdx) -{ - vector vecType; - vector< pair > vecBegEnd; - if (0 == HIT_IR_LTP::GetPredArgToWord(vecType, vecBegEnd, globalSentIdx, wordIdx)) - { - if (vecType.size() != arr_size) - { - cerr << "vecType.size() != arr_size in GetPredArgToWord_p()" << endl; - return -1; - } - else - { - int i = 0; - for (; i < arr_size; ++i) - { - type_arr[i] = vecType[i]; - beg_arr[i] = vecBegEnd[i].first; - end_arr[i] = vecBegEnd[i].second; - } - } - } - else - return -1; - - return 0; -} -int GetPredArgToWord( const char *type_arr[], int beg_arr[], int end_arr[], int arr_size, - int globalWordIdx) -{ - vector vecType; - vector< pair > vecBegEnd; - if (0 == HIT_IR_LTP::GetPredArgToWord(vecType, vecBegEnd, globalWordIdx)) - { - if (vecType.size() != arr_size) - { - cerr << "vecType.size() != arr_size in GetPredArgToWord()" << endl; - return -1; - } - else - { - int i = 0; - for (; i < arr_size; ++i) - { - type_arr[i] = vecType[i]; - beg_arr[i] = vecBegEnd[i].first; - end_arr[i] = vecBegEnd[i].second; - } - } - } - else - return -1; - - return 0; -} - diff --git a/src/__ltp_dll_for_python/ltp_dll_for_python.h b/src/__ltp_dll_for_python/ltp_dll_for_python.h deleted file mode 100644 index d0b9b584b..000000000 --- a/src/__ltp_dll_for_python/ltp_dll_for_python.h +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef _LTP_DLL_FOR_PYTHON_H -#define _LTP_DLL_FOR_PYTHON_H - -#ifdef LTP_DLL_FOR_PYTHON_EXPORT -#define LTP_DLL_FOR_PYTHON_API extern "C" _declspec(dllexport) -#else -#define LTP_DLL_FOR_PYTHON_API extern "C" _declspec(dllimport) -#endif - -#include "../__ltp_dll/__ltp_dll.h" -#pragma comment(lib, "__ltp_dll.lib") - -LTP_DLL_FOR_PYTHON_API int py_main2(const char *inFile, const char *outFile, const char* confFile = "ltp_modules_to_do.conf"); - -// DOM operation -LTP_DLL_FOR_PYTHON_API int CreateDOMFromTxt(const char *cszTxtFileName); -LTP_DLL_FOR_PYTHON_API int CreateDOMFromXml(const char *cszXmlFileName); -LTP_DLL_FOR_PYTHON_API int CreateDOMFromString(const char *str); - -LTP_DLL_FOR_PYTHON_API int ClearDOM(); -LTP_DLL_FOR_PYTHON_API int SaveDOM(const char *cszSaveFileName); - -// Modules -LTP_DLL_FOR_PYTHON_API int SplitSentence(); -//LTP_DLL_FOR_PYTHON_API int IRLAS(); // Word segment and POS -//LTP_DLL_FOR_PYTHON_API int SegmentWord(); // Word segment -LTP_DLL_FOR_PYTHON_API int CRFWordSeg(); // CRF-based word segment -LTP_DLL_FOR_PYTHON_API int PosTag(); // POSTagging -LTP_DLL_FOR_PYTHON_API int NER(); // Named entity recognition -//LTP_DLL_FOR_PYTHON_API int WSD(); // Word sense disambiguation -LTP_DLL_FOR_PYTHON_API int GParser(); // Dependency parser (Graph-based Method) -//LTP_DLL_FOR_PYTHON_API int Parser(); // Dependency parser (Ma Jinshan) -LTP_DLL_FOR_PYTHON_API int SRL(); // Semantic role labeling - -// Counting -LTP_DLL_FOR_PYTHON_API int CountParagraphInDocument(); - -LTP_DLL_FOR_PYTHON_API int CountSentenceInParagraph(int paragraphIdx); -LTP_DLL_FOR_PYTHON_API int CountSentenceInDocument(); - -LTP_DLL_FOR_PYTHON_API int CountWordInSentence_p(int paragraphIdx, int sentenceIdx); -LTP_DLL_FOR_PYTHON_API int CountWordInSentence(int globalSentIdx); -LTP_DLL_FOR_PYTHON_API int CountWordInDocument(); - - -// Get paragraph, NOTE: can ONLY used before split sentence. -LTP_DLL_FOR_PYTHON_API const char *GetParagraph(int paragraphIdx); - -// Get sentence -LTP_DLL_FOR_PYTHON_API const char *GetSentence_p(int paragraphIdx, int sentenceIdx); -LTP_DLL_FOR_PYTHON_API const char *GetSentence(int globalSentIdx); - -// Get Word -LTP_DLL_FOR_PYTHON_API const char *GetWord_p_s(int paragraphIdx, int sentenceIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API const char *GetWord_s(int globalSentIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API const char *GetWord(int globalWordIdx); - -// Get POS -LTP_DLL_FOR_PYTHON_API const char *GetPOS_p_s(int paragraphIdx, int sentenceIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API const char *GetPOS_s(int globalSentIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API const char *GetPOS(int globalWordIdx); - -// Get NE -LTP_DLL_FOR_PYTHON_API const char *GetNE_p_s(int paragraphIdx, int sentenceIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API const char *GetNE_s(int globalSentIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API const char *GetNE(int globalWordIdx); - -// Get WSD -/* -LTP_DLL_FOR_PYTHON_API int GetWSD_p_s(const char **p_wsd, const char **p_explain, int paragraphIdx, int sentenceIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API int GetWSD_s(const char **p_wsd, const char **p_explain, int globalSentIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API int GetWSD(const char **p_wsd, const char **p_explain, int globalWordIdx); -*/ - -// Get Parser -LTP_DLL_FOR_PYTHON_API int GetParse_p_s(int *p_parent, const char **p_relate, int paragraphIdx, int sentenceIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API int GetParse_s(int *p_parent, const char **p_relate, int globalSentIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API int GetParse(int *p_parent, const char **p_relate, int globalWordIdx); - -// Get words -LTP_DLL_FOR_PYTHON_API int GetWordsFromSentence_p(const char *word_arr[], int arr_size, int paragraphIdx, int sentenceIdx); -LTP_DLL_FOR_PYTHON_API int GetWordsFromSentence(const char *word_arr[], int arr_size, int globalSentIdx); - -// Get POSs -LTP_DLL_FOR_PYTHON_API int GetPOSsFromSentence_p(const char *pos_arr[], int arr_size, int paragraphIdx, int sentenceIdx); -LTP_DLL_FOR_PYTHON_API int GetPOSsFromSentence(const char *pos_arr[], int arr_size, int globalSentIdx); - -// Get NEs -LTP_DLL_FOR_PYTHON_API int GetNEsFromSentence_p(const char *ne_arr[], int arr_size, int paragraphIdx, int sentenceIdx); -LTP_DLL_FOR_PYTHON_API int GetNEsFromSentence(const char *ne_arr[], int arr_size, int globalSentIdx); - -// Get WSDs -/* -LTP_DLL_FOR_PYTHON_API int GetWSDsFromSentence_p(const char *wsd_arr[], int arr_size, int paragraphIdx, int sentenceIdx); -LTP_DLL_FOR_PYTHON_API int GetWSDsFromSentence(const char *wsd_arr[], int arr_size, int globalSentIdx); - -LTP_DLL_FOR_PYTHON_API int GetWSDExplainsFromSentence_p(const char *explain_arr[], int arr_size, int paragraphIdx, int sentenceIdx); -LTP_DLL_FOR_PYTHON_API int GetWSDExplainsFromSentence(const char *explain_arr[], int arr_size, int globalSentIdx); -*/ - -// Get Parses -LTP_DLL_FOR_PYTHON_API int GetParsesFromSentence_p(int parent_arr[], const char *relate_arr[], int arr_size, int paragraphIdx, int sentenceIdx); -LTP_DLL_FOR_PYTHON_API int GetParsesFromSentence(int parent_arr[], const char *relate_arr[], int arr_size, int globalSentIdx); - -// Get SRL -LTP_DLL_FOR_PYTHON_API int CountPredArgToWord_p_s( int paragraphIdx, int sentenceIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API int CountPredArgToWord_p( int globalSentIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API int CountPredArgToWord( int globalWordIdx); - -LTP_DLL_FOR_PYTHON_API int GetPredArgToWord_p_s(const char *type_arr[], int beg_arr[], int end_arr[], int arr_size, - int paragraphIdx, int sentenceIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API int GetPredArgToWord_p( const char *type_arr[], int beg_arr[], int end_arr[], int arr_size, - int globalSentIdx, int wordIdx); -LTP_DLL_FOR_PYTHON_API int GetPredArgToWord( const char *type_arr[], int beg_arr[], int end_arr[], int arr_size, - int globalWordIdx); - -#endif \ No newline at end of file diff --git a/src/__util/MyLib.cpp b/src/__util/MyLib.cpp index dcf8e8dc3..ede8979cd 100644 --- a/src/__util/MyLib.cpp +++ b/src/__util/MyLib.cpp @@ -387,7 +387,7 @@ bool is_chinese_number(const string& str) { if (str == "һ" || str == "??" || str == "??" || str == "??" || str == "??" || str == "??" || str == "??" || str == "??" || str == "??" || str == "ʮ" || - str == "" || str == "??" || str == "??" || str == "??" || str == "??" || + str == "��" || str == "??" || str == "??" || str == "??" || str == "??" || str == "ǧ" || str == "??" || str == "??") { return true; diff --git a/src/__xml4nlp/CMakeLists.txt b/src/__xml4nlp/CMakeLists.txt index b8d5ff8f8..4c74848fa 100644 --- a/src/__xml4nlp/CMakeLists.txt +++ b/src/__xml4nlp/CMakeLists.txt @@ -1,5 +1,4 @@ -INCLUDE_DIRECTORIES(./ - ${util_DIR} +INCLUDE_DIRECTORIES(${SOURCE_DIR} ${THIRDPARTY_DIR}/tinyxml) SET(xml4nlp_source diff --git a/src/__xml4nlp/Xml4nlp.cpp b/src/__xml4nlp/Xml4nlp.cpp index 5584cfa65..2661dd4e6 100644 --- a/src/__xml4nlp/Xml4nlp.cpp +++ b/src/__xml4nlp/Xml4nlp.cpp @@ -9,15 +9,15 @@ */ #include "Xml4nlp.h" -#include "MyLib.h" - -const char * const NOTE_SENT = "sent"; -const char * const NOTE_WORD = "word"; -const char * const NOTE_POS = "pos"; -const char * const NOTE_NE = "ne"; -const char * const NOTE_PARSER = "parser"; -const char * const NOTE_WSD = "wsd"; -const char * const NOTE_SRL = "srl"; +#include "__util/MyLib.h" + +const char * const NOTE_SENT = "sent"; +const char * const NOTE_WORD = "word"; +const char * const NOTE_POS = "pos"; +const char * const NOTE_NE = "ne"; +const char * const NOTE_PARSER = "parser"; +const char * const NOTE_WSD = "wsd"; +const char * const NOTE_SRL = "srl"; //const char * const NOTE_CLASS = "class"; //const char * const NOTE_SUM = "sum"; //const char * const NOTE_CR = "cr"; @@ -46,15 +46,15 @@ const char * const XML4NLP::TAG_END = "end"; const char * const XML4NLP::TAG_ID = "id"; XML4NLP::XML4NLP() { - m_document.documentPtr = NULL; - m_note.nodePtr = NULL; - m_summary.nodePtr = NULL; - m_textclass.nodePtr = NULL; - m_coref.nodePtr = NULL; + document.documentPtr = NULL; + note.nodePtr = NULL; + summary.nodePtr = NULL; + textclass.nodePtr = NULL; + coref.nodePtr = NULL; } XML4NLP::~XML4NLP() { - m_tiXmlDoc.Clear(); + m_tiXmlDoc.Clear(); } ///////////////////////////////////////////////////////////////////////////////////// @@ -62,29 +62,29 @@ XML4NLP::~XML4NLP() { /// the paragraphs are separated by CR ("\r\n") ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::CreateDOMFromFile(const char* fileName) { - ClearDOM(); + ClearDOM(); - if (0 != BuildDOMFrame()) return -1; + if (0 != BuildDOMFrame()) return -1; - ifstream in; - in.open(fileName); - if ( !in.is_open() ) { - cerr << "xml4nlp load file error: " << fileName << endl; - return -1; - } - - string line; - int i = 0; - while (getline(in, line)) { - clean_str(line); // Zhenghua Li, 2007-8-31, 15:57 - // remove_space_gbk(line); - if (line.empty()) { - continue; - } + ifstream in; + in.open(fileName); + if ( !in.is_open() ) { + cerr << "xml4nlp load file error: " << fileName << endl; + return -1; + } - if (0 != BuildParagraph(line, i++)) return -1; + string line; + int i = 0; + while (getline(in, line)) { + clean_str(line); // Zhenghua Li, 2007-8-31, 15:57 + // remove_space_gbk(line); + if (line.empty()) { + continue; } - return 0; + + if (0 != BuildParagraph(line, i++)) return -1; + } + return 0; } ///////////////////////////////////////////////////////////////////////////////////// @@ -92,39 +92,38 @@ int XML4NLP::CreateDOMFromFile(const char* fileName) { /// the paragraphs are separated by CR ("\r\n") ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::CreateDOMFromString(const string & str) { - ClearDOM(); + ClearDOM(); - if (0 != BuildDOMFrame()) return -1; + if (0 != BuildDOMFrame()) return -1; - string strTmp = str; - replace_char_by_char(strTmp, '\r', '\n'); + string strTmp = str; + replace_char_by_char(strTmp, '\r', '\n'); - // std::cout << strTmp << std::endl; - istringstream in(strTmp); // How to use istringstream? - string line; - int i = 0; - while (getline(in, strTmp)) { - clean_str(strTmp); + // std::cout << strTmp << std::endl; + istringstream in(strTmp); // How to use istringstream? + int i = 0; + while (getline(in, strTmp)) { + clean_str(strTmp); - if (strTmp.empty()) { - continue; - } + if (strTmp.empty()) { + continue; + } - if (0 != BuildParagraph(strTmp, i++)) { - return -1; - } + if (0 != BuildParagraph(strTmp, i++)) { + return -1; } + } - return 0; + return 0; } void XML4NLP::ReportTiXmlDocErr() const { - cerr << "[XML4NLP ERROR REPORT]" << endl; - cerr << "description : " << m_tiXmlDoc.ErrorDesc() << endl; - cerr << "location : " << endl; - cerr << "row : " << m_tiXmlDoc.ErrorRow() << endl; - cerr << "col : " << m_tiXmlDoc.ErrorCol() << endl; - cerr << "=====================" << endl; + cerr << "[XML4NLP ERROR REPORT]" << endl; + cerr << "description : " << m_tiXmlDoc.ErrorDesc() << endl; + cerr << "location : " << endl; + cerr << "row : " << m_tiXmlDoc.ErrorRow() << endl; + cerr << "col : " << m_tiXmlDoc.ErrorCol() << endl; + cerr << "=====================" << endl; } ///////////////////////////////////////////////////////////////////////////////////// @@ -135,1468 +134,1117 @@ void XML4NLP::ReportTiXmlDocErr() const { /// note: the input file must be a XML file. ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::LoadXMLFromFile(const char* fileName) { - ClearDOM(); + ClearDOM(); - if ( !m_tiXmlDoc.LoadFile(fileName) ) { - cerr << "load xml file error: " << fileName << endl; - ReportTiXmlDocErr(); - return -1; - } + if ( !m_tiXmlDoc.LoadFile(fileName) ) { + cerr << "load xml file error: " << fileName << endl; + ReportTiXmlDocErr(); + return -1; + } - return InitXmlStructure(); + return InitXmlStructure(); } ///////////////////////////////////////////////////////////////////////////////////// /// load a xml file from a string and parse it. ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::LoadXMLFromString(const char * str) { - ClearDOM(); - m_tiXmlDoc.Parse(str); + ClearDOM(); + m_tiXmlDoc.Parse(str); - if (m_tiXmlDoc.Error()) { - ReportTiXmlDocErr(); - return -1; - } + if (m_tiXmlDoc.Error()) { + ReportTiXmlDocErr(); + return -1; + } - if (-1 == InitXmlStructure()) { - return -1; - } + if (-1 == InitXmlStructure()) { + return -1; + } - if (!LTMLValidation()) { - // failed LTML Validation - return -1; - } + if (!LTMLValidation()) { + // failed LTML Validation + return -1; + } - return 0; + return 0; } int XML4NLP::LoadXMLFromString(const std::string & str) { - return LoadXMLFromString(str.c_str()); + return LoadXMLFromString(str.c_str()); } ///////////////////////////////////////////////////////////////////////////////////// /// clear the DOM tree, delete all nodes that allocated before. ///////////////////////////////////////////////////////////////////////////////////// void XML4NLP::ClearDOM() { - m_tiXmlDoc.Clear(); + m_tiXmlDoc.Clear(); - m_document.documentPtr = NULL; - m_document.paragraphs.clear(); - m_note.nodePtr = NULL; - m_summary.nodePtr = NULL; - m_textclass.nodePtr = NULL; - m_coref.nodePtr = NULL; - m_coref.vecEntity.clear(); + document.documentPtr = NULL; + document.paragraphs.clear(); + note.nodePtr = NULL; + summary.nodePtr = NULL; + textclass.nodePtr = NULL; + coref.nodePtr = NULL; + coref.vecEntity.clear(); - m_vecBegWordIdxOfStns.clear(); - m_vecBegStnsIdxOfPara.clear(); + m_vecBegWordIdxOfStns.clear(); + m_vecBegStnsIdxOfPara.clear(); } ///////////////////////////////////////////////////////////////////////////////////// /// save the DOM tree to a XML file. ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::SaveDOM(const char* fileName) { - if (!m_tiXmlDoc.SaveFile(fileName)) { - ReportTiXmlDocErr(); - return -1; - } + if (!m_tiXmlDoc.SaveFile(fileName)) { + ReportTiXmlDocErr(); + return -1; + } - return 0; + return 0; } ///////////////////////////////////////////////////////////////////////////////////// /// save the DOM tree to a XML string. ///////////////////////////////////////////////////////////////////////////////////// void XML4NLP::SaveDOM(string &strDocument) const { - TiXmlPrinter printer; - m_tiXmlDoc.Accept(&printer); - strDocument = printer.CStr(); + TiXmlPrinter printer; + m_tiXmlDoc.Accept(&printer); + strDocument = printer.CStr(); } // ----------------------------------------------------------------some counting functions int XML4NLP::CountParagraphInDocument() const { - return m_document.paragraphs.size(); + return document.paragraphs.size(); } -int XML4NLP::CountSentenceInParagraph(int paragraphIdx) const { - if ( 0 != CheckRange(paragraphIdx) ) return 0; - return m_document.paragraphs[paragraphIdx].sentences.size(); +int XML4NLP::CountSentenceInParagraph(int pid) const { + if ( 0 != CheckRange(pid) ) return 0; + return document.paragraphs[pid].sentences.size(); } int XML4NLP::CountSentenceInDocument() const { - int stnsNumInDoc = 0; - int paragraphNum = m_document.paragraphs.size(); - for (int i = 0; i < paragraphNum; ++i) { - stnsNumInDoc += m_document.paragraphs[i].sentences.size(); - } - return stnsNumInDoc; + int stnsNumInDoc = 0; + int paragraphNum = document.paragraphs.size(); + for (int i = 0; i < paragraphNum; ++i) { + stnsNumInDoc += document.paragraphs[i].sentences.size(); + } + return stnsNumInDoc; } -int XML4NLP::CountWordInSentence(int paragraphIdx, int sentenceIdx) const { - if ( 0 != CheckRange(paragraphIdx, sentenceIdx) ) return 0; - return m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words.size(); +int XML4NLP::CountWordInSentence(int pid, int sid) const { + if ( 0 != CheckRange(pid, sid) ) return 0; + return document.paragraphs[pid].sentences[sid].words.size(); } -int XML4NLP::CountWordInSentence(int sentenceIdx) const { - pair paraIdx_sentIdx; - if ( 0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx) ) return 0; - return m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words.size(); +int XML4NLP::CountWordInSentence(int global_sid) const { + int pid, sid; + if ( 0 != DecodeGlobalId(global_sid, pid, sid) ) return 0; + return document.paragraphs[pid].sentences[sid].words.size(); } -int XML4NLP::CountWordInParagraph(int paragraphIdx) const { - if ( 0 != CheckRange(paragraphIdx) ) return -1; - int totalWordNum = 0; - int sentNum = m_document.paragraphs[paragraphIdx].sentences.size(); - for (int i=0; i < sentNum; ++i) { - totalWordNum += m_document.paragraphs[paragraphIdx].sentences[i].words.size(); - } - return totalWordNum; +int XML4NLP::CountWordInParagraph(int pid) const { + if ( 0 != CheckRange(pid) ) return -1; + int nr_words = 0; + int nr_sents = document.paragraphs[pid].sentences.size(); + + for (int i = 0; i < nr_sents; ++ i) { + nr_words += document.paragraphs[pid].sentences[i].words.size(); + } + return nr_words; } int XML4NLP::CountWordInDocument() const { - int totalWordNum = 0; - int paraNum = m_document.paragraphs.size(); - for (int i=0; iGetText(); + TiXmlElement *paraPtr = document.paragraphs[pid].paragraphPtr; + return paraPtr->GetText(); } -int XML4NLP::GetParagraph(int paragraphIdx, string &strParagraph) const { - if (0 != CheckRange(paragraphIdx)) { - return -1; - } +int XML4NLP::GetParagraph(int pid, string & str) const { + if (0 != CheckRange(pid)) { return -1; } - const Paragraph_t ¶graph = m_document.paragraphs[paragraphIdx]; + const Paragraph ¶graph = document.paragraphs[pid]; - if (paragraph.sentences.empty()) { - strParagraph = paragraph.paragraphPtr->GetText() ; - } else { - strParagraph = ""; - const vector &sentences = paragraph.sentences; - for (int i=0; iAttribute(TAG_CONT); - } + if (paragraph.sentences.empty()) { + str = paragraph.paragraphPtr->GetText() ; + } else { + str = ""; + const vector &sentences = paragraph.sentences; + for (int i=0; iAttribute(TAG_CONT); } + } - return 0; -} - -const char* XML4NLP::GetSentence(int paragraphIdx, int sentenceIdx) const { - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return NULL; - return m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].sentencePtr->Attribute(TAG_CONT); -} - -const char* XML4NLP::GetSentence(int sentenceIdx) const { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return NULL; - return GetSentence(paraIdx_sentIdx.first, paraIdx_sentIdx.second); -} - -const char* XML4NLP::GetWord(int paragraphIdx, int sentenceIdx, int wordIdx) const { - if ( 0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx) ) return NULL; - return m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_CONT); + return 0; } -const char* XML4NLP::GetWord(int globalSentIdx, int wordIdx) const { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(globalSentIdx, paraIdx_sentIdx)) return NULL; - return GetWord(paraIdx_sentIdx.first, paraIdx_sentIdx.second, wordIdx); -} - -const char* XML4NLP::GetWord(int globalWordIdx) const { - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return NULL; - return GetWord(paraIdx, sentIdx, wordIdx); -} - -const char *XML4NLP::GetPOS(int paragraphIdx, int sentenceIdx, int wordIdx) const { - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return NULL; - return m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_POS); -} - -const char *XML4NLP::GetPOS(int globalSentIdx, int wordIdx) const { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(globalSentIdx, paraIdx_sentIdx)) return NULL; - return GetPOS(paraIdx_sentIdx.first, paraIdx_sentIdx.second, wordIdx); -} - -const char *XML4NLP::GetPOS(int globalWordIdx) const { - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return NULL; - return GetPOS(paraIdx, sentIdx, wordIdx); -} +#define EXTEND_FUNCTION(return_type, function_name) \ + return_type function_name (int global_sid) const { \ + int pid, sid; \ + if (0 != DecodeGlobalId(global_sid, pid, sid)) { return NULL; } \ + return (function_name)(pid, sid); \ + } -const char *XML4NLP::GetNE(int paragraphIdx, int sentenceIdx, int wordIdx) const { - if ( 0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx) ) return NULL; - return m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_NE); +const char* XML4NLP::GetSentence(int pid, int sid) const { + if (0 != CheckRange(pid, sid)) return NULL; + return document.paragraphs[pid].sentences[sid].sentencePtr->Attribute(TAG_CONT); } -const char *XML4NLP::GetNE(int globalSentIdx, int wordIdx) const { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(globalSentIdx, paraIdx_sentIdx)) return NULL; - return GetNE(paraIdx_sentIdx.first, paraIdx_sentIdx.second, wordIdx); -} +EXTEND_FUNCTION(const char *, XML4NLP::GetSentence) -const char *XML4NLP::GetNE(int globalWordIdx) const { - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return NULL; - return GetNE(paraIdx, sentIdx, wordIdx); +#define EXTEND_FUNCTION2(return_type, function_name, tag_name, failed_return) \ +return_type function_name (int pid, int sid, int wid) const { \ + if (0 != CheckRange(pid, sid, wid)) { return failed_return; } \ + return document.paragraphs[pid].sentences[sid].words[wid].wordPtr->Attribute(tag_name); \ +} \ +\ +return_type function_name (int global_sid, int wid) const { \ + int pid, sid; \ + if (0 != DecodeGlobalId(global_sid, pid, sid)) { return failed_return; } \ + return function_name (pid, sid, wid); \ +} \ +\ +return_type function_name (int global_wid) const { \ + int pid, sid, wid; \ + if (0 != DecodeGlobalId(global_wid, pid, sid, wid)) { return failed_return; } \ + return function_name (pid, sid, wid); \ } +EXTEND_FUNCTION2 (const char *, XML4NLP::GetWord, TAG_CONT, NULL) +EXTEND_FUNCTION2 (const char *, XML4NLP::GetPOS, TAG_POS, NULL) +EXTEND_FUNCTION2 (const char *, XML4NLP::GetNE, TAG_NE, NULL) -int XML4NLP::GetWSD(pair &WSD_explanation, - int paragraphIdx, - int sentenceIdx, - int wordIdx) const { - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return -1; - - WSD_explanation.first = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_WSD); - WSD_explanation.second = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_WSD_EXP); +int XML4NLP::GetWSD(WSDResult & explanation, int pid, int sid, int wid) const { + if (0 != CheckRange(pid, sid, wid)) return -1; + explanation.first = document.paragraphs[pid].sentences[sid].words[wid].wordPtr->Attribute(TAG_WSD); + explanation.second = document.paragraphs[pid].sentences[sid].words[wid].wordPtr->Attribute(TAG_WSD_EXP); return 0; } -int XML4NLP::GetWSD(pair & WSD_explanation, - int globalSentIdx, - int wordIdx) const { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(globalSentIdx, paraIdx_sentIdx)) return -1; - return GetWSD(WSD_explanation, paraIdx_sentIdx.first, paraIdx_sentIdx.second); -} - -int XML4NLP::GetWSD(pair & WSD_explanation, - int globalWordIdx) const { - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return -1; - return GetWSD(WSD_explanation, paraIdx, sentIdx, wordIdx); -} - -int XML4NLP::GetParse(pair & parent_relation, - int paragraphIdx, - int sentenceIdx, - int wordIdx) const { - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return -1; - const char *cszParent = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_PSR_PARENT); - parent_relation.first = (cszParent == NULL ? 0 : atoi(cszParent)); - parent_relation.second = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_PSR_RELATE); - return 0; -} - -int XML4NLP::GetParse(pair & parent_relation, - int globalSentIdx, - int wordIdx) const { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(globalSentIdx, paraIdx_sentIdx)) return -1; - return GetParse(parent_relation, paraIdx_sentIdx.first, paraIdx_sentIdx.second); -} - -int XML4NLP::GetParse(pair &parent_relation, - int globalWordIdx) const { - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return -1; - return GetParse(parent_relation, paraIdx, sentIdx, wordIdx); -} - -int XML4NLP::MapGlobalSentIdx2paraIdx_sentIdx(int sentenceIdx, - pair & paraIdx_sentIdx) const { - int startStnsIdxOfPara = 0; - for (int paraIdx=0; paraIdx < m_document.paragraphs.size(); ++paraIdx) { - if (startStnsIdxOfPara + m_document.paragraphs[paraIdx].sentences.size() > sentenceIdx) { - paraIdx_sentIdx.first = paraIdx; - paraIdx_sentIdx.second = sentenceIdx - startStnsIdxOfPara; - return 0; - } - startStnsIdxOfPara += m_document.paragraphs[paraIdx].sentences.size(); +int XML4NLP::GetParse(ParseResult & relation, int pid, int sid, int wid) const { + if (0 != CheckRange(pid, sid, wid)) return -1; + const char * head = document.paragraphs[pid].sentences[sid].words[wid].wordPtr->Attribute(TAG_PSR_PARENT); + relation.first = (head == NULL ? 0 : atoi(head)); + relation.second = document.paragraphs[pid].sentences[sid].words[wid].wordPtr->Attribute(TAG_PSR_RELATE); + return 0; +} + +#define EXTEND_FUNCTION3(return_type, function_name, output_type, failed_return) \ + return_type function_name (output_type & output, int global_sid, int wid) const { \ + int pid, sid; \ + if (0 != DecodeGlobalId(global_sid, pid, sid)) { return failed_return; } \ + return function_name(output, pid, sid, wid); \ + }\ +\ + return_type function_name (output_type & output, int global_wid) const { \ + int pid, wid, sid; \ + if (0 != DecodeGlobalId(global_wid, pid, sid, wid)) { return failed_return; } \ + return function_name(output, pid, sid, wid); \ + } + +EXTEND_FUNCTION3 (int, XML4NLP::GetWSD, WSDResult, -1) +EXTEND_FUNCTION3 (int, XML4NLP::GetParse, ParseResult, -1) + +int XML4NLP::DecodeGlobalId(int global_sid, int & pid, int & sid) const { + int startStnsIdxOfPara = 0; + for (pid = 0; pid < document.paragraphs.size(); ++ pid) { + int len = document.paragraphs[pid].sentences.size(); + if (startStnsIdxOfPara + len > global_sid) { + sid = global_sid - startStnsIdxOfPara; + return 0; + } + startStnsIdxOfPara += len; + } + return -1; +} + +int XML4NLP::DecodeGlobalId(int global_wid, int & pid, int & sid, int & wid) const { + int startWordIdxOfStns = 0; + for (pid = 0; pid < document.paragraphs.size(); ++ pid) { + const vector &sentences = document.paragraphs[pid].sentences; + for (sid = 0; sid < sentences.size(); ++ sid) { + if (startWordIdxOfStns + sentences[sid].words.size() > global_wid) { + wid = global_wid - startWordIdxOfStns; + return 0; + } + startWordIdxOfStns += sentences[sid].words.size(); } - return -1; + } + return -1; } -int XML4NLP::MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(int globalWordIdx, - int & paraIdx, - int & sentIdx, - int & wordIdx) const { - int startWordIdxOfStns = 0; - for (paraIdx=0; paraIdx < m_document.paragraphs.size(); ++paraIdx) { - const vector &sentences = m_document.paragraphs[paraIdx].sentences; - for (sentIdx=0; sentIdx < sentences.size(); ++sentIdx) { - if (startWordIdxOfStns + sentences[sentIdx].words.size() > globalWordIdx) { - wordIdx = globalWordIdx - startWordIdxOfStns; - return 0; - } - startWordIdxOfStns += sentences[sentIdx].words.size(); - } - } +int XML4NLP::GetSentencesFromParagraph(vector &vecSentence, + int paragraphIdx) const { + if (0 != CheckRange(paragraphIdx)) return -1; + if (document.paragraphs[paragraphIdx].sentences.empty()) { return -1; -} - -int XML4NLP::GetSentencesFromParagraph(vector &vecSentence, - int paragraphIdx) const { - if (0 != CheckRange(paragraphIdx)) return -1; - if (m_document.paragraphs[paragraphIdx].sentences.empty()) { - return -1; - } + } - const vector & sentences = m_document.paragraphs[paragraphIdx].sentences; - if (vecSentence.size() != sentences.size()) { - return -1; - } + const vector & sentences = document.paragraphs[paragraphIdx].sentences; + if (vecSentence.size() != sentences.size()) { + return -1; + } - for (int i=0; i < sentences.size(); ++i) { - vecSentence[i] = sentences[i].sentencePtr->Attribute(TAG_CONT); - } + for (int i=0; i < sentences.size(); ++i) { + vecSentence[i] = sentences[i].sentencePtr->Attribute(TAG_CONT); + } - return 0; + return 0; } -int XML4NLP::GetSentencesFromParagraph(vector &vecSentence, - int paragraphIdx) const { - if (0 != CheckRange(paragraphIdx)) return -1; +int XML4NLP::GetSentencesFromParagraph(vector &vecSentence, + int paragraphIdx) const { + if (0 != CheckRange(paragraphIdx)) return -1; - if (m_document.paragraphs[paragraphIdx].sentences.empty()) { - return -1; - } + if (document.paragraphs[paragraphIdx].sentences.empty()) { + return -1; + } - vecSentence.clear(); - const vector &sentences = m_document.paragraphs[paragraphIdx].sentences; - for (int i = 0; i < sentences.size(); ++ i) { - vecSentence.push_back( sentences[i].sentencePtr->Attribute(TAG_CONT) ); - } - return 0; + vecSentence.clear(); + const vector &sentences = document.paragraphs[paragraphIdx].sentences; + for (int i = 0; i < sentences.size(); ++ i) { + vecSentence.push_back( sentences[i].sentencePtr->Attribute(TAG_CONT) ); + } + return 0; } int XML4NLP::SetSentencesToParagraph(const vector &vecSentence, int paragraphIdx) { - if (0 != CheckRange(paragraphIdx)) { - return -1; - } - - if (!m_document.paragraphs[paragraphIdx].sentences.empty()) { - return -1; - } + if (0 != CheckRange(paragraphIdx)) { + return -1; + } - Paragraph_t & paragraph = m_document.paragraphs[paragraphIdx]; - TiXmlElement * paragraphPtr = paragraph.paragraphPtr; - vector &sentences = paragraph.sentences; + if (!document.paragraphs[paragraphIdx].sentences.empty()) { + return -1; + } - TiXmlText *textPtr = paragraphPtr->FirstChild()->ToText(); - if (textPtr == NULL) { - return -1; - } else { - paragraphPtr->RemoveChild(textPtr); - } + Paragraph & paragraph = document.paragraphs[paragraphIdx]; + TiXmlElement * paragraphPtr = paragraph.paragraphPtr; + vector &sentences = paragraph.sentences; - for (int i = 0; i < vecSentence.size(); ++i) { - TiXmlElement *sentencePtr = new TiXmlElement(TAG_SENT); - sentencePtr->SetAttribute(TAG_ID, static_cast(i)); - sentencePtr->SetAttribute(TAG_CONT, vecSentence[i].c_str()); - paragraphPtr->LinkEndChild(sentencePtr); + TiXmlText *textPtr = paragraphPtr->FirstChild()->ToText(); + if (textPtr == NULL) { + return -1; + } else { + paragraphPtr->RemoveChild(textPtr); + } + + for (int i = 0; i < vecSentence.size(); ++i) { + TiXmlElement *sentencePtr = new TiXmlElement(TAG_SENT); + sentencePtr->SetAttribute(TAG_ID, static_cast(i)); + sentencePtr->SetAttribute(TAG_CONT, vecSentence[i].c_str()); + paragraphPtr->LinkEndChild(sentencePtr); + + sentences.push_back( Sentence() ); + sentences[sentences.size()-1].sentencePtr = sentencePtr; + } + + return 0; +} + +#define EXTEND_FUNCTION4(return_type, function_name, tag_name) \ + return_type function_name (std::vector & output, int pid, int sid) const { \ + return GetInfoFromSentence(output, pid, sid, tag_name); \ + } \ +\ + return_type function_name (std::vector & output, int pid, int sid) const { \ + return GetInfoFromSentence(output, pid, sid, tag_name); \ + } \ +\ + return_type function_name (std::vector & output, int global_sid) const { \ + return GetInfoFromSentence(output, global_sid, tag_name); \ + } \ +\ + return_type function_name (std::vector & output, int global_sid) const { \ + return GetInfoFromSentence(output, global_sid, tag_name); \ + } + +EXTEND_FUNCTION4 (int, XML4NLP::GetWordsFromSentence, TAG_CONT); +EXTEND_FUNCTION4 (int, XML4NLP::GetPOSsFromSentence, TAG_POS); +EXTEND_FUNCTION4 (int, XML4NLP::GetNEsFromSentence, TAG_NE); + +int XML4NLP::SetWordsToSentence(const std::vector & input, + int pid, + int sid) { + if (0 != CheckRange(pid, sid)) return -1; + + Sentence &sentence = document.paragraphs[pid].sentences[sid]; + if (!sentence.words.empty()) { + return -1; + } - sentences.push_back( Sentence_t() ); - sentences[sentences.size()-1].sentencePtr = sentencePtr; - } + for (int i = 0; i < input.size(); ++ i) { + TiXmlElement *wordPtr = new TiXmlElement(TAG_WORD); + wordPtr->SetAttribute(TAG_ID, i); + wordPtr->SetAttribute(TAG_CONT, input[i].c_str()); + sentence.sentencePtr->LinkEndChild(wordPtr); - return 0; + sentence.words.push_back( Word() ); + sentence.words[sentence.words.size() - 1].wordPtr = wordPtr; + } + return 0; } -int XML4NLP::GetWordsFromSentence(vector & vecWord, - int paragraphIdx, - int sentenceIdx) const { - return GetInfoFromSentence(vecWord, paragraphIdx, sentenceIdx, TAG_CONT); +int XML4NLP::SetWordsToSentence(const std::vector & input, + int global_sid) { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) { return -1; } + SetWordsToSentence(input, pid, sid); + return 0; } -int XML4NLP::GetWordsFromSentence(std::vector & vecWord, - int paragraphIdx, - int sentenceIdx) const { - return GetInfoFromSentence(vecWord, paragraphIdx, sentenceIdx, TAG_CONT); +int XML4NLP::SetPOSsToSentence(const std::vector & input, + int pid, int sid) { + return SetInfoToSentence(input, pid, sid, TAG_POS); } -int XML4NLP::GetWordsFromSentence(std::vector & vecWord, - int globalSentIdx) const { - return GetInfoFromSentence(vecWord, globalSentIdx, TAG_CONT); +int XML4NLP::SetPOSsToSentence(const std::vector & input, + int global_sid) { + return SetInfoToSentence(input, global_sid, TAG_POS); } -int XML4NLP::GetWordsFromSentence(std::vector & vecWord, - int globalSentIdx) const { - return GetInfoFromSentence(vecWord, globalSentIdx, TAG_CONT); +int XML4NLP::SetNEsToSentence(const std::vector & input, + int pid, int sid) { + return SetInfoToSentence(input, pid, sid, TAG_NE); } -int XML4NLP::SetWordsToSentence(const vector & vecWord, - int paragraphIdx, - int sentenceIdx) { - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return -1; - - Sentence_t &sentence = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx]; - if (!sentence.words.empty()) { - return -1; - } - - for (int i = 0; i < vecWord.size(); ++ i) { - TiXmlElement *wordPtr = new TiXmlElement(TAG_WORD); - wordPtr->SetAttribute(TAG_ID, i); - wordPtr->SetAttribute(TAG_CONT, vecWord[i].c_str()); - sentence.sentencePtr->LinkEndChild(wordPtr); - - sentence.words.push_back( Word_t() ); - sentence.words[sentence.words.size() - 1].wordPtr = wordPtr; - } - return 0; +int XML4NLP::SetNEsToSentence(const std::vector & input, + int global_sid) { + return SetInfoToSentence(input, global_sid, TAG_NE); } -int XML4NLP::SetWordsToSentence(const vector & vecWord, - int sentenceIdx) { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) { - return -1; - } - SetWordsToSentence(vecWord, paraIdx_sentIdx.first, paraIdx_sentIdx.second); - return 0; -} +int XML4NLP::GetParsesFromSentence(std::vector< ParseResult > &relation, + int pid, int sid) const { + std::vector heads; + std::vector deprels; -int XML4NLP::GetPOSsFromSentence(std::vector & vecPOS, - int paragraphIdx, - int sentenceIdx) const { - return GetInfoFromSentence(vecPOS, paragraphIdx, sentenceIdx, TAG_POS); -} + int nr_words = CountWordInSentence(pid, sid); + relation.resize(nr_words); -int XML4NLP::GetPOSsFromSentence(std::vector &vecPOS, - int globalSentIdx) const { - return GetInfoFromSentence(vecPOS, globalSentIdx, TAG_POS); -} + if (0 != GetInfoFromSentence(heads, pid, sid, TAG_PSR_PARENT)) { + return -1; + } -int XML4NLP::GetPOSsFromSentence(std::vector &vecPOS, - int paragraphIdx, - int sentenceIdx) const { - return GetInfoFromSentence(vecPOS, paragraphIdx, sentenceIdx, TAG_POS); -} + if (0 != GetInfoFromSentence(deprels, pid, sid, TAG_PSR_RELATE)) { + return -1; + } -int XML4NLP::GetPOSsFromSentence(std::vector &vecPOS, - int globalSentIdx) const { - return GetInfoFromSentence(vecPOS, globalSentIdx, TAG_POS); -} + for (int i = 0; i < nr_words; ++ i) { + relation[i].first = atoi( heads[i] ); + relation[i].second = deprels[i]; + } -int XML4NLP::SetPOSsToSentence(const std::vector &vecPOS, - int paragraphIdx, - int sentenceIdx) { - return SetInfoToSentence(vecPOS, paragraphIdx, sentenceIdx, TAG_POS); + return 0; } -int XML4NLP::SetPOSsToSentence(const std::vector & vecPOS, - int sentenceIdx) { - return SetInfoToSentence(vecPOS, sentenceIdx, TAG_POS); -} +int XML4NLP::GetParsesFromSentence(std::vector< ParseResult > & relation, + int global_sid) const { + std::vector heads; + std::vector deprels; -int XML4NLP::GetNEsFromSentence(std::vector & vecNE, - int paragraphIdx, - int sentenceIdx) const { - return GetInfoFromSentence(vecNE, paragraphIdx, sentenceIdx, TAG_NE); -} + int nr_words = CountWordInSentence(global_sid); + relation.resize(nr_words); -int XML4NLP::GetNEsFromSentence(std::vector & vecNE, - int globalSentIdx) const { - return GetInfoFromSentence(vecNE, globalSentIdx, TAG_NE); -} + heads.resize(nr_words); + deprels.resize(nr_words); -int XML4NLP::GetNEsFromSentence(std::vector &vecNE, - int paragraphIdx, - int sentenceIdx) const { - return GetInfoFromSentence(vecNE, paragraphIdx, sentenceIdx, TAG_NE); -} + if (0 != GetInfoFromSentence(heads, global_sid, TAG_PSR_PARENT)) { + return -1; + } -int XML4NLP::GetNEsFromSentence(std::vector & vecNE, - int globalSentIdx) const { - return GetInfoFromSentence(vecNE, globalSentIdx, TAG_NE); -} + if (0 != GetInfoFromSentence(deprels, global_sid, TAG_PSR_RELATE)) { + return -1; + } -int XML4NLP::SetNEsToSentence(const std::vector & vecNE, - int paragraphIdx, - int sentenceIdx) { - return SetInfoToSentence(vecNE, paragraphIdx, sentenceIdx, TAG_NE); -} + for (int i = 0; i < nr_words; ++ i) { + relation[i].first = atoi( heads[i] ); + relation[i].second = deprels[i]; + } -int XML4NLP::SetNEsToSentence(const std::vector & vecNE, - int sentenceIdx) { - return SetInfoToSentence(vecNE, sentenceIdx, TAG_NE); + return 0; } -int XML4NLP::GetParsesFromSentence(vector< pair > &vecParse, - int paragraphIdx, - int sentenceIdx) const { - vector vecParent; - vector vecRelate; - int wordNum = CountWordInSentence(paragraphIdx, sentenceIdx); - if (wordNum != vecParse.size()) { - cerr << "vecParse.size() does not equal to the word num in the sentence, should resize first" << endl; - return -1; - } - - // vecParent.resize(wordNum); - // vecRelate.resize(wordNum); - if (0 != GetInfoFromSentence(vecParent, paragraphIdx, sentenceIdx, TAG_PSR_PARENT)) { - return -1; - } - - if (0 != GetInfoFromSentence(vecRelate, paragraphIdx, sentenceIdx, TAG_PSR_RELATE)) { - return -1; - } - for (int i=0; i < vecParent.size(); ++ i) { - vecParse[i].first = atoi( vecParent[i] ); - vecParse[i].second = vecRelate[i]; - } +int XML4NLP::GetParsesFromSentence(std::vector< std::pair > & relation, + int pid, + int sid) const { + std::vector< ParseResult > parse; + if (0 != GetParsesFromSentence(parse, pid, sid)) { + return -1; + } - return 0; + relation.resize( parse.size() ); + for (int i = 0; i < parse.size(); ++ i) { + relation[i].first = parse[i].first; + relation[i].second = parse[i].second; + } + return 0; } -int XML4NLP::GetParsesFromSentence(vector< pair > & vecParse, - int sentenceIdx) const { - vector vecParent; - vector vecRelate; - int wordNum = CountWordInSentence(sentenceIdx); - if (wordNum != vecParse.size()) { - cerr << "vecParse.size() does not equal to the word num in the sentence, should resize first" << endl; - return -1; - } - - vecParent.resize(wordNum); - vecRelate.resize(wordNum); - if (0 != GetInfoFromSentence(vecParent, sentenceIdx, TAG_PSR_PARENT)) { - return -1; - } - - if (0 != GetInfoFromSentence(vecRelate, sentenceIdx, TAG_PSR_RELATE)) { - return -1; - } +int XML4NLP::GetParsesFromSentence(std::vector< std::pair > & relation, + int global_sid) const { + std::vector< ParseResult > parse; + if (0 != GetParsesFromSentence(parse, global_sid)) { + return -1; + } - for (int i=0; i < vecParent.size(); ++i) { - vecParse[i].first = atoi( vecParent[i] ); - vecParse[i].second = vecRelate[i]; - } + relation.resize( parse.size() ); + for (int i = 0; i < parse.size(); ++ i) { + relation[i].first = parse[i].first; + relation[i].second = parse[i].second; + } - return 0; + return 0; } +int XML4NLP::SetParsesToSentence(const std::vector< std::pair > & relation, + int pid, int sid) { + if (0 != CheckRange(pid, sid)) return -1; -int XML4NLP::GetParsesFromSentence(vector< pair > &vecParse, - int paragraphIdx, - int sentenceIdx) const { - vector vecParent; - vector vecRelate; - if (0 != GetInfoFromSentence(vecParent, paragraphIdx, sentenceIdx, TAG_PSR_PARENT)) { - return -1; - } - if (0 != GetInfoFromSentence(vecRelate, paragraphIdx, sentenceIdx, TAG_PSR_RELATE)) { - return -1; - } - - vecParse.clear(); - // Assume their sizes of the two vector are equal. Is it OK? - for (int i=0; i < vecParent.size(); ++i) { - int parentIdx = atoi( vecParent[i].c_str() ); - vecParse.push_back( make_pair(static_cast(parentIdx), vecRelate[i]) ); - } - - return 0; -} + std::vector & words = document.paragraphs[pid].sentences[sid].words; -int XML4NLP::GetParsesFromSentence(vector< pair > &vecParse, - int sentenceIdx) const { - vector vecParent; - vector vecRelate; - if (0 != GetInfoFromSentence(vecParent, sentenceIdx, TAG_PSR_PARENT)) { - return -1; - } - if (0 != GetInfoFromSentence(vecRelate, sentenceIdx, TAG_PSR_RELATE)) { - return -1; - } + if (words.size() != relation.size()) { + std::cerr << "word number does not equal to vecInfo's size in paragraph" + << pid + << " sentence " + << sid << std::endl; + return -1; + } + + if (words[0].wordPtr->Attribute(TAG_PSR_PARENT) != NULL) { + std::cerr << "Attribute \"" + << TAG_PSR_PARENT + << "\" already exists in paragraph" + << pid + << " sentence " + << sid << std::endl; + return -1; + } + + if (words[0].wordPtr->Attribute(TAG_PSR_RELATE) != NULL) { + std::cerr << "Attribute \"" + << TAG_PSR_RELATE + << "\" already exists in paragraph" + << pid + << " sentence " + << sid << endl; + return -1; + } - vecParse.clear(); - // Assume their sizes of the two vector are equal. Is it OK? - for (int i=0; i < vecParent.size(); ++i) { - int parentIdx = atoi( vecParent[i].c_str() ); - vecParse.push_back( make_pair(static_cast(parentIdx), vecRelate[i]) ); - } + for (int i = 0; i < words.size(); ++ i) { + words[i].wordPtr->SetAttribute(TAG_PSR_PARENT, relation[i].first); + words[i].wordPtr->SetAttribute(TAG_PSR_RELATE, relation[i].second.c_str()); + } - return 0; + return 0; } -int XML4NLP::SetParsesToSentence(const vector< pair > &vecParse, - int paragraphIdx, - int sentenceIdx) { - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return -1; - vector &words = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words; - if (words.size() != vecParse.size()) { - cerr << "word number does not equal to vecInfo's size in paragraph" << paragraphIdx - << " sentence " << sentenceIdx << endl; - return -1; - } - - if (words[0].wordPtr->Attribute(TAG_PSR_PARENT) != NULL) { - cerr << "Attribute \"" << TAG_PSR_PARENT << "\" already exists in paragraph" << paragraphIdx - << " sentence " << sentenceIdx << endl; - return -1; - } - - if (words[0].wordPtr->Attribute(TAG_PSR_RELATE) != NULL) { - cerr << "Attribute \"" << TAG_PSR_RELATE << "\" already exists in paragraph" << paragraphIdx - << " sentence " << sentenceIdx << endl; - return -1; - } - - for (int i = 0; i < words.size(); ++i) { - words[i].wordPtr->SetAttribute(TAG_PSR_PARENT, vecParse[i].first); - words[i].wordPtr->SetAttribute(TAG_PSR_RELATE, vecParse[i].second.c_str()); - } - - return 0; +int XML4NLP::SetParsesToSentence(const std::vector< std::pair > & relation, + int global_sid) { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return SetParsesToSentence(relation, pid, sid); } -int XML4NLP::SetParsesToSentence(const vector< pair > &vecParse, int sentenceIdx) { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - vector &words = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words; - if (words.size() != vecParse.size()) { - cerr << "word number does not equal to vecInfo's size in paragraph" << paraIdx_sentIdx.first - << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } - if (words[0].wordPtr->Attribute(TAG_PSR_PARENT) != NULL) - { - cerr << "Attribute \"" << TAG_PSR_PARENT << "\" already exists in paragraph" << paraIdx_sentIdx.first - << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } - if (words[0].wordPtr->Attribute(TAG_PSR_RELATE) != NULL) - { - cerr << "Attribute \"" << TAG_PSR_RELATE << "\" already exists in paragraph" << paraIdx_sentIdx.first - << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } - - for (int i = 0; i < words.size(); ++i) - { - words[i].wordPtr->SetAttribute(TAG_PSR_PARENT, vecParse[i].first); - words[i].wordPtr->SetAttribute(TAG_PSR_RELATE, vecParse[i].second.c_str()); - } - return 0; +int XML4NLP::SetParsesToSentence(const std::vector & heads, + const std::vector & deprels, + int pid, + int sid) { + if (0 != SetInfoToSentence(heads, pid, sid, TAG_PSR_PARENT)) return -1; + if (0 != SetInfoToSentence(deprels, pid, sid, TAG_PSR_RELATE)) return -1; + return 0; } -int XML4NLP::SetParsesToSentence(const vector &vecHead, const vector &vecRel, int paragraphIdx, int sentenceIdx) -{ - if (0 != SetInfoToSentence(vecHead, paragraphIdx, sentenceIdx, TAG_PSR_PARENT)) return -1; - if (0 != SetInfoToSentence(vecRel, paragraphIdx, sentenceIdx, TAG_PSR_RELATE)) return -1; - return 0; -} +int XML4NLP::SetParsesToSentence(const std::vector & heads, + const std::vector & deprels, + int global_sid) { + // decreasing vecHead index + std::vector d_heads; + for (int i = 0; i < heads.size(); ++ i) { + d_heads.push_back( heads[i] - 1 ); + // std::cout << d_heads[i] << " " << deprels[i] << std::endl; + } -int XML4NLP::SetParsesToSentence(const vector &vecHead, const vector &vecRel, int sentenceIdx) -{ - // decreasing vecHead index - vector d_vecHead; - for (int i = 0; i < vecHead.size(); i++) - { - d_vecHead.push_back(vecHead[i] - 1); - } + if (0 != SetInfoToSentence(d_heads, global_sid, TAG_PSR_PARENT)) return -1; + if (0 != SetInfoToSentence(deprels, global_sid, TAG_PSR_RELATE)) return -1; - if (0 != SetInfoToSentence(d_vecHead, sentenceIdx, TAG_PSR_PARENT)) return -1; - if (0 != SetInfoToSentence(vecRel, sentenceIdx, TAG_PSR_RELATE)) return -1; - return 0; + // std::string buffer; + // SaveDOM(buffer); + // std::cout << buffer << std::endl; + return 0; } -// ----------------------------------------------------------------for text summarization -const char* XML4NLP::GetTextSummary() const -{ - if (m_summary.nodePtr != NULL) - { - return m_summary.nodePtr->GetText(); - } - else - { - cerr << "have not done text summary." << endl; - return NULL; - } +const char * XML4NLP::GetTextSummary() const { + if (summary.nodePtr != NULL) { + return summary.nodePtr->GetText(); + } else { + std::cerr << "have not done text summary." << std::endl; + return NULL; + } } -int XML4NLP::SetTextSummary(const char* cszTextSum) -{ - if (m_summary.nodePtr != NULL) - { - cerr << "has done text summary" << endl; - return -1; - } +int XML4NLP::SetTextSummary(const char* cszTextSum) { + if (summary.nodePtr != NULL) { + std::cerr << "has done text summary" << std::endl; + return -1; + } - m_summary.nodePtr = new TiXmlElement(TAG_SUM); - m_tiXmlDoc.RootElement()->LinkEndChild(m_summary.nodePtr); - TiXmlText *textPtr = new TiXmlText(cszTextSum); - m_summary.nodePtr->LinkEndChild(textPtr); + summary.nodePtr = new TiXmlElement(TAG_SUM); + m_tiXmlDoc.RootElement()->LinkEndChild(summary.nodePtr); + TiXmlText * textPtr = new TiXmlText(cszTextSum); + summary.nodePtr->LinkEndChild(textPtr); - return 0; + return 0; } -// ----------------------------------------------------------------for text classification -const char* XML4NLP::GetTextClass() const -{ - if (m_textclass.nodePtr != NULL) - { - return m_textclass.nodePtr->GetText(); - } - else - { - cerr << "have not done text class." << endl; - return NULL; - } +const char * XML4NLP::GetTextClass() const { + if (textclass.nodePtr != NULL) { + return textclass.nodePtr->GetText(); + } else { + cerr << "have not done text class." << endl; + return NULL; + } } -int XML4NLP::SetTextClass(const char* cszTextClass) -{ - if (m_textclass.nodePtr != NULL) - { - cerr << "has done text classify" << endl; - return -1; - } +int XML4NLP::SetTextClass(const char* cszTextClass) { + if (textclass.nodePtr != NULL) { + cerr << "has done text classify" << endl; + return -1; + } - m_textclass.nodePtr = new TiXmlElement(TAG_CLASS); - m_tiXmlDoc.RootElement()->LinkEndChild(m_textclass.nodePtr); - TiXmlText *textPtr = new TiXmlText(cszTextClass); - m_textclass.nodePtr->LinkEndChild(textPtr); - return 0; + textclass.nodePtr = new TiXmlElement(TAG_CLASS); + m_tiXmlDoc.RootElement()->LinkEndChild(textclass.nodePtr); + TiXmlText *textPtr = new TiXmlText(cszTextClass); + textclass.nodePtr->LinkEndChild(textPtr); + return 0; } // ----------------------------------------------------------------for SRL -int XML4NLP::CountPredArgToWord(int paragraphIdx, int sentenceIdx, int wordIdx) const -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr; - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - //cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - // << " of sentence " << sentenceIdx << " of paragraph " << paragraphIdx << endl; - return 0; - } - - int counter = 0; - do - { - ++counter; - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - } while (argPtr != NULL); +int XML4NLP::CountPredArgToWord(int pid, int sid, int wid) const { + if (0 != CheckRange(pid, sid, wid)) return -1; - return counter; -} + TiXmlElement *wordPtr = document.paragraphs[pid].sentences[sid].words[wid].wordPtr; + TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); -int XML4NLP::CountPredArgToWord(int globalSentIdx, int wordIdx) const -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(globalSentIdx, paraIdx_sentIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words[wordIdx].wordPtr; - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - //cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - // << " of sentence " << sentenceIdx << " of paragraph " << paragraphIdx << endl; - return 0; - } - - int counter = 0; - do - { - ++counter; - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - } while (argPtr != NULL); - - return counter; -} + if (argPtr == NULL) { + return 0; + } -int XML4NLP::CountPredArgToWord(int globalWordIdx) const -{ - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paraIdx].sentences[sentIdx].words[wordIdx].wordPtr; - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - //cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - // << " of sentence " << sentenceIdx << " of paragraph " << paragraphIdx << endl; - return 0; - } + int nr_args = 0; - int counter = 0; - do - { - ++counter; - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - } while (argPtr != NULL); + do { + ++ nr_args; + argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); + } while (argPtr != NULL); - return counter; + return nr_args; } - -int XML4NLP::GetPredArgToWord( int paragraphIdx, int sentenceIdx, int wordIdx, - vector &vecType, vector< pair > &vecBegEnd) const -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr; - - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - << " of sentence " << sentenceIdx << " of paragraph " << paragraphIdx << endl; - return -1; - } - - if (vecType.size() != vecBegEnd.size()) - { - cerr << "vecType's size() != vecBegEnd.size(), should resize() first." << endl; - return -1; - } - if (vecType.empty()) - { - cerr << "vecType is empty" << endl; - return -1; - } - - int i = 0; - do - { - const char *cszType = argPtr->Attribute(TAG_SRL_TYPE); - const char *cszBeg = argPtr->Attribute(TAG_BEGIN); - const char *cszEnd = argPtr->Attribute(TAG_END); - vecType[i] = cszType; - int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); - int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); - vecBegEnd[i].first = uiBeg; - vecBegEnd[i].second = uiEnd; - - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - ++i; - } while (argPtr != NULL && i < vecType.size()); - - if ( ! (argPtr == NULL && i == vecType.size()) ) - { - if (argPtr == NULL) - { - cerr << "vecType.size() is too large" << endl; - } - else - { - cerr << "vecType.size() is too small" << endl; - } - - return -1; - } - - return 0; +int XML4NLP::CountPredArgToWord(int global_sid, int wid) const { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return CountPredArgToWord(pid, sid, wid); } -int XML4NLP::GetPredArgToWord( int sentenceIdx, int wordIdx, - vector &vecType, vector< pair > &vecBegEnd) const -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words[wordIdx].wordPtr; - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - << " of sentence " << paraIdx_sentIdx.first << " of paragraph " << paraIdx_sentIdx.first << endl; - return -1; - } - - if (vecType.size() != vecBegEnd.size()) - { - cerr << "vecType's size() != vecBegEnd.size(), should resize() first." << endl; - return -1; - } - if (vecType.empty()) - { - cerr << "vecType is empty" << endl; - return -1; - } - - int i = 0; - do - { - const char *cszType = argPtr->Attribute(TAG_SRL_TYPE); - const char *cszBeg = argPtr->Attribute(TAG_BEGIN); - const char *cszEnd = argPtr->Attribute(TAG_END); - vecType[i] = cszType; - int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); - int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); - //vecBegEnd.push_back( make_pair(uiBeg, uiEnd) ); - vecBegEnd[i].first = uiBeg; - vecBegEnd[i].second = uiEnd; - - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - ++i; - } while (argPtr != NULL && i < vecType.size()); - - if ( ! (argPtr == NULL && i == vecType.size()) ) - { - if (argPtr == NULL) - { - cerr << "vecType.size() is too large" << endl; - } - else - { - cerr << "vecType.size() is too small" << endl; - } - - return -1; - } - - return 0; +int XML4NLP::CountPredArgToWord(int global_wid) const { + int pid, sid, wid; + if (0 != DecodeGlobalId(global_wid, pid, sid, wid)) return -1; + return CountPredArgToWord(pid, sid, wid); } -int XML4NLP::GetPredArgToWord( int globalWordIdx, - vector &vecType, vector< pair > &vecBegEnd) const -{ - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paraIdx].sentences[sentIdx].words[wordIdx].wordPtr; - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - << " of sentence " << paraIdx << " of paragraph " << sentIdx << endl; - return -1; - } - if (vecType.size() != vecBegEnd.size()) - { - cerr << "vecType's size() != vecBegEnd.size(), should resize() first." << endl; - return -1; - } - if (vecType.empty()) - { - cerr << "vecType is empty" << endl; - return -1; - } +int XML4NLP::GetPredArgToWord(int pid, + int sid, + int wid, + std::vector & role, + std::vector< std::pair > & range) const { + if (0 != CheckRange(pid, sid, wid)) return -1; - int i = 0; - do - { - const char *cszType = argPtr->Attribute(TAG_SRL_TYPE); - const char *cszBeg = argPtr->Attribute(TAG_BEGIN); - const char *cszEnd = argPtr->Attribute(TAG_END); - int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); - int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); - vecType[i] = cszType; - vecBegEnd[i].first = uiBeg; - vecBegEnd[i].second = uiEnd; - - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - ++i; - } while (argPtr != NULL && i < vecType.size()); - - if ( ! (argPtr == NULL && i == vecType.size()) ) - { - if (argPtr == NULL) - { - cerr << "vecType.size() is too large" << endl; - } - else - { - cerr << "vecType.size() is too small" << endl; - } + TiXmlElement *wordPtr = document.paragraphs[pid].sentences[sid].words[wid].wordPtr; + TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - return -1; - } + if (argPtr == NULL) { + std::cerr << "\"" + << TAG_SRL_ARG + << "\" does not exists in word " + << wid + << " of sentence " + << sid + << " of paragraph " + << pid << std::endl; + return -1; + } - return 0; -} + if (role.size() != range.size()) { + std::cerr << "role's size() != range.size(), should resize() first." << std::endl; + return -1; + } -int XML4NLP::GetPredArgToWord( int paragraphIdx, int sentenceIdx, int wordIdx, - vector &vecType, vector< pair > &vecBegEnd) const -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr; - - vecType.clear(); - vecBegEnd.clear(); - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - << " of sentence " << sentenceIdx << " of paragraph " << paragraphIdx << endl; - return -1; + if (role.empty()) { + cerr << "role is empty" << endl; + return -1; + } + + int i = 0; + + do { + const char *cszType = argPtr->Attribute(TAG_SRL_TYPE); + const char *cszBeg = argPtr->Attribute(TAG_BEGIN); + const char *cszEnd = argPtr->Attribute(TAG_END); + role[i] = cszType; + int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); + int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); + range[i].first = uiBeg; + range[i].second = uiEnd; + + argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); + ++i; + } while (argPtr != NULL && i < role.size()); + + if ( ! (argPtr == NULL && i == role.size()) ) { + if (argPtr == NULL) { + cerr << "role.size() is too large" << endl; + } else { + cerr << "role.size() is too small" << endl; } - do - { - const char *cszType = argPtr->Attribute(TAG_SRL_TYPE); - const char *cszBeg = argPtr->Attribute(TAG_BEGIN); - const char *cszEnd = argPtr->Attribute(TAG_END); - vecType.push_back(cszType != NULL ? cszType : "" ); - int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); - int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); - vecBegEnd.push_back( make_pair(uiBeg, uiEnd) ); + return -1; + } + + return 0; +} + +int XML4NLP::GetPredArgToWord(int global_sid, + int wid, + std::vector & role, + std::vector< std::pair > & range) const { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return GetPredArgToWord(pid, sid, wid, role, range); +} + +int XML4NLP::GetPredArgToWord(int global_wid, + std::vector & role, + std::vector< std::pair > & range) const { + int pid, sid, wid; + if (0 != DecodeGlobalId(global_wid, pid, sid, wid)) return -1; + return GetPredArgToWord(pid, sid, wid, role, range); +} + +int XML4NLP::GetPredArgToWord(int pid, + int sid, + int wid, + std::vector & role, + std::vector< std::pair > & range) const { + std::vector role2; + int ret = GetPredArgToWord(pid, sid, wid, role2, range); + if (0 != ret) { return ret; } + + role.resize(role2.size()); + for (int i = 0; i < role2.size(); ++ i) { role[i] = role2[i]; } + return 0; +} + +int XML4NLP::GetPredArgToWord(int global_sid, + int wid, + std::vector & role, + std::vector< std::pair > & range) const { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return GetPredArgToWord(pid, sid, wid, role, range); +} + + +int XML4NLP::SetPredArgToWord(int pid, + int sid, + int wid, + const std::vector & role, + const std::vector< std::pair > & range) { + if (0 != CheckRange(pid, sid, wid)) return -1; + + TiXmlElement *wordPtr = document.paragraphs[pid].sentences[sid].words[wid].wordPtr; + + if (wordPtr->FirstChildElement(TAG_SRL_ARG) != NULL) { + std::cerr << "\"" + << TAG_SRL_ARG + << "\" already exists in word " + << wid + << " of sentence " + << sid + << " of paragraph " + << pid << std::endl; + return -1; + } - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - } while (argPtr != NULL); + for (int i = 0; i < role.size(); ++ i) { + TiXmlElement *argPtr = new TiXmlElement(TAG_SRL_ARG); + argPtr->SetAttribute(TAG_ID, i); + argPtr->SetAttribute(TAG_SRL_TYPE, role[i].c_str()); + argPtr->SetAttribute(TAG_BEGIN, range[i].first); + argPtr->SetAttribute(TAG_END, range[i].second); + wordPtr->LinkEndChild(argPtr); + } - return 0; + return 0; } -int XML4NLP::GetPredArgToWord( int sentenceIdx, int wordIdx, - vector &vecType, vector< pair > &vecBegEnd) const -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words[wordIdx].wordPtr; - - vecType.clear(); - vecBegEnd.clear(); - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - << " of sentence " << paraIdx_sentIdx.first << " of paragraph " << paraIdx_sentIdx.first << endl; - return -1; - } - - do - { - const char *cszType = argPtr->Attribute(TAG_SRL_TYPE); - const char *cszBeg = argPtr->Attribute(TAG_BEGIN); - const char *cszEnd = argPtr->Attribute(TAG_END); - vecType.push_back(cszType != NULL ? cszType : "" ); - int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); - int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); - vecBegEnd.push_back( make_pair(uiBeg, uiEnd) ); - - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - } while (argPtr != NULL); - return 0; +int XML4NLP::SetPredArgToWord(int global_sid, + int wid, + const std::vector & role, + const std::vector< std::pair > & range) { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return SetPredArgToWord(pid, sid, wid, role, range); } +int XML4NLP::GetMentionOfEntity(std::vector< std::pair > &mention, + int entityIdx) const { + if (entityIdx >= coref.vecEntity.size()) { + cerr << "entity idx is too large" << endl; + return -1; + } -int XML4NLP::SetPredArgToWord( int paragraphIdx, int sentenceIdx, int wordIdx, - const vector &vecType, const vector< pair > &vecBegEnd) -{ - - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr; - - if (wordPtr->FirstChildElement(TAG_SRL_ARG) != NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" already exists in word " << wordIdx - << " of sentence " << sentenceIdx << " of paragraph " << paragraphIdx << endl; - return -1; - } + const vector &mentionRef = coref.vecEntity[entityIdx].vecMention; + if (mention.size() != mentionRef.size()) { + std::cerr << "mention.size() does not equal to the num of mention," + << " should resize() first" + << std::endl; + return -1; + } - for (int i = 0; i < vecType.size(); ++i) - { - TiXmlElement *argPtr = new TiXmlElement(TAG_SRL_ARG); - argPtr->SetAttribute(TAG_ID, i); - argPtr->SetAttribute(TAG_SRL_TYPE, vecType[i].c_str()); - argPtr->SetAttribute(TAG_BEGIN, vecBegEnd[i].first); - argPtr->SetAttribute(TAG_END, vecBegEnd[i].second); - wordPtr->LinkEndChild(argPtr); + for (int i=0; i < mentionRef.size(); ++i) { + const char *cszBeg = mentionRef[i].mentionPtr->Attribute(TAG_BEGIN); + const char *cszEnd = mentionRef[i].mentionPtr->Attribute(TAG_END); + if (cszBeg == NULL || cszEnd == NULL) { + std::cerr << "mention attribute err in DOM" << std::endl; + return -1; } - return 0; + mention[i].first = atoi(cszBeg); + mention[i].second = atoi(cszEnd); + } + return 0; } -int XML4NLP::SetPredArgToWord( int sentenceIdx, int wordIdx, - const vector &vecType, const vector< pair > &vecBegEnd) -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words[wordIdx].wordPtr; +int XML4NLP::GetCoreference(vector< vector< pair > > &vecCoref) const { + if (coref.nodePtr == NULL) { + cerr << "has not done coreference" << endl; + return -1; + } + vecCoref.clear(); + TiXmlElement *crPtr = coref.nodePtr->FirstChildElement(TAG_COREF_CR); - if (wordPtr->FirstChildElement(TAG_SRL_ARG) != NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" already exists in word " << wordIdx - << " of sentence " << paraIdx_sentIdx.first << " of paragraph " << paraIdx_sentIdx.first << endl; - return -1; - } + for (; crPtr != NULL; crPtr = crPtr->NextSiblingElement(TAG_COREF_CR)) { + vecCoref.push_back( vector< pair >() ); + vector< pair > &vecRef = vecCoref[vecCoref.size()-1]; + TiXmlElement *mentPtr = crPtr->FirstChildElement(TAG_COREF_MENT); - for (int i = 0; i < vecType.size(); ++i) - { - TiXmlElement *argPtr = new TiXmlElement(TAG_SRL_ARG); - argPtr->SetAttribute(TAG_ID, i); - argPtr->SetAttribute(TAG_SRL_TYPE, vecType[i].c_str()); - argPtr->SetAttribute(TAG_BEGIN, vecBegEnd[i].first); - argPtr->SetAttribute(TAG_END, vecBegEnd[i].second); - wordPtr->LinkEndChild(argPtr); + for (; mentPtr != NULL; mentPtr = mentPtr->NextSiblingElement(TAG_COREF_MENT)) { + const char *cszBeg = mentPtr->Attribute(TAG_BEGIN); + const char *cszEnd = mentPtr->Attribute(TAG_END); + int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); + int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); + vecRef.push_back( make_pair(uiBeg, uiEnd) ); } - return 0; + } + return 0; } -// ----------------------------------------------------------------for coreference resolution -int XML4NLP::GetMentionOfEntity(vector< pair > &vecMention, int entityIdx) const -{ - if (entityIdx >= m_coref.vecEntity.size()) - { - cerr << "entity idx is too large" << endl; - return -1; - } +int XML4NLP::SetCoreference(const vector< vector< pair > > &vecCoref) { + if (coref.nodePtr != NULL) { + cerr << "has already done coreference" << endl; + return -1; + } - const vector &vecMentionRef = m_coref.vecEntity[entityIdx].vecMention; - if (vecMention.size() != vecMentionRef.size()) - { - cerr << "vecMention.size() does not equal to the num of mention, should resize() first" << endl; - return -1; - } + coref.nodePtr = new TiXmlElement(TAG_COREF); + for (int i = 0; i < vecCoref.size(); ++i) { + TiXmlElement *crPtr = new TiXmlElement(TAG_COREF_CR); + crPtr->SetAttribute(TAG_ID, i); - for (int i=0; i < vecMentionRef.size(); ++i) - { - const char *cszBeg = vecMentionRef[i].mentionPtr->Attribute(TAG_BEGIN); - const char *cszEnd = vecMentionRef[i].mentionPtr->Attribute(TAG_END); - if (cszBeg == NULL || cszEnd == NULL) - { - cerr << "mention attribute err in DOM" << endl; - return -1; - } - vecMention[i].first = atoi(cszBeg); - vecMention[i].second = atoi(cszEnd); - } - return 0; -} + coref.vecEntity.push_back( Entity() ); + Entity &entity = coref.vecEntity[coref.vecEntity.size() - 1]; + entity.entityPtr = crPtr; -int XML4NLP::GetCoreference(vector< vector< pair > > &vecCoref) const -{ - if (m_coref.nodePtr == NULL) - { - cerr << "has not done coreference" << endl; - return -1; - } - vecCoref.clear(); - TiXmlElement *crPtr = m_coref.nodePtr->FirstChildElement(TAG_COREF_CR); - for (; crPtr != NULL; crPtr = crPtr->NextSiblingElement(TAG_COREF_CR)) - { - vecCoref.push_back( vector< pair >() ); - vector< pair > &vecRef = vecCoref[vecCoref.size()-1]; - TiXmlElement *mentPtr = crPtr->FirstChildElement(TAG_COREF_MENT); - for (; mentPtr != NULL; mentPtr = mentPtr->NextSiblingElement(TAG_COREF_MENT)) - { - const char *cszBeg = mentPtr->Attribute(TAG_BEGIN); - const char *cszEnd = mentPtr->Attribute(TAG_END); - int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); - int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); - vecRef.push_back( make_pair(uiBeg, uiEnd) ); - } - } - return 0; -} + for (int j = 0; j < vecCoref[i].size(); ++j) { + TiXmlElement *mentPtr = new TiXmlElement(TAG_COREF_MENT); + mentPtr->SetAttribute(TAG_ID, j); + mentPtr->SetAttribute(TAG_BEGIN, vecCoref[i][j].first); + mentPtr->SetAttribute(TAG_END, vecCoref[i][j].second); + crPtr->LinkEndChild(mentPtr); -int XML4NLP::SetCoreference(const vector< vector< pair > > &vecCoref) -{ - if (m_coref.nodePtr != NULL) - { - cerr << "has already done coreference" << endl; - return -1; + entity.vecMention.push_back( Mention() ); + Mention &mention = entity.vecMention[entity.vecMention.size() - 1]; + mention.mentionPtr = mentPtr; } - m_coref.nodePtr = new TiXmlElement(TAG_COREF); - for (int i = 0; i < vecCoref.size(); ++i) - { - TiXmlElement *crPtr = new TiXmlElement(TAG_COREF_CR); - crPtr->SetAttribute(TAG_ID, i); - - m_coref.vecEntity.push_back( Entity() ); - Entity &entity = m_coref.vecEntity[m_coref.vecEntity.size() - 1]; - entity.entityPtr = crPtr; - - for (int j = 0; j < vecCoref[i].size(); ++j) - { - TiXmlElement *mentPtr = new TiXmlElement(TAG_COREF_MENT); - mentPtr->SetAttribute(TAG_ID, j); - mentPtr->SetAttribute(TAG_BEGIN, vecCoref[i][j].first); - mentPtr->SetAttribute(TAG_END, vecCoref[i][j].second); - crPtr->LinkEndChild(mentPtr); - - entity.vecMention.push_back( Mention() ); - Mention &mention = entity.vecMention[entity.vecMention.size() - 1]; - mention.mentionPtr = mentPtr; - } - - m_coref.nodePtr->LinkEndChild(crPtr); - } - m_tiXmlDoc.RootElement()->LinkEndChild(m_coref.nodePtr); + coref.nodePtr->LinkEndChild(crPtr); + } + m_tiXmlDoc.RootElement()->LinkEndChild(coref.nodePtr); - return 0; + return 0; } ///////////////////////////////////////////////////////////////////////////////////// /// initialize the XML DOM tree. /// after the process LoadFile(), the DOM tree has been set up -/// but it is not fully conform to our need, +/// but it is not fully conform to our need, /// for example, the member "paragraphss" of the class Document has not been initialized, /// this function just do this. ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::InitXmlStructure() { - TiXmlElement *xml4nlp = m_tiXmlDoc.RootElement(); - m_document.documentPtr = xml4nlp->FirstChildElement(TAG_DOC); - m_note.nodePtr = xml4nlp->FirstChildElement(TAG_NOTE); - // document summary, text classification and coreference is not - // provided in current version (v3.0.0) - // m_summary.nodePtr = xml4nlp->FirstChildElement(TAG_SUM); - // m_textclass.nodePtr = xml4nlp->FirstChildElement(TAG_CLASS); - // m_coref.nodePtr = xml4nlp->FirstChildElement(TAG_COREF); - - if (m_document.documentPtr == NULL) { // consider it as wrong for now. - cerr << "there is no \"" << TAG_DOC << "\" tag in xml file." << endl; - return -1; - } + TiXmlElement *xml4nlp = m_tiXmlDoc.RootElement(); + document.documentPtr = xml4nlp->FirstChildElement(TAG_DOC); + note.nodePtr = xml4nlp->FirstChildElement(TAG_NOTE); + // document summary, text classification and coreference is not + // provided in current version (v3.0.0) + // summary.nodePtr = xml4nlp->FirstChildElement(TAG_SUM); + // textclass.nodePtr = xml4nlp->FirstChildElement(TAG_CLASS); + // coref.nodePtr = xml4nlp->FirstChildElement(TAG_COREF); + + if (document.documentPtr == NULL) { // consider it as wrong for now. + cerr << "there is no \"" << TAG_DOC << "\" tag in xml file." << endl; + return -1; + } - if (0 != InitXmlDocument(m_document)) { - return -1; - } + if (0 != InitXmlDocument(document)) { + return -1; + } - if (m_coref.nodePtr != NULL) { - if (0 != InitXmlCoref(m_coref)) { - return -1; - } + if (coref.nodePtr != NULL) { + if (0 != InitXmlCoref(coref)) { + return -1; } + } - return 0; + return 0; } +int XML4NLP::InitXmlCoref(Coref &coref) { + TiXmlElement *entityPtr = coref.nodePtr->FirstChildElement(TAG_COREF_CR); -void XML4NLP::CheckNoteForOldLtml() -{ - m_note.nodePtr = new TiXmlElement(TAG_NOTE); - m_tiXmlDoc.RootElement()->LinkEndChild( m_note.nodePtr ); - ClearAllNote(); - - // if (m_coref.nodePtr != NULL) SetNote(NOTE_CR); - // if (m_summary.nodePtr != NULL) SetNote(NOTE_SUM); - // if (m_textclass.nodePtr != NULL) SetNote(NOTE_CLASS); - - if ( m_document.paragraphs.empty() ) return; - if ( m_document.paragraphs[0].sentences.empty() ) return; - SetNote(NOTE_SENT); - if ( m_document.paragraphs[0].sentences[0].words.empty() ) return; - SetNote(NOTE_WORD); - TiXmlElement *wordPtr = m_document.paragraphs[0].sentences[0].words[0].wordPtr; - if ( wordPtr->Attribute(TAG_POS) != NULL ) SetNote(NOTE_POS); - if ( wordPtr->Attribute(TAG_NE) != NULL ) SetNote(NOTE_NE); - if ( wordPtr->Attribute(TAG_WSD) != NULL ) SetNote(NOTE_WSD); // consider only one attribute, excluding TAG_WSD_EXP - if ( wordPtr->Attribute(TAG_PSR_PARENT) != NULL ) SetNote(NOTE_PARSER); // excluding TAG_PSR_RELATE - if ( wordPtr->Attribute(TAG_SRL_ARG) != NULL ) SetNote(NOTE_SRL); // excluding TAG_SRL_TYPE -} - -int XML4NLP::InitXmlCoref(Coref &coref) -{ - TiXmlElement *entityPtr = coref.nodePtr->FirstChildElement(TAG_COREF_CR); - if (entityPtr == NULL) - { - return 0; - } - - do - { - if (0 != InitXmlEntity(coref.vecEntity, entityPtr)) return -1; - entityPtr = entityPtr->NextSiblingElement(TAG_COREF_CR); - } while (entityPtr != NULL); + if (entityPtr == NULL) { return 0; + } + + do { + if (0 != InitXmlEntity(coref.vecEntity, entityPtr)) return -1; + entityPtr = entityPtr->NextSiblingElement(TAG_COREF_CR); + } while (entityPtr != NULL); + return 0; } -int XML4NLP::InitXmlEntity(vector &vecEntity, TiXmlElement *entityPtr) -{ - vecEntity.push_back( Entity() ); - Entity &entity = vecEntity[vecEntity.size()-1]; - entity.entityPtr = entityPtr; - - TiXmlElement *mentionPtr = entityPtr->FirstChildElement(TAG_COREF_MENT); - if (mentionPtr == NULL) return 0; - - do - { - if (0 != InitXmlMention(entity.vecMention, mentionPtr)) return -1; - mentionPtr = mentionPtr->NextSiblingElement(TAG_COREF_MENT); - } while(mentionPtr != NULL); - return 0; +int XML4NLP::InitXmlEntity(vector &vecEntity, TiXmlElement *entityPtr) { + vecEntity.push_back( Entity() ); + Entity &entity = vecEntity[vecEntity.size()-1]; + entity.entityPtr = entityPtr; + + TiXmlElement *mentionPtr = entityPtr->FirstChildElement(TAG_COREF_MENT); + if (mentionPtr == NULL) return 0; + + do { + if (0 != InitXmlMention(entity.vecMention, mentionPtr)) return -1; + mentionPtr = mentionPtr->NextSiblingElement(TAG_COREF_MENT); + } while(mentionPtr != NULL); + return 0; } -int XML4NLP::InitXmlMention(vector &vecMention, TiXmlElement *mentionPtr) -{ - vecMention.push_back( Mention() ); - vecMention[vecMention.size() -1].mentionPtr = mentionPtr; - return 0; +int XML4NLP::InitXmlMention(vector &vecMention, TiXmlElement *mentionPtr) { + vecMention.push_back( Mention() ); + vecMention[vecMention.size() -1].mentionPtr = mentionPtr; + return 0; } -int XML4NLP::InitXmlDocument(Document_t &document) -{ - TiXmlElement *paragraphPtr = document.documentPtr->FirstChildElement(TAG_PARA); - if (paragraphPtr == NULL) // consider it as wrong for now. - { - cerr << "there is no \"" << TAG_PARA << "\" tag in xml file." << endl; - return -1; - } +int XML4NLP::InitXmlDocument(Document &document) { + TiXmlElement *paragraphPtr = document.documentPtr->FirstChildElement(TAG_PARA); + if (paragraphPtr == NULL) { + // consider it as wrong for now. + cerr << "there is no \"" << TAG_PARA << "\" tag in xml file." << endl; + return -1; + } - do - { - if (0 != InitXmlParagraph(document.paragraphs, paragraphPtr)) return -1; - paragraphPtr = paragraphPtr->NextSiblingElement(TAG_PARA); - } while (paragraphPtr != NULL); - return 0; + do { + if (0 != InitXmlParagraph(document.paragraphs, paragraphPtr)) return -1; + paragraphPtr = paragraphPtr->NextSiblingElement(TAG_PARA); + } while (paragraphPtr != NULL); + return 0; } -int XML4NLP::InitXmlParagraph(vector ¶graphs, TiXmlElement *paragraphPtr) +int XML4NLP::InitXmlParagraph(vector ¶graphs, TiXmlElement *paragraphPtr) { - paragraphs.push_back( Paragraph_t() ); - Paragraph_t ¶graph = paragraphs[paragraphs.size()-1]; - paragraph.paragraphPtr = paragraphPtr; + paragraphs.push_back( Paragraph() ); + Paragraph ¶graph = paragraphs[paragraphs.size()-1]; + paragraph.paragraphPtr = paragraphPtr; - TiXmlElement *stnsPtr = paragraphPtr->FirstChildElement(TAG_SENT); - if (stnsPtr == NULL) return 0; // have not split sentence + TiXmlElement *stnsPtr = paragraphPtr->FirstChildElement(TAG_SENT); + if (stnsPtr == NULL) return 0; // have not split sentence - // record the sentence info - do { - if (0 != InitXmlSentence(paragraph.sentences, stnsPtr)) return -1; - stnsPtr = stnsPtr->NextSiblingElement(TAG_SENT); - } while(stnsPtr != NULL); + // record the sentence info + do { + if (0 != InitXmlSentence(paragraph.sentences, stnsPtr)) return -1; + stnsPtr = stnsPtr->NextSiblingElement(TAG_SENT); + } while(stnsPtr != NULL); - return 0; + return 0; } -int XML4NLP::InitXmlSentence(vector &sentences, TiXmlElement *stnsPtr) +int XML4NLP::InitXmlSentence(vector &sentences, TiXmlElement *stnsPtr) { - sentences.push_back( Sentence_t() ); - Sentence_t &sentence = sentences[sentences.size()-1]; - sentence.sentencePtr = stnsPtr; + sentences.push_back( Sentence() ); + Sentence &sentence = sentences[sentences.size()-1]; + sentence.sentencePtr = stnsPtr; - TiXmlElement *wordPtr = stnsPtr->FirstChildElement(TAG_WORD); - if (wordPtr == NULL) return 0; // have not done word segment + TiXmlElement *wordPtr = stnsPtr->FirstChildElement(TAG_WORD); + if (wordPtr == NULL) return 0; // have not done word segment - do - { - if (0 != InitXmlWord(sentence.words, wordPtr)) return -1; - wordPtr = wordPtr->NextSiblingElement(TAG_WORD); - } while(wordPtr != NULL); + do + { + if (0 != InitXmlWord(sentence.words, wordPtr)) return -1; + wordPtr = wordPtr->NextSiblingElement(TAG_WORD); + } while(wordPtr != NULL); - return 0; + return 0; } -int XML4NLP::InitXmlWord(vector &words, TiXmlElement *wordPtr) -{ - words.push_back( Word_t() ); - words[words.size()-1].wordPtr = wordPtr; - return 0; +int XML4NLP::InitXmlWord(vector &words, TiXmlElement *wordPtr) { + words.push_back( Word() ); + words[words.size()-1].wordPtr = wordPtr; + return 0; } ///////////////////////////////////////////////////////////////////////////////////// /// build the initial DOM tree frame. -/// it creates the XML declaration and the XSL declaration instructions and creates +/// it creates the XML declaration and the XSL declaration instructions and creates /// a root element "xml4nlp" and a child node "doc". ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::BuildDOMFrame() { - TiXmlDeclaration * xmlDeclaration = new TiXmlDeclaration("1.0", "utf-8", ""); - TiXmlElement * xml4nlp = new TiXmlElement("xml4nlp"); - m_note.nodePtr = new TiXmlElement(TAG_NOTE); - m_document.documentPtr = new TiXmlElement(TAG_DOC); + TiXmlDeclaration * xmlDeclaration = new TiXmlDeclaration("1.0", "utf-8", ""); + TiXmlElement * xml4nlp = new TiXmlElement("xml4nlp"); + note.nodePtr = new TiXmlElement(TAG_NOTE); + document.documentPtr = new TiXmlElement(TAG_DOC); - m_tiXmlDoc.LinkEndChild(xmlDeclaration); - m_tiXmlDoc.LinkEndChild(xml4nlp); + m_tiXmlDoc.LinkEndChild(xmlDeclaration); + m_tiXmlDoc.LinkEndChild(xml4nlp); - xml4nlp->LinkEndChild(m_note.nodePtr); - ClearAllNote(); - xml4nlp->LinkEndChild(m_document.documentPtr); + xml4nlp->LinkEndChild(note.nodePtr); + ClearAllNote(); + xml4nlp->LinkEndChild(document.documentPtr); - return 0; + return 0; } bool XML4NLP::LTMLValidation() { - // there should not be any attributes in `` - // but it wont matter - - // is the attributes in `note` legal - int state = 0; - state |= QueryNote(NOTE_SRL); state <<= 1; - state |= QueryNote(NOTE_NE); state <<= 1; - state |= QueryNote(NOTE_PARSER); state <<= 1; - state |= QueryNote(NOTE_POS); state <<= 1; - state |= QueryNote(NOTE_WORD); state <<= 1; - state |= QueryNote(NOTE_SENT); - - if (0 == state || // 0 - 0x01 == state || // 1 - 0x03 == state || // 11 - 0x07 == state || // 111 - 0x0f == state || // 1111 - 0x17 == state || // 10111 - 0x1f == state || // 11111 - 0x3f == state) { - } else { - return false; - } - - // if sent attribute in note is `y`, there should be an `cont` - // attribute in para node. - // travel through all the `para` node, query if there is a `cont` - // attribute - if (!(state & 0x01)) { - for (unsigned i = 0; i < m_document.paragraphs.size(); ++ i) { - const Paragraph_t & paragraph = m_document.paragraphs[i]; - if (!paragraph.sentences.size()) { - if (!paragraph.paragraphPtr->GetText()) { return false; } - } else { - for (unsigned j = 0; j < paragraph.sentences.size(); ++ j) { - const Sentence_t & sentence = paragraph.sentences[j]; - if (!sentence.sentencePtr->Attribute(TAG_CONT)) { return false; } - } - } + // there should not be any attributes in `` + // but it wont matter + if (!note.nodePtr->Attribute(NOTE_SENT) + || !note.nodePtr->Attribute(NOTE_WORD) + || !note.nodePtr->Attribute(NOTE_POS) + || !note.nodePtr->Attribute(NOTE_PARSER) + || !note.nodePtr->Attribute(NOTE_NE) + || !note.nodePtr->Attribute(NOTE_SRL)) { + return false; + } + + // is the attributes in `note` legal + int state = 0; + state |= QueryNote(NOTE_SRL); state <<= 1; + state |= QueryNote(NOTE_NE); state <<= 1; + state |= QueryNote(NOTE_PARSER); state <<= 1; + state |= QueryNote(NOTE_POS); state <<= 1; + state |= QueryNote(NOTE_WORD); state <<= 1; + state |= QueryNote(NOTE_SENT); + + if (0 == state || // 0 + 0x01 == state || // 1 + 0x03 == state || // 11 + 0x07 == state || // 111 + 0x0f == state || // 1111 + 0x17 == state || // 10111 + 0x1f == state || // 11111 + 0x3f == state) { + } else { + return false; + } + + // if sent attribute in note is `y`, there should be an `cont` + // attribute in para node. + // travel through all the `para` node, query if there is a `cont` + // attribute + if (!(state & 0x01)) { + for (unsigned i = 0; i < document.paragraphs.size(); ++ i) { + const Paragraph & paragraph = document.paragraphs[i]; + if (!paragraph.sentences.size()) { + if (!paragraph.paragraphPtr->GetText()) { return false; } + } else { + for (unsigned j = 0; j < paragraph.sentences.size(); ++ j) { + const Sentence & sentence = paragraph.sentences[j]; + if (!sentence.sentencePtr->Attribute(TAG_CONT)) { return false; } } + } } + } #define FOREACH(p, s, w) \ - for (unsigned i = 0; i < m_document.paragraphs.size(); ++ i) { \ - const Paragraph_t & p = m_document.paragraphs[i]; \ - for (unsigned j = 0; j < p.sentences.size(); ++ j) { \ - const Sentence_t & s = p.sentences[j]; \ - for (unsigned k = 0; k < s.words.size(); ++ k) { \ - const Word_t & w = s.words[k]; + for (unsigned i = 0; i < document.paragraphs.size(); ++ i) { \ + const Paragraph & p = document.paragraphs[i]; \ + for (unsigned j = 0; j < p.sentences.size(); ++ j) { \ + const Sentence & s = p.sentences[j]; \ + for (unsigned k = 0; k < s.words.size(); ++ k) { \ + const Word & w = s.words[k]; #define END }}} - FOREACH(p, s, w) - // segment check - if ((state & 0x02) && (!w.wordPtr->Attribute(TAG_CONT))) { return false; } - if ((state & 0x04) && (!w.wordPtr->Attribute(TAG_POS))) { return false; } - if ((state & 0x08) && (!w.wordPtr->Attribute(TAG_PSR_PARENT))) { return false; } - if ((state & 0x08) && (!w.wordPtr->Attribute(TAG_PSR_RELATE))) { return false; } - if ((state & 0x10) && (!w.wordPtr->Attribute(TAG_NE))) { return false; } - END + FOREACH(p, s, w) + // segment check + const char * buffer = NULL; + buffer = w.wordPtr->Attribute(TAG_CONT); + if ((state & 0x02) + && (!buffer || !strnlen(buffer, 1024))) { return false; } + + buffer = w.wordPtr->Attribute(TAG_POS); + if ((state & 0x04) + && (!buffer || !strnlen(buffer, 1024))) { return false; } + + buffer = w.wordPtr->Attribute(TAG_PSR_PARENT); + if ((state & 0x08) + && (!buffer || !strnlen(buffer, 1024))) { return false; } + + buffer = w.wordPtr->Attribute(TAG_PSR_RELATE); + if ((state & 0x08) + && (!buffer || !strnlen(buffer, 1024))) { return false; } + + buffer = w.wordPtr->Attribute(TAG_NE); + if ((state & 0x10) + && (!buffer || !strnlen(buffer, 1024))) { return false; } + END #undef END #undef FOREACH - return true; + return true; } void XML4NLP::ClearAllNote() { - ClearNote(NOTE_SENT); - ClearNote(NOTE_WORD); - ClearNote(NOTE_POS); - ClearNote(NOTE_NE); - ClearNote(NOTE_PARSER); - ClearNote(NOTE_WSD); - ClearNote(NOTE_SRL); - // ClearNote(NOTE_CLASS); - // ClearNote(NOTE_SUM); - // ClearNote(NOTE_CR); + ClearNote(NOTE_SENT); + ClearNote(NOTE_WORD); + ClearNote(NOTE_POS); + ClearNote(NOTE_NE); + ClearNote(NOTE_PARSER); + ClearNote(NOTE_WSD); + ClearNote(NOTE_SRL); + // ClearNote(NOTE_CLASS); + // ClearNote(NOTE_SUM); + // ClearNote(NOTE_CR); } ///////////////////////////////////////////////////////////////////////////////////// @@ -1604,313 +1252,198 @@ void XML4NLP::ClearAllNote() { /// in the initial, a paragraph has only one sentence. ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::BuildParagraph(string& strParagraph, int paragraphIdx) { - if (strParagraph == "עٰȨ" - || strParagraph == "аͱ" - || strParagraph == "עǧȨ") { - strParagraph = "ӭʹùҵѧϢоԼƽ̨"; - } else { - } - TiXmlElement * documentPtr = m_document.documentPtr; - vector ¶graphs = m_document.paragraphs; + TiXmlElement * documentPtr = document.documentPtr; + vector ¶graphs = document.paragraphs; - paragraphs.push_back( Paragraph_t() ); - Paragraph_t ¶graph = paragraphs[paragraphs.size() - 1]; + paragraphs.push_back( Paragraph() ); + Paragraph ¶graph = paragraphs[paragraphs.size() - 1]; - paragraph.paragraphPtr = new TiXmlElement(TAG_PARA); - paragraph.paragraphPtr->SetAttribute(TAG_ID, paragraphIdx); - documentPtr->LinkEndChild(paragraph.paragraphPtr); + paragraph.paragraphPtr = new TiXmlElement(TAG_PARA); + paragraph.paragraphPtr->SetAttribute(TAG_ID, paragraphIdx); + documentPtr->LinkEndChild(paragraph.paragraphPtr); - TiXmlText *textPtr = new TiXmlText(strParagraph.c_str()); - paragraph.paragraphPtr->LinkEndChild( textPtr ); + TiXmlText *textPtr = new TiXmlText(strParagraph.c_str()); + paragraph.paragraphPtr->LinkEndChild( textPtr ); - return 0; + return 0; } -int XML4NLP::GetInfoFromSentence(vector &vecInfo, - int paragraphIdx, - int sentenceIdx, - const char *attrName) const -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return -1; - - const vector &words = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words; +int XML4NLP::GetInfoFromSentence(std::vector & info, + int pid, + int sid, + const char *attribute_name) const { + if (0 != CheckRange(pid, sid)) return -1; - /* - if (vecInfo.size() != words.size()) - { - // cerr << "vecInfo's size does not equal to word num in the sentence, should resize() first" << endl; + const vector & words = document.paragraphs[pid].sentences[sid].words; + if (words[0].wordPtr->Attribute(attribute_name) == NULL) { return -1; - } - */ + } - if (words[0].wordPtr->Attribute(attrName) == NULL) - { - // cerr << "Attribute \"" << attrName << "\" does not exists in paragraph " << paragraphIdx - // << " sentence " << sentenceIdx << endl; - return -1; - } + info.resize(words.size()); + for (int i = 0; i < words.size(); ++ i) { + info[i] = words[i].wordPtr->Attribute(attribute_name); + } + return 0; +} - for (int i = 0; i < words.size(); ++i) - { - vecInfo.push_back(words[i].wordPtr->Attribute(attrName)); - //vecInfo[i] = words[i].wordPtr->Attribute(attrName); - } - return 0; +int XML4NLP::GetInfoFromSentence(std::vector & info, + int global_sid, + const char *attribute_name) const { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return GetInfoFromSentence(info, pid, sid, attribute_name); } -int XML4NLP::GetInfoFromSentence(vector &vecInfo, int sentenceIdx, const char *attrName) const -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - const vector &words = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words; +int XML4NLP::GetInfoFromSentence(std::vector &info, + int pid, + int sid, + const char* attribute_name) const { + if (0 != CheckRange(pid, sid)) return -1; + + const vector & words = document.paragraphs[pid].sentences[sid].words; - /* - if (vecInfo.size() != words.size()) - { - // cerr << "vecInfo's size does not equal to word num in the sentence, should resize() first" << endl; + if (words[0].wordPtr->Attribute(attribute_name) == NULL) { return -1; - } - */ + } - if (words[0].wordPtr->Attribute(attrName) == NULL) - { - // cerr << "Attribute \"" << attrName << "\" does not exists in paragraph " << paraIdx_sentIdx.first - // << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } + info.clear(); + for (int i = 0; i < words.size(); ++ i) { + const char * cszAttrValue = words[i].wordPtr->Attribute(attribute_name); + info.push_back(cszAttrValue != NULL ? cszAttrValue : ""); + } + return 0; +} - for (int i = 0; i < words.size(); ++i) - { - vecInfo.push_back(words[i].wordPtr->Attribute(attrName)); - //vecInfo[i] = words[i].wordPtr->Attribute(attrName); - } - return 0; +int XML4NLP::GetInfoFromSentence(std::vector & info, + int global_sid, + const char* attribute_name) const { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return GetInfoFromSentence(info, pid, sid, attribute_name); } +int XML4NLP::SetInfoToSentence(const std::vector & info, + int pid, + int sid, + const char* attribute_name) { + if (0 != CheckRange(pid, sid)) return -1; -int XML4NLP::GetInfoFromSentence(vector &vecInfo, int paragraphIdx, - int sentenceIdx, const char* attrName) const -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return -1; + std::vector & words = document.paragraphs[pid].sentences[sid].words; - const vector &words = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words; + if (words.size() != info.size()) { + return -1; + } - if (words[0].wordPtr->Attribute(attrName) == NULL) - { - // cerr << "Attribute \"" << attrName << "\" does not exists in paragraph " << paragraphIdx - // << " sentence " << sentenceIdx << endl; - return -1; - } + if (words[0].wordPtr->Attribute(attribute_name) != NULL) { + return -1; + } - vecInfo.clear(); - for (int i = 0; i < words.size(); ++i) - { - const char *cszAttrValue = words[i].wordPtr->Attribute(attrName); - vecInfo.push_back(cszAttrValue != NULL ? cszAttrValue : ""); - } - return 0; + for (int i = 0; i < words.size(); ++ i) { + // std::cout << attribute_name << " " << info[i] << std::endl; + words[i].wordPtr->SetAttribute(attribute_name, info[i].c_str()); + } + return 0; } -int XML4NLP::GetInfoFromSentence(vector &vecInfo, int sentenceIdx, const char* attrName) const -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - const vector &words = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words; - - if (words[0].wordPtr->Attribute(attrName) == NULL) - { - // cerr << "Attribute \"" << attrName << "\" does not exists in paragraph " << paraIdx_sentIdx.first - // << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } +int XML4NLP::SetInfoToSentence(const std::vector & info, + int global_sid, + const char * attribute_name) { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; - vecInfo.clear(); - for (int i = 0; i < words.size(); ++i) - { - const char *cszAttrValue = words[i].wordPtr->Attribute(attrName); - vecInfo.push_back(cszAttrValue != NULL ? cszAttrValue : ""); - } - return 0; + return SetInfoToSentence(info, pid, sid, attribute_name); } -int XML4NLP::SetInfoToSentence(const vector &vecInfo, int paragraphIdx, - int sentenceIdx, const char* attrName) -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return -1; - - vector &words = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words; +int XML4NLP::SetInfoToSentence(const std::vector & info, + int pid, + int sid, + const char * attribute_name) { + if (0 != CheckRange(pid, sid)) return -1; - if (words.size() != vecInfo.size()) - { - // cerr << "word number does not equal to attribute \"" << attrName << "\" num in paragraph " << paragraphIdx - // << " sentence " << sentenceIdx << endl; - return -1; - } - if (words[0].wordPtr->Attribute(attrName) != NULL) - { - // cerr << "Attribute \"" << attrName << "\" already exists in paragraph " << paragraphIdx - // << " sentence " << sentenceIdx << endl; - return -1; - } + std::vector & words = document.paragraphs[pid].sentences[sid].words; - for (int i = 0; i < words.size(); ++i) - { - words[i].wordPtr->SetAttribute(attrName, vecInfo[i].c_str()); - } - return 0; -} + if (words.size() != info.size()) { + return -1; + } -int XML4NLP::SetInfoToSentence(const vector &vecInfo, int sentenceIdx, const char* attrName) -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - vector &words = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words; - if (words.size() != vecInfo.size()) - { - // cerr << "word number does not equal to attribute \"" << attrName << "\" num in paragraph " << paraIdx_sentIdx.first - // << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } - if (words[0].wordPtr->Attribute(attrName) != NULL) - { - // cerr << "Attribute \"" << attrName << "\" already exists in paragraph " << paraIdx_sentIdx.first - // << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } + if (words[0].wordPtr->Attribute(attribute_name) != NULL) { + return -1; + } - for (int i = 0; i < words.size(); ++i) - { - words[i].wordPtr->SetAttribute(attrName, vecInfo[i].c_str()); - } - return 0; + for (int i = 0; i < words.size(); ++ i) { + // std::cout << attribute_name << " " << info[i] << std::endl; + words[i].wordPtr->SetAttribute(attribute_name, info[i]); + } + return 0; } -int XML4NLP::SetInfoToSentence(const vector &vecInfo, int paragraphIdx, - int sentenceIdx, const char* attrName) -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return -1; - - vector &words = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words; +int XML4NLP::SetInfoToSentence(const std::vector & info, + int global_sid, + const char * attribute_name) { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return SetInfoToSentence(info, pid, sid, attribute_name); +} - if (words.size() != vecInfo.size()) - { - // cerr << "word number does not equal to attribute \"" << attrName << "\" num in paragraph " << paragraphIdx - // << " sentence " << sentenceIdx << endl; - return -1; - } - if (words[0].wordPtr->Attribute(attrName) != NULL) - { - // cerr << "Attribute \"" << attrName << "\" already exists in paragraph " << paragraphIdx - // << " sentence " << sentenceIdx << endl; - return -1; - } - for (int i = 0; i < words.size(); ++i) - { - words[i].wordPtr->SetAttribute(attrName, vecInfo[i]); - } - return 0; -} +int XML4NLP::CheckRange(int pid, int sid, int wid) const { + if (pid >= document.paragraphs.size()) { + return -1; + } -int XML4NLP::SetInfoToSentence(const vector &vecInfo, int sentenceIdx, const char* attrName) -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - vector &words = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words; - if (words.size() != vecInfo.size()) - { - // cerr << "word number does not equal to attribute \"" << attrName << "\" num in paragraph " << paraIdx_sentIdx.first - // << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } - if (words[0].wordPtr->Attribute(attrName) != NULL) - { - // cerr << "Attribute \"" << attrName << "\" already exists in paragraph " << paraIdx_sentIdx.first - // << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } + if (sid >= document.paragraphs[pid].sentences.size()) { + return -1; + } - for (int i = 0; i < words.size(); ++i) - { - words[i].wordPtr->SetAttribute(attrName, vecInfo[i]); - } - return 0; + if (wid >= document.paragraphs[pid].sentences[sid].words.size()) { + return -1; + } + return 0; } +int XML4NLP::CheckRange(int pid, int sid) const { + if (pid >= document.paragraphs.size()) { + return -1; + } -int XML4NLP::CheckRange(int paragraphIdx, int sentenceIdx, int wordIdx) const -{ - if (paragraphIdx >= m_document.paragraphs.size()) - { - // cerr << "paragraphIdx is too large: " << paragraphIdx << endl; - return -1; - } - if (sentenceIdx >= m_document.paragraphs[paragraphIdx].sentences.size()) - { - // cerr << "sentenceIdx is too large: " << sentenceIdx << " in paragraph : " << paragraphIdx << endl; - return -1; - } - if (wordIdx >= m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words.size()) - { - // cerr << "wordIdx is too large: " << wordIdx << " in sentence : " << sentenceIdx - // << " of paragraph : " << paragraphIdx << endl; - return -1; - } - return 0; -} + if (sid >= document.paragraphs[pid].sentences.size()) { + return -1; + } -int XML4NLP::CheckRange(int paragraphIdx, int sentenceIdx) const -{ - if (paragraphIdx >= m_document.paragraphs.size()) - { - // cerr << "paragraphIdx is too large: " << paragraphIdx << endl; - return -1; - } - if (sentenceIdx >= m_document.paragraphs[paragraphIdx].sentences.size()) - { - // cerr << "sentenceIdx is too large: " << sentenceIdx << " in paragraph : " << paragraphIdx << endl; - return -1; - } - return 0; + return 0; } int XML4NLP::CheckRange(int paragraphIdx) const { - if (paragraphIdx >= m_document.paragraphs.size()) { - // cerr << "paragraphIdx is too large: " << paragraphIdx << endl; - return -1; - } - return 0; + if (paragraphIdx >= document.paragraphs.size()) { + return -1; + } + return 0; } bool XML4NLP::QueryNote(const char *cszNoteName) const { - if (m_note.nodePtr == NULL) return false; // OK? + if (note.nodePtr == NULL) return false; // OK? - return (strcmp(m_note.nodePtr->Attribute(cszNoteName), "y") == 0) ? true : false; + return (strcmp(note.nodePtr->Attribute(cszNoteName), "y") == 0) ? true : false; } int XML4NLP::SetNote(const char *cszNoteName) { - if (m_note.nodePtr == NULL) { - m_note.nodePtr = new TiXmlElement(TAG_NOTE); - m_tiXmlDoc.RootElement()->LinkEndChild( m_note.nodePtr ); - } - m_note.nodePtr->SetAttribute(cszNoteName, "y"); - return 0; + if (note.nodePtr == NULL) { + note.nodePtr = new TiXmlElement(TAG_NOTE); + m_tiXmlDoc.RootElement()->LinkEndChild( note.nodePtr ); + } + note.nodePtr->SetAttribute(cszNoteName, "y"); + return 0; } int XML4NLP::ClearNote(const char *cszNoteName) { - if (m_note.nodePtr == NULL) { - m_note.nodePtr = new TiXmlElement(TAG_NOTE); - m_tiXmlDoc.RootElement()->LinkEndChild( m_note.nodePtr ); - } + if (note.nodePtr == NULL) { + note.nodePtr = new TiXmlElement(TAG_NOTE); + m_tiXmlDoc.RootElement()->LinkEndChild( note.nodePtr ); + } - m_note.nodePtr->SetAttribute(cszNoteName, "n"); - return 0; + note.nodePtr->SetAttribute(cszNoteName, "n"); + return 0; } - - diff --git a/src/__xml4nlp/Xml4nlp.h b/src/__xml4nlp/Xml4nlp.h index 5aa0b6683..417cf1b08 100644 --- a/src/__xml4nlp/Xml4nlp.h +++ b/src/__xml4nlp/Xml4nlp.h @@ -41,788 +41,786 @@ extern const char * const NOTE_CR; ///////////////////////////////////////////////////////////////////////////////////// class XML4NLP { public: - XML4NLP(); - virtual ~XML4NLP(); - - // -------------------------------------------------------------- - // Functions for DOM Tree Creation - // -------------------------------------------------------------- - /* - * Create DOM from file, read in each line of the file and store - * them in the xml tree. - * - * @param[in] filename the filename - */ - int CreateDOMFromFile(const char * filename); - - /* - * Create DOM from raw string text. - * - * @param[in] str the string - * @return int 0 on success, otherwise -1 - */ - int CreateDOMFromString(const std::string & str); - - /* - * A wrapper of CreateDOMFromString(const std::string & str); - * - * @param[in] str the string - * @return int 0 on success, otherwise -1 - */ - int CreateDOMFromString(const char * str); - - /* - * Load XML DOM from file - * - * @param[in] filename the file name - * @return int 0 on success, otherwise -1 - */ - int LoadXMLFromFile(const char * fileName); - - /* - * Load XML DOM from string - * - * @param[in] str the string - * @return int 0 on success, otherwise -1 - */ - int LoadXMLFromString(const char * str); - - /* - * Load XML DOM from string - * - * @param[in] str the string - */ - int LoadXMLFromString(const std::string & str); - - /* - * Clear the DOM tree - */ - void ClearDOM(); - - /* - * Save the DOM tree to file - * - * @param[in] filename the filename - * @return int 0 on success, otherwise -1 - */ - int SaveDOM(const char * fileName); - - /* - * Save the DOM tree to strin - * - * @param[out] strDocument the str - */ - void SaveDOM(string &strDocument) const; - - /* - * Get attributes value in `` - * - * @param[in] cszNoteName the note name - * @return bool return true on `` exists and - * attributes value equals "y", otherwise - * false. - */ - bool QueryNote(const char * cszNoteName) const; - - /* - * Set attributes value in `` to "y" - * - * @param[in] cszNoteName the note name - * @return int return 0 - */ - int SetNote(const char * cszNoteName); - - /* - * Set attributes value in `` to "n" - * - * @param[in] cszNoteName the note name - * @return int return 0 - */ - int ClearNote(const char * cszNoteName); - - /* - * Set all nlp attributes value in `` to "n" - * - * @param[in] cszNoteName the note name - * @return int return 0 - */ - void ClearAllNote(); - - // counting operation - /* - * count number of paragraph in document - * - * @return int the number of paragraph - */ - int CountParagraphInDocument() const; - - /* - * conut number of sentence in paragraph - * - * @param[in] pid the index number of paragraph - * @return int the number of paragraph - */ - int CountSentenceInParagraph(int pid) const; - - /* - * count number of all sentences in document - * - * @return int the number of all sentences in document - */ - int CountSentenceInDocument() const; - - /* - * Count number of words in sentence, given the index of paragraph - * and index of sentence. - * - * @param[in] pid the index of paragraph - * @param[in] sid the index of sentence - * @return int - */ - int CountWordInSentence(int pid, int sid) const; - - /* - * Count number of words in sentence, given the global index - * of the sentence - * - * @param[in] global_sid the global index of a sentence - * @return int number of sentence - */ - int CountWordInSentence(int global_sid) const; - - /* - * Count number of words in paragraph - * - * @param[in] pid the index of paragraph - * @return int number of words in paragraph if legal - * pid is given, otherwise -1 - */ - int CountWordInParagraph(int pid) const; - - /* - * Count total number of words in paragraph - * - * @return int number of words - */ - int CountWordInDocument() const; - - /* - * Get content of paragraph and store it in string - * - * @param[in] pid the index of paragraph - * @param[out] strParagraph the output string - * @return int 0 on success, otherwise -1 - */ - int GetParagraph(int pid, string & strParagraph) const; - - /* - * Get content of paragraph - * - * @param[in] pid the index of paragraph - * @return const char * the pointer to the string, NULL on failure - */ - const char * GetParagraph(int pid) const; - - /* - * Get content of sentence - * - * @param[in] pid the index of paragraph - * @param[in] sid the index of sentence - * @return const char * the pointer to the string, NULL on failure - */ - const char * GetSentence(int pid, int sid) const; - - /* - * Get content of sentence, given the sentence's global index - * - * @param[in] global_sid the global index of the sentence - * @return const char * the pointer to the string, NULL on failure - */ - const char * GetSentence(int global_sid) const; - - /* - * Get word content - * - * @param[in] pid the index of paragraph in document - * @param[in] sid the index of sentence in paragraph - * @param[in] wid the index of word in sentence - * @return const char * the pointer to the string, NULL on failure - */ - const char * GetWord(int pid, int sid, int wid) const; - - /* - * Get word content, given the global sentence index - * - * @param[in] global_sid the global index of the sentence - * @param[in] wid the index of word in sentence - * @return const char * the pointer to the string, NULL on failure - */ - const char * GetWord(int global_sid, int wid) const; - - /* - * Get word content, given the global index of word - * - * @param[in] global_wid the global index of the sentence - * @return const char * the pointer to the string, NULL on failure - */ - const char * GetWord(int glabal_wid) const; - - /* - * Get word's postag - * - * @param[in] pid the index of paragraph - * @param[in] sid the index of sentence - * @param[in] wid the index of word - * @return const char * the pointer to the string, NULL on failure. - */ - const char * GetPOS(int pid, int sid, int wid) const; - - /* - * Get word's postag - * - * @param[in] global_sid the global index of sentence - * @param[in] wid the index of the word - * @return const char * the pointer to the string, NULL on failure. - */ - const char * GetPOS(int global_sid, int wid) const; - - /* - * Get word's postag, given the global index of the word in the document. - * - * @param[in] global_wid the global index of the word. - * @return const char * the pointer to the string, NULL on failure. - */ - const char * GetPOS(int global_wid) const; - - /* - * Get word's NER tag - * - * @param[in] pid the index of paragraph - * @param[in] sid the index of sentence - * @param[in] wid the index of word - * @return const char * the pointer to the tag, NULL on failure. - */ - const char * GetNE(int pid, int sid, int wid) const; - - /* - * Get word's NER tag, given the global index of sentence in the document. - * - * @param[in] global_sid the global index of sentence - * @param[in] wid the index of the word - * @return const char * the pointer to the tag, NULL on failure. - */ - const char * GetNE(int global_sid, int wid) const; - - /* - * Get word's NER, given the global index of the word in the document. - * - * @param[in] global_wid the global index of the word. - * @return const char * the pointer to the string, NULL on failure. - */ - const char * GetNE(int glabalWordIdx) const; - - /* - * Get word's WSD result (WSD module is under construction) - * - * @param[out] WSD_explanation the explanation of the WSD - * @param[in] pid the index of paragraph - * @param[in] sid the index of sentence - * @param[in] wid the index of word - * @return int 0 on success, otherwise -1 - */ - int GetWSD(pair & WSD_explanation, - int pid, - int sid, - int wid) const; - - /* - * Get word's WSD result (WSD module is under construction) - * - * @param[out] WSD_explanation the explanation of the WSD - * @param[in] global_sid the global index of sentence - * @param[in] wid the index of the word - * @return int 0 on success, -1 on illegal index - */ - int GetWSD(pair & WSD_explanation, - int global_sid, - int wid) const; - - /* - * Get word's WSD result (WSD module is under construction) - * - * @param[out] WSD_explanation the explanation of the WSD - * @param[in] global_wid the global index of sentence - * @return int 0 on success, -1 on illegal index - */ - int GetWSD(pair & WSD_explanation, - int global_wid) const; - - /* - * Get word's parsing result - * - * @param[out] parent_relation the (parent, relation) pair - * @param[in] pid the index of paragraph - * @param[in] sid the index of sentence - * @param[in] wid the index of word - * @return int 0 on success, -1 on illegal index - */ - int GetParse(pair & parent_relation, - int pid, - int sid, - int wid) const; - - /* - * Get word's parsing result - * - * @param[out] parent_relation the (parent, relation) pair - * @param[in] global_sid the global index of sentence - * @param[in] wid the index of the word - * @return int 0 on success, -1 on illegal index - */ - int GetParse(pair & parent_relation, - int global_sid, - int wid) const; - - /* - * Get word's parsing result - * - * @param[out] parent_relation the (parent, relation) pair - * @param[in] global_wid the global index of sentence - * @return int 0 on success, -1 on illegal index - */ - int GetParse(pair &parent_relation, - int glabal_wid) const; - - /* - * Get sentences from paragraph - * - * @param[out] vecSentence the output vector - * @param[in] paragraphIdx the index to the paragraph - */ - int GetSentencesFromParagraph(vector & vecSentence, - int paragraphIdx) const; - - /* - * Get sentences from paragraph - * - * @param[out] vectSentence the output vector - * @param[in] paragraphIdx the index to the paragraph - */ - int GetSentencesFromParagraph(vector &vecSents, - int paragraphIdx) const; - - int SetSentencesToParagraph(const vector &vecSents, - int paragraphIdx); - - /* - * Get words from sentence - * - * @param[out] vecWord the word vector - * @param[in] paragraphIdx the index of paragraph - * @param[in] sentenceIdx the index of sentence - */ - int GetWordsFromSentence(vector &vecWord, - int paragraphIdx, - int sentenceIdx) const; - - /* - * Get words from sentence - * - * @param[out] vecWord the word vector - * @param[in] globalSentIdx the global index of sentence - */ - int GetWordsFromSentence(vector &vecWord, - int globalSentIdx) const; - - /* - * Get words from sentence, std::string interface - * - * @param[out] vecWord the word vector - * @param[in] paragraphIdx the index of paragraph - * @param[in] sentenceIdx the index of sentence - */ - int GetWordsFromSentence(vector &vecWord, - int paragraphIdx, - int sentenceIdx) const; - - /* - * Get words from sentence, std::string interface - * - * @param[out] vecWord the word vector - * @param[in] globalSentIdx the global index of sentence - */ - int GetWordsFromSentence(vector &vecWord, - int globalSentIdx) const; - - /* - * Set word to sentence - * - * @param[in] vecWord the words - * @param[in] paragraphIdx the index of paragraph - * @param[in] sentenceIdx the index of sentence - */ - int SetWordsToSentence(const vector &vecWord, - int paragraphIdx, - int sentenceIdx); - - /* - * Set word to sentence - * - * @param[in] vecWord the words - * @param[in] sentenceIdx the global index of sentence - */ - int SetWordsToSentence(const vector &vecWord, - int sentenceIdx); - - // for POS tagging - int GetPOSsFromSentence(vector & vecPOS, - int paragraphIdx, - int sentenceIdx) const; - - int GetPOSsFromSentence(vector & vecPOS, - int globalSentIdx) const; - - int GetPOSsFromSentence(vector & vecPOS, - int paragraphIdx, - int sentenceIdx) const; - - int GetPOSsFromSentence(vector & vecPOS, - int globalSentIdx) const; - - int SetPOSsToSentence(const vector & vecPOS, - int paragraphIdx, - int sentenceIdx); - - int SetPOSsToSentence(const vector & vecPOS, - int sentenceIdx); - - // for NE - int GetNEsFromSentence(vector &vecNE, - int paragraphIdx, - int sentenceIdx) const; - - int GetNEsFromSentence(vector &vecNE, - int globalSentIdx) const; - - int GetNEsFromSentence(vector &vecNE, - int paragraphIdx, - int sentenceIdx) const; - - int GetNEsFromSentence(vector &vecNE, - int globalSentIdx) const; - - int SetNEsToSentence(const vector &vecNE, - int paragraphIdx, - int sentenceIdx); - - int SetNEsToSentence(const vector &vecNE, - int sentenceIdx); - - int GetWSDsFromSentence(vector &vecWSD, - int paragraphIdx, - int sentenceIdx) const; - - int GetWSDsFromSentence(vector &vecWSD, - int sentenceIdx) const; - - int GetWSDsFromSentence(vector &vecWSD, - int paragraphIdx, - int sentenceIdx) const; - - int GetWSDsFromSentence(vector &vecWSD, - int sentenceIdx) const; - - int SetWSDsToSentence(const vector &vecWSD, - int paragraphIdx, - int sentenceIdx); - - int SetWSDsToSentence(const vector & vecWSD, - int sentenceIdx); - - int GetWSDExplainsFromSentence(vector &vecWSDExplain, - int paragraphIdx, - int sentenceIdx) const; - - int GetWSDExplainsFromSentence(vector &vecWSDExplain, - int sentenceIdx) const; - - int GetWSDExplainsFromSentence(vector &vecWSDExplain, - int paragraphIdx, - int sentenceIdx) const; - - int GetWSDExplainsFromSentence(vector &vecWSDExplain, - int sentenceIdx) const; - - int SetWSDExplainsToSentence(const vector &vecWSDExplain, - int paragraphIdx, - int sentenceIdx); + XML4NLP(); + virtual ~XML4NLP(); + + // -------------------------------------------------------------- + // Functions for DOM Tree Creation + // -------------------------------------------------------------- + /* + * Create DOM from file, read in each line of the file and store + * them in the xml tree. + * + * @param[in] filename the filename + */ + int CreateDOMFromFile(const char * filename); + + /* + * Create DOM from raw string text. + * + * @param[in] str the string + * @return int 0 on success, otherwise -1 + */ + int CreateDOMFromString(const std::string & str); + + /* + * A wrapper of CreateDOMFromString(const std::string & str); + * + * @param[in] str the string + * @return int 0 on success, otherwise -1 + */ + int CreateDOMFromString(const char * str); + + /* + * Load XML DOM from file + * + * @param[in] filename the file name + * @return int 0 on success, otherwise -1 + */ + int LoadXMLFromFile(const char * fileName); + + /* + * Load XML DOM from string + * + * @param[in] str the string + * @return int 0 on success, otherwise -1 + */ + int LoadXMLFromString(const char * str); + + /* + * Load XML DOM from string + * + * @param[in] str the string + */ + int LoadXMLFromString(const std::string & str); + + /* + * Clear the DOM tree + */ + void ClearDOM(); + + /* + * Save the DOM tree to file + * + * @param[in] filename the filename + * @return int 0 on success, otherwise -1 + */ + int SaveDOM(const char * fileName); + + /* + * Save the DOM tree to strin + * + * @param[out] strDocument the str + */ + void SaveDOM(string &strDocument) const; + + /* + * Get attributes value in `` + * + * @param[in] note_name the name of the attribute in note + * @return bool return true on `` exists and attributes + * value equals "y", otherwise false. + */ + bool QueryNote(const char * note_name) const; + + /* + * Set attributes value in `` to "y" + * + * @param[in] note_name the name of the attribute in note + * @return int return 0 + */ + int SetNote(const char * note_name); + + /* + * Set attributes value in `` to "n" + * + * @param[in] cszNoteName the note name + * @return int return 0 + */ + int ClearNote(const char * note_name); + + /* + * Set all nlp attributes value in `` to "n" + * + * @param[in] cszNoteName the note name + * @return int return 0 + */ + void ClearAllNote(); + + // counting operation + /* + * count number of paragraph in document + * + * @return int the number of paragraph + */ + int CountParagraphInDocument() const; + + /* + * conut number of sentence in paragraph + * + * @param[in] pid the index number of paragraph + * @return int the number of paragraph + */ + int CountSentenceInParagraph(int pid) const; + + /* + * count number of all sentences in document + * + * @return int the number of all sentences in document + */ + int CountSentenceInDocument() const; + + /* + * Count number of words in sentence, given the index of paragraph + * and index of sentence. + * + * @param[in] pid the index of paragraph + * @param[in] sid the index of sentence + * @return int + */ + int CountWordInSentence(int pid, int sid) const; + + /* + * Count number of words in sentence, given the global index + * of the sentence + * + * @param[in] global_sid the global index of a sentence + * @return int number of sentence + */ + int CountWordInSentence(int global_sid) const; + + /* + * Count number of words in paragraph + * + * @param[in] pid the index of paragraph + * @return int number of words in paragraph if legal + * pid is given, otherwise -1 + */ + int CountWordInParagraph(int pid) const; + + /* + * Count total number of words in paragraph + * + * @return int number of words + */ + int CountWordInDocument() const; + + /* + * Get content of paragraph and store it in string + * + * @param[in] pid the index of paragraph + * @param[out] strParagraph the output string + * @return int 0 on success, otherwise -1 + */ + int GetParagraph(int pid, string & strParagraph) const; + + /* + * Get content of paragraph + * + * @param[in] pid the index of paragraph + * @return const char * the pointer to the string, NULL on failure + */ + const char * GetParagraph(int pid) const; + + /* + * Get content of sentence + * + * @param[in] pid the index of paragraph + * @param[in] sid the index of sentence + * @return const char * the pointer to the string, NULL on failure + */ + const char * GetSentence(int pid, int sid) const; + + /* + * Get content of sentence, given the sentence's global index + * + * @param[in] global_sid the global index of the sentence + * @return const char * the pointer to the string, NULL on failure + */ + const char * GetSentence(int global_sid) const; + + /* + * Get word content + * + * @param[in] pid the index of paragraph in document + * @param[in] sid the index of sentence in paragraph + * @param[in] wid the index of word in sentence + * @return const char * the pointer to the string, NULL on failure + */ + const char * GetWord(int pid, int sid, int wid) const; + + /* + * Get word content, given the global sentence index + * + * @param[in] global_sid the global index of the sentence + * @param[in] wid the index of word in sentence + * @return const char * the pointer to the string, NULL on failure + */ + const char * GetWord(int global_sid, int wid) const; + + /* + * Get word content, given the global index of word + * + * @param[in] global_wid the global index of the sentence + * @return const char * the pointer to the string, NULL on failure + */ + const char * GetWord(int glabal_wid) const; + + /* + * Get word's postag + * + * @param[in] pid the index of the paragraph + * @param[in] sid the index of the sentence + * @param[in] wid the index of the word + * @return const char * the pointer to the string, NULL on failure. + */ + const char * GetPOS(int pid, int sid, int wid) const; + + /* + * Get word's postag + * + * @param[in] global_sid the global index of sentence + * @param[in] wid the index of the word + * @return const char * the pointer to the string, NULL on failure. + */ + const char * GetPOS(int global_sid, int wid) const; + + /* + * Get word's postag, given the global index of the word in the document. + * + * @param[in] global_wid the global index of the word. + * @return const char * the pointer to the string, NULL on failure. + */ + const char * GetPOS(int global_wid) const; + + /* + * Get word's NER tag + * + * @param[in] pid the index of paragraph + * @param[in] sid the index of sentence + * @param[in] wid the index of word + * @return const char * the pointer to the tag, NULL on failure. + */ + const char * GetNE(int pid, int sid, int wid) const; + + /* + * Get word's NER tag, given the global index of sentence in the document. + * + * @param[in] global_sid the global index of sentence + * @param[in] wid the index of the word + * @return const char * the pointer to the tag, NULL on failure. + */ + const char * GetNE(int global_sid, int wid) const; + + /* + * Get word's NER, given the global index of the word in the document. + * + * @param[in] global_wid the global index of the word. + * @return const char * the pointer to the string, NULL on failure. + */ + const char * GetNE(int glabalWordIdx) const; + + /* + * Get word's WSD result (WSD module is under construction) + * + * @param[out] WSD_explanation the explanation of the WSD + * @param[in] pid the index of paragraph + * @param[in] sid the index of sentence + * @param[in] wid the index of word + * @return int 0 on success, otherwise -1 + */ + int GetWSD(pair & WSD_explanation, + int pid, + int sid, + int wid) const; + + /* + * Get word's WSD result (WSD module is under construction) + * + * @param[out] WSD_explanation the explanation of the WSD + * @param[in] global_sid the global index of sentence + * @param[in] wid the index of the word + * @return int 0 on success, -1 on illegal index + */ + int GetWSD(pair & WSD_explanation, + int global_sid, + int wid) const; + + /* + * Get word's WSD result (WSD module is under construction) + * + * @param[out] WSD_explanation the explanation of the WSD + * @param[in] global_wid the global index of sentence + * @return int 0 on success, -1 on illegal index + */ + int GetWSD(pair & WSD_explanation, + int global_wid) const; + + /* + * Get word's parsing result + * + * @param[out] parent_relation the (parent, relation) pair + * @param[in] pid the index of paragraph + * @param[in] sid the index of sentence + * @param[in] wid the index of word + * @return int 0 on success, -1 on illegal index + */ + int GetParse(pair & parent_relation, + int pid, + int sid, + int wid) const; + + /* + * Get word's parsing result + * + * @param[out] parent_relation the (parent, relation) pair + * @param[in] global_sid the global index of sentence + * @param[in] wid the index of the word + * @return int 0 on success, -1 on illegal index + */ + int GetParse(pair & parent_relation, + int global_sid, + int wid) const; + + /* + * Get word's parsing result + * + * @param[out] parent_relation the (parent, relation) pair + * @param[in] global_wid the global index of sentence + * @return int 0 on success, -1 on illegal index + */ + int GetParse(pair &parent_relation, + int glabal_wid) const; + + /* + * Get sentences from paragraph + * + * @param[out] vecSentence the output vector + * @param[in] paragraphIdx the index to the paragraph + */ + int GetSentencesFromParagraph(vector & vecSentence, + int paragraphIdx) const; + + /* + * Get sentences from paragraph + * + * @param[out] vectSentence the output vector + * @param[in] paragraphIdx the index to the paragraph + */ + int GetSentencesFromParagraph(vector &vecSents, + int paragraphIdx) const; + + int SetSentencesToParagraph(const vector &vecSents, + int paragraphIdx); + + /* + * Get words from sentence + * + * @param[out] vecWord the word vector + * @param[in] paragraphIdx the index of paragraph + * @param[in] sentenceIdx the index of sentence + */ + int GetWordsFromSentence(vector &vecWord, + int paragraphIdx, + int sentenceIdx) const; + + /* + * Get words from sentence + * + * @param[out] vecWord the word vector + * @param[in] globalSentIdx the global index of sentence + */ + int GetWordsFromSentence(vector &vecWord, + int globalSentIdx) const; + + /* + * Get words from sentence, std::string interface + * + * @param[out] vecWord the word vector + * @param[in] paragraphIdx the index of paragraph + * @param[in] sentenceIdx the index of sentence + */ + int GetWordsFromSentence(vector &vecWord, + int paragraphIdx, + int sentenceIdx) const; + + /* + * Get words from sentence, std::string interface + * + * @param[out] vecWord the word vector + * @param[in] globalSentIdx the global index of sentence + */ + int GetWordsFromSentence(vector &vecWord, + int globalSentIdx) const; + + /* + * Set word to sentence + * + * @param[in] vecWord the words + * @param[in] paragraphIdx the index of paragraph + * @param[in] sentenceIdx the index of sentence + */ + int SetWordsToSentence(const vector &vecWord, + int paragraphIdx, + int sentenceIdx); + + /* + * Set word to sentence + * + * @param[in] vecWord the words + * @param[in] sentenceIdx the global index of sentence + */ + int SetWordsToSentence(const vector &vecWord, + int sentenceIdx); + + // for POS tagging + int GetPOSsFromSentence(vector & vecPOS, + int paragraphIdx, + int sentenceIdx) const; + + int GetPOSsFromSentence(vector & vecPOS, + int globalSentIdx) const; + + int GetPOSsFromSentence(vector & vecPOS, + int paragraphIdx, + int sentenceIdx) const; + + int GetPOSsFromSentence(vector & vecPOS, + int globalSentIdx) const; + + int SetPOSsToSentence(const vector & vecPOS, + int paragraphIdx, + int sentenceIdx); + + int SetPOSsToSentence(const vector & vecPOS, + int sentenceIdx); + + // for NE + int GetNEsFromSentence(vector &vecNE, + int paragraphIdx, + int sentenceIdx) const; + + int GetNEsFromSentence(vector &vecNE, + int globalSentIdx) const; + + int GetNEsFromSentence(vector &vecNE, + int paragraphIdx, + int sentenceIdx) const; + + int GetNEsFromSentence(vector &vecNE, + int globalSentIdx) const; + + int SetNEsToSentence(const vector &vecNE, + int paragraphIdx, + int sentenceIdx); + + int SetNEsToSentence(const vector &vecNE, + int sentenceIdx); + + int GetWSDsFromSentence(vector &vecWSD, + int paragraphIdx, + int sentenceIdx) const; + + int GetWSDsFromSentence(vector &vecWSD, + int sentenceIdx) const; + + int GetWSDsFromSentence(vector &vecWSD, + int paragraphIdx, + int sentenceIdx) const; + + int GetWSDsFromSentence(vector &vecWSD, + int sentenceIdx) const; + + int SetWSDsToSentence(const vector &vecWSD, + int paragraphIdx, + int sentenceIdx); + + int SetWSDsToSentence(const vector & vecWSD, + int sentenceIdx); + + int GetWSDExplainsFromSentence(vector &vecWSDExplain, + int paragraphIdx, + int sentenceIdx) const; + + int GetWSDExplainsFromSentence(vector &vecWSDExplain, + int sentenceIdx) const; + + int GetWSDExplainsFromSentence(vector &vecWSDExplain, + int paragraphIdx, + int sentenceIdx) const; + + int GetWSDExplainsFromSentence(vector &vecWSDExplain, + int sentenceIdx) const; + + int SetWSDExplainsToSentence(const vector &vecWSDExplain, + int paragraphIdx, + int sentenceIdx); - int SetWSDExplainsToSentence(const vector &vecWSDExplain, - int sentenceIdx); + int SetWSDExplainsToSentence(const vector &vecWSDExplain, + int sentenceIdx); - // for Parser - int GetParsesFromSentence(vector< pair > &vecParse, - int paragraphIdx, - int sentenceIdx) const; + // for Parser + int GetParsesFromSentence(vector< pair > &vecParse, + int paragraphIdx, + int sentenceIdx) const; - int GetParsesFromSentence(vector< pair > &vecParse, - int sentenceIdx) const; + int GetParsesFromSentence(vector< pair > &vecParse, + int sentenceIdx) const; - int GetParsesFromSentence(vector< pair > &vecParse, - int paragraphIdx, - int sentenceIdx) const; + int GetParsesFromSentence(vector< pair > &vecParse, + int paragraphIdx, + int sentenceIdx) const; - int GetParsesFromSentence(vector< pair > &vecParse, - int sentenceIdx) const; + int GetParsesFromSentence(vector< pair > &vecParse, + int sentenceIdx) const; - int SetParsesToSentence(const vector< pair > &vecParse, - int paragraphIdx, - int sentenceIdx); + int SetParsesToSentence(const vector< pair > &vecParse, + int paragraphIdx, + int sentenceIdx); - int SetParsesToSentence(const vector< pair > &vecParse, - int sentenceIdx); + int SetParsesToSentence(const vector< pair > &vecParse, + int sentenceIdx); - int SetParsesToSentence(const vector &vecHead, - const vector &vecRel, - int paragraphIdx, - int sentenceIdx); + int SetParsesToSentence(const vector &vecHead, + const vector &vecRel, + int paragraphIdx, + int sentenceIdx); - int SetParsesToSentence(const vector &vecHead, - const vector &vecRel, - int sentenceIdx); + int SetParsesToSentence(const vector &vecHead, + const vector &vecRel, + int sentenceIdx); - // for text summarization - const char* GetTextSummary() const; - int SetTextSummary(const char* textSum); + // for text summarization + const char* GetTextSummary() const; + int SetTextSummary(const char* textSum); - // for text classification - const char* GetTextClass() const; - int SetTextClass(const char* textClass); + // for text classification + const char* GetTextClass() const; + int SetTextClass(const char* textClass); - // for SRL - int CountPredArgToWord(int paragraphIdx, - int sentenceIdx, - int wordIdx) const; + // for SRL + int CountPredArgToWord(int paragraphIdx, + int sentenceIdx, + int wordIdx) const; - int CountPredArgToWord(int globalSentIdx, - int wordIdx) const; + int CountPredArgToWord(int globalSentIdx, + int wordIdx) const; - int CountPredArgToWord(int globalWordIdx) const; + int CountPredArgToWord(int globalWordIdx) const; - int GetPredArgToWord(int paragraphIdx, - int sentenceIdx, - int wordIdx, - vector &vecType, - vector< pair > &vecBegEnd) const; + int GetPredArgToWord(int paragraphIdx, + int sentenceIdx, + int wordIdx, + vector &vecType, + vector< pair > &vecBegEnd) const; - int GetPredArgToWord(int globalSentIdx, - int wordIdx, - vector &vecType, - vector< pair > &vecBegEnd) const; + int GetPredArgToWord(int globalSentIdx, + int wordIdx, + vector &vecType, + vector< pair > &vecBegEnd) const; - int GetPredArgToWord(int globalWordIdx, - vector &vecType, - vector< pair > &vecBegEnd) const; + int GetPredArgToWord(int globalWordIdx, + vector &vecType, + vector< pair > &vecBegEnd) const; - int GetPredArgToWord(int paragraphIdx, - int sentenceIdx, - int wordIdx, - vector &vecType, - vector< pair > &vecBegEnd) const; + int GetPredArgToWord(int paragraphIdx, + int sentenceIdx, + int wordIdx, + vector &vecType, + vector< pair > &vecBegEnd) const; - int GetPredArgToWord(int sentenceIdx, - int wordIdx, - vector &vecType, - vector< pair > &vecBegEnd) const; + int GetPredArgToWord(int sentenceIdx, + int wordIdx, + vector &vecType, + vector< pair > &vecBegEnd) const; - int SetPredArgToWord(int paragraphIdx, - int sentenceIdx, - int wordIdx, - const vector &vecType, - const vector< pair > &vecBegEnd); + int SetPredArgToWord(int paragraphIdx, + int sentenceIdx, + int wordIdx, + const vector &vecType, + const vector< pair > &vecBegEnd); - int SetPredArgToWord(int sentenceIdx, - int wordIdx, - const vector &vecType, - const vector< pair > &vecBegEnd); + int SetPredArgToWord(int sentenceIdx, + int wordIdx, + const vector &vecType, + const vector< pair > &vecBegEnd); - // for coreference resolution - int CountEntity() const; + // for coreference resolution + int CountEntity() const; - int CountMentionInEntity(int entityIdx); + int CountMentionInEntity(int entityIdx); - int GetMentionOfEntity(vector< pair > &vecMention, - int entityIdx) const; + int GetMentionOfEntity(vector< pair > &vecMention, + int entityIdx) const; - int GetCoreference(vector< vector< pair > >& vecCoref) const; + int GetCoreference(vector< vector< pair > >& vecCoref) const; - int SetCoreference(const vector< vector< pair > >& vecCoref); + int SetCoreference(const vector< vector< pair > >& vecCoref); public: - int MapGlobalSentIdx2paraIdx_sentIdx(int sentenceIdx, - pair ¶Idx_sentIdx) const; + int DecodeGlobalId(int global_sid, int & pid, int & sid) const; - int MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(int globalWordIdx, - int ¶Idx, - int &sentIdx, - int &wordIdx) const; + int DecodeGlobalId(int globalWordIdx, + int ¶Idx, + int &sentIdx, + int &wordIdx) const; - int CheckRange(int paragraphIdx, - int sentenceIdx, - int wordIdx) const; + int CheckRange(int paragraphIdx, + int sentenceIdx, + int wordIdx) const; - int CheckRange(int paragraphIdx, - int sentenceIdx) const; + int CheckRange(int paragraphIdx, + int sentenceIdx) const; - int CheckRange(int paragraphIdx) const; + int CheckRange(int paragraphIdx) const; - void ReportTiXmlDocErr() const; + void ReportTiXmlDocErr() const; - int BuildParagraph(string &strParagraph, - int paragraphIdx); + int BuildParagraph(string &strParagraph, + int paragraphIdx); private: - typedef struct { - TiXmlElement *wordPtr; - } Word_t; - - typedef struct { - vector words; - TiXmlElement * sentencePtr; - } Sentence_t; - - typedef struct { - vector sentences; - TiXmlElement * paragraphPtr; - } Paragraph_t; - - typedef struct { - vector paragraphs; - TiXmlElement * documentPtr; - } Document_t; - - typedef struct { - TiXmlElement *nodePtr; - } Note, Summary, TextClass; - - typedef struct { - TiXmlElement *mentionPtr; - } Mention; - - typedef struct { - vector vecMention; - TiXmlElement *entityPtr; - } Entity; - - typedef struct { - vector vecEntity; - TiXmlElement *nodePtr; - } Coref; - + typedef struct { + TiXmlElement *wordPtr; + } Word; + + typedef struct { + vector words; + TiXmlElement * sentencePtr; + } Sentence; + + typedef struct { + vector sentences; + TiXmlElement * paragraphPtr; + } Paragraph; + + typedef struct { + vector paragraphs; + TiXmlElement * documentPtr; + } Document; + + typedef struct { + TiXmlElement *nodePtr; + } Note, Summary, TextClass; + + typedef struct { + TiXmlElement *mentionPtr; + } Mention; + + typedef struct { + vector vecMention; + TiXmlElement *entityPtr; + } Entity; + + typedef struct { + vector vecEntity; + TiXmlElement *nodePtr; + } Coref; + + typedef std::pair WSDResult; + typedef std::pair ParseResult; private: - // initialization during loading txt - int BuildDOMFrame(); - - // initialization during loading xml - int InitXmlStructure(); + // initialization during loading txt + int BuildDOMFrame(); - void CheckNoteForOldLtml(); + // initialization during loading xml + int InitXmlStructure(); - int InitXmlDocument(Document_t & document); + int InitXmlDocument(Document & document); - int InitXmlParagraph(vector & vecParagraph, - TiXmlElement *paragraphPtr); + int InitXmlParagraph(vector & vecParagraph, + TiXmlElement *paragraphPtr); - int InitXmlSentence(vector &vecSentence, - TiXmlElement *stnsPtr); + int InitXmlSentence(vector &vecSentence, + TiXmlElement *stnsPtr); - int InitXmlWord(vector &vecWord, - TiXmlElement *wordPtr); + int InitXmlWord(vector &vecWord, + TiXmlElement *wordPtr); - int InitXmlCoref(Coref &coref); + int InitXmlCoref(Coref &coref); - int InitXmlEntity(vector &vecEntity, - TiXmlElement *entityPtr); + int InitXmlEntity(vector &vecEntity, + TiXmlElement *entityPtr); - int InitXmlMention(vector &vecMention, - TiXmlElement *mentionPtr); + int InitXmlMention(vector &vecMention, + TiXmlElement *mentionPtr); - int GetInfoFromSentence(vector &vecInfo, - int paragraphIdx, - int sentenceIdx, - const char *attrName) const; + int GetInfoFromSentence(vector &vecInfo, + int paragraphIdx, + int sentenceIdx, + const char *attrName) const; - int GetInfoFromSentence(vector &vecInfo, - int sentenceIdx, - const char *attrName) const; + int GetInfoFromSentence(vector &vecInfo, + int sentenceIdx, + const char *attrName) const; - int GetInfoFromSentence(vector &vec, - int paragraphIdx, - int sentenceIdx, - const char* attrName) const; + int GetInfoFromSentence(vector &vec, + int paragraphIdx, + int sentenceIdx, + const char* attrName) const; - int GetInfoFromSentence(vector &vec, - int sentenceIdx, - const char * attrName) const; + int GetInfoFromSentence(vector &vec, + int sentenceIdx, + const char * attrName) const; - int SetInfoToSentence(const vector &vec, - int paragraphIdx, - int sentenceIdx, - const char * attrName); + int SetInfoToSentence(const vector &vec, + int paragraphIdx, + int sentenceIdx, + const char * attrName); - int SetInfoToSentence(const vector &vec, - int sentenceIdx, - const char * attrName); + int SetInfoToSentence(const vector &vec, + int sentenceIdx, + const char * attrName); - int SetInfoToSentence(const vector &vec, - int paragraphIdx, - int sentenceIdx, - const char* attrName); + int SetInfoToSentence(const vector &vec, + int paragraphIdx, + int sentenceIdx, + const char* attrName); - int SetInfoToSentence(const vector &vec, - int sentenceIdx, - const char* attrName); + int SetInfoToSentence(const vector &vec, + int sentenceIdx, + const char* attrName); - bool LTMLValidation(); - /*-------------------------------------------*/ + bool LTMLValidation(); + /*-------------------------------------------*/ private: - vector m_vecBegStnsIdxOfPara; - vector m_vecBegWordIdxOfStns; + vector m_vecBegStnsIdxOfPara; + vector m_vecBegWordIdxOfStns; - Document_t m_document; - Note m_note; - Summary m_summary; - TextClass m_textclass; - Coref m_coref; + Document document; + Note note; + Summary summary; + TextClass textclass; + Coref coref; - TiXmlDocument m_tiXmlDoc; + TiXmlDocument m_tiXmlDoc; - /*-------------------------------------------*/ + /*-------------------------------------------*/ private: - static const char * const TAG_DOC; - static const char * const TAG_NOTE; - static const char * const TAG_SUM; - static const char * const TAG_CLASS; - static const char * const TAG_COREF; - static const char * const TAG_COREF_MENT; - static const char * const TAG_COREF_CR; - static const char * const TAG_PARA; - static const char * const TAG_SENT; - static const char * const TAG_WORD; - static const char * const TAG_CONT; //sent, word - static const char * const TAG_POS; - static const char * const TAG_NE; - static const char * const TAG_WSD; - static const char * const TAG_WSD_EXP; - static const char * const TAG_PSR_PARENT; - static const char * const TAG_PSR_RELATE; - static const char * const TAG_SRL_ARG; - static const char * const TAG_SRL_TYPE; - static const char * const TAG_BEGIN; // cr, srl - static const char * const TAG_END; // cr, srl - static const char * const TAG_ID; // para, sent, word + static const char * const TAG_DOC; + static const char * const TAG_NOTE; + static const char * const TAG_SUM; + static const char * const TAG_CLASS; + static const char * const TAG_COREF; + static const char * const TAG_COREF_MENT; + static const char * const TAG_COREF_CR; + static const char * const TAG_PARA; + static const char * const TAG_SENT; + static const char * const TAG_WORD; + static const char * const TAG_CONT; //sent, word + static const char * const TAG_POS; + static const char * const TAG_NE; + static const char * const TAG_WSD; + static const char * const TAG_WSD_EXP; + static const char * const TAG_PSR_PARENT; + static const char * const TAG_PSR_RELATE; + static const char * const TAG_SRL_ARG; + static const char * const TAG_SRL_TYPE; + static const char * const TAG_BEGIN; // cr, srl + static const char * const TAG_END; // cr, srl + static const char * const TAG_ID; // para, sent, word }; -#endif // end for __LTP_XML4NLP_H__ +#endif // end for __LTP_XML4NLP_H__ diff --git a/src/_split_sentence/CMakeLists.txt b/src/_split_sentence/CMakeLists.txt deleted file mode 100644 index 978e740a8..000000000 --- a/src/_split_sentence/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -include_directories (./ - ${utils_DIR}) - -SET(splitsnt_source - define.h - Reader.cpp - Reader.h - SentenceIterator.h - SplitSentence.cpp - SplitSentence.h) - -ADD_LIBRARY(splitsnt ${splitsnt_source}) diff --git a/src/_split_sentence/SplitSentence.h b/src/_split_sentence/SplitSentence.h deleted file mode 100644 index f95ee9ceb..000000000 --- a/src/_split_sentence/SplitSentence.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef __SPLIT_SENTENCE_H__ -#define __SPLIT_SENTENCE_H__ - -#pragma warning(disable: 4786) - -#include -#include - -// return (int)vecSentence.size(); -int SplitSentence( const std::string& strPara, std::vector& vecSentence ); - -#endif //__SPLIT_SENTENCE_H__ - diff --git a/src/ner/CMakeLists.txt b/src/ner/CMakeLists.txt index c1178f859..27403ca40 100644 --- a/src/ner/CMakeLists.txt +++ b/src/ner/CMakeLists.txt @@ -1,8 +1,4 @@ -include_directories ( - ${SOURCE_DIR}/ner - ${SOURCE_DIR}/utils - ${SOURCE_DIR}/utils/math) -# ${THIRDPARTY_DIR}/boost/include) +include_directories (${SOURCE_DIR}/ ${THIRDPARTY_DIR}/boost/include/) set (ner_VERSION "0.0.1") @@ -34,4 +30,4 @@ set_target_properties (otner RUNTIME_OUTPUT_DIRECTORY ${TOOLS_DIR}/train/) configure_file ( ner_dll.h - ${INCLUDE_OUTPUT_PATH}/ner_dll.h) + ${INCLUDE_OUTPUT_PATH}/ltp/ner_dll.h) diff --git a/src/ner/decoder.cpp b/src/ner/decoder.cpp index 938feadaa..400fd8d41 100644 --- a/src/ner/decoder.cpp +++ b/src/ner/decoder.cpp @@ -1,103 +1,103 @@ -#include "decoder.h" +#include "ner/decoder.h" namespace ltp { namespace ner { void Decoder::decode(Instance * inst) { - init_lattice(inst); - viterbi_decode(inst); - get_result(inst); - free_lattice(); + init_lattice(inst); + viterbi_decode(inst); + get_result(inst); + free_lattice(); } void Decoder::init_lattice(const Instance * inst) { - int len = inst->size(); - lattice.resize(len, L); - lattice = NULL; + int len = inst->size(); + lattice.resize(len, L); + lattice = NULL; } void Decoder::viterbi_decode(const Instance * inst) { - int len = inst->size(); - for (int i = 0; i < len; ++ i) { - for (int l = 0; l < L; ++ l) { - if (i == 0) { - LatticeItem * item = new LatticeItem(i, l, inst->uni_scores[i][l], NULL); - lattice_insert(lattice[i][l], item); - } else { - for (int pl = 0; pl < L; ++ pl) { - if (false == base.legal_trans(pl, l)) { - continue; - } - - double score = 0.; - const LatticeItem * prev = lattice[i-1][pl]; - - if (!prev) { - continue; - } - - // std::cout << i << " " << pl << " " << l << std::endl; - score = inst->uni_scores[i][l] + inst->bi_scores[pl][l] + prev->score; - const LatticeItem * item = new LatticeItem(i, l, score, prev); - lattice_insert(lattice[i][l], item); - } - } // end for if i == 0 + int len = inst->size(); + for (int i = 0; i < len; ++ i) { + for (int l = 0; l < L; ++ l) { + if (i == 0) { + LatticeItem * item = new LatticeItem(i, l, inst->uni_scores[i][l], NULL); + lattice_insert(lattice[i][l], item); + } else { + for (int pl = 0; pl < L; ++ pl) { + if (false == base.legal_trans(pl, l)) { + continue; + } + + double score = 0.; + const LatticeItem * prev = lattice[i-1][pl]; + + if (!prev) { + continue; + } + + // std::cout << i << " " << pl << " " << l << std::endl; + score = inst->uni_scores[i][l] + inst->bi_scores[pl][l] + prev->score; + const LatticeItem * item = new LatticeItem(i, l, score, prev); + lattice_insert(lattice[i][l], item); } + } // end for if i == 0 } + } } void Decoder::get_result(Instance * inst) { - int len = inst->size(); - const LatticeItem * best_item = NULL; - for (int l = 0; l < L; ++ l) { - if (!lattice[len-1][l]) { - continue; - } - if (best_item == NULL || (lattice[len-1][l]->score > best_item->score)) { - best_item = lattice[len - 1][l]; - } + int len = inst->size(); + const LatticeItem * best_item = NULL; + for (int l = 0; l < L; ++ l) { + if (!lattice[len-1][l]) { + continue; } + if (best_item == NULL || (lattice[len-1][l]->score > best_item->score)) { + best_item = lattice[len - 1][l]; + } + } - const LatticeItem * item = best_item; - inst->predicted_tagsidx.resize(len); + const LatticeItem * item = best_item; + inst->predicted_tagsidx.resize(len); - while (item) { - inst->predicted_tagsidx[item->i] = item->l; - // std::cout << item->i << " " << item->l << std::endl; - item = item->prev; - } + while (item) { + inst->predicted_tagsidx[item->i] = item->l; + // std::cout << item->i << " " << item->l << std::endl; + item = item->prev; + } } void Decoder::free_lattice() { - for (int i = 0; i < lattice.nrows(); ++ i) { - for (int j = 0; j < lattice.ncols(); ++ j) { - if (lattice[i][j]) delete lattice[i][j]; - } + for (int i = 0; i < lattice.nrows(); ++ i) { + for (int j = 0; j < lattice.ncols(); ++ j) { + if (lattice[i][j]) delete lattice[i][j]; } + } } /*void KBestDecoder::decode(Instance * inst, KBestDecodeResult & result) { - init_lattice(inst); - kbest_viterbi_decode(inst); - get_result(result); - free_lattice(); + init_lattice(inst); + kbest_viterbi_decode(inst); + get_result(result); + free_lattice(); } void KBestDecoder::init_lattice(const Instance * inst) { - int len = inst->len(); - lattice.resize(len, L); + int len = inst->len(); + lattice.resize(len, L); - for (int i = 0; i < len; ++ i) { - for (int l = 0; l < L; ++ l) { - lattice[i][l] = new KHeap(k); - } + for (int i = 0; i < len; ++ i) { + for (int l = 0; l < L; ++ l) { + lattice[i][l] = new KHeap(k); } + } } void KBestDecoder::kbest_viterbi_decode(const Instance * inst) { }*/ -} // end for namespace ner -} // end for namespace ltp +} // end for namespace ner +} // end for namespace ltp diff --git a/src/ner/decoder.h b/src/ner/decoder.h index e18bb166d..5ce0355a9 100644 --- a/src/ner/decoder.h +++ b/src/ner/decoder.h @@ -3,9 +3,9 @@ #include #include -#include "instance.h" -#include "mat.h" -#include "rulebase.h" +#include "ner/instance.h" +#include "ner/rulebase.h" +#include "utils/math/mat.h" namespace ltp { namespace ner { diff --git a/src/ner/extractor.cpp b/src/ner/extractor.cpp index 69fb69d6f..dd875811a 100644 --- a/src/ner/extractor.cpp +++ b/src/ner/extractor.cpp @@ -1,8 +1,7 @@ -#include "extractor.h" -#include "settings.h" - -#include "strutils.hpp" -#include "chartypes.hpp" +#include "ner/extractor.h" +#include "ner/settings.h" +#include "utils/strutils.hpp" +#include "utils/chartypes.hpp" namespace ltp { namespace ner { diff --git a/src/ner/extractor.h b/src/ner/extractor.h index e2590d78a..8c05963b0 100644 --- a/src/ner/extractor.h +++ b/src/ner/extractor.h @@ -3,10 +3,9 @@ #include #include -#include "instance.h" - -#include "template.hpp" -#include "strvec.hpp" +#include "ner/instance.h" +#include "utils/template.hpp" +#include "utils/strvec.hpp" namespace ltp { namespace ner { diff --git a/src/ner/featurespace.cpp b/src/ner/featurespace.cpp index b116cf5ab..9e867b362 100644 --- a/src/ner/featurespace.cpp +++ b/src/ner/featurespace.cpp @@ -1,6 +1,5 @@ -#include "featurespace.h" - -#include "extractor.h" +#include "ner/featurespace.h" +#include "ner/extractor.h" namespace ltp { namespace ner { diff --git a/src/ner/featurespace.h b/src/ner/featurespace.h index dd2a782d9..8eff642df 100644 --- a/src/ner/featurespace.h +++ b/src/ner/featurespace.h @@ -4,120 +4,122 @@ #include #include -#include "smartmap.hpp" +#include "utils/smartmap.hpp" namespace ltp { namespace ner { class FeatureSpaceIterator { public: - FeatureSpaceIterator() : - _dicts(NULL), - _i(0), - _state(0) { - // should be careful about the empty dicts + FeatureSpaceIterator() + : _dicts(NULL), + _num_dicts(-1), + _i(0), + _state(0) { + // should be careful about the empty dicts + } + + // initialize the iterator with dicts and number of dicts + FeatureSpaceIterator(utility::SmartMap * dicts, int num_dicts) + : _dicts(dicts), + _num_dicts(num_dicts), + _i(0), + _state(0) { + ++ (*this); + } + + ~FeatureSpaceIterator() { + } + + const char * key() { return _j.key(); } + int id() { return (*_j.value()); } + int tid() { return _i; } + + bool operator ==(const FeatureSpaceIterator & other) const { + return ((_dicts + _i) == other._dicts); + } + + bool operator !=(const FeatureSpaceIterator & other) const { + return ((_dicts + _i) != other._dicts); + } + + FeatureSpaceIterator & operator = (const FeatureSpaceIterator & other) { + if (this != &other) { + _dicts = other._dicts; + _i = other._i; + _state = other._state; + _num_dicts = other._num_dicts; } - // initialize the iterator with dicts and number of dicts - FeatureSpaceIterator(utility::SmartMap * dicts, int num_dicts) : - _dicts(dicts), - _num_dicts(num_dicts), - _i(0), - _state(0) { - ++ (*this); - } - - ~FeatureSpaceIterator() { - } - - const char * key() { return _j.key(); } - int id() { return (*_j.value()); } - int tid() { return _i; } - - bool operator ==(const FeatureSpaceIterator & other) const { - return ((_dicts + _i) == other._dicts); - } - - bool operator !=(const FeatureSpaceIterator & other) const { - return ((_dicts + _i) != other._dicts); - } - - FeatureSpaceIterator & operator = (const FeatureSpaceIterator & other) { - if (this != &other) { - _dicts = other._dicts; - _i = other._i; - _state = other._state; - } - - return *this; - } - - void operator ++() { - switch (_state) { - case 0: - for (_i = 0; ; ++ _i) { - if (_dicts[_i].begin() == _dicts[_i].end()) { - _state = 1; - return; - } - for (_j = _dicts[_i].begin(); _j != _dicts[_i].end(); ++ _j) { - _state = 1; - return; - case 1:; - } - } + return *this; + } + + void operator ++() { + switch (_state) { + case 0: + for (_i = 0; ; ++ _i) { + if (_dicts[_i].begin() == _dicts[_i].end()) { + _state = 1; + return; + } + for (_j = _dicts[_i].begin(); _j != _dicts[_i].end(); ++ _j) { + _state = 1; + return; + case 1:; + } } } + } - int _i; - int _state; - int _num_dicts; - utility::SmartMap::const_iterator _j; - utility::SmartMap * _dicts; + int _i; + int _state; + int _num_dicts; + utility::SmartMap::const_iterator _j; + utility::SmartMap * _dicts; }; class FeatureSpace { public: - FeatureSpace(int num_labels = 1); - ~FeatureSpace(); - - int retrieve(int tid, const char * key, bool create); - int index(int tid, const char * key, int lid = 0); - int index(int prev_lid, int lid); - int num_features(); - int dim(); - void set_num_labels(int num_labeles); - - /* - * dump the feature space to a output stream - * - * @param[in] ofs the output stream - */ - void dump(std::ostream & ofs); - - /* - * load the feature space from a input stream - * - * @param[in] num_labels the number of labels - * @param[in] ifs the input stream - */ - bool load(int num_labeles, std::istream & ifs); - - FeatureSpaceIterator begin() { - return FeatureSpaceIterator(dicts, _num_dicts); - } - - FeatureSpaceIterator end() { - return FeatureSpaceIterator(dicts + _num_dicts, _num_dicts); - } + FeatureSpace(int num_labels = 1); + ~FeatureSpace(); + + int retrieve(int tid, const char * key, bool create); + int index(int tid, const char * key, int lid = 0); + int index(int prev_lid, int lid); + int num_features(); + int dim(); + void set_num_labels(int num_labeles); + + /* + * dump the feature space to a output stream + * + * @param[in] ofs the output stream + */ + void dump(std::ostream & ofs); + + /* + * load the feature space from a input stream + * + * @param[in] num_labels the number of labels + * @param[in] ifs the input stream + */ + bool load(int num_labeles, std::istream & ifs); + + FeatureSpaceIterator begin() { + return FeatureSpaceIterator(dicts, _num_dicts); + } + + FeatureSpaceIterator end() { + return FeatureSpaceIterator(dicts + _num_dicts, _num_dicts); + } private: - int _offset; - int _num_labels; - int _num_dicts; - utility::SmartMap * dicts; + int _offset; + int _num_labels; + int _num_dicts; + utility::SmartMap * dicts; }; -} // end for namespace ner -} // end for namespace ltp +} // end for namespace ner +} // end for namespace ltp #endif // end for __LTP_NER_FEATURE_SPACE_H__ diff --git a/src/ner/featurevec.h b/src/ner/featurevec.h deleted file mode 100644 index 96574fb97..000000000 --- a/src/ner/featurevec.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef __LTP_NER_FEATURE_VECTOR_H__ -#define __LTP_NER_FEATURE_VECTOR_H__ - -namespace ltp { -namespace ner { - -struct FeatureVector { -public: - FeatureVector () : n(0), idx(0), val(0) { - } - - ~FeatureVector() { - } - - void clear() { - if (idx) { - delete [](idx); - idx = 0; - } - - if (val) { - delete [](val); - val = 0; - } - } - -public: - int n; - int * idx; - double * val; - int loff; -}; - -} // end for namespace segmentor -} // end for namespace ltp - -#endif // end for __LTP_SEGMENTOR_FEATRUE_VECTOR_H__ diff --git a/src/ner/instance.h b/src/ner/instance.h index 7df9423d9..683f22540 100644 --- a/src/ner/instance.h +++ b/src/ner/instance.h @@ -2,178 +2,177 @@ #define __LTP_NER_INSTANCE_H__ #include -#include "featurevec.h" -#include "mat.h" -#include "sparsevec.h" +#include "utils/math/mat.h" +#include "utils/math/featurevec.h" +#include "utils/math/sparsevec.h" namespace ltp { namespace ner { class Instance { public: - Instance() {} + Instance() {} - ~Instance() { - cleanup(); + ~Instance() { + cleanup(); + } + + inline size_t size() const { + return forms.size(); + } + + int num_errors() { + int len = size(); + if ((len != tagsidx.size()) || (len != predicted_tagsidx.size())) { + return len; } - inline size_t size() const { - return forms.size(); + int ret = 0; + for (int i = 0; i < len; ++ i) { + if (tagsidx[i] != predicted_tagsidx[i]) { + ++ ret; + } } - int num_errors() { - int len = size(); - if ((len != tagsidx.size()) || (len != predicted_tagsidx.size())) { - return len; - } + return ret; + } - int ret = 0; - for (int i = 0; i < len; ++ i) { - if (tagsidx[i] != predicted_tagsidx[i]) { - ++ ret; - } - } + int num_corrected_predicted_tags() { + int len = size(); + int ret = 0; - return ret; + for (int i = 0; i < len; ++ i) { + if (tagsidx[i] == predicted_tagsidx[i]) { + ++ ret; + } } - int num_corrected_predicted_tags() { - int len = size(); - int ret = 0; + return ret; + } - for (int i = 0; i < len; ++ i) { - if (tagsidx[i] == predicted_tagsidx[i]) { - ++ ret; - } - } + int num_gold_entities() { + int ret = 0; + if (entities.size() == 0) { + return size(); + } - return ret; + for (int i = 0; i < entities_tags.size(); ++ i) { + if (entities_tags[i] != "O") { + ++ ret; + } } - int num_gold_entities() { - int ret = 0; - if (entities.size() == 0) { - return size(); - } + return ret; + } - for (int i = 0; i < entities_tags.size(); ++ i) { - if (entities_tags[i] != "O") { - ++ ret; - } - } + int num_predicted_entities() { + int ret = 0; + if (predicted_entities.size() == 0) { + return size(); + } - return ret; + for (int i = 0; i < predicted_entities_tags.size(); ++ i) { + if (predicted_entities_tags[i] != "O") { + ++ ret; + } } - int num_predicted_entities() { - int ret = 0; - if (predicted_entities.size() == 0) { - return size(); - } + return ret; + } - for (int i = 0; i < predicted_entities_tags.size(); ++ i) { - if (predicted_entities_tags[i] != "O") { - ++ ret; - } - } + int num_recalled_entites() { + int len = 0; + int ret = 0; + int gold_len = 0, predicted_len = 0; - return ret; + for (int i = 0; i < entities.size(); ++ i) { + len += entities[i].size(); } - int num_recalled_entites() { - int len = 0; - int ret = 0; - int gold_len = 0, predicted_len = 0; - - for (int i = 0; i < entities.size(); ++ i) { - len += entities[i].size(); + for (int i = 0, j = 0; i < entities.size() && j < predicted_entities.size(); ) { + if ((entities[i] == predicted_entities[j]) && + (entities_tags[i] == predicted_entities_tags[j])) { + if (entities_tags[i] != "O") { + ++ ret; } - for (int i = 0, j = 0; i < entities.size() && j < predicted_entities.size(); ) { - if ((entities[i] == predicted_entities[j]) && - (entities_tags[i] == predicted_entities_tags[j])) { - if (entities_tags[i] != "O") { - ++ ret; - } - - gold_len += entities[i].size(); - predicted_len += predicted_entities[j].size(); - - ++ i; - ++ j; - } else { - gold_len += entities[i].size(); - predicted_len += predicted_entities[j].size(); - - ++ i; - ++ j; - - while (gold_len < len && predicted_len < len) { - if (gold_len < predicted_len) { - gold_len += entities[i].size(); - ++ i; - } else if (gold_len > predicted_len) { - predicted_len += predicted_entities[j].size(); - ++ j; - } else { - break; - } - } - } + gold_len += entities[i].size(); + predicted_len += predicted_entities[j].size(); + + ++ i; + ++ j; + } else { + gold_len += entities[i].size(); + predicted_len += predicted_entities[j].size(); + + ++ i; + ++ j; + + while (gold_len < len && predicted_len < len) { + if (gold_len < predicted_len) { + gold_len += entities[i].size(); + ++ i; + } else if (gold_len > predicted_len) { + predicted_len += predicted_entities[j].size(); + ++ j; + } else { + break; + } } - - return ret; + } } - int cleanup() { - int len = 0; - if ((len = uni_features.total_size()) > 0) { - int d1 = uni_features.nrows(); - int d2 = uni_features.ncols(); - - for (int i = 0; i < d1; ++ i) { - if (uni_features[i][0]) { - uni_features[i][0]->clear(); - } - for (int j = 0; j < d2; ++ j) { - if (uni_features[i][j]) { - delete uni_features[i][j]; - } - } - } + return ret; + } + + int cleanup() { + if (uni_features.total_size() > 0) { + int d1 = uni_features.nrows(); + int d2 = uni_features.ncols(); + + for (int i = 0; i < d1; ++ i) { + if (uni_features[i][0]) { + uni_features[i][0]->clear(); + } + for (int j = 0; j < d2; ++ j) { + if (uni_features[i][j]) { + delete uni_features[i][j]; + } } + } + } - uni_features.dealloc(); - uni_scores.dealloc(); - bi_scores.dealloc(); + uni_features.dealloc(); + uni_scores.dealloc(); + bi_scores.dealloc(); - features.zero(); - predicted_features.zero(); + features.zero(); + predicted_features.zero(); - return 0; - } + return 0; + } public: - std::vector< std::string > raw_forms; - std::vector< std::string > forms; - std::vector< std::string > postags; - std::vector< std::string > tags; - std::vector< int > tagsidx; - std::vector< std::string > predicted_tags; - std::vector< int > predicted_tagsidx; - std::vector< std::string > entities; - std::vector< std::string > entities_tags; - std::vector< std::string > predicted_entities; - std::vector< std::string > predicted_entities_tags; - - math::SparseVec features; /*< the gold features */ - math::SparseVec predicted_features; /*< the predicted features */ - - math::Mat< FeatureVector *> uni_features; - math::Mat< double > uni_scores; - math::Mat< double > bi_scores; + std::vector< std::string > raw_forms; + std::vector< std::string > forms; + std::vector< std::string > postags; + std::vector< std::string > tags; + std::vector< int > tagsidx; + std::vector< std::string > predicted_tags; + std::vector< int > predicted_tagsidx; + std::vector< std::string > entities; + std::vector< std::string > entities_tags; + std::vector< std::string > predicted_entities; + std::vector< std::string > predicted_entities_tags; + + math::SparseVec features; /*< the gold features */ + math::SparseVec predicted_features; /*< the predicted features */ + + math::Mat< math::FeatureVector *> uni_features; + math::Mat< double > uni_scores; + math::Mat< double > bi_scores; }; -} // end for namespace ner -} // end for namespace ltp +} // end for namespace ner +} // end for namespace ltp #endif // end for __LTP_NER_INSTANCE_H__ diff --git a/src/ner/model.cpp b/src/ner/model.cpp index eb9bf8599..ecff88906 100644 --- a/src/ner/model.cpp +++ b/src/ner/model.cpp @@ -1,4 +1,4 @@ -#include "model.h" +#include "ner/model.h" namespace ltp { namespace ner { @@ -9,77 +9,78 @@ Model::Model() { Model::~Model() { } -void Model::save(std::ostream & ofs) { - // write a signature into the file - char chunk[16] = {'o','t','c','w','s', '\0'}; - ofs.write(chunk, 16); +void +Model::save(std::ostream & ofs) { + // write a signature into the file + char chunk[16] = {'o','t','n','e','r', '\0'}; + ofs.write(chunk, 16); - int off = ofs.tellp(); + int off = ofs.tellp(); - unsigned labels_offset = 0; - unsigned lexicon_offset = 0; - unsigned feature_offset = 0; - unsigned parameter_offset = 0; + unsigned labels_offset = 0; + unsigned lexicon_offset = 0; + unsigned feature_offset = 0; + unsigned parameter_offset = 0; - write_uint(ofs, 0); // the label offset - write_uint(ofs, 0); // the cluster lexicon offset - write_uint(ofs, 0); // the features offset - write_uint(ofs, 0); // the parameter offset + write_uint(ofs, 0); // the label offset + write_uint(ofs, 0); // the cluster lexicon offset + write_uint(ofs, 0); // the features offset + write_uint(ofs, 0); // the parameter offset - labels_offset = ofs.tellp(); - labels.dump(ofs); + labels_offset = ofs.tellp(); + labels.dump(ofs); - lexicon_offset = ofs.tellp(); - cluster_lexicon.dump(ofs); + lexicon_offset = ofs.tellp(); + cluster_lexicon.dump(ofs); - feature_offset = ofs.tellp(); - space.dump(ofs); + feature_offset = ofs.tellp(); + space.dump(ofs); - parameter_offset = ofs.tellp(); - param.dump(ofs); + parameter_offset = ofs.tellp(); + param.dump(ofs); - ofs.seekp(off); - write_uint(ofs, labels_offset); - write_uint(ofs, lexicon_offset); - write_uint(ofs, feature_offset); - write_uint(ofs, parameter_offset); + ofs.seekp(off); + write_uint(ofs, labels_offset); + write_uint(ofs, lexicon_offset); + write_uint(ofs, feature_offset); + write_uint(ofs, parameter_offset); } bool Model::load(std::istream & ifs) { - char chunk[16]; - ifs.read(chunk, 16); - - if (strcmp(chunk, "otcws")) { - return false; - } - - unsigned labels_offset = read_uint(ifs); - unsigned lexicon_offset = read_uint(ifs); - unsigned feature_offset = read_uint(ifs); - unsigned parameter_offset = read_uint(ifs); - - ifs.seekg(labels_offset); - if (!labels.load(ifs)) { - return false; - } - - ifs.seekg(lexicon_offset); - if (!cluster_lexicon.load(ifs)) { - return false; - } - - ifs.seekg(feature_offset); - if (!space.load(labels.size(), ifs)) { - return false; - } - - ifs.seekg(parameter_offset); - if (!param.load(ifs)) { - return false; - } - - return true; + char chunk[16]; + ifs.read(chunk, 16); + + if (strcmp(chunk, "otner")) { + return false; + } + + unsigned labels_offset = read_uint(ifs); + unsigned lexicon_offset = read_uint(ifs); + unsigned feature_offset = read_uint(ifs); + unsigned parameter_offset = read_uint(ifs); + + ifs.seekg(labels_offset); + if (!labels.load(ifs)) { + return false; + } + + ifs.seekg(lexicon_offset); + if (!cluster_lexicon.load(ifs)) { + return false; + } + + ifs.seekg(feature_offset); + if (!space.load(labels.size(), ifs)) { + return false; + } + + ifs.seekg(parameter_offset); + if (!param.load(ifs)) { + return false; + } + + return true; } -} // end for namespace ner -} // end for namespace ltp +} // end for namespace ner +} // end for namespace ltp diff --git a/src/ner/model.h b/src/ner/model.h index 7534a1814..7084889c8 100644 --- a/src/ner/model.h +++ b/src/ner/model.h @@ -1,10 +1,9 @@ #ifndef __LTP_NER_MODEL_H__ #define __LTP_NER_MODEL_H__ -#include "featurespace.h" -#include "parameter.h" - -#include "smartmap.hpp" +#include "ner/featurespace.h" +#include "ner/parameter.h" +#include "utils/smartmap.hpp" namespace ltp { namespace ner { @@ -13,50 +12,50 @@ using namespace ltp::utility; class Model { public: - Model(); - ~Model(); - - /* - * get number of labels; - * - * @return int the number of labels - */ - inline int num_labels(void) { - return labels.size(); - } - - /* - * save the model to a output stream - * - * @param[out] ofs the output stream - */ - void save(std::ostream & ofs); - - /* - * load the model from an input stream - * - * @param[in] ifs the input stream - */ - bool load(std::istream & ifs); + Model(); + ~Model(); + + /* + * get number of labels; + * + * @return int the number of labels + */ + inline int num_labels(void) { + return labels.size(); + } + + /* + * save the model to a output stream + * + * @param[out] ofs the output stream + */ + void save(std::ostream & ofs); + + /* + * load the model from an input stream + * + * @param[in] ifs the input stream + */ + bool load(std::istream & ifs); public: - IndexableSmartMap labels; - FeatureSpace space; - Parameters param; + IndexableSmartMap labels; + FeatureSpace space; + Parameters param; - SmartMap cluster_lexicon; + SmartMap cluster_lexicon; private: - void write_uint(std::ostream & out, unsigned int val) { - out.write(reinterpret_cast(&val), sizeof(unsigned int)); - } - - unsigned int read_uint(std::istream & in) { - char p[4]; - in.read(reinterpret_cast(p), sizeof(unsigned int)); - return *reinterpret_cast(p); - } + void write_uint(std::ostream & out, unsigned int val) { + out.write(reinterpret_cast(&val), sizeof(unsigned int)); + } + + unsigned int read_uint(std::istream & in) { + char p[4]; + in.read(reinterpret_cast(p), sizeof(unsigned int)); + return *reinterpret_cast(p); + } }; -} // end for namespace ner -} // end for namespace ltp +} // end for namespace ner +} // end for namespace ltp #endif // end for __LTP_NER_MODEL_H__ diff --git a/src/ner/ner.cpp b/src/ner/ner.cpp index 40e4ba5fc..660c51cac 100644 --- a/src/ner/ner.cpp +++ b/src/ner/ner.cpp @@ -1,12 +1,11 @@ #include "ner.h" -#include "time.hpp" -#include "logging.hpp" -#include "instance.h" -#include "extractor.h" -#include "options.h" - -#include "nerio.h" +#include "utils/time.hpp" +#include "utils/logging.hpp" +#include "ner/instance.h" +#include "ner/extractor.h" +#include "ner/options.h" +#include "ner/nerio.h" #include #include @@ -20,723 +19,734 @@ namespace ltp { namespace ner { -NER::NER() : - model(0), - decoder(0) { +NER::NER() + : model(0), + decoder(0), + __TRAIN__(false), + __TEST__(false), + __DUMP__(false) { } -NER::NER(ltp::utility::ConfigParser & cfg) : - model(0), - decoder(0) { - parse_cfg(cfg); +NER::NER(ltp::utility::ConfigParser & cfg) + : model(0), + decoder(0), + __TRAIN__(false), + __TEST__(false), + __DUMP__(false) { + parse_cfg(cfg); } NER::~NER() { - if (model) { - delete model; - } + if (model) { + delete model; + } - if (decoder) { - delete decoder; - } + if (decoder) { + delete decoder; + } } -void NER::run(void) { - if (__TRAIN__) { - train(); - } +void +NER::run(void) { + if (__TRAIN__) { + train(); + } - if (__TEST__) { - test(); - } + if (__TEST__) { + test(); + } - if (__DUMP__) { - dump(); - } + if (__DUMP__) { + dump(); + } - for (int i = 0; i < train_dat.size(); ++ i) { - if (train_dat[i]) { - delete train_dat[i]; - } + for (int i = 0; i < train_dat.size(); ++ i) { + if (train_dat[i]) { + delete train_dat[i]; } + } } -bool NER::parse_cfg(ltp::utility::ConfigParser & cfg) { - std::string strbuf; - int intbuf; - - __TRAIN__ = false; +bool +NER::parse_cfg(ltp::utility::ConfigParser & cfg) { + std::string strbuf; + int intbuf; - train_opt.train_file = ""; - train_opt.holdout_file = ""; - train_opt.algorithm = "pa"; - train_opt.model_name = ""; - train_opt.max_iter = 10; - train_opt.display_interval = 5000; - if (cfg.has_section("train")) { - TRACE_LOG("Training mode specified."); - __TRAIN__ = true; + train_opt.train_file = ""; + train_opt.holdout_file = ""; + train_opt.algorithm = "pa"; + train_opt.model_name = ""; + train_opt.max_iter = 10; + train_opt.display_interval = 5000; - if (cfg.get("train", "train-file", strbuf)) { - train_opt.train_file = strbuf; - } else { - ERROR_LOG("train-file config item is not found."); - return false; - } - - if (cfg.get("train", "holdout-file", strbuf)) { - train_opt.holdout_file = strbuf; - } else { - ERROR_LOG("holdout-file config item is not found."); - return false; - } + if (cfg.has_section("train")) { + TRACE_LOG("Training mode specified."); + __TRAIN__ = true; - if (cfg.get("train", "algorithm", strbuf)) { - train_opt.algorithm = strbuf; - } else { - WARNING_LOG("algorithm is not configed, [PA] is set as default"); - } + if (cfg.get("train", "train-file", strbuf)) { + train_opt.train_file = strbuf; + } else { + ERROR_LOG("train-file config item is not found."); + return false; + } - train_opt.model_name = train_opt.train_file + "." + train_opt.algorithm; - if (cfg.get("train", "model-name", strbuf)) { - train_opt.model_name = strbuf; - } else { - WARNING_LOG("model name is not configed, [%s] is set as default", - train_opt.model_name.c_str()); - } + if (cfg.get("train", "holdout-file", strbuf)) { + train_opt.holdout_file = strbuf; + } else { + ERROR_LOG("holdout-file config item is not found."); + return false; + } - if (cfg.get_integer("train", "max-iter", intbuf)) { - train_opt.max_iter = intbuf; - } else { - WARNING_LOG("max-iter is not configed, [10] is set as default."); - } + if (cfg.get("train", "algorithm", strbuf)) { + train_opt.algorithm = strbuf; + } else { + WARNING_LOG("algorithm is not configed, [PA] is set as default"); } - __TEST__ = false; + train_opt.model_name = train_opt.train_file + "." + train_opt.algorithm; + if (cfg.get("train", "model-name", strbuf)) { + train_opt.model_name = strbuf; + } else { + WARNING_LOG("model name is not configed, [%s] is set as default", + train_opt.model_name.c_str()); + } - test_opt.test_file = ""; - test_opt.model_file = ""; - test_opt.lexicon_file = ""; + if (cfg.get_integer("train", "max-iter", intbuf)) { + train_opt.max_iter = intbuf; + } else { + WARNING_LOG("max-iter is not configed, [10] is set as default."); + } + } - if (cfg.has_section("test")) { - __TEST__ = true; + test_opt.test_file = ""; + test_opt.model_file = ""; + test_opt.lexicon_file = ""; - if (cfg.get("test", "test-file", strbuf)) { - test_opt.test_file = strbuf; - } else { - ERROR_LOG("test-file config item is not set."); - return false; - } + if (cfg.has_section("test")) { + __TEST__ = true; - if (cfg.get("test", "model-file", strbuf)) { - test_opt.model_file = strbuf; - } else { - ERROR_LOG("model-file is not configed. "); - return false; - } + if (cfg.get("test", "test-file", strbuf)) { + test_opt.test_file = strbuf; + } else { + ERROR_LOG("test-file config item is not set."); + return false; + } - if (cfg.get("test", "lexicon-file", strbuf)) { - test_opt.lexicon_file = strbuf; - } + if (cfg.get("test", "model-file", strbuf)) { + test_opt.model_file = strbuf; + } else { + ERROR_LOG("model-file is not configed. "); + return false; } - __DUMP__ = false; + if (cfg.get("test", "lexicon-file", strbuf)) { + test_opt.lexicon_file = strbuf; + } + } - dump_opt.model_file = ""; - if (cfg.has_section("dump")) { - __DUMP__ = true; + dump_opt.model_file = ""; + if (cfg.has_section("dump")) { + __DUMP__ = true; - if (cfg.get("dump", "model-file", strbuf)) { - dump_opt.model_file = strbuf; - } else { - ERROR_LOG("model-file is not configed."); - return false; - } + if (cfg.get("dump", "model-file", strbuf)) { + dump_opt.model_file = strbuf; + } else { + ERROR_LOG("model-file is not configed."); + return false; } + } - return true; + return true; } -bool NER::read_instance(const char * train_file) { - std::ifstream ifs(train_file); +bool +NER::read_instance(const char * train_file) { + std::ifstream ifs(train_file); - if (!ifs) { - return false; - } + if (!ifs) { + return false; + } - NERReader reader(ifs, true); - train_dat.clear(); + NERReader reader(ifs, true); + train_dat.clear(); - Instance * inst = NULL; + Instance * inst = NULL; - while ((inst = reader.next())) { - train_dat.push_back(inst); - } + while ((inst = reader.next())) { + train_dat.push_back(inst); + } - return true; + return true; } void NER::build_configuration(void) { - // tag set is some kind of hard coded into the source - - std::stringstream S; - for (int i = 0; i < __num_pos_types__; ++ i) { - for (int j = 0; j < __num_ne_types__; ++ j) { - S.str(std::string()); - S << __pos_types__[i] << "-" << __ne_types__[j]; - model->labels.push(S.str()); - } + // tag set is some kind of hard coded into the source + + std::stringstream S; + for (int i = 0; i < __num_pos_types__; ++ i) { + for (int j = 0; j < __num_ne_types__; ++ j) { + S.str(std::string()); + S << __pos_types__[i] << "-" << __ne_types__[j]; + model->labels.push(S.str()); } - model->labels.push("O"); + } + model->labels.push("O"); - for (int i = 0; i < train_dat.size(); ++ i) { - Instance * inst = train_dat[i]; - int len = inst->size(); + for (int i = 0; i < train_dat.size(); ++ i) { + Instance * inst = train_dat[i]; + int len = inst->size(); - inst->tagsidx.resize(len); - for (int j = 0; j < len; ++ j) { - // build labels dictionary - inst->tagsidx[j] = model->labels.index( inst->tags[j] ); - } + inst->tagsidx.resize(len); + for (int j = 0; j < len; ++ j) { + // build labels dictionary + inst->tagsidx[j] = model->labels.index( inst->tags[j] ); } + } } void NER::extract_features(Instance * inst, bool create) { - const int N = Extractor::num_templates(); - const int L = model->num_labels(); - - vector< StringVec > cache; - vector< int > cache_again; + const int N = Extractor::num_templates(); + const int L = model->num_labels(); - cache.resize(N); - int len = inst->size(); - - // allocate the uni_features - inst->uni_features.resize(len, L); inst->uni_features = 0; - inst->uni_scores.resize(len, L); inst->uni_scores = NEG_INF; - inst->bi_scores.resize(L, L); inst->bi_scores = NEG_INF; + vector< StringVec > cache; + vector< int > cache_again; - for (int pos = 0; pos < len; ++ pos) { - for (int n = 0; n < N; ++ n) { - cache[n].clear(); - } - cache_again.clear(); + cache.resize(N); + int len = inst->size(); - Extractor::extract1o(inst, pos, cache); + // allocate the uni_features + inst->uni_features.resize(len, L); inst->uni_features = 0; + inst->uni_scores.resize(len, L); inst->uni_scores = NEG_INF; + inst->bi_scores.resize(L, L); inst->bi_scores = NEG_INF; - for (int tid = 0; tid < cache.size(); ++ tid) { - for (int itx = 0; itx < cache[tid].size(); ++ itx) { - if (create) { - model->space.retrieve(tid, cache[tid][itx], true); - } + for (int pos = 0; pos < len; ++ pos) { + for (int n = 0; n < N; ++ n) { + cache[n].clear(); + } + cache_again.clear(); - int idx = model->space.index(tid, cache[tid][itx]); + Extractor::extract1o(inst, pos, cache); - if (idx >= 0) { - cache_again.push_back(idx); - } - } + for (int tid = 0; tid < cache.size(); ++ tid) { + for (int itx = 0; itx < cache[tid].size(); ++ itx) { + if (create) { + model->space.retrieve(tid, cache[tid][itx], true); } - int num_feat = cache_again.size(); - - if (num_feat > 0) { - int l = 0; - int * idx = new int[num_feat]; - for (int j = 0; j < num_feat; ++ j) { - idx[j] = cache_again[j]; - } - - inst->uni_features[pos][l] = new FeatureVector; - inst->uni_features[pos][l]->n = num_feat; - inst->uni_features[pos][l]->val = 0; - inst->uni_features[pos][l]->loff = 0; - inst->uni_features[pos][l]->idx = idx; - - for (l = 1; l < L; ++ l) { - inst->uni_features[pos][l] = new FeatureVector; - inst->uni_features[pos][l]->n = num_feat; - inst->uni_features[pos][l]->idx = idx; - inst->uni_features[pos][l]->val = 0; - inst->uni_features[pos][l]->loff = l; - } - } - } -} + int idx = model->space.index(tid, cache[tid][itx]); -void NER::build_feature_space(void) { - // build feature space, it a wrapper for - // featurespace.build_feature_space - int N = Extractor::num_templates(); - int L = model->num_labels(); - model->space.set_num_labels(L); - - for (int i = 0; i < train_dat.size(); ++ i) { - extract_features(train_dat[i], true); - if ((i + 1) % train_opt.display_interval == 0) { - TRACE_LOG("[%d] instances is extracted.", (i+1)); + if (idx >= 0) { + cache_again.push_back(idx); } + } } -} -void NER::build_entities(Instance * inst, - const std::vector & tagsidx, - std::vector & entities, - std::vector & entities_tags, - int beg_tag0, - int beg_tag1, - int beg_tag2) { - entities.clear(); - entities_tags.clear(); - - std::string entity = ""; - std::string entity_tag = ""; - int len = inst->size(); - int tag = -1; - int tag_prefix = -1; - int tag_suffix = -1; - - // should check the tagsidx size - entity = inst->raw_forms[0]; - - tag = inst->tagsidx[0]; - tag_suffix = tag % __num_ne_types__; - entity_tag = (tag == 12 ? "O" : __ne_types__[tag_suffix]); - for (int i = 1; i < len; ++ i) { - tag = tagsidx[i]; - - tag_prefix = tag / __num_ne_types__; - tag_suffix = (tag % __num_ne_types__); - - if (tag_prefix == beg_tag0 || tag_prefix == beg_tag1 || tag_prefix == beg_tag2) { - entities.push_back(entity); - entities_tags.push_back(entity_tag); - - entity = inst->raw_forms[i]; - entity_tag = (tag == 12 ? "O" : __ne_types__[tag_suffix]); - } else { - entity += inst->raw_forms[i]; - } + int num_feat = cache_again.size(); + + if (num_feat > 0) { + int l = 0; + int * idx = new int[num_feat]; + for (int j = 0; j < num_feat; ++ j) { + idx[j] = cache_again[j]; + } + + inst->uni_features[pos][l] = new FeatureVector; + inst->uni_features[pos][l]->n = num_feat; + inst->uni_features[pos][l]->val = 0; + inst->uni_features[pos][l]->loff = 0; + inst->uni_features[pos][l]->idx = idx; + + for (l = 1; l < L; ++ l) { + inst->uni_features[pos][l] = new FeatureVector; + inst->uni_features[pos][l]->n = num_feat; + inst->uni_features[pos][l]->idx = idx; + inst->uni_features[pos][l]->val = 0; + inst->uni_features[pos][l]->loff = l; + } } - entities.push_back(entity); - entities_tags.push_back(entity_tag); + } } -void NER::calculate_scores(Instance * inst, bool use_avg) { - int len = inst->size(); - int L = model->num_labels(); - for (int i = 0; i < len; ++ i) { - for (int l = 0; l < L; ++ l) { - FeatureVector * fv = inst->uni_features[i][l]; - if (!fv) { - continue; - } +void NER::build_feature_space(void) { + // build feature space, it a wrapper for + // featurespace.build_feature_space + Extractor::num_templates(); - inst->uni_scores[i][l] = model->param.dot(inst->uni_features[i][l], use_avg); - } - } + int L = model->num_labels(); + model->space.set_num_labels(L); - for (int pl = 0; pl < L; ++ pl) { - for (int l = 0; l < L; ++ l) { - int idx = model->space.index(pl, l); - inst->bi_scores[pl][l] = model->param.dot(idx, use_avg); - } + for (int i = 0; i < train_dat.size(); ++ i) { + extract_features(train_dat[i], true); + if ((i + 1) % train_opt.display_interval == 0) { + TRACE_LOG("[%d] instances is extracted.", (i+1)); } + } } -void NER::collect_features(Instance * inst, const std::vector & tagsidx, math::SparseVec & vec) { - int len = inst->size(); +void +NER::build_entities(Instance * inst, + const std::vector & tagsidx, + std::vector & entities, + std::vector & entities_tags, + int beg_tag0, + int beg_tag1, + int beg_tag2) { + entities.clear(); + entities_tags.clear(); - vec.zero(); - for (int i = 0; i < len; ++ i) { - int l = tagsidx[i]; - const FeatureVector * fv = inst->uni_features[i][l]; + int len = inst->size(); - if (!fv) { - continue; - } + // should check the tagsidx size + std::string entity = inst->raw_forms[0]; - vec.add(fv->idx, fv->val, fv->n, fv->loff, 1.); + int tag = inst->tagsidx[0]; + int tag_prefix = -1; + int tag_suffix = tag % __num_ne_types__; - if (i > 0) { - int prev_lid = tagsidx[i-1]; - int idx = model->space.index(prev_lid, l); - vec.add(idx, 1.); - } - } -} - -Model * NER::truncate(void) { - Model * new_model = new Model; - // copy the label indexable map to the new model - for (int i = 0; i < model->labels.size(); ++ i) { - const char * key = model->labels.at(i); - new_model->labels.push(key); - } + std::string entity_tag = (tag == 12 ? "O" : __ne_types__[tag_suffix]); + for (int i = 1; i < len; ++ i) { + tag = tagsidx[i]; - TRACE_LOG("building labels map is done"); - - int L = new_model->num_labels(); - new_model->space.set_num_labels(L); - - // iterate over the feature space and see if the parameter value equals to zero - for (FeatureSpaceIterator itx = model->space.begin(); - itx != model->space.end(); - ++ itx) { - const char * key = itx.key(); - int tid = itx.tid(); - int id = model->space.index(tid, key); - - bool flag = false; - for (int l = 0; l < L; ++ l) { - double p = model->param.dot(id + l); - if (p != 0.) { - flag = true; - } - } + tag_prefix = tag / __num_ne_types__; + tag_suffix = (tag % __num_ne_types__); - if (!flag) { - continue; - } + if (tag_prefix == beg_tag0 || tag_prefix == beg_tag1 || tag_prefix == beg_tag2) { + entities.push_back(entity); + entities_tags.push_back(entity_tag); - new_model->space.retrieve(tid, key, true); - } - TRACE_LOG("Scanning old features space, building new feature space is done"); - - new_model->param.realloc(new_model->space.dim()); - TRACE_LOG("Parameter dimension of new model is [%d]", new_model->space.dim()); - - for (FeatureSpaceIterator itx = new_model->space.begin(); - itx != new_model->space.end(); - ++ itx) { - const char * key = itx.key(); - int tid = itx.tid(); - - int old_id = model->space.index(tid, key); - int new_id = new_model->space.index(tid, key); - - for (int l = 0; l < L; ++ l) { - // pay attention to this place, use average should be set true - // some dirty code - new_model->param._W[new_id + l] = model->param._W[old_id + l]; - new_model->param._W_sum[new_id + l] = model->param._W_sum[old_id + l]; - new_model->param._W_time[new_id + l] = model->param._W_time[old_id + l]; - } + entity = inst->raw_forms[i]; + entity_tag = (tag == 12 ? "O" : __ne_types__[tag_suffix]); + } else { + entity += inst->raw_forms[i]; } + } + entities.push_back(entity); + entities_tags.push_back(entity_tag); +} - for (int pl = 0; pl < L; ++ pl) { - for (int l = 0; l < L; ++ l) { - int old_id = model->space.index(pl, l); - int new_id = new_model->space.index(pl, l); +void +NER::calculate_scores(Instance * inst, bool use_avg) { + int len = inst->size(); + int L = model->num_labels(); + for (int i = 0; i < len; ++ i) { + for (int l = 0; l < L; ++ l) { + FeatureVector * fv = inst->uni_features[i][l]; + if (!fv) { + continue; + } - new_model->param._W[new_id] = model->param._W[old_id]; - new_model->param._W_sum[new_id] = model->param._W_sum[old_id]; - new_model->param._W_time[new_id] = model->param._W_time[old_id]; - } + inst->uni_scores[i][l] = model->param.dot(inst->uni_features[i][l], use_avg); } - TRACE_LOG("Building new model is done"); + } - for (SmartMap::const_iterator itx = model->cluster_lexicon.begin(); - itx != model->cluster_lexicon.end(); - ++ itx) { - new_model->cluster_lexicon.set(itx.key(), (*itx.value())); + for (int pl = 0; pl < L; ++ pl) { + for (int l = 0; l < L; ++ l) { + int idx = model->space.index(pl, l); + inst->bi_scores[pl][l] = model->param.dot(idx, use_avg); } - - return new_model; + } } -void NER::train(void) { - const char * train_file = train_opt.train_file.c_str(); +void +NER::collect_features(Instance * inst, + const std::vector & tagsidx, + math::SparseVec & vec) { + int len = inst->size(); - // read in training instance - if (!read_instance(train_file)) { - ERROR_LOG("Training file doesn't exist"); + vec.zero(); + for (int i = 0; i < len; ++ i) { + int l = tagsidx[i]; + const FeatureVector * fv = inst->uni_features[i][l]; + + if (!fv) { + continue; } - TRACE_LOG("Read in [%d] instances.", train_dat.size()); + vec.add(fv->idx, fv->val, fv->n, fv->loff, 1.); - model = new Model; - // build tag dictionary, map string tag to index - TRACE_LOG("Start build configuration"); - build_configuration(); - TRACE_LOG("Build configuration is done."); - TRACE_LOG("Number of labels: [%d]", model->labels.size()); + if (i > 0) { + int prev_lid = tagsidx[i-1]; + int idx = model->space.index(prev_lid, l); + vec.add(idx, 1.); + } + } +} - // build feature space from the training instance - TRACE_LOG("Start building feature space."); - build_feature_space(); - TRACE_LOG("Building feature space is done."); - TRACE_LOG("Number of features: [%d]", model->space.num_features()); +Model * +NER::truncate(void) { + Model * new_model = new Model; + // copy the label indexable map to the new model + for (int i = 0; i < model->labels.size(); ++ i) { + const char * key = model->labels.at(i); + new_model->labels.push(key); + } - model->param.realloc(model->space.dim()); - TRACE_LOG("Allocate [%d] dimensition parameter.", model->space.dim()); + TRACE_LOG("building labels map is done"); - NERWriter writer(std::cout); + int L = new_model->num_labels(); + new_model->space.set_num_labels(L); - if (train_opt.algorithm == "mira") { - // use mira algorithm - /*kbest_decoder = new KBestDecoder(L); + // iterate over the feature space and see if the parameter value equals to zero + for (FeatureSpaceIterator itx = model->space.begin(); + itx != model->space.end(); + ++ itx) { + const char * key = itx.key(); + int tid = itx.tid(); + int id = model->space.index(tid, key); - for (int iter = 0; iter < train_opt.max_iter; ++ iter) { - for (int i = 0; i < train_dat.size(); ++ i) { - extract_features(train_dat[i]); - calculate_scores(train_dat[i]); + bool flag = false; + for (int l = 0; l < L; ++ l) { + double p = model->param.dot(id + l); + if (p != 0.) { + flag = true; + } + } - KBestDecoder::KBestDecodeResult result; - kbest_decoder->decode(train_dat[i], result); - } - }*/ - } else { - // use pa or average perceptron algorithm - rulebase::RuleBase base(model->labels); - decoder = new Decoder(model->num_labels(), base); - TRACE_LOG("Allocated plain decoder"); - - for (int iter = 0; iter < train_opt.max_iter; ++ iter) { - TRACE_LOG("Training iteraition [%d]", (iter + 1)); - for (int i = 0; i < train_dat.size(); ++ i) { - // extract_features(train_dat[i]); - - Instance * inst = train_dat[i]; - calculate_scores(inst, false); - decoder->decode(inst); - - if (inst->features.dim() == 0) { - collect_features(inst, inst->tagsidx, inst->features); - } - collect_features(inst, inst->predicted_tagsidx, inst->predicted_features); - - - if (train_opt.algorithm == "pa") { - SparseVec update_features; - update_features.zero(); - update_features.add(train_dat[i]->features, 1.); - update_features.add(train_dat[i]->predicted_features, -1.); - - double error = train_dat[i]->num_errors(); - double score = model->param.dot(update_features, false); - double norm = update_features.L2(); - - double step = 0.; - if (norm < EPS) { - step = 0; - } else { - step = (error - score) / norm; - } - - model->param.add(update_features, - iter * train_dat.size() + i + 1, - step); - } else if (train_opt.algorithm == "ap") { - SparseVec update_features; - update_features.zero(); - update_features.add(train_dat[i]->features, 1.); - update_features.add(train_dat[i]->predicted_features, -1.); - - model->param.add(update_features, - iter * train_dat.size() + i + 1, - 1.); - } - - if ((i+1) % train_opt.display_interval == 0) { - TRACE_LOG("[%d] instances is trained.", i+1); - } - } - model->param.flush( train_dat.size() * (iter + 1) ); - - Model * new_model = truncate(); - swap(model, new_model); - evaluate(); - - std::string saved_model_file = (train_opt.model_name + "." + strutils::to_str(iter) + ".model"); - std::ofstream ofs(saved_model_file.c_str(), std::ofstream::binary); - - swap(model, new_model); - new_model->save(ofs); - delete new_model; - - TRACE_LOG("Model for iteration [%d] is saved to [%s]", - iter + 1, - saved_model_file.c_str()); - } + if (!flag) { + continue; } -} -void NER::evaluate(void) { - const char * holdout_file = train_opt.holdout_file.c_str(); + new_model->space.retrieve(tid, key, true); + } + TRACE_LOG("Scanning old features space, building new feature space is done"); - ifstream ifs(holdout_file); + new_model->param.realloc(new_model->space.dim()); + TRACE_LOG("Parameter dimension of new model is [%d]", new_model->space.dim()); - if (!ifs) { - ERROR_LOG("Failed to open holdout file."); - return; + for (FeatureSpaceIterator itx = new_model->space.begin(); + itx != new_model->space.end(); + ++ itx) { + const char * key = itx.key(); + int tid = itx.tid(); + + int old_id = model->space.index(tid, key); + int new_id = new_model->space.index(tid, key); + + for (int l = 0; l < L; ++ l) { + // pay attention to this place, use average should be set true + // some dirty code + new_model->param._W[new_id + l] = model->param._W[old_id + l]; + new_model->param._W_sum[new_id + l] = model->param._W_sum[old_id + l]; + new_model->param._W_time[new_id + l] = model->param._W_time[old_id + l]; } + } - NERReader reader(ifs, true); - NERWriter writer(std::cout); - Instance * inst = NULL; + for (int pl = 0; pl < L; ++ pl) { + for (int l = 0; l < L; ++ l) { + int old_id = model->space.index(pl, l); + int new_id = new_model->space.index(pl, l); - // some dirty hard code and trick - int beg_tag0 = (model->labels.index( "B-Nh" ) / __num_ne_types__); - int beg_tag1 = (model->labels.index( "S-Nh" ) / __num_ne_types__); - int beg_tag2 = (model->labels.index( "O" ) / __num_ne_types__); + new_model->param._W[new_id] = model->param._W[old_id]; + new_model->param._W_sum[new_id] = model->param._W_sum[old_id]; + new_model->param._W_time[new_id] = model->param._W_time[old_id]; + } + } + TRACE_LOG("Building new model is done"); + for (SmartMap::const_iterator itx = model->cluster_lexicon.begin(); + itx != model->cluster_lexicon.end(); + ++ itx) { + new_model->cluster_lexicon.set(itx.key(), (*itx.value())); + } - int num_recalled_entities = 0; - int num_predicted_entities = 0; - int num_gold_entities = 0; + return new_model; +} - int L = model->num_labels(); +void +NER::train(void) { + const char * train_file = train_opt.train_file.c_str(); + + // read in training instance + if (!read_instance(train_file)) { + ERROR_LOG("Training file doesn't exist"); + } + + TRACE_LOG("Read in [%d] instances.", train_dat.size()); + + model = new Model; + // build tag dictionary, map string tag to index + TRACE_LOG("Start build configuration"); + build_configuration(); + TRACE_LOG("Build configuration is done."); + TRACE_LOG("Number of labels: [%d]", model->labels.size()); + + // build feature space from the training instance + TRACE_LOG("Start building feature space."); + build_feature_space(); + TRACE_LOG("Building feature space is done."); + TRACE_LOG("Number of features: [%d]", model->space.num_features()); + + model->param.realloc(model->space.dim()); + TRACE_LOG("Allocate [%d] dimensition parameter.", model->space.dim()); + + NERWriter writer(std::cout); + + if (train_opt.algorithm == "mira") { + // use mira algorithm + /*kbest_decoder = new KBestDecoder(L); + + for (int iter = 0; iter < train_opt.max_iter; ++ iter) { + for (int i = 0; i < train_dat.size(); ++ i) { + extract_features(train_dat[i]); + calculate_scores(train_dat[i]); + + KBestDecoder::KBestDecodeResult result; + kbest_decoder->decode(train_dat[i], result); + } + }*/ + } else { + // use pa or average perceptron algorithm + rulebase::RuleBase base(model->labels); + decoder = new Decoder(model->num_labels(), base); + TRACE_LOG("Allocated plain decoder"); - int c = 0; - while ((inst = reader.next())) { - int len = inst->size(); - inst->tagsidx.resize(len); - for (int i = 0; i < len; ++ i) { - inst->tagsidx[i] = model->labels.index(inst->tags[i]); - } + for (int iter = 0; iter < train_opt.max_iter; ++ iter) { + TRACE_LOG("Training iteraition [%d]", (iter + 1)); + for (int i = 0; i < train_dat.size(); ++ i) { + // extract_features(train_dat[i]); - extract_features(inst); - calculate_scores(inst, true); + Instance * inst = train_dat[i]; + calculate_scores(inst, false); decoder->decode(inst); - // writer.debug(inst); - if (inst->entities.size() == 0) { - build_entities(inst, - inst->tagsidx, - inst->entities, - inst->entities_tags, - beg_tag0, - beg_tag1, - beg_tag2); + if (inst->features.dim() == 0) { + collect_features(inst, inst->tagsidx, inst->features); } + collect_features(inst, inst->predicted_tagsidx, inst->predicted_features); + + + if (train_opt.algorithm == "pa") { + SparseVec update_features; + update_features.zero(); + update_features.add(train_dat[i]->features, 1.); + update_features.add(train_dat[i]->predicted_features, -1.); + + double error = train_dat[i]->num_errors(); + double score = model->param.dot(update_features, false); + double norm = update_features.L2(); + + double step = 0.; + if (norm < EPS) { + step = 0; + } else { + step = (error - score) / norm; + } + + model->param.add(update_features, + iter * train_dat.size() + i + 1, + step); + } else if (train_opt.algorithm == "ap") { + SparseVec update_features; + update_features.zero(); + update_features.add(train_dat[i]->features, 1.); + update_features.add(train_dat[i]->predicted_features, -1.); + + model->param.add(update_features, + iter * train_dat.size() + i + 1, + 1.); + } + + if ((i+1) % train_opt.display_interval == 0) { + TRACE_LOG("[%d] instances is trained.", i+1); + } + } + model->param.flush( train_dat.size() * (iter + 1) ); + + Model * new_model = truncate(); + swap(model, new_model); + evaluate(); - build_entities(inst, - inst->predicted_tagsidx, - inst->predicted_entities, - inst->predicted_entities_tags, - beg_tag0, - beg_tag1, - beg_tag2); + std::string saved_model_file = (train_opt.model_name + + "." + + strutils::to_str(iter) + + ".model"); + std::ofstream ofs(saved_model_file.c_str(), std::ofstream::binary); - num_recalled_entities += inst->num_recalled_entites(); - num_predicted_entities += inst->num_predicted_entities(); - num_gold_entities += inst->num_gold_entities(); + swap(model, new_model); + new_model->save(ofs); + delete new_model; - delete inst; + TRACE_LOG("Model for iteration [%d] is saved to [%s]", + iter + 1, + saved_model_file.c_str()); } + } +} + +void NER::evaluate(void) { + const char * holdout_file = train_opt.holdout_file.c_str(); - double p = (double)num_recalled_entities / num_predicted_entities; - double r = (double)num_recalled_entities / num_gold_entities; - double f = 2 * p * r / (p + r); + ifstream ifs(holdout_file); - TRACE_LOG("P: %lf ( %d / %d )", p, num_recalled_entities, num_predicted_entities); - TRACE_LOG("R: %lf ( %d / %d )", r, num_recalled_entities, num_gold_entities); - TRACE_LOG("F: %lf" , f); + if (!ifs) { + ERROR_LOG("Failed to open holdout file."); return; + } + + NERReader reader(ifs, true); + NERWriter writer(std::cout); + Instance * inst = NULL; + + // some dirty hard code and trick + int beg_tag0 = (model->labels.index( "B-Nh" ) / __num_ne_types__); + int beg_tag1 = (model->labels.index( "S-Nh" ) / __num_ne_types__); + int beg_tag2 = (model->labels.index( "O" ) / __num_ne_types__); + + + int num_recalled_entities = 0; + int num_predicted_entities = 0; + int num_gold_entities = 0; + + while ((inst = reader.next())) { + int len = inst->size(); + inst->tagsidx.resize(len); + for (int i = 0; i < len; ++ i) { + inst->tagsidx[i] = model->labels.index(inst->tags[i]); + } + + extract_features(inst); + calculate_scores(inst, true); + decoder->decode(inst); + + // writer.debug(inst); + if (inst->entities.size() == 0) { + build_entities(inst, + inst->tagsidx, + inst->entities, + inst->entities_tags, + beg_tag0, + beg_tag1, + beg_tag2); + } + + build_entities(inst, + inst->predicted_tagsidx, + inst->predicted_entities, + inst->predicted_entities_tags, + beg_tag0, + beg_tag1, + beg_tag2); + + num_recalled_entities += inst->num_recalled_entites(); + num_predicted_entities += inst->num_predicted_entities(); + num_gold_entities += inst->num_gold_entities(); + + delete inst; + } + + double p = (double)num_recalled_entities / num_predicted_entities; + double r = (double)num_recalled_entities / num_gold_entities; + double f = 2 * p * r / (p + r); + + TRACE_LOG("P: %lf ( %d / %d )", p, num_recalled_entities, num_predicted_entities); + TRACE_LOG("R: %lf ( %d / %d )", r, num_recalled_entities, num_gold_entities); + TRACE_LOG("F: %lf" , f); + return; } void NER::test(void) { - // load model - const char * model_file = test_opt.model_file.c_str(); - ifstream mfs(model_file, std::ifstream::binary); + // load model + const char * model_file = test_opt.model_file.c_str(); + ifstream mfs(model_file, std::ifstream::binary); - if (!mfs) { - ERROR_LOG("Failed to load model"); - return; - } + if (!mfs) { + ERROR_LOG("Failed to load model"); + return; + } - model = new Model; - if (!model->load(mfs)) { - ERROR_LOG("Failed to load model"); - return; - } + model = new Model; + if (!model->load(mfs)) { + ERROR_LOG("Failed to load model"); + return; + } - TRACE_LOG("Number of labels [%d]", model->num_labels()); - TRACE_LOG("Number of features [%d]", model->space.num_features()); - TRACE_LOG("Number of dimension [%d]", model->space.dim()); + TRACE_LOG("Number of labels [%d]", model->num_labels()); + TRACE_LOG("Number of features [%d]", model->space.num_features()); + TRACE_LOG("Number of dimension [%d]", model->space.dim()); - const char * test_file = test_opt.test_file.c_str(); + const char * test_file = test_opt.test_file.c_str(); - ifstream ifs(test_file); + ifstream ifs(test_file); - if (!ifs) { - ERROR_LOG("Failed to open holdout file."); - return; - } + if (!ifs) { + ERROR_LOG("Failed to open holdout file."); + return; + } - rulebase::RuleBase base(model->labels); - decoder = new Decoder(model->num_labels(), base); - NERReader reader(ifs); - NERWriter writer(cout); - Instance * inst = NULL; + rulebase::RuleBase base(model->labels); + decoder = new Decoder(model->num_labels(), base); + NERReader reader(ifs); + NERWriter writer(cout); + Instance * inst = NULL; - // int beg_tag0 = model->labels.index( __b__ ); - // int beg_tag1 = model->labels.index( __s__ ); + // int beg_tag0 = model->labels.index( __b__ ); + // int beg_tag1 = model->labels.index( __s__ ); - double before = get_time(); + double before = get_time(); - while ((inst = reader.next())) { - int len = inst->size(); - inst->tagsidx.resize(len); + while ((inst = reader.next())) { + int len = inst->size(); + inst->tagsidx.resize(len); - extract_features(inst); - calculate_scores(inst, true); - decoder->decode(inst); + extract_features(inst); + calculate_scores(inst, true); + decoder->decode(inst); - writer.write(inst); - delete inst; - } + writer.write(inst); + delete inst; + } - double after = get_time(); - TRACE_LOG("Eclipse time %lf", after - before); + double after = get_time(); + TRACE_LOG("Eclipse time %lf", after - before); - sleep(1000000); - return; + sleep(1000000); + return; } void NER::dump() { - // load model - const char * model_file = dump_opt.model_file.c_str(); - ifstream mfs(model_file, std::ifstream::binary); - - if (!mfs) { - ERROR_LOG("Failed to load model"); - return; - } + // load model + const char * model_file = dump_opt.model_file.c_str(); + ifstream mfs(model_file, std::ifstream::binary); - model = new Model; - if (!model->load(mfs)) { - ERROR_LOG("Failed to load model"); - return; - } - - int L = model->num_labels(); - TRACE_LOG("Number of labels [%d]", model->num_labels()); - TRACE_LOG("Number of features [%d]", model->space.num_features()); - TRACE_LOG("Number of dimension [%d]", model->space.dim()); - - for (FeatureSpaceIterator itx = model->space.begin(); itx != model->space.end(); ++ itx) { - const char * key = itx.key(); - int tid = itx.tid(); - int id = model->space.index(tid, key); + if (!mfs) { + ERROR_LOG("Failed to load model"); + return; + } - for (int l = 0; l < L; ++ l) { - std::cout << key << " ( " << id + l << " ) " + model = new Model; + if (!model->load(mfs)) { + ERROR_LOG("Failed to load model"); + return; + } + + int L = model->num_labels(); + TRACE_LOG("Number of labels [%d]", model->num_labels()); + TRACE_LOG("Number of features [%d]", model->space.num_features()); + TRACE_LOG("Number of dimension [%d]", model->space.dim()); + + for (FeatureSpaceIterator itx = model->space.begin(); + itx != model->space.end(); + ++ itx) { + const char * key = itx.key(); + int tid = itx.tid(); + int id = model->space.index(tid, key); + + for (int l = 0; l < L; ++ l) { + std::cout << key << " ( " << id + l << " ) " << " --> " << model->param.dot(id + l) << std::endl; - } } + } - for (int pl = 0; pl < L; ++ pl) { - for (int l = 0; l < L; ++ l) { - int id = model->space.index(pl, l); - std::cout << pl << " --> " << l << " " << model->param.dot(id) << std::endl; - } + for (int pl = 0; pl < L; ++ pl) { + for (int l = 0; l < L; ++ l) { + int id = model->space.index(pl, l); + std::cout << pl << " --> " << l << " " << model->param.dot(id) << std::endl; } + } } -} // end for namespace segmentor -} // end for namespace ltp +} // end for namespace segmentor +} // end for namespace ltp diff --git a/src/ner/ner.h b/src/ner/ner.h index 1ec60bc5e..a204bea17 100644 --- a/src/ner/ner.h +++ b/src/ner/ner.h @@ -1,124 +1,124 @@ #ifndef __LTP_NER_NER_H__ #define __LTP_NER_NER_H__ -#include "cfgparser.hpp" -#include "model.h" -#include "decoder.h" +#include "utils/cfgparser.hpp" +#include "ner/model.h" +#include "ner/decoder.h" namespace ltp { namespace ner { class NER { public: - NER(); - NER(ltp::utility::ConfigParser & cfg); - ~NER(); + NER(); + NER(ltp::utility::ConfigParser & cfg); + ~NER(); - void run(); + void run(); private: - /* - * parse the configuration, return true on success, otherwise false - * - * @param[in] cfg the config class - * @return bool return true on success, otherwise false - */ - bool parse_cfg(ltp::utility::ConfigParser & cfg); - - /* - * read instances from file and store them in train_dat - * - * @param[in] file_name the filename - * @return bool true on success, otherwise false - */ - bool read_instance( const char * file_name ); - void build_configuration(void); - void build_feature_space(void); - - /* - * the training process - */ - void train(void); - - /* - * the evaluating process - */ - void evaluate(void); - - /* - * the testing process - */ - void test(void); - - /* - * the dumping model process - */ - void dump(void); - - /* - * do feature trauncation on the model. create a model duplation - * on the model and return their - * - * @return Model the duplication of the model - */ - Model * truncate(void); + /* + * parse the configuration, return true on success, otherwise false + * + * @param[in] cfg the config class + * @return bool return true on success, otherwise false + */ + bool parse_cfg(ltp::utility::ConfigParser & cfg); + + /* + * read instances from file and store them in train_dat + * + * @param[in] file_name the filename + * @return bool true on success, otherwise false + */ + bool read_instance( const char * file_name ); + void build_configuration(void); + void build_feature_space(void); + + /* + * the training process + */ + void train(void); + + /* + * the evaluating process + */ + void evaluate(void); + + /* + * the testing process + */ + void test(void); + + /* + * the dumping model process + */ + void dump(void); + + /* + * do feature trauncation on the model. create a model duplation + * on the model and return their + * + * @return Model the duplication of the model + */ + Model * truncate(void); protected: - /* - * extract features from one instance, - * - */ - void extract_features(Instance * inst, bool create = false); - - /* - * build words from tags for certain instance - * - * @param[in/out] inst the instance - * @param[out] words the output words - * @param[in] tagsidx the index of tags - * @param[in] begtag0 first of the word begin tag - * @param[in] begtag1 second of the word begin tag - */ - void build_entities(Instance * inst, - const std::vector & tagsidx, - std::vector & entities, - std::vector & entities_tags, - int beg_tag0, - int beg_tag1, - int beg_tag2); - - /* - * cache all the score for the certain instance. - * - * @param[in/out] inst the instance - * @param[in] use_avg use to specify use average parameter - */ - void calculate_scores(Instance * inst, bool use_avg); - - /* - * collect feature when given the tags index - * - * @param[in] inst the instance - * @param[in] tagsidx the tags index - * @param[out] vec the output sparse vector - */ - void collect_features(Instance * inst, - const std::vector & tagsidx, - ltp::math::SparseVec & vec); + /* + * extract features from one instance, + * + */ + void extract_features(Instance * inst, bool create = false); + + /* + * build words from tags for certain instance + * + * @param[in/out] inst the instance + * @param[out] words the output words + * @param[in] tagsidx the index of tags + * @param[in] begtag0 first of the word begin tag + * @param[in] begtag1 second of the word begin tag + */ + void build_entities(Instance * inst, + const std::vector & tagsidx, + std::vector & entities, + std::vector & entities_tags, + int beg_tag0, + int beg_tag1, + int beg_tag2); + + /* + * cache all the score for the certain instance. + * + * @param[in/out] inst the instance + * @param[in] use_avg use to specify use average parameter + */ + void calculate_scores(Instance * inst, bool use_avg); + + /* + * collect feature when given the tags index + * + * @param[in] inst the instance + * @param[in] tagsidx the tags index + * @param[out] vec the output sparse vector + */ + void collect_features(Instance * inst, + const std::vector & tagsidx, + ltp::math::SparseVec & vec); private: - bool __TRAIN__; - bool __TEST__; - bool __DUMP__; + bool __TRAIN__; + bool __TEST__; + bool __DUMP__; private: - std::vector< Instance * > train_dat; + std::vector< Instance * > train_dat; protected: - Model * model; - Decoder * decoder; + Model * model; + Decoder * decoder; }; -} // end for namespace segmentor -} // end for namespace ltp +} // end for namespace segmentor +} // end for namespace ltp #endif // end for __LTP_NER_NER_H__ diff --git a/src/ner/ner_dll.cpp b/src/ner/ner_dll.cpp index a77833c51..c547324b0 100644 --- a/src/ner/ner_dll.cpp +++ b/src/ner/ner_dll.cpp @@ -1,97 +1,102 @@ -#include "ner_dll.h" - -#include "ner.h" -#include "settings.h" -//#include "instance.h" - -#include "logging.hpp" -#include "codecs.hpp" +#include "ner/ner_dll.h" +#include "ner/ner.h" +#include "ner/settings.h" +#include "utils/logging.hpp" +#include "utils/codecs.hpp" #include class NERWrapper : public ltp::ner::NER { public: - NERWrapper() : - beg_tag0(-1), - beg_tag1(-1) {} - - ~NERWrapper() {} + NERWrapper() + : beg_tag0(-1), + beg_tag1(-1) {} - bool load(const char * model_file) { - std::ifstream mfs(model_file, std::ifstream::binary); + ~NERWrapper() {} - if (!mfs) { - return false; - } + bool load(const char * model_file) { + std::ifstream mfs(model_file, std::ifstream::binary); - model = new ltp::ner::Model; - if (!model->load(mfs)) { - delete model; - return false; - } - - // beg_tag0 = model->labels.index( ); - // beg_tag1 = model->labels.index( ); + if (!mfs) { + return false; + } - return true; + model = new ltp::ner::Model; + if (!model->load(mfs)) { + delete model; + return false; } - int recognize(const std::vector & words, - const std::vector & postags, - std::vector & tags) { - ltp::ner::rulebase::RuleBase base(model->labels); - ltp::ner::Decoder deco(model->num_labels(), base); + // beg_tag0 = model->labels.index( ); + // beg_tag1 = model->labels.index( ); - ltp::ner::Instance * inst = new ltp::ner::Instance; - if (words.size() != postags.size()) { - return 0; - } + return true; + } - for (int i = 0; i < words.size(); ++ i) { - inst->forms.push_back(ltp::strutils::chartypes::sbc2dbc_x(words[i])); - inst->postags.push_back(postags[i]); - } + int recognize(const std::vector & words, + const std::vector & postags, + std::vector & tags) { + ltp::ner::rulebase::RuleBase base(model->labels); + ltp::ner::Decoder deco(model->num_labels(), base); - ltp::ner::NER::extract_features(inst); - ltp::ner::NER::calculate_scores(inst, true); - deco.decode(inst); + ltp::ner::Instance * inst = new ltp::ner::Instance; - for (int i = 0; i < words.size(); ++ i) { - tags.push_back(model->labels.at(inst->predicted_tagsidx[i])); - } + for (int i = 0; i < words.size(); ++ i) { + inst->forms.push_back(ltp::strutils::chartypes::sbc2dbc_x(words[i])); + inst->postags.push_back(postags[i]); + } + + ltp::ner::NER::extract_features(inst); + ltp::ner::NER::calculate_scores(inst, true); + deco.decode(inst); - delete inst; - return tags.size(); + for (int i = 0; i < words.size(); ++ i) { + tags.push_back(model->labels.at(inst->predicted_tagsidx[i])); } + delete inst; + return tags.size(); + } + private: - int beg_tag0; - int beg_tag1; + int beg_tag0; + int beg_tag1; }; void * ner_create_recognizer(const char * path) { - NERWrapper * wrapper = new NERWrapper(); + NERWrapper * wrapper = new NERWrapper(); - if (!wrapper->load(path)) { - return 0; - } + if (!wrapper->load(path)) { + return 0; + } - return reinterpret_cast(wrapper); + return reinterpret_cast(wrapper); } int ner_release_recognizer(void * ner) { - if (!ner) { - return -1; - } - delete reinterpret_cast(ner); - return 0; + if (!ner) { + return -1; + } + delete reinterpret_cast(ner); + return 0; } int ner_recognize(void * ner, - const std::vector & words, - const std::vector & postags, - std::vector & tags) { - NERWrapper * wrapper = 0; - wrapper = reinterpret_cast(ner); - return wrapper->recognize(words, postags, tags); + const std::vector & words, + const std::vector & postags, + std::vector & tags) { + + if (words.size() != postags.size()) { + return 0; + } + + for (int i = 0; i < words.size(); ++ i) { + if (words[i].empty() || postags.empty()) { + return 0; + } + } + + NERWrapper * wrapper = 0; + wrapper = reinterpret_cast(ner); + return wrapper->recognize(words, postags, tags); } diff --git a/src/ner/ner_dll.h b/src/ner/ner_dll.h index a1c5f125f..94bceac89 100644 --- a/src/ner/ner_dll.h +++ b/src/ner/ner_dll.h @@ -31,18 +31,19 @@ NER_DLL_API void * ner_create_recognizer(const char * path); * @param[in] segmentor the segmentor * @return int i don't know */ -NER_DLL_API int ner_release_recognizer(void * ner); +NER_DLL_API int ner_release_recognizer(void * ner); /* * run segment on the given segmentor * * @param[in] line the string to be segmented * @param[out] words the words of the input line - * @return int the number of word tokens + * @return int the number of word tokens, if the input arguments + * is not legal, return 0 */ NER_DLL_API int ner_recognize(void * ner, - const std::vector & words, - const std::vector & postags, - std::vector & tags); + const std::vector & words, + const std::vector & postags, + std::vector & tags); #endif // end for __LTP_NER_DLL_H__ diff --git a/src/ner/nerio.h b/src/ner/nerio.h index e99bd769e..b9c3f227b 100644 --- a/src/ner/nerio.h +++ b/src/ner/nerio.h @@ -2,11 +2,11 @@ #define __LTP_NER_IO_H__ #include -#include "settings.h" -#include "instance.h" -#include "strutils.hpp" -#include "sbcdbc.hpp" -#include "codecs.hpp" +#include "ner/settings.h" +#include "ner/instance.h" +#include "utils/strutils.hpp" +#include "utils/sbcdbc.hpp" +#include "utils/codecs.hpp" namespace ltp { namespace ner { @@ -15,115 +15,115 @@ using namespace ltp::strutils; class NERReader { public: - NERReader(istream & _ifs, bool _train = false, int _style = 4) : - ifs(_ifs), - train(_train), - style(_style) {} - - Instance * next() { - if (ifs.eof()) { - return 0; - } - - Instance * inst = new Instance; - std::string line; - - std::getline(ifs, line); - strutils::chomp(line); - - if (line.size() == 0) { + NERReader(istream & _ifs, bool _train = false, int _style = 4) + : ifs(_ifs), + train(_train), + style(_style) {} + + Instance * next() { + if (ifs.eof()) { + return 0; + } + + Instance * inst = new Instance; + std::string line; + + std::getline(ifs, line); + strutils::chomp(line); + + if (line.size() == 0) { + delete inst; + return 0; + } + + std::vector words = split(line); + int found; + + for (int i = 0; i < words.size(); ++ i) { + if (train) { + found = words[i].find_last_of('#'); + if (found != std::string::npos) { + std::string tag = words[i].substr(found + 1); + inst->tags.push_back(tag); + words[i] = words[i].substr(0, found); + + found = words[i].find_last_of('/'); + if (found != std::string::npos) { + std::string postag = words[i].substr(found + 1); + inst->postags.push_back(postag); + words[i] = words[i].substr(0, found); + + inst->raw_forms.push_back(words[i]); + inst->forms.push_back(strutils::chartypes::sbc2dbc_x(words[i])); + } else { delete inst; return 0; + } + } else { + delete inst; + return 0; } - - std::vector words = split(line); - int found; - - for (int i = 0; i < words.size(); ++ i) { - if (train) { - found = words[i].find_last_of('#'); - if (found != std::string::npos) { - std::string tag = words[i].substr(found + 1); - inst->tags.push_back(tag); - words[i] = words[i].substr(0, found); - - found = words[i].find_last_of('/'); - if (found != std::string::npos) { - std::string postag = words[i].substr(found + 1); - inst->postags.push_back(postag); - words[i] = words[i].substr(0, found); - - inst->raw_forms.push_back(words[i]); - inst->forms.push_back(strutils::chartypes::sbc2dbc_x(words[i])); - } else { - delete inst; - return 0; - } - } else { - delete inst; - return 0; - } - } else { - found = words[i].find_last_of('/'); - if (found != std::string::npos) { - std::string postag = words[i].substr(found + 1); - inst->postags.push_back(postag); - words[i] = words[i].substr(0, found); - - inst->raw_forms.push_back(words[i]); - inst->forms.push_back(strutils::chartypes::sbc2dbc_x(words[i])); - } else { - delete inst; - return 0; - } - } + } else { + found = words[i].find_last_of('/'); + if (found != std::string::npos) { + std::string postag = words[i].substr(found + 1); + inst->postags.push_back(postag); + words[i] = words[i].substr(0, found); + + inst->raw_forms.push_back(words[i]); + inst->forms.push_back(strutils::chartypes::sbc2dbc_x(words[i])); + } else { + delete inst; + return 0; } + } + } - return inst; + return inst; } private: - istream & ifs; - int style; - bool train; + istream & ifs; + int style; + bool train; }; class NERWriter { public: - NERWriter(std::ostream & _ofs) : ofs(_ofs) {} - - void write(const Instance * inst) { - int len = inst->size(); - if (inst->predicted_tags.size() != len) { - return; - } - - for (int i = 0; i < len; ++ i) { - ofs << inst->forms[i] - << "/" << inst->postags[i] - << "#" << inst->predicted_tags[i]; - if (i + 1 < len ) { - ofs << "\t"; - } else { - ofs << std::endl; - } - } + NERWriter(std::ostream & _ofs) : ofs(_ofs) {} + + void write(const Instance * inst) { + int len = inst->size(); + if (inst->predicted_tags.size() != len) { + return; + } + + for (int i = 0; i < len; ++ i) { + ofs << inst->forms[i] + << "/" << inst->postags[i] + << "#" << inst->predicted_tags[i]; + if (i + 1 < len ) { + ofs << "\t"; + } else { + ofs << std::endl; + } + } } - void debug(const Instance * inst, bool show_feat = false) { - int len = inst->size(); + void debug(const Instance * inst, bool show_feat = false) { + int len = inst->size(); - for (int i = 0; i < len; ++ i) { - ofs << inst->forms[i] - << "\t" << inst->postags[i] - << "\t" << inst->tagsidx[i] - << "\t" << inst->predicted_tagsidx[i] - << std::endl; - } + for (int i = 0; i < len; ++ i) { + ofs << inst->forms[i] + << "\t" << inst->postags[i] + << "\t" << inst->tagsidx[i] + << "\t" << inst->predicted_tagsidx[i] + << std::endl; + } } private: - std::ostream & ofs; + std::ostream & ofs; }; -} // end for namespace ner -} // end for namespace ltp -#endif // end for __LTP_SEGMENTOR_WRITER_H__ +} // end for namespace ner +} // end for namespace ltp +#endif // end for __LTP_SEGMENTOR_WRITER_H__ diff --git a/src/ner/options.cpp b/src/ner/options.cpp index 85b2ca7ef..9f44136b3 100644 --- a/src/ner/options.cpp +++ b/src/ner/options.cpp @@ -1,4 +1,4 @@ -#include "options.h" +#include "ner/options.h" namespace ltp { namespace ner { diff --git a/src/ner/options.h b/src/ner/options.h index e2a9606ac..652bf6773 100644 --- a/src/ner/options.h +++ b/src/ner/options.h @@ -7,26 +7,26 @@ namespace ltp { namespace ner { struct ModelOptions { - std::string model_file; + std::string model_file; }; struct TrainOptions { - std::string train_file; - std::string holdout_file; - std::string model_name; - std::string algorithm; - int max_iter; - int display_interval; + std::string train_file; + std::string holdout_file; + std::string model_name; + std::string algorithm; + int max_iter; + int display_interval; }; struct TestOptions { - std::string test_file; - std::string model_file; - std::string lexicon_file; + std::string test_file; + std::string model_file; + std::string lexicon_file; }; struct DumpOptions { - std::string model_file; + std::string model_file; }; extern ModelOptions model_opt; @@ -34,7 +34,7 @@ extern TrainOptions train_opt; extern TestOptions test_opt; extern DumpOptions dump_opt; -} // end for namespace ner -} // end for namespace ltp +} // end for namespace ner +} // end for namespace ltp -#endif // end for __LTP_NER_OPTIONS_H__ +#endif // end for __LTP_NER_OPTIONS_H__ diff --git a/src/ner/otner.cpp b/src/ner/otner.cpp index 1a5c0f3c0..48e54d406 100644 --- a/src/ner/otner.cpp +++ b/src/ner/otner.cpp @@ -1,33 +1,34 @@ #include -#include "cfgparser.hpp" -#include "logging.hpp" -#include "ner.h" +#include "utils/cfgparser.hpp" +#include "utils/logging.hpp" +#include "ner/ner.h" using namespace ltp::utility; using namespace ltp::ner; void usage(void) { - std::cerr << "otcws - Training and testing suite for Chinese Word segmentation" << std::endl; - std::cerr << "Copyright (C) 2012-2013 HIT-SCIR" << std::endl; - std::cerr << std::endl; - std::cerr << "usage: ./otcws " << std::endl; - std::cerr << std::endl; + std::cerr << "otcws - Training and testing suite for Named Entity Recognization" + << std::endl; + std::cerr << "Copyright (C) 2012-2014 HIT-SCIR" << std::endl; + std::cerr << std::endl; + std::cerr << "usage: ./otner " << std::endl; + std::cerr << std::endl; } int main(int argc, const char * argv[]) { - if (argc < 2 || (argv[1][0] == '-' && argv[1][1] == 'h')) { - usage(); - return -1; - } + if (argc < 2 || (argv[1][0] == '-' && argv[1][1] == 'h')) { + usage(); + return -1; + } - ConfigParser cfg(argv[1]); + ConfigParser cfg(argv[1]); - if (!cfg) { - ERROR_LOG("Failed to parse config file."); - return -1; - } + if (!cfg) { + ERROR_LOG("Failed to parse config file."); + return -1; + } - NER engine(cfg); - engine.run(); - return 0; + NER engine(cfg); + engine.run(); + return 0; } diff --git a/src/ner/parameter.h b/src/ner/parameter.h index a3ec971b1..ba9f929fd 100644 --- a/src/ner/parameter.h +++ b/src/ner/parameter.h @@ -2,8 +2,8 @@ #define __LTP_NER_PARAMETER_H__ #include -#include "sparsevec.h" -#include "featurevec.h" +#include "utils/math/sparsevec.h" +#include "utils/math/featurevec.h" namespace ltp { namespace ner { @@ -12,151 +12,151 @@ using namespace ltp::math; class Parameters { public: - int _dim; - double * _W; - double * _W_sum; - int * _W_time; - - Parameters() : - _dim(0), - _W(0), - _W_sum(0), - _W_time(0) {} - - ~Parameters() { - dealloc(); + int _dim; + double * _W; + double * _W_sum; + int * _W_time; + + Parameters() : + _dim(0), + _W(0), + _W_sum(0), + _W_time(0) {} + + ~Parameters() { + dealloc(); + } + + void realloc(int dim) { + dealloc(); + _dim = dim; + + if (dim > 0) { + _W = new double[dim]; + _W_sum = new double[dim]; + _W_time = new int[dim]; } - void realloc(int dim) { - dealloc(); - _dim = dim; - - if (dim > 0) { - _W = new double[dim]; - _W_sum = new double[dim]; - _W_time = new int[dim]; - } - - for (int i = 0; i < dim; ++ i) { - _W[i] = 0; - _W_sum[i] = 0; - _W_time[i] = 0; - } + for (int i = 0; i < dim; ++ i) { + _W[i] = 0; + _W_sum[i] = 0; + _W_time[i] = 0; } - - void dealloc() { - if (_W && _W == _W_sum) { - delete [](_W); - _W = 0; - _W_sum = 0; - } else { - if (_W) { - delete [](_W); - _W = 0; - } - if (_W_sum) { - delete [](_W_sum); - _W_sum = 0; - } - } - - if (_W_time) { - delete [](_W_time); - _W_time = 0; - } + } + + void dealloc() { + if (_W && _W == _W_sum) { + delete [](_W); + _W = 0; + _W_sum = 0; + } else { + if (_W) { + delete [](_W); + _W = 0; + } + if (_W_sum) { + delete [](_W_sum); + _W_sum = 0; + } } - void add(int idx, int now, double scale = 1.) { - int elapsed = now - _W_time[idx]; - double upd = scale; - double cur_val = _W[idx]; - - _W[idx] = cur_val + upd; - _W_sum[idx] += elapsed * cur_val + upd; - _W_time[idx] = now; + if (_W_time) { + delete [](_W_time); + _W_time = 0; } - - void add(const SparseVec & vec, int now, double scale = 1.) { - for (SparseVec::const_iterator itx = vec.begin(); - itx != vec.end(); - itx ++) { - int idx = itx->first; - int elapsed = now - _W_time[idx]; - double upd = scale * itx->second; - double cur_val = _W[idx]; - - _W[idx] = cur_val + upd; - _W_sum[idx] += elapsed * cur_val + upd; - _W_time[idx] = now; - } + } + + void add(int idx, int now, double scale = 1.) { + int elapsed = now - _W_time[idx]; + double upd = scale; + double cur_val = _W[idx]; + + _W[idx] = cur_val + upd; + _W_sum[idx] += elapsed * cur_val + upd; + _W_time[idx] = now; + } + + void add(const SparseVec & vec, int now, double scale = 1.) { + for (SparseVec::const_iterator itx = vec.begin(); + itx != vec.end(); + ++ itx) { + int idx = itx->first; + int elapsed = now - _W_time[idx]; + double upd = scale * itx->second; + double cur_val = _W[idx]; + + _W[idx] = cur_val + upd; + _W_sum[idx] += elapsed * cur_val + upd; + _W_time[idx] = now; } - - double dot(const SparseVec & vec, bool use_avg = false) const { - const double * const p = (use_avg ? _W_sum : _W); - double ret = 0.; - for (SparseVec::const_iterator itx = vec.begin(); - itx != vec.end(); - ++ itx) { - ret += p[itx->first] * itx->second; - } - return ret; + } + + double dot(const SparseVec & vec, bool use_avg = false) const { + const double * const p = (use_avg ? _W_sum : _W); + double ret = 0.; + for (SparseVec::const_iterator itx = vec.begin(); + itx != vec.end(); + ++ itx) { + ret += p[itx->first] * itx->second; } - - double dot(const FeatureVector * vec, bool use_avg = false) const { - const double * const p = (use_avg ? _W_sum : _W); - double ret = 0.; - for (int i = 0; i < vec->n; ++ i) { - if (vec->val) { - ret += p[vec->idx[i] + vec->loff] * vec->val[i]; - } else { - ret += p[vec->idx[i] + vec->loff]; - } - } - return ret; + return ret; + } + + double dot(const FeatureVector * vec, bool use_avg = false) const { + const double * const p = (use_avg ? _W_sum : _W); + double ret = 0.; + for (int i = 0; i < vec->n; ++ i) { + if (vec->val) { + ret += p[vec->idx[i] + vec->loff] * vec->val[i]; + } else { + ret += p[vec->idx[i] + vec->loff]; + } } - - double dot(const int idx, bool use_avg = false) const { - const double * const p = (use_avg ? _W_sum : _W); - return p[idx]; + return ret; + } + + double dot(const int idx, bool use_avg = false) const { + const double * const p = (use_avg ? _W_sum : _W); + return p[idx]; + } + + void flush(int now) { + for(int i = 0; i < _dim; ++i) { + _W_sum[i] += (now - _W_time[i]) * _W[i]; + _W_time[i] = now; } - - void flush(int now) { - for(int i = 0; i < _dim; ++i) { - _W_sum[i] += (now - _W_time[i]) * _W[i]; - _W_time[i] = now; - } + } + + void dump(std::ostream & out, bool use_avg = true) { + const double * p = (use_avg ? _W_sum : _W); + char chunk[16] = {'p', 'a', 'r', 'a', 'm', 0}; + out.write(chunk, 16); + out.write(reinterpret_cast(&_dim), sizeof(int)); + if (_dim > 0) { + out.write(reinterpret_cast(p), sizeof(double) * _dim); } + } - void dump(std::ostream & out, bool use_avg = true) { - const double * p = (use_avg ? _W_sum : _W); - char chunk[16] = {'p', 'a', 'r', 'a', 'm', 0}; - out.write(chunk, 16); - out.write(reinterpret_cast(&_dim), sizeof(int)); - if (_dim > 0) { - out.write(reinterpret_cast(p), sizeof(double) * _dim); - } + bool load(std::istream & in) { + char chunk[16]; + in.read(chunk, 16); + if (strcmp(chunk, "param")) { + return false; } - bool load(std::istream & in) { - char chunk[16]; - in.read(chunk, 16); - if (strcmp(chunk, "param")) { - return false; - } - - in.read(reinterpret_cast(&_dim), sizeof(int)); - if (_dim > 0) { - _W = new double[_dim]; - in.read(reinterpret_cast(_W), sizeof(double) * _dim); - _W_sum = _W; - } - - return true; + in.read(reinterpret_cast(&_dim), sizeof(int)); + if (_dim > 0) { + _W = new double[_dim]; + in.read(reinterpret_cast(_W), sizeof(double) * _dim); + _W_sum = _W; } + + return true; + } }; -} // end for namespace ner -} // end for namespace ltp +} // end for namespace ner +} // end for namespace ltp #endif // end for __LTP_NER_PARAMETER_H__ diff --git a/src/ner/rulebase.h b/src/ner/rulebase.h index d79447df1..a785e6f97 100644 --- a/src/ner/rulebase.h +++ b/src/ner/rulebase.h @@ -6,10 +6,10 @@ #include #include -#include "settings.h" -#include "sbcdbc.hpp" -#include "smartmap.hpp" -#include "chartypes.hpp" +#include "ner/settings.h" +#include "utils/sbcdbc.hpp" +#include "utils/smartmap.hpp" +#include "utils/chartypes.hpp" namespace ltp { namespace ner { @@ -17,86 +17,86 @@ namespace rulebase { class RuleBase { public: - RuleBase(utility::IndexableSmartMap & labels) { - // only 4 tag style is supported + RuleBase(utility::IndexableSmartMap & labels) { + // only 4 tag style is supported - std::stringstream S; + std::stringstream S; - __trans__ = 0; - // b - S.str(std::string()); S << __pos_types__[0] << "-" << __ne_types__[0]; - __b_idx__ = prefix( labels.index(S.str()) ) ; + __trans__ = 0; + // b + S.str(std::string()); S << __pos_types__[0] << "-" << __ne_types__[0]; + __b_idx__ = prefix( labels.index(S.str()) ) ; - S.str(std::string()); S << __pos_types__[1] << "-" << __ne_types__[0]; - __i_idx__ = prefix( labels.index(S.str()) ); + S.str(std::string()); S << __pos_types__[1] << "-" << __ne_types__[0]; + __i_idx__ = prefix( labels.index(S.str()) ); - S.str(std::string()); S << __pos_types__[2] << "-" << __ne_types__[0]; - __e_idx__ = prefix( labels.index(S.str()) ); + S.str(std::string()); S << __pos_types__[2] << "-" << __ne_types__[0]; + __e_idx__ = prefix( labels.index(S.str()) ); - S.str(std::string()); S << __pos_types__[3] << "-" << __ne_types__[0]; - __s_idx__ = prefix( labels.index(S.str()) ); - __o_idx__ = prefix( labels.index("O") ); + S.str(std::string()); S << __pos_types__[3] << "-" << __ne_types__[0]; + __s_idx__ = prefix( labels.index(S.str()) ); + __o_idx__ = prefix( labels.index("O") ); - if (__s_idx__>=0 && __b_idx__>=0 && __i_idx__>=0 && __e_idx__>=0 && __o_idx__>=0) { - __trans__ |= (1<<((__s_idx__<<3) + __s_idx__)); - __trans__ |= (1<<((__s_idx__<<3) + __b_idx__)); - __trans__ |= (1<<((__s_idx__<<3) + __o_idx__)); + if (__s_idx__>=0 && __b_idx__>=0 && __i_idx__>=0 && __e_idx__>=0 && __o_idx__>=0) { + __trans__ |= (1<<((__s_idx__<<3) + __s_idx__)); + __trans__ |= (1<<((__s_idx__<<3) + __b_idx__)); + __trans__ |= (1<<((__s_idx__<<3) + __o_idx__)); - __trans__ |= (1<<((__b_idx__<<3) + __i_idx__)); - __trans__ |= (1<<((__b_idx__<<3) + __e_idx__)); + __trans__ |= (1<<((__b_idx__<<3) + __i_idx__)); + __trans__ |= (1<<((__b_idx__<<3) + __e_idx__)); - __trans__ |= (1<<((__i_idx__<<3) + __i_idx__)); - __trans__ |= (1<<((__i_idx__<<3) + __e_idx__)); + __trans__ |= (1<<((__i_idx__<<3) + __i_idx__)); + __trans__ |= (1<<((__i_idx__<<3) + __e_idx__)); - __trans__ |= (1<<((__e_idx__<<3) + __s_idx__)); - __trans__ |= (1<<((__e_idx__<<3) + __b_idx__)); - __trans__ |= (1<<((__e_idx__<<3) + __o_idx__)); + __trans__ |= (1<<((__e_idx__<<3) + __s_idx__)); + __trans__ |= (1<<((__e_idx__<<3) + __b_idx__)); + __trans__ |= (1<<((__e_idx__<<3) + __o_idx__)); - __trans__ |= (1<<((__o_idx__<<3) + __s_idx__)); - __trans__ |= (1<<((__o_idx__<<3) + __b_idx__)); - __trans__ |= (1<<((__o_idx__<<3) + __o_idx__)); - } else { - __trans__ = 0xffff; - } + __trans__ |= (1<<((__o_idx__<<3) + __s_idx__)); + __trans__ |= (1<<((__o_idx__<<3) + __b_idx__)); + __trans__ |= (1<<((__o_idx__<<3) + __o_idx__)); + } else { + __trans__ = 0xffff; } - - ~RuleBase() { - } - - inline bool legal_trans(int prev, int curr) { - int prev_prefix = prefix(prev); - int prev_suffix = suffix(prev); - int curr_prefix = prefix(curr); - int curr_suffix = suffix(curr); - - if (prev_prefix == __b_idx__ || prev_prefix == __i_idx__) { - return ((__trans__ & (1<<((prev_prefix<<3) + curr_prefix))) > 0 - && (prev_suffix == curr_suffix)); - } else { - return ((__trans__ & (1<<((prev_prefix<<3) + curr_prefix))) > 0); - } + } + + ~RuleBase() { + } + + inline bool legal_trans(int prev, int curr) { + int prev_prefix = prefix(prev); + int prev_suffix = suffix(prev); + int curr_prefix = prefix(curr); + int curr_suffix = suffix(curr); + + if (prev_prefix == __b_idx__ || prev_prefix == __i_idx__) { + return ((__trans__ & (1<<((prev_prefix<<3) + curr_prefix))) > 0 + && (prev_suffix == curr_suffix)); + } else { + return ((__trans__ & (1<<((prev_prefix<<3) + curr_prefix))) > 0); } + } private: - unsigned __trans__; + unsigned __trans__; - int __s_idx__; - int __b_idx__; - int __i_idx__; - int __e_idx__; - int __o_idx__; + int __s_idx__; + int __b_idx__; + int __i_idx__; + int __e_idx__; + int __o_idx__; - inline int prefix(int tag) { - return (tag / __num_ne_types__); - } + inline int prefix(int tag) { + return (tag / __num_ne_types__); + } - inline int suffix(int tag) { - return (tag % __num_ne_types__); - } + inline int suffix(int tag) { + return (tag % __num_ne_types__); + } }; -} // end for rulebase -} // end for namespace ner -} // end for namespace ltp +} // end for rulebase +} // end for namespace ner +} // end for namespace ltp #endif // end for __LTP_NER_RULE_BASE_H__ diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt index b0586d648..748569050 100644 --- a/src/parser/CMakeLists.txt +++ b/src/parser/CMakeLists.txt @@ -1,9 +1,6 @@ # example of configure time generate header -include_directories (./ - ${SOURCE_DIR}/parser - ${SOURCE_DIR}/utils - ${SOURCE_DIR}/utils/math) +include_directories (${SOURCE_DIR}/) set (lgdpj_VERSION "0.0.1") @@ -36,4 +33,4 @@ set_target_properties (lgdpj configure_file ( parser_dll.h - ${INCLUDE_OUTPUT_PATH}/parser_dll.h) + ${INCLUDE_OUTPUT_PATH}/ltp/parser_dll.h) diff --git a/src/parser/collections.cpp b/src/parser/collections.cpp index 84322efe0..c5a44d05d 100644 --- a/src/parser/collections.cpp +++ b/src/parser/collections.cpp @@ -1,86 +1,93 @@ -#include "collections.h" +#include "parser/collections.h" namespace ltp { namespace parser { DictionaryCollections::DictionaryCollections(int num_dicts) : - idx(0) { - dicts.resize( num_dicts ); + idx(0) { + dicts.resize( num_dicts ); - for (int i = 0; i < num_dicts; ++ i) { - dicts[i] = new Dictionary( this ); - } + for (int i = 0; i < num_dicts; ++ i) { + dicts[i] = new Dictionary( this ); + } } DictionaryCollections::~DictionaryCollections() { - for (int i = 0; i < dicts.size(); ++ i) { - delete dicts[i]; - } + for (int i = 0; i < dicts.size(); ++ i) { + delete dicts[i]; + } } -Dictionary * DictionaryCollections::getDictionary(int i) { - if (i < dicts.size()) { - return dicts[i]; - } +Dictionary * +DictionaryCollections::getDictionary(int i) { + if (i < dicts.size()) { + return dicts[i]; + } - return NULL; -} -int DictionaryCollections::retrieve(int tid, const char * key, bool create) { - return dicts[tid]->retrieve(key, create); + return NULL; } -size_t DictionaryCollections::dim() const { - return idx; +int +DictionaryCollections::retrieve(int tid, const char * key, bool create) { + return dicts[tid]->retrieve(key, create); } -int DictionaryCollections::size() { - return dicts.size(); +size_t +DictionaryCollections::dim() const { + return idx; } -void DictionaryCollections::dump(ostream & out) { - char chunk[32]; - unsigned int sz = dicts.size(); - strncpy(chunk, "collections", 16); - - out.write(chunk, 16); - out.write(reinterpret_cast(&idx), sizeof(int)); - out.write(reinterpret_cast(&sz), sizeof(unsigned int)); - for (int i = 0; i < dicts.size(); ++ i) { - // strncpy(chunk, dicts[i]->dict_name.c_str(), 32); - // out.write(chunk, 32); - - dicts[i]->database.dump(out); - } +int +DictionaryCollections::size() { + return dicts.size(); } -bool DictionaryCollections::load(istream & in) { - char chunk[32]; - unsigned int sz; +void +DictionaryCollections::dump(ostream & out) { + char chunk[32]; + unsigned int sz = dicts.size(); + strncpy(chunk, "collections", 16); + + out.write(chunk, 16); + out.write(reinterpret_cast(&idx), sizeof(int)); + out.write(reinterpret_cast(&sz), sizeof(unsigned int)); + for (int i = 0; i < dicts.size(); ++ i) { + // strncpy(chunk, dicts[i]->dict_name.c_str(), 32); + // out.write(chunk, 32); + + dicts[i]->database.dump(out); + } +} - in.read(chunk, 16); - if (strcmp(chunk, "collections")) { - return false; - } +bool +DictionaryCollections::load(istream & in) { + char chunk[32]; + unsigned int sz; - in.read(reinterpret_cast(&idx), sizeof(int)); - in.read(reinterpret_cast(&sz), sizeof(unsigned int)); + in.read(chunk, 16); + if (strcmp(chunk, "collections")) { + return false; + } - if (sz != dicts.size()) { - return false; - } + in.read(reinterpret_cast(&idx), sizeof(int)); + in.read(reinterpret_cast(&sz), sizeof(unsigned int)); - for (unsigned i = 0; i < sz; ++ i) { - // in.read(chunk, 32); + if (sz != dicts.size()) { + return false; + } - // Dictionary * dict = new Dictionary(this); - if (!dicts[i]->database.load(in)) { - return false; - } + for (unsigned i = 0; i < sz; ++ i) { + // in.read(chunk, 32); - // dicts[i].push_back(dict); + // Dictionary * dict = new Dictionary(this); + if (!dicts[i]->database.load(in)) { + return false; } - return true; + // dicts[i].push_back(dict); + } + + return true; } } // end for namespace parser diff --git a/src/parser/collections.h b/src/parser/collections.h index 609025d30..22cd4c5d1 100644 --- a/src/parser/collections.h +++ b/src/parser/collections.h @@ -1,12 +1,11 @@ -#ifndef __DICT_COLLECTIONS_H__ -#define __DICT_COLLECTIONS_H__ +#ifndef __LTP_PARSER_DICT_COLLECTIONS_H__ +#define __LTP_PARSER_DICT_COLLECTIONS_H__ #include #include -#include "stringmap.hpp" -#include "smartmap.hpp" - -#include "instance.h" +#include "utils/stringmap.hpp" +#include "utils/smartmap.hpp" +#include "parser/instance.h" namespace ltp { namespace parser { @@ -23,101 +22,101 @@ class Dictionary; // a index counter is shared within several dictionary. class DictionaryCollections { public: - DictionaryCollections(int num_dicts); - ~DictionaryCollections(); - - /* - * Dump the dictionary collections into output stream - * - * @param[out] out the output stream - */ - void dump(ostream & out); - - /* - * Load the dictionary collections from input stream, - * return true if dictionary successfully loaded, otherwise - * false. - * - * @param[in] in the input stream - * @return bool true on success, otherwise false. - */ - bool load(istream & in); - - /* - * Get the size of dictionary collections - * - * @return size_t the size of the dictionary - */ - size_t dim() const; - - /* - * Retrieve the certain key in one of the dictionaries in this - * collection. If create is specified, this key is created on - * the condition that it is not in the dictionary. Return the - * index of the key, -1 on failure - * - * @param[in] tid the index of the dictionary - * @param[in] key the key - * @param[in] create insert the key to dictionary if create - * if true. - * @return int the index of the key, -1 on failure. - */ - int retrieve(int tid, const char * key, bool create); - - /* - * Get the ith Dictionary - * - * @param[in] i the index of the dictionary - * @return Dictionary * the dictionary - */ - Dictionary * getDictionary(int i); - - /* - * Get size of dicts - * - * @return int the size of the dictionary - */ - int size(); + DictionaryCollections(int num_dicts); + ~DictionaryCollections(); + + /* + * Dump the dictionary collections into output stream + * + * @param[out] out the output stream + */ + void dump(ostream & out); + + /* + * Load the dictionary collections from input stream, + * return true if dictionary successfully loaded, otherwise + * false. + * + * @param[in] in the input stream + * @return bool true on success, otherwise false. + */ + bool load(istream & in); + + /* + * Get the size of dictionary collections + * + * @return size_t the size of the dictionary + */ + size_t dim() const; + + /* + * Retrieve the certain key in one of the dictionaries in this + * collection. If create is specified, this key is created on + * the condition that it is not in the dictionary. Return the + * index of the key, -1 on failure + * + * @param[in] tid the index of the dictionary + * @param[in] key the key + * @param[in] create insert the key to dictionary if create + * if true. + * @return int the index of the key, -1 on failure. + */ + int retrieve(int tid, const char * key, bool create); + + /* + * Get the ith Dictionary + * + * @param[in] i the index of the dictionary + * @return Dictionary * the dictionary + */ + Dictionary * getDictionary(int i); + + /* + * Get size of dicts + * + * @return int the size of the dictionary + */ + int size(); public: - int idx; /*< the shared index among dictionaries */ + int idx; /*< the shared index among dictionaries */ private: - vector dicts; + vector dicts; }; // the dictionary class // it's wrapper of class SmartMap class Dictionary { public: - Dictionary(DictionaryCollections * coll): - collections(coll) {} - - //StringMap database; - SmartMap database; - DictionaryCollections * collections; - - inline int retrieve(const char * key, bool create) { - int val; - - if (database.get(key, val)) { - return val; - } else { - if (create) { - val = collections->idx; - database.set(key, val); - // database.unsafe_set(key, val); - ++ collections->idx; - return val; - } - } - - return -1; + Dictionary(DictionaryCollections * coll): + collections(coll) {} + + //StringMap database; + SmartMap database; + DictionaryCollections * collections; + + inline int retrieve(const char * key, bool create) { + int val; + + if (database.get(key, val)) { + return val; + } else { + if (create) { + val = collections->idx; + database.set(key, val); + // database.unsafe_set(key, val); + ++ collections->idx; + return val; + } } - inline int size() { - return database.size(); - } + return -1; + } + + inline int size() { + return database.size(); + } }; // labelcollections is a bi-direction map. @@ -126,6 +125,6 @@ class Dictionary { // * string key -> int index // * int index -> string key // -} // end for namespace parser -} // end for namespace ltp -#endif // end for __FEATURE_COLLECTIONS_H__ +} // end for namespace parser +} // end for namespace ltp +#endif // end for __LTP_PARSER_DICT_COLLECTIONS_H__ diff --git a/src/parser/conllreader.h b/src/parser/conllreader.h index 755b755e6..5ddcc5992 100644 --- a/src/parser/conllreader.h +++ b/src/parser/conllreader.h @@ -1,16 +1,15 @@ -#ifndef __CONLL_READER_H__ -#define __CONLL_READER_H__ +#ifndef __LTP_PARSER_CONLL_READER_H__ +#define __LTP_PARSER_CONLL_READER_H__ #include #include -#include "codecs.hpp" -#include "strutils.hpp" -#include "logging.hpp" - -#include "settings.h" -#include "instance.h" -#include "options.h" +#include "utils/codecs.hpp" +#include "utils/strutils.hpp" +#include "utils/logging.hpp" +#include "parser/settings.h" +#include "parser/instance.h" +#include "parser/options.h" namespace ltp { namespace parser { @@ -20,80 +19,80 @@ using namespace ltp::strutils; class CoNLLReader { public: - /* - * Constructor for ConllReader - * Register a ifstream to the ConllReader - * - * @param f the reference to the ifstream - */ - CoNLLReader(ifstream& _f): f(_f) {} - ~CoNLLReader() {} - - /* - * Get next instance from ifstream buffer - */ - Instance * next() { - if (f.eof()) { - return NULL; - } - - Instance * inst = new Instance; - string line; - - inst->forms.push_back( ROOT_FORM ); - inst->lemmas.push_back( ROOT_LEMMA ); - inst->postags.push_back( ROOT_POSTAG ); - inst->heads.push_back( -1 ); - - if (model_opt.labeled) { - inst->deprels.push_back( ROOT_DEPREL ); - } - inst->chars.push_back( vector() ); - - while (!f.eof()) { - getline(f, line); - chomp(line); - - if (line.size() == 0) { - break; - } - - vector items = split(line); - if (items.size() != 10) { - WARNING_LOG("Unknown conll format file"); - } - - inst->forms.push_back( items[1] ); // items[1]: form - inst->lemmas.push_back( items[2] ); // items[2]: lemma - inst->postags.push_back( items[3] ); // items[4]: postag - inst->heads.push_back( to_int(items[6]) ); - - if (model_opt.labeled) { - inst->deprels.push_back( items[7] ); - } - - vector chars; - codecs::decode(items[1], chars); - inst->chars.push_back( chars ); - } - - if (inst->forms.size() == 1) { - delete inst; - inst = NULL; - } - return inst; + /* + * Constructor for ConllReader + * Register a ifstream to the ConllReader + * + * @param f the reference to the ifstream + */ + CoNLLReader(ifstream& _f): f(_f) {} + ~CoNLLReader() {} + + /* + * Get next instance from ifstream buffer + */ + Instance * next() { + if (f.eof()) { + return NULL; + } + + Instance * inst = new Instance; + string line; + + inst->forms.push_back( ROOT_FORM ); + inst->lemmas.push_back( ROOT_LEMMA ); + inst->postags.push_back( ROOT_POSTAG ); + inst->heads.push_back( -1 ); + + if (model_opt.labeled) { + inst->deprels.push_back( ROOT_DEPREL ); + } + inst->chars.push_back( vector() ); + + while (!f.eof()) { + getline(f, line); + chomp(line); + + if (line.size() == 0) { + break; + } + + vector items = split(line); + if (items.size() != 10) { + WARNING_LOG("Unknown conll format file"); + } + + inst->forms.push_back( items[1] ); // items[1]: form + inst->lemmas.push_back( items[2] ); // items[2]: lemma + inst->postags.push_back( items[3] ); // items[4]: postag + inst->heads.push_back( to_int(items[6]) ); + + if (model_opt.labeled) { + inst->deprels.push_back( items[7] ); + } + + vector chars; + codecs::decode(items[1], chars); + inst->chars.push_back( chars ); } - /* - * Reader reach the end of the file - */ - bool eof() { - return f.eof(); + if (inst->forms.size() == 1) { + delete inst; + inst = NULL; } + return inst; + } + + /* + * Reader reach the end of the file + */ + bool eof() { + return f.eof(); + } private: - ifstream& f; + ifstream& f; }; // end for ConllReader } // end for parser } // end for namespace ltp -#endif // end for __CONLL_READER_H__ +#endif // end for __LTP_PARSER_CONLL_READER_H__ diff --git a/src/parser/conllwriter.h b/src/parser/conllwriter.h index f3fc94fb8..c4602ae2e 100644 --- a/src/parser/conllwriter.h +++ b/src/parser/conllwriter.h @@ -1,10 +1,10 @@ -#ifndef __CONLL_WRITER_H__ -#define __CONLL_WRITER_H__ +#ifndef __LTP_PARSER_CONLL_WRITER_H__ +#define __LTP_PARSER_CONLL_WRITER_H__ #include -#include "strutils.hpp" -#include "instance.h" +#include "utils/strutils.hpp" +#include "parser/instance.h" namespace ltp { namespace parser { @@ -13,47 +13,47 @@ using namespace ltp::strutils; class CoNLLWriter { public: - CoNLLWriter(std::ostream& _f): f(_f) {} - ~CoNLLWriter() {} - - void write(const Instance * inst) { - int len = inst->size(); - bool predicted = (inst->predicted_heads.size() > 0 && - inst->predicted_heads.size() == len); - bool predicted_label = (inst->predicted_deprels.size() > 0 && - inst->predicted_deprels.size() == len); - - for (int i = 1; i < inst->size(); ++ i) { - f << i - << "\t" // 0 - index - << inst->forms[i] - << "\t" // 1 - form - << inst->lemmas[i] - << "\t" // 2 - lemma - << inst->postags[i] - << "\t" // 3 - postag - << "_" - << "\t" // 4 - unknown - << "_" - << "\t" // 5 - unknown - << inst->heads[i] - << "\t" // 6 - heads - << inst->deprels[i] - << "\t" // 7 - deprels - << (predicted ? to_str(inst->predicted_heads[i]) : "_") - << "\t" - << (predicted_label ? inst->predicted_deprels[i] : "_") - << endl; - } - - f << endl; + CoNLLWriter(std::ostream& _f): f(_f) {} + ~CoNLLWriter() {} + + void write(const Instance * inst) { + int len = inst->size(); + bool predicted = (inst->predicted_heads.size() > 0 + && inst->predicted_heads.size() == len); + bool predicted_label = (inst->predicted_deprels.size() > 0 + && inst->predicted_deprels.size() == len); + + for (int i = 1; i < inst->size(); ++ i) { + f << i + << "\t" // 0 - index + << inst->forms[i] + << "\t" // 1 - form + << inst->lemmas[i] + << "\t" // 2 - lemma + << inst->postags[i] + << "\t" // 3 - postag + << "_" + << "\t" // 4 - unknown + << "_" + << "\t" // 5 - unknown + << inst->heads[i] + << "\t" // 6 - heads + << inst->deprels[i] + << "\t" // 7 - deprels + << (predicted ? to_str(inst->predicted_heads[i]) : "_") + << "\t" + << (predicted_label ? inst->predicted_deprels[i] : "_") + << endl; } + + f << endl; + } private: - std::ostream& f; + std::ostream& f; }; // end for ConnllWriter } // end for parser } // end for namespace ltp -#endif // end for __CONLL_WRITER_H__ +#endif // end for __LTP_PARSER_CONLL_WRITER_H__ diff --git a/src/parser/debug.h b/src/parser/debug.h index 3849f99a0..13e163aa9 100644 --- a/src/parser/debug.h +++ b/src/parser/debug.h @@ -2,7 +2,7 @@ #define __DEBUG_H__ #include -#include "instance.h" +#include "parser/instance.h" namespace ltp { namespace parser { diff --git a/src/parser/decoder.h b/src/parser/decoder.h index d47e421ae..d333fc8a1 100644 --- a/src/parser/decoder.h +++ b/src/parser/decoder.h @@ -1,9 +1,9 @@ -#ifndef __DECODER_H__ -#define __DECODER_H__ +#ifndef __LTP_PARSER_DECODER_H__ +#define __LTP_PARSER_DECODER_H__ -#include "instance.h" -#include "settings.h" -#include "options.h" +#include "parser/instance.h" +#include "parser/settings.h" +#include "parser/options.h" #include #include @@ -12,166 +12,169 @@ namespace ltp { namespace parser { -// data struct for decode chart item. Provide several construction +// data struct for decode chart item. Provide several construction // methods and bind certain type. class LatticeItem { public: - const int _g; /* grand */ - const int _s; /* from */ - const int _t; /*< the distance to */ - const int _comp; /*< specify if this span is complete */ - const int _label_s_t; /*< label type */ + const int _g; /* grand */ + const int _s; /* from */ + const int _t; /*< the distance to */ + const int _comp; /*< specify if this span is complete */ + const int _label_s_t; /*< label type */ - const LatticeItem * const _left; - const LatticeItem * const _right; + const LatticeItem * const _left; + const LatticeItem * const _right; - const double _prob; + const double _prob; public: - LatticeItem(const int comp, - const int g, - const int s, - const int t, - const double prob, - const LatticeItem * const left, - const LatticeItem * const right) : - _g(g), - _s(s), - _t(t), - _comp(comp), - _prob(prob), - _left(left), - _right(right), - _label_s_t(-1) { } - - LatticeItem(const int comp, - const int s, - const int t, - const double prob, - const LatticeItem * const left, - const LatticeItem * const right, - const int label_s_t = -1) : - _g(-1), - _s(s), - _t(t), - _comp(comp), - _prob(prob), - _left(left), - _right(right), - _label_s_t(label_s_t) { } - - // for span like C(s,s) - LatticeItem(const int g, - const int s) : - _g(g), - _s(s), - _t(s), - _prob(0.0), - _comp(CMP), - _left(0), - _right(0), - _label_s_t(-1) { } - - LatticeItem(const int s) : - _s(s), - _t(s), - _prob(0.0), - _comp(CMP), - _left(0), - _right(0), - _g(-1), - _label_s_t(-1) { } - - ~LatticeItem() {} + LatticeItem(const int comp, + const int g, + const int s, + const int t, + const double prob, + const LatticeItem * const left, + const LatticeItem * const right) : + _g(g), + _s(s), + _t(t), + _comp(comp), + _prob(prob), + _left(left), + _right(right), + _label_s_t(-1) { } + + LatticeItem(const int comp, + const int s, + const int t, + const double prob, + const LatticeItem * const left, + const LatticeItem * const right, + const int label_s_t = -1) : + _g(-1), + _s(s), + _t(t), + _comp(comp), + _prob(prob), + _left(left), + _right(right), + _label_s_t(label_s_t) { } + + // for span like C(s,s) + LatticeItem(const int g, + const int s) : + _g(g), + _s(s), + _t(s), + _prob(0.0), + _comp(CMP), + _left(0), + _right(0), + _label_s_t(-1) { } + + LatticeItem(const int s) : + _s(s), + _t(s), + _prob(0.0), + _comp(CMP), + _left(0), + _right(0), + _g(-1), + _label_s_t(-1) { } + + ~LatticeItem() {} private: - // forbidden construction - LatticeItem(const LatticeItem & rhs) : - _s(0), - _t(0), - _prob(0.0), - _comp(-1), - _left(0), - _right(0), - _label_s_t(-1), - _g(-1) { - std::cerr << "LatticeItem::LatticeItem(const LatticeItem & rhs) is not allowed" << std::endl; - exit(-1); - } - - LatticeItem & operator = (const LatticeItem & rhs) { - std::cerr << "LatticeItem::operator= (const LatticeItem & rhs) is not allowed" << std::endl; - exit(-1); - } + // forbidden construction + LatticeItem(const LatticeItem & rhs) : + _s(0), + _t(0), + _prob(0.0), + _comp(-1), + _left(0), + _right(0), + _label_s_t(-1), + _g(-1) { + std::cerr << "LatticeItem::LatticeItem(const LatticeItem & rhs) is not allowed" + << std::endl; + exit(-1); + } + + LatticeItem & operator = (const LatticeItem & rhs) { + std::cerr << "LatticeItem::operator= (const LatticeItem & rhs) is not allowed" + << std::endl; + exit(-1); + } }; class Decoder { public: - Decoder() {} - virtual ~Decoder() {} - - /* - * Decode the instance, this method is a controller, - * execute: - * - init lattice - * - decode projective - * - get result - * - free lattice - * in sequence. - * - * @param[in] inst the instance - */ - void decode(Instance * inst) { - init_lattice(inst); - decode_projective(inst); - get_result(inst); - free_lattice(); - } - - virtual void init_lattice(const Instance * inst) = 0; - virtual void decode_projective(const Instance * inst) = 0; - virtual void get_result(Instance * inst) = 0; - virtual void free_lattice() = 0; + Decoder() {} + virtual ~Decoder() {} + + /* + * Decode the instance, this method is a controller, + * execute: + * - init lattice + * - decode projective + * - get result + * - free lattice + * in sequence. + * + * @param[in] inst the instance + */ + void decode(Instance * inst) { + init_lattice(inst); + decode_projective(inst); + get_result(inst); + free_lattice(); + } + + virtual void init_lattice(const Instance * inst) = 0; + virtual void decode_projective(const Instance * inst) = 0; + virtual void get_result(Instance * inst) = 0; + virtual void free_lattice() = 0; protected: - void lattice_insert(const LatticeItem * &position, const LatticeItem * const item) { - if (position == NULL) { - position = item; - } else if (position->_prob < item->_prob - EPS) { - delete position; - position = item; - } else { - delete item; - } + void lattice_insert(const LatticeItem * &position, const LatticeItem * const item) { + if (position == NULL) { + position = item; + } else if (position->_prob < item->_prob - EPS) { + delete position; + position = item; + } else { + delete item; } + } - void __BUILD_TREE(Instance * inst, const LatticeItem * item) { - if (!item) { - return; - } - - __BUILD_TREE(inst, item->_left); + void __BUILD_TREE(Instance * inst, const LatticeItem * item) { + if (!item) { + return; + } - if (INCMP == item->_comp) { - inst->predicted_heads[item->_t] = item->_s; + __BUILD_TREE(inst, item->_left); - if (model_opt.labeled) { - inst->predicted_deprelsidx[item->_t] = item->_label_s_t; - } - } else if (CMP == item->_comp) { - // do nothing; - } else if (SIBSP == item->_comp) { - // do nothing - } else { - } + if (INCMP == item->_comp) { + inst->predicted_heads[item->_t] = item->_s; - __BUILD_TREE(inst, item->_right); + if (model_opt.labeled) { + inst->predicted_deprelsidx[item->_t] = item->_label_s_t; + } + } else if (CMP == item->_comp) { + // do nothing; + } else if (SIBSP == item->_comp) { + // do nothing + } else { + // do nothing } + __BUILD_TREE(inst, item->_right); + } + }; // end for class decoder } // end for namespace parser } // end for namespace ltp -#endif // end for __DECODER_H__ +#endif // end for __LTP_PARSER_DECODER_H__ diff --git a/src/parser/decoder1o.cpp b/src/parser/decoder1o.cpp index 441351249..e3b94afc5 100644 --- a/src/parser/decoder1o.cpp +++ b/src/parser/decoder1o.cpp @@ -1,175 +1,175 @@ -#include "decoder1o.h" +#include "parser/decoder1o.h" namespace ltp { namespace parser { void Decoder1O::init_lattice(const Instance * inst) { - int len = inst->size(); - _lattice_cmp.resize(len, len); - _lattice_incmp.resize(len, len, L); + int len = inst->size(); + _lattice_cmp.resize(len, len); + _lattice_incmp.resize(len, len, L); - _lattice_cmp = NULL; - _lattice_incmp = NULL; + _lattice_cmp = NULL; + _lattice_incmp = NULL; - for (int i = 0; i < len; ++ i) { - _lattice_cmp[i][i] = new LatticeItem(i); - } + for (int i = 0; i < len; ++ i) { + _lattice_cmp[i][i] = new LatticeItem(i); + } } void Decoder1O::decode_projective(const Instance * inst) { - int len = inst->size(); - - // instance_verify(inst); - for (int width = 1; width < len; ++ width) { - for (int s = 0; s + width < len; ++ s) { - int t = s + width; - _lattice_cmp[s][t] = NULL; - _lattice_cmp[t][s] = NULL; - for (int l = 0; l < L; ++ l) { - _lattice_incmp[s][t][l] = NULL; - _lattice_incmp[t][s][l] = NULL; + int len = inst->size(); + + // instance_verify(inst); + for (int width = 1; width < len; ++ width) { + for (int s = 0; s + width < len; ++ s) { + int t = s + width; + _lattice_cmp[s][t] = NULL; + _lattice_cmp[t][s] = NULL; + for (int l = 0; l < L; ++ l) { + _lattice_incmp[s][t][l] = NULL; + _lattice_incmp[t][s][l] = NULL; + } + + for (int r = s; r < t; ++ r) { + const LatticeItem * const left = _lattice_cmp[s][r]; + if (!left) { + continue; + } + + const LatticeItem * const right = _lattice_cmp[t][r+1]; + if (!right) { + continue; + } + + for (int l = 0; l < L; ++ l) { + + { // I(s,t) = C(s,r) + C(t,r+1) + double prob = (left->_prob + right->_prob); + + if (feat_opt.use_unlabeled_dependency) { + prob += inst->depu_scores[s][t]; } - for (int r = s; r < t; ++ r) { - const LatticeItem * const left = _lattice_cmp[s][r]; - if (!left) { - continue; - } - - const LatticeItem * const right = _lattice_cmp[t][r+1]; - if (!right) { - continue; - } - - for (int l = 0; l < L; ++ l) { - - { // I(s,t) = C(s,r) + C(t,r+1) - double prob = (left->_prob + right->_prob); - - if (feat_opt.use_unlabeled_dependency) { - prob += inst->depu_scores[s][t]; - } - - if (feat_opt.use_labeled_dependency) { - prob += inst->depl_scores[s][t][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - s, - t, - prob, - left, - right, - l); - - lattice_insert(_lattice_incmp[s][t][l], item); - } - - if (s != 0) { // I(t,s) - double prob = (left->_prob + right->_prob); - - if (feat_opt.use_unlabeled_dependency) { - prob += inst->depu_scores[t][s]; - } - - if (feat_opt.use_labeled_dependency) { - prob += inst->depl_scores[t][s][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - t, - s, - prob, - left, - right, - l); - - // cerr << "INCMP " << t << "-" << s << "-" << l << endl; - lattice_insert(_lattice_incmp[t][s][l], item); - } // end for if (s != 0) - } // end for for (int l = 0; l < _L; ++ l) - } // end for for (int r = s; r < t; ++ r) - - for (int r = s; r <= t; ++ r) { - if (r != s) { // C(s,t) = I(s,r) + C(r,t) - const LatticeItem * const right = _lattice_cmp[r][t]; - if (!right) { - continue; - } - - for (int l = 0; l < L; ++ l) { - const LatticeItem * const left = _lattice_incmp[s][r][l]; - if (!left) { - continue; - } - - const double prob = left->_prob + right->_prob; - const LatticeItem * const item = new LatticeItem(CMP, - s, - t, - prob, - left, - right); - - // cerr << "CMP " << s << "-" << t << endl; - lattice_insert(_lattice_cmp[s][t], item); - } - } // end for if (r != s) - - if (r != t && s != 0) { // C(t,s) = I(t,r) + C(r,s) - const LatticeItem * const left = _lattice_cmp[r][s]; - if (!left) { - continue; - } - - for (int l = 0; l < L; ++ l) { - const LatticeItem * const right = _lattice_incmp[t][r][l]; - if (!right) { - continue; - } - - const double prob = left->_prob + right->_prob; - const LatticeItem * const item = new LatticeItem(CMP, - t, - s, - prob, - left, - right); - - // cerr << "CMP " << t << "-" << s << endl; - lattice_insert(_lattice_cmp[t][s], item); - } // end for for (int l = 0; l < L; ++ l) - } // end for if (r != t && s != 0) + if (feat_opt.use_labeled_dependency) { + prob += inst->depl_scores[s][t][l]; } - } + + const LatticeItem * const item = new LatticeItem(INCMP, + s, + t, + prob, + left, + right, + l); + + lattice_insert(_lattice_incmp[s][t][l], item); + } + + if (s != 0) { // I(t,s) + double prob = (left->_prob + right->_prob); + + if (feat_opt.use_unlabeled_dependency) { + prob += inst->depu_scores[t][s]; + } + + if (feat_opt.use_labeled_dependency) { + prob += inst->depl_scores[t][s][l]; + } + + const LatticeItem * const item = new LatticeItem(INCMP, + t, + s, + prob, + left, + right, + l); + + // cerr << "INCMP " << t << "-" << s << "-" << l << endl; + lattice_insert(_lattice_incmp[t][s][l], item); + } // end for if (s != 0) + } // end for for (int l = 0; l < _L; ++ l) + } // end for for (int r = s; r < t; ++ r) + + for (int r = s; r <= t; ++ r) { + if (r != s) { // C(s,t) = I(s,r) + C(r,t) + const LatticeItem * const right = _lattice_cmp[r][t]; + if (!right) { + continue; + } + + for (int l = 0; l < L; ++ l) { + const LatticeItem * const left = _lattice_incmp[s][r][l]; + if (!left) { + continue; + } + + const double prob = left->_prob + right->_prob; + const LatticeItem * const item = new LatticeItem(CMP, + s, + t, + prob, + left, + right); + + // cerr << "CMP " << s << "-" << t << endl; + lattice_insert(_lattice_cmp[s][t], item); + } + } // end for if (r != s) + + if (r != t && s != 0) { // C(t,s) = I(t,r) + C(r,s) + const LatticeItem * const left = _lattice_cmp[r][s]; + if (!left) { + continue; + } + + for (int l = 0; l < L; ++ l) { + const LatticeItem * const right = _lattice_incmp[t][r][l]; + if (!right) { + continue; + } + + const double prob = left->_prob + right->_prob; + const LatticeItem * const item = new LatticeItem(CMP, + t, + s, + prob, + left, + right); + + // cerr << "CMP " << t << "-" << s << endl; + lattice_insert(_lattice_cmp[t][s], item); + } // end for for (int l = 0; l < L; ++ l) + } // end for if (r != t && s != 0) + } } + } } void Decoder1O::get_result(Instance * inst) { - int len = inst->size(); - inst->predicted_heads.resize(len, -1); - if (model_opt.labeled) { - inst->predicted_deprelsidx.resize(len, -1); - } - - const LatticeItem * best_item = _lattice_cmp[0][len - 1]; - __BUILD_TREE(inst, best_item); + int len = inst->size(); + inst->predicted_heads.resize(len, -1); + if (model_opt.labeled) { + inst->predicted_deprelsidx.resize(len, -1); + } + + const LatticeItem * best_item = _lattice_cmp[0][len - 1]; + __BUILD_TREE(inst, best_item); } void Decoder1O::free_lattice() { - int len = _lattice_cmp.nrows(); - for (int i = 0; i < len; ++ i) { - for (int j = 0; j < len; ++ j) { - for (int l = 0; l < L; ++ l) { - if (_lattice_incmp[i][j][l]) { - delete _lattice_incmp[i][j][l]; - } - } - - delete _lattice_cmp[i][j]; + int len = _lattice_cmp.nrows(); + for (int i = 0; i < len; ++ i) { + for (int j = 0; j < len; ++ j) { + for (int l = 0; l < L; ++ l) { + if (_lattice_incmp[i][j][l]) { + delete _lattice_incmp[i][j][l]; } + } + + delete _lattice_cmp[i][j]; } + } } } // end for namespace parser diff --git a/src/parser/decoder1o.h b/src/parser/decoder1o.h index e46431937..d3265ff2f 100644 --- a/src/parser/decoder1o.h +++ b/src/parser/decoder1o.h @@ -1,12 +1,11 @@ -#ifndef __DECODER_1_O_H__ -#define __DECODER_1_O_H__ +#ifndef __LTP_PARSER_DECODER_1_O_H__ +#define __LTP_PARSER_DECODER_1_O_H__ -#include "instance.h" -#include "decoder.h" -#include "mat.h" -#include "options.h" - -#include "debug.h" +#include "parser/instance.h" +#include "parser/decoder.h" +#include "parser/options.h" +#include "parser/debug.h" +#include "utils/math/mat.h" namespace ltp { namespace parser { @@ -15,21 +14,21 @@ using namespace ltp::math; class Decoder1O : public Decoder { public: - Decoder1O(int _L = 1) : L(_L) {} + Decoder1O(int _L = 1) : L(_L) {} protected: - void init_lattice(const Instance * inst); - void decode_projective(const Instance * inst); - void get_result(Instance * inst); - void free_lattice(); + void init_lattice(const Instance * inst); + void decode_projective(const Instance * inst); + void get_result(Instance * inst); + void free_lattice(); protected: - int L; + int L; - Mat< const LatticeItem * > _lattice_cmp; // complete span - Mat3< const LatticeItem * > _lattice_incmp; // incomplete span -}; // end for class Decoder1O + Mat< const LatticeItem * > _lattice_cmp; // complete span + Mat3< const LatticeItem * > _lattice_incmp; // incomplete span +}; // end for class Decoder1O -} // end for namespace parser -} // end for namespace ltp +} // end for namespace parser +} // end for namespace ltp -#endif // end for __DECODER_1_O_H__ +#endif // end for __LTP_PARSER_DECODER_1_O_H__ diff --git a/src/parser/decoder2o.cpp b/src/parser/decoder2o.cpp index 10ae9a48c..ec5a5a7e1 100644 --- a/src/parser/decoder2o.cpp +++ b/src/parser/decoder2o.cpp @@ -1,5 +1,5 @@ -#include "decoder2o.h" -#include "options.h" +#include "parser/decoder2o.h" +#include "parser/options.h" namespace ltp { namespace parser { @@ -9,296 +9,296 @@ namespace parser { // ================================================================ // void Decoder2O::init_lattice(const Instance * inst) { - int len = inst->size(); - _lattice_cmp.resize(len, len); - _lattice_sib.resize(len, len); - _lattice_incmp.resize(len, len); - - for (int i = 0; i < len; ++ i) { - for (int j = 0; j < len; ++ j) { - _lattice_cmp[i][j] = 0; - _lattice_sib[i][j] = 0; - _lattice_incmp[i][j] = 0; - } - } - for (int i = 0; i < len; ++ i) { - _lattice_cmp[i][i] = new LatticeItem(i); + int len = inst->size(); + _lattice_cmp.resize(len, len); + _lattice_sib.resize(len, len); + _lattice_incmp.resize(len, len); + + for (int i = 0; i < len; ++ i) { + for (int j = 0; j < len; ++ j) { + _lattice_cmp[i][j] = 0; + _lattice_sib[i][j] = 0; + _lattice_incmp[i][j] = 0; } + } + for (int i = 0; i < len; ++ i) { + _lattice_cmp[i][i] = new LatticeItem(i); + } } void Decoder2O::decode_projective(const Instance * inst) { - int len = inst->size(); - for (int width = 1; width < len; ++ width) { - for (int s = 0; s + width < len; ++ s) { - int t = s + width; - - for (int l = 0; l < L; ++ l) { - double shared_score = 0.; - - if (feat_opt.use_unlabeled_dependency) { - shared_score += inst->depu_scores[s][t]; - } - - if (feat_opt.use_labeled_dependency) { - shared_score += inst->depl_scores[s][t][l]; - } - - { // I(s,t) = C(s,s) + C(t,s+1) - const LatticeItem * const left = _lattice_cmp[s][s]; - const LatticeItem * const right = _lattice_cmp[t][s + 1]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob + shared_score; - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[s][t][s]; - } - - if (feat_opt.use_labeled_sibling) { - score += inst->sibl_scores[s][t][s][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - s, - t, - score, - left, - right, - l); - - lattice_insert(_lattice_incmp[s][t], item); - } // end for I(s,t) = C(s,s) + C(t,s+1) - - { // I(s,t) = I(s,r) + S(r,t) - for (int r = s + 1; r < t; ++ r) { - const LatticeItem * const left = _lattice_incmp[s][r]; - const LatticeItem * const right = _lattice_sib[r][t]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob + shared_score; - - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[s][t][r]; - } - - if (feat_opt.use_labeled_sibling) { - score += inst->sibl_scores[s][t][r][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - s, - t, - score, - left, - right, - l); - - lattice_insert(_lattice_incmp[s][t], item); - } - } // end for I(s,t) = I(s,r) + S(r,t) - - } // end for for (l = 0; l < L; ++ l) - - if (s != 0) { // I(t,s) = C(s, t-1) + C(t, t) - for (int l = 0; l < L; ++ l) { - double shared_score = 0.; - - if (feat_opt.use_unlabeled_dependency) { - shared_score += inst->depu_scores[t][s]; - } - - if (feat_opt.use_labeled_dependency) { - shared_score += inst->depl_scores[t][s][l]; - } - - { // I(t,s) = C(s,t-1) + C(t,t) - const LatticeItem * const left = _lattice_cmp[s][t-1]; - const LatticeItem * const right = _lattice_cmp[t][t]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob + shared_score; - - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[t][s][t]; - } - - if (feat_opt.use_labeled_sibling) { - score += inst->sibl_scores[t][s][t][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - t, - s, - score, - left, - right, - l); - - lattice_insert(_lattice_incmp[t][s], item); - } // end for I(t, s) = C(s,t-1) + C(t,t) - - { // I(t,s) = S(s,r) + I(t,r) - for (int r = s + 1; r < t; ++ r) { - const LatticeItem * const left = _lattice_sib[s][r]; - const LatticeItem * const right = _lattice_incmp[t][r]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob + shared_score; - - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[t][s][r]; - } - - if (feat_opt.use_labeled_sibling) { - score += inst->sibl_scores[t][s][r][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - t, - s, - score, - left, - right, - l); - - lattice_insert(_lattice_incmp[t][s], item); - } - } // end for I(t,s) = S(s,r) + I(t,r) - - } - } // end for if (s != 0) - - { // S(s,t) = C(s,r) + C(t,r+1) - for (int r = s; r < t; ++ r) { - const LatticeItem * const left = _lattice_cmp[s][r]; - const LatticeItem * const right = _lattice_cmp[t][r+1]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob; - - const LatticeItem * const item = new LatticeItem(SIBSP, - s, - t, - score, - left, - right); - - lattice_insert(_lattice_sib[s][t], item); - } - } // end for S(s,t) = C(s,t) + C(t,r+1) - - { // C(s,t) = I(s,r) + C(r,t) - for (int r = s + 1; r <= t; ++ r) { - const LatticeItem * const left = _lattice_incmp[s][r]; - const LatticeItem * const right = _lattice_cmp[r][t]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob; - - if (feat_opt.use_last_sibling) { - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[s][r][r]; - } - - if (feat_opt.use_labeled_sibling) { - int l = left->_label_s_t; - score += inst->sibl_scores[s][r][r][l]; - } - } - - const LatticeItem * const item = new LatticeItem(CMP, - s, - t, - score, - left, - right); - - lattice_insert(_lattice_cmp[s][t], item); - - } + int len = inst->size(); + for (int width = 1; width < len; ++ width) { + for (int s = 0; s + width < len; ++ s) { + int t = s + width; + + for (int l = 0; l < L; ++ l) { + double shared_score = 0.; + + if (feat_opt.use_unlabeled_dependency) { + shared_score += inst->depu_scores[s][t]; + } + + if (feat_opt.use_labeled_dependency) { + shared_score += inst->depl_scores[s][t][l]; + } + + { // I(s,t) = C(s,s) + C(t,s+1) + const LatticeItem * const left = _lattice_cmp[s][s]; + const LatticeItem * const right = _lattice_cmp[t][s + 1]; + + if (!left || !right) { + continue; + } + + double score = left->_prob + right->_prob + shared_score; + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[s][t][s]; + } + + if (feat_opt.use_labeled_sibling) { + score += inst->sibl_scores[s][t][s][l]; + } + + const LatticeItem * const item = new LatticeItem(INCMP, + s, + t, + score, + left, + right, + l); + + lattice_insert(_lattice_incmp[s][t], item); + } // end for I(s,t) = C(s,s) + C(t,s+1) + + { // I(s,t) = I(s,r) + S(r,t) + for (int r = s + 1; r < t; ++ r) { + const LatticeItem * const left = _lattice_incmp[s][r]; + const LatticeItem * const right = _lattice_sib[r][t]; + + if (!left || !right) { + continue; + } + + double score = left->_prob + right->_prob + shared_score; + + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[s][t][r]; + } + + if (feat_opt.use_labeled_sibling) { + score += inst->sibl_scores[s][t][r][l]; + } + + const LatticeItem * const item = new LatticeItem(INCMP, + s, + t, + score, + left, + right, + l); + + lattice_insert(_lattice_incmp[s][t], item); + } + } // end for I(s,t) = I(s,r) + S(r,t) + + } // end for for (l = 0; l < L; ++ l) + + if (s != 0) { // I(t,s) = C(s, t-1) + C(t, t) + for (int l = 0; l < L; ++ l) { + double shared_score = 0.; + + if (feat_opt.use_unlabeled_dependency) { + shared_score += inst->depu_scores[t][s]; + } + + if (feat_opt.use_labeled_dependency) { + shared_score += inst->depl_scores[t][s][l]; + } + + { // I(t,s) = C(s,t-1) + C(t,t) + const LatticeItem * const left = _lattice_cmp[s][t-1]; + const LatticeItem * const right = _lattice_cmp[t][t]; + + if (!left || !right) { + continue; + } + + double score = left->_prob + right->_prob + shared_score; + + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[t][s][t]; } - if (s != 0) { - for (int r = s; r < t; ++ r) { - const LatticeItem * const left = _lattice_cmp[r][s]; - const LatticeItem * const right = _lattice_incmp[t][r]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob; - - if (feat_opt.use_last_sibling) { - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[t][r][r]; - } - - if (feat_opt.use_labeled_sibling) { - int l = right->_label_s_t; - score += inst->sibl_scores[t][r][r][l]; - } - } - - const LatticeItem * const item = new LatticeItem(CMP, - t, - s, - score, - left, - right); - - lattice_insert(_lattice_cmp[t][s], item); - } + if (feat_opt.use_labeled_sibling) { + score += inst->sibl_scores[t][s][t][l]; } + + const LatticeItem * const item = new LatticeItem(INCMP, + t, + s, + score, + left, + right, + l); + + lattice_insert(_lattice_incmp[t][s], item); + } // end for I(t, s) = C(s,t-1) + C(t,t) + + { // I(t,s) = S(s,r) + I(t,r) + for (int r = s + 1; r < t; ++ r) { + const LatticeItem * const left = _lattice_sib[s][r]; + const LatticeItem * const right = _lattice_incmp[t][r]; + + if (!left || !right) { + continue; + } + + double score = left->_prob + right->_prob + shared_score; + + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[t][s][r]; + } + + if (feat_opt.use_labeled_sibling) { + score += inst->sibl_scores[t][s][r][l]; + } + + const LatticeItem * const item = new LatticeItem(INCMP, + t, + s, + score, + left, + right, + l); + + lattice_insert(_lattice_incmp[t][s], item); + } + } // end for I(t,s) = S(s,r) + I(t,r) + } - } -} + } // end for if (s != 0) -void Decoder2O::get_result(Instance * inst) { - int len = inst->size(); - inst->predicted_heads.resize(len, -1); - if (model_opt.labeled) { - inst->predicted_deprelsidx.resize(len, -1); - } + { // S(s,t) = C(s,r) + C(t,r+1) + for (int r = s; r < t; ++ r) { + const LatticeItem * const left = _lattice_cmp[s][r]; + const LatticeItem * const right = _lattice_cmp[t][r+1]; - const LatticeItem * best_item = _lattice_cmp[0][len - 1]; - __BUILD_TREE(inst, best_item); -} + if (!left || !right) { + continue; + } -void Decoder2O::free_lattice() { - int len = _lattice_cmp.nrows(); - for (int i = 0; i < len; ++ i) { - for (int j = 0; j < len; ++ j) { - if (_lattice_incmp[i][j]) { - delete _lattice_incmp[i][j]; + double score = left->_prob + right->_prob; + + const LatticeItem * const item = new LatticeItem(SIBSP, + s, + t, + score, + left, + right); + + lattice_insert(_lattice_sib[s][t], item); + } + } // end for S(s,t) = C(s,t) + C(t,r+1) + + { // C(s,t) = I(s,r) + C(r,t) + for (int r = s + 1; r <= t; ++ r) { + const LatticeItem * const left = _lattice_incmp[s][r]; + const LatticeItem * const right = _lattice_cmp[r][t]; + + if (!left || !right) { + continue; + } + + double score = left->_prob + right->_prob; + + if (feat_opt.use_last_sibling) { + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[s][r][r]; } - if (_lattice_cmp[i][j]) { - delete _lattice_cmp[i][j]; + if (feat_opt.use_labeled_sibling) { + int l = left->_label_s_t; + score += inst->sibl_scores[s][r][r][l]; + } + } + + const LatticeItem * const item = new LatticeItem(CMP, + s, + t, + score, + left, + right); + + lattice_insert(_lattice_cmp[s][t], item); + + } + } + + if (s != 0) { + for (int r = s; r < t; ++ r) { + const LatticeItem * const left = _lattice_cmp[r][s]; + const LatticeItem * const right = _lattice_incmp[t][r]; + + if (!left || !right) { + continue; + } + + double score = left->_prob + right->_prob; + + if (feat_opt.use_last_sibling) { + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[t][r][r]; } - if (_lattice_sib[i][j]) { - delete _lattice_sib[i][j]; + if (feat_opt.use_labeled_sibling) { + int l = right->_label_s_t; + score += inst->sibl_scores[t][r][r][l]; } + } + + const LatticeItem * const item = new LatticeItem(CMP, + t, + s, + score, + left, + right); + + lattice_insert(_lattice_cmp[t][s], item); } + } + } + } +} + +void Decoder2O::get_result(Instance * inst) { + int len = inst->size(); + inst->predicted_heads.resize(len, -1); + if (model_opt.labeled) { + inst->predicted_deprelsidx.resize(len, -1); + } + + const LatticeItem * best_item = _lattice_cmp[0][len - 1]; + __BUILD_TREE(inst, best_item); +} + +void Decoder2O::free_lattice() { + int len = _lattice_cmp.nrows(); + for (int i = 0; i < len; ++ i) { + for (int j = 0; j < len; ++ j) { + if (_lattice_incmp[i][j]) { + delete _lattice_incmp[i][j]; + } + + if (_lattice_cmp[i][j]) { + delete _lattice_cmp[i][j]; + } + + if (_lattice_sib[i][j]) { + delete _lattice_sib[i][j]; + } } + } } @@ -306,335 +306,335 @@ void Decoder2O::free_lattice() { // 2nd-order Decoder using dependency, sibling and grand features // // ================================================================ // void Decoder2OCarreras::init_lattice(const Instance * inst) { - int len = inst->size(); - _lattice_cmp.resize(len, len, len); - _lattice_incmp.resize(len, len, L); + int len = inst->size(); + _lattice_cmp.resize(len, len, len); + _lattice_incmp.resize(len, len, L); - _lattice_cmp = NULL; - _lattice_incmp = NULL; + _lattice_cmp = NULL; + _lattice_incmp = NULL; - for (int i = 0; i < len; ++ i) { - _lattice_cmp[i][i][i] = new LatticeItem(i); - } + for (int i = 0; i < len; ++ i) { + _lattice_cmp[i][i][i] = new LatticeItem(i); + } } void Decoder2OCarreras::decode_projective(const Instance * inst) { - int len = inst->size(); - - for (int width = 1; width < len; ++ width) { - for (int s = 0; s + width < len; ++ s) { - int t = s + width; - - // I(s, t) = C(s, r) + C(t, r + 1) - for (int l = 0; l < L; ++ l) { - for (int r = s; r < t; ++ r) { - const LatticeItem * best_left_item = 0; - double best_left_score = DOUBLE_NEG_INF; - - for (int cs = s; cs <= r; ++ cs) { - if (cs == s && s != r) { - continue; - } - - const LatticeItem * item = _lattice_cmp[s][r][cs]; - - if (!item) { - continue; - } - - double score = item->_prob; - - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[s][t][cs]; - } - - if (feat_opt.use_labeled_sibling) { - score += inst->sibl_scores[s][t][cs][l]; - } - - if (score > best_left_score) { - best_left_item = item; - best_left_score = score; - } - } - - const LatticeItem * best_right_item = 0; - double best_right_score = DOUBLE_NEG_INF; - - for (int ct = r + 1; ct <= t; ++ ct) { - if (ct == t && r + 1 != t) { - continue; - } + int len = inst->size(); + + for (int width = 1; width < len; ++ width) { + for (int s = 0; s + width < len; ++ s) { + int t = s + width; + + // I(s, t) = C(s, r) + C(t, r + 1) + for (int l = 0; l < L; ++ l) { + for (int r = s; r < t; ++ r) { + const LatticeItem * best_left_item = 0; + double best_left_score = DOUBLE_NEG_INF; + + for (int cs = s; cs <= r; ++ cs) { + if (cs == s && s != r) { + continue; + } + + const LatticeItem * item = _lattice_cmp[s][r][cs]; + + if (!item) { + continue; + } - const LatticeItem * item = _lattice_cmp[t][r + 1][ct]; - - if (!item) { - continue; - } - - double score = item->_prob; + double score = item->_prob; - if (feat_opt.use_unlabeled_grand && - (feat_opt.use_no_grand || ct != t)) { - score += inst->grdu_scores[s][t][ct == t ? s : ct]; - } - - if (feat_opt.use_labeled_grand && - (feat_opt.use_no_grand || ct != t)) { - score += inst->grdl_scores[s][t][ct == t ? s : ct][l]; - } - - if (score > best_right_score) { - best_right_item = item; - best_right_score = score; - } - } - - if (best_left_item && best_right_item) { - double score = best_left_score + best_right_score; - - if (feat_opt.use_unlabeled_dependency) { - score += inst->depu_scores[s][t]; - } - - if (feat_opt.use_labeled_dependency) { - score += inst->depl_scores[s][t][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - s, - t, - score, - best_left_item, - best_right_item, - l); - - lattice_insert(_lattice_incmp[s][t][l], item); - } // end for if !left || !right - } - } // end for for l = 0; l < L; ++ l - - if (s != 0) { - // I(t, s) = C(s, r) + C(t, r + 1) - for (int l = 0; l < L; ++ l) { - for (int r = s; r < t; ++ r) { - const LatticeItem * best_left_item = 0; - double best_left_score = DOUBLE_NEG_INF; - - for (int cs = s; cs <= r; ++ cs) { - if (cs == s && s != r) { - continue; - } - - const LatticeItem * item = _lattice_cmp[s][r][cs]; - - if (!item) { - continue; - } - - double score = item->_prob; - - if (feat_opt.use_unlabeled_grand && - (feat_opt.use_no_grand || cs != s)) { - score += inst->grdu_scores[t][s][cs]; - } - - if (feat_opt.use_labeled_grand && - (feat_opt.use_no_grand || cs != s)) { - score += inst->grdl_scores[t][s][cs][l]; - } - - if (score > best_left_score) { - best_left_item = item; - best_left_score = score; - } - } - - const LatticeItem * best_right_item = 0; - double best_right_score = DOUBLE_NEG_INF; - - for (int ct = r + 1; ct <= t; ++ ct) { - if (ct == t && r + 1 != t) { - continue; - } - - const LatticeItem * item = _lattice_cmp[t][r + 1][ct]; - - if (!item) { - continue; - } - - double score = item->_prob; - - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[t][s][ct]; - } - - if (feat_opt.use_labeled_sibling) { - score += inst->sibl_scores[t][s][ct][l]; - } - - if (score > best_right_score) { - best_right_item = item; - best_right_score = score; - } - } - - if (best_left_item && best_right_item) { - double score = best_left_score + best_right_score; - - if (feat_opt.use_unlabeled_dependency) { - score += inst->depu_scores[t][s]; - } - - if (feat_opt.use_labeled_dependency) { - score += inst->depl_scores[t][s][l]; - } - - const LatticeItem * const item = new LatticeItem( INCMP, - t, - s, - score, - best_left_item, - best_right_item, - l); - - lattice_insert(_lattice_incmp[t][s][l], item); - } - } - } - } // end for if s != 0 - - for (int m = s; m <= t; ++ m) { - if (m != s) { // C(s, t, m) = I(s, m, l) + C(m, t, cm); - for (int l = 0; l < L; ++ l) { - const LatticeItem * const left = _lattice_incmp[s][m][l]; - - if (!left) { - continue; - } - - for (int cm = m; cm <= t; ++ cm) { - if (cm == m && cm != t) { - continue; - } - - const LatticeItem * const right = _lattice_cmp[m][t][cm]; - - if (!right) { - continue; - } - - double score = left->_prob + right->_prob; - - if (feat_opt.use_unlabeled_grand && - (feat_opt.use_no_grand || cm != m)) { - score += inst->grdu_scores[s][m][cm]; - } - - if (feat_opt.use_labeled_grand && - (feat_opt.use_no_grand || cm != m)) { - score += inst->grdl_scores[s][m][cm][l]; - } - - const LatticeItem * const item = new LatticeItem(CMP, - s, - t, - score, - left, - right); - - lattice_insert(_lattice_cmp[s][t][m], item); - } // end for (int cm = m; cm <= t; ++ cm) - } // enf for (int l = 0; l < L; ++ l) - } // end for if (m != s) - - if (m != t && s != 0) { // C(t, s, m) = C(m, s, cm) + I(t, m, l) - for (int l = 0; l < L; ++ l) { - const LatticeItem * const right = _lattice_incmp[t][m][l]; - - if (!right) { - continue; - } - - for (int cm = s; cm <= m; ++ cm) { - if (cm == m && cm != s) { - continue; - } - - const LatticeItem * const left = _lattice_cmp[m][s][cm]; - - if (!left) { - continue; - } - - double score = left->_prob + right->_prob; - - if (feat_opt.use_unlabeled_grand && - (feat_opt.use_no_grand || cm != m)) { - score += inst->grdu_scores[t][m][cm == m ? t : cm]; - } - - if (feat_opt.use_labeled_grand && - (feat_opt.use_no_grand || cm != m)) { - score += inst->grdl_scores[t][m][cm == m ? t : cm][l]; - } - - const LatticeItem * const item = new LatticeItem(CMP, - t, - s, - score, - left, - right); - - lattice_insert(_lattice_cmp[t][s][m], item); - } // end for (int cm = s; cm <= m; ++ cm) - } - } + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[s][t][cs]; } + + if (feat_opt.use_labeled_sibling) { + score += inst->sibl_scores[s][t][cs][l]; + } + + if (score > best_left_score) { + best_left_item = item; + best_left_score = score; + } + } + + const LatticeItem * best_right_item = 0; + double best_right_score = DOUBLE_NEG_INF; + + for (int ct = r + 1; ct <= t; ++ ct) { + if (ct == t && r + 1 != t) { + continue; + } + + const LatticeItem * item = _lattice_cmp[t][r + 1][ct]; + + if (!item) { + continue; + } + + double score = item->_prob; + + if (feat_opt.use_unlabeled_grand && + (feat_opt.use_no_grand || ct != t)) { + score += inst->grdu_scores[s][t][ct == t ? s : ct]; + } + + if (feat_opt.use_labeled_grand && + (feat_opt.use_no_grand || ct != t)) { + score += inst->grdl_scores[s][t][ct == t ? s : ct][l]; + } + + if (score > best_right_score) { + best_right_item = item; + best_right_score = score; + } + } + + if (best_left_item && best_right_item) { + double score = best_left_score + best_right_score; + + if (feat_opt.use_unlabeled_dependency) { + score += inst->depu_scores[s][t]; + } + + if (feat_opt.use_labeled_dependency) { + score += inst->depl_scores[s][t][l]; + } + + const LatticeItem * const item = new LatticeItem(INCMP, + s, + t, + score, + best_left_item, + best_right_item, + l); + + lattice_insert(_lattice_incmp[s][t][l], item); + } // end for if !left || !right } + } // end for for l = 0; l < L; ++ l + + if (s != 0) { + // I(t, s) = C(s, r) + C(t, r + 1) + for (int l = 0; l < L; ++ l) { + for (int r = s; r < t; ++ r) { + const LatticeItem * best_left_item = 0; + double best_left_score = DOUBLE_NEG_INF; + + for (int cs = s; cs <= r; ++ cs) { + if (cs == s && s != r) { + continue; + } + + const LatticeItem * item = _lattice_cmp[s][r][cs]; + + if (!item) { + continue; + } + + double score = item->_prob; + + if (feat_opt.use_unlabeled_grand && + (feat_opt.use_no_grand || cs != s)) { + score += inst->grdu_scores[t][s][cs]; + } + + if (feat_opt.use_labeled_grand && + (feat_opt.use_no_grand || cs != s)) { + score += inst->grdl_scores[t][s][cs][l]; + } + + if (score > best_left_score) { + best_left_item = item; + best_left_score = score; + } + } + + const LatticeItem * best_right_item = 0; + double best_right_score = DOUBLE_NEG_INF; + + for (int ct = r + 1; ct <= t; ++ ct) { + if (ct == t && r + 1 != t) { + continue; + } + + const LatticeItem * item = _lattice_cmp[t][r + 1][ct]; + + if (!item) { + continue; + } + + double score = item->_prob; + + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[t][s][ct]; + } + + if (feat_opt.use_labeled_sibling) { + score += inst->sibl_scores[t][s][ct][l]; + } + + if (score > best_right_score) { + best_right_item = item; + best_right_score = score; + } + } + + if (best_left_item && best_right_item) { + double score = best_left_score + best_right_score; + + if (feat_opt.use_unlabeled_dependency) { + score += inst->depu_scores[t][s]; + } + + if (feat_opt.use_labeled_dependency) { + score += inst->depl_scores[t][s][l]; + } + + const LatticeItem * const item = new LatticeItem(INCMP, + t, + s, + score, + best_left_item, + best_right_item, + l); + + lattice_insert(_lattice_incmp[t][s][l], item); + } + } + } + } // end for if s != 0 + + for (int m = s; m <= t; ++ m) { + if (m != s) { // C(s, t, m) = I(s, m, l) + C(m, t, cm); + for (int l = 0; l < L; ++ l) { + const LatticeItem * const left = _lattice_incmp[s][m][l]; + + if (!left) { + continue; + } + + for (int cm = m; cm <= t; ++ cm) { + if (cm == m && cm != t) { + continue; + } + + const LatticeItem * const right = _lattice_cmp[m][t][cm]; + + if (!right) { + continue; + } + + double score = left->_prob + right->_prob; + + if (feat_opt.use_unlabeled_grand && + (feat_opt.use_no_grand || cm != m)) { + score += inst->grdu_scores[s][m][cm]; + } + + if (feat_opt.use_labeled_grand && + (feat_opt.use_no_grand || cm != m)) { + score += inst->grdl_scores[s][m][cm][l]; + } + + const LatticeItem * const item = new LatticeItem(CMP, + s, + t, + score, + left, + right); + + lattice_insert(_lattice_cmp[s][t][m], item); + } // end for (int cm = m; cm <= t; ++ cm) + } // enf for (int l = 0; l < L; ++ l) + } // end for if (m != s) + + if (m != t && s != 0) { // C(t, s, m) = C(m, s, cm) + I(t, m, l) + for (int l = 0; l < L; ++ l) { + const LatticeItem * const right = _lattice_incmp[t][m][l]; + + if (!right) { + continue; + } + + for (int cm = s; cm <= m; ++ cm) { + if (cm == m && cm != s) { + continue; + } + + const LatticeItem * const left = _lattice_cmp[m][s][cm]; + + if (!left) { + continue; + } + + double score = left->_prob + right->_prob; + + if (feat_opt.use_unlabeled_grand && + (feat_opt.use_no_grand || cm != m)) { + score += inst->grdu_scores[t][m][cm == m ? t : cm]; + } + + if (feat_opt.use_labeled_grand && + (feat_opt.use_no_grand || cm != m)) { + score += inst->grdl_scores[t][m][cm == m ? t : cm][l]; + } + + const LatticeItem * const item = new LatticeItem(CMP, + t, + s, + score, + left, + right); + + lattice_insert(_lattice_cmp[t][s][m], item); + } // end for (int cm = s; cm <= m; ++ cm) + } + } + } } + } } void Decoder2OCarreras::get_result(Instance * inst) { - int len = inst->size(); - inst->predicted_heads.resize(len, -1); - if (model_opt.labeled) { - inst->predicted_deprelsidx.resize(len, -1); + int len = inst->size(); + inst->predicted_heads.resize(len, -1); + if (model_opt.labeled) { + inst->predicted_deprelsidx.resize(len, -1); + } + + const LatticeItem * best_item = NULL; + for (int c = 1; c < len; ++ c) { + const LatticeItem * item = _lattice_cmp[0][len - 1][c]; + if (!item) { + continue; } - const LatticeItem * best_item = NULL; - for (int c = 1; c < len; ++ c) { - const LatticeItem * item = _lattice_cmp[0][len - 1][c]; - if (!item) { - continue; - } - - if (NULL == best_item || best_item->_prob < item->_prob) { - best_item = item; - } + if (NULL == best_item || best_item->_prob < item->_prob) { + best_item = item; } + } - __BUILD_TREE(inst, best_item); + __BUILD_TREE(inst, best_item); } void Decoder2OCarreras::free_lattice() { - int len = _lattice_cmp.dim1(); - for (int i = 0; i < len; ++ i) { - for (int j = 0; j < len; ++ j) { - for (int l = 0; l < L; ++ l) { - if (_lattice_incmp[i][j][l]) { - delete _lattice_incmp[i][j][l]; - } - } + int len = _lattice_cmp.dim1(); + for (int i = 0; i < len; ++ i) { + for (int j = 0; j < len; ++ j) { + for (int l = 0; l < L; ++ l) { + if (_lattice_incmp[i][j][l]) { + delete _lattice_incmp[i][j][l]; + } + } - for (int k = 0; k < len; ++ k) { - if (_lattice_cmp[i][j][k]) { - delete _lattice_cmp[i][j][k]; - } - } + for (int k = 0; k < len; ++ k) { + if (_lattice_cmp[i][j][k]) { + delete _lattice_cmp[i][j][k]; } + } } + } } } // end for namespace parser diff --git a/src/parser/decoder2o.h b/src/parser/decoder2o.h index fcd62227d..37280d185 100644 --- a/src/parser/decoder2o.h +++ b/src/parser/decoder2o.h @@ -1,7 +1,7 @@ -#ifndef __DECODER_2_O_H__ -#define __DECODER_2_O_H__ +#ifndef __LTP_PARSER_DECODER_2_O_H__ +#define __LTP_PARSER_DECODER_2_O_H__ -#include "decoder.h" +#include "parser/decoder.h" namespace ltp { namespace parser { @@ -9,39 +9,39 @@ namespace parser { // 2nd-order decoder with dependency features and sibling features class Decoder2O : public Decoder { public: - Decoder2O(int _L = 1) : L(_L) {} + Decoder2O(int _L = 1) : L(_L) {} public: - void init_lattice(const Instance * inst); - void decode_projective(const Instance * inst); - void get_result(Instance * inst); - void free_lattice(); + void init_lattice(const Instance * inst); + void decode_projective(const Instance * inst); + void get_result(Instance * inst); + void free_lattice(); private: - int L; - Mat< const LatticeItem * > _lattice_cmp; - Mat< const LatticeItem * > _lattice_incmp; - Mat< const LatticeItem * > _lattice_sib; + int L; + Mat< const LatticeItem * > _lattice_cmp; + Mat< const LatticeItem * > _lattice_incmp; + Mat< const LatticeItem * > _lattice_sib; }; // 2nd-order decoder with dependency, sibling and grand features class Decoder2OCarreras : public Decoder { public: - Decoder2OCarreras(int _L = 1) : L(_L) {} + Decoder2OCarreras(int _L = 1) : L(_L) {} public: - void init_lattice(const Instance * inst); - void decode_projective(const Instance * inst); - void get_result(Instance * inst); - void free_lattice(); + void init_lattice(const Instance * inst); + void decode_projective(const Instance * inst); + void get_result(Instance * inst); + void free_lattice(); private: - int L; - Mat3< const LatticeItem * > _lattice_cmp; - Mat3< const LatticeItem * > _lattice_incmp; + int L; + Mat3< const LatticeItem * > _lattice_cmp; + Mat3< const LatticeItem * > _lattice_incmp; }; } // end for namespace parser } // end for namespace ltp -#endif // end for __DECODER_2_O__ +#endif // end for __LTP_PARSER_DECODER_2_O_H__ diff --git a/src/parser/extractor.cpp b/src/parser/extractor.cpp index 00bf56ad8..1df4f3f0a 100644 --- a/src/parser/extractor.cpp +++ b/src/parser/extractor.cpp @@ -1,20 +1,20 @@ -#include "extractor.h" -#include "options.h" -#include "settings.h" +#include "parser/extractor.h" +#include "parser/options.h" +#include "parser/settings.h" -#define LEN(x) (x.size()) -#define LAST(x) ((x)[(x).size()-1]) -#define FIRST(x) ((x)[0]) +#define LEN(x) (x.size()) +#define LAST(x) ((x)[(x).size()-1]) +#define FIRST(x) ((x)[0]) #define PUSH(x) do {\ - cache.push_back((x)); \ + cache.push_back((x)); \ }while(0); #define PUSH_DIST(x) do { \ - if (feat_opt.use_distance_in_features) { \ - (x).append(dist); \ - PUSH(x); \ - } \ + if (feat_opt.use_distance_in_features) { \ + (x).append(dist); \ + PUSH(x); \ + } \ } while (0); namespace ltp { @@ -22,33 +22,33 @@ namespace parser { // function of GET direction void Extractor::__GET_DIRECTION(int head_id, int child_id, string& direction) { - if (head_id == 0) { - direction = "L#R"; - } else { - direction = (head_id > child_id ? "L" : "R"); - } + if (head_id == 0) { + direction = "L#R"; + } else { + direction = (head_id > child_id ? "L" : "R"); + } } void Extractor::__GET_DISTANCE_1_2_36_7(int head_id, int child_id, string& distance) { - int dist = (head_id > child_id ? head_id - child_id : child_id - head_id) ; - - if (dist < 3) { - ostringstream S; S << dist; - distance = S.str(); - } else if (dist < 7) { - distance = "<7"; - } else { - distance = ">6"; - } + int dist = (head_id > child_id ? head_id - child_id : child_id - head_id) ; + + if (dist < 3) { + ostringstream S; S << dist; + distance = S.str(); + } else if (dist < 7) { + distance = "<7"; + } else { + distance = ">6"; + } } const string POSUExtractor::prefix = "PU-"; // ================================================================ // -// Dependency Features Extractor // -// feature templates is listed in `extractor.h` // +// Dependency Features Extractor // +// feature templates is listed in `extractor.h` // // the DEPExtractor is a singleton, which only be construct once // -// during the life of the program. // +// during the life of the program. // // ================================================================ // // Initialize the static member @@ -58,302 +58,302 @@ vector