diff --git a/.gitignore b/.gitignore index 2a95a24e0..c41f82d27 100644 --- a/.gitignore +++ b/.gitignore @@ -24,21 +24,28 @@ config.h include/ lib/ bin/ -lgdpj -lgsrl -otcws -otpos -otner -maxent -cws -cws_cmdline -multi_cws_cmdline -pos -par -ner +tools/train/lgdpj +tools/train/lgsrl +tools/train/otcws +tools/train/otpos +tools/train/otner +tools/train/maxent +examples/cws +examples/cws_cmdline +examples/multi_cws_cmdline +examples/pos +examples/pos_cmdline +examples/multi_pos_cmdline +examples/par +examples/ner ############### # data file # ############### new_ltp_data/ ltp_data/ + +################## +# running folder # +################## +dummy/ diff --git a/ChangeLog.md b/ChangeLog.md index e657e7628..984652995 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,3 +1,11 @@ +2014-01-16 +---------- +* 在分词、词性标注和依存句法分析模块中加入模型裁剪功能,减少了模型大小。用户可以通过配置文件里的rare-feature-threshold参数配置裁剪力度,如果rare-feature-threshold为0,则只去掉为0的特征;rare-feature-threshold大于0时将一步去掉更新次数低于阈值的特征。这一优化方法主要参考[Learning Sparser Perceptron Models](http://www.cs.bgu.ac.il/~yoavg/publications/acl2011sparse.pdf)。 +* 修复了词性标注、命名实体识别、依存句法分析训练套件中的内存泄露问题 +* 修复了语义角色标注的内存泄露问题 +* 增加了`ltp_server`在异常输入情况下返回错误代码,如果输入数据编码错误或者输入xml不符合规则,将返回400 +* 修复了词性标注、命名实体识别模型文件的错误标示符,这项修改将导致3.1.0以及之后的版本不能与3.0.x的模型兼容,请务必注意 + 2013-09-29 ---------- * 解决windows编译问题 diff --git a/doc/ltp-document-3.0.md b/doc/ltp-document-3.0.md index 7f9e1c9b7..ab55e541c 100644 --- a/doc/ltp-document-3.0.md +++ b/doc/ltp-document-3.0.md @@ -43,7 +43,9 @@ LTP提供了一系列中文自然语言处理工具,用户可以使用这些 同时,你可以从以下一些地方获得LTP的模型。 -* +* [百度云](http://pan.baidu.com/share/link?shareid=1988562907&uk=2738088569) +* 当前模型版本3.1.0 + ## 安装CMake @@ -860,7 +862,7 @@ otcws是ltp分词模型的训练套件,用户可以使用otcws训练获得ltp otcws分别支持从人工切分数据中训练分词模型和调用分词模型对句子进行切分。人工切分的句子的样例如下: - 对外 , 他们 代表 国家 。 + 对外 , 他们 代表 国家 。 otcws主要通过配置文件指定执行的工作,其中主要有两类配置文件:训练配置和测试配置。 @@ -872,15 +874,17 @@ otcws主要通过配置文件指定执行的工作,其中主要有两类配置 algorithm = pa model-name = model/ctb5-seg max-iter = 5 + rare-feature-threshold = 0 其中, * [train] 配置组指定执行训练 - * Ttain-file 配置项指定训练集文件 - * Holdout-file 配置项指定开发集文件 - * Algorithm 指定参数学习方法,现在otcws支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 - * Model-name 指定输出模型文件名 - * Max-iter 指定最大迭代次数 + * ttain-file 配置项指定训练集文件 + * holdout-file 配置项指定开发集文件 + * algorithm 指定参数学习方法,现在otcws支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 + * model-name 指定输出模型文件名 + * max-iter 指定最大迭代次数 + * rare-feature-threshold 配置裁剪力度,如果rare-feature-threshold为0,则只去掉为0的特征;rare-feature-threshold;如果大于0时将进一步去掉更新次数低于阈值的特征 测试配置的配置文件样例如下所示。 @@ -891,8 +895,8 @@ otcws主要通过配置文件指定执行的工作,其中主要有两类配置 其中, * [test] 配置组指定执行测试 - * Test-file 指定测试文件 - * Model-file 指定模型文件位置 + * test-file 指定测试文件 + * model-file 指定模型文件位置 切分结果将输入到标准io中。 @@ -924,11 +928,12 @@ otpos主要通过配置文件指定执行的工作,其中主要有两类配置 其中, * [train] 配置组指定执行训练 - * Ttain-file 配置项指定训练集文件 - * Holdout-file 配置项指定开发集文件 - * Algorithm 指定参数学习方法,现在otcws支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 - * Model-name 指定输出模型文件名 - * Max-iter 指定最大迭代次数 + * ttain-file 配置项指定训练集文件 + * holdout-file 配置项指定开发集文件 + * algorithm 指定参数学习方法,现在otcws支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 + * model-name 指定输出模型文件名 + * max-iter 指定最大迭代次数 + * rare-feature-threshold 配置裁剪力度,如果rare-feature-threshold为0,则只去掉为0的特征;rare-feature-threshold;如果大于0时将进一步去掉更新次数低于阈值的特征 测试配置的配置文件样例如下所示。 @@ -939,8 +944,8 @@ otpos主要通过配置文件指定执行的工作,其中主要有两类配置 其中, * [test] 配置组指定执行测试 - * Test-file 指定测试文件 - * Model-file 指定模型文件位置 + * test-file 指定测试文件 + * model-file 指定模型文件位置 词性标注结果将输入到标准io中。 @@ -972,11 +977,11 @@ Otner主要通过配置文件指定执行的工作,其中主要有两类配置 其中, * [train] 配置组指定执行训练 - * Train-file 配置项指定训练集文件 - * Holdout-file 配置项指定开发集文件 - * Algorithm 指定参数学习方法,现在otner支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 - * Model-name 指定输出模型文件名 - * Max-iter 指定最大迭代次数 + * train-file 配置项指定训练集文件 + * holdout-file 配置项指定开发集文件 + * algorithm 指定参数学习方法,现在otner支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 + * model-name 指定输出模型文件名 + * max-iter 指定最大迭代次数 测试配置的配置文件样例如下所示。 @@ -987,8 +992,8 @@ Otner主要通过配置文件指定执行的工作,其中主要有两类配置 其中, * [test] 配置组指定执行测试 - * Test-file 指定测试文件 - * Model-file 指定模型文件位置 + * test-file 指定测试文件 + * model-file 指定模型文件位置 命名实体识别结果将输入到标准io中。 @@ -1000,7 +1005,7 @@ lgdpj是ltp依存句法分析模型的训练套件,用户可以使用lgdpj训 编译之后,在tools/train下面会产生名为lgdpj的二进制程序。调用方法是 - ./lgdpj [config_file]。 + ./lgdpj [config_file] lgdpj分别支持从人工标注依存句法的数据中训练依存句法分析模型和调用依存句法分析模型对句子进行依存句法分析。人工标注的词性标注依存句法的句子遵从conll格式,其样例如下: @@ -1039,6 +1044,7 @@ lgdpj主要通过配置文件指定执行的工作,其中主要有两类配置 max-iter = 5 algorithm = pa model-name = model/parser/ldc-o2carreras + rare-feature-threshold = 0 其中, @@ -1047,11 +1053,12 @@ lgdpj主要通过配置文件指定执行的工作,其中主要有两类配置 * decoder-name 表示采用的解码算法,现在lgdpj支持三种解码算法,分别是1o,2o-sib,2o-carreras * [feature] 配置组指定使用的特征 * [train] 配置组指定执行训练 - * Train-file 配置项指定训练集文件 - * Holdout-file 配置项指定开发集文件 - * Algorithm 指定参数学习方法,现在otcws支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 - * Model-name 指定输出模型文件名 - * Max-iter 指定最大迭代次数 + * train-file 配置项指定训练集文件 + * holdout-file 配置项指定开发集文件 + * algorithm 指定参数学习方法,现在otcws支持两种参数学习方法,分别是passive aggressive(pa)和average perceptron(ap)。 + * model-name 指定输出模型文件名 + * max-iter 指定最大迭代次数 + * rare-feature-threshold 配置裁剪力度,如果rare-feature-threshold为0,则只去掉为0的特征;rare-feature-threshold;如果大于0时将进一步去掉更新次数低于阈值的特征 测试配置的配置文件样例如下所示。 @@ -1062,8 +1069,8 @@ lgdpj主要通过配置文件指定执行的工作,其中主要有两类配置 其中, * [test] 配置组指定执行测试 - * Test-file 指定测试文件 - * Model-file 指定模型文件位置 + * test-file 指定测试文件 + * model-file 指定模型文件位置 依存句法分析结果将输入到标准io中。 diff --git a/examples/cws.cpp b/examples/cws.cpp index 0f42063e0..6a9355c54 100644 --- a/examples/cws.cpp +++ b/examples/cws.cpp @@ -3,38 +3,38 @@ #include "segment_dll.h" int main(int argc, char * argv[]) { - if (argc < 2) { - std::cerr << "cws [model path] [lexicon_file]" << std::endl; - return 1; - } + if (argc < 2) { + std::cerr << "cws [model path] [lexicon_file]" << std::endl; + return 1; + } - void * engine = 0; - if (argc == 2) { - engine = segmentor_create_segmentor(argv[1]); - } else if (argc == 3) { - engine = segmentor_create_segmentor(argv[1], argv[2]); - } + void * engine = 0; + if (argc == 2) { + engine = segmentor_create_segmentor(argv[1]); + } else if (argc == 3) { + engine = segmentor_create_segmentor(argv[1], argv[2]); + } - if (!engine) { - return -1; - } - std::vector words; + if (!engine) { + return -1; + } + std::vector words; - const char * suite[2] = { - "What's wrong with you? 别灰心! http://t.cn/zQz0Rn", - "台北真的是天子骄子吗?",}; + const char * suite[2] = { + "What's wrong with you? 别灰心! http://t.cn/zQz0Rn", + "台北真的是天子骄子吗?",}; - for (int i = 0; i < 2; ++ i) { - words.clear(); - int len = segmentor_segment(engine, suite[i], words); - for (int i = 0; i < len; ++ i) { - std::cout << words[i]; - if (i+1 == len) std::cout < words; - std::string sentence; + if (!engine) { + return -1; + } + std::vector words; + std::string sentence; - std::cerr << "TRACE: Model is loaded" << std::endl; - double tm = get_time(); + std::cerr << "TRACE: Model is loaded" << std::endl; + double tm = get_time(); - while (std::getline(std::cin, sentence, '\n')) { - words.clear(); - if (sentence.size() == 0) { continue; } - int len = segmentor_segment(engine, sentence, words); - for (int i = 0; i < len; ++ i) { - std::cout << words[i]; - if (i+1 == len) std::cout < +#include #include #include #include @@ -91,7 +92,9 @@ void multithreaded_segment( void * args) { int main(int argc, char ** argv) { if (argc < 2 || (0 == strcmp(argv[1], "-h"))) { - std::cerr << "Example: ./multi_cws_cmdline [model path] [lexicon file]=NULL threadnum" << std::endl; + std::cerr << "Example: ./multi_cws_cmdline " + << "[model path] [lexicon file](optional) threadnum" + << std::endl; std::cerr << std::endl; std::cerr << "This program recieve input word sequence from stdin." << std::endl; std::cerr << "One sentence per line." << std::endl; diff --git a/examples/ner.cpp b/examples/ner.cpp index 223e31402..c1c3589e1 100644 --- a/examples/ner.cpp +++ b/examples/ner.cpp @@ -4,40 +4,40 @@ #include "ner_dll.h" int main(int argc, char * argv[]) { - if (argc < 2) { - std::cerr << "usage: ./ner [model_path]" << std::endl; - return -1; - } - - void * engine = ner_create_recognizer(argv[1]); - if (!engine) { - std::cerr << "failed to load model" << std::endl; - return -1; - } - - std::vector words; - std::vector postags; - - words.push_back("中国"); postags.push_back("ns"); - words.push_back("国际"); postags.push_back("n"); - words.push_back("广播"); postags.push_back("n"); - words.push_back("电台"); postags.push_back("n"); - words.push_back("创办"); postags.push_back("v"); - words.push_back("于"); postags.push_back("p"); - words.push_back("1941年"); postags.push_back("m"); - words.push_back("12月"); postags.push_back("m"); - words.push_back("3日"); postags.push_back("m"); - words.push_back("。"); postags.push_back("wp"); - - std::vector tags; - - ner_recognize(engine, words, postags, tags); - - for (int i = 0; i < tags.size(); ++ i) { - std::cout << words[i] << "\t" << postags[i] << "\t" << tags[i] << std::endl; - } - - ner_release_recognizer(engine); - return 0; + if (argc < 2) { + std::cerr << "usage: ./ner [model_path]" << std::endl; + return -1; + } + + void * engine = ner_create_recognizer(argv[1]); + if (!engine) { + std::cerr << "failed to load model" << std::endl; + return -1; + } + + std::vector words; + std::vector postags; + + words.push_back("中国"); postags.push_back("ns"); + words.push_back("国际"); postags.push_back("n"); + words.push_back("广播"); postags.push_back("n"); + words.push_back("电台"); postags.push_back("n"); + words.push_back("创办"); postags.push_back("v"); + words.push_back("于"); postags.push_back("p"); + words.push_back("1941年"); postags.push_back("m"); + words.push_back("12月"); postags.push_back("m"); + words.push_back("3日"); postags.push_back("m"); + words.push_back("。"); postags.push_back("wp"); + + std::vector tags; + + ner_recognize(engine, words, postags, tags); + + for (int i = 0; i < tags.size(); ++ i) { + std::cout << words[i] << "\t" << postags[i] << "\t" << tags[i] << std::endl; + } + + ner_release_recognizer(engine); + return 0; } diff --git a/examples/par.cpp b/examples/par.cpp index 2db1855e7..02ed9dcf2 100644 --- a/examples/par.cpp +++ b/examples/par.cpp @@ -4,34 +4,34 @@ #include "parser_dll.h" int main(int argc, char * argv[]) { - if (argc < 2) { - return -1; - } + if (argc < 2) { + return -1; + } - void * engine = parser_create_parser(argv[1]); - if (!engine) { - return -1; - } + void * engine = parser_create_parser(argv[1]); + if (!engine) { + return -1; + } - std::vector words; - std::vector postags; + std::vector words; + std::vector postags; - words.push_back("一把手"); postags.push_back("n"); - words.push_back("亲自"); postags.push_back("d"); - words.push_back("过问"); postags.push_back("v"); - words.push_back("。"); postags.push_back("wp"); + words.push_back("一把手"); postags.push_back("n"); + words.push_back("亲自"); postags.push_back("d"); + words.push_back("过问"); postags.push_back("v"); + words.push_back("。"); postags.push_back("wp"); - std::vector heads; - std::vector deprels; + std::vector heads; + std::vector deprels; - parser_parse(engine, words, postags, heads, deprels); + parser_parse(engine, words, postags, heads, deprels); - for (int i = 0; i < heads.size(); ++ i) { - std::cout << words[i] << "\t" << postags[i] << "\t" - << heads[i] << "\t" << deprels[i] << std::endl; - } + for (int i = 0; i < heads.size(); ++ i) { + std::cout << words[i] << "\t" << postags[i] << "\t" + << heads[i] << "\t" << deprels[i] << std::endl; + } - parser_release_parser(engine); - return 0; + parser_release_parser(engine); + return 0; } diff --git a/examples/pos_cmdline.cpp b/examples/pos_cmdline.cpp index feca053b5..2785bfe33 100644 --- a/examples/pos_cmdline.cpp +++ b/examples/pos_cmdline.cpp @@ -21,62 +21,62 @@ #include "postag_dll.h" double get_time(void) { - struct timeval tv; - gettimeofday(&tv, NULL); - return tv.tv_sec + (tv.tv_usec / 1000000.0); + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + (tv.tv_usec / 1000000.0); } int main(int argc, char * argv[]) { - if (argc < 1 || (0 == strcmp(argv[1], "-h"))) { - std::cerr << "Example: ./pos_cmdline [model path]" << std::endl; - std::cerr << std::endl; - std::cerr << "This program recieve input word sequence from stdin." << std::endl; - std::cerr << "One sentence per line. Words are separated by space." << std::endl; - std::cerr << std::endl; - return 1; - } + if (argc < 1 || (0 == strcmp(argv[1], "-h"))) { + std::cerr << "Example: ./pos_cmdline [model path]" << std::endl; + std::cerr << std::endl; + std::cerr << "This program recieve input word sequence from stdin." << std::endl; + std::cerr << "One sentence per line. Words are separated by space." << std::endl; + std::cerr << std::endl; + return 1; + } - void * engine = postagger_create_postagger(argv[1]); - if (!engine) { - std::cerr << "WARNINIG : Failed to load model." << std::endl; - return -1; - } + void * engine = postagger_create_postagger(argv[1]); + if (!engine) { + std::cerr << "WARNINIG : Failed to load model." << std::endl; + return -1; + } - std::string line; - std::string word; - std::vector words; - std::vector postags; + std::string line; + std::string word; + std::vector words; + std::vector postags; - std::cerr << "TRACE: Model is loaded" << std::endl; - double tm = get_time(); + std::cerr << "TRACE: Model is loaded" << std::endl; + double tm = get_time(); - while (std::getline(std::cin, line, '\n')) { - std::stringstream S(line); - words.clear(); - while (S >> word) { words.push_back(word); } + while (std::getline(std::cin, line, '\n')) { + std::stringstream S(line); + words.clear(); + while (S >> word) { words.push_back(word); } - if (words.size() == 0) { continue; } - int len = postagger_postag(engine, words, postags); - if (postags.size() != words.size()) { - std::cerr << "WARNINIG: Number of postags is different from number of words" + if (words.size() == 0) { continue; } + int len = postagger_postag(engine, words, postags); + if (postags.size() != words.size()) { + std::cerr << "WARNINIG: Number of postags is different from number of words" << std::endl; - } + } - for (int i = 0; i < len; ++ i) { - std::cout << words[i] << "_" << postags[i]; - if (i+1 == len) std::cout < vecSentences; - string para; - xml.GetParagraph(i, para); + for (int i = 0; i < paraNum; ++i) { + vector vecSentences; + string para; + xml.GetParagraph(i, para); - if (0 == SplitSentence( para, vecSentences )) { - ERROR_LOG("in LTP::splitsent, failed to split sentence"); - return -1; - } + if (0 == SplitSentence( para, vecSentences )) { + ERROR_LOG("in LTP::splitsent, failed to split sentence"); + return -1; + } - // dummy - // vecSentences.push_back(para); - if (0 != xml.SetSentencesToParagraph(vecSentences, i)) { - ERROR_LOG("in LTP::splitsent, failed to write sentence to xml"); - return -1; - } + // dummy + // vecSentences.push_back(para); + if (0 != xml.SetSentencesToParagraph(vecSentences, i)) { + ERROR_LOG("in LTP::splitsent, failed to write sentence to xml"); + return -1; } + } - xml.SetNote(NOTE_SENT); - return 0; + xml.SetNote(NOTE_SENT); + return 0; } // integrate word segmentor into LTP int LTP::wordseg(XML4NLP & xml) { - if (xml.QueryNote(NOTE_WORD)) { - return 0; - } + if (xml.QueryNote(NOTE_WORD)) { + return 0; + } - // - if (0 != splitSentence_dummy(xml)) { - ERROR_LOG("in LTP::wordseg, failed to perform split sentence preprocess."); - return -1; - } + // + if (0 != splitSentence_dummy(xml)) { + ERROR_LOG("in LTP::wordseg, failed to perform split sentence preprocess."); + return -1; + } - /*if (0 != m_ltpResource.LoadSegmentorResource(m_ltpOption.segmentor_model_path)) { - ERROR_LOG("in LTP::wordseg, failed to load segmentor resource"); - return -1; - }*/ + /*if (0 != m_ltpResource.LoadSegmentorResource(m_ltpOption.segmentor_model_path)) { + ERROR_LOG("in LTP::wordseg, failed to load segmentor resource"); + return -1; + }*/ - // get the segmentor pointer - void * segmentor = m_ltpResource.GetSegmentor(); - if (0 == segmentor) { - ERROR_LOG("in LTP::wordseg, failed to init a segmentor"); - return -1; - } + // get the segmentor pointer + void * segmentor = m_ltpResource.GetSegmentor(); + if (0 == segmentor) { + ERROR_LOG("in LTP::wordseg, failed to init a segmentor"); + return -1; + } - int stnsNum = xml.CountSentenceInDocument(); + int stnsNum = xml.CountSentenceInDocument(); - if (0 == stnsNum) { - ERROR_LOG("in LTP::wordseg, number of sentence equals 0"); - return -1; - } + if (0 == stnsNum) { + ERROR_LOG("in LTP::wordseg, number of sentence equals 0"); + return -1; + } - for (int i = 0; i < stnsNum; ++ i) { - string strStn = xml.GetSentence(i); - vector vctWords; + for (int i = 0; i < stnsNum; ++ i) { + std::string strStn = xml.GetSentence(i); + std::vector vctWords; - if (ltp::strutils::codecs::length(strStn) > MAX_SENTENCE_LEN) { - ERROR_LOG("in LTP::wordseg, input sentence is too long"); - return -1; - } + if (ltp::strutils::codecs::length(strStn) > MAX_SENTENCE_LEN) { + ERROR_LOG("in LTP::wordseg, input sentence is too long"); + return -1; + } - if (0 == segmentor_segment(segmentor, strStn, vctWords)) { - ERROR_LOG("in LTP::wordseg, failed to perform word segment on \"%s\"", - strStn.c_str()); - return -1; - } + if (0 == segmentor_segment(segmentor, strStn, vctWords)) { + ERROR_LOG("in LTP::wordseg, failed to perform word segment on \"%s\"", + strStn.c_str()); + return -1; + } - if (0 != xml.SetWordsToSentence(vctWords, i)) { - ERROR_LOG("in LTP::wordseg, failed to write segment result to xml"); - return -1; - } + if (0 != xml.SetWordsToSentence(vctWords, i)) { + ERROR_LOG("in LTP::wordseg, failed to write segment result to xml"); + return -1; } + } - xml.SetNote(NOTE_WORD); - return 0; + xml.SetNote(NOTE_WORD); + return 0; } // integrate postagger into LTP int LTP::postag(XML4NLP & xml) { - if ( xml.QueryNote(NOTE_POS) ) { - return 0; - } + if ( xml.QueryNote(NOTE_POS) ) { + return 0; + } - // dependency - if (0 != wordseg(xml)) { - ERROR_LOG("in LTP::postag, failed to perform word segment preprocess"); - return -1; - } + // dependency + if (0 != wordseg(xml)) { + ERROR_LOG("in LTP::postag, failed to perform word segment preprocess"); + return -1; + } - /*if (0 != m_ltpResource.LoadPostaggerResource(m_ltpOption.postagger_model_path)) { - ERROR_LOG("in LTP::postag, failed to load postagger resource."); - return -1; - }*/ + /*if (0 != m_ltpResource.LoadPostaggerResource(m_ltpOption.postagger_model_path)) { + ERROR_LOG("in LTP::postag, failed to load postagger resource."); + return -1; + }*/ - void * postagger = m_ltpResource.GetPostagger(); - if (0 == postagger) { - ERROR_LOG("in LTP::postag, failed to init a postagger"); - return -1; - } + void * postagger = m_ltpResource.GetPostagger(); + if (0 == postagger) { + ERROR_LOG("in LTP::postag, failed to init a postagger"); + return -1; + } - int stnsNum = xml.CountSentenceInDocument(); + int stnsNum = xml.CountSentenceInDocument(); - if (0 == stnsNum) { - ERROR_LOG("in LTP::postag, number of sentence equals 0"); - return -1; - } + if (0 == stnsNum) { + ERROR_LOG("in LTP::postag, number of sentence equals 0"); + return -1; + } - for (int i = 0; i < stnsNum; ++i) { - vector vecWord; - vector vecPOS; + for (int i = 0; i < stnsNum; ++i) { + vector vecWord; + vector vecPOS; - xml.GetWordsFromSentence(vecWord, i); + xml.GetWordsFromSentence(vecWord, i); - if (0 == vecWord.size()) { - ERROR_LOG("Input sentence is empty."); - return -1; - } + if (0 == vecWord.size()) { + ERROR_LOG("Input sentence is empty."); + return -1; + } - if (vecWord.size() > MAX_WORDS_NUM) { - ERROR_LOG("Input sentence is too long."); - return -1; - } + if (vecWord.size() > MAX_WORDS_NUM) { + ERROR_LOG("Input sentence is too long."); + return -1; + } - if (0 == postagger_postag(postagger, vecWord, vecPOS)) { - ERROR_LOG("in LTP::postag, failed to perform postag on sent. #%d", i+1); - return -1; - } + if (0 == postagger_postag(postagger, vecWord, vecPOS)) { + ERROR_LOG("in LTP::postag, failed to perform postag on sent. #%d", i+1); + return -1; + } - if (xml.SetPOSsToSentence(vecPOS, i) != 0) { - ERROR_LOG("in LTP::postag, failed to write postag result to xml"); - return -1; - } + if (xml.SetPOSsToSentence(vecPOS, i) != 0) { + ERROR_LOG("in LTP::postag, failed to write postag result to xml"); + return -1; } + } - xml.SetNote(NOTE_POS); + xml.SetNote(NOTE_POS); - return 0; + return 0; } // perform ner over xml -int LTP::ner(XML4NLP & xml) { - if ( xml.QueryNote(NOTE_NE) ) { - return 0; - } +int LTP::ner(XML4NLP & xml) { + if ( xml.QueryNote(NOTE_NE) ) { + return 0; + } - // dependency - if (0 != postag(xml)) { - ERROR_LOG("in LTP::ner, failed to perform postag preprocess"); - return -1; - } + // dependency + if (0 != postag(xml)) { + ERROR_LOG("in LTP::ner, failed to perform postag preprocess"); + return -1; + } - /*if (0 != m_ltpResource.LoadNEResource(m_ltpOption.ner_model_path)) { - ERROR_LOG("in LTP::ner, failed to load ner resource"); - return -1; - }*/ + /*if (0 != m_ltpResource.LoadNEResource(m_ltpOption.ner_model_path)) { + ERROR_LOG("in LTP::ner, failed to load ner resource"); + return -1; + }*/ - void * ner = m_ltpResource.GetNER(); + void * ner = m_ltpResource.GetNER(); - if (NULL == ner) { - ERROR_LOG("in LTP::ner, failed to init a ner."); - return -1; - } + if (NULL == ner) { + ERROR_LOG("in LTP::ner, failed to init a ner."); + return -1; + } - int stnsNum = xml.CountSentenceInDocument(); + int stnsNum = xml.CountSentenceInDocument(); - if (stnsNum == 0) { - ERROR_LOG("in LTP::ner, number of sentence equals 0"); - return -1; - } - - for (int i = 0; i < stnsNum; ++ i) { - vector vecWord; - vector vecPOS; - vector vecNETag; + if (stnsNum == 0) { + ERROR_LOG("in LTP::ner, number of sentence equals 0"); + return -1; + } - if (xml.GetWordsFromSentence(vecWord, i) != 0) { - ERROR_LOG("in LTP::ner, failed to get words from xml"); - return -1; - } + for (int i = 0; i < stnsNum; ++ i) { + vector vecWord; + vector vecPOS; + vector vecNETag; - if (xml.GetPOSsFromSentence(vecPOS, i) != 0) { - ERROR_LOG("in LTP::ner, failed to get postags from xml"); - return -1; - } + if (xml.GetWordsFromSentence(vecWord, i) != 0) { + ERROR_LOG("in LTP::ner, failed to get words from xml"); + return -1; + } - if (0 == vecWord.size()) { - ERROR_LOG("Input sentence is empty."); - return -1; - } + if (xml.GetPOSsFromSentence(vecPOS, i) != 0) { + ERROR_LOG("in LTP::ner, failed to get postags from xml"); + return -1; + } - if (vecWord.size() > MAX_WORDS_NUM) { - ERROR_LOG("Input sentence is too long."); - return -1; - } + if (0 == vecWord.size()) { + ERROR_LOG("Input sentence is empty."); + return -1; + } - if (0 == ner_recognize(ner, vecWord, vecPOS, vecNETag)) { - ERROR_LOG("in LTP::ner, failed to perform ner on sent. #%d", i+1); - return -1; - } + if (vecWord.size() > MAX_WORDS_NUM) { + ERROR_LOG("Input sentence is too long."); + return -1; + } - xml.SetNEsToSentence(vecNETag, i); + if (0 == ner_recognize(ner, vecWord, vecPOS, vecNETag)) { + ERROR_LOG("in LTP::ner, failed to perform ner on sent. #%d", i+1); + return -1; } - xml.SetNote(NOTE_NE); - return 0; + xml.SetNEsToSentence(vecNETag, i); + } + + xml.SetNote(NOTE_NE); + return 0; } int LTP::parser(XML4NLP & xml) { - if ( xml.QueryNote(NOTE_PARSER) ) return 0; + if ( xml.QueryNote(NOTE_PARSER) ) return 0; - if (0 != postag(xml)) { - ERROR_LOG("in LTP::parser, failed to perform postag preprocessing"); - return -1; - } + if (0 != postag(xml)) { + ERROR_LOG("in LTP::parser, failed to perform postag preprocessing"); + return -1; + } - /*if ( 0 != m_ltpResource.LoadParserResource(m_ltpOption.parser_model_path) ) { - ERROR_LOG("in LTP::parser, failed to load parser resource"); - return -1; - }*/ + /*if ( 0 != m_ltpResource.LoadParserResource(m_ltpOption.parser_model_path) ) { + ERROR_LOG("in LTP::parser, failed to load parser resource"); + return -1; + }*/ - void * parser = m_ltpResource.GetParser(); + void * parser = m_ltpResource.GetParser(); - if (parser == NULL) { - ERROR_LOG("in LTP::parser, failed to init a parser"); - return -1; - } + if (parser == NULL) { + ERROR_LOG("in LTP::parser, failed to init a parser"); + return -1; + } - int stnsNum = xml.CountSentenceInDocument(); - if (stnsNum == 0) { - ERROR_LOG("in LTP::parser, number of sentences equals 0"); - return -1; - } + int stnsNum = xml.CountSentenceInDocument(); + if (stnsNum == 0) { + ERROR_LOG("in LTP::parser, number of sentences equals 0"); + return -1; + } - for (int i = 0; i < stnsNum; ++i) { - vector vecWord; - vector vecPOS; - vector vecHead; - vector vecRel; + for (int i = 0; i < stnsNum; ++i) { + std::vector vecWord; + std::vector vecPOS; + std::vector vecHead; + std::vector vecRel; - if (xml.GetWordsFromSentence(vecWord, i) != 0) { - ERROR_LOG("in LTP::parser, failed to get words from xml"); - return -1; - } + if (xml.GetWordsFromSentence(vecWord, i) != 0) { + ERROR_LOG("in LTP::parser, failed to get words from xml"); + return -1; + } - if (xml.GetPOSsFromSentence(vecPOS, i) != 0) { - ERROR_LOG("in LTP::parser, failed to get postags from xml"); - return -1; - } + if (xml.GetPOSsFromSentence(vecPOS, i) != 0) { + ERROR_LOG("in LTP::parser, failed to get postags from xml"); + return -1; + } - if (0 == vecWord.size()) { - ERROR_LOG("Input sentence is empty."); - return -1; - } + if (0 == vecWord.size()) { + ERROR_LOG("Input sentence is empty."); + return -1; + } - if (vecWord.size() > MAX_WORDS_NUM) { - ERROR_LOG("Input sentence is too long."); - return -1; - } + if (vecWord.size() > MAX_WORDS_NUM) { + ERROR_LOG("Input sentence is too long."); + return -1; + } - if (-1 == parser_parse(parser, vecWord, vecPOS, vecHead, vecRel)) { - ERROR_LOG("in LTP::parser, failed to perform parse on sent. #%d", i+1); - return -1; - } + if (-1 == parser_parse(parser, vecWord, vecPOS, vecHead, vecRel)) { + ERROR_LOG("in LTP::parser, failed to perform parse on sent. #%d", i+1); + return -1; + } - if (0 != xml.SetParsesToSentence(vecHead, vecRel, i)) { - ERROR_LOG("in LTP::parser, failed to write parse result to xml"); - return -1; - } + if (0 != xml.SetParsesToSentence(vecHead, vecRel, i)) { + ERROR_LOG("in LTP::parser, failed to write parse result to xml"); + return -1; } + } - xml.SetNote(NOTE_PARSER); + xml.SetNote(NOTE_PARSER); - return 0; + return 0; } int LTP::srl(XML4NLP & xml) { - if ( xml.QueryNote(NOTE_SRL) ) return 0; + if ( xml.QueryNote(NOTE_SRL) ) return 0; - // dependency - if (0 != ner(xml)) { - ERROR_LOG("in LTP::srl, failed to perform ner preprocess"); - return -1; + // dependency + if (0 != ner(xml)) { + ERROR_LOG("in LTP::srl, failed to perform ner preprocess"); + return -1; + } + + if (0 != parser(xml)) { + ERROR_LOG("in LTP::srl, failed to perform parsing preprocess"); + return -1; + } + + /*if ( 0 != m_ltpResource.LoadSRLResource(m_ltpOption.srl_data_dir) ) { + ERROR_LOG("in LTP::srl, failed to load srl resource"); + return -1; + }*/ + + int stnsNum = xml.CountSentenceInDocument(); + if (stnsNum == 0) { + ERROR_LOG("in LTP::srl, number of sentence equals 0"); + return -1; + } + + for (int i = 0; i < stnsNum; ++i) { + vector vecWord; + vector vecPOS; + vector vecNE; + vector< pair > vecParse; + vector< pair > > > > vecSRLResult; + + if (xml.GetWordsFromSentence(vecWord, i) != 0) { + ERROR_LOG("in LTP::ner, failed to get words from xml"); + return -1; } - if (0 != parser(xml)) { - ERROR_LOG("in LTP::srl, failed to perform parsing preprocess"); - return -1; + if (xml.GetPOSsFromSentence(vecPOS, i) != 0) { + ERROR_LOG("in LTP::ner, failed to get postags from xml"); + return -1; } - /*if ( 0 != m_ltpResource.LoadSRLResource(m_ltpOption.srl_data_dir) ) { - ERROR_LOG("in LTP::srl, failed to load srl resource"); - return -1; - }*/ + if (xml.GetNEsFromSentence(vecNE, i) != 0) { + ERROR_LOG("in LTP::ner, failed to get ner result from xml"); + return -1; + } - int stnsNum = xml.CountSentenceInDocument(); - if (stnsNum == 0) { - ERROR_LOG("in LTP::srl, number of sentence equals 0"); + if (xml.GetParsesFromSentence(vecParse, i) != 0) { + ERROR_LOG("in LTP::ner, failed to get parsing result from xml"); + return -1; + } + + if (0 != SRL(vecWord, vecPOS, vecNE, vecParse, vecSRLResult)) { + ERROR_LOG("in LTP::srl, failed to perform srl on sent. #%d", i+1); + return -1; + } + + int j = 0; + for (; j < vecSRLResult.size(); ++j) { + vector vecType; + vector< pair > vecBegEnd; + int k = 0; + + for (; k < vecSRLResult[j].second.size(); ++k) { + vecType.push_back(vecSRLResult[j].second[k].first); + vecBegEnd.push_back(vecSRLResult[j].second[k].second); + } + + if (0 != xml.SetPredArgToWord(i, vecSRLResult[j].first, vecType, vecBegEnd)) { return -1; + } } + } - for (int i = 0; i < stnsNum; ++i) { - vector vecWord; - vector vecPOS; - vector vecNE; - vector< pair > vecParse; - vector< pair< int, vector< pair > > > > vecSRLResult; - - if (xml.GetWordsFromSentence(vecWord, i) != 0) { - ERROR_LOG("in LTP::ner, failed to get words from xml"); - return -1; - } - - if (xml.GetPOSsFromSentence(vecPOS, i) != 0) { - ERROR_LOG("in LTP::ner, failed to get postags from xml"); - return -1; - } - - if (xml.GetNEsFromSentence(vecNE, i) != 0) { - ERROR_LOG("in LTP::ner, failed to get ner result from xml"); - return -1; - } - - if (xml.GetParsesFromSentence(vecParse, i) != 0) { - ERROR_LOG("in LTP::ner, failed to get parsing result from xml"); - return -1; - } - - if (0 != SRL(vecWord, vecPOS, vecNE, vecParse, vecSRLResult)) { - ERROR_LOG("in LTP::srl, failed to perform srl on sent. #%d", i+1); - return -1; - } - - int j = 0; - for (; j < vecSRLResult.size(); ++j) { - vector vecType; - vector< pair > vecBegEnd; - int k = 0; - - for (; k < vecSRLResult[j].second.size(); ++k) { - vecType.push_back(vecSRLResult[j].second[k].first); - vecBegEnd.push_back(vecSRLResult[j].second[k].second); - } - - if (0 != xml.SetPredArgToWord(i, vecSRLResult[j].first, vecType, vecBegEnd)) { - return -1; - } - } - } - - xml.SetNote(NOTE_SRL); - return 0; + xml.SetNote(NOTE_SRL); + return 0; } diff --git a/src/__ltp_dll/Ltp.h b/src/__ltp_dll/Ltp.h index cb77d55d5..1dc387fc6 100644 --- a/src/__ltp_dll/Ltp.h +++ b/src/__ltp_dll/Ltp.h @@ -16,103 +16,104 @@ using namespace std; // extern ofstream ltp_log_file; #define MAX_SENTENCE_LEN 300 -#define MAX_WORDS_NUM 70 +#define MAX_WORDS_NUM 70 class LTP { public: - /* - * the constructor with config filepath specified to `conf/ltp.cnf` - */ - LTP(); - - /* - * the another constructor with user specified config file - * - * @param[in] cfg_file the path to the config file - */ - LTP(const char * cfg_file); - - /* - * deallocate the ltp resource - */ - ~LTP(); - - /* - * return true on the resource successful loaded, otherwise false - */ - bool loaded(); - - // discard - // int CreateDOMFromTxt(const char * cszTxtFileName, XML4NLP& m_xml4nlp); - - // discard - // int CreateDOMFromXml(const char * cszXmlFileName, XML4NLP& m_xml4nlp); - - // save dom tree - // int SaveDOM(const char *cszSaveFileName, XML4NLP& m_xml4nlp); - - /* - * do word segmentation. - * - * @param[in/out] xml the xml storing ltp result - * @return int 0 on success, otherwise -1 - */ - int wordseg(XML4NLP & xml); - - /* - * do postagging - * - * @param[in/out] xml the xml storing ltp result - * @return int 0 on success, otherwise -1 - */ - int postag(XML4NLP & xml); - - /* - * do name entities recognization - * - * @param[in/out] xml the xml storing ltp result - * @return int 0 on success, otherwise -1 - */ - int ner(XML4NLP & xml); - - /* - * do dependency parsing - * - * @param[in/out] xml the xml storing ltp result - * @return int 0 on success, otherwise -1 - */ - int parser(XML4NLP & xml); - - /* - * do semantic role labeling - * - * @param[in/out] xml the xml storing ltp result - * @return int 0 on success, otherwise -1 - */ - int srl(XML4NLP & xml); - + /* + * the constructor with config filepath specified to `conf/ltp.cnf` + */ + LTP(); + + /* + * the another constructor with user specified config file + * + * @param[in] cfg_file the path to the config file + */ + LTP(const char * cfg_file); + + /* + * deallocate the ltp resource + */ + ~LTP(); + + /* + * return true on the resource successful loaded, otherwise false + */ + bool loaded(); + + // discard + // int CreateDOMFromTxt(const char * cszTxtFileName, XML4NLP& m_xml4nlp); + + // discard + // int CreateDOMFromXml(const char * cszXmlFileName, XML4NLP& m_xml4nlp); + + // save dom tree + // int SaveDOM(const char *cszSaveFileName, XML4NLP& m_xml4nlp); + + /* + * do word segmentation. + * + * @param[in/out] xml the xml storing ltp result + * @return int 0 on success, otherwise -1 + */ + int wordseg(XML4NLP & xml); + + /* + * do postagging + * + * @param[in/out] xml the xml storing ltp result + * @return int 0 on success, otherwise -1 + */ + int postag(XML4NLP & xml); + + /* + * do name entities recognization + * + * @param[in/out] xml the xml storing ltp result + * @return int 0 on success, otherwise -1 + */ + int ner(XML4NLP & xml); + + /* + * do dependency parsing + * + * @param[in/out] xml the xml storing ltp result + * @return int 0 on success, otherwise -1 + */ + int parser(XML4NLP & xml); + + /* + * do semantic role labeling + * + * @param[in/out] xml the xml storing ltp result + * @return int 0 on success, otherwise -1 + */ + int srl(XML4NLP & xml); + + int splitSentence_dummy(XML4NLP & xml); private: - /* - * split the sentence - * - * @param[in/out] xml the xml storing ltp result - * @return int 0 on success, otherwise -1 - */ - int splitSentence_dummy(XML4NLP & xml); - - /* - * parse the config file, and load resource according the config - * - * @param[in] confFileName the config file - * @return int 0 on success, otherwise -1 - */ - int ReadConfFile(const char *confFileName = "conf/ltp.cnf"); + /* + * split the sentence + * + * @param[in/out] xml the xml storing ltp result + * @return int 0 on success, otherwise -1 + */ + //int splitSentence_dummy(XML4NLP & xml); + + /* + * parse the config file, and load resource according the config + * + * @param[in] confFileName the config file + * @return int 0 on success, otherwise -1 + */ + int ReadConfFile(const char *confFileName = "conf/ltp.cnf"); private: - LTPResource m_ltpResource; /*< the ltp resources */ - bool m_loaded; /*< use to sepcify if the resource is loaded */ + LTPResource m_ltpResource; /*< the ltp resources */ + bool m_loaded; /*< use to sepcify if the resource is loaded */ }; #endif // end for __LTP_H__ diff --git a/src/__xml4nlp/Xml4nlp.cpp b/src/__xml4nlp/Xml4nlp.cpp index 5584cfa65..498fce10c 100644 --- a/src/__xml4nlp/Xml4nlp.cpp +++ b/src/__xml4nlp/Xml4nlp.cpp @@ -11,13 +11,13 @@ #include "Xml4nlp.h" #include "MyLib.h" -const char * const NOTE_SENT = "sent"; -const char * const NOTE_WORD = "word"; -const char * const NOTE_POS = "pos"; -const char * const NOTE_NE = "ne"; -const char * const NOTE_PARSER = "parser"; -const char * const NOTE_WSD = "wsd"; -const char * const NOTE_SRL = "srl"; +const char * const NOTE_SENT = "sent"; +const char * const NOTE_WORD = "word"; +const char * const NOTE_POS = "pos"; +const char * const NOTE_NE = "ne"; +const char * const NOTE_PARSER = "parser"; +const char * const NOTE_WSD = "wsd"; +const char * const NOTE_SRL = "srl"; //const char * const NOTE_CLASS = "class"; //const char * const NOTE_SUM = "sum"; //const char * const NOTE_CR = "cr"; @@ -46,15 +46,15 @@ const char * const XML4NLP::TAG_END = "end"; const char * const XML4NLP::TAG_ID = "id"; XML4NLP::XML4NLP() { - m_document.documentPtr = NULL; - m_note.nodePtr = NULL; - m_summary.nodePtr = NULL; - m_textclass.nodePtr = NULL; - m_coref.nodePtr = NULL; + document.documentPtr = NULL; + note.nodePtr = NULL; + summary.nodePtr = NULL; + textclass.nodePtr = NULL; + coref.nodePtr = NULL; } XML4NLP::~XML4NLP() { - m_tiXmlDoc.Clear(); + m_tiXmlDoc.Clear(); } ///////////////////////////////////////////////////////////////////////////////////// @@ -62,29 +62,29 @@ XML4NLP::~XML4NLP() { /// the paragraphs are separated by CR ("\r\n") ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::CreateDOMFromFile(const char* fileName) { - ClearDOM(); + ClearDOM(); - if (0 != BuildDOMFrame()) return -1; + if (0 != BuildDOMFrame()) return -1; - ifstream in; - in.open(fileName); - if ( !in.is_open() ) { - cerr << "xml4nlp load file error: " << fileName << endl; - return -1; - } - - string line; - int i = 0; - while (getline(in, line)) { - clean_str(line); // Zhenghua Li, 2007-8-31, 15:57 - // remove_space_gbk(line); - if (line.empty()) { - continue; - } + ifstream in; + in.open(fileName); + if ( !in.is_open() ) { + cerr << "xml4nlp load file error: " << fileName << endl; + return -1; + } - if (0 != BuildParagraph(line, i++)) return -1; + string line; + int i = 0; + while (getline(in, line)) { + clean_str(line); // Zhenghua Li, 2007-8-31, 15:57 + // remove_space_gbk(line); + if (line.empty()) { + continue; } - return 0; + + if (0 != BuildParagraph(line, i++)) return -1; + } + return 0; } ///////////////////////////////////////////////////////////////////////////////////// @@ -92,39 +92,38 @@ int XML4NLP::CreateDOMFromFile(const char* fileName) { /// the paragraphs are separated by CR ("\r\n") ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::CreateDOMFromString(const string & str) { - ClearDOM(); + ClearDOM(); - if (0 != BuildDOMFrame()) return -1; + if (0 != BuildDOMFrame()) return -1; - string strTmp = str; - replace_char_by_char(strTmp, '\r', '\n'); + string strTmp = str; + replace_char_by_char(strTmp, '\r', '\n'); - // std::cout << strTmp << std::endl; - istringstream in(strTmp); // How to use istringstream? - string line; - int i = 0; - while (getline(in, strTmp)) { - clean_str(strTmp); + // std::cout << strTmp << std::endl; + istringstream in(strTmp); // How to use istringstream? + int i = 0; + while (getline(in, strTmp)) { + clean_str(strTmp); - if (strTmp.empty()) { - continue; - } + if (strTmp.empty()) { + continue; + } - if (0 != BuildParagraph(strTmp, i++)) { - return -1; - } + if (0 != BuildParagraph(strTmp, i++)) { + return -1; } + } - return 0; + return 0; } void XML4NLP::ReportTiXmlDocErr() const { - cerr << "[XML4NLP ERROR REPORT]" << endl; - cerr << "description : " << m_tiXmlDoc.ErrorDesc() << endl; - cerr << "location : " << endl; - cerr << "row : " << m_tiXmlDoc.ErrorRow() << endl; - cerr << "col : " << m_tiXmlDoc.ErrorCol() << endl; - cerr << "=====================" << endl; + cerr << "[XML4NLP ERROR REPORT]" << endl; + cerr << "description : " << m_tiXmlDoc.ErrorDesc() << endl; + cerr << "location : " << endl; + cerr << "row : " << m_tiXmlDoc.ErrorRow() << endl; + cerr << "col : " << m_tiXmlDoc.ErrorCol() << endl; + cerr << "=====================" << endl; } ///////////////////////////////////////////////////////////////////////////////////// @@ -135,1468 +134,1102 @@ void XML4NLP::ReportTiXmlDocErr() const { /// note: the input file must be a XML file. ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::LoadXMLFromFile(const char* fileName) { - ClearDOM(); + ClearDOM(); - if ( !m_tiXmlDoc.LoadFile(fileName) ) { - cerr << "load xml file error: " << fileName << endl; - ReportTiXmlDocErr(); - return -1; - } + if ( !m_tiXmlDoc.LoadFile(fileName) ) { + cerr << "load xml file error: " << fileName << endl; + ReportTiXmlDocErr(); + return -1; + } - return InitXmlStructure(); + return InitXmlStructure(); } ///////////////////////////////////////////////////////////////////////////////////// /// load a xml file from a string and parse it. ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::LoadXMLFromString(const char * str) { - ClearDOM(); - m_tiXmlDoc.Parse(str); + ClearDOM(); + m_tiXmlDoc.Parse(str); - if (m_tiXmlDoc.Error()) { - ReportTiXmlDocErr(); - return -1; - } + if (m_tiXmlDoc.Error()) { + ReportTiXmlDocErr(); + return -1; + } - if (-1 == InitXmlStructure()) { - return -1; - } + if (-1 == InitXmlStructure()) { + return -1; + } - if (!LTMLValidation()) { - // failed LTML Validation - return -1; - } + if (!LTMLValidation()) { + // failed LTML Validation + return -1; + } - return 0; + return 0; } int XML4NLP::LoadXMLFromString(const std::string & str) { - return LoadXMLFromString(str.c_str()); + return LoadXMLFromString(str.c_str()); } ///////////////////////////////////////////////////////////////////////////////////// /// clear the DOM tree, delete all nodes that allocated before. ///////////////////////////////////////////////////////////////////////////////////// void XML4NLP::ClearDOM() { - m_tiXmlDoc.Clear(); + m_tiXmlDoc.Clear(); - m_document.documentPtr = NULL; - m_document.paragraphs.clear(); - m_note.nodePtr = NULL; - m_summary.nodePtr = NULL; - m_textclass.nodePtr = NULL; - m_coref.nodePtr = NULL; - m_coref.vecEntity.clear(); + document.documentPtr = NULL; + document.paragraphs.clear(); + note.nodePtr = NULL; + summary.nodePtr = NULL; + textclass.nodePtr = NULL; + coref.nodePtr = NULL; + coref.vecEntity.clear(); - m_vecBegWordIdxOfStns.clear(); - m_vecBegStnsIdxOfPara.clear(); + m_vecBegWordIdxOfStns.clear(); + m_vecBegStnsIdxOfPara.clear(); } ///////////////////////////////////////////////////////////////////////////////////// /// save the DOM tree to a XML file. ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::SaveDOM(const char* fileName) { - if (!m_tiXmlDoc.SaveFile(fileName)) { - ReportTiXmlDocErr(); - return -1; - } + if (!m_tiXmlDoc.SaveFile(fileName)) { + ReportTiXmlDocErr(); + return -1; + } - return 0; + return 0; } ///////////////////////////////////////////////////////////////////////////////////// /// save the DOM tree to a XML string. ///////////////////////////////////////////////////////////////////////////////////// void XML4NLP::SaveDOM(string &strDocument) const { - TiXmlPrinter printer; - m_tiXmlDoc.Accept(&printer); - strDocument = printer.CStr(); + TiXmlPrinter printer; + m_tiXmlDoc.Accept(&printer); + strDocument = printer.CStr(); } // ----------------------------------------------------------------some counting functions int XML4NLP::CountParagraphInDocument() const { - return m_document.paragraphs.size(); + return document.paragraphs.size(); } -int XML4NLP::CountSentenceInParagraph(int paragraphIdx) const { - if ( 0 != CheckRange(paragraphIdx) ) return 0; - return m_document.paragraphs[paragraphIdx].sentences.size(); +int XML4NLP::CountSentenceInParagraph(int pid) const { + if ( 0 != CheckRange(pid) ) return 0; + return document.paragraphs[pid].sentences.size(); } int XML4NLP::CountSentenceInDocument() const { - int stnsNumInDoc = 0; - int paragraphNum = m_document.paragraphs.size(); - for (int i = 0; i < paragraphNum; ++i) { - stnsNumInDoc += m_document.paragraphs[i].sentences.size(); - } - return stnsNumInDoc; + int stnsNumInDoc = 0; + int paragraphNum = document.paragraphs.size(); + for (int i = 0; i < paragraphNum; ++i) { + stnsNumInDoc += document.paragraphs[i].sentences.size(); + } + return stnsNumInDoc; } -int XML4NLP::CountWordInSentence(int paragraphIdx, int sentenceIdx) const { - if ( 0 != CheckRange(paragraphIdx, sentenceIdx) ) return 0; - return m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words.size(); +int XML4NLP::CountWordInSentence(int pid, int sid) const { + if ( 0 != CheckRange(pid, sid) ) return 0; + return document.paragraphs[pid].sentences[sid].words.size(); } -int XML4NLP::CountWordInSentence(int sentenceIdx) const { - pair paraIdx_sentIdx; - if ( 0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx) ) return 0; - return m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words.size(); +int XML4NLP::CountWordInSentence(int global_sid) const { + int pid, sid; + if ( 0 != DecodeGlobalId(global_sid, pid, sid) ) return 0; + return document.paragraphs[pid].sentences[sid].words.size(); } -int XML4NLP::CountWordInParagraph(int paragraphIdx) const { - if ( 0 != CheckRange(paragraphIdx) ) return -1; - int totalWordNum = 0; - int sentNum = m_document.paragraphs[paragraphIdx].sentences.size(); - for (int i=0; i < sentNum; ++i) { - totalWordNum += m_document.paragraphs[paragraphIdx].sentences[i].words.size(); - } - return totalWordNum; +int XML4NLP::CountWordInParagraph(int pid) const { + if ( 0 != CheckRange(pid) ) return -1; + int nr_words = 0; + int nr_sents = document.paragraphs[pid].sentences.size(); + + for (int i = 0; i < nr_sents; ++ i) { + nr_words += document.paragraphs[pid].sentences[i].words.size(); + } + return nr_words; } int XML4NLP::CountWordInDocument() const { - int totalWordNum = 0; - int paraNum = m_document.paragraphs.size(); - for (int i=0; iGetText(); + TiXmlElement *paraPtr = document.paragraphs[pid].paragraphPtr; + return paraPtr->GetText(); } -int XML4NLP::GetParagraph(int paragraphIdx, string &strParagraph) const { - if (0 != CheckRange(paragraphIdx)) { - return -1; - } +int XML4NLP::GetParagraph(int pid, string & str) const { + if (0 != CheckRange(pid)) { return -1; } - const Paragraph_t ¶graph = m_document.paragraphs[paragraphIdx]; + const Paragraph ¶graph = document.paragraphs[pid]; - if (paragraph.sentences.empty()) { - strParagraph = paragraph.paragraphPtr->GetText() ; - } else { - strParagraph = ""; - const vector &sentences = paragraph.sentences; - for (int i=0; iAttribute(TAG_CONT); - } + if (paragraph.sentences.empty()) { + str = paragraph.paragraphPtr->GetText() ; + } else { + str = ""; + const vector &sentences = paragraph.sentences; + for (int i=0; iAttribute(TAG_CONT); } + } - return 0; -} - -const char* XML4NLP::GetSentence(int paragraphIdx, int sentenceIdx) const { - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return NULL; - return m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].sentencePtr->Attribute(TAG_CONT); -} - -const char* XML4NLP::GetSentence(int sentenceIdx) const { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return NULL; - return GetSentence(paraIdx_sentIdx.first, paraIdx_sentIdx.second); + return 0; } -const char* XML4NLP::GetWord(int paragraphIdx, int sentenceIdx, int wordIdx) const { - if ( 0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx) ) return NULL; - return m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_CONT); -} +#define EXTEND_FUNCTION(return_type, function_name) \ + return_type function_name (int global_sid) const { \ + int pid, sid; \ + if (0 != DecodeGlobalId(global_sid, pid, sid)) { return NULL; } \ + return (function_name)(pid, sid); \ + } -const char* XML4NLP::GetWord(int globalSentIdx, int wordIdx) const { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(globalSentIdx, paraIdx_sentIdx)) return NULL; - return GetWord(paraIdx_sentIdx.first, paraIdx_sentIdx.second, wordIdx); +const char* XML4NLP::GetSentence(int pid, int sid) const { + if (0 != CheckRange(pid, sid)) return NULL; + return document.paragraphs[pid].sentences[sid].sentencePtr->Attribute(TAG_CONT); } -const char* XML4NLP::GetWord(int globalWordIdx) const { - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return NULL; - return GetWord(paraIdx, sentIdx, wordIdx); -} +EXTEND_FUNCTION(const char *, XML4NLP::GetSentence) -const char *XML4NLP::GetPOS(int paragraphIdx, int sentenceIdx, int wordIdx) const { - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return NULL; - return m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_POS); +#define EXTEND_FUNCTION2(return_type, function_name, tag_name, failed_return) \ +return_type function_name (int pid, int sid, int wid) const { \ + if (0 != CheckRange(pid, sid, wid)) { return failed_return; } \ + return document.paragraphs[pid].sentences[sid].words[wid].wordPtr->Attribute(tag_name); \ +} \ +\ +return_type function_name (int global_sid, int wid) const { \ + int pid, sid; \ + if (0 != DecodeGlobalId(global_sid, pid, sid)) { return failed_return; } \ + return function_name (pid, sid, wid); \ +} \ +\ +return_type function_name (int global_wid) const { \ + int pid, sid, wid; \ + if (0 != DecodeGlobalId(global_wid, pid, sid, wid)) { return failed_return; } \ + return function_name (pid, sid, wid); \ } -const char *XML4NLP::GetPOS(int globalSentIdx, int wordIdx) const { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(globalSentIdx, paraIdx_sentIdx)) return NULL; - return GetPOS(paraIdx_sentIdx.first, paraIdx_sentIdx.second, wordIdx); -} - -const char *XML4NLP::GetPOS(int globalWordIdx) const { - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return NULL; - return GetPOS(paraIdx, sentIdx, wordIdx); -} - -const char *XML4NLP::GetNE(int paragraphIdx, int sentenceIdx, int wordIdx) const { - if ( 0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx) ) return NULL; - return m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_NE); -} - -const char *XML4NLP::GetNE(int globalSentIdx, int wordIdx) const { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(globalSentIdx, paraIdx_sentIdx)) return NULL; - return GetNE(paraIdx_sentIdx.first, paraIdx_sentIdx.second, wordIdx); -} - -const char *XML4NLP::GetNE(int globalWordIdx) const { - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return NULL; - return GetNE(paraIdx, sentIdx, wordIdx); -} - - -int XML4NLP::GetWSD(pair &WSD_explanation, - int paragraphIdx, - int sentenceIdx, - int wordIdx) const { - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return -1; +EXTEND_FUNCTION2 (const char *, XML4NLP::GetWord, TAG_CONT, NULL) +EXTEND_FUNCTION2 (const char *, XML4NLP::GetPOS, TAG_POS, NULL) +EXTEND_FUNCTION2 (const char *, XML4NLP::GetNE, TAG_NE, NULL) - WSD_explanation.first = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_WSD); - WSD_explanation.second = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_WSD_EXP); +int XML4NLP::GetWSD(WSDResult & explanation, int pid, int sid, int wid) const { + if (0 != CheckRange(pid, sid, wid)) return -1; + explanation.first = document.paragraphs[pid].sentences[sid].words[wid].wordPtr->Attribute(TAG_WSD); + explanation.second = document.paragraphs[pid].sentences[sid].words[wid].wordPtr->Attribute(TAG_WSD_EXP); return 0; } -int XML4NLP::GetWSD(pair & WSD_explanation, - int globalSentIdx, - int wordIdx) const { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(globalSentIdx, paraIdx_sentIdx)) return -1; - return GetWSD(WSD_explanation, paraIdx_sentIdx.first, paraIdx_sentIdx.second); -} - -int XML4NLP::GetWSD(pair & WSD_explanation, - int globalWordIdx) const { - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return -1; - return GetWSD(WSD_explanation, paraIdx, sentIdx, wordIdx); -} - -int XML4NLP::GetParse(pair & parent_relation, - int paragraphIdx, - int sentenceIdx, - int wordIdx) const { - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return -1; - const char *cszParent = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_PSR_PARENT); - parent_relation.first = (cszParent == NULL ? 0 : atoi(cszParent)); - parent_relation.second = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr->Attribute(TAG_PSR_RELATE); - return 0; -} - -int XML4NLP::GetParse(pair & parent_relation, - int globalSentIdx, - int wordIdx) const { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(globalSentIdx, paraIdx_sentIdx)) return -1; - return GetParse(parent_relation, paraIdx_sentIdx.first, paraIdx_sentIdx.second); -} - -int XML4NLP::GetParse(pair &parent_relation, - int globalWordIdx) const { - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return -1; - return GetParse(parent_relation, paraIdx, sentIdx, wordIdx); -} - -int XML4NLP::MapGlobalSentIdx2paraIdx_sentIdx(int sentenceIdx, - pair & paraIdx_sentIdx) const { - int startStnsIdxOfPara = 0; - for (int paraIdx=0; paraIdx < m_document.paragraphs.size(); ++paraIdx) { - if (startStnsIdxOfPara + m_document.paragraphs[paraIdx].sentences.size() > sentenceIdx) { - paraIdx_sentIdx.first = paraIdx; - paraIdx_sentIdx.second = sentenceIdx - startStnsIdxOfPara; - return 0; - } - startStnsIdxOfPara += m_document.paragraphs[paraIdx].sentences.size(); +int XML4NLP::GetParse(ParseResult & relation, int pid, int sid, int wid) const { + if (0 != CheckRange(pid, sid, wid)) return -1; + const char * head = document.paragraphs[pid].sentences[sid].words[wid].wordPtr->Attribute(TAG_PSR_PARENT); + relation.first = (head == NULL ? 0 : atoi(head)); + relation.second = document.paragraphs[pid].sentences[sid].words[wid].wordPtr->Attribute(TAG_PSR_RELATE); + return 0; +} + +#define EXTEND_FUNCTION3(return_type, function_name, output_type, failed_return) \ + return_type function_name (output_type & output, int global_sid, int wid) const { \ + int pid, sid; \ + if (0 != DecodeGlobalId(global_sid, pid, sid)) { return failed_return; } \ + return function_name(output, pid, sid, wid); \ + }\ +\ + return_type function_name (output_type & output, int global_wid) const { \ + int pid, wid, sid; \ + if (0 != DecodeGlobalId(global_wid, pid, sid, wid)) { return failed_return; } \ + return function_name(output, pid, sid, wid); \ + } + +EXTEND_FUNCTION3 (int, XML4NLP::GetWSD, WSDResult, -1) +EXTEND_FUNCTION3 (int, XML4NLP::GetParse, ParseResult, -1) + +int XML4NLP::DecodeGlobalId(int global_sid, int & pid, int & sid) const { + int startStnsIdxOfPara = 0; + for (pid = 0; pid < document.paragraphs.size(); ++ pid) { + int len = document.paragraphs[pid].sentences.size(); + if (startStnsIdxOfPara + len > global_sid) { + sid = global_sid - startStnsIdxOfPara; + return 0; + } + startStnsIdxOfPara += len; + } + return -1; +} + +int XML4NLP::DecodeGlobalId(int global_wid, int & pid, int & sid, int & wid) const { + int startWordIdxOfStns = 0; + for (pid = 0; pid < document.paragraphs.size(); ++ pid) { + const vector &sentences = document.paragraphs[pid].sentences; + for (sid = 0; sid < sentences.size(); ++ sid) { + if (startWordIdxOfStns + sentences[sid].words.size() > global_wid) { + wid = global_wid - startWordIdxOfStns; + return 0; + } + startWordIdxOfStns += sentences[sid].words.size(); } - return -1; + } + return -1; } -int XML4NLP::MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(int globalWordIdx, - int & paraIdx, - int & sentIdx, - int & wordIdx) const { - int startWordIdxOfStns = 0; - for (paraIdx=0; paraIdx < m_document.paragraphs.size(); ++paraIdx) { - const vector &sentences = m_document.paragraphs[paraIdx].sentences; - for (sentIdx=0; sentIdx < sentences.size(); ++sentIdx) { - if (startWordIdxOfStns + sentences[sentIdx].words.size() > globalWordIdx) { - wordIdx = globalWordIdx - startWordIdxOfStns; - return 0; - } - startWordIdxOfStns += sentences[sentIdx].words.size(); - } - } +int XML4NLP::GetSentencesFromParagraph(vector &vecSentence, + int paragraphIdx) const { + if (0 != CheckRange(paragraphIdx)) return -1; + if (document.paragraphs[paragraphIdx].sentences.empty()) { return -1; -} - -int XML4NLP::GetSentencesFromParagraph(vector &vecSentence, - int paragraphIdx) const { - if (0 != CheckRange(paragraphIdx)) return -1; - if (m_document.paragraphs[paragraphIdx].sentences.empty()) { - return -1; - } + } - const vector & sentences = m_document.paragraphs[paragraphIdx].sentences; - if (vecSentence.size() != sentences.size()) { - return -1; - } + const vector & sentences = document.paragraphs[paragraphIdx].sentences; + if (vecSentence.size() != sentences.size()) { + return -1; + } - for (int i=0; i < sentences.size(); ++i) { - vecSentence[i] = sentences[i].sentencePtr->Attribute(TAG_CONT); - } + for (int i=0; i < sentences.size(); ++i) { + vecSentence[i] = sentences[i].sentencePtr->Attribute(TAG_CONT); + } - return 0; + return 0; } -int XML4NLP::GetSentencesFromParagraph(vector &vecSentence, - int paragraphIdx) const { - if (0 != CheckRange(paragraphIdx)) return -1; +int XML4NLP::GetSentencesFromParagraph(vector &vecSentence, + int paragraphIdx) const { + if (0 != CheckRange(paragraphIdx)) return -1; - if (m_document.paragraphs[paragraphIdx].sentences.empty()) { - return -1; - } + if (document.paragraphs[paragraphIdx].sentences.empty()) { + return -1; + } - vecSentence.clear(); - const vector &sentences = m_document.paragraphs[paragraphIdx].sentences; - for (int i = 0; i < sentences.size(); ++ i) { - vecSentence.push_back( sentences[i].sentencePtr->Attribute(TAG_CONT) ); - } - return 0; + vecSentence.clear(); + const vector &sentences = document.paragraphs[paragraphIdx].sentences; + for (int i = 0; i < sentences.size(); ++ i) { + vecSentence.push_back( sentences[i].sentencePtr->Attribute(TAG_CONT) ); + } + return 0; } int XML4NLP::SetSentencesToParagraph(const vector &vecSentence, int paragraphIdx) { - if (0 != CheckRange(paragraphIdx)) { - return -1; - } + if (0 != CheckRange(paragraphIdx)) { + return -1; + } - if (!m_document.paragraphs[paragraphIdx].sentences.empty()) { - return -1; - } + if (!document.paragraphs[paragraphIdx].sentences.empty()) { + return -1; + } - Paragraph_t & paragraph = m_document.paragraphs[paragraphIdx]; - TiXmlElement * paragraphPtr = paragraph.paragraphPtr; - vector &sentences = paragraph.sentences; + Paragraph & paragraph = document.paragraphs[paragraphIdx]; + TiXmlElement * paragraphPtr = paragraph.paragraphPtr; + vector &sentences = paragraph.sentences; - TiXmlText *textPtr = paragraphPtr->FirstChild()->ToText(); - if (textPtr == NULL) { - return -1; - } else { - paragraphPtr->RemoveChild(textPtr); - } + TiXmlText *textPtr = paragraphPtr->FirstChild()->ToText(); + if (textPtr == NULL) { + return -1; + } else { + paragraphPtr->RemoveChild(textPtr); + } + + for (int i = 0; i < vecSentence.size(); ++i) { + TiXmlElement *sentencePtr = new TiXmlElement(TAG_SENT); + sentencePtr->SetAttribute(TAG_ID, static_cast(i)); + sentencePtr->SetAttribute(TAG_CONT, vecSentence[i].c_str()); + paragraphPtr->LinkEndChild(sentencePtr); + + sentences.push_back( Sentence() ); + sentences[sentences.size()-1].sentencePtr = sentencePtr; + } + + return 0; +} + +#define EXTEND_FUNCTION4(return_type, function_name, tag_name) \ + return_type function_name (std::vector & output, int pid, int sid) const { \ + return GetInfoFromSentence(output, pid, sid, tag_name); \ + } \ +\ + return_type function_name (std::vector & output, int pid, int sid) const { \ + return GetInfoFromSentence(output, pid, sid, tag_name); \ + } \ +\ + return_type function_name (std::vector & output, int global_sid) const { \ + return GetInfoFromSentence(output, global_sid, tag_name); \ + } \ +\ + return_type function_name (std::vector & output, int global_sid) const { \ + return GetInfoFromSentence(output, global_sid, tag_name); \ + } + +EXTEND_FUNCTION4 (int, XML4NLP::GetWordsFromSentence, TAG_CONT); +EXTEND_FUNCTION4 (int, XML4NLP::GetPOSsFromSentence, TAG_POS); +EXTEND_FUNCTION4 (int, XML4NLP::GetNEsFromSentence, TAG_NE); + +int XML4NLP::SetWordsToSentence(const std::vector & input, + int pid, + int sid) { + if (0 != CheckRange(pid, sid)) return -1; + + Sentence &sentence = document.paragraphs[pid].sentences[sid]; + if (!sentence.words.empty()) { + return -1; + } - for (int i = 0; i < vecSentence.size(); ++i) { - TiXmlElement *sentencePtr = new TiXmlElement(TAG_SENT); - sentencePtr->SetAttribute(TAG_ID, static_cast(i)); - sentencePtr->SetAttribute(TAG_CONT, vecSentence[i].c_str()); - paragraphPtr->LinkEndChild(sentencePtr); + for (int i = 0; i < input.size(); ++ i) { + TiXmlElement *wordPtr = new TiXmlElement(TAG_WORD); + wordPtr->SetAttribute(TAG_ID, i); + wordPtr->SetAttribute(TAG_CONT, input[i].c_str()); + sentence.sentencePtr->LinkEndChild(wordPtr); - sentences.push_back( Sentence_t() ); - sentences[sentences.size()-1].sentencePtr = sentencePtr; - } - - return 0; + sentence.words.push_back( Word() ); + sentence.words[sentence.words.size() - 1].wordPtr = wordPtr; + } + return 0; } -int XML4NLP::GetWordsFromSentence(vector & vecWord, - int paragraphIdx, - int sentenceIdx) const { - return GetInfoFromSentence(vecWord, paragraphIdx, sentenceIdx, TAG_CONT); +int XML4NLP::SetWordsToSentence(const std::vector & input, + int global_sid) { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) { return -1; } + SetWordsToSentence(input, pid, sid); + return 0; } -int XML4NLP::GetWordsFromSentence(std::vector & vecWord, - int paragraphIdx, - int sentenceIdx) const { - return GetInfoFromSentence(vecWord, paragraphIdx, sentenceIdx, TAG_CONT); +int XML4NLP::SetPOSsToSentence(const std::vector & input, + int pid, int sid) { + return SetInfoToSentence(input, pid, sid, TAG_POS); } -int XML4NLP::GetWordsFromSentence(std::vector & vecWord, - int globalSentIdx) const { - return GetInfoFromSentence(vecWord, globalSentIdx, TAG_CONT); +int XML4NLP::SetPOSsToSentence(const std::vector & input, + int global_sid) { + return SetInfoToSentence(input, global_sid, TAG_POS); } -int XML4NLP::GetWordsFromSentence(std::vector & vecWord, - int globalSentIdx) const { - return GetInfoFromSentence(vecWord, globalSentIdx, TAG_CONT); +int XML4NLP::SetNEsToSentence(const std::vector & input, + int pid, int sid) { + return SetInfoToSentence(input, pid, sid, TAG_NE); } -int XML4NLP::SetWordsToSentence(const vector & vecWord, - int paragraphIdx, - int sentenceIdx) { - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return -1; - - Sentence_t &sentence = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx]; - if (!sentence.words.empty()) { - return -1; - } - - for (int i = 0; i < vecWord.size(); ++ i) { - TiXmlElement *wordPtr = new TiXmlElement(TAG_WORD); - wordPtr->SetAttribute(TAG_ID, i); - wordPtr->SetAttribute(TAG_CONT, vecWord[i].c_str()); - sentence.sentencePtr->LinkEndChild(wordPtr); - - sentence.words.push_back( Word_t() ); - sentence.words[sentence.words.size() - 1].wordPtr = wordPtr; - } - return 0; +int XML4NLP::SetNEsToSentence(const std::vector & input, + int global_sid) { + return SetInfoToSentence(input, global_sid, TAG_NE); } -int XML4NLP::SetWordsToSentence(const vector & vecWord, - int sentenceIdx) { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) { - return -1; - } - SetWordsToSentence(vecWord, paraIdx_sentIdx.first, paraIdx_sentIdx.second); - return 0; -} +int XML4NLP::GetParsesFromSentence(std::vector< ParseResult > &relation, + int pid, int sid) const { + std::vector heads; + std::vector deprels; -int XML4NLP::GetPOSsFromSentence(std::vector & vecPOS, - int paragraphIdx, - int sentenceIdx) const { - return GetInfoFromSentence(vecPOS, paragraphIdx, sentenceIdx, TAG_POS); -} + int nr_words = CountWordInSentence(pid, sid); + relation.resize(nr_words); -int XML4NLP::GetPOSsFromSentence(std::vector &vecPOS, - int globalSentIdx) const { - return GetInfoFromSentence(vecPOS, globalSentIdx, TAG_POS); -} + if (0 != GetInfoFromSentence(heads, pid, sid, TAG_PSR_PARENT)) { + return -1; + } -int XML4NLP::GetPOSsFromSentence(std::vector &vecPOS, - int paragraphIdx, - int sentenceIdx) const { - return GetInfoFromSentence(vecPOS, paragraphIdx, sentenceIdx, TAG_POS); -} + if (0 != GetInfoFromSentence(deprels, pid, sid, TAG_PSR_RELATE)) { + return -1; + } -int XML4NLP::GetPOSsFromSentence(std::vector &vecPOS, - int globalSentIdx) const { - return GetInfoFromSentence(vecPOS, globalSentIdx, TAG_POS); -} + for (int i = 0; i < nr_words; ++ i) { + relation[i].first = atoi( heads[i] ); + relation[i].second = deprels[i]; + } -int XML4NLP::SetPOSsToSentence(const std::vector &vecPOS, - int paragraphIdx, - int sentenceIdx) { - return SetInfoToSentence(vecPOS, paragraphIdx, sentenceIdx, TAG_POS); + return 0; } -int XML4NLP::SetPOSsToSentence(const std::vector & vecPOS, - int sentenceIdx) { - return SetInfoToSentence(vecPOS, sentenceIdx, TAG_POS); -} +int XML4NLP::GetParsesFromSentence(std::vector< ParseResult > & relation, + int global_sid) const { + std::vector heads; + std::vector deprels; -int XML4NLP::GetNEsFromSentence(std::vector & vecNE, - int paragraphIdx, - int sentenceIdx) const { - return GetInfoFromSentence(vecNE, paragraphIdx, sentenceIdx, TAG_NE); -} + int nr_words = CountWordInSentence(global_sid); + relation.resize(nr_words); -int XML4NLP::GetNEsFromSentence(std::vector & vecNE, - int globalSentIdx) const { - return GetInfoFromSentence(vecNE, globalSentIdx, TAG_NE); -} + heads.resize(nr_words); + deprels.resize(nr_words); -int XML4NLP::GetNEsFromSentence(std::vector &vecNE, - int paragraphIdx, - int sentenceIdx) const { - return GetInfoFromSentence(vecNE, paragraphIdx, sentenceIdx, TAG_NE); -} + if (0 != GetInfoFromSentence(heads, global_sid, TAG_PSR_PARENT)) { + return -1; + } -int XML4NLP::GetNEsFromSentence(std::vector & vecNE, - int globalSentIdx) const { - return GetInfoFromSentence(vecNE, globalSentIdx, TAG_NE); -} + if (0 != GetInfoFromSentence(deprels, global_sid, TAG_PSR_RELATE)) { + return -1; + } -int XML4NLP::SetNEsToSentence(const std::vector & vecNE, - int paragraphIdx, - int sentenceIdx) { - return SetInfoToSentence(vecNE, paragraphIdx, sentenceIdx, TAG_NE); -} + for (int i = 0; i < nr_words; ++ i) { + relation[i].first = atoi( heads[i] ); + relation[i].second = deprels[i]; + } -int XML4NLP::SetNEsToSentence(const std::vector & vecNE, - int sentenceIdx) { - return SetInfoToSentence(vecNE, sentenceIdx, TAG_NE); + return 0; } -int XML4NLP::GetParsesFromSentence(vector< pair > &vecParse, - int paragraphIdx, - int sentenceIdx) const { - vector vecParent; - vector vecRelate; - int wordNum = CountWordInSentence(paragraphIdx, sentenceIdx); - if (wordNum != vecParse.size()) { - cerr << "vecParse.size() does not equal to the word num in the sentence, should resize first" << endl; - return -1; - } - - // vecParent.resize(wordNum); - // vecRelate.resize(wordNum); - if (0 != GetInfoFromSentence(vecParent, paragraphIdx, sentenceIdx, TAG_PSR_PARENT)) { - return -1; - } - - if (0 != GetInfoFromSentence(vecRelate, paragraphIdx, sentenceIdx, TAG_PSR_RELATE)) { - return -1; - } - for (int i=0; i < vecParent.size(); ++ i) { - vecParse[i].first = atoi( vecParent[i] ); - vecParse[i].second = vecRelate[i]; - } +int XML4NLP::GetParsesFromSentence(std::vector< std::pair > & relation, + int pid, + int sid) const { + std::vector< ParseResult > parse; + if (0 != GetParsesFromSentence(parse, pid, sid)) { + return -1; + } - return 0; + relation.resize( parse.size() ); + for (int i = 0; i < parse.size(); ++ i) { + relation[i].first = parse[i].first; + relation[i].second = parse[i].second; + } + return 0; } -int XML4NLP::GetParsesFromSentence(vector< pair > & vecParse, - int sentenceIdx) const { - vector vecParent; - vector vecRelate; - int wordNum = CountWordInSentence(sentenceIdx); - if (wordNum != vecParse.size()) { - cerr << "vecParse.size() does not equal to the word num in the sentence, should resize first" << endl; - return -1; - } - - vecParent.resize(wordNum); - vecRelate.resize(wordNum); - if (0 != GetInfoFromSentence(vecParent, sentenceIdx, TAG_PSR_PARENT)) { - return -1; - } - - if (0 != GetInfoFromSentence(vecRelate, sentenceIdx, TAG_PSR_RELATE)) { - return -1; - } +int XML4NLP::GetParsesFromSentence(std::vector< std::pair > & relation, + int global_sid) const { + std::vector< ParseResult > parse; + if (0 != GetParsesFromSentence(parse, global_sid)) { + return -1; + } - for (int i=0; i < vecParent.size(); ++i) { - vecParse[i].first = atoi( vecParent[i] ); - vecParse[i].second = vecRelate[i]; - } + relation.resize( parse.size() ); + for (int i = 0; i < parse.size(); ++ i) { + relation[i].first = parse[i].first; + relation[i].second = parse[i].second; + } - return 0; + return 0; } +int XML4NLP::SetParsesToSentence(const std::vector< std::pair > & relation, + int pid, int sid) { + if (0 != CheckRange(pid, sid)) return -1; -int XML4NLP::GetParsesFromSentence(vector< pair > &vecParse, - int paragraphIdx, - int sentenceIdx) const { - vector vecParent; - vector vecRelate; - if (0 != GetInfoFromSentence(vecParent, paragraphIdx, sentenceIdx, TAG_PSR_PARENT)) { - return -1; - } - if (0 != GetInfoFromSentence(vecRelate, paragraphIdx, sentenceIdx, TAG_PSR_RELATE)) { - return -1; - } - - vecParse.clear(); - // Assume their sizes of the two vector are equal. Is it OK? - for (int i=0; i < vecParent.size(); ++i) { - int parentIdx = atoi( vecParent[i].c_str() ); - vecParse.push_back( make_pair(static_cast(parentIdx), vecRelate[i]) ); - } + std::vector & words = document.paragraphs[pid].sentences[sid].words; - return 0; -} + if (words.size() != relation.size()) { + std::cerr << "word number does not equal to vecInfo's size in paragraph" + << pid + << " sentence " + << sid << std::endl; + return -1; + } + + if (words[0].wordPtr->Attribute(TAG_PSR_PARENT) != NULL) { + std::cerr << "Attribute \"" + << TAG_PSR_PARENT + << "\" already exists in paragraph" + << pid + << " sentence " + << sid << std::endl; + return -1; + } + + if (words[0].wordPtr->Attribute(TAG_PSR_RELATE) != NULL) { + std::cerr << "Attribute \"" + << TAG_PSR_RELATE + << "\" already exists in paragraph" + << pid + << " sentence " + << sid << endl; + return -1; + } -int XML4NLP::GetParsesFromSentence(vector< pair > &vecParse, - int sentenceIdx) const { - vector vecParent; - vector vecRelate; - if (0 != GetInfoFromSentence(vecParent, sentenceIdx, TAG_PSR_PARENT)) { - return -1; - } - if (0 != GetInfoFromSentence(vecRelate, sentenceIdx, TAG_PSR_RELATE)) { - return -1; - } + for (int i = 0; i < words.size(); ++ i) { + words[i].wordPtr->SetAttribute(TAG_PSR_PARENT, relation[i].first); + words[i].wordPtr->SetAttribute(TAG_PSR_RELATE, relation[i].second.c_str()); + } - vecParse.clear(); - // Assume their sizes of the two vector are equal. Is it OK? - for (int i=0; i < vecParent.size(); ++i) { - int parentIdx = atoi( vecParent[i].c_str() ); - vecParse.push_back( make_pair(static_cast(parentIdx), vecRelate[i]) ); - } - - return 0; + return 0; } -int XML4NLP::SetParsesToSentence(const vector< pair > &vecParse, - int paragraphIdx, - int sentenceIdx) { - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return -1; - vector &words = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words; - if (words.size() != vecParse.size()) { - cerr << "word number does not equal to vecInfo's size in paragraph" << paragraphIdx - << " sentence " << sentenceIdx << endl; - return -1; - } - - if (words[0].wordPtr->Attribute(TAG_PSR_PARENT) != NULL) { - cerr << "Attribute \"" << TAG_PSR_PARENT << "\" already exists in paragraph" << paragraphIdx - << " sentence " << sentenceIdx << endl; - return -1; - } - - if (words[0].wordPtr->Attribute(TAG_PSR_RELATE) != NULL) { - cerr << "Attribute \"" << TAG_PSR_RELATE << "\" already exists in paragraph" << paragraphIdx - << " sentence " << sentenceIdx << endl; - return -1; - } - - for (int i = 0; i < words.size(); ++i) { - words[i].wordPtr->SetAttribute(TAG_PSR_PARENT, vecParse[i].first); - words[i].wordPtr->SetAttribute(TAG_PSR_RELATE, vecParse[i].second.c_str()); - } - - return 0; +int XML4NLP::SetParsesToSentence(const std::vector< std::pair > & relation, + int global_sid) { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return SetParsesToSentence(relation, pid, sid); } -int XML4NLP::SetParsesToSentence(const vector< pair > &vecParse, int sentenceIdx) { - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - vector &words = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words; - if (words.size() != vecParse.size()) { - cerr << "word number does not equal to vecInfo's size in paragraph" << paraIdx_sentIdx.first - << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } - if (words[0].wordPtr->Attribute(TAG_PSR_PARENT) != NULL) - { - cerr << "Attribute \"" << TAG_PSR_PARENT << "\" already exists in paragraph" << paraIdx_sentIdx.first - << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } - if (words[0].wordPtr->Attribute(TAG_PSR_RELATE) != NULL) - { - cerr << "Attribute \"" << TAG_PSR_RELATE << "\" already exists in paragraph" << paraIdx_sentIdx.first - << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } - - for (int i = 0; i < words.size(); ++i) - { - words[i].wordPtr->SetAttribute(TAG_PSR_PARENT, vecParse[i].first); - words[i].wordPtr->SetAttribute(TAG_PSR_RELATE, vecParse[i].second.c_str()); - } - return 0; +int XML4NLP::SetParsesToSentence(const std::vector & heads, + const std::vector & deprels, + int pid, + int sid) { + if (0 != SetInfoToSentence(heads, pid, sid, TAG_PSR_PARENT)) return -1; + if (0 != SetInfoToSentence(deprels, pid, sid, TAG_PSR_RELATE)) return -1; + return 0; } -int XML4NLP::SetParsesToSentence(const vector &vecHead, const vector &vecRel, int paragraphIdx, int sentenceIdx) -{ - if (0 != SetInfoToSentence(vecHead, paragraphIdx, sentenceIdx, TAG_PSR_PARENT)) return -1; - if (0 != SetInfoToSentence(vecRel, paragraphIdx, sentenceIdx, TAG_PSR_RELATE)) return -1; - return 0; -} +int XML4NLP::SetParsesToSentence(const std::vector & heads, + const std::vector & deprels, + int global_sid) { + // decreasing vecHead index + std::vector d_heads; + for (int i = 0; i < heads.size(); ++ i) { + d_heads.push_back( heads[i] - 1 ); + // std::cout << d_heads[i] << " " << deprels[i] << std::endl; + } -int XML4NLP::SetParsesToSentence(const vector &vecHead, const vector &vecRel, int sentenceIdx) -{ - // decreasing vecHead index - vector d_vecHead; - for (int i = 0; i < vecHead.size(); i++) - { - d_vecHead.push_back(vecHead[i] - 1); - } + if (0 != SetInfoToSentence(d_heads, global_sid, TAG_PSR_PARENT)) return -1; + if (0 != SetInfoToSentence(deprels, global_sid, TAG_PSR_RELATE)) return -1; - if (0 != SetInfoToSentence(d_vecHead, sentenceIdx, TAG_PSR_PARENT)) return -1; - if (0 != SetInfoToSentence(vecRel, sentenceIdx, TAG_PSR_RELATE)) return -1; - return 0; + // std::string buffer; + // SaveDOM(buffer); + // std::cout << buffer << std::endl; + return 0; } -// ----------------------------------------------------------------for text summarization -const char* XML4NLP::GetTextSummary() const -{ - if (m_summary.nodePtr != NULL) - { - return m_summary.nodePtr->GetText(); - } - else - { - cerr << "have not done text summary." << endl; - return NULL; - } +const char * XML4NLP::GetTextSummary() const { + if (summary.nodePtr != NULL) { + return summary.nodePtr->GetText(); + } else { + std::cerr << "have not done text summary." << std::endl; + return NULL; + } } -int XML4NLP::SetTextSummary(const char* cszTextSum) -{ - if (m_summary.nodePtr != NULL) - { - cerr << "has done text summary" << endl; - return -1; - } +int XML4NLP::SetTextSummary(const char* cszTextSum) { + if (summary.nodePtr != NULL) { + std::cerr << "has done text summary" << std::endl; + return -1; + } - m_summary.nodePtr = new TiXmlElement(TAG_SUM); - m_tiXmlDoc.RootElement()->LinkEndChild(m_summary.nodePtr); - TiXmlText *textPtr = new TiXmlText(cszTextSum); - m_summary.nodePtr->LinkEndChild(textPtr); + summary.nodePtr = new TiXmlElement(TAG_SUM); + m_tiXmlDoc.RootElement()->LinkEndChild(summary.nodePtr); + TiXmlText * textPtr = new TiXmlText(cszTextSum); + summary.nodePtr->LinkEndChild(textPtr); - return 0; + return 0; } -// ----------------------------------------------------------------for text classification -const char* XML4NLP::GetTextClass() const -{ - if (m_textclass.nodePtr != NULL) - { - return m_textclass.nodePtr->GetText(); - } - else - { - cerr << "have not done text class." << endl; - return NULL; - } +const char * XML4NLP::GetTextClass() const { + if (textclass.nodePtr != NULL) { + return textclass.nodePtr->GetText(); + } else { + cerr << "have not done text class." << endl; + return NULL; + } } -int XML4NLP::SetTextClass(const char* cszTextClass) -{ - if (m_textclass.nodePtr != NULL) - { - cerr << "has done text classify" << endl; - return -1; - } +int XML4NLP::SetTextClass(const char* cszTextClass) { + if (textclass.nodePtr != NULL) { + cerr << "has done text classify" << endl; + return -1; + } - m_textclass.nodePtr = new TiXmlElement(TAG_CLASS); - m_tiXmlDoc.RootElement()->LinkEndChild(m_textclass.nodePtr); - TiXmlText *textPtr = new TiXmlText(cszTextClass); - m_textclass.nodePtr->LinkEndChild(textPtr); - return 0; + textclass.nodePtr = new TiXmlElement(TAG_CLASS); + m_tiXmlDoc.RootElement()->LinkEndChild(textclass.nodePtr); + TiXmlText *textPtr = new TiXmlText(cszTextClass); + textclass.nodePtr->LinkEndChild(textPtr); + return 0; } // ----------------------------------------------------------------for SRL -int XML4NLP::CountPredArgToWord(int paragraphIdx, int sentenceIdx, int wordIdx) const -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr; - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - //cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - // << " of sentence " << sentenceIdx << " of paragraph " << paragraphIdx << endl; - return 0; - } +int XML4NLP::CountPredArgToWord(int pid, int sid, int wid) const { + if (0 != CheckRange(pid, sid, wid)) return -1; - int counter = 0; - do - { - ++counter; - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - } while (argPtr != NULL); + TiXmlElement *wordPtr = document.paragraphs[pid].sentences[sid].words[wid].wordPtr; + TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - return counter; -} + if (argPtr == NULL) { + return 0; + } -int XML4NLP::CountPredArgToWord(int globalSentIdx, int wordIdx) const -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(globalSentIdx, paraIdx_sentIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words[wordIdx].wordPtr; - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - //cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - // << " of sentence " << sentenceIdx << " of paragraph " << paragraphIdx << endl; - return 0; - } + int nr_args = 0; - int counter = 0; - do - { - ++counter; - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - } while (argPtr != NULL); + do { + ++ nr_args; + argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); + } while (argPtr != NULL); - return counter; + return nr_args; } -int XML4NLP::CountPredArgToWord(int globalWordIdx) const -{ - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paraIdx].sentences[sentIdx].words[wordIdx].wordPtr; - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - //cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - // << " of sentence " << sentenceIdx << " of paragraph " << paragraphIdx << endl; - return 0; - } - - int counter = 0; - do - { - ++counter; - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - } while (argPtr != NULL); - - return counter; +int XML4NLP::CountPredArgToWord(int global_sid, int wid) const { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return CountPredArgToWord(pid, sid, wid); } - -int XML4NLP::GetPredArgToWord( int paragraphIdx, int sentenceIdx, int wordIdx, - vector &vecType, vector< pair > &vecBegEnd) const -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr; - - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - << " of sentence " << sentenceIdx << " of paragraph " << paragraphIdx << endl; - return -1; - } - - if (vecType.size() != vecBegEnd.size()) - { - cerr << "vecType's size() != vecBegEnd.size(), should resize() first." << endl; - return -1; - } - if (vecType.empty()) - { - cerr << "vecType is empty" << endl; - return -1; - } - - int i = 0; - do - { - const char *cszType = argPtr->Attribute(TAG_SRL_TYPE); - const char *cszBeg = argPtr->Attribute(TAG_BEGIN); - const char *cszEnd = argPtr->Attribute(TAG_END); - vecType[i] = cszType; - int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); - int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); - vecBegEnd[i].first = uiBeg; - vecBegEnd[i].second = uiEnd; - - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - ++i; - } while (argPtr != NULL && i < vecType.size()); - - if ( ! (argPtr == NULL && i == vecType.size()) ) - { - if (argPtr == NULL) - { - cerr << "vecType.size() is too large" << endl; - } - else - { - cerr << "vecType.size() is too small" << endl; - } - - return -1; - } - - return 0; +int XML4NLP::CountPredArgToWord(int global_wid) const { + int pid, sid, wid; + if (0 != DecodeGlobalId(global_wid, pid, sid, wid)) return -1; + return CountPredArgToWord(pid, sid, wid); } -int XML4NLP::GetPredArgToWord( int sentenceIdx, int wordIdx, - vector &vecType, vector< pair > &vecBegEnd) const -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words[wordIdx].wordPtr; - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - << " of sentence " << paraIdx_sentIdx.first << " of paragraph " << paraIdx_sentIdx.first << endl; - return -1; - } - - if (vecType.size() != vecBegEnd.size()) - { - cerr << "vecType's size() != vecBegEnd.size(), should resize() first." << endl; - return -1; - } - if (vecType.empty()) - { - cerr << "vecType is empty" << endl; - return -1; - } - - int i = 0; - do - { - const char *cszType = argPtr->Attribute(TAG_SRL_TYPE); - const char *cszBeg = argPtr->Attribute(TAG_BEGIN); - const char *cszEnd = argPtr->Attribute(TAG_END); - vecType[i] = cszType; - int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); - int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); - //vecBegEnd.push_back( make_pair(uiBeg, uiEnd) ); - vecBegEnd[i].first = uiBeg; - vecBegEnd[i].second = uiEnd; - - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - ++i; - } while (argPtr != NULL && i < vecType.size()); - - if ( ! (argPtr == NULL && i == vecType.size()) ) - { - if (argPtr == NULL) - { - cerr << "vecType.size() is too large" << endl; - } - else - { - cerr << "vecType.size() is too small" << endl; - } - return -1; - } +int XML4NLP::GetPredArgToWord(int pid, + int sid, + int wid, + std::vector & role, + std::vector< std::pair > & range) const { + if (0 != CheckRange(pid, sid, wid)) return -1; - return 0; -} + TiXmlElement *wordPtr = document.paragraphs[pid].sentences[sid].words[wid].wordPtr; + TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); -int XML4NLP::GetPredArgToWord( int globalWordIdx, - vector &vecType, vector< pair > &vecBegEnd) const -{ - int paraIdx, sentIdx, wordIdx; - if (0 != MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(globalWordIdx, paraIdx, sentIdx, wordIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paraIdx].sentences[sentIdx].words[wordIdx].wordPtr; - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - << " of sentence " << paraIdx << " of paragraph " << sentIdx << endl; - return -1; - } - - if (vecType.size() != vecBegEnd.size()) - { - cerr << "vecType's size() != vecBegEnd.size(), should resize() first." << endl; - return -1; - } - if (vecType.empty()) - { - cerr << "vecType is empty" << endl; - return -1; - } - - int i = 0; - do - { - const char *cszType = argPtr->Attribute(TAG_SRL_TYPE); - const char *cszBeg = argPtr->Attribute(TAG_BEGIN); - const char *cszEnd = argPtr->Attribute(TAG_END); - int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); - int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); - vecType[i] = cszType; - vecBegEnd[i].first = uiBeg; - vecBegEnd[i].second = uiEnd; - - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - ++i; - } while (argPtr != NULL && i < vecType.size()); - - if ( ! (argPtr == NULL && i == vecType.size()) ) - { - if (argPtr == NULL) - { - cerr << "vecType.size() is too large" << endl; - } - else - { - cerr << "vecType.size() is too small" << endl; - } - - return -1; - } + if (argPtr == NULL) { + std::cerr << "\"" + << TAG_SRL_ARG + << "\" does not exists in word " + << wid + << " of sentence " + << sid + << " of paragraph " + << pid << std::endl; + return -1; + } - return 0; -} + if (role.size() != range.size()) { + std::cerr << "role's size() != range.size(), should resize() first." << std::endl; + return -1; + } -int XML4NLP::GetPredArgToWord( int paragraphIdx, int sentenceIdx, int wordIdx, - vector &vecType, vector< pair > &vecBegEnd) const -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr; - - vecType.clear(); - vecBegEnd.clear(); - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - << " of sentence " << sentenceIdx << " of paragraph " << paragraphIdx << endl; - return -1; + if (role.empty()) { + cerr << "role is empty" << endl; + return -1; + } + + int i = 0; + + do { + const char *cszType = argPtr->Attribute(TAG_SRL_TYPE); + const char *cszBeg = argPtr->Attribute(TAG_BEGIN); + const char *cszEnd = argPtr->Attribute(TAG_END); + role[i] = cszType; + int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); + int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); + range[i].first = uiBeg; + range[i].second = uiEnd; + + argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); + ++i; + } while (argPtr != NULL && i < role.size()); + + if ( ! (argPtr == NULL && i == role.size()) ) { + if (argPtr == NULL) { + cerr << "role.size() is too large" << endl; + } else { + cerr << "role.size() is too small" << endl; } - do - { - const char *cszType = argPtr->Attribute(TAG_SRL_TYPE); - const char *cszBeg = argPtr->Attribute(TAG_BEGIN); - const char *cszEnd = argPtr->Attribute(TAG_END); - vecType.push_back(cszType != NULL ? cszType : "" ); - int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); - int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); - vecBegEnd.push_back( make_pair(uiBeg, uiEnd) ); + return -1; + } + + return 0; +} + +int XML4NLP::GetPredArgToWord(int global_sid, + int wid, + std::vector & role, + std::vector< std::pair > & range) const { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return GetPredArgToWord(pid, sid, wid, role, range); +} + +int XML4NLP::GetPredArgToWord(int global_wid, + std::vector & role, + std::vector< std::pair > & range) const { + int pid, sid, wid; + if (0 != DecodeGlobalId(global_wid, pid, sid, wid)) return -1; + return GetPredArgToWord(pid, sid, wid, role, range); +} + +int XML4NLP::GetPredArgToWord(int pid, + int sid, + int wid, + std::vector & role, + std::vector< std::pair > & range) const { + std::vector role2; + int ret = GetPredArgToWord(pid, sid, wid, role2, range); + if (0 != ret) { return ret; } + + role.resize(role2.size()); + for (int i = 0; i < role2.size(); ++ i) { role[i] = role2[i]; } + return 0; +} + +int XML4NLP::GetPredArgToWord(int global_sid, + int wid, + std::vector & role, + std::vector< std::pair > & range) const { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return GetPredArgToWord(pid, sid, wid, role, range); +} + + +int XML4NLP::SetPredArgToWord(int pid, + int sid, + int wid, + const std::vector & role, + const std::vector< std::pair > & range) { + if (0 != CheckRange(pid, sid, wid)) return -1; + + TiXmlElement *wordPtr = document.paragraphs[pid].sentences[sid].words[wid].wordPtr; + + if (wordPtr->FirstChildElement(TAG_SRL_ARG) != NULL) { + std::cerr << "\"" + << TAG_SRL_ARG + << "\" already exists in word " + << wid + << " of sentence " + << sid + << " of paragraph " + << pid << std::endl; + return -1; + } - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - } while (argPtr != NULL); + for (int i = 0; i < role.size(); ++ i) { + TiXmlElement *argPtr = new TiXmlElement(TAG_SRL_ARG); + argPtr->SetAttribute(TAG_ID, i); + argPtr->SetAttribute(TAG_SRL_TYPE, role[i].c_str()); + argPtr->SetAttribute(TAG_BEGIN, range[i].first); + argPtr->SetAttribute(TAG_END, range[i].second); + wordPtr->LinkEndChild(argPtr); + } - return 0; + return 0; } -int XML4NLP::GetPredArgToWord( int sentenceIdx, int wordIdx, - vector &vecType, vector< pair > &vecBegEnd) const -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words[wordIdx].wordPtr; - - vecType.clear(); - vecBegEnd.clear(); - TiXmlElement *argPtr = wordPtr->FirstChildElement(TAG_SRL_ARG); - if (argPtr == NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" does not exists in word " << wordIdx - << " of sentence " << paraIdx_sentIdx.first << " of paragraph " << paraIdx_sentIdx.first << endl; - return -1; - } - - do - { - const char *cszType = argPtr->Attribute(TAG_SRL_TYPE); - const char *cszBeg = argPtr->Attribute(TAG_BEGIN); - const char *cszEnd = argPtr->Attribute(TAG_END); - vecType.push_back(cszType != NULL ? cszType : "" ); - int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); - int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); - vecBegEnd.push_back( make_pair(uiBeg, uiEnd) ); - - argPtr = argPtr->NextSiblingElement(TAG_SRL_ARG); - } while (argPtr != NULL); - return 0; +int XML4NLP::SetPredArgToWord(int global_sid, + int wid, + const std::vector & role, + const std::vector< std::pair > & range) { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return SetPredArgToWord(pid, sid, wid, role, range); } +int XML4NLP::GetMentionOfEntity(std::vector< std::pair > &mention, + int entityIdx) const { + if (entityIdx >= coref.vecEntity.size()) { + cerr << "entity idx is too large" << endl; + return -1; + } -int XML4NLP::SetPredArgToWord( int paragraphIdx, int sentenceIdx, int wordIdx, - const vector &vecType, const vector< pair > &vecBegEnd) -{ - - if (0 != CheckRange(paragraphIdx, sentenceIdx, wordIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words[wordIdx].wordPtr; - - if (wordPtr->FirstChildElement(TAG_SRL_ARG) != NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" already exists in word " << wordIdx - << " of sentence " << sentenceIdx << " of paragraph " << paragraphIdx << endl; - return -1; - } + const vector &mentionRef = coref.vecEntity[entityIdx].vecMention; + if (mention.size() != mentionRef.size()) { + std::cerr << "mention.size() does not equal to the num of mention," + << " should resize() first" + << std::endl; + return -1; + } - for (int i = 0; i < vecType.size(); ++i) - { - TiXmlElement *argPtr = new TiXmlElement(TAG_SRL_ARG); - argPtr->SetAttribute(TAG_ID, i); - argPtr->SetAttribute(TAG_SRL_TYPE, vecType[i].c_str()); - argPtr->SetAttribute(TAG_BEGIN, vecBegEnd[i].first); - argPtr->SetAttribute(TAG_END, vecBegEnd[i].second); - wordPtr->LinkEndChild(argPtr); + for (int i=0; i < mentionRef.size(); ++i) { + const char *cszBeg = mentionRef[i].mentionPtr->Attribute(TAG_BEGIN); + const char *cszEnd = mentionRef[i].mentionPtr->Attribute(TAG_END); + if (cszBeg == NULL || cszEnd == NULL) { + std::cerr << "mention attribute err in DOM" << std::endl; + return -1; } - return 0; + mention[i].first = atoi(cszBeg); + mention[i].second = atoi(cszEnd); + } + return 0; } -int XML4NLP::SetPredArgToWord( int sentenceIdx, int wordIdx, - const vector &vecType, const vector< pair > &vecBegEnd) -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - TiXmlElement *wordPtr = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words[wordIdx].wordPtr; +int XML4NLP::GetCoreference(vector< vector< pair > > &vecCoref) const { + if (coref.nodePtr == NULL) { + cerr << "has not done coreference" << endl; + return -1; + } + vecCoref.clear(); + TiXmlElement *crPtr = coref.nodePtr->FirstChildElement(TAG_COREF_CR); - if (wordPtr->FirstChildElement(TAG_SRL_ARG) != NULL) - { - cerr << "\"" << TAG_SRL_ARG << "\" already exists in word " << wordIdx - << " of sentence " << paraIdx_sentIdx.first << " of paragraph " << paraIdx_sentIdx.first << endl; - return -1; - } + for (; crPtr != NULL; crPtr = crPtr->NextSiblingElement(TAG_COREF_CR)) { + vecCoref.push_back( vector< pair >() ); + vector< pair > &vecRef = vecCoref[vecCoref.size()-1]; + TiXmlElement *mentPtr = crPtr->FirstChildElement(TAG_COREF_MENT); - for (int i = 0; i < vecType.size(); ++i) - { - TiXmlElement *argPtr = new TiXmlElement(TAG_SRL_ARG); - argPtr->SetAttribute(TAG_ID, i); - argPtr->SetAttribute(TAG_SRL_TYPE, vecType[i].c_str()); - argPtr->SetAttribute(TAG_BEGIN, vecBegEnd[i].first); - argPtr->SetAttribute(TAG_END, vecBegEnd[i].second); - wordPtr->LinkEndChild(argPtr); + for (; mentPtr != NULL; mentPtr = mentPtr->NextSiblingElement(TAG_COREF_MENT)) { + const char *cszBeg = mentPtr->Attribute(TAG_BEGIN); + const char *cszEnd = mentPtr->Attribute(TAG_END); + int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); + int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); + vecRef.push_back( make_pair(uiBeg, uiEnd) ); } - return 0; + } + return 0; } -// ----------------------------------------------------------------for coreference resolution -int XML4NLP::GetMentionOfEntity(vector< pair > &vecMention, int entityIdx) const -{ - if (entityIdx >= m_coref.vecEntity.size()) - { - cerr << "entity idx is too large" << endl; - return -1; - } +int XML4NLP::SetCoreference(const vector< vector< pair > > &vecCoref) { + if (coref.nodePtr != NULL) { + cerr << "has already done coreference" << endl; + return -1; + } - const vector &vecMentionRef = m_coref.vecEntity[entityIdx].vecMention; - if (vecMention.size() != vecMentionRef.size()) - { - cerr << "vecMention.size() does not equal to the num of mention, should resize() first" << endl; - return -1; - } + coref.nodePtr = new TiXmlElement(TAG_COREF); + for (int i = 0; i < vecCoref.size(); ++i) { + TiXmlElement *crPtr = new TiXmlElement(TAG_COREF_CR); + crPtr->SetAttribute(TAG_ID, i); - for (int i=0; i < vecMentionRef.size(); ++i) - { - const char *cszBeg = vecMentionRef[i].mentionPtr->Attribute(TAG_BEGIN); - const char *cszEnd = vecMentionRef[i].mentionPtr->Attribute(TAG_END); - if (cszBeg == NULL || cszEnd == NULL) - { - cerr << "mention attribute err in DOM" << endl; - return -1; - } - vecMention[i].first = atoi(cszBeg); - vecMention[i].second = atoi(cszEnd); - } - return 0; -} + coref.vecEntity.push_back( Entity() ); + Entity &entity = coref.vecEntity[coref.vecEntity.size() - 1]; + entity.entityPtr = crPtr; -int XML4NLP::GetCoreference(vector< vector< pair > > &vecCoref) const -{ - if (m_coref.nodePtr == NULL) - { - cerr << "has not done coreference" << endl; - return -1; - } - vecCoref.clear(); - TiXmlElement *crPtr = m_coref.nodePtr->FirstChildElement(TAG_COREF_CR); - for (; crPtr != NULL; crPtr = crPtr->NextSiblingElement(TAG_COREF_CR)) - { - vecCoref.push_back( vector< pair >() ); - vector< pair > &vecRef = vecCoref[vecCoref.size()-1]; - TiXmlElement *mentPtr = crPtr->FirstChildElement(TAG_COREF_MENT); - for (; mentPtr != NULL; mentPtr = mentPtr->NextSiblingElement(TAG_COREF_MENT)) - { - const char *cszBeg = mentPtr->Attribute(TAG_BEGIN); - const char *cszEnd = mentPtr->Attribute(TAG_END); - int uiBeg = static_cast(cszBeg != NULL ? atoi(cszBeg) : 0); - int uiEnd = static_cast(cszEnd != NULL ? atoi(cszEnd) : 0); - vecRef.push_back( make_pair(uiBeg, uiEnd) ); - } - } - return 0; -} + for (int j = 0; j < vecCoref[i].size(); ++j) { + TiXmlElement *mentPtr = new TiXmlElement(TAG_COREF_MENT); + mentPtr->SetAttribute(TAG_ID, j); + mentPtr->SetAttribute(TAG_BEGIN, vecCoref[i][j].first); + mentPtr->SetAttribute(TAG_END, vecCoref[i][j].second); + crPtr->LinkEndChild(mentPtr); -int XML4NLP::SetCoreference(const vector< vector< pair > > &vecCoref) -{ - if (m_coref.nodePtr != NULL) - { - cerr << "has already done coreference" << endl; - return -1; + entity.vecMention.push_back( Mention() ); + Mention &mention = entity.vecMention[entity.vecMention.size() - 1]; + mention.mentionPtr = mentPtr; } - m_coref.nodePtr = new TiXmlElement(TAG_COREF); - for (int i = 0; i < vecCoref.size(); ++i) - { - TiXmlElement *crPtr = new TiXmlElement(TAG_COREF_CR); - crPtr->SetAttribute(TAG_ID, i); - - m_coref.vecEntity.push_back( Entity() ); - Entity &entity = m_coref.vecEntity[m_coref.vecEntity.size() - 1]; - entity.entityPtr = crPtr; - - for (int j = 0; j < vecCoref[i].size(); ++j) - { - TiXmlElement *mentPtr = new TiXmlElement(TAG_COREF_MENT); - mentPtr->SetAttribute(TAG_ID, j); - mentPtr->SetAttribute(TAG_BEGIN, vecCoref[i][j].first); - mentPtr->SetAttribute(TAG_END, vecCoref[i][j].second); - crPtr->LinkEndChild(mentPtr); - - entity.vecMention.push_back( Mention() ); - Mention &mention = entity.vecMention[entity.vecMention.size() - 1]; - mention.mentionPtr = mentPtr; - } - - m_coref.nodePtr->LinkEndChild(crPtr); - } - m_tiXmlDoc.RootElement()->LinkEndChild(m_coref.nodePtr); + coref.nodePtr->LinkEndChild(crPtr); + } + m_tiXmlDoc.RootElement()->LinkEndChild(coref.nodePtr); - return 0; + return 0; } ///////////////////////////////////////////////////////////////////////////////////// /// initialize the XML DOM tree. /// after the process LoadFile(), the DOM tree has been set up -/// but it is not fully conform to our need, +/// but it is not fully conform to our need, /// for example, the member "paragraphss" of the class Document has not been initialized, /// this function just do this. ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::InitXmlStructure() { - TiXmlElement *xml4nlp = m_tiXmlDoc.RootElement(); - m_document.documentPtr = xml4nlp->FirstChildElement(TAG_DOC); - m_note.nodePtr = xml4nlp->FirstChildElement(TAG_NOTE); - // document summary, text classification and coreference is not - // provided in current version (v3.0.0) - // m_summary.nodePtr = xml4nlp->FirstChildElement(TAG_SUM); - // m_textclass.nodePtr = xml4nlp->FirstChildElement(TAG_CLASS); - // m_coref.nodePtr = xml4nlp->FirstChildElement(TAG_COREF); - - if (m_document.documentPtr == NULL) { // consider it as wrong for now. - cerr << "there is no \"" << TAG_DOC << "\" tag in xml file." << endl; - return -1; - } + TiXmlElement *xml4nlp = m_tiXmlDoc.RootElement(); + document.documentPtr = xml4nlp->FirstChildElement(TAG_DOC); + note.nodePtr = xml4nlp->FirstChildElement(TAG_NOTE); + // document summary, text classification and coreference is not + // provided in current version (v3.0.0) + // summary.nodePtr = xml4nlp->FirstChildElement(TAG_SUM); + // textclass.nodePtr = xml4nlp->FirstChildElement(TAG_CLASS); + // coref.nodePtr = xml4nlp->FirstChildElement(TAG_COREF); + + if (document.documentPtr == NULL) { // consider it as wrong for now. + cerr << "there is no \"" << TAG_DOC << "\" tag in xml file." << endl; + return -1; + } - if (0 != InitXmlDocument(m_document)) { - return -1; - } + if (0 != InitXmlDocument(document)) { + return -1; + } - if (m_coref.nodePtr != NULL) { - if (0 != InitXmlCoref(m_coref)) { - return -1; - } + if (coref.nodePtr != NULL) { + if (0 != InitXmlCoref(coref)) { + return -1; } + } - return 0; + return 0; } +int XML4NLP::InitXmlCoref(Coref &coref) { + TiXmlElement *entityPtr = coref.nodePtr->FirstChildElement(TAG_COREF_CR); -void XML4NLP::CheckNoteForOldLtml() -{ - m_note.nodePtr = new TiXmlElement(TAG_NOTE); - m_tiXmlDoc.RootElement()->LinkEndChild( m_note.nodePtr ); - ClearAllNote(); - - // if (m_coref.nodePtr != NULL) SetNote(NOTE_CR); - // if (m_summary.nodePtr != NULL) SetNote(NOTE_SUM); - // if (m_textclass.nodePtr != NULL) SetNote(NOTE_CLASS); - - if ( m_document.paragraphs.empty() ) return; - if ( m_document.paragraphs[0].sentences.empty() ) return; - SetNote(NOTE_SENT); - if ( m_document.paragraphs[0].sentences[0].words.empty() ) return; - SetNote(NOTE_WORD); - TiXmlElement *wordPtr = m_document.paragraphs[0].sentences[0].words[0].wordPtr; - if ( wordPtr->Attribute(TAG_POS) != NULL ) SetNote(NOTE_POS); - if ( wordPtr->Attribute(TAG_NE) != NULL ) SetNote(NOTE_NE); - if ( wordPtr->Attribute(TAG_WSD) != NULL ) SetNote(NOTE_WSD); // consider only one attribute, excluding TAG_WSD_EXP - if ( wordPtr->Attribute(TAG_PSR_PARENT) != NULL ) SetNote(NOTE_PARSER); // excluding TAG_PSR_RELATE - if ( wordPtr->Attribute(TAG_SRL_ARG) != NULL ) SetNote(NOTE_SRL); // excluding TAG_SRL_TYPE -} - -int XML4NLP::InitXmlCoref(Coref &coref) -{ - TiXmlElement *entityPtr = coref.nodePtr->FirstChildElement(TAG_COREF_CR); - if (entityPtr == NULL) - { - return 0; - } - - do - { - if (0 != InitXmlEntity(coref.vecEntity, entityPtr)) return -1; - entityPtr = entityPtr->NextSiblingElement(TAG_COREF_CR); - } while (entityPtr != NULL); + if (entityPtr == NULL) { return 0; + } + + do { + if (0 != InitXmlEntity(coref.vecEntity, entityPtr)) return -1; + entityPtr = entityPtr->NextSiblingElement(TAG_COREF_CR); + } while (entityPtr != NULL); + return 0; } -int XML4NLP::InitXmlEntity(vector &vecEntity, TiXmlElement *entityPtr) -{ - vecEntity.push_back( Entity() ); - Entity &entity = vecEntity[vecEntity.size()-1]; - entity.entityPtr = entityPtr; - - TiXmlElement *mentionPtr = entityPtr->FirstChildElement(TAG_COREF_MENT); - if (mentionPtr == NULL) return 0; - - do - { - if (0 != InitXmlMention(entity.vecMention, mentionPtr)) return -1; - mentionPtr = mentionPtr->NextSiblingElement(TAG_COREF_MENT); - } while(mentionPtr != NULL); - return 0; +int XML4NLP::InitXmlEntity(vector &vecEntity, TiXmlElement *entityPtr) { + vecEntity.push_back( Entity() ); + Entity &entity = vecEntity[vecEntity.size()-1]; + entity.entityPtr = entityPtr; + + TiXmlElement *mentionPtr = entityPtr->FirstChildElement(TAG_COREF_MENT); + if (mentionPtr == NULL) return 0; + + do { + if (0 != InitXmlMention(entity.vecMention, mentionPtr)) return -1; + mentionPtr = mentionPtr->NextSiblingElement(TAG_COREF_MENT); + } while(mentionPtr != NULL); + return 0; } -int XML4NLP::InitXmlMention(vector &vecMention, TiXmlElement *mentionPtr) -{ - vecMention.push_back( Mention() ); - vecMention[vecMention.size() -1].mentionPtr = mentionPtr; - return 0; +int XML4NLP::InitXmlMention(vector &vecMention, TiXmlElement *mentionPtr) { + vecMention.push_back( Mention() ); + vecMention[vecMention.size() -1].mentionPtr = mentionPtr; + return 0; } -int XML4NLP::InitXmlDocument(Document_t &document) -{ - TiXmlElement *paragraphPtr = document.documentPtr->FirstChildElement(TAG_PARA); - if (paragraphPtr == NULL) // consider it as wrong for now. - { - cerr << "there is no \"" << TAG_PARA << "\" tag in xml file." << endl; - return -1; - } +int XML4NLP::InitXmlDocument(Document &document) { + TiXmlElement *paragraphPtr = document.documentPtr->FirstChildElement(TAG_PARA); + if (paragraphPtr == NULL) { + // consider it as wrong for now. + cerr << "there is no \"" << TAG_PARA << "\" tag in xml file." << endl; + return -1; + } - do - { - if (0 != InitXmlParagraph(document.paragraphs, paragraphPtr)) return -1; - paragraphPtr = paragraphPtr->NextSiblingElement(TAG_PARA); - } while (paragraphPtr != NULL); - return 0; + do { + if (0 != InitXmlParagraph(document.paragraphs, paragraphPtr)) return -1; + paragraphPtr = paragraphPtr->NextSiblingElement(TAG_PARA); + } while (paragraphPtr != NULL); + return 0; } -int XML4NLP::InitXmlParagraph(vector ¶graphs, TiXmlElement *paragraphPtr) +int XML4NLP::InitXmlParagraph(vector ¶graphs, TiXmlElement *paragraphPtr) { - paragraphs.push_back( Paragraph_t() ); - Paragraph_t ¶graph = paragraphs[paragraphs.size()-1]; - paragraph.paragraphPtr = paragraphPtr; + paragraphs.push_back( Paragraph() ); + Paragraph ¶graph = paragraphs[paragraphs.size()-1]; + paragraph.paragraphPtr = paragraphPtr; - TiXmlElement *stnsPtr = paragraphPtr->FirstChildElement(TAG_SENT); - if (stnsPtr == NULL) return 0; // have not split sentence + TiXmlElement *stnsPtr = paragraphPtr->FirstChildElement(TAG_SENT); + if (stnsPtr == NULL) return 0; // have not split sentence - // record the sentence info - do { - if (0 != InitXmlSentence(paragraph.sentences, stnsPtr)) return -1; - stnsPtr = stnsPtr->NextSiblingElement(TAG_SENT); - } while(stnsPtr != NULL); + // record the sentence info + do { + if (0 != InitXmlSentence(paragraph.sentences, stnsPtr)) return -1; + stnsPtr = stnsPtr->NextSiblingElement(TAG_SENT); + } while(stnsPtr != NULL); - return 0; + return 0; } -int XML4NLP::InitXmlSentence(vector &sentences, TiXmlElement *stnsPtr) +int XML4NLP::InitXmlSentence(vector &sentences, TiXmlElement *stnsPtr) { - sentences.push_back( Sentence_t() ); - Sentence_t &sentence = sentences[sentences.size()-1]; - sentence.sentencePtr = stnsPtr; + sentences.push_back( Sentence() ); + Sentence &sentence = sentences[sentences.size()-1]; + sentence.sentencePtr = stnsPtr; - TiXmlElement *wordPtr = stnsPtr->FirstChildElement(TAG_WORD); - if (wordPtr == NULL) return 0; // have not done word segment + TiXmlElement *wordPtr = stnsPtr->FirstChildElement(TAG_WORD); + if (wordPtr == NULL) return 0; // have not done word segment - do - { - if (0 != InitXmlWord(sentence.words, wordPtr)) return -1; - wordPtr = wordPtr->NextSiblingElement(TAG_WORD); - } while(wordPtr != NULL); + do + { + if (0 != InitXmlWord(sentence.words, wordPtr)) return -1; + wordPtr = wordPtr->NextSiblingElement(TAG_WORD); + } while(wordPtr != NULL); - return 0; + return 0; } -int XML4NLP::InitXmlWord(vector &words, TiXmlElement *wordPtr) -{ - words.push_back( Word_t() ); - words[words.size()-1].wordPtr = wordPtr; - return 0; +int XML4NLP::InitXmlWord(vector &words, TiXmlElement *wordPtr) { + words.push_back( Word() ); + words[words.size()-1].wordPtr = wordPtr; + return 0; } ///////////////////////////////////////////////////////////////////////////////////// /// build the initial DOM tree frame. -/// it creates the XML declaration and the XSL declaration instructions and creates +/// it creates the XML declaration and the XSL declaration instructions and creates /// a root element "xml4nlp" and a child node "doc". ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::BuildDOMFrame() { - TiXmlDeclaration * xmlDeclaration = new TiXmlDeclaration("1.0", "utf-8", ""); - TiXmlElement * xml4nlp = new TiXmlElement("xml4nlp"); - m_note.nodePtr = new TiXmlElement(TAG_NOTE); - m_document.documentPtr = new TiXmlElement(TAG_DOC); + TiXmlDeclaration * xmlDeclaration = new TiXmlDeclaration("1.0", "utf-8", ""); + TiXmlElement * xml4nlp = new TiXmlElement("xml4nlp"); + note.nodePtr = new TiXmlElement(TAG_NOTE); + document.documentPtr = new TiXmlElement(TAG_DOC); - m_tiXmlDoc.LinkEndChild(xmlDeclaration); - m_tiXmlDoc.LinkEndChild(xml4nlp); + m_tiXmlDoc.LinkEndChild(xmlDeclaration); + m_tiXmlDoc.LinkEndChild(xml4nlp); - xml4nlp->LinkEndChild(m_note.nodePtr); - ClearAllNote(); - xml4nlp->LinkEndChild(m_document.documentPtr); + xml4nlp->LinkEndChild(note.nodePtr); + ClearAllNote(); + xml4nlp->LinkEndChild(document.documentPtr); - return 0; + return 0; } bool XML4NLP::LTMLValidation() { - // there should not be any attributes in `` - // but it wont matter - - // is the attributes in `note` legal - int state = 0; - state |= QueryNote(NOTE_SRL); state <<= 1; - state |= QueryNote(NOTE_NE); state <<= 1; - state |= QueryNote(NOTE_PARSER); state <<= 1; - state |= QueryNote(NOTE_POS); state <<= 1; - state |= QueryNote(NOTE_WORD); state <<= 1; - state |= QueryNote(NOTE_SENT); - - if (0 == state || // 0 - 0x01 == state || // 1 - 0x03 == state || // 11 - 0x07 == state || // 111 - 0x0f == state || // 1111 - 0x17 == state || // 10111 - 0x1f == state || // 11111 - 0x3f == state) { - } else { - return false; - } - - // if sent attribute in note is `y`, there should be an `cont` - // attribute in para node. - // travel through all the `para` node, query if there is a `cont` - // attribute - if (!(state & 0x01)) { - for (unsigned i = 0; i < m_document.paragraphs.size(); ++ i) { - const Paragraph_t & paragraph = m_document.paragraphs[i]; - if (!paragraph.sentences.size()) { - if (!paragraph.paragraphPtr->GetText()) { return false; } - } else { - for (unsigned j = 0; j < paragraph.sentences.size(); ++ j) { - const Sentence_t & sentence = paragraph.sentences[j]; - if (!sentence.sentencePtr->Attribute(TAG_CONT)) { return false; } - } - } + // there should not be any attributes in `` + // but it wont matter + if (!note.nodePtr->Attribute(NOTE_SENT) + || !note.nodePtr->Attribute(NOTE_WORD) + || !note.nodePtr->Attribute(NOTE_POS) + || !note.nodePtr->Attribute(NOTE_PARSER) + || !note.nodePtr->Attribute(NOTE_NE) + || !note.nodePtr->Attribute(NOTE_SRL)) { + return false; + } + + // is the attributes in `note` legal + int state = 0; + state |= QueryNote(NOTE_SRL); state <<= 1; + state |= QueryNote(NOTE_NE); state <<= 1; + state |= QueryNote(NOTE_PARSER); state <<= 1; + state |= QueryNote(NOTE_POS); state <<= 1; + state |= QueryNote(NOTE_WORD); state <<= 1; + state |= QueryNote(NOTE_SENT); + + if (0 == state || // 0 + 0x01 == state || // 1 + 0x03 == state || // 11 + 0x07 == state || // 111 + 0x0f == state || // 1111 + 0x17 == state || // 10111 + 0x1f == state || // 11111 + 0x3f == state) { + } else { + return false; + } + + // if sent attribute in note is `y`, there should be an `cont` + // attribute in para node. + // travel through all the `para` node, query if there is a `cont` + // attribute + if (!(state & 0x01)) { + for (unsigned i = 0; i < document.paragraphs.size(); ++ i) { + const Paragraph & paragraph = document.paragraphs[i]; + if (!paragraph.sentences.size()) { + if (!paragraph.paragraphPtr->GetText()) { return false; } + } else { + for (unsigned j = 0; j < paragraph.sentences.size(); ++ j) { + const Sentence & sentence = paragraph.sentences[j]; + if (!sentence.sentencePtr->Attribute(TAG_CONT)) { return false; } } + } } + } #define FOREACH(p, s, w) \ - for (unsigned i = 0; i < m_document.paragraphs.size(); ++ i) { \ - const Paragraph_t & p = m_document.paragraphs[i]; \ - for (unsigned j = 0; j < p.sentences.size(); ++ j) { \ - const Sentence_t & s = p.sentences[j]; \ - for (unsigned k = 0; k < s.words.size(); ++ k) { \ - const Word_t & w = s.words[k]; + for (unsigned i = 0; i < document.paragraphs.size(); ++ i) { \ + const Paragraph & p = document.paragraphs[i]; \ + for (unsigned j = 0; j < p.sentences.size(); ++ j) { \ + const Sentence & s = p.sentences[j]; \ + for (unsigned k = 0; k < s.words.size(); ++ k) { \ + const Word & w = s.words[k]; #define END }}} - FOREACH(p, s, w) - // segment check - if ((state & 0x02) && (!w.wordPtr->Attribute(TAG_CONT))) { return false; } - if ((state & 0x04) && (!w.wordPtr->Attribute(TAG_POS))) { return false; } - if ((state & 0x08) && (!w.wordPtr->Attribute(TAG_PSR_PARENT))) { return false; } - if ((state & 0x08) && (!w.wordPtr->Attribute(TAG_PSR_RELATE))) { return false; } - if ((state & 0x10) && (!w.wordPtr->Attribute(TAG_NE))) { return false; } - END + FOREACH(p, s, w) + // segment check + if ((state & 0x02) && (!w.wordPtr->Attribute(TAG_CONT))) { return false; } + if ((state & 0x04) && (!w.wordPtr->Attribute(TAG_POS))) { return false; } + if ((state & 0x08) && (!w.wordPtr->Attribute(TAG_PSR_PARENT))) { return false; } + if ((state & 0x08) && (!w.wordPtr->Attribute(TAG_PSR_RELATE))) { return false; } + if ((state & 0x10) && (!w.wordPtr->Attribute(TAG_NE))) { return false; } + END #undef END #undef FOREACH - return true; + return true; } void XML4NLP::ClearAllNote() { - ClearNote(NOTE_SENT); - ClearNote(NOTE_WORD); - ClearNote(NOTE_POS); - ClearNote(NOTE_NE); - ClearNote(NOTE_PARSER); - ClearNote(NOTE_WSD); - ClearNote(NOTE_SRL); - // ClearNote(NOTE_CLASS); - // ClearNote(NOTE_SUM); - // ClearNote(NOTE_CR); + ClearNote(NOTE_SENT); + ClearNote(NOTE_WORD); + ClearNote(NOTE_POS); + ClearNote(NOTE_NE); + ClearNote(NOTE_PARSER); + ClearNote(NOTE_WSD); + ClearNote(NOTE_SRL); + // ClearNote(NOTE_CLASS); + // ClearNote(NOTE_SUM); + // ClearNote(NOTE_CR); } ///////////////////////////////////////////////////////////////////////////////////// @@ -1604,313 +1237,199 @@ void XML4NLP::ClearAllNote() { /// in the initial, a paragraph has only one sentence. ///////////////////////////////////////////////////////////////////////////////////// int XML4NLP::BuildParagraph(string& strParagraph, int paragraphIdx) { - if (strParagraph == "עٰȨ" - || strParagraph == "аͱ" - || strParagraph == "עǧȨ") { - strParagraph = "ӭʹùҵѧϢоԼƽ̨"; - } else { - } - TiXmlElement * documentPtr = m_document.documentPtr; - vector ¶graphs = m_document.paragraphs; + TiXmlElement * documentPtr = document.documentPtr; + vector ¶graphs = document.paragraphs; - paragraphs.push_back( Paragraph_t() ); - Paragraph_t ¶graph = paragraphs[paragraphs.size() - 1]; + paragraphs.push_back( Paragraph() ); + Paragraph ¶graph = paragraphs[paragraphs.size() - 1]; - paragraph.paragraphPtr = new TiXmlElement(TAG_PARA); - paragraph.paragraphPtr->SetAttribute(TAG_ID, paragraphIdx); - documentPtr->LinkEndChild(paragraph.paragraphPtr); + paragraph.paragraphPtr = new TiXmlElement(TAG_PARA); + paragraph.paragraphPtr->SetAttribute(TAG_ID, paragraphIdx); + documentPtr->LinkEndChild(paragraph.paragraphPtr); - TiXmlText *textPtr = new TiXmlText(strParagraph.c_str()); - paragraph.paragraphPtr->LinkEndChild( textPtr ); + TiXmlText *textPtr = new TiXmlText(strParagraph.c_str()); + paragraph.paragraphPtr->LinkEndChild( textPtr ); - return 0; + return 0; } -int XML4NLP::GetInfoFromSentence(vector &vecInfo, - int paragraphIdx, - int sentenceIdx, - const char *attrName) const -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return -1; - - const vector &words = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words; +int XML4NLP::GetInfoFromSentence(std::vector & info, + int pid, + int sid, + const char *attribute_name) const { + if (0 != CheckRange(pid, sid)) return -1; - /* - if (vecInfo.size() != words.size()) - { - // cerr << "vecInfo's size does not equal to word num in the sentence, should resize() first" << endl; + const vector & words = document.paragraphs[pid].sentences[sid].words; + if (words[0].wordPtr->Attribute(attribute_name) == NULL) { return -1; - } - */ + } - if (words[0].wordPtr->Attribute(attrName) == NULL) - { - // cerr << "Attribute \"" << attrName << "\" does not exists in paragraph " << paragraphIdx - // << " sentence " << sentenceIdx << endl; - return -1; - } + info.resize(words.size()); + for (int i = 0; i < words.size(); ++ i) { + info[i] = words[i].wordPtr->Attribute(attribute_name); + } + return 0; +} - for (int i = 0; i < words.size(); ++i) - { - vecInfo.push_back(words[i].wordPtr->Attribute(attrName)); - //vecInfo[i] = words[i].wordPtr->Attribute(attrName); - } - return 0; +int XML4NLP::GetInfoFromSentence(std::vector & info, + int global_sid, + const char *attribute_name) const { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return GetInfoFromSentence(info, pid, sid, attribute_name); } -int XML4NLP::GetInfoFromSentence(vector &vecInfo, int sentenceIdx, const char *attrName) const -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - const vector &words = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words; +int XML4NLP::GetInfoFromSentence(std::vector &info, + int pid, + int sid, + const char* attribute_name) const { + if (0 != CheckRange(pid, sid)) return -1; + + const vector & words = document.paragraphs[pid].sentences[sid].words; - /* - if (vecInfo.size() != words.size()) - { - // cerr << "vecInfo's size does not equal to word num in the sentence, should resize() first" << endl; + if (words[0].wordPtr->Attribute(attribute_name) == NULL) { return -1; - } - */ + } - if (words[0].wordPtr->Attribute(attrName) == NULL) - { - // cerr << "Attribute \"" << attrName << "\" does not exists in paragraph " << paraIdx_sentIdx.first - // << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } + info.clear(); + for (int i = 0; i < words.size(); ++ i) { + const char * cszAttrValue = words[i].wordPtr->Attribute(attribute_name); + info.push_back(cszAttrValue != NULL ? cszAttrValue : ""); + } + return 0; +} - for (int i = 0; i < words.size(); ++i) - { - vecInfo.push_back(words[i].wordPtr->Attribute(attrName)); - //vecInfo[i] = words[i].wordPtr->Attribute(attrName); - } - return 0; +int XML4NLP::GetInfoFromSentence(std::vector & info, + int global_sid, + const char* attribute_name) const { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return GetInfoFromSentence(info, pid, sid, attribute_name); } +int XML4NLP::SetInfoToSentence(const std::vector & info, + int pid, + int sid, + const char* attribute_name) { + if (0 != CheckRange(pid, sid)) return -1; -int XML4NLP::GetInfoFromSentence(vector &vecInfo, int paragraphIdx, - int sentenceIdx, const char* attrName) const -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return -1; + std::vector & words = document.paragraphs[pid].sentences[sid].words; - const vector &words = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words; + if (words.size() != info.size()) { + return -1; + } - if (words[0].wordPtr->Attribute(attrName) == NULL) - { - // cerr << "Attribute \"" << attrName << "\" does not exists in paragraph " << paragraphIdx - // << " sentence " << sentenceIdx << endl; - return -1; - } + if (words[0].wordPtr->Attribute(attribute_name) != NULL) { + return -1; + } - vecInfo.clear(); - for (int i = 0; i < words.size(); ++i) - { - const char *cszAttrValue = words[i].wordPtr->Attribute(attrName); - vecInfo.push_back(cszAttrValue != NULL ? cszAttrValue : ""); - } - return 0; + for (int i = 0; i < words.size(); ++ i) { + // std::cout << attribute_name << " " << info[i] << std::endl; + words[i].wordPtr->SetAttribute(attribute_name, info[i].c_str()); + } + return 0; } -int XML4NLP::GetInfoFromSentence(vector &vecInfo, int sentenceIdx, const char* attrName) const -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - const vector &words = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words; +int XML4NLP::SetInfoToSentence(const std::vector & info, + int global_sid, + const char * attribute_name) { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; - if (words[0].wordPtr->Attribute(attrName) == NULL) - { - // cerr << "Attribute \"" << attrName << "\" does not exists in paragraph " << paraIdx_sentIdx.first - // << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } - - vecInfo.clear(); - for (int i = 0; i < words.size(); ++i) - { - const char *cszAttrValue = words[i].wordPtr->Attribute(attrName); - vecInfo.push_back(cszAttrValue != NULL ? cszAttrValue : ""); - } - return 0; + return SetInfoToSentence(info, pid, sid, attribute_name); } -int XML4NLP::SetInfoToSentence(const vector &vecInfo, int paragraphIdx, - int sentenceIdx, const char* attrName) -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return -1; - - vector &words = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words; +int XML4NLP::SetInfoToSentence(const std::vector & info, + int pid, + int sid, + const char * attribute_name) { + if (0 != CheckRange(pid, sid)) return -1; - if (words.size() != vecInfo.size()) - { - // cerr << "word number does not equal to attribute \"" << attrName << "\" num in paragraph " << paragraphIdx - // << " sentence " << sentenceIdx << endl; - return -1; - } - if (words[0].wordPtr->Attribute(attrName) != NULL) - { - // cerr << "Attribute \"" << attrName << "\" already exists in paragraph " << paragraphIdx - // << " sentence " << sentenceIdx << endl; - return -1; - } + std::vector & words = document.paragraphs[pid].sentences[sid].words; - for (int i = 0; i < words.size(); ++i) - { - words[i].wordPtr->SetAttribute(attrName, vecInfo[i].c_str()); - } - return 0; -} + if (words.size() != info.size()) { + return -1; + } -int XML4NLP::SetInfoToSentence(const vector &vecInfo, int sentenceIdx, const char* attrName) -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - vector &words = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words; - if (words.size() != vecInfo.size()) - { - // cerr << "word number does not equal to attribute \"" << attrName << "\" num in paragraph " << paraIdx_sentIdx.first - // << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } - if (words[0].wordPtr->Attribute(attrName) != NULL) - { - // cerr << "Attribute \"" << attrName << "\" already exists in paragraph " << paraIdx_sentIdx.first - // << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } + if (words[0].wordPtr->Attribute(attribute_name) != NULL) { + return -1; + } - for (int i = 0; i < words.size(); ++i) - { - words[i].wordPtr->SetAttribute(attrName, vecInfo[i].c_str()); - } - return 0; + for (int i = 0; i < words.size(); ++ i) { + // std::cout << attribute_name << " " << info[i] << std::endl; + words[i].wordPtr->SetAttribute(attribute_name, info[i]); + } + return 0; } -int XML4NLP::SetInfoToSentence(const vector &vecInfo, int paragraphIdx, - int sentenceIdx, const char* attrName) -{ - if (0 != CheckRange(paragraphIdx, sentenceIdx)) return -1; - - vector &words = m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words; +int XML4NLP::SetInfoToSentence(const std::vector & info, + int global_sid, + const char * attribute_name) { + int pid, sid; + if (0 != DecodeGlobalId(global_sid, pid, sid)) return -1; + return SetInfoToSentence(info, pid, sid, attribute_name); +} - if (words.size() != vecInfo.size()) - { - // cerr << "word number does not equal to attribute \"" << attrName << "\" num in paragraph " << paragraphIdx - // << " sentence " << sentenceIdx << endl; - return -1; - } - if (words[0].wordPtr->Attribute(attrName) != NULL) - { - // cerr << "Attribute \"" << attrName << "\" already exists in paragraph " << paragraphIdx - // << " sentence " << sentenceIdx << endl; - return -1; - } - for (int i = 0; i < words.size(); ++i) - { - words[i].wordPtr->SetAttribute(attrName, vecInfo[i]); - } - return 0; -} +int XML4NLP::CheckRange(int pid, int sid, int wid) const { + if (pid >= document.paragraphs.size()) { + return -1; + } -int XML4NLP::SetInfoToSentence(const vector &vecInfo, int sentenceIdx, const char* attrName) -{ - pair paraIdx_sentIdx; - if (0 != MapGlobalSentIdx2paraIdx_sentIdx(sentenceIdx, paraIdx_sentIdx)) return -1; - - vector &words = m_document.paragraphs[paraIdx_sentIdx.first].sentences[paraIdx_sentIdx.second].words; - if (words.size() != vecInfo.size()) - { - // cerr << "word number does not equal to attribute \"" << attrName << "\" num in paragraph " << paraIdx_sentIdx.first - // << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } - if (words[0].wordPtr->Attribute(attrName) != NULL) - { - // cerr << "Attribute \"" << attrName << "\" already exists in paragraph " << paraIdx_sentIdx.first - // << " sentence " << paraIdx_sentIdx.second << endl; - return -1; - } + if (sid >= document.paragraphs[pid].sentences.size()) { + return -1; + } - for (int i = 0; i < words.size(); ++i) - { - words[i].wordPtr->SetAttribute(attrName, vecInfo[i]); - } - return 0; + if (wid >= document.paragraphs[pid].sentences[sid].words.size()) { + return -1; + } + return 0; } +int XML4NLP::CheckRange(int pid, int sid) const { + if (pid >= document.paragraphs.size()) { + return -1; + } -int XML4NLP::CheckRange(int paragraphIdx, int sentenceIdx, int wordIdx) const -{ - if (paragraphIdx >= m_document.paragraphs.size()) - { - // cerr << "paragraphIdx is too large: " << paragraphIdx << endl; - return -1; - } - if (sentenceIdx >= m_document.paragraphs[paragraphIdx].sentences.size()) - { - // cerr << "sentenceIdx is too large: " << sentenceIdx << " in paragraph : " << paragraphIdx << endl; - return -1; - } - if (wordIdx >= m_document.paragraphs[paragraphIdx].sentences[sentenceIdx].words.size()) - { - // cerr << "wordIdx is too large: " << wordIdx << " in sentence : " << sentenceIdx - // << " of paragraph : " << paragraphIdx << endl; - return -1; - } - return 0; -} + if (sid >= document.paragraphs[pid].sentences.size()) { + return -1; + } -int XML4NLP::CheckRange(int paragraphIdx, int sentenceIdx) const -{ - if (paragraphIdx >= m_document.paragraphs.size()) - { - // cerr << "paragraphIdx is too large: " << paragraphIdx << endl; - return -1; - } - if (sentenceIdx >= m_document.paragraphs[paragraphIdx].sentences.size()) - { - // cerr << "sentenceIdx is too large: " << sentenceIdx << " in paragraph : " << paragraphIdx << endl; - return -1; - } - return 0; + return 0; } int XML4NLP::CheckRange(int paragraphIdx) const { - if (paragraphIdx >= m_document.paragraphs.size()) { - // cerr << "paragraphIdx is too large: " << paragraphIdx << endl; - return -1; - } - return 0; + if (paragraphIdx >= document.paragraphs.size()) { + return -1; + } + return 0; } bool XML4NLP::QueryNote(const char *cszNoteName) const { - if (m_note.nodePtr == NULL) return false; // OK? + if (note.nodePtr == NULL) return false; // OK? - return (strcmp(m_note.nodePtr->Attribute(cszNoteName), "y") == 0) ? true : false; + return (strcmp(note.nodePtr->Attribute(cszNoteName), "y") == 0) ? true : false; } int XML4NLP::SetNote(const char *cszNoteName) { - if (m_note.nodePtr == NULL) { - m_note.nodePtr = new TiXmlElement(TAG_NOTE); - m_tiXmlDoc.RootElement()->LinkEndChild( m_note.nodePtr ); - } - m_note.nodePtr->SetAttribute(cszNoteName, "y"); - return 0; + if (note.nodePtr == NULL) { + note.nodePtr = new TiXmlElement(TAG_NOTE); + m_tiXmlDoc.RootElement()->LinkEndChild( note.nodePtr ); + } + note.nodePtr->SetAttribute(cszNoteName, "y"); + return 0; } int XML4NLP::ClearNote(const char *cszNoteName) { - if (m_note.nodePtr == NULL) { - m_note.nodePtr = new TiXmlElement(TAG_NOTE); - m_tiXmlDoc.RootElement()->LinkEndChild( m_note.nodePtr ); - } + if (note.nodePtr == NULL) { + note.nodePtr = new TiXmlElement(TAG_NOTE); + m_tiXmlDoc.RootElement()->LinkEndChild( note.nodePtr ); + } - m_note.nodePtr->SetAttribute(cszNoteName, "n"); - return 0; + note.nodePtr->SetAttribute(cszNoteName, "n"); + return 0; } - diff --git a/src/__xml4nlp/Xml4nlp.h b/src/__xml4nlp/Xml4nlp.h index 5aa0b6683..acbba30d2 100644 --- a/src/__xml4nlp/Xml4nlp.h +++ b/src/__xml4nlp/Xml4nlp.h @@ -41,788 +41,786 @@ extern const char * const NOTE_CR; ///////////////////////////////////////////////////////////////////////////////////// class XML4NLP { public: - XML4NLP(); - virtual ~XML4NLP(); - - // -------------------------------------------------------------- - // Functions for DOM Tree Creation - // -------------------------------------------------------------- - /* - * Create DOM from file, read in each line of the file and store - * them in the xml tree. - * - * @param[in] filename the filename - */ - int CreateDOMFromFile(const char * filename); - - /* - * Create DOM from raw string text. - * - * @param[in] str the string - * @return int 0 on success, otherwise -1 - */ - int CreateDOMFromString(const std::string & str); - - /* - * A wrapper of CreateDOMFromString(const std::string & str); - * - * @param[in] str the string - * @return int 0 on success, otherwise -1 - */ - int CreateDOMFromString(const char * str); - - /* - * Load XML DOM from file - * - * @param[in] filename the file name - * @return int 0 on success, otherwise -1 - */ - int LoadXMLFromFile(const char * fileName); - - /* - * Load XML DOM from string - * - * @param[in] str the string - * @return int 0 on success, otherwise -1 - */ - int LoadXMLFromString(const char * str); - - /* - * Load XML DOM from string - * - * @param[in] str the string - */ - int LoadXMLFromString(const std::string & str); - - /* - * Clear the DOM tree - */ - void ClearDOM(); - - /* - * Save the DOM tree to file - * - * @param[in] filename the filename - * @return int 0 on success, otherwise -1 - */ - int SaveDOM(const char * fileName); - - /* - * Save the DOM tree to strin - * - * @param[out] strDocument the str - */ - void SaveDOM(string &strDocument) const; - - /* - * Get attributes value in `` - * - * @param[in] cszNoteName the note name - * @return bool return true on `` exists and - * attributes value equals "y", otherwise - * false. - */ - bool QueryNote(const char * cszNoteName) const; - - /* - * Set attributes value in `` to "y" - * - * @param[in] cszNoteName the note name - * @return int return 0 - */ - int SetNote(const char * cszNoteName); - - /* - * Set attributes value in `` to "n" - * - * @param[in] cszNoteName the note name - * @return int return 0 - */ - int ClearNote(const char * cszNoteName); - - /* - * Set all nlp attributes value in `` to "n" - * - * @param[in] cszNoteName the note name - * @return int return 0 - */ - void ClearAllNote(); - - // counting operation - /* - * count number of paragraph in document - * - * @return int the number of paragraph - */ - int CountParagraphInDocument() const; - - /* - * conut number of sentence in paragraph - * - * @param[in] pid the index number of paragraph - * @return int the number of paragraph - */ - int CountSentenceInParagraph(int pid) const; - - /* - * count number of all sentences in document - * - * @return int the number of all sentences in document - */ - int CountSentenceInDocument() const; - - /* - * Count number of words in sentence, given the index of paragraph - * and index of sentence. - * - * @param[in] pid the index of paragraph - * @param[in] sid the index of sentence - * @return int - */ - int CountWordInSentence(int pid, int sid) const; - - /* - * Count number of words in sentence, given the global index - * of the sentence - * - * @param[in] global_sid the global index of a sentence - * @return int number of sentence - */ - int CountWordInSentence(int global_sid) const; - - /* - * Count number of words in paragraph - * - * @param[in] pid the index of paragraph - * @return int number of words in paragraph if legal - * pid is given, otherwise -1 - */ - int CountWordInParagraph(int pid) const; - - /* - * Count total number of words in paragraph - * - * @return int number of words - */ - int CountWordInDocument() const; - - /* - * Get content of paragraph and store it in string - * - * @param[in] pid the index of paragraph - * @param[out] strParagraph the output string - * @return int 0 on success, otherwise -1 - */ - int GetParagraph(int pid, string & strParagraph) const; - - /* - * Get content of paragraph - * - * @param[in] pid the index of paragraph - * @return const char * the pointer to the string, NULL on failure - */ - const char * GetParagraph(int pid) const; - - /* - * Get content of sentence - * - * @param[in] pid the index of paragraph - * @param[in] sid the index of sentence - * @return const char * the pointer to the string, NULL on failure - */ - const char * GetSentence(int pid, int sid) const; - - /* - * Get content of sentence, given the sentence's global index - * - * @param[in] global_sid the global index of the sentence - * @return const char * the pointer to the string, NULL on failure - */ - const char * GetSentence(int global_sid) const; - - /* - * Get word content - * - * @param[in] pid the index of paragraph in document - * @param[in] sid the index of sentence in paragraph - * @param[in] wid the index of word in sentence - * @return const char * the pointer to the string, NULL on failure - */ - const char * GetWord(int pid, int sid, int wid) const; - - /* - * Get word content, given the global sentence index - * - * @param[in] global_sid the global index of the sentence - * @param[in] wid the index of word in sentence - * @return const char * the pointer to the string, NULL on failure - */ - const char * GetWord(int global_sid, int wid) const; - - /* - * Get word content, given the global index of word - * - * @param[in] global_wid the global index of the sentence - * @return const char * the pointer to the string, NULL on failure - */ - const char * GetWord(int glabal_wid) const; - - /* - * Get word's postag - * - * @param[in] pid the index of paragraph - * @param[in] sid the index of sentence - * @param[in] wid the index of word - * @return const char * the pointer to the string, NULL on failure. - */ - const char * GetPOS(int pid, int sid, int wid) const; - - /* - * Get word's postag - * - * @param[in] global_sid the global index of sentence - * @param[in] wid the index of the word - * @return const char * the pointer to the string, NULL on failure. - */ - const char * GetPOS(int global_sid, int wid) const; - - /* - * Get word's postag, given the global index of the word in the document. - * - * @param[in] global_wid the global index of the word. - * @return const char * the pointer to the string, NULL on failure. - */ - const char * GetPOS(int global_wid) const; - - /* - * Get word's NER tag - * - * @param[in] pid the index of paragraph - * @param[in] sid the index of sentence - * @param[in] wid the index of word - * @return const char * the pointer to the tag, NULL on failure. - */ - const char * GetNE(int pid, int sid, int wid) const; - - /* - * Get word's NER tag, given the global index of sentence in the document. - * - * @param[in] global_sid the global index of sentence - * @param[in] wid the index of the word - * @return const char * the pointer to the tag, NULL on failure. - */ - const char * GetNE(int global_sid, int wid) const; - - /* - * Get word's NER, given the global index of the word in the document. - * - * @param[in] global_wid the global index of the word. - * @return const char * the pointer to the string, NULL on failure. - */ - const char * GetNE(int glabalWordIdx) const; - - /* - * Get word's WSD result (WSD module is under construction) - * - * @param[out] WSD_explanation the explanation of the WSD - * @param[in] pid the index of paragraph - * @param[in] sid the index of sentence - * @param[in] wid the index of word - * @return int 0 on success, otherwise -1 - */ - int GetWSD(pair & WSD_explanation, - int pid, - int sid, - int wid) const; - - /* - * Get word's WSD result (WSD module is under construction) - * - * @param[out] WSD_explanation the explanation of the WSD - * @param[in] global_sid the global index of sentence - * @param[in] wid the index of the word - * @return int 0 on success, -1 on illegal index - */ - int GetWSD(pair & WSD_explanation, - int global_sid, - int wid) const; - - /* - * Get word's WSD result (WSD module is under construction) - * - * @param[out] WSD_explanation the explanation of the WSD - * @param[in] global_wid the global index of sentence - * @return int 0 on success, -1 on illegal index - */ - int GetWSD(pair & WSD_explanation, - int global_wid) const; - - /* - * Get word's parsing result - * - * @param[out] parent_relation the (parent, relation) pair - * @param[in] pid the index of paragraph - * @param[in] sid the index of sentence - * @param[in] wid the index of word - * @return int 0 on success, -1 on illegal index - */ - int GetParse(pair & parent_relation, - int pid, - int sid, - int wid) const; - - /* - * Get word's parsing result - * - * @param[out] parent_relation the (parent, relation) pair - * @param[in] global_sid the global index of sentence - * @param[in] wid the index of the word - * @return int 0 on success, -1 on illegal index - */ - int GetParse(pair & parent_relation, - int global_sid, - int wid) const; - - /* - * Get word's parsing result - * - * @param[out] parent_relation the (parent, relation) pair - * @param[in] global_wid the global index of sentence - * @return int 0 on success, -1 on illegal index - */ - int GetParse(pair &parent_relation, - int glabal_wid) const; - - /* - * Get sentences from paragraph - * - * @param[out] vecSentence the output vector - * @param[in] paragraphIdx the index to the paragraph - */ - int GetSentencesFromParagraph(vector & vecSentence, - int paragraphIdx) const; - - /* - * Get sentences from paragraph - * - * @param[out] vectSentence the output vector - * @param[in] paragraphIdx the index to the paragraph - */ - int GetSentencesFromParagraph(vector &vecSents, - int paragraphIdx) const; - - int SetSentencesToParagraph(const vector &vecSents, - int paragraphIdx); - - /* - * Get words from sentence - * - * @param[out] vecWord the word vector - * @param[in] paragraphIdx the index of paragraph - * @param[in] sentenceIdx the index of sentence - */ - int GetWordsFromSentence(vector &vecWord, - int paragraphIdx, - int sentenceIdx) const; - - /* - * Get words from sentence - * - * @param[out] vecWord the word vector - * @param[in] globalSentIdx the global index of sentence - */ - int GetWordsFromSentence(vector &vecWord, - int globalSentIdx) const; - - /* - * Get words from sentence, std::string interface - * - * @param[out] vecWord the word vector - * @param[in] paragraphIdx the index of paragraph - * @param[in] sentenceIdx the index of sentence - */ - int GetWordsFromSentence(vector &vecWord, - int paragraphIdx, - int sentenceIdx) const; - - /* - * Get words from sentence, std::string interface - * - * @param[out] vecWord the word vector - * @param[in] globalSentIdx the global index of sentence - */ - int GetWordsFromSentence(vector &vecWord, - int globalSentIdx) const; - - /* - * Set word to sentence - * - * @param[in] vecWord the words - * @param[in] paragraphIdx the index of paragraph - * @param[in] sentenceIdx the index of sentence - */ - int SetWordsToSentence(const vector &vecWord, - int paragraphIdx, - int sentenceIdx); - - /* - * Set word to sentence - * - * @param[in] vecWord the words - * @param[in] sentenceIdx the global index of sentence - */ - int SetWordsToSentence(const vector &vecWord, - int sentenceIdx); - - // for POS tagging - int GetPOSsFromSentence(vector & vecPOS, - int paragraphIdx, - int sentenceIdx) const; - - int GetPOSsFromSentence(vector & vecPOS, - int globalSentIdx) const; - - int GetPOSsFromSentence(vector & vecPOS, - int paragraphIdx, - int sentenceIdx) const; - - int GetPOSsFromSentence(vector & vecPOS, - int globalSentIdx) const; - - int SetPOSsToSentence(const vector & vecPOS, - int paragraphIdx, - int sentenceIdx); - - int SetPOSsToSentence(const vector & vecPOS, - int sentenceIdx); - - // for NE - int GetNEsFromSentence(vector &vecNE, - int paragraphIdx, - int sentenceIdx) const; - - int GetNEsFromSentence(vector &vecNE, - int globalSentIdx) const; - - int GetNEsFromSentence(vector &vecNE, - int paragraphIdx, - int sentenceIdx) const; - - int GetNEsFromSentence(vector &vecNE, - int globalSentIdx) const; - - int SetNEsToSentence(const vector &vecNE, - int paragraphIdx, - int sentenceIdx); - - int SetNEsToSentence(const vector &vecNE, - int sentenceIdx); - - int GetWSDsFromSentence(vector &vecWSD, - int paragraphIdx, - int sentenceIdx) const; - - int GetWSDsFromSentence(vector &vecWSD, - int sentenceIdx) const; - - int GetWSDsFromSentence(vector &vecWSD, - int paragraphIdx, - int sentenceIdx) const; - - int GetWSDsFromSentence(vector &vecWSD, - int sentenceIdx) const; - - int SetWSDsToSentence(const vector &vecWSD, - int paragraphIdx, - int sentenceIdx); - - int SetWSDsToSentence(const vector & vecWSD, - int sentenceIdx); - - int GetWSDExplainsFromSentence(vector &vecWSDExplain, - int paragraphIdx, - int sentenceIdx) const; - - int GetWSDExplainsFromSentence(vector &vecWSDExplain, - int sentenceIdx) const; - - int GetWSDExplainsFromSentence(vector &vecWSDExplain, - int paragraphIdx, - int sentenceIdx) const; - - int GetWSDExplainsFromSentence(vector &vecWSDExplain, - int sentenceIdx) const; - - int SetWSDExplainsToSentence(const vector &vecWSDExplain, - int paragraphIdx, - int sentenceIdx); + XML4NLP(); + virtual ~XML4NLP(); + + // -------------------------------------------------------------- + // Functions for DOM Tree Creation + // -------------------------------------------------------------- + /* + * Create DOM from file, read in each line of the file and store + * them in the xml tree. + * + * @param[in] filename the filename + */ + int CreateDOMFromFile(const char * filename); + + /* + * Create DOM from raw string text. + * + * @param[in] str the string + * @return int 0 on success, otherwise -1 + */ + int CreateDOMFromString(const std::string & str); + + /* + * A wrapper of CreateDOMFromString(const std::string & str); + * + * @param[in] str the string + * @return int 0 on success, otherwise -1 + */ + int CreateDOMFromString(const char * str); + + /* + * Load XML DOM from file + * + * @param[in] filename the file name + * @return int 0 on success, otherwise -1 + */ + int LoadXMLFromFile(const char * fileName); + + /* + * Load XML DOM from string + * + * @param[in] str the string + * @return int 0 on success, otherwise -1 + */ + int LoadXMLFromString(const char * str); + + /* + * Load XML DOM from string + * + * @param[in] str the string + */ + int LoadXMLFromString(const std::string & str); + + /* + * Clear the DOM tree + */ + void ClearDOM(); + + /* + * Save the DOM tree to file + * + * @param[in] filename the filename + * @return int 0 on success, otherwise -1 + */ + int SaveDOM(const char * fileName); + + /* + * Save the DOM tree to strin + * + * @param[out] strDocument the str + */ + void SaveDOM(string &strDocument) const; + + /* + * Get attributes value in `` + * + * @param[in] note_name the name of the attribute in note + * @return bool return true on `` exists and attributes + * value equals "y", otherwise false. + */ + bool QueryNote(const char * note_name) const; + + /* + * Set attributes value in `` to "y" + * + * @param[in] note_name the name of the attribute in note + * @return int return 0 + */ + int SetNote(const char * note_name); + + /* + * Set attributes value in `` to "n" + * + * @param[in] cszNoteName the note name + * @return int return 0 + */ + int ClearNote(const char * note_name); + + /* + * Set all nlp attributes value in `` to "n" + * + * @param[in] cszNoteName the note name + * @return int return 0 + */ + void ClearAllNote(); + + // counting operation + /* + * count number of paragraph in document + * + * @return int the number of paragraph + */ + int CountParagraphInDocument() const; + + /* + * conut number of sentence in paragraph + * + * @param[in] pid the index number of paragraph + * @return int the number of paragraph + */ + int CountSentenceInParagraph(int pid) const; + + /* + * count number of all sentences in document + * + * @return int the number of all sentences in document + */ + int CountSentenceInDocument() const; + + /* + * Count number of words in sentence, given the index of paragraph + * and index of sentence. + * + * @param[in] pid the index of paragraph + * @param[in] sid the index of sentence + * @return int + */ + int CountWordInSentence(int pid, int sid) const; + + /* + * Count number of words in sentence, given the global index + * of the sentence + * + * @param[in] global_sid the global index of a sentence + * @return int number of sentence + */ + int CountWordInSentence(int global_sid) const; + + /* + * Count number of words in paragraph + * + * @param[in] pid the index of paragraph + * @return int number of words in paragraph if legal + * pid is given, otherwise -1 + */ + int CountWordInParagraph(int pid) const; + + /* + * Count total number of words in paragraph + * + * @return int number of words + */ + int CountWordInDocument() const; + + /* + * Get content of paragraph and store it in string + * + * @param[in] pid the index of paragraph + * @param[out] strParagraph the output string + * @return int 0 on success, otherwise -1 + */ + int GetParagraph(int pid, string & strParagraph) const; + + /* + * Get content of paragraph + * + * @param[in] pid the index of paragraph + * @return const char * the pointer to the string, NULL on failure + */ + const char * GetParagraph(int pid) const; + + /* + * Get content of sentence + * + * @param[in] pid the index of paragraph + * @param[in] sid the index of sentence + * @return const char * the pointer to the string, NULL on failure + */ + const char * GetSentence(int pid, int sid) const; + + /* + * Get content of sentence, given the sentence's global index + * + * @param[in] global_sid the global index of the sentence + * @return const char * the pointer to the string, NULL on failure + */ + const char * GetSentence(int global_sid) const; + + /* + * Get word content + * + * @param[in] pid the index of paragraph in document + * @param[in] sid the index of sentence in paragraph + * @param[in] wid the index of word in sentence + * @return const char * the pointer to the string, NULL on failure + */ + const char * GetWord(int pid, int sid, int wid) const; + + /* + * Get word content, given the global sentence index + * + * @param[in] global_sid the global index of the sentence + * @param[in] wid the index of word in sentence + * @return const char * the pointer to the string, NULL on failure + */ + const char * GetWord(int global_sid, int wid) const; + + /* + * Get word content, given the global index of word + * + * @param[in] global_wid the global index of the sentence + * @return const char * the pointer to the string, NULL on failure + */ + const char * GetWord(int glabal_wid) const; + + /* + * Get word's postag + * + * @param[in] pid the index of the paragraph + * @param[in] sid the index of the sentence + * @param[in] wid the index of the word + * @return const char * the pointer to the string, NULL on failure. + */ + const char * GetPOS(int pid, int sid, int wid) const; + + /* + * Get word's postag + * + * @param[in] global_sid the global index of sentence + * @param[in] wid the index of the word + * @return const char * the pointer to the string, NULL on failure. + */ + const char * GetPOS(int global_sid, int wid) const; + + /* + * Get word's postag, given the global index of the word in the document. + * + * @param[in] global_wid the global index of the word. + * @return const char * the pointer to the string, NULL on failure. + */ + const char * GetPOS(int global_wid) const; + + /* + * Get word's NER tag + * + * @param[in] pid the index of paragraph + * @param[in] sid the index of sentence + * @param[in] wid the index of word + * @return const char * the pointer to the tag, NULL on failure. + */ + const char * GetNE(int pid, int sid, int wid) const; + + /* + * Get word's NER tag, given the global index of sentence in the document. + * + * @param[in] global_sid the global index of sentence + * @param[in] wid the index of the word + * @return const char * the pointer to the tag, NULL on failure. + */ + const char * GetNE(int global_sid, int wid) const; + + /* + * Get word's NER, given the global index of the word in the document. + * + * @param[in] global_wid the global index of the word. + * @return const char * the pointer to the string, NULL on failure. + */ + const char * GetNE(int glabalWordIdx) const; + + /* + * Get word's WSD result (WSD module is under construction) + * + * @param[out] WSD_explanation the explanation of the WSD + * @param[in] pid the index of paragraph + * @param[in] sid the index of sentence + * @param[in] wid the index of word + * @return int 0 on success, otherwise -1 + */ + int GetWSD(pair & WSD_explanation, + int pid, + int sid, + int wid) const; + + /* + * Get word's WSD result (WSD module is under construction) + * + * @param[out] WSD_explanation the explanation of the WSD + * @param[in] global_sid the global index of sentence + * @param[in] wid the index of the word + * @return int 0 on success, -1 on illegal index + */ + int GetWSD(pair & WSD_explanation, + int global_sid, + int wid) const; + + /* + * Get word's WSD result (WSD module is under construction) + * + * @param[out] WSD_explanation the explanation of the WSD + * @param[in] global_wid the global index of sentence + * @return int 0 on success, -1 on illegal index + */ + int GetWSD(pair & WSD_explanation, + int global_wid) const; + + /* + * Get word's parsing result + * + * @param[out] parent_relation the (parent, relation) pair + * @param[in] pid the index of paragraph + * @param[in] sid the index of sentence + * @param[in] wid the index of word + * @return int 0 on success, -1 on illegal index + */ + int GetParse(pair & parent_relation, + int pid, + int sid, + int wid) const; + + /* + * Get word's parsing result + * + * @param[out] parent_relation the (parent, relation) pair + * @param[in] global_sid the global index of sentence + * @param[in] wid the index of the word + * @return int 0 on success, -1 on illegal index + */ + int GetParse(pair & parent_relation, + int global_sid, + int wid) const; + + /* + * Get word's parsing result + * + * @param[out] parent_relation the (parent, relation) pair + * @param[in] global_wid the global index of sentence + * @return int 0 on success, -1 on illegal index + */ + int GetParse(pair &parent_relation, + int glabal_wid) const; + + /* + * Get sentences from paragraph + * + * @param[out] vecSentence the output vector + * @param[in] paragraphIdx the index to the paragraph + */ + int GetSentencesFromParagraph(vector & vecSentence, + int paragraphIdx) const; + + /* + * Get sentences from paragraph + * + * @param[out] vectSentence the output vector + * @param[in] paragraphIdx the index to the paragraph + */ + int GetSentencesFromParagraph(vector &vecSents, + int paragraphIdx) const; + + int SetSentencesToParagraph(const vector &vecSents, + int paragraphIdx); + + /* + * Get words from sentence + * + * @param[out] vecWord the word vector + * @param[in] paragraphIdx the index of paragraph + * @param[in] sentenceIdx the index of sentence + */ + int GetWordsFromSentence(vector &vecWord, + int paragraphIdx, + int sentenceIdx) const; + + /* + * Get words from sentence + * + * @param[out] vecWord the word vector + * @param[in] globalSentIdx the global index of sentence + */ + int GetWordsFromSentence(vector &vecWord, + int globalSentIdx) const; + + /* + * Get words from sentence, std::string interface + * + * @param[out] vecWord the word vector + * @param[in] paragraphIdx the index of paragraph + * @param[in] sentenceIdx the index of sentence + */ + int GetWordsFromSentence(vector &vecWord, + int paragraphIdx, + int sentenceIdx) const; + + /* + * Get words from sentence, std::string interface + * + * @param[out] vecWord the word vector + * @param[in] globalSentIdx the global index of sentence + */ + int GetWordsFromSentence(vector &vecWord, + int globalSentIdx) const; + + /* + * Set word to sentence + * + * @param[in] vecWord the words + * @param[in] paragraphIdx the index of paragraph + * @param[in] sentenceIdx the index of sentence + */ + int SetWordsToSentence(const vector &vecWord, + int paragraphIdx, + int sentenceIdx); + + /* + * Set word to sentence + * + * @param[in] vecWord the words + * @param[in] sentenceIdx the global index of sentence + */ + int SetWordsToSentence(const vector &vecWord, + int sentenceIdx); + + // for POS tagging + int GetPOSsFromSentence(vector & vecPOS, + int paragraphIdx, + int sentenceIdx) const; + + int GetPOSsFromSentence(vector & vecPOS, + int globalSentIdx) const; + + int GetPOSsFromSentence(vector & vecPOS, + int paragraphIdx, + int sentenceIdx) const; + + int GetPOSsFromSentence(vector & vecPOS, + int globalSentIdx) const; + + int SetPOSsToSentence(const vector & vecPOS, + int paragraphIdx, + int sentenceIdx); + + int SetPOSsToSentence(const vector & vecPOS, + int sentenceIdx); + + // for NE + int GetNEsFromSentence(vector &vecNE, + int paragraphIdx, + int sentenceIdx) const; + + int GetNEsFromSentence(vector &vecNE, + int globalSentIdx) const; + + int GetNEsFromSentence(vector &vecNE, + int paragraphIdx, + int sentenceIdx) const; + + int GetNEsFromSentence(vector &vecNE, + int globalSentIdx) const; + + int SetNEsToSentence(const vector &vecNE, + int paragraphIdx, + int sentenceIdx); + + int SetNEsToSentence(const vector &vecNE, + int sentenceIdx); + + int GetWSDsFromSentence(vector &vecWSD, + int paragraphIdx, + int sentenceIdx) const; + + int GetWSDsFromSentence(vector &vecWSD, + int sentenceIdx) const; + + int GetWSDsFromSentence(vector &vecWSD, + int paragraphIdx, + int sentenceIdx) const; + + int GetWSDsFromSentence(vector &vecWSD, + int sentenceIdx) const; + + int SetWSDsToSentence(const vector &vecWSD, + int paragraphIdx, + int sentenceIdx); + + int SetWSDsToSentence(const vector & vecWSD, + int sentenceIdx); + + int GetWSDExplainsFromSentence(vector &vecWSDExplain, + int paragraphIdx, + int sentenceIdx) const; + + int GetWSDExplainsFromSentence(vector &vecWSDExplain, + int sentenceIdx) const; + + int GetWSDExplainsFromSentence(vector &vecWSDExplain, + int paragraphIdx, + int sentenceIdx) const; + + int GetWSDExplainsFromSentence(vector &vecWSDExplain, + int sentenceIdx) const; + + int SetWSDExplainsToSentence(const vector &vecWSDExplain, + int paragraphIdx, + int sentenceIdx); - int SetWSDExplainsToSentence(const vector &vecWSDExplain, - int sentenceIdx); + int SetWSDExplainsToSentence(const vector &vecWSDExplain, + int sentenceIdx); - // for Parser - int GetParsesFromSentence(vector< pair > &vecParse, - int paragraphIdx, - int sentenceIdx) const; + // for Parser + int GetParsesFromSentence(vector< pair > &vecParse, + int paragraphIdx, + int sentenceIdx) const; - int GetParsesFromSentence(vector< pair > &vecParse, - int sentenceIdx) const; + int GetParsesFromSentence(vector< pair > &vecParse, + int sentenceIdx) const; - int GetParsesFromSentence(vector< pair > &vecParse, - int paragraphIdx, - int sentenceIdx) const; + int GetParsesFromSentence(vector< pair > &vecParse, + int paragraphIdx, + int sentenceIdx) const; - int GetParsesFromSentence(vector< pair > &vecParse, - int sentenceIdx) const; + int GetParsesFromSentence(vector< pair > &vecParse, + int sentenceIdx) const; - int SetParsesToSentence(const vector< pair > &vecParse, - int paragraphIdx, - int sentenceIdx); + int SetParsesToSentence(const vector< pair > &vecParse, + int paragraphIdx, + int sentenceIdx); - int SetParsesToSentence(const vector< pair > &vecParse, - int sentenceIdx); + int SetParsesToSentence(const vector< pair > &vecParse, + int sentenceIdx); - int SetParsesToSentence(const vector &vecHead, - const vector &vecRel, - int paragraphIdx, - int sentenceIdx); + int SetParsesToSentence(const vector &vecHead, + const vector &vecRel, + int paragraphIdx, + int sentenceIdx); - int SetParsesToSentence(const vector &vecHead, - const vector &vecRel, - int sentenceIdx); + int SetParsesToSentence(const vector &vecHead, + const vector &vecRel, + int sentenceIdx); - // for text summarization - const char* GetTextSummary() const; - int SetTextSummary(const char* textSum); + // for text summarization + const char* GetTextSummary() const; + int SetTextSummary(const char* textSum); - // for text classification - const char* GetTextClass() const; - int SetTextClass(const char* textClass); + // for text classification + const char* GetTextClass() const; + int SetTextClass(const char* textClass); - // for SRL - int CountPredArgToWord(int paragraphIdx, - int sentenceIdx, - int wordIdx) const; + // for SRL + int CountPredArgToWord(int paragraphIdx, + int sentenceIdx, + int wordIdx) const; - int CountPredArgToWord(int globalSentIdx, - int wordIdx) const; + int CountPredArgToWord(int globalSentIdx, + int wordIdx) const; - int CountPredArgToWord(int globalWordIdx) const; + int CountPredArgToWord(int globalWordIdx) const; - int GetPredArgToWord(int paragraphIdx, - int sentenceIdx, - int wordIdx, - vector &vecType, - vector< pair > &vecBegEnd) const; + int GetPredArgToWord(int paragraphIdx, + int sentenceIdx, + int wordIdx, + vector &vecType, + vector< pair > &vecBegEnd) const; - int GetPredArgToWord(int globalSentIdx, - int wordIdx, - vector &vecType, - vector< pair > &vecBegEnd) const; + int GetPredArgToWord(int globalSentIdx, + int wordIdx, + vector &vecType, + vector< pair > &vecBegEnd) const; - int GetPredArgToWord(int globalWordIdx, - vector &vecType, - vector< pair > &vecBegEnd) const; + int GetPredArgToWord(int globalWordIdx, + vector &vecType, + vector< pair > &vecBegEnd) const; - int GetPredArgToWord(int paragraphIdx, - int sentenceIdx, - int wordIdx, - vector &vecType, - vector< pair > &vecBegEnd) const; + int GetPredArgToWord(int paragraphIdx, + int sentenceIdx, + int wordIdx, + vector &vecType, + vector< pair > &vecBegEnd) const; - int GetPredArgToWord(int sentenceIdx, - int wordIdx, - vector &vecType, - vector< pair > &vecBegEnd) const; + int GetPredArgToWord(int sentenceIdx, + int wordIdx, + vector &vecType, + vector< pair > &vecBegEnd) const; - int SetPredArgToWord(int paragraphIdx, - int sentenceIdx, - int wordIdx, - const vector &vecType, - const vector< pair > &vecBegEnd); + int SetPredArgToWord(int paragraphIdx, + int sentenceIdx, + int wordIdx, + const vector &vecType, + const vector< pair > &vecBegEnd); - int SetPredArgToWord(int sentenceIdx, - int wordIdx, - const vector &vecType, - const vector< pair > &vecBegEnd); + int SetPredArgToWord(int sentenceIdx, + int wordIdx, + const vector &vecType, + const vector< pair > &vecBegEnd); - // for coreference resolution - int CountEntity() const; + // for coreference resolution + int CountEntity() const; - int CountMentionInEntity(int entityIdx); + int CountMentionInEntity(int entityIdx); - int GetMentionOfEntity(vector< pair > &vecMention, - int entityIdx) const; + int GetMentionOfEntity(vector< pair > &vecMention, + int entityIdx) const; - int GetCoreference(vector< vector< pair > >& vecCoref) const; + int GetCoreference(vector< vector< pair > >& vecCoref) const; - int SetCoreference(const vector< vector< pair > >& vecCoref); + int SetCoreference(const vector< vector< pair > >& vecCoref); public: - int MapGlobalSentIdx2paraIdx_sentIdx(int sentenceIdx, - pair ¶Idx_sentIdx) const; + int DecodeGlobalId(int global_sid, int & pid, int & sid) const; - int MapGlobalWordIdx2paraIdx_sentIdx_wordIdx(int globalWordIdx, - int ¶Idx, - int &sentIdx, - int &wordIdx) const; + int DecodeGlobalId(int globalWordIdx, + int ¶Idx, + int &sentIdx, + int &wordIdx) const; - int CheckRange(int paragraphIdx, - int sentenceIdx, - int wordIdx) const; + int CheckRange(int paragraphIdx, + int sentenceIdx, + int wordIdx) const; - int CheckRange(int paragraphIdx, - int sentenceIdx) const; + int CheckRange(int paragraphIdx, + int sentenceIdx) const; - int CheckRange(int paragraphIdx) const; + int CheckRange(int paragraphIdx) const; - void ReportTiXmlDocErr() const; + void ReportTiXmlDocErr() const; - int BuildParagraph(string &strParagraph, - int paragraphIdx); + int BuildParagraph(string &strParagraph, + int paragraphIdx); private: - typedef struct { - TiXmlElement *wordPtr; - } Word_t; - - typedef struct { - vector words; - TiXmlElement * sentencePtr; - } Sentence_t; - - typedef struct { - vector sentences; - TiXmlElement * paragraphPtr; - } Paragraph_t; - - typedef struct { - vector paragraphs; - TiXmlElement * documentPtr; - } Document_t; - - typedef struct { - TiXmlElement *nodePtr; - } Note, Summary, TextClass; - - typedef struct { - TiXmlElement *mentionPtr; - } Mention; - - typedef struct { - vector vecMention; - TiXmlElement *entityPtr; - } Entity; - - typedef struct { - vector vecEntity; - TiXmlElement *nodePtr; - } Coref; - + typedef struct { + TiXmlElement *wordPtr; + } Word; + + typedef struct { + vector words; + TiXmlElement * sentencePtr; + } Sentence; + + typedef struct { + vector sentences; + TiXmlElement * paragraphPtr; + } Paragraph; + + typedef struct { + vector paragraphs; + TiXmlElement * documentPtr; + } Document; + + typedef struct { + TiXmlElement *nodePtr; + } Note, Summary, TextClass; + + typedef struct { + TiXmlElement *mentionPtr; + } Mention; + + typedef struct { + vector vecMention; + TiXmlElement *entityPtr; + } Entity; + + typedef struct { + vector vecEntity; + TiXmlElement *nodePtr; + } Coref; + + typedef std::pair WSDResult; + typedef std::pair ParseResult; private: - // initialization during loading txt - int BuildDOMFrame(); - - // initialization during loading xml - int InitXmlStructure(); + // initialization during loading txt + int BuildDOMFrame(); - void CheckNoteForOldLtml(); + // initialization during loading xml + int InitXmlStructure(); - int InitXmlDocument(Document_t & document); + int InitXmlDocument(Document & document); - int InitXmlParagraph(vector & vecParagraph, - TiXmlElement *paragraphPtr); + int InitXmlParagraph(vector & vecParagraph, + TiXmlElement *paragraphPtr); - int InitXmlSentence(vector &vecSentence, - TiXmlElement *stnsPtr); + int InitXmlSentence(vector &vecSentence, + TiXmlElement *stnsPtr); - int InitXmlWord(vector &vecWord, - TiXmlElement *wordPtr); + int InitXmlWord(vector &vecWord, + TiXmlElement *wordPtr); - int InitXmlCoref(Coref &coref); + int InitXmlCoref(Coref &coref); - int InitXmlEntity(vector &vecEntity, - TiXmlElement *entityPtr); + int InitXmlEntity(vector &vecEntity, + TiXmlElement *entityPtr); - int InitXmlMention(vector &vecMention, - TiXmlElement *mentionPtr); + int InitXmlMention(vector &vecMention, + TiXmlElement *mentionPtr); - int GetInfoFromSentence(vector &vecInfo, - int paragraphIdx, - int sentenceIdx, - const char *attrName) const; + int GetInfoFromSentence(vector &vecInfo, + int paragraphIdx, + int sentenceIdx, + const char *attrName) const; - int GetInfoFromSentence(vector &vecInfo, - int sentenceIdx, - const char *attrName) const; + int GetInfoFromSentence(vector &vecInfo, + int sentenceIdx, + const char *attrName) const; - int GetInfoFromSentence(vector &vec, - int paragraphIdx, - int sentenceIdx, - const char* attrName) const; + int GetInfoFromSentence(vector &vec, + int paragraphIdx, + int sentenceIdx, + const char* attrName) const; - int GetInfoFromSentence(vector &vec, - int sentenceIdx, - const char * attrName) const; + int GetInfoFromSentence(vector &vec, + int sentenceIdx, + const char * attrName) const; - int SetInfoToSentence(const vector &vec, - int paragraphIdx, - int sentenceIdx, - const char * attrName); + int SetInfoToSentence(const vector &vec, + int paragraphIdx, + int sentenceIdx, + const char * attrName); - int SetInfoToSentence(const vector &vec, - int sentenceIdx, - const char * attrName); + int SetInfoToSentence(const vector &vec, + int sentenceIdx, + const char * attrName); - int SetInfoToSentence(const vector &vec, - int paragraphIdx, - int sentenceIdx, - const char* attrName); + int SetInfoToSentence(const vector &vec, + int paragraphIdx, + int sentenceIdx, + const char* attrName); - int SetInfoToSentence(const vector &vec, - int sentenceIdx, - const char* attrName); + int SetInfoToSentence(const vector &vec, + int sentenceIdx, + const char* attrName); - bool LTMLValidation(); - /*-------------------------------------------*/ + bool LTMLValidation(); + /*-------------------------------------------*/ private: - vector m_vecBegStnsIdxOfPara; - vector m_vecBegWordIdxOfStns; + vector m_vecBegStnsIdxOfPara; + vector m_vecBegWordIdxOfStns; - Document_t m_document; - Note m_note; - Summary m_summary; - TextClass m_textclass; - Coref m_coref; + Document document; + Note note; + Summary summary; + TextClass textclass; + Coref coref; - TiXmlDocument m_tiXmlDoc; + TiXmlDocument m_tiXmlDoc; - /*-------------------------------------------*/ + /*-------------------------------------------*/ private: - static const char * const TAG_DOC; - static const char * const TAG_NOTE; - static const char * const TAG_SUM; - static const char * const TAG_CLASS; - static const char * const TAG_COREF; - static const char * const TAG_COREF_MENT; - static const char * const TAG_COREF_CR; - static const char * const TAG_PARA; - static const char * const TAG_SENT; - static const char * const TAG_WORD; - static const char * const TAG_CONT; //sent, word - static const char * const TAG_POS; - static const char * const TAG_NE; - static const char * const TAG_WSD; - static const char * const TAG_WSD_EXP; - static const char * const TAG_PSR_PARENT; - static const char * const TAG_PSR_RELATE; - static const char * const TAG_SRL_ARG; - static const char * const TAG_SRL_TYPE; - static const char * const TAG_BEGIN; // cr, srl - static const char * const TAG_END; // cr, srl - static const char * const TAG_ID; // para, sent, word + static const char * const TAG_DOC; + static const char * const TAG_NOTE; + static const char * const TAG_SUM; + static const char * const TAG_CLASS; + static const char * const TAG_COREF; + static const char * const TAG_COREF_MENT; + static const char * const TAG_COREF_CR; + static const char * const TAG_PARA; + static const char * const TAG_SENT; + static const char * const TAG_WORD; + static const char * const TAG_CONT; //sent, word + static const char * const TAG_POS; + static const char * const TAG_NE; + static const char * const TAG_WSD; + static const char * const TAG_WSD_EXP; + static const char * const TAG_PSR_PARENT; + static const char * const TAG_PSR_RELATE; + static const char * const TAG_SRL_ARG; + static const char * const TAG_SRL_TYPE; + static const char * const TAG_BEGIN; // cr, srl + static const char * const TAG_END; // cr, srl + static const char * const TAG_ID; // para, sent, word }; -#endif // end for __LTP_XML4NLP_H__ +#endif // end for __LTP_XML4NLP_H__ diff --git a/src/ner/featurespace.h b/src/ner/featurespace.h index dd2a782d9..53983685d 100644 --- a/src/ner/featurespace.h +++ b/src/ner/featurespace.h @@ -11,113 +11,115 @@ namespace ner { class FeatureSpaceIterator { public: - FeatureSpaceIterator() : - _dicts(NULL), - _i(0), - _state(0) { - // should be careful about the empty dicts + FeatureSpaceIterator() + : _dicts(NULL), + _num_dicts(-1), + _i(0), + _state(0) { + // should be careful about the empty dicts + } + + // initialize the iterator with dicts and number of dicts + FeatureSpaceIterator(utility::SmartMap * dicts, int num_dicts) + : _dicts(dicts), + _num_dicts(num_dicts), + _i(0), + _state(0) { + ++ (*this); + } + + ~FeatureSpaceIterator() { + } + + const char * key() { return _j.key(); } + int id() { return (*_j.value()); } + int tid() { return _i; } + + bool operator ==(const FeatureSpaceIterator & other) const { + return ((_dicts + _i) == other._dicts); + } + + bool operator !=(const FeatureSpaceIterator & other) const { + return ((_dicts + _i) != other._dicts); + } + + FeatureSpaceIterator & operator = (const FeatureSpaceIterator & other) { + if (this != &other) { + _dicts = other._dicts; + _i = other._i; + _state = other._state; + _num_dicts = other._num_dicts; } - // initialize the iterator with dicts and number of dicts - FeatureSpaceIterator(utility::SmartMap * dicts, int num_dicts) : - _dicts(dicts), - _num_dicts(num_dicts), - _i(0), - _state(0) { - ++ (*this); - } - - ~FeatureSpaceIterator() { - } - - const char * key() { return _j.key(); } - int id() { return (*_j.value()); } - int tid() { return _i; } - - bool operator ==(const FeatureSpaceIterator & other) const { - return ((_dicts + _i) == other._dicts); - } - - bool operator !=(const FeatureSpaceIterator & other) const { - return ((_dicts + _i) != other._dicts); - } - - FeatureSpaceIterator & operator = (const FeatureSpaceIterator & other) { - if (this != &other) { - _dicts = other._dicts; - _i = other._i; - _state = other._state; - } - - return *this; - } - - void operator ++() { - switch (_state) { - case 0: - for (_i = 0; ; ++ _i) { - if (_dicts[_i].begin() == _dicts[_i].end()) { - _state = 1; - return; - } - for (_j = _dicts[_i].begin(); _j != _dicts[_i].end(); ++ _j) { - _state = 1; - return; - case 1:; - } - } + return *this; + } + + void operator ++() { + switch (_state) { + case 0: + for (_i = 0; ; ++ _i) { + if (_dicts[_i].begin() == _dicts[_i].end()) { + _state = 1; + return; + } + for (_j = _dicts[_i].begin(); _j != _dicts[_i].end(); ++ _j) { + _state = 1; + return; + case 1:; + } } } + } - int _i; - int _state; - int _num_dicts; - utility::SmartMap::const_iterator _j; - utility::SmartMap * _dicts; + int _i; + int _state; + int _num_dicts; + utility::SmartMap::const_iterator _j; + utility::SmartMap * _dicts; }; class FeatureSpace { public: - FeatureSpace(int num_labels = 1); - ~FeatureSpace(); - - int retrieve(int tid, const char * key, bool create); - int index(int tid, const char * key, int lid = 0); - int index(int prev_lid, int lid); - int num_features(); - int dim(); - void set_num_labels(int num_labeles); - - /* - * dump the feature space to a output stream - * - * @param[in] ofs the output stream - */ - void dump(std::ostream & ofs); - - /* - * load the feature space from a input stream - * - * @param[in] num_labels the number of labels - * @param[in] ifs the input stream - */ - bool load(int num_labeles, std::istream & ifs); - - FeatureSpaceIterator begin() { - return FeatureSpaceIterator(dicts, _num_dicts); - } - - FeatureSpaceIterator end() { - return FeatureSpaceIterator(dicts + _num_dicts, _num_dicts); - } + FeatureSpace(int num_labels = 1); + ~FeatureSpace(); + + int retrieve(int tid, const char * key, bool create); + int index(int tid, const char * key, int lid = 0); + int index(int prev_lid, int lid); + int num_features(); + int dim(); + void set_num_labels(int num_labeles); + + /* + * dump the feature space to a output stream + * + * @param[in] ofs the output stream + */ + void dump(std::ostream & ofs); + + /* + * load the feature space from a input stream + * + * @param[in] num_labels the number of labels + * @param[in] ifs the input stream + */ + bool load(int num_labeles, std::istream & ifs); + + FeatureSpaceIterator begin() { + return FeatureSpaceIterator(dicts, _num_dicts); + } + + FeatureSpaceIterator end() { + return FeatureSpaceIterator(dicts + _num_dicts, _num_dicts); + } private: - int _offset; - int _num_labels; - int _num_dicts; - utility::SmartMap * dicts; + int _offset; + int _num_labels; + int _num_dicts; + utility::SmartMap * dicts; }; -} // end for namespace ner -} // end for namespace ltp +} // end for namespace ner +} // end for namespace ltp #endif // end for __LTP_NER_FEATURE_SPACE_H__ diff --git a/src/ner/featurevec.h b/src/ner/featurevec.h index 96574fb97..1e0652652 100644 --- a/src/ner/featurevec.h +++ b/src/ner/featurevec.h @@ -6,32 +6,32 @@ namespace ner { struct FeatureVector { public: - FeatureVector () : n(0), idx(0), val(0) { - } + FeatureVector () : n(0), idx(0), val(0), loff(0) { + } - ~FeatureVector() { - } + ~FeatureVector() { + } - void clear() { - if (idx) { - delete [](idx); - idx = 0; - } + void clear() { + if (idx) { + delete [](idx); + idx = 0; + } - if (val) { - delete [](val); - val = 0; - } + if (val) { + delete [](val); + val = 0; } + } public: - int n; - int * idx; - double * val; - int loff; + int n; + int * idx; + double * val; + int loff; }; -} // end for namespace segmentor -} // end for namespace ltp +} // end for namespace segmentor +} // end for namespace ltp #endif // end for __LTP_SEGMENTOR_FEATRUE_VECTOR_H__ diff --git a/src/ner/instance.h b/src/ner/instance.h index 7df9423d9..e67b84e79 100644 --- a/src/ner/instance.h +++ b/src/ner/instance.h @@ -11,169 +11,168 @@ namespace ner { class Instance { public: - Instance() {} + Instance() {} - ~Instance() { - cleanup(); + ~Instance() { + cleanup(); + } + + inline size_t size() const { + return forms.size(); + } + + int num_errors() { + int len = size(); + if ((len != tagsidx.size()) || (len != predicted_tagsidx.size())) { + return len; } - inline size_t size() const { - return forms.size(); + int ret = 0; + for (int i = 0; i < len; ++ i) { + if (tagsidx[i] != predicted_tagsidx[i]) { + ++ ret; + } } - int num_errors() { - int len = size(); - if ((len != tagsidx.size()) || (len != predicted_tagsidx.size())) { - return len; - } + return ret; + } - int ret = 0; - for (int i = 0; i < len; ++ i) { - if (tagsidx[i] != predicted_tagsidx[i]) { - ++ ret; - } - } + int num_corrected_predicted_tags() { + int len = size(); + int ret = 0; - return ret; + for (int i = 0; i < len; ++ i) { + if (tagsidx[i] == predicted_tagsidx[i]) { + ++ ret; + } } - int num_corrected_predicted_tags() { - int len = size(); - int ret = 0; + return ret; + } - for (int i = 0; i < len; ++ i) { - if (tagsidx[i] == predicted_tagsidx[i]) { - ++ ret; - } - } + int num_gold_entities() { + int ret = 0; + if (entities.size() == 0) { + return size(); + } - return ret; + for (int i = 0; i < entities_tags.size(); ++ i) { + if (entities_tags[i] != "O") { + ++ ret; + } } - int num_gold_entities() { - int ret = 0; - if (entities.size() == 0) { - return size(); - } + return ret; + } - for (int i = 0; i < entities_tags.size(); ++ i) { - if (entities_tags[i] != "O") { - ++ ret; - } - } + int num_predicted_entities() { + int ret = 0; + if (predicted_entities.size() == 0) { + return size(); + } - return ret; + for (int i = 0; i < predicted_entities_tags.size(); ++ i) { + if (predicted_entities_tags[i] != "O") { + ++ ret; + } } - int num_predicted_entities() { - int ret = 0; - if (predicted_entities.size() == 0) { - return size(); - } + return ret; + } - for (int i = 0; i < predicted_entities_tags.size(); ++ i) { - if (predicted_entities_tags[i] != "O") { - ++ ret; - } - } + int num_recalled_entites() { + int len = 0; + int ret = 0; + int gold_len = 0, predicted_len = 0; - return ret; + for (int i = 0; i < entities.size(); ++ i) { + len += entities[i].size(); } - int num_recalled_entites() { - int len = 0; - int ret = 0; - int gold_len = 0, predicted_len = 0; - - for (int i = 0; i < entities.size(); ++ i) { - len += entities[i].size(); + for (int i = 0, j = 0; i < entities.size() && j < predicted_entities.size(); ) { + if ((entities[i] == predicted_entities[j]) && + (entities_tags[i] == predicted_entities_tags[j])) { + if (entities_tags[i] != "O") { + ++ ret; } - for (int i = 0, j = 0; i < entities.size() && j < predicted_entities.size(); ) { - if ((entities[i] == predicted_entities[j]) && - (entities_tags[i] == predicted_entities_tags[j])) { - if (entities_tags[i] != "O") { - ++ ret; - } - - gold_len += entities[i].size(); - predicted_len += predicted_entities[j].size(); - - ++ i; - ++ j; - } else { - gold_len += entities[i].size(); - predicted_len += predicted_entities[j].size(); - - ++ i; - ++ j; - - while (gold_len < len && predicted_len < len) { - if (gold_len < predicted_len) { - gold_len += entities[i].size(); - ++ i; - } else if (gold_len > predicted_len) { - predicted_len += predicted_entities[j].size(); - ++ j; - } else { - break; - } - } - } + gold_len += entities[i].size(); + predicted_len += predicted_entities[j].size(); + + ++ i; + ++ j; + } else { + gold_len += entities[i].size(); + predicted_len += predicted_entities[j].size(); + + ++ i; + ++ j; + + while (gold_len < len && predicted_len < len) { + if (gold_len < predicted_len) { + gold_len += entities[i].size(); + ++ i; + } else if (gold_len > predicted_len) { + predicted_len += predicted_entities[j].size(); + ++ j; + } else { + break; + } } - - return ret; + } } - int cleanup() { - int len = 0; - if ((len = uni_features.total_size()) > 0) { - int d1 = uni_features.nrows(); - int d2 = uni_features.ncols(); - - for (int i = 0; i < d1; ++ i) { - if (uni_features[i][0]) { - uni_features[i][0]->clear(); - } - for (int j = 0; j < d2; ++ j) { - if (uni_features[i][j]) { - delete uni_features[i][j]; - } - } - } + return ret; + } + + int cleanup() { + if (uni_features.total_size() > 0) { + int d1 = uni_features.nrows(); + int d2 = uni_features.ncols(); + + for (int i = 0; i < d1; ++ i) { + if (uni_features[i][0]) { + uni_features[i][0]->clear(); + } + for (int j = 0; j < d2; ++ j) { + if (uni_features[i][j]) { + delete uni_features[i][j]; + } } + } + } - uni_features.dealloc(); - uni_scores.dealloc(); - bi_scores.dealloc(); + uni_features.dealloc(); + uni_scores.dealloc(); + bi_scores.dealloc(); - features.zero(); - predicted_features.zero(); + features.zero(); + predicted_features.zero(); - return 0; - } + return 0; + } public: - std::vector< std::string > raw_forms; - std::vector< std::string > forms; - std::vector< std::string > postags; - std::vector< std::string > tags; - std::vector< int > tagsidx; - std::vector< std::string > predicted_tags; - std::vector< int > predicted_tagsidx; - std::vector< std::string > entities; - std::vector< std::string > entities_tags; - std::vector< std::string > predicted_entities; - std::vector< std::string > predicted_entities_tags; - - math::SparseVec features; /*< the gold features */ - math::SparseVec predicted_features; /*< the predicted features */ - - math::Mat< FeatureVector *> uni_features; - math::Mat< double > uni_scores; - math::Mat< double > bi_scores; + std::vector< std::string > raw_forms; + std::vector< std::string > forms; + std::vector< std::string > postags; + std::vector< std::string > tags; + std::vector< int > tagsidx; + std::vector< std::string > predicted_tags; + std::vector< int > predicted_tagsidx; + std::vector< std::string > entities; + std::vector< std::string > entities_tags; + std::vector< std::string > predicted_entities; + std::vector< std::string > predicted_entities_tags; + + math::SparseVec features; /*< the gold features */ + math::SparseVec predicted_features; /*< the predicted features */ + + math::Mat< FeatureVector *> uni_features; + math::Mat< double > uni_scores; + math::Mat< double > bi_scores; }; -} // end for namespace ner -} // end for namespace ltp +} // end for namespace ner +} // end for namespace ltp #endif // end for __LTP_NER_INSTANCE_H__ diff --git a/src/ner/model.cpp b/src/ner/model.cpp index eb9bf8599..b44bd8193 100644 --- a/src/ner/model.cpp +++ b/src/ner/model.cpp @@ -9,77 +9,78 @@ Model::Model() { Model::~Model() { } -void Model::save(std::ostream & ofs) { - // write a signature into the file - char chunk[16] = {'o','t','c','w','s', '\0'}; - ofs.write(chunk, 16); +void +Model::save(std::ostream & ofs) { + // write a signature into the file + char chunk[16] = {'o','t','n','e','r', '\0'}; + ofs.write(chunk, 16); - int off = ofs.tellp(); + int off = ofs.tellp(); - unsigned labels_offset = 0; - unsigned lexicon_offset = 0; - unsigned feature_offset = 0; - unsigned parameter_offset = 0; + unsigned labels_offset = 0; + unsigned lexicon_offset = 0; + unsigned feature_offset = 0; + unsigned parameter_offset = 0; - write_uint(ofs, 0); // the label offset - write_uint(ofs, 0); // the cluster lexicon offset - write_uint(ofs, 0); // the features offset - write_uint(ofs, 0); // the parameter offset + write_uint(ofs, 0); // the label offset + write_uint(ofs, 0); // the cluster lexicon offset + write_uint(ofs, 0); // the features offset + write_uint(ofs, 0); // the parameter offset - labels_offset = ofs.tellp(); - labels.dump(ofs); + labels_offset = ofs.tellp(); + labels.dump(ofs); - lexicon_offset = ofs.tellp(); - cluster_lexicon.dump(ofs); + lexicon_offset = ofs.tellp(); + cluster_lexicon.dump(ofs); - feature_offset = ofs.tellp(); - space.dump(ofs); + feature_offset = ofs.tellp(); + space.dump(ofs); - parameter_offset = ofs.tellp(); - param.dump(ofs); + parameter_offset = ofs.tellp(); + param.dump(ofs); - ofs.seekp(off); - write_uint(ofs, labels_offset); - write_uint(ofs, lexicon_offset); - write_uint(ofs, feature_offset); - write_uint(ofs, parameter_offset); + ofs.seekp(off); + write_uint(ofs, labels_offset); + write_uint(ofs, lexicon_offset); + write_uint(ofs, feature_offset); + write_uint(ofs, parameter_offset); } bool Model::load(std::istream & ifs) { - char chunk[16]; - ifs.read(chunk, 16); - - if (strcmp(chunk, "otcws")) { - return false; - } - - unsigned labels_offset = read_uint(ifs); - unsigned lexicon_offset = read_uint(ifs); - unsigned feature_offset = read_uint(ifs); - unsigned parameter_offset = read_uint(ifs); - - ifs.seekg(labels_offset); - if (!labels.load(ifs)) { - return false; - } - - ifs.seekg(lexicon_offset); - if (!cluster_lexicon.load(ifs)) { - return false; - } - - ifs.seekg(feature_offset); - if (!space.load(labels.size(), ifs)) { - return false; - } - - ifs.seekg(parameter_offset); - if (!param.load(ifs)) { - return false; - } - - return true; + char chunk[16]; + ifs.read(chunk, 16); + + if (strcmp(chunk, "otner")) { + return false; + } + + unsigned labels_offset = read_uint(ifs); + unsigned lexicon_offset = read_uint(ifs); + unsigned feature_offset = read_uint(ifs); + unsigned parameter_offset = read_uint(ifs); + + ifs.seekg(labels_offset); + if (!labels.load(ifs)) { + return false; + } + + ifs.seekg(lexicon_offset); + if (!cluster_lexicon.load(ifs)) { + return false; + } + + ifs.seekg(feature_offset); + if (!space.load(labels.size(), ifs)) { + return false; + } + + ifs.seekg(parameter_offset); + if (!param.load(ifs)) { + return false; + } + + return true; } -} // end for namespace ner -} // end for namespace ltp +} // end for namespace ner +} // end for namespace ltp diff --git a/src/ner/model.h b/src/ner/model.h index 7534a1814..ec898b70e 100644 --- a/src/ner/model.h +++ b/src/ner/model.h @@ -13,50 +13,50 @@ using namespace ltp::utility; class Model { public: - Model(); - ~Model(); - - /* - * get number of labels; - * - * @return int the number of labels - */ - inline int num_labels(void) { - return labels.size(); - } - - /* - * save the model to a output stream - * - * @param[out] ofs the output stream - */ - void save(std::ostream & ofs); - - /* - * load the model from an input stream - * - * @param[in] ifs the input stream - */ - bool load(std::istream & ifs); + Model(); + ~Model(); + + /* + * get number of labels; + * + * @return int the number of labels + */ + inline int num_labels(void) { + return labels.size(); + } + + /* + * save the model to a output stream + * + * @param[out] ofs the output stream + */ + void save(std::ostream & ofs); + + /* + * load the model from an input stream + * + * @param[in] ifs the input stream + */ + bool load(std::istream & ifs); public: - IndexableSmartMap labels; - FeatureSpace space; - Parameters param; + IndexableSmartMap labels; + FeatureSpace space; + Parameters param; - SmartMap cluster_lexicon; + SmartMap cluster_lexicon; private: - void write_uint(std::ostream & out, unsigned int val) { - out.write(reinterpret_cast(&val), sizeof(unsigned int)); - } - - unsigned int read_uint(std::istream & in) { - char p[4]; - in.read(reinterpret_cast(p), sizeof(unsigned int)); - return *reinterpret_cast(p); - } + void write_uint(std::ostream & out, unsigned int val) { + out.write(reinterpret_cast(&val), sizeof(unsigned int)); + } + + unsigned int read_uint(std::istream & in) { + char p[4]; + in.read(reinterpret_cast(p), sizeof(unsigned int)); + return *reinterpret_cast(p); + } }; -} // end for namespace ner -} // end for namespace ltp +} // end for namespace ner +} // end for namespace ltp #endif // end for __LTP_NER_MODEL_H__ diff --git a/src/ner/ner.cpp b/src/ner/ner.cpp index 40e4ba5fc..897eee707 100644 --- a/src/ner/ner.cpp +++ b/src/ner/ner.cpp @@ -20,723 +20,734 @@ namespace ltp { namespace ner { -NER::NER() : - model(0), - decoder(0) { +NER::NER() + : model(0), + decoder(0), + __TRAIN__(false), + __TEST__(false), + __DUMP__(false) { } -NER::NER(ltp::utility::ConfigParser & cfg) : - model(0), - decoder(0) { - parse_cfg(cfg); +NER::NER(ltp::utility::ConfigParser & cfg) + : model(0), + decoder(0), + __TRAIN__(false), + __TEST__(false), + __DUMP__(false) { + parse_cfg(cfg); } NER::~NER() { - if (model) { - delete model; - } + if (model) { + delete model; + } - if (decoder) { - delete decoder; - } + if (decoder) { + delete decoder; + } } -void NER::run(void) { - if (__TRAIN__) { - train(); - } +void +NER::run(void) { + if (__TRAIN__) { + train(); + } - if (__TEST__) { - test(); - } + if (__TEST__) { + test(); + } - if (__DUMP__) { - dump(); - } + if (__DUMP__) { + dump(); + } - for (int i = 0; i < train_dat.size(); ++ i) { - if (train_dat[i]) { - delete train_dat[i]; - } + for (int i = 0; i < train_dat.size(); ++ i) { + if (train_dat[i]) { + delete train_dat[i]; } + } } -bool NER::parse_cfg(ltp::utility::ConfigParser & cfg) { - std::string strbuf; - int intbuf; - - __TRAIN__ = false; - - train_opt.train_file = ""; - train_opt.holdout_file = ""; - train_opt.algorithm = "pa"; - train_opt.model_name = ""; - train_opt.max_iter = 10; - train_opt.display_interval = 5000; +bool +NER::parse_cfg(ltp::utility::ConfigParser & cfg) { + std::string strbuf; + int intbuf; - if (cfg.has_section("train")) { - TRACE_LOG("Training mode specified."); - __TRAIN__ = true; - if (cfg.get("train", "train-file", strbuf)) { - train_opt.train_file = strbuf; - } else { - ERROR_LOG("train-file config item is not found."); - return false; - } + train_opt.train_file = ""; + train_opt.holdout_file = ""; + train_opt.algorithm = "pa"; + train_opt.model_name = ""; + train_opt.max_iter = 10; + train_opt.display_interval = 5000; - if (cfg.get("train", "holdout-file", strbuf)) { - train_opt.holdout_file = strbuf; - } else { - ERROR_LOG("holdout-file config item is not found."); - return false; - } + if (cfg.has_section("train")) { + TRACE_LOG("Training mode specified."); + __TRAIN__ = true; - if (cfg.get("train", "algorithm", strbuf)) { - train_opt.algorithm = strbuf; - } else { - WARNING_LOG("algorithm is not configed, [PA] is set as default"); - } + if (cfg.get("train", "train-file", strbuf)) { + train_opt.train_file = strbuf; + } else { + ERROR_LOG("train-file config item is not found."); + return false; + } - train_opt.model_name = train_opt.train_file + "." + train_opt.algorithm; - if (cfg.get("train", "model-name", strbuf)) { - train_opt.model_name = strbuf; - } else { - WARNING_LOG("model name is not configed, [%s] is set as default", - train_opt.model_name.c_str()); - } + if (cfg.get("train", "holdout-file", strbuf)) { + train_opt.holdout_file = strbuf; + } else { + ERROR_LOG("holdout-file config item is not found."); + return false; + } - if (cfg.get_integer("train", "max-iter", intbuf)) { - train_opt.max_iter = intbuf; - } else { - WARNING_LOG("max-iter is not configed, [10] is set as default."); - } + if (cfg.get("train", "algorithm", strbuf)) { + train_opt.algorithm = strbuf; + } else { + WARNING_LOG("algorithm is not configed, [PA] is set as default"); } - __TEST__ = false; + train_opt.model_name = train_opt.train_file + "." + train_opt.algorithm; + if (cfg.get("train", "model-name", strbuf)) { + train_opt.model_name = strbuf; + } else { + WARNING_LOG("model name is not configed, [%s] is set as default", + train_opt.model_name.c_str()); + } - test_opt.test_file = ""; - test_opt.model_file = ""; - test_opt.lexicon_file = ""; + if (cfg.get_integer("train", "max-iter", intbuf)) { + train_opt.max_iter = intbuf; + } else { + WARNING_LOG("max-iter is not configed, [10] is set as default."); + } + } - if (cfg.has_section("test")) { - __TEST__ = true; + test_opt.test_file = ""; + test_opt.model_file = ""; + test_opt.lexicon_file = ""; - if (cfg.get("test", "test-file", strbuf)) { - test_opt.test_file = strbuf; - } else { - ERROR_LOG("test-file config item is not set."); - return false; - } + if (cfg.has_section("test")) { + __TEST__ = true; - if (cfg.get("test", "model-file", strbuf)) { - test_opt.model_file = strbuf; - } else { - ERROR_LOG("model-file is not configed. "); - return false; - } + if (cfg.get("test", "test-file", strbuf)) { + test_opt.test_file = strbuf; + } else { + ERROR_LOG("test-file config item is not set."); + return false; + } - if (cfg.get("test", "lexicon-file", strbuf)) { - test_opt.lexicon_file = strbuf; - } + if (cfg.get("test", "model-file", strbuf)) { + test_opt.model_file = strbuf; + } else { + ERROR_LOG("model-file is not configed. "); + return false; } - __DUMP__ = false; + if (cfg.get("test", "lexicon-file", strbuf)) { + test_opt.lexicon_file = strbuf; + } + } - dump_opt.model_file = ""; - if (cfg.has_section("dump")) { - __DUMP__ = true; + dump_opt.model_file = ""; + if (cfg.has_section("dump")) { + __DUMP__ = true; - if (cfg.get("dump", "model-file", strbuf)) { - dump_opt.model_file = strbuf; - } else { - ERROR_LOG("model-file is not configed."); - return false; - } + if (cfg.get("dump", "model-file", strbuf)) { + dump_opt.model_file = strbuf; + } else { + ERROR_LOG("model-file is not configed."); + return false; } + } - return true; + return true; } -bool NER::read_instance(const char * train_file) { - std::ifstream ifs(train_file); +bool +NER::read_instance(const char * train_file) { + std::ifstream ifs(train_file); - if (!ifs) { - return false; - } + if (!ifs) { + return false; + } - NERReader reader(ifs, true); - train_dat.clear(); + NERReader reader(ifs, true); + train_dat.clear(); - Instance * inst = NULL; + Instance * inst = NULL; - while ((inst = reader.next())) { - train_dat.push_back(inst); - } + while ((inst = reader.next())) { + train_dat.push_back(inst); + } - return true; + return true; } void NER::build_configuration(void) { - // tag set is some kind of hard coded into the source - - std::stringstream S; - for (int i = 0; i < __num_pos_types__; ++ i) { - for (int j = 0; j < __num_ne_types__; ++ j) { - S.str(std::string()); - S << __pos_types__[i] << "-" << __ne_types__[j]; - model->labels.push(S.str()); - } + // tag set is some kind of hard coded into the source + + std::stringstream S; + for (int i = 0; i < __num_pos_types__; ++ i) { + for (int j = 0; j < __num_ne_types__; ++ j) { + S.str(std::string()); + S << __pos_types__[i] << "-" << __ne_types__[j]; + model->labels.push(S.str()); } - model->labels.push("O"); + } + model->labels.push("O"); - for (int i = 0; i < train_dat.size(); ++ i) { - Instance * inst = train_dat[i]; - int len = inst->size(); + for (int i = 0; i < train_dat.size(); ++ i) { + Instance * inst = train_dat[i]; + int len = inst->size(); - inst->tagsidx.resize(len); - for (int j = 0; j < len; ++ j) { - // build labels dictionary - inst->tagsidx[j] = model->labels.index( inst->tags[j] ); - } + inst->tagsidx.resize(len); + for (int j = 0; j < len; ++ j) { + // build labels dictionary + inst->tagsidx[j] = model->labels.index( inst->tags[j] ); } + } } void NER::extract_features(Instance * inst, bool create) { - const int N = Extractor::num_templates(); - const int L = model->num_labels(); - - vector< StringVec > cache; - vector< int > cache_again; - - cache.resize(N); - int len = inst->size(); + const int N = Extractor::num_templates(); + const int L = model->num_labels(); - // allocate the uni_features - inst->uni_features.resize(len, L); inst->uni_features = 0; - inst->uni_scores.resize(len, L); inst->uni_scores = NEG_INF; - inst->bi_scores.resize(L, L); inst->bi_scores = NEG_INF; + vector< StringVec > cache; + vector< int > cache_again; - for (int pos = 0; pos < len; ++ pos) { - for (int n = 0; n < N; ++ n) { - cache[n].clear(); - } - cache_again.clear(); + cache.resize(N); + int len = inst->size(); - Extractor::extract1o(inst, pos, cache); + // allocate the uni_features + inst->uni_features.resize(len, L); inst->uni_features = 0; + inst->uni_scores.resize(len, L); inst->uni_scores = NEG_INF; + inst->bi_scores.resize(L, L); inst->bi_scores = NEG_INF; - for (int tid = 0; tid < cache.size(); ++ tid) { - for (int itx = 0; itx < cache[tid].size(); ++ itx) { - if (create) { - model->space.retrieve(tid, cache[tid][itx], true); - } + for (int pos = 0; pos < len; ++ pos) { + for (int n = 0; n < N; ++ n) { + cache[n].clear(); + } + cache_again.clear(); - int idx = model->space.index(tid, cache[tid][itx]); + Extractor::extract1o(inst, pos, cache); - if (idx >= 0) { - cache_again.push_back(idx); - } - } + for (int tid = 0; tid < cache.size(); ++ tid) { + for (int itx = 0; itx < cache[tid].size(); ++ itx) { + if (create) { + model->space.retrieve(tid, cache[tid][itx], true); } - int num_feat = cache_again.size(); - - if (num_feat > 0) { - int l = 0; - int * idx = new int[num_feat]; - for (int j = 0; j < num_feat; ++ j) { - idx[j] = cache_again[j]; - } - - inst->uni_features[pos][l] = new FeatureVector; - inst->uni_features[pos][l]->n = num_feat; - inst->uni_features[pos][l]->val = 0; - inst->uni_features[pos][l]->loff = 0; - inst->uni_features[pos][l]->idx = idx; - - for (l = 1; l < L; ++ l) { - inst->uni_features[pos][l] = new FeatureVector; - inst->uni_features[pos][l]->n = num_feat; - inst->uni_features[pos][l]->idx = idx; - inst->uni_features[pos][l]->val = 0; - inst->uni_features[pos][l]->loff = l; - } - } - } -} + int idx = model->space.index(tid, cache[tid][itx]); -void NER::build_feature_space(void) { - // build feature space, it a wrapper for - // featurespace.build_feature_space - int N = Extractor::num_templates(); - int L = model->num_labels(); - model->space.set_num_labels(L); - - for (int i = 0; i < train_dat.size(); ++ i) { - extract_features(train_dat[i], true); - if ((i + 1) % train_opt.display_interval == 0) { - TRACE_LOG("[%d] instances is extracted.", (i+1)); + if (idx >= 0) { + cache_again.push_back(idx); } + } } -} -void NER::build_entities(Instance * inst, - const std::vector & tagsidx, - std::vector & entities, - std::vector & entities_tags, - int beg_tag0, - int beg_tag1, - int beg_tag2) { - entities.clear(); - entities_tags.clear(); - - std::string entity = ""; - std::string entity_tag = ""; - int len = inst->size(); - int tag = -1; - int tag_prefix = -1; - int tag_suffix = -1; - - // should check the tagsidx size - entity = inst->raw_forms[0]; - - tag = inst->tagsidx[0]; - tag_suffix = tag % __num_ne_types__; - entity_tag = (tag == 12 ? "O" : __ne_types__[tag_suffix]); - for (int i = 1; i < len; ++ i) { - tag = tagsidx[i]; - - tag_prefix = tag / __num_ne_types__; - tag_suffix = (tag % __num_ne_types__); - - if (tag_prefix == beg_tag0 || tag_prefix == beg_tag1 || tag_prefix == beg_tag2) { - entities.push_back(entity); - entities_tags.push_back(entity_tag); - - entity = inst->raw_forms[i]; - entity_tag = (tag == 12 ? "O" : __ne_types__[tag_suffix]); - } else { - entity += inst->raw_forms[i]; - } + int num_feat = cache_again.size(); + + if (num_feat > 0) { + int l = 0; + int * idx = new int[num_feat]; + for (int j = 0; j < num_feat; ++ j) { + idx[j] = cache_again[j]; + } + + inst->uni_features[pos][l] = new FeatureVector; + inst->uni_features[pos][l]->n = num_feat; + inst->uni_features[pos][l]->val = 0; + inst->uni_features[pos][l]->loff = 0; + inst->uni_features[pos][l]->idx = idx; + + for (l = 1; l < L; ++ l) { + inst->uni_features[pos][l] = new FeatureVector; + inst->uni_features[pos][l]->n = num_feat; + inst->uni_features[pos][l]->idx = idx; + inst->uni_features[pos][l]->val = 0; + inst->uni_features[pos][l]->loff = l; + } } - entities.push_back(entity); - entities_tags.push_back(entity_tag); + } } -void NER::calculate_scores(Instance * inst, bool use_avg) { - int len = inst->size(); - int L = model->num_labels(); - for (int i = 0; i < len; ++ i) { - for (int l = 0; l < L; ++ l) { - FeatureVector * fv = inst->uni_features[i][l]; - if (!fv) { - continue; - } +void NER::build_feature_space(void) { + // build feature space, it a wrapper for + // featurespace.build_feature_space + Extractor::num_templates(); - inst->uni_scores[i][l] = model->param.dot(inst->uni_features[i][l], use_avg); - } - } + int L = model->num_labels(); + model->space.set_num_labels(L); - for (int pl = 0; pl < L; ++ pl) { - for (int l = 0; l < L; ++ l) { - int idx = model->space.index(pl, l); - inst->bi_scores[pl][l] = model->param.dot(idx, use_avg); - } + for (int i = 0; i < train_dat.size(); ++ i) { + extract_features(train_dat[i], true); + if ((i + 1) % train_opt.display_interval == 0) { + TRACE_LOG("[%d] instances is extracted.", (i+1)); } + } } -void NER::collect_features(Instance * inst, const std::vector & tagsidx, math::SparseVec & vec) { - int len = inst->size(); +void +NER::build_entities(Instance * inst, + const std::vector & tagsidx, + std::vector & entities, + std::vector & entities_tags, + int beg_tag0, + int beg_tag1, + int beg_tag2) { + entities.clear(); + entities_tags.clear(); - vec.zero(); - for (int i = 0; i < len; ++ i) { - int l = tagsidx[i]; - const FeatureVector * fv = inst->uni_features[i][l]; - - if (!fv) { - continue; - } + int len = inst->size(); - vec.add(fv->idx, fv->val, fv->n, fv->loff, 1.); + // should check the tagsidx size + std::string entity = inst->raw_forms[0]; - if (i > 0) { - int prev_lid = tagsidx[i-1]; - int idx = model->space.index(prev_lid, l); - vec.add(idx, 1.); - } - } -} + int tag = inst->tagsidx[0]; + int tag_prefix = -1; + int tag_suffix = tag % __num_ne_types__; -Model * NER::truncate(void) { - Model * new_model = new Model; - // copy the label indexable map to the new model - for (int i = 0; i < model->labels.size(); ++ i) { - const char * key = model->labels.at(i); - new_model->labels.push(key); - } + std::string entity_tag = (tag == 12 ? "O" : __ne_types__[tag_suffix]); + for (int i = 1; i < len; ++ i) { + tag = tagsidx[i]; - TRACE_LOG("building labels map is done"); - - int L = new_model->num_labels(); - new_model->space.set_num_labels(L); - - // iterate over the feature space and see if the parameter value equals to zero - for (FeatureSpaceIterator itx = model->space.begin(); - itx != model->space.end(); - ++ itx) { - const char * key = itx.key(); - int tid = itx.tid(); - int id = model->space.index(tid, key); - - bool flag = false; - for (int l = 0; l < L; ++ l) { - double p = model->param.dot(id + l); - if (p != 0.) { - flag = true; - } - } + tag_prefix = tag / __num_ne_types__; + tag_suffix = (tag % __num_ne_types__); - if (!flag) { - continue; - } + if (tag_prefix == beg_tag0 || tag_prefix == beg_tag1 || tag_prefix == beg_tag2) { + entities.push_back(entity); + entities_tags.push_back(entity_tag); - new_model->space.retrieve(tid, key, true); - } - TRACE_LOG("Scanning old features space, building new feature space is done"); - - new_model->param.realloc(new_model->space.dim()); - TRACE_LOG("Parameter dimension of new model is [%d]", new_model->space.dim()); - - for (FeatureSpaceIterator itx = new_model->space.begin(); - itx != new_model->space.end(); - ++ itx) { - const char * key = itx.key(); - int tid = itx.tid(); - - int old_id = model->space.index(tid, key); - int new_id = new_model->space.index(tid, key); - - for (int l = 0; l < L; ++ l) { - // pay attention to this place, use average should be set true - // some dirty code - new_model->param._W[new_id + l] = model->param._W[old_id + l]; - new_model->param._W_sum[new_id + l] = model->param._W_sum[old_id + l]; - new_model->param._W_time[new_id + l] = model->param._W_time[old_id + l]; - } + entity = inst->raw_forms[i]; + entity_tag = (tag == 12 ? "O" : __ne_types__[tag_suffix]); + } else { + entity += inst->raw_forms[i]; } + } + entities.push_back(entity); + entities_tags.push_back(entity_tag); +} - for (int pl = 0; pl < L; ++ pl) { - for (int l = 0; l < L; ++ l) { - int old_id = model->space.index(pl, l); - int new_id = new_model->space.index(pl, l); +void +NER::calculate_scores(Instance * inst, bool use_avg) { + int len = inst->size(); + int L = model->num_labels(); + for (int i = 0; i < len; ++ i) { + for (int l = 0; l < L; ++ l) { + FeatureVector * fv = inst->uni_features[i][l]; + if (!fv) { + continue; + } - new_model->param._W[new_id] = model->param._W[old_id]; - new_model->param._W_sum[new_id] = model->param._W_sum[old_id]; - new_model->param._W_time[new_id] = model->param._W_time[old_id]; - } + inst->uni_scores[i][l] = model->param.dot(inst->uni_features[i][l], use_avg); } - TRACE_LOG("Building new model is done"); + } - for (SmartMap::const_iterator itx = model->cluster_lexicon.begin(); - itx != model->cluster_lexicon.end(); - ++ itx) { - new_model->cluster_lexicon.set(itx.key(), (*itx.value())); + for (int pl = 0; pl < L; ++ pl) { + for (int l = 0; l < L; ++ l) { + int idx = model->space.index(pl, l); + inst->bi_scores[pl][l] = model->param.dot(idx, use_avg); } - - return new_model; + } } -void NER::train(void) { - const char * train_file = train_opt.train_file.c_str(); +void +NER::collect_features(Instance * inst, + const std::vector & tagsidx, + math::SparseVec & vec) { + int len = inst->size(); - // read in training instance - if (!read_instance(train_file)) { - ERROR_LOG("Training file doesn't exist"); + vec.zero(); + for (int i = 0; i < len; ++ i) { + int l = tagsidx[i]; + const FeatureVector * fv = inst->uni_features[i][l]; + + if (!fv) { + continue; } - TRACE_LOG("Read in [%d] instances.", train_dat.size()); + vec.add(fv->idx, fv->val, fv->n, fv->loff, 1.); - model = new Model; - // build tag dictionary, map string tag to index - TRACE_LOG("Start build configuration"); - build_configuration(); - TRACE_LOG("Build configuration is done."); - TRACE_LOG("Number of labels: [%d]", model->labels.size()); + if (i > 0) { + int prev_lid = tagsidx[i-1]; + int idx = model->space.index(prev_lid, l); + vec.add(idx, 1.); + } + } +} - // build feature space from the training instance - TRACE_LOG("Start building feature space."); - build_feature_space(); - TRACE_LOG("Building feature space is done."); - TRACE_LOG("Number of features: [%d]", model->space.num_features()); +Model * +NER::truncate(void) { + Model * new_model = new Model; + // copy the label indexable map to the new model + for (int i = 0; i < model->labels.size(); ++ i) { + const char * key = model->labels.at(i); + new_model->labels.push(key); + } - model->param.realloc(model->space.dim()); - TRACE_LOG("Allocate [%d] dimensition parameter.", model->space.dim()); + TRACE_LOG("building labels map is done"); - NERWriter writer(std::cout); + int L = new_model->num_labels(); + new_model->space.set_num_labels(L); - if (train_opt.algorithm == "mira") { - // use mira algorithm - /*kbest_decoder = new KBestDecoder(L); + // iterate over the feature space and see if the parameter value equals to zero + for (FeatureSpaceIterator itx = model->space.begin(); + itx != model->space.end(); + ++ itx) { + const char * key = itx.key(); + int tid = itx.tid(); + int id = model->space.index(tid, key); - for (int iter = 0; iter < train_opt.max_iter; ++ iter) { - for (int i = 0; i < train_dat.size(); ++ i) { - extract_features(train_dat[i]); - calculate_scores(train_dat[i]); + bool flag = false; + for (int l = 0; l < L; ++ l) { + double p = model->param.dot(id + l); + if (p != 0.) { + flag = true; + } + } - KBestDecoder::KBestDecodeResult result; - kbest_decoder->decode(train_dat[i], result); - } - }*/ - } else { - // use pa or average perceptron algorithm - rulebase::RuleBase base(model->labels); - decoder = new Decoder(model->num_labels(), base); - TRACE_LOG("Allocated plain decoder"); - - for (int iter = 0; iter < train_opt.max_iter; ++ iter) { - TRACE_LOG("Training iteraition [%d]", (iter + 1)); - for (int i = 0; i < train_dat.size(); ++ i) { - // extract_features(train_dat[i]); - - Instance * inst = train_dat[i]; - calculate_scores(inst, false); - decoder->decode(inst); - - if (inst->features.dim() == 0) { - collect_features(inst, inst->tagsidx, inst->features); - } - collect_features(inst, inst->predicted_tagsidx, inst->predicted_features); - - - if (train_opt.algorithm == "pa") { - SparseVec update_features; - update_features.zero(); - update_features.add(train_dat[i]->features, 1.); - update_features.add(train_dat[i]->predicted_features, -1.); - - double error = train_dat[i]->num_errors(); - double score = model->param.dot(update_features, false); - double norm = update_features.L2(); - - double step = 0.; - if (norm < EPS) { - step = 0; - } else { - step = (error - score) / norm; - } - - model->param.add(update_features, - iter * train_dat.size() + i + 1, - step); - } else if (train_opt.algorithm == "ap") { - SparseVec update_features; - update_features.zero(); - update_features.add(train_dat[i]->features, 1.); - update_features.add(train_dat[i]->predicted_features, -1.); - - model->param.add(update_features, - iter * train_dat.size() + i + 1, - 1.); - } - - if ((i+1) % train_opt.display_interval == 0) { - TRACE_LOG("[%d] instances is trained.", i+1); - } - } - model->param.flush( train_dat.size() * (iter + 1) ); - - Model * new_model = truncate(); - swap(model, new_model); - evaluate(); - - std::string saved_model_file = (train_opt.model_name + "." + strutils::to_str(iter) + ".model"); - std::ofstream ofs(saved_model_file.c_str(), std::ofstream::binary); - - swap(model, new_model); - new_model->save(ofs); - delete new_model; - - TRACE_LOG("Model for iteration [%d] is saved to [%s]", - iter + 1, - saved_model_file.c_str()); - } + if (!flag) { + continue; } -} -void NER::evaluate(void) { - const char * holdout_file = train_opt.holdout_file.c_str(); + new_model->space.retrieve(tid, key, true); + } + TRACE_LOG("Scanning old features space, building new feature space is done"); - ifstream ifs(holdout_file); + new_model->param.realloc(new_model->space.dim()); + TRACE_LOG("Parameter dimension of new model is [%d]", new_model->space.dim()); - if (!ifs) { - ERROR_LOG("Failed to open holdout file."); - return; + for (FeatureSpaceIterator itx = new_model->space.begin(); + itx != new_model->space.end(); + ++ itx) { + const char * key = itx.key(); + int tid = itx.tid(); + + int old_id = model->space.index(tid, key); + int new_id = new_model->space.index(tid, key); + + for (int l = 0; l < L; ++ l) { + // pay attention to this place, use average should be set true + // some dirty code + new_model->param._W[new_id + l] = model->param._W[old_id + l]; + new_model->param._W_sum[new_id + l] = model->param._W_sum[old_id + l]; + new_model->param._W_time[new_id + l] = model->param._W_time[old_id + l]; } + } - NERReader reader(ifs, true); - NERWriter writer(std::cout); - Instance * inst = NULL; + for (int pl = 0; pl < L; ++ pl) { + for (int l = 0; l < L; ++ l) { + int old_id = model->space.index(pl, l); + int new_id = new_model->space.index(pl, l); - // some dirty hard code and trick - int beg_tag0 = (model->labels.index( "B-Nh" ) / __num_ne_types__); - int beg_tag1 = (model->labels.index( "S-Nh" ) / __num_ne_types__); - int beg_tag2 = (model->labels.index( "O" ) / __num_ne_types__); + new_model->param._W[new_id] = model->param._W[old_id]; + new_model->param._W_sum[new_id] = model->param._W_sum[old_id]; + new_model->param._W_time[new_id] = model->param._W_time[old_id]; + } + } + TRACE_LOG("Building new model is done"); + for (SmartMap::const_iterator itx = model->cluster_lexicon.begin(); + itx != model->cluster_lexicon.end(); + ++ itx) { + new_model->cluster_lexicon.set(itx.key(), (*itx.value())); + } - int num_recalled_entities = 0; - int num_predicted_entities = 0; - int num_gold_entities = 0; + return new_model; +} - int L = model->num_labels(); +void +NER::train(void) { + const char * train_file = train_opt.train_file.c_str(); + + // read in training instance + if (!read_instance(train_file)) { + ERROR_LOG("Training file doesn't exist"); + } + + TRACE_LOG("Read in [%d] instances.", train_dat.size()); + + model = new Model; + // build tag dictionary, map string tag to index + TRACE_LOG("Start build configuration"); + build_configuration(); + TRACE_LOG("Build configuration is done."); + TRACE_LOG("Number of labels: [%d]", model->labels.size()); + + // build feature space from the training instance + TRACE_LOG("Start building feature space."); + build_feature_space(); + TRACE_LOG("Building feature space is done."); + TRACE_LOG("Number of features: [%d]", model->space.num_features()); + + model->param.realloc(model->space.dim()); + TRACE_LOG("Allocate [%d] dimensition parameter.", model->space.dim()); + + NERWriter writer(std::cout); + + if (train_opt.algorithm == "mira") { + // use mira algorithm + /*kbest_decoder = new KBestDecoder(L); + + for (int iter = 0; iter < train_opt.max_iter; ++ iter) { + for (int i = 0; i < train_dat.size(); ++ i) { + extract_features(train_dat[i]); + calculate_scores(train_dat[i]); + + KBestDecoder::KBestDecodeResult result; + kbest_decoder->decode(train_dat[i], result); + } + }*/ + } else { + // use pa or average perceptron algorithm + rulebase::RuleBase base(model->labels); + decoder = new Decoder(model->num_labels(), base); + TRACE_LOG("Allocated plain decoder"); - int c = 0; - while ((inst = reader.next())) { - int len = inst->size(); - inst->tagsidx.resize(len); - for (int i = 0; i < len; ++ i) { - inst->tagsidx[i] = model->labels.index(inst->tags[i]); - } + for (int iter = 0; iter < train_opt.max_iter; ++ iter) { + TRACE_LOG("Training iteraition [%d]", (iter + 1)); + for (int i = 0; i < train_dat.size(); ++ i) { + // extract_features(train_dat[i]); - extract_features(inst); - calculate_scores(inst, true); + Instance * inst = train_dat[i]; + calculate_scores(inst, false); decoder->decode(inst); - // writer.debug(inst); - if (inst->entities.size() == 0) { - build_entities(inst, - inst->tagsidx, - inst->entities, - inst->entities_tags, - beg_tag0, - beg_tag1, - beg_tag2); + if (inst->features.dim() == 0) { + collect_features(inst, inst->tagsidx, inst->features); } + collect_features(inst, inst->predicted_tagsidx, inst->predicted_features); + + + if (train_opt.algorithm == "pa") { + SparseVec update_features; + update_features.zero(); + update_features.add(train_dat[i]->features, 1.); + update_features.add(train_dat[i]->predicted_features, -1.); + + double error = train_dat[i]->num_errors(); + double score = model->param.dot(update_features, false); + double norm = update_features.L2(); + + double step = 0.; + if (norm < EPS) { + step = 0; + } else { + step = (error - score) / norm; + } + + model->param.add(update_features, + iter * train_dat.size() + i + 1, + step); + } else if (train_opt.algorithm == "ap") { + SparseVec update_features; + update_features.zero(); + update_features.add(train_dat[i]->features, 1.); + update_features.add(train_dat[i]->predicted_features, -1.); + + model->param.add(update_features, + iter * train_dat.size() + i + 1, + 1.); + } + + if ((i+1) % train_opt.display_interval == 0) { + TRACE_LOG("[%d] instances is trained.", i+1); + } + } + model->param.flush( train_dat.size() * (iter + 1) ); + + Model * new_model = truncate(); + swap(model, new_model); + evaluate(); - build_entities(inst, - inst->predicted_tagsidx, - inst->predicted_entities, - inst->predicted_entities_tags, - beg_tag0, - beg_tag1, - beg_tag2); + std::string saved_model_file = (train_opt.model_name + + "." + + strutils::to_str(iter) + + ".model"); + std::ofstream ofs(saved_model_file.c_str(), std::ofstream::binary); - num_recalled_entities += inst->num_recalled_entites(); - num_predicted_entities += inst->num_predicted_entities(); - num_gold_entities += inst->num_gold_entities(); + swap(model, new_model); + new_model->save(ofs); + delete new_model; - delete inst; + TRACE_LOG("Model for iteration [%d] is saved to [%s]", + iter + 1, + saved_model_file.c_str()); } + } +} + +void NER::evaluate(void) { + const char * holdout_file = train_opt.holdout_file.c_str(); - double p = (double)num_recalled_entities / num_predicted_entities; - double r = (double)num_recalled_entities / num_gold_entities; - double f = 2 * p * r / (p + r); + ifstream ifs(holdout_file); - TRACE_LOG("P: %lf ( %d / %d )", p, num_recalled_entities, num_predicted_entities); - TRACE_LOG("R: %lf ( %d / %d )", r, num_recalled_entities, num_gold_entities); - TRACE_LOG("F: %lf" , f); + if (!ifs) { + ERROR_LOG("Failed to open holdout file."); return; + } + + NERReader reader(ifs, true); + NERWriter writer(std::cout); + Instance * inst = NULL; + + // some dirty hard code and trick + int beg_tag0 = (model->labels.index( "B-Nh" ) / __num_ne_types__); + int beg_tag1 = (model->labels.index( "S-Nh" ) / __num_ne_types__); + int beg_tag2 = (model->labels.index( "O" ) / __num_ne_types__); + + + int num_recalled_entities = 0; + int num_predicted_entities = 0; + int num_gold_entities = 0; + + while ((inst = reader.next())) { + int len = inst->size(); + inst->tagsidx.resize(len); + for (int i = 0; i < len; ++ i) { + inst->tagsidx[i] = model->labels.index(inst->tags[i]); + } + + extract_features(inst); + calculate_scores(inst, true); + decoder->decode(inst); + + // writer.debug(inst); + if (inst->entities.size() == 0) { + build_entities(inst, + inst->tagsidx, + inst->entities, + inst->entities_tags, + beg_tag0, + beg_tag1, + beg_tag2); + } + + build_entities(inst, + inst->predicted_tagsidx, + inst->predicted_entities, + inst->predicted_entities_tags, + beg_tag0, + beg_tag1, + beg_tag2); + + num_recalled_entities += inst->num_recalled_entites(); + num_predicted_entities += inst->num_predicted_entities(); + num_gold_entities += inst->num_gold_entities(); + + delete inst; + } + + double p = (double)num_recalled_entities / num_predicted_entities; + double r = (double)num_recalled_entities / num_gold_entities; + double f = 2 * p * r / (p + r); + + TRACE_LOG("P: %lf ( %d / %d )", p, num_recalled_entities, num_predicted_entities); + TRACE_LOG("R: %lf ( %d / %d )", r, num_recalled_entities, num_gold_entities); + TRACE_LOG("F: %lf" , f); + return; } void NER::test(void) { - // load model - const char * model_file = test_opt.model_file.c_str(); - ifstream mfs(model_file, std::ifstream::binary); + // load model + const char * model_file = test_opt.model_file.c_str(); + ifstream mfs(model_file, std::ifstream::binary); - if (!mfs) { - ERROR_LOG("Failed to load model"); - return; - } + if (!mfs) { + ERROR_LOG("Failed to load model"); + return; + } - model = new Model; - if (!model->load(mfs)) { - ERROR_LOG("Failed to load model"); - return; - } + model = new Model; + if (!model->load(mfs)) { + ERROR_LOG("Failed to load model"); + return; + } - TRACE_LOG("Number of labels [%d]", model->num_labels()); - TRACE_LOG("Number of features [%d]", model->space.num_features()); - TRACE_LOG("Number of dimension [%d]", model->space.dim()); + TRACE_LOG("Number of labels [%d]", model->num_labels()); + TRACE_LOG("Number of features [%d]", model->space.num_features()); + TRACE_LOG("Number of dimension [%d]", model->space.dim()); - const char * test_file = test_opt.test_file.c_str(); + const char * test_file = test_opt.test_file.c_str(); - ifstream ifs(test_file); + ifstream ifs(test_file); - if (!ifs) { - ERROR_LOG("Failed to open holdout file."); - return; - } + if (!ifs) { + ERROR_LOG("Failed to open holdout file."); + return; + } - rulebase::RuleBase base(model->labels); - decoder = new Decoder(model->num_labels(), base); - NERReader reader(ifs); - NERWriter writer(cout); - Instance * inst = NULL; + rulebase::RuleBase base(model->labels); + decoder = new Decoder(model->num_labels(), base); + NERReader reader(ifs); + NERWriter writer(cout); + Instance * inst = NULL; - // int beg_tag0 = model->labels.index( __b__ ); - // int beg_tag1 = model->labels.index( __s__ ); + // int beg_tag0 = model->labels.index( __b__ ); + // int beg_tag1 = model->labels.index( __s__ ); - double before = get_time(); + double before = get_time(); - while ((inst = reader.next())) { - int len = inst->size(); - inst->tagsidx.resize(len); + while ((inst = reader.next())) { + int len = inst->size(); + inst->tagsidx.resize(len); - extract_features(inst); - calculate_scores(inst, true); - decoder->decode(inst); + extract_features(inst); + calculate_scores(inst, true); + decoder->decode(inst); - writer.write(inst); - delete inst; - } + writer.write(inst); + delete inst; + } - double after = get_time(); - TRACE_LOG("Eclipse time %lf", after - before); + double after = get_time(); + TRACE_LOG("Eclipse time %lf", after - before); - sleep(1000000); - return; + sleep(1000000); + return; } void NER::dump() { - // load model - const char * model_file = dump_opt.model_file.c_str(); - ifstream mfs(model_file, std::ifstream::binary); - - if (!mfs) { - ERROR_LOG("Failed to load model"); - return; - } + // load model + const char * model_file = dump_opt.model_file.c_str(); + ifstream mfs(model_file, std::ifstream::binary); - model = new Model; - if (!model->load(mfs)) { - ERROR_LOG("Failed to load model"); - return; - } - - int L = model->num_labels(); - TRACE_LOG("Number of labels [%d]", model->num_labels()); - TRACE_LOG("Number of features [%d]", model->space.num_features()); - TRACE_LOG("Number of dimension [%d]", model->space.dim()); - - for (FeatureSpaceIterator itx = model->space.begin(); itx != model->space.end(); ++ itx) { - const char * key = itx.key(); - int tid = itx.tid(); - int id = model->space.index(tid, key); + if (!mfs) { + ERROR_LOG("Failed to load model"); + return; + } - for (int l = 0; l < L; ++ l) { - std::cout << key << " ( " << id + l << " ) " + model = new Model; + if (!model->load(mfs)) { + ERROR_LOG("Failed to load model"); + return; + } + + int L = model->num_labels(); + TRACE_LOG("Number of labels [%d]", model->num_labels()); + TRACE_LOG("Number of features [%d]", model->space.num_features()); + TRACE_LOG("Number of dimension [%d]", model->space.dim()); + + for (FeatureSpaceIterator itx = model->space.begin(); + itx != model->space.end(); + ++ itx) { + const char * key = itx.key(); + int tid = itx.tid(); + int id = model->space.index(tid, key); + + for (int l = 0; l < L; ++ l) { + std::cout << key << " ( " << id + l << " ) " << " --> " << model->param.dot(id + l) << std::endl; - } } + } - for (int pl = 0; pl < L; ++ pl) { - for (int l = 0; l < L; ++ l) { - int id = model->space.index(pl, l); - std::cout << pl << " --> " << l << " " << model->param.dot(id) << std::endl; - } + for (int pl = 0; pl < L; ++ pl) { + for (int l = 0; l < L; ++ l) { + int id = model->space.index(pl, l); + std::cout << pl << " --> " << l << " " << model->param.dot(id) << std::endl; } + } } -} // end for namespace segmentor -} // end for namespace ltp +} // end for namespace segmentor +} // end for namespace ltp diff --git a/src/ner/ner.h b/src/ner/ner.h index 1ec60bc5e..4c13f4514 100644 --- a/src/ner/ner.h +++ b/src/ner/ner.h @@ -10,115 +10,115 @@ namespace ner { class NER { public: - NER(); - NER(ltp::utility::ConfigParser & cfg); - ~NER(); + NER(); + NER(ltp::utility::ConfigParser & cfg); + ~NER(); - void run(); + void run(); private: - /* - * parse the configuration, return true on success, otherwise false - * - * @param[in] cfg the config class - * @return bool return true on success, otherwise false - */ - bool parse_cfg(ltp::utility::ConfigParser & cfg); - - /* - * read instances from file and store them in train_dat - * - * @param[in] file_name the filename - * @return bool true on success, otherwise false - */ - bool read_instance( const char * file_name ); - void build_configuration(void); - void build_feature_space(void); - - /* - * the training process - */ - void train(void); - - /* - * the evaluating process - */ - void evaluate(void); - - /* - * the testing process - */ - void test(void); - - /* - * the dumping model process - */ - void dump(void); - - /* - * do feature trauncation on the model. create a model duplation - * on the model and return their - * - * @return Model the duplication of the model - */ - Model * truncate(void); + /* + * parse the configuration, return true on success, otherwise false + * + * @param[in] cfg the config class + * @return bool return true on success, otherwise false + */ + bool parse_cfg(ltp::utility::ConfigParser & cfg); + + /* + * read instances from file and store them in train_dat + * + * @param[in] file_name the filename + * @return bool true on success, otherwise false + */ + bool read_instance( const char * file_name ); + void build_configuration(void); + void build_feature_space(void); + + /* + * the training process + */ + void train(void); + + /* + * the evaluating process + */ + void evaluate(void); + + /* + * the testing process + */ + void test(void); + + /* + * the dumping model process + */ + void dump(void); + + /* + * do feature trauncation on the model. create a model duplation + * on the model and return their + * + * @return Model the duplication of the model + */ + Model * truncate(void); protected: - /* - * extract features from one instance, - * - */ - void extract_features(Instance * inst, bool create = false); - - /* - * build words from tags for certain instance - * - * @param[in/out] inst the instance - * @param[out] words the output words - * @param[in] tagsidx the index of tags - * @param[in] begtag0 first of the word begin tag - * @param[in] begtag1 second of the word begin tag - */ - void build_entities(Instance * inst, - const std::vector & tagsidx, - std::vector & entities, - std::vector & entities_tags, - int beg_tag0, - int beg_tag1, - int beg_tag2); - - /* - * cache all the score for the certain instance. - * - * @param[in/out] inst the instance - * @param[in] use_avg use to specify use average parameter - */ - void calculate_scores(Instance * inst, bool use_avg); - - /* - * collect feature when given the tags index - * - * @param[in] inst the instance - * @param[in] tagsidx the tags index - * @param[out] vec the output sparse vector - */ - void collect_features(Instance * inst, - const std::vector & tagsidx, - ltp::math::SparseVec & vec); + /* + * extract features from one instance, + * + */ + void extract_features(Instance * inst, bool create = false); + + /* + * build words from tags for certain instance + * + * @param[in/out] inst the instance + * @param[out] words the output words + * @param[in] tagsidx the index of tags + * @param[in] begtag0 first of the word begin tag + * @param[in] begtag1 second of the word begin tag + */ + void build_entities(Instance * inst, + const std::vector & tagsidx, + std::vector & entities, + std::vector & entities_tags, + int beg_tag0, + int beg_tag1, + int beg_tag2); + + /* + * cache all the score for the certain instance. + * + * @param[in/out] inst the instance + * @param[in] use_avg use to specify use average parameter + */ + void calculate_scores(Instance * inst, bool use_avg); + + /* + * collect feature when given the tags index + * + * @param[in] inst the instance + * @param[in] tagsidx the tags index + * @param[out] vec the output sparse vector + */ + void collect_features(Instance * inst, + const std::vector & tagsidx, + ltp::math::SparseVec & vec); private: - bool __TRAIN__; - bool __TEST__; - bool __DUMP__; + bool __TRAIN__; + bool __TEST__; + bool __DUMP__; private: - std::vector< Instance * > train_dat; + std::vector< Instance * > train_dat; protected: - Model * model; - Decoder * decoder; + Model * model; + Decoder * decoder; }; -} // end for namespace segmentor -} // end for namespace ltp +} // end for namespace segmentor +} // end for namespace ltp #endif // end for __LTP_NER_NER_H__ diff --git a/src/ner/ner_dll.cpp b/src/ner/ner_dll.cpp index a77833c51..d04761e71 100644 --- a/src/ner/ner_dll.cpp +++ b/src/ner/ner_dll.cpp @@ -11,87 +11,87 @@ class NERWrapper : public ltp::ner::NER { public: - NERWrapper() : - beg_tag0(-1), - beg_tag1(-1) {} + NERWrapper() + : beg_tag0(-1), + beg_tag1(-1) {} - ~NERWrapper() {} + ~NERWrapper() {} - bool load(const char * model_file) { - std::ifstream mfs(model_file, std::ifstream::binary); + bool load(const char * model_file) { + std::ifstream mfs(model_file, std::ifstream::binary); - if (!mfs) { - return false; - } - - model = new ltp::ner::Model; - if (!model->load(mfs)) { - delete model; - return false; - } - - // beg_tag0 = model->labels.index( ); - // beg_tag1 = model->labels.index( ); + if (!mfs) { + return false; + } - return true; + model = new ltp::ner::Model; + if (!model->load(mfs)) { + delete model; + return false; } - int recognize(const std::vector & words, - const std::vector & postags, - std::vector & tags) { - ltp::ner::rulebase::RuleBase base(model->labels); - ltp::ner::Decoder deco(model->num_labels(), base); + // beg_tag0 = model->labels.index( ); + // beg_tag1 = model->labels.index( ); - ltp::ner::Instance * inst = new ltp::ner::Instance; - if (words.size() != postags.size()) { - return 0; - } + return true; + } - for (int i = 0; i < words.size(); ++ i) { - inst->forms.push_back(ltp::strutils::chartypes::sbc2dbc_x(words[i])); - inst->postags.push_back(postags[i]); - } + int recognize(const std::vector & words, + const std::vector & postags, + std::vector & tags) { + ltp::ner::rulebase::RuleBase base(model->labels); + ltp::ner::Decoder deco(model->num_labels(), base); - ltp::ner::NER::extract_features(inst); - ltp::ner::NER::calculate_scores(inst, true); - deco.decode(inst); + ltp::ner::Instance * inst = new ltp::ner::Instance; + if (words.size() != postags.size()) { + return 0; + } + + for (int i = 0; i < words.size(); ++ i) { + inst->forms.push_back(ltp::strutils::chartypes::sbc2dbc_x(words[i])); + inst->postags.push_back(postags[i]); + } - for (int i = 0; i < words.size(); ++ i) { - tags.push_back(model->labels.at(inst->predicted_tagsidx[i])); - } + ltp::ner::NER::extract_features(inst); + ltp::ner::NER::calculate_scores(inst, true); + deco.decode(inst); - delete inst; - return tags.size(); + for (int i = 0; i < words.size(); ++ i) { + tags.push_back(model->labels.at(inst->predicted_tagsidx[i])); } + delete inst; + return tags.size(); + } + private: - int beg_tag0; - int beg_tag1; + int beg_tag0; + int beg_tag1; }; void * ner_create_recognizer(const char * path) { - NERWrapper * wrapper = new NERWrapper(); + NERWrapper * wrapper = new NERWrapper(); - if (!wrapper->load(path)) { - return 0; - } + if (!wrapper->load(path)) { + return 0; + } - return reinterpret_cast(wrapper); + return reinterpret_cast(wrapper); } int ner_release_recognizer(void * ner) { - if (!ner) { - return -1; - } - delete reinterpret_cast(ner); - return 0; + if (!ner) { + return -1; + } + delete reinterpret_cast(ner); + return 0; } int ner_recognize(void * ner, - const std::vector & words, - const std::vector & postags, - std::vector & tags) { - NERWrapper * wrapper = 0; - wrapper = reinterpret_cast(ner); - return wrapper->recognize(words, postags, tags); + const std::vector & words, + const std::vector & postags, + std::vector & tags) { + NERWrapper * wrapper = 0; + wrapper = reinterpret_cast(ner); + return wrapper->recognize(words, postags, tags); } diff --git a/src/ner/ner_dll.h b/src/ner/ner_dll.h index a1c5f125f..cbc1c3aab 100644 --- a/src/ner/ner_dll.h +++ b/src/ner/ner_dll.h @@ -31,7 +31,7 @@ NER_DLL_API void * ner_create_recognizer(const char * path); * @param[in] segmentor the segmentor * @return int i don't know */ -NER_DLL_API int ner_release_recognizer(void * ner); +NER_DLL_API int ner_release_recognizer(void * ner); /* * run segment on the given segmentor @@ -41,8 +41,8 @@ NER_DLL_API int ner_release_recognizer(void * ner); * @return int the number of word tokens */ NER_DLL_API int ner_recognize(void * ner, - const std::vector & words, - const std::vector & postags, - std::vector & tags); + const std::vector & words, + const std::vector & postags, + std::vector & tags); #endif // end for __LTP_NER_DLL_H__ diff --git a/src/ner/nerio.h b/src/ner/nerio.h index e99bd769e..35489cf27 100644 --- a/src/ner/nerio.h +++ b/src/ner/nerio.h @@ -15,115 +15,115 @@ using namespace ltp::strutils; class NERReader { public: - NERReader(istream & _ifs, bool _train = false, int _style = 4) : - ifs(_ifs), - train(_train), - style(_style) {} - - Instance * next() { - if (ifs.eof()) { - return 0; - } - - Instance * inst = new Instance; - std::string line; - - std::getline(ifs, line); - strutils::chomp(line); - - if (line.size() == 0) { + NERReader(istream & _ifs, bool _train = false, int _style = 4) + : ifs(_ifs), + train(_train), + style(_style) {} + + Instance * next() { + if (ifs.eof()) { + return 0; + } + + Instance * inst = new Instance; + std::string line; + + std::getline(ifs, line); + strutils::chomp(line); + + if (line.size() == 0) { + delete inst; + return 0; + } + + std::vector words = split(line); + int found; + + for (int i = 0; i < words.size(); ++ i) { + if (train) { + found = words[i].find_last_of('#'); + if (found != std::string::npos) { + std::string tag = words[i].substr(found + 1); + inst->tags.push_back(tag); + words[i] = words[i].substr(0, found); + + found = words[i].find_last_of('/'); + if (found != std::string::npos) { + std::string postag = words[i].substr(found + 1); + inst->postags.push_back(postag); + words[i] = words[i].substr(0, found); + + inst->raw_forms.push_back(words[i]); + inst->forms.push_back(strutils::chartypes::sbc2dbc_x(words[i])); + } else { delete inst; return 0; + } + } else { + delete inst; + return 0; } - - std::vector words = split(line); - int found; - - for (int i = 0; i < words.size(); ++ i) { - if (train) { - found = words[i].find_last_of('#'); - if (found != std::string::npos) { - std::string tag = words[i].substr(found + 1); - inst->tags.push_back(tag); - words[i] = words[i].substr(0, found); - - found = words[i].find_last_of('/'); - if (found != std::string::npos) { - std::string postag = words[i].substr(found + 1); - inst->postags.push_back(postag); - words[i] = words[i].substr(0, found); - - inst->raw_forms.push_back(words[i]); - inst->forms.push_back(strutils::chartypes::sbc2dbc_x(words[i])); - } else { - delete inst; - return 0; - } - } else { - delete inst; - return 0; - } - } else { - found = words[i].find_last_of('/'); - if (found != std::string::npos) { - std::string postag = words[i].substr(found + 1); - inst->postags.push_back(postag); - words[i] = words[i].substr(0, found); - - inst->raw_forms.push_back(words[i]); - inst->forms.push_back(strutils::chartypes::sbc2dbc_x(words[i])); - } else { - delete inst; - return 0; - } - } + } else { + found = words[i].find_last_of('/'); + if (found != std::string::npos) { + std::string postag = words[i].substr(found + 1); + inst->postags.push_back(postag); + words[i] = words[i].substr(0, found); + + inst->raw_forms.push_back(words[i]); + inst->forms.push_back(strutils::chartypes::sbc2dbc_x(words[i])); + } else { + delete inst; + return 0; } + } + } - return inst; + return inst; } private: - istream & ifs; - int style; - bool train; + istream & ifs; + int style; + bool train; }; class NERWriter { public: - NERWriter(std::ostream & _ofs) : ofs(_ofs) {} - - void write(const Instance * inst) { - int len = inst->size(); - if (inst->predicted_tags.size() != len) { - return; - } - - for (int i = 0; i < len; ++ i) { - ofs << inst->forms[i] - << "/" << inst->postags[i] - << "#" << inst->predicted_tags[i]; - if (i + 1 < len ) { - ofs << "\t"; - } else { - ofs << std::endl; - } - } + NERWriter(std::ostream & _ofs) : ofs(_ofs) {} + + void write(const Instance * inst) { + int len = inst->size(); + if (inst->predicted_tags.size() != len) { + return; + } + + for (int i = 0; i < len; ++ i) { + ofs << inst->forms[i] + << "/" << inst->postags[i] + << "#" << inst->predicted_tags[i]; + if (i + 1 < len ) { + ofs << "\t"; + } else { + ofs << std::endl; + } + } } - void debug(const Instance * inst, bool show_feat = false) { - int len = inst->size(); + void debug(const Instance * inst, bool show_feat = false) { + int len = inst->size(); - for (int i = 0; i < len; ++ i) { - ofs << inst->forms[i] - << "\t" << inst->postags[i] - << "\t" << inst->tagsidx[i] - << "\t" << inst->predicted_tagsidx[i] - << std::endl; - } + for (int i = 0; i < len; ++ i) { + ofs << inst->forms[i] + << "\t" << inst->postags[i] + << "\t" << inst->tagsidx[i] + << "\t" << inst->predicted_tagsidx[i] + << std::endl; + } } private: - std::ostream & ofs; + std::ostream & ofs; }; -} // end for namespace ner -} // end for namespace ltp -#endif // end for __LTP_SEGMENTOR_WRITER_H__ +} // end for namespace ner +} // end for namespace ltp +#endif // end for __LTP_SEGMENTOR_WRITER_H__ diff --git a/src/ner/options.h b/src/ner/options.h index e2a9606ac..652bf6773 100644 --- a/src/ner/options.h +++ b/src/ner/options.h @@ -7,26 +7,26 @@ namespace ltp { namespace ner { struct ModelOptions { - std::string model_file; + std::string model_file; }; struct TrainOptions { - std::string train_file; - std::string holdout_file; - std::string model_name; - std::string algorithm; - int max_iter; - int display_interval; + std::string train_file; + std::string holdout_file; + std::string model_name; + std::string algorithm; + int max_iter; + int display_interval; }; struct TestOptions { - std::string test_file; - std::string model_file; - std::string lexicon_file; + std::string test_file; + std::string model_file; + std::string lexicon_file; }; struct DumpOptions { - std::string model_file; + std::string model_file; }; extern ModelOptions model_opt; @@ -34,7 +34,7 @@ extern TrainOptions train_opt; extern TestOptions test_opt; extern DumpOptions dump_opt; -} // end for namespace ner -} // end for namespace ltp +} // end for namespace ner +} // end for namespace ltp -#endif // end for __LTP_NER_OPTIONS_H__ +#endif // end for __LTP_NER_OPTIONS_H__ diff --git a/src/ner/otner.cpp b/src/ner/otner.cpp index 1a5c0f3c0..6b3763a25 100644 --- a/src/ner/otner.cpp +++ b/src/ner/otner.cpp @@ -7,27 +7,28 @@ using namespace ltp::utility; using namespace ltp::ner; void usage(void) { - std::cerr << "otcws - Training and testing suite for Chinese Word segmentation" << std::endl; - std::cerr << "Copyright (C) 2012-2013 HIT-SCIR" << std::endl; - std::cerr << std::endl; - std::cerr << "usage: ./otcws " << std::endl; - std::cerr << std::endl; + std::cerr << "otcws - Training and testing suite for Named Entity Recognization" + << std::endl; + std::cerr << "Copyright (C) 2012-2014 HIT-SCIR" << std::endl; + std::cerr << std::endl; + std::cerr << "usage: ./otner " << std::endl; + std::cerr << std::endl; } int main(int argc, const char * argv[]) { - if (argc < 2 || (argv[1][0] == '-' && argv[1][1] == 'h')) { - usage(); - return -1; - } + if (argc < 2 || (argv[1][0] == '-' && argv[1][1] == 'h')) { + usage(); + return -1; + } - ConfigParser cfg(argv[1]); + ConfigParser cfg(argv[1]); - if (!cfg) { - ERROR_LOG("Failed to parse config file."); - return -1; - } + if (!cfg) { + ERROR_LOG("Failed to parse config file."); + return -1; + } - NER engine(cfg); - engine.run(); - return 0; + NER engine(cfg); + engine.run(); + return 0; } diff --git a/src/ner/parameter.h b/src/ner/parameter.h index a3ec971b1..cfd469b0c 100644 --- a/src/ner/parameter.h +++ b/src/ner/parameter.h @@ -12,151 +12,151 @@ using namespace ltp::math; class Parameters { public: - int _dim; - double * _W; - double * _W_sum; - int * _W_time; - - Parameters() : - _dim(0), - _W(0), - _W_sum(0), - _W_time(0) {} - - ~Parameters() { - dealloc(); + int _dim; + double * _W; + double * _W_sum; + int * _W_time; + + Parameters() : + _dim(0), + _W(0), + _W_sum(0), + _W_time(0) {} + + ~Parameters() { + dealloc(); + } + + void realloc(int dim) { + dealloc(); + _dim = dim; + + if (dim > 0) { + _W = new double[dim]; + _W_sum = new double[dim]; + _W_time = new int[dim]; } - void realloc(int dim) { - dealloc(); - _dim = dim; - - if (dim > 0) { - _W = new double[dim]; - _W_sum = new double[dim]; - _W_time = new int[dim]; - } - - for (int i = 0; i < dim; ++ i) { - _W[i] = 0; - _W_sum[i] = 0; - _W_time[i] = 0; - } + for (int i = 0; i < dim; ++ i) { + _W[i] = 0; + _W_sum[i] = 0; + _W_time[i] = 0; } - - void dealloc() { - if (_W && _W == _W_sum) { - delete [](_W); - _W = 0; - _W_sum = 0; - } else { - if (_W) { - delete [](_W); - _W = 0; - } - if (_W_sum) { - delete [](_W_sum); - _W_sum = 0; - } - } - - if (_W_time) { - delete [](_W_time); - _W_time = 0; - } + } + + void dealloc() { + if (_W && _W == _W_sum) { + delete [](_W); + _W = 0; + _W_sum = 0; + } else { + if (_W) { + delete [](_W); + _W = 0; + } + if (_W_sum) { + delete [](_W_sum); + _W_sum = 0; + } } - void add(int idx, int now, double scale = 1.) { - int elapsed = now - _W_time[idx]; - double upd = scale; - double cur_val = _W[idx]; - - _W[idx] = cur_val + upd; - _W_sum[idx] += elapsed * cur_val + upd; - _W_time[idx] = now; + if (_W_time) { + delete [](_W_time); + _W_time = 0; } - - void add(const SparseVec & vec, int now, double scale = 1.) { - for (SparseVec::const_iterator itx = vec.begin(); - itx != vec.end(); - itx ++) { - int idx = itx->first; - int elapsed = now - _W_time[idx]; - double upd = scale * itx->second; - double cur_val = _W[idx]; - - _W[idx] = cur_val + upd; - _W_sum[idx] += elapsed * cur_val + upd; - _W_time[idx] = now; - } + } + + void add(int idx, int now, double scale = 1.) { + int elapsed = now - _W_time[idx]; + double upd = scale; + double cur_val = _W[idx]; + + _W[idx] = cur_val + upd; + _W_sum[idx] += elapsed * cur_val + upd; + _W_time[idx] = now; + } + + void add(const SparseVec & vec, int now, double scale = 1.) { + for (SparseVec::const_iterator itx = vec.begin(); + itx != vec.end(); + ++ itx) { + int idx = itx->first; + int elapsed = now - _W_time[idx]; + double upd = scale * itx->second; + double cur_val = _W[idx]; + + _W[idx] = cur_val + upd; + _W_sum[idx] += elapsed * cur_val + upd; + _W_time[idx] = now; } - - double dot(const SparseVec & vec, bool use_avg = false) const { - const double * const p = (use_avg ? _W_sum : _W); - double ret = 0.; - for (SparseVec::const_iterator itx = vec.begin(); - itx != vec.end(); - ++ itx) { - ret += p[itx->first] * itx->second; - } - return ret; + } + + double dot(const SparseVec & vec, bool use_avg = false) const { + const double * const p = (use_avg ? _W_sum : _W); + double ret = 0.; + for (SparseVec::const_iterator itx = vec.begin(); + itx != vec.end(); + ++ itx) { + ret += p[itx->first] * itx->second; } - - double dot(const FeatureVector * vec, bool use_avg = false) const { - const double * const p = (use_avg ? _W_sum : _W); - double ret = 0.; - for (int i = 0; i < vec->n; ++ i) { - if (vec->val) { - ret += p[vec->idx[i] + vec->loff] * vec->val[i]; - } else { - ret += p[vec->idx[i] + vec->loff]; - } - } - return ret; + return ret; + } + + double dot(const FeatureVector * vec, bool use_avg = false) const { + const double * const p = (use_avg ? _W_sum : _W); + double ret = 0.; + for (int i = 0; i < vec->n; ++ i) { + if (vec->val) { + ret += p[vec->idx[i] + vec->loff] * vec->val[i]; + } else { + ret += p[vec->idx[i] + vec->loff]; + } } - - double dot(const int idx, bool use_avg = false) const { - const double * const p = (use_avg ? _W_sum : _W); - return p[idx]; + return ret; + } + + double dot(const int idx, bool use_avg = false) const { + const double * const p = (use_avg ? _W_sum : _W); + return p[idx]; + } + + void flush(int now) { + for(int i = 0; i < _dim; ++i) { + _W_sum[i] += (now - _W_time[i]) * _W[i]; + _W_time[i] = now; } - - void flush(int now) { - for(int i = 0; i < _dim; ++i) { - _W_sum[i] += (now - _W_time[i]) * _W[i]; - _W_time[i] = now; - } + } + + void dump(std::ostream & out, bool use_avg = true) { + const double * p = (use_avg ? _W_sum : _W); + char chunk[16] = {'p', 'a', 'r', 'a', 'm', 0}; + out.write(chunk, 16); + out.write(reinterpret_cast(&_dim), sizeof(int)); + if (_dim > 0) { + out.write(reinterpret_cast(p), sizeof(double) * _dim); } + } - void dump(std::ostream & out, bool use_avg = true) { - const double * p = (use_avg ? _W_sum : _W); - char chunk[16] = {'p', 'a', 'r', 'a', 'm', 0}; - out.write(chunk, 16); - out.write(reinterpret_cast(&_dim), sizeof(int)); - if (_dim > 0) { - out.write(reinterpret_cast(p), sizeof(double) * _dim); - } + bool load(std::istream & in) { + char chunk[16]; + in.read(chunk, 16); + if (strcmp(chunk, "param")) { + return false; } - bool load(std::istream & in) { - char chunk[16]; - in.read(chunk, 16); - if (strcmp(chunk, "param")) { - return false; - } - - in.read(reinterpret_cast(&_dim), sizeof(int)); - if (_dim > 0) { - _W = new double[_dim]; - in.read(reinterpret_cast(_W), sizeof(double) * _dim); - _W_sum = _W; - } - - return true; + in.read(reinterpret_cast(&_dim), sizeof(int)); + if (_dim > 0) { + _W = new double[_dim]; + in.read(reinterpret_cast(_W), sizeof(double) * _dim); + _W_sum = _W; } + + return true; + } }; -} // end for namespace ner -} // end for namespace ltp +} // end for namespace ner +} // end for namespace ltp #endif // end for __LTP_NER_PARAMETER_H__ diff --git a/src/ner/rulebase.h b/src/ner/rulebase.h index d79447df1..115759070 100644 --- a/src/ner/rulebase.h +++ b/src/ner/rulebase.h @@ -17,86 +17,86 @@ namespace rulebase { class RuleBase { public: - RuleBase(utility::IndexableSmartMap & labels) { - // only 4 tag style is supported + RuleBase(utility::IndexableSmartMap & labels) { + // only 4 tag style is supported - std::stringstream S; + std::stringstream S; - __trans__ = 0; - // b - S.str(std::string()); S << __pos_types__[0] << "-" << __ne_types__[0]; - __b_idx__ = prefix( labels.index(S.str()) ) ; + __trans__ = 0; + // b + S.str(std::string()); S << __pos_types__[0] << "-" << __ne_types__[0]; + __b_idx__ = prefix( labels.index(S.str()) ) ; - S.str(std::string()); S << __pos_types__[1] << "-" << __ne_types__[0]; - __i_idx__ = prefix( labels.index(S.str()) ); + S.str(std::string()); S << __pos_types__[1] << "-" << __ne_types__[0]; + __i_idx__ = prefix( labels.index(S.str()) ); - S.str(std::string()); S << __pos_types__[2] << "-" << __ne_types__[0]; - __e_idx__ = prefix( labels.index(S.str()) ); + S.str(std::string()); S << __pos_types__[2] << "-" << __ne_types__[0]; + __e_idx__ = prefix( labels.index(S.str()) ); - S.str(std::string()); S << __pos_types__[3] << "-" << __ne_types__[0]; - __s_idx__ = prefix( labels.index(S.str()) ); - __o_idx__ = prefix( labels.index("O") ); + S.str(std::string()); S << __pos_types__[3] << "-" << __ne_types__[0]; + __s_idx__ = prefix( labels.index(S.str()) ); + __o_idx__ = prefix( labels.index("O") ); - if (__s_idx__>=0 && __b_idx__>=0 && __i_idx__>=0 && __e_idx__>=0 && __o_idx__>=0) { - __trans__ |= (1<<((__s_idx__<<3) + __s_idx__)); - __trans__ |= (1<<((__s_idx__<<3) + __b_idx__)); - __trans__ |= (1<<((__s_idx__<<3) + __o_idx__)); + if (__s_idx__>=0 && __b_idx__>=0 && __i_idx__>=0 && __e_idx__>=0 && __o_idx__>=0) { + __trans__ |= (1<<((__s_idx__<<3) + __s_idx__)); + __trans__ |= (1<<((__s_idx__<<3) + __b_idx__)); + __trans__ |= (1<<((__s_idx__<<3) + __o_idx__)); - __trans__ |= (1<<((__b_idx__<<3) + __i_idx__)); - __trans__ |= (1<<((__b_idx__<<3) + __e_idx__)); + __trans__ |= (1<<((__b_idx__<<3) + __i_idx__)); + __trans__ |= (1<<((__b_idx__<<3) + __e_idx__)); - __trans__ |= (1<<((__i_idx__<<3) + __i_idx__)); - __trans__ |= (1<<((__i_idx__<<3) + __e_idx__)); + __trans__ |= (1<<((__i_idx__<<3) + __i_idx__)); + __trans__ |= (1<<((__i_idx__<<3) + __e_idx__)); - __trans__ |= (1<<((__e_idx__<<3) + __s_idx__)); - __trans__ |= (1<<((__e_idx__<<3) + __b_idx__)); - __trans__ |= (1<<((__e_idx__<<3) + __o_idx__)); + __trans__ |= (1<<((__e_idx__<<3) + __s_idx__)); + __trans__ |= (1<<((__e_idx__<<3) + __b_idx__)); + __trans__ |= (1<<((__e_idx__<<3) + __o_idx__)); - __trans__ |= (1<<((__o_idx__<<3) + __s_idx__)); - __trans__ |= (1<<((__o_idx__<<3) + __b_idx__)); - __trans__ |= (1<<((__o_idx__<<3) + __o_idx__)); - } else { - __trans__ = 0xffff; - } + __trans__ |= (1<<((__o_idx__<<3) + __s_idx__)); + __trans__ |= (1<<((__o_idx__<<3) + __b_idx__)); + __trans__ |= (1<<((__o_idx__<<3) + __o_idx__)); + } else { + __trans__ = 0xffff; } - - ~RuleBase() { - } - - inline bool legal_trans(int prev, int curr) { - int prev_prefix = prefix(prev); - int prev_suffix = suffix(prev); - int curr_prefix = prefix(curr); - int curr_suffix = suffix(curr); - - if (prev_prefix == __b_idx__ || prev_prefix == __i_idx__) { - return ((__trans__ & (1<<((prev_prefix<<3) + curr_prefix))) > 0 - && (prev_suffix == curr_suffix)); - } else { - return ((__trans__ & (1<<((prev_prefix<<3) + curr_prefix))) > 0); - } + } + + ~RuleBase() { + } + + inline bool legal_trans(int prev, int curr) { + int prev_prefix = prefix(prev); + int prev_suffix = suffix(prev); + int curr_prefix = prefix(curr); + int curr_suffix = suffix(curr); + + if (prev_prefix == __b_idx__ || prev_prefix == __i_idx__) { + return ((__trans__ & (1<<((prev_prefix<<3) + curr_prefix))) > 0 + && (prev_suffix == curr_suffix)); + } else { + return ((__trans__ & (1<<((prev_prefix<<3) + curr_prefix))) > 0); } + } private: - unsigned __trans__; + unsigned __trans__; - int __s_idx__; - int __b_idx__; - int __i_idx__; - int __e_idx__; - int __o_idx__; + int __s_idx__; + int __b_idx__; + int __i_idx__; + int __e_idx__; + int __o_idx__; - inline int prefix(int tag) { - return (tag / __num_ne_types__); - } + inline int prefix(int tag) { + return (tag / __num_ne_types__); + } - inline int suffix(int tag) { - return (tag % __num_ne_types__); - } + inline int suffix(int tag) { + return (tag % __num_ne_types__); + } }; -} // end for rulebase -} // end for namespace ner -} // end for namespace ltp +} // end for rulebase +} // end for namespace ner +} // end for namespace ltp #endif // end for __LTP_NER_RULE_BASE_H__ diff --git a/src/parser/collections.cpp b/src/parser/collections.cpp index 84322efe0..c5c53b2be 100644 --- a/src/parser/collections.cpp +++ b/src/parser/collections.cpp @@ -4,83 +4,90 @@ namespace ltp { namespace parser { DictionaryCollections::DictionaryCollections(int num_dicts) : - idx(0) { - dicts.resize( num_dicts ); + idx(0) { + dicts.resize( num_dicts ); - for (int i = 0; i < num_dicts; ++ i) { - dicts[i] = new Dictionary( this ); - } + for (int i = 0; i < num_dicts; ++ i) { + dicts[i] = new Dictionary( this ); + } } DictionaryCollections::~DictionaryCollections() { - for (int i = 0; i < dicts.size(); ++ i) { - delete dicts[i]; - } + for (int i = 0; i < dicts.size(); ++ i) { + delete dicts[i]; + } } -Dictionary * DictionaryCollections::getDictionary(int i) { - if (i < dicts.size()) { - return dicts[i]; - } +Dictionary * +DictionaryCollections::getDictionary(int i) { + if (i < dicts.size()) { + return dicts[i]; + } - return NULL; -} -int DictionaryCollections::retrieve(int tid, const char * key, bool create) { - return dicts[tid]->retrieve(key, create); + return NULL; } -size_t DictionaryCollections::dim() const { - return idx; +int +DictionaryCollections::retrieve(int tid, const char * key, bool create) { + return dicts[tid]->retrieve(key, create); } -int DictionaryCollections::size() { - return dicts.size(); +size_t +DictionaryCollections::dim() const { + return idx; } -void DictionaryCollections::dump(ostream & out) { - char chunk[32]; - unsigned int sz = dicts.size(); - strncpy(chunk, "collections", 16); - - out.write(chunk, 16); - out.write(reinterpret_cast(&idx), sizeof(int)); - out.write(reinterpret_cast(&sz), sizeof(unsigned int)); - for (int i = 0; i < dicts.size(); ++ i) { - // strncpy(chunk, dicts[i]->dict_name.c_str(), 32); - // out.write(chunk, 32); - - dicts[i]->database.dump(out); - } +int +DictionaryCollections::size() { + return dicts.size(); } -bool DictionaryCollections::load(istream & in) { - char chunk[32]; - unsigned int sz; +void +DictionaryCollections::dump(ostream & out) { + char chunk[32]; + unsigned int sz = dicts.size(); + strncpy(chunk, "collections", 16); + + out.write(chunk, 16); + out.write(reinterpret_cast(&idx), sizeof(int)); + out.write(reinterpret_cast(&sz), sizeof(unsigned int)); + for (int i = 0; i < dicts.size(); ++ i) { + // strncpy(chunk, dicts[i]->dict_name.c_str(), 32); + // out.write(chunk, 32); + + dicts[i]->database.dump(out); + } +} - in.read(chunk, 16); - if (strcmp(chunk, "collections")) { - return false; - } +bool +DictionaryCollections::load(istream & in) { + char chunk[32]; + unsigned int sz; - in.read(reinterpret_cast(&idx), sizeof(int)); - in.read(reinterpret_cast(&sz), sizeof(unsigned int)); + in.read(chunk, 16); + if (strcmp(chunk, "collections")) { + return false; + } - if (sz != dicts.size()) { - return false; - } + in.read(reinterpret_cast(&idx), sizeof(int)); + in.read(reinterpret_cast(&sz), sizeof(unsigned int)); - for (unsigned i = 0; i < sz; ++ i) { - // in.read(chunk, 32); + if (sz != dicts.size()) { + return false; + } - // Dictionary * dict = new Dictionary(this); - if (!dicts[i]->database.load(in)) { - return false; - } + for (unsigned i = 0; i < sz; ++ i) { + // in.read(chunk, 32); - // dicts[i].push_back(dict); + // Dictionary * dict = new Dictionary(this); + if (!dicts[i]->database.load(in)) { + return false; } - return true; + // dicts[i].push_back(dict); + } + + return true; } } // end for namespace parser diff --git a/src/parser/collections.h b/src/parser/collections.h index 609025d30..c5c579a33 100644 --- a/src/parser/collections.h +++ b/src/parser/collections.h @@ -1,5 +1,5 @@ -#ifndef __DICT_COLLECTIONS_H__ -#define __DICT_COLLECTIONS_H__ +#ifndef __LTP_PARSER_DICT_COLLECTIONS_H__ +#define __LTP_PARSER_DICT_COLLECTIONS_H__ #include #include @@ -23,101 +23,101 @@ class Dictionary; // a index counter is shared within several dictionary. class DictionaryCollections { public: - DictionaryCollections(int num_dicts); - ~DictionaryCollections(); - - /* - * Dump the dictionary collections into output stream - * - * @param[out] out the output stream - */ - void dump(ostream & out); - - /* - * Load the dictionary collections from input stream, - * return true if dictionary successfully loaded, otherwise - * false. - * - * @param[in] in the input stream - * @return bool true on success, otherwise false. - */ - bool load(istream & in); - - /* - * Get the size of dictionary collections - * - * @return size_t the size of the dictionary - */ - size_t dim() const; - - /* - * Retrieve the certain key in one of the dictionaries in this - * collection. If create is specified, this key is created on - * the condition that it is not in the dictionary. Return the - * index of the key, -1 on failure - * - * @param[in] tid the index of the dictionary - * @param[in] key the key - * @param[in] create insert the key to dictionary if create - * if true. - * @return int the index of the key, -1 on failure. - */ - int retrieve(int tid, const char * key, bool create); - - /* - * Get the ith Dictionary - * - * @param[in] i the index of the dictionary - * @return Dictionary * the dictionary - */ - Dictionary * getDictionary(int i); - - /* - * Get size of dicts - * - * @return int the size of the dictionary - */ - int size(); + DictionaryCollections(int num_dicts); + ~DictionaryCollections(); + + /* + * Dump the dictionary collections into output stream + * + * @param[out] out the output stream + */ + void dump(ostream & out); + + /* + * Load the dictionary collections from input stream, + * return true if dictionary successfully loaded, otherwise + * false. + * + * @param[in] in the input stream + * @return bool true on success, otherwise false. + */ + bool load(istream & in); + + /* + * Get the size of dictionary collections + * + * @return size_t the size of the dictionary + */ + size_t dim() const; + + /* + * Retrieve the certain key in one of the dictionaries in this + * collection. If create is specified, this key is created on + * the condition that it is not in the dictionary. Return the + * index of the key, -1 on failure + * + * @param[in] tid the index of the dictionary + * @param[in] key the key + * @param[in] create insert the key to dictionary if create + * if true. + * @return int the index of the key, -1 on failure. + */ + int retrieve(int tid, const char * key, bool create); + + /* + * Get the ith Dictionary + * + * @param[in] i the index of the dictionary + * @return Dictionary * the dictionary + */ + Dictionary * getDictionary(int i); + + /* + * Get size of dicts + * + * @return int the size of the dictionary + */ + int size(); public: - int idx; /*< the shared index among dictionaries */ + int idx; /*< the shared index among dictionaries */ private: - vector dicts; + vector dicts; }; // the dictionary class // it's wrapper of class SmartMap class Dictionary { public: - Dictionary(DictionaryCollections * coll): - collections(coll) {} - - //StringMap database; - SmartMap database; - DictionaryCollections * collections; - - inline int retrieve(const char * key, bool create) { - int val; - - if (database.get(key, val)) { - return val; - } else { - if (create) { - val = collections->idx; - database.set(key, val); - // database.unsafe_set(key, val); - ++ collections->idx; - return val; - } - } - - return -1; + Dictionary(DictionaryCollections * coll): + collections(coll) {} + + //StringMap database; + SmartMap database; + DictionaryCollections * collections; + + inline int retrieve(const char * key, bool create) { + int val; + + if (database.get(key, val)) { + return val; + } else { + if (create) { + val = collections->idx; + database.set(key, val); + // database.unsafe_set(key, val); + ++ collections->idx; + return val; + } } - inline int size() { - return database.size(); - } + return -1; + } + + inline int size() { + return database.size(); + } }; // labelcollections is a bi-direction map. @@ -126,6 +126,6 @@ class Dictionary { // * string key -> int index // * int index -> string key // -} // end for namespace parser -} // end for namespace ltp -#endif // end for __FEATURE_COLLECTIONS_H__ +} // end for namespace parser +} // end for namespace ltp +#endif // end for __LTP_PARSER_DICT_COLLECTIONS_H__ diff --git a/src/parser/conllreader.h b/src/parser/conllreader.h index 755b755e6..a8ac9ae61 100644 --- a/src/parser/conllreader.h +++ b/src/parser/conllreader.h @@ -1,5 +1,5 @@ -#ifndef __CONLL_READER_H__ -#define __CONLL_READER_H__ +#ifndef __LTP_PARSER_CONLL_READER_H__ +#define __LTP_PARSER_CONLL_READER_H__ #include #include @@ -20,80 +20,80 @@ using namespace ltp::strutils; class CoNLLReader { public: - /* - * Constructor for ConllReader - * Register a ifstream to the ConllReader - * - * @param f the reference to the ifstream - */ - CoNLLReader(ifstream& _f): f(_f) {} - ~CoNLLReader() {} - - /* - * Get next instance from ifstream buffer - */ - Instance * next() { - if (f.eof()) { - return NULL; - } - - Instance * inst = new Instance; - string line; - - inst->forms.push_back( ROOT_FORM ); - inst->lemmas.push_back( ROOT_LEMMA ); - inst->postags.push_back( ROOT_POSTAG ); - inst->heads.push_back( -1 ); - - if (model_opt.labeled) { - inst->deprels.push_back( ROOT_DEPREL ); - } - inst->chars.push_back( vector() ); - - while (!f.eof()) { - getline(f, line); - chomp(line); - - if (line.size() == 0) { - break; - } - - vector items = split(line); - if (items.size() != 10) { - WARNING_LOG("Unknown conll format file"); - } - - inst->forms.push_back( items[1] ); // items[1]: form - inst->lemmas.push_back( items[2] ); // items[2]: lemma - inst->postags.push_back( items[3] ); // items[4]: postag - inst->heads.push_back( to_int(items[6]) ); - - if (model_opt.labeled) { - inst->deprels.push_back( items[7] ); - } - - vector chars; - codecs::decode(items[1], chars); - inst->chars.push_back( chars ); - } - - if (inst->forms.size() == 1) { - delete inst; - inst = NULL; - } - return inst; + /* + * Constructor for ConllReader + * Register a ifstream to the ConllReader + * + * @param f the reference to the ifstream + */ + CoNLLReader(ifstream& _f): f(_f) {} + ~CoNLLReader() {} + + /* + * Get next instance from ifstream buffer + */ + Instance * next() { + if (f.eof()) { + return NULL; } - /* - * Reader reach the end of the file - */ - bool eof() { - return f.eof(); + Instance * inst = new Instance; + string line; + + inst->forms.push_back( ROOT_FORM ); + inst->lemmas.push_back( ROOT_LEMMA ); + inst->postags.push_back( ROOT_POSTAG ); + inst->heads.push_back( -1 ); + + if (model_opt.labeled) { + inst->deprels.push_back( ROOT_DEPREL ); + } + inst->chars.push_back( vector() ); + + while (!f.eof()) { + getline(f, line); + chomp(line); + + if (line.size() == 0) { + break; + } + + vector items = split(line); + if (items.size() != 10) { + WARNING_LOG("Unknown conll format file"); + } + + inst->forms.push_back( items[1] ); // items[1]: form + inst->lemmas.push_back( items[2] ); // items[2]: lemma + inst->postags.push_back( items[3] ); // items[4]: postag + inst->heads.push_back( to_int(items[6]) ); + + if (model_opt.labeled) { + inst->deprels.push_back( items[7] ); + } + + vector chars; + codecs::decode(items[1], chars); + inst->chars.push_back( chars ); + } + + if (inst->forms.size() == 1) { + delete inst; + inst = NULL; } + return inst; + } + + /* + * Reader reach the end of the file + */ + bool eof() { + return f.eof(); + } private: - ifstream& f; + ifstream& f; }; // end for ConllReader } // end for parser } // end for namespace ltp -#endif // end for __CONLL_READER_H__ +#endif // end for __LTP_PARSER_CONLL_READER_H__ diff --git a/src/parser/conllwriter.h b/src/parser/conllwriter.h index f3fc94fb8..0d4e91a4f 100644 --- a/src/parser/conllwriter.h +++ b/src/parser/conllwriter.h @@ -1,5 +1,5 @@ -#ifndef __CONLL_WRITER_H__ -#define __CONLL_WRITER_H__ +#ifndef __LTP_PARSER_CONLL_WRITER_H__ +#define __LTP_PARSER_CONLL_WRITER_H__ #include @@ -13,47 +13,47 @@ using namespace ltp::strutils; class CoNLLWriter { public: - CoNLLWriter(std::ostream& _f): f(_f) {} - ~CoNLLWriter() {} - - void write(const Instance * inst) { - int len = inst->size(); - bool predicted = (inst->predicted_heads.size() > 0 && - inst->predicted_heads.size() == len); - bool predicted_label = (inst->predicted_deprels.size() > 0 && - inst->predicted_deprels.size() == len); - - for (int i = 1; i < inst->size(); ++ i) { - f << i - << "\t" // 0 - index - << inst->forms[i] - << "\t" // 1 - form - << inst->lemmas[i] - << "\t" // 2 - lemma - << inst->postags[i] - << "\t" // 3 - postag - << "_" - << "\t" // 4 - unknown - << "_" - << "\t" // 5 - unknown - << inst->heads[i] - << "\t" // 6 - heads - << inst->deprels[i] - << "\t" // 7 - deprels - << (predicted ? to_str(inst->predicted_heads[i]) : "_") - << "\t" - << (predicted_label ? inst->predicted_deprels[i] : "_") - << endl; - } - - f << endl; + CoNLLWriter(std::ostream& _f): f(_f) {} + ~CoNLLWriter() {} + + void write(const Instance * inst) { + int len = inst->size(); + bool predicted = (inst->predicted_heads.size() > 0 + && inst->predicted_heads.size() == len); + bool predicted_label = (inst->predicted_deprels.size() > 0 + && inst->predicted_deprels.size() == len); + + for (int i = 1; i < inst->size(); ++ i) { + f << i + << "\t" // 0 - index + << inst->forms[i] + << "\t" // 1 - form + << inst->lemmas[i] + << "\t" // 2 - lemma + << inst->postags[i] + << "\t" // 3 - postag + << "_" + << "\t" // 4 - unknown + << "_" + << "\t" // 5 - unknown + << inst->heads[i] + << "\t" // 6 - heads + << inst->deprels[i] + << "\t" // 7 - deprels + << (predicted ? to_str(inst->predicted_heads[i]) : "_") + << "\t" + << (predicted_label ? inst->predicted_deprels[i] : "_") + << endl; } + + f << endl; + } private: - std::ostream& f; + std::ostream& f; }; // end for ConnllWriter } // end for parser } // end for namespace ltp -#endif // end for __CONLL_WRITER_H__ +#endif // end for __LTP_PARSER_CONLL_WRITER_H__ diff --git a/src/parser/decoder.h b/src/parser/decoder.h index d47e421ae..4d065d379 100644 --- a/src/parser/decoder.h +++ b/src/parser/decoder.h @@ -1,5 +1,5 @@ -#ifndef __DECODER_H__ -#define __DECODER_H__ +#ifndef __LTP_PARSER_DECODER_H__ +#define __LTP_PARSER_DECODER_H__ #include "instance.h" #include "settings.h" @@ -12,166 +12,169 @@ namespace ltp { namespace parser { -// data struct for decode chart item. Provide several construction +// data struct for decode chart item. Provide several construction // methods and bind certain type. class LatticeItem { public: - const int _g; /* grand */ - const int _s; /* from */ - const int _t; /*< the distance to */ - const int _comp; /*< specify if this span is complete */ - const int _label_s_t; /*< label type */ + const int _g; /* grand */ + const int _s; /* from */ + const int _t; /*< the distance to */ + const int _comp; /*< specify if this span is complete */ + const int _label_s_t; /*< label type */ - const LatticeItem * const _left; - const LatticeItem * const _right; + const LatticeItem * const _left; + const LatticeItem * const _right; - const double _prob; + const double _prob; public: - LatticeItem(const int comp, - const int g, - const int s, - const int t, - const double prob, - const LatticeItem * const left, - const LatticeItem * const right) : - _g(g), - _s(s), - _t(t), - _comp(comp), - _prob(prob), - _left(left), - _right(right), - _label_s_t(-1) { } - - LatticeItem(const int comp, - const int s, - const int t, - const double prob, - const LatticeItem * const left, - const LatticeItem * const right, - const int label_s_t = -1) : - _g(-1), - _s(s), - _t(t), - _comp(comp), - _prob(prob), - _left(left), - _right(right), - _label_s_t(label_s_t) { } - - // for span like C(s,s) - LatticeItem(const int g, - const int s) : - _g(g), - _s(s), - _t(s), - _prob(0.0), - _comp(CMP), - _left(0), - _right(0), - _label_s_t(-1) { } - - LatticeItem(const int s) : - _s(s), - _t(s), - _prob(0.0), - _comp(CMP), - _left(0), - _right(0), - _g(-1), - _label_s_t(-1) { } - - ~LatticeItem() {} + LatticeItem(const int comp, + const int g, + const int s, + const int t, + const double prob, + const LatticeItem * const left, + const LatticeItem * const right) : + _g(g), + _s(s), + _t(t), + _comp(comp), + _prob(prob), + _left(left), + _right(right), + _label_s_t(-1) { } + + LatticeItem(const int comp, + const int s, + const int t, + const double prob, + const LatticeItem * const left, + const LatticeItem * const right, + const int label_s_t = -1) : + _g(-1), + _s(s), + _t(t), + _comp(comp), + _prob(prob), + _left(left), + _right(right), + _label_s_t(label_s_t) { } + + // for span like C(s,s) + LatticeItem(const int g, + const int s) : + _g(g), + _s(s), + _t(s), + _prob(0.0), + _comp(CMP), + _left(0), + _right(0), + _label_s_t(-1) { } + + LatticeItem(const int s) : + _s(s), + _t(s), + _prob(0.0), + _comp(CMP), + _left(0), + _right(0), + _g(-1), + _label_s_t(-1) { } + + ~LatticeItem() {} private: - // forbidden construction - LatticeItem(const LatticeItem & rhs) : - _s(0), - _t(0), - _prob(0.0), - _comp(-1), - _left(0), - _right(0), - _label_s_t(-1), - _g(-1) { - std::cerr << "LatticeItem::LatticeItem(const LatticeItem & rhs) is not allowed" << std::endl; - exit(-1); - } - - LatticeItem & operator = (const LatticeItem & rhs) { - std::cerr << "LatticeItem::operator= (const LatticeItem & rhs) is not allowed" << std::endl; - exit(-1); - } + // forbidden construction + LatticeItem(const LatticeItem & rhs) : + _s(0), + _t(0), + _prob(0.0), + _comp(-1), + _left(0), + _right(0), + _label_s_t(-1), + _g(-1) { + std::cerr << "LatticeItem::LatticeItem(const LatticeItem & rhs) is not allowed" + << std::endl; + exit(-1); + } + + LatticeItem & operator = (const LatticeItem & rhs) { + std::cerr << "LatticeItem::operator= (const LatticeItem & rhs) is not allowed" + << std::endl; + exit(-1); + } }; class Decoder { public: - Decoder() {} - virtual ~Decoder() {} - - /* - * Decode the instance, this method is a controller, - * execute: - * - init lattice - * - decode projective - * - get result - * - free lattice - * in sequence. - * - * @param[in] inst the instance - */ - void decode(Instance * inst) { - init_lattice(inst); - decode_projective(inst); - get_result(inst); - free_lattice(); - } - - virtual void init_lattice(const Instance * inst) = 0; - virtual void decode_projective(const Instance * inst) = 0; - virtual void get_result(Instance * inst) = 0; - virtual void free_lattice() = 0; + Decoder() {} + virtual ~Decoder() {} + + /* + * Decode the instance, this method is a controller, + * execute: + * - init lattice + * - decode projective + * - get result + * - free lattice + * in sequence. + * + * @param[in] inst the instance + */ + void decode(Instance * inst) { + init_lattice(inst); + decode_projective(inst); + get_result(inst); + free_lattice(); + } + + virtual void init_lattice(const Instance * inst) = 0; + virtual void decode_projective(const Instance * inst) = 0; + virtual void get_result(Instance * inst) = 0; + virtual void free_lattice() = 0; protected: - void lattice_insert(const LatticeItem * &position, const LatticeItem * const item) { - if (position == NULL) { - position = item; - } else if (position->_prob < item->_prob - EPS) { - delete position; - position = item; - } else { - delete item; - } + void lattice_insert(const LatticeItem * &position, const LatticeItem * const item) { + if (position == NULL) { + position = item; + } else if (position->_prob < item->_prob - EPS) { + delete position; + position = item; + } else { + delete item; } + } - void __BUILD_TREE(Instance * inst, const LatticeItem * item) { - if (!item) { - return; - } - - __BUILD_TREE(inst, item->_left); + void __BUILD_TREE(Instance * inst, const LatticeItem * item) { + if (!item) { + return; + } - if (INCMP == item->_comp) { - inst->predicted_heads[item->_t] = item->_s; + __BUILD_TREE(inst, item->_left); - if (model_opt.labeled) { - inst->predicted_deprelsidx[item->_t] = item->_label_s_t; - } - } else if (CMP == item->_comp) { - // do nothing; - } else if (SIBSP == item->_comp) { - // do nothing - } else { - } + if (INCMP == item->_comp) { + inst->predicted_heads[item->_t] = item->_s; - __BUILD_TREE(inst, item->_right); + if (model_opt.labeled) { + inst->predicted_deprelsidx[item->_t] = item->_label_s_t; + } + } else if (CMP == item->_comp) { + // do nothing; + } else if (SIBSP == item->_comp) { + // do nothing + } else { + // do nothing } + __BUILD_TREE(inst, item->_right); + } + }; // end for class decoder } // end for namespace parser } // end for namespace ltp -#endif // end for __DECODER_H__ +#endif // end for __LTP_PARSER_DECODER_H__ diff --git a/src/parser/decoder1o.cpp b/src/parser/decoder1o.cpp index 441351249..73474eabe 100644 --- a/src/parser/decoder1o.cpp +++ b/src/parser/decoder1o.cpp @@ -4,172 +4,172 @@ namespace ltp { namespace parser { void Decoder1O::init_lattice(const Instance * inst) { - int len = inst->size(); - _lattice_cmp.resize(len, len); - _lattice_incmp.resize(len, len, L); + int len = inst->size(); + _lattice_cmp.resize(len, len); + _lattice_incmp.resize(len, len, L); - _lattice_cmp = NULL; - _lattice_incmp = NULL; + _lattice_cmp = NULL; + _lattice_incmp = NULL; - for (int i = 0; i < len; ++ i) { - _lattice_cmp[i][i] = new LatticeItem(i); - } + for (int i = 0; i < len; ++ i) { + _lattice_cmp[i][i] = new LatticeItem(i); + } } void Decoder1O::decode_projective(const Instance * inst) { - int len = inst->size(); - - // instance_verify(inst); - for (int width = 1; width < len; ++ width) { - for (int s = 0; s + width < len; ++ s) { - int t = s + width; - _lattice_cmp[s][t] = NULL; - _lattice_cmp[t][s] = NULL; - for (int l = 0; l < L; ++ l) { - _lattice_incmp[s][t][l] = NULL; - _lattice_incmp[t][s][l] = NULL; + int len = inst->size(); + + // instance_verify(inst); + for (int width = 1; width < len; ++ width) { + for (int s = 0; s + width < len; ++ s) { + int t = s + width; + _lattice_cmp[s][t] = NULL; + _lattice_cmp[t][s] = NULL; + for (int l = 0; l < L; ++ l) { + _lattice_incmp[s][t][l] = NULL; + _lattice_incmp[t][s][l] = NULL; + } + + for (int r = s; r < t; ++ r) { + const LatticeItem * const left = _lattice_cmp[s][r]; + if (!left) { + continue; + } + + const LatticeItem * const right = _lattice_cmp[t][r+1]; + if (!right) { + continue; + } + + for (int l = 0; l < L; ++ l) { + + { // I(s,t) = C(s,r) + C(t,r+1) + double prob = (left->_prob + right->_prob); + + if (feat_opt.use_unlabeled_dependency) { + prob += inst->depu_scores[s][t]; } - for (int r = s; r < t; ++ r) { - const LatticeItem * const left = _lattice_cmp[s][r]; - if (!left) { - continue; - } - - const LatticeItem * const right = _lattice_cmp[t][r+1]; - if (!right) { - continue; - } - - for (int l = 0; l < L; ++ l) { - - { // I(s,t) = C(s,r) + C(t,r+1) - double prob = (left->_prob + right->_prob); - - if (feat_opt.use_unlabeled_dependency) { - prob += inst->depu_scores[s][t]; - } - - if (feat_opt.use_labeled_dependency) { - prob += inst->depl_scores[s][t][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - s, - t, - prob, - left, - right, - l); - - lattice_insert(_lattice_incmp[s][t][l], item); - } - - if (s != 0) { // I(t,s) - double prob = (left->_prob + right->_prob); - - if (feat_opt.use_unlabeled_dependency) { - prob += inst->depu_scores[t][s]; - } - - if (feat_opt.use_labeled_dependency) { - prob += inst->depl_scores[t][s][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - t, - s, - prob, - left, - right, - l); - - // cerr << "INCMP " << t << "-" << s << "-" << l << endl; - lattice_insert(_lattice_incmp[t][s][l], item); - } // end for if (s != 0) - } // end for for (int l = 0; l < _L; ++ l) - } // end for for (int r = s; r < t; ++ r) - - for (int r = s; r <= t; ++ r) { - if (r != s) { // C(s,t) = I(s,r) + C(r,t) - const LatticeItem * const right = _lattice_cmp[r][t]; - if (!right) { - continue; - } - - for (int l = 0; l < L; ++ l) { - const LatticeItem * const left = _lattice_incmp[s][r][l]; - if (!left) { - continue; - } - - const double prob = left->_prob + right->_prob; - const LatticeItem * const item = new LatticeItem(CMP, - s, - t, - prob, - left, - right); - - // cerr << "CMP " << s << "-" << t << endl; - lattice_insert(_lattice_cmp[s][t], item); - } - } // end for if (r != s) - - if (r != t && s != 0) { // C(t,s) = I(t,r) + C(r,s) - const LatticeItem * const left = _lattice_cmp[r][s]; - if (!left) { - continue; - } - - for (int l = 0; l < L; ++ l) { - const LatticeItem * const right = _lattice_incmp[t][r][l]; - if (!right) { - continue; - } - - const double prob = left->_prob + right->_prob; - const LatticeItem * const item = new LatticeItem(CMP, - t, - s, - prob, - left, - right); - - // cerr << "CMP " << t << "-" << s << endl; - lattice_insert(_lattice_cmp[t][s], item); - } // end for for (int l = 0; l < L; ++ l) - } // end for if (r != t && s != 0) + if (feat_opt.use_labeled_dependency) { + prob += inst->depl_scores[s][t][l]; } - } + + const LatticeItem * const item = new LatticeItem(INCMP, + s, + t, + prob, + left, + right, + l); + + lattice_insert(_lattice_incmp[s][t][l], item); + } + + if (s != 0) { // I(t,s) + double prob = (left->_prob + right->_prob); + + if (feat_opt.use_unlabeled_dependency) { + prob += inst->depu_scores[t][s]; + } + + if (feat_opt.use_labeled_dependency) { + prob += inst->depl_scores[t][s][l]; + } + + const LatticeItem * const item = new LatticeItem(INCMP, + t, + s, + prob, + left, + right, + l); + + // cerr << "INCMP " << t << "-" << s << "-" << l << endl; + lattice_insert(_lattice_incmp[t][s][l], item); + } // end for if (s != 0) + } // end for for (int l = 0; l < _L; ++ l) + } // end for for (int r = s; r < t; ++ r) + + for (int r = s; r <= t; ++ r) { + if (r != s) { // C(s,t) = I(s,r) + C(r,t) + const LatticeItem * const right = _lattice_cmp[r][t]; + if (!right) { + continue; + } + + for (int l = 0; l < L; ++ l) { + const LatticeItem * const left = _lattice_incmp[s][r][l]; + if (!left) { + continue; + } + + const double prob = left->_prob + right->_prob; + const LatticeItem * const item = new LatticeItem(CMP, + s, + t, + prob, + left, + right); + + // cerr << "CMP " << s << "-" << t << endl; + lattice_insert(_lattice_cmp[s][t], item); + } + } // end for if (r != s) + + if (r != t && s != 0) { // C(t,s) = I(t,r) + C(r,s) + const LatticeItem * const left = _lattice_cmp[r][s]; + if (!left) { + continue; + } + + for (int l = 0; l < L; ++ l) { + const LatticeItem * const right = _lattice_incmp[t][r][l]; + if (!right) { + continue; + } + + const double prob = left->_prob + right->_prob; + const LatticeItem * const item = new LatticeItem(CMP, + t, + s, + prob, + left, + right); + + // cerr << "CMP " << t << "-" << s << endl; + lattice_insert(_lattice_cmp[t][s], item); + } // end for for (int l = 0; l < L; ++ l) + } // end for if (r != t && s != 0) + } } + } } void Decoder1O::get_result(Instance * inst) { - int len = inst->size(); - inst->predicted_heads.resize(len, -1); - if (model_opt.labeled) { - inst->predicted_deprelsidx.resize(len, -1); - } - - const LatticeItem * best_item = _lattice_cmp[0][len - 1]; - __BUILD_TREE(inst, best_item); + int len = inst->size(); + inst->predicted_heads.resize(len, -1); + if (model_opt.labeled) { + inst->predicted_deprelsidx.resize(len, -1); + } + + const LatticeItem * best_item = _lattice_cmp[0][len - 1]; + __BUILD_TREE(inst, best_item); } void Decoder1O::free_lattice() { - int len = _lattice_cmp.nrows(); - for (int i = 0; i < len; ++ i) { - for (int j = 0; j < len; ++ j) { - for (int l = 0; l < L; ++ l) { - if (_lattice_incmp[i][j][l]) { - delete _lattice_incmp[i][j][l]; - } - } - - delete _lattice_cmp[i][j]; + int len = _lattice_cmp.nrows(); + for (int i = 0; i < len; ++ i) { + for (int j = 0; j < len; ++ j) { + for (int l = 0; l < L; ++ l) { + if (_lattice_incmp[i][j][l]) { + delete _lattice_incmp[i][j][l]; } + } + + delete _lattice_cmp[i][j]; } + } } } // end for namespace parser diff --git a/src/parser/decoder1o.h b/src/parser/decoder1o.h index e46431937..4d6b10942 100644 --- a/src/parser/decoder1o.h +++ b/src/parser/decoder1o.h @@ -1,5 +1,5 @@ -#ifndef __DECODER_1_O_H__ -#define __DECODER_1_O_H__ +#ifndef __LTP_PARSER_DECODER_1_O_H__ +#define __LTP_PARSER_DECODER_1_O_H__ #include "instance.h" #include "decoder.h" @@ -15,21 +15,21 @@ using namespace ltp::math; class Decoder1O : public Decoder { public: - Decoder1O(int _L = 1) : L(_L) {} + Decoder1O(int _L = 1) : L(_L) {} protected: - void init_lattice(const Instance * inst); - void decode_projective(const Instance * inst); - void get_result(Instance * inst); - void free_lattice(); + void init_lattice(const Instance * inst); + void decode_projective(const Instance * inst); + void get_result(Instance * inst); + void free_lattice(); protected: - int L; + int L; - Mat< const LatticeItem * > _lattice_cmp; // complete span - Mat3< const LatticeItem * > _lattice_incmp; // incomplete span -}; // end for class Decoder1O + Mat< const LatticeItem * > _lattice_cmp; // complete span + Mat3< const LatticeItem * > _lattice_incmp; // incomplete span +}; // end for class Decoder1O -} // end for namespace parser -} // end for namespace ltp +} // end for namespace parser +} // end for namespace ltp -#endif // end for __DECODER_1_O_H__ +#endif // end for __LTP_PARSER_DECODER_1_O_H__ diff --git a/src/parser/decoder2o.cpp b/src/parser/decoder2o.cpp index 10ae9a48c..dd3a295cc 100644 --- a/src/parser/decoder2o.cpp +++ b/src/parser/decoder2o.cpp @@ -9,296 +9,296 @@ namespace parser { // ================================================================ // void Decoder2O::init_lattice(const Instance * inst) { - int len = inst->size(); - _lattice_cmp.resize(len, len); - _lattice_sib.resize(len, len); - _lattice_incmp.resize(len, len); - - for (int i = 0; i < len; ++ i) { - for (int j = 0; j < len; ++ j) { - _lattice_cmp[i][j] = 0; - _lattice_sib[i][j] = 0; - _lattice_incmp[i][j] = 0; - } - } - for (int i = 0; i < len; ++ i) { - _lattice_cmp[i][i] = new LatticeItem(i); + int len = inst->size(); + _lattice_cmp.resize(len, len); + _lattice_sib.resize(len, len); + _lattice_incmp.resize(len, len); + + for (int i = 0; i < len; ++ i) { + for (int j = 0; j < len; ++ j) { + _lattice_cmp[i][j] = 0; + _lattice_sib[i][j] = 0; + _lattice_incmp[i][j] = 0; } + } + for (int i = 0; i < len; ++ i) { + _lattice_cmp[i][i] = new LatticeItem(i); + } } void Decoder2O::decode_projective(const Instance * inst) { - int len = inst->size(); - for (int width = 1; width < len; ++ width) { - for (int s = 0; s + width < len; ++ s) { - int t = s + width; - - for (int l = 0; l < L; ++ l) { - double shared_score = 0.; - - if (feat_opt.use_unlabeled_dependency) { - shared_score += inst->depu_scores[s][t]; - } - - if (feat_opt.use_labeled_dependency) { - shared_score += inst->depl_scores[s][t][l]; - } - - { // I(s,t) = C(s,s) + C(t,s+1) - const LatticeItem * const left = _lattice_cmp[s][s]; - const LatticeItem * const right = _lattice_cmp[t][s + 1]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob + shared_score; - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[s][t][s]; - } - - if (feat_opt.use_labeled_sibling) { - score += inst->sibl_scores[s][t][s][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - s, - t, - score, - left, - right, - l); - - lattice_insert(_lattice_incmp[s][t], item); - } // end for I(s,t) = C(s,s) + C(t,s+1) - - { // I(s,t) = I(s,r) + S(r,t) - for (int r = s + 1; r < t; ++ r) { - const LatticeItem * const left = _lattice_incmp[s][r]; - const LatticeItem * const right = _lattice_sib[r][t]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob + shared_score; - - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[s][t][r]; - } - - if (feat_opt.use_labeled_sibling) { - score += inst->sibl_scores[s][t][r][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - s, - t, - score, - left, - right, - l); - - lattice_insert(_lattice_incmp[s][t], item); - } - } // end for I(s,t) = I(s,r) + S(r,t) - - } // end for for (l = 0; l < L; ++ l) - - if (s != 0) { // I(t,s) = C(s, t-1) + C(t, t) - for (int l = 0; l < L; ++ l) { - double shared_score = 0.; - - if (feat_opt.use_unlabeled_dependency) { - shared_score += inst->depu_scores[t][s]; - } - - if (feat_opt.use_labeled_dependency) { - shared_score += inst->depl_scores[t][s][l]; - } - - { // I(t,s) = C(s,t-1) + C(t,t) - const LatticeItem * const left = _lattice_cmp[s][t-1]; - const LatticeItem * const right = _lattice_cmp[t][t]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob + shared_score; - - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[t][s][t]; - } - - if (feat_opt.use_labeled_sibling) { - score += inst->sibl_scores[t][s][t][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - t, - s, - score, - left, - right, - l); - - lattice_insert(_lattice_incmp[t][s], item); - } // end for I(t, s) = C(s,t-1) + C(t,t) - - { // I(t,s) = S(s,r) + I(t,r) - for (int r = s + 1; r < t; ++ r) { - const LatticeItem * const left = _lattice_sib[s][r]; - const LatticeItem * const right = _lattice_incmp[t][r]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob + shared_score; - - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[t][s][r]; - } - - if (feat_opt.use_labeled_sibling) { - score += inst->sibl_scores[t][s][r][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - t, - s, - score, - left, - right, - l); - - lattice_insert(_lattice_incmp[t][s], item); - } - } // end for I(t,s) = S(s,r) + I(t,r) - - } - } // end for if (s != 0) - - { // S(s,t) = C(s,r) + C(t,r+1) - for (int r = s; r < t; ++ r) { - const LatticeItem * const left = _lattice_cmp[s][r]; - const LatticeItem * const right = _lattice_cmp[t][r+1]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob; - - const LatticeItem * const item = new LatticeItem(SIBSP, - s, - t, - score, - left, - right); - - lattice_insert(_lattice_sib[s][t], item); - } - } // end for S(s,t) = C(s,t) + C(t,r+1) - - { // C(s,t) = I(s,r) + C(r,t) - for (int r = s + 1; r <= t; ++ r) { - const LatticeItem * const left = _lattice_incmp[s][r]; - const LatticeItem * const right = _lattice_cmp[r][t]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob; - - if (feat_opt.use_last_sibling) { - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[s][r][r]; - } - - if (feat_opt.use_labeled_sibling) { - int l = left->_label_s_t; - score += inst->sibl_scores[s][r][r][l]; - } - } - - const LatticeItem * const item = new LatticeItem(CMP, - s, - t, - score, - left, - right); - - lattice_insert(_lattice_cmp[s][t], item); - - } + int len = inst->size(); + for (int width = 1; width < len; ++ width) { + for (int s = 0; s + width < len; ++ s) { + int t = s + width; + + for (int l = 0; l < L; ++ l) { + double shared_score = 0.; + + if (feat_opt.use_unlabeled_dependency) { + shared_score += inst->depu_scores[s][t]; + } + + if (feat_opt.use_labeled_dependency) { + shared_score += inst->depl_scores[s][t][l]; + } + + { // I(s,t) = C(s,s) + C(t,s+1) + const LatticeItem * const left = _lattice_cmp[s][s]; + const LatticeItem * const right = _lattice_cmp[t][s + 1]; + + if (!left || !right) { + continue; + } + + double score = left->_prob + right->_prob + shared_score; + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[s][t][s]; + } + + if (feat_opt.use_labeled_sibling) { + score += inst->sibl_scores[s][t][s][l]; + } + + const LatticeItem * const item = new LatticeItem(INCMP, + s, + t, + score, + left, + right, + l); + + lattice_insert(_lattice_incmp[s][t], item); + } // end for I(s,t) = C(s,s) + C(t,s+1) + + { // I(s,t) = I(s,r) + S(r,t) + for (int r = s + 1; r < t; ++ r) { + const LatticeItem * const left = _lattice_incmp[s][r]; + const LatticeItem * const right = _lattice_sib[r][t]; + + if (!left || !right) { + continue; + } + + double score = left->_prob + right->_prob + shared_score; + + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[s][t][r]; + } + + if (feat_opt.use_labeled_sibling) { + score += inst->sibl_scores[s][t][r][l]; + } + + const LatticeItem * const item = new LatticeItem(INCMP, + s, + t, + score, + left, + right, + l); + + lattice_insert(_lattice_incmp[s][t], item); + } + } // end for I(s,t) = I(s,r) + S(r,t) + + } // end for for (l = 0; l < L; ++ l) + + if (s != 0) { // I(t,s) = C(s, t-1) + C(t, t) + for (int l = 0; l < L; ++ l) { + double shared_score = 0.; + + if (feat_opt.use_unlabeled_dependency) { + shared_score += inst->depu_scores[t][s]; + } + + if (feat_opt.use_labeled_dependency) { + shared_score += inst->depl_scores[t][s][l]; + } + + { // I(t,s) = C(s,t-1) + C(t,t) + const LatticeItem * const left = _lattice_cmp[s][t-1]; + const LatticeItem * const right = _lattice_cmp[t][t]; + + if (!left || !right) { + continue; + } + + double score = left->_prob + right->_prob + shared_score; + + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[t][s][t]; } - if (s != 0) { - for (int r = s; r < t; ++ r) { - const LatticeItem * const left = _lattice_cmp[r][s]; - const LatticeItem * const right = _lattice_incmp[t][r]; - - if (!left || !right) { - continue; - } - - double score = left->_prob + right->_prob; - - if (feat_opt.use_last_sibling) { - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[t][r][r]; - } - - if (feat_opt.use_labeled_sibling) { - int l = right->_label_s_t; - score += inst->sibl_scores[t][r][r][l]; - } - } - - const LatticeItem * const item = new LatticeItem(CMP, - t, - s, - score, - left, - right); - - lattice_insert(_lattice_cmp[t][s], item); - } + if (feat_opt.use_labeled_sibling) { + score += inst->sibl_scores[t][s][t][l]; } + + const LatticeItem * const item = new LatticeItem(INCMP, + t, + s, + score, + left, + right, + l); + + lattice_insert(_lattice_incmp[t][s], item); + } // end for I(t, s) = C(s,t-1) + C(t,t) + + { // I(t,s) = S(s,r) + I(t,r) + for (int r = s + 1; r < t; ++ r) { + const LatticeItem * const left = _lattice_sib[s][r]; + const LatticeItem * const right = _lattice_incmp[t][r]; + + if (!left || !right) { + continue; + } + + double score = left->_prob + right->_prob + shared_score; + + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[t][s][r]; + } + + if (feat_opt.use_labeled_sibling) { + score += inst->sibl_scores[t][s][r][l]; + } + + const LatticeItem * const item = new LatticeItem(INCMP, + t, + s, + score, + left, + right, + l); + + lattice_insert(_lattice_incmp[t][s], item); + } + } // end for I(t,s) = S(s,r) + I(t,r) + } - } -} + } // end for if (s != 0) -void Decoder2O::get_result(Instance * inst) { - int len = inst->size(); - inst->predicted_heads.resize(len, -1); - if (model_opt.labeled) { - inst->predicted_deprelsidx.resize(len, -1); - } + { // S(s,t) = C(s,r) + C(t,r+1) + for (int r = s; r < t; ++ r) { + const LatticeItem * const left = _lattice_cmp[s][r]; + const LatticeItem * const right = _lattice_cmp[t][r+1]; - const LatticeItem * best_item = _lattice_cmp[0][len - 1]; - __BUILD_TREE(inst, best_item); -} + if (!left || !right) { + continue; + } -void Decoder2O::free_lattice() { - int len = _lattice_cmp.nrows(); - for (int i = 0; i < len; ++ i) { - for (int j = 0; j < len; ++ j) { - if (_lattice_incmp[i][j]) { - delete _lattice_incmp[i][j]; + double score = left->_prob + right->_prob; + + const LatticeItem * const item = new LatticeItem(SIBSP, + s, + t, + score, + left, + right); + + lattice_insert(_lattice_sib[s][t], item); + } + } // end for S(s,t) = C(s,t) + C(t,r+1) + + { // C(s,t) = I(s,r) + C(r,t) + for (int r = s + 1; r <= t; ++ r) { + const LatticeItem * const left = _lattice_incmp[s][r]; + const LatticeItem * const right = _lattice_cmp[r][t]; + + if (!left || !right) { + continue; + } + + double score = left->_prob + right->_prob; + + if (feat_opt.use_last_sibling) { + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[s][r][r]; } - if (_lattice_cmp[i][j]) { - delete _lattice_cmp[i][j]; + if (feat_opt.use_labeled_sibling) { + int l = left->_label_s_t; + score += inst->sibl_scores[s][r][r][l]; + } + } + + const LatticeItem * const item = new LatticeItem(CMP, + s, + t, + score, + left, + right); + + lattice_insert(_lattice_cmp[s][t], item); + + } + } + + if (s != 0) { + for (int r = s; r < t; ++ r) { + const LatticeItem * const left = _lattice_cmp[r][s]; + const LatticeItem * const right = _lattice_incmp[t][r]; + + if (!left || !right) { + continue; + } + + double score = left->_prob + right->_prob; + + if (feat_opt.use_last_sibling) { + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[t][r][r]; } - if (_lattice_sib[i][j]) { - delete _lattice_sib[i][j]; + if (feat_opt.use_labeled_sibling) { + int l = right->_label_s_t; + score += inst->sibl_scores[t][r][r][l]; } + } + + const LatticeItem * const item = new LatticeItem(CMP, + t, + s, + score, + left, + right); + + lattice_insert(_lattice_cmp[t][s], item); } + } + } + } +} + +void Decoder2O::get_result(Instance * inst) { + int len = inst->size(); + inst->predicted_heads.resize(len, -1); + if (model_opt.labeled) { + inst->predicted_deprelsidx.resize(len, -1); + } + + const LatticeItem * best_item = _lattice_cmp[0][len - 1]; + __BUILD_TREE(inst, best_item); +} + +void Decoder2O::free_lattice() { + int len = _lattice_cmp.nrows(); + for (int i = 0; i < len; ++ i) { + for (int j = 0; j < len; ++ j) { + if (_lattice_incmp[i][j]) { + delete _lattice_incmp[i][j]; + } + + if (_lattice_cmp[i][j]) { + delete _lattice_cmp[i][j]; + } + + if (_lattice_sib[i][j]) { + delete _lattice_sib[i][j]; + } } + } } @@ -306,335 +306,335 @@ void Decoder2O::free_lattice() { // 2nd-order Decoder using dependency, sibling and grand features // // ================================================================ // void Decoder2OCarreras::init_lattice(const Instance * inst) { - int len = inst->size(); - _lattice_cmp.resize(len, len, len); - _lattice_incmp.resize(len, len, L); + int len = inst->size(); + _lattice_cmp.resize(len, len, len); + _lattice_incmp.resize(len, len, L); - _lattice_cmp = NULL; - _lattice_incmp = NULL; + _lattice_cmp = NULL; + _lattice_incmp = NULL; - for (int i = 0; i < len; ++ i) { - _lattice_cmp[i][i][i] = new LatticeItem(i); - } + for (int i = 0; i < len; ++ i) { + _lattice_cmp[i][i][i] = new LatticeItem(i); + } } void Decoder2OCarreras::decode_projective(const Instance * inst) { - int len = inst->size(); - - for (int width = 1; width < len; ++ width) { - for (int s = 0; s + width < len; ++ s) { - int t = s + width; - - // I(s, t) = C(s, r) + C(t, r + 1) - for (int l = 0; l < L; ++ l) { - for (int r = s; r < t; ++ r) { - const LatticeItem * best_left_item = 0; - double best_left_score = DOUBLE_NEG_INF; - - for (int cs = s; cs <= r; ++ cs) { - if (cs == s && s != r) { - continue; - } - - const LatticeItem * item = _lattice_cmp[s][r][cs]; - - if (!item) { - continue; - } - - double score = item->_prob; - - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[s][t][cs]; - } - - if (feat_opt.use_labeled_sibling) { - score += inst->sibl_scores[s][t][cs][l]; - } - - if (score > best_left_score) { - best_left_item = item; - best_left_score = score; - } - } - - const LatticeItem * best_right_item = 0; - double best_right_score = DOUBLE_NEG_INF; - - for (int ct = r + 1; ct <= t; ++ ct) { - if (ct == t && r + 1 != t) { - continue; - } + int len = inst->size(); + + for (int width = 1; width < len; ++ width) { + for (int s = 0; s + width < len; ++ s) { + int t = s + width; + + // I(s, t) = C(s, r) + C(t, r + 1) + for (int l = 0; l < L; ++ l) { + for (int r = s; r < t; ++ r) { + const LatticeItem * best_left_item = 0; + double best_left_score = DOUBLE_NEG_INF; + + for (int cs = s; cs <= r; ++ cs) { + if (cs == s && s != r) { + continue; + } + + const LatticeItem * item = _lattice_cmp[s][r][cs]; + + if (!item) { + continue; + } - const LatticeItem * item = _lattice_cmp[t][r + 1][ct]; - - if (!item) { - continue; - } - - double score = item->_prob; + double score = item->_prob; - if (feat_opt.use_unlabeled_grand && - (feat_opt.use_no_grand || ct != t)) { - score += inst->grdu_scores[s][t][ct == t ? s : ct]; - } - - if (feat_opt.use_labeled_grand && - (feat_opt.use_no_grand || ct != t)) { - score += inst->grdl_scores[s][t][ct == t ? s : ct][l]; - } - - if (score > best_right_score) { - best_right_item = item; - best_right_score = score; - } - } - - if (best_left_item && best_right_item) { - double score = best_left_score + best_right_score; - - if (feat_opt.use_unlabeled_dependency) { - score += inst->depu_scores[s][t]; - } - - if (feat_opt.use_labeled_dependency) { - score += inst->depl_scores[s][t][l]; - } - - const LatticeItem * const item = new LatticeItem(INCMP, - s, - t, - score, - best_left_item, - best_right_item, - l); - - lattice_insert(_lattice_incmp[s][t][l], item); - } // end for if !left || !right - } - } // end for for l = 0; l < L; ++ l - - if (s != 0) { - // I(t, s) = C(s, r) + C(t, r + 1) - for (int l = 0; l < L; ++ l) { - for (int r = s; r < t; ++ r) { - const LatticeItem * best_left_item = 0; - double best_left_score = DOUBLE_NEG_INF; - - for (int cs = s; cs <= r; ++ cs) { - if (cs == s && s != r) { - continue; - } - - const LatticeItem * item = _lattice_cmp[s][r][cs]; - - if (!item) { - continue; - } - - double score = item->_prob; - - if (feat_opt.use_unlabeled_grand && - (feat_opt.use_no_grand || cs != s)) { - score += inst->grdu_scores[t][s][cs]; - } - - if (feat_opt.use_labeled_grand && - (feat_opt.use_no_grand || cs != s)) { - score += inst->grdl_scores[t][s][cs][l]; - } - - if (score > best_left_score) { - best_left_item = item; - best_left_score = score; - } - } - - const LatticeItem * best_right_item = 0; - double best_right_score = DOUBLE_NEG_INF; - - for (int ct = r + 1; ct <= t; ++ ct) { - if (ct == t && r + 1 != t) { - continue; - } - - const LatticeItem * item = _lattice_cmp[t][r + 1][ct]; - - if (!item) { - continue; - } - - double score = item->_prob; - - if (feat_opt.use_unlabeled_sibling) { - score += inst->sibu_scores[t][s][ct]; - } - - if (feat_opt.use_labeled_sibling) { - score += inst->sibl_scores[t][s][ct][l]; - } - - if (score > best_right_score) { - best_right_item = item; - best_right_score = score; - } - } - - if (best_left_item && best_right_item) { - double score = best_left_score + best_right_score; - - if (feat_opt.use_unlabeled_dependency) { - score += inst->depu_scores[t][s]; - } - - if (feat_opt.use_labeled_dependency) { - score += inst->depl_scores[t][s][l]; - } - - const LatticeItem * const item = new LatticeItem( INCMP, - t, - s, - score, - best_left_item, - best_right_item, - l); - - lattice_insert(_lattice_incmp[t][s][l], item); - } - } - } - } // end for if s != 0 - - for (int m = s; m <= t; ++ m) { - if (m != s) { // C(s, t, m) = I(s, m, l) + C(m, t, cm); - for (int l = 0; l < L; ++ l) { - const LatticeItem * const left = _lattice_incmp[s][m][l]; - - if (!left) { - continue; - } - - for (int cm = m; cm <= t; ++ cm) { - if (cm == m && cm != t) { - continue; - } - - const LatticeItem * const right = _lattice_cmp[m][t][cm]; - - if (!right) { - continue; - } - - double score = left->_prob + right->_prob; - - if (feat_opt.use_unlabeled_grand && - (feat_opt.use_no_grand || cm != m)) { - score += inst->grdu_scores[s][m][cm]; - } - - if (feat_opt.use_labeled_grand && - (feat_opt.use_no_grand || cm != m)) { - score += inst->grdl_scores[s][m][cm][l]; - } - - const LatticeItem * const item = new LatticeItem(CMP, - s, - t, - score, - left, - right); - - lattice_insert(_lattice_cmp[s][t][m], item); - } // end for (int cm = m; cm <= t; ++ cm) - } // enf for (int l = 0; l < L; ++ l) - } // end for if (m != s) - - if (m != t && s != 0) { // C(t, s, m) = C(m, s, cm) + I(t, m, l) - for (int l = 0; l < L; ++ l) { - const LatticeItem * const right = _lattice_incmp[t][m][l]; - - if (!right) { - continue; - } - - for (int cm = s; cm <= m; ++ cm) { - if (cm == m && cm != s) { - continue; - } - - const LatticeItem * const left = _lattice_cmp[m][s][cm]; - - if (!left) { - continue; - } - - double score = left->_prob + right->_prob; - - if (feat_opt.use_unlabeled_grand && - (feat_opt.use_no_grand || cm != m)) { - score += inst->grdu_scores[t][m][cm == m ? t : cm]; - } - - if (feat_opt.use_labeled_grand && - (feat_opt.use_no_grand || cm != m)) { - score += inst->grdl_scores[t][m][cm == m ? t : cm][l]; - } - - const LatticeItem * const item = new LatticeItem(CMP, - t, - s, - score, - left, - right); - - lattice_insert(_lattice_cmp[t][s][m], item); - } // end for (int cm = s; cm <= m; ++ cm) - } - } + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[s][t][cs]; } + + if (feat_opt.use_labeled_sibling) { + score += inst->sibl_scores[s][t][cs][l]; + } + + if (score > best_left_score) { + best_left_item = item; + best_left_score = score; + } + } + + const LatticeItem * best_right_item = 0; + double best_right_score = DOUBLE_NEG_INF; + + for (int ct = r + 1; ct <= t; ++ ct) { + if (ct == t && r + 1 != t) { + continue; + } + + const LatticeItem * item = _lattice_cmp[t][r + 1][ct]; + + if (!item) { + continue; + } + + double score = item->_prob; + + if (feat_opt.use_unlabeled_grand && + (feat_opt.use_no_grand || ct != t)) { + score += inst->grdu_scores[s][t][ct == t ? s : ct]; + } + + if (feat_opt.use_labeled_grand && + (feat_opt.use_no_grand || ct != t)) { + score += inst->grdl_scores[s][t][ct == t ? s : ct][l]; + } + + if (score > best_right_score) { + best_right_item = item; + best_right_score = score; + } + } + + if (best_left_item && best_right_item) { + double score = best_left_score + best_right_score; + + if (feat_opt.use_unlabeled_dependency) { + score += inst->depu_scores[s][t]; + } + + if (feat_opt.use_labeled_dependency) { + score += inst->depl_scores[s][t][l]; + } + + const LatticeItem * const item = new LatticeItem(INCMP, + s, + t, + score, + best_left_item, + best_right_item, + l); + + lattice_insert(_lattice_incmp[s][t][l], item); + } // end for if !left || !right } + } // end for for l = 0; l < L; ++ l + + if (s != 0) { + // I(t, s) = C(s, r) + C(t, r + 1) + for (int l = 0; l < L; ++ l) { + for (int r = s; r < t; ++ r) { + const LatticeItem * best_left_item = 0; + double best_left_score = DOUBLE_NEG_INF; + + for (int cs = s; cs <= r; ++ cs) { + if (cs == s && s != r) { + continue; + } + + const LatticeItem * item = _lattice_cmp[s][r][cs]; + + if (!item) { + continue; + } + + double score = item->_prob; + + if (feat_opt.use_unlabeled_grand && + (feat_opt.use_no_grand || cs != s)) { + score += inst->grdu_scores[t][s][cs]; + } + + if (feat_opt.use_labeled_grand && + (feat_opt.use_no_grand || cs != s)) { + score += inst->grdl_scores[t][s][cs][l]; + } + + if (score > best_left_score) { + best_left_item = item; + best_left_score = score; + } + } + + const LatticeItem * best_right_item = 0; + double best_right_score = DOUBLE_NEG_INF; + + for (int ct = r + 1; ct <= t; ++ ct) { + if (ct == t && r + 1 != t) { + continue; + } + + const LatticeItem * item = _lattice_cmp[t][r + 1][ct]; + + if (!item) { + continue; + } + + double score = item->_prob; + + if (feat_opt.use_unlabeled_sibling) { + score += inst->sibu_scores[t][s][ct]; + } + + if (feat_opt.use_labeled_sibling) { + score += inst->sibl_scores[t][s][ct][l]; + } + + if (score > best_right_score) { + best_right_item = item; + best_right_score = score; + } + } + + if (best_left_item && best_right_item) { + double score = best_left_score + best_right_score; + + if (feat_opt.use_unlabeled_dependency) { + score += inst->depu_scores[t][s]; + } + + if (feat_opt.use_labeled_dependency) { + score += inst->depl_scores[t][s][l]; + } + + const LatticeItem * const item = new LatticeItem(INCMP, + t, + s, + score, + best_left_item, + best_right_item, + l); + + lattice_insert(_lattice_incmp[t][s][l], item); + } + } + } + } // end for if s != 0 + + for (int m = s; m <= t; ++ m) { + if (m != s) { // C(s, t, m) = I(s, m, l) + C(m, t, cm); + for (int l = 0; l < L; ++ l) { + const LatticeItem * const left = _lattice_incmp[s][m][l]; + + if (!left) { + continue; + } + + for (int cm = m; cm <= t; ++ cm) { + if (cm == m && cm != t) { + continue; + } + + const LatticeItem * const right = _lattice_cmp[m][t][cm]; + + if (!right) { + continue; + } + + double score = left->_prob + right->_prob; + + if (feat_opt.use_unlabeled_grand && + (feat_opt.use_no_grand || cm != m)) { + score += inst->grdu_scores[s][m][cm]; + } + + if (feat_opt.use_labeled_grand && + (feat_opt.use_no_grand || cm != m)) { + score += inst->grdl_scores[s][m][cm][l]; + } + + const LatticeItem * const item = new LatticeItem(CMP, + s, + t, + score, + left, + right); + + lattice_insert(_lattice_cmp[s][t][m], item); + } // end for (int cm = m; cm <= t; ++ cm) + } // enf for (int l = 0; l < L; ++ l) + } // end for if (m != s) + + if (m != t && s != 0) { // C(t, s, m) = C(m, s, cm) + I(t, m, l) + for (int l = 0; l < L; ++ l) { + const LatticeItem * const right = _lattice_incmp[t][m][l]; + + if (!right) { + continue; + } + + for (int cm = s; cm <= m; ++ cm) { + if (cm == m && cm != s) { + continue; + } + + const LatticeItem * const left = _lattice_cmp[m][s][cm]; + + if (!left) { + continue; + } + + double score = left->_prob + right->_prob; + + if (feat_opt.use_unlabeled_grand && + (feat_opt.use_no_grand || cm != m)) { + score += inst->grdu_scores[t][m][cm == m ? t : cm]; + } + + if (feat_opt.use_labeled_grand && + (feat_opt.use_no_grand || cm != m)) { + score += inst->grdl_scores[t][m][cm == m ? t : cm][l]; + } + + const LatticeItem * const item = new LatticeItem(CMP, + t, + s, + score, + left, + right); + + lattice_insert(_lattice_cmp[t][s][m], item); + } // end for (int cm = s; cm <= m; ++ cm) + } + } + } } + } } void Decoder2OCarreras::get_result(Instance * inst) { - int len = inst->size(); - inst->predicted_heads.resize(len, -1); - if (model_opt.labeled) { - inst->predicted_deprelsidx.resize(len, -1); + int len = inst->size(); + inst->predicted_heads.resize(len, -1); + if (model_opt.labeled) { + inst->predicted_deprelsidx.resize(len, -1); + } + + const LatticeItem * best_item = NULL; + for (int c = 1; c < len; ++ c) { + const LatticeItem * item = _lattice_cmp[0][len - 1][c]; + if (!item) { + continue; } - const LatticeItem * best_item = NULL; - for (int c = 1; c < len; ++ c) { - const LatticeItem * item = _lattice_cmp[0][len - 1][c]; - if (!item) { - continue; - } - - if (NULL == best_item || best_item->_prob < item->_prob) { - best_item = item; - } + if (NULL == best_item || best_item->_prob < item->_prob) { + best_item = item; } + } - __BUILD_TREE(inst, best_item); + __BUILD_TREE(inst, best_item); } void Decoder2OCarreras::free_lattice() { - int len = _lattice_cmp.dim1(); - for (int i = 0; i < len; ++ i) { - for (int j = 0; j < len; ++ j) { - for (int l = 0; l < L; ++ l) { - if (_lattice_incmp[i][j][l]) { - delete _lattice_incmp[i][j][l]; - } - } + int len = _lattice_cmp.dim1(); + for (int i = 0; i < len; ++ i) { + for (int j = 0; j < len; ++ j) { + for (int l = 0; l < L; ++ l) { + if (_lattice_incmp[i][j][l]) { + delete _lattice_incmp[i][j][l]; + } + } - for (int k = 0; k < len; ++ k) { - if (_lattice_cmp[i][j][k]) { - delete _lattice_cmp[i][j][k]; - } - } + for (int k = 0; k < len; ++ k) { + if (_lattice_cmp[i][j][k]) { + delete _lattice_cmp[i][j][k]; } + } } + } } } // end for namespace parser diff --git a/src/parser/decoder2o.h b/src/parser/decoder2o.h index fcd62227d..788bb3111 100644 --- a/src/parser/decoder2o.h +++ b/src/parser/decoder2o.h @@ -1,5 +1,5 @@ -#ifndef __DECODER_2_O_H__ -#define __DECODER_2_O_H__ +#ifndef __LTP_PARSER_DECODER_2_O_H__ +#define __LTP_PARSER_DECODER_2_O_H__ #include "decoder.h" @@ -9,39 +9,39 @@ namespace parser { // 2nd-order decoder with dependency features and sibling features class Decoder2O : public Decoder { public: - Decoder2O(int _L = 1) : L(_L) {} + Decoder2O(int _L = 1) : L(_L) {} public: - void init_lattice(const Instance * inst); - void decode_projective(const Instance * inst); - void get_result(Instance * inst); - void free_lattice(); + void init_lattice(const Instance * inst); + void decode_projective(const Instance * inst); + void get_result(Instance * inst); + void free_lattice(); private: - int L; - Mat< const LatticeItem * > _lattice_cmp; - Mat< const LatticeItem * > _lattice_incmp; - Mat< const LatticeItem * > _lattice_sib; + int L; + Mat< const LatticeItem * > _lattice_cmp; + Mat< const LatticeItem * > _lattice_incmp; + Mat< const LatticeItem * > _lattice_sib; }; // 2nd-order decoder with dependency, sibling and grand features class Decoder2OCarreras : public Decoder { public: - Decoder2OCarreras(int _L = 1) : L(_L) {} + Decoder2OCarreras(int _L = 1) : L(_L) {} public: - void init_lattice(const Instance * inst); - void decode_projective(const Instance * inst); - void get_result(Instance * inst); - void free_lattice(); + void init_lattice(const Instance * inst); + void decode_projective(const Instance * inst); + void get_result(Instance * inst); + void free_lattice(); private: - int L; - Mat3< const LatticeItem * > _lattice_cmp; - Mat3< const LatticeItem * > _lattice_incmp; + int L; + Mat3< const LatticeItem * > _lattice_cmp; + Mat3< const LatticeItem * > _lattice_incmp; }; } // end for namespace parser } // end for namespace ltp -#endif // end for __DECODER_2_O__ +#endif // end for __LTP_PARSER_DECODER_2_O_H__ diff --git a/src/parser/extractor.cpp b/src/parser/extractor.cpp index 00bf56ad8..553e818dd 100644 --- a/src/parser/extractor.cpp +++ b/src/parser/extractor.cpp @@ -2,19 +2,19 @@ #include "options.h" #include "settings.h" -#define LEN(x) (x.size()) -#define LAST(x) ((x)[(x).size()-1]) -#define FIRST(x) ((x)[0]) +#define LEN(x) (x.size()) +#define LAST(x) ((x)[(x).size()-1]) +#define FIRST(x) ((x)[0]) #define PUSH(x) do {\ - cache.push_back((x)); \ + cache.push_back((x)); \ }while(0); #define PUSH_DIST(x) do { \ - if (feat_opt.use_distance_in_features) { \ - (x).append(dist); \ - PUSH(x); \ - } \ + if (feat_opt.use_distance_in_features) { \ + (x).append(dist); \ + PUSH(x); \ + } \ } while (0); namespace ltp { @@ -22,33 +22,33 @@ namespace parser { // function of GET direction void Extractor::__GET_DIRECTION(int head_id, int child_id, string& direction) { - if (head_id == 0) { - direction = "L#R"; - } else { - direction = (head_id > child_id ? "L" : "R"); - } + if (head_id == 0) { + direction = "L#R"; + } else { + direction = (head_id > child_id ? "L" : "R"); + } } void Extractor::__GET_DISTANCE_1_2_36_7(int head_id, int child_id, string& distance) { - int dist = (head_id > child_id ? head_id - child_id : child_id - head_id) ; - - if (dist < 3) { - ostringstream S; S << dist; - distance = S.str(); - } else if (dist < 7) { - distance = "<7"; - } else { - distance = ">6"; - } + int dist = (head_id > child_id ? head_id - child_id : child_id - head_id) ; + + if (dist < 3) { + ostringstream S; S << dist; + distance = S.str(); + } else if (dist < 7) { + distance = "<7"; + } else { + distance = ">6"; + } } const string POSUExtractor::prefix = "PU-"; // ================================================================ // -// Dependency Features Extractor // -// feature templates is listed in `extractor.h` // +// Dependency Features Extractor // +// feature templates is listed in `extractor.h` // // the DEPExtractor is a singleton, which only be construct once // -// during the life of the program. // +// during the life of the program. // // ================================================================ // // Initialize the static member @@ -58,302 +58,302 @@ vector