diff --git a/src/segmentor/rulebase.h b/src/segmentor/rulebase.h index 2bfb3a2d4..22ada95c1 100644 --- a/src/segmentor/rulebase.h +++ b/src/segmentor/rulebase.h @@ -33,7 +33,8 @@ const unsigned HAVE_ENG_ON_RIGHT = (1<<6); const unsigned HAVE_URI_ON_LEFT = (1<<7); const unsigned HAVE_URI_ON_RIGHT = (1<<8); -static boost::regex engpattern("(([A-Za-z]+)([\\-'\\.][A-Za-z]+)*)"); +static boost::regex engpattern("([A-Za-z0-9\\.]*[A-Za-z\\-]((—||[\\-'\\.])[A-Za-z0-9]+)*)"); +//static boost::regex engpattern("(([A-Za-z]+)([\\-'\\.][A-Za-z]+)*)"); static boost::regex uripattern("((https?|ftp|file)" "://[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])"); diff --git a/src/srl/DepSRL.cpp b/src/srl/DepSRL.cpp index e6b96bc29..c5708793f 100644 --- a/src/srl/DepSRL.cpp +++ b/src/srl/DepSRL.cpp @@ -21,10 +21,16 @@ int DepSRL::LoadResource(const string &ConfigDir) m_selectFeats = ConfigDir + "/srl.cfg"; // load srl and prg model m_srlModel = new maxent::ME_Model; - m_srlModel->load(ConfigDir + "/srl.model"); + bool tag = m_srlModel->load(ConfigDir + "/srl.model"); + if(!tag) { + return 0; + } m_prgModel = new maxent::ME_Model; - m_prgModel->load(ConfigDir + "/prg.model"); + tag = m_prgModel->load(ConfigDir + "/prg.model"); + if(!tag) { + return 0; + } m_resourceLoaded = true; diff --git a/src/unittest/CMakeLists.txt b/src/unittest/CMakeLists.txt index 2ac1ed32b..182ef6044 100644 --- a/src/unittest/CMakeLists.txt +++ b/src/unittest/CMakeLists.txt @@ -4,10 +4,9 @@ # - project output header include_directories (./ ${SOURCE_DIR}/ - ${THIRDPARTY_DIR}/gtest/include) - + ${THIRDPARTY_DIR}/gtest/include + ${THIRDPARTY_DIR}/boost/include) link_directories (${LIBRARY_OUTPUT_PATH}) - # add unittest executable set (utils_strutils_unittest_SRC ./utils_strutils_unittest.cpp) add_executable (utils_strutils_unittest ${utils_strutils_unittest_SRC}) @@ -45,6 +44,13 @@ target_link_libraries (utils_strpaste_unittest gtest pthread) set_target_properties(utils_strpaste_unittest PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/) +set (engpattern_unittest_SRC ./engpattern_unittest.cpp) +add_executable (engpattern_unittest ${engpattern_unittest_SRC}) +target_link_libraries (engpattern_unittest boost_regex gtest pthread) +set_target_properties(engpattern_unittest PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/) + + set (segmentor_unittest_SRC ./segmentor_unittest.cpp) add_executable (segmentor_unittest ${segmentor_unittest_SRC}) target_link_libraries (segmentor_unittest segmentor gtest pthread) @@ -52,13 +58,15 @@ set_target_properties(segmentor_unittest PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/) set (postagger_internal_unittest_SRC ./postagger_internal_unittest.cpp) -add_executable (postagger_internal_unittest ${postagger_internal_unittest_SRC}) +add_executable (postagger_internal_unittest +${postagger_internal_unittest_SRC}) target_link_libraries (postagger_internal_unittest postagger gtest pthread) set_target_properties(postagger_internal_unittest PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/) set (postagger_interface_unittest_SRC ./postagger_interface_unittest.cpp) -add_executable (postagger_interface_unittest ${postagger_interface_unittest_SRC}) +add_executable (postagger_interface_unittest +${postagger_interface_unittest_SRC}) target_link_libraries (postagger_interface_unittest postagger gtest pthread) set_target_properties(postagger_interface_unittest PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/) @@ -68,3 +76,19 @@ add_executable (parser_unittest ${parser_unittest_SRC}) target_link_libraries (parser_unittest parser gtest pthread) set_target_properties(parser_unittest PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/) + +set (parser_interface_unittest_SRC ./parser_interface_unittest.cpp) +add_executable (parser_interface_unittest +${parser_interface_unittest_SRC}) +target_link_libraries (parser_interface_unittest parser gtest pthread) +set_target_properties(parser_interface_unittest PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/) + +set (srl_interface_unittest_SRC ./srl_interface_unittest.cpp) +add_executable (srl_interface_unittest +${srl_interface_unittest_SRC}) +target_link_libraries (srl_interface_unittest srl gtest pthread) +set_target_properties(srl_interface_unittest PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/) + + diff --git a/src/unittest/engpattern_unittest.cpp b/src/unittest/engpattern_unittest.cpp new file mode 100644 index 000000000..bcfa8932f --- /dev/null +++ b/src/unittest/engpattern_unittest.cpp @@ -0,0 +1,30 @@ +#include <iostream> +#include <gtest/gtest.h> +#include <string> +#include "boost/regex.hpp" +#include "segmentor/rulebase.h" + +TEST(engpattern_unittest, english_word) { + std::string word = "78G"; + EXPECT_EQ(true, boost::regex_match(word,ltp::segmentor::rulebase::engpattern)); + + word = "78-d"; + EXPECT_EQ(true, boost::regex_match(word,ltp::segmentor::rulebase::engpattern)); + + word = "md3243"; + EXPECT_EQ(true, boost::regex_match(word,ltp::segmentor::rulebase::engpattern)); + + word = "md-3243"; + EXPECT_EQ(true, boost::regex_match(word,ltp::segmentor::rulebase::engpattern)); +} + +TEST(engpattern_unittest, number) { + std::string word = "1997"; + EXPECT_EQ(false, boost::regex_match(word,ltp::segmentor::rulebase::engpattern)); +} + + +int main(int argc, char ** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/unittest/parser_interface_unittest.cpp b/src/unittest/parser_interface_unittest.cpp new file mode 100644 index 000000000..8dd82b814 --- /dev/null +++ b/src/unittest/parser_interface_unittest.cpp @@ -0,0 +1,145 @@ +// Provide some higher-level testcases +// For some unittest on the internal data structure and function, +// please write in the ./parser_internal_unittest.cpp in same +// folder. +#include <iostream> +#include <fstream> +#include <vector> +#include <gtest/gtest.h> +#include "parser/parser_dll.h" +#include "utils/strutils.hpp" + +TEST(parser_interface_unittest, test_load_model_success) { + void * engine = parser_create_parser("./ltp_data/parser.model"); + EXPECT_TRUE(NULL != engine); + parser_release_parser(engine); +} + +TEST(parser_interface_unittest, test_load_model_fail) { + void * engine = parser_create_parser("/a/path/that/never/exist"); + EXPECT_EQ(NULL, engine); +} + +const char * kNormalWords[] = {"我", "是", "猫", "。"}; +const char * kNormalPostags[] = {"r", "v", "n", "wp"}; +const int kNumNormalWords = 4; + +TEST(parser_interface_unittest, test_normal) { + void * engine = parser_create_parser("./ltp_data/parser.model"); + EXPECT_TRUE(NULL != engine); + std::vector<std::string> words; + std::vector<std::string> tags; + std::vector<int> heads; + std::vector<std::string> deprels; + for (int i = 0; i < kNumNormalWords; ++ i) { + words.push_back(kNormalWords[i]); + tags.push_back(kNormalPostags[i]); + } + int nr_words = parser_parse(engine, words, tags ,heads ,deprels); + // tagged words should be greater than 4 + EXPECT_GT(nr_words, 0); + parser_release_parser(engine); +} + +TEST(parser_interface_unittest, test_empty_list) { + void * engine = parser_create_parser("./ltp_data/parser.model"); + EXPECT_TRUE(NULL != engine); + std::vector<std::string> words; + std::vector<std::string> tags; + std::vector<int> heads; + std::vector<std::string> deprels; + + int nr_words = parser_parse(engine, words, tags ,heads ,deprels); + EXPECT_EQ(0, nr_words); + parser_release_parser(engine); +} + +TEST(parser_interface_unittest, test_empty_word) { + void * engine = parser_create_parser("./ltp_data/parser.model"); + EXPECT_TRUE(NULL != engine); + std::vector<std::string> words; + std::vector<std::string> tags; + std::vector<int> heads; + std::vector<std::string> deprels; + + for (int i = 0; i < kNumNormalWords; ++ i) { + if (i == 2) { + words.push_back(""); + } else { + words.push_back(kNormalWords[i]); + } + tags.push_back(kNormalPostags[i]); + } + + int nr_words = parser_parse(engine, words, tags, heads, deprels); + EXPECT_EQ(0, nr_words); + parser_release_parser(engine); +} + +TEST(parser_interface_unittest, test_empty_tag) { + void * engine = parser_create_parser("./ltp_data/parser.model"); + EXPECT_TRUE(NULL != engine); + std::vector<std::string> words; + std::vector<std::string> tags; + std::vector<int> heads; + std::vector<std::string> deprels; + + for (int i = 0; i < kNumNormalWords; ++ i) { + if (i == 2) { + tags.push_back(""); + } else { + tags.push_back(kNormalPostags[i]); + } + words.push_back(kNormalWords[i]); + } + + int nr_words = parser_parse(engine, words, tags, heads, deprels); + EXPECT_EQ(0, nr_words); + parser_release_parser(engine); +} + + +TEST(parser_interface_unittest, test_speed) { + void * engine = parser_create_parser("./ltp_data/parser.model"); + EXPECT_TRUE(NULL != engine); + + std::ifstream ifs("./test_data/unittest/test_data.postaggered"); + std::string line; + std::string word; + std::vector<std::string> words; + std::vector<std::string> tags; + std::vector<int> heads; + std::vector<std::string> deprels; + + int nr_tokens = 0; + + long start_time = clock(); + while (std::getline(ifs, line, '\n')) { + std::stringstream S(line); + words.clear(); + tags.clear(); + heads.clear(); + deprels.clear(); + while (S >> word) { + std::vector<std::string> sep = ltp::strutils::rsplit_by_sep(word, "_", 1); + if(sep.size()==2) { + words.push_back(sep[0]); + tags.push_back(sep[1]); + } else { + std::cerr << word<<std::endl; + return; + } + } + parser_parse(engine, words, tags, heads, deprels); + nr_tokens += words.size(); + } + double throughput_per_millisecond = (nr_tokens / ((clock() -start_time) / 1000.)); + std::cerr << throughput_per_millisecond << std::endl; + parser_release_parser(engine); +} + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/src/unittest/srl_interface_unittest.cpp b/src/unittest/srl_interface_unittest.cpp new file mode 100644 index 000000000..620e08bc0 --- /dev/null +++ b/src/unittest/srl_interface_unittest.cpp @@ -0,0 +1,138 @@ +// Provide some higher-level testcases +// For some unittest on the internal data structure and function, +// please write in the ./srl_internal_unittest.cpp in same +// folder. +#include <iostream> +#include <fstream> +#include <vector> +#include <gtest/gtest.h> +#include "srl/SRL_DLL.h" + +TEST(srl_interface_unittest, test_load_model_success) { + int tag = SRL_LoadResource("./ltp_data/srl"); + EXPECT_TRUE(0 == tag); + SRL_ReleaseResource(); +} + +TEST(srl_interface_unittest, test_load_model_fail) { + int tag = SRL_LoadResource("/a/path/that/never/exist"); + EXPECT_EQ(-1, tag); +} + +const char * kNormalWords[] = {"我", "是", "猫", "。"}; +const char * kNormalPostags[] = {"r", "v", "n", "wp"}; +const char * kNormalNes[] = {"O", "O", "O", "O"}; +const int kNormalHeads[] = {1,-1,1,1}; +const char * kNormalDeprels[] = {"SBV", "HED", "VOB", "WP"}; +const int kNumNormalWords = 4; + +TEST(srl_interface_unittest, test_normal) { + int tag = SRL_LoadResource("./ltp_data/srl"); + EXPECT_TRUE(0 == tag); + std::vector<std::string> words; + std::vector<std::string> tags; + std::vector<std::string> nes; + std::vector<std::pair<int,std::string > > parses; + std::vector< std::pair< int, std::vector< std::pair<std::string, std::pair< int, int > > > > > srls; + for (int i = 0; i < kNumNormalWords; ++ i) { + words.push_back(kNormalWords[i]); + tags.push_back(kNormalPostags[i]); + nes.push_back(kNormalNes[i]); + parses.push_back(std::make_pair(kNormalHeads[i], kNormalDeprels[i])); + } + int nr_words = DoSRL(words, tags , nes, parses ,srls); + // tagged words should be greater than 4 + EXPECT_GT(nr_words, 0); + SRL_ReleaseResource(); +} + +TEST(srl_interface_unittest, test_empty_list) { + int tag = SRL_LoadResource("./ltp_data/srl"); + EXPECT_TRUE(0 == tag); + std::vector<std::string> words; + std::vector<std::string> tags; + std::vector<std::string> nes; + std::vector<std::pair<int,std::string > > parses; + std::vector< std::pair< int, std::vector< std::pair<std::string, std::pair< int, int > > > > > srls; + + int nr_words = DoSRL(words, tags , nes, parses ,srls); + EXPECT_EQ(0, nr_words); + SRL_ReleaseResource(); +} + +TEST(srl_interface_unittest, test_empty_word) { + int tag = SRL_LoadResource("./ltp_data/srl"); + EXPECT_TRUE(0 == tag); + std::vector<std::string> words; + std::vector<std::string> tags; + std::vector<std::string> nes; + std::vector<std::pair<int,std::string > > parses; + std::vector< std::pair< int, std::vector< std::pair<std::string, std::pair< int, int > > > > > srls; + + for (int i = 0; i < kNumNormalWords; ++ i) { + if (i == 2) { + words.push_back(""); + } else { + words.push_back(kNormalWords[i]); + } + tags.push_back(kNormalPostags[i]); + nes.push_back(kNormalNes[i]); + parses.push_back(std::make_pair(kNormalHeads[i], kNormalDeprels[i])); + } + + int nr_words = DoSRL( words, tags, nes, parses, srls); + EXPECT_EQ(-1, nr_words); + SRL_ReleaseResource(); +} + +TEST(srl_interface_unittest, test_different_size) { + int tag = SRL_LoadResource("./ltp_data/srl"); + EXPECT_TRUE(0 == tag); + std::vector<std::string> words; + std::vector<std::string> tags; + std::vector<std::string> nes; + std::vector<std::pair<int,std::string > > parses; + std::vector< std::pair< int, std::vector< std::pair<std::string, std::pair< int, int > > > > > srls; + + for (int i = 0; i < kNumNormalWords; ++ i) { + if (i != 2) { + words.push_back(kNormalWords[i]); + } + tags.push_back(kNormalPostags[i]); + tags.push_back(kNormalNes[i]); + parses.push_back(std::make_pair(kNormalHeads[i], kNormalDeprels[i])); + } + + int nr_words = DoSRL(words, tags, nes, parses, srls); + EXPECT_EQ(-1, nr_words); + SRL_ReleaseResource(); +} + + +TEST(srl_interface_unittest, test_illeagel_head) { + int tag = SRL_LoadResource("./ltp_data/srl"); + EXPECT_TRUE(0 == tag); + std::vector<std::string> words; + std::vector<std::string> tags; + std::vector<std::string> nes; + std::vector<std::pair<int,std::string > > parses; + std::vector< std::pair< int, std::vector< std::pair<std::string, std::pair< int, int > > > > > srls; + + for (int i = 0; i < kNumNormalWords; ++ i) { + tags.push_back(kNormalPostags[i]); + words.push_back(kNormalWords[i]); + nes.push_back(kNormalNes[i]); + parses.push_back(std::make_pair(kNormalHeads[i], kNormalDeprels[i])); + } + parses[0].first = -2; + + int nr_words = DoSRL(words, tags, nes, parses, srls); + EXPECT_EQ(-1, nr_words); + SRL_ReleaseResource(); +} + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +