Skip to content

Commit

Permalink
Merge pull request HIT-SCIR#67 from ruoshui1126/parser
Browse files Browse the repository at this point in the history
engpattern & some unittests
  • Loading branch information
Oneplus committed May 8, 2014
2 parents c59fdb8 + c4bfbe9 commit caf49a8
Show file tree
Hide file tree
Showing 6 changed files with 352 additions and 8 deletions.
3 changes: 2 additions & 1 deletion src/segmentor/rulebase.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ const unsigned HAVE_ENG_ON_RIGHT = (1<<6);
const unsigned HAVE_URI_ON_LEFT = (1<<7);
const unsigned HAVE_URI_ON_RIGHT = (1<<8);

static boost::regex engpattern("(([A-Za-z]+)([\\-'\\.][A-Za-z]+)*)");
static boost::regex engpattern("([A-Za-z0-9\\.]*[A-Za-z\\-]((—||[\\-'\\.])[A-Za-z0-9]+)*)");
//static boost::regex engpattern("(([A-Za-z]+)([\\-'\\.][A-Za-z]+)*)");
static boost::regex uripattern("((https?|ftp|file)"
"://[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])");

Expand Down
10 changes: 8 additions & 2 deletions src/srl/DepSRL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,16 @@ int DepSRL::LoadResource(const string &ConfigDir)
m_selectFeats = ConfigDir + "/srl.cfg";
// load srl and prg model
m_srlModel = new maxent::ME_Model;
m_srlModel->load(ConfigDir + "/srl.model");
bool tag = m_srlModel->load(ConfigDir + "/srl.model");
if(!tag) {
return 0;
}

m_prgModel = new maxent::ME_Model;
m_prgModel->load(ConfigDir + "/prg.model");
tag = m_prgModel->load(ConfigDir + "/prg.model");
if(!tag) {
return 0;
}

m_resourceLoaded = true;

Expand Down
34 changes: 29 additions & 5 deletions src/unittest/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@
# - project output header
include_directories (./
${SOURCE_DIR}/
${THIRDPARTY_DIR}/gtest/include)

${THIRDPARTY_DIR}/gtest/include
${THIRDPARTY_DIR}/boost/include)
link_directories (${LIBRARY_OUTPUT_PATH})

# add unittest executable
set (utils_strutils_unittest_SRC ./utils_strutils_unittest.cpp)
add_executable (utils_strutils_unittest ${utils_strutils_unittest_SRC})
Expand Down Expand Up @@ -45,20 +44,29 @@ target_link_libraries (utils_strpaste_unittest gtest pthread)
set_target_properties(utils_strpaste_unittest PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/)

set (engpattern_unittest_SRC ./engpattern_unittest.cpp)
add_executable (engpattern_unittest ${engpattern_unittest_SRC})
target_link_libraries (engpattern_unittest boost_regex gtest pthread)
set_target_properties(engpattern_unittest PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/)


set (segmentor_unittest_SRC ./segmentor_unittest.cpp)
add_executable (segmentor_unittest ${segmentor_unittest_SRC})
target_link_libraries (segmentor_unittest segmentor gtest pthread)
set_target_properties(segmentor_unittest PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/)

set (postagger_internal_unittest_SRC ./postagger_internal_unittest.cpp)
add_executable (postagger_internal_unittest ${postagger_internal_unittest_SRC})
add_executable (postagger_internal_unittest
${postagger_internal_unittest_SRC})
target_link_libraries (postagger_internal_unittest postagger gtest pthread)
set_target_properties(postagger_internal_unittest PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/)

set (postagger_interface_unittest_SRC ./postagger_interface_unittest.cpp)
add_executable (postagger_interface_unittest ${postagger_interface_unittest_SRC})
add_executable (postagger_interface_unittest
${postagger_interface_unittest_SRC})
target_link_libraries (postagger_interface_unittest postagger gtest pthread)
set_target_properties(postagger_interface_unittest PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/)
Expand All @@ -68,3 +76,19 @@ add_executable (parser_unittest ${parser_unittest_SRC})
target_link_libraries (parser_unittest parser gtest pthread)
set_target_properties(parser_unittest PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/)

set (parser_interface_unittest_SRC ./parser_interface_unittest.cpp)
add_executable (parser_interface_unittest
${parser_interface_unittest_SRC})
target_link_libraries (parser_interface_unittest parser gtest pthread)
set_target_properties(parser_interface_unittest PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/)

set (srl_interface_unittest_SRC ./srl_interface_unittest.cpp)
add_executable (srl_interface_unittest
${srl_interface_unittest_SRC})
target_link_libraries (srl_interface_unittest srl gtest pthread)
set_target_properties(srl_interface_unittest PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/unittest/)


30 changes: 30 additions & 0 deletions src/unittest/engpattern_unittest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include <iostream>
#include <gtest/gtest.h>
#include <string>
#include "boost/regex.hpp"
#include "segmentor/rulebase.h"

TEST(engpattern_unittest, english_word) {
std::string word = "78G";
EXPECT_EQ(true, boost::regex_match(word,ltp::segmentor::rulebase::engpattern));

word = "78-d";
EXPECT_EQ(true, boost::regex_match(word,ltp::segmentor::rulebase::engpattern));

word = "md3243";
EXPECT_EQ(true, boost::regex_match(word,ltp::segmentor::rulebase::engpattern));

word = "md-3243";
EXPECT_EQ(true, boost::regex_match(word,ltp::segmentor::rulebase::engpattern));
}

TEST(engpattern_unittest, number) {
std::string word = "1997";
EXPECT_EQ(false, boost::regex_match(word,ltp::segmentor::rulebase::engpattern));
}


int main(int argc, char ** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
145 changes: 145 additions & 0 deletions src/unittest/parser_interface_unittest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// Provide some higher-level testcases
// For some unittest on the internal data structure and function,
// please write in the ./parser_internal_unittest.cpp in same
// folder.
#include <iostream>
#include <fstream>
#include <vector>
#include <gtest/gtest.h>
#include "parser/parser_dll.h"
#include "utils/strutils.hpp"

TEST(parser_interface_unittest, test_load_model_success) {
void * engine = parser_create_parser("./ltp_data/parser.model");
EXPECT_TRUE(NULL != engine);
parser_release_parser(engine);
}

TEST(parser_interface_unittest, test_load_model_fail) {
void * engine = parser_create_parser("/a/path/that/never/exist");
EXPECT_EQ(NULL, engine);
}

const char * kNormalWords[] = {"", "", "", ""};
const char * kNormalPostags[] = {"r", "v", "n", "wp"};
const int kNumNormalWords = 4;

TEST(parser_interface_unittest, test_normal) {
void * engine = parser_create_parser("./ltp_data/parser.model");
EXPECT_TRUE(NULL != engine);
std::vector<std::string> words;
std::vector<std::string> tags;
std::vector<int> heads;
std::vector<std::string> deprels;
for (int i = 0; i < kNumNormalWords; ++ i) {
words.push_back(kNormalWords[i]);
tags.push_back(kNormalPostags[i]);
}
int nr_words = parser_parse(engine, words, tags ,heads ,deprels);
// tagged words should be greater than 4
EXPECT_GT(nr_words, 0);
parser_release_parser(engine);
}

TEST(parser_interface_unittest, test_empty_list) {
void * engine = parser_create_parser("./ltp_data/parser.model");
EXPECT_TRUE(NULL != engine);
std::vector<std::string> words;
std::vector<std::string> tags;
std::vector<int> heads;
std::vector<std::string> deprels;

int nr_words = parser_parse(engine, words, tags ,heads ,deprels);
EXPECT_EQ(0, nr_words);
parser_release_parser(engine);
}

TEST(parser_interface_unittest, test_empty_word) {
void * engine = parser_create_parser("./ltp_data/parser.model");
EXPECT_TRUE(NULL != engine);
std::vector<std::string> words;
std::vector<std::string> tags;
std::vector<int> heads;
std::vector<std::string> deprels;

for (int i = 0; i < kNumNormalWords; ++ i) {
if (i == 2) {
words.push_back("");
} else {
words.push_back(kNormalWords[i]);
}
tags.push_back(kNormalPostags[i]);
}

int nr_words = parser_parse(engine, words, tags, heads, deprels);
EXPECT_EQ(0, nr_words);
parser_release_parser(engine);
}

TEST(parser_interface_unittest, test_empty_tag) {
void * engine = parser_create_parser("./ltp_data/parser.model");
EXPECT_TRUE(NULL != engine);
std::vector<std::string> words;
std::vector<std::string> tags;
std::vector<int> heads;
std::vector<std::string> deprels;

for (int i = 0; i < kNumNormalWords; ++ i) {
if (i == 2) {
tags.push_back("");
} else {
tags.push_back(kNormalPostags[i]);
}
words.push_back(kNormalWords[i]);
}

int nr_words = parser_parse(engine, words, tags, heads, deprels);
EXPECT_EQ(0, nr_words);
parser_release_parser(engine);
}


TEST(parser_interface_unittest, test_speed) {
void * engine = parser_create_parser("./ltp_data/parser.model");
EXPECT_TRUE(NULL != engine);

std::ifstream ifs("./test_data/unittest/test_data.postaggered");
std::string line;
std::string word;
std::vector<std::string> words;
std::vector<std::string> tags;
std::vector<int> heads;
std::vector<std::string> deprels;

int nr_tokens = 0;

long start_time = clock();
while (std::getline(ifs, line, '\n')) {
std::stringstream S(line);
words.clear();
tags.clear();
heads.clear();
deprels.clear();
while (S >> word) {
std::vector<std::string> sep = ltp::strutils::rsplit_by_sep(word, "_", 1);
if(sep.size()==2) {
words.push_back(sep[0]);
tags.push_back(sep[1]);
} else {
std::cerr << word<<std::endl;
return;
}
}
parser_parse(engine, words, tags, heads, deprels);
nr_tokens += words.size();
}
double throughput_per_millisecond = (nr_tokens / ((clock() -start_time) / 1000.));
std::cerr << throughput_per_millisecond << std::endl;
parser_release_parser(engine);
}

int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

Loading

0 comments on commit caf49a8

Please sign in to comment.