Skip to content

Commit

Permalink
Merge pull request HIT-SCIR#66 from niuox/ltp-unittest
Browse files Browse the repository at this point in the history
Ltp unittest
  • Loading branch information
Oneplus committed May 8, 2014
2 parents 1704112 + 6c42cfc commit c59fdb8
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 28 deletions.
38 changes: 18 additions & 20 deletions src/__util/MyLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ void replace_char_by_char(string &str, char c1, char c2)
}

void split_bychars(const string& str, vector<string> & vec, const char *sep)
{ //assert(vec.empty());
{ //assert(vec.empty());
vec.clear();
string::size_type pos1 = 0, pos2 = 0;
string word;
Expand Down Expand Up @@ -58,7 +58,6 @@ bool my_getline(ifstream &inf, string &line)
while (end >= 0 && (line[end] == '\r' || line[end] == '\n')) {
line.erase(end--);
}

return true;
}

Expand Down Expand Up @@ -132,7 +131,7 @@ void split_pair_vector(const vector< pair<int, string> > &vecPair, vector<int> &
}

void split_bychar(const string& str, vector<string>& vec,
const char separator)
const char separator)
{
//assert(vec.empty());
vec.clear();
Expand Down Expand Up @@ -208,13 +207,13 @@ void split_sentence(const string& line, vector<string>& vecSentence)
split_to_pair(line, vecPair);
int size = vecPair.size();
string sentence = "";

for(int i = 0; i < size; i++)
{
if (vecPair[i].first == "??" || vecPair[i].first == "??" || vecPair[i].first == "??")
if (vecPair[i].first == "" || vecPair[i].first == "" || vecPair[i].first == "")
{
sentence += vecPair[i].first + "/" + vecPair[i].second + " ";
if (i+1 < size && vecPair[i+1].first == "??")
if (i+1 < size && vecPair[i+1].first == "")
{
sentence += vecPair[i+1].first + "/" + vecPair[i+1].second + " ";
i++;
Expand All @@ -241,7 +240,7 @@ void chomp(string& str)
else
{
str = str.substr(pos1, pos2-pos1+1);
}
}
}

int common_substr_len(string str1, string str2)
Expand Down Expand Up @@ -272,13 +271,13 @@ int common_substr_len(string str1, string str2)
{
maxSubstrLen = substrLen;
}

if (maxSubstrLen >= minLen-posBeg-1)
{
return maxSubstrLen;
}
}
}
}
}
}
return 0;
}
Expand Down Expand Up @@ -306,7 +305,7 @@ bool is_chinese_char(string& str)
}
}

string separators = "???????????????????????????????棯??\",.?!:'/;??()%"; //all defined separators
string separators = "。,?!、:—“”《》()%¥℃/·\",.?!:'/;()%"; //all defined separators

bool is_separator(string& str)
{
Expand Down Expand Up @@ -353,7 +352,7 @@ void split_to_sentence_by_period(const string& line, vector<string>& vecSentence
int pos1 = 0, pos2 = 0;
string sentence;

while((pos2 = find_GB_char(line, "??", pos1)) != -1)
while((pos2 = find_GB_char(line, "", pos1)) != -1)
{
sentence = line.substr(pos1, pos2-pos1+2);
pos1 = pos2 + 2;
Expand Down Expand Up @@ -385,10 +384,10 @@ void split_by_separator(const string& str, vector<string>& vec, const string sep

bool is_chinese_number(const string& str)
{
if (str == "һ" || str == "??" || str == "??" || str == "??" || str == "??" ||
str == "??" || str == "??" || str == "??" || str == "??" || str == "ʮ" ||
str == "��" || str == "??" || str == "??" || str == "??" || str == "??" ||
str == "ǧ" || str == "??" || str == "??")
if (str == "" || str == "" || str == "" || str == "" || str == "" ||
str == "" || str == "" || str == "" || str == "" || str == "" ||
str == "" || str == "" || str == "" || str == "" || str == "" ||
str == "" || str == "" || str == "亿")
{
return true;
}
Expand All @@ -400,9 +399,9 @@ bool is_chinese_number(const string& str)

//void compute_time()
//{
// clock_t tick = clock();
// double t = (double)tick / CLK_TCK;
// cout << endl << "The time used: " << t << " seconds." << endl;
// clock_t tick = clock();
// double t = (double)tick / CLK_TCK;
// cout << endl << "The time used: " << t << " seconds." << endl;
//}

string word(string& word_pos)
Expand Down Expand Up @@ -446,4 +445,3 @@ void getCharacters_gbk(const string &str, vector<string> &vecCharacter) {
pos += char_num;
}
}

2 changes: 1 addition & 1 deletion src/__xml4nlp/Xml4nlp.h
Original file line number Diff line number Diff line change
Expand Up @@ -823,4 +823,4 @@ class XML4NLP {
static const char * const TAG_ID; // para, sent, word
};

#endif // end for __LTP_XML4NLP_H__
#endif // end for __LTP_XML4NLP_H__
2 changes: 1 addition & 1 deletion src/postagger/postag_dll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class PostaggerWrapper : public ltp::postagger::Postagger {
if (model->external_lexicon.size() != 0) {
for (int i = 0; i < len; ++ i) {
Bitset * mask = model->external_lexicon.get((inst->forms[i]).c_str());
if (NULL == mask) {
if (NULL != mask) {
inst->postag_constrain[i].merge((*mask));
} else {
inst->postag_constrain[i].allsetones();
Expand Down
6 changes: 1 addition & 5 deletions src/postagger/postagger.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -667,9 +667,7 @@ Postagger::test(void) {

// load exteranl lexicon
const char * lexicon_file = test_opt.lexicon_file.c_str();

load_constrain(model, lexicon_file);

const char * test_file = test_opt.test_file.c_str();

ifstream ifs(test_file);
Expand All @@ -696,12 +694,11 @@ Postagger::test(void) {
for (int i = 0; i < len; ++ i) {
inst->tagsidx[i] = model->labels.index(inst->tags[i]);
}

inst->postag_constrain.resize(len);
if (model->external_lexicon.size() != 0) {
for (int i = 0; i < len; ++ i) {
Bitset * mask = model->external_lexicon.get((inst->forms[i]).c_str());
if (NULL == mask) {
if (NULL != mask) {
inst->postag_constrain[i].merge((*mask));
} else {
inst->postag_constrain[i].allsetones();
Expand All @@ -712,7 +709,6 @@ Postagger::test(void) {
inst->postag_constrain[i].allsetones();
}
}

extract_features(inst);
calculate_scores(inst, true);

Expand Down
96 changes: 96 additions & 0 deletions src/unittest/postagger_internal_unittest.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
#include <iostream>
#include <fstream>
#include <map>
#include <vector>
#include <gtest/gtest.h>
#include "postagger/model.h"
#include "postagger/constrainutil.hpp"
#include "utils/smartmap.hpp"
#include "utils/tinybitset.hpp"

using namespace std;
using namespace ltp::utility;

const int kMaxTagIndex = 27;
const char * kTags[] = { "a", "b", "c",
Expand Down Expand Up @@ -73,6 +80,95 @@ TEST(postagger_unittest, test_load_constrain_with_known_and_unknown_tag) {
EXPECT_EQ(nr_constraints, 1);
}

TEST(postagger_unittest, test_load_constrain_function_correction) {

const std::string model_file = "./ltp_data/pos.model";
const std::string constraints_file = "./test_data/unittest/postag-known.constrain";

std::ifstream mfs(model_file.c_str());
std::ifstream lfs(constraints_file.c_str());

ltp::postagger::Model * model = new ltp::postagger::Model();
model->load(mfs);
load_constrain(model, constraints_file.c_str());

typedef std::vector< int > int_vec;
typedef std::vector< std::string > str_vec;
typedef std::map< std::string, str_vec > Map;
typedef Map::iterator MapIt;
typedef Map::const_iterator MapConstIt;

Map mapping_stl;
Map mapping_smartmap;

//read constrain file, construct mapping_stl
std::string line;
while (std::getline(lfs, line)) {
line = ltp::strutils::chomp(line);
if (line.size() == 0) {
continue;
}
str_vec tokens = ltp::strutils::split(line);
int num_tokens = tokens.size();
std::string key = ltp::strutils::chartypes::sbc2dbc_x(tokens[0]);
if( mapping_stl.find(key) == mapping_stl.end() ) {
mapping_stl.insert( Map::value_type( key, str_vec() ) );
}
for (int i = 1; i < num_tokens; ++ i) {
mapping_stl[key].push_back(tokens[i]);
}
}

//sort & unique mapping_stl
for(MapIt it = mapping_stl.begin() ;it != mapping_stl.end() ; ++it) {
string key = it->first;
str_vec & value = it->second;
sort( value.begin(),value.end() );
value.erase( unique(value.begin(),value.end()), value.end());
}

//traverse model external_lexicon, construct mapping_smartmap
for (SmartMap<Bitset>::const_iterator itx = model->external_lexicon.begin();
itx != model->external_lexicon.end();
++ itx ) {
std::string key = itx.key();
Bitset mask = (*(itx.value()));
int_vec ones = mask.getbitones();
str_vec value;
for(int i=0;i<ones.size();i++) {
std::string label = model->labels.at(ones[i]);
value.push_back(label);
}
sort(value.begin(),value.end());
mapping_smartmap.insert(Map::value_type(key,value) );
}

//mapping_stl size should equal to mapping_smartmap size
EXPECT_EQ(mapping_stl.size(), mapping_smartmap.size() );

for(MapConstIt it = mapping_stl.begin() ;it != mapping_stl.end() ; ++it) {
string key = it->first;
str_vec value_stl = it->second;
MapConstIt key_it = mapping_smartmap.find(key);
bool find = (mapping_smartmap.end() != key_it);
EXPECT_TRUE(find);
if(!find) {
return;
}
str_vec value_smartmap = key_it->second;
bool value_size_equal = (value_stl.size() == value_smartmap.size() );
EXPECT_TRUE(value_size_equal);
if(!value_size_equal) {
return;
}
for(int i=0;i<value_stl.size();i++) {
EXPECT_EQ(value_stl[i],value_smartmap[i]);
}
}

}


int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
Expand Down
2 changes: 1 addition & 1 deletion src/utils/tinybitset.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ namespace ltp {
namespace utility {

const int kBucketSize = int( sizeof(unsigned) ) * 8;
const int kN = log(kBucketSize) / log(2);
const int kN = int( log(kBucketSize) / log(2) );

struct Bitset{
private:
Expand Down

0 comments on commit c59fdb8

Please sign in to comment.