From 484855df4148ef8de0a5e3005a10df19483490ab Mon Sep 17 00:00:00 2001 From: Oneplus Date: Wed, 25 Sep 2013 02:56:18 +0800 Subject: [PATCH] add pos cmdline test, and multithreaded test --- examples/Makefile | 38 +++-- examples/multi_cws_cmdline.cpp | 14 +- examples/multi_pos_cmdline.cpp | 146 ++++++++++++++++++ examples/pos.cpp | 1 + examples/pos_cmdline.cpp | 82 ++++++++++ src/ner/ner_dll.cpp | 8 +- src/postagger/postag_dll.cpp | 6 +- src/segmentor/segment_dll.cpp | 7 +- tools/train/conf/parser/parser-o1.cnf | 2 +- tools/train/conf/parser/parser-o2carreras.cnf | 2 +- tools/train/conf/parser/parser-o2sib.cnf | 2 +- tools/train/rock.sh | 71 +++++++++ 12 files changed, 347 insertions(+), 32 deletions(-) create mode 100644 examples/multi_pos_cmdline.cpp create mode 100644 examples/pos_cmdline.cpp diff --git a/examples/Makefile b/examples/Makefile index df702872b..a39b2d4ce 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -1,4 +1,7 @@ -all: cws cws_cmdline pos par ner multi_cws_cmdline +all: cws cws_cmdline multi_cws_cmdline \ + pos pos_cmdline multi_pos_cmdline \ + par \ + ner cws: cws.cpp g++ -o cws cws.cpp -I./ \ @@ -12,11 +15,33 @@ cws_cmdline: cws_cmdline.cpp -I../thirdparty/boost/include \ -Wl,-dn -L../lib/ -lsegmentor -lboost_regex -Wl,-dy +multi_cws_cmdline: multi_cws_cmdline.cpp + g++ -o multi_cws_cmdline multi_cws_cmdline.cpp \ + thirdparty/tinythreadpp/tinythread.cpp \ + -I./ \ + -I../include/ \ + -I../thirdparty/boost/include/ \ + -I./thirdparty/tinythreadpp/ \ + -Wl,-dn -L../lib/ -lsegmentor -lboost_regex -Wl,-dy -lpthread + pos: pos.cpp g++ -o pos pos.cpp -I./ \ -I../include/ \ -L../lib/ -lpostagger +pos_cmdline: pos_cmdline.cpp + g++ -o pos_cmdline pos_cmdline.cpp -I./ \ + -I../include/ \ + -Wl,-dn -L../lib/ -lpostagger -Wl,-dy + +multi_pos_cmdline: multi_pos_cmdline.cpp + g++ -o multi_pos_cmdline multi_pos_cmdline.cpp \ + thirdparty/tinythreadpp/tinythread.cpp \ + -I./ \ + -I../include/ \ + -I./thirdparty/tinythreadpp/ \ + -Wl,-dn -L../lib/ -lpostagger -Wl,-dy -lpthread + ner: ner.cpp g++ -o ner ner.cpp -I./ \ -I../src/ner/ \ @@ -27,21 +52,14 @@ par: par.cpp -I../src/parser/ \ -L../lib -lparser -multi_cws_cmdline: multi_cws_cmdline.cpp - g++ -o multi_cws_cmdline multi_cws_cmdline.cpp \ - thirdparty/tinythreadpp/tinythread.cpp \ - -I./ \ - -I../include/ \ - -I../thirdparty/boost/include/ \ - -I./thirdparty/tinythreadpp/ \ - -Wl,-dn -L../lib/ -lsegmentor -lboost_regex -Wl,-dy -lpthread - .PHONY: clean clean: rm cws rm cws_cmdline rm pos + rm pos_cmdline rm ner rm par rm multi_cws_cmdline + rm multi_pos_cmdline diff --git a/examples/multi_cws_cmdline.cpp b/examples/multi_cws_cmdline.cpp index b207cd9c1..e76f61325 100644 --- a/examples/multi_cws_cmdline.cpp +++ b/examples/multi_cws_cmdline.cpp @@ -12,6 +12,7 @@ * is not compilable under MSVC */ #include +#include #include #include #include @@ -89,13 +90,14 @@ void multithreaded_segment( void * args) { } int main(int argc, char ** argv) { - if (argc < 2) { - std::cerr << "multi-cws [model path] [lexicon file]" << std::endl; + if (argc < 1 || (0 == strcmp(argv[1], "-h"))) { + std::cerr << "Example: ./multi_cws_cmdline [model path] [lexicon file]" << std::endl; + std::cerr << std::endl; + std::cerr << "This program recieve input word sequence from stdin." << std::endl; + std::cerr << "One sentence per line." << std::endl; + return -1; } - string sentence; - vector result; - void * engine = 0; if (argc == 2) { engine = segmentor_create_segmentor(argv[1]); @@ -107,8 +109,6 @@ int main(int argc, char ** argv) { return -1; } - std::vector words; - int num_threads = thread::hardware_concurrency(); std::cerr << "TRACE: Model is loaded" << std::endl; std::cerr << "TRACE: Running " << num_threads << " thread(s)" << std::endl; diff --git a/examples/multi_pos_cmdline.cpp b/examples/multi_pos_cmdline.cpp new file mode 100644 index 000000000..6617fa182 --- /dev/null +++ b/examples/multi_pos_cmdline.cpp @@ -0,0 +1,146 @@ +/* + * Multi-threaded postagger test program. The user input a line + * of Chinese sentence an the program will output its segment + * result. + * + * @dependency package: tinythread - a portable c++ wrapper for + * multi-thread library. + * @author: LIU, Yijia + * @data: 2013-09-24 + * + * This program is special designed for UNIX user, for get time + * is not compilable under MSVC + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "postag_dll.h" +#include "tinythread.h" +#include "fast_mutex.h" + +using namespace std; +using namespace tthread; + +const int MAX_LEN = 1024; + +double get_time(void) { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + (tv.tv_usec / 1000000.0); +} + +class Dispatcher { +public: + Dispatcher( void * model ) { + _model = model; + } + + int next(std::vector &words) { + std::string line; + std::string word; + lock_guard guard(_mutex); + if (getline(std::cin, line, '\n')) { + std::stringstream S(line); + words.clear(); + while (S >> word) { words.push_back(word); } + } else { + return -1; + } + return 0; + } + + void output(const std::vector & words, + const std::vector &postags) { + lock_guard guard(_mutex); + if (words.size() != postags.size()) { + return; + } + + for (int i = 0; i < words.size(); ++ i) { + std::cout << words[i] << "_" << postags[i]; + std::cout << (i == words.size() - 1 ? '\n' : '|'); + } + return; + } + + void * model() { + return _model; + } + +private: + fast_mutex _mutex; + void * _model; + string _sentence; +}; + +void multithreaded_postag( void * args) { + std::vector words; + std::vector postags; + + Dispatcher * dispatcher = (Dispatcher *)args; + void * model = dispatcher->model(); + + while (true) { + int ret = dispatcher->next(words); + + if (ret < 0) + break; + + postags.clear(); + postagger_postag(model, words, postags); + dispatcher->output(words, postags); + } + + return; +} + +int main(int argc, char ** argv) { + if (argc < 1 || (0 == strcmp(argv[1], "-h"))) { + std::cerr << "Usage: ./multi_pos_cmdline [model path]" << std::endl; + std::cerr << std::endl; + std::cerr << "This program recieve input word sequence from stdin." << std::endl; + std::cerr << "One sentence per line. Words are separated by space." << std::endl; + return -1; + } + + void * engine = postagger_create_postagger(argv[1]); + + if (!engine) { + return -1; + } + + int num_threads = thread::hardware_concurrency(); + std::cerr << "TRACE: Model is loaded" << std::endl; + std::cerr << "TRACE: Running " << num_threads << " thread(s)" << std::endl; + + Dispatcher * dispatcher = new Dispatcher( engine ); + + double tm = get_time(); + list thread_list; + for (int i = 0; i < num_threads; ++ i) { + thread * t = new thread( multithreaded_postag, (void *)dispatcher ); + thread_list.push_back( t ); + } + + for (list::iterator i = thread_list.begin(); + i != thread_list.end(); ++ i) { + thread * t = *i; + t->join(); + delete t; + } + + tm = get_time() - tm; + std::cerr << "TRACE: consume " + << tm + << " seconds." + << std::endl; + + return 0; +} + diff --git a/examples/pos.cpp b/examples/pos.cpp index 99b3148a2..840d60f6b 100644 --- a/examples/pos.cpp +++ b/examples/pos.cpp @@ -5,6 +5,7 @@ int main(int argc, char * argv[]) { if (argc < 1) { + std::cerr << "pos [model path]" << std::endl; return -1; } diff --git a/examples/pos_cmdline.cpp b/examples/pos_cmdline.cpp new file mode 100644 index 000000000..feca053b5 --- /dev/null +++ b/examples/pos_cmdline.cpp @@ -0,0 +1,82 @@ +/* + * Single-threaded segmentor test program. The user input a line + * of Chinese sentence an the program will output its segment + * result. + * + * @dependency package: tinythread - a portable c++ wrapper for + * multi-thread library. + * @author: LIU, Yijia + * @data: 2013-09-24 + * + * This program is special designed for UNIX user, for get time + * is not compilable under MSVC + */ +#include +#include +#include +#include +#include +#include +#include +#include "postag_dll.h" + +double get_time(void) { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + (tv.tv_usec / 1000000.0); +} + +int main(int argc, char * argv[]) { + if (argc < 1 || (0 == strcmp(argv[1], "-h"))) { + std::cerr << "Example: ./pos_cmdline [model path]" << std::endl; + std::cerr << std::endl; + std::cerr << "This program recieve input word sequence from stdin." << std::endl; + std::cerr << "One sentence per line. Words are separated by space." << std::endl; + std::cerr << std::endl; + return 1; + } + + void * engine = postagger_create_postagger(argv[1]); + if (!engine) { + std::cerr << "WARNINIG : Failed to load model." << std::endl; + return -1; + } + + std::string line; + std::string word; + std::vector words; + std::vector postags; + + std::cerr << "TRACE: Model is loaded" << std::endl; + double tm = get_time(); + + while (std::getline(std::cin, line, '\n')) { + std::stringstream S(line); + words.clear(); + while (S >> word) { words.push_back(word); } + + if (words.size() == 0) { continue; } + int len = postagger_postag(engine, words, postags); + if (postags.size() != words.size()) { + std::cerr << "WARNINIG: Number of postags is different from number of words" + << std::endl; + } + + for (int i = 0; i < len; ++ i) { + std::cout << words[i] << "_" << postags[i]; + if (i+1 == len) std::cout <labels); - decoder = new ltp::ner::Decoder(model->num_labels(), base); - // beg_tag0 = model->labels.index( ); // beg_tag1 = model->labels.index( ); @@ -42,6 +39,9 @@ class NERWrapper : public ltp::ner::NER { int recognize(const std::vector & words, const std::vector & postags, std::vector & tags) { + ltp::ner::rulebase::RuleBase base(model->labels); + ltp::ner::Decoder deco(model->num_labels(), base); + ltp::ner::Instance * inst = new ltp::ner::Instance; if (words.size() != postags.size()) { return 0; @@ -54,7 +54,7 @@ class NERWrapper : public ltp::ner::NER { ltp::ner::NER::extract_features(inst); ltp::ner::NER::calculate_scores(inst, true); - decoder->decode(inst); + deco.decode(inst); for (int i = 0; i < words.size(); ++ i) { tags.push_back(model->labels.at(inst->predicted_tagsidx[i])); diff --git a/src/postagger/postag_dll.cpp b/src/postagger/postag_dll.cpp index aa8d37c93..a669072c0 100644 --- a/src/postagger/postag_dll.cpp +++ b/src/postagger/postag_dll.cpp @@ -27,21 +27,21 @@ class PostaggerWrapper : public ltp::postagger::Postagger { return false; } - decoder = new ltp::postagger::Decoder(model->num_labels()); - return true; } int postag(const std::vector & words, std::vector & tags) { ltp::postagger::Instance * inst = new ltp::postagger::Instance; + ltp::postagger::Decoder deco(model->num_labels()); + for (int i = 0; i < words.size(); ++ i) { inst->forms.push_back(ltp::strutils::chartypes::sbc2dbc_x(words[i])); } ltp::postagger::Postagger::extract_features(inst); ltp::postagger::Postagger::calculate_scores(inst, true); - decoder->decode(inst); + deco.decode(inst); ltp::postagger::Postagger::build_labels(inst, tags); diff --git a/src/segmentor/segment_dll.cpp b/src/segmentor/segment_dll.cpp index 049f0afcc..bb815304a 100644 --- a/src/segmentor/segment_dll.cpp +++ b/src/segmentor/segment_dll.cpp @@ -75,15 +75,12 @@ class SegmentorWrapper : public ltp::segmentor::Segmentor { // allocate a new decoder so that the segmentor support multithreaded // decoding. this modification was committed by niuox - ltp::segmentor::Decoder * decoder_temp = new ltp::segmentor::Decoder( - model->num_labels(), - *baseAll); + ltp::segmentor::Decoder deco(model->num_labels(), *baseAll); - decoder_temp->decode(inst); + deco.decode(inst); ltp::segmentor::Segmentor::build_words(inst, inst->predicted_tagsidx, words, beg_tag0, beg_tag1); delete inst; - delete decoder_temp; return words.size(); } diff --git a/tools/train/conf/parser/parser-o1.cnf b/tools/train/conf/parser/parser-o1.cnf index 8dc4549b1..7335e7f7e 100644 --- a/tools/train/conf/parser/parser-o1.cnf +++ b/tools/train/conf/parser/parser-o1.cnf @@ -19,4 +19,4 @@ holdout-file = sample/parser/example-holdout.conll max-iter = 1 algorithm = pa -model-name = build/parser/example-o1 +model-name = build/parser/example-parser-o1 diff --git a/tools/train/conf/parser/parser-o2carreras.cnf b/tools/train/conf/parser/parser-o2carreras.cnf index 3937f6945..99e0c8a4b 100644 --- a/tools/train/conf/parser/parser-o2carreras.cnf +++ b/tools/train/conf/parser/parser-o2carreras.cnf @@ -22,4 +22,4 @@ holdout-file = sample/parser/example-holdout.conll max-iter = 1 algorithm = pa -model-name = build/parser/ldc-o2carreras +model-name = build/parser/example-parser-o2carreras diff --git a/tools/train/conf/parser/parser-o2sib.cnf b/tools/train/conf/parser/parser-o2sib.cnf index 66ce823d4..503868c72 100644 --- a/tools/train/conf/parser/parser-o2sib.cnf +++ b/tools/train/conf/parser/parser-o2sib.cnf @@ -19,4 +19,4 @@ holdout-file = sample/parser/example-holdout.conll max-iter = 1 algorithm = pa -model-name = build/parser/example-o2sib +model-name = build/parser/example-parser-o2sib diff --git a/tools/train/rock.sh b/tools/train/rock.sh index 75c6b77e6..d89752f8f 100755 --- a/tools/train/rock.sh +++ b/tools/train/rock.sh @@ -107,4 +107,75 @@ else echo "[3] TRACE: NER train model test is passed." fi +################################################# +# THE PARSER O1 SESSION # +################################################# + +PARSER_MODEL_DIR=$BUILD_DIR/parser +PARSER_MODEL_O1_PATH=$PARSER_MODEL_DIR/example-parser-o1.0.model + +PARSER_CONF_DIR=$CONF_DIR/parser +PARSER_CONF_TRAIN_O1_PATH=$PARSER_CONF_DIR/parser-o1.cnf + +PARSER_LOG_DIR=$LOG_DIR/parser +PARSER_LOG_TRAIN_O1_PATH=$PARSER_LOG_DIR/example-train.conll + +PARSER_EXE=./lgdpj + +mkdir -p $PARSER_MODEL_DIR +mkdir -p $PARSER_LOG_DIR + +$PARSER_EXE $PARSER_CONF_TRAIN_O1_PATH >& $PARSER_LOG_TRAIN_O1_PATH + +if [ ! -f $PARSER_MODEL_O1_PATH ]; then + echo "[4.1] ERROR: Parser-o1 model is not detected!" +else + echo "[4.1] TRACE: Parser-o1 train model test is passed." +fi + +################################################# +# THE PARSER O2 sibling SESSION # +################################################# + +PARSER_MODEL_DIR=$BUILD_DIR/parser +PARSER_MODEL_O2SIB_PATH=$PARSER_MODEL_DIR/example-parser-o2sib.0.model + +PARSER_CONF_DIR=$CONF_DIR/parser +PARSER_CONF_TRAIN_O2SIB_PATH=$PARSER_CONF_DIR/parser-o2sib.cnf + +PARSER_LOG_DIR=$LOG_DIR/parser +PARSER_LOG_TRAIN_O2SIB_PATH=$PARSER_LOG_DIR/example-train.conll + +PARSER_EXE=./lgdpj + +$PARSER_EXE $PARSER_CONF_TRAIN_O2SIB_PATH >& $PARSER_LOG_TRAIN_O2SIB_PATH + +if [ ! -f $PARSER_MODEL_O2SIB_PATH ]; then + echo "[4.2] ERROR: Parser-o2sib model is not detected!" +else + echo "[4.2] TRACE: Parser-o2sib train model test is passed." +fi + +################################################# +# THE PARSER O2 carreras SESSION # +################################################# + +PARSER_MODEL_DIR=$BUILD_DIR/parser +PARSER_MODEL_O2CARRERAS_PATH=$PARSER_MODEL_DIR/example-parser-o2carreras.0.model + +PARSER_CONF_DIR=$CONF_DIR/parser +PARSER_CONF_TRAIN_O2CARRERAS_PATH=$PARSER_CONF_DIR/parser-o2carreras.cnf + +PARSER_LOG_DIR=$LOG_DIR/parser +PARSER_LOG_TRAIN_O2CARRERAS_PATH=$PARSER_LOG_DIR/example-train.conll + +PARSER_EXE=./lgdpj + +$PARSER_EXE $PARSER_CONF_TRAIN_O2CARRERAS_PATH >& $PARSER_LOG_TRAIN_O2CARRERAS_PATH + +if [ ! -f $PARSER_MODEL_O2SIB_PATH ]; then + echo "[4.3] ERROR: Parser-o2carreras model is not detected!" +else + echo "[4.3] TRACE: Parser-o2carreras train model test is passed." +fi