From 2276ee5ffb4df69ac1be5ef9cd9bfbb5a25210b8 Mon Sep 17 00:00:00 2001 From: niuox Date: Wed, 16 Apr 2014 17:42:30 +0800 Subject: [PATCH] pos internal&external lexicon debug code --- src/postagger/decoder.cpp | 51 +++++++++--------- src/postagger/decoder.h | 5 +- src/postagger/instance.h | 6 +++ src/postagger/model.cpp | 12 +++++ src/postagger/model.h | 7 ++- src/postagger/poslexicon.h | 62 ---------------------- src/postagger/postag_dll.cpp | 98 +++++++++++++++++++++------------- src/postagger/postag_dll.h | 1 + src/postagger/postagger.cpp | 100 ++++++++++++++++++++++++++++++----- src/postagger/postagger.h | 2 + src/utils/tinybitset.hpp | 74 ++++++++++++++++++++++++++ 11 files changed, 276 insertions(+), 142 deletions(-) delete mode 100644 src/postagger/poslexicon.h create mode 100644 src/utils/tinybitset.hpp diff --git a/src/postagger/decoder.cpp b/src/postagger/decoder.cpp index 5f8c086d7..eb2dd47c0 100644 --- a/src/postagger/decoder.cpp +++ b/src/postagger/decoder.cpp @@ -5,9 +5,9 @@ namespace postagger { void -Decoder::decode(Instance * inst,const Poslexicon* lexicon ) { +Decoder::decode(Instance * inst) { init_lattice(inst); - viterbi_decode(inst,lexicon); + viterbi_decode(inst); get_result(inst); free_lattice(); } @@ -40,32 +40,33 @@ void Decoder::viterbi_decode_inner(const Instance * inst,int i,int l){ } void -Decoder::viterbi_decode(const Instance * inst,const Poslexicon* lexicon) { +Decoder::viterbi_decode(const Instance * inst) { int len = inst->size(); - if (!lexicon){ - for (int i = 0; i < len; ++ i) { + bool external_lexicon_flag = false; + if(inst->external_lexicon_match_state.size() == len){ + external_lexicon_flag = true; + } + for (int i = 0; i < len; ++ i) { + if(external_lexicon_flag && inst->external_lexicon_match_state[i].isnotempty()) { for (int l = 0; l < L; ++ l) { - viterbi_decode_inner(inst,i,l); - } // end for label kinds - } // end for len - } // end for lexicon == NULL - else{ - std::vector lex_labels; - int lex_labels_size; - for (int i = 0; i < len; ++ i) { - if ( lexicon->get(inst->forms[i] ,lex_labels) ){ - lex_labels_size = lex_labels.size(); - for (int l_idx = 0; l_idx < lex_labels_size; ++l_idx) { - viterbi_decode_inner( inst,i,lex_labels[l_idx] ); - } // end for label kinds - }// end for if word found in lexicon - else{ - for (int l = 0; l < L; ++ l) { + if(inst->external_lexicon_match_state[i].get(l)){ + viterbi_decode_inner(inst,i,l); + } + } + } + else if(inst->internal_lexicon_match_state[i].isnotempty()) { + for (int l = 0; l < L; ++ l) { + if(inst->internal_lexicon_match_state[i].get(l)) { viterbi_decode_inner(inst,i,l); - } // end for label kinds - }// end for word not found in lexicon - } // end for len - } // end for lexicon != NULL + } + } + } + else{ + for (int l = 0; l < L; ++ l) { + viterbi_decode_inner(inst,i,l); + } + } + } } void diff --git a/src/postagger/decoder.h b/src/postagger/decoder.h index 90124f96b..9c6e6559e 100644 --- a/src/postagger/decoder.h +++ b/src/postagger/decoder.h @@ -6,7 +6,6 @@ #include #include "instance.h" #include "mat.h" -#include "poslexicon.h" namespace ltp { namespace postagger { @@ -36,11 +35,11 @@ class LatticeItem { class Decoder { public: Decoder (int _L) : L(_L) {} - void decode(Instance * inst,const Poslexicon* lexicon = NULL); + void decode(Instance * inst); private: void init_lattice(const Instance * inst); void viterbi_decode_inner(const Instance * inst,int i,int l); - void viterbi_decode(const Instance * inst,const Poslexicon* lexicon = NULL); + void viterbi_decode(const Instance * inst); void get_result(Instance * inst); void free_lattice(); diff --git a/src/postagger/instance.h b/src/postagger/instance.h index 90aecbc18..745fc52e6 100644 --- a/src/postagger/instance.h +++ b/src/postagger/instance.h @@ -5,10 +5,13 @@ #include "featurevec.h" #include "mat.h" #include "sparsevec.h" +#include "tinybitset.hpp" namespace ltp { namespace postagger { +using namespace ltp::utility; + class Instance { public: Instance() {} @@ -64,6 +67,9 @@ class Instance { std::vector< std::string > predicted_tags; std::vector< int > predicted_tagsidx; + std::vector internal_lexicon_match_state; + std::vector external_lexicon_match_state; + math::SparseVec features; /*< the gold features */ math::SparseVec predicted_features; /*< the predicted features */ diff --git a/src/postagger/model.cpp b/src/postagger/model.cpp index 2eb510635..2a48220c9 100644 --- a/src/postagger/model.cpp +++ b/src/postagger/model.cpp @@ -17,16 +17,21 @@ void Model::save(std::ostream & ofs) { int off = ofs.tellp(); unsigned labels_offset = 0; + unsigned lexicon_offset = 0; unsigned feature_offset = 0; unsigned parameter_offset = 0; write_uint(ofs, 0); // the label offset + write_uint(ofs, 0); // the internal lexicon offset write_uint(ofs, 0); // the features offset write_uint(ofs, 0); // the parameter offset labels_offset = ofs.tellp(); labels.dump(ofs); + lexicon_offset = ofs.tellp(); + internal_lexicon.dump(ofs); + feature_offset = ofs.tellp(); space.dump(ofs); @@ -35,6 +40,7 @@ void Model::save(std::ostream & ofs) { ofs.seekp(off); write_uint(ofs, labels_offset); + write_uint(ofs, lexicon_offset); write_uint(ofs, feature_offset); write_uint(ofs, parameter_offset); } @@ -48,6 +54,7 @@ bool Model::load(std::istream & ifs) { } unsigned labels_offset = read_uint(ifs); + unsigned lexicon_offset = read_uint(ifs); unsigned feature_offset = read_uint(ifs); unsigned parameter_offset = read_uint(ifs); @@ -56,6 +63,11 @@ bool Model::load(std::istream & ifs) { return false; } + ifs.seekg(lexicon_offset); + if (!internal_lexicon.load(ifs)){ + return false; + } + ifs.seekg(feature_offset); if (!space.load(labels.size(), ifs)) { return false; diff --git a/src/postagger/model.h b/src/postagger/model.h index 4a91498e5..4bf6f9b49 100644 --- a/src/postagger/model.h +++ b/src/postagger/model.h @@ -3,9 +3,8 @@ #include "featurespace.h" #include "parameter.h" - #include "smartmap.hpp" -#include "poslexicon.h" +#include "tinybitset.hpp" namespace ltp { namespace postagger { @@ -43,8 +42,8 @@ class Model { IndexableSmartMap labels; FeatureSpace space; Parameters param; - - Poslexicon poslexicon; + SmartMap internal_lexicon; + SmartMap external_lexicon; private: void write_uint(std::ostream & out, unsigned int val) { diff --git a/src/postagger/poslexicon.h b/src/postagger/poslexicon.h deleted file mode 100644 index 20fbc0818..000000000 --- a/src/postagger/poslexicon.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef __LTP_POSTAGGER_POSLEXICON_H__ -#define __LTP_POSTAGGER_POSLEXICON_H__ - -#include -#include -#include -#include - -namespace ltp { -namespace postagger { -class Poslexicon{ - -public: - Poslexicon(){} - ~Poslexicon(){} - - bool get(const std::string key,std::vector & value) const{ - Lexiconmap_const_iter it = lexiconmap.find(key); - if( it == lexiconmap.end() ){ - return false; - } - else{ - value = it->second; - return true; - } - } - - void set(const std::string key,const std::vector value){ - Lexiconmap_iter it = lexiconmap.find(key); - if (it == lexiconmap.end() ){ - lexiconmap.insert(Lexiconmap::value_type(key, value)); - } - else{ - std::vector & origin = it->second; - origin.insert( origin.begin(),value.begin(),value.end() ); - sort(origin.begin(),origin.end()); - origin.erase( unique(origin.begin(),origin.end()),origin.end() ); - } - } - - void dump(){ - Lexiconmap_const_iter it = lexiconmap.begin(); - int lexicon_size; - for(;it != lexiconmap.end();it++){ - std::cout<first<<" ->"; - lexicon_size = (it->second).size(); - for(int i=0;isecond)[i]; - } - std::cout< > Lexiconmap; - typedef std::map >::iterator Lexiconmap_iter; - typedef std::map >::const_iterator Lexiconmap_const_iter; - Lexiconmap lexiconmap; -}; - -} // end for namespace postagger -} // end for namespace ltp -#endif // end for __LTP_POSTAGGER_POSLEXICON_H__ diff --git a/src/postagger/postag_dll.cpp b/src/postagger/postag_dll.cpp index 098210ac3..2321f0f49 100644 --- a/src/postagger/postag_dll.cpp +++ b/src/postagger/postag_dll.cpp @@ -8,6 +8,8 @@ #include "sbcdbc.hpp" #include +using namespace ltp::utility; + class PostaggerWrapper : public ltp::postagger::Postagger { public: PostaggerWrapper() {} @@ -26,43 +28,48 @@ class PostaggerWrapper : public ltp::postagger::Postagger { return false; } - if (NULL != lexicon_file) { - std::ifstream lfs(lexicon_file); - if (lfs) { - - std::string buffer; - std::vector key_values; - int key_values_size; - std::string key; - std::vector values; - int value; - - while (std::getline(lfs, buffer)) { - buffer = ltp::strutils::chomp(buffer); - if (buffer.size() == 0) { - continue; - } - key_values = ltp::strutils::split(buffer); - key_values_size = key_values.size(); - key = ltp::strutils::chartypes::sbc2dbc_x(key_values[0]); - values.clear(); - for(int i=1;ilabels.index(key_values[i]); - if (value != -1){ - values.push_back( value ); - } - else { - std::cerr << "Tag named " << key_values[i] << " for word "<< key_values[0]<< " is not existed in LTP labels set."< key_values; + int key_values_size; + std::string key; + int value; + Bitset * original_bitset; + while (std::getline(lfs, buffer)) { + buffer = ltp::strutils::chomp(buffer); + if (buffer.size() == 0) { + continue; + } + Bitset values; + key_values = ltp::strutils::split(buffer); + key_values_size = key_values.size(); + key = ltp::strutils::chartypes::sbc2dbc_x(key_values[0]); + for(int i=1;ilabels.index(key_values[i]); + if (value != -1){ + if(!(values.set(value))) { + std::cerr << "Tag named " << key_values[i] << " for word "<< key_values[0]<< " add external lexicon error."<external_lexicon.get(key.c_str()); + if(original_bitset){ + original_bitset->merge(values); } - sort(values.begin(),values.end()); - values.erase( unique(values.begin(),values.end()),values.end() ); - if (int(values.size()) > 0){ - model->poslexicon.set(key,values); + else{ + model->external_lexicon.set(key.c_str(),values); } - } - } - } + } + + } + } + } return true; } @@ -72,14 +79,33 @@ class PostaggerWrapper : public ltp::postagger::Postagger { ltp::postagger::Instance * inst = new ltp::postagger::Instance; ltp::postagger::Decoder deco(model->num_labels()); int wt = 0; + Bitset * original_bitset; + for (int i = 0; i < words.size(); ++ i) { inst->forms.push_back(ltp::strutils::chartypes::sbc2dbc_x_wt(words[i],wt)); inst->wordtypes.push_back(wt); + original_bitset = model->internal_lexicon.get((inst->forms[i]).c_str()); + if(original_bitset){ + inst->internal_lexicon_match_state.push_back((*original_bitset)); + } + else{ + inst->internal_lexicon_match_state.push_back(Bitset()); + } + if( int(model->external_lexicon.size()) != 0){ + original_bitset = model->external_lexicon.get((inst->forms[i]).c_str()); + if(original_bitset){ + inst->external_lexicon_match_state.push_back((*original_bitset)); + } + else{ + inst->external_lexicon_match_state.push_back(Bitset()); + } + } } ltp::postagger::Postagger::extract_features(inst); ltp::postagger::Postagger::calculate_scores(inst, true); - deco.decode(inst,&(model->poslexicon) ); + //deco.decode(inst,&(model->external_lexicon) ); + deco.decode(inst); ltp::postagger::Postagger::build_labels(inst, tags); diff --git a/src/postagger/postag_dll.h b/src/postagger/postag_dll.h index 0bd12641c..04547360b 100644 --- a/src/postagger/postag_dll.h +++ b/src/postagger/postag_dll.h @@ -3,6 +3,7 @@ #include #include +#include "tinybitset.hpp" #define POSTAGGER_DLL_API #define POSTAGGER_DLL_API_EXPORT diff --git a/src/postagger/postagger.cpp b/src/postagger/postagger.cpp index 69a4417ec..7af0fc8e6 100644 --- a/src/postagger/postagger.cpp +++ b/src/postagger/postagger.cpp @@ -20,6 +20,8 @@ namespace ltp { namespace postagger { +using namespace ltp::utility; + Postagger::Postagger() : model(0), decoder(0), @@ -198,15 +200,53 @@ Postagger::read_instance(const char * train_file) { void Postagger::build_configuration(void) { // model->labels.push( __dummy__ ); + + + SmartMap tmp_internal_lexicon; + SmartMap wordfreq; + Bitset * original_bitset; + + //word frequency firstly for (int i = 0; i < train_dat.size(); ++ i) { Instance * inst = train_dat[i]; int len = inst->size(); - inst->tagsidx.resize(len); for (int j = 0; j < len; ++ j) { inst->tagsidx[j] = model->labels.push( inst->tags[j] ); + wordfreq.set(inst->forms[j].c_str(), true); + original_bitset = tmp_internal_lexicon.get((inst->forms[j]).c_str()); + if(original_bitset){ + original_bitset->set(inst->tagsidx[j]); + } + else{ + tmp_internal_lexicon.set((inst->forms[j]).c_str(),Bitset(inst->tagsidx[j]) ); + } + } + } + + for (SmartMap::const_iterator itx = wordfreq.begin(); + itx != wordfreq.end(); + ++ itx) { + if (itx.frequency() >= 5 ) { + original_bitset = tmp_internal_lexicon.get(itx.key()); + if(original_bitset){ + model->internal_lexicon.set(itx.key(), *(original_bitset) ); + } + } + } + + for (int i = 0; i < train_dat.size(); ++ i) { + Instance * inst = train_dat[i]; + int len = inst->size(); + inst->internal_lexicon_match_state.resize(len); + for (int j = 0; j < len; ++ j) { + original_bitset = model->internal_lexicon.get((inst->forms[j]).c_str()); + if(original_bitset){ + inst->internal_lexicon_match_state[j] = (*original_bitset); + } } } + } void @@ -444,6 +484,11 @@ Postagger::erase_rare_features(int * feature_group_updated_time) { } TRACE_LOG("Building new model is done"); + for(SmartMap::const_iterator itx = model->internal_lexicon.begin(); + itx != model->internal_lexicon.end(); + ++itx) { + new_model->internal_lexicon.set(itx.key(),*(itx.value()) ); + } return new_model; } @@ -506,10 +551,9 @@ Postagger::train(void) { Instance * inst = train_dat[i]; calculate_scores(inst, false); - /*in training,we do not need to add lexicon but if lexicon is added , it is ok too */ - //decoder->decode(inst,&(model->poslexicon) ); + //decoder->decode(inst,&(model->external_lexicon) ); decoder->decode(inst); if (inst->features.dim() == 0) { @@ -621,11 +665,18 @@ Postagger::evaluate(double &p) { int num_recalled_tags = 0; int num_tags = 0; + Bitset * original_bitset; + while ((inst = reader.next())) { int len = inst->size(); inst->tagsidx.resize(len); + inst->internal_lexicon_match_state.resize(len); for (int i = 0; i < len; ++ i) { inst->tagsidx[i] = model->labels.index(inst->tags[i]); + original_bitset = model->internal_lexicon.get((inst->forms[i]).c_str()); + if(original_bitset){ + inst->internal_lexicon_match_state[i] = (*original_bitset); + } } extract_features(inst, false); @@ -676,32 +727,38 @@ Postagger::test(void) { std::vector key_values; int key_values_size; std::string key; - std::vector values; int value; - + Bitset * original_bitset; while (std::getline(lfs, buffer)) { buffer = ltp::strutils::chomp(buffer); if (buffer.size() == 0) { continue; } + Bitset values; key_values = ltp::strutils::split(buffer); key_values_size = key_values.size(); key = ltp::strutils::chartypes::sbc2dbc_x(key_values[0]); - values.clear(); for(int i=1;ilabels.index(key_values[i]); if (value != -1){ - values.push_back( value ); + if(!(values.set(value))) { + std::cerr << "Tag named " << key_values[i] << " for word "<< key_values[0]<< " add external lexicon error."< 0){ - model->poslexicon.set(key,values); + if(values.isnotempty()){ + original_bitset = model->external_lexicon.get(key.c_str()); + if(original_bitset){ + original_bitset->merge(values); + } + else{ + model->external_lexicon.set(key.c_str(),values); + } } + } } } @@ -724,19 +781,38 @@ Postagger::test(void) { int num_tags = 0; double before = get_time(); + Bitset * original_bitset; while ((inst = reader.next())) { int len = inst->size(); inst->tagsidx.resize(len); + inst->internal_lexicon_match_state.resize(len); for (int i = 0; i < len; ++ i) { inst->tagsidx[i] = model->labels.index(inst->tags[i]); + original_bitset = model->internal_lexicon.get((inst->forms[i]).c_str()); + + if(original_bitset){ + inst->internal_lexicon_match_state[i] = (*original_bitset); + } + + if( int(model->external_lexicon.size()) != 0){ + original_bitset = model->external_lexicon.get((inst->forms[i]).c_str()); + if(original_bitset){ + inst->external_lexicon_match_state.push_back((*original_bitset)); + } + else{ + inst->external_lexicon_match_state.push_back(Bitset()); + } + } + } extract_features(inst); calculate_scores(inst, true); //in testing phrase,docode need poslexicon - decoder->decode(inst,&(model->poslexicon) ); + //decoder->decode(inst,&(model->external_lexicon) ); + decoder->decode(inst); build_labels(inst, inst->predicted_tags); writer.write(inst); diff --git a/src/postagger/postagger.h b/src/postagger/postagger.h index 2e62136c9..c4ed30ec7 100644 --- a/src/postagger/postagger.h +++ b/src/postagger/postagger.h @@ -4,6 +4,8 @@ #include "cfgparser.hpp" #include "model.h" #include "decoder.h" +#include "smartmap.hpp" +#include "tinybitset.hpp" namespace ltp { namespace postagger { diff --git a/src/utils/tinybitset.hpp b/src/utils/tinybitset.hpp new file mode 100644 index 000000000..8bde80e2b --- /dev/null +++ b/src/utils/tinybitset.hpp @@ -0,0 +1,74 @@ +#ifndef __SMALLBITSET__ +#define __SMALLBITSET__ + +#include +#include +#include +#include +#include + +namespace ltp { +namespace utility { + +struct Bitset{ + bool nonemptyflag; + unsigned bits[4]; + Bitset(){ + memset(bits,0,sizeof(bits)); + nonemptyflag = 0; + } + Bitset(int val){ + memset(bits,0,sizeof(bits)); + nonemptyflag = 0; + set(val); + } + inline bool isnotempty() const{ + return nonemptyflag; + } + inline bool set(int val){ + int bucket_cap = sizeof(bits)/sizeof(unsigned); + int bucket_size = sizeof(unsigned); + int bucket_index = val/bucket_size; + int bucket_off = val%bucket_size; + if (bucket_index<0 || bucket_index >= bucket_cap){ + return false; + } + bits[bucket_index] |= 1<= bucket_cap){ + return false; + } + if(bits[bucket_index] & (1<