Skip to content

Commit

Permalink
pos internal&external lexicon debug code
Browse files Browse the repository at this point in the history
  • Loading branch information
niuox committed Apr 16, 2014
1 parent 0fda579 commit 2276ee5
Show file tree
Hide file tree
Showing 11 changed files with 276 additions and 142 deletions.
51 changes: 26 additions & 25 deletions src/postagger/decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ namespace postagger {


void
Decoder::decode(Instance * inst,const Poslexicon* lexicon ) {
Decoder::decode(Instance * inst) {
init_lattice(inst);
viterbi_decode(inst,lexicon);
viterbi_decode(inst);
get_result(inst);
free_lattice();
}
Expand Down Expand Up @@ -40,32 +40,33 @@ void Decoder::viterbi_decode_inner(const Instance * inst,int i,int l){
}

void
Decoder::viterbi_decode(const Instance * inst,const Poslexicon* lexicon) {
Decoder::viterbi_decode(const Instance * inst) {
int len = inst->size();
if (!lexicon){
for (int i = 0; i < len; ++ i) {
bool external_lexicon_flag = false;
if(inst->external_lexicon_match_state.size() == len){
external_lexicon_flag = true;
}
for (int i = 0; i < len; ++ i) {
if(external_lexicon_flag && inst->external_lexicon_match_state[i].isnotempty()) {
for (int l = 0; l < L; ++ l) {
viterbi_decode_inner(inst,i,l);
} // end for label kinds
} // end for len
} // end for lexicon == NULL
else{
std::vector <int> lex_labels;
int lex_labels_size;
for (int i = 0; i < len; ++ i) {
if ( lexicon->get(inst->forms[i] ,lex_labels) ){
lex_labels_size = lex_labels.size();
for (int l_idx = 0; l_idx < lex_labels_size; ++l_idx) {
viterbi_decode_inner( inst,i,lex_labels[l_idx] );
} // end for label kinds
}// end for if word found in lexicon
else{
for (int l = 0; l < L; ++ l) {
if(inst->external_lexicon_match_state[i].get(l)){
viterbi_decode_inner(inst,i,l);
}
}
}
else if(inst->internal_lexicon_match_state[i].isnotempty()) {
for (int l = 0; l < L; ++ l) {
if(inst->internal_lexicon_match_state[i].get(l)) {
viterbi_decode_inner(inst,i,l);
} // end for label kinds
}// end for word not found in lexicon
} // end for len
} // end for lexicon != NULL
}
}
}
else{
for (int l = 0; l < L; ++ l) {
viterbi_decode_inner(inst,i,l);
}
}
}
}

void
Expand Down
5 changes: 2 additions & 3 deletions src/postagger/decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#include <vector>
#include "instance.h"
#include "mat.h"
#include "poslexicon.h"

namespace ltp {
namespace postagger {
Expand Down Expand Up @@ -36,11 +35,11 @@ class LatticeItem {
class Decoder {
public:
Decoder (int _L) : L(_L) {}
void decode(Instance * inst,const Poslexicon* lexicon = NULL);
void decode(Instance * inst);
private:
void init_lattice(const Instance * inst);
void viterbi_decode_inner(const Instance * inst,int i,int l);
void viterbi_decode(const Instance * inst,const Poslexicon* lexicon = NULL);
void viterbi_decode(const Instance * inst);
void get_result(Instance * inst);
void free_lattice();

Expand Down
6 changes: 6 additions & 0 deletions src/postagger/instance.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@
#include "featurevec.h"
#include "mat.h"
#include "sparsevec.h"
#include "tinybitset.hpp"

namespace ltp {
namespace postagger {

using namespace ltp::utility;

class Instance {
public:
Instance() {}
Expand Down Expand Up @@ -64,6 +67,9 @@ class Instance {
std::vector< std::string > predicted_tags;
std::vector< int > predicted_tagsidx;

std::vector<Bitset> internal_lexicon_match_state;
std::vector<Bitset> external_lexicon_match_state;

math::SparseVec features; /*< the gold features */
math::SparseVec predicted_features; /*< the predicted features */

Expand Down
12 changes: 12 additions & 0 deletions src/postagger/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,21 @@ void Model::save(std::ostream & ofs) {
int off = ofs.tellp();

unsigned labels_offset = 0;
unsigned lexicon_offset = 0;
unsigned feature_offset = 0;
unsigned parameter_offset = 0;

write_uint(ofs, 0); // the label offset
write_uint(ofs, 0); // the internal lexicon offset
write_uint(ofs, 0); // the features offset
write_uint(ofs, 0); // the parameter offset

labels_offset = ofs.tellp();
labels.dump(ofs);

lexicon_offset = ofs.tellp();
internal_lexicon.dump(ofs);

feature_offset = ofs.tellp();
space.dump(ofs);

Expand All @@ -35,6 +40,7 @@ void Model::save(std::ostream & ofs) {

ofs.seekp(off);
write_uint(ofs, labels_offset);
write_uint(ofs, lexicon_offset);
write_uint(ofs, feature_offset);
write_uint(ofs, parameter_offset);
}
Expand All @@ -48,6 +54,7 @@ bool Model::load(std::istream & ifs) {
}

unsigned labels_offset = read_uint(ifs);
unsigned lexicon_offset = read_uint(ifs);
unsigned feature_offset = read_uint(ifs);
unsigned parameter_offset = read_uint(ifs);

Expand All @@ -56,6 +63,11 @@ bool Model::load(std::istream & ifs) {
return false;
}

ifs.seekg(lexicon_offset);
if (!internal_lexicon.load(ifs)){
return false;
}

ifs.seekg(feature_offset);
if (!space.load(labels.size(), ifs)) {
return false;
Expand Down
7 changes: 3 additions & 4 deletions src/postagger/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@

#include "featurespace.h"
#include "parameter.h"

#include "smartmap.hpp"
#include "poslexicon.h"
#include "tinybitset.hpp"

namespace ltp {
namespace postagger {
Expand Down Expand Up @@ -43,8 +42,8 @@ class Model {
IndexableSmartMap labels;
FeatureSpace space;
Parameters param;

Poslexicon poslexicon;
SmartMap<Bitset> internal_lexicon;
SmartMap<Bitset> external_lexicon;

private:
void write_uint(std::ostream & out, unsigned int val) {
Expand Down
62 changes: 0 additions & 62 deletions src/postagger/poslexicon.h

This file was deleted.

98 changes: 62 additions & 36 deletions src/postagger/postag_dll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include "sbcdbc.hpp"
#include <iostream>

using namespace ltp::utility;

class PostaggerWrapper : public ltp::postagger::Postagger {
public:
PostaggerWrapper() {}
Expand All @@ -26,43 +28,48 @@ class PostaggerWrapper : public ltp::postagger::Postagger {
return false;
}

if (NULL != lexicon_file) {
std::ifstream lfs(lexicon_file);
if (lfs) {

std::string buffer;
std::vector<std::string> key_values;
int key_values_size;
std::string key;
std::vector<int> values;
int value;

while (std::getline(lfs, buffer)) {
buffer = ltp::strutils::chomp(buffer);
if (buffer.size() == 0) {
continue;
}
key_values = ltp::strutils::split(buffer);
key_values_size = key_values.size();
key = ltp::strutils::chartypes::sbc2dbc_x(key_values[0]);
values.clear();
for(int i=1;i<key_values_size;i++){
value = model->labels.index(key_values[i]);
if (value != -1){
values.push_back( value );
}
else {
std::cerr << "Tag named " << key_values[i] << " for word "<< key_values[0]<< " is not existed in LTP labels set."<<std::endl;
}
if (NULL != lexicon_file) {
std::ifstream lfs(lexicon_file);
if (lfs) {
std::string buffer;
std::vector<std::string> key_values;
int key_values_size;
std::string key;
int value;
Bitset * original_bitset;
while (std::getline(lfs, buffer)) {
buffer = ltp::strutils::chomp(buffer);
if (buffer.size() == 0) {
continue;
}
Bitset values;
key_values = ltp::strutils::split(buffer);
key_values_size = key_values.size();
key = ltp::strutils::chartypes::sbc2dbc_x(key_values[0]);
for(int i=1;i<key_values_size;i++){
value = model->labels.index(key_values[i]);
if (value != -1){
if(!(values.set(value))) {
std::cerr << "Tag named " << key_values[i] << " for word "<< key_values[0]<< " add external lexicon error."<<std::endl;
}
}
else {
std::cerr << "Tag named " << key_values[i] << " for word "<< key_values[0]<< " is not existed in LTP labels set."<<std::endl;
}
}
if(values.isnotempty()){
original_bitset = model->external_lexicon.get(key.c_str());
if(original_bitset){
original_bitset->merge(values);
}
sort(values.begin(),values.end());
values.erase( unique(values.begin(),values.end()),values.end() );
if (int(values.size()) > 0){
model->poslexicon.set(key,values);
else{
model->external_lexicon.set(key.c_str(),values);
}
}
}
}
}

}
}
}

return true;
}
Expand All @@ -72,14 +79,33 @@ class PostaggerWrapper : public ltp::postagger::Postagger {
ltp::postagger::Instance * inst = new ltp::postagger::Instance;
ltp::postagger::Decoder deco(model->num_labels());
int wt = 0;
Bitset * original_bitset;

for (int i = 0; i < words.size(); ++ i) {
inst->forms.push_back(ltp::strutils::chartypes::sbc2dbc_x_wt(words[i],wt));
inst->wordtypes.push_back(wt);
original_bitset = model->internal_lexicon.get((inst->forms[i]).c_str());
if(original_bitset){
inst->internal_lexicon_match_state.push_back((*original_bitset));
}
else{
inst->internal_lexicon_match_state.push_back(Bitset());
}
if( int(model->external_lexicon.size()) != 0){
original_bitset = model->external_lexicon.get((inst->forms[i]).c_str());
if(original_bitset){
inst->external_lexicon_match_state.push_back((*original_bitset));
}
else{
inst->external_lexicon_match_state.push_back(Bitset());
}
}
}

ltp::postagger::Postagger::extract_features(inst);
ltp::postagger::Postagger::calculate_scores(inst, true);
deco.decode(inst,&(model->poslexicon) );
//deco.decode(inst,&(model->external_lexicon) );
deco.decode(inst);

ltp::postagger::Postagger::build_labels(inst, tags);

Expand Down
1 change: 1 addition & 0 deletions src/postagger/postag_dll.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <iostream>
#include <vector>
#include "tinybitset.hpp"

#define POSTAGGER_DLL_API
#define POSTAGGER_DLL_API_EXPORT
Expand Down
Loading

0 comments on commit 2276ee5

Please sign in to comment.