Skip to content

Commit

Permalink
Finish Segmentor migiration.
Browse files Browse the repository at this point in the history
  • Loading branch information
Oneplus committed May 13, 2015
1 parent 34a4d16 commit 46c7782
Show file tree
Hide file tree
Showing 23 changed files with 1,611 additions and 1,332 deletions.
69 changes: 16 additions & 53 deletions src/segmentor/decoder.cpp
Original file line number Diff line number Diff line change
@@ -1,69 +1,32 @@
#include "segmentor/decoder.h"
#include "segmentor/preprocessor.h"

namespace ltp {
namespace segmentor {

void Decoder::decode(Instance * inst, const ScoreMatrix* scm) {
init_lattice(inst->size(), L);
viterbi_decode(inst, scm);
get_result(inst);
free_lattice();
SegmentorConstrain::SegmentorConstrain(): chartypes(0) {
}

void Decoder::viterbi_decode(const Instance * inst, const ScoreMatrix* scm) {
int len = inst->size();
for (int i = 0; i < len; ++ i) {
for (int l = 0; l < L; ++ l) {
if (false == base.legal_emit(inst->chartypes[i], l)) {
continue;
}

if (i == 0) {
LatticeItem * item = new LatticeItem(i, l, scm->uni_scores[i][l], NULL);
lattice_insert(lattice[i][l], item);
} else {
for (int pl = 0; pl < L; ++ pl) {
if (false == base.legal_trans(pl, l)) {
continue;
}

double score = 0.;
const LatticeItem * prev = lattice[i-1][pl];

if (!prev) {
continue;
}
void SegmentorConstrain::regist(const std::vector<int>* _chartypes) {
chartypes = _chartypes;
}

// std::cout << i << " " << pl << " " << l << std::endl;
score = scm->uni_scores[i][l] + scm->bi_scores[pl][l] + prev->score;
const LatticeItem * item = new LatticeItem(i, l, score, prev);
lattice_insert(lattice[i][l], item);
}
} // end for if i == 0
}
}
bool SegmentorConstrain::can_tran(const size_t& i, const size_t& j) const {
return ((i == 0 || i == 1) && (j == 1 || j == 2)
|| ((i == 2 || i == 3) && (i == 3 || j == 3)));
}

void Decoder::get_result(Instance * inst) {
int len = inst->size();
const LatticeItem * best_item = NULL;
for (int l = 0; l < L; ++ l) {
if (!lattice[len-1][l]) {
continue;
}
if (best_item == NULL || (lattice[len-1][l]->score > best_item->score)) {
best_item = lattice[len - 1][l];
bool SegmentorConstrain::can_emit(const size_t& i, const size_t& j) const {
if (i == 0 && !(j == 0 || j == 3)) { return false; }

if (chartypes) {
int flag = chartypes->at(i);
if (((flag&0x07)== Preprocessor::CHAR_ENG) || ((flag&0x07) == Preprocessor::CHAR_URI)) {
return (j == 3);
}
}

const LatticeItem * item = best_item;
inst->predicted_tagsidx.resize(len);
// backtracking
while (item) {
inst->predicted_tagsidx[item->i] = item->l;
// std::cout << item->i << " " << item->l << std::endl;
item = item->prev;
}
return true;
}

} // end for namespace segmentor
Expand Down
24 changes: 7 additions & 17 deletions src/segmentor/decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,22 @@
#include <iostream>
#include <vector>
#include "framework/decoder.h"
#include "segmentor/instance.h"
#include "segmentor/rulebase.h"
#include "segmentor/score_matrix.h"
#include "utils/math/mat.h"

namespace ltp {
namespace segmentor {

class Decoder: public framework::ViterbiDecoder {
class SegmentorConstrain: public framework::ViterbiDecodeConstrain {
private:
typedef framework::ViterbiLatticeItem LatticeItem;
int L;
rulebase::RuleBase base;
const std::vector<int>* chartypes;
public:
Decoder (int _l, rulebase::RuleBase & _base) : L(_l), base(_base) {}
SegmentorConstrain();

/**
* The main decoding process
* @param[in/out] the instance
* @param[in] the score matrix
*/
void decode(Instance * inst, const ScoreMatrix* scm);
void regist(const std::vector<int>* chartypes);

private:
void viterbi_decode(const Instance * inst, const ScoreMatrix* scm);
void get_result(Instance * inst);
bool can_tran(const size_t& i, const size_t& j) const;

bool can_emit(const size_t& i, const size_t& j) const;
};

} // end for namespace segmentor
Expand Down
88 changes: 37 additions & 51 deletions src/segmentor/extractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
namespace ltp {
namespace segmentor {

namespace utils = ltp::utility;
using utility::Template;
using utility::StringVec;

std::vector<utils::Template *> Extractor::templates;
std::vector<Template *> Extractor::templates;

Extractor& Extractor::extractor() {
static Extractor instance_;
Expand All @@ -22,70 +23,55 @@ int Extractor::num_templates() {

Extractor::Extractor() {
// delimit feature templates
templates.push_back(new utils::Template("1={c-2}"));
templates.push_back(new utils::Template("2={c-1}"));
templates.push_back(new utils::Template("3={c-0}"));
templates.push_back(new utils::Template("4={c+1}"));
templates.push_back(new utils::Template("5={c+2}"));
templates.push_back(new utils::Template("6={c-2}-{c-1}"));
templates.push_back(new utils::Template("7={c-1}-{c-0}"));
templates.push_back(new utils::Template("8={c-0}-{c+1}"));
templates.push_back(new utils::Template("9={c+1}-{c+2}"));
//templates.push_back(new Template("10={c-2}-{c-0}"));
//templates.push_back(new Template("11={c-1}-{c+1}"));
//templates.push_back(new Template("12={c-0}-{c+2}"));
//templates.push_back(new Template("13={c-1}-{c-0}-{c+1}"));
templates.push_back(new utils::Template("14={ct-1}"));
templates.push_back(new utils::Template("15={ct-0}"));
templates.push_back(new utils::Template("16={ct+1}"));
templates.push_back(new utils::Template("17={lex1}"));
templates.push_back(new utils::Template("18={lex2}"));
templates.push_back(new utils::Template("19={lex3}"));
//templates.push_back(new Template("17={dup-1}"));
//templates.push_back(new Template("18={dup-0}"));
//templates.push_back(new Template("19={dup2-2}"));
//templates.push_back(new Template("20={dup2-1}"));
//templates.push_back(new Template("21={dup2-0}"));
templates.push_back(new Template("1={c-2}"));
templates.push_back(new Template("2={c-1}"));
templates.push_back(new Template("3={c-0}"));
templates.push_back(new Template("4={c+1}"));
templates.push_back(new Template("5={c+2}"));
templates.push_back(new Template("6={c-2}-{c-1}"));
templates.push_back(new Template("7={c-1}-{c-0}"));
templates.push_back(new Template("8={c-0}-{c+1}"));
templates.push_back(new Template("9={c+1}-{c+2}"));
templates.push_back(new Template("14={ct-1}"));
templates.push_back(new Template("15={ct-0}"));
templates.push_back(new Template("16={ct+1}"));
templates.push_back(new Template("17={lex1}"));
templates.push_back(new Template("18={lex2}"));
templates.push_back(new Template("19={lex3}"));
}

Extractor::~Extractor() {
for (int i = 0; i < templates.size(); ++ i) {
for (size_t i = 0; i < templates.size(); ++ i) {
delete templates[i];
}
}

int Extractor::extract1o(const Instance * inst, int idx,
std::vector< utils::StringVec > & cache) {
int Extractor::extract1o(const Instance& inst, int idx,
std::vector<StringVec>& cache) {

int len = inst->size();
size_t len = inst.size();
Template::Data data;

utils::Template::Data data;

#define EQU(x, y) (inst->forms[(x)] == inst->forms[(y)])
#define TYPE(x) (strutils::to_str(inst->chartypes[(x)]&0x07))
data.set( "c-2", (idx-2 < 0 ? BOS : inst->forms[idx-2]) );
data.set( "c-1", (idx-1 < 0 ? BOS : inst->forms[idx-1]) );
data.set( "c-0", inst->forms[idx] );
data.set( "c+1", (idx+1 >= len ? EOS : inst->forms[idx+1]) );
data.set( "c+2", (idx+2 >= len ? EOS : inst->forms[idx+2]) );
data.set( "ct-1", (idx-1 < 0 ? BOT : TYPE(idx-1)) );
data.set( "ct-0", TYPE(idx) );
data.set( "ct+1", (idx+1 >= len ? EOT : TYPE(idx+1)) );
// data.set( "dup-1", (idx-1 > 0 && EQU(idx-1, idx) ? "1" : "0") );
// data.set( "dup-0", (idx+1 < len && EQU(idx, idx+1) ? "1" : "0") );
// data.set( "dup2-2", (idx-2 > 0 && EQU(idx-2, idx) ? "1" : "0") );
// data.set( "dup2-1", (idx-1 > 0 && idx+1 < len && EQU(idx-1, idx+1) ? "1" : "0") );
// data.set( "dup2-0", (idx+2 < len && EQU(idx, idx+2) ? "1" : "0") );
data.set( "lex1", strutils::to_str(inst->lexicon_match_state[idx] & 0x0f));
data.set( "lex2", strutils::to_str((inst->lexicon_match_state[idx]>>4) & 0x0f));
data.set( "lex3", strutils::to_str((inst->lexicon_match_state[idx]>>8) & 0x0f));
#define EQU(x, y) (inst.forms[(x)] == inst.forms[(y)])
#define TYPE(x) (strutils::to_str(inst.chartypes[(x)]&0x07))
data.set( "c-2", (idx-2 < 0 ? BOS : inst.forms[idx-2]) );
data.set( "c-1", (idx-1 < 0 ? BOS : inst.forms[idx-1]) );
data.set( "c-0", inst.forms[idx] );
data.set( "c+1", (idx+1 >= len ? EOS : inst.forms[idx+1]) );
data.set( "c+2", (idx+2 >= len ? EOS : inst.forms[idx+2]) );
data.set( "ct-1", (idx-1 < 0 ? BOT : TYPE(idx-1)) );
data.set( "ct-0", TYPE(idx) );
data.set( "ct+1", (idx+1 >= len ? EOT : TYPE(idx+1)) );
data.set( "lex1", strutils::to_str(inst.lexicon_match_state[idx] & 0x0f));
data.set( "lex2", strutils::to_str((inst.lexicon_match_state[idx]>>4) & 0x0f));
data.set( "lex3", strutils::to_str((inst.lexicon_match_state[idx]>>8) & 0x0f));
#undef TYPE
#undef EQU

std::string feat;
feat.reserve(1024);
// render features
for (int i = 0; i < templates.size(); ++ i) {
for (size_t i = 0; i < templates.size(); ++ i) {
templates[i]->render(data, feat);
cache[i].push_back(feat);
}
Expand Down
8 changes: 3 additions & 5 deletions src/segmentor/extractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
namespace ltp {
namespace segmentor {

namespace utils = ltp::utility;

/**
* A singleton for extracting features
*
Expand All @@ -29,13 +27,13 @@ class Extractor {
* @param[in] idx The index of the current form.
* @param[out] cache The cached.
*/
static int extract1o(const Instance * inst, int idx,
std::vector< utils::StringVec >& cache);
static int extract1o(const Instance& inst, int idx,
std::vector< utility::StringVec >& cache);
protected:
Extractor();
~Extractor();
private:
static std::vector< utils::Template* > templates;
static std::vector< utility::Template* > templates;
};

} // end for namespace segmentor
Expand Down
34 changes: 17 additions & 17 deletions src/segmentor/instance.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,18 @@ class Instance {
}

/**
* return the number of tags that predicted wrong
* return the number of tags that predict wrong
* @return int the number
*/
int num_errors() {
int len = size();
if ((len != tagsidx.size()) || (len != predicted_tagsidx.size())) {
if ((len != tagsidx.size()) || (len != predict_tagsidx.size())) {
return -1;
}

int ret = 0;
for (int i = 0; i < len; ++ i) {
if (tagsidx[i] != predicted_tagsidx[i]) {
if (tagsidx[i] != predict_tagsidx[i]) {
++ ret;
}
}
Expand All @@ -42,7 +42,7 @@ class Instance {
}

int num_predicted_words() {
return predicted_words.size();
return predict_words.size();
}

int num_gold_words() {
Expand All @@ -51,39 +51,39 @@ class Instance {


/**
* calculate the number of words that predicted right
* calculate the number of words that predict right
* @return int the number
*/
int num_recalled_words() {
int len = 0;
int ret = 0;
int gold_len = 0, predicted_len = 0;
int gold_len = 0, predict_len = 0;

for (int i = 0; i < words.size(); ++ i) {
len += words[i].size();
}

for (int i = 0, j = 0; i < words.size() && j < predicted_words.size(); ) {
if (words[i] == predicted_words[j]) {
for (int i = 0, j = 0; i < words.size() && j < predict_words.size(); ) {
if (words[i] == predict_words[j]) {
++ ret;
gold_len += words[i].size();
predicted_len += predicted_words[j].size();
predict_len += predict_words[j].size();

++ i;
++ j;
} else {
gold_len += words[i].size();
predicted_len += predicted_words[j].size();
predict_len += predict_words[j].size();

++ i;
++ j;

while (gold_len < len && predicted_len < len) {
if (gold_len < predicted_len) {
while (gold_len < len && predict_len < len) {
if (gold_len < predict_len) {
gold_len += words[i].size();
++ i;
} else if (gold_len > predicted_len) {
predicted_len += predicted_words[j].size();
} else if (gold_len > predict_len) {
predict_len += predict_words[j].size();
++ j;
} else {
break;
Expand All @@ -101,10 +101,10 @@ class Instance {
std::vector< int > chartypes; // types of characters, digit, text, punct etc.
std::vector< std::string > tags; // tags of characters, {B I E S}
std::vector< int > tagsidx; // int tags
std::vector< std::string > predicted_tags;
std::vector< int > predicted_tagsidx;
std::vector< std::string > predict_tags;
std::vector< int > predict_tagsidx;
std::vector< std::string > words; // words of the input
std::vector< std::string > predicted_words;
std::vector< std::string > predict_words;
std::vector< int > lexicon_match_state;
};

Expand Down
Loading

0 comments on commit 46c7782

Please sign in to comment.