Skip to content

Commit

Permalink
fix dump bug in segmentor, implement zero feature truncation, model s…
Browse files Browse the repository at this point in the history
…ize is reduced
  • Loading branch information
Oneplus committed Jul 29, 2013
1 parent 4de486f commit 04564e7
Show file tree
Hide file tree
Showing 7 changed files with 255 additions and 22 deletions.
25 changes: 10 additions & 15 deletions src/postagger/featurespace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,23 @@ namespace postagger {
FeatureSpace::FeatureSpace(int num_labels) :
_num_labels(num_labels),
_offset(0) {
dicts.resize(Extractor::num_templates());

for (int i = 0; i < dicts.size(); ++ i) {
dicts[i] = new utility::SmartMap<int>();
}
_num_dicts = Extractor::num_templates();
dicts = new utility::SmartMap<int>[_num_dicts];
}

FeatureSpace::~FeatureSpace(void) {
for (int i = 0; i < dicts.size(); ++ i) {
delete dicts[i];
}
delete [](dicts);
}

int FeatureSpace::retrieve(int tid, const char * key, bool create) {
int val;

if (dicts[tid]->get(key, val)) {
if (dicts[tid].get(key, val)) {
return val;
} else {
if (create) {
val = _offset;
dicts[tid]->set(key, val);
dicts[tid].set(key, val);
++ _offset;

return val;
Expand Down Expand Up @@ -65,15 +60,15 @@ void FeatureSpace::set_num_labels(int num_labels) {
}
void FeatureSpace::dump(std::ostream & ofs) {
char chunk[16];
unsigned int sz = dicts.size();
unsigned sz = _num_dicts;
strncpy(chunk, "featurespace", 16);

ofs.write(chunk, 16);
ofs.write(reinterpret_cast<const char *>(&_offset), sizeof(int));
ofs.write(reinterpret_cast<const char *>(&sz), sizeof(unsigned int));

for (int i = 0; i < dicts.size(); ++ i) {
dicts[i]->dump(ofs);
for (int i = 0; i < _num_dicts; ++ i) {
dicts[i].dump(ofs);
}
}

Expand All @@ -89,12 +84,12 @@ bool FeatureSpace::load(int num_labels, std::istream & ifs) {
ifs.read(reinterpret_cast<char *>(&_offset), sizeof(int));
ifs.read(reinterpret_cast<char *>(&sz), sizeof(unsigned int));

if (sz != dicts.size()) {
if (sz != _num_dicts) {
return false;
}

for (unsigned i = 0; i < sz; ++ i) {
if (!dicts[i]->load(ifs)) {
if (!dicts[i].load(ifs)) {
return false;
}
}
Expand Down
71 changes: 70 additions & 1 deletion src/postagger/featurespace.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,65 @@
namespace ltp {
namespace postagger {

class FeatureSpaceIterator {
public:
FeatureSpaceIterator() :
_dicts(NULL),
_i(0),
_state(0) {
// should be careful about the empty dicts
}

FeatureSpaceIterator(utility::SmartMap<int> * dicts) :
_dicts(dicts),
_i(0),
_state(0) {
++ (*this);
}

~FeatureSpaceIterator() {
}

const char * key() { return _j.key(); }
int id() { return (*_j.value()); }
int tid() { return _i; }

bool operator ==(const FeatureSpaceIterator & other) const { return ((_dicts + _i) == other._dicts); }
bool operator !=(const FeatureSpaceIterator & other) const { return ((_dicts + _i) != other._dicts); }

FeatureSpaceIterator & operator = (const FeatureSpaceIterator & other) {
if (this != &other) {
_dicts = other._dicts;
_i = other._i;
_state = other._state;
}

return *this;
}

void operator ++() {
switch (_state) {
case 0:
for (_i = 0; ; ++ _i) {
if (_dicts[_i].begin() == _dicts[_i].end()) {
_state = 1;
return;
}
for (_j = _dicts[_i].begin(); _j != _dicts[_i].end(); ++ _j) {
_state = 1;
return;
case 1:;
}
}
}
}

int _i;
int _state;
utility::SmartMap<int>::const_iterator _j;
utility::SmartMap<int> * _dicts;
};

class FeatureSpace {
public:
FeatureSpace(int num_labels = 1);
Expand All @@ -35,10 +94,20 @@ class FeatureSpace {
* @param[in] ifs the input stream
*/
bool load(int num_labeles, std::istream & ifs);

FeatureSpaceIterator begin() {
return FeatureSpaceIterator(dicts);
}

FeatureSpaceIterator end() {
return FeatureSpaceIterator(dicts + _num_dicts);
}

private:
int _offset;
int _num_dicts;
int _num_labels;
std::vector< utility::SmartMap<int> * > dicts;
utility::SmartMap<int> * dicts;
};

} // end for namespace postagger
Expand Down
1 change: 1 addition & 0 deletions src/postagger/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ namespace postagger {
ModelOptions model_opt;
TrainOptions train_opt;
TestOptions test_opt;
DumpOptions dump_opt;

} // end for namespace postagger
} // end for namespace ltp
5 changes: 5 additions & 0 deletions src/postagger/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,14 @@ struct TestOptions {
std::string model_file;
};

struct DumpOptions {
std::string model_file;
};

extern ModelOptions model_opt;
extern TrainOptions train_opt;
extern TestOptions test_opt;
extern DumpOptions dump_opt;

} // end for namespace postagger
} // end for namespace ltp
Expand Down
155 changes: 149 additions & 6 deletions src/postagger/postagger.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,19 @@
#if _WIN32
#include <Windows.h>
#define sleep Sleep
#endif // end for _WIN32
#endif // end for _WIN32

namespace ltp {
namespace postagger {

Postagger::Postagger() {
Postagger::Postagger() :
model(0),
decoder(0) {
}

Postagger::Postagger(ltp::utility::ConfigParser & cfg) {
Postagger::Postagger(ltp::utility::ConfigParser & cfg) :
model(0),
decoder(0) {
parse_cfg(cfg);
}

Expand All @@ -44,6 +48,10 @@ void Postagger::run(void) {
if (__TEST__) {
test();
}

if (__DUMP__) {
dump();
}
}

bool Postagger::parse_cfg(ltp::utility::ConfigParser & cfg) {
Expand Down Expand Up @@ -121,7 +129,21 @@ bool Postagger::parse_cfg(ltp::utility::ConfigParser & cfg) {
}
}

return true;
__DUMP__ = false;
dump_opt.model_file = "";

if (cfg.has_section("dump")) {
__DUMP__ = true;

if (cfg.get("dump", "model-file", strbuf)) {
dump_opt.model_file = strbuf;
} else {
ERROR_LOG("model-file is not configed. ");
return false;
}
}

return true;
}

bool Postagger::read_instance(const char * train_file) {
Expand Down Expand Up @@ -294,6 +316,79 @@ void Postagger::collect_features(Instance * inst, const std::vector<int> & tagsi
}
}

Model * Postagger::truncate(void) {
Model * new_model = new Model;
// copy the label indexable map to the new model
for (int i = 0; i < model->labels.size(); ++ i) {
const char * key = model->labels.at(i);
new_model->labels.push(key);
}
TRACE_LOG("building labels map is done");

int L = new_model->num_labels();
new_model->space.set_num_labels(L);

// iterate over the feature space and see if the parameter value equals to zero

for (FeatureSpaceIterator itx = model->space.begin();
itx != model->space.end();
++ itx) {
const char * key = itx.key();
int tid = itx.tid();
int id = model->space.index(tid, key);
bool flag = false;

for (int l = 0; l < L; ++ l) {
double p = model->param.dot(id + l);
if (p != 0.) {
flag = true;
}
}

if (!flag) {
continue;
}

new_model->space.retrieve(tid, key, true);
}

TRACE_LOG("Scanning old features space, building new feature space is done");
new_model->param.realloc(new_model->space.dim());
TRACE_LOG("Parameter dimension of new model is [%d]", new_model->space.dim());

for (FeatureSpaceIterator itx = new_model->space.begin();
itx != new_model->space.end();
++ itx) {
const char * key = itx.key();
int tid = itx.tid();

int old_id = model->space.index(tid, key);
int new_id = new_model->space.index(tid, key);

for (int l = 0; l < L; ++ l) {
// pay attention to this place, use average should be set true
// some dirty code
new_model->param._W[new_id + l] = model->param._W[old_id + l];
new_model->param._W_sum[new_id + l] = model->param._W_sum[old_id + l];
new_model->param._W_time[new_id + l] = model->param._W_time[old_id + l];
}
}

for (int pl = 0; pl < L; ++ pl) {
for (int l = 0; l < L; ++ l) {
int old_id = model->space.index(pl, l);
int new_id = new_model->space.index(pl, l);

new_model->param._W[new_id] = model->param._W[old_id];
new_model->param._W_sum[new_id] = model->param._W_sum[old_id];
new_model->param._W_time[new_id] = model->param._W_time[old_id];
}
}
TRACE_LOG("Building new model is done");

return new_model;
}

void Postagger::train(void) {
const char * train_file = train_opt.train_file.c_str();

Expand Down Expand Up @@ -391,11 +486,17 @@ void Postagger::train(void) {
TRACE_LOG("[%d] instances is trained.", train_dat.size());

model->param.flush( train_dat.size() * (iter + 1) );
Model * new_model = truncate();
swap(model, new_model);
evaluate();

std::string saved_model_file = (train_opt.model_name + "." + strutils::to_str(iter) + ".model");
std::ofstream ofs(saved_model_file.c_str(), std::ofstream::binary);
model->save(ofs);

swap(model, new_model);
new_model->save(ofs);
delete new_model;
// model->save(ofs);

TRACE_LOG("Model for iteration [%d] is saved to [%s]",
iter + 1,
Expand Down Expand Up @@ -511,5 +612,47 @@ void Postagger::test(void) {
return;
}

} //. end for namespace postagger
void Postagger::dump() {
// load model
const char * model_file = dump_opt.model_file.c_str();
ifstream mfs(model_file, std::ifstream::binary);

if (!mfs) {
ERROR_LOG("Failed to load model");
return;
}

model = new Model;
if (!model->load(mfs)) {
ERROR_LOG("Failed to load model");
return;
}

int L = model->num_labels();
TRACE_LOG("Number of labels [%d]", model->num_labels());
TRACE_LOG("Number of features [%d]", model->space.num_features());
TRACE_LOG("Number of dimension [%d]", model->space.dim());

for (FeatureSpaceIterator itx = model->space.begin(); itx != model->space.end(); ++ itx) {
const char * key = itx.key();
int tid = itx.tid();
int id = model->space.index(tid, key);

for (int l = 0; l < L; ++ l) {
std::cout << key << " ( " << id + l << " ) "
<< " --> "
<< model->param.dot(id + l)
<< std::endl;
}
}

for (int pl = 0; pl < L; ++ pl) {
for (int l = 0; l < L; ++ l) {
int id = model->space.index(pl, l);
std::cout << pl << " --> " << l << " " << model->param.dot(id) << std::endl;
}
}
}

} // end for namespace postagger
} // end for namespace ltp
Loading

0 comments on commit 04564e7

Please sign in to comment.