Skip to content

Commit

Permalink
personal segmentor
Browse files Browse the repository at this point in the history
  • Loading branch information
ruoshui1126 committed Nov 6, 2014
1 parent 1b5b0fb commit b3e9d24
Show file tree
Hide file tree
Showing 13 changed files with 1,103 additions and 24 deletions.
9 changes: 8 additions & 1 deletion src/ner/ner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,9 @@ void NER::test(void) {
NERReader reader(ifs);
NERWriter writer(cout);
Instance * inst = NULL;
int beg_tag0 = (model->labels.index( "B-Nh" ) / __num_ne_types__);
int beg_tag1 = (model->labels.index( "S-Nh" ) / __num_ne_types__);
int beg_tag2 = (model->labels.index( "O" ) / __num_ne_types__);

// int beg_tag0 = model->labels.index( __b__ );
// int beg_tag1 = model->labels.index( __s__ );
Expand All @@ -693,14 +696,18 @@ void NER::test(void) {
calculate_scores(inst, true);
decoder->decode(inst);

inst->predicted_tags.resize(len);
for(int i = 0; i < len; ++i) {
inst->predicted_tags[i] = model->labels.at(inst->predicted_tagsidx[i]);
}
writer.write(inst);
delete inst;
}

double after = get_time();
TRACE_LOG("Eclipse time %lf", after - before);

sleep(1000000);
// sleep(1000000);
return;
}

Expand Down
2 changes: 1 addition & 1 deletion src/parser/parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1233,7 +1233,7 @@ Parser::test() {
total_rels);
}

sleep(1000000);
//sleep(1000000);
}

// Enumerate all the subtree in the whole tree space (without specifed tree),
Expand Down
6 changes: 6 additions & 0 deletions src/segmentor/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,23 @@ set_target_properties (segmentor_shared_lib PROPERTIES
# TOOLKIT
# -----------------------------------------------
add_executable (otcws otcws.cpp ${segment_SRC})
add_executable (personal_otcws personal_otcws.cpp ${segment_SRC} personal_segmentor.cpp)
message (STATUS ${LIBRARY_OUTPUT_PATH} )
link_directories ( ${LIBRARY_OUTPUT_PATH} )

target_link_libraries (otcws boost_regex_static_lib)
target_link_libraries (personal_otcws boost_regex_static_lib)

# redirect the output binary to tools/train
set_target_properties (otcws
PROPERTIES
OUTPUT_NAME otcws
RUNTIME_OUTPUT_DIRECTORY ${TOOLS_DIR}/train/)

set_target_properties (personal_otcws
PROPERTIES
OUTPUT_NAME personal_otcws
RUNTIME_OUTPUT_DIRECTORY ${TOOLS_DIR}/train/)
configure_file (
segment_dll.h
${INCLUDE_OUTPUT_PATH}/ltp/segment_dll.h)
23 changes: 23 additions & 0 deletions src/segmentor/instance.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,32 @@ class Instance {
}
}

if (personal_uni_features.total_size() > 0) {
int d1 = personal_uni_features.nrows();
int d2 = personal_uni_features.ncols();

for (int i = 0; i < d1; ++i) {
if (personal_uni_features[i][0]) {
personal_uni_features[i][0]->clear();
}
for (int j = 0; j < d2; ++j) {
if (personal_uni_features[i][j]) {
delete personal_uni_features[i][j];
}
}
}
}

uni_features.dealloc();
personal_uni_features.dealloc();
uni_scores.dealloc();
bi_scores.dealloc();

features.zero();
personal_features.zero();
predicted_features.zero();
personal_predicted_features.zero();


return 0;
}
Expand All @@ -125,9 +145,12 @@ class Instance {
std::vector< int > lexicon_match_state;

math::SparseVec features; /*< the gold features */
math::SparseVec personal_features; /*< the gold features */
math::SparseVec predicted_features; /*< the predicted features */
math::SparseVec personal_predicted_features; /*< the predicted features */

math::Mat< math::FeatureVector *> uni_features;
math::Mat< math::FeatureVector *> personal_uni_features;
math::Mat< double > uni_scores;
math::Mat< double > bi_scores;
};
Expand Down
2 changes: 2 additions & 0 deletions src/segmentor/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Model::save(std::ostream & ofs) {
// write a signature into the file
char chunk[16] = {'o','t','c','w','s', '\0'};
ofs.write(chunk, 16);
ofs.write(reinterpret_cast<const char *>(&end_time), sizeof(int));

int off = ofs.tellp();

Expand Down Expand Up @@ -54,6 +55,7 @@ Model::load(std::istream & ifs) {
if (strcmp(chunk, "otcws")) {
return false;
}
ifs.read(reinterpret_cast<char *>(&end_time), sizeof(int));

unsigned labels_offset = read_uint(ifs);
unsigned lexicon_offset = read_uint(ifs);
Expand Down
1 change: 1 addition & 0 deletions src/segmentor/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class Model {
bool load(std::istream & ifs);

public:
int end_time;
IndexableSmartMap labels;
FeatureSpace space;
Parameters param;
Expand Down
2 changes: 2 additions & 0 deletions src/segmentor/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ struct TrainOptions {
std::string train_file;
std::string holdout_file;
std::string model_name;
std::string personal_model_name;
std::string algorithm;
int max_iter;
int display_interval;
Expand All @@ -23,6 +24,7 @@ struct TrainOptions {
struct TestOptions {
std::string test_file;
std::string model_file;
std::string personal_model_file;
std::string lexicon_file;
};

Expand Down
26 changes: 22 additions & 4 deletions src/segmentor/parameter.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,25 +115,42 @@ class Parameters {
return ret;
}

double dot_flush_time(const FeatureVector * vec, int beg_time, int end_time) const {
double ret = 0;
for (int i = 0; i < vec->n; ++ i) {
int idx = vec->idx[i] + vec->loff;
if (vec->val) {
ret += (_W_sum[idx] + _W[idx] * (end_time - beg_time) * vec->val[i]);
} else {
ret += (_W_sum[idx] + _W[idx] * (end_time - beg_time));
}
}
return ret;
}

double dot(const int idx, bool use_avg = false) const {
const double * const p = (use_avg ? _W_sum : _W);
return p[idx];
}

double dot_flush_time(const int idx, int beg_time, int end_time) const {
return _W_sum[idx] + _W[idx] * (end_time - beg_time);
}

void flush(int now) {
for(int i = 0; i < _dim; ++i) {
_W_sum[i] += (now - _W_time[i]) * _W[i];
_W_time[i] = now;
}
}

void dump(std::ostream & out, bool use_avg = true) {
const double * p = (use_avg ? _W_sum : _W);
void dump(std::ostream & out) {
char chunk[16] = {'p', 'a', 'r', 'a', 'm', 0};
out.write(chunk, 16);
out.write(reinterpret_cast<const char *>(&_dim), sizeof(int));
if (_dim > 0) {
out.write(reinterpret_cast<const char *>(p), sizeof(double) * _dim);
out.write(reinterpret_cast<const char *>(_W), sizeof(double) * _dim);
out.write(reinterpret_cast<const char *>(_W_sum), sizeof(double) * _dim);
}
}

Expand All @@ -147,8 +164,9 @@ class Parameters {
in.read(reinterpret_cast<char *>(&_dim), sizeof(int));
if (_dim > 0) {
_W = new double[_dim];
_W_sum = new double[_dim];
in.read(reinterpret_cast<char *>(_W), sizeof(double) * _dim);
_W_sum = _W;
in.read(reinterpret_cast<char *>(_W_sum), sizeof(double) * _dim);
}

return true;
Expand Down
34 changes: 34 additions & 0 deletions src/segmentor/personal_otcws.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#include <iostream>
#include "utils/cfgparser.hpp"
#include "utils/logging.hpp"
#include "segmentor/personal_segmentor.h"

using namespace ltp::utility;
using namespace ltp::segmentor;

void usage(void) {
std::cerr << "otcws - Training and testing suite for Chinese Word segmentation"
<< std::endl;
std::cerr << "Copyright (C) 2012-2014 HIT-SCIR" << std::endl;
std::cerr << std::endl;
std::cerr << "usage: ./otcws <config_file>" << std::endl;
std::cerr << std::endl;
}

int main(int argc, const char * argv[]) {
if (argc < 2 || (argv[1][0] == '-' && argv[1][1] == 'h')) {
usage();
return -1;
}

ConfigParser cfg(argv[1]);

if (!cfg) {
ERROR_LOG("Failed to parse config file.");
return -1;
}

Personal_Segmentor segmentor(cfg);
segmentor.run();
return 0;
}
Loading

0 comments on commit b3e9d24

Please sign in to comment.