Skip to content

Commit

Permalink
Import boost program options, rewrite Postagger frontend
Browse files Browse the repository at this point in the history
  • Loading branch information
Oneplus committed May 6, 2015
1 parent 8b45324 commit cde7198
Show file tree
Hide file tree
Showing 82 changed files with 9,675 additions and 2,308 deletions.
24 changes: 24 additions & 0 deletions src/framework/frontend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef __LTP_FRAMEWORK_FRONTEND_H__
#define __LTP_FRAMEWORK_FRONTEND_H__

namespace ltp {
namespace framework {

enum FrontendMode {
kLearn,
kTest,
kDump
};

class Frontend {
protected:
FrontendMode mode;

public:
Frontend(const FrontendMode& _mode): mode(_mode) {}
};

} // namespace framework
} // namespace framework

#endif // end for __LTP_FRAMEWORK_FRONTEND_H__
37 changes: 37 additions & 0 deletions src/framework/io.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#ifndef __LTP_FRAMEWORK_IO_H__
#define __LTP_FRAMEWORK_IO_H__

#include <iostream>

namespace ltp {
namespace framework {

class Reader {
protected:
std::istream& is;
public:
Reader(std::istream& _is): is(_is) {}

size_t number_of_lines() {
const int size = 1024*1024;
char buffer[1024*1024];
size_t retval = 0;

while (true) {
is.read(buffer, 1024*1024);
std::streamsize cc = is.gcount();
if (0 == cc) { break; }
for (std::streamsize i = 0; i < cc; ++ i) {
if (buffer[i] == '\n') { ++ retval; } }
}
is.clear();
is.seekg(0, std::ios_base::beg);
return retval;
}
};

} // namespace framework
} // namespace ltp

#endif // end for __LTP_FRAMEWORK_IO_H__

21 changes: 11 additions & 10 deletions src/framework/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,20 @@ struct ModelOptions {
};

struct TrainOptions {
std::string train_file;
std::string holdout_file;
std::string model_name;
std::string algorithm;
int max_iter;
int display_interval;
int rare_feature_threshold;
std::string train_file;
std::string holdout_file;
std::string model_name;
std::string algorithm;
int max_iter;
int display_interval;
int rare_feature_threshold;
};

struct TestOptions {
std::string test_file;
std::string model_file;
std::string lexicon_file;
std::string test_file;
std::string model_file;
std::string lexicon_file;
bool evaluate;
};

struct DumpOptions {
Expand Down
Empty file modified src/ner/decoder.h
100755 → 100644
Empty file.
7 changes: 3 additions & 4 deletions src/postagger/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ include_directories ( ${SOURCE_DIR}/ )
set (postagger_VERSION "0.0.2")

set (postagger_SRC
options.cpp
decoder.cpp
featurespace.cpp
model.cpp
Expand All @@ -30,10 +29,10 @@ set_target_properties (postagger_shared_lib PROPERTIES

add_library (postagger postag_dll.cpp ${postagger_SRC})

add_executable (otpos otpos.cpp ${postagger_SRC})
add_executable (otpos otpos.cpp postagger_frontend.cpp io.cpp ${postagger_SRC})
# redirect the output binary to tools/train
set_target_properties (otpos
PROPERTIES
target_link_libraries (otpos boost_program_options_static_lib)
set_target_properties (otpos PROPERTIES
OUTPUT_NAME otpos
RUNTIME_OUTPUT_DIRECTORY ${TOOLS_DIR}/train/)

Expand Down
3 changes: 3 additions & 0 deletions src/postagger/decode_context.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#ifndef __LTP_POSTAGGER_DECODE_CONTEXT_H__
#define __LTP_POSTAGGER_DECODE_CONTEXT_H__

#include "utils/math/mat.h"
#include "utils/math/sparsevec.h"
#include "utils/math/featurevec.h"
namespace ltp {
namespace postagger {

Expand Down
65 changes: 19 additions & 46 deletions src/postagger/extractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
namespace ltp {
namespace postagger {

using strutils::codecs::decode;
using strutils::to_str;
using utility::StringVec;
using utility::Template;

std::vector<Template *> Extractor::templates;

Extractor& Extractor::extractor() {
Expand All @@ -28,25 +33,8 @@ Extractor::Extractor() {
templates.push_back(new Template("6={c-1}-{c-0}"));
templates.push_back(new Template("7={c-0}-{c+1}"));
templates.push_back(new Template("8={c-1}-{c+1}"));
//templates.push_back(new Template("9={ct-1}"));
//templates.push_back(new Template("10={ct-0}"));
//templates.push_back(new Template("11={ct+1}"));
//templates.push_back(new Template("7={c-1}-{c-0}-{c+1}"));
//templates.push_back(new Template("9={len}"));
//templates.push_back(new Template("9={ch-0,0}-{ch-0,n}"));
// templates.push_back(new Template("10={ch-1,n}-{ch-0,0}"));
//templates.push_back(new Template("11={ch-0,n}-{ch+1,0}"));
templates.push_back(new Template("12={prefix}"));
templates.push_back(new Template("13={suffix}"));
//templates.push_back(new Template("14={pos}"));
//templates.push_back(new Template("14={ct-1}"));
//templates.push_back(new Template("15={ct-0}"));
//templates.push_back(new Template("16={ct+1}"));
//templates.push_back(new Template("17={dup-1}"));
//templates.push_back(new Template("18={dup-0}"));
//templates.push_back(new Template("19={dup2-2}"));
//templates.push_back(new Template("20={dup2-1}"));
//templates.push_back(new Template("21={dup2-0}"));
}

Extractor::~Extractor() {
Expand All @@ -55,57 +43,42 @@ Extractor::~Extractor() {
}
}

int Extractor::extract1o(Instance * inst, int idx, std::vector< StringVec > & cache) {
int Extractor::extract1o(const Instance* inst, int idx,
std::vector<StringVec>& cache) {
int len = inst->size();

if (inst->chars.size() == 0) {
inst->chars.resize(len);
for (int i = 0; i < len; ++ i) {
strutils::codecs::decode(inst->forms[i], inst->chars[i]);
}
}
std::vector<std::string> chars;
decode(inst->forms[idx], chars);

Template::Data data;

//#define TYPE(x) (strutils::to_str(inst->wordtypes[(x)]))

data.set( "c-2", (idx-2 < 0 ? BOS : inst->forms[idx-2]) );
data.set( "c-1", (idx-1 < 0 ? BOS : inst->forms[idx-1]) );
data.set( "c-0", inst->forms[idx] );
data.set( "c+1", (idx+1 >= len ? EOS : inst->forms[idx+1]) );
data.set( "c+2", (idx+2 >= len ? EOS : inst->forms[idx+2]) );
//data.set( "ct-1", (idx-1 < 0 ? BOT : TYPE(idx-1)) );
//data.set( "ct-0", TYPE(idx) );
//data.set( "ct+1", (idx+1 >= len ? EOT : TYPE(idx+1)) );

int length = inst->forms[idx].size(); length = (length < 5 ? length : 5);
data.set( "len", strutils::to_str(length));
data.set( "len", to_str(length));

// data.set( "ch-1,n", (idx-1 < 0 ? BOC : inst->chars[idx-1][inst->chars[idx-1].size()-1]));
// data.set( "ch-0,0", inst->chars[idx][0] );
// data.set( "ch-0,n", inst->chars[idx][inst->chars[idx].size()-1]);
// data.set( "ch+1,0", (idx+1 >= len ? EOC : inst->chars[idx+1][0]));

string feat;
std::string feat;
feat.reserve(1024);

int N = templates.size();

// 1-9 basic feature
for (int i = 0; i < N - 2; ++ i) {
templates[i]->render(data, feat);
cache[i].push_back(feat);
}

// 12-13 prefix and suffix feature.
for (int i = N - 2; i < N; ++ i) {
string prefix = "";
string suffix = "";
int num_chars = inst->chars[idx].size();
std::string prefix = "";
std::string suffix = "";
int num_chars = chars.size();
for (int j = 0; j < num_chars && j < 3; ++ j) {
prefix = prefix + inst->chars[idx][j];
suffix = inst->chars[idx][num_chars-j-1] + suffix;

prefix = prefix + chars[j];
suffix = chars[num_chars-j-1] + suffix;
data.set( "prefix", prefix);
data.set( "suffix", suffix);

templates[i]->render(data, feat);
cache[i].push_back(feat);
}
Expand Down
8 changes: 3 additions & 5 deletions src/postagger/extractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,17 @@
namespace ltp {
namespace postagger {

using namespace std;
using namespace ltp::utility;

class Extractor {
public:
static Extractor& extractor();
static int num_templates();
static int extract1o(Instance * inst, int idx, vector< StringVec > & cache);
static int extract1o(const Instance* inst, int idx,
std::vector<utility::StringVec>& cache);
protected:
Extractor();
~Extractor();
private:
static vector< Template * > templates;
static std::vector< utility::Template * > templates;
};

} // end for namespace postagger
Expand Down
23 changes: 11 additions & 12 deletions src/postagger/featurespace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ FeatureSpace::~FeatureSpace(void) {
delete [](dicts);
}

int FeatureSpace::retrieve(int tid, const char * key, bool create) {
int FeatureSpace::retrieve(int tid, const char* key, bool create) {
int val;

if (dicts[tid].get(key, val)) {
Expand All @@ -25,42 +25,41 @@ int FeatureSpace::retrieve(int tid, const char * key, bool create) {
val = _offset;
dicts[tid].set(key, val);
++ _offset;

return val;
}
}

return -1;
}

int FeatureSpace::index(int tid, const char * key, int lid) {
int idx = retrieve(tid, key, false);
if (idx < 0) {
return -1;
int FeatureSpace::index(int tid, const char* key, int lid) const {
int val;
if (dicts[tid].get(key, val)) {
return val* _num_labels+ lid;
}

return idx * _num_labels + lid;
return -1;
}

int FeatureSpace::index(int prev_lid, int lid) {
int FeatureSpace::index(int prev_lid, int lid) const {
return _offset * _num_labels + prev_lid * _num_labels + lid;
}

int FeatureSpace::num_feature_groups() {
int FeatureSpace::num_feature_groups() const {
return _offset + _num_labels;
}

int FeatureSpace::num_features() {
int FeatureSpace::num_features() const {
return _offset;
}

int FeatureSpace::dim() {
int FeatureSpace::dim() const {
return _offset * _num_labels + _num_labels * _num_labels;
}

void FeatureSpace::set_num_labels(int num_labels) {
_num_labels = num_labels;
}

void FeatureSpace::dump(std::ostream & ofs) {
char chunk[16];
unsigned sz = _num_dicts;
Expand Down
14 changes: 7 additions & 7 deletions src/postagger/featurespace.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,21 @@ class FeatureSpace {
~FeatureSpace();

int retrieve(int tid, const char * key, bool create);
int index(int tid, const char * key, int lid = 0);
int index(int prev_lid, int lid);
int num_features();
int dim();
int index(int tid, const char * key, int lid = 0) const;
int index(int prev_lid, int lid) const;
int num_features() const;
int dim() const;
int num_feature_groups() const;
void set_num_labels(int num_labeles);
int num_feature_groups();

/*
/**
* dump the feature space to a output stream
*
* @param[in] ofs the output stream
*/
void dump(std::ostream & ofs);

/*
/**
* load the feature space from a input stream
*
* @param[in] num_labels the number of labels
Expand Down
3 changes: 0 additions & 3 deletions src/postagger/instance.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,12 @@ class Instance {
public:
std::vector< std::string > raw_forms;
std::vector< std::string > forms;
//std::vector< int > wordtypes;
std::vector< std::string > tags;
std::vector< int > tagsidx;
std::vector< std::string > predicted_tags;
std::vector< int > predicted_tagsidx;

std::vector< Bitset > postag_constrain; /*< the postag constrain for decode */

std::vector< std::vector< std::string> > chars;
};

} // end for namespace postagger
Expand Down
Loading

0 comments on commit cde7198

Please sign in to comment.