forked from HIT-SCIR/ltp
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
84 changed files
with
13,017 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# example of configure time generate header | ||
|
||
include_directories (./ | ||
${SOURCE_DIR}/parser | ||
${SOURCE_DIR}/utils | ||
${SOURCE_DIR}/math) | ||
|
||
message(STATUS ${SOURCE_DIR}) | ||
|
||
set (lgdpj_SRC collections.cpp | ||
model.cpp | ||
featurespace.cpp | ||
extractor.cpp | ||
options.cpp | ||
decoder1o.cpp | ||
decoder2o.cpp | ||
parser.cpp | ||
lgdpj.cpp) | ||
|
||
add_executable(lgdpj ${lgdpj_SRC}) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
#include "collections.h" | ||
|
||
namespace ltp { | ||
namespace parser { | ||
|
||
DictionaryCollections::DictionaryCollections(int num_dicts) : | ||
idx(0) { | ||
dicts.resize( num_dicts ); | ||
|
||
for (int i = 0; i < num_dicts; ++ i) { | ||
dicts[i] = new Dictionary( this ); | ||
} | ||
} | ||
|
||
int DictionaryCollections::retrieve(int tid, const char * key, bool create) { | ||
return dicts[tid]->retrieve(key, create); | ||
} | ||
|
||
size_t DictionaryCollections::dim() const{ | ||
return idx; | ||
} | ||
|
||
void DictionaryCollections::dump(ostream & out) { | ||
char chunk[32]; | ||
unsigned int sz = dicts.size(); | ||
strncpy(chunk, "collections", 16); | ||
|
||
out.write(chunk, 16); | ||
out.write(reinterpret_cast<const char *>(&idx), sizeof(int)); | ||
out.write(reinterpret_cast<const char *>(&sz), sizeof(unsigned int)); | ||
for (int i = 0; i < dicts.size(); ++ i) { | ||
// strncpy(chunk, dicts[i]->dict_name.c_str(), 32); | ||
// out.write(chunk, 32); | ||
|
||
dicts[i]->database.dump(out); | ||
} | ||
} | ||
|
||
bool DictionaryCollections::load(istream & in) { | ||
char chunk[32]; | ||
unsigned int sz; | ||
|
||
in.read(chunk, 16); | ||
if (strcmp(chunk, "collections")) { | ||
return false; | ||
} | ||
|
||
in.read(reinterpret_cast<char *>(&idx), sizeof(int)); | ||
in.read(reinterpret_cast<char *>(&sz), sizeof(unsigned int)); | ||
|
||
if (sz != dicts.size()) { | ||
return false; | ||
} | ||
|
||
for (unsigned i = 0; i < sz; ++ i) { | ||
// in.read(chunk, 32); | ||
|
||
// Dictionary * dict = new Dictionary(this); | ||
if (!dicts[i]->database.load(in)) { | ||
return false; | ||
} | ||
|
||
// dicts[i].push_back(dict); | ||
} | ||
|
||
return true; | ||
} | ||
|
||
} // end for namespace parser | ||
} // end for namespace ltp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
#ifndef __DICT_COLLECTIONS_H__ | ||
#define __DICT_COLLECTIONS_H__ | ||
|
||
#include <iostream> | ||
#include <vector> | ||
#include "stringmap.hpp" | ||
#include "smartmap.hpp" | ||
|
||
#include "instance.h" | ||
|
||
namespace ltp { | ||
namespace parser { | ||
|
||
using namespace std; | ||
using namespace ltp::utility; | ||
|
||
|
||
// declariation of dictionary, this is specially needed | ||
// by the observer design pattern | ||
class Dictionary; | ||
|
||
// class of a collection of dictionary | ||
// a index counter is shared within several dictionary. | ||
class DictionaryCollections { | ||
public: | ||
DictionaryCollections(int num_dicts); | ||
~DictionaryCollections() {} | ||
|
||
/* | ||
* Dump the dictionary collections into output stream | ||
* | ||
* @param[out] out the output stream | ||
*/ | ||
void dump(ostream & out); | ||
|
||
/* | ||
* Load the dictionary collections from input stream, | ||
* return true if dictionary successfully loaded, otherwise | ||
* false. | ||
* | ||
* @param[in] in the input stream | ||
* @return bool true on success, otherwise false. | ||
*/ | ||
bool load(istream & in); | ||
|
||
/* | ||
* Get the size of dictionary collections | ||
* | ||
* @return size_t the size of the dictionary | ||
*/ | ||
size_t dim() const; | ||
|
||
/* | ||
* Retrieve the certain key in one of the dictionaries in this | ||
* collection. If create is specified, this key is created on | ||
* the condition that it is not in the dictionary. Return the | ||
* index of the key, -1 on failure | ||
* | ||
* @param[in] tid the index of the dictionary | ||
* @param[in] key the key | ||
* @param[in] create insert the key to dictionary if create | ||
* if true. | ||
* @return int the index of the key, -1 on failure. | ||
*/ | ||
int retrieve(int tid, const char * key, bool create); | ||
|
||
public: | ||
int idx; /*< the shared index among dictionaries */ | ||
|
||
private: | ||
vector<Dictionary *> dicts; | ||
}; | ||
|
||
// the dictionary class | ||
// it's wrapper of class SmartMap<int> | ||
class Dictionary { | ||
public: | ||
Dictionary(DictionaryCollections * coll): | ||
collections(coll) {} | ||
|
||
//StringMap<int> database; | ||
SmartMap<int> database; | ||
DictionaryCollections * collections; | ||
|
||
inline int retrieve(const char * key, bool create) { | ||
int val; | ||
|
||
if (database.get(key, val)) { | ||
return val; | ||
} else { | ||
if (create) { | ||
val = collections->idx; | ||
database.set(key, val); | ||
// database.unsafe_set(key, val); | ||
++ collections->idx; | ||
return val; | ||
} | ||
} | ||
|
||
return -1; | ||
} | ||
|
||
inline int size() { | ||
return database.size(); | ||
} | ||
}; | ||
|
||
// labelcollections is a bi-direction map. | ||
// it support two way of retrieving | ||
// | ||
// * string key -> int index | ||
// * int index -> string key | ||
// | ||
} // end for namespace parser | ||
} // end for namespace ltp | ||
#endif // end for __FEATURE_COLLECTIONS_H__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
#ifndef __CONLL_READER_H__ | ||
#define __CONLL_READER_H__ | ||
|
||
#include <iostream> | ||
#include <fstream> | ||
|
||
#include "codecs.hpp" | ||
#include "strutils.hpp" | ||
#include "logging.hpp" | ||
|
||
#include "settings.h" | ||
#include "instance.h" | ||
#include "options.h" | ||
|
||
namespace ltp { | ||
namespace parser { | ||
|
||
using namespace std; | ||
using namespace ltp::strutils; | ||
|
||
class CoNLLReader { | ||
public: | ||
/* | ||
* Constructor for ConllReader | ||
* Register a ifstream to the ConllReader | ||
* | ||
* @param f the reference to the ifstream | ||
*/ | ||
CoNLLReader(ifstream& _f): f(_f) {} | ||
~CoNLLReader() {} | ||
|
||
/* | ||
* Get next instance from ifstream buffer | ||
*/ | ||
Instance * next() { | ||
if (f.eof()) { | ||
return NULL; | ||
} | ||
|
||
Instance * inst = new Instance; | ||
string line; | ||
|
||
inst->forms.push_back( ROOT_FORM ); | ||
inst->lemmas.push_back( ROOT_LEMMA ); | ||
inst->postags.push_back( ROOT_POSTAG ); | ||
inst->heads.push_back( -1 ); | ||
if (model_opt.labeled) { | ||
inst->deprels.push_back( ROOT_DEPREL ); | ||
} | ||
inst->chars.push_back( vector<string>() ); | ||
|
||
while (!f.eof()) { | ||
getline(f, line); | ||
chomp(line); | ||
|
||
if (line.size() == 0) { | ||
break; | ||
} | ||
|
||
vector<string> items = split(line); | ||
if (items.size() != 10) { | ||
WARNING_LOG("Unknown conll format file"); | ||
} | ||
|
||
inst->forms.push_back( items[1] ); // items[1]: form | ||
inst->lemmas.push_back( items[2] ); // items[2]: lemma | ||
inst->postags.push_back( items[3] ); // items[4]: postag | ||
inst->heads.push_back( to_int(items[6]) ); | ||
|
||
if (model_opt.labeled) { | ||
inst->deprels.push_back( items[7] ); | ||
} | ||
|
||
vector<string> chars; | ||
codecs::decode(items[1], chars); | ||
inst->chars.push_back( chars ); | ||
} | ||
|
||
if (inst->forms.size() == 1) { | ||
delete inst; | ||
inst = NULL; | ||
} | ||
return inst; | ||
} | ||
|
||
/* | ||
* Reader reach the end of the file | ||
*/ | ||
bool eof() { | ||
return f.eof(); | ||
} | ||
private: | ||
ifstream& f; | ||
}; // end for ConllReader | ||
} // end for parser | ||
} // end for namespace ltp | ||
|
||
#endif // end for __CONLL_READER_H__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
#ifndef __CONLL_WRITER_H__ | ||
#define __CONLL_WRITER_H__ | ||
|
||
#include <iostream> | ||
|
||
#include "strutils.hpp" | ||
#include "instance.h" | ||
|
||
namespace ltp { | ||
namespace parser { | ||
|
||
using namespace ltp::strutils; | ||
|
||
class CoNLLWriter { | ||
public: | ||
CoNLLWriter(std::ostream& _f): f(_f) {} | ||
~CoNLLWriter() {} | ||
|
||
void write(const Instance * inst) { | ||
int len = inst->size(); | ||
bool predicted = (inst->predicted_heads.size() > 0 && | ||
inst->predicted_heads.size() == len); | ||
bool predicted_label = (inst->predicted_deprels.size() > 0 && | ||
inst->predicted_deprels.size() == len); | ||
|
||
for (int i = 1; i < inst->size(); ++ i) { | ||
f << i + 1 | ||
<< "\t" // 0 - index | ||
<< inst->forms[i] | ||
<< "\t" // 1 - form | ||
<< inst->lemmas[i] | ||
<< "\t" // 2 - lemma | ||
<< inst->postags[i] | ||
<< "\t" // 3 - postag | ||
<< "_" | ||
<< "\t" // 4 - unknown | ||
<< "_" | ||
<< "\t" // 5 - unknown | ||
<< inst->heads[i] | ||
<< "\t" // 6 - heads | ||
<< inst->deprels[i] | ||
<< "\t" // 7 - deprels | ||
<< (predicted ? to_str(inst->predicted_heads[i]) : "_") | ||
<< "\t" | ||
<< (predicted_label ? inst->predicted_deprels[i] : "_") | ||
<< endl; | ||
} | ||
|
||
f << endl; | ||
} | ||
private: | ||
std::ostream& f; | ||
}; // end for ConnllWriter | ||
|
||
} // end for parser | ||
} // end for namespace ltp | ||
|
||
|
||
#endif // end for __CONLL_WRITER_H__ |
Oops, something went wrong.