forked from kpu/kenlm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
225 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
This directory is for wrappers around other people's LMs, presenting an interface similar to KenLM's. You will need to have their LM installed. | ||
|
||
NPLM is a work in progress. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#include "lm/wrappers/nplm.hh" | ||
#include "util/exception.hh" | ||
#include "util/file.hh" | ||
|
||
#include <algorithm> | ||
|
||
#include <string.h> | ||
|
||
#include "neuralLM.h" | ||
|
||
namespace lm { | ||
namespace np { | ||
|
||
Vocabulary::Vocabulary(const nplm::vocabulary &vocab) | ||
: base::Vocabulary(vocab.lookup_word("<s>"), vocab.lookup_word("</s>"), vocab.lookup_word("<unk>")), | ||
vocab_(vocab) {} | ||
|
||
Vocabulary::~Vocabulary() {} | ||
|
||
WordIndex Vocabulary::Index(const std::string &str) const { | ||
return vocab_.lookup_word(str); | ||
} | ||
|
||
bool Model::Recognize(const std::string &name) { | ||
try { | ||
util::scoped_fd file(util::OpenReadOrThrow(name.c_str())); | ||
char magic_check[16]; | ||
util::ReadOrThrow(file.get(), magic_check, sizeof(magic_check)); | ||
const char nnlm_magic[] = "\\config\nversion "; | ||
return !memcmp(magic_check, nnlm_magic, 16); | ||
} catch (const util::Exception &) { | ||
return false; | ||
} | ||
} | ||
|
||
Model::Model(const std::string &file) : base_instance_(new nplm::neuralLM(file)), vocab_(base_instance_->get_vocabulary()) { | ||
UTIL_THROW_IF(base_instance_->get_order() > NPLM_MAX_ORDER, util::Exception, "This NPLM has order " << (unsigned int)base_instance_->get_order() << " but the KenLM wrapper was compiled with " << NPLM_MAX_ORDER << ". Change the defintion of NPLM_MAX_ORDER and recompile."); | ||
State begin_sentence, null_context; | ||
std::fill(begin_sentence.words, begin_sentence.words + NPLM_MAX_ORDER - 1, base_instance_->lookup_word("<s>")); | ||
null_word_ = base_instance_->lookup_word("<null>"); | ||
std::fill(null_context.words, null_context.words + NPLM_MAX_ORDER - 1, null_word_); | ||
|
||
Init(begin_sentence, null_context, vocab_, base_instance_->get_order()); | ||
} | ||
|
||
Model::~Model() {} | ||
|
||
FullScoreReturn Model::FullScore(const State &from, const WordIndex new_word, State &out_state) const { | ||
nplm::neuralLM *lm = backend_.get(); | ||
if (!lm) { | ||
lm = new nplm::neuralLM(*base_instance_); | ||
backend_.reset(lm); | ||
} | ||
// State is in natural word order. | ||
FullScoreReturn ret; | ||
for (int i = 0; i < lm->get_order() - 1; ++i) { | ||
lm->staging_ngram()(i) = from.words[i]; | ||
} | ||
lm->staging_ngram()(lm->get_order() - 1) = new_word; | ||
ret.prob = lm->lookup_from_staging(); | ||
// Always say full order. | ||
ret.ngram_length = lm->get_order(); | ||
// Shift everything down by one. | ||
memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (lm->get_order() - 2)); | ||
out_state.words[lm->get_order() - 2] = new_word; | ||
// Fill in trailing words with zeros so state comparison works. | ||
memset(out_state.words + lm->get_order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - lm->get_order())); | ||
return ret; | ||
} | ||
|
||
// TODO: optimize with direct call? | ||
FullScoreReturn Model::FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const { | ||
// State is in natural word order. The API here specifies reverse order. | ||
std::size_t state_length = std::min<std::size_t>(Order() - 1, context_rend - context_rbegin); | ||
State state; | ||
// Pad with null words. | ||
for (lm::WordIndex *i = state.words; i < state.words + Order() - 1 - state_length; ++i) { | ||
*i = null_word_; | ||
} | ||
// Put new words at the end. | ||
std::reverse_copy(context_rbegin, context_rbegin + state_length, state.words + Order() - 1 - state_length); | ||
return FullScore(state, new_word, out_state); | ||
} | ||
|
||
} // namespace np | ||
} // namespace lm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#ifndef LM_WRAPPER_NPLM__ | ||
#define LM_WRAPPER_NPLM__ | ||
|
||
#include "lm/facade.hh" | ||
#include "lm/max_order.hh" | ||
#include "util/string_piece.hh" | ||
|
||
#include <boost/thread/tss.hpp> | ||
#include <boost/scoped_ptr.hpp> | ||
|
||
/* Wrapper to NPLM "by Ashish Vaswani, with contributions from David Chiang | ||
* and Victoria Fossum." | ||
* http://nlg.isi.edu/software/nplm/ | ||
*/ | ||
|
||
namespace nplm { | ||
class vocabulary; | ||
class neuralLM; | ||
} // namespace nplm | ||
|
||
namespace lm { | ||
namespace np { | ||
|
||
class Vocabulary : public base::Vocabulary { | ||
public: | ||
Vocabulary(const nplm::vocabulary &vocab); | ||
|
||
~Vocabulary(); | ||
|
||
WordIndex Index(const std::string &str) const; | ||
|
||
// TODO: lobby them to support StringPiece | ||
WordIndex Index(const StringPiece &str) const { | ||
return Index(std::string(str.data(), str.size())); | ||
} | ||
|
||
private: | ||
const nplm::vocabulary &vocab_; | ||
}; | ||
|
||
// Sorry for imposing my limitations on your code. | ||
#define NPLM_MAX_ORDER 7 | ||
|
||
struct State { | ||
WordIndex words[NPLM_MAX_ORDER - 1]; | ||
}; | ||
|
||
class Model : public lm::base::ModelFacade<Model, State, Vocabulary> { | ||
private: | ||
typedef lm::base::ModelFacade<Model, State, Vocabulary> P; | ||
|
||
public: | ||
// Does this look like an NPLM? | ||
static bool Recognize(const std::string &file); | ||
|
||
explicit Model(const std::string &file); | ||
|
||
~Model(); | ||
|
||
FullScoreReturn FullScore(const State &from, const WordIndex new_word, State &out_state) const; | ||
|
||
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const; | ||
|
||
private: | ||
boost::scoped_ptr<nplm::neuralLM> base_instance_; | ||
|
||
mutable boost::thread_specific_ptr<nplm::neuralLM> backend_; | ||
|
||
Vocabulary vocab_; | ||
|
||
lm::WordIndex null_word_; | ||
}; | ||
|
||
} // namespace np | ||
} // namespace lm | ||
|
||
#endif // LM_WRAPPER_NPLM__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters