Skip to content

Commit

Permalink
Merge branch 'master' into pruning2
Browse files Browse the repository at this point in the history
  • Loading branch information
emjotde committed Apr 7, 2014
2 parents 7f0c68c + e669eb5 commit 5cc2bd0
Show file tree
Hide file tree
Showing 11 changed files with 225 additions and 10 deletions.
9 changes: 8 additions & 1 deletion compile_query_only.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,19 @@ rm {lm,util}/*.o 2>/dev/null
set -e

CXX=${CXX:-g++}

CXXFLAGS+=" -I. -O3 -DNDEBUG -DKENLM_MAX_ORDER=6"

#If this fails for you, consider using bjam.
if [ ${#NPLM} != 0 ]; then
CXXFLAGS+=" -DHAVE_NPLM -lneuralLM -L$NPLM/src -I$NPLM/src -lboost_thread-mt -fopenmp"
ADDED_PATHS="lm/wrappers/*.cc"
fi
echo 'Compiling with '$CXX $CXXFLAGS

#Grab all cc files in these directories except those ending in test.cc or main.cc
objects=""
for i in util/double-conversion/*.cc util/*.cc lm/*.cc; do
for i in util/double-conversion/*.cc util/*.cc lm/*.cc $ADDED_PATHS; do
if [ "${i%test.cc}" == "$i" ] && [ "${i%main.cc}" == "$i" ]; then
$CXX $CXXFLAGS -c $i -o ${i%.cc}.o
objects="$objects ${i%.cc}.o"
Expand Down
11 changes: 10 additions & 1 deletion lm/Jamfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,16 @@ update-if-changed $(ORDER-LOG) $(max-order) ;

max-order += <dependency>$(ORDER-LOG) ;

fakelib kenlm : [ glob *.cc : *main.cc *test.cc ] ../util//kenutil : <include>.. $(max-order) : : <include>.. $(max-order) ;
wrappers = ;
local with-nplm = [ option.get "with-nplm" ] ;
if $(with-nplm) {
lib neuralLM : : <search>$(with-nplm)/src ;
obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <cxxflags>-fopenmp ;
alias nplm : nplm.o neuralLM ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ;
wrappers += nplm ;
}

fakelib kenlm : $(wrappers) [ glob *.cc : *main.cc *test.cc ] ../util//kenutil : <include>.. $(max-order) : : <include>.. $(max-order) ;

import testing ;

Expand Down
19 changes: 14 additions & 5 deletions lm/facade.hh
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,27 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
typedef StateT State;
typedef VocabularyT Vocabulary;

// Default Score function calls FullScore. Model can override this.
float Score(const State &in_state, const WordIndex new_word, State &out_state) const {
return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob;
}

/* Translate from void* to State */
FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const {
return static_cast<const Child*>(this)->FullScore(
*reinterpret_cast<const State*>(in_state),
new_word,
*reinterpret_cast<State*>(out_state));
}

FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const {
return static_cast<const Child*>(this)->FullScoreForgotState(
context_rbegin,
context_rend,
new_word,
*reinterpret_cast<State*>(out_state));
}

// Default Score function calls FullScore. Model can override this.
float Score(const State &in_state, const WordIndex new_word, State &out_state) const {
return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob;
}

float Score(const void *in_state, const WordIndex new_word, void *out_state) const {
return static_cast<const Child*>(this)->Score(
*reinterpret_cast<const State*>(in_state),
Expand Down
9 changes: 9 additions & 0 deletions lm/query_main.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#include "lm/ngram_query.hh"

#ifdef WITH_NPLM
#include "lm/wrappers/nplm.hh"
#endif

int main(int argc, char *argv[]) {
if (!(argc == 2 || (argc == 3 && !strcmp(argv[2], "null")))) {
std::cerr << "KenLM was compiled with maximum order " << KENLM_MAX_ORDER << "." << std::endl;
Expand Down Expand Up @@ -35,6 +39,11 @@ int main(int argc, char *argv[]) {
std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
abort();
}
#ifdef WITH_NPLM
} else if (lm::np::Model::Recognize(argv[1])) {
lm::np::Model model(argv[1]);
Query(model, sentence_context, std::cin, std::cout);
#endif
} else {
Query<ProbingModel>(argv[1], sentence_context, std::cin, std::cout);
}
Expand Down
3 changes: 3 additions & 0 deletions lm/virtual_interface.hh
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ class Model {
// Requires in_state != out_state
virtual FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;

// Prefer to use FullScore. The context words should be provided in reverse order.
virtual FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0;

unsigned char Order() const { return order_; }

const Vocabulary &BaseVocabulary() const { return *base_vocab_; }
Expand Down
3 changes: 3 additions & 0 deletions lm/wrappers/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This directory is for wrappers around other people's LMs, presenting an interface similar to KenLM's. You will need to have their LM installed.

NPLM is a work in progress.
86 changes: 86 additions & 0 deletions lm/wrappers/nplm.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#include "lm/wrappers/nplm.hh"
#include "util/exception.hh"
#include "util/file.hh"

#include <algorithm>

#include <string.h>

#include "neuralLM.h"

namespace lm {
namespace np {

Vocabulary::Vocabulary(const nplm::vocabulary &vocab)
: base::Vocabulary(vocab.lookup_word("<s>"), vocab.lookup_word("</s>"), vocab.lookup_word("<unk>")),
vocab_(vocab) {}

Vocabulary::~Vocabulary() {}

WordIndex Vocabulary::Index(const std::string &str) const {
return vocab_.lookup_word(str);
}

bool Model::Recognize(const std::string &name) {
try {
util::scoped_fd file(util::OpenReadOrThrow(name.c_str()));
char magic_check[16];
util::ReadOrThrow(file.get(), magic_check, sizeof(magic_check));
const char nnlm_magic[] = "\\config\nversion ";
return !memcmp(magic_check, nnlm_magic, 16);
} catch (const util::Exception &) {
return false;
}
}

Model::Model(const std::string &file) : base_instance_(new nplm::neuralLM(file)), vocab_(base_instance_->get_vocabulary()) {
UTIL_THROW_IF(base_instance_->get_order() > NPLM_MAX_ORDER, util::Exception, "This NPLM has order " << (unsigned int)base_instance_->get_order() << " but the KenLM wrapper was compiled with " << NPLM_MAX_ORDER << ". Change the defintion of NPLM_MAX_ORDER and recompile.");
State begin_sentence, null_context;
std::fill(begin_sentence.words, begin_sentence.words + NPLM_MAX_ORDER - 1, base_instance_->lookup_word("<s>"));
null_word_ = base_instance_->lookup_word("<null>");
std::fill(null_context.words, null_context.words + NPLM_MAX_ORDER - 1, null_word_);

Init(begin_sentence, null_context, vocab_, base_instance_->get_order());
}

Model::~Model() {}

FullScoreReturn Model::FullScore(const State &from, const WordIndex new_word, State &out_state) const {
nplm::neuralLM *lm = backend_.get();
if (!lm) {
lm = new nplm::neuralLM(*base_instance_);
backend_.reset(lm);
}
// State is in natural word order.
FullScoreReturn ret;
for (int i = 0; i < lm->get_order() - 1; ++i) {
lm->staging_ngram()(i) = from.words[i];
}
lm->staging_ngram()(lm->get_order() - 1) = new_word;
ret.prob = lm->lookup_from_staging();
// Always say full order.
ret.ngram_length = lm->get_order();
// Shift everything down by one.
memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (lm->get_order() - 2));
out_state.words[lm->get_order() - 2] = new_word;
// Fill in trailing words with zeros so state comparison works.
memset(out_state.words + lm->get_order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - lm->get_order()));
return ret;
}

// TODO: optimize with direct call?
FullScoreReturn Model::FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const {
// State is in natural word order. The API here specifies reverse order.
std::size_t state_length = std::min<std::size_t>(Order() - 1, context_rend - context_rbegin);
State state;
// Pad with null words.
for (lm::WordIndex *i = state.words; i < state.words + Order() - 1 - state_length; ++i) {
*i = null_word_;
}
// Put new words at the end.
std::reverse_copy(context_rbegin, context_rbegin + state_length, state.words + Order() - 1 - state_length);
return FullScore(state, new_word, out_state);
}

} // namespace np
} // namespace lm
77 changes: 77 additions & 0 deletions lm/wrappers/nplm.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#ifndef LM_WRAPPER_NPLM__
#define LM_WRAPPER_NPLM__

#include "lm/facade.hh"
#include "lm/max_order.hh"
#include "util/string_piece.hh"

#include <boost/thread/tss.hpp>
#include <boost/scoped_ptr.hpp>

/* Wrapper to NPLM "by Ashish Vaswani, with contributions from David Chiang
* and Victoria Fossum."
* http://nlg.isi.edu/software/nplm/
*/

namespace nplm {
class vocabulary;
class neuralLM;
} // namespace nplm

namespace lm {
namespace np {

class Vocabulary : public base::Vocabulary {
public:
Vocabulary(const nplm::vocabulary &vocab);

~Vocabulary();

WordIndex Index(const std::string &str) const;

// TODO: lobby them to support StringPiece
WordIndex Index(const StringPiece &str) const {
return Index(std::string(str.data(), str.size()));
}

private:
const nplm::vocabulary &vocab_;
};

// Sorry for imposing my limitations on your code.
#define NPLM_MAX_ORDER 7

struct State {
WordIndex words[NPLM_MAX_ORDER - 1];
};

class Model : public lm::base::ModelFacade<Model, State, Vocabulary> {
private:
typedef lm::base::ModelFacade<Model, State, Vocabulary> P;

public:
// Does this look like an NPLM?
static bool Recognize(const std::string &file);

explicit Model(const std::string &file);

~Model();

FullScoreReturn FullScore(const State &from, const WordIndex new_word, State &out_state) const;

FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;

private:
boost::scoped_ptr<nplm::neuralLM> base_instance_;

mutable boost::thread_specific_ptr<nplm::neuralLM> backend_;

Vocabulary vocab_;

lm::WordIndex null_word_;
};

} // namespace np
} // namespace lm

#endif // LM_WRAPPER_NPLM__
4 changes: 3 additions & 1 deletion util/file_piece.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ StringPiece FilePiece::ReadLine(char delim) {
}
}
if (at_end_) {
if (position_ == position_end_) Shift();
if (position_ == position_end_) {
Shift();
}
return Consume(position_end_);
}
skip = position_end_ - position_;
Expand Down
10 changes: 9 additions & 1 deletion util/file_piece.hh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <iosfwd>
#include <string>

#include <assert.h>
#include <stdint.h>

namespace util {
Expand Down Expand Up @@ -66,8 +67,14 @@ class FilePiece {

// Skip spaces defined by isspace.
void SkipSpaces(const bool *delim = kSpaces) {
assert(position_ <= position_end_);
for (; ; ++position_) {
if (position_ == position_end_) Shift();
if (position_ == position_end_) {
Shift();
// And break out at end of file.
if (position_ == position_end_) return;
}
assert(position_ < position_end_);
if (!delim[static_cast<unsigned char>(*position_)]) return;
}
}
Expand All @@ -86,6 +93,7 @@ class FilePiece {
template <class T> T ReadNumber();

StringPiece Consume(const char *to) {
assert(to >= position_);
StringPiece ret(position_, to - position_);
position_ = to;
return ret;
Expand Down
4 changes: 3 additions & 1 deletion util/mmap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ void scoped_memory::call_realloc(std::size_t size) {
if (!new_data) {
reset();
} else {
reset(new_data, size, MALLOC_ALLOCATED);
data_ = new_data;
size_ = size;
source_ = MALLOC_ALLOCATED;
}
}

Expand Down

0 comments on commit 5cc2bd0

Please sign in to comment.