Skip to content

Commit

Permalink
Merge Lane's changes and a few other things from Moses
Browse files Browse the repository at this point in the history
  • Loading branch information
kpu committed Aug 16, 2012
1 parent 8893bec commit 21055bf
Show file tree
Hide file tree
Showing 16 changed files with 63 additions and 40 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ dist
*.o
query
build_binary
kenlm_max_order
lm/left_test
lm/model_test
util/bit_packing_test
Expand Down
3 changes: 2 additions & 1 deletion Jamroot
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sanity ;
sanity.boost 103600 ;
project : requirements [ sanity.reqs ] ;
max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
project : requirements [ sanity.reqs ] <define>KENLM_MAX_ORDER=$(max-order) ;
project : default-build <threading>multi <warnings>on <variant>release ;

sanity.external-lib z ;
Expand Down
2 changes: 1 addition & 1 deletion clean.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
#!/bin/bash
rm -rf */*.o query build_binary */*_test lm/test.binary* lm/test.arpa?????? util/file_piece.cc.gz
rm -rf */*.o query kenlm_max_order build_binary */*_test lm/test.binary* lm/test.arpa?????? util/file_piece.cc.gz
3 changes: 2 additions & 1 deletion compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@

#don't need to use if compiling with moses Makefiles already

rm {lm,util}/*.o 2>/dev/null
set -e

rm {lm,util}/*.o
for i in util/{bit_packing,ersatz_progress,exception,file_piece,murmur_hash,file,mmap,usage} lm/{bhiksha,binary_format,config,lm_exception,model,quantize,read_arpa,search_hashed,search_trie,trie,trie_sort,value_build,virtual_interface,vocab}; do
g++ -I. -O3 -DNDEBUG $CXXFLAGS -c $i.cc -o $i.o
done
g++ -I. -O3 -DNDEBUG $CXXFLAGS lm/build_binary.cc {lm,util}/*.o -lz -o build_binary
g++ -I. -O3 -DNDEBUG $CXXFLAGS lm/ngram_query.cc {lm,util}/*.o -lz -o query
g++ -I. -O3 -DNDEBUG $CXXFLAGS lm/max_order.cc -o kenlm_max_order
3 changes: 2 additions & 1 deletion lm/Jamfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ run model_test.cc ../util//kenutil kenlm ..//boost_unit_test_framework : : test.

exe query : ngram_query.cc kenlm ../util//kenutil ;
exe build_binary : build_binary.cc kenlm ../util//kenutil ;
exe kenlm_max_order : max_order.cc : <include>.. ;

install legacy : build_binary query
install legacy : build_binary query kenlm_max_order
: <location>$(TOP)/lm <install-type>EXE <install-dependencies>on <link>shared:<dll-path>$(TOP)/lm <link>shared:<install-type>LIB ;
2 changes: 1 addition & 1 deletion lm/left.hh
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ template <class M> class RuleScore {
return;
}

float backoffs[kMaxOrder - 1], backoffs2[kMaxOrder - 1];
float backoffs[KENLM_MAX_ORDER - 1], backoffs2[KENLM_MAX_ORDER - 1];
float *back = backoffs, *back2 = backoffs2;
unsigned char next_use = out_.right.length;

Expand Down
6 changes: 6 additions & 0 deletions lm/max_order.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#include "lm/max_order.hh"
#include <iostream>

int main(int argc, char *argv[]) {
std::cerr << "KenLM was compiled with a maximum supported n-gram order set to " << KENLM_MAX_ORDER << "." << std::endl;
}
26 changes: 12 additions & 14 deletions lm/max_order.hh
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
#ifndef LM_MAX_ORDER__
#define LM_MAX_ORDER__
namespace lm {
namespace ngram {
// If you need higher order, change this and recompile.
// Having this limit means that State can be
// (kMaxOrder - 1) * sizeof(float) bytes instead of
// sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
const unsigned char kMaxOrder = 6;

} // namespace ngram
} // namespace lm

#endif // LM_MAX_ORDER__
/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM.
* If not, this is the default maximum order.
* Having this limit means that State can be
* (kMaxOrder - 1) * sizeof(float) bytes instead of
* sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
*/
#ifndef KENLM_MAX_ORDER
#define KENLM_MAX_ORDER 6
#endif
#ifndef KENLM_ORDER_MESSAGE
#define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --kenlm-max-order=6 -a'. Otherwise, edit lm/max_order.hh."
#endif
11 changes: 9 additions & 2 deletions lm/model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "lm/search_hashed.hh"
#include "lm/search_trie.hh"
#include "lm/read_arpa.hh"
#include "util/have.hh"
#include "util/murmur_hash.hh"

#include <algorithm>
Expand Down Expand Up @@ -47,7 +48,14 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge
P::Init(begin_sentence, null_context, vocab_, search_.Order());
}

namespace {
void CheckMaxOrder(size_t order) {
UTIL_THROW_IF(order > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << order << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE);
}
} // namespace

template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters &params, const Config &config, int fd) {
CheckMaxOrder(params.counts.size());
SetupMemory(start, params.counts, config);
vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab);
search_.LoadedBinary();
Expand All @@ -60,8 +68,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
std::vector<uint64_t> counts;
// File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_.
ReadARPACounts(f, counts);

if (counts.size() > kMaxOrder) UTIL_THROW(FormatLoadException, "This model has order " << counts.size() << ". Edit lm/max_order.hh, set kMaxOrder to at least this value, and recompile.");
CheckMaxOrder(counts.size());
if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model.");
if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0");

Expand Down
1 change: 0 additions & 1 deletion lm/model.hh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#include "lm/binary_format.hh"
#include "lm/config.hh"
#include "lm/facade.hh"
#include "lm/max_order.hh"
#include "lm/quantize.hh"
#include "lm/search_hashed.hh"
#include "lm/search_trie.hh"
Expand Down
2 changes: 1 addition & 1 deletion lm/quantize.hh
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ class SeparatelyQuantize {
const Bins &LongestTable() const { return longest_; }

private:
Bins tables_[kMaxOrder - 1][2];
Bins tables_[KENLM_MAX_ORDER - 1][2];

Bins longest_;

Expand Down
9 changes: 9 additions & 0 deletions lm/read_arpa.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
#include <string.h>
#include <stdint.h>

#ifdef WIN32
#include <float.h>
#endif

namespace lm {

// 1 for '\t', '\n', and ' '. This is stricter than isspace.
Expand Down Expand Up @@ -95,8 +99,13 @@ void ReadBackoff(util::FilePiece &in, float &backoff) {
backoff = in.ReadFloat();
if (backoff == ngram::kExtensionBackoff) backoff = ngram::kNoExtensionBackoff;
{
#ifdef WIN32
int float_class = _fpclass(backoff);
UTIL_THROW_IF(float_class == _FPCLASS_SNAN || float_class == _FPCLASS_QNAN || float_class == _FPCLASS_NINF || float_class == _FPCLASS_PINF, FormatLoadException, "Bad backoff " << backoff);
#else
int float_class = fpclassify(backoff);
UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << backoff);
#endif
}
UTIL_THROW_IF(in.get() != '\n', FormatLoadException, "Expected newline after backoff");
break;
Expand Down
20 changes: 10 additions & 10 deletions lm/search_trie.cc
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ const float kBadProb = std::numeric_limits<float>::infinity();
class SRISucks {
public:
SRISucks() {
for (BackoffMessages *i = messages_; i != messages_ + kMaxOrder - 1; ++i)
for (BackoffMessages *i = messages_; i != messages_ + KENLM_MAX_ORDER - 1; ++i)
i->Init(sizeof(ProbPointer) + sizeof(WordIndex) * (i - messages_ + 1));
}

Expand All @@ -196,7 +196,7 @@ class SRISucks {
}

void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) {
for (unsigned char i = 0; i < kMaxOrder - 1; ++i) {
for (unsigned char i = 0; i < KENLM_MAX_ORDER - 1; ++i) {
it_[i] = values_[i].empty() ? NULL : &*values_[i].begin();
}
messages_[0].Apply(it_, unigram_file);
Expand All @@ -221,10 +221,10 @@ class SRISucks {

private:
// This used to be one array. Then I needed to separate it by order for quantization to work.
std::vector<float> values_[kMaxOrder - 1];
BackoffMessages messages_[kMaxOrder - 1];
std::vector<float> values_[KENLM_MAX_ORDER - 1];
BackoffMessages messages_[KENLM_MAX_ORDER - 1];

float *it_[kMaxOrder - 1];
float *it_[KENLM_MAX_ORDER - 1];
};

class FindBlanks {
Expand Down Expand Up @@ -337,7 +337,7 @@ struct Gram {
template <class Doing> class BlankManager {
public:
BlankManager(unsigned char total_order, Doing &doing) : total_order_(total_order), been_length_(0), doing_(doing) {
for (float *i = basis_; i != basis_ + kMaxOrder - 1; ++i) *i = kBadProb;
for (float *i = basis_; i != basis_ + KENLM_MAX_ORDER - 1; ++i) *i = kBadProb;
}

void Visit(const WordIndex *to, unsigned char length, float prob) {
Expand Down Expand Up @@ -373,10 +373,10 @@ template <class Doing> class BlankManager {
private:
const unsigned char total_order_;

WordIndex been_[kMaxOrder];
WordIndex been_[KENLM_MAX_ORDER];
unsigned char been_length_;

float basis_[kMaxOrder];
float basis_[KENLM_MAX_ORDER];

Doing &doing_;
};
Expand Down Expand Up @@ -470,8 +470,8 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c
} // namespace

template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) {
RecordReader inputs[kMaxOrder - 1];
RecordReader contexts[kMaxOrder - 1];
RecordReader inputs[KENLM_MAX_ORDER - 1];
RecordReader contexts[KENLM_MAX_ORDER - 1];

for (unsigned char i = 2; i <= counts.size(); ++i) {
inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff)));
Expand Down
10 changes: 5 additions & 5 deletions lm/state.hh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class State {

// Call this before using raw memcmp.
void ZeroRemaining() {
for (unsigned char i = length; i < kMaxOrder - 1; ++i) {
for (unsigned char i = length; i < KENLM_MAX_ORDER - 1; ++i) {
words[i] = 0;
backoff[i] = 0.0;
}
Expand All @@ -42,8 +42,8 @@ class State {

// You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD.
// This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit.
WordIndex words[kMaxOrder - 1];
float backoff[kMaxOrder - 1];
WordIndex words[KENLM_MAX_ORDER - 1];
float backoff[KENLM_MAX_ORDER - 1];
unsigned char length;
};

Expand Down Expand Up @@ -72,11 +72,11 @@ struct Left {
}

void ZeroRemaining() {
for (uint64_t * i = pointers + length; i < pointers + kMaxOrder - 1; ++i)
for (uint64_t * i = pointers + length; i < pointers + KENLM_MAX_ORDER - 1; ++i)
*i = 0;
}

uint64_t pointers[kMaxOrder - 1];
uint64_t pointers[KENLM_MAX_ORDER - 1];
unsigned char length;
bool full;
};
Expand Down
2 changes: 1 addition & 1 deletion lm/trie_sort.hh
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ class SortedFiles {

util::scoped_fd unigram_;

util::scoped_FILE full_[kMaxOrder - 1], context_[kMaxOrder - 1];
util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1];
};

} // namespace trie
Expand Down
2 changes: 1 addition & 1 deletion lm/value.hh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include "lm/weights.hh"
#include "util/bit_packing.hh"

#include <inttypes.h>
#include <stdint.h>

namespace lm {
namespace ngram {
Expand Down

0 comments on commit 21055bf

Please sign in to comment.