Skip to content

Commit

Permalink
Merge from lazy. Includes lower order rest cost for probing.
Browse files Browse the repository at this point in the history
  • Loading branch information
kpu committed Jun 3, 2012
1 parent d59f09f commit ceb3841
Show file tree
Hide file tree
Showing 50 changed files with 1,472 additions and 828 deletions.
2 changes: 1 addition & 1 deletion compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

set -e

for i in util/{bit_packing,ersatz_progress,exception,file_piece,murmur_hash,file,mmap} lm/{bhiksha,binary_format,config,lm_exception,model,quantize,read_arpa,search_hashed,search_trie,trie,trie_sort,virtual_interface,vocab}; do
for i in util/{bit_packing,ersatz_progress,exception,file_piece,murmur_hash,file,mmap,usage} lm/{bhiksha,binary_format,config,lm_exception,model,quantize,read_arpa,search_hashed,search_trie,trie,trie_sort,value_build,virtual_interface,vocab}; do
g++ -I. -O3 -DNDEBUG $CXXFLAGS -c $i.cc -o $i.o
done
g++ -I. -O3 -DNDEBUG $CXXFLAGS lm/build_binary.cc {lm,util}/*.o -lz -o build_binary
Expand Down
2 changes: 1 addition & 1 deletion lm/Jamfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
lib kenlm : bhiksha.cc binary_format.cc config.cc lm_exception.cc model.cc quantize.cc read_arpa.cc search_hashed.cc search_trie.cc trie.cc trie_sort.cc virtual_interface.cc vocab.cc ../util//kenutil : <include>.. : : <include>.. <library>../util//kenutil ;
lib kenlm : bhiksha.cc binary_format.cc config.cc lm_exception.cc model.cc quantize.cc read_arpa.cc search_hashed.cc search_trie.cc trie.cc trie_sort.cc value_build.cc virtual_interface.cc vocab.cc ../util//kenutil : <include>.. : : <include>.. <library>../util//kenutil ;

import testing ;

Expand Down
2 changes: 1 addition & 1 deletion lm/bhiksha.hh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

namespace lm {
namespace ngram {
struct Config;
class Config;

namespace trie {

Expand Down
2 changes: 1 addition & 1 deletion lm/binary_format.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ struct Sanity {
}
};

const char *kModelNames[6] = {"hashed n-grams with probing", "hashed n-grams with sorted uniform find", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};

std::size_t TotalHeaderSize(unsigned char order) {
return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
Expand Down
103 changes: 69 additions & 34 deletions lm/build_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ void Usage(const char *name) {
"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
"-w mmap|after determines how writing is done.\n"
" mmap maps the binary file and writes to it. Default for trie.\n"
" after allocates anonymous memory, builds, and writes. Default for probing.\n\n"
" after allocates anonymous memory, builds, and writes. Default for probing.\n"
"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n"
" model files. order1.arpa must be an ARPA file. All others may be ARPA or\n"
" the same data structure as being built. All files must have the same\n"
" vocabulary. For probing, the unigrams must be in the same order.\n\n"
"type is either probing or trie. Default is probing.\n\n"
"probing uses a probing hash table. It is the fastest but uses the most memory.\n"
"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n"
Expand Down Expand Up @@ -66,16 +70,28 @@ uint8_t ParseBitCount(const char *from) {
return val;
}

void ParseFileList(const char *from, std::vector<std::string> &to) {
to.clear();
while (true) {
const char *i;
for (i = from; *i && *i != ' '; ++i) {}
to.push_back(std::string(from, i - from));
if (!*i) break;
from = i + 1;
}
}

void ShowSizes(const char *file, const lm::ngram::Config &config) {
std::vector<uint64_t> counts;
util::FilePiece f(file);
lm::ReadARPACounts(f, counts);
std::size_t sizes[5];
std::size_t sizes[6];
sizes[0] = ProbingModel::Size(counts, config);
sizes[1] = TrieModel::Size(counts, config);
sizes[2] = QuantTrieModel::Size(counts, config);
sizes[3] = ArrayTrieModel::Size(counts, config);
sizes[4] = QuantArrayTrieModel::Size(counts, config);
sizes[1] = RestProbingModel::Size(counts, config);
sizes[2] = TrieModel::Size(counts, config);
sizes[3] = QuantTrieModel::Size(counts, config);
sizes[4] = ArrayTrieModel::Size(counts, config);
sizes[5] = QuantArrayTrieModel::Size(counts, config);
std::size_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(size_t));
std::size_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(size_t));
std::size_t divide;
Expand All @@ -99,10 +115,11 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
for (long int i = 0; i < length - 2; ++i) std::cout << ' ';
std::cout << prefix << "B\n"
"probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n"
"trie " << std::setw(length) << (sizes[1] / divide) << " without quantization\n"
"trie " << std::setw(length) << (sizes[2] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n"
"trie " << std::setw(length) << (sizes[3] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n"
"trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n";
"probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r models -p " << config.probing_multiplier << "\n"
"trie " << std::setw(length) << (sizes[2] / divide) << " without quantization\n"
"trie " << std::setw(length) << (sizes[3] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n"
"trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n"
"trie " << std::setw(length) << (sizes[5] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n";
}

void ProbingQuantizationUnsupported() {
Expand All @@ -118,10 +135,10 @@ int main(int argc, char *argv[]) {
using namespace lm::ngram;

try {
bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false;
bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false;
lm::ngram::Config config;
int opt;
while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:si")) != -1) {
while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:sir:")) != -1) {
switch(opt) {
case 'q':
config.prob_bits = ParseBitCount(optarg);
Expand Down Expand Up @@ -164,6 +181,11 @@ int main(int argc, char *argv[]) {
case 'i':
config.positive_log_probability = lm::SILENT;
break;
case 'r':
rest = true;
ParseFileList(optarg, config.rest_lower_files);
config.rest_function = Config::REST_LOWER;
break;
default:
Usage(argv[0]);
}
Expand All @@ -174,35 +196,48 @@ int main(int argc, char *argv[]) {
}
if (optind + 1 == argc) {
ShowSizes(argv[optind], config);
} else if (optind + 2 == argc) {
return 0;
}
const char *model_type;
const char *from_file;

if (optind + 2 == argc) {
model_type = "probing";
from_file = argv[optind];
config.write_mmap = argv[optind + 1];
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
ProbingModel(argv[optind], config);
} else if (optind + 3 == argc) {
const char *model_type = argv[optind];
const char *from_file = argv[optind + 1];
model_type = argv[optind];
from_file = argv[optind + 1];
config.write_mmap = argv[optind + 2];
if (!strcmp(model_type, "probing")) {
if (!set_write_method) config.write_method = Config::WRITE_AFTER;
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
} else {
Usage(argv[0]);
}
if (!strcmp(model_type, "probing")) {
if (!set_write_method) config.write_method = Config::WRITE_AFTER;
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
if (rest) {
RestProbingModel(from_file, config);
} else {
ProbingModel(from_file, config);
} else if (!strcmp(model_type, "trie")) {
if (!set_write_method) config.write_method = Config::WRITE_MMAP;
if (quantize) {
if (bhiksha) {
QuantArrayTrieModel(from_file, config);
} else {
QuantTrieModel(from_file, config);
}
}
} else if (!strcmp(model_type, "trie")) {
if (rest) {
std::cerr << "Rest + trie is not supported yet." << std::endl;
return 1;
}
if (!set_write_method) config.write_method = Config::WRITE_MMAP;
if (quantize) {
if (bhiksha) {
QuantArrayTrieModel(from_file, config);
} else {
if (bhiksha) {
ArrayTrieModel(from_file, config);
} else {
TrieModel(from_file, config);
}
QuantTrieModel(from_file, config);
}
} else {
Usage(argv[0]);
if (bhiksha) {
ArrayTrieModel(from_file, config);
} else {
TrieModel(from_file, config);
}
}
} else {
Usage(argv[0]);
Expand Down
1 change: 1 addition & 0 deletions lm/config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Config::Config() :
write_mmap(NULL),
write_method(WRITE_AFTER),
include_vocab(true),
rest_function(REST_MAX),
prob_bits(8),
backoff_bits(8),
pointer_bhiksha_bits(22),
Expand Down
22 changes: 17 additions & 5 deletions lm/config.hh
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#ifndef LM_CONFIG__
#define LM_CONFIG__

#include <iosfwd>

#include "lm/lm_exception.hh"
#include "util/mmap.hh"

#include <iosfwd>
#include <string>
#include <vector>

/* Configuration for ngram model. Separate header to reduce pollution. */

namespace lm {
Expand Down Expand Up @@ -63,23 +65,33 @@ struct Config {
const char *temporary_directory_prefix;

// Level of complaining to do when loading from ARPA instead of binary format.
typedef enum {ALL, EXPENSIVE, NONE} ARPALoadComplain;
enum ARPALoadComplain {ALL, EXPENSIVE, NONE};
ARPALoadComplain arpa_complain;

// While loading an ARPA file, also write out this binary format file. Set
// to NULL to disable.
const char *write_mmap;

typedef enum {
enum WriteMethod {
WRITE_MMAP, // Map the file directly.
WRITE_AFTER // Write after we're done.
} WriteMethod;
};
WriteMethod write_method;

// Include the vocab in the binary file? Only effective if write_mmap != NULL.
bool include_vocab;


// Left rest options. Only used when the model includes rest costs.
enum RestFunction {
REST_MAX, // Maximum of any score to the left
REST_LOWER, // Use lower-order files given below.
};
RestFunction rest_function;
// Only used for REST_LOWER.
std::vector<std::string> rest_lower_files;



// Quantization options. Only effective for QuantTrieModel. One value is
// reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
Expand Down
Loading

0 comments on commit ceb3841

Please sign in to comment.