forked from kpu/kenlm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Renaming and avoid decoding prob if we don't have to
git-svn-id: file:///dev/shm/somefilter.svn@455 e102df66-1e2e-11dd-9b44-c24451a4db5e
- Loading branch information
Showing
15 changed files
with
206 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
#include "lm/ngram.hh" | ||
#include "lm/model.hh" | ||
|
||
#include <iostream> | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
#include "lm/ngram_config.hh" | ||
#include "lm/config.hh" | ||
|
||
#include <iostream> | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
#include "lm/ngram.hh" | ||
#include "lm/model.hh" | ||
|
||
#include <cstdlib> | ||
#include <fstream> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
#ifndef LM_SEARCH_HASHED__ | ||
#define LM_SEARCH_HASHED__ | ||
|
||
#include "lm/binary_format.hh" | ||
#include "lm/config.hh" | ||
#include "lm/read_arpa.hh" | ||
#include "lm/weights.hh" | ||
|
||
#include "util/key_value_packing.hh" | ||
#include "util/probing_hash_table.hh" | ||
#include "util/sorted_uniform.hh" | ||
|
||
#include <algorithm> | ||
#include <vector> | ||
|
||
namespace util { class FilePiece; } | ||
|
||
namespace lm { | ||
namespace ngram { | ||
namespace detail { | ||
|
||
inline uint64_t CombineWordHash(uint64_t current, const WordIndex next) { | ||
uint64_t ret = (current * 8978948897894561157ULL) ^ (static_cast<uint64_t>(next) * 17894857484156487943ULL); | ||
return ret; | ||
} | ||
|
||
struct HashedSearch { | ||
typedef uint64_t Node; | ||
|
||
class Unigram { | ||
public: | ||
Unigram() {} | ||
|
||
Unigram(void *start, std::size_t /*allocated*/) : unigram_(static_cast<ProbBackoff*>(start)) {} | ||
|
||
static std::size_t Size(uint64_t count) { | ||
return (count + 1) * sizeof(ProbBackoff); // +1 for hallucinate <unk> | ||
} | ||
|
||
const ProbBackoff &Lookup(WordIndex index) const { return unigram_[index]; } | ||
|
||
ProbBackoff &Unknown() { return unigram_[0]; } | ||
|
||
void LoadedBinary() {} | ||
|
||
// For building. | ||
ProbBackoff *Raw() { return unigram_; } | ||
|
||
private: | ||
ProbBackoff *unigram_; | ||
}; | ||
|
||
Unigram unigram; | ||
|
||
bool LookupUnigram(WordIndex word, float &prob, float &backoff, Node &next) const { | ||
const ProbBackoff &entry = unigram.Lookup(word); | ||
prob = entry.prob; | ||
backoff = entry.backoff; | ||
next = static_cast<Node>(word); | ||
return true; | ||
} | ||
}; | ||
|
||
template <class MiddleT, class LongestT> struct TemplateHashedSearch : public HashedSearch { | ||
typedef MiddleT Middle; | ||
std::vector<Middle> middle; | ||
|
||
typedef LongestT Longest; | ||
Longest longest; | ||
|
||
static std::size_t Size(const std::vector<uint64_t> &counts, const Config &config) { | ||
std::size_t ret = Unigram::Size(counts[0]); | ||
for (unsigned char n = 1; n < counts.size() - 1; ++n) { | ||
ret += Middle::Size(counts[n], config.probing_multiplier); | ||
} | ||
return ret + Longest::Size(counts.back(), config.probing_multiplier); | ||
} | ||
|
||
uint8_t *SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) { | ||
std::size_t allocated = Unigram::Size(counts[0]); | ||
unigram = Unigram(start, allocated); | ||
start += allocated; | ||
for (unsigned int n = 2; n < counts.size(); ++n) { | ||
allocated = Middle::Size(counts[n - 1], config.probing_multiplier); | ||
middle.push_back(Middle(start, allocated)); | ||
start += allocated; | ||
} | ||
allocated = Longest::Size(counts.back(), config.probing_multiplier); | ||
longest = Longest(start, allocated); | ||
start += allocated; | ||
return start; | ||
} | ||
|
||
template <class Voc> void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, Voc &vocab); | ||
|
||
bool LookupMiddle(const Middle &middle, WordIndex word, float &prob, float &backoff, Node &node) const { | ||
node = CombineWordHash(node, word); | ||
typename Middle::ConstIterator found; | ||
if (!middle.Find(node, found)) return false; | ||
prob = found->GetValue().prob; | ||
backoff = found->GetValue().backoff; | ||
return true; | ||
} | ||
|
||
bool LookupMiddleNoProb(const Middle &middle, WordIndex word, float &backoff, Node &node) const { | ||
node = CombineWordHash(node, word); | ||
typename Middle::ConstIterator found; | ||
if (!middle.Find(node, found)) return false; | ||
backoff = found->GetValue().backoff; | ||
return true; | ||
} | ||
|
||
bool LookupLongest(WordIndex word, float &prob, Node &node) const { | ||
node = CombineWordHash(node, word); | ||
typename Longest::ConstIterator found; | ||
if (!longest.Find(node, found)) return false; | ||
prob = found->GetValue().prob; | ||
return true; | ||
} | ||
|
||
// Geenrate a node without necessarily checking that it actually exists. | ||
// Optionally return false if it's know to not exist. | ||
bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const { | ||
assert(begin != end); | ||
node = static_cast<Node>(*begin); | ||
for (const WordIndex *i = begin + 1; i < end; ++i) { | ||
node = CombineWordHash(node, *i); | ||
} | ||
return true; | ||
} | ||
}; | ||
|
||
// std::identity is an SGI extension :-( | ||
struct IdentityHash : public std::unary_function<uint64_t, size_t> { | ||
size_t operator()(uint64_t arg) const { return static_cast<size_t>(arg); } | ||
}; | ||
|
||
struct ProbingHashedSearch : public TemplateHashedSearch< | ||
util::ProbingHashTable<util::ByteAlignedPacking<uint64_t, ProbBackoff>, IdentityHash>, | ||
util::ProbingHashTable<util::ByteAlignedPacking<uint64_t, Prob>, IdentityHash> > { | ||
|
||
static const ModelType kModelType = HASH_PROBING; | ||
}; | ||
|
||
struct SortedHashedSearch : public TemplateHashedSearch< | ||
util::SortedUniformMap<util::ByteAlignedPacking<uint64_t, ProbBackoff> >, | ||
util::SortedUniformMap<util::ByteAlignedPacking<uint64_t, Prob> > > { | ||
|
||
static const ModelType kModelType = HASH_SORTED; | ||
}; | ||
|
||
} // namespace detail | ||
} // namespace ngram | ||
} // namespace lm | ||
|
||
#endif // LM_SEARCH_HASHED__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters