Skip to content

Commit

Permalink
Merge branch 'master' of github.com:kpu/kenlm
Browse files Browse the repository at this point in the history
  • Loading branch information
kpu committed Mar 29, 2018
2 parents 66cb4a6 + 03aa6d2 commit 65f4219
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 19 deletions.
24 changes: 21 additions & 3 deletions lm/read_arpa.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@

namespace lm {

// 1 for '\t', '\n', and ' '. This is stricter than isspace.
const bool kARPASpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
// 1 for '\t', '\n', '\r', and ' '. This is stricter than isspace. Apparently ARPA allows vertical tab inside a word.
const bool kARPASpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};

namespace {

Expand Down Expand Up @@ -85,6 +85,11 @@ void ReadNGramHeader(util::FilePiece &in, unsigned int length) {
if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead");
}

void ConsumeNewline(util::FilePiece &in) {
char follow = in.get();
UTIL_THROW_IF('\n' != follow, FormatLoadException, "Expected newline got '" << follow << "'");
}

void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) {
switch (in.get()) {
case '\t':
Expand All @@ -94,6 +99,9 @@ void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) {
UTIL_THROW(FormatLoadException, "Non-zero backoff " << got << " provided for an n-gram that should have no backoff");
}
break;
case '\r':
ConsumeNewline(in);
// Intentionally no break.
case '\n':
break;
default:
Expand All @@ -120,8 +128,18 @@ void ReadBackoff(util::FilePiece &in, float &backoff) {
UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << backoff);
#endif
}
UTIL_THROW_IF(in.get() != '\n', FormatLoadException, "Expected newline after backoff");
switch (char got = in.get()) {
case '\r':
ConsumeNewline(in);
case '\n':
break;
default:
UTIL_THROW(FormatLoadException, "Expected newline after backoffs, got " << got);
}
break;
case '\r':
ConsumeNewline(in);
// Intentionally no break.
case '\n':
backoff = ngram::kNoExtensionBackoff;
break;
Expand Down
9 changes: 6 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@ def compile_test(header, library):
FILES = glob.glob('util/*.cc') + glob.glob('lm/*.cc') + glob.glob('util/double-conversion/*.cc')
FILES = [fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc'))]

LIBS = ['stdc++']
if platform.system() != 'Darwin':
LIBS.append('rt')
if platform.system() == 'Linux':
LIBS = ['stdc++', 'rt']
elif platform.system() == 'Darwin':
LIBS = ['stdc++']
else:
LIBS = []

#We don't need -std=c++11 but python seems to be compiled with it now. https://github.com/kpu/kenlm/issues/86
ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER=6', '-std=c++11']
Expand Down
4 changes: 2 additions & 2 deletions util/bit_packing.hh
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ typedef union { float f; uint32_t i; } FloatEnc;

inline float ReadFloat32(const void *base, uint64_t bit_off) {
FloatEnc encoded;
encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 32);
encoded.i = static_cast<uint32_t>(ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 32));
return encoded.f;
}
inline void WriteFloat32(void *base, uint64_t bit_off, float value) {
Expand All @@ -135,7 +135,7 @@ inline void UnsetSign(float &to) {

inline float ReadNonPositiveFloat31(const void *base, uint64_t bit_off) {
FloatEnc encoded;
encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31);
encoded.i = static_cast<uint32_t>(ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31));
// Sign bit set means negative.
encoded.i |= kSignBit;
return encoded.f;
Expand Down
2 changes: 1 addition & 1 deletion util/exception.hh
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ class OverflowException : public Exception {

template <unsigned len> inline std::size_t CheckOverflowInternal(uint64_t value) {
UTIL_THROW_IF(value > static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), OverflowException, "Integer overflow detected. This model is too big for 32-bit code.");
return value;
return static_cast<std::size_t>(value);
}

template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) {
Expand Down
2 changes: 1 addition & 1 deletion util/file_piece.hh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class FilePiece {
if (position_ == position_end_) {
try {
Shift();
} catch (const util::EndOfFileException &e) { return false; }
} catch (const util::EndOfFileException &) { return false; }
// And break out at end of file.
if (position_ == position_end_) return false;
}
Expand Down
45 changes: 36 additions & 9 deletions windows/kenlm.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -161,37 +161,58 @@
<None Include="..\lm\max_order.hh" />
<None Include="..\lm\model.hh" />
<None Include="..\lm\model_type.hh" />
<None Include="..\lm\ngram_query.hh" />
<None Include="..\lm\partial.hh" />
<None Include="..\lm\quantize.hh" />
<None Include="..\lm\read_arpa.hh" />
<None Include="..\lm\return.hh" />
<None Include="..\lm\search_hashed.hh" />
<None Include="..\lm\search_trie.hh" />
<None Include="..\lm\test.arpa" />
<None Include="..\lm\test_nounk.arpa" />
<None Include="..\lm\sizes.hh" />
<None Include="..\lm\state.hh" />
<None Include="..\lm\trie.hh" />
<None Include="..\lm\trie_sort.hh" />
<None Include="..\lm\value_build.hh" />
<None Include="..\lm\value.hh" />
<None Include="..\lm\virtual_interface.hh" />
<None Include="..\lm\vocab.hh" />
<None Include="..\lm\weights.hh" />
<None Include="..\lm\word_index.hh" />
<None Include="..\util\bit_packing.hh" />
<None Include="..\util\ersatz_progress.hh" />
<None Include="..\util\exception.hh" />
<None Include="..\util\fake_ifstream.hh" />
<None Include="..\util\fake_ostream.hh" />
<None Include="..\util\file.hh" />
<None Include="..\util\file_piece.hh" />
<None Include="..\util\file_stream.hh" />
<None Include="..\util\fixed_array.hh" />
<None Include="..\util\float_to_string.hh" />
<None Include="..\util\getopt.hh" />
<None Include="..\util\have.hh" />
<None Include="..\util\integer_to_string.hh" />
<None Include="..\util\joint_sort.hh" />
<None Include="..\util\key_value_packing.hh" />
<None Include="..\util\keep_buffer.hh" />
<None Include="..\util\mmap.hh" />
<None Include="..\util\multi_intersection.hh" />
<None Include="..\util\murmur_hash.hh" />
<None Include="..\util\parallel_read.hh" />
<None Include="..\util\pcqueue.hh" />
<None Include="..\util\pool.hh" />
<None Include="..\util\probing_hash_table.hh" />
<None Include="..\util\proxy_iterator.hh" />
<None Include="..\util\read_compressed.hh" />
<None Include="..\util\scoped.hh" />
<None Include="..\util\sized_iterator.hh" />
<None Include="..\util\sorted_uniform.hh" />
<None Include="..\util\spaces.hh" />
<None Include="..\util\string_piece_hash.hh" />
<None Include="..\util\string_piece.hh" />
<None Include="..\util\string_stream.hh" />
<None Include="..\util\sum_exp.hh" />
<None Include="..\util\thread_pool.hh" />
<None Include="..\util\tokenize_piece.hh" />
<None Include="..\util\usage.hh" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\lm\value_build.cc" />
Expand All @@ -203,10 +224,6 @@
<ClCompile Include="..\util\double-conversion\fast-dtoa.cc" />
<ClCompile Include="..\util\double-conversion\fixed-dtoa.cc" />
<ClCompile Include="..\util\double-conversion\strtod.cc" />
<ClCompile Include="..\util\pool.cc" />
<ClCompile Include="..\util\read_compressed.cc" />
<ClCompile Include="..\util\scoped.cc" />
<ClCompile Include="..\util\usage.cc" />
<ClCompile Include="..\lm\bhiksha.cc" />
<ClCompile Include="..\lm\binary_format.cc" />
<ClCompile Include="..\lm\config.cc" />
Expand All @@ -219,16 +236,26 @@
<ClCompile Include="..\lm\sizes.cc" />
<ClCompile Include="..\lm\trie.cc" />
<ClCompile Include="..\lm\trie_sort.cc" />
<ClCompile Include="..\lm\value_build.cc" />
<ClCompile Include="..\lm\virtual_interface.cc" />
<ClCompile Include="..\lm\vocab.cc" />
<ClCompile Include="..\util\bit_packing.cc" />
<ClCompile Include="..\util\ersatz_progress.cc" />
<ClCompile Include="..\util\exception.cc" />
<ClCompile Include="..\util\file.cc" />
<ClCompile Include="..\util\file_piece.cc" />
<ClCompile Include="..\util\getopt.c" />
<ClCompile Include="..\util\float_to_string.cc" />
<ClCompile Include="..\util\integer_to_string.cc" />
<ClCompile Include="..\util\mmap.cc" />
<ClCompile Include="..\util\murmur_hash.cc" />
<ClCompile Include="..\util\parallel_read.cc" />
<ClCompile Include="..\util\pool.cc" />
<ClCompile Include="..\util\read_compressed.cc" />
<ClCompile Include="..\util\scoped.cc" />
<ClCompile Include="..\util\spaces.cc" />
<ClCompile Include="..\util\string_piece.cc" />
<ClCompile Include="..\util\usage.cc" />
<ClCompile Include="..\util\getopt.c" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\util\double-conversion\bignum-dtoa.h" />
Expand All @@ -245,4 +272,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
</Project>

0 comments on commit 65f4219

Please sign in to comment.