diff --git a/lm/read_arpa.cc b/lm/read_arpa.cc index dc05a6539..6ee9bfb2a 100644 --- a/lm/read_arpa.cc +++ b/lm/read_arpa.cc @@ -19,8 +19,8 @@ namespace lm { -// 1 for '\t', '\n', and ' '. This is stricter than isspace. -const bool kARPASpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; +// 1 for '\t', '\n', '\r', and ' '. This is stricter than isspace. Apparently ARPA allows vertical tab inside a word. +const bool kARPASpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; namespace { @@ -85,6 +85,11 @@ void ReadNGramHeader(util::FilePiece &in, unsigned int length) { if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead"); } +void ConsumeNewline(util::FilePiece &in) { + char follow = in.get(); + UTIL_THROW_IF('\n' != follow, FormatLoadException, "Expected newline got '" << follow << "'"); +} + void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) { switch (in.get()) { case '\t': @@ -94,6 +99,9 @@ void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) { UTIL_THROW(FormatLoadException, "Non-zero backoff " << got << " provided for an n-gram that should have no backoff"); } break; + case '\r': + ConsumeNewline(in); + // Intentionally no break. case '\n': break; default: @@ -120,8 +128,18 @@ void ReadBackoff(util::FilePiece &in, float &backoff) { UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << backoff); #endif } - UTIL_THROW_IF(in.get() != '\n', FormatLoadException, "Expected newline after backoff"); + switch (char got = in.get()) { + case '\r': + ConsumeNewline(in); + case '\n': + break; + default: + UTIL_THROW(FormatLoadException, "Expected newline after backoffs, got " << got); + } break; + case '\r': + ConsumeNewline(in); + // Intentionally no break. case '\n': backoff = ngram::kNoExtensionBackoff; break; diff --git a/setup.py b/setup.py index 9d40c0195..eceb70186 100644 --- a/setup.py +++ b/setup.py @@ -13,9 +13,12 @@ def compile_test(header, library): FILES = glob.glob('util/*.cc') + glob.glob('lm/*.cc') + glob.glob('util/double-conversion/*.cc') FILES = [fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc'))] -LIBS = ['stdc++'] -if platform.system() != 'Darwin': - LIBS.append('rt') +if platform.system() == 'Linux': + LIBS = ['stdc++', 'rt'] +elif platform.system() == 'Darwin': + LIBS = ['stdc++'] +else: + LIBS = [] #We don't need -std=c++11 but python seems to be compiled with it now. https://github.com/kpu/kenlm/issues/86 ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER=6', '-std=c++11'] diff --git a/util/bit_packing.hh b/util/bit_packing.hh index b24fd9c1f..77abc0df5 100644 --- a/util/bit_packing.hh +++ b/util/bit_packing.hh @@ -108,7 +108,7 @@ typedef union { float f; uint32_t i; } FloatEnc; inline float ReadFloat32(const void *base, uint64_t bit_off) { FloatEnc encoded; - encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 32); + encoded.i = static_cast(ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 32)); return encoded.f; } inline void WriteFloat32(void *base, uint64_t bit_off, float value) { @@ -135,7 +135,7 @@ inline void UnsetSign(float &to) { inline float ReadNonPositiveFloat31(const void *base, uint64_t bit_off) { FloatEnc encoded; - encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31); + encoded.i = static_cast(ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31)); // Sign bit set means negative. encoded.i |= kSignBit; return encoded.f; diff --git a/util/exception.hh b/util/exception.hh index 03543a9b0..614a88fa6 100644 --- a/util/exception.hh +++ b/util/exception.hh @@ -134,7 +134,7 @@ class OverflowException : public Exception { template inline std::size_t CheckOverflowInternal(uint64_t value) { UTIL_THROW_IF(value > static_cast(std::numeric_limits::max()), OverflowException, "Integer overflow detected. This model is too big for 32-bit code."); - return value; + return static_cast(value); } template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) { diff --git a/util/file_piece.hh b/util/file_piece.hh index 67b28848c..042a78e9e 100644 --- a/util/file_piece.hh +++ b/util/file_piece.hh @@ -103,7 +103,7 @@ class FilePiece { if (position_ == position_end_) { try { Shift(); - } catch (const util::EndOfFileException &e) { return false; } + } catch (const util::EndOfFileException &) { return false; } // And break out at end of file. if (position_ == position_end_) return false; } diff --git a/windows/kenlm.vcxproj b/windows/kenlm.vcxproj index 26eff0a67..3238129d3 100644 --- a/windows/kenlm.vcxproj +++ b/windows/kenlm.vcxproj @@ -161,15 +161,19 @@ + + - - + + + + @@ -177,21 +181,38 @@ + + + + + + - + + + + + + + + + + + + @@ -203,10 +224,6 @@ - - - - @@ -219,6 +236,7 @@ + @@ -226,9 +244,18 @@ - + + + + + + + + + + @@ -245,4 +272,4 @@ - \ No newline at end of file +