diff --git a/lm/read_arpa.cc b/lm/read_arpa.cc index c4c537a1f..8e9a770d2 100644 --- a/lm/read_arpa.cc +++ b/lm/read_arpa.cc @@ -20,7 +20,12 @@ bool IsEntirelyWhiteSpace(const StringPiece &line) { template void GenericReadARPACounts(F &in, std::vector &number) { number.clear(); StringPiece line; - if (!IsEntirelyWhiteSpace(line = in.ReadLine())) UTIL_THROW(FormatLoadException, "First line was \"" << line << "\" not blank"); + if (!IsEntirelyWhiteSpace(line = in.ReadLine())) { + if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast(line.data()[1]) == 0x8b)) { + UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, run\nzcat " << in.FileName() << " |kenlm/build_binary /dev/stdin " << in.FileName() << ".binary\nIf this already in binary format, you need to decompress it because mmap doesn't work on top of gzip."); + } + UTIL_THROW(FormatLoadException, "First line was \"" << static_cast(line.data()[1]) << "\" not blank"); + } if ((line = in.ReadLine()) != "\\data\\") UTIL_THROW(FormatLoadException, "second line was \"" << line << "\" not \\data\\."); while (!IsEntirelyWhiteSpace(line = in.ReadLine())) { if (line.size() < 6 || strncmp(line.data(), "ngram ", 6)) UTIL_THROW(FormatLoadException, "count line \"" << line << "\"doesn't begin with \"ngram \""); @@ -72,6 +77,11 @@ class FakeFilePiece { return ret; } + const char *FileName() const { + // This only used for error messages and we don't know the file name. . . + return "$file"; + } + private: std::istream &in_; std::string buffer_; diff --git a/util/file_piece.cc b/util/file_piece.cc index 8ac5f1218..069218aba 100644 --- a/util/file_piece.cc +++ b/util/file_piece.cc @@ -51,6 +51,7 @@ FilePiece::FilePiece(const char *name, int fd, std::ostream *show_progress, off_ } void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) { + file_name_ = name; if (total_size_ == kBadSize) { fallback_to_read_ = true; if (show_progress) diff --git a/util/file_piece.hh b/util/file_piece.hh index 9756250f2..a14327843 100644 --- a/util/file_piece.hh +++ b/util/file_piece.hh @@ -65,6 +65,8 @@ class FilePiece { void ForceFallbackToRead() { fallback_to_read_ = true; } + + const std::string &FileName() const { return file_name_; } private: void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer); @@ -98,6 +100,8 @@ class FilePiece { bool fallback_to_read_; ErsatzProgress progress_; + + std::string file_name_; }; } // namespace util