Skip to content

Commit

Permalink
Make merge_test independent of endianness
Browse files Browse the repository at this point in the history
  • Loading branch information
kpu committed Aug 29, 2020
1 parent a88fa7d commit 7af2468
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 55 deletions.
24 changes: 8 additions & 16 deletions lm/interpolate/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,48 +16,40 @@ if(EIGEN3_FOUND)
tune_instances.cc
tune_weights.cc
universal_vocab.cc)

find_package(OpenMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
else()
message(STATUS "OpenMP support would be nice for parallelizing matrix operations.")
endif()

add_library(kenlm_interpolate ${KENLM_INTERPOLATE_SOURCE})
target_link_libraries(kenlm_interpolate kenlm)

set(KENLM_INTERPOLATE_EXES
interpolate
streaming_example)

if (NOT MSVC)
set(THREADS pthread)
endif()

set(KENLM_INTERPOLATE_LIBS
kenlm_interpolate)

AddExes(EXES ${KENLM_INTERPOLATE_EXES}
LIBRARIES ${KENLM_INTERPOLATE_LIBS})

if(BUILD_TESTING)
AddTests(TESTS backoff_reunification_test bounded_sequence_encoding_test normalize_test tune_derivatives_test
AddTests(TESTS backoff_reunification_test bounded_sequence_encoding_test merge_vocab_test normalize_test tune_derivatives_test
LIBRARIES ${KENLM_INTERPOLATE_LIBS} pthread)

# tune_instances_test needs an extra command line parameter
KenLMAddTest(TEST tune_instances_test
LIBRARIES ${KENLM_INTERPOLATE_LIBS}
TEST_ARGS -- ${CMAKE_CURRENT_SOURCE_DIR}/../common/test_data)

foreach(test_file test1 test2 test3 test_bad_order test_no_unk)
set(KENLM_MERGE_TESTS_PATH ${KENLM_MERGE_TESTS_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/merge_test/${test_file})
endforeach(test_file)

KenLMAddTest(TEST merge_vocab_test
LIBRARIES ${KENLM_INTERPOLATE_LIBS}
TEST_ARGS ${KENLM_MERGE_TESTS_PATH})
endif()
else()
message(WARNING "Not building interpolation. You have an old version of Eigen3, ${EIGEN3_VERSION}, which has a race condition: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=466. Please install Eigen 3.1.0 or above.")
Expand Down
Binary file removed lm/interpolate/merge_test/test1
Binary file not shown.
Binary file removed lm/interpolate/merge_test/test2
Binary file not shown.
Binary file removed lm/interpolate/merge_test/test3
Binary file not shown.
Binary file removed lm/interpolate/merge_test/test_bad_order
Binary file not shown.
1 change: 0 additions & 1 deletion lm/interpolate/merge_test/test_no_unk

This file was deleted.

117 changes: 79 additions & 38 deletions lm/interpolate/merge_vocab_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,36 +9,80 @@
#include "lm/word_index.hh"
#include "util/file.hh"
#include "util/file_piece.hh"
#include "util/file_stream.hh"
#include "util/tokenize_piece.hh"

#include <algorithm>
#include <cstring>
#include <vector>

namespace lm {
namespace interpolate {
namespace {

// Stupid bjam permutes the command line arguments randomly.
struct VocabEntry {
explicit VocabEntry(StringPiece value) :
str(value), hash(util::MurmurHash64A(value.data(), value.size())) {}
StringPiece str;
uint64_t hash;
bool operator<(const VocabEntry &other) const {
return hash < other.hash;
}
};

int WriteVocabFile(const std::vector<VocabEntry> &vocab, util::scoped_fd &file) {
file.reset(util::MakeTemp(util::DefaultTempDirectory()));
{
util::FileStream out(file.get(), 128);
for (std::vector<VocabEntry>::const_iterator i = vocab.begin(); i != vocab.end(); ++i) {
out << i->str << '\0';
}
}
util::SeekOrThrow(file.get(), 0);
return file.get();
}

std::vector<VocabEntry> ParseVocab(StringPiece words) {
std::vector<VocabEntry> entries;
entries.push_back(VocabEntry("<unk>"));
for (util::TokenIter<util::SingleCharacter> i(words, '\t'); i; ++i) {
entries.push_back(VocabEntry(*i));
}
std::sort(entries.begin() + 1, entries.end());
return entries;
}

int WriteVocabFile(StringPiece words, util::scoped_fd &file) {
return WriteVocabFile(ParseVocab(words), file);
}

class TestFiles {
public:
TestFiles() {
char **argv = boost::unit_test::framework::master_test_suite().argv;
int argc = boost::unit_test::framework::master_test_suite().argc;
BOOST_REQUIRE_EQUAL(6, argc);
for (int i = 1; i < argc; ++i) {
EndsWithAssign(argv[i], "test1", test[0]);
EndsWithAssign(argv[i], "test2", test[1]);
EndsWithAssign(argv[i], "test3", test[2]);
EndsWithAssign(argv[i], "no_unk", no_unk);
EndsWithAssign(argv[i], "bad_order", bad_order);
}
TestFiles() {}
int Test0() {
return WriteVocabFile("this\tis\ta\tfirst\tcut", test[0]);
}

void EndsWithAssign(char *arg, StringPiece value, util::scoped_fd &to) {
StringPiece str(arg);
if (str.size() < value.size()) return;
if (std::memcmp(str.data() + str.size() - value.size(), value.data(), value.size())) return;
to.reset(util::OpenReadOrThrow(arg));
int Test1() {
return WriteVocabFile("is this\tthis a\tfirst cut\ta first", test[1]);
}

int Test2() {
return WriteVocabFile("is\tsecd\ti", test[2]);
}
int NoUNK() {
std::vector<VocabEntry> no_unk_vec;
no_unk_vec.push_back(VocabEntry("toto"));
return WriteVocabFile(no_unk_vec, no_unk);
}
int BadOrder() {
std::vector<VocabEntry> bad_order_vec;
bad_order_vec.push_back(VocabEntry("<unk>"));
bad_order_vec.push_back(VocabEntry("0"));
bad_order_vec.push_back(VocabEntry("1"));
bad_order_vec.push_back(VocabEntry("2"));
bad_order_vec.push_back(VocabEntry("a"));
return WriteVocabFile(bad_order_vec, bad_order);
}
private:
util::scoped_fd test[3], no_unk, bad_order;
};

Expand All @@ -51,16 +95,16 @@ BOOST_AUTO_TEST_CASE(MergeVocabTest) {
TestFiles files;

util::FixedArray<int> used_files(3);
used_files.push_back(files.test[0].get());
used_files.push_back(files.test[1].get());
used_files.push_back(files.test[2].get());
used_files.push_back(files.Test0());
used_files.push_back(files.Test1());
used_files.push_back(files.Test2());

std::vector<lm::WordIndex> model_max_idx;
model_max_idx.push_back(10);
model_max_idx.push_back(10);
model_max_idx.push_back(10);

util::scoped_fd combined(util::MakeTemp("temporary"));
util::scoped_fd combined(util::MakeTemp(util::DefaultTempDirectory()));

UniversalVocab universal_vocab(model_max_idx);
{
Expand All @@ -75,30 +119,27 @@ BOOST_AUTO_TEST_CASE(MergeVocabTest) {
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 1), 2);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 1), 8);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 5), 11);
#if BYTE_ORDER == LITTLE_ENDIAN
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 3), 4);
#elif BYTE_ORDER == BIG_ENDIAN
// MurmurHash has a different ordering of the vocabulary.
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 3), 5);
#endif
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 3), 10);

util::SeekOrThrow(combined.get(), 0);
util::FilePiece f(combined.release());
BOOST_CHECK_EQUAL("<unk>", f.ReadLine('\0'));
BOOST_CHECK_EQUAL("a", f.ReadLine('\0'));
BOOST_CHECK_EQUAL("is this", f.ReadLine('\0'));
BOOST_CHECK_EQUAL("this a", f.ReadLine('\0'));
BOOST_CHECK_EQUAL("first cut", f.ReadLine('\0'));
BOOST_CHECK_EQUAL("this", f.ReadLine('\0'));
BOOST_CHECK_EQUAL("a first", f.ReadLine('\0'));
BOOST_CHECK_EQUAL("cut", f.ReadLine('\0'));
BOOST_CHECK_EQUAL("is", f.ReadLine('\0'));
BOOST_CHECK_EQUAL("i", f.ReadLine('\0'));
BOOST_CHECK_EQUAL("secd", f.ReadLine('\0'));
BOOST_CHECK_EQUAL("first", f.ReadLine('\0'));
std::vector<VocabEntry> expected = ParseVocab("a\tis this\tthis a\tfirst cut\tthis\ta first\tcut\tis\ti\tsecd\tfirst");
for (std::vector<VocabEntry>::const_iterator i = expected.begin(); i != expected.end(); ++i) {
BOOST_CHECK_EQUAL(i->str, f.ReadLine('\0'));
}
BOOST_CHECK_THROW(f.ReadLine('\0'), util::EndOfFileException);
}

BOOST_AUTO_TEST_CASE(MergeVocabNoUnkTest) {
TestFiles files;
util::FixedArray<int> used_files(1);
used_files.push_back(files.no_unk.get());
used_files.push_back(files.NoUNK());

std::vector<lm::WordIndex> model_max_idx;
model_max_idx.push_back(10);
Expand All @@ -112,8 +153,8 @@ BOOST_AUTO_TEST_CASE(MergeVocabWrongOrderTest) {
TestFiles files;

util::FixedArray<int> used_files(2);
used_files.push_back(files.test[0].get());
used_files.push_back(files.bad_order.get());
used_files.push_back(files.Test0());
used_files.push_back(files.BadOrder());

std::vector<lm::WordIndex> model_max_idx;
model_max_idx.push_back(10);
Expand Down

0 comments on commit 7af2468

Please sign in to comment.