Skip to content

Commit

Permalink
Working version of the grammar extractor.
Browse files Browse the repository at this point in the history
  • Loading branch information
pauldb89 committed Feb 14, 2013
1 parent 252fb16 commit 9a026ba
Show file tree
Hide file tree
Showing 80 changed files with 3,007 additions and 700 deletions.
90 changes: 76 additions & 14 deletions extractor/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,36 +1,84 @@
bin_PROGRAMS = compile run_extractor

noinst_PROGRAMS = \
alignment_test \
binary_search_merger_test \
data_array_test \
feature_count_source_target_test \
feature_is_source_singleton_test \
feature_is_source_target_singleton_test \
feature_max_lex_source_given_target_test \
feature_max_lex_target_given_source_test \
feature_sample_source_count_test \
feature_target_given_source_coherent_test \
grammar_extractor_test \
intersector_test \
linear_merger_test \
matching_comparator_test \
matching_test \
matchings_finder_test \
phrase_test \
precomputation_test \
rule_extractor_helper_test \
rule_extractor_test \
rule_factory_test \
sampler_test \
scorer_test \
suffix_array_test \
target_phrase_extractor_test \
translation_table_test \
veb_test

TESTS = sampler_test
#TESTS = binary_search_merger_test \
# data_array_test \
# intersector_test \
# linear_merger_test \
# matching_comparator_test \
# matching_test \
# matchings_finder_test \
# phrase_test \
# precomputation_test \
# suffix_array_test \
# veb_test
TESTS = alignment_test \
binary_search_merger_test \
data_array_test \
feature_count_source_target_test \
feature_is_source_singleton_test \
feature_is_source_target_singleton_test \
feature_max_lex_source_given_target_test \
feature_max_lex_target_given_source_test \
feature_sample_source_count_test \
feature_target_given_source_coherent_test \
grammar_extractor_test \
intersector_test \
linear_merger_test \
matching_comparator_test \
matching_test \
matchings_finder_test \
phrase_test \
precomputation_test \
rule_extractor_helper_test \
rule_extractor_test \
rule_factory_test \
sampler_test \
scorer_test \
suffix_array_test \
target_phrase_extractor_test \
translation_table_test \
veb_test

alignment_test_SOURCES = alignment_test.cc
alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
binary_search_merger_test_SOURCES = binary_search_merger_test.cc
binary_search_merger_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
data_array_test_SOURCES = data_array_test.cc
data_array_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
feature_count_source_target_test_SOURCES = features/count_source_target_test.cc
feature_count_source_target_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
feature_is_source_singleton_test_SOURCES = features/is_source_singleton_test.cc
feature_is_source_singleton_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
feature_is_source_target_singleton_test_SOURCES = features/is_source_target_singleton_test.cc
feature_is_source_target_singleton_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
feature_max_lex_source_given_target_test_SOURCES = features/max_lex_source_given_target_test.cc
feature_max_lex_source_given_target_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
feature_max_lex_target_given_source_test_SOURCES = features/max_lex_target_given_source_test.cc
feature_max_lex_target_given_source_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
feature_sample_source_count_test_SOURCES = features/sample_source_count_test.cc
feature_sample_source_count_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
feature_target_given_source_coherent_test_SOURCES = features/target_given_source_coherent_test.cc
feature_target_given_source_coherent_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
grammar_extractor_test_SOURCES = grammar_extractor_test.cc
grammar_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
intersector_test_SOURCES = intersector_test.cc
intersector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
linear_merger_test_SOURCES = linear_merger_test.cc
Expand All @@ -45,10 +93,22 @@ phrase_test_SOURCES = phrase_test.cc
phrase_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
precomputation_test_SOURCES = precomputation_test.cc
precomputation_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
suffix_array_test_SOURCES = suffix_array_test.cc
suffix_array_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
rule_extractor_helper_test_SOURCES = rule_extractor_helper_test.cc
rule_extractor_helper_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
rule_extractor_test_SOURCES = rule_extractor_test.cc
rule_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
rule_factory_test_SOURCES = rule_factory_test.cc
rule_factory_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
sampler_test_SOURCES = sampler_test.cc
sampler_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
scorer_test_SOURCES = scorer_test.cc
scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
suffix_array_test_SOURCES = suffix_array_test.cc
suffix_array_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
target_phrase_extractor_test_SOURCES = target_phrase_extractor_test.cc
target_phrase_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
translation_table_test_SOURCES = translation_table_test.cc
translation_table_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
veb_test_SOURCES = veb_test.cc
veb_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a

Expand Down Expand Up @@ -93,10 +153,12 @@ libextractor_a_SOURCES = \
precomputation.cc \
rule.cc \
rule_extractor.cc \
rule_extractor_helper.cc \
rule_factory.cc \
sampler.cc \
scorer.cc \
suffix_array.cc \
target_phrase_extractor.cc \
translation_table.cc \
veb.cc \
veb_bitset.cc \
Expand Down
6 changes: 5 additions & 1 deletion extractor/alignment.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ Alignment::Alignment(const string& filename) {
alignments.shrink_to_fit();
}

const vector<pair<int, int> >& Alignment::GetLinks(int sentence_index) const {
Alignment::Alignment() {}

Alignment::~Alignment() {}

vector<pair<int, int> > Alignment::GetLinks(int sentence_index) const {
return alignments[sentence_index];
}

Expand Down
7 changes: 6 additions & 1 deletion extractor/alignment.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,15 @@ class Alignment {
public:
Alignment(const string& filename);

const vector<pair<int, int> >& GetLinks(int sentence_index) const;
virtual vector<pair<int, int> > GetLinks(int sentence_index) const;

void WriteBinary(const fs::path& filepath);

virtual ~Alignment();

protected:
Alignment();

private:
vector<vector<pair<int, int> > > alignments;
};
Expand Down
31 changes: 31 additions & 0 deletions extractor/alignment_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#include <gtest/gtest.h>

#include <memory>
#include <string>

#include "alignment.h"

using namespace std;
using namespace ::testing;

namespace {

class AlignmentTest : public Test {
protected:
virtual void SetUp() {
alignment = make_shared<Alignment>("sample_alignment.txt");
}

shared_ptr<Alignment> alignment;
};

TEST_F(AlignmentTest, TestGetLinks) {
vector<pair<int, int> > expected_links = {
make_pair(0, 0), make_pair(1, 1), make_pair(2, 2)
};
EXPECT_EQ(expected_links, alignment->GetLinks(0));
expected_links = {make_pair(1, 0), make_pair(2, 1)};
EXPECT_EQ(expected_links, alignment->GetLinks(1));
}

} // namespace
6 changes: 4 additions & 2 deletions extractor/binary_search_merger.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ BinarySearchMerger::~BinarySearchMerger() {}

void BinarySearchMerger::Merge(
vector<int>& locations, const Phrase& phrase, const Phrase& suffix,
vector<int>::iterator prefix_start, vector<int>::iterator prefix_end,
vector<int>::iterator suffix_start, vector<int>::iterator suffix_end,
const vector<int>::iterator& prefix_start,
const vector<int>::iterator& prefix_end,
const vector<int>::iterator& suffix_start,
const vector<int>::iterator& suffix_end,
int prefix_subpatterns, int suffix_subpatterns) const {
if (IsIntersectionVoid(prefix_start, prefix_end, suffix_start, suffix_end,
prefix_subpatterns, suffix_subpatterns, suffix)) {
Expand Down
6 changes: 4 additions & 2 deletions extractor/binary_search_merger.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ class BinarySearchMerger {

virtual void Merge(
vector<int>& locations, const Phrase& phrase, const Phrase& suffix,
vector<int>::iterator prefix_start, vector<int>::iterator prefix_end,
vector<int>::iterator suffix_start, vector<int>::iterator suffix_end,
const vector<int>::iterator& prefix_start,
const vector<int>::iterator& prefix_end,
const vector<int>::iterator& suffix_start,
const vector<int>::iterator& suffix_end,
int prefix_subpatterns, int suffix_subpatterns) const;

static double BAEZA_YATES_FACTOR;
Expand Down
19 changes: 14 additions & 5 deletions extractor/data_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
namespace fs = boost::filesystem;
using namespace std;

int DataArray::END_OF_FILE = 0;
int DataArray::NULL_WORD = 0;
int DataArray::END_OF_LINE = 1;
string DataArray::END_OF_FILE_STR = "__END_OF_FILE__";
string DataArray::NULL_WORD_STR = "__NULL__";
string DataArray::END_OF_LINE_STR = "__END_OF_LINE__";

DataArray::DataArray() {
Expand Down Expand Up @@ -47,9 +47,9 @@ DataArray::DataArray(const string& filename, const Side& side) {
}

void DataArray::InitializeDataArray() {
word2id[END_OF_FILE_STR] = END_OF_FILE;
id2word.push_back(END_OF_FILE_STR);
word2id[END_OF_LINE_STR] = END_OF_FILE;
word2id[NULL_WORD_STR] = NULL_WORD;
id2word.push_back(NULL_WORD_STR);
word2id[END_OF_LINE_STR] = END_OF_LINE;
id2word.push_back(END_OF_LINE_STR);
}

Expand Down Expand Up @@ -87,6 +87,10 @@ int DataArray::AtIndex(int index) const {
return data[index];
}

string DataArray::GetWordAtIndex(int index) const {
return id2word[data[index]];
}

int DataArray::GetSize() const {
return data.size();
}
Expand All @@ -103,6 +107,11 @@ int DataArray::GetSentenceStart(int position) const {
return sentence_start[position];
}

int DataArray::GetSentenceLength(int sentence_id) const {
// Ignore end of line markers.
return sentence_start[sentence_id + 1] - sentence_start[sentence_id] - 1;
}

int DataArray::GetSentenceId(int position) const {
return sentence_id[position];
}
Expand Down
16 changes: 10 additions & 6 deletions extractor/data_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
#define _DATA_ARRAY_H_

#include <string>
#include <tr1/unordered_map>
#include <unordered_map>
#include <vector>

#include <boost/filesystem.hpp>

namespace fs = boost::filesystem;
using namespace std;
using namespace tr1;

enum Side {
SOURCE,
Expand All @@ -18,9 +17,9 @@ enum Side {

class DataArray {
public:
static int END_OF_FILE;
static int NULL_WORD;
static int END_OF_LINE;
static string END_OF_FILE_STR;
static string NULL_WORD_STR;
static string END_OF_LINE_STR;

DataArray(const string& filename);
Expand All @@ -33,6 +32,8 @@ class DataArray {

virtual int AtIndex(int index) const;

virtual string GetWordAtIndex(int index) const;

virtual int GetSize() const;

virtual int GetVocabularySize() const;
Expand All @@ -43,9 +44,12 @@ class DataArray {

virtual string GetWord(int word_id) const;

int GetNumSentences() const;
virtual int GetNumSentences() const;

virtual int GetSentenceStart(int position) const;

int GetSentenceStart(int position) const;
//TODO(pauldb): Add unit tests.
virtual int GetSentenceLength(int sentence_id) const;

virtual int GetSentenceId(int position) const;

Expand Down
32 changes: 32 additions & 0 deletions extractor/features/count_source_target_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include <gtest/gtest.h>

#include <memory>
#include <string>

#include "count_source_target.h"

using namespace std;
using namespace ::testing;

namespace {

class CountSourceTargetTest : public Test {
protected:
virtual void SetUp() {
feature = make_shared<CountSourceTarget>();
}

shared_ptr<CountSourceTarget> feature;
};

TEST_F(CountSourceTargetTest, TestGetName) {
EXPECT_EQ("CountEF", feature->GetName());
}

TEST_F(CountSourceTargetTest, TestScore) {
Phrase phrase;
FeatureContext context(phrase, phrase, 0.5, 9, 13);
EXPECT_EQ(1.0, feature->Score(context));
}

} // namespace
2 changes: 2 additions & 0 deletions extractor/features/feature.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include "feature.h"

const double Feature::MAX_SCORE = 99.0;

Feature::~Feature() {}
10 changes: 7 additions & 3 deletions extractor/features/feature.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,16 @@ using namespace std;

struct FeatureContext {
FeatureContext(const Phrase& source_phrase, const Phrase& target_phrase,
double sample_source_count, int pair_count) :
double source_phrase_count, int pair_count, int num_samples) :
source_phrase(source_phrase), target_phrase(target_phrase),
sample_source_count(sample_source_count), pair_count(pair_count) {}
source_phrase_count(source_phrase_count), pair_count(pair_count),
num_samples(num_samples) {}

Phrase source_phrase;
Phrase target_phrase;
double sample_source_count;
double source_phrase_count;
int pair_count;
int num_samples;
};

class Feature {
Expand All @@ -26,6 +28,8 @@ class Feature {

virtual string GetName() const = 0;

virtual ~Feature();

static const double MAX_SCORE;
};

Expand Down
2 changes: 1 addition & 1 deletion extractor/features/is_source_singleton.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include <cmath>

double IsSourceSingleton::Score(const FeatureContext& context) const {
return context.sample_source_count == 1;
return context.source_phrase_count == 1;
}

string IsSourceSingleton::GetName() const {
Expand Down
Loading

0 comments on commit 9a026ba

Please sign in to comment.