Skip to content

Commit

Permalink
More consistent filenames and settings parameters (ad-freiburg#612)
Browse files Browse the repository at this point in the history
  • Loading branch information
joka921 authored Feb 24, 2022
1 parent 25d5d64 commit a9f4961
Show file tree
Hide file tree
Showing 15 changed files with 47 additions and 162 deletions.
3 changes: 0 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,3 @@ target_link_libraries(VocabularyMergerMain index ${CMAKE_THREAD_LIBS_INIT})

add_executable(PermutationExporterMain src/index/PermutationExporterMain.cpp)
target_link_libraries(PermutationExporterMain index ${CMAKE_THREAD_LIBS_INIT})

add_executable(VocabularyConverterMain src/VocabularyConverterMain.cpp)
target_link_libraries(VocabularyConverterMain index ${CMAKE_THREAD_LIBS_INIT})
2 changes: 1 addition & 1 deletion e2e/e2e-build-settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"num-triples-per-partial-vocab" : 40000,
"num-triples-per-batch" : 40000,
"parser-batch-size" : 1000,
"ascii-prefixes-only":false
}
2 changes: 1 addition & 1 deletion examples/olympics.settings.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"ascii-prefixes-only": true,
"num-triples-per-partial-vocab": 50000000
"num-triples-per-batch": 50000000
}
2 changes: 1 addition & 1 deletion examples/scientists.settings.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"ascii-prefixes-only": true,
"num-triples-per-partial-vocab": 10000000
"num-triples-per-batch": 10000000
}
2 changes: 1 addition & 1 deletion examples/wikidata.settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@
"ignore-punctuation": true
},
"ascii-prefixes-only": true,
"num-triples-per-partial-vocab" : 50000000
"num-triples-per-batch" : 50000000
}
4 changes: 2 additions & 2 deletions master.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ index:
fi

remove_index:
rm -f $(DB).index.* $(DB).literals-index $(DB).vocabulary $(DB).prefixes $(DB).meta-data.json $(DB).index-log.txt
rm -f $(DB).index.* $(DB).vocabulary.* $(DB).prefixes $(DB).meta-data.json $(DB).index-log.txt

# Create wordsfile and docsfile from all literals of the given NT file.
# Using this as input for a SPARQL+Text index build will effectively enable
Expand Down Expand Up @@ -183,7 +183,7 @@ clear_unpinned:
# STATISTICS on cache, memory, and the number of triples per predicate.

disk_usage:
du -hc $(DB).index.* $(DB).literals-index $(DB).vocabulary $(DB).prefixes $(DB).meta-data.json
du -hc $(DB).index.* $(DB).vocabulary.* $(DB).prefixes $(DB).meta-data.json

cachestats:
@curl -Gs $(QLEVER_API) --data-urlencode "cmd=cachestats" \
Expand Down
127 changes: 0 additions & 127 deletions src/VocabularyConverterMain.cpp

This file was deleted.

2 changes: 1 addition & 1 deletion src/VocabularyMergerMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ int main(int argc, char** argv) {
size_t numFiles = atoi(argv[2]);

VocabularyMerger m;
std::ofstream file(basename + ".vocabulary");
std::ofstream file(basename + INTERNAL_VOCAB_SUFFIX);
AD_CHECK(file.is_open());
auto internalVocabularyAction = [&file](const auto& word) {
file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n';
Expand Down
2 changes: 2 additions & 0 deletions src/global/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ static const int DEFAULT_NOF_VALUE_EXPONENT_DIGITS = 20;
static const int DEFAULT_NOF_VALUE_MANTISSA_DIGITS = 30;
static const int DEFAULT_NOF_DATE_YEAR_DIGITS = 19;

static const std::string INTERNAL_VOCAB_SUFFIX = ".vocabulary.internal";
static const std::string EXTERNAL_VOCAB_SUFFIX = ".vocabulary.external";
static const std::string MMAP_FILE_SUFFIX = ".meta";
static const std::string CONFIGURATION_FILE = ".meta-data.json";
static const std::string PREFIX_FILE = ".prefixes";
Expand Down
5 changes: 3 additions & 2 deletions src/index/Index.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,9 @@ void Index::passContextFileIntoVector(const string& contextFile,
// initialized before
_vocab = RdfsVocabulary{};
readConfiguration();
_vocab.readFromFile(_onDiskBase + ".vocabulary",
_onDiskLiterals ? _onDiskBase + ".literals-index" : "");
_vocab.readFromFile(
_onDiskBase + INTERNAL_VOCAB_SUFFIX,
_onDiskLiterals ? _onDiskBase + EXTERNAL_VOCAB_SUFFIX : "");

TextVec::bufwriter_type writer(vec);
ad_utility::HashMap<Id, Score> wordsInContext;
Expand Down
36 changes: 18 additions & 18 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ Index::Index() : _usePatterns(false) {}
template <class Parser>
IndexBuilderDataAsPsoSorter Index::createIdTriplesAndVocab(
const string& ntFile) {
auto vocabData =
passFileForVocabulary<Parser>(ntFile, _numTriplesPerPartialVocab);
auto indexBuilderData =
passFileForVocabulary<Parser>(ntFile, _numTriplesPerBatch);
// first save the total number of words, this is needed to initialize the
// dense IndexMetaData variants
_totalVocabularySize = vocabData.nofWords;
_totalVocabularySize = indexBuilderData.nofWords;
LOG(DEBUG) << "Number of words in internal and external vocabulary: "
<< _totalVocabularySize << std::endl;

Expand All @@ -47,18 +47,18 @@ IndexBuilderDataAsPsoSorter Index::createIdTriplesAndVocab(
if (_onDiskLiterals) {
_vocab.externalizeLiteralsFromTextFile(
_onDiskBase + EXTERNAL_LITS_TEXT_FILE_NAME,
_onDiskBase + ".literals-index");
_onDiskBase + EXTERNAL_VOCAB_SUFFIX);
}
deleteTemporaryFile(_onDiskBase + EXTERNAL_LITS_TEXT_FILE_NAME);
// clear vocabulary to save ram (only information from partial binary files
// used from now on). This will preserve information about externalized
// Prefixes etc.
_vocab.clear();
auto psoSorter = convertPartialToGlobalIds(*vocabData.idTriples,
vocabData.actualPartialSizes,
NUM_TRIPLES_PER_PARTIAL_VOCAB);
auto psoSorter = convertPartialToGlobalIds(
*indexBuilderData.idTriples, indexBuilderData.actualPartialSizes,
NUM_TRIPLES_PER_PARTIAL_VOCAB);

return {vocabData, std::move(psoSorter)};
return {indexBuilderData, std::move(psoSorter)};
}
namespace {
// Return a lambda that takes a triple of IDs and returns true iff the predicate
Expand Down Expand Up @@ -117,14 +117,14 @@ void Index::createFromFile(const string& filename) {
// If we have no compression, this will also copy the whole vocabulary.
// but since we expect compression to be the default case, this should not
// hurt.
string vocabFile = _onDiskBase + ".vocabulary";
string vocabFile = _onDiskBase + INTERNAL_VOCAB_SUFFIX;
string vocabFileTmp = _onDiskBase + ".vocabularyTmp";
std::vector<string> prefixes;
if (_vocabPrefixCompressed) {
// We have to use the "normally" sorted vocabulary for the prefix
// compression.
std::string vocabFileForPrefixCalculation =
_onDiskBase + TMP_BASENAME_COMPRESSION + ".vocabulary";
_onDiskBase + TMP_BASENAME_COMPRESSION + INTERNAL_VOCAB_SUFFIX;
prefixes = calculatePrefixes(vocabFileForPrefixCalculation,
NUM_COMPRESSION_PREFIXES, 1, true);
deleteTemporaryFile(vocabFileForPrefixCalculation);
Expand Down Expand Up @@ -346,7 +346,7 @@ IndexBuilderDataAsStxxlVector Index::passFileForVocabulary(
<< "(internal only) ..." << std::endl;
VocabularyMerger m;
std::ofstream compressionOutfile(_onDiskBase + TMP_BASENAME_COMPRESSION +
".vocabulary");
INTERNAL_VOCAB_SUFFIX);
AD_CHECK(compressionOutfile.is_open());
auto internalVocabularyActionCompression =
[&compressionOutfile](const auto& word) {
Expand All @@ -371,7 +371,7 @@ IndexBuilderDataAsStxxlVector Index::passFileForVocabulary(
return (*cmp)(a, b, decltype(_vocab)::SortLevel::TOTAL);
};
auto wordWriter =
_vocab.makeUncompressingWordWriter(_onDiskBase + ".vocabulary");
_vocab.makeUncompressingWordWriter(_onDiskBase + INTERNAL_VOCAB_SUFFIX);
auto internalVocabularyAction = [&wordWriter](const auto& word) {
wordWriter.push(word.data(), word.size());
};
Expand Down Expand Up @@ -649,8 +649,9 @@ void Index::addPatternsToExistingIndex() {
void Index::createFromOnDiskIndex(const string& onDiskBase) {
setOnDiskBase(onDiskBase);
readConfiguration();
_vocab.readFromFile(_onDiskBase + ".vocabulary",
_onDiskLiterals ? _onDiskBase + ".literals-index" : "");
_vocab.readFromFile(
_onDiskBase + INTERNAL_VOCAB_SUFFIX,
_onDiskLiterals ? _onDiskBase + EXTERNAL_VOCAB_SUFFIX : "");

_totalVocabularySize = _vocab.size() + _vocab.getExternalVocab().size();
LOG(DEBUG) << "Number of words in internal and external vocabulary: "
Expand Down Expand Up @@ -1043,11 +1044,10 @@ void Index::initializeVocabularySettingsBuild() {
}
}

if (j.count("num-triples-per-partial-vocab")) {
_numTriplesPerPartialVocab = size_t{j["num-triples-per-partial-vocab"]};
if (j.count("num-triples-per-batch")) {
_numTriplesPerBatch = size_t{j["num-triples-per-batch"]};
LOG(INFO)
<< "You specified \"num-triples-per-partial-vocab = "
<< _numTriplesPerPartialVocab
<< "You specified \"num-triples-per-batch = " << _numTriplesPerBatch
<< "\", choose a lower value if the index builder runs out of memory"
<< std::endl;
}
Expand Down
6 changes: 5 additions & 1 deletion src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,10 @@ class Index {

void setPrefixCompression(bool compressed);

void setNumTriplesPerBatch(uint64_t numTriplesPerBatch) {
_numTriplesPerBatch = numTriplesPerBatch;
}

const string& getTextName() const { return _textMeta.getName(); }

const string& getKbName() const { return _PSO.metaData().getName(); }
Expand Down Expand Up @@ -477,7 +481,7 @@ class Index {
size_t _fullHasPredicateSize;

size_t _parserBatchSize = PARSER_BATCH_SIZE;
size_t _numTriplesPerPartialVocab = NUM_TRIPLES_PER_PARTIAL_VOCAB;
size_t _numTriplesPerBatch = NUM_TRIPLES_PER_PARTIAL_VOCAB;
/**
* @brief Maps pattern ids to sets of predicate ids.
*/
Expand Down
1 change: 1 addition & 0 deletions test/GroupByTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class GroupByTest : public ::testing::Test {
_index.setKbName("group_by_test");
_index.setTextName("group_by_test");
_index.setOnDiskBase("group_ty_test");
_index.setNumTriplesPerBatch(2);
_index.createFromFile<TurtleParserAuto>("group_by_test.nt");
_index.addTextFromContextFile("group_by_test.words");
_index.buildDocsDB("group_by_test.documents");
Expand Down
6 changes: 6 additions & 0 deletions test/IndexTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ TEST(IndexTest, createFromTsvTest) {
{
Index index;
index.setOnDiskBase("_testindex");
index.setNumTriplesPerBatch(2);
index.createFromFile<TsvParser>("_testtmp2.tsv");
}
Index index;
Expand Down Expand Up @@ -145,6 +146,7 @@ TEST(IndexTest, createFromTsvTest) {
{
Index index;
index.setOnDiskBase("_testindex");
index.setNumTriplesPerBatch(2);
index.createFromFile<TsvParser>("_testtmp2.tsv");
}
Index index;
Expand Down Expand Up @@ -243,6 +245,7 @@ TEST_F(CreatePatternsFixture, createPatterns) {
Index index;
index.setUsePatterns(true);
index.setOnDiskBase("_testindex");
index.setNumTriplesPerBatch(2);
index.createFromFile<TsvParser>("_testtmppatterns.tsv");
}
Index index;
Expand Down Expand Up @@ -307,6 +310,7 @@ TEST(IndexTest, createFromOnDiskIndexTest) {
{
Index indexPrim;
indexPrim.setOnDiskBase("_testindex2");
indexPrim.setNumTriplesPerBatch(2);
indexPrim.createFromFile<TsvParser>("_testtmp3.tsv");
}

Expand Down Expand Up @@ -357,6 +361,7 @@ TEST(IndexTest, scanTest) {
{
Index index;
index.setOnDiskBase("_testindex");
index.setNumTriplesPerBatch(2);
index.createFromFile<TsvParser>("_testtmp2.tsv");
}

Expand Down Expand Up @@ -442,6 +447,7 @@ TEST(IndexTest, scanTest) {
{
Index index;
index.setOnDiskBase("_testindex");
index.setNumTriplesPerBatch(2);
index.createFromFile<TsvParser>("_testtmp2.tsv");
}
Index index;
Expand Down
Loading

0 comments on commit a9f4961

Please sign in to comment.