Skip to content

Commit

Permalink
Fix bug in num of docs in -1k generated corpus (elastic#1318)
Browse files Browse the repository at this point in the history
This commit fixes a bug causing the generation of 1001 docs in the test-mode specific -1k corpus files
created by the create-track subcommand (due to an of Off-by-one error).

Closes elastic#1317
  • Loading branch information
dliappis authored Sep 1, 2021
1 parent 1b56ef4 commit db06c98
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 9 deletions.
2 changes: 1 addition & 1 deletion esrally/tracker/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def dump_documents(client, index, out_path, total_docs, progress_message_suffix=
logger.info("Dumping corpus for index [%s] to [%s].", index, out_path)
query = {"query": {"match_all": {}}}
for n, doc in enumerate(helpers.scan(client, query=query, index=index)):
if n > total_docs:
if n >= total_docs:
break
data = (json.dumps(doc["_source"], separators=(",", ":")) + "\n").encode("utf-8")

Expand Down
29 changes: 21 additions & 8 deletions it/tracker_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@ def test_cluster():

@it.rally_in_mem
def test_create_track(cfg, tmp_path, test_cluster):
# prepare some data
# use 0.05% of geonames corpus to generate data. We need something small but >1000 docs to properly test
# the -1k corpus too.
cmd = (
f"--test-mode --pipeline=benchmark-only --target-hosts=127.0.0.1:{test_cluster.http_port} "
f" --track=geonames --challenge=append-no-conflicts-index-only --quiet"
f"--pipeline=benchmark-only --target-hosts=127.0.0.1:{test_cluster.http_port} --track=geonames "
f'--challenge=append-no-conflicts-index-only --track-params="ingest_percentage:0.05" --on-error=abort '
f'--include-tasks="delete-index,create-index,check-cluster-health,index-append" --quiet'
)
assert it.race(cfg, cmd) == 0

Expand All @@ -59,19 +61,30 @@ def test_create_track(cfg, tmp_path, test_cluster):
== 0
)

base_generated_corpora = "geonames-documents"
expected_files = [
"track.json",
"geonames.json",
"geonames-documents-1k.json",
"geonames-documents.json",
"geonames-documents-1k.json.bz2",
"geonames-documents.json.bz2",
f"{base_generated_corpora}-1k.json",
f"{base_generated_corpora}.json",
f"{base_generated_corpora}-1k.json.bz2",
f"{base_generated_corpora}.json.bz2",
]

for f in expected_files:
full_path = track_path / f
assert full_path.exists(), f"Expected file to exist at path [{full_path}]"

# run a benchmark with the created track
with open(track_path / f"{base_generated_corpora}-1k.json", "rt") as f:
num_lines = sum(1 for line in f)
assert (
num_lines == 1000
), f"Corpora [{base_generated_corpora}-1k.json] used by test-mode is [{num_lines}] lines but should be 1000 lines"

# run a benchmark in test mode with the created track
cmd = f"--test-mode --pipeline=benchmark-only --target-hosts=127.0.0.1:{test_cluster.http_port} --track-path={track_path}"
assert it.race(cfg, cmd) == 0

# and also run a normal (short) benchmark using the created track
cmd = f"--pipeline=benchmark-only --target-hosts=127.0.0.1:{test_cluster.http_port} --track-path={track_path}"
assert it.race(cfg, cmd) == 0

0 comments on commit db06c98

Please sign in to comment.