Fix bug in num of docs in -1k generated corpus (elastic#1318)

This commit fixes a bug causing the generation of 1001 docs in the test-mode specific -1k corpus files created by the create-track subcommand (due to an of Off-by-one error). Closes elastic#1317
gizas · Sep 1, 2021 · db06c98 · db06c98
1 parent 1b56ef4
commit db06c98
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 9 deletions.
diff --git a/esrally/tracker/corpus.py b/esrally/tracker/corpus.py
@@ -81,7 +81,7 @@ def dump_documents(client, index, out_path, total_docs, progress_message_suffix=
             logger.info("Dumping corpus for index [%s] to [%s].", index, out_path)
             query = {"query": {"match_all": {}}}
             for n, doc in enumerate(helpers.scan(client, query=query, index=index)):
-                if n > total_docs:
+                if n >= total_docs:
                     break
                 data = (json.dumps(doc["_source"], separators=(",", ":")) + "\n").encode("utf-8")
 

diff --git a/it/tracker_test.py b/it/tracker_test.py
@@ -39,10 +39,12 @@ def test_cluster():
 
 @it.rally_in_mem
 def test_create_track(cfg, tmp_path, test_cluster):
-    # prepare some data
+    # use 0.05% of geonames corpus to generate data. We need something small but >1000 docs to properly test
+    # the -1k corpus too.
     cmd = (
-        f"--test-mode --pipeline=benchmark-only --target-hosts=127.0.0.1:{test_cluster.http_port} "
-        f" --track=geonames --challenge=append-no-conflicts-index-only --quiet"
+        f"--pipeline=benchmark-only --target-hosts=127.0.0.1:{test_cluster.http_port} --track=geonames "
+        f'--challenge=append-no-conflicts-index-only --track-params="ingest_percentage:0.05" --on-error=abort '
+        f'--include-tasks="delete-index,create-index,check-cluster-health,index-append" --quiet'
     )
     assert it.race(cfg, cmd) == 0
 
@@ -59,19 +61,30 @@ def test_create_track(cfg, tmp_path, test_cluster):
         == 0
     )
 
+    base_generated_corpora = "geonames-documents"
     expected_files = [
         "track.json",
         "geonames.json",
-        "geonames-documents-1k.json",
-        "geonames-documents.json",
-        "geonames-documents-1k.json.bz2",
-        "geonames-documents.json.bz2",
+        f"{base_generated_corpora}-1k.json",
+        f"{base_generated_corpora}.json",
+        f"{base_generated_corpora}-1k.json.bz2",
+        f"{base_generated_corpora}.json.bz2",
     ]
 
     for f in expected_files:
         full_path = track_path / f
         assert full_path.exists(), f"Expected file to exist at path [{full_path}]"
 
-    # run a benchmark with the created track
+    with open(track_path / f"{base_generated_corpora}-1k.json", "rt") as f:
+        num_lines = sum(1 for line in f)
+    assert (
+        num_lines == 1000
+    ), f"Corpora [{base_generated_corpora}-1k.json] used by test-mode is [{num_lines}] lines but should be 1000 lines"
+
+    # run a benchmark in test mode with the created track
     cmd = f"--test-mode --pipeline=benchmark-only --target-hosts=127.0.0.1:{test_cluster.http_port} --track-path={track_path}"
     assert it.race(cfg, cmd) == 0
+
+    # and also run a normal (short) benchmark using the created track
+    cmd = f"--pipeline=benchmark-only --target-hosts=127.0.0.1:{test_cluster.http_port} --track-path={track_path}"
+    assert it.race(cfg, cmd) == 0