update xgrammar testing scripts

ibehnam · Dec 20, 2024 · 3f60753 · 3f60753
1 parent 69263ec
commit 3f60753
Show file tree

Hide file tree

Showing 4 changed files with 195 additions and 72 deletions.
diff --git a/json_stats/scripts/kill_xgr.sh b/json_stats/scripts/kill_xgr.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+kill `ps fax|grep xgr_test | awk '{print $1}'`
+kill -9 `ps fax|grep xgr_test | awk '{print $1}'`
+ps fax|grep xgr_test
diff --git a/json_stats/scripts/xgr/xgr_combine.py b/json_stats/scripts/xgr/xgr_combine.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+
+import json
+import glob
+
+output_path = "tmp/xgr/"
+
+
+class Stats:
+    def __init__(self) -> None:
+        self.ttfm_us = 0
+        self.max_ttfm_us = 0
+        self.masks_us = 0
+        self.max_mask_us = 0
+        self.num_tokens = 0
+        self.num_schemas = 0
+        self.num_schemas_ok = 0
+        self.num_compilation_errors = 0
+        self.num_validation_errors = 0
+        self.num_tests = 0
+        self.num_valid_tests = 0
+        self.num_invalid_tests = 0
+
+def log_fraction_plot(times: list[int]):
+    times.sort()
+    cutoff = 1
+    mult = 1.3
+    count = 0
+    csv = "cutoff time,count left\n"
+    total = len(times)
+    for t in times:
+        if t > cutoff:
+            csv += f"{cutoff/1000.0},{(total - count)/total}\n"
+            cutoff = int(cutoff * mult) + 1
+        count += 1
+    return csv
+
+def main():
+    files = glob.glob(output_path + "*.json")
+    files = sorted(files)
+    stats = Stats()
+    ttfm_us = []
+    all_masks_us = []
+    for f in files:
+        with open(f) as f:
+            data = json.load(f)
+        if "num_tests" not in data:
+            continue
+        stats.num_schemas += 1
+        stats.num_tests += data["num_tests"]
+        if "compile_error" in data:
+            stats.num_compilation_errors += 1
+        else:
+            stats.ttfm_us += data["ttfm_us"]
+            ttfm_us.append(data["ttfm_us"])
+            stats.max_ttfm_us = max(data["max_ttfm_us"], stats.max_ttfm_us)
+            stats.masks_us += data["masks_us"]
+            stats.max_mask_us = max(data["max_mask_us"], stats.max_mask_us)
+            stats.num_tokens += data["num_tokens"]
+            if "validation_error" in data:
+                stats.num_validation_errors += 1
+            else:
+                stats.num_schemas_ok += 1
+            stats.num_valid_tests += data["num_valid_tests"]
+            stats.num_invalid_tests += data["num_invalid_tests"]
+            all_masks_us.extend(data["all_mask_us"])
+    print(json.dumps(stats.__dict__, indent=2))
+    with open("tmp/xgr_ttfm_us.csv", "w") as f:
+        f.write(log_fraction_plot(ttfm_us))
+    with open("tmp/xgr_masks_us.csv", "w") as f:
+        f.write(log_fraction_plot(all_masks_us))
+
+
+
+main()
diff --git a/json_stats/scripts/xgr/xgr_multi.py b/json_stats/scripts/xgr/xgr_multi.py
@@ -51,15 +51,15 @@ def process_file(files: List[str]):
             "message": str(e)
         }
 
-files = []
+file_name = []
 for arg in sys.argv[1:]:
     if arg.endswith(".json"):
-        files.append(arg)
+        file_name.append(arg)
     else:
-        files.extend(glob.glob(arg + "/*.json"))
-print(len(files), file=sys.stderr)
+        file_name.extend(glob.glob(arg + "/*.json"))
+print(len(file_name), file=sys.stderr)
 missing_files = []
-for f in files:
+for f in file_name:
     file_base = f.split("/")[-1]
     output_name = f"{output_base}/{file_base}"
     if not os.path.exists(output_name):
@@ -82,16 +82,10 @@ def process_file(files: List[str]):
 with concurrent.futures.ThreadPoolExecutor(max_workers=40) as executor:
     futures = {executor.submit(process_file, f): f for f in chunks}
     for future in concurrent.futures.as_completed(futures):
-        files = futures[future]
+        file_name = futures[future]
         try:
             r = future.result()
-            cnt += len(files)
-            print(cnt)
-            rs = json.dumps(r)
-            with open(log_file, "a") as f:
-                f.write(f"FILES: {files}\n{rs}\n")
+            print(file_name)
             # print(f"OK: {files}")
         except Exception as e:
-            with open(log_file, "a") as f:
-                f.write(f"ERROR {files}: {repr(e)}")
-            print(f"ERROR: {files}", repr(e))
+            print(f"ERROR: {file_name}", repr(e))
diff --git a/json_stats/scripts/xgr/xgr_test.py b/json_stats/scripts/xgr/xgr_test.py
@@ -6,87 +6,136 @@
 import os
 import random
 import time
+import resource
 
 import xgrammar as xgr
-import torch
 import numpy as np
 from transformers import AutoTokenizer, AutoConfig
 
-positive_base = os.environ.get("HOME") + "/src/json-data/positive"
-output_base = os.environ.get("HOME") + "/src/json-data/xgr_output"
+output_path = "tmp/xgr/"
 
 
-def do_process(file: str):
+def time_us(prev: float) -> int:
+    return int((time.monotonic() - prev) * 1000000)
+
+
+def process_file(file: str):
+    id = os.path.basename(file)
+    output_name = output_path + id
+    if os.path.exists(output_name):
+        return None
+
+    with open(output_name, "w") as f:
+        f.write(json.dumps({ "pending_file": 1 }, indent=2))
+
     with open(file) as f:
         pos_data = json.loads(f.read())
 
     schema = json.dumps(pos_data["schema"])
-    instance = json.dumps(pos_data["tests"][0]["data"], indent=4)
-
     token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
-    tokens = tokenizer.encode(instance, add_special_tokens=False)
 
+    all_mask_us = []
     status = {
-        "file": file,
-        "ok": False,
-        "num_tokens": len(tokens),
-        "accepted_tokens": 0,
+        "id": id,
+        "ttfm_us": 0,
+        "max_ttfm_us": 0,
+        "masks_us": 0,
+        "max_mask_us": 0,
+        "num_tokens": 0,
+        "num_tests": len(pos_data["tests"]),
+        "all_mask_us": all_mask_us,
+        "num_valid_tests": 0,
+        "num_invalid_tests": 0,
     }
 
     try:
         t0 = time.monotonic()
-        compiled_grammar = compiler.compile_json_schema(schema, indent=4)
+        compiled_grammar = compiler.compile_json_schema(
+            schema, any_whitespace=True, strict_mode=False
+        )
         matcher = xgr.GrammarMatcher(compiled_grammar)
     except Exception as e:
         status["compile_error"] = repr(e)
+        with open(output_name, "w") as f:
+            f.write(json.dumps(status, indent=2))
         return status
 
-    status["compile_time"] = int((time.monotonic() - t0) * 1000)
+    status["ttfm_us"] = time_us(t0)
+    status["max_ttfm_us"] = status["ttfm_us"]
+
+    masks_us = 0
+    max_mask_us = 0
+    num_tokens = 0
+
+    for i, test in enumerate(pos_data["tests"]):
+        instance = json.dumps(test["data"], indent=None)
+        tokens = tokenizer.encode(instance, add_special_tokens=False)
+
+        t1 = time.monotonic()
+        accepted = True
+        for tidx, t in enumerate(tokens):
+            t2 = time.monotonic()
+            matcher.fill_next_token_bitmask(token_bitmask)
+            ok = matcher.accept_token(t)
+            mask_time = time_us(t2)
+            num_tokens += 1
+            masks_us += mask_time
+            all_mask_us.append(mask_time)
+            if mask_time > max_mask_us:
+                max_mask_us = mask_time
+            if not ok:
+                accepted = False
+                break
+
+        if accepted and not test["valid"]:
+            status["validation_error"] = f"test #{i}: should reject but didn't"
+        elif not accepted and test["valid"]:
+            status["validation_error"] = f"test #{i}: should accept but didn't"
+        else:
+            if test["valid"]:
+                status["num_valid_tests"] += 1
+            else:
+                status["num_invalid_tests"] += 1
+
+    status["masks_us"] = masks_us
+    status["max_mask_us"] = max_mask_us
+    status["num_tokens"] = num_tokens
 
-    t1 = time.monotonic()
-    for i, t in enumerate(tokens):
-        matcher.fill_next_token_bitmask(token_bitmask)
-        ok = matcher.accept_token(t)
-        if not ok:
-            break
-        status["accepted_tokens"] = i + 1
-
-    status["ok"] = status["accepted_tokens"] == len(tokens)
+    with open(output_name, "w") as f:
+        f.write(json.dumps(status, indent=2))
     return status
 
-
-def process_file(file: str):
-    file_base = file.split("/")[-1]
-    output_name = f"{output_base}/{file_base}"
-    if os.path.exists(output_name):
-        return
-
-    print("PROCESSING: " + file, file=sys.stderr)
-    status = do_process(file)
-    print("RESULT: " + json.dumps(status), file=sys.stderr)
-    with open(output_name, "w") as f:
-        f.write(json.dumps(status, indent=4))
-
-
-# Get tokenizer info
-model_id = "meta-llama/Llama-3.2-1B-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-config = AutoConfig.from_pretrained(model_id)
-# This can be larger than tokenizer.vocab_size due to paddings
-full_vocab_size = config.vocab_size
-tokenizer_info = xgr.TokenizerInfo.from_huggingface(
-    tokenizer, vocab_size=full_vocab_size
-)
-compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=1)
-
-files = []
-for arg in sys.argv[1:]:
-    if arg.endswith(".json"):
-        files.append(arg)
-    else:
-        files.extend(glob.glob(arg + "/*.json"))
-print(len(files), file=sys.stderr)
-random.shuffle(files)
-
-for f in files:
-    process_file(f)
+def main():
+    global tokenizer_info, compiler, tokenizer
+
+    limit_gb = 32
+    limit_bytes = limit_gb * 1024 * 1024 * 1024
+    resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes))
+
+    # Get tokenizer info
+    model_id = "meta-llama/Llama-3.1-8B-Instruct"
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    config = AutoConfig.from_pretrained(model_id)
+    # This can be larger than tokenizer.vocab_size due to paddings
+    full_vocab_size = config.vocab_size
+    tokenizer_info = xgr.TokenizerInfo.from_huggingface(
+        tokenizer, vocab_size=full_vocab_size
+    )
+    compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=1)
+
+    files = []
+    for arg in sys.argv[1:]:
+        if arg.endswith(".json"):
+            files.append(arg)
+        else:
+            files.extend(glob.glob(arg + "/*.json"))
+    print(len(files), file=sys.stderr)
+    random.shuffle(files)
+
+    os.makedirs(output_path, exist_ok=True)
+
+    for f in files:
+        print(f, file=sys.stderr)
+        process_file(f)
+
+main()