forked from NVlabs/verilog-eval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluation.py
141 lines (113 loc) · 4.46 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from collections import defaultdict, Counter
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import List, Union, Iterable, Dict, Tuple, Optional
import itertools
import numpy as np
import tqdm
from verilog_eval.data import read_problems, stream_jsonl, write_jsonl
from verilog_eval.execution import check_correctness, clean_up_simulation
def estimate_pass_at_k(
num_samples: Union[int, List[int], np.ndarray],
num_correct: Union[List[int], np.ndarray],
k: int
) -> np.ndarray:
"""
Estimates pass@k of each problem and returns them in an array.
"""
def estimator(n: int, c: int, k: int) -> float:
"""
Calculates 1 - comb(n - c, k) / comb(n, k).
"""
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
if isinstance(num_samples, int):
num_samples_it = itertools.repeat(num_samples, len(num_correct))
else:
assert len(num_samples) == len(num_correct)
num_samples_it = iter(num_samples)
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
def contain_passing_completion(
problem: Dict,
completions: List[str],
n_workers: int = 4,
timeout: float = 30.0,
unit_test_length: Optional[int] = None,
clean_up: bool = True,
) -> Tuple[bool, str]:
with ProcessPoolExecutor(max_workers=n_workers) as executor:
futures = []
for idx, completion in enumerate(completions):
args = (problem, completion, timeout, idx, unit_test_length)
future = executor.submit(check_correctness, *args)
futures.append(future)
for future in as_completed(futures):
result = future.result()
if result["passed"]:
return True, completions[result["completion_id"]]
if clean_up:
clean_up_simulation()
return False, ""
def evaluate_functional_correctness(
sample_file: str,
problem_file: str,
k: List[int] = [1, 10, 100],
n_workers: int = 4,
timeout: float = 30.0,
unit_test: bool = False,
clean_up: bool = True,
):
"""
Evaluates the functional correctness of generated samples, and writes
results to f"{sample_file}_results.jsonl.gz"
"""
problems = read_problems(problem_file)
# Check the generated samples against test suites.
with ProcessPoolExecutor(max_workers=n_workers) as executor:
futures = []
completion_id = Counter()
n_samples = 0
results = defaultdict(list)
print("Reading samples...")
for sample in tqdm.tqdm(stream_jsonl(sample_file)):
task_id = sample["task_id"]
completion = sample["completion"]
if unit_test:
args = (problems[task_id], completion, timeout, completion_id[task_id], 100)
else:
args = (problems[task_id], completion, timeout, completion_id[task_id])
future = executor.submit(check_correctness, *args)
futures.append(future)
completion_id[task_id] += 1
n_samples += 1
assert len(completion_id) == len(problems), "Some problems are not attempted."
print("Running test suites...")
for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
result = future.result()
results[result["task_id"]].append((result["completion_id"], result))
if clean_up:
clean_up_simulation()
# Calculate pass@k.
total, correct = [], []
for result in results.values():
result.sort()
passed = [r[1]["passed"] for r in result]
total.append(len(passed))
correct.append(sum(passed))
total = np.array(total)
correct = np.array(correct)
ks = k
pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
for k in ks if (total >= k).all()}
# Finally, save the results in one file:
def combine_results():
for sample in stream_jsonl(sample_file):
task_id = sample["task_id"]
result = results[task_id].pop(0)
sample["result"] = result[1]["result"]
sample["passed"] = result[1]["passed"]
yield sample
out_file = sample_file + "_results.jsonl"
print(f"Writing results to {out_file}...")
write_jsonl(out_file, tqdm.tqdm(combine_results(), total=n_samples))
return pass_at_k