-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathModels_Scoring_System.py
353 lines (287 loc) · 12.9 KB
/
Models_Scoring_System.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
# ------------------------------------------------------------------------------------------------
# License
# ------------------------------------------------------------------------------------------------
# Copyright (c) 2024 LSeu-Open
#
# This code is licensed under the MIT License.
# See LICENSE-CODE file in the root directory
# ------------------------------------------------------------------------------------------------
# Description
# ------------------------------------------------------------------------------------------------
# This script is used to score large language models based on the following criteria:
# - Entity benchmarks
# - Dev benchmarks
# - Community score
# - Technical specifications
# This file is the ModelScorer class, to run the scoring system, please use the Run_Scoring.py file
# This is the Alpha v0.3 of the scoring system
# ------------------------------------------------------------------------------------------------
# ModelScorer class
# ------------------------------------------------------------------------------------------------
class ModelScorer:
"""
A class for scoring and evaluating large language models based on multiple criteria.
This class implements a comprehensive scoring system that evaluates models on:
- Entity benchmarks (25 points max)
- Dev benchmarks (35 points max)
- Community engagement (20 points max)
- Technical specifications (20 points max)
The final score is calculated out of 100 points total.
Attributes:
model_name (str): Name of the model being scored. Defaults to "Unnamed Model".
external_score (float): Combined score from entity and dev benchmarks (set externally)
community_score (float): Score based on community engagement (set externally)
technical_score (float): Score based on technical specs (set externally)
"""
def __init__(self, model_name="Unnamed Model"):
"""
Initialize a ModelScorer instance.
Args:
model_name (str, optional): Name of the model to score. Defaults to "Unnamed Model".
"""
self.model_name = model_name
def calculate_entity_benchmarks(self, benchmark_scores):
"""
Calculate entity benchmarks score out of 25 points maximum.
Evaluates performance on core entity benchmarks like artificial analysis,
live code bench, big code models and open LLM evaluations.
Args:
benchmark_scores (dict): Dictionary mapping benchmark names to scores (0-1 range)
Returns:
float: Weighted score out of 25 points
"""
if not benchmark_scores:
return 0
# Define relative weights for each benchmark
weights = {
'artificial_analysis': 25,
'live_code_bench': 25,
'big_code_models': 25,
'open_llm': 25,
}
score = 0
available_weights = 0
# Calculate weighted average of available scores
for bench, result in benchmark_scores.items():
if result is not None:
score += (result * weights[bench])
available_weights += weights[bench]
# Scale to 25 points maximum if we have scores
if available_weights > 0:
return (score / available_weights) * 25
return 0
def calculate_dev_benchmarks(self, benchmark_scores):
"""
Calculate dev benchmarks score out of 35 points maximum.
Evaluates performance across a wide range of development benchmarks including:
- Language understanding (MMLU, BigBench)
- Reasoning (DROP, HellaSwag)
- Math & coding (MATH, HumanEval)
- And many others
Args:
benchmark_scores (dict): Dictionary mapping benchmark names to scores (0-1 range)
Returns:
float: Weighted score out of 35 points
"""
if not benchmark_scores:
return 0
# Define relative weights for each benchmark based on importance
weights = {
'MMLU': 3.0,
'MMLU Pro': 8.0,
'BigBench': 3.0,
'DROP': 7.0,
'HellaSwag': 7.0,
'GPQA': 2.0,
'ARC-C': 2.0,
'LiveBench': 1.5,
'LatestEval': 1.5,
'AlignBench': 4.0,
'Wild Bench': 4.0,
'MT-bench': 4.0,
'IFEval': 4.0,
'Arena-Hard': 4.5,
'TruthfulQA': 4.5,
'MATH': 4.0,
'GSM-8K': 4.0,
'MGSM': 7.0,
'HumanEval': 3.0,
'HumanEval Plus': 3.0,
'MBPP': 3.0,
'MBPP Plus': 3.0,
'SWE-bench': 2.0,
'API-Bank': 2.0,
'BFCL': 5.0,
'Gorilla Benchmark': 2.0,
'Nexus': 2.0
}
score = 0
available_weights = 0
# Calculate weighted average of available scores
for bench, result in benchmark_scores.items():
if result is not None and bench in weights:
score += (result * weights[bench])
available_weights += weights[bench]
# Scale to 35 points maximum if we have scores
if available_weights > 0:
return (score / available_weights) * 35
return 0
def calculate_external_benchmarks(self, entity_benchmarks, dev_benchmarks=None):
"""
Calculate total external benchmarks score out of 60 points maximum.
Combines entity benchmarks (25 points) and dev benchmarks (35 points).
Args:
entity_benchmarks (dict): Entity benchmark scores
dev_benchmarks (dict, optional): Dev benchmark scores. If None, uses entity_benchmarks
Returns:
float: Combined external benchmark score out of 60 points
"""
if dev_benchmarks is None:
dev_benchmarks = entity_benchmarks # For backward compatibility
entity_score = self.calculate_entity_benchmarks(entity_benchmarks)
dev_score = self.calculate_dev_benchmarks(dev_benchmarks)
return entity_score + dev_score
def calculate_community_score(self, elo_rating):
"""
Calculate community engagement score out of 20 points maximum.
Converts ELO rating to a normalized score between 0-20 points.
Args:
elo_rating (float): Model's ELO rating from community evaluations
Returns:
float: Community score out of 20 points, or None if no rating provided
"""
if elo_rating is None:
return None
base_elo = 1000 # Minimum expected ELO
max_elo = 1402 # Maximum expected ELO
# Normalize to percentage then scale to 20 points
normalized_score = ((elo_rating - base_elo) / (max_elo - base_elo)) * 100
return (normalized_score * 0.2) # 20 points max
def _calculate_price_score(self, price):
"""
Calculate score based on model's price point (8 points max).
Args:
price (float): Price per million tokens in USD
Returns:
int: Score from 1-8 based on price brackets
"""
if price is None:
return 0
if price < 1: return 8
elif price < 3: return 7
elif price < 5: return 6
elif price < 10: return 5
elif price < 20: return 4
elif price < 40: return 3
elif price < 80: return 2
else: return 1
def _calculate_context_score(self, context_size):
"""
Calculate score based on context window size (6 points max).
Args:
context_size (int): Maximum context window size in tokens
Returns:
int: Score from 1-6 based on context size brackets
"""
if context_size is None:
return 0
if context_size > 200000: return 6
elif context_size > 100000: return 5
elif context_size > 32000: return 4
elif context_size > 16000: return 3
elif context_size > 8000: return 2
else: return 1
def calculate_size_perf_ratio(self, benchmark_score, param_count):
"""
Calculate performance-to-size efficiency ratio score (6 points max).
Evaluates how well the model performs relative to its parameter count.
Higher scores indicate better efficiency.
Args:
benchmark_score (float): Average benchmark performance (0-100)
param_count (float): Number of parameters in billions
Returns:
float: Efficiency score from 2-6 points
"""
# Base score from performance - adjusted thresholds
if benchmark_score > 85:
base_score = 6.0 # Excellent (>85%)
elif benchmark_score > 75: # Changed >= to >
base_score = 5.0 # Good (76-85%)
elif benchmark_score > 65: # Changed >= to >
base_score = 4.0 # Decent (66-75%)
elif benchmark_score > 55: # Keep as >
base_score = 3.0 # Moderate (56-65%)
else:
base_score = 2.0 # Poor (≤55%)
# Apply size efficiency multiplier
if param_count >= 70:
return min(base_score, 3.0)
elif param_count >= 40:
return min(base_score, 4.0)
elif param_count >= 30:
return min(base_score, 5.0)
elif param_count >= 15:
return min(base_score, 5.5)
else:
return min(base_score, 6.0)
def _calculate_ratio_score(self, ratio):
"""
Calculate score based on size-performance ratio (6 points max).
Args:
ratio (float): Size-performance ratio score
Returns:
int: Score from 2-6 based on ratio brackets
"""
if ratio is None:
return 0
if ratio > 90: return 6
elif ratio > 80: return 5
elif ratio > 70: return 4
elif ratio > 60: return 3
else: return 2
def calculate_technical_score(self, price, context_window, size_perf_ratio):
"""
Calculate technical specifications score out of 20 points maximum.
Combines scores for:
- Price efficiency (8 points)
- Context window size (6 points)
- Size-performance ratio (6 points)
Args:
price (float): Price per million tokens in USD
context_window (int): Maximum context window size in tokens
size_perf_ratio (float): Performance to parameter count ratio
Returns:
float: Combined technical score out of 20 points
"""
price_score = self._calculate_price_score(price)
context_score = self._calculate_context_score(context_window)
ratio_score = self._calculate_ratio_score(size_perf_ratio)
return price_score + context_score + ratio_score
def calculate_final_score(self):
"""
Calculate final comprehensive score out of 100 points.
Combines:
- External benchmarks (60 points)
- Community score (20 points)
- Technical score (20 points)
All component scores must be set before calling this method.
Returns:
float: Final rounded score out of 100 points
Raises:
ValueError: If any component scores are not set
"""
# Verify all required scores are set
if not hasattr(self, 'external_score') or \
not hasattr(self, 'community_score') or \
not hasattr(self, 'technical_score'):
raise ValueError("All component scores must be set before calculating final score")
# Calculate total score
final_score = self.external_score + self.community_score + self.technical_score
# Print detailed breakdown
print(f"\n=== Score Breakdown for {self.model_name} ===")
print(f"External Score: {self.external_score:>6.2f}/60")
print(f"Community Score: {self.community_score:>6.2f}/20")
print(f"Technical Score: {self.technical_score:>6.2f}/20")
print(f"Final Score: {final_score:>6.2f}/100")
print("=" * 40)
return round(final_score, 2)