Upload 2 files
Browse files- app.py +1 -1
- report_generation.jsonl +2 -0
app.py
CHANGED
|
@@ -111,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
-
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
|
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
+
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 17 Dec 2025.")
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
report_generation.jsonl
CHANGED
|
@@ -45,3 +45,5 @@
|
|
| 45 |
{"Model": "DeepSeek-AI/DeepSeek-V3.2 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 45.7, "Physics": 30.0, "Chemistry": 55.8, "Finance": 33.0, "Consulting": 64.1, "Extraction": 39.1, "Reasoning": 45.9, "Style": 58.2, "Response Characters": 4346, "Input Tokens": 456, "Output Tokens": 13245, "Cost": 0.87}
|
| 46 |
{"Model": "DeepSeek-AI/DeepSeek-V3.2", "Category": "Open-weight Instruct", "Overall": 50.5, "Physics": 42.1, "Chemistry": 57.8, "Finance": 37.1, "Consulting": 65.0, "Extraction": 43.3, "Reasoning": 51.7, "Style": 53.1, "Response Characters": 8266, "Input Tokens": 456, "Output Tokens": 2755, "Cost": 0.2}
|
| 47 |
{"Model": "OpenAI/GPT-5.2 (xhigh)", "Category": "Closed-source Reasoning", "Overall": 55.1, "Physics": 42.7, "Chemistry": 72.6, "Finance": 35.6, "Consulting": 69.4, "Extraction": 47.3, "Reasoning": 57.8, "Style": 75.7, "Response Characters": 7010, "Input Tokens": 3191, "Output Tokens": 22965, "Cost": 52.34}
|
|
|
|
|
|
|
|
|
| 45 |
{"Model": "DeepSeek-AI/DeepSeek-V3.2 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 45.7, "Physics": 30.0, "Chemistry": 55.8, "Finance": 33.0, "Consulting": 64.1, "Extraction": 39.1, "Reasoning": 45.9, "Style": 58.2, "Response Characters": 4346, "Input Tokens": 456, "Output Tokens": 13245, "Cost": 0.87}
|
| 46 |
{"Model": "DeepSeek-AI/DeepSeek-V3.2", "Category": "Open-weight Instruct", "Overall": 50.5, "Physics": 42.1, "Chemistry": 57.8, "Finance": 37.1, "Consulting": 65.0, "Extraction": 43.3, "Reasoning": 51.7, "Style": 53.1, "Response Characters": 8266, "Input Tokens": 456, "Output Tokens": 2755, "Cost": 0.2}
|
| 47 |
{"Model": "OpenAI/GPT-5.2 (xhigh)", "Category": "Closed-source Reasoning", "Overall": 55.1, "Physics": 42.7, "Chemistry": 72.6, "Finance": 35.6, "Consulting": 69.4, "Extraction": 47.3, "Reasoning": 57.8, "Style": 75.7, "Response Characters": 7010, "Input Tokens": 3191, "Output Tokens": 22965, "Cost": 52.34}
|
| 48 |
+
{"Model": "Google/Gemini-3-Flash-Preview (Thinking)", "Category": "Closed-source Reasoning", "Overall": 53.4, "Physics": 37.8, "Chemistry": 72.1, "Finance": 37.2, "Consulting": 66.6, "Extraction": 45.9, "Reasoning": 52.1, "Style": 68.5, "Response Characters": 6766, "Input Tokens": 489, "Output Tokens": 9120, "Cost": 4.42}
|
| 49 |
+
{"Model": "Google/Gemini-3-Flash-Preview", "Category": "Closed-source Instruct", "Overall": 50.2, "Physics": 35.2, "Chemistry": 64.4, "Finance": 35.8, "Consulting": 65.3, "Extraction": 43.1, "Reasoning": 50.2, "Style": 68.2, "Response Characters": 3989, "Input Tokens": 480, "Output Tokens": 1335, "Cost": 0.68}
|