Spaces:

pelcra
/

llmlagbench

Running

App Files Files Community

fzarnecki commited on 30 days ago

Commit

f9bc6f5

1 Parent(s): 654cff7

Added citation

Browse files

Files changed (2) hide show

app.py +1 -6
content.py +14 -7

app.py CHANGED Viewed

@@ -18,12 +18,7 @@ from src.utils import(
     load_raw_model_data,
     build_year_column_mapping,
 )
-from content import LLMLAGBENCH_INTRO, LEADERBOARD_INTRO, MODEL_COMPARISON_INTRO, AUTHORS
-# TODO move to file
-CIT_BTN_TEXT = ""
-CIT_BTN_LABEL = ""
 ### CONFIGURATION

     load_raw_model_data,
     build_year_column_mapping,
 )
+from content import LLMLAGBENCH_INTRO, LEADERBOARD_INTRO, MODEL_COMPARISON_INTRO, AUTHORS, CIT_BTN_TEXT, CIT_BTN_LABEL
 ### CONFIGURATION

content.py CHANGED Viewed

@@ -1,8 +1,3 @@
-"""
-Content text for the LLMLagBench application.
-Contains descriptive text for various sections of the UI.
-"""
 # Section under main title
 LLMLAGBENCH_INTRO = """
 Large Language Models (LLMs) are pretrained on textual data up to a specific temporal cutoff, creating
@@ -11,7 +6,7 @@ external sources. More subtly, when this limitation is unknown or ignored, LLMs
 outdated time-sensitive information with general knowledge during reasoning tasks, **potentially
 compromising response accuracy**.
-LLMLagBench provides a systematic approach for **identifying the earliest probable temporal boundaries** of
 an LLM's training data by evaluating its knowledge of recent events. The benchmark comprises of **1,700+ curated questions** about events sampled from news reports published between 2020-2025 (we plan to update the question set regularly). Each
 question could not be accurately answered before the event was reported in news media. We evaluate model
 responses using a **0-2 scale faithfulness metric** and apply the **PELT (Pruned Exact Linear Time)** changepoint
@@ -62,4 +57,16 @@ AUTHORS = """
 <div style='text-align: center; font-size: 0.9em; color: #666; margin-top: 5px; margin-bottom: 15px;'>
 <em>Piotr Pęzik, Konrad Kaczyński, Maria Szymańska, Filip Żarnecki, Zuzanna Deckert, Jakub Kwiatkowski, Wojciech Janowski</em>
 </div>
-"""

 # Section under main title
 LLMLAGBENCH_INTRO = """
 Large Language Models (LLMs) are pretrained on textual data up to a specific temporal cutoff, creating
 outdated time-sensitive information with general knowledge during reasoning tasks, **potentially
 compromising response accuracy**.
+LLMLagBench (https://arxiv.org/abs/2511.12116) provides a systematic approach for **identifying the earliest probable temporal boundaries** of
 an LLM's training data by evaluating its knowledge of recent events. The benchmark comprises of **1,700+ curated questions** about events sampled from news reports published between 2020-2025 (we plan to update the question set regularly). Each
 question could not be accurately answered before the event was reported in news media. We evaluate model
 responses using a **0-2 scale faithfulness metric** and apply the **PELT (Pruned Exact Linear Time)** changepoint
 <div style='text-align: center; font-size: 0.9em; color: #666; margin-top: 5px; margin-bottom: 15px;'>
 <em>Piotr Pęzik, Konrad Kaczyński, Maria Szymańska, Filip Żarnecki, Zuzanna Deckert, Jakub Kwiatkowski, Wojciech Janowski</em>
 </div>
+"""
+CIT_BTN_TEXT = """@misc{pęzik2025llmlagbenchidentifyingtemporaltraining,
+      title={LLMLagBench: Identifying Temporal Training Boundaries in Large Language Models},
+      author={Piotr Pęzik and Konrad Kaczyński and Maria Szymańska and Filip Żarnecki and Zuzanna Deckert and Jakub Kwiatkowski and Wojciech Janowski},
+      year={2025},
+      eprint={2511.12116},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2511.12116},
+}"""
+CIT_BTN_LABEL = "📄 BibTeX Citation"