import gradio as gr import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download import time import os from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, LLM_BENCHMARKS_TEXT_1, LLM_BENCHMARKS_TEXT_2, CROSS_EVALUATION_METRICS, NOTE_GENERATION_METRICS, HEALTHBENCH_METRICS, TITLE, FIVE_PILLAR_DIAGRAM, ) from src.display.css_html_js import custom_css from src.display.css_v2 import v2_css from src.display.utils import ( DATASET_BENCHMARK_COLS, OPEN_ENDED_BENCHMARK_COLS, MED_SAFETY_BENCHMARK_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, ACI_BENCHMARK_COLS, SOAP_BENCHMARK_COLS, HEALTHBENCH_BENCHMARK_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, DATASET_COLS, OPEN_ENDED_COLS, MED_SAFETY_COLS, MEDICAL_SUMMARIZATION_COLS, ACI_COLS, SOAP_COLS, HEALTHBENCH_COLS, HEALTHBENCH_HARD_COLS, EVAL_COLS, EVAL_TYPES, NUMERIC_INTERVALS, TYPES, AutoEvalColumn, ModelType, Precision, WeightType, fields, render_generation_templates, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, OpenEndedRomanian_COLS, OpenEndedRomanian_BENCHMARK_COLS, OpenEndedGreek_COLS, OpenEndedGreek_BENCHMARK_COLS, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, EHRSQL_ZERO_SHOT_COLS, EHRSQL_ZERO_SHOT_BENCHMARK_COLS, EHRSQL_FEW_SHOT_COLS, EHRSQL_FEW_SHOT_BENCHMARK_COLS, MEDCALC_DIRECT_ANSWER_COLS, MEDCALC_DIRECT_ANSWER_BENCHMARK_COLS, MEDCALC_ONE_SHOT_COT_COLS, MEDCALC_ONE_SHOT_COT_BENCHMARK_COLS, MEDCALC_ZERO_SHOT_COT_COLS, MEDCALC_ZERO_SHOT_COT_BENCHMARK_COLS, MEDEC_ZERO_SHOT_COLS, MEDEC_ZERO_SHOT_BENCHMARK_COLS, MEDEC_ONE_SHOT_COLS, MEDEC_ONE_SHOT_BENCHMARK_COLS, ) from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.populate import get_evaluation_queue_df from src.populate_optimized import load_all_datasets_parallel from src.submission.submit import add_new_eval from src.ui.shared import set_all_datasets from src.ui.leaderboard_v1 import build_v1_leaderboard_tabs from src.ui.leaderboard_v2 import build_v2_leaderboard_tabs from src.ui.leaderboard_v2_ui import set_v2_datasets # ===================================================================================== # 1. SETUP AND DATA LOADING # ===================================================================================== def restart_space(): API.restart_space(repo_id=REPO_ID) print("Downloading evaluation data...") try: snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN) snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", token=TOKEN) print("Downloads complete.") except Exception as e: print(f"An error occurred during download: {e}") restart_space() print("Loading all dataframes into a central dictionary (optimized parallel loading)...") start_time = time.time() # Use optimized parallel loading with caching ALL_DATASETS = load_all_datasets_parallel( EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, max_workers=4 # Adjust based on CPU cores ) end_time = time.time() print(f"✓ All dataframes loaded in {end_time - start_time:.2f} seconds.") print(f" Loaded {len(ALL_DATASETS)} datasets") # Initialize shared module with datasets set_all_datasets(ALL_DATASETS) # Initialize v2 datasets set_v2_datasets(ALL_DATASETS) # Evaluation Queue DataFrames (finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df) = get_evaluation_queue_df( EVAL_REQUESTS_PATH, EVAL_COLS ) # ===================================================================================== # 4. GRADIO DEMO UI (Main application layout) # ===================================================================================== # Combine CSS for both v1 and v2 combined_css = custom_css + v2_css demo = gr.Blocks(css=combined_css) with demo: # Top-level tabs for v1/v2 leaderboards (moved to topmost position) with gr.Tabs(elem_classes="tab-buttons") as version_tabs: # ===================================================================================== # Leaderboard v2 (New - Reordered Evaluations) # ===================================================================================== with gr.TabItem("Leaderboard v2", id="leaderboard-v2"): build_v2_leaderboard_tabs(finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df) # ===================================================================================== # Leaderboard v1 (Current - Unchanged Order) # ===================================================================================== with gr.TabItem("Leaderboard v1", id="leaderboard-v1"): build_v1_leaderboard_tabs(finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df) # Citation section (shared across both versions) with gr.Row(): with gr.Accordion("📙 Citation", open=False): gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=86400) scheduler.start() demo.queue(default_concurrency_limit=40).launch(allowed_paths=["./assets/"], share=True, ssr_mode=False)