From f5a2b0789201c78afde93b3675fe43d91d3e4c90 Mon Sep 17 00:00:00 2001
From: lsabor <lukesabor@gmail.com>
Date: Sun, 14 Dec 2025 12:04:14 -0800
Subject: [PATCH] adds daily task to update global bot leaderboard, refactors
 command for convenience

---
 misc/management/commands/cron.py              |   8 +
 scoring/jobs.py                               |  17 +
 .../commands/update_global_bot_leaderboard.py | 562 +++++++++---------
 3 files changed, 306 insertions(+), 281 deletions(-)

diff --git a/misc/management/commands/cron.py b/misc/management/commands/cron.py
index cf9360552e..0734505f5d 100644
--- a/misc/management/commands/cron.py
+++ b/misc/management/commands/cron.py
@@ -26,6 +26,7 @@
 from scoring.jobs import (
     finalize_leaderboards,
     update_global_comment_and_question_leaderboards,
+    update_gobal_bot_leaderboard,
 )
 from scoring.utils import update_medal_points_and_ranks
 
@@ -200,6 +201,13 @@ def handle(self, *args, **options):
             max_instances=1,
             replace_existing=True,
         )
+        scheduler.add_job(
+            close_old_connections(update_gobal_bot_leaderboard),
+            trigger=CronTrigger.from_crontab("0 5 * * *"),  # Every day at 05:00 UTC
+            id="update_gobal_bot_leaderboard",
+            max_instances=1,
+            replace_existing=True,
+        )
 
         #
         # Comment Jobs
diff --git a/scoring/jobs.py b/scoring/jobs.py
index 89c6900a89..1d3b60dace 100644
--- a/scoring/jobs.py
+++ b/scoring/jobs.py
@@ -6,9 +6,26 @@
 from scoring.models import Leaderboard
 from scoring.utils import update_project_leaderboard
 
+from scoring.management.commands.update_global_bot_leaderboard import (
+    run_update_global_bot_leaderboard,
+)
+
 logger = logging.getLogger(__name__)
 
 
+def update_gobal_bot_leaderboard():
+    global_bot_leaderboard = Leaderboard.objects.filter(
+        name="Global Bot Leaderboard",
+    ).first()
+    if not global_bot_leaderboard:
+        logger.warning("Global Bot Leaderboard not found.")
+        return
+    try:
+        run_update_global_bot_leaderboard()
+    except Exception as e:
+        logger.error(f"Error updating Global Bot Leaderboard: {e}")
+
+
 def update_global_comment_and_question_leaderboards():
     global_leaderboards = Leaderboard.objects.filter(
         finalized=False,
diff --git a/scoring/management/commands/update_global_bot_leaderboard.py b/scoring/management/commands/update_global_bot_leaderboard.py
index 479e57a3db..89594a307e 100644
--- a/scoring/management/commands/update_global_bot_leaderboard.py
+++ b/scoring/management/commands/update_global_bot_leaderboard.py
@@ -604,302 +604,302 @@ def bootstrap_skills(
     return ci_lower, ci_upper
 
 
-class Command(BaseCommand):
-    help = """
-    Update the global bots leaderboard
-    """
-
-    def handle(self, *args, **options) -> None:
-        baseline_player: int | str = 236038  # metac-gpt-4o+asknews
-        bootstrap_iterations = 30
-
-        # SETUP: users to evaluate & questions
-        print("Initializing...")
-        users: QuerySet[User] = User.objects.filter(
-            metadata__bot_details__metac_bot=True,
-            metadata__bot_details__include_in_calculations=True,
-            metadata__bot_details__display_in_leaderboard=True,
-            is_active=True,
-        ).order_by("id")
-        user_forecast_exists = Forecast.objects.filter(
-            question_id=OuterRef("pk"), author__in=users
-        )
-        questions: QuerySet[Question] = (
-            Question.objects.filter(
-                Q(
-                    related_posts__post__default_project__default_permission__in=[
-                        "viewer",
-                        "forecaster",
-                    ]
-                )
-                | Q(
-                    related_posts__post__default_project_id__in=[
-                        3349,  # aib q3 2024
-                        32506,  # aib q4 2024
-                        32627,  # aib q1 2025
-                        32721,  # aib q2 2025
-                        32813,  # aib fall 2025
-                    ]
-                ),
-                related_posts__post__curation_status=Post.CurationStatus.APPROVED,
-                resolution__isnull=False,
-                scheduled_close_time__lte=timezone.now(),
-            )
-            .exclude(related_posts__post__default_project__slug__startswith="minibench")
-            .exclude(resolution__in=UnsuccessfulResolutionType)
-            .filter(Exists(user_forecast_exists))
-            .prefetch_related(  # only prefetch forecasts from those users
-                Prefetch(
-                    "user_forecasts", queryset=Forecast.objects.filter(author__in=users)
-                )
+def run_update_global_bot_leaderboard() -> None:
+    baseline_player: int | str = 236038  # metac-gpt-4o+asknews
+    bootstrap_iterations = 30
+
+    # SETUP: users to evaluate & questions
+    print("Initializing...")
+    users: QuerySet[User] = User.objects.filter(
+        metadata__bot_details__metac_bot=True,
+        metadata__bot_details__include_in_calculations=True,
+        metadata__bot_details__display_in_leaderboard=True,
+        is_active=True,
+    ).order_by("id")
+    user_forecast_exists = Forecast.objects.filter(
+        question_id=OuterRef("pk"), author__in=users
+    )
+    questions: QuerySet[Question] = (
+        Question.objects.filter(
+            Q(
+                related_posts__post__default_project__default_permission__in=[
+                    "viewer",
+                    "forecaster",
+                ]
             )
-            .order_by("id")
-            .distinct("id")
+            | Q(
+                related_posts__post__default_project_id__in=[
+                    3349,  # aib q3 2024
+                    32506,  # aib q4 2024
+                    32627,  # aib q1 2025
+                    32721,  # aib q2 2025
+                    32813,  # aib fall 2025
+                ]
+            ),
+            related_posts__post__curation_status=Post.CurationStatus.APPROVED,
+            resolution__isnull=False,
+            scheduled_close_time__lte=timezone.now(),
         )
-        ###############
-        # make sure they have at least 100 resolved questions
-        print("initialize list")
-        question_list = list(questions)
-        print("Filtering users.")
-        scored_question_counts: dict[int, int] = defaultdict(int)
-        c = users.count()
-        i = 0
-        for user in users:
-            i += 1
-            print(i, "/", c, end="\r")
-            scored_question_counts[user.id] = (
-                Score.objects.filter(user=user, question__in=question_list)
-                .distinct("question_id")
-                .count()
+        .exclude(related_posts__post__default_project__slug__startswith="minibench")
+        .exclude(resolution__in=UnsuccessfulResolutionType)
+        .filter(Exists(user_forecast_exists))
+        .prefetch_related(  # only prefetch forecasts from those users
+            Prefetch(
+                "user_forecasts", queryset=Forecast.objects.filter(author__in=users)
             )
-        excluded_ids = [
-            uid for uid, count in scored_question_counts.items() if count < 100
-        ]
-        users = users.exclude(id__in=excluded_ids)
-        ###############
-        print("Initializing... DONE")
-
-        # Gather head to head scores
-        user1_ids, user2_ids, question_ids, scores, weights = gather_data(
-            users, questions
         )
-
-        # choose baseline player if not already chosen
-        if not baseline_player:
-            baseline_player = max(
-                set(user1_ids) | set(user2_ids), key=(user1_ids + user2_ids).count
-            )
-        # get variance of average scores (used in rescaling)
-        avg_scores = get_avg_scores(user1_ids, user2_ids, scores, weights)
-        var_avg_scores = (
-            np.var(np.array(list(avg_scores.values()))) if len(avg_scores) > 1 else 0
+        .order_by("id")
+        .distinct("id")
+    )
+    ###############
+    # make sure they have at least 100 resolved questions
+    print("initialize list")
+    question_list = list(questions)
+    print("Filtering users.")
+    scored_question_counts: dict[int, int] = defaultdict(int)
+    c = users.count()
+    i = 0
+    for user in users:
+        i += 1
+        print(i, "/", c, end="\r")
+        scored_question_counts[user.id] = (
+            Score.objects.filter(user=user, question__in=question_list)
+            .distinct("question_id")
+            .count()
         )
-
-        # compute skills initially
-        skills = get_skills(
-            user1_ids=user1_ids,
-            user2_ids=user2_ids,
-            question_ids=question_ids,
-            scores=scores,
-            weights=weights,
-            baseline_player=baseline_player,
-            var_avg_scores=var_avg_scores,
-            verbose=False,
+    excluded_ids = [uid for uid, count in scored_question_counts.items() if count < 100]
+    users = users.exclude(id__in=excluded_ids)
+    ###############
+    print("Initializing... DONE")
+
+    # Gather head to head scores
+    user1_ids, user2_ids, question_ids, scores, weights = gather_data(users, questions)
+
+    # choose baseline player if not already chosen
+    if not baseline_player:
+        baseline_player = max(
+            set(user1_ids) | set(user2_ids), key=(user1_ids + user2_ids).count
         )
+    # get variance of average scores (used in rescaling)
+    avg_scores = get_avg_scores(user1_ids, user2_ids, scores, weights)
+    var_avg_scores = (
+        np.var(np.array(list(avg_scores.values()))) if len(avg_scores) > 1 else 0
+    )
 
-        # Compute bootstrap confidence intervals
-        ci_lower, ci_upper = bootstrap_skills(
-            user1_ids,
-            user2_ids,
-            question_ids,
-            scores,
-            weights,
-            var_avg_scores,
-            baseline_player=baseline_player,
-            bootstrap_iterations=bootstrap_iterations,
-        )
-        print()
+    # compute skills initially
+    skills = get_skills(
+        user1_ids=user1_ids,
+        user2_ids=user2_ids,
+        question_ids=question_ids,
+        scores=scores,
+        weights=weights,
+        baseline_player=baseline_player,
+        var_avg_scores=var_avg_scores,
+        verbose=False,
+    )
 
-        ordered_skills = sorted(
-            [(user, skill) for user, skill in skills.items()], key=lambda x: -x[1]
-        )
-        player_stats: dict[int | str, list] = defaultdict(lambda: [0, set()])
-        for u1id, u2id, qid in zip(user1_ids, user2_ids, question_ids):
-            player_stats[u1id][0] += 1
-            player_stats[u1id][1].add(qid)
-            player_stats[u2id][0] += 1
-            player_stats[u2id][1].add(qid)
-
-        ##########################################################################
-        ##########################################################################
-        ##########################################################################
-        ##########################################################################
-        # UPDATE Leaderboard
-        print("Updating leaderboard...", end="\r")
-        leaderboard, _ = Leaderboard.objects.get_or_create(
-            name="Global Bot Leaderboard",
-            project=Project.objects.get(type=Project.ProjectTypes.SITE_MAIN),
-            score_type=LeaderboardScoreTypes.MANUAL,
-            bot_status=Project.BotLeaderboardStatus.BOTS_ONLY,
-        )
-        entry_dict = {
-            entry.user_id or entry.aggregation_method: entry
-            for entry in list(leaderboard.entries.all())
-        }
-        rank = 1
-        question_count = len(set(question_ids))
-        seen = set()
-        for uid, skill in ordered_skills:
-            contribution_count = len(player_stats[uid][1])
-
-            excluded = False
-            if isinstance(uid, int):
-                user = User.objects.get(id=uid)
-                bot_details = user.metadata["bot_details"]
-                if not bot_details.get("display_in_leaderboard"):
-                    excluded = True
-
-            entry: LeaderboardEntry = entry_dict.pop(uid, LeaderboardEntry())
-            entry.user_id = uid if isinstance(uid, int) else None
-            entry.aggregation_method = uid if isinstance(uid, str) else None
-            entry.leaderboard = leaderboard
-            entry.score = skill
-            entry.rank = rank
-            entry.excluded = excluded
-            entry.show_when_excluded = False
-            entry.contribution_count = contribution_count
-            entry.coverage = contribution_count / question_count
-            entry.calculated_on = timezone.now()
-            entry.ci_lower = ci_lower.get(uid, None)
-            entry.ci_upper = ci_upper.get(uid, None)
-            # TODO: support for more efficient saving once this is implemented
-            # for leaderboards with more than 100 entries
-            entry.save()
-            seen.add(entry.id)
-
-            if not excluded:
-                rank += 1
-        print("Updating leaderboard... DONE")
-        # delete unseen entries
-        leaderboard.entries.exclude(id__in=seen).delete()
-        print()
-
-        ##########################################################################
-        ##########################################################################
-        ##########################################################################
-        ##########################################################################
-        # DISPLAY
-        print("Results:")
+    # Compute bootstrap confidence intervals
+    ci_lower, ci_upper = bootstrap_skills(
+        user1_ids,
+        user2_ids,
+        question_ids,
+        scores,
+        weights,
+        var_avg_scores,
+        baseline_player=baseline_player,
+        bootstrap_iterations=bootstrap_iterations,
+    )
+    print()
+
+    ordered_skills = sorted(
+        [(user, skill) for user, skill in skills.items()], key=lambda x: -x[1]
+    )
+    player_stats: dict[int | str, list] = defaultdict(lambda: [0, set()])
+    for u1id, u2id, qid in zip(user1_ids, user2_ids, question_ids):
+        player_stats[u1id][0] += 1
+        player_stats[u1id][1].add(qid)
+        player_stats[u2id][0] += 1
+        player_stats[u2id][1].add(qid)
+
+    ##########################################################################
+    ##########################################################################
+    ##########################################################################
+    ##########################################################################
+    # UPDATE Leaderboard
+    print("Updating leaderboard...", end="\r")
+    leaderboard, _ = Leaderboard.objects.get_or_create(
+        name="Global Bot Leaderboard",
+        project=Project.objects.get(type=Project.ProjectTypes.SITE_MAIN),
+        score_type=LeaderboardScoreTypes.MANUAL,
+        bot_status=Project.BotLeaderboardStatus.BOTS_ONLY,
+    )
+    entry_dict = {
+        entry.user_id or entry.aggregation_method: entry
+        for entry in list(leaderboard.entries.all())
+    }
+    rank = 1
+    question_count = len(set(question_ids))
+    seen = set()
+    for uid, skill in ordered_skills:
+        contribution_count = len(player_stats[uid][1])
+
+        excluded = False
+        if isinstance(uid, int):
+            user = User.objects.get(id=uid)
+            bot_details = user.metadata["bot_details"]
+            if not bot_details.get("display_in_leaderboard"):
+                excluded = True
+
+        entry: LeaderboardEntry = entry_dict.pop(uid, LeaderboardEntry())
+        entry.user_id = uid if isinstance(uid, int) else None
+        entry.aggregation_method = uid if isinstance(uid, str) else None
+        entry.leaderboard = leaderboard
+        entry.score = skill
+        entry.rank = rank
+        entry.excluded = excluded
+        entry.show_when_excluded = False
+        entry.contribution_count = contribution_count
+        entry.coverage = contribution_count / question_count
+        entry.calculated_on = timezone.now()
+        entry.ci_lower = ci_lower.get(uid, None)
+        entry.ci_upper = ci_upper.get(uid, None)
+        # TODO: support for more efficient saving once this is implemented
+        # for leaderboards with more than 100 entries
+        entry.save()
+        seen.add(entry.id)
+
+        if not excluded:
+            rank += 1
+    print("Updating leaderboard... DONE")
+    # delete unseen entries
+    leaderboard.entries.exclude(id__in=seen).delete()
+    print()
+
+    ##########################################################################
+    ##########################################################################
+    ##########################################################################
+    ##########################################################################
+    # DISPLAY
+    print("Results:")
+    print(
+        "|  2.5%  "
+        "| Skill  "
+        "| 97.5%  "
+        "| Match  "
+        "| Quest. "
+        "|   ID   "
+        "| Username "
+    )
+    print(
+        "| Match  "
+        "|        "
+        "| Match  "
+        "| Count  "
+        "| Count  "
+        "|        "
+        "|          "
+    )
+    print(
+        "=========================================="
+        "=========================================="
+    )
+    unevaluated = (
+        set(user1_ids) | set(user2_ids) | set(users.values_list("id", flat=True))
+    )
+    for uid, skill in ordered_skills:
+        if isinstance(uid, str):
+            username = uid
+        else:
+            username = User.objects.get(id=uid).username
+        unevaluated.remove(uid)
+        lower = ci_lower.get(uid, 0)
+        upper = ci_upper.get(uid, 0)
         print(
-            "|  2.5%  "
-            "| Skill  "
-            "| 97.5%  "
-            "| Match  "
-            "| Quest. "
-            "|   ID   "
-            "| Username "
+            f"| {round(lower, 2):>6} "
+            f"| {round(skill, 2):>6} "
+            f"| {round(upper, 2):>6} "
+            f"| {player_stats[uid][0]:>6} "
+            f"| {len(player_stats[uid][1]):>6} "
+            f"| {uid if isinstance(uid, int) else '':>6} "
+            f"| {username}"
         )
+    for uid in unevaluated:
+        if isinstance(uid, str):
+            username = uid
+        else:
+            username = User.objects.get(id=uid).username
         print(
-            "| Match  "
-            "|        "
-            "| Match  "
-            "| Count  "
-            "| Count  "
-            "|        "
-            "|          "
+            "| ------ "
+            "| ------ "
+            "| ------ "
+            "| ------ "
+            "| ------ "
+            f"| {uid if isinstance(uid, int) else '':>5} "
+            f"| {username}"
         )
+    print()
+
+    ##########################################################################
+    ##########################################################################
+    ##########################################################################
+    ##########################################################################
+    # TESTS
+    skills_array = np.array(list(skills.values()))
+
+    # 1. Correllation between skill and avg_score (DO NOT HAVE YET - need avg_score)
+    x = []
+    y = []
+    for uid in user1_ids:
+        x.append(skills.get(uid, 0))
+        y.append(avg_scores.get(uid, 0))
+    correlation = np.corrcoef(x, y)
+    print(f"\nCorrelation between skill and avg_score: {correlation[0][1]}")
+
+    # 2. Shapiro-Wilk test (good for small to medium samples)
+    if len(skills_array) >= 3:
+        shapiro_stat, shapiro_p = stats.shapiro(skills_array)
         print(
-            "=========================================="
-            "=========================================="
+            f"  Shapiro-Wilk test: statistic={shapiro_stat:.4f}, p-value={shapiro_p:.4f}"
         )
-        unevaluated = (
-            set(user1_ids) | set(user2_ids) | set(users.values_list("id", flat=True))
-        )
-        for uid, skill in ordered_skills:
-            if isinstance(uid, str):
-                username = uid
-            else:
-                username = User.objects.get(id=uid).username
-            unevaluated.remove(uid)
-            lower = ci_lower.get(uid, 0)
-            upper = ci_upper.get(uid, 0)
-            print(
-                f"| {round(lower, 2):>6} "
-                f"| {round(skill, 2):>6} "
-                f"| {round(upper, 2):>6} "
-                f"| {player_stats[uid][0]:>6} "
-                f"| {len(player_stats[uid][1]):>6} "
-                f"| {uid if isinstance(uid, int) else '':>6} "
-                f"| {username}"
-            )
-        for uid in unevaluated:
-            if isinstance(uid, str):
-                username = uid
-            else:
-                username = User.objects.get(id=uid).username
-            print(
-                "| ------ "
-                "| ------ "
-                "| ------ "
-                "| ------ "
-                "| ------ "
-                f"| {uid if isinstance(uid, int) else '':>5} "
-                f"| {username}"
-            )
-        print()
-
-        ##########################################################################
-        ##########################################################################
-        ##########################################################################
-        ##########################################################################
-        # TESTS
-        skills_array = np.array(list(skills.values()))
-
-        # 1. Correllation between skill and avg_score (DO NOT HAVE YET - need avg_score)
-        x = []
-        y = []
-        for uid in user1_ids:
-            x.append(skills.get(uid, 0))
-            y.append(avg_scores.get(uid, 0))
-        correlation = np.corrcoef(x, y)
-        print(f"\nCorrelation between skill and avg_score: {correlation[0][1]}")
-
-        # 2. Shapiro-Wilk test (good for small to medium samples)
-        if len(skills_array) >= 3:
-            shapiro_stat, shapiro_p = stats.shapiro(skills_array)
-            print(
-                f"  Shapiro-Wilk test: statistic={shapiro_stat:.4f}, p-value={shapiro_p:.4f}"
-            )
-            if shapiro_p > 0.05:
-                print("    → Skills appear normally distributed (p > 0.05)")
-            else:
-                print("    → Skills may not be normally distributed (p ≤ 0.05)")
-
-        # 3. Anderson-Darling test (more sensitive to tails)
-        anderson_result = stats.anderson(skills_array, dist="norm")
-        print(f"  Anderson-Darling test: statistic={anderson_result.statistic:.4f}")
-        # Check at 5% significance level
-        critical_5pct = anderson_result.critical_values[2]  # Index 2 is 5% level
-        print(f"    Critical value at 5%: {critical_5pct:.4f}")
-        if anderson_result.statistic < critical_5pct:
-            print("    → Skills appear normally distributed (stat < critical)")
-        else:
-            print("    → Skills may not be normally distributed (stat ≥ critical)")
-
-        # 4. Kolmogorov-Smirnov test (compare to normal distribution)
-        ks_stat, ks_p = stats.kstest(
-            skills_array, "norm", args=(skills_array.mean(), skills_array.std())
-        )
-        print(f"  Kolmogorov-Smirnov test: statistic={ks_stat:.4f}, p-value={ks_p:.4f}")
-        if ks_p > 0.05:
+        if shapiro_p > 0.05:
             print("    → Skills appear normally distributed (p > 0.05)")
         else:
             print("    → Skills may not be normally distributed (p ≤ 0.05)")
 
-        # 5. Summary statistics
-        print("\nSkill distribution summary:")
-        print(f"  Mean: {skills_array.mean():.2f}")
-        print(f"  Std: {skills_array.std():.2f}")
-        print(f"  Skewness: {stats.skew(skills_array):.4f}")
-        print(f"  Kurtosis: {stats.kurtosis(skills_array):.4f}")
-        print()
+    # 3. Anderson-Darling test (more sensitive to tails)
+    anderson_result = stats.anderson(skills_array, dist="norm")
+    print(f"  Anderson-Darling test: statistic={anderson_result.statistic:.4f}")
+    # Check at 5% significance level
+    critical_5pct = anderson_result.critical_values[2]  # Index 2 is 5% level
+    print(f"    Critical value at 5%: {critical_5pct:.4f}")
+    if anderson_result.statistic < critical_5pct:
+        print("    → Skills appear normally distributed (stat < critical)")
+    else:
+        print("    → Skills may not be normally distributed (stat ≥ critical)")
+
+    # 4. Kolmogorov-Smirnov test (compare to normal distribution)
+    ks_stat, ks_p = stats.kstest(
+        skills_array, "norm", args=(skills_array.mean(), skills_array.std())
+    )
+    print(f"  Kolmogorov-Smirnov test: statistic={ks_stat:.4f}, p-value={ks_p:.4f}")
+    if ks_p > 0.05:
+        print("    → Skills appear normally distributed (p > 0.05)")
+    else:
+        print("    → Skills may not be normally distributed (p ≤ 0.05)")
+
+    # 5. Summary statistics
+    print("\nSkill distribution summary:")
+    print(f"  Mean: {skills_array.mean():.2f}")
+    print(f"  Std: {skills_array.std():.2f}")
+    print(f"  Skewness: {stats.skew(skills_array):.4f}")
+    print(f"  Kurtosis: {stats.kurtosis(skills_array):.4f}")
+    print()
+
+
+class Command(BaseCommand):
+    help = """
+    Update the global bots leaderboard
+    """
+
+    def handle(self, *args, **options) -> None:
+        run_update_global_bot_leaderboard()