Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions backend/app/alembic/versions/041_add_config_in_evals_run_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""add config in evals run table
Revision ID: 041
Revises: 040
Create Date: 2025-12-15 14:03:22.082746
"""
from alembic import op
import sqlalchemy as sa
import sqlmodel.sql.sqltypes
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "041"
down_revision = "040"
branch_labels = None
depends_on = None


def upgrade():
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Add return type hints to migration functions.

Both upgrade() and downgrade() functions are missing return type hints.

As per coding guidelines, all functions should have type hints.

📝 Proposed fix
-def upgrade():
+def upgrade() -> None:
-def downgrade():
+def downgrade() -> None:

Also applies to: 45-45

🤖 Prompt for AI Agents
In @backend/app/alembic/versions/041_add_config_in_evals_run_table.py at line
20, The migration functions upgrade() and downgrade() lack return type hints;
update both function definitions (upgrade and downgrade) to include explicit
return types (e.g., change "def upgrade():" and "def downgrade():" to "def
upgrade() -> None:" and "def downgrade() -> None:") so they conform to the
project's typing guidelines.

# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"evaluation_run",
sa.Column(
"config_id",
sa.Uuid(),
nullable=True,
comment="Reference to the stored config used",
),
)
op.add_column(
"evaluation_run",
sa.Column(
"config_version",
sa.Integer(),
nullable=True,
comment="Version of the config used",
),
)
op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
op.drop_column("evaluation_run", "config")
Comment on lines +22 to +41
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Data loss and foreign key constraint naming issues.

This migration has several critical problems:

  1. Data loss: Line 41 drops the config column without migrating existing data to the new config_id/config_version columns. Any existing evaluation runs will lose their configuration data permanently.

  2. Foreign key constraint naming: Line 40 creates a foreign key with None as the constraint name, causing Alembic to auto-generate a name. However, the downgrade function (Line 57) also uses None to drop the constraint, which won't match the auto-generated name and will fail.

Required actions:

  1. Add a data migration step before dropping the config column. You'll need to:

    • Parse each existing config JSONB object
    • Look up or create corresponding config records with appropriate versions
    • Update config_id and config_version for each evaluation_run
    • Or, if data migration isn't feasible, add a comment explaining why data loss is acceptable
  2. Specify an explicit constraint name instead of None:

🔧 Proposed fix for FK constraint naming
-    op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
+    op.create_foreign_key(
+        "fk_evaluation_run_config_id", 
+        "evaluation_run", 
+        "config", 
+        ["config_id"], 
+        ["id"]
+    )

And update the downgrade:

-    op.drop_constraint(None, "evaluation_run", type_="foreignkey")
+    op.drop_constraint("fk_evaluation_run_config_id", "evaluation_run", type_="foreignkey")

Committable suggestion skipped: line range outside the PR's diff.

# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"evaluation_run",
sa.Column(
"config",
postgresql.JSONB(astext_type=sa.Text()),
autoincrement=False,
nullable=False,
comment="Evaluation configuration (model, instructions, etc.)",
),
)
Comment on lines +47 to +56
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Downgrade will fail with existing data.

The downgrade re-adds the config column with nullable=False (Line 53). If the evaluation_run table contains any records when downgrading, this operation will fail because PostgreSQL cannot add a non-nullable column to a table with existing rows without specifying a default value.

Either:

  1. Make the column nullable during downgrade: nullable=True
  2. Provide a server default value
  3. Add a data migration to populate the column before setting it non-nullable
🔧 Proposed fix (Option 1: Make nullable)
     op.add_column(
         "evaluation_run",
         sa.Column(
             "config",
             postgresql.JSONB(astext_type=sa.Text()),
             autoincrement=False,
-            nullable=False,
+            nullable=True,
             comment="Evaluation configuration (model, instructions, etc.)",
         ),
     )
🤖 Prompt for AI Agents
In @backend/app/alembic/versions/041_add_config_in_evals_run_table.py around
lines 47 - 56, The downgrade currently re-adds the "config" column on the
"evaluation_run" table using op.add_column with sa.Column(..., nullable=False)
which will fail if rows exist; update that op.add_column call in the downgrade
to use nullable=True (or alternatively add a server_default or a prior data
migration to populate values before setting non-nullable), ensuring the column
is created nullable during downgrade to avoid PostgreSQL errors.

op.drop_constraint(None, "evaluation_run", type_="foreignkey")
op.drop_column("evaluation_run", "config_version")
op.drop_column("evaluation_run", "config_id")
# ### end Alembic commands ###
1 change: 1 addition & 0 deletions backend/app/crud/evaluations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
create_evaluation_run,
get_evaluation_run_by_id,
list_evaluation_runs,
resolve_model_from_config,
save_score,
)
from app.crud.evaluations.cron import (
Expand Down
62 changes: 58 additions & 4 deletions backend/app/crud/evaluations/core.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import logging
from typing import Any
from uuid import UUID

from langfuse import Langfuse
from sqlmodel import Session, select

from app.core.util import now
from app.crud.config.version import ConfigVersionCrud
from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse
from app.models import EvaluationRun
from app.models.llm.request import LLMCallConfig
from app.services.llm.jobs import resolve_config_blob

logger = logging.getLogger(__name__)

Expand All @@ -16,7 +20,8 @@ def create_evaluation_run(
run_name: str,
dataset_name: str,
dataset_id: int,
config: dict,
config_id: UUID,
config_version: int,
organization_id: int,
project_id: int,
) -> EvaluationRun:
Expand All @@ -28,7 +33,8 @@ def create_evaluation_run(
run_name: Name of the evaluation run/experiment
dataset_name: Name of the dataset being used
dataset_id: ID of the dataset
config: Configuration dict for the evaluation
config_id: UUID of the stored config
config_version: Version number of the config
organization_id: Organization ID
project_id: Project ID

Expand All @@ -39,7 +45,8 @@ def create_evaluation_run(
run_name=run_name,
dataset_name=dataset_name,
dataset_id=dataset_id,
config=config,
config_id=config_id,
config_version=config_version,
status="pending",
organization_id=organization_id,
project_id=project_id,
Expand All @@ -56,7 +63,10 @@ def create_evaluation_run(
logger.error(f"Failed to create EvaluationRun: {e}", exc_info=True)
raise

logger.info(f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}")
logger.info(
f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}, "
f"config_id={config_id}, config_version={config_version}"
)

return eval_run

Expand Down Expand Up @@ -293,3 +303,47 @@ def save_score(
f"traces={len(score.get('traces', []))}"
)
return eval_run


def resolve_model_from_config(
session: Session,
eval_run: EvaluationRun,
) -> str:
"""
Resolve the model name from the evaluation run's config.

Args:
session: Database session
eval_run: EvaluationRun instance

Returns:
Model name from config

Raises:
ValueError: If config is missing, invalid, or has no model
"""
if not eval_run.config_id or not eval_run.config_version:
raise ValueError(
f"Evaluation run {eval_run.id} has no config reference "
f"(config_id={eval_run.config_id}, config_version={eval_run.config_version})"
)

config_version_crud = ConfigVersionCrud(
session=session,
config_id=eval_run.config_id,
project_id=eval_run.project_id,
)

config, error = resolve_config_blob(
config_crud=config_version_crud,
config=LLMCallConfig(id=eval_run.config_id, version=eval_run.config_version),
)

if error or config is None:
raise ValueError(
f"Config resolution failed for evaluation {eval_run.id} "
f"(config_id={eval_run.config_id}, version={eval_run.config_version}): {error}"
)

model = config.completion.params.get("model")
return model
14 changes: 1 addition & 13 deletions backend/app/crud/evaluations/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,19 +363,7 @@ def start_embedding_batch(
logger.info(f"Starting embedding batch for evaluation run {eval_run.id}")

# Get embedding model from config (default: text-embedding-3-large)
embedding_model = eval_run.config.get(
"embedding_model", "text-embedding-3-large"
)

# Validate and fallback to default if invalid
try:
validate_embedding_model(embedding_model)
except ValueError as e:
logger.warning(
f"Invalid embedding model '{embedding_model}' in config: {e}. "
f"Falling back to text-embedding-3-large"
)
embedding_model = "text-embedding-3-large"
embedding_model = "text-embedding-3-large"

# Step 1: Build embedding JSONL with trace_ids
jsonl_data = build_embedding_jsonl(
Expand Down
10 changes: 5 additions & 5 deletions backend/app/crud/evaluations/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
upload_batch_results_to_object_store,
)
from app.crud.evaluations.batch import fetch_dataset_items
from app.crud.evaluations.core import update_evaluation_run
from app.crud.evaluations.core import update_evaluation_run, resolve_model_from_config
from app.crud.evaluations.embeddings import (
calculate_average_similarity,
parse_embedding_results,
Expand Down Expand Up @@ -254,16 +254,16 @@ async def process_completed_evaluation(
if not results:
raise ValueError("No valid results found in batch output")

# Extract model from config for cost tracking
model = eval_run.config.get("model") if eval_run.config else None

# Step 5: Create Langfuse dataset run with traces
# Use model stored at creation time for cost tracking
model = resolve_model_from_config(session=session, eval_run=eval_run)

trace_id_mapping = create_langfuse_dataset_run(
langfuse=langfuse,
dataset_name=eval_run.dataset_name,
model=model,
run_name=eval_run.run_name,
results=results,
model=model,
)

# Store object store URL in database
Expand Down
24 changes: 14 additions & 10 deletions backend/app/models/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from datetime import datetime
from typing import TYPE_CHECKING, Any, Optional
from uuid import UUID

from pydantic import BaseModel, Field
from sqlalchemy import Column, Index, Text, UniqueConstraint
Expand Down Expand Up @@ -193,15 +194,17 @@ class EvaluationRun(SQLModel, table=True):
sa_column_kwargs={"comment": "Name of the Langfuse dataset used"},
)

# Config field - dict requires sa_column
config: dict[str, Any] = SQLField(
default_factory=dict,
sa_column=Column(
JSONB,
nullable=False,
comment="Evaluation configuration (model, instructions, etc.)",
),
description="Evaluation configuration",
config_id: UUID = SQLField(
foreign_key="config.id",
nullable=True,
description="Reference to the stored config used for this evaluation",
sa_column_kwargs={"comment": "Reference to the stored config used"},
)
config_version: int = SQLField(
nullable=True,
ge=1,
description="Version of the config used for this evaluation",
sa_column_kwargs={"comment": "Version of the config used"},
)

# Dataset reference
Expand Down Expand Up @@ -339,7 +342,8 @@ class EvaluationRunPublic(SQLModel):
id: int
run_name: str
dataset_name: str
config: dict[str, Any]
config_id: UUID | None
config_version: int | None
dataset_id: int
batch_job_id: int | None
embedding_batch_job_id: int | None
Expand Down
26 changes: 18 additions & 8 deletions backend/app/tests/api/routes/test_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import io
from typing import Any
from unittest.mock import Mock, patch
from uuid import uuid4

import pytest
from fastapi.testclient import TestClient
Expand Down Expand Up @@ -558,21 +559,22 @@ def test_start_batch_evaluation_missing_model(
response = client.post(
"/api/v1/evaluations/",
json={
"experiment_name": "test_no_model",
"dataset_id": 1, # Dummy ID, error should come before this is checked
"config": invalid_config,
"experiment_name": "test_no_config",
"dataset_id": 1, # Dummy ID, config validation happens first
"config_id": str(uuid4()), # Non-existent config
"config_version": 1,
},
headers=user_api_key_header,
)

# Should fail with either 400 (model missing) or 404 (dataset not found)
# Should fail with either 400 (config not found) or 404 (dataset/config not found)
assert response.status_code in [400, 404]
response_data = response.json()
error_str = response_data.get(
"detail", response_data.get("message", str(response_data))
)
# Should fail with either "model" missing or "dataset not found" (both acceptable)
assert "model" in error_str.lower() or "not found" in error_str.lower()
# Should mention config or not found
assert "config" in error_str.lower() or "not found" in error_str.lower()

def test_start_batch_evaluation_without_authentication(
self, client, sample_evaluation_config
Expand Down Expand Up @@ -758,11 +760,15 @@ def test_get_evaluation_run_trace_info_not_completed(
create_test_dataset: EvaluationDataset,
) -> None:
"""Test requesting trace info for incomplete evaluation returns error."""
# Create a config for the evaluation run
config = create_test_config(db, project_id=user_api_key.project.id)

eval_run = EvaluationRun(
run_name="test_pending_run",
dataset_name=create_test_dataset.name,
dataset_id=create_test_dataset.id,
config={"model": "gpt-4o"},
config_id=config.id,
config_version=1,
status="pending",
total_items=3,
organization_id=user_api_key.organization_id,
Expand Down Expand Up @@ -794,11 +800,15 @@ def test_get_evaluation_run_trace_info_completed(
create_test_dataset: EvaluationDataset,
) -> None:
"""Test requesting trace info for completed evaluation returns cached scores."""
# Create a config for the evaluation run
config = create_test_config(db, project_id=user_api_key.project.id)

eval_run = EvaluationRun(
run_name="test_completed_run",
dataset_name=create_test_dataset.name,
dataset_id=create_test_dataset.id,
config={"model": "gpt-4o"},
config_id=config.id,
config_version=1,
status="completed",
total_items=3,
score={
Expand Down
Loading