diff --git a/backend/app/alembic/versions/041_add_config_in_evals_run_table.py b/backend/app/alembic/versions/041_add_config_in_evals_run_table.py new file mode 100644 index 00000000..449768b3 --- /dev/null +++ b/backend/app/alembic/versions/041_add_config_in_evals_run_table.py @@ -0,0 +1,60 @@ +"""add config in evals run table + +Revision ID: 041 +Revises: 040 +Create Date: 2025-12-15 14:03:22.082746 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "041" +down_revision = "040" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "evaluation_run", + sa.Column( + "config_id", + sa.Uuid(), + nullable=True, + comment="Reference to the stored config used", + ), + ) + op.add_column( + "evaluation_run", + sa.Column( + "config_version", + sa.Integer(), + nullable=True, + comment="Version of the config used", + ), + ) + op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"]) + op.drop_column("evaluation_run", "config") + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "evaluation_run", + sa.Column( + "config", + postgresql.JSONB(astext_type=sa.Text()), + autoincrement=False, + nullable=False, + comment="Evaluation configuration (model, instructions, etc.)", + ), + ) + op.drop_constraint(None, "evaluation_run", type_="foreignkey") + op.drop_column("evaluation_run", "config_version") + op.drop_column("evaluation_run", "config_id") + # ### end Alembic commands ### diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py index bb095413..cb158ea7 100644 --- a/backend/app/crud/evaluations/__init__.py +++ b/backend/app/crud/evaluations/__init__.py @@ -5,6 +5,7 @@ create_evaluation_run, get_evaluation_run_by_id, list_evaluation_runs, + resolve_model_from_config, save_score, ) from app.crud.evaluations.cron import ( diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index b2b118df..b64aa820 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -1,12 +1,16 @@ import logging from typing import Any +from uuid import UUID from langfuse import Langfuse from sqlmodel import Session, select from app.core.util import now +from app.crud.config.version import ConfigVersionCrud from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse from app.models import EvaluationRun +from app.models.llm.request import LLMCallConfig +from app.services.llm.jobs import resolve_config_blob logger = logging.getLogger(__name__) @@ -16,7 +20,8 @@ def create_evaluation_run( run_name: str, dataset_name: str, dataset_id: int, - config: dict, + config_id: UUID, + config_version: int, organization_id: int, project_id: int, ) -> EvaluationRun: @@ -28,7 +33,8 @@ def create_evaluation_run( run_name: Name of the evaluation run/experiment dataset_name: Name of the dataset being used dataset_id: ID of the dataset - config: Configuration dict for the evaluation + config_id: UUID of the stored config + config_version: Version number of the config organization_id: Organization ID project_id: Project ID @@ -39,7 +45,8 @@ def create_evaluation_run( run_name=run_name, dataset_name=dataset_name, dataset_id=dataset_id, - config=config, + config_id=config_id, + config_version=config_version, status="pending", organization_id=organization_id, project_id=project_id, @@ -56,7 +63,10 @@ def create_evaluation_run( logger.error(f"Failed to create EvaluationRun: {e}", exc_info=True) raise - logger.info(f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}") + logger.info( + f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}, " + f"config_id={config_id}, config_version={config_version}" + ) return eval_run @@ -293,3 +303,47 @@ def save_score( f"traces={len(score.get('traces', []))}" ) return eval_run + + +def resolve_model_from_config( + session: Session, + eval_run: EvaluationRun, +) -> str: + """ + Resolve the model name from the evaluation run's config. + + Args: + session: Database session + eval_run: EvaluationRun instance + + Returns: + Model name from config + + Raises: + ValueError: If config is missing, invalid, or has no model + """ + if not eval_run.config_id or not eval_run.config_version: + raise ValueError( + f"Evaluation run {eval_run.id} has no config reference " + f"(config_id={eval_run.config_id}, config_version={eval_run.config_version})" + ) + + config_version_crud = ConfigVersionCrud( + session=session, + config_id=eval_run.config_id, + project_id=eval_run.project_id, + ) + + config, error = resolve_config_blob( + config_crud=config_version_crud, + config=LLMCallConfig(id=eval_run.config_id, version=eval_run.config_version), + ) + + if error or config is None: + raise ValueError( + f"Config resolution failed for evaluation {eval_run.id} " + f"(config_id={eval_run.config_id}, version={eval_run.config_version}): {error}" + ) + + model = config.completion.params.get("model") + return model diff --git a/backend/app/crud/evaluations/embeddings.py b/backend/app/crud/evaluations/embeddings.py index 17ead39a..20fabfd1 100644 --- a/backend/app/crud/evaluations/embeddings.py +++ b/backend/app/crud/evaluations/embeddings.py @@ -363,19 +363,7 @@ def start_embedding_batch( logger.info(f"Starting embedding batch for evaluation run {eval_run.id}") # Get embedding model from config (default: text-embedding-3-large) - embedding_model = eval_run.config.get( - "embedding_model", "text-embedding-3-large" - ) - - # Validate and fallback to default if invalid - try: - validate_embedding_model(embedding_model) - except ValueError as e: - logger.warning( - f"Invalid embedding model '{embedding_model}' in config: {e}. " - f"Falling back to text-embedding-3-large" - ) - embedding_model = "text-embedding-3-large" + embedding_model = "text-embedding-3-large" # Step 1: Build embedding JSONL with trace_ids jsonl_data = build_embedding_jsonl( diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py index fbc2d231..c8baf4d5 100644 --- a/backend/app/crud/evaluations/processing.py +++ b/backend/app/crud/evaluations/processing.py @@ -26,7 +26,7 @@ upload_batch_results_to_object_store, ) from app.crud.evaluations.batch import fetch_dataset_items -from app.crud.evaluations.core import update_evaluation_run +from app.crud.evaluations.core import update_evaluation_run, resolve_model_from_config from app.crud.evaluations.embeddings import ( calculate_average_similarity, parse_embedding_results, @@ -254,16 +254,16 @@ async def process_completed_evaluation( if not results: raise ValueError("No valid results found in batch output") - # Extract model from config for cost tracking - model = eval_run.config.get("model") if eval_run.config else None - # Step 5: Create Langfuse dataset run with traces + # Use model stored at creation time for cost tracking + model = resolve_model_from_config(session=session, eval_run=eval_run) + trace_id_mapping = create_langfuse_dataset_run( langfuse=langfuse, dataset_name=eval_run.dataset_name, + model=model, run_name=eval_run.run_name, results=results, - model=model, ) # Store object store URL in database diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index f99fbb27..6ae4542f 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -1,5 +1,6 @@ from datetime import datetime from typing import TYPE_CHECKING, Any, Optional +from uuid import UUID from pydantic import BaseModel, Field from sqlalchemy import Column, Index, Text, UniqueConstraint @@ -193,15 +194,17 @@ class EvaluationRun(SQLModel, table=True): sa_column_kwargs={"comment": "Name of the Langfuse dataset used"}, ) - # Config field - dict requires sa_column - config: dict[str, Any] = SQLField( - default_factory=dict, - sa_column=Column( - JSONB, - nullable=False, - comment="Evaluation configuration (model, instructions, etc.)", - ), - description="Evaluation configuration", + config_id: UUID = SQLField( + foreign_key="config.id", + nullable=True, + description="Reference to the stored config used for this evaluation", + sa_column_kwargs={"comment": "Reference to the stored config used"}, + ) + config_version: int = SQLField( + nullable=True, + ge=1, + description="Version of the config used for this evaluation", + sa_column_kwargs={"comment": "Version of the config used"}, ) # Dataset reference @@ -339,7 +342,8 @@ class EvaluationRunPublic(SQLModel): id: int run_name: str dataset_name: str - config: dict[str, Any] + config_id: UUID | None + config_version: int | None dataset_id: int batch_job_id: int | None embedding_batch_job_id: int | None diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py index 813fd483..a5dc34af 100644 --- a/backend/app/tests/api/routes/test_evaluation.py +++ b/backend/app/tests/api/routes/test_evaluation.py @@ -1,6 +1,7 @@ import io from typing import Any from unittest.mock import Mock, patch +from uuid import uuid4 import pytest from fastapi.testclient import TestClient @@ -558,21 +559,22 @@ def test_start_batch_evaluation_missing_model( response = client.post( "/api/v1/evaluations/", json={ - "experiment_name": "test_no_model", - "dataset_id": 1, # Dummy ID, error should come before this is checked - "config": invalid_config, + "experiment_name": "test_no_config", + "dataset_id": 1, # Dummy ID, config validation happens first + "config_id": str(uuid4()), # Non-existent config + "config_version": 1, }, headers=user_api_key_header, ) - # Should fail with either 400 (model missing) or 404 (dataset not found) + # Should fail with either 400 (config not found) or 404 (dataset/config not found) assert response.status_code in [400, 404] response_data = response.json() error_str = response_data.get( "detail", response_data.get("message", str(response_data)) ) - # Should fail with either "model" missing or "dataset not found" (both acceptable) - assert "model" in error_str.lower() or "not found" in error_str.lower() + # Should mention config or not found + assert "config" in error_str.lower() or "not found" in error_str.lower() def test_start_batch_evaluation_without_authentication( self, client, sample_evaluation_config @@ -758,11 +760,15 @@ def test_get_evaluation_run_trace_info_not_completed( create_test_dataset: EvaluationDataset, ) -> None: """Test requesting trace info for incomplete evaluation returns error.""" + # Create a config for the evaluation run + config = create_test_config(db, project_id=user_api_key.project.id) + eval_run = EvaluationRun( run_name="test_pending_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, status="pending", total_items=3, organization_id=user_api_key.organization_id, @@ -794,11 +800,15 @@ def test_get_evaluation_run_trace_info_completed( create_test_dataset: EvaluationDataset, ) -> None: """Test requesting trace info for completed evaluation returns cached scores.""" + # Create a config for the evaluation run + config = create_test_config(db, project_id=user_api_key.project.id) + eval_run = EvaluationRun( run_name="test_completed_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, status="completed", total_items=3, score={