ProjectTech4DevAI · rkritika1508 · Jan 19, 2026 · Jan 19, 2026 · Jan 19, 2026 · Jan 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,6 @@ htmlcov
 .venv
 .DS_Store
 .env
-.env.test
+.env.test
+metrics.json
+predictions.csv
diff --git a/backend/app/api/routes/guardrails.py b/backend/app/api/routes/guardrails.py
@@ -162,15 +162,17 @@ async def _validate_with_guard(
         )
 
 def add_validator_logs(guard: Guard, request_log_id: UUID, validator_log_crud: ValidatorLogCrud):
-    if not guard or not guard.history or not guard.history.last:
+    history = getattr(guard, "history", None)
+    if not history:
         return
 
-    call = guard.history.last
-    if not call.iterations:
+    last_call = getattr(history, "last", None)
+    if not last_call or not getattr(last_call, "iterations", None):
         return
 
-    iteration = call.iterations[-1]
-    if not iteration.outputs or not iteration.outputs.validator_logs:
+    iteration = last_call.iterations[-1]
+    outputs = getattr(iteration, "outputs", None)
+    if not outputs or not getattr(outputs, "validator_logs", None):
         return
 
     for log in iteration.outputs.validator_logs:

diff --git a/backend/app/eval/common/io.py b/backend/app/eval/common/io.py
@@ -0,0 +1,14 @@
+from pathlib import Path
+import json
+import pandas as pd
+
+
+def write_csv(df: pd.DataFrame, path: Path):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    df.to_csv(path, index=False)
+
+
+def write_json(obj: dict, path: Path):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w") as f:
+        json.dump(obj, f, indent=2)
diff --git a/backend/app/eval/common/metrics.py b/backend/app/eval/common/metrics.py
@@ -0,0 +1,19 @@
+def compute_binary_metrics(y_true, y_pred):
+    tp = sum((yt == 1 and yp == 1) for yt, yp in zip(y_true, y_pred))
+    tn = sum((yt == 0 and yp == 0) for yt, yp in zip(y_true, y_pred))
+    fp = sum((yt == 0 and yp == 1) for yt, yp in zip(y_true, y_pred))
+    fn = sum((yt == 1 and yp == 0) for yt, yp in zip(y_true, y_pred))
-def compute_binary_metrics(y_true, y_pred):
-    tp = sum((yt == 1 and yp == 1) for yt, yp in zip(y_true, y_pred))
-    tn = sum((yt == 0 and yp == 0) for yt, yp in zip(y_true, y_pred))
-    fp = sum((yt == 0 and yp == 1) for yt, yp in zip(y_true, y_pred))
-    fn = sum((yt == 1 and yp == 0) for yt, yp in zip(y_true, y_pred))
+def compute_binary_metrics(y_true, y_pred):
+    tp = sum((yt == 1 and yp == 1) for yt, yp in zip(y_true, y_pred, strict=True))
+    tn = sum((yt == 0 and yp == 0) for yt, yp in zip(y_true, y_pred, strict=True))
+    fp = sum((yt == 0 and yp == 1) for yt, yp in zip(y_true, y_pred, strict=True))
+    fn = sum((yt == 1 and yp == 0) for yt, yp in zip(y_true, y_pred, strict=True))
-def compute_binary_metrics(y_true, y_pred):
-    tp = sum((yt == 1 and yp == 1) for yt, yp in zip(y_true, y_pred))
-    tn = sum((yt == 0 and yp == 0) for yt, yp in zip(y_true, y_pred))
-    fp = sum((yt == 0 and yp == 1) for yt, yp in zip(y_true, y_pred))
-    fn = sum((yt == 1 and yp == 0) for yt, yp in zip(y_true, y_pred))
+def compute_binary_metrics(y_true, y_pred):
+    tp = sum((yt == 1 and yp == 1) for yt, yp in zip(y_true, y_pred, strict=True))
+    tn = sum((yt == 0 and yp == 0) for yt, yp in zip(y_true, y_pred, strict=True))
+    fp = sum((yt == 0 and yp == 1) for yt, yp in zip(y_true, y_pred, strict=True))
+    fn = sum((yt == 1 and yp == 0) for yt, yp in zip(y_true, y_pred, strict=True))
+
+    precision = tp / (tp + fp) if tp + fp else 0.0
+    recall = tp / (tp + fn) if tp + fn else 0.0
+    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
+
+    return {
+        "tp": tp,
+        "tn": tn,
+        "fp": fp,
+        "fn": fn,
+        "precision": precision,
+        "recall": recall,
+        "f1": f1,
+    }
diff --git a/backend/app/eval/common/profiling.py b/backend/app/eval/common/profiling.py
@@ -0,0 +1,19 @@
+import time
+import tracemalloc
+
+class Profiler:
+    def __enter__(self):
+        self.latencies = []
+        tracemalloc.start()
+        return self
+
+    def record(self, fn, *args):
+        start = time.perf_counter()
+        result = fn(*args)
+        self.latencies.append((time.perf_counter() - start) * 1000)
+        return result
+
+    def __exit__(self, *args):
+        _, peak = tracemalloc.get_traced_memory()
+        tracemalloc.stop()
+        self.peak_memory_mb = peak / (1024 * 1024)
diff --git a/backend/app/eval/datasets/lexical_slur_testing_dataset.csv b/backend/app/eval/datasets/lexical_slur_testing_dataset.csv
diff --git a/backend/app/eval/datasets/pii_detection_testing_dataset.csv b/backend/app/eval/datasets/pii_detection_testing_dataset.csv
diff --git a/backend/app/eval/lexical_slur/run.py b/backend/app/eval/lexical_slur/run.py
@@ -0,0 +1,45 @@
+from pathlib import Path
+import pandas as pd
+from guardrails.validators import FailResult
+
+from app.core.validators.lexical_slur import LexicalSlur
+from app.eval.common.metrics import compute_binary_metrics
+from app.eval.common.profiling import Profiler
+from app.eval.common.io import write_csv, write_json
+
+BASE_DIR = Path(__file__).resolve().parent.parent
+OUT_DIR = BASE_DIR / "outputs" / "lexical_slur"
+
+df = pd.read_csv(BASE_DIR / "datasets" / "lexical_slur_testing_dataset.csv")
+
+validator = LexicalSlur()
+
+with Profiler() as p:
+    df["result"] = df["commentText"].astype(str).apply(
+        lambda x: p.record(lambda t: validator.validate(t, metadata=None), x)
+    )
+
+df["y_pred"] = df["result"].apply(lambda r: int(isinstance(r, FailResult)))
+df["y_true"] = df["label"]
+
+metrics = compute_binary_metrics(df["y_true"], df["y_pred"])
+
+# ---- Save outputs ----
+write_csv(df.drop(columns=["result"]), OUT_DIR / "predictions.csv")
+
+write_json(
+    {
+        "guardrail": "lexical_slur",
+        "num_samples": len(df),
+        "metrics": metrics,
+        "performance": {
+            "latency_ms": {
+                "mean": sum(p.latencies) / len(p.latencies),
+                "p95": sorted(p.latencies)[int(len(p.latencies) * 0.95)],
+                "max": max(p.latencies),
+            },
-        "performance": {
-            "latency_ms": {
-                "mean": sum(p.latencies) / len(p.latencies),
-                "p95": sorted(p.latencies)[int(len(p.latencies) * 0.95)],
-                "max": max(p.latencies),
-            },
+latencies = p.latencies
+if latencies:
+    latency_stats = {
+        "mean": sum(latencies) / len(latencies),
+        "p95": sorted(latencies)[int(len(latencies) * 0.95)],
+        "max": max(latencies),
+    }
+else:
+    latency_stats = {"mean": 0.0, "p95": 0.0, "max": 0.0}
+
+write_json(
+    {
+        "guardrail": "lexical_slur",
+        "num_samples": len(df),
+        "metrics": metrics,
+        "performance": {
+            "latency_ms": {
+                **latency_stats,
+            },
+            "memory_mb": p.peak_memory_mb,
+        },
+    },
+    OUT_DIR / "metrics.json",
+)
-        "performance": {
-            "latency_ms": {
-                "mean": sum(p.latencies) / len(p.latencies),
-                "p95": sorted(p.latencies)[int(len(p.latencies) * 0.95)],
-                "max": max(p.latencies),
-            },
+latencies = p.latencies
+if latencies:
+    latency_stats = {
+        "mean": sum(latencies) / len(latencies),
+        "p95": sorted(latencies)[int(len(latencies) * 0.95)],
+        "max": max(latencies),
+    }
+else:
+    latency_stats = {"mean": 0.0, "p95": 0.0, "max": 0.0}
+
+write_json(
+    {
+        "guardrail": "lexical_slur",
+        "num_samples": len(df),
+        "metrics": metrics,
+        "performance": {
+            "latency_ms": {
+                **latency_stats,
+            },
+            "memory_mb": p.peak_memory_mb,
+        },
+    },
+    OUT_DIR / "metrics.json",
+)
+            "memory_mb": p.peak_memory_mb,
+        },
+    },
+    OUT_DIR / "metrics.json",
+)
diff --git a/backend/app/eval/pii/entity_metrics.py b/backend/app/eval/pii/entity_metrics.py
@@ -0,0 +1,83 @@
+import re
+from collections import defaultdict
+from typing import Iterable, Dict, Set
+
+# Matches placeholders like [PHONE_NUMBER], <IN_PAN>, etc.
+ENTITY_PATTERN = re.compile(r"[\[<]([A-Z0-9_]+)[\]>]")
+
+
+def extract_entities(text: str) -> Set[str]:
+    """
+    Extract entity labels from a masked/anonymized string.
+
+    Examples:
+        "Call me at [PHONE_NUMBER]" -> {"PHONE_NUMBER"}
+        "<IN_PAN> <PHONE_NUMBER>"  -> {"IN_PAN", "PHONE_NUMBER"}
+    """
+    if not isinstance(text, str):
+        return set()
+    return set(ENTITY_PATTERN.findall(text))
+
+
+def compare_entities(gold: Set[str], pred: Set[str]):
+    """
+    Compare gold vs predicted entity sets.
+    """
+    tp = gold & pred          # correctly detected
+    fn = gold - pred          # missed entities
+    fp = pred - gold          # hallucinated entities
+    return tp, fp, fn
+
+
+def compute_entity_metrics(
+    gold_texts: Iterable[str],
+    pred_texts: Iterable[str],
+) -> Dict[str, dict]:
+    """
+    Compute per-entity TP / FP / FN counts across the dataset.
+    """
+    stats = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})
+
+    for gold_txt, pred_txt in zip(gold_texts, pred_texts):
-    for gold_txt, pred_txt in zip(gold_texts, pred_texts):
+    for gold_txt, pred_txt in zip(gold_texts, pred_texts, strict=True):
-    for gold_txt, pred_txt in zip(gold_texts, pred_texts):
+    for gold_txt, pred_txt in zip(gold_texts, pred_texts, strict=True):
+        gold_entities = extract_entities(gold_txt)
+        pred_entities = extract_entities(pred_txt)
+
+        tp, fp, fn = compare_entities(gold_entities, pred_entities)
+
+        for e in tp:
+            stats[e]["tp"] += 1
+        for e in fp:
+            stats[e]["fp"] += 1
+        for e in fn:
+            stats[e]["fn"] += 1
+
+    return finalize_entity_metrics(stats)
+
+
+def finalize_entity_metrics(stats: Dict[str, dict]) -> Dict[str, dict]:
+    """
+    Convert raw counts into precision / recall / F1 per entity.
+    """
+    report = {}
+
+    for entity, s in stats.items():
+        tp, fp, fn = s["tp"], s["fp"], s["fn"]
+
+        precision = tp / (tp + fp) if (tp + fp) else 0.0
+        recall = tp / (tp + fn) if (tp + fn) else 0.0
+        f1 = (
+            2 * precision * recall / (precision + recall)
+            if (precision + recall)
+            else 0.0
+        )
+
+        report[entity] = {
+            "tp": tp,
+            "fp": fp,
+            "fn": fn,
+            "precision": precision,
+            "recall": recall,
+            "f1": f1,
+        }
+
+    return report
diff --git a/backend/app/eval/pii/run.py b/backend/app/eval/pii/run.py
@@ -0,0 +1,39 @@
+from pathlib import Path
+import pandas as pd
+from guardrails.validators import FailResult
+
+from app.core.validators.pii_remover import PIIRemover
+from app.eval.pii.entity_metrics import compute_entity_metrics
+from app.eval.common.io import write_csv, write_json
+
+BASE_DIR = Path(__file__).resolve().parent.parent
+OUT_DIR = BASE_DIR / "outputs" / "pii_remover"
+
+df = pd.read_csv(BASE_DIR / "datasets" / "pii_detection_testing_dataset.csv")
+
+validator = PIIRemover()
+
+def run_pii(text: str) -> str:
+    result = validator._validate(text)
+    if isinstance(result, FailResult):
+        return result.fix_value
+    return text
+
+df["anonymized"] = df["source_text"].astype(str).apply(run_pii)
+
+entity_report = compute_entity_metrics(
+    df["target_text"],
+    df["anonymized"],
+)
+
+# ---- Save outputs ----
+write_csv(df, OUT_DIR / "predictions.csv")
+
-# ---- Save outputs ----
-write_csv(df, OUT_DIR / "predictions.csv")
+# ---- Save outputs ----
+# Avoid exporting raw PII by default
+write_csv(df.drop(columns=["source_text"]), OUT_DIR / "predictions.csv")
+
-# ---- Save outputs ----
-write_csv(df, OUT_DIR / "predictions.csv")
+# ---- Save outputs ----
+# Avoid exporting raw PII by default
+write_csv(df.drop(columns=["source_text"]), OUT_DIR / "predictions.csv")
+
+write_json(
+    {
+        "guardrail": "pii_remover",
+        "num_samples": len(df),
+        "entity_metrics": entity_report,
+    },
+    OUT_DIR / "metrics.json",
+)
diff --git a/backend/app/tests/conftest.py b/backend/app/tests/conftest.py
@@ -1,22 +1,23 @@
 import os
 from unittest.mock import MagicMock
-# Set environment before importing ANYTHING else
-os.environ["ENVIRONMENT"] = "testing"
 
 import pytest
 from fastapi.testclient import TestClient
 
+# MUST be set before app import
+os.environ["ENVIRONMENT"] = "testing"
+
 from app.api.deps import SessionDep, verify_bearer_token
 from app.api.routes import guardrails
 from app.main import app
 
 @pytest.fixture(scope="function", autouse=True)
-def override_dependencies():
+def override_dependencies(monkeypatch):
     """
     Override ALL external dependencies:
     - Auth
     - DB session
-    - RequestLogCrud
+    - CRUDs
     """
 
     # ---- Auth override ----
@@ -27,26 +28,33 @@ def override_dependencies():
     app.dependency_overrides[SessionDep] = lambda: mock_session
 
     # ---- CRUD override ----
-    mock_crud = MagicMock()
-    mock_crud.create.return_value = MagicMock(id=1)
-    mock_crud.update_success.return_value = None
-    mock_crud.update_error.return_value = None
-
-    guardrails.RequestLogCrud = lambda session: mock_crud
+    mock_request_log_crud = MagicMock()
+    mock_request_log_crud.create.return_value = MagicMock(id=1)
+    mock_request_log_crud.update.return_value = None
+
+    mock_validator_log_crud = MagicMock()
+    mock_validator_log_crud.create.return_value = None
+
+    monkeypatch.setattr(
+        guardrails,
+        "RequestLogCrud",
+        lambda session: mock_request_log_crud,
+    )
+    monkeypatch.setattr(
+        guardrails,
+        "ValidatorLogCrud",
+        lambda session: mock_validator_log_crud,
+    )
 
     yield
 
     app.dependency_overrides.clear()
 
-
 @pytest.fixture(scope="function")
 def client():
     with TestClient(app) as c:
         yield c
 
-
 @pytest.fixture(scope="function")
-def integration_client():
-    # Same app, just semantic distinction
-    with TestClient(app) as c:
-        yield c
+def integration_client(client):
+    yield client
diff --git a/backend/app/tests/test_guardrails_api.py b/backend/app/tests/test_guardrails_api.py
@@ -70,7 +70,7 @@ def validate(self, data):
     body = response.json()
     assert body["success"] is False
     assert body["data"]["safe_input"] is None
-    assert body["error"] == "PII detected"
+    assert body["error"] == "Validation failed"
 
 
 def test_output_guardrails_success(client, mock_crud):

diff --git a/backend/app/tests/test_validate_with_guard.py b/backend/app/tests/test_validate_with_guard.py
@@ -9,6 +9,7 @@
 
 
 mock_request_log_crud = MagicMock()
+mock_validator_log_crud = MagicMock()
 mock_request_log_id = uuid4()
 
 
@@ -28,6 +29,7 @@ def validate(self, data):
             response_field="safe_input",
             request_log_crud=mock_request_log_crud,
             request_log_id=mock_request_log_id,
+            validator_log_crud=mock_validator_log_crud,
         )
 
     assert isinstance(response, APIResponse)
@@ -55,12 +57,13 @@ def validate(self, data):
             response_field="safe_input",
             request_log_crud=mock_request_log_crud,
             request_log_id=mock_request_log_id,
+            validator_log_crud=mock_validator_log_crud,
         )
 
     assert isinstance(response, APIResponse)
     assert response.success is False
     assert response.data["safe_input"] is None
-    assert response.error == "PII detected"
+    assert response.error == "Validation failed"
 
 
 @pytest.mark.asyncio
@@ -79,12 +82,13 @@ def validate(self, data):
             response_field="safe_output",
             request_log_crud=mock_request_log_crud,
             request_log_id=mock_request_log_id,
+            validator_log_crud=mock_validator_log_crud,
         )
 
     assert isinstance(response, APIResponse)
     assert response.success is False
     assert response.data["safe_output"] is None
-    assert response.error == ""
+    assert response.error == "Validation failed"
 
 
 @pytest.mark.asyncio
@@ -99,6 +103,7 @@ async def test_validate_with_guard_exception():
             response_field="safe_input",
             request_log_crud=mock_request_log_crud,
             request_log_id=mock_request_log_id,
+            validator_log_crud=mock_validator_log_crud,
         )
 
     assert isinstance(response, APIResponse)