diff --git a/.gitignore b/.gitignore index fede9d0..47c77ce 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ htmlcov .venv .DS_Store .env -.env.test \ No newline at end of file +.env.test +metrics.json +predictions.csv \ No newline at end of file diff --git a/backend/README.md b/backend/README.md index 3e4d65c..48d7e05 100644 --- a/backend/README.md +++ b/backend/README.md @@ -93,6 +93,53 @@ The tests run with Pytest, modify and add tests to `./backend/tests/`. If you use GitHub Actions the tests will run automatically. +## Running evaluation tests + +We can benchmark validators like PII Remover and Lexical Slur Detection on curated datasets. + +Download the dataset from [here](https://drive.google.com/drive/u/0/folders/1Rd1LH-oEwCkU0pBDRrYYedExorwmXA89). This contains multiple folders, one for each validator. Each folder contains a testing dataset in csv format for the validator. Download these csv files and store it in `backend/app/evaluation/datasets/` folder. Once the datasets have been stored, we can run the evaluation script for each validator. + +For lexical slur match, ban list and gender assumption bias, testing doesn't make much sense cause these are deterministic. However, we curated a dataset for lexical slur match for use in toxicity detection validator later on. + +Each validator produces: +- predictions.csv – row-level outputs for debugging and analysis +- metrics.json – aggregated accuracy + performance metrics + +Standardized output structure: +``` +app/evaluation/outputs/ + lexical_slur/ + predictions.csv + metrics.json + pii_remover/ + predictions.csv + metrics.json +``` + +- To evaluate Lexical Slur Validator, run the offline evaluation script: `python app/evaluation/lexical_slur/run.py` + +Expected outputs: +``` +app/evaluation/outputs/lexical_slur/ +├── predictions.csv +└── metrics.json +``` +predictions.csv contains row-level inputs, predictions, and labels. + +metrics.json contains binary classification metrics and performance stats (latency + peak memory). + +- To evaluate PII Validator, run the PII evaluation script: `python app/evaluation/pii/run.py` + +Expected outputs: +``` +app/evaluation/outputs/pii_remover/ +├── predictions.csv +└── metrics.json +``` +predictions.csv contains original text, anonymized output, ground-truth masked text + +metrics.json contains entity-level precision, recall, and F1 per PII type. + ### Test running stack If your stack is already up and you just want to run the tests, you can use: diff --git a/backend/app/evaluation/common/helper.py b/backend/app/evaluation/common/helper.py new file mode 100644 index 0000000..6133fd5 --- /dev/null +++ b/backend/app/evaluation/common/helper.py @@ -0,0 +1,53 @@ +from pathlib import Path +import json +import pandas as pd +import time +import tracemalloc + +def write_csv(df: pd.DataFrame, path: Path): + path.parent.mkdir(parents=True, exist_ok=True) + df.to_csv(path, index=False) + + +def write_json(obj: dict, path: Path): + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w") as f: + json.dump(obj, f, indent=2) + +def compute_binary_metrics(y_true, y_pred): + tp = sum((yt == 1 and yp == 1) for yt, yp in zip(y_true, y_pred, strict=True)) + tn = sum((yt == 0 and yp == 0) for yt, yp in zip(y_true, y_pred, strict=True)) + fp = sum((yt == 0 and yp == 1) for yt, yp in zip(y_true, y_pred, strict=True)) + fn = sum((yt == 1 and yp == 0) for yt, yp in zip(y_true, y_pred, strict=True)) + + precision = tp / (tp + fp) if tp + fp else 0.0 + recall = tp / (tp + fn) if tp + fn else 0.0 + f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0 + + return { + "tp": tp, + "tn": tn, + "fp": fp, + "fn": fn, + "precision": precision, + "recall": recall, + "f1": f1, + } + + +class Profiler: + def __enter__(self): + self.latencies = [] + tracemalloc.start() + return self + + def record(self, fn, *args): + start = time.perf_counter() + result = fn(*args) + self.latencies.append((time.perf_counter() - start) * 1000) + return result + + def __exit__(self, *args): + _, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + self.peak_memory_mb = peak / (1024 * 1024) diff --git a/backend/app/evaluation/lexical_slur/run.py b/backend/app/evaluation/lexical_slur/run.py new file mode 100644 index 0000000..7cd1b3c --- /dev/null +++ b/backend/app/evaluation/lexical_slur/run.py @@ -0,0 +1,48 @@ +from pathlib import Path +import pandas as pd +from guardrails.validators import FailResult + +from app.core.validators.lexical_slur import LexicalSlur +from app.evaluation.common.helper import ( + compute_binary_metrics, + Profiler, + write_csv, + write_json, +) + +BASE_DIR = Path(__file__).resolve().parent.parent +OUT_DIR = BASE_DIR / "outputs" / "lexical_slur" + +df = pd.read_csv(BASE_DIR / "datasets" / "lexical_slur_testing_dataset.csv") + +validator = LexicalSlur() + +with Profiler() as p: + df["result"] = df["commentText"].astype(str).apply( + lambda x: p.record(lambda t: validator.validate(t, metadata=None), x) + ) + +df["y_pred"] = df["result"].apply(lambda r: int(isinstance(r, FailResult))) +df["y_true"] = df["label"] + +metrics = compute_binary_metrics(df["y_true"], df["y_pred"]) + +# ---- Save outputs ---- +write_csv(df.drop(columns=["result"]), OUT_DIR / "predictions.csv") + +write_json( + { + "guardrail": "lexical_slur", + "num_samples": len(df), + "metrics": metrics, + "performance": { + "latency_ms": { + "mean": sum(p.latencies) / len(p.latencies), + "p95": sorted(p.latencies)[int(len(p.latencies) * 0.95)], + "max": max(p.latencies), + }, + "memory_mb": p.peak_memory_mb, + }, + }, + OUT_DIR / "metrics.json", +) diff --git a/backend/app/evaluation/pii/entity_metrics.py b/backend/app/evaluation/pii/entity_metrics.py new file mode 100644 index 0000000..b56a198 --- /dev/null +++ b/backend/app/evaluation/pii/entity_metrics.py @@ -0,0 +1,83 @@ +import re +from collections import defaultdict +from typing import Iterable, Dict, Set + +# Matches placeholders like [PHONE_NUMBER], , etc. +ENTITY_PATTERN = re.compile(r"[\[<]([A-Z0-9_]+)[\]>]") + + +def extract_entities(text: str) -> Set[str]: + """ + Extract entity labels from a masked/anonymized string. + + Examples: + "Call me at [PHONE_NUMBER]" -> {"PHONE_NUMBER"} + " " -> {"IN_PAN", "PHONE_NUMBER"} + """ + if not isinstance(text, str): + return set() + return set(ENTITY_PATTERN.findall(text)) + + +def compare_entities(gold: Set[str], pred: Set[str]): + """ + Compare gold vs predicted entity sets. + """ + tp = gold & pred # correctly detected + fn = gold - pred # missed entities + fp = pred - gold # hallucinated entities + return tp, fp, fn + + +def compute_entity_metrics( + gold_texts: Iterable[str], + pred_texts: Iterable[str], +) -> Dict[str, dict]: + """ + Compute per-entity TP / FP / FN counts across the dataset. + """ + stats = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0}) + + for gold_txt, pred_txt in zip(gold_texts, pred_texts, strict=True): + gold_entities = extract_entities(gold_txt) + pred_entities = extract_entities(pred_txt) + + tp, fp, fn = compare_entities(gold_entities, pred_entities) + + for e in tp: + stats[e]["tp"] += 1 + for e in fp: + stats[e]["fp"] += 1 + for e in fn: + stats[e]["fn"] += 1 + + return finalize_entity_metrics(stats) + + +def finalize_entity_metrics(stats: Dict[str, dict]) -> Dict[str, dict]: + """ + Convert raw counts into precision / recall / F1 per entity. + """ + report = {} + + for entity, s in stats.items(): + tp, fp, fn = s["tp"], s["fp"], s["fn"] + + precision = tp / (tp + fp) if (tp + fp) else 0.0 + recall = tp / (tp + fn) if (tp + fn) else 0.0 + f1 = ( + 2 * precision * recall / (precision + recall) + if (precision + recall) + else 0.0 + ) + + report[entity] = { + "tp": tp, + "fp": fp, + "fn": fn, + "precision": precision, + "recall": recall, + "f1": f1, + } + + return report diff --git a/backend/app/evaluation/pii/run.py b/backend/app/evaluation/pii/run.py new file mode 100644 index 0000000..4466479 --- /dev/null +++ b/backend/app/evaluation/pii/run.py @@ -0,0 +1,39 @@ +from pathlib import Path +import pandas as pd +from guardrails.validators import FailResult + +from app.core.validators.pii_remover import PIIRemover +from app.evaluation.pii.entity_metrics import compute_entity_metrics +from app.evaluation.common.helper import write_csv, write_json + +BASE_DIR = Path(__file__).resolve().parent.parent +OUT_DIR = BASE_DIR / "outputs" / "pii_remover" + +df = pd.read_csv(BASE_DIR / "datasets" / "pii_detection_testing_dataset.csv") + +validator = PIIRemover() + +def run_pii(text: str) -> str: + result = validator._validate(text) + if isinstance(result, FailResult): + return result.fix_value + return text + +df["anonymized"] = df["source_text"].astype(str).apply(run_pii) + +entity_report = compute_entity_metrics( + df["target_text"], + df["anonymized"], +) + +# ---- Save outputs ---- +write_csv(df, OUT_DIR / "predictions.csv") + +write_json( + { + "guardrail": "pii_remover", + "num_samples": len(df), + "entity_metrics": entity_report, + }, + OUT_DIR / "metrics.json", +) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 689cb0b..979075e 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "pandas>=2.3.2", "numpy>=1.24.0", "python-dotenv<2.0.0,>=1.0.0", + "scikit-learn>=1.6.0,<2.0.0", ] [tool.uv]