diff --git a/.gitignore b/.gitignore
index fede9d0..47c77ce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,6 @@ htmlcov
 .venv
 .DS_Store
 .env
-.env.test
\ No newline at end of file
+.env.test
+metrics.json
+predictions.csv
\ No newline at end of file
diff --git a/backend/README.md b/backend/README.md
index 3e4d65c..48d7e05 100644
--- a/backend/README.md
+++ b/backend/README.md
@@ -93,6 +93,53 @@ The tests run with Pytest, modify and add tests to `./backend/tests/`.
 
 If you use GitHub Actions the tests will run automatically.
 
+## Running evaluation tests
+
+We can benchmark validators like PII Remover and Lexical Slur Detection on curated datasets.
+
+Download the dataset from [here](https://drive.google.com/drive/u/0/folders/1Rd1LH-oEwCkU0pBDRrYYedExorwmXA89). This contains multiple folders, one for each validator. Each folder contains a testing dataset in csv format for the validator. Download these csv files and store it in `backend/app/evaluation/datasets/` folder. Once the datasets have been stored, we can run the evaluation script for each validator. 
+
+For lexical slur match, ban list and gender assumption bias, testing doesn't make much sense cause these are deterministic. However, we curated a dataset for lexical slur match for use in toxicity detection validator later on. 
+
+Each validator produces:
+- predictions.csv – row-level outputs for debugging and analysis
+- metrics.json – aggregated accuracy + performance metrics
+
+Standardized output structure:
+```
+app/evaluation/outputs/
+  lexical_slur/
+    predictions.csv
+    metrics.json
+  pii_remover/
+    predictions.csv
+    metrics.json
+```
+
+- To evaluate Lexical Slur Validator, run the offline evaluation script: `python app/evaluation/lexical_slur/run.py` 
+
+Expected outputs:
+```
+app/evaluation/outputs/lexical_slur/
+├── predictions.csv
+└── metrics.json
+```
+predictions.csv contains row-level inputs, predictions, and labels.
+
+metrics.json contains binary classification metrics and performance stats (latency + peak memory).
+
+- To evaluate PII Validator, run the PII evaluation script: `python app/evaluation/pii/run.py`
+
+Expected outputs:
+```
+app/evaluation/outputs/pii_remover/
+├── predictions.csv
+└── metrics.json
+```
+predictions.csv contains original text, anonymized output, ground-truth masked text
+
+metrics.json contains entity-level precision, recall, and F1 per PII type.
+
 ### Test running stack
 
 If your stack is already up and you just want to run the tests, you can use:
diff --git a/backend/app/evaluation/common/helper.py b/backend/app/evaluation/common/helper.py
new file mode 100644
index 0000000..6133fd5
--- /dev/null
+++ b/backend/app/evaluation/common/helper.py
@@ -0,0 +1,53 @@
+from pathlib import Path
+import json
+import pandas as pd
+import time
+import tracemalloc
+
+def write_csv(df: pd.DataFrame, path: Path):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    df.to_csv(path, index=False)
+
+
+def write_json(obj: dict, path: Path):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w") as f:
+        json.dump(obj, f, indent=2)
+
+def compute_binary_metrics(y_true, y_pred):
+    tp = sum((yt == 1 and yp == 1) for yt, yp in zip(y_true, y_pred, strict=True))
+    tn = sum((yt == 0 and yp == 0) for yt, yp in zip(y_true, y_pred, strict=True))
+    fp = sum((yt == 0 and yp == 1) for yt, yp in zip(y_true, y_pred, strict=True))
+    fn = sum((yt == 1 and yp == 0) for yt, yp in zip(y_true, y_pred, strict=True))
+
+    precision = tp / (tp + fp) if tp + fp else 0.0
+    recall = tp / (tp + fn) if tp + fn else 0.0
+    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
+
+    return {
+        "tp": tp,
+        "tn": tn,
+        "fp": fp,
+        "fn": fn,
+        "precision": precision,
+        "recall": recall,
+        "f1": f1,
+    }
+
+
+class Profiler:
+    def __enter__(self):
+        self.latencies = []
+        tracemalloc.start()
+        return self
+
+    def record(self, fn, *args):
+        start = time.perf_counter()
+        result = fn(*args)
+        self.latencies.append((time.perf_counter() - start) * 1000)
+        return result
+
+    def __exit__(self, *args):
+        _, peak = tracemalloc.get_traced_memory()
+        tracemalloc.stop()
+        self.peak_memory_mb = peak / (1024 * 1024)
diff --git a/backend/app/evaluation/lexical_slur/run.py b/backend/app/evaluation/lexical_slur/run.py
new file mode 100644
index 0000000..7cd1b3c
--- /dev/null
+++ b/backend/app/evaluation/lexical_slur/run.py
@@ -0,0 +1,48 @@
+from pathlib import Path
+import pandas as pd
+from guardrails.validators import FailResult
+
+from app.core.validators.lexical_slur import LexicalSlur
+from app.evaluation.common.helper import (
+    compute_binary_metrics,
+    Profiler,
+    write_csv,
+    write_json,
+)
+
+BASE_DIR = Path(__file__).resolve().parent.parent
+OUT_DIR = BASE_DIR / "outputs" / "lexical_slur"
+
+df = pd.read_csv(BASE_DIR / "datasets" / "lexical_slur_testing_dataset.csv")
+
+validator = LexicalSlur()
+
+with Profiler() as p:
+    df["result"] = df["commentText"].astype(str).apply(
+        lambda x: p.record(lambda t: validator.validate(t, metadata=None), x)
+    )
+
+df["y_pred"] = df["result"].apply(lambda r: int(isinstance(r, FailResult)))
+df["y_true"] = df["label"]
+
+metrics = compute_binary_metrics(df["y_true"], df["y_pred"])
+
+# ---- Save outputs ----
+write_csv(df.drop(columns=["result"]), OUT_DIR / "predictions.csv")
+
+write_json(
+    {
+        "guardrail": "lexical_slur",
+        "num_samples": len(df),
+        "metrics": metrics,
+        "performance": {
+            "latency_ms": {
+                "mean": sum(p.latencies) / len(p.latencies),
+                "p95": sorted(p.latencies)[int(len(p.latencies) * 0.95)],
+                "max": max(p.latencies),
+            },
+            "memory_mb": p.peak_memory_mb,
+        },
+    },
+    OUT_DIR / "metrics.json",
+)
diff --git a/backend/app/evaluation/pii/entity_metrics.py b/backend/app/evaluation/pii/entity_metrics.py
new file mode 100644
index 0000000..b56a198
--- /dev/null
+++ b/backend/app/evaluation/pii/entity_metrics.py
@@ -0,0 +1,83 @@
+import re
+from collections import defaultdict
+from typing import Iterable, Dict, Set
+
+# Matches placeholders like [PHONE_NUMBER], <IN_PAN>, etc.
+ENTITY_PATTERN = re.compile(r"[\[<]([A-Z0-9_]+)[\]>]")
+
+
+def extract_entities(text: str) -> Set[str]:
+    """
+    Extract entity labels from a masked/anonymized string.
+
+    Examples:
+        "Call me at [PHONE_NUMBER]" -> {"PHONE_NUMBER"}
+        "<IN_PAN> <PHONE_NUMBER>"  -> {"IN_PAN", "PHONE_NUMBER"}
+    """
+    if not isinstance(text, str):
+        return set()
+    return set(ENTITY_PATTERN.findall(text))
+
+
+def compare_entities(gold: Set[str], pred: Set[str]):
+    """
+    Compare gold vs predicted entity sets.
+    """
+    tp = gold & pred          # correctly detected
+    fn = gold - pred          # missed entities
+    fp = pred - gold          # hallucinated entities
+    return tp, fp, fn
+
+
+def compute_entity_metrics(
+    gold_texts: Iterable[str],
+    pred_texts: Iterable[str],
+) -> Dict[str, dict]:
+    """
+    Compute per-entity TP / FP / FN counts across the dataset.
+    """
+    stats = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})
+
+    for gold_txt, pred_txt in zip(gold_texts, pred_texts, strict=True):
+        gold_entities = extract_entities(gold_txt)
+        pred_entities = extract_entities(pred_txt)
+
+        tp, fp, fn = compare_entities(gold_entities, pred_entities)
+
+        for e in tp:
+            stats[e]["tp"] += 1
+        for e in fp:
+            stats[e]["fp"] += 1
+        for e in fn:
+            stats[e]["fn"] += 1
+
+    return finalize_entity_metrics(stats)
+
+
+def finalize_entity_metrics(stats: Dict[str, dict]) -> Dict[str, dict]:
+    """
+    Convert raw counts into precision / recall / F1 per entity.
+    """
+    report = {}
+
+    for entity, s in stats.items():
+        tp, fp, fn = s["tp"], s["fp"], s["fn"]
+
+        precision = tp / (tp + fp) if (tp + fp) else 0.0
+        recall = tp / (tp + fn) if (tp + fn) else 0.0
+        f1 = (
+            2 * precision * recall / (precision + recall)
+            if (precision + recall)
+            else 0.0
+        )
+
+        report[entity] = {
+            "tp": tp,
+            "fp": fp,
+            "fn": fn,
+            "precision": precision,
+            "recall": recall,
+            "f1": f1,
+        }
+
+    return report
diff --git a/backend/app/evaluation/pii/run.py b/backend/app/evaluation/pii/run.py
new file mode 100644
index 0000000..4466479
--- /dev/null
+++ b/backend/app/evaluation/pii/run.py
@@ -0,0 +1,39 @@
+from pathlib import Path
+import pandas as pd
+from guardrails.validators import FailResult
+
+from app.core.validators.pii_remover import PIIRemover
+from app.evaluation.pii.entity_metrics import compute_entity_metrics
+from app.evaluation.common.helper import write_csv, write_json
+
+BASE_DIR = Path(__file__).resolve().parent.parent
+OUT_DIR = BASE_DIR / "outputs" / "pii_remover"
+
+df = pd.read_csv(BASE_DIR / "datasets" / "pii_detection_testing_dataset.csv")
+
+validator = PIIRemover()
+
+def run_pii(text: str) -> str:
+    result = validator._validate(text)
+    if isinstance(result, FailResult):
+        return result.fix_value
+    return text
+
+df["anonymized"] = df["source_text"].astype(str).apply(run_pii)
+
+entity_report = compute_entity_metrics(
+    df["target_text"],
+    df["anonymized"],
+)
+
+# ---- Save outputs ----
+write_csv(df, OUT_DIR / "predictions.csv")
+
+write_json(
+    {
+        "guardrail": "pii_remover",
+        "num_samples": len(df),
+        "entity_metrics": entity_report,
+    },
+    OUT_DIR / "metrics.json",
+)
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index 689cb0b..979075e 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -32,6 +32,7 @@ dependencies = [
     "pandas>=2.3.2",
     "numpy>=1.24.0",
     "python-dotenv<2.0.0,>=1.0.0",
+    "scikit-learn>=1.6.0,<2.0.0",
 ]
 
 [tool.uv]