-
Notifications
You must be signed in to change notification settings - Fork 2
Added testing setup #17
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
f503349
2aaa996
c1c3ff4
e14fb66
c2c81c2
dd3012c
609e6a9
c81b717
c4d102c
110caae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,4 +8,6 @@ htmlcov | |
| .venv | ||
| .DS_Store | ||
| .env | ||
| .env.test | ||
| .env.test | ||
| metrics.json | ||
| predictions.csv | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| from pathlib import Path | ||
| import json | ||
| import pandas as pd | ||
|
|
||
|
|
||
| def write_csv(df: pd.DataFrame, path: Path): | ||
| path.parent.mkdir(parents=True, exist_ok=True) | ||
| df.to_csv(path, index=False) | ||
|
|
||
|
|
||
| def write_json(obj: dict, path: Path): | ||
| path.parent.mkdir(parents=True, exist_ok=True) | ||
| with open(path, "w") as f: | ||
| json.dump(obj, f, indent=2) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| def compute_binary_metrics(y_true, y_pred): | ||
| tp = sum((yt == 1 and yp == 1) for yt, yp in zip(y_true, y_pred)) | ||
| tn = sum((yt == 0 and yp == 0) for yt, yp in zip(y_true, y_pred)) | ||
| fp = sum((yt == 0 and yp == 1) for yt, yp in zip(y_true, y_pred)) | ||
| fn = sum((yt == 1 and yp == 0) for yt, yp in zip(y_true, y_pred)) | ||
|
|
||
| precision = tp / (tp + fp) if tp + fp else 0.0 | ||
| recall = tp / (tp + fn) if tp + fn else 0.0 | ||
| f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0 | ||
|
|
||
| return { | ||
| "tp": tp, | ||
| "tn": tn, | ||
| "fp": fp, | ||
| "fn": fn, | ||
| "precision": precision, | ||
| "recall": recall, | ||
| "f1": f1, | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| import time | ||
| import tracemalloc | ||
|
|
||
| class Profiler: | ||
| def __enter__(self): | ||
| self.latencies = [] | ||
| tracemalloc.start() | ||
| return self | ||
|
|
||
| def record(self, fn, *args): | ||
| start = time.perf_counter() | ||
| result = fn(*args) | ||
| self.latencies.append((time.perf_counter() - start) * 1000) | ||
| return result | ||
|
|
||
| def __exit__(self, *args): | ||
| _, peak = tracemalloc.get_traced_memory() | ||
| tracemalloc.stop() | ||
| self.peak_memory_mb = peak / (1024 * 1024) |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,45 @@ | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from pathlib import Path | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| import pandas as pd | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from guardrails.validators import FailResult | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from app.core.validators.lexical_slur import LexicalSlur | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from app.eval.common.metrics import compute_binary_metrics | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from app.eval.common.profiling import Profiler | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from app.eval.common.io import write_csv, write_json | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| BASE_DIR = Path(__file__).resolve().parent.parent | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| OUT_DIR = BASE_DIR / "outputs" / "lexical_slur" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| df = pd.read_csv(BASE_DIR / "datasets" / "lexical_slur_testing_dataset.csv") | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| validator = LexicalSlur() | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| with Profiler() as p: | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| df["result"] = df["commentText"].astype(str).apply( | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| lambda x: p.record(lambda t: validator.validate(t, metadata=None), x) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| df["y_pred"] = df["result"].apply(lambda r: int(isinstance(r, FailResult))) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| df["y_true"] = df["label"] | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| metrics = compute_binary_metrics(df["y_true"], df["y_pred"]) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| # ---- Save outputs ---- | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| write_csv(df.drop(columns=["result"]), OUT_DIR / "predictions.csv") | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| write_json( | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "guardrail": "lexical_slur", | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "num_samples": len(df), | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "metrics": metrics, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "performance": { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "latency_ms": { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "mean": sum(p.latencies) / len(p.latencies), | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "p95": sorted(p.latencies)[int(len(p.latencies) * 0.95)], | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "max": max(p.latencies), | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| }, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+35
to
+40
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Guard latency stats for empty datasets. 🛠 Proposed guard+latencies = p.latencies
+if latencies:
+ latency_stats = {
+ "mean": sum(latencies) / len(latencies),
+ "p95": sorted(latencies)[int(len(latencies) * 0.95)],
+ "max": max(latencies),
+ }
+else:
+ latency_stats = {"mean": 0.0, "p95": 0.0, "max": 0.0}
+
write_json(
{
"guardrail": "lexical_slur",
"num_samples": len(df),
"metrics": metrics,
"performance": {
"latency_ms": {
- "mean": sum(p.latencies) / len(p.latencies),
- "p95": sorted(p.latencies)[int(len(p.latencies) * 0.95)],
- "max": max(p.latencies),
+ **latency_stats,
},
"memory_mb": p.peak_memory_mb,
},
},
OUT_DIR / "metrics.json",
)📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "memory_mb": p.peak_memory_mb, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| }, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| }, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| OUT_DIR / "metrics.json", | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,83 @@ | ||||||
| import re | ||||||
| from collections import defaultdict | ||||||
| from typing import Iterable, Dict, Set | ||||||
|
|
||||||
| # Matches placeholders like [PHONE_NUMBER], <IN_PAN>, etc. | ||||||
| ENTITY_PATTERN = re.compile(r"[\[<]([A-Z0-9_]+)[\]>]") | ||||||
|
|
||||||
|
|
||||||
| def extract_entities(text: str) -> Set[str]: | ||||||
| """ | ||||||
| Extract entity labels from a masked/anonymized string. | ||||||
| Examples: | ||||||
| "Call me at [PHONE_NUMBER]" -> {"PHONE_NUMBER"} | ||||||
| "<IN_PAN> <PHONE_NUMBER>" -> {"IN_PAN", "PHONE_NUMBER"} | ||||||
| """ | ||||||
| if not isinstance(text, str): | ||||||
| return set() | ||||||
| return set(ENTITY_PATTERN.findall(text)) | ||||||
|
|
||||||
|
|
||||||
| def compare_entities(gold: Set[str], pred: Set[str]): | ||||||
| """ | ||||||
| Compare gold vs predicted entity sets. | ||||||
| """ | ||||||
| tp = gold & pred # correctly detected | ||||||
| fn = gold - pred # missed entities | ||||||
| fp = pred - gold # hallucinated entities | ||||||
| return tp, fp, fn | ||||||
|
|
||||||
|
|
||||||
| def compute_entity_metrics( | ||||||
| gold_texts: Iterable[str], | ||||||
| pred_texts: Iterable[str], | ||||||
| ) -> Dict[str, dict]: | ||||||
| """ | ||||||
| Compute per-entity TP / FP / FN counts across the dataset. | ||||||
| """ | ||||||
| stats = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0}) | ||||||
|
|
||||||
| for gold_txt, pred_txt in zip(gold_texts, pred_texts): | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: # Find Python version requirement
fd -type f -name "setup.py" -o -name "pyproject.toml" -o -name ".python-version" -o -name "requirements.txt" | head -20Repository: ProjectTech4DevAI/kaapi-guardrails Length of output: 248 🏁 Script executed: # Read the file and surrounding context
cat -n backend/app/eval/pii/entity_metrics.py | head -80Repository: ProjectTech4DevAI/kaapi-guardrails Length of output: 2861 🏁 Script executed: # Also check if there are any other zip calls to understand the pattern
rg "zip\(" backend/app/eval/pii/ -A 2 -B 2Repository: ProjectTech4DevAI/kaapi-guardrails Length of output: 495 🏁 Script executed: # Search for Python version requirement
rg "python_requires|version.*python|3\.[0-9]+" -t py --max-count=10 | head -20Repository: ProjectTech4DevAI/kaapi-guardrails Length of output: 60 🏁 Script executed: # Look for pyproject.toml or setup files
cat pyproject.toml 2>/dev/null | head -40Repository: ProjectTech4DevAI/kaapi-guardrails Length of output: 60 🏁 Script executed: # Check for any test files or examples showing usage of compute_entity_metrics
rg "compute_entity_metrics" --type py -B 3 -A 3Repository: ProjectTech4DevAI/kaapi-guardrails Length of output: 1305 🏁 Script executed: # Search all config files
find . -name "setup.py" -o -name "pyproject.toml" -o -name "setup.cfg" -o -name ".python-version" 2>/dev/null | head -10Repository: ProjectTech4DevAI/kaapi-guardrails Length of output: 102 🏁 Script executed: # Check if pyproject.toml exists in repo root
ls -la | grep -E "pyproject|setup|requirements"Repository: ProjectTech4DevAI/kaapi-guardrails Length of output: 60 🏁 Script executed: # Check the actual run.py usage context
cat -n backend/app/eval/pii/run.py | grep -A 10 "entity_report = compute_entity_metrics"Repository: ProjectTech4DevAI/kaapi-guardrails Length of output: 365 🏁 Script executed: # Check the pyproject.toml for Python version
cat backend/pyproject.tomlRepository: ProjectTech4DevAI/kaapi-guardrails Length of output: 2352 Add 🔧 Proposed fix- for gold_txt, pred_txt in zip(gold_texts, pred_texts):
+ for gold_txt, pred_txt in zip(gold_texts, pred_texts, strict=True):📝 Committable suggestion
Suggested change
🧰 Tools🪛 Ruff (0.14.14)41-41: Add explicit value for parameter (B905) 🤖 Prompt for AI Agents |
||||||
| gold_entities = extract_entities(gold_txt) | ||||||
| pred_entities = extract_entities(pred_txt) | ||||||
|
|
||||||
| tp, fp, fn = compare_entities(gold_entities, pred_entities) | ||||||
|
|
||||||
| for e in tp: | ||||||
| stats[e]["tp"] += 1 | ||||||
| for e in fp: | ||||||
| stats[e]["fp"] += 1 | ||||||
| for e in fn: | ||||||
| stats[e]["fn"] += 1 | ||||||
|
|
||||||
| return finalize_entity_metrics(stats) | ||||||
|
|
||||||
|
|
||||||
| def finalize_entity_metrics(stats: Dict[str, dict]) -> Dict[str, dict]: | ||||||
| """ | ||||||
| Convert raw counts into precision / recall / F1 per entity. | ||||||
| """ | ||||||
| report = {} | ||||||
|
|
||||||
| for entity, s in stats.items(): | ||||||
| tp, fp, fn = s["tp"], s["fp"], s["fn"] | ||||||
|
|
||||||
| precision = tp / (tp + fp) if (tp + fp) else 0.0 | ||||||
| recall = tp / (tp + fn) if (tp + fn) else 0.0 | ||||||
| f1 = ( | ||||||
| 2 * precision * recall / (precision + recall) | ||||||
| if (precision + recall) | ||||||
| else 0.0 | ||||||
| ) | ||||||
|
|
||||||
| report[entity] = { | ||||||
| "tp": tp, | ||||||
| "fp": fp, | ||||||
| "fn": fn, | ||||||
| "precision": precision, | ||||||
| "recall": recall, | ||||||
| "f1": f1, | ||||||
| } | ||||||
|
|
||||||
| return report | ||||||
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,39 @@ | ||||||||||||||
| from pathlib import Path | ||||||||||||||
| import pandas as pd | ||||||||||||||
| from guardrails.validators import FailResult | ||||||||||||||
|
|
||||||||||||||
| from app.core.validators.pii_remover import PIIRemover | ||||||||||||||
| from app.eval.pii.entity_metrics import compute_entity_metrics | ||||||||||||||
| from app.eval.common.io import write_csv, write_json | ||||||||||||||
|
|
||||||||||||||
| BASE_DIR = Path(__file__).resolve().parent.parent | ||||||||||||||
| OUT_DIR = BASE_DIR / "outputs" / "pii_remover" | ||||||||||||||
|
|
||||||||||||||
| df = pd.read_csv(BASE_DIR / "datasets" / "pii_detection_testing_dataset.csv") | ||||||||||||||
|
|
||||||||||||||
| validator = PIIRemover() | ||||||||||||||
|
|
||||||||||||||
| def run_pii(text: str) -> str: | ||||||||||||||
| result = validator._validate(text) | ||||||||||||||
| if isinstance(result, FailResult): | ||||||||||||||
| return result.fix_value | ||||||||||||||
| return text | ||||||||||||||
|
|
||||||||||||||
| df["anonymized"] = df["source_text"].astype(str).apply(run_pii) | ||||||||||||||
|
|
||||||||||||||
| entity_report = compute_entity_metrics( | ||||||||||||||
| df["target_text"], | ||||||||||||||
| df["anonymized"], | ||||||||||||||
| ) | ||||||||||||||
|
|
||||||||||||||
| # ---- Save outputs ---- | ||||||||||||||
| write_csv(df, OUT_DIR / "predictions.csv") | ||||||||||||||
|
|
||||||||||||||
|
Comment on lines
+29
to
+31
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid exporting raw PII in predictions artifacts. 🔒 Proposed safer export-# ---- Save outputs ----
-write_csv(df, OUT_DIR / "predictions.csv")
+# ---- Save outputs ----
+# Avoid exporting raw PII by default
+write_csv(df.drop(columns=["source_text"]), OUT_DIR / "predictions.csv")📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||
| write_json( | ||||||||||||||
| { | ||||||||||||||
| "guardrail": "pii_remover", | ||||||||||||||
| "num_samples": len(df), | ||||||||||||||
| "entity_metrics": entity_report, | ||||||||||||||
| }, | ||||||||||||||
| OUT_DIR / "metrics.json", | ||||||||||||||
| ) | ||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: ProjectTech4DevAI/kaapi-guardrails
Length of output: 727
🏁 Script executed:
Repository: ProjectTech4DevAI/kaapi-guardrails
Length of output: 1003
🏁 Script executed:
Repository: ProjectTech4DevAI/kaapi-guardrails
Length of output: 194
🏁 Script executed:
Repository: ProjectTech4DevAI/kaapi-guardrails
Length of output: 102
🏁 Script executed:
Repository: ProjectTech4DevAI/kaapi-guardrails
Length of output: 99
🏁 Script executed:
rg "python|requires.python" -i --type toml --type txt --max-count 5Repository: ProjectTech4DevAI/kaapi-guardrails
Length of output: 164
🏁 Script executed:
Repository: ProjectTech4DevAI/kaapi-guardrails
Length of output: 2352
Add
strict=Truetozip()calls to prevent silent truncation.The
zip()calls on lines 2–5 will silently drop items ify_trueandy_predhave different lengths, producing incorrect metrics. Since the project requires Python ≥3.10, usestrict=Trueto raise aValueErrorimmediately on length mismatch.Proposed fix
📝 Committable suggestion
🧰 Tools
🪛 Ruff (0.14.14)
2-2:
zip()without an explicitstrict=parameterAdd explicit value for parameter
strict=(B905)
3-3:
zip()without an explicitstrict=parameterAdd explicit value for parameter
strict=(B905)
4-4:
zip()without an explicitstrict=parameterAdd explicit value for parameter
strict=(B905)
5-5:
zip()without an explicitstrict=parameterAdd explicit value for parameter
strict=(B905)
🤖 Prompt for AI Agents