fix(ml): integrate Cleanlab to filter noisy SOC labels and prevent model poisoning

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jacquin Antoine
2026-04-13 02:11:25 +02:00
parent c60ce97f23
commit 9d27abf43c
2 changed files with 37 additions and 0 deletions

View File

@ -29,6 +29,13 @@ if TORCH_AVAILABLE:
if XGB_AVAILABLE:
import xgboost as xgb
from sklearn.model_selection import cross_val_predict
try:
from cleanlab.filter import find_label_issues
CLEANLAB_AVAILABLE = True
except ImportError:
CLEANLAB_AVAILABLE = False
# ─── Caches de modèles ─────────────────────────────────────────────────────
@ -285,6 +292,35 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
return None, None
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
# ── Cleanlab : filtrage des labels SOC bruyants ─────────────────────
if CLEANLAB_AVAILABLE:
try:
quick_model = xgb.XGBClassifier(
n_estimators=80, max_depth=4, learning_rate=0.15,
eval_metric='logloss', random_state=42, n_jobs=-1,
tree_method='hist',
)
pred_probs = cross_val_predict(
quick_model, X, y, cv=3, method='predict_proba',
)
issues = find_label_issues(
labels=y, pred_probs=pred_probs,
)
noisy_idx = issues[issues['is_label_issue'] == True].index.to_numpy()
if len(noisy_idx) > 0:
keep = np.ones(len(y), dtype=bool)
keep[noisy_idx] = False
X, y = X[keep], y[keep]
pct = len(noisy_idx) / (len(keep)) * 100
log_info(
f"[XGB][{name}] Cleanlab : {len(noisy_idx)}/{len(keep)} "
f"labels bruyants supprimés ({pct:.1f}%)"
)
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
except Exception as exc:
log_info(f"[XGB][{name}] Cleanlab échoué, labels bruts conservés : {exc}")
model = xgb.XGBClassifier(
n_estimators=200, max_depth=6, learning_rate=0.1,
scale_pos_weight=scale_pos, eval_metric='logloss',

View File

@ -7,5 +7,6 @@ hdbscan>=0.8.38
isotree>=0.6.1
torch>=2.0
xgboost>=2.0
cleanlab>=2.6
pyyaml>=6.0
ja4-common @ file:///app/shared/ja4_common