diff --git a/services/bot-detector/bot_detector/models.py b/services/bot-detector/bot_detector/models.py index 59f891e..0591148 100644 --- a/services/bot-detector/bot_detector/models.py +++ b/services/bot-detector/bot_detector/models.py @@ -29,6 +29,13 @@ if TORCH_AVAILABLE: if XGB_AVAILABLE: import xgboost as xgb + from sklearn.model_selection import cross_val_predict + + try: + from cleanlab.filter import find_label_issues + CLEANLAB_AVAILABLE = True + except ImportError: + CLEANLAB_AVAILABLE = False # ─── Caches de modèles ───────────────────────────────────────────────────── @@ -285,6 +292,35 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str): return None, None scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1))) + + # ── Cleanlab : filtrage des labels SOC bruyants ───────────────────── + if CLEANLAB_AVAILABLE: + try: + quick_model = xgb.XGBClassifier( + n_estimators=80, max_depth=4, learning_rate=0.15, + eval_metric='logloss', random_state=42, n_jobs=-1, + tree_method='hist', + ) + pred_probs = cross_val_predict( + quick_model, X, y, cv=3, method='predict_proba', + ) + issues = find_label_issues( + labels=y, pred_probs=pred_probs, + ) + noisy_idx = issues[issues['is_label_issue'] == True].index.to_numpy() + if len(noisy_idx) > 0: + keep = np.ones(len(y), dtype=bool) + keep[noisy_idx] = False + X, y = X[keep], y[keep] + pct = len(noisy_idx) / (len(keep)) * 100 + log_info( + f"[XGB][{name}] Cleanlab : {len(noisy_idx)}/{len(keep)} " + f"labels bruyants supprimés ({pct:.1f}%)" + ) + scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1))) + except Exception as exc: + log_info(f"[XGB][{name}] Cleanlab échoué, labels bruts conservés : {exc}") + model = xgb.XGBClassifier( n_estimators=200, max_depth=6, learning_rate=0.1, scale_pos_weight=scale_pos, eval_metric='logloss', diff --git a/services/bot-detector/bot_detector/requirements.txt b/services/bot-detector/bot_detector/requirements.txt index 80af210..4aa5a0b 100644 --- a/services/bot-detector/bot_detector/requirements.txt +++ b/services/bot-detector/bot_detector/requirements.txt @@ -7,5 +7,6 @@ hdbscan>=0.8.38 isotree>=0.6.1 torch>=2.0 xgboost>=2.0 +cleanlab>=2.6 pyyaml>=6.0 ja4-common @ file:///app/shared/ja4_common