fix(ml): integrate Cleanlab to filter noisy SOC labels and prevent model poisoning
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -29,6 +29,13 @@ if TORCH_AVAILABLE:
|
||||
|
||||
if XGB_AVAILABLE:
|
||||
import xgboost as xgb
|
||||
from sklearn.model_selection import cross_val_predict
|
||||
|
||||
try:
|
||||
from cleanlab.filter import find_label_issues
|
||||
CLEANLAB_AVAILABLE = True
|
||||
except ImportError:
|
||||
CLEANLAB_AVAILABLE = False
|
||||
|
||||
|
||||
# ─── Caches de modèles ─────────────────────────────────────────────────────
|
||||
@ -285,6 +292,35 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
|
||||
return None, None
|
||||
|
||||
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
|
||||
|
||||
# ── Cleanlab : filtrage des labels SOC bruyants ─────────────────────
|
||||
if CLEANLAB_AVAILABLE:
|
||||
try:
|
||||
quick_model = xgb.XGBClassifier(
|
||||
n_estimators=80, max_depth=4, learning_rate=0.15,
|
||||
eval_metric='logloss', random_state=42, n_jobs=-1,
|
||||
tree_method='hist',
|
||||
)
|
||||
pred_probs = cross_val_predict(
|
||||
quick_model, X, y, cv=3, method='predict_proba',
|
||||
)
|
||||
issues = find_label_issues(
|
||||
labels=y, pred_probs=pred_probs,
|
||||
)
|
||||
noisy_idx = issues[issues['is_label_issue'] == True].index.to_numpy()
|
||||
if len(noisy_idx) > 0:
|
||||
keep = np.ones(len(y), dtype=bool)
|
||||
keep[noisy_idx] = False
|
||||
X, y = X[keep], y[keep]
|
||||
pct = len(noisy_idx) / (len(keep)) * 100
|
||||
log_info(
|
||||
f"[XGB][{name}] Cleanlab : {len(noisy_idx)}/{len(keep)} "
|
||||
f"labels bruyants supprimés ({pct:.1f}%)"
|
||||
)
|
||||
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
|
||||
except Exception as exc:
|
||||
log_info(f"[XGB][{name}] Cleanlab échoué, labels bruts conservés : {exc}")
|
||||
|
||||
model = xgb.XGBClassifier(
|
||||
n_estimators=200, max_depth=6, learning_rate=0.1,
|
||||
scale_pos_weight=scale_pos, eval_metric='logloss',
|
||||
|
||||
@ -7,5 +7,6 @@ hdbscan>=0.8.38
|
||||
isotree>=0.6.1
|
||||
torch>=2.0
|
||||
xgboost>=2.0
|
||||
cleanlab>=2.6
|
||||
pyyaml>=6.0
|
||||
ja4-common @ file:///app/shared/ja4_common
|
||||
|
||||
Reference in New Issue
Block a user