fix(ml): integrate Cleanlab to filter noisy SOC labels and prevent model poisoning
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -29,6 +29,13 @@ if TORCH_AVAILABLE:
|
|||||||
|
|
||||||
if XGB_AVAILABLE:
|
if XGB_AVAILABLE:
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
|
from sklearn.model_selection import cross_val_predict
|
||||||
|
|
||||||
|
try:
|
||||||
|
from cleanlab.filter import find_label_issues
|
||||||
|
CLEANLAB_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
CLEANLAB_AVAILABLE = False
|
||||||
|
|
||||||
|
|
||||||
# ─── Caches de modèles ─────────────────────────────────────────────────────
|
# ─── Caches de modèles ─────────────────────────────────────────────────────
|
||||||
@ -285,6 +292,35 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
|
|||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
|
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
|
||||||
|
|
||||||
|
# ── Cleanlab : filtrage des labels SOC bruyants ─────────────────────
|
||||||
|
if CLEANLAB_AVAILABLE:
|
||||||
|
try:
|
||||||
|
quick_model = xgb.XGBClassifier(
|
||||||
|
n_estimators=80, max_depth=4, learning_rate=0.15,
|
||||||
|
eval_metric='logloss', random_state=42, n_jobs=-1,
|
||||||
|
tree_method='hist',
|
||||||
|
)
|
||||||
|
pred_probs = cross_val_predict(
|
||||||
|
quick_model, X, y, cv=3, method='predict_proba',
|
||||||
|
)
|
||||||
|
issues = find_label_issues(
|
||||||
|
labels=y, pred_probs=pred_probs,
|
||||||
|
)
|
||||||
|
noisy_idx = issues[issues['is_label_issue'] == True].index.to_numpy()
|
||||||
|
if len(noisy_idx) > 0:
|
||||||
|
keep = np.ones(len(y), dtype=bool)
|
||||||
|
keep[noisy_idx] = False
|
||||||
|
X, y = X[keep], y[keep]
|
||||||
|
pct = len(noisy_idx) / (len(keep)) * 100
|
||||||
|
log_info(
|
||||||
|
f"[XGB][{name}] Cleanlab : {len(noisy_idx)}/{len(keep)} "
|
||||||
|
f"labels bruyants supprimés ({pct:.1f}%)"
|
||||||
|
)
|
||||||
|
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
|
||||||
|
except Exception as exc:
|
||||||
|
log_info(f"[XGB][{name}] Cleanlab échoué, labels bruts conservés : {exc}")
|
||||||
|
|
||||||
model = xgb.XGBClassifier(
|
model = xgb.XGBClassifier(
|
||||||
n_estimators=200, max_depth=6, learning_rate=0.1,
|
n_estimators=200, max_depth=6, learning_rate=0.1,
|
||||||
scale_pos_weight=scale_pos, eval_metric='logloss',
|
scale_pos_weight=scale_pos, eval_metric='logloss',
|
||||||
|
|||||||
@ -7,5 +7,6 @@ hdbscan>=0.8.38
|
|||||||
isotree>=0.6.1
|
isotree>=0.6.1
|
||||||
torch>=2.0
|
torch>=2.0
|
||||||
xgboost>=2.0
|
xgboost>=2.0
|
||||||
|
cleanlab>=2.6
|
||||||
pyyaml>=6.0
|
pyyaml>=6.0
|
||||||
ja4-common @ file:///app/shared/ja4_common
|
ja4-common @ file:///app/shared/ja4_common
|
||||||
|
|||||||
Reference in New Issue
Block a user