feat(ml): replace logistic regression with MLP fusion and KS drift with ADWIN online learning

Replace the LogisticRegression meta-learner with a PyTorch MetaFusionMLP
(Linear(3,16)->BN->ReLU->Dropout->Linear(16,1)->Sigmoid) for non-linear
fusion of EIF, NF, and XGBoost scores. Replace KS-test + quantile digest
drift detection with ADWIN (adaptive sliding window, Hoeffding bound).
Replace weekly XGBoost batch retraining with River HoeffdingAdaptiveTree
for incremental online learning (learn_one per cycle). Update all thesis
documentation sections (2.4.2c, 2.4.3, 3.8, discussion, conclusion).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jacquin Antoine
2026-04-13 16:32:34 +02:00
parent c6cb12981c
commit 7894d39f1c
12 changed files with 502 additions and 306 deletions

View File

@ -138,7 +138,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
unknown_traffic['raw_anomaly_score'] = raw_scores
unknown_traffic['model_name'] = name
# XGBoost supervisé — troisième voix (si labels historiques disponibles)
# Modèle supervisé — troisième voix (Hoeffding Adaptive Tree ou XGBoost fallback)
unknown_traffic['xgb_prob'] = 0.0
xgb_model_ref = None # Référence pour SHAP TreeExplainer (§2.4.5)
if XGB_AVAILABLE and XGB_WEIGHT > 0:
@ -146,35 +146,36 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
xgb_client = get_client()
xgb_model, xgb_feats = load_or_train_xgb(name, xgb_client, scoring_features, cycle_id)
if xgb_model is not None and xgb_feats is not None:
# XGB peut utiliser un sous-ensemble de features (celles disponibles dans la vue)
xgb_cols = [f for f in xgb_feats if f in unknown_traffic.columns]
X_xgb = unknown_traffic[xgb_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
xgb_probs = xgb_model.predict_proba(X_xgb.values)[:, 1]
# River HAT utilise predict_proba_many(DataFrame), XGBoost utilise predict_proba(ndarray)
if hasattr(xgb_model, 'predict_proba_many'):
proba_df = xgb_model.predict_proba_many(X_xgb[xgb_cols])
xgb_probs = proba_df[1].values if 1 in proba_df.columns else np.zeros(len(X_xgb))
else:
xgb_probs = xgb_model.predict_proba(X_xgb.values)[:, 1]
unknown_traffic['xgb_prob'] = xgb_probs
xgb_model_ref = xgb_model
log_info(f"[{name}] XGBoost : xgb_mean={xgb_probs.mean():.4f}")
log_info(f"[{name}] Supervisé : score moyen={xgb_probs.mean():.4f}")
except Exception as exc:
log_info(f"[{name}] XGBoost scoring échoué : {exc} — EIF+AE seuls.")
log_info(f"[{name}] Supervisé scoring échoué : {exc} — EIF+NF seuls.")
# §8 — Score final via MetaLearner (ou poids fixes en fallback)
# §8 — Score final via MetaLearner MLP (ou poids fixes en fallback)
meta_learner = get_meta_learner(name)
eif_norm_arr = unknown_traffic['anomaly_score'].values.copy()
ae_norm_arr = normalize_scores(-unknown_traffic['ae_recon_error'].values)
xgb_prob_arr = unknown_traffic['xgb_prob'].values
hits_arr = unknown_traffic.get('hits', pd.Series(1, index=unknown_traffic.index)).values
corr_arr = unknown_traffic.get('correlated', pd.Series(0, index=unknown_traffic.index)).values
final_scores = meta_learner.predict(eif_norm_arr, ae_norm_arr, xgb_prob_arr,
hits_arr, corr_arr)
final_scores = meta_learner.predict(eif_norm_arr, ae_norm_arr, xgb_prob_arr)
unknown_traffic['anomaly_score'] = final_scores
if meta_learner.is_trained:
log_info(
f"[{name}] §8 MetaLearner actif ({meta_learner._n_samples} labels) — "
f"[{name}] §8 MetaFusionMLP actif ({meta_learner._n_samples} labels) — "
f"score moyen={final_scores.mean():.4f}"
)
elif unknown_traffic['xgb_prob'].mean() > 0:
log_info(f"[{name}] §8 Poids fixes EIF+AE+XGB (MetaLearner pas encore entraîné).")
log_info(f"[{name}] §8 Poids fixes EIF+NF+XGB (MetaFusionMLP pas encore entraîné).")
# §8 — Entraînement du MetaLearner sur les labels du cycle courant
# (accumulation progressive — activation dès MIN_SAMPLES labels)