feat(ml): replace logistic regression with MLP fusion and KS drift with ADWIN online learning

Replace the LogisticRegression meta-learner with a PyTorch MetaFusionMLP
(Linear(3,16)->BN->ReLU->Dropout->Linear(16,1)->Sigmoid) for non-linear
fusion of EIF, NF, and XGBoost scores. Replace KS-test + quantile digest
drift detection with ADWIN (adaptive sliding window, Hoeffding bound).
Replace weekly XGBoost batch retraining with River HoeffdingAdaptiveTree
for incremental online learning (learn_one per cycle). Update all thesis
documentation sections (2.4.2c, 2.4.3, 3.8, discussion, conclusion).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jacquin Antoine
2026-04-13 16:32:34 +02:00
parent c6cb12981c
commit 7894d39f1c
12 changed files with 502 additions and 306 deletions

View File

@ -1,10 +1,11 @@
"""Gestion des modèles : chargement, entraînement, versionnement.
IsolationForest (EIF), Normalizing Flow (PyTorch/FrEIA), XGBoost supervisé.
IsolationForest (EIF), Normalizing Flow (PyTorch/FrEIA), Hoeffding Adaptive Tree (River).
"""
import os
import json
import glob
import pickle
import joblib
import numpy as np
import pandas as pd
@ -18,7 +19,7 @@ from .config import (
IsolationForest, StandardScaler,
)
from .log import log_info, log_decision, append_training_history
from .scoring import compute_drift_score
from .scoring import ADWINDriftMonitor
# Imports conditionnels depuis config (déjà importés une seule fois)
if EIF_AVAILABLE:
@ -27,6 +28,12 @@ if EIF_AVAILABLE:
if TORCH_AVAILABLE:
from .config import torch, nn
try:
from river import forest as river_forest
RIVER_AVAILABLE = True
except ImportError:
RIVER_AVAILABLE = False
if XGB_AVAILABLE:
import xgboost as xgb
from sklearn.model_selection import cross_val_predict
@ -36,11 +43,14 @@ if XGB_AVAILABLE:
CLEANLAB_AVAILABLE = True
except ImportError:
CLEANLAB_AVAILABLE = False
else:
CLEANLAB_AVAILABLE = False
# ─── Caches de modèles ─────────────────────────────────────────────────────
_model_cache: dict = {}
_xgb_cache: dict = {}
_drift_monitors: dict[str, ADWINDriftMonitor] = {}
# ═══════════════════════════════════════════════════════════════════════════════
@ -258,9 +268,100 @@ def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -
def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
"""Charge ou entraîne le modèle XGBoost supervisé.
"""Charge ou met à jour le modèle supervisé en ligne (Hoeffding Adaptive Tree).
Retourne (XGBClassifier, list[str] features) ou (None, None) si indisponible.
Remplace le XGBClassifier hebdomadaire par un HoeffdingAdaptiveTreeClassifier
de River, mis à jour incrémentalement à chaque cycle via learn_one().
Retourne (model, list[str] features) ou (None, None) si indisponible.
Le model retourné expose predict_proba_many(df) → DataFrame.
"""
if not (XGB_AVAILABLE or RIVER_AVAILABLE) or XGB_WEIGHT <= 0:
return None, None
model_path = _river_model_path(name)
meta_path = _xgb_meta_path(name)
# Charger le modèle River existant
model = None
xgb_features = features
n_seen = 0
if os.path.exists(model_path):
try:
with open(model_path, 'rb') as f:
model = pickle.load(f)
with open(meta_path) as f:
meta = json.load(f)
xgb_features = meta.get('features', features)
n_seen = meta.get('n_total_labels', 0)
log_info(f"[River][{name}] HAT rechargé ({n_seen} labels cumulés, {len(xgb_features)} features).")
except Exception as exc:
log_info(f"[River][{name}] Erreur chargement : {exc} — nouveau modèle.")
model = None
# Créer un nouveau modèle si nécessaire
if model is None:
try:
model = river_forest.HoeffdingAdaptiveTreeClassifier(
grace_period=50, max_depth=12, seed=42,
)
except Exception:
# Fallback vers XGBoost batch si River indisponible
return _load_or_train_xgb_batch(name, client, features, cycle_id)
# ── Apprentissage incrémental sur les labels du cycle ──────────────
X, y, usable_features = _load_xgb_labels(client, features)
if X is not None and usable_features is not None:
xgb_features = usable_features
X_df = pd.DataFrame(X, columns=xgb_features)
n_new = 0
for i in range(len(X_df)):
try:
x_dict = {col: float(X_df.iloc[i][col]) for col in xgb_features}
model.learn_one(x_dict, int(y[i]))
n_new += 1
except Exception:
continue
n_seen += n_new
# Persister le modèle mis à jour
os.makedirs(os.path.dirname(model_path), exist_ok=True)
with open(model_path, 'wb') as f:
pickle.dump(model, f)
meta = {
'trained_at': datetime.now().isoformat(),
'n_total_labels': n_seen,
'n_new_labels': n_new,
'n_features': len(xgb_features),
'features': xgb_features,
'model_name': name,
'algorithm': 'HoeffdingAdaptiveTreeClassifier',
}
with open(meta_path, 'w') as f:
json.dump(meta, f, indent=2)
log_info(f"[River][{name}] +{n_new} labels incrémentaux ({n_seen} total) — HAT mis à jour.")
log_decision('RIVER_UPDATED', cycle_id, name, meta)
else:
if n_seen == 0:
log_info(f"[River][{name}] Pas de labels — modèle supervisé désactivé ce cycle.")
return None, None
log_info(f"[River][{name}] Pas de nouveaux labels — HAT existant réutilisé ({n_seen} labels).")
return model, xgb_features
def _river_model_path(name: str) -> str:
"""Chemin du modèle River sérialisé."""
return os.path.join(MODEL_DIR, f'river_hat_{name}.pkl')
def _load_or_train_xgb_batch(name, client, features, cycle_id):
"""Fallback : entraîne un XGBoost classique si River est indisponible.
Conservé pour la compatibilité si river n'est pas installé.
Retourne (XGBClassifier, list[str] features) ou (None, None).
"""
if not XGB_AVAILABLE or XGB_WEIGHT <= 0:
return None, None
@ -268,87 +369,36 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
model_path = _xgb_model_path(name)
meta_path = _xgb_meta_path(name)
# Charger le modèle existant si récent
if os.path.exists(model_path) and os.path.exists(meta_path):
try:
with open(meta_path) as f:
meta = json.load(f)
trained_at = datetime.fromisoformat(meta['trained_at'])
age_h = (datetime.now() - trained_at).total_seconds() / 3600
if age_h < XGB_RETRAIN_INTERVAL_H:
model = xgb.XGBClassifier()
model.load_model(model_path)
log_info(f"[XGB][{name}] Modèle rechargé ({age_h:.1f}h / {XGB_RETRAIN_INTERVAL_H}h, {meta.get('n_labels', '?')} labels).")
return model, meta.get('features', features)
except Exception as exc:
log_info(f"[XGB][{name}] Erreur chargement : {exc}")
model = xgb.XGBClassifier()
model.load_model(model_path)
return model, meta.get('features', features)
except Exception:
pass
# Entraîner un nouveau modèle
X, y, xgb_features = _load_xgb_labels(client, features)
if X is None:
log_info(f"[XGB][{name}] Labels insuffisants (< {XGB_MIN_LABELS}) — XGBoost désactivé ce cycle.")
# Tenter de réutiliser un modèle ancien
if os.path.exists(model_path) and os.path.exists(meta_path):
try:
model = xgb.XGBClassifier()
model.load_model(model_path)
with open(meta_path) as f:
meta = json.load(f)
return model, meta.get('features', features)
except Exception:
pass
return None, None
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
# ── Cleanlab : filtrage des labels SOC bruyants ─────────────────────
if CLEANLAB_AVAILABLE:
try:
quick_model = xgb.XGBClassifier(
n_estimators=80, max_depth=4, learning_rate=0.15,
eval_metric='logloss', random_state=42, n_jobs=-1,
tree_method='hist',
)
pred_probs = cross_val_predict(
quick_model, X, y, cv=3, method='predict_proba',
)
issues = find_label_issues(
labels=y, pred_probs=pred_probs,
)
noisy_idx = issues[issues['is_label_issue'] == True].index.to_numpy()
if len(noisy_idx) > 0:
keep = np.ones(len(y), dtype=bool)
keep[noisy_idx] = False
X, y = X[keep], y[keep]
pct = len(noisy_idx) / (len(keep)) * 100
log_info(
f"[XGB][{name}] Cleanlab : {len(noisy_idx)}/{len(keep)} "
f"labels bruyants supprimés ({pct:.1f}%)"
)
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
except Exception as exc:
log_info(f"[XGB][{name}] Cleanlab échoué, labels bruts conservés : {exc}")
model = xgb.XGBClassifier(
n_estimators=200, max_depth=6, learning_rate=0.1,
scale_pos_weight=scale_pos, eval_metric='logloss',
random_state=42, n_jobs=-1,
tree_method='hist',
random_state=42, n_jobs=-1, tree_method='hist',
)
model.fit(X, y, verbose=False)
model.save_model(model_path)
meta = {
'trained_at': datetime.now().isoformat(),
'n_labels': len(y), 'n_positive': int(y.sum()),
'n_negative': int((y == 0).sum()), 'n_features': len(xgb_features),
'features': xgb_features,
'scale_pos_weight': scale_pos, 'model_name': name,
'features': xgb_features, 'model_name': name,
}
with open(meta_path, 'w') as f:
json.dump(meta, f, indent=2)
log_info(f"[XGB][{name}] Modèle entraîné : {len(y)} labels ({y.sum()} positifs), scale_pos_weight={scale_pos}")
log_decision('XGB_TRAINED', cycle_id, name, meta)
return model, xgb_features
@ -369,17 +419,27 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
age_h = (datetime.now() - trained_at).total_seconds() / 3600
age_ok = age_h < RETRAIN_INTERVAL_H
# A1 — Dérive conceptuelle : comparer la distribution actuelle avec celle de l'entraînement
# A1 — Dérive conceptuelle via ADWIN (fenêtre glissante adaptative)
drift_score = 0.0
drift_forced = False
if age_ok and 'baseline_stats' in meta:
drift_score = compute_drift_score(meta['baseline_stats'], human_baseline, features,
name=name, cycle_id=cycle_id)
# Obtenir ou créer le moniteur ADWIN pour ce modèle
if name not in _drift_monitors:
_drift_monitors[name] = ADWINDriftMonitor(features)
drift_monitor = _drift_monitors[name]
if drift_monitor.available:
# Alimenter ADWIN avec les moyennes de features du cycle courant
feature_means = {}
for f in features:
if f in human_baseline.columns:
feature_means[f] = float(human_baseline[f].mean())
drift_score = drift_monitor.check_drift(feature_means, name=name, cycle_id=cycle_id)
if drift_score >= DRIFT_THRESHOLD:
drift_forced = True
log_info(f"[{name}] Dérive détectée ({drift_score:.0%} features) — retraining forcé.")
log_info(f"[{name}] Dérive ADWIN détectée ({drift_score:.0%} features) — retraining forcé.")
log_decision('DRIFT_DETECTED', cycle_id, name, {
'version_id': meta['version_id'], 'drift_score': round(drift_score, 3),
'version_id': meta['version_id'], 'drift_rate': round(drift_score, 3),
'drift_threshold': DRIFT_THRESHOLD, 'model_age_hours': round(age_h, 2)
})
@ -465,18 +525,12 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
return joblib.load(model_path), ae_prev, meta.get('features', features)
log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.")
# A1/§4 — Sauvegarder les statistiques de distribution avec quantile digest 9 points
# (p5…p95) pour une meilleure fidélité de la détection de dérive KS+KL
# A1 — Statistiques de référence pour la baseline (mean/std uniquement,
# la détection de dérive est assurée par ADWIN en temps réel)
baseline_stats = {
f: {
'mean': float(X[f].mean()), 'std': float(X[f].std()),
'p5': float(X[f].quantile(0.05)),
'p10': float(X[f].quantile(0.10)),
'p25': float(X[f].quantile(0.25)),
'p50': float(X[f].quantile(0.50)),
'p75': float(X[f].quantile(0.75)),
'p90': float(X[f].quantile(0.90)),
'p95': float(X[f].quantile(0.95)),
'mean': float(X[f].mean()),
'std': float(X[f].std()),
}
for f in features
}