feat(ml): replace logistic regression with MLP fusion and KS drift with ADWIN online learning
Replace the LogisticRegression meta-learner with a PyTorch MetaFusionMLP (Linear(3,16)->BN->ReLU->Dropout->Linear(16,1)->Sigmoid) for non-linear fusion of EIF, NF, and XGBoost scores. Replace KS-test + quantile digest drift detection with ADWIN (adaptive sliding window, Hoeffding bound). Replace weekly XGBoost batch retraining with River HoeffdingAdaptiveTree for incremental online learning (learn_one per cycle). Update all thesis documentation sections (2.4.2c, 2.4.3, 3.8, discussion, conclusion). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -1,10 +1,11 @@
|
||||
"""Gestion des modèles : chargement, entraînement, versionnement.
|
||||
|
||||
IsolationForest (EIF), Normalizing Flow (PyTorch/FrEIA), XGBoost supervisé.
|
||||
IsolationForest (EIF), Normalizing Flow (PyTorch/FrEIA), Hoeffding Adaptive Tree (River).
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
import glob
|
||||
import pickle
|
||||
import joblib
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
@ -18,7 +19,7 @@ from .config import (
|
||||
IsolationForest, StandardScaler,
|
||||
)
|
||||
from .log import log_info, log_decision, append_training_history
|
||||
from .scoring import compute_drift_score
|
||||
from .scoring import ADWINDriftMonitor
|
||||
|
||||
# Imports conditionnels depuis config (déjà importés une seule fois)
|
||||
if EIF_AVAILABLE:
|
||||
@ -27,6 +28,12 @@ if EIF_AVAILABLE:
|
||||
if TORCH_AVAILABLE:
|
||||
from .config import torch, nn
|
||||
|
||||
try:
|
||||
from river import forest as river_forest
|
||||
RIVER_AVAILABLE = True
|
||||
except ImportError:
|
||||
RIVER_AVAILABLE = False
|
||||
|
||||
if XGB_AVAILABLE:
|
||||
import xgboost as xgb
|
||||
from sklearn.model_selection import cross_val_predict
|
||||
@ -36,11 +43,14 @@ if XGB_AVAILABLE:
|
||||
CLEANLAB_AVAILABLE = True
|
||||
except ImportError:
|
||||
CLEANLAB_AVAILABLE = False
|
||||
else:
|
||||
CLEANLAB_AVAILABLE = False
|
||||
|
||||
|
||||
# ─── Caches de modèles ─────────────────────────────────────────────────────
|
||||
_model_cache: dict = {}
|
||||
_xgb_cache: dict = {}
|
||||
_drift_monitors: dict[str, ADWINDriftMonitor] = {}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
@ -258,9 +268,100 @@ def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -
|
||||
|
||||
|
||||
def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
|
||||
"""Charge ou entraîne le modèle XGBoost supervisé.
|
||||
"""Charge ou met à jour le modèle supervisé en ligne (Hoeffding Adaptive Tree).
|
||||
|
||||
Retourne (XGBClassifier, list[str] features) ou (None, None) si indisponible.
|
||||
Remplace le XGBClassifier hebdomadaire par un HoeffdingAdaptiveTreeClassifier
|
||||
de River, mis à jour incrémentalement à chaque cycle via learn_one().
|
||||
|
||||
Retourne (model, list[str] features) ou (None, None) si indisponible.
|
||||
Le model retourné expose predict_proba_many(df) → DataFrame.
|
||||
"""
|
||||
if not (XGB_AVAILABLE or RIVER_AVAILABLE) or XGB_WEIGHT <= 0:
|
||||
return None, None
|
||||
|
||||
model_path = _river_model_path(name)
|
||||
meta_path = _xgb_meta_path(name)
|
||||
|
||||
# Charger le modèle River existant
|
||||
model = None
|
||||
xgb_features = features
|
||||
n_seen = 0
|
||||
|
||||
if os.path.exists(model_path):
|
||||
try:
|
||||
with open(model_path, 'rb') as f:
|
||||
model = pickle.load(f)
|
||||
with open(meta_path) as f:
|
||||
meta = json.load(f)
|
||||
xgb_features = meta.get('features', features)
|
||||
n_seen = meta.get('n_total_labels', 0)
|
||||
log_info(f"[River][{name}] HAT rechargé ({n_seen} labels cumulés, {len(xgb_features)} features).")
|
||||
except Exception as exc:
|
||||
log_info(f"[River][{name}] Erreur chargement : {exc} — nouveau modèle.")
|
||||
model = None
|
||||
|
||||
# Créer un nouveau modèle si nécessaire
|
||||
if model is None:
|
||||
try:
|
||||
model = river_forest.HoeffdingAdaptiveTreeClassifier(
|
||||
grace_period=50, max_depth=12, seed=42,
|
||||
)
|
||||
except Exception:
|
||||
# Fallback vers XGBoost batch si River indisponible
|
||||
return _load_or_train_xgb_batch(name, client, features, cycle_id)
|
||||
|
||||
# ── Apprentissage incrémental sur les labels du cycle ──────────────
|
||||
X, y, usable_features = _load_xgb_labels(client, features)
|
||||
if X is not None and usable_features is not None:
|
||||
xgb_features = usable_features
|
||||
X_df = pd.DataFrame(X, columns=xgb_features)
|
||||
n_new = 0
|
||||
for i in range(len(X_df)):
|
||||
try:
|
||||
x_dict = {col: float(X_df.iloc[i][col]) for col in xgb_features}
|
||||
model.learn_one(x_dict, int(y[i]))
|
||||
n_new += 1
|
||||
except Exception:
|
||||
continue
|
||||
n_seen += n_new
|
||||
|
||||
# Persister le modèle mis à jour
|
||||
os.makedirs(os.path.dirname(model_path), exist_ok=True)
|
||||
with open(model_path, 'wb') as f:
|
||||
pickle.dump(model, f)
|
||||
meta = {
|
||||
'trained_at': datetime.now().isoformat(),
|
||||
'n_total_labels': n_seen,
|
||||
'n_new_labels': n_new,
|
||||
'n_features': len(xgb_features),
|
||||
'features': xgb_features,
|
||||
'model_name': name,
|
||||
'algorithm': 'HoeffdingAdaptiveTreeClassifier',
|
||||
}
|
||||
with open(meta_path, 'w') as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
|
||||
log_info(f"[River][{name}] +{n_new} labels incrémentaux ({n_seen} total) — HAT mis à jour.")
|
||||
log_decision('RIVER_UPDATED', cycle_id, name, meta)
|
||||
else:
|
||||
if n_seen == 0:
|
||||
log_info(f"[River][{name}] Pas de labels — modèle supervisé désactivé ce cycle.")
|
||||
return None, None
|
||||
log_info(f"[River][{name}] Pas de nouveaux labels — HAT existant réutilisé ({n_seen} labels).")
|
||||
|
||||
return model, xgb_features
|
||||
|
||||
|
||||
def _river_model_path(name: str) -> str:
|
||||
"""Chemin du modèle River sérialisé."""
|
||||
return os.path.join(MODEL_DIR, f'river_hat_{name}.pkl')
|
||||
|
||||
|
||||
def _load_or_train_xgb_batch(name, client, features, cycle_id):
|
||||
"""Fallback : entraîne un XGBoost classique si River est indisponible.
|
||||
|
||||
Conservé pour la compatibilité si river n'est pas installé.
|
||||
Retourne (XGBClassifier, list[str] features) ou (None, None).
|
||||
"""
|
||||
if not XGB_AVAILABLE or XGB_WEIGHT <= 0:
|
||||
return None, None
|
||||
@ -268,87 +369,36 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
|
||||
model_path = _xgb_model_path(name)
|
||||
meta_path = _xgb_meta_path(name)
|
||||
|
||||
# Charger le modèle existant si récent
|
||||
if os.path.exists(model_path) and os.path.exists(meta_path):
|
||||
try:
|
||||
with open(meta_path) as f:
|
||||
meta = json.load(f)
|
||||
trained_at = datetime.fromisoformat(meta['trained_at'])
|
||||
age_h = (datetime.now() - trained_at).total_seconds() / 3600
|
||||
if age_h < XGB_RETRAIN_INTERVAL_H:
|
||||
model = xgb.XGBClassifier()
|
||||
model.load_model(model_path)
|
||||
log_info(f"[XGB][{name}] Modèle rechargé ({age_h:.1f}h / {XGB_RETRAIN_INTERVAL_H}h, {meta.get('n_labels', '?')} labels).")
|
||||
return model, meta.get('features', features)
|
||||
except Exception as exc:
|
||||
log_info(f"[XGB][{name}] Erreur chargement : {exc}")
|
||||
model = xgb.XGBClassifier()
|
||||
model.load_model(model_path)
|
||||
return model, meta.get('features', features)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Entraîner un nouveau modèle
|
||||
X, y, xgb_features = _load_xgb_labels(client, features)
|
||||
if X is None:
|
||||
log_info(f"[XGB][{name}] Labels insuffisants (< {XGB_MIN_LABELS}) — XGBoost désactivé ce cycle.")
|
||||
# Tenter de réutiliser un modèle ancien
|
||||
if os.path.exists(model_path) and os.path.exists(meta_path):
|
||||
try:
|
||||
model = xgb.XGBClassifier()
|
||||
model.load_model(model_path)
|
||||
with open(meta_path) as f:
|
||||
meta = json.load(f)
|
||||
return model, meta.get('features', features)
|
||||
except Exception:
|
||||
pass
|
||||
return None, None
|
||||
|
||||
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
|
||||
|
||||
# ── Cleanlab : filtrage des labels SOC bruyants ─────────────────────
|
||||
if CLEANLAB_AVAILABLE:
|
||||
try:
|
||||
quick_model = xgb.XGBClassifier(
|
||||
n_estimators=80, max_depth=4, learning_rate=0.15,
|
||||
eval_metric='logloss', random_state=42, n_jobs=-1,
|
||||
tree_method='hist',
|
||||
)
|
||||
pred_probs = cross_val_predict(
|
||||
quick_model, X, y, cv=3, method='predict_proba',
|
||||
)
|
||||
issues = find_label_issues(
|
||||
labels=y, pred_probs=pred_probs,
|
||||
)
|
||||
noisy_idx = issues[issues['is_label_issue'] == True].index.to_numpy()
|
||||
if len(noisy_idx) > 0:
|
||||
keep = np.ones(len(y), dtype=bool)
|
||||
keep[noisy_idx] = False
|
||||
X, y = X[keep], y[keep]
|
||||
pct = len(noisy_idx) / (len(keep)) * 100
|
||||
log_info(
|
||||
f"[XGB][{name}] Cleanlab : {len(noisy_idx)}/{len(keep)} "
|
||||
f"labels bruyants supprimés ({pct:.1f}%)"
|
||||
)
|
||||
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
|
||||
except Exception as exc:
|
||||
log_info(f"[XGB][{name}] Cleanlab échoué, labels bruts conservés : {exc}")
|
||||
|
||||
model = xgb.XGBClassifier(
|
||||
n_estimators=200, max_depth=6, learning_rate=0.1,
|
||||
scale_pos_weight=scale_pos, eval_metric='logloss',
|
||||
random_state=42, n_jobs=-1,
|
||||
tree_method='hist',
|
||||
random_state=42, n_jobs=-1, tree_method='hist',
|
||||
)
|
||||
model.fit(X, y, verbose=False)
|
||||
|
||||
model.save_model(model_path)
|
||||
meta = {
|
||||
'trained_at': datetime.now().isoformat(),
|
||||
'n_labels': len(y), 'n_positive': int(y.sum()),
|
||||
'n_negative': int((y == 0).sum()), 'n_features': len(xgb_features),
|
||||
'features': xgb_features,
|
||||
'scale_pos_weight': scale_pos, 'model_name': name,
|
||||
'features': xgb_features, 'model_name': name,
|
||||
}
|
||||
with open(meta_path, 'w') as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
|
||||
log_info(f"[XGB][{name}] Modèle entraîné : {len(y)} labels ({y.sum()} positifs), scale_pos_weight={scale_pos}")
|
||||
log_decision('XGB_TRAINED', cycle_id, name, meta)
|
||||
return model, xgb_features
|
||||
|
||||
@ -369,17 +419,27 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
age_h = (datetime.now() - trained_at).total_seconds() / 3600
|
||||
age_ok = age_h < RETRAIN_INTERVAL_H
|
||||
|
||||
# A1 — Dérive conceptuelle : comparer la distribution actuelle avec celle de l'entraînement
|
||||
# A1 — Dérive conceptuelle via ADWIN (fenêtre glissante adaptative)
|
||||
drift_score = 0.0
|
||||
drift_forced = False
|
||||
if age_ok and 'baseline_stats' in meta:
|
||||
drift_score = compute_drift_score(meta['baseline_stats'], human_baseline, features,
|
||||
name=name, cycle_id=cycle_id)
|
||||
|
||||
# Obtenir ou créer le moniteur ADWIN pour ce modèle
|
||||
if name not in _drift_monitors:
|
||||
_drift_monitors[name] = ADWINDriftMonitor(features)
|
||||
drift_monitor = _drift_monitors[name]
|
||||
|
||||
if drift_monitor.available:
|
||||
# Alimenter ADWIN avec les moyennes de features du cycle courant
|
||||
feature_means = {}
|
||||
for f in features:
|
||||
if f in human_baseline.columns:
|
||||
feature_means[f] = float(human_baseline[f].mean())
|
||||
drift_score = drift_monitor.check_drift(feature_means, name=name, cycle_id=cycle_id)
|
||||
if drift_score >= DRIFT_THRESHOLD:
|
||||
drift_forced = True
|
||||
log_info(f"[{name}] Dérive détectée ({drift_score:.0%} features) — retraining forcé.")
|
||||
log_info(f"[{name}] Dérive ADWIN détectée ({drift_score:.0%} features) — retraining forcé.")
|
||||
log_decision('DRIFT_DETECTED', cycle_id, name, {
|
||||
'version_id': meta['version_id'], 'drift_score': round(drift_score, 3),
|
||||
'version_id': meta['version_id'], 'drift_rate': round(drift_score, 3),
|
||||
'drift_threshold': DRIFT_THRESHOLD, 'model_age_hours': round(age_h, 2)
|
||||
})
|
||||
|
||||
@ -465,18 +525,12 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
return joblib.load(model_path), ae_prev, meta.get('features', features)
|
||||
log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.")
|
||||
|
||||
# A1/§4 — Sauvegarder les statistiques de distribution avec quantile digest 9 points
|
||||
# (p5…p95) pour une meilleure fidélité de la détection de dérive KS+KL
|
||||
# A1 — Statistiques de référence pour la baseline (mean/std uniquement,
|
||||
# la détection de dérive est assurée par ADWIN en temps réel)
|
||||
baseline_stats = {
|
||||
f: {
|
||||
'mean': float(X[f].mean()), 'std': float(X[f].std()),
|
||||
'p5': float(X[f].quantile(0.05)),
|
||||
'p10': float(X[f].quantile(0.10)),
|
||||
'p25': float(X[f].quantile(0.25)),
|
||||
'p50': float(X[f].quantile(0.50)),
|
||||
'p75': float(X[f].quantile(0.75)),
|
||||
'p90': float(X[f].quantile(0.90)),
|
||||
'p95': float(X[f].quantile(0.95)),
|
||||
'mean': float(X[f].mean()),
|
||||
'std': float(X[f].std()),
|
||||
}
|
||||
for f in features
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user