feat(ml): replace logistic regression with MLP fusion and KS drift with ADWIN online learning

Replace the LogisticRegression meta-learner with a PyTorch MetaFusionMLP (Linear(3,16)->BN->ReLU->Dropout->Linear(16,1)->Sigmoid) for non-linear fusion of EIF, NF, and XGBoost scores. Replace KS-test + quantile digest drift detection with ADWIN (adaptive sliding window, Hoeffding bound). Replace weekly XGBoost batch retraining with River HoeffdingAdaptiveTree for incremental online learning (learn_one per cycle). Update all thesis documentation sections (2.4.2c, 2.4.3, 3.8, discussion, conclusion). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-13 16:32:34 +02:00
parent c6cb12981c
commit 7894d39f1c
12 changed files with 502 additions and 306 deletions
--- a/services/bot-detector/bot_detector/models.py
+++ b/services/bot-detector/bot_detector/models.py
@ -1,10 +1,11 @@
 """Gestion des modèles : chargement, entraînement, versionnement.

-IsolationForest (EIF), Normalizing Flow (PyTorch/FrEIA), XGBoost supervisé.
+IsolationForest (EIF), Normalizing Flow (PyTorch/FrEIA), Hoeffding Adaptive Tree (River).
 """
 import os
 import json
 import glob
+import pickle
 import joblib
 import numpy as np
 import pandas as pd
@ -18,7 +19,7 @@ from .config import (
    IsolationForest, StandardScaler,
 )
 from .log import log_info, log_decision, append_training_history
-from .scoring import compute_drift_score
+from .scoring import ADWINDriftMonitor

 # Imports conditionnels depuis config (déjà importés une seule fois)
 if EIF_AVAILABLE:
@ -27,6 +28,12 @@ if EIF_AVAILABLE:
 if TORCH_AVAILABLE:
    from .config import torch, nn

+try:
+    from river import forest as river_forest
+    RIVER_AVAILABLE = True
+except ImportError:
+    RIVER_AVAILABLE = False
+
 if XGB_AVAILABLE:
    import xgboost as xgb
    from sklearn.model_selection import cross_val_predict
@ -36,11 +43,14 @@ if XGB_AVAILABLE:
        CLEANLAB_AVAILABLE = True
    except ImportError:
        CLEANLAB_AVAILABLE = False
+else:
+    CLEANLAB_AVAILABLE = False


 # ─── Caches de modèles ─────────────────────────────────────────────────────
 _model_cache: dict = {}
 _xgb_cache: dict = {}
+_drift_monitors: dict[str, ADWINDriftMonitor] = {}


 # ═══════════════════════════════════════════════════════════════════════════════
@ -258,9 +268,100 @@ def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -


 def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
-    """Charge ou entraîne le modèle XGBoost supervisé.
+    """Charge ou met à jour le modèle supervisé en ligne (Hoeffding Adaptive Tree).

-    Retourne (XGBClassifier, list[str] features) ou (None, None) si indisponible.
+    Remplace le XGBClassifier hebdomadaire par un HoeffdingAdaptiveTreeClassifier
+    de River, mis à jour incrémentalement à chaque cycle via learn_one().
+
+    Retourne (model, list[str] features) ou (None, None) si indisponible.
+    Le model retourné expose predict_proba_many(df) → DataFrame.
+    """
+    if not (XGB_AVAILABLE or RIVER_AVAILABLE) or XGB_WEIGHT <= 0:
+        return None, None
+
+    model_path = _river_model_path(name)
+    meta_path = _xgb_meta_path(name)
+
+    # Charger le modèle River existant
+    model = None
+    xgb_features = features
+    n_seen = 0
+
+    if os.path.exists(model_path):
+        try:
+            with open(model_path, 'rb') as f:
+                model = pickle.load(f)
+            with open(meta_path) as f:
+                meta = json.load(f)
+            xgb_features = meta.get('features', features)
+            n_seen = meta.get('n_total_labels', 0)
+            log_info(f"[River][{name}] HAT rechargé ({n_seen} labels cumulés, {len(xgb_features)} features).")
+        except Exception as exc:
+            log_info(f"[River][{name}] Erreur chargement : {exc} — nouveau modèle.")
+            model = None
+
+    # Créer un nouveau modèle si nécessaire
+    if model is None:
+        try:
+            model = river_forest.HoeffdingAdaptiveTreeClassifier(
+                grace_period=50, max_depth=12, seed=42,
+            )
+        except Exception:
+            # Fallback vers XGBoost batch si River indisponible
+            return _load_or_train_xgb_batch(name, client, features, cycle_id)
+
+    # ── Apprentissage incrémental sur les labels du cycle ──────────────
+    X, y, usable_features = _load_xgb_labels(client, features)
+    if X is not None and usable_features is not None:
+        xgb_features = usable_features
+        X_df = pd.DataFrame(X, columns=xgb_features)
+        n_new = 0
+        for i in range(len(X_df)):
+            try:
+                x_dict = {col: float(X_df.iloc[i][col]) for col in xgb_features}
+                model.learn_one(x_dict, int(y[i]))
+                n_new += 1
+            except Exception:
+                continue
+        n_seen += n_new
+
+        # Persister le modèle mis à jour
+        os.makedirs(os.path.dirname(model_path), exist_ok=True)
+        with open(model_path, 'wb') as f:
+            pickle.dump(model, f)
+        meta = {
+            'trained_at': datetime.now().isoformat(),
+            'n_total_labels': n_seen,
+            'n_new_labels': n_new,
+            'n_features': len(xgb_features),
+            'features': xgb_features,
+            'model_name': name,
+            'algorithm': 'HoeffdingAdaptiveTreeClassifier',
+        }
+        with open(meta_path, 'w') as f:
+            json.dump(meta, f, indent=2)
+
+        log_info(f"[River][{name}] +{n_new} labels incrémentaux ({n_seen} total) — HAT mis à jour.")
+        log_decision('RIVER_UPDATED', cycle_id, name, meta)
+    else:
+        if n_seen == 0:
+            log_info(f"[River][{name}] Pas de labels — modèle supervisé désactivé ce cycle.")
+            return None, None
+        log_info(f"[River][{name}] Pas de nouveaux labels — HAT existant réutilisé ({n_seen} labels).")
+
+    return model, xgb_features
+
+
+def _river_model_path(name: str) -> str:
+    """Chemin du modèle River sérialisé."""
+    return os.path.join(MODEL_DIR, f'river_hat_{name}.pkl')
+
+
+def _load_or_train_xgb_batch(name, client, features, cycle_id):
+    """Fallback : entraîne un XGBoost classique si River est indisponible.
+
+    Conservé pour la compatibilité si river n'est pas installé.
+    Retourne (XGBClassifier, list[str] features) ou (None, None).
    """
    if not XGB_AVAILABLE or XGB_WEIGHT <= 0:
        return None, None
@ -268,87 +369,36 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
    model_path = _xgb_model_path(name)
    meta_path = _xgb_meta_path(name)

-    # Charger le modèle existant si récent
    if os.path.exists(model_path) and os.path.exists(meta_path):
        try:
            with open(meta_path) as f:
                meta = json.load(f)
-            trained_at = datetime.fromisoformat(meta['trained_at'])
-            age_h = (datetime.now() - trained_at).total_seconds() / 3600
-            if age_h < XGB_RETRAIN_INTERVAL_H:
-                model = xgb.XGBClassifier()
-                model.load_model(model_path)
-                log_info(f"[XGB][{name}] Modèle rechargé ({age_h:.1f}h / {XGB_RETRAIN_INTERVAL_H}h, {meta.get('n_labels', '?')} labels).")
-                return model, meta.get('features', features)
-        except Exception as exc:
-            log_info(f"[XGB][{name}] Erreur chargement : {exc}")
+            model = xgb.XGBClassifier()
+            model.load_model(model_path)
+            return model, meta.get('features', features)
+        except Exception:
+            pass

-    # Entraîner un nouveau modèle
    X, y, xgb_features = _load_xgb_labels(client, features)
    if X is None:
-        log_info(f"[XGB][{name}] Labels insuffisants (< {XGB_MIN_LABELS}) — XGBoost désactivé ce cycle.")
-        # Tenter de réutiliser un modèle ancien
-        if os.path.exists(model_path) and os.path.exists(meta_path):
-            try:
-                model = xgb.XGBClassifier()
-                model.load_model(model_path)
-                with open(meta_path) as f:
-                    meta = json.load(f)
-                return model, meta.get('features', features)
-            except Exception:
-                pass
        return None, None

    scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
-
-    # ── Cleanlab : filtrage des labels SOC bruyants ─────────────────────
-    if CLEANLAB_AVAILABLE:
-        try:
-            quick_model = xgb.XGBClassifier(
-                n_estimators=80, max_depth=4, learning_rate=0.15,
-                eval_metric='logloss', random_state=42, n_jobs=-1,
-                tree_method='hist',
-            )
-            pred_probs = cross_val_predict(
-                quick_model, X, y, cv=3, method='predict_proba',
-            )
-            issues = find_label_issues(
-                labels=y, pred_probs=pred_probs,
-            )
-            noisy_idx = issues[issues['is_label_issue'] == True].index.to_numpy()
-            if len(noisy_idx) > 0:
-                keep = np.ones(len(y), dtype=bool)
-                keep[noisy_idx] = False
-                X, y = X[keep], y[keep]
-                pct = len(noisy_idx) / (len(keep)) * 100
-                log_info(
-                    f"[XGB][{name}] Cleanlab : {len(noisy_idx)}/{len(keep)} "
-                    f"labels bruyants supprimés ({pct:.1f}%)"
-                )
-                scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
-        except Exception as exc:
-            log_info(f"[XGB][{name}] Cleanlab échoué, labels bruts conservés : {exc}")
-
    model = xgb.XGBClassifier(
        n_estimators=200, max_depth=6, learning_rate=0.1,
        scale_pos_weight=scale_pos, eval_metric='logloss',
-        random_state=42, n_jobs=-1,
-        tree_method='hist',
+        random_state=42, n_jobs=-1, tree_method='hist',
    )
    model.fit(X, y, verbose=False)
-
    model.save_model(model_path)
    meta = {
        'trained_at': datetime.now().isoformat(),
        'n_labels': len(y), 'n_positive': int(y.sum()),
        'n_negative': int((y == 0).sum()), 'n_features': len(xgb_features),
-        'features': xgb_features,
-        'scale_pos_weight': scale_pos, 'model_name': name,
+        'features': xgb_features, 'model_name': name,
    }
    with open(meta_path, 'w') as f:
        json.dump(meta, f, indent=2)
-
-    log_info(f"[XGB][{name}] Modèle entraîné : {len(y)} labels ({y.sum()} positifs), scale_pos_weight={scale_pos}")
    log_decision('XGB_TRAINED', cycle_id, name, meta)
    return model, xgb_features

@ -369,17 +419,27 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
        age_h = (datetime.now() - trained_at).total_seconds() / 3600
        age_ok = age_h < RETRAIN_INTERVAL_H

-        # A1 — Dérive conceptuelle : comparer la distribution actuelle avec celle de l'entraînement
+        # A1 — Dérive conceptuelle via ADWIN (fenêtre glissante adaptative)
        drift_score = 0.0
        drift_forced = False
-        if age_ok and 'baseline_stats' in meta:
-            drift_score = compute_drift_score(meta['baseline_stats'], human_baseline, features,
-                                              name=name, cycle_id=cycle_id)
+
+        # Obtenir ou créer le moniteur ADWIN pour ce modèle
+        if name not in _drift_monitors:
+            _drift_monitors[name] = ADWINDriftMonitor(features)
+        drift_monitor = _drift_monitors[name]
+
+        if drift_monitor.available:
+            # Alimenter ADWIN avec les moyennes de features du cycle courant
+            feature_means = {}
+            for f in features:
+                if f in human_baseline.columns:
+                    feature_means[f] = float(human_baseline[f].mean())
+            drift_score = drift_monitor.check_drift(feature_means, name=name, cycle_id=cycle_id)
            if drift_score >= DRIFT_THRESHOLD:
                drift_forced = True
-                log_info(f"[{name}] Dérive détectée ({drift_score:.0%} features) — retraining forcé.")
+                log_info(f"[{name}] Dérive ADWIN détectée ({drift_score:.0%} features) — retraining forcé.")
                log_decision('DRIFT_DETECTED', cycle_id, name, {
-                    'version_id': meta['version_id'], 'drift_score': round(drift_score, 3),
+                    'version_id': meta['version_id'], 'drift_rate': round(drift_score, 3),
                    'drift_threshold': DRIFT_THRESHOLD, 'model_age_hours': round(age_h, 2)
                })

@ -465,18 +525,12 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
            return joblib.load(model_path), ae_prev, meta.get('features', features)
        log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.")

-    # A1/§4 — Sauvegarder les statistiques de distribution avec quantile digest 9 points
-    # (p5…p95) pour une meilleure fidélité de la détection de dérive KS+KL
+    # A1 — Statistiques de référence pour la baseline (mean/std uniquement,
+    # la détection de dérive est assurée par ADWIN en temps réel)
    baseline_stats = {
        f: {
-            'mean': float(X[f].mean()), 'std': float(X[f].std()),
-            'p5':  float(X[f].quantile(0.05)),
-            'p10': float(X[f].quantile(0.10)),
-            'p25': float(X[f].quantile(0.25)),
-            'p50': float(X[f].quantile(0.50)),
-            'p75': float(X[f].quantile(0.75)),
-            'p90': float(X[f].quantile(0.90)),
-            'p95': float(X[f].quantile(0.95)),
+            'mean': float(X[f].mean()),
+            'std': float(X[f].std()),
        }
        for f in features
    }