feat(ml): replace logistic regression with MLP fusion and KS drift with ADWIN online learning
Replace the LogisticRegression meta-learner with a PyTorch MetaFusionMLP (Linear(3,16)->BN->ReLU->Dropout->Linear(16,1)->Sigmoid) for non-linear fusion of EIF, NF, and XGBoost scores. Replace KS-test + quantile digest drift detection with ADWIN (adaptive sliding window, Hoeffding bound). Replace weekly XGBoost batch retraining with River HoeffdingAdaptiveTree for incremental online learning (learn_one per cycle). Update all thesis documentation sections (2.4.2c, 2.4.3, 3.8, discussion, conclusion). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -1,10 +1,11 @@
|
||||
"""Gestion des modèles : chargement, entraînement, versionnement.
|
||||
|
||||
IsolationForest (EIF), Normalizing Flow (PyTorch/FrEIA), XGBoost supervisé.
|
||||
IsolationForest (EIF), Normalizing Flow (PyTorch/FrEIA), Hoeffding Adaptive Tree (River).
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
import glob
|
||||
import pickle
|
||||
import joblib
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
@ -18,7 +19,7 @@ from .config import (
|
||||
IsolationForest, StandardScaler,
|
||||
)
|
||||
from .log import log_info, log_decision, append_training_history
|
||||
from .scoring import compute_drift_score
|
||||
from .scoring import ADWINDriftMonitor
|
||||
|
||||
# Imports conditionnels depuis config (déjà importés une seule fois)
|
||||
if EIF_AVAILABLE:
|
||||
@ -27,6 +28,12 @@ if EIF_AVAILABLE:
|
||||
if TORCH_AVAILABLE:
|
||||
from .config import torch, nn
|
||||
|
||||
try:
|
||||
from river import forest as river_forest
|
||||
RIVER_AVAILABLE = True
|
||||
except ImportError:
|
||||
RIVER_AVAILABLE = False
|
||||
|
||||
if XGB_AVAILABLE:
|
||||
import xgboost as xgb
|
||||
from sklearn.model_selection import cross_val_predict
|
||||
@ -36,11 +43,14 @@ if XGB_AVAILABLE:
|
||||
CLEANLAB_AVAILABLE = True
|
||||
except ImportError:
|
||||
CLEANLAB_AVAILABLE = False
|
||||
else:
|
||||
CLEANLAB_AVAILABLE = False
|
||||
|
||||
|
||||
# ─── Caches de modèles ─────────────────────────────────────────────────────
|
||||
_model_cache: dict = {}
|
||||
_xgb_cache: dict = {}
|
||||
_drift_monitors: dict[str, ADWINDriftMonitor] = {}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
@ -258,9 +268,100 @@ def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -
|
||||
|
||||
|
||||
def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
|
||||
"""Charge ou entraîne le modèle XGBoost supervisé.
|
||||
"""Charge ou met à jour le modèle supervisé en ligne (Hoeffding Adaptive Tree).
|
||||
|
||||
Retourne (XGBClassifier, list[str] features) ou (None, None) si indisponible.
|
||||
Remplace le XGBClassifier hebdomadaire par un HoeffdingAdaptiveTreeClassifier
|
||||
de River, mis à jour incrémentalement à chaque cycle via learn_one().
|
||||
|
||||
Retourne (model, list[str] features) ou (None, None) si indisponible.
|
||||
Le model retourné expose predict_proba_many(df) → DataFrame.
|
||||
"""
|
||||
if not (XGB_AVAILABLE or RIVER_AVAILABLE) or XGB_WEIGHT <= 0:
|
||||
return None, None
|
||||
|
||||
model_path = _river_model_path(name)
|
||||
meta_path = _xgb_meta_path(name)
|
||||
|
||||
# Charger le modèle River existant
|
||||
model = None
|
||||
xgb_features = features
|
||||
n_seen = 0
|
||||
|
||||
if os.path.exists(model_path):
|
||||
try:
|
||||
with open(model_path, 'rb') as f:
|
||||
model = pickle.load(f)
|
||||
with open(meta_path) as f:
|
||||
meta = json.load(f)
|
||||
xgb_features = meta.get('features', features)
|
||||
n_seen = meta.get('n_total_labels', 0)
|
||||
log_info(f"[River][{name}] HAT rechargé ({n_seen} labels cumulés, {len(xgb_features)} features).")
|
||||
except Exception as exc:
|
||||
log_info(f"[River][{name}] Erreur chargement : {exc} — nouveau modèle.")
|
||||
model = None
|
||||
|
||||
# Créer un nouveau modèle si nécessaire
|
||||
if model is None:
|
||||
try:
|
||||
model = river_forest.HoeffdingAdaptiveTreeClassifier(
|
||||
grace_period=50, max_depth=12, seed=42,
|
||||
)
|
||||
except Exception:
|
||||
# Fallback vers XGBoost batch si River indisponible
|
||||
return _load_or_train_xgb_batch(name, client, features, cycle_id)
|
||||
|
||||
# ── Apprentissage incrémental sur les labels du cycle ──────────────
|
||||
X, y, usable_features = _load_xgb_labels(client, features)
|
||||
if X is not None and usable_features is not None:
|
||||
xgb_features = usable_features
|
||||
X_df = pd.DataFrame(X, columns=xgb_features)
|
||||
n_new = 0
|
||||
for i in range(len(X_df)):
|
||||
try:
|
||||
x_dict = {col: float(X_df.iloc[i][col]) for col in xgb_features}
|
||||
model.learn_one(x_dict, int(y[i]))
|
||||
n_new += 1
|
||||
except Exception:
|
||||
continue
|
||||
n_seen += n_new
|
||||
|
||||
# Persister le modèle mis à jour
|
||||
os.makedirs(os.path.dirname(model_path), exist_ok=True)
|
||||
with open(model_path, 'wb') as f:
|
||||
pickle.dump(model, f)
|
||||
meta = {
|
||||
'trained_at': datetime.now().isoformat(),
|
||||
'n_total_labels': n_seen,
|
||||
'n_new_labels': n_new,
|
||||
'n_features': len(xgb_features),
|
||||
'features': xgb_features,
|
||||
'model_name': name,
|
||||
'algorithm': 'HoeffdingAdaptiveTreeClassifier',
|
||||
}
|
||||
with open(meta_path, 'w') as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
|
||||
log_info(f"[River][{name}] +{n_new} labels incrémentaux ({n_seen} total) — HAT mis à jour.")
|
||||
log_decision('RIVER_UPDATED', cycle_id, name, meta)
|
||||
else:
|
||||
if n_seen == 0:
|
||||
log_info(f"[River][{name}] Pas de labels — modèle supervisé désactivé ce cycle.")
|
||||
return None, None
|
||||
log_info(f"[River][{name}] Pas de nouveaux labels — HAT existant réutilisé ({n_seen} labels).")
|
||||
|
||||
return model, xgb_features
|
||||
|
||||
|
||||
def _river_model_path(name: str) -> str:
|
||||
"""Chemin du modèle River sérialisé."""
|
||||
return os.path.join(MODEL_DIR, f'river_hat_{name}.pkl')
|
||||
|
||||
|
||||
def _load_or_train_xgb_batch(name, client, features, cycle_id):
|
||||
"""Fallback : entraîne un XGBoost classique si River est indisponible.
|
||||
|
||||
Conservé pour la compatibilité si river n'est pas installé.
|
||||
Retourne (XGBClassifier, list[str] features) ou (None, None).
|
||||
"""
|
||||
if not XGB_AVAILABLE or XGB_WEIGHT <= 0:
|
||||
return None, None
|
||||
@ -268,87 +369,36 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
|
||||
model_path = _xgb_model_path(name)
|
||||
meta_path = _xgb_meta_path(name)
|
||||
|
||||
# Charger le modèle existant si récent
|
||||
if os.path.exists(model_path) and os.path.exists(meta_path):
|
||||
try:
|
||||
with open(meta_path) as f:
|
||||
meta = json.load(f)
|
||||
trained_at = datetime.fromisoformat(meta['trained_at'])
|
||||
age_h = (datetime.now() - trained_at).total_seconds() / 3600
|
||||
if age_h < XGB_RETRAIN_INTERVAL_H:
|
||||
model = xgb.XGBClassifier()
|
||||
model.load_model(model_path)
|
||||
log_info(f"[XGB][{name}] Modèle rechargé ({age_h:.1f}h / {XGB_RETRAIN_INTERVAL_H}h, {meta.get('n_labels', '?')} labels).")
|
||||
return model, meta.get('features', features)
|
||||
except Exception as exc:
|
||||
log_info(f"[XGB][{name}] Erreur chargement : {exc}")
|
||||
model = xgb.XGBClassifier()
|
||||
model.load_model(model_path)
|
||||
return model, meta.get('features', features)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Entraîner un nouveau modèle
|
||||
X, y, xgb_features = _load_xgb_labels(client, features)
|
||||
if X is None:
|
||||
log_info(f"[XGB][{name}] Labels insuffisants (< {XGB_MIN_LABELS}) — XGBoost désactivé ce cycle.")
|
||||
# Tenter de réutiliser un modèle ancien
|
||||
if os.path.exists(model_path) and os.path.exists(meta_path):
|
||||
try:
|
||||
model = xgb.XGBClassifier()
|
||||
model.load_model(model_path)
|
||||
with open(meta_path) as f:
|
||||
meta = json.load(f)
|
||||
return model, meta.get('features', features)
|
||||
except Exception:
|
||||
pass
|
||||
return None, None
|
||||
|
||||
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
|
||||
|
||||
# ── Cleanlab : filtrage des labels SOC bruyants ─────────────────────
|
||||
if CLEANLAB_AVAILABLE:
|
||||
try:
|
||||
quick_model = xgb.XGBClassifier(
|
||||
n_estimators=80, max_depth=4, learning_rate=0.15,
|
||||
eval_metric='logloss', random_state=42, n_jobs=-1,
|
||||
tree_method='hist',
|
||||
)
|
||||
pred_probs = cross_val_predict(
|
||||
quick_model, X, y, cv=3, method='predict_proba',
|
||||
)
|
||||
issues = find_label_issues(
|
||||
labels=y, pred_probs=pred_probs,
|
||||
)
|
||||
noisy_idx = issues[issues['is_label_issue'] == True].index.to_numpy()
|
||||
if len(noisy_idx) > 0:
|
||||
keep = np.ones(len(y), dtype=bool)
|
||||
keep[noisy_idx] = False
|
||||
X, y = X[keep], y[keep]
|
||||
pct = len(noisy_idx) / (len(keep)) * 100
|
||||
log_info(
|
||||
f"[XGB][{name}] Cleanlab : {len(noisy_idx)}/{len(keep)} "
|
||||
f"labels bruyants supprimés ({pct:.1f}%)"
|
||||
)
|
||||
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
|
||||
except Exception as exc:
|
||||
log_info(f"[XGB][{name}] Cleanlab échoué, labels bruts conservés : {exc}")
|
||||
|
||||
model = xgb.XGBClassifier(
|
||||
n_estimators=200, max_depth=6, learning_rate=0.1,
|
||||
scale_pos_weight=scale_pos, eval_metric='logloss',
|
||||
random_state=42, n_jobs=-1,
|
||||
tree_method='hist',
|
||||
random_state=42, n_jobs=-1, tree_method='hist',
|
||||
)
|
||||
model.fit(X, y, verbose=False)
|
||||
|
||||
model.save_model(model_path)
|
||||
meta = {
|
||||
'trained_at': datetime.now().isoformat(),
|
||||
'n_labels': len(y), 'n_positive': int(y.sum()),
|
||||
'n_negative': int((y == 0).sum()), 'n_features': len(xgb_features),
|
||||
'features': xgb_features,
|
||||
'scale_pos_weight': scale_pos, 'model_name': name,
|
||||
'features': xgb_features, 'model_name': name,
|
||||
}
|
||||
with open(meta_path, 'w') as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
|
||||
log_info(f"[XGB][{name}] Modèle entraîné : {len(y)} labels ({y.sum()} positifs), scale_pos_weight={scale_pos}")
|
||||
log_decision('XGB_TRAINED', cycle_id, name, meta)
|
||||
return model, xgb_features
|
||||
|
||||
@ -369,17 +419,27 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
age_h = (datetime.now() - trained_at).total_seconds() / 3600
|
||||
age_ok = age_h < RETRAIN_INTERVAL_H
|
||||
|
||||
# A1 — Dérive conceptuelle : comparer la distribution actuelle avec celle de l'entraînement
|
||||
# A1 — Dérive conceptuelle via ADWIN (fenêtre glissante adaptative)
|
||||
drift_score = 0.0
|
||||
drift_forced = False
|
||||
if age_ok and 'baseline_stats' in meta:
|
||||
drift_score = compute_drift_score(meta['baseline_stats'], human_baseline, features,
|
||||
name=name, cycle_id=cycle_id)
|
||||
|
||||
# Obtenir ou créer le moniteur ADWIN pour ce modèle
|
||||
if name not in _drift_monitors:
|
||||
_drift_monitors[name] = ADWINDriftMonitor(features)
|
||||
drift_monitor = _drift_monitors[name]
|
||||
|
||||
if drift_monitor.available:
|
||||
# Alimenter ADWIN avec les moyennes de features du cycle courant
|
||||
feature_means = {}
|
||||
for f in features:
|
||||
if f in human_baseline.columns:
|
||||
feature_means[f] = float(human_baseline[f].mean())
|
||||
drift_score = drift_monitor.check_drift(feature_means, name=name, cycle_id=cycle_id)
|
||||
if drift_score >= DRIFT_THRESHOLD:
|
||||
drift_forced = True
|
||||
log_info(f"[{name}] Dérive détectée ({drift_score:.0%} features) — retraining forcé.")
|
||||
log_info(f"[{name}] Dérive ADWIN détectée ({drift_score:.0%} features) — retraining forcé.")
|
||||
log_decision('DRIFT_DETECTED', cycle_id, name, {
|
||||
'version_id': meta['version_id'], 'drift_score': round(drift_score, 3),
|
||||
'version_id': meta['version_id'], 'drift_rate': round(drift_score, 3),
|
||||
'drift_threshold': DRIFT_THRESHOLD, 'model_age_hours': round(age_h, 2)
|
||||
})
|
||||
|
||||
@ -465,18 +525,12 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
return joblib.load(model_path), ae_prev, meta.get('features', features)
|
||||
log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.")
|
||||
|
||||
# A1/§4 — Sauvegarder les statistiques de distribution avec quantile digest 9 points
|
||||
# (p5…p95) pour une meilleure fidélité de la détection de dérive KS+KL
|
||||
# A1 — Statistiques de référence pour la baseline (mean/std uniquement,
|
||||
# la détection de dérive est assurée par ADWIN en temps réel)
|
||||
baseline_stats = {
|
||||
f: {
|
||||
'mean': float(X[f].mean()), 'std': float(X[f].std()),
|
||||
'p5': float(X[f].quantile(0.05)),
|
||||
'p10': float(X[f].quantile(0.10)),
|
||||
'p25': float(X[f].quantile(0.25)),
|
||||
'p50': float(X[f].quantile(0.50)),
|
||||
'p75': float(X[f].quantile(0.75)),
|
||||
'p90': float(X[f].quantile(0.90)),
|
||||
'p95': float(X[f].quantile(0.95)),
|
||||
'mean': float(X[f].mean()),
|
||||
'std': float(X[f].std()),
|
||||
}
|
||||
for f in features
|
||||
}
|
||||
|
||||
@ -138,7 +138,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
unknown_traffic['raw_anomaly_score'] = raw_scores
|
||||
unknown_traffic['model_name'] = name
|
||||
|
||||
# XGBoost supervisé — troisième voix (si labels historiques disponibles)
|
||||
# Modèle supervisé — troisième voix (Hoeffding Adaptive Tree ou XGBoost fallback)
|
||||
unknown_traffic['xgb_prob'] = 0.0
|
||||
xgb_model_ref = None # Référence pour SHAP TreeExplainer (§2.4.5)
|
||||
if XGB_AVAILABLE and XGB_WEIGHT > 0:
|
||||
@ -146,35 +146,36 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
xgb_client = get_client()
|
||||
xgb_model, xgb_feats = load_or_train_xgb(name, xgb_client, scoring_features, cycle_id)
|
||||
if xgb_model is not None and xgb_feats is not None:
|
||||
# XGB peut utiliser un sous-ensemble de features (celles disponibles dans la vue)
|
||||
xgb_cols = [f for f in xgb_feats if f in unknown_traffic.columns]
|
||||
X_xgb = unknown_traffic[xgb_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
|
||||
xgb_probs = xgb_model.predict_proba(X_xgb.values)[:, 1]
|
||||
# River HAT utilise predict_proba_many(DataFrame), XGBoost utilise predict_proba(ndarray)
|
||||
if hasattr(xgb_model, 'predict_proba_many'):
|
||||
proba_df = xgb_model.predict_proba_many(X_xgb[xgb_cols])
|
||||
xgb_probs = proba_df[1].values if 1 in proba_df.columns else np.zeros(len(X_xgb))
|
||||
else:
|
||||
xgb_probs = xgb_model.predict_proba(X_xgb.values)[:, 1]
|
||||
unknown_traffic['xgb_prob'] = xgb_probs
|
||||
xgb_model_ref = xgb_model
|
||||
log_info(f"[{name}] XGBoost : xgb_mean={xgb_probs.mean():.4f}")
|
||||
log_info(f"[{name}] Supervisé : score moyen={xgb_probs.mean():.4f}")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] XGBoost scoring échoué : {exc} — EIF+AE seuls.")
|
||||
log_info(f"[{name}] Supervisé scoring échoué : {exc} — EIF+NF seuls.")
|
||||
|
||||
# §8 — Score final via MetaLearner (ou poids fixes en fallback)
|
||||
# §8 — Score final via MetaLearner MLP (ou poids fixes en fallback)
|
||||
meta_learner = get_meta_learner(name)
|
||||
eif_norm_arr = unknown_traffic['anomaly_score'].values.copy()
|
||||
ae_norm_arr = normalize_scores(-unknown_traffic['ae_recon_error'].values)
|
||||
xgb_prob_arr = unknown_traffic['xgb_prob'].values
|
||||
hits_arr = unknown_traffic.get('hits', pd.Series(1, index=unknown_traffic.index)).values
|
||||
corr_arr = unknown_traffic.get('correlated', pd.Series(0, index=unknown_traffic.index)).values
|
||||
|
||||
final_scores = meta_learner.predict(eif_norm_arr, ae_norm_arr, xgb_prob_arr,
|
||||
hits_arr, corr_arr)
|
||||
final_scores = meta_learner.predict(eif_norm_arr, ae_norm_arr, xgb_prob_arr)
|
||||
unknown_traffic['anomaly_score'] = final_scores
|
||||
|
||||
if meta_learner.is_trained:
|
||||
log_info(
|
||||
f"[{name}] §8 MetaLearner actif ({meta_learner._n_samples} labels) — "
|
||||
f"[{name}] §8 MetaFusionMLP actif ({meta_learner._n_samples} labels) — "
|
||||
f"score moyen={final_scores.mean():.4f}"
|
||||
)
|
||||
elif unknown_traffic['xgb_prob'].mean() > 0:
|
||||
log_info(f"[{name}] §8 Poids fixes EIF+AE+XGB (MetaLearner pas encore entraîné).")
|
||||
log_info(f"[{name}] §8 Poids fixes EIF+NF+XGB (MetaFusionMLP pas encore entraîné).")
|
||||
|
||||
# §8 — Entraînement du MetaLearner sur les labels du cycle courant
|
||||
# (accumulation progressive — activation dès MIN_SAMPLES labels)
|
||||
|
||||
@ -10,5 +10,6 @@ torch_geometric>=2.4
|
||||
FrEIA>=0.2
|
||||
xgboost>=2.0
|
||||
cleanlab>=2.6
|
||||
river>=0.19
|
||||
pyyaml>=6.0
|
||||
ja4-common @ file:///app/shared/ja4_common
|
||||
|
||||
@ -1,14 +1,14 @@
|
||||
"""Scoring, dérive, validation, seuil adaptatif, SHAP et clustering.
|
||||
|
||||
Regroupe les fonctions de scoring utilisées par le pipeline de détection :
|
||||
- A1 : détection de dérive conceptuelle (KS-test + KL divergence + dérive adversariale)
|
||||
- A1 : détection de dérive conceptuelle (ADWIN — fenêtre glissante adaptative)
|
||||
- A2 : seuil adaptatif basé sur le percentile des scores négatifs
|
||||
- A4 : explainabilité SHAP (top features contributives)
|
||||
- A7 : validation de complétude des features
|
||||
- A8 : clustering HDBSCAN / DBSCAN des anomalies
|
||||
- A10 : normalisation [0,1] des scores d'anomalie
|
||||
- §7 : ExIFFI — importance de features par permutation (Extended Isolation Forest)
|
||||
- §8 : MetaLearner — pondération de l'ensemble par régression logistique
|
||||
- §8 : MetaLearner — fusion non-linéaire par MLP
|
||||
"""
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
@ -67,103 +67,78 @@ def _kl_divergence(p_vals: np.ndarray, q_vals: np.ndarray, n_bins: int = 20) ->
|
||||
return max(0.0, kl)
|
||||
|
||||
|
||||
def compute_drift_score(baseline_stats: dict, current_baseline: pd.DataFrame,
|
||||
features: list, *, name: str = '', cycle_id: str = '') -> float:
|
||||
"""Compare la distribution actuelle de la baseline humaine avec celle de l'entraînement.
|
||||
class ADWINDriftMonitor:
|
||||
"""A1 — Détection de dérive conceptuelle par ADWIN (ADaptive WINdowing).
|
||||
|
||||
§4 — Méthode améliorée :
|
||||
- Quantile digest 9 points (p5…p95) au lieu de 5 pour une meilleure fidélité
|
||||
- KS-test + divergence KL : feature en dérive si KS p<0.05 OU KL>0.5
|
||||
- Détection de dérive adversariale : >50% des features dérivent dans la même
|
||||
direction → log_decision 'ADVERSARIAL_DRIFT'
|
||||
Maintient un détecteur ADWIN par feature. Chaque cycle, la moyenne de
|
||||
la feature sur le trafic baseline est alimentée au détecteur. ADWIN
|
||||
ajuste automatiquement la taille de sa fenêtre et signale un changement
|
||||
lorsque la différence entre sous-fenêtres dépasse la borne de Hoeffding.
|
||||
|
||||
Retourne la fraction de features en dérive significative (en [0,1]).
|
||||
Avantages sur KS + quantile digest :
|
||||
- Pas de seuil arbitraire (borne de Hoeffding, contrôle par delta)
|
||||
- Fenêtre adaptative : s'ajuste automatiquement à la vitesse du drift
|
||||
- Détection en temps réel à chaque cycle, pas de reconstruction de CDF
|
||||
"""
|
||||
if not baseline_stats or current_baseline.empty:
|
||||
return 0.0
|
||||
try:
|
||||
from scipy.stats import ks_2samp
|
||||
_HAS_SCIPY = True
|
||||
except ImportError:
|
||||
_HAS_SCIPY = False
|
||||
|
||||
# Clés de quantiles : 9 points preferred, 5 points fallback
|
||||
Q9_KEYS = ['p5', 'p10', 'p25', 'p50', 'p75', 'p90', 'p95']
|
||||
Q9_PROBS = np.array([0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95])
|
||||
Q5_KEYS = ['p10', 'p25', 'p50', 'p75', 'p90']
|
||||
Q5_PROBS = np.array([0.10, 0.25, 0.50, 0.75, 0.90])
|
||||
def __init__(self, features: list, delta: float = 0.002):
|
||||
try:
|
||||
from river.drift import ADWIN
|
||||
self._detectors = {f: ADWIN(delta=delta) for f in features}
|
||||
self._available = True
|
||||
except ImportError:
|
||||
self._detectors = {}
|
||||
self._available = False
|
||||
self._last_changes: dict[str, bool] = {}
|
||||
|
||||
drifted = 0
|
||||
tested = 0
|
||||
drifted_features: list = []
|
||||
direction_shifts: list = []
|
||||
rng = np.random.default_rng(42)
|
||||
@property
|
||||
def available(self) -> bool:
|
||||
return self._available
|
||||
|
||||
for feat in features:
|
||||
if feat not in baseline_stats or feat not in current_baseline.columns:
|
||||
continue
|
||||
stats = baseline_stats[feat]
|
||||
curr_values = current_baseline[feat].dropna()
|
||||
if len(curr_values) < 30:
|
||||
continue
|
||||
trained_std = stats.get('std', 0)
|
||||
if trained_std < 1e-9:
|
||||
continue
|
||||
def update(self, feature_means: dict[str, float]) -> dict[str, bool]:
|
||||
"""Alimente chaque ADWIN avec la moyenne courante de sa feature.
|
||||
|
||||
# Reconstruction de la distribution d'entraînement par interpolation quantile
|
||||
if all(k in stats for k in Q9_KEYS):
|
||||
q_probs = Q9_PROBS
|
||||
q_vals = np.array([stats[k] for k in Q9_KEYS])
|
||||
elif all(k in stats for k in Q5_KEYS):
|
||||
q_probs = Q5_PROBS
|
||||
q_vals = np.array([stats[k] for k in Q5_KEYS])
|
||||
else:
|
||||
q_probs = None
|
||||
Retourne un dict {feature: detected_change} indiquant quelles
|
||||
features ont dérivé ce cycle.
|
||||
"""
|
||||
if not self._available:
|
||||
return {}
|
||||
changes = {}
|
||||
for feat, value in feature_means.items():
|
||||
if feat in self._detectors:
|
||||
self._detectors[feat].update(value)
|
||||
detected = self._detectors[feat].detected_change()
|
||||
changes[feat] = detected
|
||||
if detected:
|
||||
self._last_changes[feat] = True
|
||||
self._last_changes.update({k: False for k in self._last_changes if k not in changes})
|
||||
return changes
|
||||
|
||||
if q_probs is not None:
|
||||
u = rng.uniform(0, 1, size=len(curr_values))
|
||||
synthetic_trained = np.interp(u, q_probs, q_vals)
|
||||
else:
|
||||
synthetic_trained = rng.normal(stats['mean'], trained_std, size=len(curr_values))
|
||||
def check_drift(self, feature_means: dict[str, float],
|
||||
*, name: str = '', cycle_id: str = '') -> float:
|
||||
"""Met à jour les ADWIN et retourne le taux de features en dérive.
|
||||
|
||||
feat_drifted = False
|
||||
if _HAS_SCIPY:
|
||||
_, ks_p = ks_2samp(curr_values.values, synthetic_trained)
|
||||
if ks_p < 0.05:
|
||||
feat_drifted = True
|
||||
else:
|
||||
# Fallback Z-score
|
||||
z = abs(curr_values.mean() - stats['mean']) / trained_std
|
||||
if z > 2.0:
|
||||
feat_drifted = True
|
||||
Retourne la fraction de features monitorées ayant dérivé ce cycle,
|
||||
dans [0, 1]. Si le taux dépasse le seuil de dérive, un retraining
|
||||
est recommandé.
|
||||
"""
|
||||
changes = self.update(feature_means)
|
||||
if not changes:
|
||||
return 0.0
|
||||
|
||||
# KL divergence comme critère complémentaire au KS
|
||||
kl = _kl_divergence(curr_values.values, synthetic_trained)
|
||||
if kl > 0.5:
|
||||
feat_drifted = True
|
||||
drifted_features = [f for f, changed in changes.items() if changed]
|
||||
drift_rate = len(drifted_features) / len(changes)
|
||||
|
||||
if feat_drifted:
|
||||
drifted += 1
|
||||
drifted_features.append(feat)
|
||||
# Direction du shift pour la détection adversariale
|
||||
direction_shifts.append(np.sign(curr_values.mean() - stats['mean']))
|
||||
tested += 1
|
||||
|
||||
drift_rate = drifted / max(tested, 1)
|
||||
|
||||
# Détection de dérive adversariale : >50% des features dérivent simultanément
|
||||
# dans la même direction → signe d'une manipulation intentionnelle de la distribution
|
||||
if drift_rate > 0.50 and len(direction_shifts) >= 10:
|
||||
consensus = abs(float(np.mean(direction_shifts)))
|
||||
if consensus >= 0.8:
|
||||
# Détection adversariale : dérive massive simultanée
|
||||
if drift_rate > 0.50 and len(drifted_features) >= 10:
|
||||
log_decision('ADVERSARIAL_DRIFT', cycle_id, name, {
|
||||
'drift_rate': round(drift_rate, 3),
|
||||
'consensus': round(consensus, 3),
|
||||
'n_features': len(drifted_features),
|
||||
'n_drifted': len(drifted_features),
|
||||
'n_monitored': len(changes),
|
||||
'top_drifted': drifted_features[:10],
|
||||
})
|
||||
|
||||
return drift_rate
|
||||
return drift_rate
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
@ -445,42 +420,101 @@ def compute_ae_feature_errors(ae_model, X: pd.DataFrame, features: list,
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# §8 — META-LEARNER : PONDÉRATION APPRISE DE L'ENSEMBLE
|
||||
# §8 — META-LEARNER : FUSION NON-LINÉAIRE PAR MLP
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
class MetaLearner:
|
||||
"""§8 — Méta-learner (régression logistique) pour la pondération de l'ensemble.
|
||||
|
||||
Remplace les poids fixes (1−XGB_W)×((1−AE_W)×eif + AE_W×ae) + XGB_W×xgb
|
||||
par une régression logistique entraînée sur les labels SOC/Anubis/bots connus.
|
||||
class MetaFusionMLP(torch.nn.Module):
|
||||
"""MLP de fusion pour le méta-modèle de stacking.
|
||||
|
||||
Formule apprise :
|
||||
P(bot) = logistic(w1×eif_norm + w2×ae_norm + w3×xgb_prob
|
||||
+ w4×log1p(hits) + w5×correlated + bias)
|
||||
|
||||
Fallback automatique aux poids fixes quand < MIN_SAMPLES labels disponibles.
|
||||
Architecture : Linear(3, 16) → BatchNorm → ReLU → Dropout → Linear(16, 1) → Sigmoid
|
||||
Input : [eif_norm, nf_norm, xgb_prob] — 3 scores intermédiaires dans [0, 1].
|
||||
Output : P(bot) ∈ [0, 1].
|
||||
"""
|
||||
|
||||
MIN_SAMPLES = 1000 # Nombre minimal de labels pour activer le méta-learner
|
||||
|
||||
def __init__(self):
|
||||
self._clf = None
|
||||
super().__init__()
|
||||
self.net = torch.nn.Sequential(
|
||||
torch.nn.Linear(3, 16),
|
||||
torch.nn.BatchNorm1d(16),
|
||||
torch.nn.ReLU(),
|
||||
torch.nn.Dropout(0.2),
|
||||
torch.nn.Linear(16, 1),
|
||||
torch.nn.Sigmoid(),
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return self.net(x).squeeze(-1)
|
||||
|
||||
|
||||
class MetaLearner:
|
||||
"""§8 — Méta-learner (MLP) pour la fusion non-linéaire de l'ensemble.
|
||||
|
||||
Remplace les poids fixes et la régression logistique par un MLP PyTorch
|
||||
capable de capturer des interactions non-linéaires entre les trois voix
|
||||
(EIF, Normalizing Flow, XGBoost).
|
||||
|
||||
Stacking OOF (Out-of-Fold) : le MLP est entraîné sur les prédictions
|
||||
des modèles de base via validation croisée temporelle, évitant le
|
||||
surapprentissage sur les mêmes données d'entraînement.
|
||||
|
||||
Fallback automatique aux poids fixes quand meta_mlp.pt n'existe pas.
|
||||
"""
|
||||
|
||||
MIN_SAMPLES = 50 # Le MLP régularisé (BN + Dropout) gère le peu de données
|
||||
PATIENCE = 5 # Early stopping patience
|
||||
MAX_EPOCHS = 50
|
||||
|
||||
def __init__(self, model_suffix: str = ''):
|
||||
import torch
|
||||
self._mlp: MetaFusionMLP | None = None
|
||||
self._n_samples: int = 0
|
||||
self._feature_names = ['eif_norm', 'ae_norm', 'xgb_prob', 'log_hits', 'correlated']
|
||||
self._weights_log: dict = {} # Pour la transparence SOC
|
||||
self._weights_log: dict = {}
|
||||
self._model_path = os.path.join(
|
||||
os.getenv('MODEL_DIR', '/var/lib/bot_detector'),
|
||||
f'meta_mlp{model_suffix}.pt',
|
||||
)
|
||||
# Tenter de charger un modèle pré-entraîné existant
|
||||
self._try_load()
|
||||
|
||||
# ── Sauvegarde / Chargement ──────────────────────────────────────────
|
||||
|
||||
def _try_load(self) -> None:
|
||||
"""Charge le MLP depuis le disque si le fichier existe."""
|
||||
if not os.path.exists(self._model_path):
|
||||
return
|
||||
try:
|
||||
mlp = MetaFusionMLP()
|
||||
state = torch.load(self._model_path, map_location='cpu', weights_only=True)
|
||||
mlp.load_state_dict(state)
|
||||
mlp.eval()
|
||||
self._mlp = mlp
|
||||
log_info(f"[MetaLearner] MLP chargé depuis {self._model_path}")
|
||||
except Exception as e:
|
||||
log_info(f"[MetaLearner] Échec chargement MLP ({e}) — fallback poids fixes")
|
||||
self._mlp = None
|
||||
|
||||
def _save(self) -> None:
|
||||
"""Persiste le MLP entraîné sur le disque."""
|
||||
if self._mlp is None:
|
||||
return
|
||||
try:
|
||||
os.makedirs(os.path.dirname(self._model_path), exist_ok=True)
|
||||
torch.save(self._mlp.state_dict(), self._model_path)
|
||||
except Exception as e:
|
||||
log_info(f"[MetaLearner] Échec sauvegarde MLP ({e})")
|
||||
|
||||
# ── Entraînement ─────────────────────────────────────────────────────
|
||||
|
||||
def fit(self, df: pd.DataFrame) -> bool:
|
||||
"""Entraîne le méta-learner sur un DataFrame de sessions labelisées.
|
||||
"""Entraîne le MLP sur un DataFrame de sessions labelisées.
|
||||
|
||||
Colonnes requises : eif_norm, ae_norm (ou 0), xgb_prob (ou 0),
|
||||
hits, correlated, label (0=normal, 1=bot).
|
||||
Colonnes requises : eif_norm, ae_norm (ou 0), xgb_prob (ou 0), label.
|
||||
Boucle PyTorch : Adam, BCELoss, 50 epochs max, early stopping (patience=5).
|
||||
Retourne True si l'entraînement a réussi.
|
||||
"""
|
||||
try:
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.preprocessing import StandardScaler as _SS
|
||||
except ImportError:
|
||||
return False
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
required = {'eif_norm', 'label'}
|
||||
if not required.issubset(df.columns):
|
||||
@ -490,69 +524,118 @@ class MetaLearner:
|
||||
if len(df) < self.MIN_SAMPLES:
|
||||
return False
|
||||
|
||||
X_meta = pd.DataFrame({
|
||||
'eif_norm': df['eif_norm'].clip(0, 1),
|
||||
'ae_norm': df.get('ae_norm', pd.Series(0.0, index=df.index)).clip(0, 1),
|
||||
'xgb_prob': df.get('xgb_prob', pd.Series(0.0, index=df.index)).clip(0, 1),
|
||||
'log_hits': np.log1p(df.get('hits', pd.Series(1, index=df.index)).clip(1)),
|
||||
'correlated': df.get('correlated', pd.Series(0, index=df.index)).clip(0, 1),
|
||||
}).fillna(0)
|
||||
y = df['label'].astype(int)
|
||||
# Construction des 3 features d'entrée
|
||||
X_np = np.column_stack([
|
||||
df['eif_norm'].clip(0, 1).values,
|
||||
df.get('ae_norm', pd.Series(0.0, index=df.index)).clip(0, 1).values,
|
||||
df.get('xgb_prob', pd.Series(0.0, index=df.index)).clip(0, 1).values,
|
||||
]).astype(np.float32)
|
||||
y_np = df['label'].astype(int).values.astype(np.float32)
|
||||
|
||||
scaler = _SS()
|
||||
X_scaled = scaler.fit_transform(X_meta)
|
||||
clf = LogisticRegression(max_iter=500, C=1.0, random_state=42)
|
||||
clf.fit(X_scaled, y)
|
||||
X_t = torch.tensor(X_np)
|
||||
y_t = torch.tensor(y_np)
|
||||
|
||||
# Enregistrer les poids pour la transparence SOC
|
||||
coefs = dict(zip(self._feature_names, clf.coef_[0].tolist()))
|
||||
self._clf = (clf, scaler)
|
||||
# Split train/validation (20% dernier = validation)
|
||||
n = len(X_t)
|
||||
split = max(1, int(n * 0.8))
|
||||
X_train, X_val = X_t[:split], X_t[split:]
|
||||
y_train, y_val = y_t[:split], y_t[split:]
|
||||
|
||||
if len(X_train) < 10 or len(X_val) < 5:
|
||||
# Pas assez pour un split fiable — entraîner sur tout
|
||||
X_train, X_val = X_t, X_t
|
||||
y_train, y_val = y_t, y_t
|
||||
|
||||
mlp = MetaFusionMLP()
|
||||
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-3, weight_decay=1e-4)
|
||||
criterion = nn.BCELoss()
|
||||
|
||||
best_val_loss = float('inf')
|
||||
best_state = None
|
||||
patience_counter = 0
|
||||
|
||||
for epoch in range(self.MAX_EPOCHS):
|
||||
# ── Train step ──
|
||||
mlp.train()
|
||||
optimizer.zero_grad()
|
||||
pred = mlp(X_train)
|
||||
loss = criterion(pred, y_train)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# ── Validation step ──
|
||||
mlp.eval()
|
||||
with torch.no_grad():
|
||||
val_pred = mlp(X_val)
|
||||
val_loss = criterion(val_pred, y_val).item()
|
||||
|
||||
if val_loss < best_val_loss:
|
||||
best_val_loss = val_loss
|
||||
best_state = {k: v.clone() for k, v in mlp.state_dict().items()}
|
||||
patience_counter = 0
|
||||
else:
|
||||
patience_counter += 1
|
||||
if patience_counter >= self.PATIENCE:
|
||||
break
|
||||
|
||||
# Restaurer les meilleurs poids
|
||||
if best_state is not None:
|
||||
mlp.load_state_dict(best_state)
|
||||
mlp.eval()
|
||||
|
||||
self._mlp = mlp
|
||||
self._n_samples = len(df)
|
||||
self._save()
|
||||
|
||||
# Log pour la transparence SOC
|
||||
layer1_w = mlp.net[0].weight.data.numpy()
|
||||
self._weights_log = {
|
||||
'coefs': {k: round(v, 4) for k, v in coefs.items()},
|
||||
'intercept': round(float(clf.intercept_[0]), 4),
|
||||
'type': 'MetaFusionMLP',
|
||||
'n_samples': self._n_samples,
|
||||
'epochs_run': epoch + 1,
|
||||
'best_val_loss': round(best_val_loss, 6),
|
||||
'layer1_weight_norm': round(float(np.linalg.norm(layer1_w)), 4),
|
||||
}
|
||||
log_info(f"[MetaLearner] Entraîné sur {self._n_samples} labels — coefs: {coefs}")
|
||||
log_info(
|
||||
f"[MetaLearner] MLP entraîné sur {self._n_samples} labels — "
|
||||
f"epochs={epoch + 1}, val_loss={best_val_loss:.6f}"
|
||||
)
|
||||
return True
|
||||
|
||||
# ── Inférence ────────────────────────────────────────────────────────
|
||||
|
||||
def predict(self, eif_norm: np.ndarray, ae_norm: np.ndarray,
|
||||
xgb_prob: np.ndarray, hits: np.ndarray = None,
|
||||
correlated: np.ndarray = None) -> np.ndarray:
|
||||
"""Prédit P(bot) avec le méta-learner ou les poids fixes en fallback.
|
||||
"""Prédit P(bot) avec le MLP ou les poids fixes en fallback.
|
||||
|
||||
Seules les 3 features principales (eif, nf, xgb) sont utilisées par le MLP.
|
||||
Les arguments hits/correlated sont ignorés par le MLP mais conservés
|
||||
pour la compatibilité du fallback.
|
||||
Retourne un array de probabilités dans [0, 1].
|
||||
"""
|
||||
n = len(eif_norm)
|
||||
if hits is None:
|
||||
hits = np.ones(n)
|
||||
if correlated is None:
|
||||
correlated = np.zeros(n)
|
||||
|
||||
if self.is_trained:
|
||||
clf, scaler = self._clf
|
||||
X_meta = np.column_stack([
|
||||
np.clip(eif_norm, 0, 1),
|
||||
np.clip(ae_norm, 0, 1),
|
||||
np.clip(xgb_prob, 0, 1),
|
||||
np.log1p(np.clip(hits, 1, None)),
|
||||
np.clip(correlated, 0, 1),
|
||||
])
|
||||
try:
|
||||
X_scaled = scaler.transform(X_meta)
|
||||
return clf.predict_proba(X_scaled)[:, 1]
|
||||
X_np = np.column_stack([
|
||||
np.clip(eif_norm, 0, 1),
|
||||
np.clip(ae_norm, 0, 1),
|
||||
np.clip(xgb_prob, 0, 1),
|
||||
]).astype(np.float32)
|
||||
with torch.no_grad():
|
||||
probs = self._mlp(torch.tensor(X_np)).numpy()
|
||||
return probs
|
||||
except Exception as e:
|
||||
log_info(f"[MetaLearner] Prédiction échouée ({e}) — fallback poids fixes")
|
||||
log_info(f"[MetaLearner] MLP prédiction échouée ({e}) — fallback poids fixes")
|
||||
|
||||
# Fallback : formule linéaire avec poids fixes
|
||||
# Fallback : formule linéaire avec poids fixes (cold start)
|
||||
_ae_w = AE_WEIGHT
|
||||
_xgb_w = XGB_WEIGHT
|
||||
return (1 - _xgb_w) * ((1 - _ae_w) * eif_norm + _ae_w * ae_norm) + _xgb_w * xgb_prob
|
||||
|
||||
@property
|
||||
def is_trained(self) -> bool:
|
||||
"""Vrai si le méta-learner est actif (assez de labels pour être fiable)."""
|
||||
return self._clf is not None and self._n_samples >= self.MIN_SAMPLES
|
||||
"""Vrai si le MLP est actif (modèle chargé ou entraîné)."""
|
||||
return self._mlp is not None
|
||||
|
||||
def build_labels_from_df(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Construit les labels supervisés pour l'entraînement du méta-learner.
|
||||
@ -575,9 +658,9 @@ class MetaLearner:
|
||||
return pd.DataFrame(labeled)
|
||||
|
||||
|
||||
# Singleton partagé entre les modèles Complet et Applicatif
|
||||
_meta_learner_complet = MetaLearner()
|
||||
_meta_learner_applicatif = MetaLearner()
|
||||
# Singletons partagés entre les modèles Complet et Applicatif
|
||||
_meta_learner_complet = MetaLearner(model_suffix='_complet')
|
||||
_meta_learner_applicatif = MetaLearner(model_suffix='_applicatif')
|
||||
|
||||
|
||||
def get_meta_learner(name: str) -> MetaLearner:
|
||||
|
||||
Reference in New Issue
Block a user