feat(e2e): add distributed E2E test framework with parametric traffic generation
Add run-e2e-test.sh with CLI parameters (--hits, --http-ratio, --dns, --tls, --src-ips, --keep-analysis, --up) for configurable traffic generation. Traffic runs from VM endpoints with multiple source IPs (alias IPs on eth0) to produce distinct sessions for the ML pipeline. Fix curl TLS flags (--tlsv1.2 instead of --tls-v1-2), skip redundant local verification in distributed mode, and fix dashboard is_available() cache that never retried after ClickHouse recovery. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -203,6 +203,78 @@ class TrafficNormalizingFlow:
|
||||
return nf
|
||||
|
||||
|
||||
class NFEnsemble:
|
||||
"""Deep Ensemble de M=5 Normalizing Flows pour quantification d'incertitude.
|
||||
|
||||
Chaque membre est un TrafficNormalizingFlow indépendant, entraîné sur un
|
||||
échantillon bootstrap (avec remise) de la baseline humaine. L'incertitude
|
||||
(variance inter-modèles) discrimine la dérive organique (variance faible,
|
||||
les modèles s'accordent) de la dérive adversariale (variance élevée, les
|
||||
modèles ne s'accordent pas sur la nouveauté).
|
||||
|
||||
Référence : Lakshminarayanan et al., 2017 — "Simple and Scalable Predictive
|
||||
Uncertainty Estimation using Deep Ensembles" (NeurIPS).
|
||||
"""
|
||||
|
||||
ENSEMBLE_SIZE = 5
|
||||
|
||||
def __init__(self, n_features: int):
|
||||
if not TORCH_AVAILABLE:
|
||||
raise RuntimeError("PyTorch non disponible — NFEnsemble désactivé.")
|
||||
self.n_features = n_features
|
||||
self.models = [TrafficNormalizingFlow(n_features) for _ in range(self.ENSEMBLE_SIZE)]
|
||||
|
||||
def fit(self, X: np.ndarray, epochs: int = AE_EPOCHS, lr: float = AE_LEARNING_RATE,
|
||||
batch_size: int = 256) -> dict:
|
||||
"""Entraîne les M modèles sur des échantillons bootstrapés (avec remise)."""
|
||||
n = len(X)
|
||||
all_losses = []
|
||||
for i, nf in enumerate(self.models):
|
||||
idx = np.random.choice(n, size=n, replace=True)
|
||||
X_boot = X[idx]
|
||||
stats = nf.fit(X_boot, epochs=epochs, lr=lr, batch_size=batch_size)
|
||||
all_losses.append(stats['final_loss'])
|
||||
return {
|
||||
'final_losses': all_losses,
|
||||
'mean_loss': float(np.mean(all_losses)),
|
||||
'ensemble_size': self.ENSEMBLE_SIZE,
|
||||
'n_samples': n,
|
||||
}
|
||||
|
||||
def predict_anomalies(self, X: np.ndarray) -> tuple:
|
||||
"""Retourne (mean_score, uncertainty_score) — tuple de np.ndarray.
|
||||
|
||||
mean_score : moyenne des -log p(x) sur les M modèles.
|
||||
uncertainty_score : variance des -log p(x) sur les M modèles.
|
||||
"""
|
||||
scores = np.stack([nf.score_samples(X) for nf in self.models], axis=0)
|
||||
return scores.mean(axis=0), scores.var(axis=0)
|
||||
|
||||
def score_samples(self, X: np.ndarray) -> np.ndarray:
|
||||
"""Compatibilité : retourne mean_score seul (comme TrafficNormalizingFlow)."""
|
||||
mean, _ = self.predict_anomalies(X)
|
||||
return mean
|
||||
|
||||
def encode(self, X: np.ndarray) -> np.ndarray:
|
||||
"""Espace latent moyen sur l'ensemble."""
|
||||
latents = np.stack([nf.encode(X) for nf in self.models], axis=0)
|
||||
return latents.mean(axis=0)
|
||||
|
||||
def state_dict(self) -> dict:
|
||||
return {
|
||||
'ensemble_size': self.ENSEMBLE_SIZE,
|
||||
'n_features': self.n_features,
|
||||
'members': [nf.state_dict() for nf in self.models],
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def load_state_dict(cls, state: dict) -> 'NFEnsemble':
|
||||
ens = cls(state['n_features'])
|
||||
for i, member_state in enumerate(state['members']):
|
||||
ens.models[i] = TrafficNormalizingFlow.load_state_dict(member_state)
|
||||
return ens
|
||||
|
||||
|
||||
def _ae_model_path(name: str, version_id: str) -> str:
|
||||
return os.path.join(MODEL_DIR, f'ae_{name}_{version_id}.pt')
|
||||
|
||||
@ -411,7 +483,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
dérive, entraîne un nouveau modèle sur ``human_baseline``, le sérialise sur disque,
|
||||
met à jour le fichier pointeur et purge les anciennes versions.
|
||||
|
||||
Retourne (IsolationForest, TrafficNormalizingFlow|None, list[str] features).
|
||||
Retourne (IsolationForest, NFEnsemble|None, list[str] features).
|
||||
"""
|
||||
model_path, meta = _get_current_version(name)
|
||||
if model_path and meta:
|
||||
@ -455,8 +527,8 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
ae_path = _ae_model_path(name, meta['version_id'])
|
||||
if os.path.exists(ae_path):
|
||||
try:
|
||||
ae_loaded = TrafficNormalizingFlow.load_state_dict(torch.load(ae_path, weights_only=False))
|
||||
log_info(f"[{name}] Normalizing Flow v{meta['version_id']} rechargé.")
|
||||
ae_loaded = NFEnsemble.load_state_dict(torch.load(ae_path, weights_only=False))
|
||||
log_info(f"[{name}] NFEnsemble v{meta['version_id']} rechargé (M={NFEnsemble.ENSEMBLE_SIZE}).")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] Erreur chargement AE : {exc} — AE désactivé ce cycle.")
|
||||
return joblib.load(model_path), ae_loaded, meta.get('features', features)
|
||||
@ -519,7 +591,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
ae_prev_path = _ae_model_path(name, meta.get('version_id', ''))
|
||||
if os.path.exists(ae_prev_path):
|
||||
try:
|
||||
ae_prev = TrafficNormalizingFlow.load_state_dict(torch.load(ae_prev_path, weights_only=False))
|
||||
ae_prev = NFEnsemble.load_state_dict(torch.load(ae_prev_path, weights_only=False))
|
||||
except Exception:
|
||||
pass
|
||||
return joblib.load(model_path), ae_prev, meta.get('features', features)
|
||||
@ -539,17 +611,17 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
new_meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
|
||||
joblib.dump(model, new_model_path)
|
||||
|
||||
# Entraînement du Normalizing Flow en parallèle (si PyTorch disponible et AE_WEIGHT > 0)
|
||||
# Entraînement du NFEnsemble (M=5) en parallèle (si PyTorch disponible et AE_WEIGHT > 0)
|
||||
ae_model = None
|
||||
if TORCH_AVAILABLE and AE_WEIGHT > 0:
|
||||
try:
|
||||
ae_model = TrafficNormalizingFlow(n_features=len(features))
|
||||
ae_model = NFEnsemble(n_features=len(features))
|
||||
ae_stats = ae_model.fit(X_train.values)
|
||||
ae_path = _ae_model_path(name, version_id)
|
||||
torch.save(ae_model.state_dict(), ae_path)
|
||||
log_info(f"[{name}] Normalizing Flow entraîné : NLL={ae_stats['final_loss']:.6f}, epochs={ae_stats['epochs']}")
|
||||
log_info(f"[{name}] NFEnsemble entraîné (M={NFEnsemble.ENSEMBLE_SIZE}) : NLL moyen={ae_stats['mean_loss']:.6f}")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] Normalizing Flow training échoué : {exc} — NF désactivé.")
|
||||
log_info(f"[{name}] NFEnsemble training échoué : {exc} — NF désactivé.")
|
||||
ae_model = None
|
||||
|
||||
previous_version = meta.get('version_id', None) if meta else None
|
||||
|
||||
Reference in New Issue
Block a user