feat(ml): replace Autoencoder with RealNVP Normalizing Flow and add SessionTransformer embeddings
Replace TrafficAutoEncoder (MSE reconstruction scoring) with TrafficNormalizingFlow (RealNVP via FrEIA, 4 affine coupling blocks, anomaly score = -log p(x)) for mathematically rigorous density estimation. Add SessionTransformer module producing 32-dimensional sequence embeddings from raw HTTP request sequences (path, method, timing) via a lightweight TransformerEncoder, replacing path_transition_entropy and cadence_cv features. Update thesis documentation sections 2.4.2b and 3.8 accordingly. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -1,6 +1,6 @@
|
||||
"""Gestion des modèles : chargement, entraînement, versionnement.
|
||||
|
||||
IsolationForest (EIF), Autoencoder (PyTorch), XGBoost supervisé.
|
||||
IsolationForest (EIF), Normalizing Flow (PyTorch/FrEIA), XGBoost supervisé.
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
@ -76,40 +76,51 @@ def _purge_old_versions(name: str):
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# AUTOENCODER — Second scorer parallèle (détection d'anomalies par reconstruction)
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
class TrafficAutoEncoder:
|
||||
"""Autoencoder symétrique pour détection d'anomalies par erreur de reconstruction.
|
||||
class TrafficNormalizingFlow:
|
||||
"""Normalizing Flow (RealNVP) pour détection d'anomalies par vraisemblance.
|
||||
|
||||
Architecture : encoder (n→64→32→latent_dim) — decoder (latent_dim→32→64→n)
|
||||
Activation : ReLU + BatchNorm (encoder/decoder), sigmoid (sortie — données normalisées [0,1])
|
||||
Score = MSE(input, reconstruction) par échantillon.
|
||||
L'espace latent (16-dim par défaut) peut servir de features compressées pour HDBSCAN.
|
||||
Architecture : 4 blocs de couplage affine (AllInOneBlock), sous-réseaux MLP
|
||||
(2 couches, 64 neurones, ReLU). L'espace latent = input_dim (pas de bottleneck).
|
||||
Score d'anomalie = -log p(x), estimé via le changement de variable.
|
||||
L'espace latent peut servir de features compressées pour HDBSCAN.
|
||||
"""
|
||||
|
||||
def __init__(self, n_features: int, latent_dim: int = AE_LATENT_DIM):
|
||||
def __init__(self, n_features: int, latent_dim: int = 0):
|
||||
if not TORCH_AVAILABLE:
|
||||
raise RuntimeError("PyTorch non disponible — autoencoder désactivé.")
|
||||
raise RuntimeError("PyTorch non disponible — Normalizing Flow désactivé.")
|
||||
try:
|
||||
import FrEIA.framework as Ff
|
||||
import FrEIA.modules as Fm
|
||||
except ImportError:
|
||||
raise RuntimeError("FrEIA non disponible — installer : pip install FrEIA")
|
||||
self.n_features = n_features
|
||||
self.latent_dim = latent_dim
|
||||
self.device = torch.device('cpu')
|
||||
self._build_model()
|
||||
self._scaler_min = None
|
||||
self._scaler_range = None
|
||||
|
||||
def _subnet_fc(self, c_in, c_out):
|
||||
"""Sous-réseau MLP pour les blocs de couplage (2 couches, 64 neurones)."""
|
||||
return nn.Sequential(
|
||||
nn.Linear(c_in, 64), nn.ReLU(),
|
||||
nn.Linear(64, 64), nn.ReLU(),
|
||||
nn.Linear(64, c_out),
|
||||
)
|
||||
|
||||
def _build_model(self):
|
||||
# Architecture fixe n→64→32→16→32→64→n (§2.4.3 thèse)
|
||||
dim1 = 64
|
||||
dim2 = 32
|
||||
self.encoder = nn.Sequential(
|
||||
nn.Linear(self.n_features, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||
nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||
nn.Linear(dim2, self.latent_dim),
|
||||
).to(self.device)
|
||||
self.decoder = nn.Sequential(
|
||||
nn.Linear(self.latent_dim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||
nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||
nn.Linear(dim1, self.n_features), nn.Sigmoid(),
|
||||
).to(self.device)
|
||||
self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
|
||||
import FrEIA.framework as Ff
|
||||
import FrEIA.modules as Fm
|
||||
|
||||
nodes = [Ff.InputNode(self.n_features, name='input')]
|
||||
for i in range(4):
|
||||
nodes.append(Ff.Node(
|
||||
nodes[-1],
|
||||
Fm.AllInOneBlock,
|
||||
{'subnet_constructor': self._subnet_fc, 'affine_clamping': 2.0},
|
||||
name=f'coupling_{i}',
|
||||
))
|
||||
nodes.append(Ff.OutputNode(nodes[-1], name='output'))
|
||||
self.flow = Ff.GraphINN(nodes, verbose=False).to(self.device)
|
||||
|
||||
def _to_tensor(self, X: np.ndarray) -> 'torch.Tensor':
|
||||
"""Normalise [0,1] via min-max puis convertit en Tensor."""
|
||||
@ -119,27 +130,30 @@ class TrafficAutoEncoder:
|
||||
X_norm = X
|
||||
return torch.tensor(np.clip(X_norm, 0, 1), dtype=torch.float32, device=self.device)
|
||||
|
||||
def log_likelihood(self, x: 'torch.Tensor') -> 'torch.Tensor':
|
||||
"""Calcule log p(x) = log p_z(f(x)) + log|det J_f(x)|."""
|
||||
z, log_det = self.flow(x)
|
||||
log_pz = -0.5 * (z ** 2).sum(dim=1) - 0.5 * self.n_features * np.log(2 * np.pi)
|
||||
return log_pz + log_det
|
||||
|
||||
def fit(self, X: np.ndarray, epochs: int = AE_EPOCHS, lr: float = AE_LEARNING_RATE,
|
||||
batch_size: int = 256) -> dict:
|
||||
"""Entraîne l'autoencoder sur la baseline humaine (données normales)."""
|
||||
"""Entraîne le Normalizing Flow sur la baseline humaine (données normales)."""
|
||||
self._scaler_min = X.min(axis=0)
|
||||
self._scaler_range = X.max(axis=0) - self._scaler_min
|
||||
X_t = self._to_tensor(X)
|
||||
|
||||
dataset = torch.utils.data.TensorDataset(X_t)
|
||||
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5)
|
||||
criterion = nn.MSELoss()
|
||||
optimizer = torch.optim.Adam(self.flow.parameters(), lr=lr, weight_decay=1e-5)
|
||||
|
||||
self.encoder.train()
|
||||
self.decoder.train()
|
||||
self.flow.train()
|
||||
losses = []
|
||||
for epoch in range(epochs):
|
||||
epoch_loss = 0.0
|
||||
for (batch,) in loader:
|
||||
latent = self.encoder(batch)
|
||||
recon = self.decoder(latent)
|
||||
loss = criterion(recon, batch)
|
||||
log_p = self.log_likelihood(batch)
|
||||
loss = -log_p.mean() # NLL
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
@ -148,41 +162,35 @@ class TrafficAutoEncoder:
|
||||
return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)}
|
||||
|
||||
def score_samples(self, X: np.ndarray) -> np.ndarray:
|
||||
"""Retourne l'erreur de reconstruction MSE par échantillon (plus élevé = plus anomal)."""
|
||||
self.encoder.eval()
|
||||
self.decoder.eval()
|
||||
"""Retourne -log p(x) par échantillon (plus élevé = plus anomal)."""
|
||||
self.flow.eval()
|
||||
X_t = self._to_tensor(X)
|
||||
with torch.no_grad():
|
||||
latent = self.encoder(X_t)
|
||||
recon = self.decoder(latent)
|
||||
mse = ((recon - X_t) ** 2).mean(dim=1).numpy()
|
||||
return mse
|
||||
return -self.log_likelihood(X_t).numpy()
|
||||
|
||||
def encode(self, X: np.ndarray) -> np.ndarray:
|
||||
"""Retourne l'espace latent (pour HDBSCAN clustering)."""
|
||||
self.encoder.eval()
|
||||
"""Retourne l'espace latent z = f(x) (pour HDBSCAN clustering)."""
|
||||
self.flow.eval()
|
||||
X_t = self._to_tensor(X)
|
||||
with torch.no_grad():
|
||||
return self.encoder(X_t).numpy()
|
||||
z, _ = self.flow(X_t)
|
||||
return z.numpy()
|
||||
|
||||
def state_dict(self) -> dict:
|
||||
return {
|
||||
'encoder': self.encoder.state_dict(),
|
||||
'decoder': self.decoder.state_dict(),
|
||||
'flow': self.flow.state_dict(),
|
||||
'scaler_min': self._scaler_min,
|
||||
'scaler_range': self._scaler_range,
|
||||
'n_features': self.n_features,
|
||||
'latent_dim': self.latent_dim,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def load_state_dict(cls, state: dict) -> 'TrafficAutoEncoder':
|
||||
ae = cls(state['n_features'], state['latent_dim'])
|
||||
ae._scaler_min = state['scaler_min']
|
||||
ae._scaler_range = state['scaler_range']
|
||||
ae.encoder.load_state_dict(state['encoder'])
|
||||
ae.decoder.load_state_dict(state['decoder'])
|
||||
return ae
|
||||
def load_state_dict(cls, state: dict) -> 'TrafficNormalizingFlow':
|
||||
nf = cls(state['n_features'])
|
||||
nf._scaler_min = state['scaler_min']
|
||||
nf._scaler_range = state['scaler_range']
|
||||
nf.flow.load_state_dict(state['flow'])
|
||||
return nf
|
||||
|
||||
|
||||
def _ae_model_path(name: str, version_id: str) -> str:
|
||||
@ -353,7 +361,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
dérive, entraîne un nouveau modèle sur ``human_baseline``, le sérialise sur disque,
|
||||
met à jour le fichier pointeur et purge les anciennes versions.
|
||||
|
||||
Retourne (IsolationForest, TrafficAutoEncoder|None, list[str] features).
|
||||
Retourne (IsolationForest, TrafficNormalizingFlow|None, list[str] features).
|
||||
"""
|
||||
model_path, meta = _get_current_version(name)
|
||||
if model_path and meta:
|
||||
@ -387,8 +395,8 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
ae_path = _ae_model_path(name, meta['version_id'])
|
||||
if os.path.exists(ae_path):
|
||||
try:
|
||||
ae_loaded = TrafficAutoEncoder.load_state_dict(torch.load(ae_path, weights_only=False))
|
||||
log_info(f"[{name}] Autoencoder v{meta['version_id']} rechargé.")
|
||||
ae_loaded = TrafficNormalizingFlow.load_state_dict(torch.load(ae_path, weights_only=False))
|
||||
log_info(f"[{name}] Normalizing Flow v{meta['version_id']} rechargé.")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] Erreur chargement AE : {exc} — AE désactivé ce cycle.")
|
||||
return joblib.load(model_path), ae_loaded, meta.get('features', features)
|
||||
@ -451,7 +459,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
ae_prev_path = _ae_model_path(name, meta.get('version_id', ''))
|
||||
if os.path.exists(ae_prev_path):
|
||||
try:
|
||||
ae_prev = TrafficAutoEncoder.load_state_dict(torch.load(ae_prev_path, weights_only=False))
|
||||
ae_prev = TrafficNormalizingFlow.load_state_dict(torch.load(ae_prev_path, weights_only=False))
|
||||
except Exception:
|
||||
pass
|
||||
return joblib.load(model_path), ae_prev, meta.get('features', features)
|
||||
@ -477,17 +485,17 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
new_meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
|
||||
joblib.dump(model, new_model_path)
|
||||
|
||||
# Entraînement de l'Autoencoder en parallèle (si PyTorch disponible et AE_WEIGHT > 0)
|
||||
# Entraînement du Normalizing Flow en parallèle (si PyTorch disponible et AE_WEIGHT > 0)
|
||||
ae_model = None
|
||||
if TORCH_AVAILABLE and AE_WEIGHT > 0:
|
||||
try:
|
||||
ae_model = TrafficAutoEncoder(n_features=len(features))
|
||||
ae_model = TrafficNormalizingFlow(n_features=len(features))
|
||||
ae_stats = ae_model.fit(X_train.values)
|
||||
ae_path = _ae_model_path(name, version_id)
|
||||
torch.save(ae_model.state_dict(), ae_path)
|
||||
log_info(f"[{name}] Autoencoder entraîné : loss={ae_stats['final_loss']:.6f}, epochs={ae_stats['epochs']}")
|
||||
log_info(f"[{name}] Normalizing Flow entraîné : NLL={ae_stats['final_loss']:.6f}, epochs={ae_stats['epochs']}")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] Autoencoder training échoué : {exc} — AE désactivé.")
|
||||
log_info(f"[{name}] Normalizing Flow training échoué : {exc} — NF désactivé.")
|
||||
ae_model = None
|
||||
|
||||
previous_version = meta.get('version_id', None) if meta else None
|
||||
@ -498,7 +506,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
'model_name': name, 'previous_version': previous_version,
|
||||
'retrain_interval': RETRAIN_INTERVAL_H, 'baseline_stats': baseline_stats,
|
||||
'algorithm': 'ExtendedIsolationForest' if EIF_AVAILABLE else 'IsolationForest',
|
||||
'autoencoder': ae_model is not None,
|
||||
'autoencoder': ae_model is not None, # NF en réalité, clé conservée pour rétro-compatibilité
|
||||
'ae_weight': AE_WEIGHT if ae_model else 0.0,
|
||||
'validation': {
|
||||
'val_size': len(X_val), 'train_size': len(X_train),
|
||||
@ -512,7 +520,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
append_training_history({k: v for k, v in new_meta.items() if k != 'baseline_stats'})
|
||||
_purge_old_versions(name)
|
||||
|
||||
log_info(f"[{name}] Modèle v{version_id} sauvegardé → {new_model_path} (AE={'oui' if ae_model is not None else 'non'})")
|
||||
log_info(f"[{name}] Modèle v{version_id} sauvegardé → {new_model_path} (NF={'oui' if ae_model is not None else 'non'})")
|
||||
log_decision('MODEL_TRAINED', cycle_id, name, {
|
||||
'version_id': version_id, 'previous_version': previous_version,
|
||||
'human_samples': len(human_baseline), 'next_retrain_in_h': RETRAIN_INTERVAL_H,
|
||||
|
||||
Reference in New Issue
Block a user