feat(ml): replace Autoencoder with RealNVP Normalizing Flow and add SessionTransformer embeddings
Replace TrafficAutoEncoder (MSE reconstruction scoring) with TrafficNormalizingFlow (RealNVP via FrEIA, 4 affine coupling blocks, anomaly score = -log p(x)) for mathematically rigorous density estimation. Add SessionTransformer module producing 32-dimensional sequence embeddings from raw HTTP request sequences (path, method, timing) via a lightweight TransformerEncoder, replacing path_transition_entropy and cadence_cv features. Update thesis documentation sections 2.4.2b and 3.8 accordingly. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -79,6 +79,11 @@ AE_EPOCHS = int(os.getenv('AE_EPOCHS', '50'))
|
||||
AE_LATENT_DIM = int(os.getenv('AE_LATENT_DIM', '16'))
|
||||
AE_LEARNING_RATE = float(os.getenv('AE_LEARNING_RATE', '1e-3'))
|
||||
|
||||
SESSION_TRANSFORMER_PATH = os.getenv(
|
||||
'SESSION_TRANSFORMER_PATH',
|
||||
os.path.join(MODEL_DIR, 'session_transformer.pt')
|
||||
)
|
||||
|
||||
# ─── XGBoost — troisième voix supervisée ────────────────────────────────────
|
||||
|
||||
XGB_WEIGHT = _require_float('XGB_WEIGHT', 0.20, 0, 1)
|
||||
|
||||
@ -168,6 +168,22 @@ def fetch_and_analyze():
|
||||
except Exception as e:
|
||||
log_info(f'[Thèse §5] view_thesis_features_1h inaccessible : {e} — features avancées ignorées.')
|
||||
|
||||
# ── §5.2 — Embeddings Transformer de séquence (remplace path_transition_entropy + cadence_cv)
|
||||
try:
|
||||
from .session_transformer import extract_sequence_embeddings
|
||||
df_embs = extract_sequence_embeddings(df, client)
|
||||
if df_embs is not None and not df_embs.empty:
|
||||
df = df.merge(df_embs, on=['src_ip', 'ja4', 'host'], how='left')
|
||||
for i in range(32):
|
||||
col = f'seq_emb_{i}'
|
||||
if col in df.columns:
|
||||
df[col] = df[col].fillna(0.0)
|
||||
log_info(f'[Transformer §5.2] {len(df_embs)} sessions enrichies avec 32 embeddings séquentiels.')
|
||||
except Exception as e:
|
||||
log_info(f'[Transformer §5.2] Embeddings indisponibles : {e}')
|
||||
for i in range(32):
|
||||
df[f'seq_emb_{i}'] = 0.0
|
||||
|
||||
df = preprocess_df(df)
|
||||
|
||||
# §5 — Enrichissement avec le score de flotte JA4×ASN (bipartite fleet detection)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
"""Gestion des modèles : chargement, entraînement, versionnement.
|
||||
|
||||
IsolationForest (EIF), Autoencoder (PyTorch), XGBoost supervisé.
|
||||
IsolationForest (EIF), Normalizing Flow (PyTorch/FrEIA), XGBoost supervisé.
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
@ -76,40 +76,51 @@ def _purge_old_versions(name: str):
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# AUTOENCODER — Second scorer parallèle (détection d'anomalies par reconstruction)
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
class TrafficAutoEncoder:
|
||||
"""Autoencoder symétrique pour détection d'anomalies par erreur de reconstruction.
|
||||
class TrafficNormalizingFlow:
|
||||
"""Normalizing Flow (RealNVP) pour détection d'anomalies par vraisemblance.
|
||||
|
||||
Architecture : encoder (n→64→32→latent_dim) — decoder (latent_dim→32→64→n)
|
||||
Activation : ReLU + BatchNorm (encoder/decoder), sigmoid (sortie — données normalisées [0,1])
|
||||
Score = MSE(input, reconstruction) par échantillon.
|
||||
L'espace latent (16-dim par défaut) peut servir de features compressées pour HDBSCAN.
|
||||
Architecture : 4 blocs de couplage affine (AllInOneBlock), sous-réseaux MLP
|
||||
(2 couches, 64 neurones, ReLU). L'espace latent = input_dim (pas de bottleneck).
|
||||
Score d'anomalie = -log p(x), estimé via le changement de variable.
|
||||
L'espace latent peut servir de features compressées pour HDBSCAN.
|
||||
"""
|
||||
|
||||
def __init__(self, n_features: int, latent_dim: int = AE_LATENT_DIM):
|
||||
def __init__(self, n_features: int, latent_dim: int = 0):
|
||||
if not TORCH_AVAILABLE:
|
||||
raise RuntimeError("PyTorch non disponible — autoencoder désactivé.")
|
||||
raise RuntimeError("PyTorch non disponible — Normalizing Flow désactivé.")
|
||||
try:
|
||||
import FrEIA.framework as Ff
|
||||
import FrEIA.modules as Fm
|
||||
except ImportError:
|
||||
raise RuntimeError("FrEIA non disponible — installer : pip install FrEIA")
|
||||
self.n_features = n_features
|
||||
self.latent_dim = latent_dim
|
||||
self.device = torch.device('cpu')
|
||||
self._build_model()
|
||||
self._scaler_min = None
|
||||
self._scaler_range = None
|
||||
|
||||
def _subnet_fc(self, c_in, c_out):
|
||||
"""Sous-réseau MLP pour les blocs de couplage (2 couches, 64 neurones)."""
|
||||
return nn.Sequential(
|
||||
nn.Linear(c_in, 64), nn.ReLU(),
|
||||
nn.Linear(64, 64), nn.ReLU(),
|
||||
nn.Linear(64, c_out),
|
||||
)
|
||||
|
||||
def _build_model(self):
|
||||
# Architecture fixe n→64→32→16→32→64→n (§2.4.3 thèse)
|
||||
dim1 = 64
|
||||
dim2 = 32
|
||||
self.encoder = nn.Sequential(
|
||||
nn.Linear(self.n_features, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||
nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||
nn.Linear(dim2, self.latent_dim),
|
||||
).to(self.device)
|
||||
self.decoder = nn.Sequential(
|
||||
nn.Linear(self.latent_dim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||
nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||
nn.Linear(dim1, self.n_features), nn.Sigmoid(),
|
||||
).to(self.device)
|
||||
self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
|
||||
import FrEIA.framework as Ff
|
||||
import FrEIA.modules as Fm
|
||||
|
||||
nodes = [Ff.InputNode(self.n_features, name='input')]
|
||||
for i in range(4):
|
||||
nodes.append(Ff.Node(
|
||||
nodes[-1],
|
||||
Fm.AllInOneBlock,
|
||||
{'subnet_constructor': self._subnet_fc, 'affine_clamping': 2.0},
|
||||
name=f'coupling_{i}',
|
||||
))
|
||||
nodes.append(Ff.OutputNode(nodes[-1], name='output'))
|
||||
self.flow = Ff.GraphINN(nodes, verbose=False).to(self.device)
|
||||
|
||||
def _to_tensor(self, X: np.ndarray) -> 'torch.Tensor':
|
||||
"""Normalise [0,1] via min-max puis convertit en Tensor."""
|
||||
@ -119,27 +130,30 @@ class TrafficAutoEncoder:
|
||||
X_norm = X
|
||||
return torch.tensor(np.clip(X_norm, 0, 1), dtype=torch.float32, device=self.device)
|
||||
|
||||
def log_likelihood(self, x: 'torch.Tensor') -> 'torch.Tensor':
|
||||
"""Calcule log p(x) = log p_z(f(x)) + log|det J_f(x)|."""
|
||||
z, log_det = self.flow(x)
|
||||
log_pz = -0.5 * (z ** 2).sum(dim=1) - 0.5 * self.n_features * np.log(2 * np.pi)
|
||||
return log_pz + log_det
|
||||
|
||||
def fit(self, X: np.ndarray, epochs: int = AE_EPOCHS, lr: float = AE_LEARNING_RATE,
|
||||
batch_size: int = 256) -> dict:
|
||||
"""Entraîne l'autoencoder sur la baseline humaine (données normales)."""
|
||||
"""Entraîne le Normalizing Flow sur la baseline humaine (données normales)."""
|
||||
self._scaler_min = X.min(axis=0)
|
||||
self._scaler_range = X.max(axis=0) - self._scaler_min
|
||||
X_t = self._to_tensor(X)
|
||||
|
||||
dataset = torch.utils.data.TensorDataset(X_t)
|
||||
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5)
|
||||
criterion = nn.MSELoss()
|
||||
optimizer = torch.optim.Adam(self.flow.parameters(), lr=lr, weight_decay=1e-5)
|
||||
|
||||
self.encoder.train()
|
||||
self.decoder.train()
|
||||
self.flow.train()
|
||||
losses = []
|
||||
for epoch in range(epochs):
|
||||
epoch_loss = 0.0
|
||||
for (batch,) in loader:
|
||||
latent = self.encoder(batch)
|
||||
recon = self.decoder(latent)
|
||||
loss = criterion(recon, batch)
|
||||
log_p = self.log_likelihood(batch)
|
||||
loss = -log_p.mean() # NLL
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
@ -148,41 +162,35 @@ class TrafficAutoEncoder:
|
||||
return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)}
|
||||
|
||||
def score_samples(self, X: np.ndarray) -> np.ndarray:
|
||||
"""Retourne l'erreur de reconstruction MSE par échantillon (plus élevé = plus anomal)."""
|
||||
self.encoder.eval()
|
||||
self.decoder.eval()
|
||||
"""Retourne -log p(x) par échantillon (plus élevé = plus anomal)."""
|
||||
self.flow.eval()
|
||||
X_t = self._to_tensor(X)
|
||||
with torch.no_grad():
|
||||
latent = self.encoder(X_t)
|
||||
recon = self.decoder(latent)
|
||||
mse = ((recon - X_t) ** 2).mean(dim=1).numpy()
|
||||
return mse
|
||||
return -self.log_likelihood(X_t).numpy()
|
||||
|
||||
def encode(self, X: np.ndarray) -> np.ndarray:
|
||||
"""Retourne l'espace latent (pour HDBSCAN clustering)."""
|
||||
self.encoder.eval()
|
||||
"""Retourne l'espace latent z = f(x) (pour HDBSCAN clustering)."""
|
||||
self.flow.eval()
|
||||
X_t = self._to_tensor(X)
|
||||
with torch.no_grad():
|
||||
return self.encoder(X_t).numpy()
|
||||
z, _ = self.flow(X_t)
|
||||
return z.numpy()
|
||||
|
||||
def state_dict(self) -> dict:
|
||||
return {
|
||||
'encoder': self.encoder.state_dict(),
|
||||
'decoder': self.decoder.state_dict(),
|
||||
'flow': self.flow.state_dict(),
|
||||
'scaler_min': self._scaler_min,
|
||||
'scaler_range': self._scaler_range,
|
||||
'n_features': self.n_features,
|
||||
'latent_dim': self.latent_dim,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def load_state_dict(cls, state: dict) -> 'TrafficAutoEncoder':
|
||||
ae = cls(state['n_features'], state['latent_dim'])
|
||||
ae._scaler_min = state['scaler_min']
|
||||
ae._scaler_range = state['scaler_range']
|
||||
ae.encoder.load_state_dict(state['encoder'])
|
||||
ae.decoder.load_state_dict(state['decoder'])
|
||||
return ae
|
||||
def load_state_dict(cls, state: dict) -> 'TrafficNormalizingFlow':
|
||||
nf = cls(state['n_features'])
|
||||
nf._scaler_min = state['scaler_min']
|
||||
nf._scaler_range = state['scaler_range']
|
||||
nf.flow.load_state_dict(state['flow'])
|
||||
return nf
|
||||
|
||||
|
||||
def _ae_model_path(name: str, version_id: str) -> str:
|
||||
@ -353,7 +361,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
dérive, entraîne un nouveau modèle sur ``human_baseline``, le sérialise sur disque,
|
||||
met à jour le fichier pointeur et purge les anciennes versions.
|
||||
|
||||
Retourne (IsolationForest, TrafficAutoEncoder|None, list[str] features).
|
||||
Retourne (IsolationForest, TrafficNormalizingFlow|None, list[str] features).
|
||||
"""
|
||||
model_path, meta = _get_current_version(name)
|
||||
if model_path and meta:
|
||||
@ -387,8 +395,8 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
ae_path = _ae_model_path(name, meta['version_id'])
|
||||
if os.path.exists(ae_path):
|
||||
try:
|
||||
ae_loaded = TrafficAutoEncoder.load_state_dict(torch.load(ae_path, weights_only=False))
|
||||
log_info(f"[{name}] Autoencoder v{meta['version_id']} rechargé.")
|
||||
ae_loaded = TrafficNormalizingFlow.load_state_dict(torch.load(ae_path, weights_only=False))
|
||||
log_info(f"[{name}] Normalizing Flow v{meta['version_id']} rechargé.")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] Erreur chargement AE : {exc} — AE désactivé ce cycle.")
|
||||
return joblib.load(model_path), ae_loaded, meta.get('features', features)
|
||||
@ -451,7 +459,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
ae_prev_path = _ae_model_path(name, meta.get('version_id', ''))
|
||||
if os.path.exists(ae_prev_path):
|
||||
try:
|
||||
ae_prev = TrafficAutoEncoder.load_state_dict(torch.load(ae_prev_path, weights_only=False))
|
||||
ae_prev = TrafficNormalizingFlow.load_state_dict(torch.load(ae_prev_path, weights_only=False))
|
||||
except Exception:
|
||||
pass
|
||||
return joblib.load(model_path), ae_prev, meta.get('features', features)
|
||||
@ -477,17 +485,17 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
new_meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
|
||||
joblib.dump(model, new_model_path)
|
||||
|
||||
# Entraînement de l'Autoencoder en parallèle (si PyTorch disponible et AE_WEIGHT > 0)
|
||||
# Entraînement du Normalizing Flow en parallèle (si PyTorch disponible et AE_WEIGHT > 0)
|
||||
ae_model = None
|
||||
if TORCH_AVAILABLE and AE_WEIGHT > 0:
|
||||
try:
|
||||
ae_model = TrafficAutoEncoder(n_features=len(features))
|
||||
ae_model = TrafficNormalizingFlow(n_features=len(features))
|
||||
ae_stats = ae_model.fit(X_train.values)
|
||||
ae_path = _ae_model_path(name, version_id)
|
||||
torch.save(ae_model.state_dict(), ae_path)
|
||||
log_info(f"[{name}] Autoencoder entraîné : loss={ae_stats['final_loss']:.6f}, epochs={ae_stats['epochs']}")
|
||||
log_info(f"[{name}] Normalizing Flow entraîné : NLL={ae_stats['final_loss']:.6f}, epochs={ae_stats['epochs']}")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] Autoencoder training échoué : {exc} — AE désactivé.")
|
||||
log_info(f"[{name}] Normalizing Flow training échoué : {exc} — NF désactivé.")
|
||||
ae_model = None
|
||||
|
||||
previous_version = meta.get('version_id', None) if meta else None
|
||||
@ -498,7 +506,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
'model_name': name, 'previous_version': previous_version,
|
||||
'retrain_interval': RETRAIN_INTERVAL_H, 'baseline_stats': baseline_stats,
|
||||
'algorithm': 'ExtendedIsolationForest' if EIF_AVAILABLE else 'IsolationForest',
|
||||
'autoencoder': ae_model is not None,
|
||||
'autoencoder': ae_model is not None, # NF en réalité, clé conservée pour rétro-compatibilité
|
||||
'ae_weight': AE_WEIGHT if ae_model else 0.0,
|
||||
'validation': {
|
||||
'val_size': len(X_val), 'train_size': len(X_train),
|
||||
@ -512,7 +520,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
append_training_history({k: v for k, v in new_meta.items() if k != 'baseline_stats'})
|
||||
_purge_old_versions(name)
|
||||
|
||||
log_info(f"[{name}] Modèle v{version_id} sauvegardé → {new_model_path} (AE={'oui' if ae_model is not None else 'non'})")
|
||||
log_info(f"[{name}] Modèle v{version_id} sauvegardé → {new_model_path} (NF={'oui' if ae_model is not None else 'non'})")
|
||||
log_decision('MODEL_TRAINED', cycle_id, name, {
|
||||
'version_id': version_id, 'previous_version': previous_version,
|
||||
'human_samples': len(human_baseline), 'next_retrain_in_h': RETRAIN_INTERVAL_H,
|
||||
|
||||
@ -15,7 +15,7 @@ from .config import (
|
||||
)
|
||||
from .log import log_info, log_decision
|
||||
from .infra import score_to_threat_level, get_client
|
||||
from .models import load_or_train_model, load_or_train_xgb, TrafficAutoEncoder
|
||||
from .models import load_or_train_model, load_or_train_xgb, TrafficNormalizingFlow
|
||||
from .scoring import (
|
||||
validate_features, compute_adaptive_threshold, normalize_scores,
|
||||
compute_shap_top_features, build_reason, cluster_anomalies,
|
||||
@ -115,19 +115,19 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
|
||||
log_info(f'[{name}] Scoring EIF : {len(X_test)} sessions scorées (min={raw_scores.min():.4f}, max={raw_scores.max():.4f}, mean={raw_scores.mean():.4f})')
|
||||
|
||||
# Combinaison EIF + Autoencoder si disponible
|
||||
# Score final = (1-α) * eif_norm + α * ae_norm où α = AE_WEIGHT
|
||||
# Combinaison EIF + Normalizing Flow si disponible
|
||||
# Score final = (1-α) * eif_norm + α * nf_norm où α = AE_WEIGHT
|
||||
if ae_model is not None and AE_WEIGHT > 0:
|
||||
try:
|
||||
ae_recon_errors = ae_model.score_samples(X_test.values)
|
||||
ae_norm = normalize_scores(-ae_recon_errors) # plus élevé = plus anomal
|
||||
nf_neg_ll = ae_model.score_samples(X_test.values) # -log p(x)
|
||||
nf_norm = normalize_scores(-nf_neg_ll) # plus élevé = plus anomal
|
||||
eif_norm = normalize_scores(raw_scores)
|
||||
combined_norm = (1 - AE_WEIGHT) * eif_norm + AE_WEIGHT * ae_norm
|
||||
unknown_traffic['ae_recon_error'] = ae_recon_errors
|
||||
combined_norm = (1 - AE_WEIGHT) * eif_norm + AE_WEIGHT * nf_norm
|
||||
unknown_traffic['ae_recon_error'] = nf_neg_ll # nom conservé pour rétro-compatibilité
|
||||
unknown_traffic['anomaly_score'] = combined_norm
|
||||
log_info(f"[{name}] Score combiné EIF+AE (α={AE_WEIGHT}): ae_mean={ae_recon_errors.mean():.6f}")
|
||||
log_info(f"[{name}] Score combiné EIF+NF (α={AE_WEIGHT}): nf_mean={nf_neg_ll.mean():.6f}")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] AE scoring échoué : {exc} — utilisation EIF seul.")
|
||||
log_info(f"[{name}] NF scoring échoué : {exc} — utilisation EIF seul.")
|
||||
unknown_traffic['ae_recon_error'] = 0.0
|
||||
unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
|
||||
else:
|
||||
|
||||
@ -37,10 +37,12 @@ FEATURES = [
|
||||
'axis_nav_behavior', 'axis_tls_coherence', 'axis_h2_coherence',
|
||||
# HTTP
|
||||
'missing_accept_enc_ratio', 'http_scheme_ratio',
|
||||
# Thèse §5
|
||||
'path_transition_entropy',
|
||||
'cadence_cv', 'burst_ratio', 'pause_ratio',
|
||||
# Thèse §5 — timing (burst_ratio, pause_ratio, lag1_autocorrelation, benford_deviation)
|
||||
# path_transition_entropy et cadence_cv remplacés par embeddings Transformer
|
||||
'burst_ratio', 'pause_ratio',
|
||||
'lag1_autocorrelation', 'benford_deviation',
|
||||
# §5.2 — Session Transformer embeddings (remplace path_transition_entropy + cadence_cv)
|
||||
*[f'seq_emb_{i}' for i in range(32)],
|
||||
'host_diversity', 'host_sweep_speed', 'host_coverage_uniformity',
|
||||
# §5.8b — Similarité Jaccard cross-domaine (chemins partagés entre hosts)
|
||||
'cross_domain_path_similarity',
|
||||
|
||||
@ -6,6 +6,7 @@ scipy>=1.14
|
||||
hdbscan>=0.8.38
|
||||
isotree>=0.6.1
|
||||
torch>=2.0
|
||||
FrEIA>=0.2
|
||||
xgboost>=2.0
|
||||
cleanlab>=2.6
|
||||
pyyaml>=6.0
|
||||
|
||||
296
services/bot-detector/bot_detector/session_transformer.py
Normal file
296
services/bot-detector/bot_detector/session_transformer.py
Normal file
@ -0,0 +1,296 @@
|
||||
"""Embedding de séquence de session via Transformer.
|
||||
|
||||
Remplace path_transition_entropy et cadence_cv par un vecteur dense de 32
|
||||
dimensions capturant la structure séquentielle complète d'une session HTTP.
|
||||
|
||||
Architecture :
|
||||
- Embedding chemin (hash-based, vocab 8192) + méthode (6 tokens) + delta_t projeté
|
||||
- TransformerEncoder (2 couches, 4 têtes, d_model=64)
|
||||
- Mean pooling temporel → Linear(64, 32) → seq_emb_0..seq_emb_31
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from .config import TORCH_AVAILABLE, MODEL_DIR
|
||||
from .log import log_info
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constantes
|
||||
# ---------------------------------------------------------------------------
|
||||
EMBEDDING_DIM = 32
|
||||
D_MODEL = 64
|
||||
N_HEADS = 4
|
||||
N_LAYERS = 2
|
||||
FFN_DIM = 128
|
||||
PATH_VOCAB_SIZE = 8192
|
||||
METHOD_VOCAB_SIZE = 6 # GET POST PUT DELETE PATCH HEAD/OTHER
|
||||
MAX_SEQ_LEN = 512
|
||||
|
||||
SESSION_TRANSFORMER_PATH = os.getenv(
|
||||
'SESSION_TRANSFORMER_PATH',
|
||||
os.path.join(MODEL_DIR, 'session_transformer.pt')
|
||||
)
|
||||
|
||||
METHOD_MAP = {
|
||||
'GET': 0, 'POST': 1, 'PUT': 2,
|
||||
'DELETE': 3, 'PATCH': 4,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Modèle
|
||||
# ---------------------------------------------------------------------------
|
||||
if TORCH_AVAILABLE:
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
class SessionTransformer(nn.Module):
|
||||
"""Petit Transformer encoder pour séquences de requêtes HTTP."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
d_model: int = D_MODEL,
|
||||
nhead: int = N_HEADS,
|
||||
num_layers: int = N_LAYERS,
|
||||
dim_feedforward: int = FFN_DIM,
|
||||
path_vocab: int = PATH_VOCAB_SIZE,
|
||||
method_vocab: int = METHOD_VOCAB_SIZE,
|
||||
out_dim: int = EMBEDDING_DIM,
|
||||
):
|
||||
super().__init__()
|
||||
self.d_model = d_model
|
||||
|
||||
self.path_emb = nn.Embedding(path_vocab, d_model)
|
||||
self.method_emb = nn.Embedding(method_vocab, d_model)
|
||||
self.delta_proj = nn.Linear(1, d_model)
|
||||
|
||||
encoder_layer = nn.TransformerEncoderLayer(
|
||||
d_model=d_model,
|
||||
nhead=nhead,
|
||||
dim_feedforward=dim_feedforward,
|
||||
dropout=0.1,
|
||||
batch_first=True,
|
||||
)
|
||||
self.transformer = nn.TransformerEncoder(
|
||||
encoder_layer, num_layers=num_layers
|
||||
)
|
||||
|
||||
self.head = nn.Linear(d_model, out_dim)
|
||||
|
||||
# Positional encoding sinusoidal (pré-calculé jusqu'à MAX_SEQ_LEN)
|
||||
self.register_buffer(
|
||||
'_pe', self._build_pos_encoding(MAX_SEQ_LEN, d_model)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _build_pos_encoding(max_len: int, d_model: int) -> 'torch.Tensor':
|
||||
pe = torch.zeros(max_len, d_model)
|
||||
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
||||
div_term = torch.exp(
|
||||
torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)
|
||||
)
|
||||
pe[:, 0::2] = torch.sin(position * div_term)
|
||||
pe[:, 1::2] = torch.cos(position * div_term)
|
||||
return pe # (max_len, d_model)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
path_ids: 'torch.Tensor', # (B, S) long
|
||||
method_ids: 'torch.Tensor', # (B, S) long
|
||||
delta_ts: 'torch.Tensor', # (B, S) float
|
||||
) -> 'torch.Tensor':
|
||||
B, S = path_ids.shape
|
||||
seq_len = min(S, MAX_SEQ_LEN)
|
||||
path_ids = path_ids[:, :seq_len]
|
||||
method_ids = method_ids[:, :seq_len]
|
||||
delta_ts = delta_ts[:, :seq_len]
|
||||
|
||||
x = (
|
||||
self.path_emb(path_ids)
|
||||
+ self.method_emb(method_ids)
|
||||
+ self.delta_proj(delta_ts.unsqueeze(-1))
|
||||
)
|
||||
x = x + self._pe[:seq_len].unsqueeze(0)
|
||||
|
||||
out = self.transformer(x) # (B, S, d_model)
|
||||
pooled = out.mean(dim=1) # (B, d_model)
|
||||
return self.head(pooled) # (B, out_dim)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fonction d'extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
def _hash_path(path: str) -> int:
|
||||
"""Hash stable d'un chemin HTTP dans [0, PATH_VOCAB_SIZE)."""
|
||||
return hash(path) % PATH_VOCAB_SIZE
|
||||
|
||||
|
||||
def _encode_method(method: str) -> int:
|
||||
return METHOD_MAP.get(method.upper(), METHOD_VOCAB_SIZE - 1)
|
||||
|
||||
|
||||
def extract_sequence_embeddings(
|
||||
df_sessions: pd.DataFrame,
|
||||
client,
|
||||
) -> pd.DataFrame:
|
||||
"""Extrait les embeddings Transformer pour chaque session.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_sessions : DataFrame avec les clés de session (src_ip, ja4, host).
|
||||
client : Client ClickHouse (infra.get_client()).
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame avec colonnes src_ip, ja4, host, seq_emb_0..seq_emb_31.
|
||||
"""
|
||||
emb_cols = [f'seq_emb_{i}' for i in range(EMBEDDING_DIM)]
|
||||
|
||||
# ── Fallback si PyTorch absent ──────────────────────────────────────
|
||||
if not TORCH_AVAILABLE:
|
||||
log_info('[Transformer] PyTorch non disponible — embeddings à zéro.')
|
||||
result = df_sessions[['src_ip', 'ja4', 'host']].copy()
|
||||
for c in emb_cols:
|
||||
result[c] = 0.0
|
||||
return result
|
||||
|
||||
# ── Charger le modèle ───────────────────────────────────────────────
|
||||
model = SessionTransformer()
|
||||
model.eval()
|
||||
|
||||
weights_found = os.path.isfile(SESSION_TRANSFORMER_PATH)
|
||||
if weights_found:
|
||||
state = torch.load(SESSION_TRANSFORMER_PATH, map_location='cpu', weights_only=True)
|
||||
model.load_state_dict(state)
|
||||
log_info(f'[Transformer] Poids chargés depuis {SESSION_TRANSFORMER_PATH}')
|
||||
else:
|
||||
log_info(
|
||||
f'[Transformer] ATTENTION : {SESSION_TRANSFORMER_PATH} introuvable — '
|
||||
f'embeddings aléatoires (modèle non pré-entraîné).'
|
||||
)
|
||||
|
||||
# ── Requêter les séquences brutes ───────────────────────────────────
|
||||
query = """
|
||||
SELECT
|
||||
src_ip,
|
||||
ja4,
|
||||
host,
|
||||
time,
|
||||
method,
|
||||
path,
|
||||
status_code
|
||||
FROM ja4_logs.http_logs
|
||||
WHERE time >= now() - INTERVAL 1 HOUR
|
||||
ORDER BY src_ip, ja4, host, time
|
||||
"""
|
||||
try:
|
||||
raw = client.query_df(query)
|
||||
except Exception as e:
|
||||
log_info(f'[Transformer] Erreur requête http_logs : {e} — embeddings à zéro.')
|
||||
result = df_sessions[['src_ip', 'ja4', 'host']].copy()
|
||||
for c in emb_cols:
|
||||
result[c] = 0.0
|
||||
return result
|
||||
|
||||
if raw is None or raw.empty:
|
||||
log_info('[Transformer] Aucun log brut dans http_logs — embeddings à zéro.')
|
||||
result = df_sessions[['src_ip', 'ja4', 'host']].copy()
|
||||
for c in emb_cols:
|
||||
result[c] = 0.0
|
||||
return result
|
||||
|
||||
raw.columns = [c.split('.')[-1] for c in raw.columns]
|
||||
raw['src_ip'] = raw['src_ip'].astype(str)
|
||||
|
||||
# ── Grouper par session et encoder ──────────────────────────────────
|
||||
sessions = (
|
||||
raw.groupby(['src_ip', 'ja4', 'host'])
|
||||
.agg(list)
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
all_path_ids = []
|
||||
all_method_ids = []
|
||||
all_delta_ts = []
|
||||
all_keys = []
|
||||
|
||||
for _, row in sessions.iterrows():
|
||||
times = pd.to_datetime(row['time'])
|
||||
methods = row['method']
|
||||
paths = row['path']
|
||||
|
||||
n = len(times)
|
||||
if n < 2:
|
||||
continue # vecteur de zéros (default)
|
||||
|
||||
# Garder les 512 dernières requêtes
|
||||
if n > MAX_SEQ_LEN:
|
||||
times = times.iloc[-MAX_SEQ_LEN:]
|
||||
methods = methods[-MAX_SEQ_LEN:]
|
||||
paths = paths[-MAX_SEQ_LEN:]
|
||||
n = MAX_SEQ_LEN
|
||||
|
||||
# delta_t en ms, normalisé log1p / 10
|
||||
deltas_ms = times.diff().dt.total_seconds().iloc[1:].fillna(0) * 1000
|
||||
delta_norm = np.log1p(deltas_ms.values) / 10.0
|
||||
|
||||
path_ids = np.array([_hash_path(p) for p in paths[1:]], dtype=np.int64)
|
||||
method_ids = np.array([_encode_method(m) for m in methods[1:]], dtype=np.int64)
|
||||
|
||||
all_path_ids.append(path_ids)
|
||||
all_method_ids.append(method_ids)
|
||||
all_delta_ts.append(delta_norm.astype(np.float32))
|
||||
all_keys.append((str(row['src_ip']), row['ja4'], row['host']))
|
||||
|
||||
# ── Inference batch ─────────────────────────────────────────────────
|
||||
results = {}
|
||||
emb_dim = EMBEDDING_DIM
|
||||
|
||||
if not all_path_ids:
|
||||
log_info('[Transformer] Aucune session avec ≥2 requêtes — embeddings à zéro.')
|
||||
result = df_sessions[['src_ip', 'ja4', 'host']].copy()
|
||||
for c in emb_cols:
|
||||
result[c] = 0.0
|
||||
return result
|
||||
|
||||
# Padding au sein du batch
|
||||
max_len = min(max(len(s) for s in all_path_ids), MAX_SEQ_LEN)
|
||||
|
||||
batch_path = np.zeros((len(all_path_ids), max_len), dtype=np.int64)
|
||||
batch_method = np.zeros((len(all_path_ids), max_len), dtype=np.int64)
|
||||
batch_delta = np.zeros((len(all_path_ids), max_len), dtype=np.float32)
|
||||
|
||||
for i, (p, m, d) in enumerate(zip(all_path_ids, all_method_ids, all_delta_ts)):
|
||||
L = min(len(p), max_len)
|
||||
batch_path[i, :L] = p[:L]
|
||||
batch_method[i, :L] = m[:L]
|
||||
batch_delta[i, :L] = d[:L]
|
||||
|
||||
with torch.no_grad():
|
||||
out = model(
|
||||
torch.tensor(batch_path),
|
||||
torch.tensor(batch_method),
|
||||
torch.tensor(batch_delta),
|
||||
).numpy() # (B, 32)
|
||||
|
||||
for i, key in enumerate(all_keys):
|
||||
results[key] = out[i]
|
||||
|
||||
# ── Construire le DataFrame résultat ────────────────────────────────
|
||||
rows = []
|
||||
zero_vec = np.zeros(emb_dim, dtype=np.float32)
|
||||
for _, row in df_sessions.iterrows():
|
||||
key = (str(row['src_ip']), row['ja4'], row['host'])
|
||||
vec = results.get(key, zero_vec)
|
||||
entry = {'src_ip': str(row['src_ip']), 'ja4': row['ja4'], 'host': row['host']}
|
||||
for j in range(emb_dim):
|
||||
entry[f'seq_emb_{j}'] = float(vec[j])
|
||||
rows.append(entry)
|
||||
|
||||
return pd.DataFrame(rows)
|
||||
@ -349,31 +349,39 @@ def test_ae_torch_availability_flag():
|
||||
assert isinstance(avail, bool)
|
||||
|
||||
|
||||
def _make_ae(n_features, latent_dim=4):
|
||||
"""Build a standalone TrafficAutoEncoder for testing (avoids importing bot_detector module)."""
|
||||
def _make_nf(n_features):
|
||||
"""Build a standalone TrafficNormalizingFlow for testing (avoids importing bot_detector module)."""
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
class _AE:
|
||||
def __init__(self, n_feat, ldim):
|
||||
class _NF:
|
||||
def __init__(self, n_feat):
|
||||
self.n_features = n_feat
|
||||
self.latent_dim = ldim
|
||||
self.device = torch.device('cpu')
|
||||
dim1 = min(64, max(n_feat, ldim + 4))
|
||||
dim2 = min(32, max(dim1 // 2, ldim + 2))
|
||||
self.encoder = nn.Sequential(
|
||||
nn.Linear(n_feat, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||
nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||
nn.Linear(dim2, ldim),
|
||||
)
|
||||
self.decoder = nn.Sequential(
|
||||
nn.Linear(ldim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||
nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||
nn.Linear(dim1, n_feat), nn.Sigmoid(),
|
||||
)
|
||||
self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
|
||||
self._scaler_min = None
|
||||
self._scaler_range = None
|
||||
self._build_model()
|
||||
|
||||
def _subnet_fc(self, c_in, c_out):
|
||||
return nn.Sequential(
|
||||
nn.Linear(c_in, 64), nn.ReLU(),
|
||||
nn.Linear(64, 64), nn.ReLU(),
|
||||
nn.Linear(64, c_out),
|
||||
)
|
||||
|
||||
def _build_model(self):
|
||||
import FrEIA.framework as Ff
|
||||
import FrEIA.modules as Fm
|
||||
nodes = [Ff.InputNode(self.n_features, name='input')]
|
||||
for i in range(4):
|
||||
nodes.append(Ff.Node(
|
||||
nodes[-1],
|
||||
Fm.AllInOneBlock,
|
||||
{'subnet_constructor': self._subnet_fc, 'affine_clamping': 2.0},
|
||||
name=f'coupling_{i}',
|
||||
))
|
||||
nodes.append(Ff.OutputNode(nodes[-1], name='output'))
|
||||
self.flow = Ff.GraphINN(nodes, verbose=False).to(self.device)
|
||||
|
||||
def _to_tensor(self, X):
|
||||
if self._scaler_min is not None:
|
||||
@ -382,119 +390,134 @@ def _make_ae(n_features, latent_dim=4):
|
||||
X_n = X
|
||||
return torch.tensor(np.clip(X_n, 0, 1), dtype=torch.float32)
|
||||
|
||||
def log_likelihood(self, x):
|
||||
z, log_det = self.flow(x)
|
||||
log_pz = -0.5 * (z ** 2).sum(dim=1) - 0.5 * self.n_features * np.log(2 * np.pi)
|
||||
return log_pz + log_det
|
||||
|
||||
def fit(self, X, epochs=50, lr=1e-3, batch_size=256):
|
||||
self._scaler_min = X.min(axis=0)
|
||||
self._scaler_range = X.max(axis=0) - self._scaler_min
|
||||
X_t = self._to_tensor(X)
|
||||
dataset = torch.utils.data.TensorDataset(X_t)
|
||||
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5)
|
||||
criterion = nn.MSELoss()
|
||||
self.encoder.train(); self.decoder.train()
|
||||
optimizer = torch.optim.Adam(self.flow.parameters(), lr=lr, weight_decay=1e-5)
|
||||
self.flow.train()
|
||||
losses = []
|
||||
for _ in range(epochs):
|
||||
epoch_loss = 0.0
|
||||
for (batch,) in loader:
|
||||
latent = self.encoder(batch)
|
||||
recon = self.decoder(latent)
|
||||
loss = criterion(recon, batch)
|
||||
log_p = self.log_likelihood(batch)
|
||||
loss = -log_p.mean()
|
||||
optimizer.zero_grad(); loss.backward(); optimizer.step()
|
||||
epoch_loss += loss.item() * len(batch)
|
||||
losses.append(epoch_loss / len(X_t))
|
||||
return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)}
|
||||
|
||||
def score_samples(self, X):
|
||||
self.encoder.eval(); self.decoder.eval()
|
||||
self.flow.eval()
|
||||
X_t = self._to_tensor(X)
|
||||
with torch.no_grad():
|
||||
return ((self.decoder(self.encoder(X_t)) - X_t) ** 2).mean(dim=1).numpy()
|
||||
return -self.log_likelihood(X_t).numpy()
|
||||
|
||||
def encode(self, X):
|
||||
self.encoder.eval()
|
||||
self.flow.eval()
|
||||
X_t = self._to_tensor(X)
|
||||
with torch.no_grad():
|
||||
return self.encoder(X_t).numpy()
|
||||
z, _ = self.flow(X_t)
|
||||
return z.numpy()
|
||||
|
||||
def state_dict(self):
|
||||
return {'encoder': self.encoder.state_dict(), 'decoder': self.decoder.state_dict(),
|
||||
return {'flow': self.flow.state_dict(),
|
||||
'scaler_min': self._scaler_min, 'scaler_range': self._scaler_range,
|
||||
'n_features': self.n_features, 'latent_dim': self.latent_dim}
|
||||
'n_features': self.n_features}
|
||||
|
||||
@classmethod
|
||||
def load_state_dict(cls, state):
|
||||
ae = cls(state['n_features'], state['latent_dim'])
|
||||
ae._scaler_min = state['scaler_min']
|
||||
ae._scaler_range = state['scaler_range']
|
||||
ae.encoder.load_state_dict(state['encoder'])
|
||||
ae.decoder.load_state_dict(state['decoder'])
|
||||
return ae
|
||||
nf = cls(state['n_features'])
|
||||
nf._scaler_min = state['scaler_min']
|
||||
nf._scaler_range = state['scaler_range']
|
||||
nf.flow.load_state_dict(state['flow'])
|
||||
return nf
|
||||
|
||||
return _AE(n_features, latent_dim)
|
||||
return _NF(n_features)
|
||||
|
||||
|
||||
def test_ae_class_train_and_score():
|
||||
"""TrafficAutoEncoder trains on normal data and scores anomalies higher."""
|
||||
def test_nf_class_train_and_score():
|
||||
"""TrafficNormalizingFlow trains on normal data and scores anomalies higher."""
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
pytest.skip("torch not installed")
|
||||
try:
|
||||
import FrEIA
|
||||
except ImportError:
|
||||
pytest.skip("FrEIA not installed")
|
||||
|
||||
rng = np.random.default_rng(42)
|
||||
n_features = 10
|
||||
X_normal = rng.normal(0.5, 0.1, (200, n_features)).clip(0, 1)
|
||||
X_anomaly = rng.uniform(0.8, 1.0, (20, n_features))
|
||||
|
||||
ae = _make_ae(n_features, latent_dim=4)
|
||||
stats = ae.fit(X_normal, epochs=30, lr=1e-3)
|
||||
assert stats['final_loss'] > 0, "Loss should be positive"
|
||||
nf = _make_nf(n_features)
|
||||
stats = nf.fit(X_normal, epochs=30, lr=1e-3)
|
||||
assert stats['final_loss'] > 0, "NLL should be positive"
|
||||
assert stats['epochs'] == 30
|
||||
assert stats['n_samples'] == 200
|
||||
|
||||
normal_scores = ae.score_samples(X_normal)
|
||||
anomaly_scores = ae.score_samples(X_anomaly)
|
||||
normal_scores = nf.score_samples(X_normal) # -log p(x)
|
||||
anomaly_scores = nf.score_samples(X_anomaly)
|
||||
assert np.mean(anomaly_scores) > np.mean(normal_scores), \
|
||||
f"Anomaly MSE ({np.mean(anomaly_scores):.4f}) should > normal MSE ({np.mean(normal_scores):.4f})"
|
||||
f"Anomaly -logp ({np.mean(anomaly_scores):.4f}) should > normal -logp ({np.mean(normal_scores):.4f})"
|
||||
|
||||
|
||||
def test_ae_encode_latent_space():
|
||||
"""Autoencoder encode() returns correct dimensionality."""
|
||||
def test_nf_encode_latent_space():
|
||||
"""Normalizing Flow encode() returns same dimensionality as input (bijection)."""
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
pytest.skip("torch not installed")
|
||||
try:
|
||||
import FrEIA
|
||||
except ImportError:
|
||||
pytest.skip("FrEIA not installed")
|
||||
|
||||
rng = np.random.default_rng(42)
|
||||
X = rng.normal(0.5, 0.1, (50, 8)).clip(0, 1)
|
||||
|
||||
ae = _make_ae(8, latent_dim=4)
|
||||
ae.fit(X, epochs=5)
|
||||
latent = ae.encode(X)
|
||||
assert latent.shape == (50, 4), f"Latent shape should be (50, 4), got {latent.shape}"
|
||||
nf = _make_nf(8)
|
||||
nf.fit(X, epochs=5)
|
||||
z = nf.encode(X)
|
||||
assert z.shape == (50, 8), f"Latent shape should be (50, 8), got {z.shape}"
|
||||
|
||||
|
||||
def test_ae_state_dict_save_load():
|
||||
"""Autoencoder can save and load state dict."""
|
||||
def test_nf_state_dict_save_load():
|
||||
"""Normalizing Flow can save and load state dict."""
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
pytest.skip("torch not installed")
|
||||
try:
|
||||
import FrEIA
|
||||
except ImportError:
|
||||
pytest.skip("FrEIA not installed")
|
||||
|
||||
rng = np.random.default_rng(42)
|
||||
X = rng.normal(0.5, 0.1, (100, 6)).clip(0, 1)
|
||||
|
||||
ae = _make_ae(6, latent_dim=3)
|
||||
ae.fit(X, epochs=10)
|
||||
scores_before = ae.score_samples(X)
|
||||
nf = _make_nf(6)
|
||||
nf.fit(X, epochs=10)
|
||||
scores_before = nf.score_samples(X)
|
||||
|
||||
state = ae.state_dict()
|
||||
ae2 = type(ae).load_state_dict(state)
|
||||
scores_after = ae2.score_samples(X)
|
||||
state = nf.state_dict()
|
||||
nf2 = type(nf).load_state_dict(state)
|
||||
scores_after = nf2.score_samples(X)
|
||||
np.testing.assert_allclose(scores_before, scores_after, rtol=1e-5,
|
||||
err_msg="Scores should be identical after load")
|
||||
|
||||
|
||||
def test_ae_weight_combination():
|
||||
"""Combined score should be weighted average of EIF and AE components."""
|
||||
def test_nf_weight_combination():
|
||||
"""Combined score should be weighted average of EIF and NF components."""
|
||||
eif_norm = np.array([0.2, 0.8, 0.5])
|
||||
ae_norm = np.array([0.3, 0.9, 0.4])
|
||||
alpha = 0.30
|
||||
|
||||
Reference in New Issue
Block a user