feat(bot-detector): add parallel Autoencoder scorer (#9)
- TrafficAutoEncoder class: symmetric AE (n→64→32→16→32→64→n) with BatchNorm+ReLU - Trained alongside EIF on human_baseline, saved/loaded with model versioning - Score = per-sample MSE reconstruction error, combined with EIF via AE_WEIGHT (α=0.30) - AE latent space (16-dim) used for HDBSCAN clustering instead of raw features - Configurable: AE_WEIGHT, AE_EPOCHS, AE_LATENT_DIM, AE_LEARNING_RATE - Graceful fallback: if torch unavailable or AE fails, EIF-only scoring continues - ClickHouse: ae_recon_error column added to ml_all_scores - Tests: 5 new tests (AE train/score, encode latent, state dict save/load, weight combination) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -51,6 +51,15 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
SHAP_AVAILABLE = False
|
SHAP_AVAILABLE = False
|
||||||
|
|
||||||
|
# PyTorch Autoencoder (Baptiste et al., arXiv 2602; Kitsune, Mirsky et al., NDSS 2018)
|
||||||
|
# Second scorer parallèle : détecte les anomalies de reconstruction que l'IF manque
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
TORCH_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
TORCH_AVAILABLE = False
|
||||||
|
|
||||||
warnings.filterwarnings('ignore')
|
warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
@ -123,6 +132,12 @@ MIN_VALID_FEATURE_RATIO = _require_float('MIN_VALID_FEATURE_RATIO', 0.50, 0, 1)
|
|||||||
ENABLE_CLUSTERING = os.getenv('ENABLE_CLUSTERING', 'true').lower() == 'true'
|
ENABLE_CLUSTERING = os.getenv('ENABLE_CLUSTERING', 'true').lower() == 'true'
|
||||||
CLUSTERING_MIN_SAMPLES = int(os.getenv('CLUSTERING_MIN_SAMPLES', '3'))
|
CLUSTERING_MIN_SAMPLES = int(os.getenv('CLUSTERING_MIN_SAMPLES', '3'))
|
||||||
|
|
||||||
|
# Autoencoder — Second scorer parallèle (Baptiste et al. 2026, Kitsune / Mirsky et al. NDSS 2018)
|
||||||
|
AE_WEIGHT = _require_float('AE_WEIGHT', 0.30, 0, 1) # pondération dans le score combiné (0 = IF seul)
|
||||||
|
AE_EPOCHS = int(os.getenv('AE_EPOCHS', '50'))
|
||||||
|
AE_LATENT_DIM = int(os.getenv('AE_LATENT_DIM', '16'))
|
||||||
|
AE_LEARNING_RATE = float(os.getenv('AE_LEARNING_RATE', '1e-3'))
|
||||||
|
|
||||||
# Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé)
|
# Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé)
|
||||||
# Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design.
|
# Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design.
|
||||||
STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
|
STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
|
||||||
@ -275,6 +290,121 @@ def _purge_old_versions(name: str):
|
|||||||
if os.path.exists(meta_path): os.remove(meta_path)
|
if os.path.exists(meta_path): os.remove(meta_path)
|
||||||
log_info(f"[{name}] Version purgée : {version_id} (limite={MODEL_HISTORY_COUNT})")
|
log_info(f"[{name}] Version purgée : {version_id} (limite={MODEL_HISTORY_COUNT})")
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
|
# AUTOENCODER — Second scorer parallèle (détection d'anomalies par reconstruction)
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
|
class TrafficAutoEncoder:
|
||||||
|
"""Autoencoder symétrique pour détection d'anomalies par erreur de reconstruction.
|
||||||
|
|
||||||
|
Architecture : encoder (n→64→32→latent_dim) — decoder (latent_dim→32→64→n)
|
||||||
|
Activation : ReLU + BatchNorm (encoder/decoder), sigmoid (sortie — données normalisées [0,1])
|
||||||
|
Score = MSE(input, reconstruction) par échantillon.
|
||||||
|
L'espace latent (16-dim par défaut) peut servir de features compressées pour HDBSCAN.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, n_features: int, latent_dim: int = AE_LATENT_DIM):
|
||||||
|
if not TORCH_AVAILABLE:
|
||||||
|
raise RuntimeError("PyTorch non disponible — autoencoder désactivé.")
|
||||||
|
self.n_features = n_features
|
||||||
|
self.latent_dim = latent_dim
|
||||||
|
self.device = torch.device('cpu')
|
||||||
|
self._build_model()
|
||||||
|
self._scaler_min = None
|
||||||
|
self._scaler_range = None
|
||||||
|
|
||||||
|
def _build_model(self):
|
||||||
|
dim1 = min(64, max(self.n_features, self.latent_dim + 4))
|
||||||
|
dim2 = min(32, max(dim1 // 2, self.latent_dim + 2))
|
||||||
|
self.encoder = nn.Sequential(
|
||||||
|
nn.Linear(self.n_features, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||||
|
nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||||
|
nn.Linear(dim2, self.latent_dim),
|
||||||
|
).to(self.device)
|
||||||
|
self.decoder = nn.Sequential(
|
||||||
|
nn.Linear(self.latent_dim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||||
|
nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||||
|
nn.Linear(dim1, self.n_features), nn.Sigmoid(),
|
||||||
|
).to(self.device)
|
||||||
|
self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
|
||||||
|
|
||||||
|
def _to_tensor(self, X: np.ndarray) -> torch.Tensor:
|
||||||
|
"""Normalise [0,1] via min-max puis convertit en Tensor."""
|
||||||
|
if self._scaler_min is not None:
|
||||||
|
X_norm = (X - self._scaler_min) / (self._scaler_range + 1e-9)
|
||||||
|
else:
|
||||||
|
X_norm = X
|
||||||
|
return torch.tensor(np.clip(X_norm, 0, 1), dtype=torch.float32, device=self.device)
|
||||||
|
|
||||||
|
def fit(self, X: np.ndarray, epochs: int = AE_EPOCHS, lr: float = AE_LEARNING_RATE,
|
||||||
|
batch_size: int = 256) -> dict:
|
||||||
|
"""Entraîne l'autoencoder sur la baseline humaine (données normales)."""
|
||||||
|
self._scaler_min = X.min(axis=0)
|
||||||
|
self._scaler_range = X.max(axis=0) - self._scaler_min
|
||||||
|
X_t = self._to_tensor(X)
|
||||||
|
|
||||||
|
dataset = torch.utils.data.TensorDataset(X_t)
|
||||||
|
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||||
|
optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5)
|
||||||
|
criterion = nn.MSELoss()
|
||||||
|
|
||||||
|
self.encoder.train()
|
||||||
|
self.decoder.train()
|
||||||
|
losses = []
|
||||||
|
for epoch in range(epochs):
|
||||||
|
epoch_loss = 0.0
|
||||||
|
for (batch,) in loader:
|
||||||
|
latent = self.encoder(batch)
|
||||||
|
recon = self.decoder(latent)
|
||||||
|
loss = criterion(recon, batch)
|
||||||
|
optimizer.zero_grad()
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
epoch_loss += loss.item() * len(batch)
|
||||||
|
losses.append(epoch_loss / len(X_t))
|
||||||
|
return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)}
|
||||||
|
|
||||||
|
def score_samples(self, X: np.ndarray) -> np.ndarray:
|
||||||
|
"""Retourne l'erreur de reconstruction MSE par échantillon (plus élevé = plus anomal)."""
|
||||||
|
self.encoder.eval()
|
||||||
|
self.decoder.eval()
|
||||||
|
X_t = self._to_tensor(X)
|
||||||
|
with torch.no_grad():
|
||||||
|
latent = self.encoder(X_t)
|
||||||
|
recon = self.decoder(latent)
|
||||||
|
mse = ((recon - X_t) ** 2).mean(dim=1).numpy()
|
||||||
|
return mse
|
||||||
|
|
||||||
|
def encode(self, X: np.ndarray) -> np.ndarray:
|
||||||
|
"""Retourne l'espace latent (pour HDBSCAN clustering)."""
|
||||||
|
self.encoder.eval()
|
||||||
|
X_t = self._to_tensor(X)
|
||||||
|
with torch.no_grad():
|
||||||
|
return self.encoder(X_t).numpy()
|
||||||
|
|
||||||
|
def state_dict(self) -> dict:
|
||||||
|
return {
|
||||||
|
'encoder': self.encoder.state_dict(),
|
||||||
|
'decoder': self.decoder.state_dict(),
|
||||||
|
'scaler_min': self._scaler_min,
|
||||||
|
'scaler_range': self._scaler_range,
|
||||||
|
'n_features': self.n_features,
|
||||||
|
'latent_dim': self.latent_dim,
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load_state_dict(cls, state: dict) -> 'TrafficAutoEncoder':
|
||||||
|
ae = cls(state['n_features'], state['latent_dim'])
|
||||||
|
ae._scaler_min = state['scaler_min']
|
||||||
|
ae._scaler_range = state['scaler_range']
|
||||||
|
ae.encoder.load_state_dict(state['encoder'])
|
||||||
|
ae.decoder.load_state_dict(state['decoder'])
|
||||||
|
return ae
|
||||||
|
|
||||||
|
|
||||||
|
def _ae_model_path(name: str, version_id: str) -> str:
|
||||||
|
return os.path.join(MODEL_DIR, f'ae_{name}_{version_id}.pt')
|
||||||
|
|
||||||
|
|
||||||
def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str):
|
def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str):
|
||||||
"""Charge le modèle IsolationForest existant ou en entraîne un nouveau si nécessaire.
|
"""Charge le modèle IsolationForest existant ou en entraîne un nouveau si nécessaire.
|
||||||
|
|
||||||
@ -283,7 +413,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
|||||||
dérive, entraîne un nouveau modèle sur ``human_baseline``, le sérialise sur disque,
|
dérive, entraîne un nouveau modèle sur ``human_baseline``, le sérialise sur disque,
|
||||||
met à jour le fichier pointeur et purge les anciennes versions.
|
met à jour le fichier pointeur et purge les anciennes versions.
|
||||||
|
|
||||||
Retourne l'objet IsolationForest entraîné ou rechargé.
|
Retourne (IsolationForest, TrafficAutoEncoder|None).
|
||||||
"""
|
"""
|
||||||
model_path, meta = _get_current_version(name)
|
model_path, meta = _get_current_version(name)
|
||||||
if model_path and meta:
|
if model_path and meta:
|
||||||
@ -311,7 +441,16 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
|||||||
'trained_at': meta['trained_at'], 'human_samples': meta.get('human_samples', '?'),
|
'trained_at': meta['trained_at'], 'human_samples': meta.get('human_samples', '?'),
|
||||||
'retrain_in_hours': round(RETRAIN_INTERVAL_H - age_h, 1), 'drift_score': round(drift_score, 3)
|
'retrain_in_hours': round(RETRAIN_INTERVAL_H - age_h, 1), 'drift_score': round(drift_score, 3)
|
||||||
})
|
})
|
||||||
return joblib.load(model_path)
|
ae_loaded = None
|
||||||
|
if TORCH_AVAILABLE and AE_WEIGHT > 0:
|
||||||
|
ae_path = _ae_model_path(name, meta['version_id'])
|
||||||
|
if os.path.exists(ae_path):
|
||||||
|
try:
|
||||||
|
ae_loaded = TrafficAutoEncoder.load_state_dict(torch.load(ae_path, weights_only=False))
|
||||||
|
log_info(f"[{name}] Autoencoder v{meta['version_id']} rechargé.")
|
||||||
|
except Exception as exc:
|
||||||
|
log_info(f"[{name}] Erreur chargement AE : {exc} — AE désactivé ce cycle.")
|
||||||
|
return joblib.load(model_path), ae_loaded
|
||||||
elif not drift_forced:
|
elif not drift_forced:
|
||||||
log_info(f"[{name}] Modèle v{meta['version_id']} expiré ({age_h:.1f}h ≥ {RETRAIN_INTERVAL_H}h) — retraining.")
|
log_info(f"[{name}] Modèle v{meta['version_id']} expiré ({age_h:.1f}h ≥ {RETRAIN_INTERVAL_H}h) — retraining.")
|
||||||
|
|
||||||
@ -363,7 +502,15 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
|||||||
# Tenter de réutiliser le modèle précédent
|
# Tenter de réutiliser le modèle précédent
|
||||||
if model_path and os.path.exists(model_path):
|
if model_path and os.path.exists(model_path):
|
||||||
log_info(f"[{name}] Conservation du modèle précédent v{meta.get('version_id', '?')}.")
|
log_info(f"[{name}] Conservation du modèle précédent v{meta.get('version_id', '?')}.")
|
||||||
return joblib.load(model_path)
|
ae_prev = None
|
||||||
|
if TORCH_AVAILABLE and AE_WEIGHT > 0:
|
||||||
|
ae_prev_path = _ae_model_path(name, meta.get('version_id', ''))
|
||||||
|
if os.path.exists(ae_prev_path):
|
||||||
|
try:
|
||||||
|
ae_prev = TrafficAutoEncoder.load_state_dict(torch.load(ae_prev_path, weights_only=False))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return joblib.load(model_path), ae_prev
|
||||||
log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.")
|
log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.")
|
||||||
|
|
||||||
# A1 — Sauvegarder les statistiques de distribution avec quantile digest pour drift detection
|
# A1 — Sauvegarder les statistiques de distribution avec quantile digest pour drift detection
|
||||||
@ -381,6 +528,19 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
|||||||
new_meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
|
new_meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
|
||||||
joblib.dump(model, new_model_path)
|
joblib.dump(model, new_model_path)
|
||||||
|
|
||||||
|
# Entraînement de l'Autoencoder en parallèle (si PyTorch disponible et AE_WEIGHT > 0)
|
||||||
|
ae_model = None
|
||||||
|
if TORCH_AVAILABLE and AE_WEIGHT > 0:
|
||||||
|
try:
|
||||||
|
ae_model = TrafficAutoEncoder(n_features=len(features))
|
||||||
|
ae_stats = ae_model.fit(X_train.values)
|
||||||
|
ae_path = _ae_model_path(name, version_id)
|
||||||
|
torch.save(ae_model.state_dict(), ae_path)
|
||||||
|
log_info(f"[{name}] Autoencoder entraîné : loss={ae_stats['final_loss']:.6f}, epochs={ae_stats['epochs']}")
|
||||||
|
except Exception as exc:
|
||||||
|
log_info(f"[{name}] Autoencoder training échoué : {exc} — AE désactivé.")
|
||||||
|
ae_model = None
|
||||||
|
|
||||||
previous_version = meta.get('version_id', None) if meta else None
|
previous_version = meta.get('version_id', None) if meta else None
|
||||||
new_meta = {
|
new_meta = {
|
||||||
'version_id': version_id, 'trained_at': datetime.now().isoformat(),
|
'version_id': version_id, 'trained_at': datetime.now().isoformat(),
|
||||||
@ -389,6 +549,8 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
|||||||
'model_name': name, 'previous_version': previous_version,
|
'model_name': name, 'previous_version': previous_version,
|
||||||
'retrain_interval': RETRAIN_INTERVAL_H, 'baseline_stats': baseline_stats,
|
'retrain_interval': RETRAIN_INTERVAL_H, 'baseline_stats': baseline_stats,
|
||||||
'algorithm': 'ExtendedIsolationForest' if EIF_AVAILABLE else 'IsolationForest',
|
'algorithm': 'ExtendedIsolationForest' if EIF_AVAILABLE else 'IsolationForest',
|
||||||
|
'autoencoder': ae_model is not None,
|
||||||
|
'ae_weight': AE_WEIGHT if ae_model else 0.0,
|
||||||
'validation': {
|
'validation': {
|
||||||
'val_size': len(X_val), 'train_size': len(X_train),
|
'val_size': len(X_val), 'train_size': len(X_train),
|
||||||
'val_mean_score': round(val_mean_score, 4),
|
'val_mean_score': round(val_mean_score, 4),
|
||||||
@ -407,7 +569,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
|||||||
'human_samples': len(human_baseline), 'next_retrain_in_h': RETRAIN_INTERVAL_H,
|
'human_samples': len(human_baseline), 'next_retrain_in_h': RETRAIN_INTERVAL_H,
|
||||||
'history_kept': MODEL_HISTORY_COUNT
|
'history_kept': MODEL_HISTORY_COUNT
|
||||||
})
|
})
|
||||||
return model
|
return model, ae_model
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
# A1 — DÉTECTION DE DÉRIVE CONCEPTUELLE (CONCEPT DRIFT)
|
# A1 — DÉTECTION DE DÉRIVE CONCEPTUELLE (CONCEPT DRIFT)
|
||||||
@ -619,11 +781,13 @@ def _build_reason(name: str, row: pd.Series, shap_top: dict) -> str:
|
|||||||
# ═══════════════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
# A8 — CLUSTERING COMPORTEMENTAL DES ANOMALIES (DBSCAN)
|
# A8 — CLUSTERING COMPORTEMENTAL DES ANOMALIES (DBSCAN)
|
||||||
# ═══════════════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
|
def _cluster_anomalies(anomalies: pd.DataFrame, features: list, ae_model=None) -> pd.DataFrame:
|
||||||
"""A8 : Applique HDBSCAN (ou DBSCAN en fallback) sur les features normalisées des anomalies.
|
"""A8 : Applique HDBSCAN (ou DBSCAN en fallback) sur les features normalisées des anomalies.
|
||||||
|
|
||||||
HDBSCAN est préféré car il détermine automatiquement le nombre de clusters
|
HDBSCAN est préféré car il détermine automatiquement le nombre de clusters
|
||||||
et la densité optimale (pas de paramètre eps à régler manuellement).
|
et la densité optimale (pas de paramètre eps à régler manuellement).
|
||||||
|
Si un autoencoder est disponible, utilise l'espace latent (16-dim) au lieu des features brutes
|
||||||
|
pour un clustering plus expressif dans un espace de dimension réduite.
|
||||||
Ajoute une colonne campaign_id : −1 = IP isolée, ≥0 = identifiant de campagne coordonnée.
|
Ajoute une colonne campaign_id : −1 = IP isolée, ≥0 = identifiant de campagne coordonnée.
|
||||||
"""
|
"""
|
||||||
anomalies = anomalies.copy()
|
anomalies = anomalies.copy()
|
||||||
@ -632,7 +796,17 @@ def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
|
|||||||
return anomalies
|
return anomalies
|
||||||
try:
|
try:
|
||||||
X = anomalies[features].replace([np.inf, -np.inf], np.nan).fillna(0)
|
X = anomalies[features].replace([np.inf, -np.inf], np.nan).fillna(0)
|
||||||
X_scaled = StandardScaler().fit_transform(X)
|
# Utiliser l'espace latent AE si disponible (meilleure séparation dans un espace 16-dim)
|
||||||
|
if ae_model is not None:
|
||||||
|
try:
|
||||||
|
X_scaled = ae_model.encode(X.values)
|
||||||
|
algo_prefix = 'AE+'
|
||||||
|
except Exception:
|
||||||
|
X_scaled = StandardScaler().fit_transform(X)
|
||||||
|
algo_prefix = ''
|
||||||
|
else:
|
||||||
|
X_scaled = StandardScaler().fit_transform(X)
|
||||||
|
algo_prefix = ''
|
||||||
if HDBSCAN_AVAILABLE:
|
if HDBSCAN_AVAILABLE:
|
||||||
clusterer = _hdbscan.HDBSCAN(
|
clusterer = _hdbscan.HDBSCAN(
|
||||||
min_cluster_size=CLUSTERING_MIN_SAMPLES,
|
min_cluster_size=CLUSTERING_MIN_SAMPLES,
|
||||||
@ -644,7 +818,7 @@ def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
|
|||||||
labels = DBSCAN(eps=0.5, min_samples=CLUSTERING_MIN_SAMPLES).fit_predict(X_scaled)
|
labels = DBSCAN(eps=0.5, min_samples=CLUSTERING_MIN_SAMPLES).fit_predict(X_scaled)
|
||||||
anomalies['campaign_id'] = labels
|
anomalies['campaign_id'] = labels
|
||||||
n_campaigns = len(set(labels)) - (1 if -1 in labels else 0)
|
n_campaigns = len(set(labels)) - (1 if -1 in labels else 0)
|
||||||
algo = 'HDBSCAN' if HDBSCAN_AVAILABLE else 'DBSCAN'
|
algo = algo_prefix + ('HDBSCAN' if HDBSCAN_AVAILABLE else 'DBSCAN')
|
||||||
if n_campaigns > 0:
|
if n_campaigns > 0:
|
||||||
log_info(f"[{algo}] {n_campaigns} campagne(s) détectée(s) parmi {len(anomalies)} anomalies.")
|
log_info(f"[{algo}] {n_campaigns} campagne(s) détectée(s) parmi {len(anomalies)} anomalies.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -693,17 +867,34 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
|||||||
return pd.DataFrame(), pd.DataFrame()
|
return pd.DataFrame(), pd.DataFrame()
|
||||||
|
|
||||||
# A1 — Dérive conceptuelle intégrée dans load_or_train_model
|
# A1 — Dérive conceptuelle intégrée dans load_or_train_model
|
||||||
model = load_or_train_model(name, human_baseline, valid_features, cycle_id)
|
model, ae_model = load_or_train_model(name, human_baseline, valid_features, cycle_id)
|
||||||
unknown_traffic = unknown_traffic.copy()
|
unknown_traffic = unknown_traffic.copy()
|
||||||
|
|
||||||
X_test = unknown_traffic[valid_features].replace([np.inf, -np.inf], np.nan)
|
X_test = unknown_traffic[valid_features].replace([np.inf, -np.inf], np.nan)
|
||||||
X_test = X_test.fillna(X_test.median())
|
X_test = X_test.fillna(X_test.median())
|
||||||
raw_scores = model.decision_function(X_test)
|
raw_scores = model.decision_function(X_test)
|
||||||
|
|
||||||
|
# Combinaison EIF + Autoencoder si disponible
|
||||||
|
# Score final = (1-α) * eif_norm + α * ae_norm où α = AE_WEIGHT
|
||||||
|
if ae_model is not None and AE_WEIGHT > 0:
|
||||||
|
try:
|
||||||
|
ae_recon_errors = ae_model.score_samples(X_test.values)
|
||||||
|
ae_norm = normalize_scores(-ae_recon_errors) # plus élevé = plus anomal
|
||||||
|
eif_norm = normalize_scores(raw_scores)
|
||||||
|
combined_norm = (1 - AE_WEIGHT) * eif_norm + AE_WEIGHT * ae_norm
|
||||||
|
unknown_traffic['ae_recon_error'] = ae_recon_errors
|
||||||
|
unknown_traffic['anomaly_score'] = combined_norm
|
||||||
|
log_info(f"[{name}] Score combiné EIF+AE (α={AE_WEIGHT}): ae_mean={ae_recon_errors.mean():.6f}")
|
||||||
|
except Exception as exc:
|
||||||
|
log_info(f"[{name}] AE scoring échoué : {exc} — utilisation EIF seul.")
|
||||||
|
unknown_traffic['ae_recon_error'] = 0.0
|
||||||
|
unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
|
||||||
|
else:
|
||||||
|
unknown_traffic['ae_recon_error'] = 0.0
|
||||||
|
unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
|
||||||
|
|
||||||
# raw_anomaly_score : score brut IF pour comparaison au seuil et assignation du threat_level
|
# raw_anomaly_score : score brut IF pour comparaison au seuil et assignation du threat_level
|
||||||
# anomaly_score : score normalisé [-1, 0] pour cohérence cross-modèles (A10)
|
|
||||||
unknown_traffic['raw_anomaly_score'] = raw_scores
|
unknown_traffic['raw_anomaly_score'] = raw_scores
|
||||||
unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
|
|
||||||
unknown_traffic['model_name'] = name
|
unknown_traffic['model_name'] = name
|
||||||
|
|
||||||
# A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD)
|
# A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD)
|
||||||
@ -804,7 +995,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
|||||||
|
|
||||||
# A8 — Clustering DBSCAN pour identifier les campagnes coordonnées
|
# A8 — Clustering DBSCAN pour identifier les campagnes coordonnées
|
||||||
if ENABLE_CLUSTERING:
|
if ENABLE_CLUSTERING:
|
||||||
anomalies = _cluster_anomalies(anomalies, valid_features)
|
anomalies = _cluster_anomalies(anomalies, valid_features, ae_model=ae_model)
|
||||||
|
|
||||||
anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
|
anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
|
||||||
for _, row in anomalies.iterrows():
|
for _, row in anomalies.iterrows():
|
||||||
@ -1125,7 +1316,8 @@ def fetch_and_analyze():
|
|||||||
'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category',
|
'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category',
|
||||||
'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name',
|
'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name',
|
||||||
'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label',
|
'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label',
|
||||||
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id'
|
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id',
|
||||||
|
'ae_recon_error'
|
||||||
]
|
]
|
||||||
scores_df = all_scored[[c for c in all_scores_cols if c in all_scored.columns]]
|
scores_df = all_scored[[c for c in all_scores_cols if c in all_scored.columns]]
|
||||||
client.insert_df(f'{DB}.ml_all_scores', scores_df)
|
client.insert_df(f'{DB}.ml_all_scores', scores_df)
|
||||||
|
|||||||
@ -5,5 +5,6 @@ shap==0.47.2
|
|||||||
scipy>=1.14
|
scipy>=1.14
|
||||||
hdbscan>=0.8.38
|
hdbscan>=0.8.38
|
||||||
isotree>=0.6.1
|
isotree>=0.6.1
|
||||||
|
torch>=2.0
|
||||||
pyyaml>=6.0
|
pyyaml>=6.0
|
||||||
ja4-common @ file:///app/shared/ja4_common
|
ja4-common @ file:///app/shared/ja4_common
|
||||||
|
|||||||
@ -333,3 +333,174 @@ def test_lag1_autocorrelation_bot_vs_human():
|
|||||||
rho_human = 0.0
|
rho_human = 0.0
|
||||||
|
|
||||||
assert abs(rho_human) < 0.5, f"Human autocorrelation should be low, got {rho_human:.3f}"
|
assert abs(rho_human) < 0.5, f"Human autocorrelation should be low, got {rho_human:.3f}"
|
||||||
|
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
|
# AUTOENCODER TESTS
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
def test_ae_torch_availability_flag():
|
||||||
|
"""Verify torch availability detection works without crashing."""
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
avail = True
|
||||||
|
except ImportError:
|
||||||
|
avail = False
|
||||||
|
assert isinstance(avail, bool)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_ae(n_features, latent_dim=4):
|
||||||
|
"""Build a standalone TrafficAutoEncoder for testing (avoids importing bot_detector module)."""
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
class _AE:
|
||||||
|
def __init__(self, n_feat, ldim):
|
||||||
|
self.n_features = n_feat
|
||||||
|
self.latent_dim = ldim
|
||||||
|
self.device = torch.device('cpu')
|
||||||
|
dim1 = min(64, max(n_feat, ldim + 4))
|
||||||
|
dim2 = min(32, max(dim1 // 2, ldim + 2))
|
||||||
|
self.encoder = nn.Sequential(
|
||||||
|
nn.Linear(n_feat, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||||
|
nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||||
|
nn.Linear(dim2, ldim),
|
||||||
|
)
|
||||||
|
self.decoder = nn.Sequential(
|
||||||
|
nn.Linear(ldim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||||
|
nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||||
|
nn.Linear(dim1, n_feat), nn.Sigmoid(),
|
||||||
|
)
|
||||||
|
self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
|
||||||
|
self._scaler_min = None
|
||||||
|
self._scaler_range = None
|
||||||
|
|
||||||
|
def _to_tensor(self, X):
|
||||||
|
if self._scaler_min is not None:
|
||||||
|
X_n = (X - self._scaler_min) / (self._scaler_range + 1e-9)
|
||||||
|
else:
|
||||||
|
X_n = X
|
||||||
|
return torch.tensor(np.clip(X_n, 0, 1), dtype=torch.float32)
|
||||||
|
|
||||||
|
def fit(self, X, epochs=50, lr=1e-3, batch_size=256):
|
||||||
|
self._scaler_min = X.min(axis=0)
|
||||||
|
self._scaler_range = X.max(axis=0) - self._scaler_min
|
||||||
|
X_t = self._to_tensor(X)
|
||||||
|
dataset = torch.utils.data.TensorDataset(X_t)
|
||||||
|
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||||
|
optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5)
|
||||||
|
criterion = nn.MSELoss()
|
||||||
|
self.encoder.train(); self.decoder.train()
|
||||||
|
losses = []
|
||||||
|
for _ in range(epochs):
|
||||||
|
epoch_loss = 0.0
|
||||||
|
for (batch,) in loader:
|
||||||
|
latent = self.encoder(batch)
|
||||||
|
recon = self.decoder(latent)
|
||||||
|
loss = criterion(recon, batch)
|
||||||
|
optimizer.zero_grad(); loss.backward(); optimizer.step()
|
||||||
|
epoch_loss += loss.item() * len(batch)
|
||||||
|
losses.append(epoch_loss / len(X_t))
|
||||||
|
return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)}
|
||||||
|
|
||||||
|
def score_samples(self, X):
|
||||||
|
self.encoder.eval(); self.decoder.eval()
|
||||||
|
X_t = self._to_tensor(X)
|
||||||
|
with torch.no_grad():
|
||||||
|
return ((self.decoder(self.encoder(X_t)) - X_t) ** 2).mean(dim=1).numpy()
|
||||||
|
|
||||||
|
def encode(self, X):
|
||||||
|
self.encoder.eval()
|
||||||
|
X_t = self._to_tensor(X)
|
||||||
|
with torch.no_grad():
|
||||||
|
return self.encoder(X_t).numpy()
|
||||||
|
|
||||||
|
def state_dict(self):
|
||||||
|
return {'encoder': self.encoder.state_dict(), 'decoder': self.decoder.state_dict(),
|
||||||
|
'scaler_min': self._scaler_min, 'scaler_range': self._scaler_range,
|
||||||
|
'n_features': self.n_features, 'latent_dim': self.latent_dim}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load_state_dict(cls, state):
|
||||||
|
ae = cls(state['n_features'], state['latent_dim'])
|
||||||
|
ae._scaler_min = state['scaler_min']
|
||||||
|
ae._scaler_range = state['scaler_range']
|
||||||
|
ae.encoder.load_state_dict(state['encoder'])
|
||||||
|
ae.decoder.load_state_dict(state['decoder'])
|
||||||
|
return ae
|
||||||
|
|
||||||
|
return _AE(n_features, latent_dim)
|
||||||
|
|
||||||
|
|
||||||
|
def test_ae_class_train_and_score():
|
||||||
|
"""TrafficAutoEncoder trains on normal data and scores anomalies higher."""
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("torch not installed")
|
||||||
|
|
||||||
|
rng = np.random.default_rng(42)
|
||||||
|
n_features = 10
|
||||||
|
X_normal = rng.normal(0.5, 0.1, (200, n_features)).clip(0, 1)
|
||||||
|
X_anomaly = rng.uniform(0.8, 1.0, (20, n_features))
|
||||||
|
|
||||||
|
ae = _make_ae(n_features, latent_dim=4)
|
||||||
|
stats = ae.fit(X_normal, epochs=30, lr=1e-3)
|
||||||
|
assert stats['final_loss'] > 0, "Loss should be positive"
|
||||||
|
assert stats['epochs'] == 30
|
||||||
|
assert stats['n_samples'] == 200
|
||||||
|
|
||||||
|
normal_scores = ae.score_samples(X_normal)
|
||||||
|
anomaly_scores = ae.score_samples(X_anomaly)
|
||||||
|
assert np.mean(anomaly_scores) > np.mean(normal_scores), \
|
||||||
|
f"Anomaly MSE ({np.mean(anomaly_scores):.4f}) should > normal MSE ({np.mean(normal_scores):.4f})"
|
||||||
|
|
||||||
|
|
||||||
|
def test_ae_encode_latent_space():
|
||||||
|
"""Autoencoder encode() returns correct dimensionality."""
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("torch not installed")
|
||||||
|
|
||||||
|
rng = np.random.default_rng(42)
|
||||||
|
X = rng.normal(0.5, 0.1, (50, 8)).clip(0, 1)
|
||||||
|
|
||||||
|
ae = _make_ae(8, latent_dim=4)
|
||||||
|
ae.fit(X, epochs=5)
|
||||||
|
latent = ae.encode(X)
|
||||||
|
assert latent.shape == (50, 4), f"Latent shape should be (50, 4), got {latent.shape}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_ae_state_dict_save_load():
|
||||||
|
"""Autoencoder can save and load state dict."""
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("torch not installed")
|
||||||
|
|
||||||
|
rng = np.random.default_rng(42)
|
||||||
|
X = rng.normal(0.5, 0.1, (100, 6)).clip(0, 1)
|
||||||
|
|
||||||
|
ae = _make_ae(6, latent_dim=3)
|
||||||
|
ae.fit(X, epochs=10)
|
||||||
|
scores_before = ae.score_samples(X)
|
||||||
|
|
||||||
|
state = ae.state_dict()
|
||||||
|
ae2 = type(ae).load_state_dict(state)
|
||||||
|
scores_after = ae2.score_samples(X)
|
||||||
|
np.testing.assert_allclose(scores_before, scores_after, rtol=1e-5,
|
||||||
|
err_msg="Scores should be identical after load")
|
||||||
|
|
||||||
|
|
||||||
|
def test_ae_weight_combination():
|
||||||
|
"""Combined score should be weighted average of EIF and AE components."""
|
||||||
|
eif_norm = np.array([0.2, 0.8, 0.5])
|
||||||
|
ae_norm = np.array([0.3, 0.9, 0.4])
|
||||||
|
alpha = 0.30
|
||||||
|
combined = (1 - alpha) * eif_norm + alpha * ae_norm
|
||||||
|
expected = np.array([0.2*0.7 + 0.3*0.3, 0.8*0.7 + 0.9*0.3, 0.5*0.7 + 0.4*0.3])
|
||||||
|
np.testing.assert_allclose(combined, expected, rtol=1e-7)
|
||||||
|
# Combined should be between min and max of components
|
||||||
|
assert all(combined >= np.minimum(eif_norm, ae_norm) - 1e-9)
|
||||||
|
assert all(combined <= np.maximum(eif_norm, ae_norm) + 1e-9)
|
||||||
|
|||||||
@ -94,6 +94,8 @@ CREATE TABLE IF NOT EXISTS ja4_processing.ml_all_scores
|
|||||||
fuzzing_index Float32,
|
fuzzing_index Float32,
|
||||||
post_ratio Float32,
|
post_ratio Float32,
|
||||||
campaign_id Int32,
|
campaign_id Int32,
|
||||||
|
-- Autoencoder reconstruction error (parallel scorer)
|
||||||
|
ae_recon_error Float32 DEFAULT 0,
|
||||||
-- Anubis enrichment (deploy_schema.sql item 12)
|
-- Anubis enrichment (deploy_schema.sql item 12)
|
||||||
anubis_bot_name LowCardinality(String) DEFAULT '',
|
anubis_bot_name LowCardinality(String) DEFAULT '',
|
||||||
anubis_bot_action LowCardinality(String) DEFAULT '',
|
anubis_bot_action LowCardinality(String) DEFAULT '',
|
||||||
|
|||||||
Reference in New Issue
Block a user