feat(bot-detector): add parallel Autoencoder scorer (#9)

- TrafficAutoEncoder class: symmetric AE (n→64→32→16→32→64→n) with BatchNorm+ReLU - Trained alongside EIF on human_baseline, saved/loaded with model versioning - Score = per-sample MSE reconstruction error, combined with EIF via AE_WEIGHT (α=0.30) - AE latent space (16-dim) used for HDBSCAN clustering instead of raw features - Configurable: AE_WEIGHT, AE_EPOCHS, AE_LATENT_DIM, AE_LEARNING_RATE - Graceful fallback: if torch unavailable or AE fails, EIF-only scoring continues - ClickHouse: ae_recon_error column added to ml_all_scores - Tests: 5 new tests (AE train/score, encode latent, state dict save/load, weight combination) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-08 02:40:39 +02:00
parent f6e2d3c0ca
commit 57cf6c3828
4 changed files with 378 additions and 12 deletions
--- a/services/bot-detector/bot_detector/bot_detector.py
+++ b/services/bot-detector/bot_detector/bot_detector.py
@ -51,6 +51,15 @@ try:
 except ImportError:
    SHAP_AVAILABLE = False
 # PyTorch Autoencoder (Baptiste et al., arXiv 2602; Kitsune, Mirsky et al., NDSS 2018)
 # Second scorer parallèle : détecte les anomalies de reconstruction que l'IF manque
 try:
    import torch
    import torch.nn as nn
    TORCH_AVAILABLE = True
 except ImportError:
    TORCH_AVAILABLE = False
 warnings.filterwarnings('ignore')
 # ═══════════════════════════════════════════════════════════════════════════════
@ -123,6 +132,12 @@ MIN_VALID_FEATURE_RATIO = _require_float('MIN_VALID_FEATURE_RATIO', 0.50, 0, 1)
 ENABLE_CLUSTERING = os.getenv('ENABLE_CLUSTERING', 'true').lower() == 'true'
 CLUSTERING_MIN_SAMPLES = int(os.getenv('CLUSTERING_MIN_SAMPLES', '3'))
 # Autoencoder — Second scorer parallèle (Baptiste et al. 2026, Kitsune / Mirsky et al. NDSS 2018)
 AE_WEIGHT = _require_float('AE_WEIGHT', 0.30, 0, 1)  # pondération dans le score combiné (0 = IF seul)
 AE_EPOCHS = int(os.getenv('AE_EPOCHS', '50'))
 AE_LATENT_DIM = int(os.getenv('AE_LATENT_DIM', '16'))
 AE_LEARNING_RATE = float(os.getenv('AE_LEARNING_RATE', '1e-3'))
 # Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé)
 # Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design.
 STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
@ -275,6 +290,121 @@ def _purge_old_versions(name: str):
        if os.path.exists(meta_path): os.remove(meta_path)
        log_info(f"[{name}] Version purgée : {version_id} (limite={MODEL_HISTORY_COUNT})")
 # ═══════════════════════════════════════════════════════════════════════════════
 # AUTOENCODER — Second scorer parallèle (détection d'anomalies par reconstruction)
 # ═══════════════════════════════════════════════════════════════════════════════
 class TrafficAutoEncoder:
    """Autoencoder symétrique pour détection d'anomalies par erreur de reconstruction.
    Architecture : encoder (n→64→32→latent_dim) — decoder (latent_dim→32→64→n)
    Activation : ReLU + BatchNorm (encoder/decoder), sigmoid (sortie — données normalisées [0,1])
    Score = MSE(input, reconstruction) par échantillon.
    L'espace latent (16-dim par défaut) peut servir de features compressées pour HDBSCAN.
    """
    def __init__(self, n_features: int, latent_dim: int = AE_LATENT_DIM):
        if not TORCH_AVAILABLE:
            raise RuntimeError("PyTorch non disponible — autoencoder désactivé.")
        self.n_features = n_features
        self.latent_dim = latent_dim
        self.device = torch.device('cpu')
        self._build_model()
        self._scaler_min = None
        self._scaler_range = None
    def _build_model(self):
        dim1 = min(64, max(self.n_features, self.latent_dim + 4))
        dim2 = min(32, max(dim1 // 2, self.latent_dim + 2))
        self.encoder = nn.Sequential(
            nn.Linear(self.n_features, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
            nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
            nn.Linear(dim2, self.latent_dim),
        ).to(self.device)
        self.decoder = nn.Sequential(
            nn.Linear(self.latent_dim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
            nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
            nn.Linear(dim1, self.n_features), nn.Sigmoid(),
        ).to(self.device)
        self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
    def _to_tensor(self, X: np.ndarray) -> torch.Tensor:
        """Normalise [0,1] via min-max puis convertit en Tensor."""
        if self._scaler_min is not None:
            X_norm = (X - self._scaler_min) / (self._scaler_range + 1e-9)
        else:
            X_norm = X
        return torch.tensor(np.clip(X_norm, 0, 1), dtype=torch.float32, device=self.device)
    def fit(self, X: np.ndarray, epochs: int = AE_EPOCHS, lr: float = AE_LEARNING_RATE,
            batch_size: int = 256) -> dict:
        """Entraîne l'autoencoder sur la baseline humaine (données normales)."""
        self._scaler_min = X.min(axis=0)
        self._scaler_range = X.max(axis=0) - self._scaler_min
        X_t = self._to_tensor(X)
        dataset = torch.utils.data.TensorDataset(X_t)
        loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
        optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5)
        criterion = nn.MSELoss()
        self.encoder.train()
        self.decoder.train()
        losses = []
        for epoch in range(epochs):
            epoch_loss = 0.0
            for (batch,) in loader:
                latent = self.encoder(batch)
                recon = self.decoder(latent)
                loss = criterion(recon, batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item() * len(batch)
            losses.append(epoch_loss / len(X_t))
        return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)}
    def score_samples(self, X: np.ndarray) -> np.ndarray:
        """Retourne l'erreur de reconstruction MSE par échantillon (plus élevé = plus anomal)."""
        self.encoder.eval()
        self.decoder.eval()
        X_t = self._to_tensor(X)
        with torch.no_grad():
            latent = self.encoder(X_t)
            recon = self.decoder(latent)
            mse = ((recon - X_t) ** 2).mean(dim=1).numpy()
        return mse
    def encode(self, X: np.ndarray) -> np.ndarray:
        """Retourne l'espace latent (pour HDBSCAN clustering)."""
        self.encoder.eval()
        X_t = self._to_tensor(X)
        with torch.no_grad():
            return self.encoder(X_t).numpy()
    def state_dict(self) -> dict:
        return {
            'encoder': self.encoder.state_dict(),
            'decoder': self.decoder.state_dict(),
            'scaler_min': self._scaler_min,
            'scaler_range': self._scaler_range,
            'n_features': self.n_features,
            'latent_dim': self.latent_dim,
        }
    @classmethod
    def load_state_dict(cls, state: dict) -> 'TrafficAutoEncoder':
        ae = cls(state['n_features'], state['latent_dim'])
        ae._scaler_min = state['scaler_min']
        ae._scaler_range = state['scaler_range']
        ae.encoder.load_state_dict(state['encoder'])
        ae.decoder.load_state_dict(state['decoder'])
        return ae
 def _ae_model_path(name: str, version_id: str) -> str:
    return os.path.join(MODEL_DIR, f'ae_{name}_{version_id}.pt')
 def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str):
    """Charge le modèle IsolationForest existant ou en entraîne un nouveau si nécessaire.
@ -283,7 +413,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
    dérive, entraîne un nouveau modèle sur ``human_baseline``, le sérialise sur disque,
    met à jour le fichier pointeur et purge les anciennes versions.
-    Retourne l'objet IsolationForest entraîné ou rechargé.
+    Retourne (IsolationForest, TrafficAutoEncoder|None).
    """
    model_path, meta = _get_current_version(name)
    if model_path and meta:
@ -311,7 +441,16 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
                'trained_at': meta['trained_at'], 'human_samples': meta.get('human_samples', '?'),
                'retrain_in_hours': round(RETRAIN_INTERVAL_H - age_h, 1), 'drift_score': round(drift_score, 3)
            })
-            return joblib.load(model_path)
+            ae_loaded = None
            if TORCH_AVAILABLE and AE_WEIGHT > 0:
                ae_path = _ae_model_path(name, meta['version_id'])
                if os.path.exists(ae_path):
                    try:
                        ae_loaded = TrafficAutoEncoder.load_state_dict(torch.load(ae_path, weights_only=False))
                        log_info(f"[{name}] Autoencoder v{meta['version_id']} rechargé.")
                    except Exception as exc:
                        log_info(f"[{name}] Erreur chargement AE : {exc} — AE désactivé ce cycle.")
            return joblib.load(model_path), ae_loaded
        elif not drift_forced:
            log_info(f"[{name}] Modèle v{meta['version_id']} expiré ({age_h:.1f}h ≥ {RETRAIN_INTERVAL_H}h) — retraining.")
@ -363,7 +502,15 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
        # Tenter de réutiliser le modèle précédent
        if model_path and os.path.exists(model_path):
            log_info(f"[{name}] Conservation du modèle précédent v{meta.get('version_id', '?')}.")
-            return joblib.load(model_path)
+            ae_prev = None
            if TORCH_AVAILABLE and AE_WEIGHT > 0:
                ae_prev_path = _ae_model_path(name, meta.get('version_id', ''))
                if os.path.exists(ae_prev_path):
                    try:
                        ae_prev = TrafficAutoEncoder.load_state_dict(torch.load(ae_prev_path, weights_only=False))
                    except Exception:
                        pass
            return joblib.load(model_path), ae_prev
        log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.")
    # A1 — Sauvegarder les statistiques de distribution avec quantile digest pour drift detection
@ -381,6 +528,19 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
    new_meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
    joblib.dump(model, new_model_path)
    # Entraînement de l'Autoencoder en parallèle (si PyTorch disponible et AE_WEIGHT > 0)
    ae_model = None
    if TORCH_AVAILABLE and AE_WEIGHT > 0:
        try:
            ae_model = TrafficAutoEncoder(n_features=len(features))
            ae_stats = ae_model.fit(X_train.values)
            ae_path = _ae_model_path(name, version_id)
            torch.save(ae_model.state_dict(), ae_path)
            log_info(f"[{name}] Autoencoder entraîné : loss={ae_stats['final_loss']:.6f}, epochs={ae_stats['epochs']}")
        except Exception as exc:
            log_info(f"[{name}] Autoencoder training échoué : {exc} — AE désactivé.")
            ae_model = None
    previous_version = meta.get('version_id', None) if meta else None
    new_meta = {
        'version_id': version_id, 'trained_at': datetime.now().isoformat(),
@ -389,6 +549,8 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
        'model_name': name, 'previous_version': previous_version,
        'retrain_interval': RETRAIN_INTERVAL_H, 'baseline_stats': baseline_stats,
        'algorithm': 'ExtendedIsolationForest' if EIF_AVAILABLE else 'IsolationForest',
        'autoencoder': ae_model is not None,
        'ae_weight': AE_WEIGHT if ae_model else 0.0,
        'validation': {
            'val_size': len(X_val), 'train_size': len(X_train),
            'val_mean_score': round(val_mean_score, 4),
@ -407,7 +569,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
        'human_samples': len(human_baseline), 'next_retrain_in_h': RETRAIN_INTERVAL_H,
        'history_kept': MODEL_HISTORY_COUNT
    })
-    return model
+    return model, ae_model
 # ═══════════════════════════════════════════════════════════════════════════════
 # A1 — DÉTECTION DE DÉRIVE CONCEPTUELLE (CONCEPT DRIFT)
@ -619,11 +781,13 @@ def _build_reason(name: str, row: pd.Series, shap_top: dict) -> str:
 # ═══════════════════════════════════════════════════════════════════════════════
 # A8 — CLUSTERING COMPORTEMENTAL DES ANOMALIES (DBSCAN)
 # ═══════════════════════════════════════════════════════════════════════════════
-def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
+def _cluster_anomalies(anomalies: pd.DataFrame, features: list, ae_model=None) -> pd.DataFrame:
    """A8 : Applique HDBSCAN (ou DBSCAN en fallback) sur les features normalisées des anomalies.
    HDBSCAN est préféré car il détermine automatiquement le nombre de clusters
    et la densité optimale (pas de paramètre eps à régler manuellement).
    Si un autoencoder est disponible, utilise l'espace latent (16-dim) au lieu des features brutes
    pour un clustering plus expressif dans un espace de dimension réduite.
    Ajoute une colonne campaign_id : −1 = IP isolée, ≥0 = identifiant de campagne coordonnée.
    """
    anomalies = anomalies.copy()
@ -632,7 +796,17 @@ def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
        return anomalies
    try:
        X = anomalies[features].replace([np.inf, -np.inf], np.nan).fillna(0)
-        X_scaled = StandardScaler().fit_transform(X)
+        # Utiliser l'espace latent AE si disponible (meilleure séparation dans un espace 16-dim)
        if ae_model is not None:
            try:
                X_scaled = ae_model.encode(X.values)
                algo_prefix = 'AE+'
            except Exception:
                X_scaled = StandardScaler().fit_transform(X)
                algo_prefix = ''
        else:
            X_scaled = StandardScaler().fit_transform(X)
            algo_prefix = ''
        if HDBSCAN_AVAILABLE:
            clusterer = _hdbscan.HDBSCAN(
                min_cluster_size=CLUSTERING_MIN_SAMPLES,
@ -644,7 +818,7 @@ def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
            labels = DBSCAN(eps=0.5, min_samples=CLUSTERING_MIN_SAMPLES).fit_predict(X_scaled)
        anomalies['campaign_id'] = labels
        n_campaigns = len(set(labels)) - (1 if -1 in labels else 0)
-        algo = 'HDBSCAN' if HDBSCAN_AVAILABLE else 'DBSCAN'
+        algo = algo_prefix + ('HDBSCAN' if HDBSCAN_AVAILABLE else 'DBSCAN')
        if n_campaigns > 0:
            log_info(f"[{algo}] {n_campaigns} campagne(s) détectée(s) parmi {len(anomalies)} anomalies.")
    except Exception as e:
@ -693,17 +867,34 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
        return pd.DataFrame(), pd.DataFrame()
    # A1 — Dérive conceptuelle intégrée dans load_or_train_model
-    model = load_or_train_model(name, human_baseline, valid_features, cycle_id)
+    model, ae_model = load_or_train_model(name, human_baseline, valid_features, cycle_id)
    unknown_traffic = unknown_traffic.copy()
    X_test = unknown_traffic[valid_features].replace([np.inf, -np.inf], np.nan)
    X_test = X_test.fillna(X_test.median())
    raw_scores = model.decision_function(X_test)
    # Combinaison EIF + Autoencoder si disponible
    # Score final = (1-α) * eif_norm + α * ae_norm   où α = AE_WEIGHT
    if ae_model is not None and AE_WEIGHT > 0:
        try:
            ae_recon_errors = ae_model.score_samples(X_test.values)
            ae_norm = normalize_scores(-ae_recon_errors)  # plus élevé = plus anomal
            eif_norm = normalize_scores(raw_scores)
            combined_norm = (1 - AE_WEIGHT) * eif_norm + AE_WEIGHT * ae_norm
            unknown_traffic['ae_recon_error'] = ae_recon_errors
            unknown_traffic['anomaly_score'] = combined_norm
            log_info(f"[{name}] Score combiné EIF+AE (α={AE_WEIGHT}): ae_mean={ae_recon_errors.mean():.6f}")
        except Exception as exc:
            log_info(f"[{name}] AE scoring échoué : {exc} — utilisation EIF seul.")
            unknown_traffic['ae_recon_error'] = 0.0
            unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
    else:
        unknown_traffic['ae_recon_error'] = 0.0
        unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
    # raw_anomaly_score : score brut IF pour comparaison au seuil et assignation du threat_level
    # anomaly_score     : score normalisé [-1, 0] pour cohérence cross-modèles (A10)
    unknown_traffic['raw_anomaly_score'] = raw_scores
    unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
    unknown_traffic['model_name'] = name
    # A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD)
@ -804,7 +995,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
        # A8 — Clustering DBSCAN pour identifier les campagnes coordonnées
        if ENABLE_CLUSTERING:
-            anomalies = _cluster_anomalies(anomalies, valid_features)
+            anomalies = _cluster_anomalies(anomalies, valid_features, ae_model=ae_model)
        anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
        for _, row in anomalies.iterrows():
@ -1125,7 +1316,8 @@ def fetch_and_analyze():
                'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category',
                'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name',
                'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label',
-                'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id'
+                'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id',
                'ae_recon_error'
            ]
            scores_df = all_scored[[c for c in all_scores_cols if c in all_scored.columns]]
            client.insert_df(f'{DB}.ml_all_scores', scores_df)
--- a/services/bot-detector/bot_detector/requirements.txt
+++ b/services/bot-detector/bot_detector/requirements.txt
@ -5,5 +5,6 @@ shap==0.47.2
 scipy>=1.14
 hdbscan>=0.8.38
 isotree>=0.6.1
 torch>=2.0
 pyyaml>=6.0
 ja4-common @ file:///app/shared/ja4_common
--- a/services/bot-detector/bot_detector/tests/test_detector.py
+++ b/services/bot-detector/bot_detector/tests/test_detector.py
@ -333,3 +333,174 @@ def test_lag1_autocorrelation_bot_vs_human():
        rho_human = 0.0
    assert abs(rho_human) < 0.5, f"Human autocorrelation should be low, got {rho_human:.3f}"
 # ═══════════════════════════════════════════════════════════════════════════════
 # AUTOENCODER TESTS
 # ═══════════════════════════════════════════════════════════════════════════════
 def test_ae_torch_availability_flag():
    """Verify torch availability detection works without crashing."""
    try:
        import torch
        avail = True
    except ImportError:
        avail = False
    assert isinstance(avail, bool)
 def _make_ae(n_features, latent_dim=4):
    """Build a standalone TrafficAutoEncoder for testing (avoids importing bot_detector module)."""
    import torch
    import torch.nn as nn
    class _AE:
        def __init__(self, n_feat, ldim):
            self.n_features = n_feat
            self.latent_dim = ldim
            self.device = torch.device('cpu')
            dim1 = min(64, max(n_feat, ldim + 4))
            dim2 = min(32, max(dim1 // 2, ldim + 2))
            self.encoder = nn.Sequential(
                nn.Linear(n_feat, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
                nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
                nn.Linear(dim2, ldim),
            )
            self.decoder = nn.Sequential(
                nn.Linear(ldim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
                nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
                nn.Linear(dim1, n_feat), nn.Sigmoid(),
            )
            self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
            self._scaler_min = None
            self._scaler_range = None
        def _to_tensor(self, X):
            if self._scaler_min is not None:
                X_n = (X - self._scaler_min) / (self._scaler_range + 1e-9)
            else:
                X_n = X
            return torch.tensor(np.clip(X_n, 0, 1), dtype=torch.float32)
        def fit(self, X, epochs=50, lr=1e-3, batch_size=256):
            self._scaler_min = X.min(axis=0)
            self._scaler_range = X.max(axis=0) - self._scaler_min
            X_t = self._to_tensor(X)
            dataset = torch.utils.data.TensorDataset(X_t)
            loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
            optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5)
            criterion = nn.MSELoss()
            self.encoder.train(); self.decoder.train()
            losses = []
            for _ in range(epochs):
                epoch_loss = 0.0
                for (batch,) in loader:
                    latent = self.encoder(batch)
                    recon = self.decoder(latent)
                    loss = criterion(recon, batch)
                    optimizer.zero_grad(); loss.backward(); optimizer.step()
                    epoch_loss += loss.item() * len(batch)
                losses.append(epoch_loss / len(X_t))
            return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)}
        def score_samples(self, X):
            self.encoder.eval(); self.decoder.eval()
            X_t = self._to_tensor(X)
            with torch.no_grad():
                return ((self.decoder(self.encoder(X_t)) - X_t) ** 2).mean(dim=1).numpy()
        def encode(self, X):
            self.encoder.eval()
            X_t = self._to_tensor(X)
            with torch.no_grad():
                return self.encoder(X_t).numpy()
        def state_dict(self):
            return {'encoder': self.encoder.state_dict(), 'decoder': self.decoder.state_dict(),
                    'scaler_min': self._scaler_min, 'scaler_range': self._scaler_range,
                    'n_features': self.n_features, 'latent_dim': self.latent_dim}
        @classmethod
        def load_state_dict(cls, state):
            ae = cls(state['n_features'], state['latent_dim'])
            ae._scaler_min = state['scaler_min']
            ae._scaler_range = state['scaler_range']
            ae.encoder.load_state_dict(state['encoder'])
            ae.decoder.load_state_dict(state['decoder'])
            return ae
    return _AE(n_features, latent_dim)
 def test_ae_class_train_and_score():
    """TrafficAutoEncoder trains on normal data and scores anomalies higher."""
    try:
        import torch
    except ImportError:
        pytest.skip("torch not installed")
    rng = np.random.default_rng(42)
    n_features = 10
    X_normal = rng.normal(0.5, 0.1, (200, n_features)).clip(0, 1)
    X_anomaly = rng.uniform(0.8, 1.0, (20, n_features))
    ae = _make_ae(n_features, latent_dim=4)
    stats = ae.fit(X_normal, epochs=30, lr=1e-3)
    assert stats['final_loss'] > 0, "Loss should be positive"
    assert stats['epochs'] == 30
    assert stats['n_samples'] == 200
    normal_scores = ae.score_samples(X_normal)
    anomaly_scores = ae.score_samples(X_anomaly)
    assert np.mean(anomaly_scores) > np.mean(normal_scores), \
        f"Anomaly MSE ({np.mean(anomaly_scores):.4f}) should > normal MSE ({np.mean(normal_scores):.4f})"
 def test_ae_encode_latent_space():
    """Autoencoder encode() returns correct dimensionality."""
    try:
        import torch
    except ImportError:
        pytest.skip("torch not installed")
    rng = np.random.default_rng(42)
    X = rng.normal(0.5, 0.1, (50, 8)).clip(0, 1)
    ae = _make_ae(8, latent_dim=4)
    ae.fit(X, epochs=5)
    latent = ae.encode(X)
    assert latent.shape == (50, 4), f"Latent shape should be (50, 4), got {latent.shape}"
 def test_ae_state_dict_save_load():
    """Autoencoder can save and load state dict."""
    try:
        import torch
    except ImportError:
        pytest.skip("torch not installed")
    rng = np.random.default_rng(42)
    X = rng.normal(0.5, 0.1, (100, 6)).clip(0, 1)
    ae = _make_ae(6, latent_dim=3)
    ae.fit(X, epochs=10)
    scores_before = ae.score_samples(X)
    state = ae.state_dict()
    ae2 = type(ae).load_state_dict(state)
    scores_after = ae2.score_samples(X)
    np.testing.assert_allclose(scores_before, scores_after, rtol=1e-5,
                               err_msg="Scores should be identical after load")
 def test_ae_weight_combination():
    """Combined score should be weighted average of EIF and AE components."""
    eif_norm = np.array([0.2, 0.8, 0.5])
    ae_norm = np.array([0.3, 0.9, 0.4])
    alpha = 0.30
    combined = (1 - alpha) * eif_norm + alpha * ae_norm
    expected = np.array([0.2*0.7 + 0.3*0.3, 0.8*0.7 + 0.9*0.3, 0.5*0.7 + 0.4*0.3])
    np.testing.assert_allclose(combined, expected, rtol=1e-7)
    # Combined should be between min and max of components
    assert all(combined >= np.minimum(eif_norm, ae_norm) - 1e-9)
    assert all(combined <= np.maximum(eif_norm, ae_norm) + 1e-9)
--- a/shared/clickhouse/06_ml_tables.sql
+++ b/shared/clickhouse/06_ml_tables.sql
@ -94,6 +94,8 @@ CREATE TABLE IF NOT EXISTS ja4_processing.ml_all_scores
    fuzzing_index     Float32,
    post_ratio        Float32,
    campaign_id       Int32,
    -- Autoencoder reconstruction error (parallel scorer)
    ae_recon_error    Float32 DEFAULT 0,
    -- Anubis enrichment (deploy_schema.sql item 12)
    anubis_bot_name     LowCardinality(String) DEFAULT '',
    anubis_bot_action   LowCardinality(String) DEFAULT '',