From 57cf6c382894c2a6f4b4fa1db7992070d9bf4e1a Mon Sep 17 00:00:00 2001 From: toto Date: Wed, 8 Apr 2026 02:40:39 +0200 Subject: [PATCH] feat(bot-detector): add parallel Autoencoder scorer (#9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - TrafficAutoEncoder class: symmetric AE (n→64→32→16→32→64→n) with BatchNorm+ReLU - Trained alongside EIF on human_baseline, saved/loaded with model versioning - Score = per-sample MSE reconstruction error, combined with EIF via AE_WEIGHT (α=0.30) - AE latent space (16-dim) used for HDBSCAN clustering instead of raw features - Configurable: AE_WEIGHT, AE_EPOCHS, AE_LATENT_DIM, AE_LEARNING_RATE - Graceful fallback: if torch unavailable or AE fails, EIF-only scoring continues - ClickHouse: ae_recon_error column added to ml_all_scores - Tests: 5 new tests (AE train/score, encode latent, state dict save/load, weight combination) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../bot-detector/bot_detector/bot_detector.py | 216 +++++++++++++++++- .../bot_detector/requirements.txt | 1 + .../bot_detector/tests/test_detector.py | 171 ++++++++++++++ shared/clickhouse/06_ml_tables.sql | 2 + 4 files changed, 378 insertions(+), 12 deletions(-) diff --git a/services/bot-detector/bot_detector/bot_detector.py b/services/bot-detector/bot_detector/bot_detector.py index fbc99bd..b68e983 100644 --- a/services/bot-detector/bot_detector/bot_detector.py +++ b/services/bot-detector/bot_detector/bot_detector.py @@ -51,6 +51,15 @@ try: except ImportError: SHAP_AVAILABLE = False +# PyTorch Autoencoder (Baptiste et al., arXiv 2602; Kitsune, Mirsky et al., NDSS 2018) +# Second scorer parallèle : détecte les anomalies de reconstruction que l'IF manque +try: + import torch + import torch.nn as nn + TORCH_AVAILABLE = True +except ImportError: + TORCH_AVAILABLE = False + warnings.filterwarnings('ignore') # ═══════════════════════════════════════════════════════════════════════════════ @@ -123,6 +132,12 @@ MIN_VALID_FEATURE_RATIO = _require_float('MIN_VALID_FEATURE_RATIO', 0.50, 0, 1) ENABLE_CLUSTERING = os.getenv('ENABLE_CLUSTERING', 'true').lower() == 'true' CLUSTERING_MIN_SAMPLES = int(os.getenv('CLUSTERING_MIN_SAMPLES', '3')) +# Autoencoder — Second scorer parallèle (Baptiste et al. 2026, Kitsune / Mirsky et al. NDSS 2018) +AE_WEIGHT = _require_float('AE_WEIGHT', 0.30, 0, 1) # pondération dans le score combiné (0 = IF seul) +AE_EPOCHS = int(os.getenv('AE_EPOCHS', '50')) +AE_LATENT_DIM = int(os.getenv('AE_LATENT_DIM', '16')) +AE_LEARNING_RATE = float(os.getenv('AE_LEARNING_RATE', '1e-3')) + # Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé) # Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design. STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = { @@ -275,6 +290,121 @@ def _purge_old_versions(name: str): if os.path.exists(meta_path): os.remove(meta_path) log_info(f"[{name}] Version purgée : {version_id} (limite={MODEL_HISTORY_COUNT})") +# ═══════════════════════════════════════════════════════════════════════════════ +# AUTOENCODER — Second scorer parallèle (détection d'anomalies par reconstruction) +# ═══════════════════════════════════════════════════════════════════════════════ +class TrafficAutoEncoder: + """Autoencoder symétrique pour détection d'anomalies par erreur de reconstruction. + + Architecture : encoder (n→64→32→latent_dim) — decoder (latent_dim→32→64→n) + Activation : ReLU + BatchNorm (encoder/decoder), sigmoid (sortie — données normalisées [0,1]) + Score = MSE(input, reconstruction) par échantillon. + L'espace latent (16-dim par défaut) peut servir de features compressées pour HDBSCAN. + """ + + def __init__(self, n_features: int, latent_dim: int = AE_LATENT_DIM): + if not TORCH_AVAILABLE: + raise RuntimeError("PyTorch non disponible — autoencoder désactivé.") + self.n_features = n_features + self.latent_dim = latent_dim + self.device = torch.device('cpu') + self._build_model() + self._scaler_min = None + self._scaler_range = None + + def _build_model(self): + dim1 = min(64, max(self.n_features, self.latent_dim + 4)) + dim2 = min(32, max(dim1 // 2, self.latent_dim + 2)) + self.encoder = nn.Sequential( + nn.Linear(self.n_features, dim1), nn.BatchNorm1d(dim1), nn.ReLU(), + nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(), + nn.Linear(dim2, self.latent_dim), + ).to(self.device) + self.decoder = nn.Sequential( + nn.Linear(self.latent_dim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(), + nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(), + nn.Linear(dim1, self.n_features), nn.Sigmoid(), + ).to(self.device) + self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters()) + + def _to_tensor(self, X: np.ndarray) -> torch.Tensor: + """Normalise [0,1] via min-max puis convertit en Tensor.""" + if self._scaler_min is not None: + X_norm = (X - self._scaler_min) / (self._scaler_range + 1e-9) + else: + X_norm = X + return torch.tensor(np.clip(X_norm, 0, 1), dtype=torch.float32, device=self.device) + + def fit(self, X: np.ndarray, epochs: int = AE_EPOCHS, lr: float = AE_LEARNING_RATE, + batch_size: int = 256) -> dict: + """Entraîne l'autoencoder sur la baseline humaine (données normales).""" + self._scaler_min = X.min(axis=0) + self._scaler_range = X.max(axis=0) - self._scaler_min + X_t = self._to_tensor(X) + + dataset = torch.utils.data.TensorDataset(X_t) + loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True) + optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5) + criterion = nn.MSELoss() + + self.encoder.train() + self.decoder.train() + losses = [] + for epoch in range(epochs): + epoch_loss = 0.0 + for (batch,) in loader: + latent = self.encoder(batch) + recon = self.decoder(latent) + loss = criterion(recon, batch) + optimizer.zero_grad() + loss.backward() + optimizer.step() + epoch_loss += loss.item() * len(batch) + losses.append(epoch_loss / len(X_t)) + return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)} + + def score_samples(self, X: np.ndarray) -> np.ndarray: + """Retourne l'erreur de reconstruction MSE par échantillon (plus élevé = plus anomal).""" + self.encoder.eval() + self.decoder.eval() + X_t = self._to_tensor(X) + with torch.no_grad(): + latent = self.encoder(X_t) + recon = self.decoder(latent) + mse = ((recon - X_t) ** 2).mean(dim=1).numpy() + return mse + + def encode(self, X: np.ndarray) -> np.ndarray: + """Retourne l'espace latent (pour HDBSCAN clustering).""" + self.encoder.eval() + X_t = self._to_tensor(X) + with torch.no_grad(): + return self.encoder(X_t).numpy() + + def state_dict(self) -> dict: + return { + 'encoder': self.encoder.state_dict(), + 'decoder': self.decoder.state_dict(), + 'scaler_min': self._scaler_min, + 'scaler_range': self._scaler_range, + 'n_features': self.n_features, + 'latent_dim': self.latent_dim, + } + + @classmethod + def load_state_dict(cls, state: dict) -> 'TrafficAutoEncoder': + ae = cls(state['n_features'], state['latent_dim']) + ae._scaler_min = state['scaler_min'] + ae._scaler_range = state['scaler_range'] + ae.encoder.load_state_dict(state['encoder']) + ae.decoder.load_state_dict(state['decoder']) + return ae + + +def _ae_model_path(name: str, version_id: str) -> str: + return os.path.join(MODEL_DIR, f'ae_{name}_{version_id}.pt') + + def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str): """Charge le modèle IsolationForest existant ou en entraîne un nouveau si nécessaire. @@ -283,7 +413,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, dérive, entraîne un nouveau modèle sur ``human_baseline``, le sérialise sur disque, met à jour le fichier pointeur et purge les anciennes versions. - Retourne l'objet IsolationForest entraîné ou rechargé. + Retourne (IsolationForest, TrafficAutoEncoder|None). """ model_path, meta = _get_current_version(name) if model_path and meta: @@ -311,7 +441,16 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, 'trained_at': meta['trained_at'], 'human_samples': meta.get('human_samples', '?'), 'retrain_in_hours': round(RETRAIN_INTERVAL_H - age_h, 1), 'drift_score': round(drift_score, 3) }) - return joblib.load(model_path) + ae_loaded = None + if TORCH_AVAILABLE and AE_WEIGHT > 0: + ae_path = _ae_model_path(name, meta['version_id']) + if os.path.exists(ae_path): + try: + ae_loaded = TrafficAutoEncoder.load_state_dict(torch.load(ae_path, weights_only=False)) + log_info(f"[{name}] Autoencoder v{meta['version_id']} rechargé.") + except Exception as exc: + log_info(f"[{name}] Erreur chargement AE : {exc} — AE désactivé ce cycle.") + return joblib.load(model_path), ae_loaded elif not drift_forced: log_info(f"[{name}] Modèle v{meta['version_id']} expiré ({age_h:.1f}h ≥ {RETRAIN_INTERVAL_H}h) — retraining.") @@ -363,7 +502,15 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, # Tenter de réutiliser le modèle précédent if model_path and os.path.exists(model_path): log_info(f"[{name}] Conservation du modèle précédent v{meta.get('version_id', '?')}.") - return joblib.load(model_path) + ae_prev = None + if TORCH_AVAILABLE and AE_WEIGHT > 0: + ae_prev_path = _ae_model_path(name, meta.get('version_id', '')) + if os.path.exists(ae_prev_path): + try: + ae_prev = TrafficAutoEncoder.load_state_dict(torch.load(ae_prev_path, weights_only=False)) + except Exception: + pass + return joblib.load(model_path), ae_prev log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.") # A1 — Sauvegarder les statistiques de distribution avec quantile digest pour drift detection @@ -381,6 +528,19 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, new_meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json') joblib.dump(model, new_model_path) + # Entraînement de l'Autoencoder en parallèle (si PyTorch disponible et AE_WEIGHT > 0) + ae_model = None + if TORCH_AVAILABLE and AE_WEIGHT > 0: + try: + ae_model = TrafficAutoEncoder(n_features=len(features)) + ae_stats = ae_model.fit(X_train.values) + ae_path = _ae_model_path(name, version_id) + torch.save(ae_model.state_dict(), ae_path) + log_info(f"[{name}] Autoencoder entraîné : loss={ae_stats['final_loss']:.6f}, epochs={ae_stats['epochs']}") + except Exception as exc: + log_info(f"[{name}] Autoencoder training échoué : {exc} — AE désactivé.") + ae_model = None + previous_version = meta.get('version_id', None) if meta else None new_meta = { 'version_id': version_id, 'trained_at': datetime.now().isoformat(), @@ -389,6 +549,8 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, 'model_name': name, 'previous_version': previous_version, 'retrain_interval': RETRAIN_INTERVAL_H, 'baseline_stats': baseline_stats, 'algorithm': 'ExtendedIsolationForest' if EIF_AVAILABLE else 'IsolationForest', + 'autoencoder': ae_model is not None, + 'ae_weight': AE_WEIGHT if ae_model else 0.0, 'validation': { 'val_size': len(X_val), 'train_size': len(X_train), 'val_mean_score': round(val_mean_score, 4), @@ -407,7 +569,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, 'human_samples': len(human_baseline), 'next_retrain_in_h': RETRAIN_INTERVAL_H, 'history_kept': MODEL_HISTORY_COUNT }) - return model + return model, ae_model # ═══════════════════════════════════════════════════════════════════════════════ # A1 — DÉTECTION DE DÉRIVE CONCEPTUELLE (CONCEPT DRIFT) @@ -619,11 +781,13 @@ def _build_reason(name: str, row: pd.Series, shap_top: dict) -> str: # ═══════════════════════════════════════════════════════════════════════════════ # A8 — CLUSTERING COMPORTEMENTAL DES ANOMALIES (DBSCAN) # ═══════════════════════════════════════════════════════════════════════════════ -def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame: +def _cluster_anomalies(anomalies: pd.DataFrame, features: list, ae_model=None) -> pd.DataFrame: """A8 : Applique HDBSCAN (ou DBSCAN en fallback) sur les features normalisées des anomalies. HDBSCAN est préféré car il détermine automatiquement le nombre de clusters et la densité optimale (pas de paramètre eps à régler manuellement). + Si un autoencoder est disponible, utilise l'espace latent (16-dim) au lieu des features brutes + pour un clustering plus expressif dans un espace de dimension réduite. Ajoute une colonne campaign_id : −1 = IP isolée, ≥0 = identifiant de campagne coordonnée. """ anomalies = anomalies.copy() @@ -632,7 +796,17 @@ def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame: return anomalies try: X = anomalies[features].replace([np.inf, -np.inf], np.nan).fillna(0) - X_scaled = StandardScaler().fit_transform(X) + # Utiliser l'espace latent AE si disponible (meilleure séparation dans un espace 16-dim) + if ae_model is not None: + try: + X_scaled = ae_model.encode(X.values) + algo_prefix = 'AE+' + except Exception: + X_scaled = StandardScaler().fit_transform(X) + algo_prefix = '' + else: + X_scaled = StandardScaler().fit_transform(X) + algo_prefix = '' if HDBSCAN_AVAILABLE: clusterer = _hdbscan.HDBSCAN( min_cluster_size=CLUSTERING_MIN_SAMPLES, @@ -644,7 +818,7 @@ def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame: labels = DBSCAN(eps=0.5, min_samples=CLUSTERING_MIN_SAMPLES).fit_predict(X_scaled) anomalies['campaign_id'] = labels n_campaigns = len(set(labels)) - (1 if -1 in labels else 0) - algo = 'HDBSCAN' if HDBSCAN_AVAILABLE else 'DBSCAN' + algo = algo_prefix + ('HDBSCAN' if HDBSCAN_AVAILABLE else 'DBSCAN') if n_campaigns > 0: log_info(f"[{algo}] {n_campaigns} campagne(s) détectée(s) parmi {len(anomalies)} anomalies.") except Exception as e: @@ -693,17 +867,34 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map): return pd.DataFrame(), pd.DataFrame() # A1 — Dérive conceptuelle intégrée dans load_or_train_model - model = load_or_train_model(name, human_baseline, valid_features, cycle_id) + model, ae_model = load_or_train_model(name, human_baseline, valid_features, cycle_id) unknown_traffic = unknown_traffic.copy() X_test = unknown_traffic[valid_features].replace([np.inf, -np.inf], np.nan) X_test = X_test.fillna(X_test.median()) raw_scores = model.decision_function(X_test) + # Combinaison EIF + Autoencoder si disponible + # Score final = (1-α) * eif_norm + α * ae_norm où α = AE_WEIGHT + if ae_model is not None and AE_WEIGHT > 0: + try: + ae_recon_errors = ae_model.score_samples(X_test.values) + ae_norm = normalize_scores(-ae_recon_errors) # plus élevé = plus anomal + eif_norm = normalize_scores(raw_scores) + combined_norm = (1 - AE_WEIGHT) * eif_norm + AE_WEIGHT * ae_norm + unknown_traffic['ae_recon_error'] = ae_recon_errors + unknown_traffic['anomaly_score'] = combined_norm + log_info(f"[{name}] Score combiné EIF+AE (α={AE_WEIGHT}): ae_mean={ae_recon_errors.mean():.6f}") + except Exception as exc: + log_info(f"[{name}] AE scoring échoué : {exc} — utilisation EIF seul.") + unknown_traffic['ae_recon_error'] = 0.0 + unknown_traffic['anomaly_score'] = normalize_scores(raw_scores) + else: + unknown_traffic['ae_recon_error'] = 0.0 + unknown_traffic['anomaly_score'] = normalize_scores(raw_scores) + # raw_anomaly_score : score brut IF pour comparaison au seuil et assignation du threat_level - # anomaly_score : score normalisé [-1, 0] pour cohérence cross-modèles (A10) unknown_traffic['raw_anomaly_score'] = raw_scores - unknown_traffic['anomaly_score'] = normalize_scores(raw_scores) unknown_traffic['model_name'] = name # A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD) @@ -804,7 +995,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map): # A8 — Clustering DBSCAN pour identifier les campagnes coordonnées if ENABLE_CLUSTERING: - anomalies = _cluster_anomalies(anomalies, valid_features) + anomalies = _cluster_anomalies(anomalies, valid_features, ae_model=ae_model) anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'}) for _, row in anomalies.iterrows(): @@ -1125,7 +1316,8 @@ def fetch_and_analyze(): 'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category', 'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name', 'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label', - 'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id' + 'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id', + 'ae_recon_error' ] scores_df = all_scored[[c for c in all_scores_cols if c in all_scored.columns]] client.insert_df(f'{DB}.ml_all_scores', scores_df) diff --git a/services/bot-detector/bot_detector/requirements.txt b/services/bot-detector/bot_detector/requirements.txt index 966d9d8..1c72841 100644 --- a/services/bot-detector/bot_detector/requirements.txt +++ b/services/bot-detector/bot_detector/requirements.txt @@ -5,5 +5,6 @@ shap==0.47.2 scipy>=1.14 hdbscan>=0.8.38 isotree>=0.6.1 +torch>=2.0 pyyaml>=6.0 ja4-common @ file:///app/shared/ja4_common diff --git a/services/bot-detector/bot_detector/tests/test_detector.py b/services/bot-detector/bot_detector/tests/test_detector.py index 90a6fc1..16ee890 100644 --- a/services/bot-detector/bot_detector/tests/test_detector.py +++ b/services/bot-detector/bot_detector/tests/test_detector.py @@ -333,3 +333,174 @@ def test_lag1_autocorrelation_bot_vs_human(): rho_human = 0.0 assert abs(rho_human) < 0.5, f"Human autocorrelation should be low, got {rho_human:.3f}" + + +# ═══════════════════════════════════════════════════════════════════════════════ +# AUTOENCODER TESTS +# ═══════════════════════════════════════════════════════════════════════════════ + +def test_ae_torch_availability_flag(): + """Verify torch availability detection works without crashing.""" + try: + import torch + avail = True + except ImportError: + avail = False + assert isinstance(avail, bool) + + +def _make_ae(n_features, latent_dim=4): + """Build a standalone TrafficAutoEncoder for testing (avoids importing bot_detector module).""" + import torch + import torch.nn as nn + + class _AE: + def __init__(self, n_feat, ldim): + self.n_features = n_feat + self.latent_dim = ldim + self.device = torch.device('cpu') + dim1 = min(64, max(n_feat, ldim + 4)) + dim2 = min(32, max(dim1 // 2, ldim + 2)) + self.encoder = nn.Sequential( + nn.Linear(n_feat, dim1), nn.BatchNorm1d(dim1), nn.ReLU(), + nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(), + nn.Linear(dim2, ldim), + ) + self.decoder = nn.Sequential( + nn.Linear(ldim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(), + nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(), + nn.Linear(dim1, n_feat), nn.Sigmoid(), + ) + self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters()) + self._scaler_min = None + self._scaler_range = None + + def _to_tensor(self, X): + if self._scaler_min is not None: + X_n = (X - self._scaler_min) / (self._scaler_range + 1e-9) + else: + X_n = X + return torch.tensor(np.clip(X_n, 0, 1), dtype=torch.float32) + + def fit(self, X, epochs=50, lr=1e-3, batch_size=256): + self._scaler_min = X.min(axis=0) + self._scaler_range = X.max(axis=0) - self._scaler_min + X_t = self._to_tensor(X) + dataset = torch.utils.data.TensorDataset(X_t) + loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True) + optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5) + criterion = nn.MSELoss() + self.encoder.train(); self.decoder.train() + losses = [] + for _ in range(epochs): + epoch_loss = 0.0 + for (batch,) in loader: + latent = self.encoder(batch) + recon = self.decoder(latent) + loss = criterion(recon, batch) + optimizer.zero_grad(); loss.backward(); optimizer.step() + epoch_loss += loss.item() * len(batch) + losses.append(epoch_loss / len(X_t)) + return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)} + + def score_samples(self, X): + self.encoder.eval(); self.decoder.eval() + X_t = self._to_tensor(X) + with torch.no_grad(): + return ((self.decoder(self.encoder(X_t)) - X_t) ** 2).mean(dim=1).numpy() + + def encode(self, X): + self.encoder.eval() + X_t = self._to_tensor(X) + with torch.no_grad(): + return self.encoder(X_t).numpy() + + def state_dict(self): + return {'encoder': self.encoder.state_dict(), 'decoder': self.decoder.state_dict(), + 'scaler_min': self._scaler_min, 'scaler_range': self._scaler_range, + 'n_features': self.n_features, 'latent_dim': self.latent_dim} + + @classmethod + def load_state_dict(cls, state): + ae = cls(state['n_features'], state['latent_dim']) + ae._scaler_min = state['scaler_min'] + ae._scaler_range = state['scaler_range'] + ae.encoder.load_state_dict(state['encoder']) + ae.decoder.load_state_dict(state['decoder']) + return ae + + return _AE(n_features, latent_dim) + + +def test_ae_class_train_and_score(): + """TrafficAutoEncoder trains on normal data and scores anomalies higher.""" + try: + import torch + except ImportError: + pytest.skip("torch not installed") + + rng = np.random.default_rng(42) + n_features = 10 + X_normal = rng.normal(0.5, 0.1, (200, n_features)).clip(0, 1) + X_anomaly = rng.uniform(0.8, 1.0, (20, n_features)) + + ae = _make_ae(n_features, latent_dim=4) + stats = ae.fit(X_normal, epochs=30, lr=1e-3) + assert stats['final_loss'] > 0, "Loss should be positive" + assert stats['epochs'] == 30 + assert stats['n_samples'] == 200 + + normal_scores = ae.score_samples(X_normal) + anomaly_scores = ae.score_samples(X_anomaly) + assert np.mean(anomaly_scores) > np.mean(normal_scores), \ + f"Anomaly MSE ({np.mean(anomaly_scores):.4f}) should > normal MSE ({np.mean(normal_scores):.4f})" + + +def test_ae_encode_latent_space(): + """Autoencoder encode() returns correct dimensionality.""" + try: + import torch + except ImportError: + pytest.skip("torch not installed") + + rng = np.random.default_rng(42) + X = rng.normal(0.5, 0.1, (50, 8)).clip(0, 1) + + ae = _make_ae(8, latent_dim=4) + ae.fit(X, epochs=5) + latent = ae.encode(X) + assert latent.shape == (50, 4), f"Latent shape should be (50, 4), got {latent.shape}" + + +def test_ae_state_dict_save_load(): + """Autoencoder can save and load state dict.""" + try: + import torch + except ImportError: + pytest.skip("torch not installed") + + rng = np.random.default_rng(42) + X = rng.normal(0.5, 0.1, (100, 6)).clip(0, 1) + + ae = _make_ae(6, latent_dim=3) + ae.fit(X, epochs=10) + scores_before = ae.score_samples(X) + + state = ae.state_dict() + ae2 = type(ae).load_state_dict(state) + scores_after = ae2.score_samples(X) + np.testing.assert_allclose(scores_before, scores_after, rtol=1e-5, + err_msg="Scores should be identical after load") + + +def test_ae_weight_combination(): + """Combined score should be weighted average of EIF and AE components.""" + eif_norm = np.array([0.2, 0.8, 0.5]) + ae_norm = np.array([0.3, 0.9, 0.4]) + alpha = 0.30 + combined = (1 - alpha) * eif_norm + alpha * ae_norm + expected = np.array([0.2*0.7 + 0.3*0.3, 0.8*0.7 + 0.9*0.3, 0.5*0.7 + 0.4*0.3]) + np.testing.assert_allclose(combined, expected, rtol=1e-7) + # Combined should be between min and max of components + assert all(combined >= np.minimum(eif_norm, ae_norm) - 1e-9) + assert all(combined <= np.maximum(eif_norm, ae_norm) + 1e-9) diff --git a/shared/clickhouse/06_ml_tables.sql b/shared/clickhouse/06_ml_tables.sql index d0087ec..20358fd 100644 --- a/shared/clickhouse/06_ml_tables.sql +++ b/shared/clickhouse/06_ml_tables.sql @@ -94,6 +94,8 @@ CREATE TABLE IF NOT EXISTS ja4_processing.ml_all_scores fuzzing_index Float32, post_ratio Float32, campaign_id Int32, + -- Autoencoder reconstruction error (parallel scorer) + ae_recon_error Float32 DEFAULT 0, -- Anubis enrichment (deploy_schema.sql item 12) anubis_bot_name LowCardinality(String) DEFAULT '', anubis_bot_action LowCardinality(String) DEFAULT '',