feat(bot-detector): add parallel Autoencoder scorer (#9)

- TrafficAutoEncoder class: symmetric AE (n→64→32→16→32→64→n) with BatchNorm+ReLU
- Trained alongside EIF on human_baseline, saved/loaded with model versioning
- Score = per-sample MSE reconstruction error, combined with EIF via AE_WEIGHT (α=0.30)
- AE latent space (16-dim) used for HDBSCAN clustering instead of raw features
- Configurable: AE_WEIGHT, AE_EPOCHS, AE_LATENT_DIM, AE_LEARNING_RATE
- Graceful fallback: if torch unavailable or AE fails, EIF-only scoring continues
- ClickHouse: ae_recon_error column added to ml_all_scores
- Tests: 5 new tests (AE train/score, encode latent, state dict save/load, weight combination)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-08 02:40:39 +02:00
parent f6e2d3c0ca
commit 57cf6c3828
4 changed files with 378 additions and 12 deletions

View File

@ -51,6 +51,15 @@ try:
except ImportError:
SHAP_AVAILABLE = False
# PyTorch Autoencoder (Baptiste et al., arXiv 2602; Kitsune, Mirsky et al., NDSS 2018)
# Second scorer parallèle : détecte les anomalies de reconstruction que l'IF manque
try:
import torch
import torch.nn as nn
TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False
warnings.filterwarnings('ignore')
# ═══════════════════════════════════════════════════════════════════════════════
@ -123,6 +132,12 @@ MIN_VALID_FEATURE_RATIO = _require_float('MIN_VALID_FEATURE_RATIO', 0.50, 0, 1)
ENABLE_CLUSTERING = os.getenv('ENABLE_CLUSTERING', 'true').lower() == 'true'
CLUSTERING_MIN_SAMPLES = int(os.getenv('CLUSTERING_MIN_SAMPLES', '3'))
# Autoencoder — Second scorer parallèle (Baptiste et al. 2026, Kitsune / Mirsky et al. NDSS 2018)
AE_WEIGHT = _require_float('AE_WEIGHT', 0.30, 0, 1) # pondération dans le score combiné (0 = IF seul)
AE_EPOCHS = int(os.getenv('AE_EPOCHS', '50'))
AE_LATENT_DIM = int(os.getenv('AE_LATENT_DIM', '16'))
AE_LEARNING_RATE = float(os.getenv('AE_LEARNING_RATE', '1e-3'))
# Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé)
# Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design.
STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
@ -275,6 +290,121 @@ def _purge_old_versions(name: str):
if os.path.exists(meta_path): os.remove(meta_path)
log_info(f"[{name}] Version purgée : {version_id} (limite={MODEL_HISTORY_COUNT})")
# ═══════════════════════════════════════════════════════════════════════════════
# AUTOENCODER — Second scorer parallèle (détection d'anomalies par reconstruction)
# ═══════════════════════════════════════════════════════════════════════════════
class TrafficAutoEncoder:
"""Autoencoder symétrique pour détection d'anomalies par erreur de reconstruction.
Architecture : encoder (n→64→32→latent_dim) — decoder (latent_dim→32→64→n)
Activation : ReLU + BatchNorm (encoder/decoder), sigmoid (sortie — données normalisées [0,1])
Score = MSE(input, reconstruction) par échantillon.
L'espace latent (16-dim par défaut) peut servir de features compressées pour HDBSCAN.
"""
def __init__(self, n_features: int, latent_dim: int = AE_LATENT_DIM):
if not TORCH_AVAILABLE:
raise RuntimeError("PyTorch non disponible — autoencoder désactivé.")
self.n_features = n_features
self.latent_dim = latent_dim
self.device = torch.device('cpu')
self._build_model()
self._scaler_min = None
self._scaler_range = None
def _build_model(self):
dim1 = min(64, max(self.n_features, self.latent_dim + 4))
dim2 = min(32, max(dim1 // 2, self.latent_dim + 2))
self.encoder = nn.Sequential(
nn.Linear(self.n_features, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
nn.Linear(dim2, self.latent_dim),
).to(self.device)
self.decoder = nn.Sequential(
nn.Linear(self.latent_dim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
nn.Linear(dim1, self.n_features), nn.Sigmoid(),
).to(self.device)
self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
def _to_tensor(self, X: np.ndarray) -> torch.Tensor:
"""Normalise [0,1] via min-max puis convertit en Tensor."""
if self._scaler_min is not None:
X_norm = (X - self._scaler_min) / (self._scaler_range + 1e-9)
else:
X_norm = X
return torch.tensor(np.clip(X_norm, 0, 1), dtype=torch.float32, device=self.device)
def fit(self, X: np.ndarray, epochs: int = AE_EPOCHS, lr: float = AE_LEARNING_RATE,
batch_size: int = 256) -> dict:
"""Entraîne l'autoencoder sur la baseline humaine (données normales)."""
self._scaler_min = X.min(axis=0)
self._scaler_range = X.max(axis=0) - self._scaler_min
X_t = self._to_tensor(X)
dataset = torch.utils.data.TensorDataset(X_t)
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5)
criterion = nn.MSELoss()
self.encoder.train()
self.decoder.train()
losses = []
for epoch in range(epochs):
epoch_loss = 0.0
for (batch,) in loader:
latent = self.encoder(batch)
recon = self.decoder(latent)
loss = criterion(recon, batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss += loss.item() * len(batch)
losses.append(epoch_loss / len(X_t))
return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)}
def score_samples(self, X: np.ndarray) -> np.ndarray:
"""Retourne l'erreur de reconstruction MSE par échantillon (plus élevé = plus anomal)."""
self.encoder.eval()
self.decoder.eval()
X_t = self._to_tensor(X)
with torch.no_grad():
latent = self.encoder(X_t)
recon = self.decoder(latent)
mse = ((recon - X_t) ** 2).mean(dim=1).numpy()
return mse
def encode(self, X: np.ndarray) -> np.ndarray:
"""Retourne l'espace latent (pour HDBSCAN clustering)."""
self.encoder.eval()
X_t = self._to_tensor(X)
with torch.no_grad():
return self.encoder(X_t).numpy()
def state_dict(self) -> dict:
return {
'encoder': self.encoder.state_dict(),
'decoder': self.decoder.state_dict(),
'scaler_min': self._scaler_min,
'scaler_range': self._scaler_range,
'n_features': self.n_features,
'latent_dim': self.latent_dim,
}
@classmethod
def load_state_dict(cls, state: dict) -> 'TrafficAutoEncoder':
ae = cls(state['n_features'], state['latent_dim'])
ae._scaler_min = state['scaler_min']
ae._scaler_range = state['scaler_range']
ae.encoder.load_state_dict(state['encoder'])
ae.decoder.load_state_dict(state['decoder'])
return ae
def _ae_model_path(name: str, version_id: str) -> str:
return os.path.join(MODEL_DIR, f'ae_{name}_{version_id}.pt')
def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str):
"""Charge le modèle IsolationForest existant ou en entraîne un nouveau si nécessaire.
@ -283,7 +413,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
dérive, entraîne un nouveau modèle sur ``human_baseline``, le sérialise sur disque,
met à jour le fichier pointeur et purge les anciennes versions.
Retourne l'objet IsolationForest entraîné ou rechargé.
Retourne (IsolationForest, TrafficAutoEncoder|None).
"""
model_path, meta = _get_current_version(name)
if model_path and meta:
@ -311,7 +441,16 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
'trained_at': meta['trained_at'], 'human_samples': meta.get('human_samples', '?'),
'retrain_in_hours': round(RETRAIN_INTERVAL_H - age_h, 1), 'drift_score': round(drift_score, 3)
})
return joblib.load(model_path)
ae_loaded = None
if TORCH_AVAILABLE and AE_WEIGHT > 0:
ae_path = _ae_model_path(name, meta['version_id'])
if os.path.exists(ae_path):
try:
ae_loaded = TrafficAutoEncoder.load_state_dict(torch.load(ae_path, weights_only=False))
log_info(f"[{name}] Autoencoder v{meta['version_id']} rechargé.")
except Exception as exc:
log_info(f"[{name}] Erreur chargement AE : {exc} — AE désactivé ce cycle.")
return joblib.load(model_path), ae_loaded
elif not drift_forced:
log_info(f"[{name}] Modèle v{meta['version_id']} expiré ({age_h:.1f}h ≥ {RETRAIN_INTERVAL_H}h) — retraining.")
@ -363,7 +502,15 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
# Tenter de réutiliser le modèle précédent
if model_path and os.path.exists(model_path):
log_info(f"[{name}] Conservation du modèle précédent v{meta.get('version_id', '?')}.")
return joblib.load(model_path)
ae_prev = None
if TORCH_AVAILABLE and AE_WEIGHT > 0:
ae_prev_path = _ae_model_path(name, meta.get('version_id', ''))
if os.path.exists(ae_prev_path):
try:
ae_prev = TrafficAutoEncoder.load_state_dict(torch.load(ae_prev_path, weights_only=False))
except Exception:
pass
return joblib.load(model_path), ae_prev
log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.")
# A1 — Sauvegarder les statistiques de distribution avec quantile digest pour drift detection
@ -381,6 +528,19 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
new_meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
joblib.dump(model, new_model_path)
# Entraînement de l'Autoencoder en parallèle (si PyTorch disponible et AE_WEIGHT > 0)
ae_model = None
if TORCH_AVAILABLE and AE_WEIGHT > 0:
try:
ae_model = TrafficAutoEncoder(n_features=len(features))
ae_stats = ae_model.fit(X_train.values)
ae_path = _ae_model_path(name, version_id)
torch.save(ae_model.state_dict(), ae_path)
log_info(f"[{name}] Autoencoder entraîné : loss={ae_stats['final_loss']:.6f}, epochs={ae_stats['epochs']}")
except Exception as exc:
log_info(f"[{name}] Autoencoder training échoué : {exc} — AE désactivé.")
ae_model = None
previous_version = meta.get('version_id', None) if meta else None
new_meta = {
'version_id': version_id, 'trained_at': datetime.now().isoformat(),
@ -389,6 +549,8 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
'model_name': name, 'previous_version': previous_version,
'retrain_interval': RETRAIN_INTERVAL_H, 'baseline_stats': baseline_stats,
'algorithm': 'ExtendedIsolationForest' if EIF_AVAILABLE else 'IsolationForest',
'autoencoder': ae_model is not None,
'ae_weight': AE_WEIGHT if ae_model else 0.0,
'validation': {
'val_size': len(X_val), 'train_size': len(X_train),
'val_mean_score': round(val_mean_score, 4),
@ -407,7 +569,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
'human_samples': len(human_baseline), 'next_retrain_in_h': RETRAIN_INTERVAL_H,
'history_kept': MODEL_HISTORY_COUNT
})
return model
return model, ae_model
# ═══════════════════════════════════════════════════════════════════════════════
# A1 — DÉTECTION DE DÉRIVE CONCEPTUELLE (CONCEPT DRIFT)
@ -619,11 +781,13 @@ def _build_reason(name: str, row: pd.Series, shap_top: dict) -> str:
# ═══════════════════════════════════════════════════════════════════════════════
# A8 — CLUSTERING COMPORTEMENTAL DES ANOMALIES (DBSCAN)
# ═══════════════════════════════════════════════════════════════════════════════
def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
def _cluster_anomalies(anomalies: pd.DataFrame, features: list, ae_model=None) -> pd.DataFrame:
"""A8 : Applique HDBSCAN (ou DBSCAN en fallback) sur les features normalisées des anomalies.
HDBSCAN est préféré car il détermine automatiquement le nombre de clusters
et la densité optimale (pas de paramètre eps à régler manuellement).
Si un autoencoder est disponible, utilise l'espace latent (16-dim) au lieu des features brutes
pour un clustering plus expressif dans un espace de dimension réduite.
Ajoute une colonne campaign_id : 1 = IP isolée, ≥0 = identifiant de campagne coordonnée.
"""
anomalies = anomalies.copy()
@ -632,7 +796,17 @@ def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
return anomalies
try:
X = anomalies[features].replace([np.inf, -np.inf], np.nan).fillna(0)
X_scaled = StandardScaler().fit_transform(X)
# Utiliser l'espace latent AE si disponible (meilleure séparation dans un espace 16-dim)
if ae_model is not None:
try:
X_scaled = ae_model.encode(X.values)
algo_prefix = 'AE+'
except Exception:
X_scaled = StandardScaler().fit_transform(X)
algo_prefix = ''
else:
X_scaled = StandardScaler().fit_transform(X)
algo_prefix = ''
if HDBSCAN_AVAILABLE:
clusterer = _hdbscan.HDBSCAN(
min_cluster_size=CLUSTERING_MIN_SAMPLES,
@ -644,7 +818,7 @@ def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
labels = DBSCAN(eps=0.5, min_samples=CLUSTERING_MIN_SAMPLES).fit_predict(X_scaled)
anomalies['campaign_id'] = labels
n_campaigns = len(set(labels)) - (1 if -1 in labels else 0)
algo = 'HDBSCAN' if HDBSCAN_AVAILABLE else 'DBSCAN'
algo = algo_prefix + ('HDBSCAN' if HDBSCAN_AVAILABLE else 'DBSCAN')
if n_campaigns > 0:
log_info(f"[{algo}] {n_campaigns} campagne(s) détectée(s) parmi {len(anomalies)} anomalies.")
except Exception as e:
@ -693,17 +867,34 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
return pd.DataFrame(), pd.DataFrame()
# A1 — Dérive conceptuelle intégrée dans load_or_train_model
model = load_or_train_model(name, human_baseline, valid_features, cycle_id)
model, ae_model = load_or_train_model(name, human_baseline, valid_features, cycle_id)
unknown_traffic = unknown_traffic.copy()
X_test = unknown_traffic[valid_features].replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(X_test.median())
raw_scores = model.decision_function(X_test)
# Combinaison EIF + Autoencoder si disponible
# Score final = (1-α) * eif_norm + α * ae_norm où α = AE_WEIGHT
if ae_model is not None and AE_WEIGHT > 0:
try:
ae_recon_errors = ae_model.score_samples(X_test.values)
ae_norm = normalize_scores(-ae_recon_errors) # plus élevé = plus anomal
eif_norm = normalize_scores(raw_scores)
combined_norm = (1 - AE_WEIGHT) * eif_norm + AE_WEIGHT * ae_norm
unknown_traffic['ae_recon_error'] = ae_recon_errors
unknown_traffic['anomaly_score'] = combined_norm
log_info(f"[{name}] Score combiné EIF+AE (α={AE_WEIGHT}): ae_mean={ae_recon_errors.mean():.6f}")
except Exception as exc:
log_info(f"[{name}] AE scoring échoué : {exc} — utilisation EIF seul.")
unknown_traffic['ae_recon_error'] = 0.0
unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
else:
unknown_traffic['ae_recon_error'] = 0.0
unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
# raw_anomaly_score : score brut IF pour comparaison au seuil et assignation du threat_level
# anomaly_score : score normalisé [-1, 0] pour cohérence cross-modèles (A10)
unknown_traffic['raw_anomaly_score'] = raw_scores
unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
unknown_traffic['model_name'] = name
# A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD)
@ -804,7 +995,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
# A8 — Clustering DBSCAN pour identifier les campagnes coordonnées
if ENABLE_CLUSTERING:
anomalies = _cluster_anomalies(anomalies, valid_features)
anomalies = _cluster_anomalies(anomalies, valid_features, ae_model=ae_model)
anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
for _, row in anomalies.iterrows():
@ -1125,7 +1316,8 @@ def fetch_and_analyze():
'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category',
'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name',
'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label',
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id'
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id',
'ae_recon_error'
]
scores_df = all_scored[[c for c in all_scores_cols if c in all_scored.columns]]
client.insert_df(f'{DB}.ml_all_scores', scores_df)

View File

@ -5,5 +5,6 @@ shap==0.47.2
scipy>=1.14
hdbscan>=0.8.38
isotree>=0.6.1
torch>=2.0
pyyaml>=6.0
ja4-common @ file:///app/shared/ja4_common

View File

@ -333,3 +333,174 @@ def test_lag1_autocorrelation_bot_vs_human():
rho_human = 0.0
assert abs(rho_human) < 0.5, f"Human autocorrelation should be low, got {rho_human:.3f}"
# ═══════════════════════════════════════════════════════════════════════════════
# AUTOENCODER TESTS
# ═══════════════════════════════════════════════════════════════════════════════
def test_ae_torch_availability_flag():
"""Verify torch availability detection works without crashing."""
try:
import torch
avail = True
except ImportError:
avail = False
assert isinstance(avail, bool)
def _make_ae(n_features, latent_dim=4):
"""Build a standalone TrafficAutoEncoder for testing (avoids importing bot_detector module)."""
import torch
import torch.nn as nn
class _AE:
def __init__(self, n_feat, ldim):
self.n_features = n_feat
self.latent_dim = ldim
self.device = torch.device('cpu')
dim1 = min(64, max(n_feat, ldim + 4))
dim2 = min(32, max(dim1 // 2, ldim + 2))
self.encoder = nn.Sequential(
nn.Linear(n_feat, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
nn.Linear(dim2, ldim),
)
self.decoder = nn.Sequential(
nn.Linear(ldim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
nn.Linear(dim1, n_feat), nn.Sigmoid(),
)
self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
self._scaler_min = None
self._scaler_range = None
def _to_tensor(self, X):
if self._scaler_min is not None:
X_n = (X - self._scaler_min) / (self._scaler_range + 1e-9)
else:
X_n = X
return torch.tensor(np.clip(X_n, 0, 1), dtype=torch.float32)
def fit(self, X, epochs=50, lr=1e-3, batch_size=256):
self._scaler_min = X.min(axis=0)
self._scaler_range = X.max(axis=0) - self._scaler_min
X_t = self._to_tensor(X)
dataset = torch.utils.data.TensorDataset(X_t)
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5)
criterion = nn.MSELoss()
self.encoder.train(); self.decoder.train()
losses = []
for _ in range(epochs):
epoch_loss = 0.0
for (batch,) in loader:
latent = self.encoder(batch)
recon = self.decoder(latent)
loss = criterion(recon, batch)
optimizer.zero_grad(); loss.backward(); optimizer.step()
epoch_loss += loss.item() * len(batch)
losses.append(epoch_loss / len(X_t))
return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)}
def score_samples(self, X):
self.encoder.eval(); self.decoder.eval()
X_t = self._to_tensor(X)
with torch.no_grad():
return ((self.decoder(self.encoder(X_t)) - X_t) ** 2).mean(dim=1).numpy()
def encode(self, X):
self.encoder.eval()
X_t = self._to_tensor(X)
with torch.no_grad():
return self.encoder(X_t).numpy()
def state_dict(self):
return {'encoder': self.encoder.state_dict(), 'decoder': self.decoder.state_dict(),
'scaler_min': self._scaler_min, 'scaler_range': self._scaler_range,
'n_features': self.n_features, 'latent_dim': self.latent_dim}
@classmethod
def load_state_dict(cls, state):
ae = cls(state['n_features'], state['latent_dim'])
ae._scaler_min = state['scaler_min']
ae._scaler_range = state['scaler_range']
ae.encoder.load_state_dict(state['encoder'])
ae.decoder.load_state_dict(state['decoder'])
return ae
return _AE(n_features, latent_dim)
def test_ae_class_train_and_score():
"""TrafficAutoEncoder trains on normal data and scores anomalies higher."""
try:
import torch
except ImportError:
pytest.skip("torch not installed")
rng = np.random.default_rng(42)
n_features = 10
X_normal = rng.normal(0.5, 0.1, (200, n_features)).clip(0, 1)
X_anomaly = rng.uniform(0.8, 1.0, (20, n_features))
ae = _make_ae(n_features, latent_dim=4)
stats = ae.fit(X_normal, epochs=30, lr=1e-3)
assert stats['final_loss'] > 0, "Loss should be positive"
assert stats['epochs'] == 30
assert stats['n_samples'] == 200
normal_scores = ae.score_samples(X_normal)
anomaly_scores = ae.score_samples(X_anomaly)
assert np.mean(anomaly_scores) > np.mean(normal_scores), \
f"Anomaly MSE ({np.mean(anomaly_scores):.4f}) should > normal MSE ({np.mean(normal_scores):.4f})"
def test_ae_encode_latent_space():
"""Autoencoder encode() returns correct dimensionality."""
try:
import torch
except ImportError:
pytest.skip("torch not installed")
rng = np.random.default_rng(42)
X = rng.normal(0.5, 0.1, (50, 8)).clip(0, 1)
ae = _make_ae(8, latent_dim=4)
ae.fit(X, epochs=5)
latent = ae.encode(X)
assert latent.shape == (50, 4), f"Latent shape should be (50, 4), got {latent.shape}"
def test_ae_state_dict_save_load():
"""Autoencoder can save and load state dict."""
try:
import torch
except ImportError:
pytest.skip("torch not installed")
rng = np.random.default_rng(42)
X = rng.normal(0.5, 0.1, (100, 6)).clip(0, 1)
ae = _make_ae(6, latent_dim=3)
ae.fit(X, epochs=10)
scores_before = ae.score_samples(X)
state = ae.state_dict()
ae2 = type(ae).load_state_dict(state)
scores_after = ae2.score_samples(X)
np.testing.assert_allclose(scores_before, scores_after, rtol=1e-5,
err_msg="Scores should be identical after load")
def test_ae_weight_combination():
"""Combined score should be weighted average of EIF and AE components."""
eif_norm = np.array([0.2, 0.8, 0.5])
ae_norm = np.array([0.3, 0.9, 0.4])
alpha = 0.30
combined = (1 - alpha) * eif_norm + alpha * ae_norm
expected = np.array([0.2*0.7 + 0.3*0.3, 0.8*0.7 + 0.9*0.3, 0.5*0.7 + 0.4*0.3])
np.testing.assert_allclose(combined, expected, rtol=1e-7)
# Combined should be between min and max of components
assert all(combined >= np.minimum(eif_norm, ae_norm) - 1e-9)
assert all(combined <= np.maximum(eif_norm, ae_norm) + 1e-9)

View File

@ -94,6 +94,8 @@ CREATE TABLE IF NOT EXISTS ja4_processing.ml_all_scores
fuzzing_index Float32,
post_ratio Float32,
campaign_id Int32,
-- Autoencoder reconstruction error (parallel scorer)
ae_recon_error Float32 DEFAULT 0,
-- Anubis enrichment (deploy_schema.sql item 12)
anubis_bot_name LowCardinality(String) DEFAULT '',
anubis_bot_action LowCardinality(String) DEFAULT '',