feat(bot-detector): add parallel Autoencoder scorer (#9)
- TrafficAutoEncoder class: symmetric AE (n→64→32→16→32→64→n) with BatchNorm+ReLU - Trained alongside EIF on human_baseline, saved/loaded with model versioning - Score = per-sample MSE reconstruction error, combined with EIF via AE_WEIGHT (α=0.30) - AE latent space (16-dim) used for HDBSCAN clustering instead of raw features - Configurable: AE_WEIGHT, AE_EPOCHS, AE_LATENT_DIM, AE_LEARNING_RATE - Graceful fallback: if torch unavailable or AE fails, EIF-only scoring continues - ClickHouse: ae_recon_error column added to ml_all_scores - Tests: 5 new tests (AE train/score, encode latent, state dict save/load, weight combination) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -51,6 +51,15 @@ try:
|
||||
except ImportError:
|
||||
SHAP_AVAILABLE = False
|
||||
|
||||
# PyTorch Autoencoder (Baptiste et al., arXiv 2602; Kitsune, Mirsky et al., NDSS 2018)
|
||||
# Second scorer parallèle : détecte les anomalies de reconstruction que l'IF manque
|
||||
try:
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
TORCH_AVAILABLE = True
|
||||
except ImportError:
|
||||
TORCH_AVAILABLE = False
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
@ -123,6 +132,12 @@ MIN_VALID_FEATURE_RATIO = _require_float('MIN_VALID_FEATURE_RATIO', 0.50, 0, 1)
|
||||
ENABLE_CLUSTERING = os.getenv('ENABLE_CLUSTERING', 'true').lower() == 'true'
|
||||
CLUSTERING_MIN_SAMPLES = int(os.getenv('CLUSTERING_MIN_SAMPLES', '3'))
|
||||
|
||||
# Autoencoder — Second scorer parallèle (Baptiste et al. 2026, Kitsune / Mirsky et al. NDSS 2018)
|
||||
AE_WEIGHT = _require_float('AE_WEIGHT', 0.30, 0, 1) # pondération dans le score combiné (0 = IF seul)
|
||||
AE_EPOCHS = int(os.getenv('AE_EPOCHS', '50'))
|
||||
AE_LATENT_DIM = int(os.getenv('AE_LATENT_DIM', '16'))
|
||||
AE_LEARNING_RATE = float(os.getenv('AE_LEARNING_RATE', '1e-3'))
|
||||
|
||||
# Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé)
|
||||
# Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design.
|
||||
STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
|
||||
@ -275,6 +290,121 @@ def _purge_old_versions(name: str):
|
||||
if os.path.exists(meta_path): os.remove(meta_path)
|
||||
log_info(f"[{name}] Version purgée : {version_id} (limite={MODEL_HISTORY_COUNT})")
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# AUTOENCODER — Second scorer parallèle (détection d'anomalies par reconstruction)
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
class TrafficAutoEncoder:
|
||||
"""Autoencoder symétrique pour détection d'anomalies par erreur de reconstruction.
|
||||
|
||||
Architecture : encoder (n→64→32→latent_dim) — decoder (latent_dim→32→64→n)
|
||||
Activation : ReLU + BatchNorm (encoder/decoder), sigmoid (sortie — données normalisées [0,1])
|
||||
Score = MSE(input, reconstruction) par échantillon.
|
||||
L'espace latent (16-dim par défaut) peut servir de features compressées pour HDBSCAN.
|
||||
"""
|
||||
|
||||
def __init__(self, n_features: int, latent_dim: int = AE_LATENT_DIM):
|
||||
if not TORCH_AVAILABLE:
|
||||
raise RuntimeError("PyTorch non disponible — autoencoder désactivé.")
|
||||
self.n_features = n_features
|
||||
self.latent_dim = latent_dim
|
||||
self.device = torch.device('cpu')
|
||||
self._build_model()
|
||||
self._scaler_min = None
|
||||
self._scaler_range = None
|
||||
|
||||
def _build_model(self):
|
||||
dim1 = min(64, max(self.n_features, self.latent_dim + 4))
|
||||
dim2 = min(32, max(dim1 // 2, self.latent_dim + 2))
|
||||
self.encoder = nn.Sequential(
|
||||
nn.Linear(self.n_features, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||
nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||
nn.Linear(dim2, self.latent_dim),
|
||||
).to(self.device)
|
||||
self.decoder = nn.Sequential(
|
||||
nn.Linear(self.latent_dim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||
nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||
nn.Linear(dim1, self.n_features), nn.Sigmoid(),
|
||||
).to(self.device)
|
||||
self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
|
||||
|
||||
def _to_tensor(self, X: np.ndarray) -> torch.Tensor:
|
||||
"""Normalise [0,1] via min-max puis convertit en Tensor."""
|
||||
if self._scaler_min is not None:
|
||||
X_norm = (X - self._scaler_min) / (self._scaler_range + 1e-9)
|
||||
else:
|
||||
X_norm = X
|
||||
return torch.tensor(np.clip(X_norm, 0, 1), dtype=torch.float32, device=self.device)
|
||||
|
||||
def fit(self, X: np.ndarray, epochs: int = AE_EPOCHS, lr: float = AE_LEARNING_RATE,
|
||||
batch_size: int = 256) -> dict:
|
||||
"""Entraîne l'autoencoder sur la baseline humaine (données normales)."""
|
||||
self._scaler_min = X.min(axis=0)
|
||||
self._scaler_range = X.max(axis=0) - self._scaler_min
|
||||
X_t = self._to_tensor(X)
|
||||
|
||||
dataset = torch.utils.data.TensorDataset(X_t)
|
||||
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5)
|
||||
criterion = nn.MSELoss()
|
||||
|
||||
self.encoder.train()
|
||||
self.decoder.train()
|
||||
losses = []
|
||||
for epoch in range(epochs):
|
||||
epoch_loss = 0.0
|
||||
for (batch,) in loader:
|
||||
latent = self.encoder(batch)
|
||||
recon = self.decoder(latent)
|
||||
loss = criterion(recon, batch)
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
epoch_loss += loss.item() * len(batch)
|
||||
losses.append(epoch_loss / len(X_t))
|
||||
return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)}
|
||||
|
||||
def score_samples(self, X: np.ndarray) -> np.ndarray:
|
||||
"""Retourne l'erreur de reconstruction MSE par échantillon (plus élevé = plus anomal)."""
|
||||
self.encoder.eval()
|
||||
self.decoder.eval()
|
||||
X_t = self._to_tensor(X)
|
||||
with torch.no_grad():
|
||||
latent = self.encoder(X_t)
|
||||
recon = self.decoder(latent)
|
||||
mse = ((recon - X_t) ** 2).mean(dim=1).numpy()
|
||||
return mse
|
||||
|
||||
def encode(self, X: np.ndarray) -> np.ndarray:
|
||||
"""Retourne l'espace latent (pour HDBSCAN clustering)."""
|
||||
self.encoder.eval()
|
||||
X_t = self._to_tensor(X)
|
||||
with torch.no_grad():
|
||||
return self.encoder(X_t).numpy()
|
||||
|
||||
def state_dict(self) -> dict:
|
||||
return {
|
||||
'encoder': self.encoder.state_dict(),
|
||||
'decoder': self.decoder.state_dict(),
|
||||
'scaler_min': self._scaler_min,
|
||||
'scaler_range': self._scaler_range,
|
||||
'n_features': self.n_features,
|
||||
'latent_dim': self.latent_dim,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def load_state_dict(cls, state: dict) -> 'TrafficAutoEncoder':
|
||||
ae = cls(state['n_features'], state['latent_dim'])
|
||||
ae._scaler_min = state['scaler_min']
|
||||
ae._scaler_range = state['scaler_range']
|
||||
ae.encoder.load_state_dict(state['encoder'])
|
||||
ae.decoder.load_state_dict(state['decoder'])
|
||||
return ae
|
||||
|
||||
|
||||
def _ae_model_path(name: str, version_id: str) -> str:
|
||||
return os.path.join(MODEL_DIR, f'ae_{name}_{version_id}.pt')
|
||||
|
||||
|
||||
def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str):
|
||||
"""Charge le modèle IsolationForest existant ou en entraîne un nouveau si nécessaire.
|
||||
|
||||
@ -283,7 +413,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
dérive, entraîne un nouveau modèle sur ``human_baseline``, le sérialise sur disque,
|
||||
met à jour le fichier pointeur et purge les anciennes versions.
|
||||
|
||||
Retourne l'objet IsolationForest entraîné ou rechargé.
|
||||
Retourne (IsolationForest, TrafficAutoEncoder|None).
|
||||
"""
|
||||
model_path, meta = _get_current_version(name)
|
||||
if model_path and meta:
|
||||
@ -311,7 +441,16 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
'trained_at': meta['trained_at'], 'human_samples': meta.get('human_samples', '?'),
|
||||
'retrain_in_hours': round(RETRAIN_INTERVAL_H - age_h, 1), 'drift_score': round(drift_score, 3)
|
||||
})
|
||||
return joblib.load(model_path)
|
||||
ae_loaded = None
|
||||
if TORCH_AVAILABLE and AE_WEIGHT > 0:
|
||||
ae_path = _ae_model_path(name, meta['version_id'])
|
||||
if os.path.exists(ae_path):
|
||||
try:
|
||||
ae_loaded = TrafficAutoEncoder.load_state_dict(torch.load(ae_path, weights_only=False))
|
||||
log_info(f"[{name}] Autoencoder v{meta['version_id']} rechargé.")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] Erreur chargement AE : {exc} — AE désactivé ce cycle.")
|
||||
return joblib.load(model_path), ae_loaded
|
||||
elif not drift_forced:
|
||||
log_info(f"[{name}] Modèle v{meta['version_id']} expiré ({age_h:.1f}h ≥ {RETRAIN_INTERVAL_H}h) — retraining.")
|
||||
|
||||
@ -363,7 +502,15 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
# Tenter de réutiliser le modèle précédent
|
||||
if model_path and os.path.exists(model_path):
|
||||
log_info(f"[{name}] Conservation du modèle précédent v{meta.get('version_id', '?')}.")
|
||||
return joblib.load(model_path)
|
||||
ae_prev = None
|
||||
if TORCH_AVAILABLE and AE_WEIGHT > 0:
|
||||
ae_prev_path = _ae_model_path(name, meta.get('version_id', ''))
|
||||
if os.path.exists(ae_prev_path):
|
||||
try:
|
||||
ae_prev = TrafficAutoEncoder.load_state_dict(torch.load(ae_prev_path, weights_only=False))
|
||||
except Exception:
|
||||
pass
|
||||
return joblib.load(model_path), ae_prev
|
||||
log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.")
|
||||
|
||||
# A1 — Sauvegarder les statistiques de distribution avec quantile digest pour drift detection
|
||||
@ -381,6 +528,19 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
new_meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
|
||||
joblib.dump(model, new_model_path)
|
||||
|
||||
# Entraînement de l'Autoencoder en parallèle (si PyTorch disponible et AE_WEIGHT > 0)
|
||||
ae_model = None
|
||||
if TORCH_AVAILABLE and AE_WEIGHT > 0:
|
||||
try:
|
||||
ae_model = TrafficAutoEncoder(n_features=len(features))
|
||||
ae_stats = ae_model.fit(X_train.values)
|
||||
ae_path = _ae_model_path(name, version_id)
|
||||
torch.save(ae_model.state_dict(), ae_path)
|
||||
log_info(f"[{name}] Autoencoder entraîné : loss={ae_stats['final_loss']:.6f}, epochs={ae_stats['epochs']}")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] Autoencoder training échoué : {exc} — AE désactivé.")
|
||||
ae_model = None
|
||||
|
||||
previous_version = meta.get('version_id', None) if meta else None
|
||||
new_meta = {
|
||||
'version_id': version_id, 'trained_at': datetime.now().isoformat(),
|
||||
@ -389,6 +549,8 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
'model_name': name, 'previous_version': previous_version,
|
||||
'retrain_interval': RETRAIN_INTERVAL_H, 'baseline_stats': baseline_stats,
|
||||
'algorithm': 'ExtendedIsolationForest' if EIF_AVAILABLE else 'IsolationForest',
|
||||
'autoencoder': ae_model is not None,
|
||||
'ae_weight': AE_WEIGHT if ae_model else 0.0,
|
||||
'validation': {
|
||||
'val_size': len(X_val), 'train_size': len(X_train),
|
||||
'val_mean_score': round(val_mean_score, 4),
|
||||
@ -407,7 +569,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
'human_samples': len(human_baseline), 'next_retrain_in_h': RETRAIN_INTERVAL_H,
|
||||
'history_kept': MODEL_HISTORY_COUNT
|
||||
})
|
||||
return model
|
||||
return model, ae_model
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# A1 — DÉTECTION DE DÉRIVE CONCEPTUELLE (CONCEPT DRIFT)
|
||||
@ -619,11 +781,13 @@ def _build_reason(name: str, row: pd.Series, shap_top: dict) -> str:
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# A8 — CLUSTERING COMPORTEMENTAL DES ANOMALIES (DBSCAN)
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
|
||||
def _cluster_anomalies(anomalies: pd.DataFrame, features: list, ae_model=None) -> pd.DataFrame:
|
||||
"""A8 : Applique HDBSCAN (ou DBSCAN en fallback) sur les features normalisées des anomalies.
|
||||
|
||||
HDBSCAN est préféré car il détermine automatiquement le nombre de clusters
|
||||
et la densité optimale (pas de paramètre eps à régler manuellement).
|
||||
Si un autoencoder est disponible, utilise l'espace latent (16-dim) au lieu des features brutes
|
||||
pour un clustering plus expressif dans un espace de dimension réduite.
|
||||
Ajoute une colonne campaign_id : −1 = IP isolée, ≥0 = identifiant de campagne coordonnée.
|
||||
"""
|
||||
anomalies = anomalies.copy()
|
||||
@ -632,7 +796,17 @@ def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
|
||||
return anomalies
|
||||
try:
|
||||
X = anomalies[features].replace([np.inf, -np.inf], np.nan).fillna(0)
|
||||
# Utiliser l'espace latent AE si disponible (meilleure séparation dans un espace 16-dim)
|
||||
if ae_model is not None:
|
||||
try:
|
||||
X_scaled = ae_model.encode(X.values)
|
||||
algo_prefix = 'AE+'
|
||||
except Exception:
|
||||
X_scaled = StandardScaler().fit_transform(X)
|
||||
algo_prefix = ''
|
||||
else:
|
||||
X_scaled = StandardScaler().fit_transform(X)
|
||||
algo_prefix = ''
|
||||
if HDBSCAN_AVAILABLE:
|
||||
clusterer = _hdbscan.HDBSCAN(
|
||||
min_cluster_size=CLUSTERING_MIN_SAMPLES,
|
||||
@ -644,7 +818,7 @@ def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
|
||||
labels = DBSCAN(eps=0.5, min_samples=CLUSTERING_MIN_SAMPLES).fit_predict(X_scaled)
|
||||
anomalies['campaign_id'] = labels
|
||||
n_campaigns = len(set(labels)) - (1 if -1 in labels else 0)
|
||||
algo = 'HDBSCAN' if HDBSCAN_AVAILABLE else 'DBSCAN'
|
||||
algo = algo_prefix + ('HDBSCAN' if HDBSCAN_AVAILABLE else 'DBSCAN')
|
||||
if n_campaigns > 0:
|
||||
log_info(f"[{algo}] {n_campaigns} campagne(s) détectée(s) parmi {len(anomalies)} anomalies.")
|
||||
except Exception as e:
|
||||
@ -693,17 +867,34 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
return pd.DataFrame(), pd.DataFrame()
|
||||
|
||||
# A1 — Dérive conceptuelle intégrée dans load_or_train_model
|
||||
model = load_or_train_model(name, human_baseline, valid_features, cycle_id)
|
||||
model, ae_model = load_or_train_model(name, human_baseline, valid_features, cycle_id)
|
||||
unknown_traffic = unknown_traffic.copy()
|
||||
|
||||
X_test = unknown_traffic[valid_features].replace([np.inf, -np.inf], np.nan)
|
||||
X_test = X_test.fillna(X_test.median())
|
||||
raw_scores = model.decision_function(X_test)
|
||||
|
||||
# raw_anomaly_score : score brut IF pour comparaison au seuil et assignation du threat_level
|
||||
# anomaly_score : score normalisé [-1, 0] pour cohérence cross-modèles (A10)
|
||||
unknown_traffic['raw_anomaly_score'] = raw_scores
|
||||
# Combinaison EIF + Autoencoder si disponible
|
||||
# Score final = (1-α) * eif_norm + α * ae_norm où α = AE_WEIGHT
|
||||
if ae_model is not None and AE_WEIGHT > 0:
|
||||
try:
|
||||
ae_recon_errors = ae_model.score_samples(X_test.values)
|
||||
ae_norm = normalize_scores(-ae_recon_errors) # plus élevé = plus anomal
|
||||
eif_norm = normalize_scores(raw_scores)
|
||||
combined_norm = (1 - AE_WEIGHT) * eif_norm + AE_WEIGHT * ae_norm
|
||||
unknown_traffic['ae_recon_error'] = ae_recon_errors
|
||||
unknown_traffic['anomaly_score'] = combined_norm
|
||||
log_info(f"[{name}] Score combiné EIF+AE (α={AE_WEIGHT}): ae_mean={ae_recon_errors.mean():.6f}")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] AE scoring échoué : {exc} — utilisation EIF seul.")
|
||||
unknown_traffic['ae_recon_error'] = 0.0
|
||||
unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
|
||||
else:
|
||||
unknown_traffic['ae_recon_error'] = 0.0
|
||||
unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
|
||||
|
||||
# raw_anomaly_score : score brut IF pour comparaison au seuil et assignation du threat_level
|
||||
unknown_traffic['raw_anomaly_score'] = raw_scores
|
||||
unknown_traffic['model_name'] = name
|
||||
|
||||
# A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD)
|
||||
@ -804,7 +995,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
|
||||
# A8 — Clustering DBSCAN pour identifier les campagnes coordonnées
|
||||
if ENABLE_CLUSTERING:
|
||||
anomalies = _cluster_anomalies(anomalies, valid_features)
|
||||
anomalies = _cluster_anomalies(anomalies, valid_features, ae_model=ae_model)
|
||||
|
||||
anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
|
||||
for _, row in anomalies.iterrows():
|
||||
@ -1125,7 +1316,8 @@ def fetch_and_analyze():
|
||||
'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category',
|
||||
'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name',
|
||||
'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label',
|
||||
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id'
|
||||
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id',
|
||||
'ae_recon_error'
|
||||
]
|
||||
scores_df = all_scored[[c for c in all_scores_cols if c in all_scored.columns]]
|
||||
client.insert_df(f'{DB}.ml_all_scores', scores_df)
|
||||
|
||||
@ -5,5 +5,6 @@ shap==0.47.2
|
||||
scipy>=1.14
|
||||
hdbscan>=0.8.38
|
||||
isotree>=0.6.1
|
||||
torch>=2.0
|
||||
pyyaml>=6.0
|
||||
ja4-common @ file:///app/shared/ja4_common
|
||||
|
||||
@ -333,3 +333,174 @@ def test_lag1_autocorrelation_bot_vs_human():
|
||||
rho_human = 0.0
|
||||
|
||||
assert abs(rho_human) < 0.5, f"Human autocorrelation should be low, got {rho_human:.3f}"
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# AUTOENCODER TESTS
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def test_ae_torch_availability_flag():
|
||||
"""Verify torch availability detection works without crashing."""
|
||||
try:
|
||||
import torch
|
||||
avail = True
|
||||
except ImportError:
|
||||
avail = False
|
||||
assert isinstance(avail, bool)
|
||||
|
||||
|
||||
def _make_ae(n_features, latent_dim=4):
|
||||
"""Build a standalone TrafficAutoEncoder for testing (avoids importing bot_detector module)."""
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
class _AE:
|
||||
def __init__(self, n_feat, ldim):
|
||||
self.n_features = n_feat
|
||||
self.latent_dim = ldim
|
||||
self.device = torch.device('cpu')
|
||||
dim1 = min(64, max(n_feat, ldim + 4))
|
||||
dim2 = min(32, max(dim1 // 2, ldim + 2))
|
||||
self.encoder = nn.Sequential(
|
||||
nn.Linear(n_feat, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||
nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||
nn.Linear(dim2, ldim),
|
||||
)
|
||||
self.decoder = nn.Sequential(
|
||||
nn.Linear(ldim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
|
||||
nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
|
||||
nn.Linear(dim1, n_feat), nn.Sigmoid(),
|
||||
)
|
||||
self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
|
||||
self._scaler_min = None
|
||||
self._scaler_range = None
|
||||
|
||||
def _to_tensor(self, X):
|
||||
if self._scaler_min is not None:
|
||||
X_n = (X - self._scaler_min) / (self._scaler_range + 1e-9)
|
||||
else:
|
||||
X_n = X
|
||||
return torch.tensor(np.clip(X_n, 0, 1), dtype=torch.float32)
|
||||
|
||||
def fit(self, X, epochs=50, lr=1e-3, batch_size=256):
|
||||
self._scaler_min = X.min(axis=0)
|
||||
self._scaler_range = X.max(axis=0) - self._scaler_min
|
||||
X_t = self._to_tensor(X)
|
||||
dataset = torch.utils.data.TensorDataset(X_t)
|
||||
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5)
|
||||
criterion = nn.MSELoss()
|
||||
self.encoder.train(); self.decoder.train()
|
||||
losses = []
|
||||
for _ in range(epochs):
|
||||
epoch_loss = 0.0
|
||||
for (batch,) in loader:
|
||||
latent = self.encoder(batch)
|
||||
recon = self.decoder(latent)
|
||||
loss = criterion(recon, batch)
|
||||
optimizer.zero_grad(); loss.backward(); optimizer.step()
|
||||
epoch_loss += loss.item() * len(batch)
|
||||
losses.append(epoch_loss / len(X_t))
|
||||
return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)}
|
||||
|
||||
def score_samples(self, X):
|
||||
self.encoder.eval(); self.decoder.eval()
|
||||
X_t = self._to_tensor(X)
|
||||
with torch.no_grad():
|
||||
return ((self.decoder(self.encoder(X_t)) - X_t) ** 2).mean(dim=1).numpy()
|
||||
|
||||
def encode(self, X):
|
||||
self.encoder.eval()
|
||||
X_t = self._to_tensor(X)
|
||||
with torch.no_grad():
|
||||
return self.encoder(X_t).numpy()
|
||||
|
||||
def state_dict(self):
|
||||
return {'encoder': self.encoder.state_dict(), 'decoder': self.decoder.state_dict(),
|
||||
'scaler_min': self._scaler_min, 'scaler_range': self._scaler_range,
|
||||
'n_features': self.n_features, 'latent_dim': self.latent_dim}
|
||||
|
||||
@classmethod
|
||||
def load_state_dict(cls, state):
|
||||
ae = cls(state['n_features'], state['latent_dim'])
|
||||
ae._scaler_min = state['scaler_min']
|
||||
ae._scaler_range = state['scaler_range']
|
||||
ae.encoder.load_state_dict(state['encoder'])
|
||||
ae.decoder.load_state_dict(state['decoder'])
|
||||
return ae
|
||||
|
||||
return _AE(n_features, latent_dim)
|
||||
|
||||
|
||||
def test_ae_class_train_and_score():
|
||||
"""TrafficAutoEncoder trains on normal data and scores anomalies higher."""
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
pytest.skip("torch not installed")
|
||||
|
||||
rng = np.random.default_rng(42)
|
||||
n_features = 10
|
||||
X_normal = rng.normal(0.5, 0.1, (200, n_features)).clip(0, 1)
|
||||
X_anomaly = rng.uniform(0.8, 1.0, (20, n_features))
|
||||
|
||||
ae = _make_ae(n_features, latent_dim=4)
|
||||
stats = ae.fit(X_normal, epochs=30, lr=1e-3)
|
||||
assert stats['final_loss'] > 0, "Loss should be positive"
|
||||
assert stats['epochs'] == 30
|
||||
assert stats['n_samples'] == 200
|
||||
|
||||
normal_scores = ae.score_samples(X_normal)
|
||||
anomaly_scores = ae.score_samples(X_anomaly)
|
||||
assert np.mean(anomaly_scores) > np.mean(normal_scores), \
|
||||
f"Anomaly MSE ({np.mean(anomaly_scores):.4f}) should > normal MSE ({np.mean(normal_scores):.4f})"
|
||||
|
||||
|
||||
def test_ae_encode_latent_space():
|
||||
"""Autoencoder encode() returns correct dimensionality."""
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
pytest.skip("torch not installed")
|
||||
|
||||
rng = np.random.default_rng(42)
|
||||
X = rng.normal(0.5, 0.1, (50, 8)).clip(0, 1)
|
||||
|
||||
ae = _make_ae(8, latent_dim=4)
|
||||
ae.fit(X, epochs=5)
|
||||
latent = ae.encode(X)
|
||||
assert latent.shape == (50, 4), f"Latent shape should be (50, 4), got {latent.shape}"
|
||||
|
||||
|
||||
def test_ae_state_dict_save_load():
|
||||
"""Autoencoder can save and load state dict."""
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
pytest.skip("torch not installed")
|
||||
|
||||
rng = np.random.default_rng(42)
|
||||
X = rng.normal(0.5, 0.1, (100, 6)).clip(0, 1)
|
||||
|
||||
ae = _make_ae(6, latent_dim=3)
|
||||
ae.fit(X, epochs=10)
|
||||
scores_before = ae.score_samples(X)
|
||||
|
||||
state = ae.state_dict()
|
||||
ae2 = type(ae).load_state_dict(state)
|
||||
scores_after = ae2.score_samples(X)
|
||||
np.testing.assert_allclose(scores_before, scores_after, rtol=1e-5,
|
||||
err_msg="Scores should be identical after load")
|
||||
|
||||
|
||||
def test_ae_weight_combination():
|
||||
"""Combined score should be weighted average of EIF and AE components."""
|
||||
eif_norm = np.array([0.2, 0.8, 0.5])
|
||||
ae_norm = np.array([0.3, 0.9, 0.4])
|
||||
alpha = 0.30
|
||||
combined = (1 - alpha) * eif_norm + alpha * ae_norm
|
||||
expected = np.array([0.2*0.7 + 0.3*0.3, 0.8*0.7 + 0.9*0.3, 0.5*0.7 + 0.4*0.3])
|
||||
np.testing.assert_allclose(combined, expected, rtol=1e-7)
|
||||
# Combined should be between min and max of components
|
||||
assert all(combined >= np.minimum(eif_norm, ae_norm) - 1e-9)
|
||||
assert all(combined <= np.maximum(eif_norm, ae_norm) + 1e-9)
|
||||
|
||||
@ -94,6 +94,8 @@ CREATE TABLE IF NOT EXISTS ja4_processing.ml_all_scores
|
||||
fuzzing_index Float32,
|
||||
post_ratio Float32,
|
||||
campaign_id Int32,
|
||||
-- Autoencoder reconstruction error (parallel scorer)
|
||||
ae_recon_error Float32 DEFAULT 0,
|
||||
-- Anubis enrichment (deploy_schema.sql item 12)
|
||||
anubis_bot_name LowCardinality(String) DEFAULT '',
|
||||
anubis_bot_action LowCardinality(String) DEFAULT '',
|
||||
|
||||
Reference in New Issue
Block a user