""" Moteur de clustering K-means++ multi-métriques (numpy + scipy vectorisé). Ref: Arthur & Vassilvitskii (2007) — k-means++: The Advantages of Careful Seeding scipy.spatial.ConvexHull — enveloppe convexe (Graham/Qhull) sklearn-style API — centroids, labels_, inertia_ Features (21 dimensions, normalisées [0,1]) : 0 ttl_n : TTL initial normalisé 1 mss_n : MSS normalisé → type réseau 2 scale_n : facteur de mise à l'échelle TCP 3 win_n : fenêtre TCP normalisée 4 score_n : score anomalie ML (abs) 5 velocity_n : vélocité de requêtes (log1p) 6 fuzzing_n : index de fuzzing (log1p) 7 headless_n : ratio sessions headless 8 post_n : ratio POST/total 9 ip_id_zero_n : ratio IP-ID=0 (Linux/spoofé) 10 entropy_n : entropie temporelle 11 browser_n : score navigateur moderne 12 alpn_n : mismatch ALPN/protocole 13 alpn_absent_n : ratio ALPN absent 14 h2_n : efficacité H2 multiplexing (log1p) 15 hdr_conf_n : confiance ordre headers 16 ua_ch_n : mismatch User-Agent-Client-Hints 17 asset_n : ratio assets statiques 18 direct_n : ratio accès directs 19 ja4_div_n : diversité JA4 (log1p) 20 ua_rot_n : UA rotatif (booléen) """ from __future__ import annotations import math import logging import numpy as np from dataclasses import dataclass, field from scipy.spatial import ConvexHull log = logging.getLogger(__name__) # ─── Définition des features ────────────────────────────────────────────────── FEATURES: list[tuple[str, str, object]] = [ # TCP stack ("ttl", "TTL Initial", lambda v: min(1.0, (v or 0) / 255.0)), ("mss", "MSS Réseau", lambda v: min(1.0, (v or 0) / 1460.0)), ("scale", "Scale TCP", lambda v: min(1.0, (v or 0) / 14.0)), ("win", "Fenêtre TCP", lambda v: min(1.0, (v or 0) / 65535.0)), # Anomalie ML ("avg_score", "Score Anomalie", lambda v: min(1.0, float(v or 0))), ("avg_velocity", "Vélocité (rps)", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))), ("avg_fuzzing", "Fuzzing", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))), ("pct_headless", "Headless", lambda v: min(1.0, float(v or 0))), ("avg_post", "Ratio POST", lambda v: min(1.0, float(v or 0))), # IP-ID ("ip_id_zero", "IP-ID Zéro", lambda v: min(1.0, float(v or 0))), # Temporel ("entropy", "Entropie Temporelle", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(10))), # Navigateur ("browser_score", "Score Navigateur", lambda v: min(1.0, float(v or 0) / 50.0)), # TLS / Protocole ("alpn_mismatch", "ALPN Mismatch", lambda v: min(1.0, float(v or 0))), ("alpn_missing", "ALPN Absent", lambda v: min(1.0, float(v or 0))), ("h2_eff", "H2 Multiplexing", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(20))), ("hdr_conf", "Ordre Headers", lambda v: min(1.0, float(v or 0))), ("ua_ch_mismatch","UA-CH Mismatch", lambda v: min(1.0, float(v or 0))), # Comportement HTTP ("asset_ratio", "Ratio Assets", lambda v: min(1.0, float(v or 0))), ("direct_ratio", "Accès Direct", lambda v: min(1.0, float(v or 0))), # Diversité JA4 ("ja4_count", "Diversité JA4", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(30))), # UA rotatif ("ua_rotating", "UA Rotatif", lambda v: 1.0 if float(v or 0) > 0 else 0.0), ] FEATURE_KEYS = [f[0] for f in FEATURES] FEATURE_NAMES = [f[1] for f in FEATURES] FEATURE_NORMS = [f[2] for f in FEATURES] N_FEATURES = len(FEATURES) # ─── Construction du vecteur de features ───────────────────────────────────── def build_feature_vector(row: dict) -> list[float]: """Construit le vecteur normalisé [0,1]^21 depuis un dict SQL.""" return [norm(row.get(key, 0)) for key, _, norm in FEATURES] # ─── Standardisation z-score ────────────────────────────────────────────────── def standardize(X: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Z-score standardisation : chaque feature est centrée et mise à l'échelle par sa déviation standard. Ref: Bishop (2006) PRML §9.1 — preprocessing recommandé pour K-means. Retourne (X_std, mean, std) pour pouvoir projeter de nouveaux points. """ mean = X.mean(axis=0) std = X.std(axis=0) std[std < 1e-8] = 1.0 # évite la division par zéro pour features constantes return (X - mean) / std, mean, std # ─── K-means++ vectorisé (numpy) ───────────────────────────────────────────── @dataclass class KMeansResult: centroids: np.ndarray # (k, n_features) labels: np.ndarray # (n_points,) int32 inertia: float n_iter: int def kmeans_pp(X: np.ndarray, k: int, max_iter: int = 60, n_init: int = 3, seed: int = 42) -> KMeansResult: """ K-means++ entièrement vectorisé avec numpy. n_init exécutions, meilleure inertie conservée. """ rng = np.random.default_rng(seed) n, d = X.shape best: KMeansResult | None = None for _ in range(n_init): # ── Initialisation K-means++ ────────────────────────────────────── centers = [X[rng.integers(n)].copy()] for _ in range(k - 1): D = _min_sq_dist(X, np.array(centers)) # Garantit des probabilités non-négatives (erreurs float, points dupliqués) D = np.clip(D, 0.0, None) total = D.sum() if total < 1e-12: # Tous les points sont confondus — tirage aléatoire centers.append(X[rng.integers(n)].copy()) else: probs = D / total centers.append(X[rng.choice(n, p=probs)].copy()) centers_arr = np.array(centers) # (k, d) # ── Iterations ─────────────────────────────────────────────────── labels = np.zeros(n, dtype=np.int32) for it in range(max_iter): # Assignation vectorisée : (n, k) distance² dists = _sq_dists(X, centers_arr) # (n, k) new_labels = np.argmin(dists, axis=1).astype(np.int32) if it > 0 and np.all(new_labels == labels): break # convergence labels = new_labels # Mise à jour des centroïdes for j in range(k): mask = labels == j if mask.any(): centers_arr[j] = X[mask].mean(axis=0) inertia = float(np.sum(np.min(_sq_dists(X, centers_arr), axis=1))) result = KMeansResult(centers_arr, labels, inertia, it + 1) if best is None or inertia < best.inertia: best = result return best # type: ignore[return-value] def _sq_dists(X: np.ndarray, C: np.ndarray) -> np.ndarray: """Distance² entre chaque point de X et chaque centroïde de C. O(n·k·d).""" # ||x - c||² = ||x||² + ||c||² - 2·x·cᵀ X2 = np.sum(X ** 2, axis=1, keepdims=True) # (n, 1) C2 = np.sum(C ** 2, axis=1, keepdims=True).T # (1, k) return X2 + C2 - 2.0 * X @ C.T # (n, k) def _min_sq_dist(X: np.ndarray, C: np.ndarray) -> np.ndarray: """Distance² minimale de chaque point aux centroïdes existants.""" return np.min(_sq_dists(X, C), axis=1) # ─── PCA 2D (numpy) ────────────────────────────────────────────────────────── def pca_2d(X: np.ndarray) -> np.ndarray: """ PCA-2D vectorisée. Retourne les coordonnées normalisées [0,1] × [0,1]. """ mean = X.mean(axis=0) Xc = X - mean # Power iteration pour les 2 premières composantes rng = np.random.default_rng(0) v1 = _power_iter(Xc, rng.standard_normal(Xc.shape[1])) proj1 = Xc @ v1 # Déflation (Hotelling) Xc2 = Xc - np.outer(proj1, v1) v2 = _power_iter(Xc2, rng.standard_normal(Xc.shape[1])) proj2 = Xc2 @ v2 coords = np.column_stack([proj1, proj2]) # Normalisation [0,1] mn, mx = coords.min(axis=0), coords.max(axis=0) rng_ = mx - mn rng_[rng_ == 0] = 1.0 return (coords - mn) / rng_ def _power_iter(X: np.ndarray, v: np.ndarray, n_iter: int = 30) -> np.ndarray: """Power iteration : trouve le premier vecteur propre de XᵀX.""" for _ in range(n_iter): v = X.T @ (X @ v) norm = np.linalg.norm(v) if norm < 1e-12: break v /= norm return v # ─── Enveloppe convexe (hull) par cluster ──────────────────────────────────── def compute_hulls(coords_2d: np.ndarray, labels: np.ndarray, k: int, min_pts: int = 4) -> dict[int, list[list[float]]]: """ Calcule l'enveloppe convexe (convex hull) des points PCA pour chaque cluster. Retourne {cluster_idx: [[x,y], ...]} (polygone fermé). """ hulls: dict[int, list[list[float]]] = {} for j in range(k): pts = coords_2d[labels == j] if len(pts) < min_pts: # Pas assez de points : bounding box if len(pts) > 0: mx_, my_ = pts.mean(axis=0) r = max(0.01, pts.std(axis=0).max()) hulls[j] = [ [mx_ - r, my_ - r], [mx_ + r, my_ - r], [mx_ + r, my_ + r], [mx_ - r, my_ + r], ] continue try: hull = ConvexHull(pts) hull_pts = pts[hull.vertices].tolist() # Fermer le polygone hull_pts.append(hull_pts[0]) hulls[j] = hull_pts except Exception: hulls[j] = [] return hulls # ─── Nommage et scoring ─────────────────────────────────────────────────────── def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str: """Nom lisible basé sur les features dominantes du centroïde.""" s = centroid # alias ttl_raw = float(raw_stats.get("mean_ttl", 0)) mss_raw = float(raw_stats.get("mean_mss", 0)) # Scanners / bots masscan if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25: return "🤖 Masscan Scanner" if s[4] > 0.70 and s[6] > 0.5: return "🤖 Bot agressif" if s[16] > 0.80: return "🤖 UA-CH Mismatch" if s[7] > 0.70: return "🤖 Headless Browser" if s[4] > 0.50: return "⚠️ Anomalie ML haute" if s[3] > 0.85 and ttl_raw > 120: return "🖥️ Windows" if s[0] > 0.22 and s[0] < 0.28 and mss_raw > 1400: return "🐧 Linux" if s[1] < 0.90 and s[1] > 0.95: return "📡 VPN/Proxy" if mss_raw < 1380 and mss_raw > 0: return "🌐 Tunnel réseau" if s[5] > 0.60: return "⚡ Trafic rapide" if s[4] < 0.10 and s[5] < 0.10: return "✅ Trafic sain" return "📊 Cluster mixte" def risk_score_from_centroid(centroid: np.ndarray) -> float: """Score de risque [0,1] agrégé depuis le centroïde.""" s = centroid return float(np.clip( 0.40 * s[4] + # score ML 0.15 * s[6] + # fuzzing 0.15 * s[16] + # UA-CH mismatch 0.10 * s[7] + # headless 0.10 * s[5] + # vélocité 0.10 * s[9], # IP-ID zéro 0.0, 1.0 ))