Files
dashboard/backend/services/clustering_engine.py
SOC Analyst fc3392779b feat: slider sensibilité + z-score standardization pour clustering plus précis
Sensibilité (0.5x–3.0x) :
- Multiplie k : sensibilité=2x avec k=14 → 28 clusters effectifs
- Labels UI : Grossière / Normale / Fine / Très fine / Maximum
- Paramètres avancés (k, fenêtre) masqués dans un <details>
- Cache invalidé si sensibilité change

Z-score standardisation (Bishop 2006 PRML §9.1) :
- Normalise par variance de chaque feature avant K-means
- Features discriminantes (forte std) pèsent plus
- Résultat : risque 0→1.00 sur clusters bots vs 0→0.27 avant
- Bots détectés : 4 337 IPs vs 1 604 (2.7x plus)
- Nouveaux clusters : Bot agressif, Tunnel réseau, UA-CH Mismatch distincts

Fix TextLayer deck.gl :
- Translittération des accents (é→e, à→a, ç→c…) + strip emojis
- Évite les warnings 'Missing character' sur caractères non-ASCII

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-19 10:07:23 +01:00

295 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Moteur de clustering K-means++ multi-métriques (numpy + scipy vectorisé).
Ref:
Arthur & Vassilvitskii (2007) — k-means++: The Advantages of Careful Seeding
scipy.spatial.ConvexHull — enveloppe convexe (Graham/Qhull)
sklearn-style API — centroids, labels_, inertia_
Features (21 dimensions, normalisées [0,1]) :
0 ttl_n : TTL initial normalisé
1 mss_n : MSS normalisé → type réseau
2 scale_n : facteur de mise à l'échelle TCP
3 win_n : fenêtre TCP normalisée
4 score_n : score anomalie ML (abs)
5 velocity_n : vélocité de requêtes (log1p)
6 fuzzing_n : index de fuzzing (log1p)
7 headless_n : ratio sessions headless
8 post_n : ratio POST/total
9 ip_id_zero_n : ratio IP-ID=0 (Linux/spoofé)
10 entropy_n : entropie temporelle
11 browser_n : score navigateur moderne
12 alpn_n : mismatch ALPN/protocole
13 alpn_absent_n : ratio ALPN absent
14 h2_n : efficacité H2 multiplexing (log1p)
15 hdr_conf_n : confiance ordre headers
16 ua_ch_n : mismatch User-Agent-Client-Hints
17 asset_n : ratio assets statiques
18 direct_n : ratio accès directs
19 ja4_div_n : diversité JA4 (log1p)
20 ua_rot_n : UA rotatif (booléen)
"""
from __future__ import annotations
import math
import logging
import numpy as np
from dataclasses import dataclass, field
from scipy.spatial import ConvexHull
log = logging.getLogger(__name__)
# ─── Définition des features ──────────────────────────────────────────────────
FEATURES: list[tuple[str, str, object]] = [
# TCP stack
("ttl", "TTL Initial", lambda v: min(1.0, (v or 0) / 255.0)),
("mss", "MSS Réseau", lambda v: min(1.0, (v or 0) / 1460.0)),
("scale", "Scale TCP", lambda v: min(1.0, (v or 0) / 14.0)),
("win", "Fenêtre TCP", lambda v: min(1.0, (v or 0) / 65535.0)),
# Anomalie ML
("avg_score", "Score Anomalie", lambda v: min(1.0, float(v or 0))),
("avg_velocity", "Vélocité (rps)", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))),
("avg_fuzzing", "Fuzzing", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))),
("pct_headless", "Headless", lambda v: min(1.0, float(v or 0))),
("avg_post", "Ratio POST", lambda v: min(1.0, float(v or 0))),
# IP-ID
("ip_id_zero", "IP-ID Zéro", lambda v: min(1.0, float(v or 0))),
# Temporel
("entropy", "Entropie Temporelle", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(10))),
# Navigateur
("browser_score", "Score Navigateur", lambda v: min(1.0, float(v or 0) / 50.0)),
# TLS / Protocole
("alpn_mismatch", "ALPN Mismatch", lambda v: min(1.0, float(v or 0))),
("alpn_missing", "ALPN Absent", lambda v: min(1.0, float(v or 0))),
("h2_eff", "H2 Multiplexing", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(20))),
("hdr_conf", "Ordre Headers", lambda v: min(1.0, float(v or 0))),
("ua_ch_mismatch","UA-CH Mismatch", lambda v: min(1.0, float(v or 0))),
# Comportement HTTP
("asset_ratio", "Ratio Assets", lambda v: min(1.0, float(v or 0))),
("direct_ratio", "Accès Direct", lambda v: min(1.0, float(v or 0))),
# Diversité JA4
("ja4_count", "Diversité JA4", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(30))),
# UA rotatif
("ua_rotating", "UA Rotatif", lambda v: 1.0 if float(v or 0) > 0 else 0.0),
]
FEATURE_KEYS = [f[0] for f in FEATURES]
FEATURE_NAMES = [f[1] for f in FEATURES]
FEATURE_NORMS = [f[2] for f in FEATURES]
N_FEATURES = len(FEATURES)
# ─── Construction du vecteur de features ─────────────────────────────────────
def build_feature_vector(row: dict) -> list[float]:
"""Construit le vecteur normalisé [0,1]^21 depuis un dict SQL."""
return [norm(row.get(key, 0)) for key, _, norm in FEATURES]
# ─── Standardisation z-score ──────────────────────────────────────────────────
def standardize(X: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Z-score standardisation : chaque feature est centrée et mise à l'échelle
par sa déviation standard.
Ref: Bishop (2006) PRML §9.1 — preprocessing recommandé pour K-means.
Retourne (X_std, mean, std) pour pouvoir projeter de nouveaux points.
"""
mean = X.mean(axis=0)
std = X.std(axis=0)
std[std < 1e-8] = 1.0 # évite la division par zéro pour features constantes
return (X - mean) / std, mean, std
# ─── K-means++ vectorisé (numpy) ─────────────────────────────────────────────
@dataclass
class KMeansResult:
centroids: np.ndarray # (k, n_features)
labels: np.ndarray # (n_points,) int32
inertia: float
n_iter: int
def kmeans_pp(X: np.ndarray, k: int, max_iter: int = 60, n_init: int = 3,
seed: int = 42) -> KMeansResult:
"""
K-means++ entièrement vectorisé avec numpy.
n_init exécutions, meilleure inertie conservée.
"""
rng = np.random.default_rng(seed)
n, d = X.shape
best: KMeansResult | None = None
for _ in range(n_init):
# ── Initialisation K-means++ ──────────────────────────────────────
centers = [X[rng.integers(n)].copy()]
for _ in range(k - 1):
D = _min_sq_dist(X, np.array(centers))
# Garantit des probabilités non-négatives (erreurs float, points dupliqués)
D = np.clip(D, 0.0, None)
total = D.sum()
if total < 1e-12:
# Tous les points sont confondus — tirage aléatoire
centers.append(X[rng.integers(n)].copy())
else:
probs = D / total
centers.append(X[rng.choice(n, p=probs)].copy())
centers_arr = np.array(centers) # (k, d)
# ── Iterations ───────────────────────────────────────────────────
labels = np.zeros(n, dtype=np.int32)
for it in range(max_iter):
# Assignation vectorisée : (n, k) distance²
dists = _sq_dists(X, centers_arr) # (n, k)
new_labels = np.argmin(dists, axis=1).astype(np.int32)
if it > 0 and np.all(new_labels == labels):
break # convergence
labels = new_labels
# Mise à jour des centroïdes
for j in range(k):
mask = labels == j
if mask.any():
centers_arr[j] = X[mask].mean(axis=0)
inertia = float(np.sum(np.min(_sq_dists(X, centers_arr), axis=1)))
result = KMeansResult(centers_arr, labels, inertia, it + 1)
if best is None or inertia < best.inertia:
best = result
return best # type: ignore[return-value]
def _sq_dists(X: np.ndarray, C: np.ndarray) -> np.ndarray:
"""Distance² entre chaque point de X et chaque centroïde de C. O(n·k·d)."""
# ||x - c||² = ||x||² + ||c||² - 2·x·cᵀ
X2 = np.sum(X ** 2, axis=1, keepdims=True) # (n, 1)
C2 = np.sum(C ** 2, axis=1, keepdims=True).T # (1, k)
return X2 + C2 - 2.0 * X @ C.T # (n, k)
def _min_sq_dist(X: np.ndarray, C: np.ndarray) -> np.ndarray:
"""Distance² minimale de chaque point aux centroïdes existants."""
return np.min(_sq_dists(X, C), axis=1)
# ─── PCA 2D (numpy) ──────────────────────────────────────────────────────────
def pca_2d(X: np.ndarray) -> np.ndarray:
"""
PCA-2D vectorisée. Retourne les coordonnées normalisées [0,1] × [0,1].
"""
mean = X.mean(axis=0)
Xc = X - mean
# Power iteration pour les 2 premières composantes
rng = np.random.default_rng(0)
v1 = _power_iter(Xc, rng.standard_normal(Xc.shape[1]))
proj1 = Xc @ v1
# Déflation (Hotelling)
Xc2 = Xc - np.outer(proj1, v1)
v2 = _power_iter(Xc2, rng.standard_normal(Xc.shape[1]))
proj2 = Xc2 @ v2
coords = np.column_stack([proj1, proj2])
# Normalisation [0,1]
mn, mx = coords.min(axis=0), coords.max(axis=0)
rng_ = mx - mn
rng_[rng_ == 0] = 1.0
return (coords - mn) / rng_
def _power_iter(X: np.ndarray, v: np.ndarray, n_iter: int = 30) -> np.ndarray:
"""Power iteration : trouve le premier vecteur propre de XᵀX."""
for _ in range(n_iter):
v = X.T @ (X @ v)
norm = np.linalg.norm(v)
if norm < 1e-12:
break
v /= norm
return v
# ─── Enveloppe convexe (hull) par cluster ────────────────────────────────────
def compute_hulls(coords_2d: np.ndarray, labels: np.ndarray,
k: int, min_pts: int = 4) -> dict[int, list[list[float]]]:
"""
Calcule l'enveloppe convexe (convex hull) des points PCA pour chaque cluster.
Retourne {cluster_idx: [[x,y], ...]} (polygone fermé).
"""
hulls: dict[int, list[list[float]]] = {}
for j in range(k):
pts = coords_2d[labels == j]
if len(pts) < min_pts:
# Pas assez de points : bounding box
if len(pts) > 0:
mx_, my_ = pts.mean(axis=0)
r = max(0.01, pts.std(axis=0).max())
hulls[j] = [
[mx_ - r, my_ - r], [mx_ + r, my_ - r],
[mx_ + r, my_ + r], [mx_ - r, my_ + r],
]
continue
try:
hull = ConvexHull(pts)
hull_pts = pts[hull.vertices].tolist()
# Fermer le polygone
hull_pts.append(hull_pts[0])
hulls[j] = hull_pts
except Exception:
hulls[j] = []
return hulls
# ─── Nommage et scoring ───────────────────────────────────────────────────────
def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
"""Nom lisible basé sur les features dominantes du centroïde."""
s = centroid # alias
ttl_raw = float(raw_stats.get("mean_ttl", 0))
mss_raw = float(raw_stats.get("mean_mss", 0))
# Scanners / bots masscan
if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25:
return "🤖 Masscan Scanner"
if s[4] > 0.70 and s[6] > 0.5:
return "🤖 Bot agressif"
if s[16] > 0.80:
return "🤖 UA-CH Mismatch"
if s[7] > 0.70:
return "🤖 Headless Browser"
if s[4] > 0.50:
return "⚠️ Anomalie ML haute"
if s[3] > 0.85 and ttl_raw > 120:
return "🖥️ Windows"
if s[0] > 0.22 and s[0] < 0.28 and mss_raw > 1400:
return "🐧 Linux"
if s[1] < 0.90 and s[1] > 0.95:
return "📡 VPN/Proxy"
if mss_raw < 1380 and mss_raw > 0:
return "🌐 Tunnel réseau"
if s[5] > 0.60:
return "⚡ Trafic rapide"
if s[4] < 0.10 and s[5] < 0.10:
return "✅ Trafic sain"
return "📊 Cluster mixte"
def risk_score_from_centroid(centroid: np.ndarray) -> float:
"""Score de risque [0,1] agrégé depuis le centroïde."""
s = centroid
return float(np.clip(
0.40 * s[4] + # score ML
0.15 * s[6] + # fuzzing
0.15 * s[16] + # UA-CH mismatch
0.10 * s[7] + # headless
0.10 * s[5] + # vélocité
0.10 * s[9], # IP-ID zéro
0.0, 1.0
))