feat(clustering): palette diversifiée, suppression scores anomalie/robot, visualisation éclatée

- Suppression de 'Score Anomalie' (avg_score) des 31→30 features de clustering - Suppression de 'Score de détection robot' (mean_score) de la sidebar et de l'API - Suppression de bot_ips / high_risk_ips des stats (métriques dérivées des scores supprimés) - Redistribution des poids dans risk_score_from_centroid: UA-CH mismatch +17%, fuzzing +14%, headless +10%, vélocité +9%, ip_id_zero +7% - Mise à jour des indices feature dans name_cluster et risk_score_from_centroid - Palette 24 couleurs spectrales (cluster_color) → bleu/violet/rose/teal/amber/cyan/lime... Les couleurs identifient les clusters, non leur niveau de risque - Remplacement de la légende CRITICAL/HIGH/MEDIUM/LOW par la liste des clusters actifs - Ajout de spread_clusters(): répulsion itérative des centroïdes trop proches (50 iter) min_dist=0.16 → les clusters se repoussent mutuellement → visualisation plus lisible - Interface TypeScript mise à jour (suppression mean_score, bot_ips, high_risk_ips) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-19 14:01:14 +01:00
parent da6fef87fd
commit 08d003a050
3 changed files with 160 additions and 86 deletions
--- a/backend/services/clustering_engine.py
+++ b/backend/services/clustering_engine.py
@ -127,9 +127,7 @@ FEATURES: list[tuple[str, str, object]] = [
    ("scale",         "Scale TCP",             lambda v: min(1.0, (v or 0) / 14.0)),
    ("win",           "Fenêtre TCP",           lambda v: min(1.0, (v or 0) / 65535.0)),
    # Anomalie ML
-    ("avg_score",     "Score Anomalie",        lambda v: min(1.0, float(v or 0))),
-    ("avg_velocity",  "Vélocité (rps)",        lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))),
-    ("avg_fuzzing",   "Fuzzing",               lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))),
+    ("avg_velocity",  "Vélocité (rps)",        lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))),    ("avg_fuzzing",   "Fuzzing",               lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))),
    ("pct_headless",  "Headless",              lambda v: min(1.0, float(v or 0))),
    ("avg_post",      "Ratio POST",            lambda v: min(1.0, float(v or 0))),
    # IP-ID
@ -353,51 +351,48 @@ def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
    n = len(s)
    ttl_raw = float(raw_stats.get("mean_ttl", 0))
    mss_raw = float(raw_stats.get("mean_mss", 0))
-    country_risk_v = s[21] if n > 21 else 0.0
-    asn_cloud      = s[22] if n > 22 else 0.0
-    accept_lang    = s[23] if n > 23 else 1.0
-    accept_enc     = s[24] if n > 24 else 1.0
-    sec_fetch      = s[25] if n > 25 else 0.0
-    hdr_count      = s[26] if n > 26 else 0.5
-    hfp_popular    = s[27] if n > 27 else 0.5
-    hfp_rotating   = s[28] if n > 28 else 0.0
+    country_risk_v = s[20] if n > 20 else 0.0
+    asn_cloud      = s[21] if n > 21 else 0.0
+    accept_lang    = s[22] if n > 22 else 1.0
+    accept_enc     = s[23] if n > 23 else 1.0
+    sec_fetch      = s[24] if n > 24 else 0.0
+    hdr_count      = s[25] if n > 25 else 0.5
+    hfp_popular    = s[26] if n > 26 else 0.5
+    hfp_rotating   = s[27] if n > 27 else 0.0

    # Scanner pur : aucun header browser, fingerprint rare, peu de headers
    if accept_lang < 0.15 and accept_enc < 0.15 and hdr_count < 0.25:
        return "🤖 Scanner pur (no headers)"
-    # Fingerprint tournant ET suspect : bot qui change de profil headers
-    if hfp_rotating > 0.6 and s[4] > 0.15:
+    # Fingerprint tournant : bot qui change de profil headers
+    if hfp_rotating > 0.6:
        return "🔄 Bot fingerprint tournant"
-    # Fingerprint très rare et anomalie : bot artisanal unique
-    if hfp_popular < 0.15 and s[4] > 0.20:
+    # Fingerprint très rare : bot artisanal unique
+    if hfp_popular < 0.15:
        return "🕵️ Fingerprint rare suspect"
    # Scanners Masscan
    if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25:
        return "🤖 Masscan Scanner"
-    # Bots offensifs agressifs (fuzzing + anomalie)
-    if s[4] > 0.40 and s[6] > 0.3:
+    # Bots offensifs agressifs (fuzzing élevé)
+    if s[4] > 0.40 and s[5] > 0.3:
        return "🤖 Bot agressif"
    # Bot qui simule un navigateur mais sans les vrais headers
-    if s[16] > 0.40 and sec_fetch < 0.2 and accept_lang < 0.3:
+    if s[15] > 0.40 and sec_fetch < 0.2 and accept_lang < 0.3:
        return "🤖 Bot UA simulé"
-    # Pays à très haut risque avec trafic anormal
-    if country_risk_v > 0.75 and (s[4] > 0.10 or asn_cloud > 0.5):
+    # Pays à très haut risque avec infrastructure cloud
+    if country_risk_v > 0.75 and asn_cloud > 0.5:
        return "🌏 Source pays risqué"
    # Cloud + UA-CH mismatch
-    if s[16] > 0.50 and asn_cloud > 0.70:
+    if s[15] > 0.50 and asn_cloud > 0.70:
        return "☁️ Bot cloud UA-CH"
-    if s[16] > 0.60:
+    if s[15] > 0.60:
        return "🤖 UA-CH Mismatch"
    # Headless browser (Puppeteer/Playwright) : a les headers Sec-Fetch mais headless
-    if s[7] > 0.50 and sec_fetch > 0.5:
+    if s[6] > 0.50 and sec_fetch > 0.5:
        return "🤖 Headless Browser"
-    if s[7] > 0.50:
+    if s[6] > 0.50:
        return "🤖 Headless (no Sec-Fetch)"
-    # Anomalie ML significative
-    if s[4] > 0.35:
-        return "⚠️ Anomalie ML"
    # Cloud pur (CDN/crawler légitime ?)
-    if asn_cloud > 0.85 and s[4] < 0.15:
+    if asn_cloud > 0.85:
        return "☁️ Infrastructure cloud"
    # Pays à risque élevé sans autre signal
    if country_risk_v > 0.60:
@ -413,9 +408,9 @@ def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
        return "🐧 Linux"
    if mss_raw < 1380 and mss_raw > 0:
        return "🌐 Tunnel réseau"
-    if s[5] > 0.40:
+    if s[4] > 0.40:
        return "⚡ Trafic rapide"
-    if s[4] < 0.08 and s[5] < 0.10 and asn_cloud < 0.30:
+    if s[4] < 0.10 and asn_cloud < 0.30:
        return "✅ Trafic sain"
    return "📊 Cluster mixte"

@ -423,34 +418,130 @@ def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
 def risk_score_from_centroid(centroid: np.ndarray) -> float:
    """
    Score de risque [0,1] depuis le centroïde (espace original [0,1]).
-    31 features — poids calibrés pour sommer à 1.0.
+    30 features (avg_score supprimé) — poids calibrés pour sommer à 1.0.
+    Indices décalés de -1 après suppression de avg_score (ancien idx 4).
    """
    s = centroid
    n = len(s)
-    country_risk_v = s[21] if n > 21 else 0.0
-    asn_cloud      = s[22] if n > 22 else 0.0
-    no_accept_lang = 1.0 - (s[23] if n > 23 else 1.0)
-    no_encoding    = 1.0 - (s[24] if n > 24 else 1.0)
-    no_sec_fetch   = 1.0 - (s[25] if n > 25 else 0.0)
-    few_headers    = 1.0 - (s[26] if n > 26 else 0.5)
-    # Fingerprint rare = suspect (faible popularité), fingerprint tournant = bot
-    hfp_rare       = 1.0 - (s[27] if n > 27 else 0.5)
-    hfp_rotating   = s[28] if n > 28 else 0.0
+    country_risk_v = s[20] if n > 20 else 0.0
+    asn_cloud      = s[21] if n > 21 else 0.0
+    no_accept_lang = 1.0 - (s[22] if n > 22 else 1.0)
+    no_encoding    = 1.0 - (s[23] if n > 23 else 1.0)
+    no_sec_fetch   = 1.0 - (s[24] if n > 24 else 0.0)
+    few_headers    = 1.0 - (s[25] if n > 25 else 0.5)
+    hfp_rare       = 1.0 - (s[26] if n > 26 else 0.5)
+    hfp_rotating   = s[27] if n > 27 else 0.0

+    # [4]=vélocité  [5]=fuzzing  [6]=headless  [8]=ip_id_zero  [15]=ua_ch_mismatch
+    # Poids redistribués depuis l'ancien score ML anomalie (0.25) vers les signaux restants
    return float(np.clip(
-        0.25 * s[4]          +   # score ML anomalie (principal)
-        0.09 * s[6]          +   # fuzzing
-        0.07 * s[16]         +   # UA-CH mismatch
-        0.06 * s[7]          +   # headless
-        0.05 * s[5]          +   # vélocité
-        0.05 * s[9]          +   # IP-ID zéro
+        0.14 * s[5]          +   # fuzzing
+        0.17 * s[15]         +   # UA-CH mismatch (fort signal impersonation navigateur)
+        0.10 * s[6]          +   # headless
+        0.09 * s[4]          +   # vélocité (rps)
+        0.07 * s[8]          +   # IP-ID zéro
        0.09 * country_risk_v+   # risque pays source
        0.06 * asn_cloud     +   # infrastructure cloud/VPN
        0.04 * no_accept_lang+   # absence Accept-Language
        0.04 * no_encoding   +   # absence Accept-Encoding
-        0.04 * no_sec_fetch  +   # absence Sec-Fetch (pas un vrai navigateur)
-        0.04 * few_headers   +   # très peu de headers (scanner/curl)
-        0.06 * hfp_rare      +   # fingerprint headers rare = suspect
+        0.04 * no_sec_fetch  +   # absence Sec-Fetch
+        0.04 * few_headers   +   # très peu de headers
+        0.06 * hfp_rare      +   # fingerprint rare = suspect
        0.06 * hfp_rotating,     # rotation de fingerprint = bot
        0.0, 1.0
    ))
+
+
+# ─── Palette de couleurs diversifiée (non liée au risque) ────────────────────
+# 24 couleurs couvrant tout le spectre HSL pour distinguer les clusters visuellement.
+# Choix: teintes espacées de ~15° avec alternance de saturation/luminosité.
+
+_CLUSTER_PALETTE: list[str] = [
+    "#3b82f6",  # blue
+    "#8b5cf6",  # violet
+    "#ec4899",  # pink
+    "#14b8a6",  # teal
+    "#f59e0b",  # amber
+    "#06b6d4",  # cyan
+    "#a3e635",  # lime
+    "#f97316",  # orange
+    "#6366f1",  # indigo
+    "#10b981",  # emerald
+    "#e879f9",  # fuchsia
+    "#fbbf24",  # yellow
+    "#60a5fa",  # light blue
+    "#c084fc",  # light purple
+    "#fb7185",  # rose
+    "#34d399",  # light green
+    "#38bdf8",  # sky
+    "#a78bfa",  # lavender
+    "#fdba74",  # peach
+    "#4ade80",  # green
+    "#f472b6",  # light pink
+    "#67e8f9",  # light cyan
+    "#d97706",  # dark amber
+    "#7c3aed",  # dark violet
+]
+
+
+def cluster_color(cluster_idx: int) -> str:
+    """Couleur distinctive pour un cluster, cyclique sur la palette."""
+    return _CLUSTER_PALETTE[cluster_idx % len(_CLUSTER_PALETTE)]
+
+
+# ─── Dispersion des clusters dans l'espace 2D ────────────────────────────────
+
+def spread_clusters(coords_2d: np.ndarray, labels: np.ndarray, k: int,
+                    n_iter: int = 50, min_dist: float = 0.14) -> np.ndarray:
+    """
+    Repousse les centroïdes trop proches par répulsion itérative (spring repulsion).
+    Chaque point suit le déplacement de son centroïde.
+
+    Paramètres
+    ----------
+    min_dist : distance minimale souhaitée entre centroïdes (espace [0,1]).
+               Augmenter pour plus d'éclatement.
+    n_iter   : nombre d'itérations de la physique de répulsion.
+    """
+    rng = np.random.default_rng(0)
+    centroids = np.zeros((k, 2))
+    counts = np.zeros(k, dtype=int)
+    for j in range(k):
+        mask = labels == j
+        if mask.any():
+            centroids[j] = coords_2d[mask].mean(axis=0)
+            counts[j] = int(mask.sum())
+
+    orig = centroids.copy()
+
+    for _ in range(n_iter):
+        forces = np.zeros_like(centroids)
+        for i in range(k):
+            if counts[i] == 0:
+                continue
+            for j in range(k):
+                if i == j or counts[j] == 0:
+                    continue
+                delta = centroids[i] - centroids[j]
+                dist = float(np.linalg.norm(delta))
+                if dist < 1e-8:
+                    delta = rng.uniform(-0.02, 0.02, size=2)
+                    dist = float(np.linalg.norm(delta)) + 1e-8
+                if dist < min_dist:
+                    # Force inversement proportionnelle à l'écart
+                    magnitude = (min_dist - dist) / min_dist
+                    forces[i] += magnitude * (delta / dist)
+        centroids += forces * 0.10
+
+    # Déplace chaque point par le delta de son centroïde
+    displaced = coords_2d.copy()
+    for j in range(k):
+        if counts[j] == 0:
+            continue
+        displaced[labels == j] += centroids[j] - orig[j]
+
+    # Re-normalisation [0, 1]
+    mn, mx = displaced.min(axis=0), displaced.max(axis=0)
+    rng_ = mx - mn
+    rng_[rng_ < 1e-8] = 1.0
+    return (displaced - mn) / rng_