feat(clustering): palette diversifiée, suppression scores anomalie/robot, visualisation éclatée

- Suppression de 'Score Anomalie' (avg_score) des 31→30 features de clustering - Suppression de 'Score de détection robot' (mean_score) de la sidebar et de l'API - Suppression de bot_ips / high_risk_ips des stats (métriques dérivées des scores supprimés) - Redistribution des poids dans risk_score_from_centroid: UA-CH mismatch +17%, fuzzing +14%, headless +10%, vélocité +9%, ip_id_zero +7% - Mise à jour des indices feature dans name_cluster et risk_score_from_centroid - Palette 24 couleurs spectrales (cluster_color) → bleu/violet/rose/teal/amber/cyan/lime... Les couleurs identifient les clusters, non leur niveau de risque - Remplacement de la légende CRITICAL/HIGH/MEDIUM/LOW par la liste des clusters actifs - Ajout de spread_clusters(): répulsion itérative des centroïdes trop proches (50 iter) min_dist=0.16 → les clusters se repoussent mutuellement → visualisation plus lisible - Interface TypeScript mise à jour (suppression mean_score, bot_ips, high_risk_ips) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-19 14:01:14 +01:00
parent da6fef87fd
commit 08d003a050
3 changed files with 160 additions and 86 deletions
--- a/backend/routes/clustering.py
+++ b/backend/routes/clustering.py
@ -24,6 +24,7 @@ from ..services.clustering_engine import (
    FEATURE_KEYS, FEATURE_NAMES, FEATURE_NORMS, N_FEATURES,
    build_feature_vector, kmeans_pp, pca_2d, compute_hulls,
    name_cluster, risk_score_from_centroid, standardize,
    cluster_color, spread_clusters,
 )
 log = logging.getLogger(__name__)
@ -42,19 +43,9 @@ _CACHE_TTL = 1800  # 30 minutes
 _LOCK = threading.Lock()
 _EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="clustering")
-# ─── Couleurs menace ──────────────────────────────────────────────────────────
+# ─── Palette de couleurs (remplace l'ancienne logique menace) ─────────────────
-_THREAT_COLOR = {
+# Les couleurs sont désormais attribuées par index de cluster pour maximiser
-    0.70: "#dc2626",   # Critique
+# la distinction visuelle, indépendamment du niveau de risque.
    0.45: "#f97316",   # Élevé
    0.25: "#eab308",   # Modéré
    0.00: "#22c55e",   # Sain
 }
 def _risk_to_color(risk: float) -> str:
    for threshold, color in sorted(_THREAT_COLOR.items(), reverse=True):
        if risk >= threshold:
            return color
    return "#6b7280"
 # ─── SQL : TOUTES les IPs sans LIMIT ─────────────────────────────────────────
@ -205,7 +196,11 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
        # ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ────
        coords = pca_2d(X64)   # (n, 2), normalisé [0,1]
-        # ── 5b. Enveloppes convexes par cluster ──────────────────────────
+        # ── 5b. Dispersion — repousse les clusters trop proches ──────────
        coords = spread_clusters(coords, km.labels, k_actual,
                                 n_iter=60, min_dist=0.16)
        # ── 5c. Enveloppes convexes par cluster ──────────────────────────
        hulls = compute_hulls(coords, km.labels, k_actual)
        # ── 6. Agrégation par cluster ─────────────────────────────────────
@ -242,7 +237,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
            raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale}
            label_name = name_cluster(centroids_orig[j], raw_stats)
            risk       = float(risk_score_from_centroid(centroids_orig[j]))
-            color      = _risk_to_color(risk)
+            color      = cluster_color(j)
            # Centroïde 2D = moyenne des coords du cluster
            cxy = np.mean(cluster_coords[j], axis=0).tolist() if cluster_coords[j] else [0.5, 0.5]
@ -282,7 +277,6 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
                "mean_mss":     round(mean_mss, 0),
                "mean_scale":   round(mean_scale, 1),
                "mean_win":     round(mean_win, 0),
                "mean_score":   round(avg_f("avg_score"),   4),
                "mean_velocity":round(avg_f("avg_velocity"),3),
                "mean_fuzzing": round(avg_f("avg_fuzzing"), 3),
                "mean_headless":round(avg_f("pct_headless"),3),
@ -338,8 +332,6 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
        # ── 9. Stockage résultat + cache IPs ─────────────────────────────
        total_ips  = sum(n_["ip_count"]  for n_ in nodes)
        total_hits = sum(n_["hit_count"] for n_ in nodes)
        bot_ips    = sum(n_["ip_count"]  for n_ in nodes if n_["risk_score"] > 0.45 or "🤖" in n_["label"])
        high_ips   = sum(n_["ip_count"]  for n_ in nodes if n_["risk_score"] > 0.25)
        elapsed    = round(time.time() - t0, 2)
        result_dict = {
@ -349,8 +341,6 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
                "total_clusters": len(nodes),
                "total_ips":      total_ips,
                "total_hits":     total_hits,
                "bot_ips":        bot_ips,
                "high_risk_ips":  high_ips,
                "n_samples":      n,
                "k":              k_actual,
                "k_base":         k,
--- a/backend/services/clustering_engine.py
+++ b/backend/services/clustering_engine.py
@ -127,9 +127,7 @@ FEATURES: list[tuple[str, str, object]] = [
    ("scale",         "Scale TCP",             lambda v: min(1.0, (v or 0) / 14.0)),
    ("win",           "Fenêtre TCP",           lambda v: min(1.0, (v or 0) / 65535.0)),
    # Anomalie ML
-    ("avg_score",     "Score Anomalie",        lambda v: min(1.0, float(v or 0))),
+    ("avg_velocity",  "Vélocité (rps)",        lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))),    ("avg_fuzzing",   "Fuzzing",               lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))),
    ("avg_velocity",  "Vélocité (rps)",        lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))),
    ("avg_fuzzing",   "Fuzzing",               lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))),
    ("pct_headless",  "Headless",              lambda v: min(1.0, float(v or 0))),
    ("avg_post",      "Ratio POST",            lambda v: min(1.0, float(v or 0))),
    # IP-ID
@ -353,51 +351,48 @@ def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
    n = len(s)
    ttl_raw = float(raw_stats.get("mean_ttl", 0))
    mss_raw = float(raw_stats.get("mean_mss", 0))
-    country_risk_v = s[21] if n > 21 else 0.0
+    country_risk_v = s[20] if n > 20 else 0.0
-    asn_cloud      = s[22] if n > 22 else 0.0
+    asn_cloud      = s[21] if n > 21 else 0.0
-    accept_lang    = s[23] if n > 23 else 1.0
+    accept_lang    = s[22] if n > 22 else 1.0
-    accept_enc     = s[24] if n > 24 else 1.0
+    accept_enc     = s[23] if n > 23 else 1.0
-    sec_fetch      = s[25] if n > 25 else 0.0
+    sec_fetch      = s[24] if n > 24 else 0.0
-    hdr_count      = s[26] if n > 26 else 0.5
+    hdr_count      = s[25] if n > 25 else 0.5
-    hfp_popular    = s[27] if n > 27 else 0.5
+    hfp_popular    = s[26] if n > 26 else 0.5
-    hfp_rotating   = s[28] if n > 28 else 0.0
+    hfp_rotating   = s[27] if n > 27 else 0.0
    # Scanner pur : aucun header browser, fingerprint rare, peu de headers
    if accept_lang < 0.15 and accept_enc < 0.15 and hdr_count < 0.25:
        return "🤖 Scanner pur (no headers)"
-    # Fingerprint tournant ET suspect : bot qui change de profil headers
+    # Fingerprint tournant : bot qui change de profil headers
-    if hfp_rotating > 0.6 and s[4] > 0.15:
+    if hfp_rotating > 0.6:
        return "🔄 Bot fingerprint tournant"
-    # Fingerprint très rare et anomalie : bot artisanal unique
+    # Fingerprint très rare : bot artisanal unique
-    if hfp_popular < 0.15 and s[4] > 0.20:
+    if hfp_popular < 0.15:
        return "🕵️ Fingerprint rare suspect"
    # Scanners Masscan
    if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25:
        return "🤖 Masscan Scanner"
-    # Bots offensifs agressifs (fuzzing + anomalie)
+    # Bots offensifs agressifs (fuzzing élevé)
-    if s[4] > 0.40 and s[6] > 0.3:
+    if s[4] > 0.40 and s[5] > 0.3:
        return "🤖 Bot agressif"
    # Bot qui simule un navigateur mais sans les vrais headers
-    if s[16] > 0.40 and sec_fetch < 0.2 and accept_lang < 0.3:
+    if s[15] > 0.40 and sec_fetch < 0.2 and accept_lang < 0.3:
        return "🤖 Bot UA simulé"
-    # Pays à très haut risque avec trafic anormal
+    # Pays à très haut risque avec infrastructure cloud
-    if country_risk_v > 0.75 and (s[4] > 0.10 or asn_cloud > 0.5):
+    if country_risk_v > 0.75 and asn_cloud > 0.5:
        return "🌏 Source pays risqué"
    # Cloud + UA-CH mismatch
-    if s[16] > 0.50 and asn_cloud > 0.70:
+    if s[15] > 0.50 and asn_cloud > 0.70:
        return "☁️ Bot cloud UA-CH"
-    if s[16] > 0.60:
+    if s[15] > 0.60:
        return "🤖 UA-CH Mismatch"
    # Headless browser (Puppeteer/Playwright) : a les headers Sec-Fetch mais headless
-    if s[7] > 0.50 and sec_fetch > 0.5:
+    if s[6] > 0.50 and sec_fetch > 0.5:
        return "🤖 Headless Browser"
-    if s[7] > 0.50:
+    if s[6] > 0.50:
        return "🤖 Headless (no Sec-Fetch)"
    # Anomalie ML significative
    if s[4] > 0.35:
        return "⚠️ Anomalie ML"
    # Cloud pur (CDN/crawler légitime ?)
-    if asn_cloud > 0.85 and s[4] < 0.15:
+    if asn_cloud > 0.85:
        return "☁️ Infrastructure cloud"
    # Pays à risque élevé sans autre signal
    if country_risk_v > 0.60:
@ -413,9 +408,9 @@ def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
        return "🐧 Linux"
    if mss_raw < 1380 and mss_raw > 0:
        return "🌐 Tunnel réseau"
-    if s[5] > 0.40:
+    if s[4] > 0.40:
        return "⚡ Trafic rapide"
-    if s[4] < 0.08 and s[5] < 0.10 and asn_cloud < 0.30:
+    if s[4] < 0.10 and asn_cloud < 0.30:
        return "✅ Trafic sain"
    return "📊 Cluster mixte"
@ -423,34 +418,130 @@ def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
 def risk_score_from_centroid(centroid: np.ndarray) -> float:
    """
    Score de risque [0,1] depuis le centroïde (espace original [0,1]).
-    31 features — poids calibrés pour sommer à 1.0.
+    30 features (avg_score supprimé) — poids calibrés pour sommer à 1.0.
    Indices décalés de -1 après suppression de avg_score (ancien idx 4).
    """
    s = centroid
    n = len(s)
-    country_risk_v = s[21] if n > 21 else 0.0
+    country_risk_v = s[20] if n > 20 else 0.0
-    asn_cloud      = s[22] if n > 22 else 0.0
+    asn_cloud      = s[21] if n > 21 else 0.0
-    no_accept_lang = 1.0 - (s[23] if n > 23 else 1.0)
+    no_accept_lang = 1.0 - (s[22] if n > 22 else 1.0)
-    no_encoding    = 1.0 - (s[24] if n > 24 else 1.0)
+    no_encoding    = 1.0 - (s[23] if n > 23 else 1.0)
-    no_sec_fetch   = 1.0 - (s[25] if n > 25 else 0.0)
+    no_sec_fetch   = 1.0 - (s[24] if n > 24 else 0.0)
-    few_headers    = 1.0 - (s[26] if n > 26 else 0.5)
+    few_headers    = 1.0 - (s[25] if n > 25 else 0.5)
-    # Fingerprint rare = suspect (faible popularité), fingerprint tournant = bot
+    hfp_rare       = 1.0 - (s[26] if n > 26 else 0.5)
-    hfp_rare       = 1.0 - (s[27] if n > 27 else 0.5)
+    hfp_rotating   = s[27] if n > 27 else 0.0
    hfp_rotating   = s[28] if n > 28 else 0.0
    # [4]=vélocité  [5]=fuzzing  [6]=headless  [8]=ip_id_zero  [15]=ua_ch_mismatch
    # Poids redistribués depuis l'ancien score ML anomalie (0.25) vers les signaux restants
    return float(np.clip(
-        0.25 * s[4]          +   # score ML anomalie (principal)
+        0.14 * s[5]          +   # fuzzing
-        0.09 * s[6]          +   # fuzzing
+        0.17 * s[15]         +   # UA-CH mismatch (fort signal impersonation navigateur)
-        0.07 * s[16]         +   # UA-CH mismatch
+        0.10 * s[6]          +   # headless
-        0.06 * s[7]          +   # headless
+        0.09 * s[4]          +   # vélocité (rps)
-        0.05 * s[5]          +   # vélocité
+        0.07 * s[8]          +   # IP-ID zéro
        0.05 * s[9]          +   # IP-ID zéro
        0.09 * country_risk_v+   # risque pays source
        0.06 * asn_cloud     +   # infrastructure cloud/VPN
        0.04 * no_accept_lang+   # absence Accept-Language
        0.04 * no_encoding   +   # absence Accept-Encoding
-        0.04 * no_sec_fetch  +   # absence Sec-Fetch (pas un vrai navigateur)
+        0.04 * no_sec_fetch  +   # absence Sec-Fetch
-        0.04 * few_headers   +   # très peu de headers (scanner/curl)
+        0.04 * few_headers   +   # très peu de headers
-        0.06 * hfp_rare      +   # fingerprint headers rare = suspect
+        0.06 * hfp_rare      +   # fingerprint rare = suspect
        0.06 * hfp_rotating,     # rotation de fingerprint = bot
        0.0, 1.0
    ))
 # ─── Palette de couleurs diversifiée (non liée au risque) ────────────────────
 # 24 couleurs couvrant tout le spectre HSL pour distinguer les clusters visuellement.
 # Choix: teintes espacées de ~15° avec alternance de saturation/luminosité.
 _CLUSTER_PALETTE: list[str] = [
    "#3b82f6",  # blue
    "#8b5cf6",  # violet
    "#ec4899",  # pink
    "#14b8a6",  # teal
    "#f59e0b",  # amber
    "#06b6d4",  # cyan
    "#a3e635",  # lime
    "#f97316",  # orange
    "#6366f1",  # indigo
    "#10b981",  # emerald
    "#e879f9",  # fuchsia
    "#fbbf24",  # yellow
    "#60a5fa",  # light blue
    "#c084fc",  # light purple
    "#fb7185",  # rose
    "#34d399",  # light green
    "#38bdf8",  # sky
    "#a78bfa",  # lavender
    "#fdba74",  # peach
    "#4ade80",  # green
    "#f472b6",  # light pink
    "#67e8f9",  # light cyan
    "#d97706",  # dark amber
    "#7c3aed",  # dark violet
 ]
 def cluster_color(cluster_idx: int) -> str:
    """Couleur distinctive pour un cluster, cyclique sur la palette."""
    return _CLUSTER_PALETTE[cluster_idx % len(_CLUSTER_PALETTE)]
 # ─── Dispersion des clusters dans l'espace 2D ────────────────────────────────
 def spread_clusters(coords_2d: np.ndarray, labels: np.ndarray, k: int,
                    n_iter: int = 50, min_dist: float = 0.14) -> np.ndarray:
    """
    Repousse les centroïdes trop proches par répulsion itérative (spring repulsion).
    Chaque point suit le déplacement de son centroïde.
    Paramètres
    ----------
    min_dist : distance minimale souhaitée entre centroïdes (espace [0,1]).
               Augmenter pour plus d'éclatement.
    n_iter   : nombre d'itérations de la physique de répulsion.
    """
    rng = np.random.default_rng(0)
    centroids = np.zeros((k, 2))
    counts = np.zeros(k, dtype=int)
    for j in range(k):
        mask = labels == j
        if mask.any():
            centroids[j] = coords_2d[mask].mean(axis=0)
            counts[j] = int(mask.sum())
    orig = centroids.copy()
    for _ in range(n_iter):
        forces = np.zeros_like(centroids)
        for i in range(k):
            if counts[i] == 0:
                continue
            for j in range(k):
                if i == j or counts[j] == 0:
                    continue
                delta = centroids[i] - centroids[j]
                dist = float(np.linalg.norm(delta))
                if dist < 1e-8:
                    delta = rng.uniform(-0.02, 0.02, size=2)
                    dist = float(np.linalg.norm(delta)) + 1e-8
                if dist < min_dist:
                    # Force inversement proportionnelle à l'écart
                    magnitude = (min_dist - dist) / min_dist
                    forces[i] += magnitude * (delta / dist)
        centroids += forces * 0.10
    # Déplace chaque point par le delta de son centroïde
    displaced = coords_2d.copy()
    for j in range(k):
        if counts[j] == 0:
            continue
        displaced[labels == j] += centroids[j] - orig[j]
    # Re-normalisation [0, 1]
    mn, mx = displaced.min(axis=0), displaced.max(axis=0)
    rng_ = mx - mn
    rng_[rng_ < 1e-8] = 1.0
    return (displaced - mn) / rng_
--- a/frontend/src/components/ClusteringView.tsx
+++ b/frontend/src/components/ClusteringView.tsx
@ -35,7 +35,6 @@ interface ClusterNode {
  hit_count: number;
  mean_ttl: number;
  mean_mss: number;
  mean_score: number;
  mean_velocity: number;
  mean_fuzzing: number;
  mean_headless: number;
@ -60,8 +59,6 @@ interface ClusterStats {
  total_clusters: number;
  total_ips: number;
  total_hits: number;
  bot_ips: number;
  high_risk_ips: number;
  n_samples: number;
  k: number;
  elapsed_s: number;
@ -427,8 +424,6 @@ export default function ClusteringView() {
            <div className="font-semibold text-sm mb-2">Résultats</div>
            <Stat label="Clusters" value={data.stats.total_clusters} tooltip={TIPS.k_actual} />
            <Stat label="IPs totales" value={data.stats.total_ips.toLocaleString()} tooltip={TIPS.pca_2d} />
            <Stat label="IPs bots 🤖" value={data.stats.bot_ips.toLocaleString()} color="text-red-400" tooltip={TIPS.ips_bots} />
            <Stat label="Risque élevé" value={data.stats.high_risk_ips.toLocaleString()} color="text-orange-400" tooltip={TIPS.high_risk} />
            <Stat label="Hits totaux" value={data.stats.total_hits.toLocaleString()} tooltip={TIPS.total_hits} />
            <Stat label="Calcul" value={`${data.stats.elapsed_s}s`} tooltip={TIPS.calc_time} />
          </div>
@ -504,7 +499,7 @@ export default function ClusteringView() {
            </div>
            <p className="text-white font-semibold text-lg tracking-wide">Clustering en cours…</p>
            <p className="text-text-secondary text-sm mt-1">
-              K-means++ · 31 features · {Math.round(k * sensitivity)} clusters · toutes les IPs
+              K-means++ · 30 features · {Math.round(k * sensitivity)} clusters · toutes les IPs
            </p>
            <p className="text-text-disabled text-xs mt-2 animate-pulse">Mise à jour automatique toutes les 3 secondes</p>
          </div>
@ -527,19 +522,18 @@ export default function ClusteringView() {
            {/* Légende overlay */}
            <div style={{ position: 'absolute', bottom: 16, left: 16, pointerEvents: 'all' }}>
              <div className="bg-black/70 rounded-lg p-2 text-xs flex flex-col gap-1">
-                {([
+                <div className="text-white/50 text-[10px] uppercase tracking-wide mb-1">Clusters</div>
-                  ['#dc2626', 'CRITICAL', TIPS.risk_critical],
+                {data?.nodes?.slice(0, 6).map((n) => (
-                  ['#f97316', 'HIGH',     TIPS.risk_high],
+                  <div key={n.id} className="flex items-center gap-2">
-                  ['#eab308', 'MEDIUM',   TIPS.risk_medium],
+                    <span className="w-3 h-3 rounded-full flex-shrink-0" style={{ background: n.color }} />
-                  ['#22c55e', 'LOW',      TIPS.risk_low],
+                    <span className="text-white/70 truncate max-w-[120px]">{n.label}</span>
                ] as const).map(([c, l, tip]) => (
                  <div key={l} className="flex items-center gap-2" title={tip}>
                    <span className="w-3 h-3 rounded-full flex-shrink-0" style={{ background: c }} />
                    <span className="text-white/80 cursor-help">{l}</span>
                  </div>
                ))}
                {(data?.nodes?.length ?? 0) > 6 && (
                  <div className="text-white/30 text-[10px]">+{(data?.nodes?.length ?? 0) - 6} autres…</div>
                )}
                <div className="mt-1 pt-1 border-t border-white/10 text-white/40 text-[10px] cursor-help" title={TIPS.features_31}>
-                  31 features · PCA 2D ⓘ
+                  30 features · PCA 2D ⓘ
                </div>
              </div>
            </div>
@ -666,7 +660,6 @@ function ClusterSidebar({ node, ipDetails, ipTotal, ipPage, clusterPoints, onClo
          <div className="font-semibold mb-2">Stack TCP</div>
          <Stat label="TTL moyen" value={node.mean_ttl} tooltip={TIPS.mean_ttl} />
          <Stat label="MSS moyen" value={node.mean_mss} tooltip={TIPS.mean_mss} />
          <Stat label="Score ML" value={`${(node.mean_score * 100).toFixed(1)}%`} tooltip={TIPS.mean_score} />
          <Stat label="Vélocité" value={node.mean_velocity?.toFixed ? `${node.mean_velocity.toFixed(2)} rps` : '-'} tooltip={TIPS.mean_velocity} />
          <Stat label="Headless" value={node.mean_headless ? `${(node.mean_headless * 100).toFixed(0)}%` : '-'} tooltip={TIPS.mean_headless} />
          <Stat label="UA-CH Mismatch" value={node.mean_ua_ch ? `${(node.mean_ua_ch * 100).toFixed(0)}%` : '-'} tooltip={TIPS.mean_ua_ch} />