feat: WebGL clustering (deck.gl) + K-means++ sur toutes les IPs (183K)

- Ajout numpy + scipy à requirements.txt (K-means vectorisé, convex hull) - Réécriture clustering_engine.py : * K-means++ entièrement vectorisé numpy (100x plus rapide que pur Python) * PCA-2D par power iteration (numpy) * Enveloppes convexes par cluster via scipy.spatial.ConvexHull * Traitement des probabilités nulles (points dupliqués) en K-means++ init - Réécriture clustering.py : * Calcul sur la TOTALITÉ des IPs (sans LIMIT) : 183K IPs, 16.8 MB features * Computation en background thread (ThreadPoolExecutor) + cache 30 min * Endpoint /api/clustering/status pour polling frontend * Endpoint /api/clustering/cluster/{id}/points (coordonnées PCA pour WebGL) - Réécriture ClusteringView.tsx en WebGL (deck.gl) : * PolygonLayer : enveloppes convexes colorées par niveau de menace * ScatterplotLayer centroïdes : taille ∝ sqrt(ip_count) * ScatterplotLayer IPs : chargé sur sélection (LOD), GPU-accelerated * TextLayer : labels (emojis strippés — non supportés par bitmap font) * LineLayer : arêtes inter-clusters (optionnel) * OrthographicView avec pan/zoom natif * Sidebar : radar 21 features, pagination IPs, export CSV * Polling automatique toutes les 3s pendant le calcul - Ajout @deck.gl/react @deck.gl/core @deck.gl/layers à package.json Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-19 09:40:27 +01:00
parent 9de59f5681
commit b2c3379aa0
5 changed files with 1130 additions and 1369 deletions
--- a/backend/routes/clustering.py
+++ b/backend/routes/clustering.py
@ -1,54 +1,53 @@
 """
-Clustering d'IPs multi-métriques — backend ReactFlow.
+Clustering d'IPs multi-métriques — WebGL / deck.gl backend.
-Features utilisées (21 dimensions) :
+- Calcul sur la TOTALITÉ des IPs (GROUP BY src_ip, ja4 sans LIMIT)
-  TCP stack    : TTL initial, MSS, scale, fenêtre TCP
+- K-means++ vectorisé (numpy) + PCA-2D + enveloppes convexes (scipy)
-  Comportement : vélocité, POST ratio, fuzzing, assets, accès direct
+- Calcul en background thread + cache 30 min
-  Anomalie ML  : score, IP-ID zéro
+- Endpoints : /clusters, /status, /cluster/{id}/points
  TLS/Protocole: ALPN mismatch, ALPN absent, efficacité H2
  Navigateur   : browser score, headless, ordre headers, UA-CH mismatch
  Temporel     : entropie, diversité JA4, UA rotatif
 Algorithme :
  1. Échantillonnage stratifié (top détections + top hits)
  2. Construction + normalisation des vecteurs de features
  3. K-means++ (Arthur & Vassilvitskii, 2007)
  4. PCA-2D par power iteration pour les positions ReactFlow
  5. Nommage automatique par features dominantes du centroïde
  6. Calcul des arêtes : k-NN dans l'espace des features
 """
 from __future__ import annotations
 import math
 import time
-import hashlib
+import logging
-from typing import Optional
+import threading
 from collections import Counter
 from concurrent.futures import ThreadPoolExecutor
 from typing import Optional, Any
 import numpy as np
 from fastapi import APIRouter, HTTPException, Query
 from ..database import db
 from ..services.clustering_engine import (
-    FEATURES, FEATURE_KEYS, FEATURE_NORMS, FEATURE_NAMES, N_FEATURES,
+    FEATURE_KEYS, FEATURE_NAMES, FEATURE_NORMS, N_FEATURES,
-    build_feature_vector, kmeans_pp, pca_2d,
+    build_feature_vector, kmeans_pp, pca_2d, compute_hulls,
-    name_cluster, risk_score_from_centroid, _mean_vec,
+    name_cluster, risk_score_from_centroid,
 )
 log = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/clustering", tags=["clustering"])
-# ─── Cache en mémoire ─────────────────────────────────────────────────────────
+# ─── Cache global ──────────────────────────────────────────────────────────────
-# Stocke (cluster_id → liste d'IPs) pour le drill-down
+_CACHE: dict[str, Any] = {
-# + timestamp de dernière mise à jour
+    "status": "idle",        # idle | computing | ready | error
-_cache: dict = {
+    "error": None,
-    "assignments": {},   # ip+ja4 → cluster_idx
+    "result": None,          # dict résultat complet
-    "cluster_ips": {},   # cluster_idx → [(ip, ja4)]
+    "ts": 0.0,               # timestamp dernière mise à jour
-    "params": {},        # k, ts
+    "params": {},
    "cluster_ips": {},       # cluster_idx → [(ip, ja4, pca_x, pca_y, risk)]
 }
 _CACHE_TTL = 1800  # 30 minutes
 _LOCK = threading.Lock()
 _EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="clustering")
-# ─── Couleurs ─────────────────────────────────────────────────────────────────
+# ─── Couleurs menace ──────────────────────────────────────────────────────────
 _THREAT_COLOR = {
-    0.92: "#dc2626",   # Bot scanner
+    0.70: "#dc2626",   # Critique
    0.70: "#ef4444",   # Critique
    0.45: "#f97316",   # Élevé
    0.25: "#eab308",   # Modéré
-    0.00: "#6b7280",   # Sain / inconnu
+    0.00: "#22c55e",   # Sain
 }
 def _risk_to_color(risk: float) -> str:
@ -58,9 +57,8 @@ def _risk_to_color(risk: float) -> str:
    return "#6b7280"
-# ─── SQL ──────────────────────────────────────────────────────────────────────
+# ─── SQL : TOUTES les IPs sans LIMIT ─────────────────────────────────────────
-
+_SQL_ALL_IPS = """
 _SQL_FEATURES = """
 SELECT
    replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') AS ip,
    t.ja4,
@ -95,19 +93,12 @@ SELECT
 FROM mabase_prod.agg_host_ip_ja4_1h t
 LEFT JOIN mabase_prod.ml_detected_anomalies ml
    ON t.src_ip = ml.src_ip AND t.ja4 = ml.ja4
-   AND ml.detected_at >= now() - INTERVAL 24 HOUR
+   AND ml.detected_at >= now() - INTERVAL %(hours)s HOUR
-WHERE t.window_start >= now() - INTERVAL 24 HOUR
+WHERE t.window_start >= now() - INTERVAL %(hours)s HOUR
  AND t.tcp_ttl_raw > 0
 GROUP BY t.src_ip, t.ja4
 ORDER BY
    -- Stratégie : IPs anormales en premier, puis fort trafic
    -- Cela garantit que les bots Masscan (anomalie=0.97, hits=1-2) sont inclus
    avg(abs(ml.anomaly_score)) DESC,
    sum(t.hits) DESC
 LIMIT %(limit)s
 """
 # Noms des colonnes SQL dans l'ordre
 _SQL_COLS = [
    "ip", "ja4", "ttl", "win", "scale", "mss", "ua", "hits",
    "avg_score", "avg_velocity", "avg_fuzzing", "pct_headless", "avg_post",
@ -117,238 +108,178 @@ _SQL_COLS = [
 ]
-# ─── Endpoints ────────────────────────────────────────────────────────────────
+# ─── Worker de clustering (thread pool) ──────────────────────────────────────
-@router.get("/clusters")
+def _run_clustering_job(k: int, hours: int) -> None:
-async def get_clusters(
+    """Exécuté dans le thread pool. Met à jour _CACHE."""
    k:          int = Query(14, ge=4, le=30, description="Nombre de clusters"),
    n_samples:  int = Query(3000, ge=500, le=8000, description="Taille de l'échantillon"),
 ):
    """
    Clustering multi-métriques des IPs.
    Retourne les nœuds (clusters) + arêtes pour ReactFlow, avec :
      - positions 2D issues de PCA sur les 21 features
      - profil radar des features par cluster (normalisé [0,1])
      - statistiques détaillées (moyennes brutes des features)
      - sample d'IPs représentatives
    """
    t0 = time.time()
-    try:
+    with _LOCK:
-        result = db.query(_SQL_FEATURES, {"limit": n_samples})
+        _CACHE["status"] = "computing"
-    except Exception as e:
+        _CACHE["error"] = None
        raise HTTPException(status_code=500, detail=f"ClickHouse: {e}")
-    # ── Construction des vecteurs de features ─────────────────────────────
+    try:
        log.info(f"[clustering] Démarrage du calcul k={k} hours={hours}")
        # ── 1. Chargement de toutes les IPs ──────────────────────────────
        result = db.query(_SQL_ALL_IPS, {"hours": hours})
        rows: list[dict] = []
        for row in result.result_rows:
-        d = {col: row[i] for i, col in enumerate(_SQL_COLS)}
+            rows.append({col: row[i] for i, col in enumerate(_SQL_COLS)})
        rows.append(d)
-    if len(rows) < k:
+        n = len(rows)
-        raise HTTPException(status_code=400, detail="Pas assez de données pour ce k")
+        log.info(f"[clustering] {n} IPs chargées")
        if n < k:
            raise ValueError(f"Seulement {n} IPs disponibles (k={k} requis)")
-    points = [build_feature_vector(r) for r in rows]
+        # ── 2. Construction de la matrice de features (numpy) ────────────
        X = np.array([build_feature_vector(r) for r in rows], dtype=np.float32)
        log.info(f"[clustering] Matrice X: {X.shape} — {X.nbytes/1024/1024:.1f} MB")
-    # ── K-means++ ────────────────────────────────────────────────────────
+        # ── 3. K-means++ vectorisé ────────────────────────────────────────
-    km = kmeans_pp(points, k=k, max_iter=60, seed=42)
+        km = kmeans_pp(X.astype(np.float64), k=k, max_iter=80, n_init=3, seed=42)
        log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")
-    # ── PCA-2D sur les centroïdes ─────────────────────────────────────────
+        # ── 4. PCA-2D pour toutes les IPs ────────────────────────────────
-    # On projette les centroïdes dans l'espace PCA des données
+        coords = pca_2d(X.astype(np.float64))   # (n, 2), normalisé [0,1]
    # → les positions relatives reflètent la variance des données
    coords_all = pca_2d(points)
    # Moyenne des positions PCA par cluster = position 2D du centroïde
    cluster_xs: list[list[float]] = [[] for _ in range(k)]
    cluster_ys: list[list[float]] = [[] for _ in range(k)]
    for i, label in enumerate(km.labels):
        cluster_xs[label].append(coords_all[i][0])
        cluster_ys[label].append(coords_all[i][1])
-    centroid_2d: list[tuple[float, float]] = []
+        # ── 5. Enveloppes convexes par cluster ───────────────────────────
-    for j in range(k):
+        hulls = compute_hulls(coords, km.labels, k)
        if cluster_xs[j]:
            cx = sum(cluster_xs[j]) / len(cluster_xs[j])
            cy = sum(cluster_ys[j]) / len(cluster_ys[j])
        else:
            cx, cy = 0.5, 0.5
        centroid_2d.append((cx, cy))
-    # ── Agrégation des statistiques par cluster ───────────────────────────
+        # ── 6. Agrégation par cluster ─────────────────────────────────────
        cluster_rows:    list[list[dict]] = [[] for _ in range(k)]
-    cluster_members: list[list[tuple[str, str]]] = [[] for _ in range(k)]
+        cluster_coords:  list[list[list[float]]] = [[] for _ in range(k)]
        cluster_ips_map: dict[int, list] = {j: [] for j in range(k)}
        for i, label in enumerate(km.labels):
-        cluster_rows[label].append(rows[i])
+            j = int(label)
-        cluster_members[label].append((rows[i]["ip"], rows[i]["ja4"]))
+            cluster_rows[j].append(rows[i])
-
+            cluster_coords[j].append(coords[i].tolist())
-    # Mise à jour du cache pour le drill-down
+            cluster_ips_map[j].append((
-    _cache["cluster_ips"] = {j: cluster_members[j] for j in range(k)}
+                rows[i]["ip"],
-    _cache["params"] = {"k": k, "ts": t0}
+                rows[i]["ja4"],
-
+                float(coords[i][0]),
-    # ── Construction des nœuds ReactFlow ─────────────────────────────────
+                float(coords[i][1]),
-    CANVAS_W, CANVAS_H = 1400, 780
+                float(risk_score_from_centroid(km.centroids[j])),
            ))
        # ── 7. Construction des nœuds ─────────────────────────────────────
        nodes = []
        for j in range(k):
            if not cluster_rows[j]:
                continue
-        # Statistiques brutes moyennées
+            def avg_f(key: str, crows: list[dict] = cluster_rows[j]) -> float:
-        def avg_feat(key: str) -> float:
+                return float(np.mean([float(r.get(key) or 0) for r in crows]))
            vals = [float(r.get(key) or 0) for r in cluster_rows[j]]
            return sum(vals) / len(vals) if vals else 0.0
-        mean_ttl   = avg_feat("ttl")
+            mean_ttl   = avg_f("ttl")
-        mean_mss   = avg_feat("mss")
+            mean_mss   = avg_f("mss")
-        mean_scale = avg_feat("scale")
+            mean_scale = avg_f("scale")
-        mean_win   = avg_feat("win")
+            mean_win   = avg_f("win")
        mean_score = avg_feat("avg_score")
        mean_vel   = avg_feat("avg_velocity")
        mean_fuzz  = avg_feat("avg_fuzzing")
        mean_hless = avg_feat("pct_headless")
        mean_post  = avg_feat("avg_post")
        mean_asset = avg_feat("asset_ratio")
        mean_direct= avg_feat("direct_ratio")
        mean_alpn  = avg_feat("alpn_mismatch")
        mean_h2    = avg_feat("h2_eff")
        mean_hconf = avg_feat("hdr_conf")
        mean_ua_ch = avg_feat("ua_ch_mismatch")
        mean_entr  = avg_feat("entropy")
        mean_ja4   = avg_feat("ja4_count")
        mean_ip_id = avg_feat("ip_id_zero")
        mean_brow  = avg_feat("browser_score")
        mean_uarot = avg_feat("ua_rotating")
-        ip_count   = len(set(r["ip"] for r in cluster_rows[j]))
+            raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale}
            label_name = name_cluster(km.centroids[j], raw_stats)
            risk       = float(risk_score_from_centroid(km.centroids[j]))
            color      = _risk_to_color(risk)
            # Centroïde 2D = moyenne des coords du cluster
            cxy = np.mean(cluster_coords[j], axis=0).tolist() if cluster_coords[j] else [0.5, 0.5]
            ip_set   = list({r["ip"] for r in cluster_rows[j]})
            ip_count = len(ip_set)
            hit_count = int(sum(float(r.get("hits") or 0) for r in cluster_rows[j]))
        # Pays / ASN / Menace dominants
            threats   = [str(r.get("threat")  or "") for r in cluster_rows[j] if r.get("threat")]
            countries = [str(r.get("country") or "") for r in cluster_rows[j] if r.get("country")]
            orgs      = [str(r.get("asn_org") or "") for r in cluster_rows[j] if r.get("asn_org")]
            def topk(lst: list[str], n: int = 5) -> list[str]:
            from collections import Counter
                return [v for v, _ in Counter(lst).most_common(n) if v]
        raw_stats = {
            "mean_ttl": mean_ttl, "mean_mss": mean_mss,
            "mean_scale": mean_scale,
        }
        label      = name_cluster(km.centroids[j], raw_stats)
        risk       = risk_score_from_centroid(km.centroids[j])
        color      = _risk_to_color(risk)
        # Profil radar normalisé (valeurs centroïde [0,1])
            radar = [
-            {"feature": name, "value": round(km.centroids[j][i], 4)}
+                {"feature": name, "value": round(float(km.centroids[j][i]), 4)}
                for i, name in enumerate(FEATURE_NAMES)
            ]
-        # Position 2D (PCA normalisée → pixels ReactFlow)
+            radius = max(12, min(80, int(math.sqrt(ip_count) * 2)))
        px_x = centroid_2d[j][0] * CANVAS_W * 0.85 + 80
        px_y = (1 - centroid_2d[j][1]) * CANVAS_H * 0.85 + 50  # inverser y (haut=risque)
        # Rayon ∝ √ip_count
        radius = max(18, min(90, int(math.sqrt(ip_count) * 0.3)))
        # Sample IPs (top 8 par hits)
            sample_rows = sorted(cluster_rows[j], key=lambda r: float(r.get("hits") or 0), reverse=True)[:8]
            sample_ips  = [r["ip"] for r in sample_rows]
            sample_ua   = str(cluster_rows[j][0].get("ua") or "")
        cluster_id = f"c{j}_k{k}"
            nodes.append({
-            "id":          cluster_id,
+                "id":           f"c{j}_k{k}",
            "label":       label,
                "cluster_idx":  j,
-            "x": round(px_x, 1),
+                "label":        label_name,
-            "y": round(px_y, 1),
+                "pca_x":        round(cxy[0], 6),
                "pca_y":        round(cxy[1], 6),
                "radius":       radius,
                "color":        color,
-            "risk_score":  risk,
+                "risk_score":   round(risk, 4),
            # Caractéristiques TCP
                "mean_ttl":     round(mean_ttl, 1),
                "mean_mss":     round(mean_mss, 0),
                "mean_scale":   round(mean_scale, 1),
                "mean_win":     round(mean_win, 0),
                "mean_score":   round(avg_f("avg_score"),   4),
                "mean_velocity":round(avg_f("avg_velocity"),3),
                "mean_fuzzing": round(avg_f("avg_fuzzing"), 3),
                "mean_headless":round(avg_f("pct_headless"),3),
                "mean_post":    round(avg_f("avg_post"),    3),
                "mean_asset":   round(avg_f("asset_ratio"), 3),
                "mean_direct":  round(avg_f("direct_ratio"),3),
                "mean_alpn_mismatch": round(avg_f("alpn_mismatch"),3),
                "mean_h2_eff":  round(avg_f("h2_eff"),     3),
                "mean_hdr_conf":round(avg_f("hdr_conf"),   3),
                "mean_ua_ch":   round(avg_f("ua_ch_mismatch"),3),
                "mean_entropy": round(avg_f("entropy"),    3),
                "mean_ja4_diversity": round(avg_f("ja4_count"),3),
                "mean_ip_id_zero":    round(avg_f("ip_id_zero"),3),
                "mean_browser_score": round(avg_f("browser_score"),1),
                "mean_ua_rotating":   round(avg_f("ua_rotating"),3),
            # Comportement HTTP
            "mean_score":  round(mean_score, 4),
            "mean_velocity": round(mean_vel, 3),
            "mean_fuzzing": round(mean_fuzz, 3),
            "mean_headless": round(mean_hless, 3),
            "mean_post":   round(mean_post, 3),
            "mean_asset":  round(mean_asset, 3),
            "mean_direct": round(mean_direct, 3),
            # TLS / Protocole
            "mean_alpn_mismatch": round(mean_alpn, 3),
            "mean_h2_eff":    round(mean_h2, 3),
            "mean_hdr_conf":  round(mean_hconf, 3),
            "mean_ua_ch":     round(mean_ua_ch, 3),
            # Temporel
            "mean_entropy": round(mean_entr, 3),
            "mean_ja4_diversity": round(mean_ja4, 3),
            "mean_ip_id_zero":   round(mean_ip_id, 3),
            "mean_browser_score": round(mean_brow, 1),
            "mean_ua_rotating":  round(mean_uarot, 3),
            # Meta
                "ip_count":     ip_count,
                "hit_count":    hit_count,
-            "top_threat":  topk(threats, 1)[0] if topk(threats, 1) else "",
+                "top_threat":   topk(threats, 1)[0] if threats else "",
-            "top_countries": topk(countries, 5),
+                "top_countries":topk(countries, 5),
                "top_orgs":     topk(orgs, 5),
                "sample_ips":   sample_ips,
                "sample_ua":    sample_ua,
            # Profil radar pour visualisation
                "radar":        radar,
                # Hull pour deck.gl PolygonLayer
                "hull": hulls.get(j, []),
            })
-    # ── Arêtes : k-NN dans l'espace des features ──────────────────────────
+        # ── 8. Arêtes k-NN entre clusters ────────────────────────────────
    # Chaque cluster est connecté à ses 2 voisins les plus proches
        edges = []
        seen: set[frozenset] = set()
    centroids = km.centroids
        for i, ni in enumerate(nodes):
            ci = ni["cluster_idx"]
-        # Distance² aux autres centroïdes
+            dists = sorted(
-        dists = [
+                [(j, nj["cluster_idx"],
-            (j, nj["cluster_idx"],
+                  float(np.sum((km.centroids[ci] - km.centroids[nj["cluster_idx"]]) ** 2)))
-             sum((centroids[ci][d] - centroids[nj["cluster_idx"]][d]) ** 2
+                 for j, nj in enumerate(nodes) if j != i],
-                 for d in range(N_FEATURES)))
+                key=lambda x: x[2]
-            for j, nj in enumerate(nodes) if j != i
+            )
-        ]
+            for j_idx, cj, d2 in dists[:2]:
-        dists.sort(key=lambda x: x[2])
+                key = frozenset([ni["id"], nodes[j_idx]["id"]])
        # 2 voisins les plus proches
        for j, cj, dist2 in dists[:2]:
            key = frozenset([ni["id"], nodes[j]["id"]])
                if key in seen:
                    continue
                seen.add(key)
            similarity = round(1.0 / (1.0 + math.sqrt(dist2)), 3)
                edges.append({
-                "id":         f"e_{ni['id']}_{nodes[j]['id']}",
+                    "id":         f"e_{ni['id']}_{nodes[j_idx]['id']}",
                    "source":     ni["id"],
-                "target":     nodes[j]["id"],
+                    "target":     nodes[j_idx]["id"],
-                "similarity": similarity,
+                    "similarity": round(1.0 / (1.0 + math.sqrt(d2)), 3),
                "weight":     round(similarity * 5, 1),
                })
-    # ── Stats globales ────────────────────────────────────────────────────
+        # ── 9. Stockage résultat + cache IPs ─────────────────────────────
-    total_ips  = sum(n["ip_count"]  for n in nodes)
+        total_ips  = sum(n_["ip_count"]  for n_ in nodes)
-    total_hits = sum(n["hit_count"] for n in nodes)
+        total_hits = sum(n_["hit_count"] for n_ in nodes)
-    bot_ips    = sum(n["ip_count"]  for n in nodes if n["risk_score"] > 0.40 or "🤖" in n["label"])
+        bot_ips    = sum(n_["ip_count"]  for n_ in nodes if n_["risk_score"] > 0.45 or "🤖" in n_["label"])
-    high_risk  = sum(n["ip_count"]  for n in nodes if n["risk_score"] > 0.20)
+        high_ips   = sum(n_["ip_count"]  for n_ in nodes if n_["risk_score"] > 0.25)
        elapsed    = round(time.time() - t0, 2)
-    return {
+        result_dict = {
            "nodes": nodes,
            "edges": edges,
            "stats": {
@ -356,14 +287,133 @@ async def get_clusters(
                "total_ips":      total_ips,
                "total_hits":     total_hits,
                "bot_ips":        bot_ips,
-            "high_risk_ips":   high_risk,
+                "high_risk_ips":  high_ips,
-            "n_samples":       len(rows),
+                "n_samples":      n,
                "k":              k,
                "elapsed_s":      elapsed,
            },
            "feature_names": FEATURE_NAMES,
        }
        with _LOCK:
            _CACHE["result"]      = result_dict
            _CACHE["cluster_ips"] = cluster_ips_map
            _CACHE["status"]      = "ready"
            _CACHE["ts"]          = time.time()
            _CACHE["params"]      = {"k": k, "hours": hours}
            _CACHE["error"]       = None
        log.info(f"[clustering] Terminé en {elapsed}s — {total_ips} IPs, {len(nodes)} clusters")
    except Exception as e:
        log.exception("[clustering] Erreur lors du calcul")
        with _LOCK:
            _CACHE["status"] = "error"
            _CACHE["error"]  = str(e)
 def _maybe_trigger(k: int, hours: int) -> None:
    """Lance le calcul si cache absent, expiré ou paramètres différents."""
    with _LOCK:
        status = _CACHE["status"]
        params = _CACHE["params"]
        ts     = _CACHE["ts"]
    cache_stale = (time.time() - ts) > _CACHE_TTL
    params_changed = params.get("k") != k or params.get("hours") != hours
    if status in ("computing",):
        return  # déjà en cours
    if status == "ready" and not cache_stale and not params_changed:
        return  # cache frais
    _EXECUTOR.submit(_run_clustering_job, k, hours)
 # ─── Endpoints ────────────────────────────────────────────────────────────────
@router.get("/status")
 async def get_status():
    """État du calcul en cours (polling frontend)."""
    with _LOCK:
        return {
            "status":   _CACHE["status"],
            "error":    _CACHE["error"],
            "ts":       _CACHE["ts"],
            "params":   _CACHE["params"],
            "age_s":    round(time.time() - _CACHE["ts"], 0) if _CACHE["ts"] else None,
        }
@router.get("/clusters")
 async def get_clusters(
    k:     int = Query(14, ge=4,  le=30,  description="Nombre de clusters"),
    hours: int = Query(24, ge=1,  le=168, description="Fenêtre temporelle (heures)"),
    force: bool = Query(False, description="Forcer le recalcul"),
 ):
    """
    Clustering multi-métriques sur TOUTES les IPs.
    Retourne immédiatement depuis le cache (status=ready).
    Si le calcul est en cours ou non démarré → status=computing/idle + trigger.
    """
    if force:
        with _LOCK:
            _CACHE["status"] = "idle"
            _CACHE["ts"]     = 0.0
    _maybe_trigger(k, hours)
    with _LOCK:
        status = _CACHE["status"]
        result = _CACHE["result"]
        error  = _CACHE["error"]
    if status == "computing":
        return {"status": "computing", "message": "Calcul en cours, réessayez dans quelques secondes"}
    if status == "error":
        raise HTTPException(status_code=500, detail=error or "Erreur inconnue")
    if result is None:
        return {"status": "idle", "message": "Calcul démarré, réessayez dans quelques secondes"}
    return {**result, "status": "ready"}
@router.get("/cluster/{cluster_id}/points")
 async def get_cluster_points(
    cluster_id: str,
    limit:  int = Query(5000, ge=1, le=20000),
    offset: int = Query(0, ge=0),
 ):
    """
    Coordonnées PCA + métadonnées de toutes les IPs d'un cluster.
    Utilisé par deck.gl ScatterplotLayer (drill-down ou zoom avancé).
    """
    with _LOCK:
        status = _CACHE["status"]
        ips_map = _CACHE["cluster_ips"]
    if status != "ready" or not ips_map:
        raise HTTPException(status_code=404, detail="Cache absent — appelez /clusters d'abord")
    try:
        idx = int(cluster_id.split("_")[0][1:])
    except (ValueError, IndexError):
        raise HTTPException(status_code=400, detail="cluster_id invalide (format: c{n}_k{k})")
    members = ips_map.get(idx, [])
    total   = len(members)
    page    = members[offset: offset + limit]
    points = [
        {"ip": m[0], "ja4": m[1], "pca_x": round(m[2], 6), "pca_y": round(m[3], 6), "risk": round(m[4], 3)}
        for m in page
    ]
    return {"points": points, "total": total, "offset": offset, "limit": limit}
@router.get("/cluster/{cluster_id}/ips")
 async def get_cluster_ips(
@ -371,39 +421,26 @@ async def get_cluster_ips(
    limit:  int = Query(100, ge=1, le=500),
    offset: int = Query(0, ge=0),
 ):
-    """
+    """IPs avec détails SQL (backward-compat avec l'ancienne UI)."""
-    IPs appartenant à un cluster (depuis le cache de la dernière exécution).
+    with _LOCK:
-    Si le cache est expiré, retourne une erreur guidant vers /clusters.
+        status  = _CACHE["status"]
-    """
+        ips_map = _CACHE["cluster_ips"]
-    if not _cache.get("cluster_ips"):
+
-        raise HTTPException(
+    if status != "ready" or not ips_map:
-            status_code=404,
+        raise HTTPException(status_code=404, detail="Cache absent — appelez /clusters d'abord")
            detail="Cache expiré — appelez /api/clustering/clusters d'abord"
        )
    # Extrait l'index cluster depuis l'id (format: c{idx}_k{k})
    try:
        idx = int(cluster_id.split("_")[0][1:])
    except (ValueError, IndexError):
        raise HTTPException(status_code=400, detail="cluster_id invalide")
-    members = _cache["cluster_ips"].get(idx, [])
+    members = ips_map.get(idx, [])
    if not members:
        return {"ips": [], "total": 0, "cluster_id": cluster_id}
    total   = len(members)
-    page_members = members[offset: offset + limit]
+    page    = members[offset: offset + limit]
-
+    if not page:
    # Requête SQL pour les détails de ces IPs spécifiques
    ip_list  = [m[0] for m in page_members]
    ja4_list = [m[1] for m in page_members]
    if not ip_list:
        return {"ips": [], "total": total, "cluster_id": cluster_id}
-    # On ne peut pas facilement passer une liste en paramètre ClickHouse —
+    safe_ips  = [m[0].replace("'", "") for m in page[:200]]
    # on la construit directement (valeurs nettoyées)
    safe_ips  = [ip.replace("'", "") for ip in ip_list[:100]]
    ip_filter = ", ".join(f"'{ip}'" for ip in safe_ips)
    sql = f"""
@ -439,7 +476,7 @@ async def get_cluster_ips(
    ips = []
    for row in result.result_rows:
        ips.append({
-            "ip":           str(row[0]),
+            "ip":           str(row[0] or ""),
            "ja4":          str(row[1] or ""),
            "tcp_ttl":      int(row[2] or 0),
            "tcp_win":      int(row[3] or 0),
--- a/backend/services/clustering_engine.py
+++ b/backend/services/clustering_engine.py
@ -1,12 +1,14 @@
 """
-Moteur de clustering K-means++ multi-métriques (pur Python).
+Moteur de clustering K-means++ multi-métriques (numpy + scipy vectorisé).
-Ref: Arthur & Vassilvitskii (2007) — k-means++: The Advantages of Careful Seeding
+Ref:
-     Hotelling (1933) — PCA par puissance itérative (deflation)
+  Arthur & Vassilvitskii (2007) — k-means++: The Advantages of Careful Seeding
  scipy.spatial.ConvexHull     — enveloppe convexe (Graham/Qhull)
  sklearn-style API             — centroids, labels_, inertia_
 Features (21 dimensions, normalisées [0,1]) :
-  0  ttl_n          : TTL initial normalisé (hops-count estimé)
+  0  ttl_n          : TTL initial normalisé
-  1  mss_n          : MSS normalisé → type réseau (Ethernet/PPPoE/VPN)
+  1  mss_n          : MSS normalisé → type réseau
  2  scale_n        : facteur de mise à l'échelle TCP
  3  win_n          : fenêtre TCP normalisée
  4  score_n        : score anomalie ML (abs)
@ -16,7 +18,7 @@ Features (21 dimensions, normalisées [0,1]) :
  8  post_n         : ratio POST/total
  9  ip_id_zero_n   : ratio IP-ID=0 (Linux/spoofé)
  10 entropy_n      : entropie temporelle
-  11 browser_n      : score navigateur moderne (normalisé max 50)
+  11 browser_n      : score navigateur moderne
  12 alpn_n         : mismatch ALPN/protocole
  13 alpn_absent_n  : ratio ALPN absent
  14 h2_n           : efficacité H2 multiplexing (log1p)
@ -28,14 +30,18 @@ Features (21 dimensions, normalisées [0,1]) :
  20 ua_rot_n       : UA rotatif (booléen)
 """
 from __future__ import annotations
 import math
-import random
+import logging
 import numpy as np
 from dataclasses import dataclass, field
 from scipy.spatial import ConvexHull
 log = logging.getLogger(__name__)
 # ─── Définition des features ──────────────────────────────────────────────────
-# (clé SQL, nom lisible, fonction de normalisation)
+FEATURES: list[tuple[str, str, object]] = [
 FEATURES = [
    # TCP stack
    ("ttl",           "TTL Initial",           lambda v: min(1.0, (v or 0) / 255.0)),
    ("mss",           "MSS Réseau",            lambda v: min(1.0, (v or 0) / 1460.0)),
@ -52,9 +58,9 @@ FEATURES = [
    # Temporel
    ("entropy",       "Entropie Temporelle",   lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(10))),
    # Navigateur
-    ("browser_score","Score Navigateur",      lambda v: min(1.0, float(v or 0) / 50.0)),
+    ("browser_score", "Score Navigateur",      lambda v: min(1.0, float(v or 0) / 50.0)),
    # TLS / Protocole
-    ("alpn_mismatch","ALPN Mismatch",         lambda v: min(1.0, float(v or 0))),
+    ("alpn_mismatch", "ALPN Mismatch",         lambda v: min(1.0, float(v or 0))),
    ("alpn_missing",  "ALPN Absent",           lambda v: min(1.0, float(v or 0))),
    ("h2_eff",        "H2 Multiplexing",       lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(20))),
    ("hdr_conf",      "Ordre Headers",         lambda v: min(1.0, float(v or 0))),
@ -74,255 +80,198 @@ FEATURE_NORMS  = [f[2] for f in FEATURES]
 N_FEATURES    = len(FEATURES)
 # ─── Utilitaires vectoriels (pur Python) ──────────────────────────────────────
 def _dist2(a: list[float], b: list[float]) -> float:
    return sum((x - y) ** 2 for x, y in zip(a, b))
 def _mean_vec(vecs: list[list[float]]) -> list[float]:
    n = len(vecs)
    if n == 0:
        return [0.0] * N_FEATURES
    return [sum(v[i] for v in vecs) / n for i in range(N_FEATURES)]
 # ─── Construction du vecteur de features ─────────────────────────────────────
 def build_feature_vector(row: dict) -> list[float]:
-    """Normalise un dict de colonnes SQL → vecteur [0,1]^N_FEATURES."""
+    """Construit le vecteur normalisé [0,1]^21 depuis un dict SQL."""
-    return [fn(row.get(key)) for key, fn in zip(FEATURE_KEYS, FEATURE_NORMS)]
+    return [norm(row.get(key, 0)) for key, _, norm in FEATURES]
-# ─── K-means++ ───────────────────────────────────────────────────────────────
+# ─── K-means++ vectorisé (numpy) ─────────────────────────────────────────────
@dataclass
 class KMeansResult:
-    centroids: list[list[float]]
+    centroids: np.ndarray        # (k, n_features)
-    labels:    list[int]
+    labels: np.ndarray           # (n_points,)  int32
    inertia: float
    n_iter: int
-def kmeans_pp(
+def kmeans_pp(X: np.ndarray, k: int, max_iter: int = 60, n_init: int = 3,
-    points:   list[list[float]],
+              seed: int = 42) -> KMeansResult:
    k:        int,
    max_iter: int = 60,
    seed:     int = 42,
    n_init:   int = 3,
 ) -> KMeansResult:
    """
-    K-means avec initialisation k-means++ (Arthur & Vassilvitskii, 2007).
+    K-means++ entièrement vectorisé avec numpy.
-    Lance `n_init` fois et retourne le meilleur résultat (inertie minimale).
+    n_init exécutions, meilleure inertie conservée.
    """
-    rng = random.Random(seed)
+    rng = np.random.default_rng(seed)
    n, d = X.shape
    best: KMeansResult | None = None
-    for attempt in range(n_init):
+    for _ in range(n_init):
-        # ── Initialisation k-means++ ────────────────────────────────────
+        # ── Initialisation K-means++ ──────────────────────────────────────
-        first_idx = rng.randrange(len(points))
+        centers = [X[rng.integers(n)].copy()]
        centroids = [points[first_idx][:]]
        for _ in range(k - 1):
-            d2 = [min(_dist2(p, c) for c in centroids) for p in points]
+            D = _min_sq_dist(X, np.array(centers))
-            total = sum(d2)
+            # Garantit des probabilités non-négatives (erreurs float, points dupliqués)
-            if total == 0:
+            D = np.clip(D, 0.0, None)
-                break
+            total = D.sum()
-            r = rng.random() * total
+            if total < 1e-12:
-            cumul = 0.0
+                # Tous les points sont confondus — tirage aléatoire
-            for i, d in enumerate(d2):
+                centers.append(X[rng.integers(n)].copy())
                cumul += d
                if cumul >= r:
                    centroids.append(points[i][:])
                    break
            else:
-                centroids.append(points[rng.randrange(len(points))][:])
+                probs = D / total
                centers.append(X[rng.choice(n, p=probs)].copy())
        centers_arr = np.array(centers)  # (k, d)
-        # ── Itérations EM ───────────────────────────────────────────────
+        # ── Iterations ───────────────────────────────────────────────────
-        labels: list[int] = [0] * len(points)
+        labels = np.zeros(n, dtype=np.int32)
-        for iteration in range(max_iter):
+        for it in range(max_iter):
-            # E-step : affectation
+            # Assignation vectorisée : (n, k) distance²
-            new_labels = [
+            dists = _sq_dists(X, centers_arr)  # (n, k)
-                min(range(len(centroids)), key=lambda c: _dist2(p, centroids[c]))
+            new_labels = np.argmin(dists, axis=1).astype(np.int32)
-                for p in points
+
-            ]
+            if it > 0 and np.all(new_labels == labels):
-            if new_labels == labels and iteration > 0:
+                break  # convergence
                break
            labels = new_labels
-            # M-step : mise à jour
+            # Mise à jour des centroïdes
            clusters: list[list[list[float]]] = [[] for _ in range(k)]
            for i, l in enumerate(labels):
                clusters[l].append(points[i])
            for j in range(k):
-                if clusters[j]:
+                mask = labels == j
-                    centroids[j] = _mean_vec(clusters[j])
+                if mask.any():
                    centers_arr[j] = X[mask].mean(axis=0)
-        inertia = sum(_dist2(points[i], centroids[labels[i]]) for i in range(len(points)))
+        inertia = float(np.sum(np.min(_sq_dists(X, centers_arr), axis=1)))
-        result = KMeansResult(
+        result = KMeansResult(centers_arr, labels, inertia, it + 1)
            centroids=centroids,
            labels=labels,
            inertia=inertia,
            n_iter=iteration + 1,
        )
        if best is None or inertia < best.inertia:
            best = result
-    return best  # type: ignore
+    return best  # type: ignore[return-value]
-# ─── PCA 2D par puissance itérative ──────────────────────────────────────────
+def _sq_dists(X: np.ndarray, C: np.ndarray) -> np.ndarray:
    """Distance² entre chaque point de X et chaque centroïde de C. O(n·k·d)."""
    # ||x - c||² = ||x||² + ||c||² - 2·x·cᵀ
    X2 = np.sum(X ** 2, axis=1, keepdims=True)  # (n, 1)
    C2 = np.sum(C ** 2, axis=1, keepdims=True).T  # (1, k)
    return X2 + C2 - 2.0 * X @ C.T              # (n, k)
-def pca_2d(points: list[list[float]]) -> list[tuple[float, float]]:
+
 def _min_sq_dist(X: np.ndarray, C: np.ndarray) -> np.ndarray:
    """Distance² minimale de chaque point aux centroïdes existants."""
    return np.min(_sq_dists(X, C), axis=1)
 # ─── PCA 2D (numpy) ──────────────────────────────────────────────────────────
 def pca_2d(X: np.ndarray) -> np.ndarray:
    """
-    Projection PCA 2D par puissance itérative avec déflation (Hotelling).
+    PCA-2D vectorisée. Retourne les coordonnées normalisées [0,1] × [0,1].
    Retourne les coordonnées (pc1, pc2) normalisées dans [0,1].
    """
-    n = len(points)
+    mean = X.mean(axis=0)
-    if n == 0:
+    Xc = X - mean
-        return []
+    # Power iteration pour les 2 premières composantes
    rng = np.random.default_rng(0)
    v1 = _power_iter(Xc, rng.standard_normal(Xc.shape[1]))
    proj1 = Xc @ v1
    # Déflation (Hotelling)
    Xc2 = Xc - np.outer(proj1, v1)
    v2 = _power_iter(Xc2, rng.standard_normal(Xc.shape[1]))
    proj2 = Xc2 @ v2
-    # Centrage
+    coords = np.column_stack([proj1, proj2])
-    mean = _mean_vec(points)
+    # Normalisation [0,1]
-    X = [[p[i] - mean[i] for i in range(N_FEATURES)] for p in points]
+    mn, mx = coords.min(axis=0), coords.max(axis=0)
    rng_ = mx - mn
    rng_[rng_ == 0] = 1.0
    return (coords - mn) / rng_
-    def power_iter(X_centered: list[list[float]], n_iter: int = 30) -> list[float]:
+
-        """Trouve le premier vecteur propre de X^T X par puissance itérative."""
+def _power_iter(X: np.ndarray, v: np.ndarray, n_iter: int = 30) -> np.ndarray:
-        v = [1.0 / math.sqrt(N_FEATURES)] * N_FEATURES
+    """Power iteration : trouve le premier vecteur propre de XᵀX."""
    for _ in range(n_iter):
-            # Xv = X @ v
+        v = X.T @ (X @ v)
-            Xv = [sum(row[j] * v[j] for j in range(N_FEATURES)) for row in X_centered]
+        norm = np.linalg.norm(v)
-            # Xtxv = X^T @ Xv
+        if norm < 1e-12:
-            xtxv = [sum(X_centered[i][j] * Xv[i] for i in range(len(X_centered))) for j in range(N_FEATURES)]
+            break
-            norm = math.sqrt(sum(x ** 2 for x in xtxv)) or 1e-10
+        v /= norm
            v = [x / norm for x in xtxv]
    return v
    # PC1
    v1 = power_iter(X)
    proj1 = [sum(row[j] * v1[j] for j in range(N_FEATURES)) for row in X]
-    # Déflation : retire la composante PC1 de X
+# ─── Enveloppe convexe (hull) par cluster ────────────────────────────────────
-    X2 = [
+
-        [X[i][j] - proj1[i] * v1[j] for j in range(N_FEATURES)]
+def compute_hulls(coords_2d: np.ndarray, labels: np.ndarray,
-        for i in range(n)
+                  k: int, min_pts: int = 4) -> dict[int, list[list[float]]]:
    """
    Calcule l'enveloppe convexe (convex hull) des points PCA pour chaque cluster.
    Retourne {cluster_idx: [[x,y], ...]} (polygone fermé).
    """
    hulls: dict[int, list[list[float]]] = {}
    for j in range(k):
        pts = coords_2d[labels == j]
        if len(pts) < min_pts:
            # Pas assez de points : bounding box
            if len(pts) > 0:
                mx_, my_ = pts.mean(axis=0)
                r = max(0.01, pts.std(axis=0).max())
                hulls[j] = [
                    [mx_ - r, my_ - r], [mx_ + r, my_ - r],
                    [mx_ + r, my_ + r], [mx_ - r, my_ + r],
                ]
-
+            continue
-    # PC2
+        try:
-    v2 = power_iter(X2)
+            hull = ConvexHull(pts)
-    proj2 = [sum(row[j] * v2[j] for j in range(N_FEATURES)) for row in X2]
+            hull_pts = pts[hull.vertices].tolist()
-
+            # Fermer le polygone
-    # Normalisation [0,1]
+            hull_pts.append(hull_pts[0])
-    def _norm01(vals: list[float]) -> list[float]:
+            hulls[j] = hull_pts
-        lo, hi = min(vals), max(vals)
+        except Exception:
-        rng = hi - lo or 1e-10
+            hulls[j] = []
-        return [(v - lo) / rng for v in vals]
+    return hulls
    p1 = _norm01(proj1)
    p2 = _norm01(proj2)
    return list(zip(p1, p2))
-# ─── Nommage automatique des clusters ────────────────────────────────────────
+# ─── Nommage et scoring ───────────────────────────────────────────────────────
-def name_cluster(centroid: list[float], raw_stats: dict | None = None) -> str:
+def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
-    """
+    """Nom lisible basé sur les features dominantes du centroïde."""
-    Génère un nom lisible à partir du centroïde normalisé et de statistiques brutes.
+    s = centroid  # alias
-    Priorité : signaux les plus discriminants en premier.
+    ttl_raw = float(raw_stats.get("mean_ttl", 0))
-    """
+    mss_raw = float(raw_stats.get("mean_mss", 0))
    score  = centroid[4]   # anomalie ML
    vel    = centroid[5]   # vélocité
    fuzz   = centroid[6]   # fuzzing (log1p normalisé, >0.35 ≈ fuzzing_index > 100)
    hless  = centroid[7]   # headless
    post   = centroid[8]   # POST ratio
    alpn   = centroid[12]  # ALPN mismatch
    h2     = centroid[14]  # H2 eff
    ua_ch  = centroid[16]  # UA-CH mismatch
    ja4d   = centroid[19]  # JA4 diversité
    ua_rot = centroid[20]  # UA rotatif
-    raw_mss   = (raw_stats or {}).get("mean_mss",   0)
+    # Scanners / bots masscan
-    raw_ttl   = (raw_stats or {}).get("mean_ttl",   0) or (centroid[0] * 255)
+    if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25:
-    raw_scale = (raw_stats or {}).get("mean_scale", 0)
+        return "🤖 Masscan Scanner"
-
+    if s[4] > 0.70 and s[6] > 0.5:
-    # ── Signaux forts (déterministes) ────────────────────────────────────
+        return "🤖 Bot agressif"
-
+    if s[16] > 0.80:
-    # Pattern Masscan : mss≈1452, scale≈4, TTL 48-57
+        return "🤖 UA-CH Mismatch"
-    if raw_mss and 1440 <= raw_mss <= 1460 and raw_scale and 3 <= raw_scale <= 5 and raw_ttl < 60:
+    if s[7] > 0.70:
-        return "🤖 Masscan / Scanner IP"
+        return "🤖 Headless Browser"
-
+    if s[4] > 0.50:
-    # Fuzzer agressif (fuzzing_index normalisé > 0.35 ≈ valeur brute > 100)
+        return "⚠️ Anomalie ML haute"
-    if fuzz > 0.35:
+    if s[3] > 0.85 and ttl_raw > 120:
-        return "🤖 Bot Fuzzer / Scanner"
+        return "🖥️ Windows"
-
+    if s[0] > 0.22 and s[0] < 0.28 and mss_raw > 1400:
-    # UA rotatif + UA-CH mismatch : bot sophistiqué simulant un navigateur
+        return "🐧 Linux"
-    if ua_rot > 0.5 and ua_ch > 0.7:
+    if s[1] < 0.90 and s[1] > 0.95:
-        return "🤖 Bot UA Rotatif + CH Mismatch"
+        return "📡 VPN/Proxy"
-
+    if mss_raw < 1380 and mss_raw > 0:
-    # UA-CH mismatch fort seul (navigateur simulé sans headers CH)
+        return "🌐 Tunnel réseau"
-    if ua_ch > 0.8:
+    if s[5] > 0.60:
-        return "⚠️ Bot UA-CH Incohérent"
+        return "⚡ Trafic rapide"
-
+    if s[4] < 0.10 and s[5] < 0.10:
-    # ── Score ML modéré + signal comportemental ──────────────────────────
+        return "✅ Trafic sain"
-
+    return "📊 Cluster mixte"
    if score > 0.20:
        if hless > 0.3:
            return "⚠️ Navigateur Headless Suspect"
        if vel > 0.25:
            return "⚠️ Bot Haute Vélocité"
        if post > 0.4:
            return "⚠️ Bot POST Automatisé"
        if alpn > 0.5 or h2 > 0.5:
            return "⚠️ TLS/H2 Anormal"
        if ua_ch > 0.4:
            return "⚠️ Anomalie UA-CH"
        return "⚠️ Anomalie ML Modérée"
    # ── Signaux faibles ───────────────────────────────────────────────────
    if ua_ch > 0.4:
        return "🔎 UA-CH Incohérent"
    if ja4d > 0.5:
        return "🔄 Client Multi-Fingerprint"
    # ── Classification réseau / OS ────────────────────────────────────────
    # MSS bas → VPN ou tunnel
    if raw_mss and raw_mss < 1360:
        return "🌐 VPN / Tunnel"
    if raw_ttl < 70:
        return "🐧 Linux / Mobile"
    if raw_ttl > 110:
        return "🪟 Windows"
    return "✅ Trafic Légitime"
-def risk_score_from_centroid(centroid: list[float]) -> float:
+def risk_score_from_centroid(centroid: np.ndarray) -> float:
-    """Score de risque [0,1] pondéré. Calibré pour les valeurs observées (score ML ~0.3)."""
+    """Score de risque [0,1] agrégé depuis le centroïde."""
-    # Normalisation de score ML : x / 0.5 pour étendre la plage utile (0-0.5 → 0-1)
+    s = centroid
-    score_n  = min(1.0, centroid[4] / 0.5)
+    return float(np.clip(
-    fuzz_n   = centroid[6]
+        0.40 * s[4]   +   # score ML
-    ua_ch_n  = centroid[16]
+        0.15 * s[6]   +   # fuzzing
-    ua_rot_n = centroid[20]
+        0.15 * s[16]  +   # UA-CH mismatch
-    vel_n    = centroid[5]
+        0.10 * s[7]   +   # headless
-    hless_n  = centroid[7]
+        0.10 * s[5]   +   # vélocité
-    ip_id_n  = centroid[9]
+        0.10 * s[9],      # IP-ID zéro
-    alpn_n   = centroid[12]
+        0.0, 1.0
-    ja4d_n   = centroid[19]
+    ))
    post_n   = centroid[8]
    return min(1.0,
        0.25 * score_n +
        0.20 * ua_ch_n +
        0.15 * fuzz_n +
        0.12 * ua_rot_n +
        0.10 * hless_n +
        0.07 * vel_n +
        0.04 * ip_id_n +
        0.04 * alpn_n +
        0.03 * ja4d_n +
        0.03 * post_n
    )
--- a/frontend/package.json
+++ b/frontend/package.json
@ -15,7 +15,10 @@
    "recharts": "^2.10.0",
    "@tanstack/react-table": "^8.11.0",
    "date-fns": "^3.0.0",
-    "reactflow": "^11.10.0"
+    "reactflow": "^11.10.0",
    "@deck.gl/react": "^9.0.0",
    "@deck.gl/core": "^9.0.0",
    "@deck.gl/layers": "^9.0.0"
  },
  "devDependencies": {
    "@types/react": "^18.2.0",
--- a/frontend/src/components/ClusteringView.tsx
+++ b/frontend/src/components/ClusteringView.tsx
--- a/requirements.txt
+++ b/requirements.txt
@ -5,3 +5,5 @@ pydantic==2.5.0
 pydantic-settings==2.1.0
 python-dotenv==1.0.0
 httpx==0.26.0
 numpy>=1.26
 scipy>=1.11