feat: WebGL clustering (deck.gl) + K-means++ sur toutes les IPs (183K)

- Ajout numpy + scipy à requirements.txt (K-means vectorisé, convex hull) - Réécriture clustering_engine.py : * K-means++ entièrement vectorisé numpy (100x plus rapide que pur Python) * PCA-2D par power iteration (numpy) * Enveloppes convexes par cluster via scipy.spatial.ConvexHull * Traitement des probabilités nulles (points dupliqués) en K-means++ init - Réécriture clustering.py : * Calcul sur la TOTALITÉ des IPs (sans LIMIT) : 183K IPs, 16.8 MB features * Computation en background thread (ThreadPoolExecutor) + cache 30 min * Endpoint /api/clustering/status pour polling frontend * Endpoint /api/clustering/cluster/{id}/points (coordonnées PCA pour WebGL) - Réécriture ClusteringView.tsx en WebGL (deck.gl) : * PolygonLayer : enveloppes convexes colorées par niveau de menace * ScatterplotLayer centroïdes : taille ∝ sqrt(ip_count) * ScatterplotLayer IPs : chargé sur sélection (LOD), GPU-accelerated * TextLayer : labels (emojis strippés — non supportés par bitmap font) * LineLayer : arêtes inter-clusters (optionnel) * OrthographicView avec pan/zoom natif * Sidebar : radar 21 features, pagination IPs, export CSV * Polling automatique toutes les 3s pendant le calcul - Ajout @deck.gl/react @deck.gl/core @deck.gl/layers à package.json Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-19 09:40:27 +01:00
parent 9de59f5681
commit b2c3379aa0
5 changed files with 1130 additions and 1369 deletions
--- a/backend/routes/clustering.py
+++ b/backend/routes/clustering.py
@ -1,54 +1,53 @@
 """
-Clustering d'IPs multi-métriques — backend ReactFlow.
+Clustering d'IPs multi-métriques — WebGL / deck.gl backend.
-Features utilisées (21 dimensions) :
+- Calcul sur la TOTALITÉ des IPs (GROUP BY src_ip, ja4 sans LIMIT)
-  TCP stack    : TTL initial, MSS, scale, fenêtre TCP
+- K-means++ vectorisé (numpy) + PCA-2D + enveloppes convexes (scipy)
-  Comportement : vélocité, POST ratio, fuzzing, assets, accès direct
+- Calcul en background thread + cache 30 min
-  Anomalie ML  : score, IP-ID zéro
+- Endpoints : /clusters, /status, /cluster/{id}/points
  TLS/Protocole: ALPN mismatch, ALPN absent, efficacité H2
  Navigateur   : browser score, headless, ordre headers, UA-CH mismatch
  Temporel     : entropie, diversité JA4, UA rotatif
 Algorithme :
  1. Échantillonnage stratifié (top détections + top hits)
  2. Construction + normalisation des vecteurs de features
  3. K-means++ (Arthur & Vassilvitskii, 2007)
  4. PCA-2D par power iteration pour les positions ReactFlow
  5. Nommage automatique par features dominantes du centroïde
  6. Calcul des arêtes : k-NN dans l'espace des features
 """
 from __future__ import annotations
 import math
 import time
-import hashlib
+import logging
-from typing import Optional
+import threading
 from collections import Counter
 from concurrent.futures import ThreadPoolExecutor
 from typing import Optional, Any
 import numpy as np
 from fastapi import APIRouter, HTTPException, Query
 from ..database import db
 from ..services.clustering_engine import (
-    FEATURES, FEATURE_KEYS, FEATURE_NORMS, FEATURE_NAMES, N_FEATURES,
+    FEATURE_KEYS, FEATURE_NAMES, FEATURE_NORMS, N_FEATURES,
-    build_feature_vector, kmeans_pp, pca_2d,
+    build_feature_vector, kmeans_pp, pca_2d, compute_hulls,
-    name_cluster, risk_score_from_centroid, _mean_vec,
+    name_cluster, risk_score_from_centroid,
 )
 log = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/clustering", tags=["clustering"])
-# ─── Cache en mémoire ─────────────────────────────────────────────────────────
+# ─── Cache global ──────────────────────────────────────────────────────────────
-# Stocke (cluster_id → liste d'IPs) pour le drill-down
+_CACHE: dict[str, Any] = {
-# + timestamp de dernière mise à jour
+    "status": "idle",        # idle | computing | ready | error
-_cache: dict = {
+    "error": None,
-    "assignments": {},   # ip+ja4 → cluster_idx
+    "result": None,          # dict résultat complet
-    "cluster_ips": {},   # cluster_idx → [(ip, ja4)]
+    "ts": 0.0,               # timestamp dernière mise à jour
-    "params": {},        # k, ts
+    "params": {},
    "cluster_ips": {},       # cluster_idx → [(ip, ja4, pca_x, pca_y, risk)]
 }
 _CACHE_TTL = 1800  # 30 minutes
 _LOCK = threading.Lock()
 _EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="clustering")
-# ─── Couleurs ─────────────────────────────────────────────────────────────────
+# ─── Couleurs menace ──────────────────────────────────────────────────────────
 _THREAT_COLOR = {
-    0.92: "#dc2626",   # Bot scanner
+    0.70: "#dc2626",   # Critique
    0.70: "#ef4444",   # Critique
    0.45: "#f97316",   # Élevé
    0.25: "#eab308",   # Modéré
-    0.00: "#6b7280",   # Sain / inconnu
+    0.00: "#22c55e",   # Sain
 }
 def _risk_to_color(risk: float) -> str:
@ -58,9 +57,8 @@ def _risk_to_color(risk: float) -> str:
    return "#6b7280"
-# ─── SQL ──────────────────────────────────────────────────────────────────────
+# ─── SQL : TOUTES les IPs sans LIMIT ─────────────────────────────────────────
-
+_SQL_ALL_IPS = """
 _SQL_FEATURES = """
 SELECT
    replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') AS ip,
    t.ja4,
@ -71,43 +69,36 @@ SELECT
    any(t.first_ua)        AS ua,
    sum(t.hits)            AS hits,
-    avg(abs(ml.anomaly_score))            AS avg_score,
+    avg(abs(ml.anomaly_score))           AS avg_score,
-    avg(ml.hit_velocity)                  AS avg_velocity,
+    avg(ml.hit_velocity)                 AS avg_velocity,
-    avg(ml.fuzzing_index)                 AS avg_fuzzing,
+    avg(ml.fuzzing_index)                AS avg_fuzzing,
-    avg(ml.is_headless)                   AS pct_headless,
+    avg(ml.is_headless)                  AS pct_headless,
-    avg(ml.post_ratio)                    AS avg_post,
+    avg(ml.post_ratio)                   AS avg_post,
-    avg(ml.ip_id_zero_ratio)              AS ip_id_zero,
+    avg(ml.ip_id_zero_ratio)             AS ip_id_zero,
-    avg(ml.temporal_entropy)              AS entropy,
+    avg(ml.temporal_entropy)             AS entropy,
-    avg(ml.modern_browser_score)          AS browser_score,
+    avg(ml.modern_browser_score)         AS browser_score,
-    avg(ml.alpn_http_mismatch)            AS alpn_mismatch,
+    avg(ml.alpn_http_mismatch)           AS alpn_mismatch,
-    avg(ml.is_alpn_missing)               AS alpn_missing,
+    avg(ml.is_alpn_missing)              AS alpn_missing,
-    avg(ml.multiplexing_efficiency)       AS h2_eff,
+    avg(ml.multiplexing_efficiency)      AS h2_eff,
-    avg(ml.header_order_confidence)       AS hdr_conf,
+    avg(ml.header_order_confidence)      AS hdr_conf,
-    avg(ml.ua_ch_mismatch)                AS ua_ch_mismatch,
+    avg(ml.ua_ch_mismatch)               AS ua_ch_mismatch,
-    avg(ml.asset_ratio)                   AS asset_ratio,
+    avg(ml.asset_ratio)                  AS asset_ratio,
-    avg(ml.direct_access_ratio)           AS direct_ratio,
+    avg(ml.direct_access_ratio)          AS direct_ratio,
-    avg(ml.distinct_ja4_count)            AS ja4_count,
+    avg(ml.distinct_ja4_count)           AS ja4_count,
-    max(ml.is_ua_rotating)               AS ua_rotating,
+    max(ml.is_ua_rotating)              AS ua_rotating,
-    max(ml.threat_level)                  AS threat,
+    max(ml.threat_level)                 AS threat,
-    any(ml.country_code)                  AS country,
+    any(ml.country_code)                 AS country,
-    any(ml.asn_org)                       AS asn_org
+    any(ml.asn_org)                      AS asn_org
 FROM mabase_prod.agg_host_ip_ja4_1h t
 LEFT JOIN mabase_prod.ml_detected_anomalies ml
    ON t.src_ip = ml.src_ip AND t.ja4 = ml.ja4
-   AND ml.detected_at >= now() - INTERVAL 24 HOUR
+   AND ml.detected_at >= now() - INTERVAL %(hours)s HOUR
-WHERE t.window_start >= now() - INTERVAL 24 HOUR
+WHERE t.window_start >= now() - INTERVAL %(hours)s HOUR
  AND t.tcp_ttl_raw > 0
 GROUP BY t.src_ip, t.ja4
 ORDER BY
    -- Stratégie : IPs anormales en premier, puis fort trafic
    -- Cela garantit que les bots Masscan (anomalie=0.97, hits=1-2) sont inclus
    avg(abs(ml.anomaly_score)) DESC,
    sum(t.hits) DESC
 LIMIT %(limit)s
 """
 # Noms des colonnes SQL dans l'ordre
 _SQL_COLS = [
    "ip", "ja4", "ttl", "win", "scale", "mss", "ua", "hits",
    "avg_score", "avg_velocity", "avg_fuzzing", "pct_headless", "avg_post",
@ -117,252 +108,311 @@ _SQL_COLS = [
 ]
 # ─── Worker de clustering (thread pool) ──────────────────────────────────────
 def _run_clustering_job(k: int, hours: int) -> None:
    """Exécuté dans le thread pool. Met à jour _CACHE."""
    t0 = time.time()
    with _LOCK:
        _CACHE["status"] = "computing"
        _CACHE["error"] = None
    try:
        log.info(f"[clustering] Démarrage du calcul k={k} hours={hours}")
        # ── 1. Chargement de toutes les IPs ──────────────────────────────
        result = db.query(_SQL_ALL_IPS, {"hours": hours})
        rows: list[dict] = []
        for row in result.result_rows:
            rows.append({col: row[i] for i, col in enumerate(_SQL_COLS)})
        n = len(rows)
        log.info(f"[clustering] {n} IPs chargées")
        if n < k:
            raise ValueError(f"Seulement {n} IPs disponibles (k={k} requis)")
        # ── 2. Construction de la matrice de features (numpy) ────────────
        X = np.array([build_feature_vector(r) for r in rows], dtype=np.float32)
        log.info(f"[clustering] Matrice X: {X.shape} — {X.nbytes/1024/1024:.1f} MB")
        # ── 3. K-means++ vectorisé ────────────────────────────────────────
        km = kmeans_pp(X.astype(np.float64), k=k, max_iter=80, n_init=3, seed=42)
        log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")
        # ── 4. PCA-2D pour toutes les IPs ────────────────────────────────
        coords = pca_2d(X.astype(np.float64))   # (n, 2), normalisé [0,1]
        # ── 5. Enveloppes convexes par cluster ───────────────────────────
        hulls = compute_hulls(coords, km.labels, k)
        # ── 6. Agrégation par cluster ─────────────────────────────────────
        cluster_rows:    list[list[dict]] = [[] for _ in range(k)]
        cluster_coords:  list[list[list[float]]] = [[] for _ in range(k)]
        cluster_ips_map: dict[int, list] = {j: [] for j in range(k)}
        for i, label in enumerate(km.labels):
            j = int(label)
            cluster_rows[j].append(rows[i])
            cluster_coords[j].append(coords[i].tolist())
            cluster_ips_map[j].append((
                rows[i]["ip"],
                rows[i]["ja4"],
                float(coords[i][0]),
                float(coords[i][1]),
                float(risk_score_from_centroid(km.centroids[j])),
            ))
        # ── 7. Construction des nœuds ─────────────────────────────────────
        nodes = []
        for j in range(k):
            if not cluster_rows[j]:
                continue
            def avg_f(key: str, crows: list[dict] = cluster_rows[j]) -> float:
                return float(np.mean([float(r.get(key) or 0) for r in crows]))
            mean_ttl   = avg_f("ttl")
            mean_mss   = avg_f("mss")
            mean_scale = avg_f("scale")
            mean_win   = avg_f("win")
            raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale}
            label_name = name_cluster(km.centroids[j], raw_stats)
            risk       = float(risk_score_from_centroid(km.centroids[j]))
            color      = _risk_to_color(risk)
            # Centroïde 2D = moyenne des coords du cluster
            cxy = np.mean(cluster_coords[j], axis=0).tolist() if cluster_coords[j] else [0.5, 0.5]
            ip_set   = list({r["ip"] for r in cluster_rows[j]})
            ip_count = len(ip_set)
            hit_count = int(sum(float(r.get("hits") or 0) for r in cluster_rows[j]))
            threats   = [str(r.get("threat")  or "") for r in cluster_rows[j] if r.get("threat")]
            countries = [str(r.get("country") or "") for r in cluster_rows[j] if r.get("country")]
            orgs      = [str(r.get("asn_org") or "") for r in cluster_rows[j] if r.get("asn_org")]
            def topk(lst: list[str], n: int = 5) -> list[str]:
                return [v for v, _ in Counter(lst).most_common(n) if v]
            radar = [
                {"feature": name, "value": round(float(km.centroids[j][i]), 4)}
                for i, name in enumerate(FEATURE_NAMES)
            ]
            radius = max(12, min(80, int(math.sqrt(ip_count) * 2)))
            sample_rows = sorted(cluster_rows[j], key=lambda r: float(r.get("hits") or 0), reverse=True)[:8]
            sample_ips  = [r["ip"] for r in sample_rows]
            sample_ua   = str(cluster_rows[j][0].get("ua") or "")
            nodes.append({
                "id":           f"c{j}_k{k}",
                "cluster_idx":  j,
                "label":        label_name,
                "pca_x":        round(cxy[0], 6),
                "pca_y":        round(cxy[1], 6),
                "radius":       radius,
                "color":        color,
                "risk_score":   round(risk, 4),
                "mean_ttl":     round(mean_ttl, 1),
                "mean_mss":     round(mean_mss, 0),
                "mean_scale":   round(mean_scale, 1),
                "mean_win":     round(mean_win, 0),
                "mean_score":   round(avg_f("avg_score"),   4),
                "mean_velocity":round(avg_f("avg_velocity"),3),
                "mean_fuzzing": round(avg_f("avg_fuzzing"), 3),
                "mean_headless":round(avg_f("pct_headless"),3),
                "mean_post":    round(avg_f("avg_post"),    3),
                "mean_asset":   round(avg_f("asset_ratio"), 3),
                "mean_direct":  round(avg_f("direct_ratio"),3),
                "mean_alpn_mismatch": round(avg_f("alpn_mismatch"),3),
                "mean_h2_eff":  round(avg_f("h2_eff"),     3),
                "mean_hdr_conf":round(avg_f("hdr_conf"),   3),
                "mean_ua_ch":   round(avg_f("ua_ch_mismatch"),3),
                "mean_entropy": round(avg_f("entropy"),    3),
                "mean_ja4_diversity": round(avg_f("ja4_count"),3),
                "mean_ip_id_zero":    round(avg_f("ip_id_zero"),3),
                "mean_browser_score": round(avg_f("browser_score"),1),
                "mean_ua_rotating":   round(avg_f("ua_rotating"),3),
                "ip_count":     ip_count,
                "hit_count":    hit_count,
                "top_threat":   topk(threats, 1)[0] if threats else "",
                "top_countries":topk(countries, 5),
                "top_orgs":     topk(orgs, 5),
                "sample_ips":   sample_ips,
                "sample_ua":    sample_ua,
                "radar":        radar,
                # Hull pour deck.gl PolygonLayer
                "hull": hulls.get(j, []),
            })
        # ── 8. Arêtes k-NN entre clusters ────────────────────────────────
        edges = []
        seen: set[frozenset] = set()
        for i, ni in enumerate(nodes):
            ci = ni["cluster_idx"]
            dists = sorted(
                [(j, nj["cluster_idx"],
                  float(np.sum((km.centroids[ci] - km.centroids[nj["cluster_idx"]]) ** 2)))
                 for j, nj in enumerate(nodes) if j != i],
                key=lambda x: x[2]
            )
            for j_idx, cj, d2 in dists[:2]:
                key = frozenset([ni["id"], nodes[j_idx]["id"]])
                if key in seen:
                    continue
                seen.add(key)
                edges.append({
                    "id":         f"e_{ni['id']}_{nodes[j_idx]['id']}",
                    "source":     ni["id"],
                    "target":     nodes[j_idx]["id"],
                    "similarity": round(1.0 / (1.0 + math.sqrt(d2)), 3),
                })
        # ── 9. Stockage résultat + cache IPs ─────────────────────────────
        total_ips  = sum(n_["ip_count"]  for n_ in nodes)
        total_hits = sum(n_["hit_count"] for n_ in nodes)
        bot_ips    = sum(n_["ip_count"]  for n_ in nodes if n_["risk_score"] > 0.45 or "🤖" in n_["label"])
        high_ips   = sum(n_["ip_count"]  for n_ in nodes if n_["risk_score"] > 0.25)
        elapsed    = round(time.time() - t0, 2)
        result_dict = {
            "nodes": nodes,
            "edges": edges,
            "stats": {
                "total_clusters": len(nodes),
                "total_ips":      total_ips,
                "total_hits":     total_hits,
                "bot_ips":        bot_ips,
                "high_risk_ips":  high_ips,
                "n_samples":      n,
                "k":              k,
                "elapsed_s":      elapsed,
            },
            "feature_names": FEATURE_NAMES,
        }
        with _LOCK:
            _CACHE["result"]      = result_dict
            _CACHE["cluster_ips"] = cluster_ips_map
            _CACHE["status"]      = "ready"
            _CACHE["ts"]          = time.time()
            _CACHE["params"]      = {"k": k, "hours": hours}
            _CACHE["error"]       = None
        log.info(f"[clustering] Terminé en {elapsed}s — {total_ips} IPs, {len(nodes)} clusters")
    except Exception as e:
        log.exception("[clustering] Erreur lors du calcul")
        with _LOCK:
            _CACHE["status"] = "error"
            _CACHE["error"]  = str(e)
 def _maybe_trigger(k: int, hours: int) -> None:
    """Lance le calcul si cache absent, expiré ou paramètres différents."""
    with _LOCK:
        status = _CACHE["status"]
        params = _CACHE["params"]
        ts     = _CACHE["ts"]
    cache_stale = (time.time() - ts) > _CACHE_TTL
    params_changed = params.get("k") != k or params.get("hours") != hours
    if status in ("computing",):
        return  # déjà en cours
    if status == "ready" and not cache_stale and not params_changed:
        return  # cache frais
    _EXECUTOR.submit(_run_clustering_job, k, hours)
 # ─── Endpoints ────────────────────────────────────────────────────────────────
@router.get("/status")
 async def get_status():
    """État du calcul en cours (polling frontend)."""
    with _LOCK:
        return {
            "status":   _CACHE["status"],
            "error":    _CACHE["error"],
            "ts":       _CACHE["ts"],
            "params":   _CACHE["params"],
            "age_s":    round(time.time() - _CACHE["ts"], 0) if _CACHE["ts"] else None,
        }
@router.get("/clusters")
 async def get_clusters(
-    k:          int = Query(14, ge=4, le=30, description="Nombre de clusters"),
+    k:     int = Query(14, ge=4,  le=30,  description="Nombre de clusters"),
-    n_samples:  int = Query(3000, ge=500, le=8000, description="Taille de l'échantillon"),
+    hours: int = Query(24, ge=1,  le=168, description="Fenêtre temporelle (heures)"),
    force: bool = Query(False, description="Forcer le recalcul"),
 ):
    """
-    Clustering multi-métriques des IPs.
+    Clustering multi-métriques sur TOUTES les IPs.
-    Retourne les nœuds (clusters) + arêtes pour ReactFlow, avec :
+    Retourne immédiatement depuis le cache (status=ready).
-      - positions 2D issues de PCA sur les 21 features
+    Si le calcul est en cours ou non démarré → status=computing/idle + trigger.
      - profil radar des features par cluster (normalisé [0,1])
      - statistiques détaillées (moyennes brutes des features)
      - sample d'IPs représentatives
    """
-    t0 = time.time()
+    if force:
        with _LOCK:
            _CACHE["status"] = "idle"
            _CACHE["ts"]     = 0.0
    _maybe_trigger(k, hours)
    with _LOCK:
        status = _CACHE["status"]
        result = _CACHE["result"]
        error  = _CACHE["error"]
    if status == "computing":
        return {"status": "computing", "message": "Calcul en cours, réessayez dans quelques secondes"}
    if status == "error":
        raise HTTPException(status_code=500, detail=error or "Erreur inconnue")
    if result is None:
        return {"status": "idle", "message": "Calcul démarré, réessayez dans quelques secondes"}
    return {**result, "status": "ready"}
@router.get("/cluster/{cluster_id}/points")
 async def get_cluster_points(
    cluster_id: str,
    limit:  int = Query(5000, ge=1, le=20000),
    offset: int = Query(0, ge=0),
 ):
    """
    Coordonnées PCA + métadonnées de toutes les IPs d'un cluster.
    Utilisé par deck.gl ScatterplotLayer (drill-down ou zoom avancé).
    """
    with _LOCK:
        status = _CACHE["status"]
        ips_map = _CACHE["cluster_ips"]
    if status != "ready" or not ips_map:
        raise HTTPException(status_code=404, detail="Cache absent — appelez /clusters d'abord")
    try:
-        result = db.query(_SQL_FEATURES, {"limit": n_samples})
+        idx = int(cluster_id.split("_")[0][1:])
-    except Exception as e:
+    except (ValueError, IndexError):
-        raise HTTPException(status_code=500, detail=f"ClickHouse: {e}")
+        raise HTTPException(status_code=400, detail="cluster_id invalide (format: c{n}_k{k})")
-    # ── Construction des vecteurs de features ─────────────────────────────
+    members = ips_map.get(idx, [])
-    rows: list[dict] = []
+    total   = len(members)
-    for row in result.result_rows:
+    page    = members[offset: offset + limit]
        d = {col: row[i] for i, col in enumerate(_SQL_COLS)}
        rows.append(d)
-    if len(rows) < k:
+    points = [
-        raise HTTPException(status_code=400, detail="Pas assez de données pour ce k")
+        {"ip": m[0], "ja4": m[1], "pca_x": round(m[2], 6), "pca_y": round(m[3], 6), "risk": round(m[4], 3)}
-
+        for m in page
-    points = [build_feature_vector(r) for r in rows]
+    ]
-
+    return {"points": points, "total": total, "offset": offset, "limit": limit}
    # ── K-means++ ────────────────────────────────────────────────────────
    km = kmeans_pp(points, k=k, max_iter=60, seed=42)
    # ── PCA-2D sur les centroïdes ─────────────────────────────────────────
    # On projette les centroïdes dans l'espace PCA des données
    # → les positions relatives reflètent la variance des données
    coords_all = pca_2d(points)
    # Moyenne des positions PCA par cluster = position 2D du centroïde
    cluster_xs: list[list[float]] = [[] for _ in range(k)]
    cluster_ys: list[list[float]] = [[] for _ in range(k)]
    for i, label in enumerate(km.labels):
        cluster_xs[label].append(coords_all[i][0])
        cluster_ys[label].append(coords_all[i][1])
    centroid_2d: list[tuple[float, float]] = []
    for j in range(k):
        if cluster_xs[j]:
            cx = sum(cluster_xs[j]) / len(cluster_xs[j])
            cy = sum(cluster_ys[j]) / len(cluster_ys[j])
        else:
            cx, cy = 0.5, 0.5
        centroid_2d.append((cx, cy))
    # ── Agrégation des statistiques par cluster ───────────────────────────
    cluster_rows:    list[list[dict]] = [[] for _ in range(k)]
    cluster_members: list[list[tuple[str, str]]] = [[] for _ in range(k)]
    for i, label in enumerate(km.labels):
        cluster_rows[label].append(rows[i])
        cluster_members[label].append((rows[i]["ip"], rows[i]["ja4"]))
    # Mise à jour du cache pour le drill-down
    _cache["cluster_ips"] = {j: cluster_members[j] for j in range(k)}
    _cache["params"] = {"k": k, "ts": t0}
    # ── Construction des nœuds ReactFlow ─────────────────────────────────
    CANVAS_W, CANVAS_H = 1400, 780
    nodes = []
    for j in range(k):
        if not cluster_rows[j]:
            continue
        # Statistiques brutes moyennées
        def avg_feat(key: str) -> float:
            vals = [float(r.get(key) or 0) for r in cluster_rows[j]]
            return sum(vals) / len(vals) if vals else 0.0
        mean_ttl   = avg_feat("ttl")
        mean_mss   = avg_feat("mss")
        mean_scale = avg_feat("scale")
        mean_win   = avg_feat("win")
        mean_score = avg_feat("avg_score")
        mean_vel   = avg_feat("avg_velocity")
        mean_fuzz  = avg_feat("avg_fuzzing")
        mean_hless = avg_feat("pct_headless")
        mean_post  = avg_feat("avg_post")
        mean_asset = avg_feat("asset_ratio")
        mean_direct= avg_feat("direct_ratio")
        mean_alpn  = avg_feat("alpn_mismatch")
        mean_h2    = avg_feat("h2_eff")
        mean_hconf = avg_feat("hdr_conf")
        mean_ua_ch = avg_feat("ua_ch_mismatch")
        mean_entr  = avg_feat("entropy")
        mean_ja4   = avg_feat("ja4_count")
        mean_ip_id = avg_feat("ip_id_zero")
        mean_brow  = avg_feat("browser_score")
        mean_uarot = avg_feat("ua_rotating")
        ip_count   = len(set(r["ip"] for r in cluster_rows[j]))
        hit_count  = int(sum(float(r.get("hits") or 0) for r in cluster_rows[j]))
        # Pays / ASN / Menace dominants
        threats   = [str(r.get("threat") or "") for r in cluster_rows[j] if r.get("threat")]
        countries = [str(r.get("country") or "") for r in cluster_rows[j] if r.get("country")]
        orgs      = [str(r.get("asn_org") or "") for r in cluster_rows[j] if r.get("asn_org")]
        def topk(lst: list[str], n: int = 5) -> list[str]:
            from collections import Counter
            return [v for v, _ in Counter(lst).most_common(n) if v]
        raw_stats = {
            "mean_ttl": mean_ttl, "mean_mss": mean_mss,
            "mean_scale": mean_scale,
        }
        label      = name_cluster(km.centroids[j], raw_stats)
        risk       = risk_score_from_centroid(km.centroids[j])
        color      = _risk_to_color(risk)
        # Profil radar normalisé (valeurs centroïde [0,1])
        radar = [
            {"feature": name, "value": round(km.centroids[j][i], 4)}
            for i, name in enumerate(FEATURE_NAMES)
        ]
        # Position 2D (PCA normalisée → pixels ReactFlow)
        px_x = centroid_2d[j][0] * CANVAS_W * 0.85 + 80
        px_y = (1 - centroid_2d[j][1]) * CANVAS_H * 0.85 + 50  # inverser y (haut=risque)
        # Rayon ∝ √ip_count
        radius = max(18, min(90, int(math.sqrt(ip_count) * 0.3)))
        # Sample IPs (top 8 par hits)
        sample_rows = sorted(cluster_rows[j], key=lambda r: float(r.get("hits") or 0), reverse=True)[:8]
        sample_ips  = [r["ip"] for r in sample_rows]
        sample_ua   = str(cluster_rows[j][0].get("ua") or "")
        cluster_id = f"c{j}_k{k}"
        nodes.append({
            "id":          cluster_id,
            "label":       label,
            "cluster_idx": j,
            "x": round(px_x, 1),
            "y": round(px_y, 1),
            "radius":      radius,
            "color":       color,
            "risk_score":  risk,
            # Caractéristiques TCP
            "mean_ttl":    round(mean_ttl, 1),
            "mean_mss":    round(mean_mss, 0),
            "mean_scale":  round(mean_scale, 1),
            "mean_win":    round(mean_win, 0),
            # Comportement HTTP
            "mean_score":  round(mean_score, 4),
            "mean_velocity": round(mean_vel, 3),
            "mean_fuzzing": round(mean_fuzz, 3),
            "mean_headless": round(mean_hless, 3),
            "mean_post":   round(mean_post, 3),
            "mean_asset":  round(mean_asset, 3),
            "mean_direct": round(mean_direct, 3),
            # TLS / Protocole
            "mean_alpn_mismatch": round(mean_alpn, 3),
            "mean_h2_eff":    round(mean_h2, 3),
            "mean_hdr_conf":  round(mean_hconf, 3),
            "mean_ua_ch":     round(mean_ua_ch, 3),
            # Temporel
            "mean_entropy": round(mean_entr, 3),
            "mean_ja4_diversity": round(mean_ja4, 3),
            "mean_ip_id_zero":   round(mean_ip_id, 3),
            "mean_browser_score": round(mean_brow, 1),
            "mean_ua_rotating":  round(mean_uarot, 3),
            # Meta
            "ip_count":    ip_count,
            "hit_count":   hit_count,
            "top_threat":  topk(threats, 1)[0] if topk(threats, 1) else "",
            "top_countries": topk(countries, 5),
            "top_orgs":    topk(orgs, 5),
            "sample_ips":  sample_ips,
            "sample_ua":   sample_ua,
            # Profil radar pour visualisation
            "radar": radar,
        })
    # ── Arêtes : k-NN dans l'espace des features ──────────────────────────
    # Chaque cluster est connecté à ses 2 voisins les plus proches
    edges = []
    seen: set[frozenset] = set()
    centroids = km.centroids
    for i, ni in enumerate(nodes):
        ci = ni["cluster_idx"]
        # Distance² aux autres centroïdes
        dists = [
            (j, nj["cluster_idx"],
             sum((centroids[ci][d] - centroids[nj["cluster_idx"]][d]) ** 2
                 for d in range(N_FEATURES)))
            for j, nj in enumerate(nodes) if j != i
        ]
        dists.sort(key=lambda x: x[2])
        # 2 voisins les plus proches
        for j, cj, dist2 in dists[:2]:
            key = frozenset([ni["id"], nodes[j]["id"]])
            if key in seen:
                continue
            seen.add(key)
            similarity = round(1.0 / (1.0 + math.sqrt(dist2)), 3)
            edges.append({
                "id":         f"e_{ni['id']}_{nodes[j]['id']}",
                "source":     ni["id"],
                "target":     nodes[j]["id"],
                "similarity": similarity,
                "weight":     round(similarity * 5, 1),
            })
    # ── Stats globales ────────────────────────────────────────────────────
    total_ips  = sum(n["ip_count"]  for n in nodes)
    total_hits = sum(n["hit_count"] for n in nodes)
    bot_ips    = sum(n["ip_count"]  for n in nodes if n["risk_score"] > 0.40 or "🤖" in n["label"])
    high_risk  = sum(n["ip_count"]  for n in nodes if n["risk_score"] > 0.20)
    elapsed = round(time.time() - t0, 2)
    return {
        "nodes": nodes,
        "edges": edges,
        "stats": {
            "total_clusters":  len(nodes),
            "total_ips":       total_ips,
            "total_hits":      total_hits,
            "bot_ips":         bot_ips,
            "high_risk_ips":   high_risk,
            "n_samples":       len(rows),
            "k":               k,
            "elapsed_s":       elapsed,
        },
        "feature_names": FEATURE_NAMES,
    }
@router.get("/cluster/{cluster_id}/ips")
@ -371,57 +421,44 @@ async def get_cluster_ips(
    limit:  int = Query(100, ge=1, le=500),
    offset: int = Query(0, ge=0),
 ):
-    """
+    """IPs avec détails SQL (backward-compat avec l'ancienne UI)."""
-    IPs appartenant à un cluster (depuis le cache de la dernière exécution).
+    with _LOCK:
-    Si le cache est expiré, retourne une erreur guidant vers /clusters.
+        status  = _CACHE["status"]
-    """
+        ips_map = _CACHE["cluster_ips"]
-    if not _cache.get("cluster_ips"):
+
-        raise HTTPException(
+    if status != "ready" or not ips_map:
-            status_code=404,
+        raise HTTPException(status_code=404, detail="Cache absent — appelez /clusters d'abord")
            detail="Cache expiré — appelez /api/clustering/clusters d'abord"
        )
    # Extrait l'index cluster depuis l'id (format: c{idx}_k{k})
    try:
        idx = int(cluster_id.split("_")[0][1:])
    except (ValueError, IndexError):
        raise HTTPException(status_code=400, detail="cluster_id invalide")
-    members = _cache["cluster_ips"].get(idx, [])
+    members = ips_map.get(idx, [])
-    if not members:
+    total   = len(members)
-        return {"ips": [], "total": 0, "cluster_id": cluster_id}
+    page    = members[offset: offset + limit]
-
+    if not page:
    total = len(members)
    page_members = members[offset: offset + limit]
    # Requête SQL pour les détails de ces IPs spécifiques
    ip_list  = [m[0] for m in page_members]
    ja4_list = [m[1] for m in page_members]
    if not ip_list:
        return {"ips": [], "total": total, "cluster_id": cluster_id}
-    # On ne peut pas facilement passer une liste en paramètre ClickHouse —
+    safe_ips  = [m[0].replace("'", "") for m in page[:200]]
    # on la construit directement (valeurs nettoyées)
    safe_ips  = [ip.replace("'", "") for ip in ip_list[:100]]
    ip_filter = ", ".join(f"'{ip}'" for ip in safe_ips)
    sql = f"""
    SELECT
        replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') AS src_ip,
        t.ja4,
-        any(t.tcp_ttl_raw)   AS ttl,
+        any(t.tcp_ttl_raw)                              AS ttl,
-        any(t.tcp_win_raw)   AS win,
+        any(t.tcp_win_raw)                              AS win,
-        any(t.tcp_scale_raw) AS scale,
+        any(t.tcp_scale_raw)                            AS scale,
-        any(t.tcp_mss_raw)   AS mss,
+        any(t.tcp_mss_raw)                              AS mss,
-        sum(t.hits)          AS hits,
+        sum(t.hits)                                     AS hits,
-        any(t.first_ua)      AS ua,
+        any(t.first_ua)                                 AS ua,
-        round(avg(abs(ml.anomaly_score)), 3) AS avg_score,
+        round(avg(abs(ml.anomaly_score)), 3)            AS avg_score,
-        max(ml.threat_level)                 AS threat_level,
+        max(ml.threat_level)                            AS threat_level,
-        any(ml.country_code)                 AS country_code,
+        any(ml.country_code)                            AS country_code,
-        any(ml.asn_org)                      AS asn_org,
+        any(ml.asn_org)                                 AS asn_org,
-        round(avg(ml.fuzzing_index), 2)      AS fuzzing,
+        round(avg(ml.fuzzing_index), 2)                 AS fuzzing,
-        round(avg(ml.hit_velocity), 2)       AS velocity
+        round(avg(ml.hit_velocity), 2)                  AS velocity
    FROM mabase_prod.agg_host_ip_ja4_1h t
    LEFT JOIN mabase_prod.ml_detected_anomalies ml
        ON t.src_ip = ml.src_ip AND t.ja4 = ml.ja4
@ -439,7 +476,7 @@ async def get_cluster_ips(
    ips = []
    for row in result.result_rows:
        ips.append({
-            "ip":           str(row[0]),
+            "ip":           str(row[0] or ""),
            "ja4":          str(row[1] or ""),
            "tcp_ttl":      int(row[2] or 0),
            "tcp_win":      int(row[3] or 0),
--- a/backend/services/clustering_engine.py
+++ b/backend/services/clustering_engine.py
@ -1,12 +1,14 @@
 """
-Moteur de clustering K-means++ multi-métriques (pur Python).
+Moteur de clustering K-means++ multi-métriques (numpy + scipy vectorisé).
-Ref: Arthur & Vassilvitskii (2007) — k-means++: The Advantages of Careful Seeding
+Ref:
-     Hotelling (1933) — PCA par puissance itérative (deflation)
+  Arthur & Vassilvitskii (2007) — k-means++: The Advantages of Careful Seeding
  scipy.spatial.ConvexHull     — enveloppe convexe (Graham/Qhull)
  sklearn-style API             — centroids, labels_, inertia_
 Features (21 dimensions, normalisées [0,1]) :
-  0  ttl_n          : TTL initial normalisé (hops-count estimé)
+  0  ttl_n          : TTL initial normalisé
-  1  mss_n          : MSS normalisé → type réseau (Ethernet/PPPoE/VPN)
+  1  mss_n          : MSS normalisé → type réseau
  2  scale_n        : facteur de mise à l'échelle TCP
  3  win_n          : fenêtre TCP normalisée
  4  score_n        : score anomalie ML (abs)
@ -16,7 +18,7 @@ Features (21 dimensions, normalisées [0,1]) :
  8  post_n         : ratio POST/total
  9  ip_id_zero_n   : ratio IP-ID=0 (Linux/spoofé)
  10 entropy_n      : entropie temporelle
-  11 browser_n      : score navigateur moderne (normalisé max 50)
+  11 browser_n      : score navigateur moderne
  12 alpn_n         : mismatch ALPN/protocole
  13 alpn_absent_n  : ratio ALPN absent
  14 h2_n           : efficacité H2 multiplexing (log1p)
@ -28,301 +30,248 @@ Features (21 dimensions, normalisées [0,1]) :
  20 ua_rot_n       : UA rotatif (booléen)
 """
 from __future__ import annotations
 import math
-import random
+import logging
 import numpy as np
 from dataclasses import dataclass, field
 from scipy.spatial import ConvexHull
 log = logging.getLogger(__name__)
 # ─── Définition des features ──────────────────────────────────────────────────
-# (clé SQL, nom lisible, fonction de normalisation)
+FEATURES: list[tuple[str, str, object]] = [
 FEATURES = [
    # TCP stack
-    ("ttl",          "TTL Initial",           lambda v: min(1.0, (v or 0) / 255.0)),
+    ("ttl",           "TTL Initial",           lambda v: min(1.0, (v or 0) / 255.0)),
-    ("mss",          "MSS Réseau",            lambda v: min(1.0, (v or 0) / 1460.0)),
+    ("mss",           "MSS Réseau",            lambda v: min(1.0, (v or 0) / 1460.0)),
-    ("scale",        "Scale TCP",             lambda v: min(1.0, (v or 0) / 14.0)),
+    ("scale",         "Scale TCP",             lambda v: min(1.0, (v or 0) / 14.0)),
-    ("win",          "Fenêtre TCP",           lambda v: min(1.0, (v or 0) / 65535.0)),
+    ("win",           "Fenêtre TCP",           lambda v: min(1.0, (v or 0) / 65535.0)),
    # Anomalie ML
-    ("avg_score",    "Score Anomalie",        lambda v: min(1.0, float(v or 0))),
+    ("avg_score",     "Score Anomalie",        lambda v: min(1.0, float(v or 0))),
-    ("avg_velocity", "Vélocité (rps)",        lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))),
+    ("avg_velocity",  "Vélocité (rps)",        lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))),
-    ("avg_fuzzing",  "Fuzzing",               lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))),
+    ("avg_fuzzing",   "Fuzzing",               lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))),
-    ("pct_headless", "Headless",              lambda v: min(1.0, float(v or 0))),
+    ("pct_headless",  "Headless",              lambda v: min(1.0, float(v or 0))),
-    ("avg_post",     "Ratio POST",            lambda v: min(1.0, float(v or 0))),
+    ("avg_post",      "Ratio POST",            lambda v: min(1.0, float(v or 0))),
    # IP-ID
-    ("ip_id_zero",   "IP-ID Zéro",           lambda v: min(1.0, float(v or 0))),
+    ("ip_id_zero",    "IP-ID Zéro",           lambda v: min(1.0, float(v or 0))),
    # Temporel
-    ("entropy",      "Entropie Temporelle",   lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(10))),
+    ("entropy",       "Entropie Temporelle",   lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(10))),
    # Navigateur
-    ("browser_score","Score Navigateur",      lambda v: min(1.0, float(v or 0) / 50.0)),
+    ("browser_score", "Score Navigateur",      lambda v: min(1.0, float(v or 0) / 50.0)),
    # TLS / Protocole
-    ("alpn_mismatch","ALPN Mismatch",         lambda v: min(1.0, float(v or 0))),
+    ("alpn_mismatch", "ALPN Mismatch",         lambda v: min(1.0, float(v or 0))),
-    ("alpn_missing", "ALPN Absent",           lambda v: min(1.0, float(v or 0))),
+    ("alpn_missing",  "ALPN Absent",           lambda v: min(1.0, float(v or 0))),
-    ("h2_eff",       "H2 Multiplexing",       lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(20))),
+    ("h2_eff",        "H2 Multiplexing",       lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(20))),
-    ("hdr_conf",     "Ordre Headers",         lambda v: min(1.0, float(v or 0))),
+    ("hdr_conf",      "Ordre Headers",         lambda v: min(1.0, float(v or 0))),
-    ("ua_ch_mismatch","UA-CH Mismatch",       lambda v: min(1.0, float(v or 0))),
+    ("ua_ch_mismatch","UA-CH Mismatch",        lambda v: min(1.0, float(v or 0))),
    # Comportement HTTP
-    ("asset_ratio",  "Ratio Assets",          lambda v: min(1.0, float(v or 0))),
+    ("asset_ratio",   "Ratio Assets",          lambda v: min(1.0, float(v or 0))),
-    ("direct_ratio", "Accès Direct",          lambda v: min(1.0, float(v or 0))),
+    ("direct_ratio",  "Accès Direct",          lambda v: min(1.0, float(v or 0))),
    # Diversité JA4
-    ("ja4_count",    "Diversité JA4",         lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(30))),
+    ("ja4_count",     "Diversité JA4",         lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(30))),
    # UA rotatif
-    ("ua_rotating",  "UA Rotatif",            lambda v: 1.0 if float(v or 0) > 0 else 0.0),
+    ("ua_rotating",   "UA Rotatif",            lambda v: 1.0 if float(v or 0) > 0 else 0.0),
 ]
-FEATURE_KEYS   = [f[0] for f in FEATURES]
+FEATURE_KEYS  = [f[0] for f in FEATURES]
-FEATURE_NAMES  = [f[1] for f in FEATURES]
+FEATURE_NAMES = [f[1] for f in FEATURES]
-FEATURE_NORMS  = [f[2] for f in FEATURES]
+FEATURE_NORMS = [f[2] for f in FEATURES]
-N_FEATURES     = len(FEATURES)
+N_FEATURES    = len(FEATURES)
 # ─── Utilitaires vectoriels (pur Python) ──────────────────────────────────────
 def _dist2(a: list[float], b: list[float]) -> float:
    return sum((x - y) ** 2 for x, y in zip(a, b))
 def _mean_vec(vecs: list[list[float]]) -> list[float]:
    n = len(vecs)
    if n == 0:
        return [0.0] * N_FEATURES
    return [sum(v[i] for v in vecs) / n for i in range(N_FEATURES)]
 # ─── Construction du vecteur de features ─────────────────────────────────────
 def build_feature_vector(row: dict) -> list[float]:
-    """Normalise un dict de colonnes SQL → vecteur [0,1]^N_FEATURES."""
+    """Construit le vecteur normalisé [0,1]^21 depuis un dict SQL."""
-    return [fn(row.get(key)) for key, fn in zip(FEATURE_KEYS, FEATURE_NORMS)]
+    return [norm(row.get(key, 0)) for key, _, norm in FEATURES]
-# ─── K-means++ ───────────────────────────────────────────────────────────────
+# ─── K-means++ vectorisé (numpy) ─────────────────────────────────────────────
@dataclass
 class KMeansResult:
-    centroids: list[list[float]]
+    centroids: np.ndarray        # (k, n_features)
-    labels:    list[int]
+    labels: np.ndarray           # (n_points,)  int32
-    inertia:   float
+    inertia: float
-    n_iter:    int
+    n_iter: int
-def kmeans_pp(
+def kmeans_pp(X: np.ndarray, k: int, max_iter: int = 60, n_init: int = 3,
-    points:   list[list[float]],
+              seed: int = 42) -> KMeansResult:
    k:        int,
    max_iter: int = 60,
    seed:     int = 42,
    n_init:   int = 3,
 ) -> KMeansResult:
    """
-    K-means avec initialisation k-means++ (Arthur & Vassilvitskii, 2007).
+    K-means++ entièrement vectorisé avec numpy.
-    Lance `n_init` fois et retourne le meilleur résultat (inertie minimale).
+    n_init exécutions, meilleure inertie conservée.
    """
-    rng = random.Random(seed)
+    rng = np.random.default_rng(seed)
    n, d = X.shape
    best: KMeansResult | None = None
-    for attempt in range(n_init):
+    for _ in range(n_init):
-        # ── Initialisation k-means++ ────────────────────────────────────
+        # ── Initialisation K-means++ ──────────────────────────────────────
-        first_idx = rng.randrange(len(points))
+        centers = [X[rng.integers(n)].copy()]
        centroids = [points[first_idx][:]]
        for _ in range(k - 1):
-            d2 = [min(_dist2(p, c) for c in centroids) for p in points]
+            D = _min_sq_dist(X, np.array(centers))
-            total = sum(d2)
+            # Garantit des probabilités non-négatives (erreurs float, points dupliqués)
-            if total == 0:
+            D = np.clip(D, 0.0, None)
-                break
+            total = D.sum()
-            r = rng.random() * total
+            if total < 1e-12:
-            cumul = 0.0
+                # Tous les points sont confondus — tirage aléatoire
-            for i, d in enumerate(d2):
+                centers.append(X[rng.integers(n)].copy())
                cumul += d
                if cumul >= r:
                    centroids.append(points[i][:])
                    break
            else:
-                centroids.append(points[rng.randrange(len(points))][:])
+                probs = D / total
                centers.append(X[rng.choice(n, p=probs)].copy())
        centers_arr = np.array(centers)  # (k, d)
-        # ── Itérations EM ───────────────────────────────────────────────
+        # ── Iterations ───────────────────────────────────────────────────
-        labels: list[int] = [0] * len(points)
+        labels = np.zeros(n, dtype=np.int32)
-        for iteration in range(max_iter):
+        for it in range(max_iter):
-            # E-step : affectation
+            # Assignation vectorisée : (n, k) distance²
-            new_labels = [
+            dists = _sq_dists(X, centers_arr)  # (n, k)
-                min(range(len(centroids)), key=lambda c: _dist2(p, centroids[c]))
+            new_labels = np.argmin(dists, axis=1).astype(np.int32)
-                for p in points
+
-            ]
+            if it > 0 and np.all(new_labels == labels):
-            if new_labels == labels and iteration > 0:
+                break  # convergence
                break
            labels = new_labels
-            # M-step : mise à jour
+            # Mise à jour des centroïdes
            clusters: list[list[list[float]]] = [[] for _ in range(k)]
            for i, l in enumerate(labels):
                clusters[l].append(points[i])
            for j in range(k):
-                if clusters[j]:
+                mask = labels == j
-                    centroids[j] = _mean_vec(clusters[j])
+                if mask.any():
                    centers_arr[j] = X[mask].mean(axis=0)
-        inertia = sum(_dist2(points[i], centroids[labels[i]]) for i in range(len(points)))
+        inertia = float(np.sum(np.min(_sq_dists(X, centers_arr), axis=1)))
-        result = KMeansResult(
+        result = KMeansResult(centers_arr, labels, inertia, it + 1)
            centroids=centroids,
            labels=labels,
            inertia=inertia,
            n_iter=iteration + 1,
        )
        if best is None or inertia < best.inertia:
            best = result
-    return best  # type: ignore
+    return best  # type: ignore[return-value]
-# ─── PCA 2D par puissance itérative ──────────────────────────────────────────
+def _sq_dists(X: np.ndarray, C: np.ndarray) -> np.ndarray:
    """Distance² entre chaque point de X et chaque centroïde de C. O(n·k·d)."""
    # ||x - c||² = ||x||² + ||c||² - 2·x·cᵀ
    X2 = np.sum(X ** 2, axis=1, keepdims=True)  # (n, 1)
    C2 = np.sum(C ** 2, axis=1, keepdims=True).T  # (1, k)
    return X2 + C2 - 2.0 * X @ C.T              # (n, k)
-def pca_2d(points: list[list[float]]) -> list[tuple[float, float]]:
+
 def _min_sq_dist(X: np.ndarray, C: np.ndarray) -> np.ndarray:
    """Distance² minimale de chaque point aux centroïdes existants."""
    return np.min(_sq_dists(X, C), axis=1)
 # ─── PCA 2D (numpy) ──────────────────────────────────────────────────────────
 def pca_2d(X: np.ndarray) -> np.ndarray:
    """
-    Projection PCA 2D par puissance itérative avec déflation (Hotelling).
+    PCA-2D vectorisée. Retourne les coordonnées normalisées [0,1] × [0,1].
    Retourne les coordonnées (pc1, pc2) normalisées dans [0,1].
    """
-    n = len(points)
+    mean = X.mean(axis=0)
-    if n == 0:
+    Xc = X - mean
-        return []
+    # Power iteration pour les 2 premières composantes
-
+    rng = np.random.default_rng(0)
-    # Centrage
+    v1 = _power_iter(Xc, rng.standard_normal(Xc.shape[1]))
-    mean = _mean_vec(points)
+    proj1 = Xc @ v1
-    X = [[p[i] - mean[i] for i in range(N_FEATURES)] for p in points]
+    # Déflation (Hotelling)
-
+    Xc2 = Xc - np.outer(proj1, v1)
-    def power_iter(X_centered: list[list[float]], n_iter: int = 30) -> list[float]:
+    v2 = _power_iter(Xc2, rng.standard_normal(Xc.shape[1]))
-        """Trouve le premier vecteur propre de X^T X par puissance itérative."""
+    proj2 = Xc2 @ v2
        v = [1.0 / math.sqrt(N_FEATURES)] * N_FEATURES
        for _ in range(n_iter):
            # Xv = X @ v
            Xv = [sum(row[j] * v[j] for j in range(N_FEATURES)) for row in X_centered]
            # Xtxv = X^T @ Xv
            xtxv = [sum(X_centered[i][j] * Xv[i] for i in range(len(X_centered))) for j in range(N_FEATURES)]
            norm = math.sqrt(sum(x ** 2 for x in xtxv)) or 1e-10
            v = [x / norm for x in xtxv]
        return v
    # PC1
    v1 = power_iter(X)
    proj1 = [sum(row[j] * v1[j] for j in range(N_FEATURES)) for row in X]
    # Déflation : retire la composante PC1 de X
    X2 = [
        [X[i][j] - proj1[i] * v1[j] for j in range(N_FEATURES)]
        for i in range(n)
    ]
    # PC2
    v2 = power_iter(X2)
    proj2 = [sum(row[j] * v2[j] for j in range(N_FEATURES)) for row in X2]
    coords = np.column_stack([proj1, proj2])
    # Normalisation [0,1]
-    def _norm01(vals: list[float]) -> list[float]:
+    mn, mx = coords.min(axis=0), coords.max(axis=0)
-        lo, hi = min(vals), max(vals)
+    rng_ = mx - mn
-        rng = hi - lo or 1e-10
+    rng_[rng_ == 0] = 1.0
-        return [(v - lo) / rng for v in vals]
+    return (coords - mn) / rng_
    p1 = _norm01(proj1)
    p2 = _norm01(proj2)
    return list(zip(p1, p2))
-# ─── Nommage automatique des clusters ────────────────────────────────────────
+def _power_iter(X: np.ndarray, v: np.ndarray, n_iter: int = 30) -> np.ndarray:
    """Power iteration : trouve le premier vecteur propre de XᵀX."""
    for _ in range(n_iter):
        v = X.T @ (X @ v)
        norm = np.linalg.norm(v)
        if norm < 1e-12:
            break
        v /= norm
    return v
-def name_cluster(centroid: list[float], raw_stats: dict | None = None) -> str:
+
 # ─── Enveloppe convexe (hull) par cluster ────────────────────────────────────
 def compute_hulls(coords_2d: np.ndarray, labels: np.ndarray,
                  k: int, min_pts: int = 4) -> dict[int, list[list[float]]]:
    """
-    Génère un nom lisible à partir du centroïde normalisé et de statistiques brutes.
+    Calcule l'enveloppe convexe (convex hull) des points PCA pour chaque cluster.
-    Priorité : signaux les plus discriminants en premier.
+    Retourne {cluster_idx: [[x,y], ...]} (polygone fermé).
    """
-    score  = centroid[4]   # anomalie ML
+    hulls: dict[int, list[list[float]]] = {}
-    vel    = centroid[5]   # vélocité
+    for j in range(k):
-    fuzz   = centroid[6]   # fuzzing (log1p normalisé, >0.35 ≈ fuzzing_index > 100)
+        pts = coords_2d[labels == j]
-    hless  = centroid[7]   # headless
+        if len(pts) < min_pts:
-    post   = centroid[8]   # POST ratio
+            # Pas assez de points : bounding box
-    alpn   = centroid[12]  # ALPN mismatch
+            if len(pts) > 0:
-    h2     = centroid[14]  # H2 eff
+                mx_, my_ = pts.mean(axis=0)
-    ua_ch  = centroid[16]  # UA-CH mismatch
+                r = max(0.01, pts.std(axis=0).max())
-    ja4d   = centroid[19]  # JA4 diversité
+                hulls[j] = [
-    ua_rot = centroid[20]  # UA rotatif
+                    [mx_ - r, my_ - r], [mx_ + r, my_ - r],
-
+                    [mx_ + r, my_ + r], [mx_ - r, my_ + r],
-    raw_mss   = (raw_stats or {}).get("mean_mss",   0)
+                ]
-    raw_ttl   = (raw_stats or {}).get("mean_ttl",   0) or (centroid[0] * 255)
+            continue
-    raw_scale = (raw_stats or {}).get("mean_scale", 0)
+        try:
-
+            hull = ConvexHull(pts)
-    # ── Signaux forts (déterministes) ────────────────────────────────────
+            hull_pts = pts[hull.vertices].tolist()
-
+            # Fermer le polygone
-    # Pattern Masscan : mss≈1452, scale≈4, TTL 48-57
+            hull_pts.append(hull_pts[0])
-    if raw_mss and 1440 <= raw_mss <= 1460 and raw_scale and 3 <= raw_scale <= 5 and raw_ttl < 60:
+            hulls[j] = hull_pts
-        return "🤖 Masscan / Scanner IP"
+        except Exception:
-
+            hulls[j] = []
-    # Fuzzer agressif (fuzzing_index normalisé > 0.35 ≈ valeur brute > 100)
+    return hulls
    if fuzz > 0.35:
        return "🤖 Bot Fuzzer / Scanner"
    # UA rotatif + UA-CH mismatch : bot sophistiqué simulant un navigateur
    if ua_rot > 0.5 and ua_ch > 0.7:
        return "🤖 Bot UA Rotatif + CH Mismatch"
    # UA-CH mismatch fort seul (navigateur simulé sans headers CH)
    if ua_ch > 0.8:
        return "⚠️ Bot UA-CH Incohérent"
    # ── Score ML modéré + signal comportemental ──────────────────────────
    if score > 0.20:
        if hless > 0.3:
            return "⚠️ Navigateur Headless Suspect"
        if vel > 0.25:
            return "⚠️ Bot Haute Vélocité"
        if post > 0.4:
            return "⚠️ Bot POST Automatisé"
        if alpn > 0.5 or h2 > 0.5:
            return "⚠️ TLS/H2 Anormal"
        if ua_ch > 0.4:
            return "⚠️ Anomalie UA-CH"
        return "⚠️ Anomalie ML Modérée"
    # ── Signaux faibles ───────────────────────────────────────────────────
    if ua_ch > 0.4:
        return "🔎 UA-CH Incohérent"
    if ja4d > 0.5:
        return "🔄 Client Multi-Fingerprint"
    # ── Classification réseau / OS ────────────────────────────────────────
    # MSS bas → VPN ou tunnel
    if raw_mss and raw_mss < 1360:
        return "🌐 VPN / Tunnel"
    if raw_ttl < 70:
        return "🐧 Linux / Mobile"
    if raw_ttl > 110:
        return "🪟 Windows"
    return "✅ Trafic Légitime"
-def risk_score_from_centroid(centroid: list[float]) -> float:
+# ─── Nommage et scoring ───────────────────────────────────────────────────────
    """Score de risque [0,1] pondéré. Calibré pour les valeurs observées (score ML ~0.3)."""
    # Normalisation de score ML : x / 0.5 pour étendre la plage utile (0-0.5 → 0-1)
    score_n  = min(1.0, centroid[4] / 0.5)
    fuzz_n   = centroid[6]
    ua_ch_n  = centroid[16]
    ua_rot_n = centroid[20]
    vel_n    = centroid[5]
    hless_n  = centroid[7]
    ip_id_n  = centroid[9]
    alpn_n   = centroid[12]
    ja4d_n   = centroid[19]
    post_n   = centroid[8]
-    return min(1.0,
+def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
-        0.25 * score_n +
+    """Nom lisible basé sur les features dominantes du centroïde."""
-        0.20 * ua_ch_n +
+    s = centroid  # alias
-        0.15 * fuzz_n +
+    ttl_raw = float(raw_stats.get("mean_ttl", 0))
-        0.12 * ua_rot_n +
+    mss_raw = float(raw_stats.get("mean_mss", 0))
-        0.10 * hless_n +
+
-        0.07 * vel_n +
+    # Scanners / bots masscan
-        0.04 * ip_id_n +
+    if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25:
-        0.04 * alpn_n +
+        return "🤖 Masscan Scanner"
-        0.03 * ja4d_n +
+    if s[4] > 0.70 and s[6] > 0.5:
-        0.03 * post_n
+        return "🤖 Bot agressif"
-    )
+    if s[16] > 0.80:
        return "🤖 UA-CH Mismatch"
    if s[7] > 0.70:
        return "🤖 Headless Browser"
    if s[4] > 0.50:
        return "⚠️ Anomalie ML haute"
    if s[3] > 0.85 and ttl_raw > 120:
        return "🖥️ Windows"
    if s[0] > 0.22 and s[0] < 0.28 and mss_raw > 1400:
        return "🐧 Linux"
    if s[1] < 0.90 and s[1] > 0.95:
        return "📡 VPN/Proxy"
    if mss_raw < 1380 and mss_raw > 0:
        return "🌐 Tunnel réseau"
    if s[5] > 0.60:
        return "⚡ Trafic rapide"
    if s[4] < 0.10 and s[5] < 0.10:
        return "✅ Trafic sain"
    return "📊 Cluster mixte"
 def risk_score_from_centroid(centroid: np.ndarray) -> float:
    """Score de risque [0,1] agrégé depuis le centroïde."""
    s = centroid
    return float(np.clip(
        0.40 * s[4]   +   # score ML
        0.15 * s[6]   +   # fuzzing
        0.15 * s[16]  +   # UA-CH mismatch
        0.10 * s[7]   +   # headless
        0.10 * s[5]   +   # vélocité
        0.10 * s[9],      # IP-ID zéro
        0.0, 1.0
    ))
--- a/frontend/package.json
+++ b/frontend/package.json
@ -15,7 +15,10 @@
    "recharts": "^2.10.0",
    "@tanstack/react-table": "^8.11.0",
    "date-fns": "^3.0.0",
-    "reactflow": "^11.10.0"
+    "reactflow": "^11.10.0",
    "@deck.gl/react": "^9.0.0",
    "@deck.gl/core": "^9.0.0",
    "@deck.gl/layers": "^9.0.0"
  },
  "devDependencies": {
    "@types/react": "^18.2.0",
--- a/frontend/src/components/ClusteringView.tsx
+++ b/frontend/src/components/ClusteringView.tsx
--- a/requirements.txt
+++ b/requirements.txt
@ -5,3 +5,5 @@ pydantic==2.5.0
 pydantic-settings==2.1.0
 python-dotenv==1.0.0
 httpx==0.26.0
 numpy>=1.26
 scipy>=1.11