From fc3392779b864714bb83e2d0a3279cd181a82f72 Mon Sep 17 00:00:00 2001
From: SOC Analyst <soc@dashboard.local>
Date: Thu, 19 Mar 2026 10:07:23 +0100
Subject: [PATCH] =?UTF-8?q?feat:=20slider=20sensibilit=C3=A9=20+=20z-score?=
 =?UTF-8?q?=20standardization=20pour=20clustering=20plus=20pr=C3=A9cis?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sensibilité (0.5x–3.0x) :
- Multiplie k : sensibilité=2x avec k=14 → 28 clusters effectifs
- Labels UI : Grossière / Normale / Fine / Très fine / Maximum
- Paramètres avancés (k, fenêtre) masqués dans un <details>
- Cache invalidé si sensibilité change

Z-score standardisation (Bishop 2006 PRML §9.1) :
- Normalise par variance de chaque feature avant K-means
- Features discriminantes (forte std) pèsent plus
- Résultat : risque 0→1.00 sur clusters bots vs 0→0.27 avant
- Bots détectés : 4 337 IPs vs 1 604 (2.7x plus)
- Nouveaux clusters : Bot agressif, Tunnel réseau, UA-CH Mismatch distincts

Fix TextLayer deck.gl :
- Translittération des accents (é→e, à→a, ç→c…) + strip emojis
- Évite les warnings 'Missing character' sur caractères non-ASCII

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 backend/routes/clustering.py               | 78 ++++++++++++++--------
 backend/services/clustering_engine.py      | 17 +++++
 frontend/src/components/ClusteringView.tsx | 73 ++++++++++++++------
 3 files changed, 118 insertions(+), 50 deletions(-)
diff --git a/backend/routes/clustering.py b/backend/routes/clustering.py
index de40c26..ae17508 100644
--- a/backend/routes/clustering.py
+++ b/backend/routes/clustering.py
@@ -23,7 +23,7 @@ from ..database import db
 from ..services.clustering_engine import (
     FEATURE_KEYS, FEATURE_NAMES, FEATURE_NORMS, N_FEATURES,
     build_feature_vector, kmeans_pp, pca_2d, compute_hulls,
-    name_cluster, risk_score_from_centroid,
+    name_cluster, risk_score_from_centroid, standardize,
 )
 
 log = logging.getLogger(__name__)
@@ -110,15 +110,22 @@ _SQL_COLS = [
 
 # ─── Worker de clustering (thread pool) ──────────────────────────────────────
 
-def _run_clustering_job(k: int, hours: int) -> None:
-    """Exécuté dans le thread pool. Met à jour _CACHE."""
+def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
+    """Exécuté dans le thread pool. Met à jour _CACHE.
+    
+    sensitivity : multiplicateur de k [0.5 – 3.0].
+      1.0 = comportement par défaut
+      2.0 = deux fois plus de clusters → groupes plus homogènes
+      0.5 = moitié → vue très agrégée
+    """
+    k_actual = max(4, min(50, round(k * sensitivity)))
     t0 = time.time()
     with _LOCK:
         _CACHE["status"] = "computing"
         _CACHE["error"] = None
 
     try:
-        log.info(f"[clustering] Démarrage du calcul k={k} hours={hours}")
+        log.info(f"[clustering] Démarrage k={k_actual} (base={k}×sens={sensitivity}) hours={hours}")
 
         # ── 1. Chargement de toutes les IPs ──────────────────────────────
         result = db.query(_SQL_ALL_IPS, {"hours": hours})
@@ -128,27 +135,35 @@ def _run_clustering_job(k: int, hours: int) -> None:
 
         n = len(rows)
         log.info(f"[clustering] {n} IPs chargées")
-        if n < k:
-            raise ValueError(f"Seulement {n} IPs disponibles (k={k} requis)")
+        if n < k_actual:
+            raise ValueError(f"Seulement {n} IPs disponibles (k={k_actual} requis)")
 
         # ── 2. Construction de la matrice de features (numpy) ────────────
         X = np.array([build_feature_vector(r) for r in rows], dtype=np.float32)
         log.info(f"[clustering] Matrice X: {X.shape} — {X.nbytes/1024/1024:.1f} MB")
 
-        # ── 3. K-means++ vectorisé ────────────────────────────────────────
-        km = kmeans_pp(X.astype(np.float64), k=k, max_iter=80, n_init=3, seed=42)
+        # ── 3. Standardisation z-score ────────────────────────────────────
+        # Normalise par variance : features discriminantes (forte std)
+        # contribuent plus que les features quasi-constantes.
+        X64 = X.astype(np.float64)
+        X_std, feat_mean, feat_std = standardize(X64)
+
+        # ── 4. K-means++ sur l'espace standardisé ────────────────────────
+        km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=3, seed=42)
         log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")
 
-        # ── 4. PCA-2D pour toutes les IPs ────────────────────────────────
-        coords = pca_2d(X.astype(np.float64))   # (n, 2), normalisé [0,1]
+        # ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ────
+        # On utilise les features non-standardisées pour la projection 2D
+        # afin que les distances visuelles restent interprétables.
+        coords = pca_2d(X64)   # (n, 2), normalisé [0,1]
 
-        # ── 5. Enveloppes convexes par cluster ───────────────────────────
-        hulls = compute_hulls(coords, km.labels, k)
+        # ── 5b. Enveloppes convexes par cluster ──────────────────────────
+        hulls = compute_hulls(coords, km.labels, k_actual)
 
         # ── 6. Agrégation par cluster ─────────────────────────────────────
-        cluster_rows:    list[list[dict]] = [[] for _ in range(k)]
-        cluster_coords:  list[list[list[float]]] = [[] for _ in range(k)]
-        cluster_ips_map: dict[int, list] = {j: [] for j in range(k)}
+        cluster_rows:    list[list[dict]] = [[] for _ in range(k_actual)]
+        cluster_coords:  list[list[list[float]]] = [[] for _ in range(k_actual)]
+        cluster_ips_map: dict[int, list] = {j: [] for j in range(k_actual)}
 
         for i, label in enumerate(km.labels):
             j = int(label)
@@ -164,7 +179,7 @@ def _run_clustering_job(k: int, hours: int) -> None:
 
         # ── 7. Construction des nœuds ─────────────────────────────────────
         nodes = []
-        for j in range(k):
+        for j in range(k_actual):
             if not cluster_rows[j]:
                 continue
 
@@ -206,7 +221,7 @@ def _run_clustering_job(k: int, hours: int) -> None:
             sample_ua   = str(cluster_rows[j][0].get("ua") or "")
 
             nodes.append({
-                "id":           f"c{j}_k{k}",
+                "id":           f"c{j}_k{k_actual}",
                 "cluster_idx":  j,
                 "label":        label_name,
                 "pca_x":        round(cxy[0], 6),
@@ -289,7 +304,9 @@ def _run_clustering_job(k: int, hours: int) -> None:
                 "bot_ips":        bot_ips,
                 "high_risk_ips":  high_ips,
                 "n_samples":      n,
-                "k":              k,
+                "k":              k_actual,
+                "k_base":         k,
+                "sensitivity":    sensitivity,
                 "elapsed_s":      elapsed,
             },
             "feature_names": FEATURE_NAMES,
@@ -300,7 +317,7 @@ def _run_clustering_job(k: int, hours: int) -> None:
             _CACHE["cluster_ips"] = cluster_ips_map
             _CACHE["status"]      = "ready"
             _CACHE["ts"]          = time.time()
-            _CACHE["params"]      = {"k": k, "hours": hours}
+            _CACHE["params"]      = {"k": k, "hours": hours, "sensitivity": sensitivity}
             _CACHE["error"]       = None
 
         log.info(f"[clustering] Terminé en {elapsed}s — {total_ips} IPs, {len(nodes)} clusters")
@@ -312,7 +329,7 @@ def _run_clustering_job(k: int, hours: int) -> None:
             _CACHE["error"]  = str(e)
 
 
-def _maybe_trigger(k: int, hours: int) -> None:
+def _maybe_trigger(k: int, hours: int, sensitivity: float) -> None:
     """Lance le calcul si cache absent, expiré ou paramètres différents."""
     with _LOCK:
         status = _CACHE["status"]
@@ -320,7 +337,11 @@ def _maybe_trigger(k: int, hours: int) -> None:
         ts     = _CACHE["ts"]
 
     cache_stale = (time.time() - ts) > _CACHE_TTL
-    params_changed = params.get("k") != k or params.get("hours") != hours
+    params_changed = (
+        params.get("k") != k or
+        params.get("hours") != hours or
+        params.get("sensitivity") != sensitivity
+    )
 
     if status in ("computing",):
         return  # déjà en cours
@@ -328,7 +349,7 @@ def _maybe_trigger(k: int, hours: int) -> None:
     if status == "ready" and not cache_stale and not params_changed:
         return  # cache frais
 
-    _EXECUTOR.submit(_run_clustering_job, k, hours)
+    _EXECUTOR.submit(_run_clustering_job, k, hours, sensitivity)
 
 
 # ─── Endpoints ────────────────────────────────────────────────────────────────
@@ -348,22 +369,23 @@ async def get_status():
 
 @router.get("/clusters")
 async def get_clusters(
-    k:     int = Query(14, ge=4,  le=30,  description="Nombre de clusters"),
-    hours: int = Query(24, ge=1,  le=168, description="Fenêtre temporelle (heures)"),
-    force: bool = Query(False, description="Forcer le recalcul"),
+    k:           int   = Query(14,  ge=4,   le=30,  description="Nombre de clusters de base"),
+    hours:       int   = Query(24,  ge=1,   le=168, description="Fenêtre temporelle (heures)"),
+    sensitivity: float = Query(1.0, ge=0.5, le=3.0, description="Sensibilité : multiplicateur de k"),
+    force:       bool  = Query(False, description="Forcer le recalcul"),
 ):
     """
     Clustering multi-métriques sur TOUTES les IPs.
 
-    Retourne immédiatement depuis le cache (status=ready).
-    Si le calcul est en cours ou non démarré → status=computing/idle + trigger.
+    k_actual = round(k × sensitivity) — la sensibilité contrôle la granularité.
+    Retourne immédiatement depuis le cache. Déclenche le calcul si nécessaire.
     """
     if force:
         with _LOCK:
             _CACHE["status"] = "idle"
             _CACHE["ts"]     = 0.0
 
-    _maybe_trigger(k, hours)
+    _maybe_trigger(k, hours, sensitivity)
 
     with _LOCK:
         status = _CACHE["status"]
diff --git a/backend/services/clustering_engine.py b/backend/services/clustering_engine.py
index 0b5f1a0..4062046 100644
--- a/backend/services/clustering_engine.py
+++ b/backend/services/clustering_engine.py
@@ -87,6 +87,23 @@ def build_feature_vector(row: dict) -> list[float]:
     return [norm(row.get(key, 0)) for key, _, norm in FEATURES]
 
 
+# ─── Standardisation z-score ──────────────────────────────────────────────────
+
+def standardize(X: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Z-score standardisation : chaque feature est centrée et mise à l'échelle
+    par sa déviation standard.
+
+    Ref: Bishop (2006) PRML §9.1 — preprocessing recommandé pour K-means.
+
+    Retourne (X_std, mean, std) pour pouvoir projeter de nouveaux points.
+    """
+    mean = X.mean(axis=0)
+    std  = X.std(axis=0)
+    std[std < 1e-8] = 1.0  # évite la division par zéro pour features constantes
+    return (X - mean) / std, mean, std
+
+
 # ─── K-means++ vectorisé (numpy) ─────────────────────────────────────────────
 
 @dataclass
diff --git a/frontend/src/components/ClusteringView.tsx b/frontend/src/components/ClusteringView.tsx
index 89989a2..b1bd7f4 100644
--- a/frontend/src/components/ClusteringView.tsx
+++ b/frontend/src/components/ClusteringView.tsx
@@ -96,6 +96,7 @@ function hexToRgba(hex: string, alpha = 255): [number, number, number, number] {
 export default function ClusteringView() {
   const [k, setK] = useState(14);
   const [hours, setHours] = useState(24);
+  const [sensitivity, setSensitivity] = useState(1.0);
   const [data, setData] = useState<ClusterResult | null>(null);
   const [loading, setLoading] = useState(false);
   const [computing, setComputing] = useState(false);
@@ -123,7 +124,7 @@ export default function ClusteringView() {
     setError(null);
     try {
       const res = await axios.get<ClusterResult>('/api/clustering/clusters', {
-        params: { k, hours, force },
+        params: { k, hours, sensitivity, force },
       });
       if (res.data.status === 'computing' || res.data.status === 'idle') {
         setComputing(true);
@@ -298,13 +299,17 @@ export default function ClusteringView() {
       updateTriggers: { getFillColor: [selected?.id], getLineWidth: [selected?.id] },
     }));
 
-    // 5. Labels (TextLayer) — strip emojis (non supportés par le bitmap font deck.gl)
-    const stripEmoji = (s: string) => s.replace(/[\u{1F000}-\u{1FFFF}\u{2600}-\u{27FF}]/gu, '').trim();
+    const stripNonAscii = (s: string) =>
+      s.replace(/[\u{0080}-\u{FFFF}]/gu, c => {
+        // Translitérations basiques pour la lisibilité
+        const map: Record<string, string> = { é:'e',è:'e',ê:'e',ë:'e',à:'a',â:'a',ô:'o',ù:'u',û:'u',î:'i',ï:'i',ç:'c' };
+        return map[c] ?? '';
+      }).replace(/[\u{1F000}-\u{1FFFF}\u{2600}-\u{27FF}]/gu, '').trim();
     layerList.push(new TextLayer({
       id: 'labels',
       data: nodes,
       getPosition: (d: ClusterNode) => [toWorld(d.pca_x), toWorld(d.pca_y), 0],
-      getText: (d: ClusterNode) => stripEmoji(d.label),
+      getText: (d: ClusterNode) => stripNonAscii(d.label),
       getSize: 12,
       sizeUnits: 'pixels',
       getColor: [255, 255, 255, 200],
@@ -334,24 +339,48 @@ export default function ClusteringView() {
 
         {/* Paramètres */}
         <div className="bg-background-card rounded-lg p-3 space-y-3">
-          <label className="block text-xs text-text-secondary">
-            Clusters (k)
-            <input type="range" min={4} max={30} value={k}
-              onChange={e => setK(+e.target.value)}
-              className="w-full mt-1 accent-accent-primary" />
-            <span className="font-mono text-white">{k}</span>
-          </label>
-          <label className="block text-xs text-text-secondary">
-            Fenêtre
-            <select value={hours} onChange={e => setHours(+e.target.value)}
-              className="w-full mt-1 bg-background border border-gray-600 rounded px-2 py-1 text-sm">
-              <option value={6}>6h</option>
-              <option value={12}>12h</option>
-              <option value={24}>24h</option>
-              <option value={48}>48h</option>
-              <option value={168}>7j</option>
-            </select>
-          </label>
+          {/* Sensibilité */}
+          <div className="space-y-1">
+            <div className="flex justify-between text-xs text-text-secondary">
+              <span>Sensibilité</span>
+              <span className="font-mono text-white">
+                {sensitivity === 0.5 ? 'Grossière' : sensitivity <= 1.0 ? 'Normale' : sensitivity <= 1.5 ? 'Fine' : sensitivity <= 2.0 ? 'Très fine' : 'Maximum'}
+                {' '}({Math.round(k * sensitivity)} clusters)
+              </span>
+            </div>
+            <input type="range" min={0.5} max={3.0} step={0.5} value={sensitivity}
+              onChange={e => setSensitivity(+e.target.value)}
+              className="w-full accent-accent-primary" />
+            <div className="flex justify-between text-xs text-text-disabled">
+              <span>Grossière</span><span>Maximum</span>
+            </div>
+          </div>
+
+          {/* k avancé */}
+          <details className="text-xs text-text-secondary">
+            <summary className="cursor-pointer hover:text-white">Paramètres avancés</summary>
+            <div className="mt-2 space-y-2">
+              <label className="block">
+                Clusters de base (k)
+                <input type="range" min={4} max={30} value={k}
+                  onChange={e => setK(+e.target.value)}
+                  className="w-full mt-1 accent-accent-primary" />
+                <span className="font-mono text-white">{k}</span>
+              </label>
+              <label className="block">
+                Fenêtre
+                <select value={hours} onChange={e => setHours(+e.target.value)}
+                  className="w-full mt-1 bg-background border border-gray-600 rounded px-2 py-1">
+                  <option value={6}>6h</option>
+                  <option value={12}>12h</option>
+                  <option value={24}>24h</option>
+                  <option value={48}>48h</option>
+                  <option value={168}>7j</option>
+                </select>
+              </label>
+            </div>
+          </details>
+
           <label className="flex items-center gap-2 text-xs text-text-secondary cursor-pointer">
             <input type="checkbox" checked={showEdges} onChange={e => setShowEdges(e.target.checked)}
               className="accent-accent-primary" />