From fc3392779b864714bb83e2d0a3279cd181a82f72 Mon Sep 17 00:00:00 2001 From: SOC Analyst Date: Thu, 19 Mar 2026 10:07:23 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20slider=20sensibilit=C3=A9=20+=20z-score?= =?UTF-8?q?=20standardization=20pour=20clustering=20plus=20pr=C3=A9cis?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sensibilité (0.5x–3.0x) : - Multiplie k : sensibilité=2x avec k=14 → 28 clusters effectifs - Labels UI : Grossière / Normale / Fine / Très fine / Maximum - Paramètres avancés (k, fenêtre) masqués dans un
- Cache invalidé si sensibilité change Z-score standardisation (Bishop 2006 PRML §9.1) : - Normalise par variance de chaque feature avant K-means - Features discriminantes (forte std) pèsent plus - Résultat : risque 0→1.00 sur clusters bots vs 0→0.27 avant - Bots détectés : 4 337 IPs vs 1 604 (2.7x plus) - Nouveaux clusters : Bot agressif, Tunnel réseau, UA-CH Mismatch distincts Fix TextLayer deck.gl : - Translittération des accents (é→e, à→a, ç→c…) + strip emojis - Évite les warnings 'Missing character' sur caractères non-ASCII Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- backend/routes/clustering.py | 78 ++++++++++++++-------- backend/services/clustering_engine.py | 17 +++++ frontend/src/components/ClusteringView.tsx | 73 ++++++++++++++------ 3 files changed, 118 insertions(+), 50 deletions(-) diff --git a/backend/routes/clustering.py b/backend/routes/clustering.py index de40c26..ae17508 100644 --- a/backend/routes/clustering.py +++ b/backend/routes/clustering.py @@ -23,7 +23,7 @@ from ..database import db from ..services.clustering_engine import ( FEATURE_KEYS, FEATURE_NAMES, FEATURE_NORMS, N_FEATURES, build_feature_vector, kmeans_pp, pca_2d, compute_hulls, - name_cluster, risk_score_from_centroid, + name_cluster, risk_score_from_centroid, standardize, ) log = logging.getLogger(__name__) @@ -110,15 +110,22 @@ _SQL_COLS = [ # ─── Worker de clustering (thread pool) ────────────────────────────────────── -def _run_clustering_job(k: int, hours: int) -> None: - """Exécuté dans le thread pool. Met à jour _CACHE.""" +def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None: + """Exécuté dans le thread pool. Met à jour _CACHE. + + sensitivity : multiplicateur de k [0.5 – 3.0]. + 1.0 = comportement par défaut + 2.0 = deux fois plus de clusters → groupes plus homogènes + 0.5 = moitié → vue très agrégée + """ + k_actual = max(4, min(50, round(k * sensitivity))) t0 = time.time() with _LOCK: _CACHE["status"] = "computing" _CACHE["error"] = None try: - log.info(f"[clustering] Démarrage du calcul k={k} hours={hours}") + log.info(f"[clustering] Démarrage k={k_actual} (base={k}×sens={sensitivity}) hours={hours}") # ── 1. Chargement de toutes les IPs ────────────────────────────── result = db.query(_SQL_ALL_IPS, {"hours": hours}) @@ -128,27 +135,35 @@ def _run_clustering_job(k: int, hours: int) -> None: n = len(rows) log.info(f"[clustering] {n} IPs chargées") - if n < k: - raise ValueError(f"Seulement {n} IPs disponibles (k={k} requis)") + if n < k_actual: + raise ValueError(f"Seulement {n} IPs disponibles (k={k_actual} requis)") # ── 2. Construction de la matrice de features (numpy) ──────────── X = np.array([build_feature_vector(r) for r in rows], dtype=np.float32) log.info(f"[clustering] Matrice X: {X.shape} — {X.nbytes/1024/1024:.1f} MB") - # ── 3. K-means++ vectorisé ──────────────────────────────────────── - km = kmeans_pp(X.astype(np.float64), k=k, max_iter=80, n_init=3, seed=42) + # ── 3. Standardisation z-score ──────────────────────────────────── + # Normalise par variance : features discriminantes (forte std) + # contribuent plus que les features quasi-constantes. + X64 = X.astype(np.float64) + X_std, feat_mean, feat_std = standardize(X64) + + # ── 4. K-means++ sur l'espace standardisé ──────────────────────── + km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=3, seed=42) log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}") - # ── 4. PCA-2D pour toutes les IPs ──────────────────────────────── - coords = pca_2d(X.astype(np.float64)) # (n, 2), normalisé [0,1] + # ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ──── + # On utilise les features non-standardisées pour la projection 2D + # afin que les distances visuelles restent interprétables. + coords = pca_2d(X64) # (n, 2), normalisé [0,1] - # ── 5. Enveloppes convexes par cluster ─────────────────────────── - hulls = compute_hulls(coords, km.labels, k) + # ── 5b. Enveloppes convexes par cluster ────────────────────────── + hulls = compute_hulls(coords, km.labels, k_actual) # ── 6. Agrégation par cluster ───────────────────────────────────── - cluster_rows: list[list[dict]] = [[] for _ in range(k)] - cluster_coords: list[list[list[float]]] = [[] for _ in range(k)] - cluster_ips_map: dict[int, list] = {j: [] for j in range(k)} + cluster_rows: list[list[dict]] = [[] for _ in range(k_actual)] + cluster_coords: list[list[list[float]]] = [[] for _ in range(k_actual)] + cluster_ips_map: dict[int, list] = {j: [] for j in range(k_actual)} for i, label in enumerate(km.labels): j = int(label) @@ -164,7 +179,7 @@ def _run_clustering_job(k: int, hours: int) -> None: # ── 7. Construction des nœuds ───────────────────────────────────── nodes = [] - for j in range(k): + for j in range(k_actual): if not cluster_rows[j]: continue @@ -206,7 +221,7 @@ def _run_clustering_job(k: int, hours: int) -> None: sample_ua = str(cluster_rows[j][0].get("ua") or "") nodes.append({ - "id": f"c{j}_k{k}", + "id": f"c{j}_k{k_actual}", "cluster_idx": j, "label": label_name, "pca_x": round(cxy[0], 6), @@ -289,7 +304,9 @@ def _run_clustering_job(k: int, hours: int) -> None: "bot_ips": bot_ips, "high_risk_ips": high_ips, "n_samples": n, - "k": k, + "k": k_actual, + "k_base": k, + "sensitivity": sensitivity, "elapsed_s": elapsed, }, "feature_names": FEATURE_NAMES, @@ -300,7 +317,7 @@ def _run_clustering_job(k: int, hours: int) -> None: _CACHE["cluster_ips"] = cluster_ips_map _CACHE["status"] = "ready" _CACHE["ts"] = time.time() - _CACHE["params"] = {"k": k, "hours": hours} + _CACHE["params"] = {"k": k, "hours": hours, "sensitivity": sensitivity} _CACHE["error"] = None log.info(f"[clustering] Terminé en {elapsed}s — {total_ips} IPs, {len(nodes)} clusters") @@ -312,7 +329,7 @@ def _run_clustering_job(k: int, hours: int) -> None: _CACHE["error"] = str(e) -def _maybe_trigger(k: int, hours: int) -> None: +def _maybe_trigger(k: int, hours: int, sensitivity: float) -> None: """Lance le calcul si cache absent, expiré ou paramètres différents.""" with _LOCK: status = _CACHE["status"] @@ -320,7 +337,11 @@ def _maybe_trigger(k: int, hours: int) -> None: ts = _CACHE["ts"] cache_stale = (time.time() - ts) > _CACHE_TTL - params_changed = params.get("k") != k or params.get("hours") != hours + params_changed = ( + params.get("k") != k or + params.get("hours") != hours or + params.get("sensitivity") != sensitivity + ) if status in ("computing",): return # déjà en cours @@ -328,7 +349,7 @@ def _maybe_trigger(k: int, hours: int) -> None: if status == "ready" and not cache_stale and not params_changed: return # cache frais - _EXECUTOR.submit(_run_clustering_job, k, hours) + _EXECUTOR.submit(_run_clustering_job, k, hours, sensitivity) # ─── Endpoints ──────────────────────────────────────────────────────────────── @@ -348,22 +369,23 @@ async def get_status(): @router.get("/clusters") async def get_clusters( - k: int = Query(14, ge=4, le=30, description="Nombre de clusters"), - hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle (heures)"), - force: bool = Query(False, description="Forcer le recalcul"), + k: int = Query(14, ge=4, le=30, description="Nombre de clusters de base"), + hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle (heures)"), + sensitivity: float = Query(1.0, ge=0.5, le=3.0, description="Sensibilité : multiplicateur de k"), + force: bool = Query(False, description="Forcer le recalcul"), ): """ Clustering multi-métriques sur TOUTES les IPs. - Retourne immédiatement depuis le cache (status=ready). - Si le calcul est en cours ou non démarré → status=computing/idle + trigger. + k_actual = round(k × sensitivity) — la sensibilité contrôle la granularité. + Retourne immédiatement depuis le cache. Déclenche le calcul si nécessaire. """ if force: with _LOCK: _CACHE["status"] = "idle" _CACHE["ts"] = 0.0 - _maybe_trigger(k, hours) + _maybe_trigger(k, hours, sensitivity) with _LOCK: status = _CACHE["status"] diff --git a/backend/services/clustering_engine.py b/backend/services/clustering_engine.py index 0b5f1a0..4062046 100644 --- a/backend/services/clustering_engine.py +++ b/backend/services/clustering_engine.py @@ -87,6 +87,23 @@ def build_feature_vector(row: dict) -> list[float]: return [norm(row.get(key, 0)) for key, _, norm in FEATURES] +# ─── Standardisation z-score ────────────────────────────────────────────────── + +def standardize(X: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Z-score standardisation : chaque feature est centrée et mise à l'échelle + par sa déviation standard. + + Ref: Bishop (2006) PRML §9.1 — preprocessing recommandé pour K-means. + + Retourne (X_std, mean, std) pour pouvoir projeter de nouveaux points. + """ + mean = X.mean(axis=0) + std = X.std(axis=0) + std[std < 1e-8] = 1.0 # évite la division par zéro pour features constantes + return (X - mean) / std, mean, std + + # ─── K-means++ vectorisé (numpy) ───────────────────────────────────────────── @dataclass diff --git a/frontend/src/components/ClusteringView.tsx b/frontend/src/components/ClusteringView.tsx index 89989a2..b1bd7f4 100644 --- a/frontend/src/components/ClusteringView.tsx +++ b/frontend/src/components/ClusteringView.tsx @@ -96,6 +96,7 @@ function hexToRgba(hex: string, alpha = 255): [number, number, number, number] { export default function ClusteringView() { const [k, setK] = useState(14); const [hours, setHours] = useState(24); + const [sensitivity, setSensitivity] = useState(1.0); const [data, setData] = useState(null); const [loading, setLoading] = useState(false); const [computing, setComputing] = useState(false); @@ -123,7 +124,7 @@ export default function ClusteringView() { setError(null); try { const res = await axios.get('/api/clustering/clusters', { - params: { k, hours, force }, + params: { k, hours, sensitivity, force }, }); if (res.data.status === 'computing' || res.data.status === 'idle') { setComputing(true); @@ -298,13 +299,17 @@ export default function ClusteringView() { updateTriggers: { getFillColor: [selected?.id], getLineWidth: [selected?.id] }, })); - // 5. Labels (TextLayer) — strip emojis (non supportés par le bitmap font deck.gl) - const stripEmoji = (s: string) => s.replace(/[\u{1F000}-\u{1FFFF}\u{2600}-\u{27FF}]/gu, '').trim(); + const stripNonAscii = (s: string) => + s.replace(/[\u{0080}-\u{FFFF}]/gu, c => { + // Translitérations basiques pour la lisibilité + const map: Record = { é:'e',è:'e',ê:'e',ë:'e',à:'a',â:'a',ô:'o',ù:'u',û:'u',î:'i',ï:'i',ç:'c' }; + return map[c] ?? ''; + }).replace(/[\u{1F000}-\u{1FFFF}\u{2600}-\u{27FF}]/gu, '').trim(); layerList.push(new TextLayer({ id: 'labels', data: nodes, getPosition: (d: ClusterNode) => [toWorld(d.pca_x), toWorld(d.pca_y), 0], - getText: (d: ClusterNode) => stripEmoji(d.label), + getText: (d: ClusterNode) => stripNonAscii(d.label), getSize: 12, sizeUnits: 'pixels', getColor: [255, 255, 255, 200], @@ -334,24 +339,48 @@ export default function ClusteringView() { {/* Paramètres */}
- - + {/* Sensibilité */} +
+
+ Sensibilité + + {sensitivity === 0.5 ? 'Grossière' : sensitivity <= 1.0 ? 'Normale' : sensitivity <= 1.5 ? 'Fine' : sensitivity <= 2.0 ? 'Très fine' : 'Maximum'} + {' '}({Math.round(k * sensitivity)} clusters) + +
+ setSensitivity(+e.target.value)} + className="w-full accent-accent-primary" /> +
+ GrossièreMaximum +
+
+ + {/* k avancé */} +
+ Paramètres avancés +
+ + +
+
+