From 08d003a0505428282d23bf36c416faaed2cd5faf Mon Sep 17 00:00:00 2001 From: SOC Analyst Date: Thu, 19 Mar 2026 14:01:14 +0100 Subject: [PATCH] =?UTF-8?q?feat(clustering):=20palette=20diversifi=C3=A9e,?= =?UTF-8?q?=20suppression=20scores=20anomalie/robot,=20visualisation=20?= =?UTF-8?q?=C3=A9clat=C3=A9e?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Suppression de 'Score Anomalie' (avg_score) des 31→30 features de clustering - Suppression de 'Score de détection robot' (mean_score) de la sidebar et de l'API - Suppression de bot_ips / high_risk_ips des stats (métriques dérivées des scores supprimés) - Redistribution des poids dans risk_score_from_centroid: UA-CH mismatch +17%, fuzzing +14%, headless +10%, vélocité +9%, ip_id_zero +7% - Mise à jour des indices feature dans name_cluster et risk_score_from_centroid - Palette 24 couleurs spectrales (cluster_color) → bleu/violet/rose/teal/amber/cyan/lime... Les couleurs identifient les clusters, non leur niveau de risque - Remplacement de la légende CRITICAL/HIGH/MEDIUM/LOW par la liste des clusters actifs - Ajout de spread_clusters(): répulsion itérative des centroïdes trop proches (50 iter) min_dist=0.16 → les clusters se repoussent mutuellement → visualisation plus lisible - Interface TypeScript mise à jour (suppression mean_score, bot_ips, high_risk_ips) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- backend/routes/clustering.py | 30 ++-- backend/services/clustering_engine.py | 189 +++++++++++++++------ frontend/src/components/ClusteringView.tsx | 27 ++- 3 files changed, 160 insertions(+), 86 deletions(-) diff --git a/backend/routes/clustering.py b/backend/routes/clustering.py index 1a305ab..35b9969 100644 --- a/backend/routes/clustering.py +++ b/backend/routes/clustering.py @@ -24,6 +24,7 @@ from ..services.clustering_engine import ( FEATURE_KEYS, FEATURE_NAMES, FEATURE_NORMS, N_FEATURES, build_feature_vector, kmeans_pp, pca_2d, compute_hulls, name_cluster, risk_score_from_centroid, standardize, + cluster_color, spread_clusters, ) log = logging.getLogger(__name__) @@ -42,19 +43,9 @@ _CACHE_TTL = 1800 # 30 minutes _LOCK = threading.Lock() _EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="clustering") -# ─── Couleurs menace ────────────────────────────────────────────────────────── -_THREAT_COLOR = { - 0.70: "#dc2626", # Critique - 0.45: "#f97316", # Élevé - 0.25: "#eab308", # Modéré - 0.00: "#22c55e", # Sain -} - -def _risk_to_color(risk: float) -> str: - for threshold, color in sorted(_THREAT_COLOR.items(), reverse=True): - if risk >= threshold: - return color - return "#6b7280" +# ─── Palette de couleurs (remplace l'ancienne logique menace) ───────────────── +# Les couleurs sont désormais attribuées par index de cluster pour maximiser +# la distinction visuelle, indépendamment du niveau de risque. # ─── SQL : TOUTES les IPs sans LIMIT ───────────────────────────────────────── @@ -205,7 +196,11 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None: # ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ──── coords = pca_2d(X64) # (n, 2), normalisé [0,1] - # ── 5b. Enveloppes convexes par cluster ────────────────────────── + # ── 5b. Dispersion — repousse les clusters trop proches ────────── + coords = spread_clusters(coords, km.labels, k_actual, + n_iter=60, min_dist=0.16) + + # ── 5c. Enveloppes convexes par cluster ────────────────────────── hulls = compute_hulls(coords, km.labels, k_actual) # ── 6. Agrégation par cluster ───────────────────────────────────── @@ -242,7 +237,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None: raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale} label_name = name_cluster(centroids_orig[j], raw_stats) risk = float(risk_score_from_centroid(centroids_orig[j])) - color = _risk_to_color(risk) + color = cluster_color(j) # Centroïde 2D = moyenne des coords du cluster cxy = np.mean(cluster_coords[j], axis=0).tolist() if cluster_coords[j] else [0.5, 0.5] @@ -282,7 +277,6 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None: "mean_mss": round(mean_mss, 0), "mean_scale": round(mean_scale, 1), "mean_win": round(mean_win, 0), - "mean_score": round(avg_f("avg_score"), 4), "mean_velocity":round(avg_f("avg_velocity"),3), "mean_fuzzing": round(avg_f("avg_fuzzing"), 3), "mean_headless":round(avg_f("pct_headless"),3), @@ -338,8 +332,6 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None: # ── 9. Stockage résultat + cache IPs ───────────────────────────── total_ips = sum(n_["ip_count"] for n_ in nodes) total_hits = sum(n_["hit_count"] for n_ in nodes) - bot_ips = sum(n_["ip_count"] for n_ in nodes if n_["risk_score"] > 0.45 or "🤖" in n_["label"]) - high_ips = sum(n_["ip_count"] for n_ in nodes if n_["risk_score"] > 0.25) elapsed = round(time.time() - t0, 2) result_dict = { @@ -349,8 +341,6 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None: "total_clusters": len(nodes), "total_ips": total_ips, "total_hits": total_hits, - "bot_ips": bot_ips, - "high_risk_ips": high_ips, "n_samples": n, "k": k_actual, "k_base": k, diff --git a/backend/services/clustering_engine.py b/backend/services/clustering_engine.py index 2591c74..2d20ea8 100644 --- a/backend/services/clustering_engine.py +++ b/backend/services/clustering_engine.py @@ -127,9 +127,7 @@ FEATURES: list[tuple[str, str, object]] = [ ("scale", "Scale TCP", lambda v: min(1.0, (v or 0) / 14.0)), ("win", "Fenêtre TCP", lambda v: min(1.0, (v or 0) / 65535.0)), # Anomalie ML - ("avg_score", "Score Anomalie", lambda v: min(1.0, float(v or 0))), - ("avg_velocity", "Vélocité (rps)", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))), - ("avg_fuzzing", "Fuzzing", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))), + ("avg_velocity", "Vélocité (rps)", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))), ("avg_fuzzing", "Fuzzing", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))), ("pct_headless", "Headless", lambda v: min(1.0, float(v or 0))), ("avg_post", "Ratio POST", lambda v: min(1.0, float(v or 0))), # IP-ID @@ -353,51 +351,48 @@ def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str: n = len(s) ttl_raw = float(raw_stats.get("mean_ttl", 0)) mss_raw = float(raw_stats.get("mean_mss", 0)) - country_risk_v = s[21] if n > 21 else 0.0 - asn_cloud = s[22] if n > 22 else 0.0 - accept_lang = s[23] if n > 23 else 1.0 - accept_enc = s[24] if n > 24 else 1.0 - sec_fetch = s[25] if n > 25 else 0.0 - hdr_count = s[26] if n > 26 else 0.5 - hfp_popular = s[27] if n > 27 else 0.5 - hfp_rotating = s[28] if n > 28 else 0.0 + country_risk_v = s[20] if n > 20 else 0.0 + asn_cloud = s[21] if n > 21 else 0.0 + accept_lang = s[22] if n > 22 else 1.0 + accept_enc = s[23] if n > 23 else 1.0 + sec_fetch = s[24] if n > 24 else 0.0 + hdr_count = s[25] if n > 25 else 0.5 + hfp_popular = s[26] if n > 26 else 0.5 + hfp_rotating = s[27] if n > 27 else 0.0 # Scanner pur : aucun header browser, fingerprint rare, peu de headers if accept_lang < 0.15 and accept_enc < 0.15 and hdr_count < 0.25: return "🤖 Scanner pur (no headers)" - # Fingerprint tournant ET suspect : bot qui change de profil headers - if hfp_rotating > 0.6 and s[4] > 0.15: + # Fingerprint tournant : bot qui change de profil headers + if hfp_rotating > 0.6: return "🔄 Bot fingerprint tournant" - # Fingerprint très rare et anomalie : bot artisanal unique - if hfp_popular < 0.15 and s[4] > 0.20: + # Fingerprint très rare : bot artisanal unique + if hfp_popular < 0.15: return "🕵️ Fingerprint rare suspect" # Scanners Masscan if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25: return "🤖 Masscan Scanner" - # Bots offensifs agressifs (fuzzing + anomalie) - if s[4] > 0.40 and s[6] > 0.3: + # Bots offensifs agressifs (fuzzing élevé) + if s[4] > 0.40 and s[5] > 0.3: return "🤖 Bot agressif" # Bot qui simule un navigateur mais sans les vrais headers - if s[16] > 0.40 and sec_fetch < 0.2 and accept_lang < 0.3: + if s[15] > 0.40 and sec_fetch < 0.2 and accept_lang < 0.3: return "🤖 Bot UA simulé" - # Pays à très haut risque avec trafic anormal - if country_risk_v > 0.75 and (s[4] > 0.10 or asn_cloud > 0.5): + # Pays à très haut risque avec infrastructure cloud + if country_risk_v > 0.75 and asn_cloud > 0.5: return "🌏 Source pays risqué" # Cloud + UA-CH mismatch - if s[16] > 0.50 and asn_cloud > 0.70: + if s[15] > 0.50 and asn_cloud > 0.70: return "☁️ Bot cloud UA-CH" - if s[16] > 0.60: + if s[15] > 0.60: return "🤖 UA-CH Mismatch" # Headless browser (Puppeteer/Playwright) : a les headers Sec-Fetch mais headless - if s[7] > 0.50 and sec_fetch > 0.5: + if s[6] > 0.50 and sec_fetch > 0.5: return "🤖 Headless Browser" - if s[7] > 0.50: + if s[6] > 0.50: return "🤖 Headless (no Sec-Fetch)" - # Anomalie ML significative - if s[4] > 0.35: - return "⚠️ Anomalie ML" # Cloud pur (CDN/crawler légitime ?) - if asn_cloud > 0.85 and s[4] < 0.15: + if asn_cloud > 0.85: return "☁️ Infrastructure cloud" # Pays à risque élevé sans autre signal if country_risk_v > 0.60: @@ -413,9 +408,9 @@ def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str: return "🐧 Linux" if mss_raw < 1380 and mss_raw > 0: return "🌐 Tunnel réseau" - if s[5] > 0.40: + if s[4] > 0.40: return "⚡ Trafic rapide" - if s[4] < 0.08 and s[5] < 0.10 and asn_cloud < 0.30: + if s[4] < 0.10 and asn_cloud < 0.30: return "✅ Trafic sain" return "📊 Cluster mixte" @@ -423,34 +418,130 @@ def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str: def risk_score_from_centroid(centroid: np.ndarray) -> float: """ Score de risque [0,1] depuis le centroïde (espace original [0,1]). - 31 features — poids calibrés pour sommer à 1.0. + 30 features (avg_score supprimé) — poids calibrés pour sommer à 1.0. + Indices décalés de -1 après suppression de avg_score (ancien idx 4). """ s = centroid n = len(s) - country_risk_v = s[21] if n > 21 else 0.0 - asn_cloud = s[22] if n > 22 else 0.0 - no_accept_lang = 1.0 - (s[23] if n > 23 else 1.0) - no_encoding = 1.0 - (s[24] if n > 24 else 1.0) - no_sec_fetch = 1.0 - (s[25] if n > 25 else 0.0) - few_headers = 1.0 - (s[26] if n > 26 else 0.5) - # Fingerprint rare = suspect (faible popularité), fingerprint tournant = bot - hfp_rare = 1.0 - (s[27] if n > 27 else 0.5) - hfp_rotating = s[28] if n > 28 else 0.0 + country_risk_v = s[20] if n > 20 else 0.0 + asn_cloud = s[21] if n > 21 else 0.0 + no_accept_lang = 1.0 - (s[22] if n > 22 else 1.0) + no_encoding = 1.0 - (s[23] if n > 23 else 1.0) + no_sec_fetch = 1.0 - (s[24] if n > 24 else 0.0) + few_headers = 1.0 - (s[25] if n > 25 else 0.5) + hfp_rare = 1.0 - (s[26] if n > 26 else 0.5) + hfp_rotating = s[27] if n > 27 else 0.0 + # [4]=vélocité [5]=fuzzing [6]=headless [8]=ip_id_zero [15]=ua_ch_mismatch + # Poids redistribués depuis l'ancien score ML anomalie (0.25) vers les signaux restants return float(np.clip( - 0.25 * s[4] + # score ML anomalie (principal) - 0.09 * s[6] + # fuzzing - 0.07 * s[16] + # UA-CH mismatch - 0.06 * s[7] + # headless - 0.05 * s[5] + # vélocité - 0.05 * s[9] + # IP-ID zéro + 0.14 * s[5] + # fuzzing + 0.17 * s[15] + # UA-CH mismatch (fort signal impersonation navigateur) + 0.10 * s[6] + # headless + 0.09 * s[4] + # vélocité (rps) + 0.07 * s[8] + # IP-ID zéro 0.09 * country_risk_v+ # risque pays source 0.06 * asn_cloud + # infrastructure cloud/VPN 0.04 * no_accept_lang+ # absence Accept-Language 0.04 * no_encoding + # absence Accept-Encoding - 0.04 * no_sec_fetch + # absence Sec-Fetch (pas un vrai navigateur) - 0.04 * few_headers + # très peu de headers (scanner/curl) - 0.06 * hfp_rare + # fingerprint headers rare = suspect + 0.04 * no_sec_fetch + # absence Sec-Fetch + 0.04 * few_headers + # très peu de headers + 0.06 * hfp_rare + # fingerprint rare = suspect 0.06 * hfp_rotating, # rotation de fingerprint = bot 0.0, 1.0 )) + + +# ─── Palette de couleurs diversifiée (non liée au risque) ──────────────────── +# 24 couleurs couvrant tout le spectre HSL pour distinguer les clusters visuellement. +# Choix: teintes espacées de ~15° avec alternance de saturation/luminosité. + +_CLUSTER_PALETTE: list[str] = [ + "#3b82f6", # blue + "#8b5cf6", # violet + "#ec4899", # pink + "#14b8a6", # teal + "#f59e0b", # amber + "#06b6d4", # cyan + "#a3e635", # lime + "#f97316", # orange + "#6366f1", # indigo + "#10b981", # emerald + "#e879f9", # fuchsia + "#fbbf24", # yellow + "#60a5fa", # light blue + "#c084fc", # light purple + "#fb7185", # rose + "#34d399", # light green + "#38bdf8", # sky + "#a78bfa", # lavender + "#fdba74", # peach + "#4ade80", # green + "#f472b6", # light pink + "#67e8f9", # light cyan + "#d97706", # dark amber + "#7c3aed", # dark violet +] + + +def cluster_color(cluster_idx: int) -> str: + """Couleur distinctive pour un cluster, cyclique sur la palette.""" + return _CLUSTER_PALETTE[cluster_idx % len(_CLUSTER_PALETTE)] + + +# ─── Dispersion des clusters dans l'espace 2D ──────────────────────────────── + +def spread_clusters(coords_2d: np.ndarray, labels: np.ndarray, k: int, + n_iter: int = 50, min_dist: float = 0.14) -> np.ndarray: + """ + Repousse les centroïdes trop proches par répulsion itérative (spring repulsion). + Chaque point suit le déplacement de son centroïde. + + Paramètres + ---------- + min_dist : distance minimale souhaitée entre centroïdes (espace [0,1]). + Augmenter pour plus d'éclatement. + n_iter : nombre d'itérations de la physique de répulsion. + """ + rng = np.random.default_rng(0) + centroids = np.zeros((k, 2)) + counts = np.zeros(k, dtype=int) + for j in range(k): + mask = labels == j + if mask.any(): + centroids[j] = coords_2d[mask].mean(axis=0) + counts[j] = int(mask.sum()) + + orig = centroids.copy() + + for _ in range(n_iter): + forces = np.zeros_like(centroids) + for i in range(k): + if counts[i] == 0: + continue + for j in range(k): + if i == j or counts[j] == 0: + continue + delta = centroids[i] - centroids[j] + dist = float(np.linalg.norm(delta)) + if dist < 1e-8: + delta = rng.uniform(-0.02, 0.02, size=2) + dist = float(np.linalg.norm(delta)) + 1e-8 + if dist < min_dist: + # Force inversement proportionnelle à l'écart + magnitude = (min_dist - dist) / min_dist + forces[i] += magnitude * (delta / dist) + centroids += forces * 0.10 + + # Déplace chaque point par le delta de son centroïde + displaced = coords_2d.copy() + for j in range(k): + if counts[j] == 0: + continue + displaced[labels == j] += centroids[j] - orig[j] + + # Re-normalisation [0, 1] + mn, mx = displaced.min(axis=0), displaced.max(axis=0) + rng_ = mx - mn + rng_[rng_ < 1e-8] = 1.0 + return (displaced - mn) / rng_ diff --git a/frontend/src/components/ClusteringView.tsx b/frontend/src/components/ClusteringView.tsx index e6010d2..960f85a 100644 --- a/frontend/src/components/ClusteringView.tsx +++ b/frontend/src/components/ClusteringView.tsx @@ -35,7 +35,6 @@ interface ClusterNode { hit_count: number; mean_ttl: number; mean_mss: number; - mean_score: number; mean_velocity: number; mean_fuzzing: number; mean_headless: number; @@ -60,8 +59,6 @@ interface ClusterStats { total_clusters: number; total_ips: number; total_hits: number; - bot_ips: number; - high_risk_ips: number; n_samples: number; k: number; elapsed_s: number; @@ -427,8 +424,6 @@ export default function ClusteringView() {
Résultats
- - @@ -504,7 +499,7 @@ export default function ClusteringView() {

Clustering en cours…

- K-means++ · 31 features · {Math.round(k * sensitivity)} clusters · toutes les IPs + K-means++ · 30 features · {Math.round(k * sensitivity)} clusters · toutes les IPs

Mise à jour automatique toutes les 3 secondes

@@ -527,19 +522,18 @@ export default function ClusteringView() { {/* Légende overlay */}
- {([ - ['#dc2626', 'CRITICAL', TIPS.risk_critical], - ['#f97316', 'HIGH', TIPS.risk_high], - ['#eab308', 'MEDIUM', TIPS.risk_medium], - ['#22c55e', 'LOW', TIPS.risk_low], - ] as const).map(([c, l, tip]) => ( -
- - {l} +
Clusters
+ {data?.nodes?.slice(0, 6).map((n) => ( +
+ + {n.label}
))} + {(data?.nodes?.length ?? 0) > 6 && ( +
+{(data?.nodes?.length ?? 0) - 6} autres…
+ )}
- 31 features · PCA 2D ⓘ + 30 features · PCA 2D ⓘ
@@ -666,7 +660,6 @@ function ClusterSidebar({ node, ipDetails, ipTotal, ipPage, clusterPoints, onClo
Stack TCP
-