feat(clustering): ASN + country risk features, fix risk scoring and cluster labels

- Add country_risk() feature (index 21): lookup table 30 pays, CN/RU/KP=1.0, US/DE=0.1 - Add asn_cloud_score() feature (index 22): 50+ keyword match (cloud/CDN/VPN/crawler) - N_FEATURES: 20 → 23 (SQL query extended with country + asn_org columns) - Fix centroids_orig: denormalize K-means z-score centroids to [0,1] for display - Update risk_score_from_centroid(): weights 0.30 ML + 0.12 fuzzing + 0.12 UA-CH + 0.12 country_risk + 0.10 asn_cloud (total=1.0) - Update name_cluster(): new labels 'Source pays risqué', 'Bot cloud UA-CH', 'Infrastructure cloud'; thresholds lowered for better sensitivity - Risk scores now discriminate: pays risqué 0.35, bot cloud 0.23-0.28, sain 0.06 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-19 10:15:21 +01:00
parent fc3392779b
commit f456c807db
2 changed files with 128 additions and 28 deletions
--- a/backend/routes/clustering.py
+++ b/backend/routes/clustering.py
@ -152,9 +152,11 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
        km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=3, seed=42)
        log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")

+        # Centroïdes dans l'espace original [0,1] pour affichage radar
+        # (dé-standardisation : c_orig = c_std * std + mean, puis clip [0,1])
+        centroids_orig = np.clip(km.centroids * feat_std + feat_mean, 0.0, 1.0)
+
        # ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ────
-        # On utilise les features non-standardisées pour la projection 2D
-        # afin que les distances visuelles restent interprétables.
        coords = pca_2d(X64)   # (n, 2), normalisé [0,1]

        # ── 5b. Enveloppes convexes par cluster ──────────────────────────
@ -174,7 +176,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
                rows[i]["ja4"],
                float(coords[i][0]),
                float(coords[i][1]),
-                float(risk_score_from_centroid(km.centroids[j])),
+                float(risk_score_from_centroid(centroids_orig[j])),
            ))

        # ── 7. Construction des nœuds ─────────────────────────────────────
@ -192,8 +194,8 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
            mean_win   = avg_f("win")

            raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale}
-            label_name = name_cluster(km.centroids[j], raw_stats)
-            risk       = float(risk_score_from_centroid(km.centroids[j]))
+            label_name = name_cluster(centroids_orig[j], raw_stats)
+            risk       = float(risk_score_from_centroid(centroids_orig[j]))
            color      = _risk_to_color(risk)

            # Centroïde 2D = moyenne des coords du cluster
@ -210,7 +212,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
                return [v for v, _ in Counter(lst).most_common(n) if v]

            radar = [
-                {"feature": name, "value": round(float(km.centroids[j][i]), 4)}
+                {"feature": name, "value": round(float(centroids_orig[j][i]), 4)}
                for i, name in enumerate(FEATURE_NAMES)
            ]

@ -271,7 +273,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
            ci = ni["cluster_idx"]
            dists = sorted(
                [(j, nj["cluster_idx"],
-                  float(np.sum((km.centroids[ci] - km.centroids[nj["cluster_idx"]]) ** 2)))
+                  float(np.sum((centroids_orig[ci] - centroids_orig[nj["cluster_idx"]]) ** 2)))
                 for j, nj in enumerate(nodes) if j != i],
                key=lambda x: x[2]
            )