feat(clustering): ASN + country risk features, fix risk scoring and cluster labels

- Add country_risk() feature (index 21): lookup table 30 pays, CN/RU/KP=1.0, US/DE=0.1 - Add asn_cloud_score() feature (index 22): 50+ keyword match (cloud/CDN/VPN/crawler) - N_FEATURES: 20 → 23 (SQL query extended with country + asn_org columns) - Fix centroids_orig: denormalize K-means z-score centroids to [0,1] for display - Update risk_score_from_centroid(): weights 0.30 ML + 0.12 fuzzing + 0.12 UA-CH + 0.12 country_risk + 0.10 asn_cloud (total=1.0) - Update name_cluster(): new labels 'Source pays risqué', 'Bot cloud UA-CH', 'Infrastructure cloud'; thresholds lowered for better sensitivity - Risk scores now discriminate: pays risqué 0.35, bot cloud 0.23-0.28, sain 0.06 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-19 10:15:21 +01:00
parent fc3392779b
commit f456c807db
2 changed files with 128 additions and 28 deletions
--- a/backend/routes/clustering.py
+++ b/backend/routes/clustering.py
@ -152,9 +152,11 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
        km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=3, seed=42)
        log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")

+        # Centroïdes dans l'espace original [0,1] pour affichage radar
+        # (dé-standardisation : c_orig = c_std * std + mean, puis clip [0,1])
+        centroids_orig = np.clip(km.centroids * feat_std + feat_mean, 0.0, 1.0)
+
        # ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ────
-        # On utilise les features non-standardisées pour la projection 2D
-        # afin que les distances visuelles restent interprétables.
        coords = pca_2d(X64)   # (n, 2), normalisé [0,1]

        # ── 5b. Enveloppes convexes par cluster ──────────────────────────
@ -174,7 +176,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
                rows[i]["ja4"],
                float(coords[i][0]),
                float(coords[i][1]),
-                float(risk_score_from_centroid(km.centroids[j])),
+                float(risk_score_from_centroid(centroids_orig[j])),
            ))

        # ── 7. Construction des nœuds ─────────────────────────────────────
@ -192,8 +194,8 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
            mean_win   = avg_f("win")

            raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale}
-            label_name = name_cluster(km.centroids[j], raw_stats)
-            risk       = float(risk_score_from_centroid(km.centroids[j]))
+            label_name = name_cluster(centroids_orig[j], raw_stats)
+            risk       = float(risk_score_from_centroid(centroids_orig[j]))
            color      = _risk_to_color(risk)

            # Centroïde 2D = moyenne des coords du cluster
@ -210,7 +212,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
                return [v for v, _ in Counter(lst).most_common(n) if v]

            radar = [
-                {"feature": name, "value": round(float(km.centroids[j][i]), 4)}
+                {"feature": name, "value": round(float(centroids_orig[j][i]), 4)}
                for i, name in enumerate(FEATURE_NAMES)
            ]

@ -271,7 +273,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
            ci = ni["cluster_idx"]
            dists = sorted(
                [(j, nj["cluster_idx"],
-                  float(np.sum((km.centroids[ci] - km.centroids[nj["cluster_idx"]]) ** 2)))
+                  float(np.sum((centroids_orig[ci] - centroids_orig[nj["cluster_idx"]]) ** 2)))
                 for j, nj in enumerate(nodes) if j != i],
                key=lambda x: x[2]
            )
--- a/backend/services/clustering_engine.py
+++ b/backend/services/clustering_engine.py
@ -6,7 +6,7 @@ Ref:
  scipy.spatial.ConvexHull     — enveloppe convexe (Graham/Qhull)
  sklearn-style API             — centroids, labels_, inertia_

-Features (21 dimensions, normalisées [0,1]) :
+Features (23 dimensions, normalisées [0,1]) :
  0  ttl_n          : TTL initial normalisé
  1  mss_n          : MSS normalisé → type réseau
  2  scale_n        : facteur de mise à l'échelle TCP
@ -28,6 +28,8 @@ Features (21 dimensions, normalisées [0,1]) :
  18 direct_n       : ratio accès directs
  19 ja4_div_n      : diversité JA4 (log1p)
  20 ua_rot_n       : UA rotatif (booléen)
+  21 country_risk_n : risque pays source (CN/RU/KP → 1.0, US/DE/FR → 0.0)
+  22 asn_cloud_n    : hébergeur cloud/CDN/VPN (Cloudflare/AWS/OVH → 1.0)
 """
 from __future__ import annotations

@ -39,6 +41,73 @@ from scipy.spatial import ConvexHull

 log = logging.getLogger(__name__)

+# ─── Encodage pays (risque source) ───────────────────────────────────────────
+# Source: MISP threat intel, Spamhaus DROP list, géographie offensive connue
+
+_COUNTRY_RISK: dict[str, float] = {
+    # Très haut risque : infrastructure offensive documentée
+    "CN": 1.0, "RU": 1.0, "KP": 1.0, "IR": 1.0,
+    "BY": 0.9, "SY": 0.9, "CU": 0.8,
+    # Haut risque : transit/hébergement permissif, bulletproof hosters
+    "HK": 0.75, "VN": 0.7, "UA": 0.65,
+    "RO": 0.6,  "PK": 0.6, "NG": 0.6,
+    "BG": 0.55, "TR": 0.55, "BR": 0.5,
+    "TH": 0.5,  "IN": 0.45, "ID": 0.45,
+    # Risque faible : pays à faible tolérance envers activité malveillante
+    "US": 0.1, "DE": 0.1, "FR": 0.1, "GB": 0.1,
+    "CA": 0.1, "JP": 0.1, "AU": 0.1, "NL": 0.15,
+    "CH": 0.1, "SE": 0.1, "NO": 0.1, "DK": 0.1,
+    "FI": 0.1, "AT": 0.1, "BE": 0.1, "IT": 0.15,
+    "SG": 0.3, "TW": 0.2, "KR": 0.2, "RS": 0.4,
+}
+_DEFAULT_COUNTRY_RISK = 0.35  # pays inconnu → risque modéré
+
+
+def country_risk(cc: str | None) -> float:
+    """Score de risque [0,1] d'un code pays ISO-3166."""
+    return _COUNTRY_RISK.get((cc or "").upper(), _DEFAULT_COUNTRY_RISK)
+
+
+# ─── Encodage ASN (type d'infrastructure) ────────────────────────────────────
+# Cloud/CDN/hosting → fort corrélé avec scanners automatisés et bots
+
+_ASN_CLOUD_KEYWORDS = [
+    # Hyperscalers
+    "amazon", "aws", "google", "microsoft", "azure", "alibaba", "tencent", "huawei",
+    # CDN / edge
+    "cloudflare", "akamai", "fastly", "cloudfront", "incapsula", "imperva",
+    "sucuri", "stackpath", "keycdn",
+    # Hébergeurs
+    "ovh", "hetzner", "digitalocean", "vultr", "linode", "akamai-linode",
+    "leaseweb", "choopa", "packet", "equinix", "serverius", "combahton",
+    "m247", "b2 net", "hostinger", "contabo",
+    # Bulletproof / transit permissif connus
+    "hwclouds", "multacom", "psychz", "serverius", "colocrossing",
+    "frantech", "sharktech", "tzulo",
+    # VPN / proxy commerciaux
+    "nordvpn", "expressvpn", "mullvad", "protonvpn", "surfshark",
+    "privateinternetaccess", "pia ", "cyberghost", "hotspot shield",
+    "ipvanish", "hide.me",
+    # Bots search engines / crawlers
+    "facebook", "meta ", "twitter", "linkedin", "semrush", "ahrefs",
+    "majestic", "moz ", "babbar", "sistrix", "criteo", "peer39",
+]
+
+
+def asn_cloud_score(asn_org: str | None) -> float:
+    """
+    Score [0,1] : 1.0 = cloud/CDN/hébergement/VPN confirmé.
+    Correspond à une infrastructure typiquement utilisée par les bots.
+    """
+    if not asn_org:
+        return 0.2  # inconnu → légèrement suspect
+    s = asn_org.lower()
+    for kw in _ASN_CLOUD_KEYWORDS:
+        if kw in s:
+            return 1.0
+    return 0.0
+
+
 # ─── Définition des features ──────────────────────────────────────────────────

 FEATURES: list[tuple[str, str, object]] = [
@ -72,6 +141,9 @@ FEATURES: list[tuple[str, str, object]] = [
    ("ja4_count",     "Diversité JA4",         lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(30))),
    # UA rotatif
    ("ua_rotating",   "UA Rotatif",            lambda v: 1.0 if float(v or 0) > 0 else 0.0),
+    # ── Géographie & infrastructure (nouvelles features) ──────────────────
+    ("country",       "Risque Pays",           lambda v: country_risk(str(v) if v else None)),
+    ("asn_org",       "Hébergeur Cloud/VPN",   lambda v: asn_cloud_score(str(v) if v else None)),
 ]

 FEATURE_KEYS  = [f[0] for f in FEATURES]
@ -83,7 +155,7 @@ N_FEATURES    = len(FEATURES)
 # ─── Construction du vecteur de features ─────────────────────────────────────

 def build_feature_vector(row: dict) -> list[float]:
-    """Construit le vecteur normalisé [0,1]^21 depuis un dict SQL."""
+    """Construit le vecteur normalisé [0,1]^23 depuis un dict SQL."""
    return [norm(row.get(key, 0)) for key, _, norm in FEATURES]


@ -249,46 +321,72 @@ def compute_hulls(coords_2d: np.ndarray, labels: np.ndarray,
 # ─── Nommage et scoring ───────────────────────────────────────────────────────

 def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
-    """Nom lisible basé sur les features dominantes du centroïde."""
-    s = centroid  # alias
+    """Nom lisible basé sur les features dominantes du centroïde [0,1]."""
+    s = centroid
    ttl_raw = float(raw_stats.get("mean_ttl", 0))
    mss_raw = float(raw_stats.get("mean_mss", 0))
+    # Indices : 21=country_risk, 22=asn_cloud
+    country_risk = s[21] if len(s) > 21 else 0.0
+    asn_cloud    = s[22] if len(s) > 22 else 0.0

-    # Scanners / bots masscan
+    # Scanners Masscan
    if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25:
        return "🤖 Masscan Scanner"
-    if s[4] > 0.70 and s[6] > 0.5:
+    # Bots offensifs agressifs (fuzzing + anomalie + pays risqué)
+    if s[4] > 0.40 and s[6] > 0.3:
        return "🤖 Bot agressif"
-    if s[16] > 0.80:
+    # Pays à très haut risque (CN, RU, KP) avec trafic anormal
+    if country_risk > 0.75 and (s[4] > 0.10 or asn_cloud > 0.5):
+        return "🌏 Source pays risqué"
+    # Cloud + UA-CH mismatch = crawler/bot cloud
+    if s[16] > 0.50 and asn_cloud > 0.70:
+        return "☁️ Bot cloud UA-CH"
+    # UA-CH mismatch seul
+    if s[16] > 0.60:
        return "🤖 UA-CH Mismatch"
-    if s[7] > 0.70:
+    # Headless browser
+    if s[7] > 0.50:
        return "🤖 Headless Browser"
-    if s[4] > 0.50:
-        return "⚠️ Anomalie ML haute"
+    # Anomalie ML significative
+    if s[4] > 0.35:
+        return "⚠️ Anomalie ML"
+    # Cloud pur (CDN/crawler légitime ?)
+    if asn_cloud > 0.85 and s[4] < 0.15:
+        return "☁️ Infrastructure cloud"
+    # Pays à risque élevé sans autre signal
+    if country_risk > 0.60:
+        return "🌏 Trafic suspect (pays)"
+    # OS fingerprinting
    if s[3] > 0.85 and ttl_raw > 120:
        return "🖥️ Windows"
    if s[0] > 0.22 and s[0] < 0.28 and mss_raw > 1400:
        return "🐧 Linux"
-    if s[1] < 0.90 and s[1] > 0.95:
-        return "📡 VPN/Proxy"
    if mss_raw < 1380 and mss_raw > 0:
        return "🌐 Tunnel réseau"
-    if s[5] > 0.60:
+    if s[5] > 0.40:
        return "⚡ Trafic rapide"
-    if s[4] < 0.10 and s[5] < 0.10:
+    if s[4] < 0.08 and s[5] < 0.10 and asn_cloud < 0.30:
        return "✅ Trafic sain"
    return "📊 Cluster mixte"


 def risk_score_from_centroid(centroid: np.ndarray) -> float:
-    """Score de risque [0,1] agrégé depuis le centroïde."""
+    """
+    Score de risque [0,1] agrégé depuis le centroïde (espace original [0,1]).
+    Intègre pays et infrastructure cloud.
+    """
    s = centroid
+    country_risk = s[21] if len(s) > 21 else 0.0
+    asn_cloud    = s[22] if len(s) > 22 else 0.0
+
    return float(np.clip(
-        0.40 * s[4]   +   # score ML
-        0.15 * s[6]   +   # fuzzing
-        0.15 * s[16]  +   # UA-CH mismatch
-        0.10 * s[7]   +   # headless
-        0.10 * s[5]   +   # vélocité
-        0.10 * s[9],      # IP-ID zéro
+        0.30 * s[4]          +   # score ML anomalie (principal)
+        0.12 * s[6]          +   # fuzzing
+        0.12 * s[16]         +   # UA-CH mismatch
+        0.08 * s[7]          +   # headless
+        0.08 * s[5]          +   # vélocité
+        0.08 * s[9]          +   # IP-ID zéro
+        0.12 * country_risk  +   # risque pays source
+        0.10 * asn_cloud,        # infrastructure cloud/VPN
        0.0, 1.0
    ))