diff --git a/backend/routes/clustering.py b/backend/routes/clustering.py index ae17508..6b683a7 100644 --- a/backend/routes/clustering.py +++ b/backend/routes/clustering.py @@ -152,9 +152,11 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None: km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=3, seed=42) log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}") + # Centroïdes dans l'espace original [0,1] pour affichage radar + # (dé-standardisation : c_orig = c_std * std + mean, puis clip [0,1]) + centroids_orig = np.clip(km.centroids * feat_std + feat_mean, 0.0, 1.0) + # ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ──── - # On utilise les features non-standardisées pour la projection 2D - # afin que les distances visuelles restent interprétables. coords = pca_2d(X64) # (n, 2), normalisé [0,1] # ── 5b. Enveloppes convexes par cluster ────────────────────────── @@ -174,7 +176,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None: rows[i]["ja4"], float(coords[i][0]), float(coords[i][1]), - float(risk_score_from_centroid(km.centroids[j])), + float(risk_score_from_centroid(centroids_orig[j])), )) # ── 7. Construction des nœuds ───────────────────────────────────── @@ -192,8 +194,8 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None: mean_win = avg_f("win") raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale} - label_name = name_cluster(km.centroids[j], raw_stats) - risk = float(risk_score_from_centroid(km.centroids[j])) + label_name = name_cluster(centroids_orig[j], raw_stats) + risk = float(risk_score_from_centroid(centroids_orig[j])) color = _risk_to_color(risk) # Centroïde 2D = moyenne des coords du cluster @@ -210,7 +212,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None: return [v for v, _ in Counter(lst).most_common(n) if v] radar = [ - {"feature": name, "value": round(float(km.centroids[j][i]), 4)} + {"feature": name, "value": round(float(centroids_orig[j][i]), 4)} for i, name in enumerate(FEATURE_NAMES) ] @@ -271,7 +273,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None: ci = ni["cluster_idx"] dists = sorted( [(j, nj["cluster_idx"], - float(np.sum((km.centroids[ci] - km.centroids[nj["cluster_idx"]]) ** 2))) + float(np.sum((centroids_orig[ci] - centroids_orig[nj["cluster_idx"]]) ** 2))) for j, nj in enumerate(nodes) if j != i], key=lambda x: x[2] ) diff --git a/backend/services/clustering_engine.py b/backend/services/clustering_engine.py index 4062046..e7433de 100644 --- a/backend/services/clustering_engine.py +++ b/backend/services/clustering_engine.py @@ -6,7 +6,7 @@ Ref: scipy.spatial.ConvexHull — enveloppe convexe (Graham/Qhull) sklearn-style API — centroids, labels_, inertia_ -Features (21 dimensions, normalisées [0,1]) : +Features (23 dimensions, normalisées [0,1]) : 0 ttl_n : TTL initial normalisé 1 mss_n : MSS normalisé → type réseau 2 scale_n : facteur de mise à l'échelle TCP @@ -28,6 +28,8 @@ Features (21 dimensions, normalisées [0,1]) : 18 direct_n : ratio accès directs 19 ja4_div_n : diversité JA4 (log1p) 20 ua_rot_n : UA rotatif (booléen) + 21 country_risk_n : risque pays source (CN/RU/KP → 1.0, US/DE/FR → 0.0) + 22 asn_cloud_n : hébergeur cloud/CDN/VPN (Cloudflare/AWS/OVH → 1.0) """ from __future__ import annotations @@ -39,6 +41,73 @@ from scipy.spatial import ConvexHull log = logging.getLogger(__name__) +# ─── Encodage pays (risque source) ─────────────────────────────────────────── +# Source: MISP threat intel, Spamhaus DROP list, géographie offensive connue + +_COUNTRY_RISK: dict[str, float] = { + # Très haut risque : infrastructure offensive documentée + "CN": 1.0, "RU": 1.0, "KP": 1.0, "IR": 1.0, + "BY": 0.9, "SY": 0.9, "CU": 0.8, + # Haut risque : transit/hébergement permissif, bulletproof hosters + "HK": 0.75, "VN": 0.7, "UA": 0.65, + "RO": 0.6, "PK": 0.6, "NG": 0.6, + "BG": 0.55, "TR": 0.55, "BR": 0.5, + "TH": 0.5, "IN": 0.45, "ID": 0.45, + # Risque faible : pays à faible tolérance envers activité malveillante + "US": 0.1, "DE": 0.1, "FR": 0.1, "GB": 0.1, + "CA": 0.1, "JP": 0.1, "AU": 0.1, "NL": 0.15, + "CH": 0.1, "SE": 0.1, "NO": 0.1, "DK": 0.1, + "FI": 0.1, "AT": 0.1, "BE": 0.1, "IT": 0.15, + "SG": 0.3, "TW": 0.2, "KR": 0.2, "RS": 0.4, +} +_DEFAULT_COUNTRY_RISK = 0.35 # pays inconnu → risque modéré + + +def country_risk(cc: str | None) -> float: + """Score de risque [0,1] d'un code pays ISO-3166.""" + return _COUNTRY_RISK.get((cc or "").upper(), _DEFAULT_COUNTRY_RISK) + + +# ─── Encodage ASN (type d'infrastructure) ──────────────────────────────────── +# Cloud/CDN/hosting → fort corrélé avec scanners automatisés et bots + +_ASN_CLOUD_KEYWORDS = [ + # Hyperscalers + "amazon", "aws", "google", "microsoft", "azure", "alibaba", "tencent", "huawei", + # CDN / edge + "cloudflare", "akamai", "fastly", "cloudfront", "incapsula", "imperva", + "sucuri", "stackpath", "keycdn", + # Hébergeurs + "ovh", "hetzner", "digitalocean", "vultr", "linode", "akamai-linode", + "leaseweb", "choopa", "packet", "equinix", "serverius", "combahton", + "m247", "b2 net", "hostinger", "contabo", + # Bulletproof / transit permissif connus + "hwclouds", "multacom", "psychz", "serverius", "colocrossing", + "frantech", "sharktech", "tzulo", + # VPN / proxy commerciaux + "nordvpn", "expressvpn", "mullvad", "protonvpn", "surfshark", + "privateinternetaccess", "pia ", "cyberghost", "hotspot shield", + "ipvanish", "hide.me", + # Bots search engines / crawlers + "facebook", "meta ", "twitter", "linkedin", "semrush", "ahrefs", + "majestic", "moz ", "babbar", "sistrix", "criteo", "peer39", +] + + +def asn_cloud_score(asn_org: str | None) -> float: + """ + Score [0,1] : 1.0 = cloud/CDN/hébergement/VPN confirmé. + Correspond à une infrastructure typiquement utilisée par les bots. + """ + if not asn_org: + return 0.2 # inconnu → légèrement suspect + s = asn_org.lower() + for kw in _ASN_CLOUD_KEYWORDS: + if kw in s: + return 1.0 + return 0.0 + + # ─── Définition des features ────────────────────────────────────────────────── FEATURES: list[tuple[str, str, object]] = [ @@ -72,6 +141,9 @@ FEATURES: list[tuple[str, str, object]] = [ ("ja4_count", "Diversité JA4", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(30))), # UA rotatif ("ua_rotating", "UA Rotatif", lambda v: 1.0 if float(v or 0) > 0 else 0.0), + # ── Géographie & infrastructure (nouvelles features) ────────────────── + ("country", "Risque Pays", lambda v: country_risk(str(v) if v else None)), + ("asn_org", "Hébergeur Cloud/VPN", lambda v: asn_cloud_score(str(v) if v else None)), ] FEATURE_KEYS = [f[0] for f in FEATURES] @@ -83,7 +155,7 @@ N_FEATURES = len(FEATURES) # ─── Construction du vecteur de features ───────────────────────────────────── def build_feature_vector(row: dict) -> list[float]: - """Construit le vecteur normalisé [0,1]^21 depuis un dict SQL.""" + """Construit le vecteur normalisé [0,1]^23 depuis un dict SQL.""" return [norm(row.get(key, 0)) for key, _, norm in FEATURES] @@ -249,46 +321,72 @@ def compute_hulls(coords_2d: np.ndarray, labels: np.ndarray, # ─── Nommage et scoring ─────────────────────────────────────────────────────── def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str: - """Nom lisible basé sur les features dominantes du centroïde.""" - s = centroid # alias + """Nom lisible basé sur les features dominantes du centroïde [0,1].""" + s = centroid ttl_raw = float(raw_stats.get("mean_ttl", 0)) mss_raw = float(raw_stats.get("mean_mss", 0)) + # Indices : 21=country_risk, 22=asn_cloud + country_risk = s[21] if len(s) > 21 else 0.0 + asn_cloud = s[22] if len(s) > 22 else 0.0 - # Scanners / bots masscan + # Scanners Masscan if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25: return "🤖 Masscan Scanner" - if s[4] > 0.70 and s[6] > 0.5: + # Bots offensifs agressifs (fuzzing + anomalie + pays risqué) + if s[4] > 0.40 and s[6] > 0.3: return "🤖 Bot agressif" - if s[16] > 0.80: + # Pays à très haut risque (CN, RU, KP) avec trafic anormal + if country_risk > 0.75 and (s[4] > 0.10 or asn_cloud > 0.5): + return "🌏 Source pays risqué" + # Cloud + UA-CH mismatch = crawler/bot cloud + if s[16] > 0.50 and asn_cloud > 0.70: + return "☁️ Bot cloud UA-CH" + # UA-CH mismatch seul + if s[16] > 0.60: return "🤖 UA-CH Mismatch" - if s[7] > 0.70: + # Headless browser + if s[7] > 0.50: return "🤖 Headless Browser" - if s[4] > 0.50: - return "⚠️ Anomalie ML haute" + # Anomalie ML significative + if s[4] > 0.35: + return "⚠️ Anomalie ML" + # Cloud pur (CDN/crawler légitime ?) + if asn_cloud > 0.85 and s[4] < 0.15: + return "☁️ Infrastructure cloud" + # Pays à risque élevé sans autre signal + if country_risk > 0.60: + return "🌏 Trafic suspect (pays)" + # OS fingerprinting if s[3] > 0.85 and ttl_raw > 120: return "🖥️ Windows" if s[0] > 0.22 and s[0] < 0.28 and mss_raw > 1400: return "🐧 Linux" - if s[1] < 0.90 and s[1] > 0.95: - return "📡 VPN/Proxy" if mss_raw < 1380 and mss_raw > 0: return "🌐 Tunnel réseau" - if s[5] > 0.60: + if s[5] > 0.40: return "⚡ Trafic rapide" - if s[4] < 0.10 and s[5] < 0.10: + if s[4] < 0.08 and s[5] < 0.10 and asn_cloud < 0.30: return "✅ Trafic sain" return "📊 Cluster mixte" def risk_score_from_centroid(centroid: np.ndarray) -> float: - """Score de risque [0,1] agrégé depuis le centroïde.""" + """ + Score de risque [0,1] agrégé depuis le centroïde (espace original [0,1]). + Intègre pays et infrastructure cloud. + """ s = centroid + country_risk = s[21] if len(s) > 21 else 0.0 + asn_cloud = s[22] if len(s) > 22 else 0.0 + return float(np.clip( - 0.40 * s[4] + # score ML - 0.15 * s[6] + # fuzzing - 0.15 * s[16] + # UA-CH mismatch - 0.10 * s[7] + # headless - 0.10 * s[5] + # vélocité - 0.10 * s[9], # IP-ID zéro + 0.30 * s[4] + # score ML anomalie (principal) + 0.12 * s[6] + # fuzzing + 0.12 * s[16] + # UA-CH mismatch + 0.08 * s[7] + # headless + 0.08 * s[5] + # vélocité + 0.08 * s[9] + # IP-ID zéro + 0.12 * country_risk + # risque pays source + 0.10 * asn_cloud, # infrastructure cloud/VPN 0.0, 1.0 ))