Files
dashboard/backend/services/clustering_engine.py
SOC Analyst f456c807db feat(clustering): ASN + country risk features, fix risk scoring and cluster labels
- Add country_risk() feature (index 21): lookup table 30 pays, CN/RU/KP=1.0, US/DE=0.1
- Add asn_cloud_score() feature (index 22): 50+ keyword match (cloud/CDN/VPN/crawler)
- N_FEATURES: 20 → 23 (SQL query extended with country + asn_org columns)
- Fix centroids_orig: denormalize K-means z-score centroids to [0,1] for display
- Update risk_score_from_centroid(): weights 0.30 ML + 0.12 fuzzing + 0.12 UA-CH
  + 0.12 country_risk + 0.10 asn_cloud (total=1.0)
- Update name_cluster(): new labels 'Source pays risqué', 'Bot cloud UA-CH',
  'Infrastructure cloud'; thresholds lowered for better sensitivity
- Risk scores now discriminate: pays risqué 0.35, bot cloud 0.23-0.28, sain 0.06

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-19 10:15:21 +01:00

393 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Moteur de clustering K-means++ multi-métriques (numpy + scipy vectorisé).
Ref:
Arthur & Vassilvitskii (2007) — k-means++: The Advantages of Careful Seeding
scipy.spatial.ConvexHull — enveloppe convexe (Graham/Qhull)
sklearn-style API — centroids, labels_, inertia_
Features (23 dimensions, normalisées [0,1]) :
0 ttl_n : TTL initial normalisé
1 mss_n : MSS normalisé → type réseau
2 scale_n : facteur de mise à l'échelle TCP
3 win_n : fenêtre TCP normalisée
4 score_n : score anomalie ML (abs)
5 velocity_n : vélocité de requêtes (log1p)
6 fuzzing_n : index de fuzzing (log1p)
7 headless_n : ratio sessions headless
8 post_n : ratio POST/total
9 ip_id_zero_n : ratio IP-ID=0 (Linux/spoofé)
10 entropy_n : entropie temporelle
11 browser_n : score navigateur moderne
12 alpn_n : mismatch ALPN/protocole
13 alpn_absent_n : ratio ALPN absent
14 h2_n : efficacité H2 multiplexing (log1p)
15 hdr_conf_n : confiance ordre headers
16 ua_ch_n : mismatch User-Agent-Client-Hints
17 asset_n : ratio assets statiques
18 direct_n : ratio accès directs
19 ja4_div_n : diversité JA4 (log1p)
20 ua_rot_n : UA rotatif (booléen)
21 country_risk_n : risque pays source (CN/RU/KP → 1.0, US/DE/FR → 0.0)
22 asn_cloud_n : hébergeur cloud/CDN/VPN (Cloudflare/AWS/OVH → 1.0)
"""
from __future__ import annotations
import math
import logging
import numpy as np
from dataclasses import dataclass, field
from scipy.spatial import ConvexHull
log = logging.getLogger(__name__)
# ─── Encodage pays (risque source) ───────────────────────────────────────────
# Source: MISP threat intel, Spamhaus DROP list, géographie offensive connue
_COUNTRY_RISK: dict[str, float] = {
# Très haut risque : infrastructure offensive documentée
"CN": 1.0, "RU": 1.0, "KP": 1.0, "IR": 1.0,
"BY": 0.9, "SY": 0.9, "CU": 0.8,
# Haut risque : transit/hébergement permissif, bulletproof hosters
"HK": 0.75, "VN": 0.7, "UA": 0.65,
"RO": 0.6, "PK": 0.6, "NG": 0.6,
"BG": 0.55, "TR": 0.55, "BR": 0.5,
"TH": 0.5, "IN": 0.45, "ID": 0.45,
# Risque faible : pays à faible tolérance envers activité malveillante
"US": 0.1, "DE": 0.1, "FR": 0.1, "GB": 0.1,
"CA": 0.1, "JP": 0.1, "AU": 0.1, "NL": 0.15,
"CH": 0.1, "SE": 0.1, "NO": 0.1, "DK": 0.1,
"FI": 0.1, "AT": 0.1, "BE": 0.1, "IT": 0.15,
"SG": 0.3, "TW": 0.2, "KR": 0.2, "RS": 0.4,
}
_DEFAULT_COUNTRY_RISK = 0.35 # pays inconnu → risque modéré
def country_risk(cc: str | None) -> float:
"""Score de risque [0,1] d'un code pays ISO-3166."""
return _COUNTRY_RISK.get((cc or "").upper(), _DEFAULT_COUNTRY_RISK)
# ─── Encodage ASN (type d'infrastructure) ────────────────────────────────────
# Cloud/CDN/hosting → fort corrélé avec scanners automatisés et bots
_ASN_CLOUD_KEYWORDS = [
# Hyperscalers
"amazon", "aws", "google", "microsoft", "azure", "alibaba", "tencent", "huawei",
# CDN / edge
"cloudflare", "akamai", "fastly", "cloudfront", "incapsula", "imperva",
"sucuri", "stackpath", "keycdn",
# Hébergeurs
"ovh", "hetzner", "digitalocean", "vultr", "linode", "akamai-linode",
"leaseweb", "choopa", "packet", "equinix", "serverius", "combahton",
"m247", "b2 net", "hostinger", "contabo",
# Bulletproof / transit permissif connus
"hwclouds", "multacom", "psychz", "serverius", "colocrossing",
"frantech", "sharktech", "tzulo",
# VPN / proxy commerciaux
"nordvpn", "expressvpn", "mullvad", "protonvpn", "surfshark",
"privateinternetaccess", "pia ", "cyberghost", "hotspot shield",
"ipvanish", "hide.me",
# Bots search engines / crawlers
"facebook", "meta ", "twitter", "linkedin", "semrush", "ahrefs",
"majestic", "moz ", "babbar", "sistrix", "criteo", "peer39",
]
def asn_cloud_score(asn_org: str | None) -> float:
"""
Score [0,1] : 1.0 = cloud/CDN/hébergement/VPN confirmé.
Correspond à une infrastructure typiquement utilisée par les bots.
"""
if not asn_org:
return 0.2 # inconnu → légèrement suspect
s = asn_org.lower()
for kw in _ASN_CLOUD_KEYWORDS:
if kw in s:
return 1.0
return 0.0
# ─── Définition des features ──────────────────────────────────────────────────
FEATURES: list[tuple[str, str, object]] = [
# TCP stack
("ttl", "TTL Initial", lambda v: min(1.0, (v or 0) / 255.0)),
("mss", "MSS Réseau", lambda v: min(1.0, (v or 0) / 1460.0)),
("scale", "Scale TCP", lambda v: min(1.0, (v or 0) / 14.0)),
("win", "Fenêtre TCP", lambda v: min(1.0, (v or 0) / 65535.0)),
# Anomalie ML
("avg_score", "Score Anomalie", lambda v: min(1.0, float(v or 0))),
("avg_velocity", "Vélocité (rps)", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))),
("avg_fuzzing", "Fuzzing", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))),
("pct_headless", "Headless", lambda v: min(1.0, float(v or 0))),
("avg_post", "Ratio POST", lambda v: min(1.0, float(v or 0))),
# IP-ID
("ip_id_zero", "IP-ID Zéro", lambda v: min(1.0, float(v or 0))),
# Temporel
("entropy", "Entropie Temporelle", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(10))),
# Navigateur
("browser_score", "Score Navigateur", lambda v: min(1.0, float(v or 0) / 50.0)),
# TLS / Protocole
("alpn_mismatch", "ALPN Mismatch", lambda v: min(1.0, float(v or 0))),
("alpn_missing", "ALPN Absent", lambda v: min(1.0, float(v or 0))),
("h2_eff", "H2 Multiplexing", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(20))),
("hdr_conf", "Ordre Headers", lambda v: min(1.0, float(v or 0))),
("ua_ch_mismatch","UA-CH Mismatch", lambda v: min(1.0, float(v or 0))),
# Comportement HTTP
("asset_ratio", "Ratio Assets", lambda v: min(1.0, float(v or 0))),
("direct_ratio", "Accès Direct", lambda v: min(1.0, float(v or 0))),
# Diversité JA4
("ja4_count", "Diversité JA4", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(30))),
# UA rotatif
("ua_rotating", "UA Rotatif", lambda v: 1.0 if float(v or 0) > 0 else 0.0),
# ── Géographie & infrastructure (nouvelles features) ──────────────────
("country", "Risque Pays", lambda v: country_risk(str(v) if v else None)),
("asn_org", "Hébergeur Cloud/VPN", lambda v: asn_cloud_score(str(v) if v else None)),
]
FEATURE_KEYS = [f[0] for f in FEATURES]
FEATURE_NAMES = [f[1] for f in FEATURES]
FEATURE_NORMS = [f[2] for f in FEATURES]
N_FEATURES = len(FEATURES)
# ─── Construction du vecteur de features ─────────────────────────────────────
def build_feature_vector(row: dict) -> list[float]:
"""Construit le vecteur normalisé [0,1]^23 depuis un dict SQL."""
return [norm(row.get(key, 0)) for key, _, norm in FEATURES]
# ─── Standardisation z-score ──────────────────────────────────────────────────
def standardize(X: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Z-score standardisation : chaque feature est centrée et mise à l'échelle
par sa déviation standard.
Ref: Bishop (2006) PRML §9.1 — preprocessing recommandé pour K-means.
Retourne (X_std, mean, std) pour pouvoir projeter de nouveaux points.
"""
mean = X.mean(axis=0)
std = X.std(axis=0)
std[std < 1e-8] = 1.0 # évite la division par zéro pour features constantes
return (X - mean) / std, mean, std
# ─── K-means++ vectorisé (numpy) ─────────────────────────────────────────────
@dataclass
class KMeansResult:
centroids: np.ndarray # (k, n_features)
labels: np.ndarray # (n_points,) int32
inertia: float
n_iter: int
def kmeans_pp(X: np.ndarray, k: int, max_iter: int = 60, n_init: int = 3,
seed: int = 42) -> KMeansResult:
"""
K-means++ entièrement vectorisé avec numpy.
n_init exécutions, meilleure inertie conservée.
"""
rng = np.random.default_rng(seed)
n, d = X.shape
best: KMeansResult | None = None
for _ in range(n_init):
# ── Initialisation K-means++ ──────────────────────────────────────
centers = [X[rng.integers(n)].copy()]
for _ in range(k - 1):
D = _min_sq_dist(X, np.array(centers))
# Garantit des probabilités non-négatives (erreurs float, points dupliqués)
D = np.clip(D, 0.0, None)
total = D.sum()
if total < 1e-12:
# Tous les points sont confondus — tirage aléatoire
centers.append(X[rng.integers(n)].copy())
else:
probs = D / total
centers.append(X[rng.choice(n, p=probs)].copy())
centers_arr = np.array(centers) # (k, d)
# ── Iterations ───────────────────────────────────────────────────
labels = np.zeros(n, dtype=np.int32)
for it in range(max_iter):
# Assignation vectorisée : (n, k) distance²
dists = _sq_dists(X, centers_arr) # (n, k)
new_labels = np.argmin(dists, axis=1).astype(np.int32)
if it > 0 and np.all(new_labels == labels):
break # convergence
labels = new_labels
# Mise à jour des centroïdes
for j in range(k):
mask = labels == j
if mask.any():
centers_arr[j] = X[mask].mean(axis=0)
inertia = float(np.sum(np.min(_sq_dists(X, centers_arr), axis=1)))
result = KMeansResult(centers_arr, labels, inertia, it + 1)
if best is None or inertia < best.inertia:
best = result
return best # type: ignore[return-value]
def _sq_dists(X: np.ndarray, C: np.ndarray) -> np.ndarray:
"""Distance² entre chaque point de X et chaque centroïde de C. O(n·k·d)."""
# ||x - c||² = ||x||² + ||c||² - 2·x·cᵀ
X2 = np.sum(X ** 2, axis=1, keepdims=True) # (n, 1)
C2 = np.sum(C ** 2, axis=1, keepdims=True).T # (1, k)
return X2 + C2 - 2.0 * X @ C.T # (n, k)
def _min_sq_dist(X: np.ndarray, C: np.ndarray) -> np.ndarray:
"""Distance² minimale de chaque point aux centroïdes existants."""
return np.min(_sq_dists(X, C), axis=1)
# ─── PCA 2D (numpy) ──────────────────────────────────────────────────────────
def pca_2d(X: np.ndarray) -> np.ndarray:
"""
PCA-2D vectorisée. Retourne les coordonnées normalisées [0,1] × [0,1].
"""
mean = X.mean(axis=0)
Xc = X - mean
# Power iteration pour les 2 premières composantes
rng = np.random.default_rng(0)
v1 = _power_iter(Xc, rng.standard_normal(Xc.shape[1]))
proj1 = Xc @ v1
# Déflation (Hotelling)
Xc2 = Xc - np.outer(proj1, v1)
v2 = _power_iter(Xc2, rng.standard_normal(Xc.shape[1]))
proj2 = Xc2 @ v2
coords = np.column_stack([proj1, proj2])
# Normalisation [0,1]
mn, mx = coords.min(axis=0), coords.max(axis=0)
rng_ = mx - mn
rng_[rng_ == 0] = 1.0
return (coords - mn) / rng_
def _power_iter(X: np.ndarray, v: np.ndarray, n_iter: int = 30) -> np.ndarray:
"""Power iteration : trouve le premier vecteur propre de XᵀX."""
for _ in range(n_iter):
v = X.T @ (X @ v)
norm = np.linalg.norm(v)
if norm < 1e-12:
break
v /= norm
return v
# ─── Enveloppe convexe (hull) par cluster ────────────────────────────────────
def compute_hulls(coords_2d: np.ndarray, labels: np.ndarray,
k: int, min_pts: int = 4) -> dict[int, list[list[float]]]:
"""
Calcule l'enveloppe convexe (convex hull) des points PCA pour chaque cluster.
Retourne {cluster_idx: [[x,y], ...]} (polygone fermé).
"""
hulls: dict[int, list[list[float]]] = {}
for j in range(k):
pts = coords_2d[labels == j]
if len(pts) < min_pts:
# Pas assez de points : bounding box
if len(pts) > 0:
mx_, my_ = pts.mean(axis=0)
r = max(0.01, pts.std(axis=0).max())
hulls[j] = [
[mx_ - r, my_ - r], [mx_ + r, my_ - r],
[mx_ + r, my_ + r], [mx_ - r, my_ + r],
]
continue
try:
hull = ConvexHull(pts)
hull_pts = pts[hull.vertices].tolist()
# Fermer le polygone
hull_pts.append(hull_pts[0])
hulls[j] = hull_pts
except Exception:
hulls[j] = []
return hulls
# ─── Nommage et scoring ───────────────────────────────────────────────────────
def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
"""Nom lisible basé sur les features dominantes du centroïde [0,1]."""
s = centroid
ttl_raw = float(raw_stats.get("mean_ttl", 0))
mss_raw = float(raw_stats.get("mean_mss", 0))
# Indices : 21=country_risk, 22=asn_cloud
country_risk = s[21] if len(s) > 21 else 0.0
asn_cloud = s[22] if len(s) > 22 else 0.0
# Scanners Masscan
if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25:
return "🤖 Masscan Scanner"
# Bots offensifs agressifs (fuzzing + anomalie + pays risqué)
if s[4] > 0.40 and s[6] > 0.3:
return "🤖 Bot agressif"
# Pays à très haut risque (CN, RU, KP) avec trafic anormal
if country_risk > 0.75 and (s[4] > 0.10 or asn_cloud > 0.5):
return "🌏 Source pays risqué"
# Cloud + UA-CH mismatch = crawler/bot cloud
if s[16] > 0.50 and asn_cloud > 0.70:
return "☁️ Bot cloud UA-CH"
# UA-CH mismatch seul
if s[16] > 0.60:
return "🤖 UA-CH Mismatch"
# Headless browser
if s[7] > 0.50:
return "🤖 Headless Browser"
# Anomalie ML significative
if s[4] > 0.35:
return "⚠️ Anomalie ML"
# Cloud pur (CDN/crawler légitime ?)
if asn_cloud > 0.85 and s[4] < 0.15:
return "☁️ Infrastructure cloud"
# Pays à risque élevé sans autre signal
if country_risk > 0.60:
return "🌏 Trafic suspect (pays)"
# OS fingerprinting
if s[3] > 0.85 and ttl_raw > 120:
return "🖥️ Windows"
if s[0] > 0.22 and s[0] < 0.28 and mss_raw > 1400:
return "🐧 Linux"
if mss_raw < 1380 and mss_raw > 0:
return "🌐 Tunnel réseau"
if s[5] > 0.40:
return "⚡ Trafic rapide"
if s[4] < 0.08 and s[5] < 0.10 and asn_cloud < 0.30:
return "✅ Trafic sain"
return "📊 Cluster mixte"
def risk_score_from_centroid(centroid: np.ndarray) -> float:
"""
Score de risque [0,1] agrégé depuis le centroïde (espace original [0,1]).
Intègre pays et infrastructure cloud.
"""
s = centroid
country_risk = s[21] if len(s) > 21 else 0.0
asn_cloud = s[22] if len(s) > 22 else 0.0
return float(np.clip(
0.30 * s[4] + # score ML anomalie (principal)
0.12 * s[6] + # fuzzing
0.12 * s[16] + # UA-CH mismatch
0.08 * s[7] + # headless
0.08 * s[5] + # vélocité
0.08 * s[9] + # IP-ID zéro
0.12 * country_risk + # risque pays source
0.10 * asn_cloud, # infrastructure cloud/VPN
0.0, 1.0
))