dashboard/backend/services/clustering_engine.py

"""
Moteur de clustering K-means++ multi-métriques (numpy + scipy vectorisé).

Ref:
  Arthur & Vassilvitskii (2007) — k-means++: The Advantages of Careful Seeding
  scipy.spatial.ConvexHull     — enveloppe convexe (Graham/Qhull)
  sklearn-style API             — centroids, labels_, inertia_

Features (23 dimensions, normalisées [0,1]) :
  0  ttl_n          : TTL initial normalisé
  1  mss_n          : MSS normalisé → type réseau
  2  scale_n        : facteur de mise à l'échelle TCP
  3  win_n          : fenêtre TCP normalisée
  4  score_n        : score anomalie ML (abs)
  5  velocity_n     : vélocité de requêtes (log1p)
  6  fuzzing_n      : index de fuzzing (log1p)
  7  headless_n     : ratio sessions headless
  8  post_n         : ratio POST/total
  9  ip_id_zero_n   : ratio IP-ID=0 (Linux/spoofé)
  10 entropy_n      : entropie temporelle
  11 browser_n      : score navigateur moderne
  12 alpn_n         : mismatch ALPN/protocole
  13 alpn_absent_n  : ratio ALPN absent
  14 h2_n           : efficacité H2 multiplexing (log1p)
  15 hdr_conf_n     : confiance ordre headers
  16 ua_ch_n        : mismatch User-Agent-Client-Hints
  17 asset_n        : ratio assets statiques
  18 direct_n       : ratio accès directs
  19 ja4_div_n      : diversité JA4 (log1p)
  20 ua_rot_n       : UA rotatif (booléen)
  21 country_risk_n : risque pays source (CN/RU/KP → 1.0, US/DE/FR → 0.0)
  22 asn_cloud_n    : hébergeur cloud/CDN/VPN (Cloudflare/AWS/OVH → 1.0)
"""
from __future__ import annotations

import math
import logging
import numpy as np
from dataclasses import dataclass, field
from scipy.spatial import ConvexHull

log = logging.getLogger(__name__)

# ─── Encodage pays (risque source) ───────────────────────────────────────────
# Source: MISP threat intel, Spamhaus DROP list, géographie offensive connue

_COUNTRY_RISK: dict[str, float] = {
    # Très haut risque : infrastructure offensive documentée
    "CN": 1.0, "RU": 1.0, "KP": 1.0, "IR": 1.0,
    "BY": 0.9, "SY": 0.9, "CU": 0.8,
    # Haut risque : transit/hébergement permissif, bulletproof hosters
    "HK": 0.75, "VN": 0.7, "UA": 0.65,
    "RO": 0.6,  "PK": 0.6, "NG": 0.6,
    "BG": 0.55, "TR": 0.55, "BR": 0.5,
    "TH": 0.5,  "IN": 0.45, "ID": 0.45,
    # Risque faible : pays à faible tolérance envers activité malveillante
    "US": 0.1, "DE": 0.1, "FR": 0.1, "GB": 0.1,
    "CA": 0.1, "JP": 0.1, "AU": 0.1, "NL": 0.15,
    "CH": 0.1, "SE": 0.1, "NO": 0.1, "DK": 0.1,
    "FI": 0.1, "AT": 0.1, "BE": 0.1, "IT": 0.15,
    "SG": 0.3, "TW": 0.2, "KR": 0.2, "RS": 0.4,
}
_DEFAULT_COUNTRY_RISK = 0.35  # pays inconnu → risque modéré


def country_risk(cc: str | None) -> float:
    """Score de risque [0,1] d'un code pays ISO-3166."""
    return _COUNTRY_RISK.get((cc or "").upper(), _DEFAULT_COUNTRY_RISK)


# ─── Encodage ASN (type d'infrastructure) ────────────────────────────────────
# Cloud/CDN/hosting → fort corrélé avec scanners automatisés et bots

_ASN_CLOUD_KEYWORDS = [
    # Hyperscalers
    "amazon", "aws", "google", "microsoft", "azure", "alibaba", "tencent", "huawei",
    # CDN / edge
    "cloudflare", "akamai", "fastly", "cloudfront", "incapsula", "imperva",
    "sucuri", "stackpath", "keycdn",
    # Hébergeurs
    "ovh", "hetzner", "digitalocean", "vultr", "linode", "akamai-linode",
    "leaseweb", "choopa", "packet", "equinix", "serverius", "combahton",
    "m247", "b2 net", "hostinger", "contabo",
    # Bulletproof / transit permissif connus
    "hwclouds", "multacom", "psychz", "serverius", "colocrossing",
    "frantech", "sharktech", "tzulo",
    # VPN / proxy commerciaux
    "nordvpn", "expressvpn", "mullvad", "protonvpn", "surfshark",
    "privateinternetaccess", "pia ", "cyberghost", "hotspot shield",
    "ipvanish", "hide.me",
    # Bots search engines / crawlers
    "facebook", "meta ", "twitter", "linkedin", "semrush", "ahrefs",
    "majestic", "moz ", "babbar", "sistrix", "criteo", "peer39",
]


def asn_cloud_score(asn_org: str | None) -> float:
    """
    Score [0,1] : 1.0 = cloud/CDN/hébergement/VPN confirmé.
    Correspond à une infrastructure typiquement utilisée par les bots.
    """
    if not asn_org:
        return 0.2  # inconnu → légèrement suspect
    s = asn_org.lower()
    for kw in _ASN_CLOUD_KEYWORDS:
        if kw in s:
            return 1.0
    return 0.0


# ─── Définition des features ──────────────────────────────────────────────────

FEATURES: list[tuple[str, str, object]] = [
    # TCP stack
    ("ttl",           "TTL Initial",           lambda v: min(1.0, (v or 0) / 255.0)),
    ("mss",           "MSS Réseau",            lambda v: min(1.0, (v or 0) / 1460.0)),
    ("scale",         "Scale TCP",             lambda v: min(1.0, (v or 0) / 14.0)),
    ("win",           "Fenêtre TCP",           lambda v: min(1.0, (v or 0) / 65535.0)),
    # Anomalie ML
    ("avg_score",     "Score Anomalie",        lambda v: min(1.0, float(v or 0))),
    ("avg_velocity",  "Vélocité (rps)",        lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))),
    ("avg_fuzzing",   "Fuzzing",               lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))),
    ("pct_headless",  "Headless",              lambda v: min(1.0, float(v or 0))),
    ("avg_post",      "Ratio POST",            lambda v: min(1.0, float(v or 0))),
    # IP-ID
    ("ip_id_zero",    "IP-ID Zéro",           lambda v: min(1.0, float(v or 0))),
    # Temporel
    ("entropy",       "Entropie Temporelle",   lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(10))),
    # Navigateur
    ("browser_score", "Score Navigateur",      lambda v: min(1.0, float(v or 0) / 50.0)),
    # TLS / Protocole
    ("alpn_mismatch", "ALPN Mismatch",         lambda v: min(1.0, float(v or 0))),
    ("alpn_missing",  "ALPN Absent",           lambda v: min(1.0, float(v or 0))),
    ("h2_eff",        "H2 Multiplexing",       lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(20))),
    ("hdr_conf",      "Ordre Headers",         lambda v: min(1.0, float(v or 0))),
    ("ua_ch_mismatch","UA-CH Mismatch",        lambda v: min(1.0, float(v or 0))),
    # Comportement HTTP
    ("asset_ratio",   "Ratio Assets",          lambda v: min(1.0, float(v or 0))),
    ("direct_ratio",  "Accès Direct",          lambda v: min(1.0, float(v or 0))),
    # Diversité JA4
    ("ja4_count",     "Diversité JA4",         lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(30))),
    # UA rotatif
    ("ua_rotating",   "UA Rotatif",            lambda v: 1.0 if float(v or 0) > 0 else 0.0),
    # ── Géographie & infrastructure (nouvelles features) ──────────────────
    ("country",       "Risque Pays",           lambda v: country_risk(str(v) if v else None)),
    ("asn_org",       "Hébergeur Cloud/VPN",   lambda v: asn_cloud_score(str(v) if v else None)),
]

FEATURE_KEYS  = [f[0] for f in FEATURES]
FEATURE_NAMES = [f[1] for f in FEATURES]
FEATURE_NORMS = [f[2] for f in FEATURES]
N_FEATURES    = len(FEATURES)


# ─── Construction du vecteur de features ─────────────────────────────────────

def build_feature_vector(row: dict) -> list[float]:
    """Construit le vecteur normalisé [0,1]^23 depuis un dict SQL."""
    return [norm(row.get(key, 0)) for key, _, norm in FEATURES]


# ─── Standardisation z-score ──────────────────────────────────────────────────

def standardize(X: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Z-score standardisation : chaque feature est centrée et mise à l'échelle
    par sa déviation standard.

    Ref: Bishop (2006) PRML §9.1 — preprocessing recommandé pour K-means.

    Retourne (X_std, mean, std) pour pouvoir projeter de nouveaux points.
    """
    mean = X.mean(axis=0)
    std  = X.std(axis=0)
    std[std < 1e-8] = 1.0  # évite la division par zéro pour features constantes
    return (X - mean) / std, mean, std


# ─── K-means++ vectorisé (numpy) ─────────────────────────────────────────────

@dataclass
class KMeansResult:
    centroids: np.ndarray        # (k, n_features)
    labels: np.ndarray           # (n_points,)  int32
    inertia: float
    n_iter: int


def kmeans_pp(X: np.ndarray, k: int, max_iter: int = 60, n_init: int = 3,
              seed: int = 42) -> KMeansResult:
    """
    K-means++ entièrement vectorisé avec numpy.
    n_init exécutions, meilleure inertie conservée.
    """
    rng = np.random.default_rng(seed)
    n, d = X.shape
    best: KMeansResult | None = None

    for _ in range(n_init):
        # ── Initialisation K-means++ ──────────────────────────────────────
        centers = [X[rng.integers(n)].copy()]
        for _ in range(k - 1):
            D = _min_sq_dist(X, np.array(centers))
            # Garantit des probabilités non-négatives (erreurs float, points dupliqués)
            D = np.clip(D, 0.0, None)
            total = D.sum()
            if total < 1e-12:
                # Tous les points sont confondus — tirage aléatoire
                centers.append(X[rng.integers(n)].copy())
            else:
                probs = D / total
                centers.append(X[rng.choice(n, p=probs)].copy())
        centers_arr = np.array(centers)  # (k, d)

        # ── Iterations ───────────────────────────────────────────────────
        labels = np.zeros(n, dtype=np.int32)
        for it in range(max_iter):
            # Assignation vectorisée : (n, k) distance²
            dists = _sq_dists(X, centers_arr)  # (n, k)
            new_labels = np.argmin(dists, axis=1).astype(np.int32)

            if it > 0 and np.all(new_labels == labels):
                break  # convergence
            labels = new_labels

            # Mise à jour des centroïdes
            for j in range(k):
                mask = labels == j
                if mask.any():
                    centers_arr[j] = X[mask].mean(axis=0)

        inertia = float(np.sum(np.min(_sq_dists(X, centers_arr), axis=1)))
        result = KMeansResult(centers_arr, labels, inertia, it + 1)
        if best is None or inertia < best.inertia:
            best = result

    return best  # type: ignore[return-value]


def _sq_dists(X: np.ndarray, C: np.ndarray) -> np.ndarray:
    """Distance² entre chaque point de X et chaque centroïde de C. O(n·k·d)."""
    # ||x - c||² = ||x||² + ||c||² - 2·x·cᵀ
    X2 = np.sum(X ** 2, axis=1, keepdims=True)  # (n, 1)
    C2 = np.sum(C ** 2, axis=1, keepdims=True).T  # (1, k)
    return X2 + C2 - 2.0 * X @ C.T              # (n, k)


def _min_sq_dist(X: np.ndarray, C: np.ndarray) -> np.ndarray:
    """Distance² minimale de chaque point aux centroïdes existants."""
    return np.min(_sq_dists(X, C), axis=1)


# ─── PCA 2D (numpy) ──────────────────────────────────────────────────────────

def pca_2d(X: np.ndarray) -> np.ndarray:
    """
    PCA-2D vectorisée. Retourne les coordonnées normalisées [0,1] × [0,1].
    """
    mean = X.mean(axis=0)
    Xc = X - mean
    # Power iteration pour les 2 premières composantes
    rng = np.random.default_rng(0)
    v1 = _power_iter(Xc, rng.standard_normal(Xc.shape[1]))
    proj1 = Xc @ v1
    # Déflation (Hotelling)
    Xc2 = Xc - np.outer(proj1, v1)
    v2 = _power_iter(Xc2, rng.standard_normal(Xc.shape[1]))
    proj2 = Xc2 @ v2

    coords = np.column_stack([proj1, proj2])
    # Normalisation [0,1]
    mn, mx = coords.min(axis=0), coords.max(axis=0)
    rng_ = mx - mn
    rng_[rng_ == 0] = 1.0
    return (coords - mn) / rng_


def _power_iter(X: np.ndarray, v: np.ndarray, n_iter: int = 30) -> np.ndarray:
    """Power iteration : trouve le premier vecteur propre de XᵀX."""
    for _ in range(n_iter):
        v = X.T @ (X @ v)
        norm = np.linalg.norm(v)
        if norm < 1e-12:
            break
        v /= norm
    return v


# ─── Enveloppe convexe (hull) par cluster ────────────────────────────────────

def compute_hulls(coords_2d: np.ndarray, labels: np.ndarray,
                  k: int, min_pts: int = 4) -> dict[int, list[list[float]]]:
    """
    Calcule l'enveloppe convexe (convex hull) des points PCA pour chaque cluster.
    Retourne {cluster_idx: [[x,y], ...]} (polygone fermé).
    """
    hulls: dict[int, list[list[float]]] = {}
    for j in range(k):
        pts = coords_2d[labels == j]
        if len(pts) < min_pts:
            # Pas assez de points : bounding box
            if len(pts) > 0:
                mx_, my_ = pts.mean(axis=0)
                r = max(0.01, pts.std(axis=0).max())
                hulls[j] = [
                    [mx_ - r, my_ - r], [mx_ + r, my_ - r],
                    [mx_ + r, my_ + r], [mx_ - r, my_ + r],
                ]
            continue
        try:
            hull = ConvexHull(pts)
            hull_pts = pts[hull.vertices].tolist()
            # Fermer le polygone
            hull_pts.append(hull_pts[0])
            hulls[j] = hull_pts
        except Exception:
            hulls[j] = []
    return hulls


# ─── Nommage et scoring ───────────────────────────────────────────────────────

def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
    """Nom lisible basé sur les features dominantes du centroïde [0,1]."""
    s = centroid
    ttl_raw = float(raw_stats.get("mean_ttl", 0))
    mss_raw = float(raw_stats.get("mean_mss", 0))
    # Indices : 21=country_risk, 22=asn_cloud
    country_risk = s[21] if len(s) > 21 else 0.0
    asn_cloud    = s[22] if len(s) > 22 else 0.0

    # Scanners Masscan
    if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25:
        return "🤖 Masscan Scanner"
    # Bots offensifs agressifs (fuzzing + anomalie + pays risqué)
    if s[4] > 0.40 and s[6] > 0.3:
        return "🤖 Bot agressif"
    # Pays à très haut risque (CN, RU, KP) avec trafic anormal
    if country_risk > 0.75 and (s[4] > 0.10 or asn_cloud > 0.5):
        return "🌏 Source pays risqué"
    # Cloud + UA-CH mismatch = crawler/bot cloud
    if s[16] > 0.50 and asn_cloud > 0.70:
        return "☁️ Bot cloud UA-CH"
    # UA-CH mismatch seul
    if s[16] > 0.60:
        return "🤖 UA-CH Mismatch"
    # Headless browser
    if s[7] > 0.50:
        return "🤖 Headless Browser"
    # Anomalie ML significative
    if s[4] > 0.35:
        return "⚠️ Anomalie ML"
    # Cloud pur (CDN/crawler légitime ?)
    if asn_cloud > 0.85 and s[4] < 0.15:
        return "☁️ Infrastructure cloud"
    # Pays à risque élevé sans autre signal
    if country_risk > 0.60:
        return "🌏 Trafic suspect (pays)"
    # OS fingerprinting
    if s[3] > 0.85 and ttl_raw > 120:
        return "🖥️ Windows"
    if s[0] > 0.22 and s[0] < 0.28 and mss_raw > 1400:
        return "🐧 Linux"
    if mss_raw < 1380 and mss_raw > 0:
        return "🌐 Tunnel réseau"
    if s[5] > 0.40:
        return "⚡ Trafic rapide"
    if s[4] < 0.08 and s[5] < 0.10 and asn_cloud < 0.30:
        return "✅ Trafic sain"
    return "📊 Cluster mixte"


def risk_score_from_centroid(centroid: np.ndarray) -> float:
    """
    Score de risque [0,1] agrégé depuis le centroïde (espace original [0,1]).
    Intègre pays et infrastructure cloud.
    """
    s = centroid
    country_risk = s[21] if len(s) > 21 else 0.0
    asn_cloud    = s[22] if len(s) > 22 else 0.0

    return float(np.clip(
        0.30 * s[4]          +   # score ML anomalie (principal)
        0.12 * s[6]          +   # fuzzing
        0.12 * s[16]         +   # UA-CH mismatch
        0.08 * s[7]          +   # headless
        0.08 * s[5]          +   # vélocité
        0.08 * s[9]          +   # IP-ID zéro
        0.12 * country_risk  +   # risque pays source
        0.10 * asn_cloud,        # infrastructure cloud/VPN
        0.0, 1.0
    ))