feat: clustering multi-métriques + TCP fingerprinting amélioré

- TCP fingerprinting: 20 signatures OS (p0f-style), scoring multi-signal TTL/MSS/scale/fenêtre, détection Masscan 97% confiance, réseau path (Ethernet/PPPoE/VPN/Tunnel), estimation hop-count - Clustering IPs: K-means++ (Arthur & Vassilvitskii 2007) sur 21 features TCP stack + anomalie ML + TLS/protocole + navigateur + temporel PCA-2D par puissance itérative (Hotelling) pour positionnement - Visualisation redesign: 2 vues lisibles - Tableau de bord: grille de cartes groupées par niveau de risque (Bots / Suspects / Légitimes), métriques clés + mini-barres - Graphe de relations: ReactFlow avec nœuds-cartes en colonnes par niveau de menace, arêtes colorées par similarité, légende - Sidebar: RadarChart comportemental + toutes métriques + export CSV Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-18 18:22:57 +01:00
parent c887846af5
commit e2db8ca84e
9 changed files with 2430 additions and 202 deletions
--- a/backend/routes/clustering.py
+++ b/backend/routes/clustering.py
@ -0,0 +1,458 @@
+"""
+Clustering d'IPs multi-métriques — backend ReactFlow.
+
+Features utilisées (21 dimensions) :
+  TCP stack    : TTL initial, MSS, scale, fenêtre TCP
+  Comportement : vélocité, POST ratio, fuzzing, assets, accès direct
+  Anomalie ML  : score, IP-ID zéro
+  TLS/Protocole: ALPN mismatch, ALPN absent, efficacité H2
+  Navigateur   : browser score, headless, ordre headers, UA-CH mismatch
+  Temporel     : entropie, diversité JA4, UA rotatif
+
+Algorithme :
+  1. Échantillonnage stratifié (top détections + top hits)
+  2. Construction + normalisation des vecteurs de features
+  3. K-means++ (Arthur & Vassilvitskii, 2007)
+  4. PCA-2D par power iteration pour les positions ReactFlow
+  5. Nommage automatique par features dominantes du centroïde
+  6. Calcul des arêtes : k-NN dans l'espace des features
+"""
+from __future__ import annotations
+import math
+import time
+import hashlib
+from typing import Optional
+from fastapi import APIRouter, HTTPException, Query
+
+from ..database import db
+from ..services.clustering_engine import (
+    FEATURES, FEATURE_KEYS, FEATURE_NORMS, FEATURE_NAMES, N_FEATURES,
+    build_feature_vector, kmeans_pp, pca_2d,
+    name_cluster, risk_score_from_centroid, _mean_vec,
+)
+
+router = APIRouter(prefix="/api/clustering", tags=["clustering"])
+
+# ─── Cache en mémoire ─────────────────────────────────────────────────────────
+# Stocke (cluster_id → liste d'IPs) pour le drill-down
+# + timestamp de dernière mise à jour
+_cache: dict = {
+    "assignments": {},   # ip+ja4 → cluster_idx
+    "cluster_ips": {},   # cluster_idx → [(ip, ja4)]
+    "params": {},        # k, ts
+}
+
+# ─── Couleurs ─────────────────────────────────────────────────────────────────
+_THREAT_COLOR = {
+    0.92: "#dc2626",   # Bot scanner
+    0.70: "#ef4444",   # Critique
+    0.45: "#f97316",   # Élevé
+    0.25: "#eab308",   # Modéré
+    0.00: "#6b7280",   # Sain / inconnu
+}
+
+def _risk_to_color(risk: float) -> str:
+    for threshold, color in sorted(_THREAT_COLOR.items(), reverse=True):
+        if risk >= threshold:
+            return color
+    return "#6b7280"
+
+
+# ─── SQL ──────────────────────────────────────────────────────────────────────
+
+_SQL_FEATURES = """
+SELECT
+    replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') AS ip,
+    t.ja4,
+    any(t.tcp_ttl_raw)     AS ttl,
+    any(t.tcp_win_raw)     AS win,
+    any(t.tcp_scale_raw)   AS scale,
+    any(t.tcp_mss_raw)     AS mss,
+    any(t.first_ua)        AS ua,
+    sum(t.hits)            AS hits,
+
+    avg(abs(ml.anomaly_score))            AS avg_score,
+    avg(ml.hit_velocity)                  AS avg_velocity,
+    avg(ml.fuzzing_index)                 AS avg_fuzzing,
+    avg(ml.is_headless)                   AS pct_headless,
+    avg(ml.post_ratio)                    AS avg_post,
+    avg(ml.ip_id_zero_ratio)              AS ip_id_zero,
+    avg(ml.temporal_entropy)              AS entropy,
+    avg(ml.modern_browser_score)          AS browser_score,
+    avg(ml.alpn_http_mismatch)            AS alpn_mismatch,
+    avg(ml.is_alpn_missing)               AS alpn_missing,
+    avg(ml.multiplexing_efficiency)       AS h2_eff,
+    avg(ml.header_order_confidence)       AS hdr_conf,
+    avg(ml.ua_ch_mismatch)                AS ua_ch_mismatch,
+    avg(ml.asset_ratio)                   AS asset_ratio,
+    avg(ml.direct_access_ratio)           AS direct_ratio,
+    avg(ml.distinct_ja4_count)            AS ja4_count,
+    max(ml.is_ua_rotating)               AS ua_rotating,
+
+    max(ml.threat_level)                  AS threat,
+    any(ml.country_code)                  AS country,
+    any(ml.asn_org)                       AS asn_org
+FROM mabase_prod.agg_host_ip_ja4_1h t
+LEFT JOIN mabase_prod.ml_detected_anomalies ml
+    ON t.src_ip = ml.src_ip AND t.ja4 = ml.ja4
+   AND ml.detected_at >= now() - INTERVAL 24 HOUR
+WHERE t.window_start >= now() - INTERVAL 24 HOUR
+  AND t.tcp_ttl_raw > 0
+GROUP BY t.src_ip, t.ja4
+ORDER BY
+    -- Stratégie : IPs anormales en premier, puis fort trafic
+    -- Cela garantit que les bots Masscan (anomalie=0.97, hits=1-2) sont inclus
+    avg(abs(ml.anomaly_score)) DESC,
+    sum(t.hits) DESC
+LIMIT %(limit)s
+"""
+
+# Noms des colonnes SQL dans l'ordre
+_SQL_COLS = [
+    "ip", "ja4", "ttl", "win", "scale", "mss", "ua", "hits",
+    "avg_score", "avg_velocity", "avg_fuzzing", "pct_headless", "avg_post",
+    "ip_id_zero", "entropy", "browser_score", "alpn_mismatch", "alpn_missing",
+    "h2_eff", "hdr_conf", "ua_ch_mismatch", "asset_ratio", "direct_ratio",
+    "ja4_count", "ua_rotating", "threat", "country", "asn_org",
+]
+
+
+# ─── Endpoints ────────────────────────────────────────────────────────────────
+
+@router.get("/clusters")
+async def get_clusters(
+    k:          int = Query(14, ge=4, le=30, description="Nombre de clusters"),
+    n_samples:  int = Query(3000, ge=500, le=8000, description="Taille de l'échantillon"),
+):
+    """
+    Clustering multi-métriques des IPs.
+
+    Retourne les nœuds (clusters) + arêtes pour ReactFlow, avec :
+      - positions 2D issues de PCA sur les 21 features
+      - profil radar des features par cluster (normalisé [0,1])
+      - statistiques détaillées (moyennes brutes des features)
+      - sample d'IPs représentatives
+    """
+    t0 = time.time()
+    try:
+        result = db.query(_SQL_FEATURES, {"limit": n_samples})
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"ClickHouse: {e}")
+
+    # ── Construction des vecteurs de features ─────────────────────────────
+    rows: list[dict] = []
+    for row in result.result_rows:
+        d = {col: row[i] for i, col in enumerate(_SQL_COLS)}
+        rows.append(d)
+
+    if len(rows) < k:
+        raise HTTPException(status_code=400, detail="Pas assez de données pour ce k")
+
+    points = [build_feature_vector(r) for r in rows]
+
+    # ── K-means++ ────────────────────────────────────────────────────────
+    km = kmeans_pp(points, k=k, max_iter=60, seed=42)
+
+    # ── PCA-2D sur les centroïdes ─────────────────────────────────────────
+    # On projette les centroïdes dans l'espace PCA des données
+    # → les positions relatives reflètent la variance des données
+    coords_all = pca_2d(points)
+    # Moyenne des positions PCA par cluster = position 2D du centroïde
+    cluster_xs: list[list[float]] = [[] for _ in range(k)]
+    cluster_ys: list[list[float]] = [[] for _ in range(k)]
+    for i, label in enumerate(km.labels):
+        cluster_xs[label].append(coords_all[i][0])
+        cluster_ys[label].append(coords_all[i][1])
+
+    centroid_2d: list[tuple[float, float]] = []
+    for j in range(k):
+        if cluster_xs[j]:
+            cx = sum(cluster_xs[j]) / len(cluster_xs[j])
+            cy = sum(cluster_ys[j]) / len(cluster_ys[j])
+        else:
+            cx, cy = 0.5, 0.5
+        centroid_2d.append((cx, cy))
+
+    # ── Agrégation des statistiques par cluster ───────────────────────────
+    cluster_rows:    list[list[dict]] = [[] for _ in range(k)]
+    cluster_members: list[list[tuple[str, str]]] = [[] for _ in range(k)]
+    for i, label in enumerate(km.labels):
+        cluster_rows[label].append(rows[i])
+        cluster_members[label].append((rows[i]["ip"], rows[i]["ja4"]))
+
+    # Mise à jour du cache pour le drill-down
+    _cache["cluster_ips"] = {j: cluster_members[j] for j in range(k)}
+    _cache["params"] = {"k": k, "ts": t0}
+
+    # ── Construction des nœuds ReactFlow ─────────────────────────────────
+    CANVAS_W, CANVAS_H = 1400, 780
+
+    nodes = []
+    for j in range(k):
+        if not cluster_rows[j]:
+            continue
+
+        # Statistiques brutes moyennées
+        def avg_feat(key: str) -> float:
+            vals = [float(r.get(key) or 0) for r in cluster_rows[j]]
+            return sum(vals) / len(vals) if vals else 0.0
+
+        mean_ttl   = avg_feat("ttl")
+        mean_mss   = avg_feat("mss")
+        mean_scale = avg_feat("scale")
+        mean_win   = avg_feat("win")
+        mean_score = avg_feat("avg_score")
+        mean_vel   = avg_feat("avg_velocity")
+        mean_fuzz  = avg_feat("avg_fuzzing")
+        mean_hless = avg_feat("pct_headless")
+        mean_post  = avg_feat("avg_post")
+        mean_asset = avg_feat("asset_ratio")
+        mean_direct= avg_feat("direct_ratio")
+        mean_alpn  = avg_feat("alpn_mismatch")
+        mean_h2    = avg_feat("h2_eff")
+        mean_hconf = avg_feat("hdr_conf")
+        mean_ua_ch = avg_feat("ua_ch_mismatch")
+        mean_entr  = avg_feat("entropy")
+        mean_ja4   = avg_feat("ja4_count")
+        mean_ip_id = avg_feat("ip_id_zero")
+        mean_brow  = avg_feat("browser_score")
+        mean_uarot = avg_feat("ua_rotating")
+
+        ip_count   = len(set(r["ip"] for r in cluster_rows[j]))
+        hit_count  = int(sum(float(r.get("hits") or 0) for r in cluster_rows[j]))
+
+        # Pays / ASN / Menace dominants
+        threats   = [str(r.get("threat") or "") for r in cluster_rows[j] if r.get("threat")]
+        countries = [str(r.get("country") or "") for r in cluster_rows[j] if r.get("country")]
+        orgs      = [str(r.get("asn_org") or "") for r in cluster_rows[j] if r.get("asn_org")]
+
+        def topk(lst: list[str], n: int = 5) -> list[str]:
+            from collections import Counter
+            return [v for v, _ in Counter(lst).most_common(n) if v]
+
+        raw_stats = {
+            "mean_ttl": mean_ttl, "mean_mss": mean_mss,
+            "mean_scale": mean_scale,
+        }
+        label      = name_cluster(km.centroids[j], raw_stats)
+        risk       = risk_score_from_centroid(km.centroids[j])
+        color      = _risk_to_color(risk)
+
+        # Profil radar normalisé (valeurs centroïde [0,1])
+        radar = [
+            {"feature": name, "value": round(km.centroids[j][i], 4)}
+            for i, name in enumerate(FEATURE_NAMES)
+        ]
+
+        # Position 2D (PCA normalisée → pixels ReactFlow)
+        px_x = centroid_2d[j][0] * CANVAS_W * 0.85 + 80
+        px_y = (1 - centroid_2d[j][1]) * CANVAS_H * 0.85 + 50  # inverser y (haut=risque)
+
+        # Rayon ∝ √ip_count
+        radius = max(18, min(90, int(math.sqrt(ip_count) * 0.3)))
+
+        # Sample IPs (top 8 par hits)
+        sample_rows = sorted(cluster_rows[j], key=lambda r: float(r.get("hits") or 0), reverse=True)[:8]
+        sample_ips  = [r["ip"] for r in sample_rows]
+        sample_ua   = str(cluster_rows[j][0].get("ua") or "")
+
+        cluster_id = f"c{j}_k{k}"
+
+        nodes.append({
+            "id":          cluster_id,
+            "label":       label,
+            "cluster_idx": j,
+            "x": round(px_x, 1),
+            "y": round(px_y, 1),
+            "radius":      radius,
+            "color":       color,
+            "risk_score":  risk,
+
+            # Caractéristiques TCP
+            "mean_ttl":    round(mean_ttl, 1),
+            "mean_mss":    round(mean_mss, 0),
+            "mean_scale":  round(mean_scale, 1),
+            "mean_win":    round(mean_win, 0),
+
+            # Comportement HTTP
+            "mean_score":  round(mean_score, 4),
+            "mean_velocity": round(mean_vel, 3),
+            "mean_fuzzing": round(mean_fuzz, 3),
+            "mean_headless": round(mean_hless, 3),
+            "mean_post":   round(mean_post, 3),
+            "mean_asset":  round(mean_asset, 3),
+            "mean_direct": round(mean_direct, 3),
+
+            # TLS / Protocole
+            "mean_alpn_mismatch": round(mean_alpn, 3),
+            "mean_h2_eff":    round(mean_h2, 3),
+            "mean_hdr_conf":  round(mean_hconf, 3),
+            "mean_ua_ch":     round(mean_ua_ch, 3),
+
+            # Temporel
+            "mean_entropy": round(mean_entr, 3),
+            "mean_ja4_diversity": round(mean_ja4, 3),
+            "mean_ip_id_zero":   round(mean_ip_id, 3),
+            "mean_browser_score": round(mean_brow, 1),
+            "mean_ua_rotating":  round(mean_uarot, 3),
+
+            # Meta
+            "ip_count":    ip_count,
+            "hit_count":   hit_count,
+            "top_threat":  topk(threats, 1)[0] if topk(threats, 1) else "",
+            "top_countries": topk(countries, 5),
+            "top_orgs":    topk(orgs, 5),
+            "sample_ips":  sample_ips,
+            "sample_ua":   sample_ua,
+
+            # Profil radar pour visualisation
+            "radar": radar,
+        })
+
+    # ── Arêtes : k-NN dans l'espace des features ──────────────────────────
+    # Chaque cluster est connecté à ses 2 voisins les plus proches
+    edges = []
+    seen: set[frozenset] = set()
+    centroids = km.centroids
+
+    for i, ni in enumerate(nodes):
+        ci = ni["cluster_idx"]
+        # Distance² aux autres centroïdes
+        dists = [
+            (j, nj["cluster_idx"],
+             sum((centroids[ci][d] - centroids[nj["cluster_idx"]][d]) ** 2
+                 for d in range(N_FEATURES)))
+            for j, nj in enumerate(nodes) if j != i
+        ]
+        dists.sort(key=lambda x: x[2])
+        # 2 voisins les plus proches
+        for j, cj, dist2 in dists[:2]:
+            key = frozenset([ni["id"], nodes[j]["id"]])
+            if key in seen:
+                continue
+            seen.add(key)
+            similarity = round(1.0 / (1.0 + math.sqrt(dist2)), 3)
+            edges.append({
+                "id":         f"e_{ni['id']}_{nodes[j]['id']}",
+                "source":     ni["id"],
+                "target":     nodes[j]["id"],
+                "similarity": similarity,
+                "weight":     round(similarity * 5, 1),
+            })
+
+    # ── Stats globales ────────────────────────────────────────────────────
+    total_ips  = sum(n["ip_count"]  for n in nodes)
+    total_hits = sum(n["hit_count"] for n in nodes)
+    bot_ips    = sum(n["ip_count"]  for n in nodes if n["risk_score"] > 0.40 or "🤖" in n["label"])
+    high_risk  = sum(n["ip_count"]  for n in nodes if n["risk_score"] > 0.20)
+
+    elapsed = round(time.time() - t0, 2)
+
+    return {
+        "nodes": nodes,
+        "edges": edges,
+        "stats": {
+            "total_clusters":  len(nodes),
+            "total_ips":       total_ips,
+            "total_hits":      total_hits,
+            "bot_ips":         bot_ips,
+            "high_risk_ips":   high_risk,
+            "n_samples":       len(rows),
+            "k":               k,
+            "elapsed_s":       elapsed,
+        },
+        "feature_names": FEATURE_NAMES,
+    }
+
+
+@router.get("/cluster/{cluster_id}/ips")
+async def get_cluster_ips(
+    cluster_id: str,
+    limit:  int = Query(100, ge=1, le=500),
+    offset: int = Query(0, ge=0),
+):
+    """
+    IPs appartenant à un cluster (depuis le cache de la dernière exécution).
+    Si le cache est expiré, retourne une erreur guidant vers /clusters.
+    """
+    if not _cache.get("cluster_ips"):
+        raise HTTPException(
+            status_code=404,
+            detail="Cache expiré — appelez /api/clustering/clusters d'abord"
+        )
+
+    # Extrait l'index cluster depuis l'id (format: c{idx}_k{k})
+    try:
+        idx = int(cluster_id.split("_")[0][1:])
+    except (ValueError, IndexError):
+        raise HTTPException(status_code=400, detail="cluster_id invalide")
+
+    members = _cache["cluster_ips"].get(idx, [])
+    if not members:
+        return {"ips": [], "total": 0, "cluster_id": cluster_id}
+
+    total = len(members)
+    page_members = members[offset: offset + limit]
+
+    # Requête SQL pour les détails de ces IPs spécifiques
+    ip_list  = [m[0] for m in page_members]
+    ja4_list = [m[1] for m in page_members]
+
+    if not ip_list:
+        return {"ips": [], "total": total, "cluster_id": cluster_id}
+
+    # On ne peut pas facilement passer une liste en paramètre ClickHouse —
+    # on la construit directement (valeurs nettoyées)
+    safe_ips  = [ip.replace("'", "") for ip in ip_list[:100]]
+    ip_filter = ", ".join(f"'{ip}'" for ip in safe_ips)
+
+    sql = f"""
+    SELECT
+        replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') AS src_ip,
+        t.ja4,
+        any(t.tcp_ttl_raw)   AS ttl,
+        any(t.tcp_win_raw)   AS win,
+        any(t.tcp_scale_raw) AS scale,
+        any(t.tcp_mss_raw)   AS mss,
+        sum(t.hits)          AS hits,
+        any(t.first_ua)      AS ua,
+        round(avg(abs(ml.anomaly_score)), 3) AS avg_score,
+        max(ml.threat_level)                 AS threat_level,
+        any(ml.country_code)                 AS country_code,
+        any(ml.asn_org)                      AS asn_org,
+        round(avg(ml.fuzzing_index), 2)      AS fuzzing,
+        round(avg(ml.hit_velocity), 2)       AS velocity
+    FROM mabase_prod.agg_host_ip_ja4_1h t
+    LEFT JOIN mabase_prod.ml_detected_anomalies ml
+        ON t.src_ip = ml.src_ip AND t.ja4 = ml.ja4
+       AND ml.detected_at >= now() - INTERVAL 24 HOUR
+    WHERE t.window_start >= now() - INTERVAL 24 HOUR
+      AND replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') IN ({ip_filter})
+    GROUP BY t.src_ip, t.ja4
+    ORDER BY hits DESC
+    """
+    try:
+        result = db.query(sql)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+    ips = []
+    for row in result.result_rows:
+        ips.append({
+            "ip":           str(row[0]),
+            "ja4":          str(row[1] or ""),
+            "tcp_ttl":      int(row[2] or 0),
+            "tcp_win":      int(row[3] or 0),
+            "tcp_scale":    int(row[4] or 0),
+            "tcp_mss":      int(row[5] or 0),
+            "hits":         int(row[6] or 0),
+            "ua":           str(row[7] or ""),
+            "avg_score":    float(row[8] or 0),
+            "threat_level": str(row[9] or ""),
+            "country_code": str(row[10] or ""),
+            "asn_org":      str(row[11] or ""),
+            "fuzzing":      float(row[12] or 0),
+            "velocity":     float(row[13] or 0),
+        })
+
+    return {"ips": ips, "total": total, "cluster_id": cluster_id}
--- a/backend/routes/investigation_summary.py
+++ b/backend/routes/investigation_summary.py
@ -7,6 +7,7 @@ agg_host_ip_ja4_1h (rotation JA4), view_ip_recurrence, view_ai_features_1h.
 from fastapi import APIRouter, HTTPException

 from ..database import db
+from ..services.tcp_fingerprint import fingerprint_os, detect_spoof, declared_os_from_ua

 router = APIRouter(prefix="/api/investigation", tags=["investigation"])

@ -62,32 +63,45 @@ async def get_ip_full_summary(ip: str):
            "top_hosts":      [str(h) for h in (bf_row[3] or [])] if bf_row else [],
        }

-        # ── 3. TCP spoofing ────────────────────────────────────────────────────
+        # ── 3. TCP spoofing — fingerprinting multi-signal ─────────────────────
        tcp_sql = """
-        SELECT tcp_ttl, first_ua
-        FROM mabase_prod.view_tcp_spoofing_detected
+        SELECT
+            any(tcp_ttl_raw)   AS ttl,
+            any(tcp_win_raw)   AS win,
+            any(tcp_scale_raw) AS scale,
+            any(tcp_mss_raw)   AS mss,
+            any(first_ua)      AS ua
+        FROM mabase_prod.agg_host_ip_ja4_1h
        WHERE replaceRegexpAll(toString(src_ip), '^::ffff:', '') = %(ip)s
-          AND tcp_ttl > 0
+          AND window_start >= now() - INTERVAL 24 HOUR
+          AND tcp_ttl_raw > 0
        LIMIT 1
        """
        tcp_res = db.query(tcp_sql, {"ip": clean_ip})
        tcp_data = {"detected": False, "tcp_ttl": None, "suspected_os": None}
        if tcp_res.result_rows:
-            ttl = int(tcp_res.result_rows[0][0])
-            if 52 <= ttl <= 65:
-                sus_os = "Linux/Mac"
-            elif 110 <= ttl <= 135:
-                sus_os = "Windows"
-            else:
-                sus_os = "Unknown"
-            ua = str(tcp_res.result_rows[0][1] or "")
-            dec_os = "Windows" if "Windows" in ua else ("macOS" if "Mac OS X" in ua else "Linux/Android" if "Linux" in ua else "Unknown")
-            spoof = sus_os != "Unknown" and dec_os != "Unknown" and sus_os != dec_os
+            r     = tcp_res.result_rows[0]
+            ttl   = int(r[0] or 0)
+            win   = int(r[1] or 0)
+            scale = int(r[2] or 0)
+            mss   = int(r[3] or 0)
+            ua    = str(r[4] or "")
+            fp         = fingerprint_os(ttl, win, scale, mss)
+            dec_os     = declared_os_from_ua(ua)
+            spoof_res  = detect_spoof(fp, dec_os)
            tcp_data = {
-                "detected":     spoof,
-                "tcp_ttl":      ttl,
-                "suspected_os": sus_os,
-                "declared_os":  dec_os,
+                "detected":      spoof_res.is_spoof,
+                "tcp_ttl":       ttl,
+                "tcp_mss":       mss,
+                "tcp_win_scale": scale,
+                "initial_ttl":   fp.initial_ttl,
+                "hop_count":     fp.hop_count,
+                "suspected_os":  fp.os_name,
+                "declared_os":   dec_os,
+                "confidence":    fp.confidence,
+                "network_path":  fp.network_path,
+                "is_bot_tool":   fp.is_bot_tool,
+                "spoof_reason":  spoof_res.reason,
            }

        # ── 4. JA4 rotation ────────────────────────────────────────────────────
@ -146,7 +160,9 @@ async def get_ip_full_summary(ip: str):
        risk = 0
        risk += min(50, ml_data["max_score"] * 50)
        if bf_data["active"]:        risk += 20
-        if tcp_data["detected"]:     risk += 15
+        if tcp_data["detected"]:
+            if tcp_data.get("is_bot_tool"):   risk += 30  # outil de scan connu
+            else:                             risk += 15  # spoof OS
        if rot_data["rotating"]:     risk += min(15, rot_data["distinct_ja4_count"] * 3)
        if pers_data["persistent"]:  risk += min(10, pers_data["recurrence"] * 2)
        risk = min(100, round(risk))
--- a/backend/routes/tcp_spoofing.py
+++ b/backend/routes/tcp_spoofing.py
@ -1,130 +1,95 @@
 """
-Endpoints pour la détection du TCP spoofing (TTL / window size anormaux)
+Endpoints pour la détection du TCP spoofing / fingerprinting OS

-Règle de corrélation :
-  - TTL=0 ou tcp_window_size=0 → données TCP absentes (proxy/LB) → pas de corrélation possible
-  - TTL 55-65  → fingerprint Linux/Mac (initial TTL 64)
-  - TTL 120-135 → fingerprint Windows (initial TTL 128)
-  - TTL 110-120 → fingerprint Windows (initial TTL 128, quelques sauts)
-  - Toute autre valeur → OS indéterminé → pas de flag spoofing
-  - spoof_flag = True UNIQUEMENT si OS fingerprinting TCP possible ET incompatible avec l'UA
+Approche multi-signal (p0f-style) :
+  - TTL initial estimé  → famille OS (Linux/Mac=64, Windows=128, Cisco/BSD=255)
+  - MSS                 → type de réseau (Ethernet=1460, PPPoE=1452, VPN=1380-1420)
+  - Taille de fenêtre   → signature OS précise
+  - Facteur d'échelle   → affine la version kernel/stack TCP
+
+Détection bots : signatures connues (Masscan/ZMap/Mirai) identifiées par combinaison
+win+scale+mss indépendamment de l'UA.
 """
 from fastapi import APIRouter, HTTPException, Query

 from ..database import db
+from ..services.tcp_fingerprint import (
+    fingerprint_os,
+    detect_spoof,
+    declared_os_from_ua,
+)

 router = APIRouter(prefix="/api/tcp-spoofing", tags=["tcp_spoofing"])

-# Plages TTL qui permettent une corrélation fiable
-_TTL_LINUX   = (range(52, 66), "Linux/Mac")    # initial 64, 1-12 sauts
-_TTL_WINDOWS = (range(110, 136), "Windows")    # initial 128, 1-18 sauts
-_TTL_CISCO   = (range(240, 256), "Cisco/BSD")  # initial 255
-
-
-def _suspected_os(ttl: int) -> str:
-    """Retourne l'OS probable à partir du TTL observé.
-    Retourne 'Unknown' si le TTL ne permet pas une corrélation fiable
-    (TTL=0 = pas de données TCP, ou hors plage connue).
-    """
-    if ttl <= 0:
-        return "Unknown"  # Pas de données TCP (proxy/CDN)
-    for rng, name in (_TTL_LINUX, _TTL_WINDOWS, _TTL_CISCO):
-        if ttl in rng:
-            return name
-    return "Unknown"
-
-
-def _declared_os(ua: str) -> str:
-    ua = ua or ""
-    if "Windows" in ua:
-        return "Windows"
-    if "Mac OS X" in ua:
-        return "macOS"
-    if "Linux" in ua or "Android" in ua:
-        return "Linux/Android"
-    return "Unknown"
-
-
-def _is_spoof(suspected_os: str, declared_os: str) -> bool:
-    """Spoof confirmé uniquement si on a un fingerprint TCP fiable ET une incompatibilité d'OS."""
-    if suspected_os == "Unknown" or declared_os == "Unknown":
-        return False  # Pas de corrélation possible
-    # Linux/Mac fingerprint TCP mais UA déclare Windows
-    if suspected_os == "Linux/Mac" and declared_os == "Windows":
-        return True
-    # Windows fingerprint TCP mais UA déclare Linux/Android ou macOS
-    if suspected_os == "Windows" and declared_os in ("Linux/Android", "macOS"):
-        return True
-    return False
-

@router.get("/overview")
 async def get_tcp_spoofing_overview():
-    """Statistiques globales : seules les entrées avec données TCP valides sont analysées."""
+    """Statistiques globales avec fingerprinting multi-signal (TTL + MSS + fenêtre + scale)."""
    try:
        sql = """
        SELECT
-            count()                                      AS total_entries,
-            uniq(src_ip)                                 AS unique_ips,
-            countIf(tcp_ttl = 0)                         AS no_tcp_data,
-            countIf(tcp_ttl > 0)                         AS with_tcp_data,
-            countIf(tcp_ttl BETWEEN 52 AND 65)           AS linux_fingerprint,
-            countIf(tcp_ttl BETWEEN 110 AND 135)         AS windows_fingerprint
-        FROM mabase_prod.view_tcp_spoofing_detected
+            count()                                                              AS total_entries,
+            uniq(src_ip)                                                         AS unique_ips,
+            countIf(tcp_ttl_raw = 0)                                             AS no_tcp_data,
+            countIf(tcp_ttl_raw > 0)                                             AS with_tcp_data,
+            countIf(tcp_ttl_raw > 0 AND tcp_ttl_raw <= 64)                       AS linux_mac_fp,
+            countIf(tcp_ttl_raw > 64 AND tcp_ttl_raw <= 128)                     AS windows_fp,
+            countIf(tcp_ttl_raw > 128)                                           AS cisco_bsd_fp,
+            countIf(tcp_win_raw = 5808 AND tcp_mss_raw = 1452 AND tcp_scale_raw = 4) AS bot_scanner_fp
+        FROM mabase_prod.agg_host_ip_ja4_1h
+        WHERE window_start >= now() - INTERVAL 24 HOUR
        """
        result = db.query(sql)
        row = result.result_rows[0]
-        total_entries    = int(row[0])
-        unique_ips       = int(row[1])
-        no_tcp_data      = int(row[2])
-        with_tcp_data    = int(row[3])
-        linux_fp         = int(row[4])
-        windows_fp       = int(row[5])

-        # Distribution TTL uniquement pour les entrées avec données TCP valides
+        # Distribution TTL (top 15)
        ttl_sql = """
-        SELECT
-            tcp_ttl,
-            count()      AS cnt,
-            uniq(src_ip) AS ips
-        FROM mabase_prod.view_tcp_spoofing_detected
-        WHERE tcp_ttl > 0
-        GROUP BY tcp_ttl
-        ORDER BY cnt DESC
-        LIMIT 15
+        SELECT tcp_ttl_raw AS ttl, count() AS cnt, uniq(src_ip) AS ips
+        FROM mabase_prod.agg_host_ip_ja4_1h
+        WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
+        GROUP BY ttl ORDER BY cnt DESC LIMIT 15
        """
        ttl_res = db.query(ttl_sql)
-        ttl_distribution = [
-            {"ttl": int(r[0]), "count": int(r[1]), "ips": int(r[2])}
-            for r in ttl_res.result_rows
-        ]

-        # Distribution window_size pour entrées avec données TCP
+        # Distribution MSS — nouveau signal clé (top 12)
+        mss_sql = """
+        SELECT tcp_mss_raw AS mss, count() AS cnt, uniq(src_ip) AS ips
+        FROM mabase_prod.agg_host_ip_ja4_1h
+        WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_mss_raw > 0
+        GROUP BY mss ORDER BY cnt DESC LIMIT 12
+        """
+        mss_res = db.query(mss_sql)
+
+        # Distribution fenêtre (top 10)
        win_sql = """
-        SELECT
-            tcp_window_size,
-            count() AS cnt
-        FROM mabase_prod.view_tcp_spoofing_detected
-        WHERE tcp_ttl > 0
-        GROUP BY tcp_window_size
-        ORDER BY cnt DESC
-        LIMIT 10
+        SELECT tcp_win_raw AS win, count() AS cnt
+        FROM mabase_prod.agg_host_ip_ja4_1h
+        WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
+        GROUP BY win ORDER BY cnt DESC LIMIT 10
        """
        win_res = db.query(win_sql)
-        window_size_distribution = [
-            {"window_size": int(r[0]), "count": int(r[1])}
-            for r in win_res.result_rows
-        ]

        return {
-            "total_entries":             total_entries,
-            "unique_ips":                unique_ips,
-            "no_tcp_data":               no_tcp_data,
-            "with_tcp_data":             with_tcp_data,
-            "linux_fingerprint":         linux_fp,
-            "windows_fingerprint":       windows_fp,
-            "ttl_distribution":          ttl_distribution,
-            "window_size_distribution":  window_size_distribution,
+            "total_entries":            int(row[0]),
+            "unique_ips":               int(row[1]),
+            "no_tcp_data":              int(row[2]),
+            "with_tcp_data":            int(row[3]),
+            "linux_mac_fingerprint":    int(row[4]),
+            "windows_fingerprint":      int(row[5]),
+            "cisco_bsd_fingerprint":    int(row[6]),
+            "bot_scanner_fingerprint":  int(row[7]),
+            "ttl_distribution":         [
+                {"ttl": int(r[0]), "count": int(r[1]), "ips": int(r[2])}
+                for r in ttl_res.result_rows
+            ],
+            "mss_distribution":         [
+                {"mss": int(r[0]), "count": int(r[1]), "ips": int(r[2])}
+                for r in mss_res.result_rows
+            ],
+            "window_size_distribution": [
+                {"window_size": int(r[0]), "count": int(r[1])}
+                for r in win_res.result_rows
+            ],
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
@ -134,54 +99,75 @@ async def get_tcp_spoofing_overview():
 async def get_tcp_spoofing_list(
    limit:      int  = Query(100, ge=1, le=1000),
    offset:     int  = Query(0, ge=0),
-    spoof_only: bool = Query(False, description="Ne retourner que les vrais spoofs (TTL corrélable + OS mismatch)"),
+    spoof_only: bool = Query(False, description="Retourner uniquement les spoofs/bots confirmés"),
 ):
-    """Liste des entrées avec données TCP valides (tcp_ttl > 0).
-    Entrées sans données TCP (TTL=0) exclues : pas de corrélation possible.
-    Si spoof_only=True, retourne uniquement les entrées avec fingerprint OS identifiable (Linux/Mac TTL 52-65).
+    """Liste avec fingerprinting multi-signal (TTL + MSS + fenêtre + scale).
+    Inclut les champs enrichis : mss, win_scale, initial_ttl, hop_count, confidence, network_path, is_bot_tool.
    """
    try:
-        # Filtre SQL : seules les entrées avec TTL valide, et si spoof_only les plages corrélables
-        if spoof_only:
-            # Seules les plages de TTL qui permettent une identification OS fiable
-            ttl_filter = "tcp_ttl BETWEEN 52 AND 65 OR tcp_ttl BETWEEN 110 AND 135 OR tcp_ttl BETWEEN 240 AND 255"
-        else:
-            ttl_filter = "tcp_ttl > 0"
-
-        count_sql = f"SELECT count() FROM mabase_prod.view_tcp_spoofing_detected WHERE {ttl_filter}"
+        count_sql = """
+        SELECT count() FROM (
+            SELECT src_ip, ja4
+            FROM mabase_prod.agg_host_ip_ja4_1h
+            WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
+            GROUP BY src_ip, ja4
+        )
+        """
        total = int(db.query(count_sql).result_rows[0][0])

-        sql = f"""
+        sql = """
        SELECT
            replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS src_ip,
-            ja4, tcp_ttl, tcp_window_size, first_ua
-        FROM mabase_prod.view_tcp_spoofing_detected
-        WHERE {ttl_filter}
-        ORDER BY tcp_ttl ASC
+            ja4,
+            any(tcp_ttl_raw)   AS tcp_ttl,
+            any(tcp_win_raw)   AS tcp_window_size,
+            any(tcp_scale_raw) AS tcp_win_scale,
+            any(tcp_mss_raw)   AS tcp_mss,
+            any(first_ua)      AS first_ua,
+            sum(hits)          AS hits
+        FROM mabase_prod.agg_host_ip_ja4_1h
+        WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
+        GROUP BY src_ip, ja4
+        ORDER BY hits DESC
        LIMIT %(limit)s OFFSET %(offset)s
        """
        result = db.query(sql, {"limit": limit, "offset": offset})
        items = []
        for row in result.result_rows:
-            ip          = str(row[0])
-            ja4         = str(row[1] or "")
-            ttl         = int(row[2])
-            window_size = int(row[3])
-            ua          = str(row[4] or "")
-            sus_os      = _suspected_os(ttl)
-            dec_os      = _declared_os(ua)
-            spoof_flag  = _is_spoof(sus_os, dec_os)
-            if spoof_only and not spoof_flag:
+            ip    = str(row[0])
+            ja4   = str(row[1] or "")
+            ttl   = int(row[2] or 0)
+            win   = int(row[3] or 0)
+            scale = int(row[4] or 0)
+            mss   = int(row[5] or 0)
+            ua    = str(row[6] or "")
+            hits  = int(row[7] or 0)
+
+            fp         = fingerprint_os(ttl, win, scale, mss)
+            dec_os     = declared_os_from_ua(ua)
+            spoof_res  = detect_spoof(fp, dec_os)
+
+            if spoof_only and not spoof_res.is_spoof:
                continue
+
            items.append({
                "ip":              ip,
                "ja4":             ja4,
                "tcp_ttl":         ttl,
-                "tcp_window_size": window_size,
+                "tcp_window_size": win,
+                "tcp_win_scale":   scale,
+                "tcp_mss":         mss,
+                "hits":            hits,
                "first_ua":        ua,
-                "suspected_os":    sus_os,
+                "suspected_os":    fp.os_name,
+                "initial_ttl":     fp.initial_ttl,
+                "hop_count":       fp.hop_count,
+                "confidence":      fp.confidence,
+                "network_path":    fp.network_path,
+                "is_bot_tool":     fp.is_bot_tool,
                "declared_os":     dec_os,
-                "spoof_flag":      spoof_flag,
+                "spoof_flag":      spoof_res.is_spoof,
+                "spoof_reason":    spoof_res.reason,
            })
        return {"items": items, "total": total}
    except Exception as e:
@ -190,29 +176,44 @@ async def get_tcp_spoofing_list(

@router.get("/matrix")
 async def get_tcp_spoofing_matrix():
-    """Matrice suspected_os × declared_os — uniquement entrées avec TTL valide."""
+    """Matrice OS suspecté × OS déclaré avec fingerprinting multi-signal."""
    try:
        sql = """
-        SELECT tcp_ttl, first_ua
-        FROM mabase_prod.view_tcp_spoofing_detected
-        WHERE tcp_ttl > 0
+        SELECT
+            any(tcp_ttl_raw)   AS ttl,
+            any(tcp_win_raw)   AS win,
+            any(tcp_scale_raw) AS scale,
+            any(tcp_mss_raw)   AS mss,
+            any(first_ua)      AS ua,
+            count()            AS cnt
+        FROM mabase_prod.agg_host_ip_ja4_1h
+        WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
+        GROUP BY src_ip, ja4
        """
        result = db.query(sql)
        counts: dict = {}
        for row in result.result_rows:
-            ttl    = int(row[0])
-            ua     = str(row[1] or "")
-            sus_os = _suspected_os(ttl)
-            dec_os = _declared_os(ua)
-            key    = (sus_os, dec_os)
-            counts[key] = counts.get(key, 0) + 1
+            ttl   = int(row[0] or 0)
+            win   = int(row[1] or 0)
+            scale = int(row[2] or 0)
+            mss   = int(row[3] or 0)
+            ua    = str(row[4] or "")
+            cnt   = int(row[5] or 1)
+
+            fp        = fingerprint_os(ttl, win, scale, mss)
+            dec_os    = declared_os_from_ua(ua)
+            spoof_res = detect_spoof(fp, dec_os)
+
+            key = (fp.os_name, dec_os, spoof_res.is_spoof, fp.is_bot_tool)
+            counts[key] = counts.get(key, 0) + cnt

        matrix = [
            {
                "suspected_os": k[0],
                "declared_os":  k[1],
                "count":        v,
-                "is_spoof":     _is_spoof(k[0], k[1]),
+                "is_spoof":     k[2],
+                "is_bot_tool":  k[3],
            }
            for k, v in counts.items()
        ]