feat: ja4-platform monorepo — 5 services unified, tests & RPM builds standardized

Services: - ja4sentinel: TLS/JA4 fingerprint capture daemon (Go, libpcap) - logcorrelator: JA4 log correlation engine (Go, ClickHouse) - mod_reqin_log: Apache module (C, JSON request logging) - bot_detector: ML bot detection pipeline (Python) - dashboard: FastAPI/Streamlit analytics UI (Python) Shared libraries: - shared/go/ja4common: logger, config, shutdown, ipfilter (Go module) - shared/python/ja4_common: ClickHouseClient, ClickHouseSettings (Python package) - shared/clickhouse/: canonical SQL migrations (10 files) Build & packaging: - Unified 3-stage Dockerfile.package for Go RPMs (el8/el9/el10) - go.work workspace linking sentinel, correlator, ja4common - Makefile with test-all, build-all, rpm-* targets Fixes applied: - go.work: 1.21 → 1.24.6 (required by sentinel) - correlator Dockerfiles: golang:1.21 → golang:1.24 - replace directives in go.mod for ja4common local path - pyproject.toml: setuptools.backends → setuptools.build_meta - Removed static libpcap linking (unavailable on Rocky 9) - Fixed data races in output/writers_test.go (sync.Mutex + atomic.Int32) - Rewrote corrupted test files (logger_test.go × 2) Test coverage: - correlator: 67.1% total (unixsocket 80.5%, config 91.7%, app 83.3%, multi 87.7%, stdout 100%) - sentinel: all 10 packages pass (api, capture, config, fingerprint, ipfilter, logging, output, tlsparse) Documentation: - README.md + docs/ (architecture, development, 5 services, shared libs, DB schema & migrations) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-07 16:42:59 +02:00
commit d469e39da7
278 changed files with 1621301 additions and 0 deletions
--- a/services/dashboard/backend/routes/fingerprints.py
+++ b/services/dashboard/backend/routes/fingerprints.py
@ -0,0 +1,827 @@
+"""
+Endpoints pour l'analyse des fingerprints JA4 et User-Agents
+
+Objectifs:
+  - Détecter le spoofing JA4 (fingerprint TLS qui prétend être un navigateur mais
+    dont les User-Agents, les headers HTTP ou les métriques comportementales trahissent
+    une origine bot/script)
+  - Construire une matrice JA4 × User-Agent pour visualiser les associations suspectes
+  - Analyser la distribution des User-Agents pour identifier les rotateurs et les bots
+    qui usurpent des UA de navigateurs légitimes
+"""
+from fastapi import APIRouter, HTTPException, Query
+import re
+
+from ..database import db
+
+router = APIRouter(prefix="/api/fingerprints", tags=["fingerprints"])
+
+
+# ─── Helpers ──────────────────────────────────────────────────────────────────
+
+# Patterns indiquant clairement un bot/script sans simulation de navigateur
+_BOT_PATTERNS = re.compile(
+    r"bot|crawler|spider|scraper|python|curl|wget|go-http|java/|axios|"
+    r"libwww|httpclient|okhttp|requests|aiohttp|httpx|playwright|puppeteer|"
+    r"selenium|headless|phantomjs",
+    re.IGNORECASE,
+)
+
+# Navigateurs légitimes communs — un JA4 de type "browser" devrait venir avec ces UAs
+_BROWSER_PATTERNS = re.compile(
+    r"mozilla|chrome|safari|firefox|edge|opera|trident",
+    re.IGNORECASE,
+)
+
+
+def _classify_ua(ua: str) -> str:
+    """Retourne 'bot', 'browser', ou 'script'"""
+    if not ua:
+        return "empty"
+    if _BOT_PATTERNS.search(ua):
+        return "bot"
+    if _BROWSER_PATTERNS.search(ua):
+        return "browser"
+    return "script"
+
+
+# =============================================================================
+# ENDPOINT 1 — Détection de spoofing JA4
+# =============================================================================
+
+@router.get("/spoofing")
+async def get_ja4_spoofing(
+    hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle"),
+    min_detections: int = Query(10, ge=1, description="Nombre minimum de détections"),
+    limit: int = Query(50, ge=1, le=200),
+):
+    """
+    Identifie les JA4 fingerprints suspects de spoofing navigateur.
+
+    Un JA4 est considéré suspect quand:
+      - Il présente un taux élevé de ua_ch_mismatch (header UA ≠ Client Hints)
+      - Son modern_browser_score est élevé mais les UAs associés sont des bots/scripts
+      - Il apparaît avec un taux élevé de sni_host_mismatch ou alpn_http_mismatch
+      - is_rare_ja4 = true avec un volume important
+
+    Retourne un score de confiance de spoofing [0-100] pour chaque JA4.
+    """
+    try:
+        # Agrégation par JA4 avec tous les indicateurs de spoofing
+        query = """
+        SELECT
+            ja4,
+            count() AS total_detections,
+            uniq(src_ip) AS unique_ips,
+
+            -- Indicateurs de mismatch
+            countIf(ua_ch_mismatch = true) AS ua_ch_mismatch_count,
+            round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
+            countIf(sni_host_mismatch = true) AS sni_mismatch_count,
+            round(countIf(sni_host_mismatch = true) * 100.0 / count(), 2) AS sni_mismatch_pct,
+            countIf(alpn_http_mismatch = true) AS alpn_mismatch_count,
+            round(countIf(alpn_http_mismatch = true) * 100.0 / count(), 2) AS alpn_mismatch_pct,
+
+            -- Indicateurs comportementaux
+            avg(modern_browser_score) AS avg_browser_score,
+            countIf(is_rare_ja4 = true) AS rare_ja4_count,
+            round(countIf(is_rare_ja4 = true) * 100.0 / count(), 2) AS rare_ja4_pct,
+            countIf(is_ua_rotating = true) AS ua_rotating_count,
+            round(countIf(is_ua_rotating = true) * 100.0 / count(), 2) AS ua_rotating_pct,
+
+            -- Métriques TLS/TCP
+            countIf(is_alpn_missing = true) AS alpn_missing_count,
+            avg(distinct_ja4_count) AS avg_distinct_ja4_per_ip,
+
+            -- Répartition threat levels
+            countIf(threat_level = 'CRITICAL') AS critical_count,
+            countIf(threat_level = 'HIGH') AS high_count,
+
+            -- Botnet indicators
+            avg(ja4_asn_concentration) AS avg_asn_concentration,
+            avg(ja4_country_concentration) AS avg_country_concentration,
+
+            argMax(threat_level, detected_at) AS last_threat_level
+        FROM ml_detected_anomalies
+        WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
+          AND ja4 != '' AND ja4 IS NOT NULL
+        GROUP BY ja4
+        HAVING total_detections >= %(min_detections)s
+        ORDER BY ua_ch_mismatch_pct DESC, total_detections DESC
+        LIMIT %(limit)s
+        """
+
+        result = db.query(query, {
+            "hours": hours,
+            "min_detections": min_detections,
+            "limit": limit,
+        })
+
+        # Fetch top UA per JA4 from view_dashboard_user_agents
+        ja4_list = [str(r[0]) for r in result.result_rows if r[0]]
+        ua_by_ja4: dict = {}
+        if ja4_list:
+            ja4_sql = ", ".join(f"'{j}'" for j in ja4_list[:100])
+            ua_q = f"""
+            SELECT ja4, groupArray(5)(ua) AS top_uas
+            FROM (
+                SELECT ja4, arrayJoin(user_agents) AS ua, sum(requests) AS cnt
+                FROM view_dashboard_user_agents
+                WHERE ja4 IN ({ja4_sql})
+                  AND hour >= now() - INTERVAL {hours} HOUR
+                  AND ua != ''
+                GROUP BY ja4, ua
+                ORDER BY ja4, cnt DESC
+            )
+            GROUP BY ja4
+            """
+            try:
+                ua_res = db.query(ua_q)
+                for ua_row in ua_res.result_rows:
+                    j4 = str(ua_row[0])
+                    if ua_row[1]:
+                        ua_by_ja4[j4] = list(ua_row[1])
+            except Exception:
+                pass
+
+        items = []
+        for row in result.result_rows:
+            ja4 = str(row[0])
+            ua_ch_mismatch_pct = float(row[4] or 0)
+            sni_mismatch_pct = float(row[6] or 0)
+            alpn_mismatch_pct = float(row[8] or 0)
+            avg_browser_score = float(row[9] or 0)
+            rare_ja4_pct = float(row[11] or 0)
+            ua_rotating_pct = float(row[13] or 0)
+            alpn_missing_count = int(row[14] or 0)
+            total = int(row[1] or 1)
+
+            top_uas = ua_by_ja4.get(ja4, [])
+            ua_classes = [_classify_ua(u) for u in top_uas]
+            has_bot_ua = any(c == "bot" for c in ua_classes)
+            has_browser_ua = any(c == "browser" for c in ua_classes)
+
+            # Spoofing confidence score [0-100]:
+            # UA/CH mismatch est le signal le plus fort (poids 40)
+            # Browser UA avec score navigateur élevé mais indicateurs bot (poids 25)
+            # SNI/ALPN mismatches (poids 15)
+            # is_rare_ja4 avec gros volume (poids 10)
+            # UA rotating (poids 10)
+            spoof_score = min(100, round(
+                ua_ch_mismatch_pct * 0.40
+                + (avg_browser_score * 25 / 100 if has_bot_ua else 0)
+                + sni_mismatch_pct * 0.10
+                + alpn_mismatch_pct * 0.05
+                + rare_ja4_pct * 0.10
+                + ua_rotating_pct * 0.10
+                + (10 if alpn_missing_count > total * 0.3 else 0)
+            ))
+
+            # Classification du JA4
+            if spoof_score >= 60:
+                classification = "spoofed_browser"
+            elif has_bot_ua and avg_browser_score < 30:
+                classification = "known_bot"
+            elif has_browser_ua and ua_ch_mismatch_pct < 10:
+                classification = "legitimate_browser"
+            else:
+                classification = "suspicious"
+
+            items.append({
+                "ja4": ja4,
+                "classification": classification,
+                "spoofing_score": spoof_score,
+                "total_detections": int(row[1] or 0),
+                "unique_ips": int(row[2] or 0),
+                "indicators": {
+                    "ua_ch_mismatch_pct": ua_ch_mismatch_pct,
+                    "sni_mismatch_pct": sni_mismatch_pct,
+                    "alpn_mismatch_pct": alpn_mismatch_pct,
+                    "avg_browser_score": round(avg_browser_score, 1),
+                    "rare_ja4_pct": rare_ja4_pct,
+                    "ua_rotating_pct": ua_rotating_pct,
+                    "alpn_missing_count": alpn_missing_count,
+                    "avg_asn_concentration": round(float(row[18] or 0), 3),
+                    "avg_country_concentration": round(float(row[19] or 0), 3),
+                },
+                "top_user_agents": [
+                    {"ua": u, "type": _classify_ua(u)} for u in top_uas
+                ],
+                "threat_breakdown": {
+                    "critical": int(row[16] or 0),
+                    "high": int(row[17] or 0),
+                    "last_level": str(row[20] or "LOW"),
+                },
+            })
+
+        # Trier: spoofed_browser d'abord, puis par score
+        items.sort(key=lambda x: (-x["spoofing_score"], -x["total_detections"]))
+
+        return {
+            "items": items,
+            "total": len(items),
+            "period_hours": hours,
+            "summary": {
+                "spoofed_browser": sum(1 for i in items if i["classification"] == "spoofed_browser"),
+                "known_bot": sum(1 for i in items if i["classification"] == "known_bot"),
+                "suspicious": sum(1 for i in items if i["classification"] == "suspicious"),
+                "legitimate_browser": sum(1 for i in items if i["classification"] == "legitimate_browser"),
+            },
+        }
+
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
+
+
+# =============================================================================
+# ENDPOINT 2 — Matrice JA4 × User-Agent
+# =============================================================================
+
+@router.get("/ja4-ua-matrix")
+async def get_ja4_ua_matrix(
+    hours: int = Query(24, ge=1, le=168),
+    min_ips: int = Query(3, ge=1, description="Nombre minimum d'IPs pour inclure un JA4"),
+    limit: int = Query(30, ge=1, le=100),
+):
+    """
+    Matrice JA4 × User-Agent.
+
+    Pour chaque JA4:
+      - Top User-Agents associés (depuis view_dashboard_entities)
+      - Taux de ua_ch_mismatch
+      - Classification UA (bot / browser / script)
+      - Indicateur de spoofing si browser_score élevé + UA non-navigateur
+    """
+    try:
+        # Stats JA4 depuis ml_detected_anomalies
+        stats_query = """
+        SELECT
+            ja4,
+            uniq(src_ip) AS unique_ips,
+            count() AS total_detections,
+            round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
+            avg(modern_browser_score) AS avg_browser_score,
+            countIf(is_rare_ja4 = true) AS rare_count,
+            countIf(is_ua_rotating = true) AS rotating_count,
+            argMax(threat_level, detected_at) AS last_threat
+        FROM ml_detected_anomalies
+        WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
+          AND ja4 != '' AND ja4 IS NOT NULL
+        GROUP BY ja4
+        HAVING unique_ips >= %(min_ips)s
+        ORDER BY ua_ch_mismatch_pct DESC, unique_ips DESC
+        LIMIT %(limit)s
+        """
+
+        stats_res = db.query(stats_query, {"hours": hours, "min_ips": min_ips, "limit": limit})
+        ja4_list = [str(r[0]) for r in stats_res.result_rows]
+
+        if not ja4_list:
+            return {"items": [], "total": 0, "period_hours": hours}
+
+        # UAs par JA4 depuis view_dashboard_user_agents
+        ja4_sql = ", ".join(f"'{j}'" for j in ja4_list)
+        ua_query = f"""
+        SELECT
+            ja4,
+            ua,
+            sum(requests) AS cnt
+        FROM view_dashboard_user_agents
+        ARRAY JOIN user_agents AS ua
+        WHERE ja4 IN ({ja4_sql})
+          AND hour >= now() - INTERVAL {hours} HOUR
+          AND ua != ''
+        GROUP BY ja4, ua
+        ORDER BY ja4, cnt DESC
+        """
+
+        ua_by_ja4: dict = {}
+        try:
+            ua_res = db.query(ua_query)
+            for row in ua_res.result_rows:
+                j4 = str(row[0])
+                if j4 not in ua_by_ja4:
+                    ua_by_ja4[j4] = []
+                if len(ua_by_ja4[j4]) < 8:
+                    ua_by_ja4[j4].append({"ua": str(row[1]), "count": int(row[2] or 0)})
+        except Exception:
+            pass
+
+        items = []
+        for row in stats_res.result_rows:
+            ja4 = str(row[0])
+            unique_ips = int(row[1] or 0)
+            ua_ch_mismatch_pct = float(row[3] or 0)
+            avg_browser_score = float(row[4] or 0)
+
+            top_uas = ua_by_ja4.get(ja4, [])
+            ua_total = sum(u["count"] for u in top_uas) or 1
+
+            classified_uas = []
+            for u in top_uas:
+                ua_type = _classify_ua(u["ua"])
+                classified_uas.append({
+                    "ua": u["ua"],
+                    "count": u["count"],
+                    "pct": round(u["count"] * 100 / ua_total, 1),
+                    "type": ua_type,
+                })
+
+            bot_pct = sum(u["pct"] for u in classified_uas if u["type"] == "bot")
+            browser_pct = sum(u["pct"] for u in classified_uas if u["type"] == "browser")
+
+            # Spoofing flag: JA4 ressemble à un navigateur (browser_score élevé)
+            # mais les UAs sont des bots/scripts
+            is_spoofing = avg_browser_score > 50 and bot_pct > 30 and ua_ch_mismatch_pct > 20
+
+            items.append({
+                "ja4": ja4,
+                "unique_ips": unique_ips,
+                "total_detections": int(row[2] or 0),
+                "ua_ch_mismatch_pct": ua_ch_mismatch_pct,
+                "avg_browser_score": round(avg_browser_score, 1),
+                "rare_count": int(row[5] or 0),
+                "rotating_count": int(row[6] or 0),
+                "last_threat": str(row[7] or "LOW"),
+                "user_agents": classified_uas,
+                "ua_summary": {
+                    "bot_pct": round(bot_pct, 1),
+                    "browser_pct": round(browser_pct, 1),
+                    "script_pct": round(100 - bot_pct - browser_pct, 1),
+                    "total_distinct": len(top_uas),
+                },
+                "is_spoofing_suspect": is_spoofing,
+            })
+
+        return {
+            "items": items,
+            "total": len(items),
+            "period_hours": hours,
+        }
+
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
+
+
+# =============================================================================
+# ENDPOINT 3 — Analyse globale des User-Agents
+# =============================================================================
+
+@router.get("/ua-analysis")
+async def get_ua_analysis(
+    hours: int = Query(24, ge=1, le=168),
+    limit: int = Query(50, ge=1, le=200),
+):
+    """
+    Analyse globale des User-Agents dans les détections.
+
+    Identifie:
+      - UAs de type bot/script
+      - UAs browser légitimes vs UAs browser utilisés par des bots (via ua_ch_mismatch)
+      - UAs rares/suspects qui tournent (is_ua_rotating)
+      - Distribution JA4 par UA pour détecter les UAs multi-fingerprints (rotation)
+    """
+    try:
+        # Top UAs globaux depuis view_dashboard_user_agents
+        ua_global_query = """
+        SELECT
+            ua,
+            sum(requests) AS ip_count
+        FROM view_dashboard_user_agents
+        ARRAY JOIN user_agents AS ua
+        WHERE hour >= now() - INTERVAL %(hours)s HOUR
+          AND ua != ''
+        GROUP BY ua
+        ORDER BY ip_count DESC
+        LIMIT %(limit)s
+        """
+
+        ua_global_res = db.query(ua_global_query, {"hours": hours, "limit": limit})
+        top_uas = [str(r[0]) for r in ua_global_res.result_rows]
+
+        # Pour chaque UA, chercher ses JA4 via view_dashboard_user_agents
+        ua_sql = ", ".join(f"'{u.replace(chr(39), chr(39)*2)}'" for u in top_uas[:50]) if top_uas else "''"
+        ja4_per_ua_query = f"""
+        SELECT
+            ua,
+            uniq(ja4) AS unique_ja4s,
+            groupUniqArray(3)(ja4) AS sample_ja4s
+        FROM view_dashboard_user_agents
+        ARRAY JOIN user_agents AS ua
+        WHERE ua IN ({ua_sql})
+          AND hour >= now() - INTERVAL {hours} HOUR
+          AND ua != ''
+          AND ja4 != ''
+        GROUP BY ua
+        """
+        ja4_by_ua: dict = {}
+        try:
+            ja4_res = db.query(ja4_per_ua_query)
+            for r in ja4_res.result_rows:
+                ja4_by_ua[str(r[0])] = {
+                    "unique_ja4s": int(r[1] or 0),
+                    "sample_ja4s": list(r[2] or []),
+                }
+        except Exception:
+            pass
+
+        # IPs avec is_ua_rotating depuis ml_detected_anomalies
+        rotating_query = """
+        SELECT
+            replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
+            avg(ua_ch_mismatch) AS avg_ua_ch_mismatch
+        FROM ml_detected_anomalies
+        WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
+          AND is_ua_rotating = true
+        GROUP BY clean_ip
+        ORDER BY avg_ua_ch_mismatch DESC
+        """
+        rotating_ips: list = []
+        try:
+            rot_res = db.query(rotating_query, {"hours": hours})
+            rotating_ips = [str(r[0]) for r in rot_res.result_rows]
+        except Exception:
+            pass
+
+        # Construire la réponse
+        items = []
+        for row in ua_global_res.result_rows:
+            ua = str(row[0])
+            ip_count = int(row[1] or 0)
+            ua_type = _classify_ua(ua)
+            ja4_info = ja4_by_ua.get(ua, {"unique_ja4s": 0, "sample_ja4s": []})
+
+            # UA multi-JA4 est suspect: un vrai navigateur a généralement 1-2 JA4
+            multi_ja4_flag = ja4_info["unique_ja4s"] > 3
+
+            items.append({
+                "user_agent": ua,
+                "type": ua_type,
+                "ip_count": ip_count,
+                "unique_ja4_count": ja4_info["unique_ja4s"],
+                "sample_ja4s": ja4_info["sample_ja4s"],
+                "is_multi_ja4_suspect": multi_ja4_flag,
+                "risk_flags": _build_ua_risk_flags(ua, ua_type, ja4_info["unique_ja4s"], ip_count),
+            })
+
+        # IPs avec rotation d'UA
+        ua_rotating_stats = {
+            "rotating_ip_count": len(rotating_ips),
+            "sample_rotating_ips": rotating_ips[:10],
+        }
+
+        return {
+            "items": items,
+            "total": len(items),
+            "period_hours": hours,
+            "ua_rotating_stats": ua_rotating_stats,
+            "summary": {
+                "bot_count": sum(1 for i in items if i["type"] == "bot"),
+                "browser_count": sum(1 for i in items if i["type"] == "browser"),
+                "script_count": sum(1 for i in items if i["type"] == "script"),
+                "multi_ja4_suspect_count": sum(1 for i in items if i["is_multi_ja4_suspect"]),
+            },
+        }
+
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
+
+
+def _build_ua_risk_flags(ua: str, ua_type: str, unique_ja4s: int, ip_count: int) -> list:
+    flags = []
+    if ua_type == "bot":
+        flags.append("ua_bot_signature")
+    elif ua_type == "script":
+        flags.append("ua_script_library")
+    if unique_ja4s > 5:
+        flags.append("ja4_rotation_suspect")
+    if unique_ja4s > 3 and ua_type == "browser":
+        flags.append("browser_ua_multi_fingerprint")
+    if ip_count > 100:
+        flags.append("high_volume")
+    return flags
+
+
+# =============================================================================
+# ENDPOINT 4 — JA4 d'un IP spécifique: analyse de cohérence UA/JA4
+# =============================================================================
+
+@router.get("/ip/{ip}/coherence")
+async def get_ip_fingerprint_coherence(ip: str):
+    """
+    Analyse la cohérence JA4/UA pour une IP spécifique.
+
+    Répond à la question: "Cette IP spoofait-elle son fingerprint?"
+
+    Calcule un score de cohérence basé sur:
+      - Correspondance entre JA4 (TLS client fingerprint) et User-Agent
+      - ua_ch_mismatch (User-Agent vs Client Hints)
+      - modern_browser_score vs type d'UA réel
+      - Nombre de JA4 distincts utilisés (rotation)
+      - sni_host_mismatch, alpn_http_mismatch
+    """
+    try:
+        # Données depuis ml_detected_anomalies
+        ml_query = """
+        SELECT
+            ja4,
+            ua_ch_mismatch,
+            modern_browser_score,
+            sni_host_mismatch,
+            alpn_http_mismatch,
+            is_alpn_missing,
+            is_rare_ja4,
+            is_ua_rotating,
+            distinct_ja4_count,
+            header_count,
+            has_accept_language,
+            has_cookie,
+            has_referer,
+            header_order_shared_count,
+            detected_at,
+            threat_level,
+            window_mss_ratio,
+            tcp_jitter_variance,
+            multiplexing_efficiency
+        FROM ml_detected_anomalies
+        WHERE src_ip = %(ip)s
+        ORDER BY detected_at DESC
+        """
+        ml_res = db.query(ml_query, {"ip": ip})
+
+        if not ml_res.result_rows:
+            raise HTTPException(status_code=404, detail="IP non trouvée dans les détections")
+
+        # User-agents réels depuis view_dashboard_user_agents
+        ua_query = """
+        SELECT ua, sum(requests) AS cnt
+        FROM view_dashboard_user_agents
+        ARRAY JOIN user_agents AS ua
+        WHERE toString(src_ip) = %(ip)s
+          AND hour >= now() - INTERVAL 72 HOUR
+          AND ua != ''
+        GROUP BY ua ORDER BY cnt DESC
+        """
+        ua_res = db.query(ua_query, {"ip": ip})
+        top_uas = [{"ua": str(r[0]), "count": int(r[1] or 0), "type": _classify_ua(str(r[0]))}
+                   for r in ua_res.result_rows]
+
+        # Agréger les indicateurs de la dernière session
+        rows = ml_res.result_rows
+        latest = rows[0]
+        total_rows = len(rows)
+
+        ua_ch_mismatch_count = sum(1 for r in rows if r[1])
+        sni_mismatch_count = sum(1 for r in rows if r[3])
+        alpn_mismatch_count = sum(1 for r in rows if r[4])
+        is_rare_count = sum(1 for r in rows if r[6])
+        is_rotating = any(r[7] for r in rows)
+        distinct_ja4s = {str(r[0]) for r in rows if r[0]}
+        avg_browser_score = sum(int(r[2] or 0) for r in rows) / total_rows
+
+        # UA analysis
+        has_browser_ua = any(u["type"] == "browser" for u in top_uas)
+        has_bot_ua = any(u["type"] == "bot" for u in top_uas)
+        primary_ua_type = top_uas[0]["type"] if top_uas else "empty"
+
+        # Calcul du score de spoofing
+        spoof_score = min(100, round(
+            (ua_ch_mismatch_count / total_rows * 100) * 0.40
+            + (avg_browser_score * 0.20 if has_bot_ua else 0)
+            + (sni_mismatch_count / total_rows * 100) * 0.10
+            + (alpn_mismatch_count / total_rows * 100) * 0.05
+            + (len(distinct_ja4s) * 5 if len(distinct_ja4s) > 2 else 0)
+            + (15 if is_rotating else 0)
+            + (10 if is_rare_count > total_rows * 0.5 else 0)
+        ))
+
+        # Verdict
+        if spoof_score >= 70:
+            verdict = "high_confidence_spoofing"
+        elif spoof_score >= 40:
+            verdict = "suspicious_spoofing"
+        elif has_bot_ua and avg_browser_score < 20:
+            verdict = "known_bot_no_spoofing"
+        elif has_browser_ua and spoof_score < 20:
+            verdict = "legitimate_browser"
+        else:
+            verdict = "inconclusive"
+
+        # Explication humaine
+        explanation = []
+        if ua_ch_mismatch_count > total_rows * 0.3:
+            explanation.append(f"UA-Client-Hints mismatch sur {round(ua_ch_mismatch_count*100/total_rows)}% des requêtes")
+        if has_bot_ua and avg_browser_score > 40:
+            explanation.append(f"JA4 ressemble à un navigateur (score {round(avg_browser_score)}/100) mais UA est de type bot")
+        if len(distinct_ja4s) > 2:
+            explanation.append(f"{len(distinct_ja4s)} JA4 distincts utilisés → rotation de fingerprint")
+        if is_rotating:
+            explanation.append("is_ua_rotating détecté → rotation d'User-Agent confirmée")
+        if sni_mismatch_count > 0:
+            explanation.append(f"SNI ≠ Host header sur {sni_mismatch_count}/{total_rows} requêtes")
+        if not explanation:
+            explanation.append("Aucun indicateur de spoofing majeur détecté")
+
+        return {
+            "ip": ip,
+            "verdict": verdict,
+            "spoofing_score": spoof_score,
+            "explanation": explanation,
+            "indicators": {
+                "ua_ch_mismatch_rate": round(ua_ch_mismatch_count / total_rows * 100, 1),
+                "sni_mismatch_rate": round(sni_mismatch_count / total_rows * 100, 1),
+                "alpn_mismatch_rate": round(alpn_mismatch_count / total_rows * 100, 1),
+                "avg_browser_score": round(avg_browser_score, 1),
+                "distinct_ja4_count": len(distinct_ja4s),
+                "is_ua_rotating": is_rotating,
+                "rare_ja4_rate": round(is_rare_count / total_rows * 100, 1),
+            },
+            "fingerprints": {
+                "ja4_list": list(distinct_ja4s),
+                "latest_ja4": str(latest[0] or ""),
+            },
+            "user_agents": top_uas,
+            "latest_detection": {
+                "detected_at": latest[14].isoformat() if latest[14] else "",
+                "threat_level": str(latest[15] or "LOW"),
+                "modern_browser_score": int(latest[2] or 0),
+                "header_count": int(latest[9] or 0),
+                "has_accept_language": bool(latest[10]),
+                "has_cookie": bool(latest[11]),
+                "has_referer": bool(latest[12]),
+                "header_order_shared_count": int(latest[13] or 0),
+            },
+        }
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
+
+
+# =============================================================================
+# ENDPOINT 5 — JA4 légitimes (baseline / whitelist)
+# =============================================================================
+
+@router.get("/legitimate-ja4")
+async def get_legitimate_ja4(
+    hours: int = Query(168, ge=24, le=720, description="Fenêtre pour établir la baseline"),
+    min_ips: int = Query(50, ge=5, description="Nombre minimum d'IPs pour qualifier un JA4 de légitime"),
+):
+    """
+    Établit une baseline des JA4 fingerprints légitimes.
+
+    Un JA4 est considéré légitime si:
+      - Il est utilisé par un grand nombre d'IPs distinctes (> min_ips)
+      - Son taux de ua_ch_mismatch est faible (< 5%)
+      - Son modern_browser_score est élevé (> 60)
+      - Il n'est PAS is_rare_ja4
+      - Ses UAs sont dominés par des navigateurs connus
+
+    Utile comme whitelist pour réduire les faux positifs.
+    """
+    try:
+        query = """
+        SELECT
+            ja4,
+            uniq(src_ip) AS unique_ips,
+            count() AS total_detections,
+            round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
+            avg(modern_browser_score) AS avg_browser_score,
+            countIf(is_rare_ja4 = true) AS rare_count,
+            round(countIf(threat_level = 'CRITICAL') * 100.0 / count(), 2) AS critical_pct,
+            round(countIf(threat_level = 'HIGH') * 100.0 / count(), 2) AS high_pct
+        FROM ml_detected_anomalies
+        WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
+          AND ja4 != '' AND ja4 IS NOT NULL
+        GROUP BY ja4
+        HAVING unique_ips >= %(min_ips)s
+          AND ua_ch_mismatch_pct < 5.0
+          AND avg_browser_score > 60
+          AND rare_count = 0
+        ORDER BY unique_ips DESC
+        """
+
+        result = db.query(query, {"hours": hours, "min_ips": min_ips})
+
+        items = [
+            {
+                "ja4": str(row[0]),
+                "unique_ips": int(row[1] or 0),
+                "total_detections": int(row[2] or 0),
+                "ua_ch_mismatch_pct": float(row[3] or 0),
+                "avg_browser_score": round(float(row[4] or 0), 1),
+                "critical_pct": float(row[6] or 0),
+                "high_pct": float(row[7] or 0),
+                "legitimacy_confidence": min(100, round(
+                    (1 - float(row[3] or 0) / 100) * 40
+                    + float(row[4] or 0) * 0.40
+                    + min(int(row[1] or 0) / min_ips, 1) * 20
+                )),
+            }
+            for row in result.result_rows
+        ]
+
+        return {
+            "items": items,
+            "total": len(items),
+            "period_hours": hours,
+            "note": "Ces JA4 sont candidats à une whitelist. Vérifier manuellement avant de whitelister.",
+        }
+
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
+
+
+# =============================================================================
+# ENDPOINT — Corrélation JA4 × ASN / Pays (C5)
+# Détecte les JA4 fortement concentrés sur un seul ASN ou pays
+# → signal de botnet ciblé ou d'infrastructure de test/attaque partagée
+# =============================================================================
+
+@router.get("/asn-correlation")
+async def get_ja4_asn_correlation(
+    min_concentration: float = Query(0.7, ge=0.0, le=1.0, description="Seuil min de concentration ASN ou pays"),
+    min_ips: int = Query(5, ge=1, description="Nombre minimum d'IPs par JA4"),
+    limit: int = Query(50, ge=1, le=200),
+):
+    """
+    Identifie les JA4 fingerprints fortement concentrés sur un seul ASN ou pays.
+    Un JA4 avec asn_concentration ≥ 0.7 signifie que ≥70% des IPs utilisant ce fingerprint
+    proviennent du même ASN → infrastructure de bot partagée ou datacenter suspect.
+    """
+    try:
+        # Two-pass: first aggregate per (ja4, asn) to get IP counts per ASN,
+        # then aggregate per ja4 to compute concentration ratio
+        sql = """
+        SELECT
+            ja4,
+            sum(ips_per_combo)                                          AS unique_ips,
+            uniq(src_asn)                                               AS unique_asns,
+            uniq(src_country_code)                                      AS unique_countries,
+            toString(argMax(src_asn, ips_per_combo))                    AS top_asn_number,
+            argMax(asn_name, ips_per_combo)                             AS top_asn_name,
+            argMax(src_country_code, country_ips)                       AS dominant_country,
+            sum(total_hits)                                             AS total_hits,
+            round(max(ips_per_combo) / greatest(sum(ips_per_combo), 1), 3) AS asn_concentration,
+            round(max(country_ips)   / greatest(sum(ips_per_combo), 1), 3) AS country_concentration
+        FROM (
+            SELECT
+                ja4,
+                src_asn,
+                src_country_code,
+                any(src_as_name) AS asn_name,
+                uniq(src_ip)     AS ips_per_combo,
+                uniq(src_ip)     AS country_ips,
+                sum(hits)        AS total_hits
+            FROM mabase_prod.agg_host_ip_ja4_1h
+            WHERE window_start >= now() - INTERVAL 24 HOUR
+              AND ja4 != ''
+            GROUP BY ja4, src_asn, src_country_code
+        )
+        GROUP BY ja4
+        HAVING unique_ips >= %(min_ips)s
+           AND (asn_concentration >= %(min_conc)s OR country_concentration >= %(min_conc)s)
+        ORDER BY asn_concentration DESC, unique_ips DESC
+        LIMIT %(limit)s
+        """
+        result = db.query(sql, {"min_ips": min_ips, "min_conc": min_concentration, "limit": limit})
+        items = []
+        for row in result.result_rows:
+            ja4                   = str(row[0])
+            unique_ips            = int(row[1])
+            unique_asns           = int(row[2])
+            unique_countries      = int(row[3])
+            top_asn_number        = str(row[4] or "")
+            top_asn_name          = str(row[5] or "")
+            dominant_country      = str(row[6] or "")
+            total_hits            = int(row[7] or 0)
+            asn_concentration     = float(row[8] or 0)
+            country_concentration = float(row[9] or 0)
+
+            if asn_concentration >= 0.85:
+                corr_type, risk = "asn_monopoly", "high"
+            elif asn_concentration >= min_concentration:
+                corr_type, risk = "asn_dominant", "medium"
+            elif country_concentration >= min_concentration:
+                corr_type, risk = "geo_targeted", "medium"
+            else:
+                corr_type, risk = "distributed", "low"
+
+            items.append({
+                "ja4":                  ja4,
+                "unique_ips":           unique_ips,
+                "unique_asns":          unique_asns,
+                "unique_countries":     unique_countries,
+                "top_asn_name":         top_asn_name,
+                "top_asn_number":       top_asn_number,
+                "dominant_country":     dominant_country,
+                "total_hits":           total_hits,
+                "asn_concentration":    asn_concentration,
+                "country_concentration":country_concentration,
+                "correlation_type":     corr_type,
+                "risk":                 risk,
+            })
+        return {"items": items, "total": len(items)}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")