"""
Endpoints pour l'analyse des fingerprints JA4 et User-Agents

Objectifs:
  - Détecter le spoofing JA4 (fingerprint TLS qui prétend être un navigateur mais
    dont les User-Agents, les headers HTTP ou les métriques comportementales trahissent
    une origine bot/script)
  - Construire une matrice JA4 × User-Agent pour visualiser les associations suspectes
  - Analyser la distribution des User-Agents pour identifier les rotateurs et les bots
    qui usurpent des UA de navigateurs légitimes
"""
from fastapi import APIRouter, HTTPException, Query
import re

from ..database import db

router = APIRouter(prefix="/api/fingerprints", tags=["fingerprints"])


# ─── Helpers ──────────────────────────────────────────────────────────────────

# Patterns indiquant clairement un bot/script sans simulation de navigateur
_BOT_PATTERNS = re.compile(
    r"bot|crawler|spider|scraper|python|curl|wget|go-http|java/|axios|"
    r"libwww|httpclient|okhttp|requests|aiohttp|httpx|playwright|puppeteer|"
    r"selenium|headless|phantomjs",
    re.IGNORECASE,
)

# Navigateurs légitimes communs — un JA4 de type "browser" devrait venir avec ces UAs
_BROWSER_PATTERNS = re.compile(
    r"mozilla|chrome|safari|firefox|edge|opera|trident",
    re.IGNORECASE,
)


def _classify_ua(ua: str) -> str:
    """Retourne 'bot', 'browser', ou 'script'"""
    if not ua:
        return "empty"
    if _BOT_PATTERNS.search(ua):
        return "bot"
    if _BROWSER_PATTERNS.search(ua):
        return "browser"
    return "script"


# =============================================================================
# ENDPOINT 1 — Détection de spoofing JA4
# =============================================================================

@router.get("/spoofing")
async def get_ja4_spoofing(
    hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle"),
    min_detections: int = Query(10, ge=1, description="Nombre minimum de détections"),
    limit: int = Query(50, ge=1, le=200),
):
    """
    Identifie les JA4 fingerprints suspects de spoofing navigateur.

    Un JA4 est considéré suspect quand:
      - Il présente un taux élevé de ua_ch_mismatch (header UA ≠ Client Hints)
      - Son modern_browser_score est élevé mais les UAs associés sont des bots/scripts
      - Il apparaît avec un taux élevé de sni_host_mismatch ou alpn_http_mismatch
      - is_rare_ja4 = true avec un volume important

    Retourne un score de confiance de spoofing [0-100] pour chaque JA4.
    """
    try:
        # Agrégation par JA4 avec tous les indicateurs de spoofing
        query = """
        SELECT
            ja4,
            count() AS total_detections,
            uniq(src_ip) AS unique_ips,

            -- Indicateurs de mismatch
            countIf(ua_ch_mismatch = true) AS ua_ch_mismatch_count,
            round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
            countIf(sni_host_mismatch = true) AS sni_mismatch_count,
            round(countIf(sni_host_mismatch = true) * 100.0 / count(), 2) AS sni_mismatch_pct,
            countIf(alpn_http_mismatch = true) AS alpn_mismatch_count,
            round(countIf(alpn_http_mismatch = true) * 100.0 / count(), 2) AS alpn_mismatch_pct,

            -- Indicateurs comportementaux
            avg(modern_browser_score) AS avg_browser_score,
            countIf(is_rare_ja4 = true) AS rare_ja4_count,
            round(countIf(is_rare_ja4 = true) * 100.0 / count(), 2) AS rare_ja4_pct,
            countIf(is_ua_rotating = true) AS ua_rotating_count,
            round(countIf(is_ua_rotating = true) * 100.0 / count(), 2) AS ua_rotating_pct,

            -- Métriques TLS/TCP
            countIf(is_alpn_missing = true) AS alpn_missing_count,
            avg(distinct_ja4_count) AS avg_distinct_ja4_per_ip,

            -- Répartition threat levels
            countIf(threat_level = 'CRITICAL') AS critical_count,
            countIf(threat_level = 'HIGH') AS high_count,

            -- Botnet indicators
            avg(ja4_asn_concentration) AS avg_asn_concentration,
            avg(ja4_country_concentration) AS avg_country_concentration,

            argMax(threat_level, detected_at) AS last_threat_level
        FROM ml_detected_anomalies
        WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
          AND ja4 != '' AND ja4 IS NOT NULL
        GROUP BY ja4
        HAVING total_detections >= %(min_detections)s
        ORDER BY ua_ch_mismatch_pct DESC, total_detections DESC
        LIMIT %(limit)s
        """

        result = db.query(query, {
            "hours": hours,
            "min_detections": min_detections,
            "limit": limit,
        })

        # Fetch top UA per JA4 from view_dashboard_user_agents
        ja4_list = [str(r[0]) for r in result.result_rows if r[0]]
        ua_by_ja4: dict = {}
        if ja4_list:
            ja4_sql = ", ".join(f"'{j}'" for j in ja4_list[:100])
            ua_q = f"""
            SELECT ja4, groupArray(5)(ua) AS top_uas
            FROM (
                SELECT ja4, arrayJoin(user_agents) AS ua, sum(requests) AS cnt
                FROM view_dashboard_user_agents
                WHERE ja4 IN ({ja4_sql})
                  AND hour >= now() - INTERVAL {hours} HOUR
                  AND ua != ''
                GROUP BY ja4, ua
                ORDER BY ja4, cnt DESC
            )
            GROUP BY ja4
            """
            try:
                ua_res = db.query(ua_q)
                for ua_row in ua_res.result_rows:
                    j4 = str(ua_row[0])
                    if ua_row[1]:
                        ua_by_ja4[j4] = list(ua_row[1])
            except Exception:
                pass

        items = []
        for row in result.result_rows:
            ja4 = str(row[0])
            ua_ch_mismatch_pct = float(row[4] or 0)
            sni_mismatch_pct = float(row[6] or 0)
            alpn_mismatch_pct = float(row[8] or 0)
            avg_browser_score = float(row[9] or 0)
            rare_ja4_pct = float(row[11] or 0)
            ua_rotating_pct = float(row[13] or 0)
            alpn_missing_count = int(row[14] or 0)
            total = int(row[1] or 1)

            top_uas = ua_by_ja4.get(ja4, [])
            ua_classes = [_classify_ua(u) for u in top_uas]
            has_bot_ua = any(c == "bot" for c in ua_classes)
            has_browser_ua = any(c == "browser" for c in ua_classes)

            # Spoofing confidence score [0-100]:
            # UA/CH mismatch est le signal le plus fort (poids 40)
            # Browser UA avec score navigateur élevé mais indicateurs bot (poids 25)
            # SNI/ALPN mismatches (poids 15)
            # is_rare_ja4 avec gros volume (poids 10)
            # UA rotating (poids 10)
            spoof_score = min(100, round(
                ua_ch_mismatch_pct * 0.40
                + (avg_browser_score * 25 / 100 if has_bot_ua else 0)
                + sni_mismatch_pct * 0.10
                + alpn_mismatch_pct * 0.05
                + rare_ja4_pct * 0.10
                + ua_rotating_pct * 0.10
                + (10 if alpn_missing_count > total * 0.3 else 0)
            ))

            # Classification du JA4
            if spoof_score >= 60:
                classification = "spoofed_browser"
            elif has_bot_ua and avg_browser_score < 30:
                classification = "known_bot"
            elif has_browser_ua and ua_ch_mismatch_pct < 10:
                classification = "legitimate_browser"
            else:
                classification = "suspicious"

            items.append({
                "ja4": ja4,
                "classification": classification,
                "spoofing_score": spoof_score,
                "total_detections": int(row[1] or 0),
                "unique_ips": int(row[2] or 0),
                "indicators": {
                    "ua_ch_mismatch_pct": ua_ch_mismatch_pct,
                    "sni_mismatch_pct": sni_mismatch_pct,
                    "alpn_mismatch_pct": alpn_mismatch_pct,
                    "avg_browser_score": round(avg_browser_score, 1),
                    "rare_ja4_pct": rare_ja4_pct,
                    "ua_rotating_pct": ua_rotating_pct,
                    "alpn_missing_count": alpn_missing_count,
                    "avg_asn_concentration": round(float(row[18] or 0), 3),
                    "avg_country_concentration": round(float(row[19] or 0), 3),
                },
                "top_user_agents": [
                    {"ua": u, "type": _classify_ua(u)} for u in top_uas
                ],
                "threat_breakdown": {
                    "critical": int(row[16] or 0),
                    "high": int(row[17] or 0),
                    "last_level": str(row[20] or "LOW"),
                },
            })

        # Trier: spoofed_browser d'abord, puis par score
        items.sort(key=lambda x: (-x["spoofing_score"], -x["total_detections"]))

        return {
            "items": items,
            "total": len(items),
            "period_hours": hours,
            "summary": {
                "spoofed_browser": sum(1 for i in items if i["classification"] == "spoofed_browser"),
                "known_bot": sum(1 for i in items if i["classification"] == "known_bot"),
                "suspicious": sum(1 for i in items if i["classification"] == "suspicious"),
                "legitimate_browser": sum(1 for i in items if i["classification"] == "legitimate_browser"),
            },
        }

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")


# =============================================================================
# ENDPOINT 2 — Matrice JA4 × User-Agent
# =============================================================================

@router.get("/ja4-ua-matrix")
async def get_ja4_ua_matrix(
    hours: int = Query(24, ge=1, le=168),
    min_ips: int = Query(3, ge=1, description="Nombre minimum d'IPs pour inclure un JA4"),
    limit: int = Query(30, ge=1, le=100),
):
    """
    Matrice JA4 × User-Agent.

    Pour chaque JA4:
      - Top User-Agents associés (depuis view_dashboard_entities)
      - Taux de ua_ch_mismatch
      - Classification UA (bot / browser / script)
      - Indicateur de spoofing si browser_score élevé + UA non-navigateur
    """
    try:
        # Stats JA4 depuis ml_detected_anomalies
        stats_query = """
        SELECT
            ja4,
            uniq(src_ip) AS unique_ips,
            count() AS total_detections,
            round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
            avg(modern_browser_score) AS avg_browser_score,
            countIf(is_rare_ja4 = true) AS rare_count,
            countIf(is_ua_rotating = true) AS rotating_count,
            argMax(threat_level, detected_at) AS last_threat
        FROM ml_detected_anomalies
        WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
          AND ja4 != '' AND ja4 IS NOT NULL
        GROUP BY ja4
        HAVING unique_ips >= %(min_ips)s
        ORDER BY ua_ch_mismatch_pct DESC, unique_ips DESC
        LIMIT %(limit)s
        """

        stats_res = db.query(stats_query, {"hours": hours, "min_ips": min_ips, "limit": limit})
        ja4_list = [str(r[0]) for r in stats_res.result_rows]

        if not ja4_list:
            return {"items": [], "total": 0, "period_hours": hours}

        # UAs par JA4 depuis view_dashboard_user_agents
        ja4_sql = ", ".join(f"'{j}'" for j in ja4_list)
        ua_query = f"""
        SELECT
            ja4,
            ua,
            sum(requests) AS cnt
        FROM view_dashboard_user_agents
        ARRAY JOIN user_agents AS ua
        WHERE ja4 IN ({ja4_sql})
          AND hour >= now() - INTERVAL {hours} HOUR
          AND ua != ''
        GROUP BY ja4, ua
        ORDER BY ja4, cnt DESC
        """

        ua_by_ja4: dict = {}
        try:
            ua_res = db.query(ua_query)
            for row in ua_res.result_rows:
                j4 = str(row[0])
                if j4 not in ua_by_ja4:
                    ua_by_ja4[j4] = []
                if len(ua_by_ja4[j4]) < 8:
                    ua_by_ja4[j4].append({"ua": str(row[1]), "count": int(row[2] or 0)})
        except Exception:
            pass

        items = []
        for row in stats_res.result_rows:
            ja4 = str(row[0])
            unique_ips = int(row[1] or 0)
            ua_ch_mismatch_pct = float(row[3] or 0)
            avg_browser_score = float(row[4] or 0)

            top_uas = ua_by_ja4.get(ja4, [])
            ua_total = sum(u["count"] for u in top_uas) or 1

            classified_uas = []
            for u in top_uas:
                ua_type = _classify_ua(u["ua"])
                classified_uas.append({
                    "ua": u["ua"],
                    "count": u["count"],
                    "pct": round(u["count"] * 100 / ua_total, 1),
                    "type": ua_type,
                })

            bot_pct = sum(u["pct"] for u in classified_uas if u["type"] == "bot")
            browser_pct = sum(u["pct"] for u in classified_uas if u["type"] == "browser")

            # Spoofing flag: JA4 ressemble à un navigateur (browser_score élevé)
            # mais les UAs sont des bots/scripts
            is_spoofing = avg_browser_score > 50 and bot_pct > 30 and ua_ch_mismatch_pct > 20

            items.append({
                "ja4": ja4,
                "unique_ips": unique_ips,
                "total_detections": int(row[2] or 0),
                "ua_ch_mismatch_pct": ua_ch_mismatch_pct,
                "avg_browser_score": round(avg_browser_score, 1),
                "rare_count": int(row[5] or 0),
                "rotating_count": int(row[6] or 0),
                "last_threat": str(row[7] or "LOW"),
                "user_agents": classified_uas,
                "ua_summary": {
                    "bot_pct": round(bot_pct, 1),
                    "browser_pct": round(browser_pct, 1),
                    "script_pct": round(100 - bot_pct - browser_pct, 1),
                    "total_distinct": len(top_uas),
                },
                "is_spoofing_suspect": is_spoofing,
            })

        return {
            "items": items,
            "total": len(items),
            "period_hours": hours,
        }

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")


# =============================================================================
# ENDPOINT 3 — Analyse globale des User-Agents
# =============================================================================

@router.get("/ua-analysis")
async def get_ua_analysis(
    hours: int = Query(24, ge=1, le=168),
    limit: int = Query(50, ge=1, le=200),
):
    """
    Analyse globale des User-Agents dans les détections.

    Identifie:
      - UAs de type bot/script
      - UAs browser légitimes vs UAs browser utilisés par des bots (via ua_ch_mismatch)
      - UAs rares/suspects qui tournent (is_ua_rotating)
      - Distribution JA4 par UA pour détecter les UAs multi-fingerprints (rotation)
    """
    try:
        # Top UAs globaux depuis view_dashboard_user_agents
        ua_global_query = """
        SELECT
            ua,
            sum(requests) AS ip_count
        FROM view_dashboard_user_agents
        ARRAY JOIN user_agents AS ua
        WHERE hour >= now() - INTERVAL %(hours)s HOUR
          AND ua != ''
        GROUP BY ua
        ORDER BY ip_count DESC
        LIMIT %(limit)s
        """

        ua_global_res = db.query(ua_global_query, {"hours": hours, "limit": limit})
        top_uas = [str(r[0]) for r in ua_global_res.result_rows]

        # Pour chaque UA, chercher ses JA4 via view_dashboard_user_agents
        ua_sql = ", ".join(f"'{u.replace(chr(39), chr(39)*2)}'" for u in top_uas[:50]) if top_uas else "''"
        ja4_per_ua_query = f"""
        SELECT
            ua,
            uniq(ja4) AS unique_ja4s,
            groupUniqArray(3)(ja4) AS sample_ja4s
        FROM view_dashboard_user_agents
        ARRAY JOIN user_agents AS ua
        WHERE ua IN ({ua_sql})
          AND hour >= now() - INTERVAL {hours} HOUR
          AND ua != ''
          AND ja4 != ''
        GROUP BY ua
        """
        ja4_by_ua: dict = {}
        try:
            ja4_res = db.query(ja4_per_ua_query)
            for r in ja4_res.result_rows:
                ja4_by_ua[str(r[0])] = {
                    "unique_ja4s": int(r[1] or 0),
                    "sample_ja4s": list(r[2] or []),
                }
        except Exception:
            pass

        # IPs avec is_ua_rotating depuis ml_detected_anomalies
        rotating_query = """
        SELECT
            replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
            avg(ua_ch_mismatch) AS avg_ua_ch_mismatch
        FROM ml_detected_anomalies
        WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
          AND is_ua_rotating = true
        GROUP BY clean_ip
        ORDER BY avg_ua_ch_mismatch DESC
        """
        rotating_ips: list = []
        try:
            rot_res = db.query(rotating_query, {"hours": hours})
            rotating_ips = [str(r[0]) for r in rot_res.result_rows]
        except Exception:
            pass

        # Construire la réponse
        items = []
        for row in ua_global_res.result_rows:
            ua = str(row[0])
            ip_count = int(row[1] or 0)
            ua_type = _classify_ua(ua)
            ja4_info = ja4_by_ua.get(ua, {"unique_ja4s": 0, "sample_ja4s": []})

            # UA multi-JA4 est suspect: un vrai navigateur a généralement 1-2 JA4
            multi_ja4_flag = ja4_info["unique_ja4s"] > 3

            items.append({
                "user_agent": ua,
                "type": ua_type,
                "ip_count": ip_count,
                "unique_ja4_count": ja4_info["unique_ja4s"],
                "sample_ja4s": ja4_info["sample_ja4s"],
                "is_multi_ja4_suspect": multi_ja4_flag,
                "risk_flags": _build_ua_risk_flags(ua, ua_type, ja4_info["unique_ja4s"], ip_count),
            })

        # IPs avec rotation d'UA
        ua_rotating_stats = {
            "rotating_ip_count": len(rotating_ips),
            "sample_rotating_ips": rotating_ips[:10],
        }

        return {
            "items": items,
            "total": len(items),
            "period_hours": hours,
            "ua_rotating_stats": ua_rotating_stats,
            "summary": {
                "bot_count": sum(1 for i in items if i["type"] == "bot"),
                "browser_count": sum(1 for i in items if i["type"] == "browser"),
                "script_count": sum(1 for i in items if i["type"] == "script"),
                "multi_ja4_suspect_count": sum(1 for i in items if i["is_multi_ja4_suspect"]),
            },
        }

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")


def _build_ua_risk_flags(ua: str, ua_type: str, unique_ja4s: int, ip_count: int) -> list:
    flags = []
    if ua_type == "bot":
        flags.append("ua_bot_signature")
    elif ua_type == "script":
        flags.append("ua_script_library")
    if unique_ja4s > 5:
        flags.append("ja4_rotation_suspect")
    if unique_ja4s > 3 and ua_type == "browser":
        flags.append("browser_ua_multi_fingerprint")
    if ip_count > 100:
        flags.append("high_volume")
    return flags


# =============================================================================
# ENDPOINT 4 — JA4 d'un IP spécifique: analyse de cohérence UA/JA4
# =============================================================================

@router.get("/ip/{ip}/coherence")
async def get_ip_fingerprint_coherence(ip: str):
    """
    Analyse la cohérence JA4/UA pour une IP spécifique.

    Répond à la question: "Cette IP spoofait-elle son fingerprint?"

    Calcule un score de cohérence basé sur:
      - Correspondance entre JA4 (TLS client fingerprint) et User-Agent
      - ua_ch_mismatch (User-Agent vs Client Hints)
      - modern_browser_score vs type d'UA réel
      - Nombre de JA4 distincts utilisés (rotation)
      - sni_host_mismatch, alpn_http_mismatch
    """
    try:
        # Données depuis ml_detected_anomalies
        ml_query = """
        SELECT
            ja4,
            ua_ch_mismatch,
            modern_browser_score,
            sni_host_mismatch,
            alpn_http_mismatch,
            is_alpn_missing,
            is_rare_ja4,
            is_ua_rotating,
            distinct_ja4_count,
            header_count,
            has_accept_language,
            has_cookie,
            has_referer,
            header_order_shared_count,
            detected_at,
            threat_level,
            window_mss_ratio,
            tcp_jitter_variance,
            multiplexing_efficiency
        FROM ml_detected_anomalies
        WHERE src_ip = %(ip)s
        ORDER BY detected_at DESC
        """
        ml_res = db.query(ml_query, {"ip": ip})

        if not ml_res.result_rows:
            raise HTTPException(status_code=404, detail="IP non trouvée dans les détections")

        # User-agents réels depuis view_dashboard_user_agents
        ua_query = """
        SELECT ua, sum(requests) AS cnt
        FROM view_dashboard_user_agents
        ARRAY JOIN user_agents AS ua
        WHERE toString(src_ip) = %(ip)s
          AND hour >= now() - INTERVAL 72 HOUR
          AND ua != ''
        GROUP BY ua ORDER BY cnt DESC
        """
        ua_res = db.query(ua_query, {"ip": ip})
        top_uas = [{"ua": str(r[0]), "count": int(r[1] or 0), "type": _classify_ua(str(r[0]))}
                   for r in ua_res.result_rows]

        # Agréger les indicateurs de la dernière session
        rows = ml_res.result_rows
        latest = rows[0]
        total_rows = len(rows)

        ua_ch_mismatch_count = sum(1 for r in rows if r[1])
        sni_mismatch_count = sum(1 for r in rows if r[3])
        alpn_mismatch_count = sum(1 for r in rows if r[4])
        is_rare_count = sum(1 for r in rows if r[6])
        is_rotating = any(r[7] for r in rows)
        distinct_ja4s = {str(r[0]) for r in rows if r[0]}
        avg_browser_score = sum(int(r[2] or 0) for r in rows) / total_rows

        # UA analysis
        has_browser_ua = any(u["type"] == "browser" for u in top_uas)
        has_bot_ua = any(u["type"] == "bot" for u in top_uas)
        primary_ua_type = top_uas[0]["type"] if top_uas else "empty"

        # Calcul du score de spoofing
        spoof_score = min(100, round(
            (ua_ch_mismatch_count / total_rows * 100) * 0.40
            + (avg_browser_score * 0.20 if has_bot_ua else 0)
            + (sni_mismatch_count / total_rows * 100) * 0.10
            + (alpn_mismatch_count / total_rows * 100) * 0.05
            + (len(distinct_ja4s) * 5 if len(distinct_ja4s) > 2 else 0)
            + (15 if is_rotating else 0)
            + (10 if is_rare_count > total_rows * 0.5 else 0)
        ))

        # Verdict
        if spoof_score >= 70:
            verdict = "high_confidence_spoofing"
        elif spoof_score >= 40:
            verdict = "suspicious_spoofing"
        elif has_bot_ua and avg_browser_score < 20:
            verdict = "known_bot_no_spoofing"
        elif has_browser_ua and spoof_score < 20:
            verdict = "legitimate_browser"
        else:
            verdict = "inconclusive"

        # Explication humaine
        explanation = []
        if ua_ch_mismatch_count > total_rows * 0.3:
            explanation.append(f"UA-Client-Hints mismatch sur {round(ua_ch_mismatch_count*100/total_rows)}% des requêtes")
        if has_bot_ua and avg_browser_score > 40:
            explanation.append(f"JA4 ressemble à un navigateur (score {round(avg_browser_score)}/100) mais UA est de type bot")
        if len(distinct_ja4s) > 2:
            explanation.append(f"{len(distinct_ja4s)} JA4 distincts utilisés → rotation de fingerprint")
        if is_rotating:
            explanation.append("is_ua_rotating détecté → rotation d'User-Agent confirmée")
        if sni_mismatch_count > 0:
            explanation.append(f"SNI ≠ Host header sur {sni_mismatch_count}/{total_rows} requêtes")
        if not explanation:
            explanation.append("Aucun indicateur de spoofing majeur détecté")

        return {
            "ip": ip,
            "verdict": verdict,
            "spoofing_score": spoof_score,
            "explanation": explanation,
            "indicators": {
                "ua_ch_mismatch_rate": round(ua_ch_mismatch_count / total_rows * 100, 1),
                "sni_mismatch_rate": round(sni_mismatch_count / total_rows * 100, 1),
                "alpn_mismatch_rate": round(alpn_mismatch_count / total_rows * 100, 1),
                "avg_browser_score": round(avg_browser_score, 1),
                "distinct_ja4_count": len(distinct_ja4s),
                "is_ua_rotating": is_rotating,
                "rare_ja4_rate": round(is_rare_count / total_rows * 100, 1),
            },
            "fingerprints": {
                "ja4_list": list(distinct_ja4s),
                "latest_ja4": str(latest[0] or ""),
            },
            "user_agents": top_uas,
            "latest_detection": {
                "detected_at": latest[14].isoformat() if latest[14] else "",
                "threat_level": str(latest[15] or "LOW"),
                "modern_browser_score": int(latest[2] or 0),
                "header_count": int(latest[9] or 0),
                "has_accept_language": bool(latest[10]),
                "has_cookie": bool(latest[11]),
                "has_referer": bool(latest[12]),
                "header_order_shared_count": int(latest[13] or 0),
            },
        }

    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")


# =============================================================================
# ENDPOINT 5 — JA4 légitimes (baseline / whitelist)
# =============================================================================

@router.get("/legitimate-ja4")
async def get_legitimate_ja4(
    hours: int = Query(168, ge=24, le=720, description="Fenêtre pour établir la baseline"),
    min_ips: int = Query(50, ge=5, description="Nombre minimum d'IPs pour qualifier un JA4 de légitime"),
):
    """
    Établit une baseline des JA4 fingerprints légitimes.

    Un JA4 est considéré légitime si:
      - Il est utilisé par un grand nombre d'IPs distinctes (> min_ips)
      - Son taux de ua_ch_mismatch est faible (< 5%)
      - Son modern_browser_score est élevé (> 60)
      - Il n'est PAS is_rare_ja4
      - Ses UAs sont dominés par des navigateurs connus

    Utile comme whitelist pour réduire les faux positifs.
    """
    try:
        query = """
        SELECT
            ja4,
            uniq(src_ip) AS unique_ips,
            count() AS total_detections,
            round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
            avg(modern_browser_score) AS avg_browser_score,
            countIf(is_rare_ja4 = true) AS rare_count,
            round(countIf(threat_level = 'CRITICAL') * 100.0 / count(), 2) AS critical_pct,
            round(countIf(threat_level = 'HIGH') * 100.0 / count(), 2) AS high_pct
        FROM ml_detected_anomalies
        WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
          AND ja4 != '' AND ja4 IS NOT NULL
        GROUP BY ja4
        HAVING unique_ips >= %(min_ips)s
          AND ua_ch_mismatch_pct < 5.0
          AND avg_browser_score > 60
          AND rare_count = 0
        ORDER BY unique_ips DESC
        """

        result = db.query(query, {"hours": hours, "min_ips": min_ips})

        items = [
            {
                "ja4": str(row[0]),
                "unique_ips": int(row[1] or 0),
                "total_detections": int(row[2] or 0),
                "ua_ch_mismatch_pct": float(row[3] or 0),
                "avg_browser_score": round(float(row[4] or 0), 1),
                "critical_pct": float(row[6] or 0),
                "high_pct": float(row[7] or 0),
                "legitimacy_confidence": min(100, round(
                    (1 - float(row[3] or 0) / 100) * 40
                    + float(row[4] or 0) * 0.40
                    + min(int(row[1] or 0) / min_ips, 1) * 20
                )),
            }
            for row in result.result_rows
        ]

        return {
            "items": items,
            "total": len(items),
            "period_hours": hours,
            "note": "Ces JA4 sont candidats à une whitelist. Vérifier manuellement avant de whitelister.",
        }

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")


# =============================================================================
# ENDPOINT — Corrélation JA4 × ASN / Pays (C5)
# Détecte les JA4 fortement concentrés sur un seul ASN ou pays
# → signal de botnet ciblé ou d'infrastructure de test/attaque partagée
# =============================================================================

@router.get("/asn-correlation")
async def get_ja4_asn_correlation(
    min_concentration: float = Query(0.7, ge=0.0, le=1.0, description="Seuil min de concentration ASN ou pays"),
    min_ips: int = Query(5, ge=1, description="Nombre minimum d'IPs par JA4"),
    limit: int = Query(50, ge=1, le=200),
):
    """
    Identifie les JA4 fingerprints fortement concentrés sur un seul ASN ou pays.
    Un JA4 avec asn_concentration ≥ 0.7 signifie que ≥70% des IPs utilisant ce fingerprint
    proviennent du même ASN → infrastructure de bot partagée ou datacenter suspect.
    """
    try:
        # Two-pass: first aggregate per (ja4, asn) to get IP counts per ASN,
        # then aggregate per ja4 to compute concentration ratio
        sql = """
        SELECT
            ja4,
            sum(ips_per_combo)                                          AS unique_ips,
            uniq(src_asn)                                               AS unique_asns,
            uniq(src_country_code)                                      AS unique_countries,
            toString(argMax(src_asn, ips_per_combo))                    AS top_asn_number,
            argMax(asn_name, ips_per_combo)                             AS top_asn_name,
            argMax(src_country_code, country_ips)                       AS dominant_country,
            sum(total_hits)                                             AS total_hits,
            round(max(ips_per_combo) / greatest(sum(ips_per_combo), 1), 3) AS asn_concentration,
            round(max(country_ips)   / greatest(sum(ips_per_combo), 1), 3) AS country_concentration
        FROM (
            SELECT
                ja4,
                src_asn,
                src_country_code,
                any(src_as_name) AS asn_name,
                uniq(src_ip)     AS ips_per_combo,
                uniq(src_ip)     AS country_ips,
                sum(hits)        AS total_hits
            FROM mabase_prod.agg_host_ip_ja4_1h
            WHERE window_start >= now() - INTERVAL 24 HOUR
              AND ja4 != ''
            GROUP BY ja4, src_asn, src_country_code
        )
        GROUP BY ja4
        HAVING unique_ips >= %(min_ips)s
           AND (asn_concentration >= %(min_conc)s OR country_concentration >= %(min_conc)s)
        ORDER BY asn_concentration DESC, unique_ips DESC
        LIMIT %(limit)s
        """
        result = db.query(sql, {"min_ips": min_ips, "min_conc": min_concentration, "limit": limit})
        items = []
        for row in result.result_rows:
            ja4                   = str(row[0])
            unique_ips            = int(row[1])
            unique_asns           = int(row[2])
            unique_countries      = int(row[3])
            top_asn_number        = str(row[4] or "")
            top_asn_name          = str(row[5] or "")
            dominant_country      = str(row[6] or "")
            total_hits            = int(row[7] or 0)
            asn_concentration     = float(row[8] or 0)
            country_concentration = float(row[9] or 0)

            if asn_concentration >= 0.85:
                corr_type, risk = "asn_monopoly", "high"
            elif asn_concentration >= min_concentration:
                corr_type, risk = "asn_dominant", "medium"
            elif country_concentration >= min_concentration:
                corr_type, risk = "geo_targeted", "medium"
            else:
                corr_type, risk = "distributed", "low"

            items.append({
                "ja4":                  ja4,
                "unique_ips":           unique_ips,
                "unique_asns":          unique_asns,
                "unique_countries":     unique_countries,
                "top_asn_name":         top_asn_name,
                "top_asn_number":       top_asn_number,
                "dominant_country":     dominant_country,
                "total_hits":           total_hits,
                "asn_concentration":    asn_concentration,
                "country_concentration":country_concentration,
                "correlation_type":     corr_type,
                "risk":                 risk,
            })
        return {"items": items, "total": len(items)}
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")