refactor(bot-detector): extract monolith into modular package

Split bot_detector.py (~1982 lines) into 10 focused modules: - config.py: all configuration constants and optional imports - log.py: logging utilities (log_info, log_decision, append_training_history) - infra.py: ClickHouse client, health check HTTP server, shutdown - browser.py: multifactorial browser identification (5 axes) - scoring.py: drift detection, feature validation, SHAP, clustering - models.py: EIF, Autoencoder, XGBoost model management - preprocessing.py: data preprocessing and feature list definitions - pipeline.py: core semi-supervised scoring loop - cycle.py: main analysis cycle orchestration - __main__.py: entry point with startup banner Update Dockerfile to copy package directory and use python -m bot_detector. All 36 existing tests pass unchanged. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-09 01:02:04 +02:00
parent 2d04288e95
commit 1f103392ac
12 changed files with 2112 additions and 2 deletions
--- a/services/bot-detector/bot_detector/Dockerfile
+++ b/services/bot-detector/bot_detector/Dockerfile
@ -13,6 +13,6 @@ RUN pip install --no-cache-dir /app/shared/ja4_common/
 COPY services/bot-detector/bot_detector/requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-COPY services/bot-detector/bot_detector/bot_detector.py .
+COPY services/bot-detector/bot_detector/ ./bot_detector/
-CMD ["python", "bot_detector.py"]
+CMD ["python", "-m", "bot_detector"]
--- a/services/bot-detector/bot_detector/init.py
+++ b/services/bot-detector/bot_detector/init.py
@ -0,0 +1 @@
 """Package bot_detector — détection de bots par ML semi-supervisé."""
--- a/services/bot-detector/bot_detector/main.py
+++ b/services/bot-detector/bot_detector/main.py
@ -0,0 +1,41 @@
 """Point d'entrée du bot-detector : boucle principale d'analyse."""
 import time
 from .config import *  # noqa: F403 — All config for startup banner
 from .log import log_info, log_decision
 from .cycle import fetch_and_analyze
 if __name__ == '__main__':
    log_info('')
    log_info('╔═══════════════════════════════════════════════════════════════╗')
    log_info('║   BOT DETECTOR IA v12 — Pipeline de détection semi-supervisé ║')
    log_info('╚═══════════════════════════════════════════════════════════════╝')
    log_info(f'  Base de données       : {DB}')
    log_info(f'  Contamination EIF     : {CONTAMINATION}')
    log_info(f'  Seuil anomalie        : {ANOMALY_THRESHOLD} (adaptatif p{ANOMALY_PERCENTILE})')
    log_info(f'  Cycle d\'analyse       : toutes les {CYCLE_INTERVAL}s')
    log_info(f'  Retraining            : toutes les {RETRAIN_INTERVAL_H}h (drift seuil={DRIFT_THRESHOLD:.0%})')
    log_info(f'  Modèles               : {MODEL_DIR}')
    log_info(f'  Autoencoder (AE)      : poids={AE_WEIGHT} {"(PyTorch disponible)" if TORCH_AVAILABLE else "(PyTorch absent — désactivé)"}')
    log_info(f'  XGBoost               : poids={XGB_WEIGHT} (min labels={XGB_MIN_LABELS})')
    log_info(f'  SHAP explainabilité   : {"activé" if ENABLE_SHAP else "désactivé (shap non installé)" if not SHAP_AVAILABLE else "désactivé"}')
    log_info(f'  Clustering HDBSCAN    : {"activé" if ENABLE_CLUSTERING else "désactivé"}')
    log_info(f'  Dédup inter-cycles    : TTL={DEDUP_TTL_MIN}min')
    log_info(f'  Récurrence            : weight={RECURRENCE_WEIGHT}')
    log_info(f'  Multi-fenêtres (24h)  : {"activé" if ENABLE_MULTIWINDOW else "désactivé"}')
    log_info(f'  Feature ratio minimum : {MIN_VALID_FEATURE_RATIO:.0%}')
    log_info(f'  Anubis                : ALLOW→KNOWN_BOT, DENY→forcé menace (score IF réel)')
    log_info(f'  Browser légitime      : confidence≥{BROWSER_CONFIDENCE_THRESHOLD} (5 axes), cohorte≥{BROWSER_COHORT_RATIO}')
    log_info(f'  Feedback SOC          : {"activé" if ENABLE_FEEDBACK else "désactivé"} (fenêtre={FEEDBACK_WINDOW_DAYS}j)')
    log_info('')
    log_decision('SERVICE_START', 'boot', '', {
        'db': DB, 'contamination': CONTAMINATION, 'anomaly_threshold': ANOMALY_THRESHOLD,
        'cycle_interval': CYCLE_INTERVAL, 'retrain_interval_h': RETRAIN_INTERVAL_H
    })
    log_info(f'En attente du premier cycle dans {CYCLE_INTERVAL}s…')
    while True:
        try:
            fetch_and_analyze()
        except Exception as e:
            log_info(f"Erreur globale : {e}")
        time.sleep(CYCLE_INTERVAL)
--- a/services/bot-detector/bot_detector/browser.py
+++ b/services/bot-detector/bot_detector/browser.py
@ -0,0 +1,170 @@
 """A9 — Identification multifactorielle des navigateurs.
 5 axes indépendants combinés pour identifier les navigateurs légitimes.
 Chaque axe produit un score [0,1]. La combinaison pondérée produit
 browser_confidence [0,1], remplaçant l'ancien système binaire JA4-only.
 """
 import pandas as pd
 import numpy as np
 # Profils structurels JA4 des navigateurs connus (TCP).
 # Plages de cipher/extension count caractéristiques par famille TLS.
 # Sources : FoxIO ja4plus-mapping.csv, ja4db.com, captures réelles.
 _BROWSER_JA4_PROFILES = {
    'Chromium': {'tls': '13', 'alpn': ('h2', 'h3'),
                 'ciphers': range(13, 19), 'exts': range(13, 20)},
    'Firefox':  {'tls': '13', 'alpn': ('h2', 'h3'),
                 'ciphers': range(15, 20), 'exts': range(13, 18)},
    'Safari':   {'tls': '13', 'alpn': ('h2', 'h3'),
                 'ciphers': range(17, 25), 'exts': range(11, 17)},
    'Tor_Browser': {'tls': '13', 'alpn': ('h2',),
                    'ciphers': range(14, 18), 'exts': range(12, 17)},
 }
 # Pondération des 5 axes pour le score browser_confidence.
 _AXIS_WEIGHTS = {
    'ja4_known':     0.25,   # Axe 1 — Signature JA4 dans dict_browser_ja4
    'ja4_struct':    0.15,   # Axe 2 — Structure JA4 (TLS1.3, h2, nb ciphers/ext)
    'http_modern':   0.25,   # Axe 3 — Headers HTTP modernes (sec-ch-ua, Sec-Fetch-*)
    'nav_behavior':  0.15,   # Axe 4 — Comportement de navigation (assets, referers)
    'tls_coherence': 0.20,   # Axe 5 — Cohérence TLS/TCP (pas de mismatch)
 }
 def _parse_ja4_columns(ja4_series: pd.Series) -> pd.DataFrame:
    """Parse la partie structurelle du JA4 (10 premiers caractères) vectorisé.
    Format JA4 : {proto}{tls_ver}{sni}{cipher_cnt}{ext_cnt}{alpn}_hash_hash
    Exemple    : t13d1516h2_8daaf6152771_02713d6af862
    Retourne un DataFrame avec : ja4_tls, ja4_ciphers, ja4_exts, ja4_alpn.
    """
    s = ja4_series.fillna('').astype(str)
    valid = s.str.len() >= 10
    result = pd.DataFrame(index=ja4_series.index)
    result['ja4_tls'] = s.str[1:3].where(valid, '00')
    result['ja4_ciphers'] = pd.to_numeric(s.str[4:6].where(valid, '0'), errors='coerce').fillna(0).astype(int)
    result['ja4_exts'] = pd.to_numeric(s.str[6:8].where(valid, '0'), errors='coerce').fillna(0).astype(int)
    result['ja4_alpn'] = s.str[8:10].where(valid, '00')
    return result
 def _compute_browser_axes(df: pd.DataFrame) -> pd.DataFrame:
    """Calcule les 5 axes d'identification navigateur pour chaque session.
    Axe 1 (ja4_known)     : JA4 présent dans dict_browser_ja4 (0 ou 1)
    Axe 2 (ja4_struct)    : Structure JA4 compatible navigateur (TLS1.3 + h2/h3 + ciphers/exts)
    Axe 3 (http_modern)   : Headers HTTP modernes (sec-ch-ua, Accept-Language, Sec-Fetch-*)
    Axe 4 (nav_behavior)  : Comportement navigation (cookies, referer, assets, accès direct)
    Axe 5 (tls_coherence) : Cohérence TLS/TCP (pas de mismatch ALPN, window scaling, TLS1.3)
    Retourne un DataFrame avec les colonnes axis_* et browser_confidence.
    """
    n = len(df)
    axes = pd.DataFrame(index=df.index)
    # ── Axe 1 — Signature JA4 connue ──
    bf = df.get('browser_family', pd.Series('', index=df.index)).fillna('').astype(str)
    axes['axis_ja4_known'] = (bf != '').astype(float)
    # ── Axe 2 — Structure JA4 ──
    ja4_parsed = _parse_ja4_columns(df.get('ja4', pd.Series('', index=df.index)))
    is_tls13 = (ja4_parsed['ja4_tls'] == '13').astype(float)
    is_h2h3 = ja4_parsed['ja4_alpn'].isin(['h2', 'h3']).astype(float)
    # Plage de ciphers et extensions typique d'un navigateur (10-25)
    c = ja4_parsed['ja4_ciphers']
    e = ja4_parsed['ja4_exts']
    ciphers_ok = ((c >= 10) & (c <= 25)).astype(float)
    exts_ok = ((e >= 10) & (e <= 25)).astype(float)
    axes['axis_ja4_struct'] = (
        is_tls13 * 0.35 + is_h2h3 * 0.25 + ciphers_ok * 0.20 + exts_ok * 0.20
    )
    # ── Axe 3 — Headers HTTP modernes ──
    mbs = df.get('modern_browser_score', pd.Series(0, index=df.index)).fillna(0)
    hal = df.get('has_accept_language', pd.Series(0, index=df.index)).fillna(0)
    sfa = df.get('sec_fetch_absence_rate', pd.Series(1, index=df.index)).fillna(1)
    gar = df.get('generic_accept_ratio', pd.Series(1, index=df.index)).fillna(1)
    uam = df.get('ua_ch_mismatch', pd.Series(0, index=df.index)).fillna(0)
    axes['axis_http_modern'] = (
        (mbs >= 50).astype(float) * 0.35
        + (hal > 0).astype(float) * 0.20
        + (sfa < 0.3).astype(float) * 0.25
        + (gar < 0.3).astype(float) * 0.10
        + (uam == 0).astype(float) * 0.10
    )
    # ── Axe 4 — Comportement de navigation ──
    hck = df.get('has_cookie', pd.Series(0, index=df.index)).fillna(0)
    hrf = df.get('has_referer', pd.Series(0, index=df.index)).fillna(0)
    asr = df.get('asset_ratio', pd.Series(0, index=df.index)).fillna(0)
    dar = df.get('direct_access_ratio', pd.Series(1, index=df.index)).fillna(1)
    axes['axis_nav_behavior'] = (
        (hck > 0).astype(float) * 0.25
        + (hrf > 0).astype(float) * 0.25
        + (asr > 0.15).astype(float) * 0.25
        + (dar < 0.5).astype(float) * 0.25
    )
    # ── Axe 5 — Cohérence TLS/TCP ──
    alm = df.get('alpn_http_mismatch', pd.Series(0, index=df.index)).fillna(0)
    nws = df.get('no_window_scale_ratio', pd.Series(0, index=df.index)).fillna(0)
    t12 = df.get('tls12_ratio', pd.Series(0, index=df.index)).fillna(0)
    h10 = df.get('http10_ratio', pd.Series(0, index=df.index)).fillna(0)
    iam = df.get('is_alpn_missing', pd.Series(0, index=df.index)).fillna(0)
    axes['axis_tls_coherence'] = (
        (alm == 0).astype(float) * 0.25
        + (nws == 0).astype(float) * 0.20
        + (t12 < 0.1).astype(float) * 0.20
        + (h10 == 0).astype(float) * 0.15
        + (iam == 0).astype(float) * 0.20
    )
    # ── Score combiné pondéré ──
    axes['browser_confidence'] = sum(
        axes[f'axis_{k}'] * w for k, w in _AXIS_WEIGHTS.items()
    )
    return axes
 def _infer_browser_family(df: pd.DataFrame, ja4_parsed: pd.DataFrame,
                          axes: pd.DataFrame) -> pd.Series:
    """Infère la famille navigateur par analyse structurelle quand dict_browser_ja4 manque.
    Priorité :
    1. browser_family connu (dict_browser_ja4) → conservé
    2. Profil structurel JA4 (cipher/ext count) → famille probable
    3. Sinon → '' (inconnu)
    La famille inférée nécessite browser_confidence ≥ 0.45 pour éviter les faux positifs
    (un bot avec un JA4 structurellement similaire mais sans comportement navigateur).
    """
    bf = df.get('browser_family', pd.Series('', index=df.index)).fillna('').astype(str)
    result = bf.copy()
    unknown_mask = (result == '')
    if not unknown_mask.any():
        return result
    # Inférence structurelle sur les sessions sans famille connue
    c = ja4_parsed.loc[unknown_mask, 'ja4_ciphers']
    e = ja4_parsed.loc[unknown_mask, 'ja4_exts']
    tls = ja4_parsed.loc[unknown_mask, 'ja4_tls']
    alpn = ja4_parsed.loc[unknown_mask, 'ja4_alpn']
    conf = axes.loc[unknown_mask, 'browser_confidence']
    # Seuil minimal : le comportement global doit être suffisamment navigateur
    eligible = conf >= 0.45
    for family, profile in _BROWSER_JA4_PROFILES.items():
        match = (
            eligible
            & (tls == profile['tls'])
            & alpn.isin(profile['alpn'])
            & c.isin(profile['ciphers'])
            & e.isin(profile['exts'])
        )
        # Affecter seulement si pas encore attribué
        match = match & (result.loc[unknown_mask] == '')
        result.loc[match[match].index] = family
    return result
--- a/services/bot-detector/bot_detector/config.py
+++ b/services/bot-detector/bot_detector/config.py
@ -0,0 +1,153 @@
 """Configuration centralisée du bot-detector.
 Toutes les variables d'environnement, constantes et imports optionnels.
 Aucun effet de bord (pas de logging, pas de connexion) — pur configuration.
 """
 import os
 import re
 import warnings
 warnings.filterwarnings('ignore')
 # ─── Utilitaire de lecture d'env var ────────────────────────────────────────
 def _require_float(name, default, lo=None, hi=None):
    """Lit une variable d'environnement comme flottant et valide la plage si spécifiée.
    Lève SystemExit si la valeur est non numérique ou hors plage (lo, hi) exclusive.
    """
    raw = os.getenv(name, str(default))
    try:
        v = float(raw)
    except ValueError:
        raise SystemExit(f"[CONFIG] {name}={raw!r} invalide — doit être un nombre décimal.")
    if lo is not None and not (lo < v < hi):
        raise SystemExit(f"[CONFIG] {name}={v} hors plage ({lo} < valeur < {hi}).")
    return v
 # ─── Validation identifiants SQL ────────────────────────────────────────────
 _SAFE_IDENTIFIER_RE = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$')
 # ─── ClickHouse ─────────────────────────────────────────────────────────────
 DB = os.getenv('CLICKHOUSE_DB_PROCESSING', os.getenv('CLICKHOUSE_DB', 'ja4_processing'))
 DB_LOGS = os.getenv('CLICKHOUSE_DB_LOGS', 'ja4_logs')
 for _db_name, _db_val in [('CLICKHOUSE_DB_PROCESSING', DB), ('CLICKHOUSE_DB_LOGS', DB_LOGS)]:
    if not _SAFE_IDENTIFIER_RE.match(_db_val):
        raise SystemExit(f"[CONFIG] {_db_name}={_db_val!r} invalide — doit être un identifiant SQL valide.")
 # ─── Isolation Forest ───────────────────────────────────────────────────────
 CONTAMINATION = _require_float('ISOLATION_CONTAMINATION', 0.001, 0, 0.5)
 N_ESTIMATORS = int(os.getenv('N_ESTIMATORS', '300'))
 ANOMALY_THRESHOLD = _require_float('ANOMALY_THRESHOLD', -0.05)
 ANOMALY_PERCENTILE = int(os.getenv('ANOMALY_PERCENTILE', '5'))
 # ─── Modèles ───────────────────────────────────────────────────────────────
 MODEL_DIR = os.getenv('MODEL_DIR', '/var/lib/bot_detector')
 MODEL_HISTORY_COUNT = int(os.getenv('MODEL_HISTORY_COUNT', '10'))
 RETRAIN_INTERVAL_H = int(os.getenv('RETRAIN_INTERVAL_HOURS', '24'))
 DRIFT_THRESHOLD = _require_float('DRIFT_THRESHOLD', 0.30, 0, 1)
 MIN_VALID_FEATURE_RATIO = _require_float('MIN_VALID_FEATURE_RATIO', 0.50, 0, 1)
 MAX_FAILURES = int(os.getenv('MAX_CONSECUTIVE_FAILURES', '3'))
 # ─── Cycle & Logging ───────────────────────────────────────────────────────
 CYCLE_INTERVAL = int(os.getenv('CYCLE_INTERVAL_SEC', '300'))
 LOG_FILE = os.getenv('BOT_DETECTOR_LOG', '/var/log/bot_detector/decisions.jsonl')
 LOG_BACKUP_COUNT = int(os.getenv('LOG_BACKUP_COUNT', '7'))
 TRAINING_HISTORY_FILE = os.path.join(MODEL_DIR, 'training_history.jsonl')
 # ─── Health check ───────────────────────────────────────────────────────────
 HEALTH_PORT = int(os.getenv('HEALTH_PORT', '8080'))
 # ─── Déduplication et récurrence ────────────────────────────────────────────
 DEDUP_TTL_MIN = int(os.getenv('DEDUP_TTL_MIN', '60'))
 RECURRENCE_WEIGHT = _require_float('RECURRENCE_WEIGHT', 0.005)
 # ─── Autoencoder (AE) — second scorer parallèle ────────────────────────────
 AE_WEIGHT = _require_float('AE_WEIGHT', 0.30, 0, 1)
 AE_EPOCHS = int(os.getenv('AE_EPOCHS', '50'))
 AE_LATENT_DIM = int(os.getenv('AE_LATENT_DIM', '16'))
 AE_LEARNING_RATE = float(os.getenv('AE_LEARNING_RATE', '1e-3'))
 # ─── XGBoost — troisième voix supervisée ────────────────────────────────────
 XGB_WEIGHT = _require_float('XGB_WEIGHT', 0.20, 0, 1)
 XGB_MIN_LABELS = int(os.getenv('XGB_MIN_LABELS', '100'))
 XGB_RETRAIN_INTERVAL_H = int(os.getenv('XGB_RETRAIN_INTERVAL_HOURS', '168'))
 # ─── A9 — Classification multifactorielle des navigateurs ──────────────────
 BROWSER_CONFIDENCE_THRESHOLD = _require_float('BROWSER_CONFIDENCE_THRESHOLD', 0.55, 0, 1)
 BROWSER_COHORT_RATIO = _require_float('BROWSER_COHORT_RATIO', 0.70, 0, 1)
 # ─── SHAP / Clustering / Multi-fenêtres / Feedback ─────────────────────────
 ENABLE_CLUSTERING = os.getenv('ENABLE_CLUSTERING', 'true').lower() == 'true'
 CLUSTERING_MIN_SAMPLES = int(os.getenv('CLUSTERING_MIN_SAMPLES', '3'))
 ENABLE_MULTIWINDOW = os.getenv('ENABLE_MULTIWINDOW', 'false').lower() == 'true'
 MULTIWINDOW_VIEW = os.getenv('MULTIWINDOW_VIEW', 'view_ai_features_24h')
 if not _SAFE_IDENTIFIER_RE.match(MULTIWINDOW_VIEW):
    raise SystemExit(f"[CONFIG] MULTIWINDOW_VIEW={MULTIWINDOW_VIEW!r} invalide.")
 ENABLE_FEEDBACK = os.getenv('ENABLE_FEEDBACK', 'true').lower() == 'true'
 FEEDBACK_WINDOW_DAYS = int(os.getenv('FEEDBACK_WINDOW_DAYS', '7'))
 # ─── Features structurellement exclues par modèle ──────────────────────────
 STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
    'Complet':    ['orphan_ratio'],
    'Applicatif': ['orphan_ratio', 'is_rare_ja4', 'tcp_shared_count',
                   'request_size_variance', 'mss_mobile_mismatch',
                   'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
                   'avg_ttl', 'ttl_std', 'no_window_scale_ratio',
                   'ja4_drift_ratio'],
 }
 # ─── Imports optionnels (bibliothèques lourdes) ────────────────────────────
 try:
    from isotree import IsolationForest as ExtendedIsolationForest
    EIF_AVAILABLE = True
 except ImportError:
    EIF_AVAILABLE = False
 from sklearn.ensemble import IsolationForest
 try:
    import hdbscan as _hdbscan
    HDBSCAN_AVAILABLE = True
 except ImportError:
    HDBSCAN_AVAILABLE = False
 from sklearn.cluster import DBSCAN
 from sklearn.preprocessing import StandardScaler
 try:
    import shap as _shap
    SHAP_AVAILABLE = True
 except ImportError:
    SHAP_AVAILABLE = False
 ENABLE_SHAP = SHAP_AVAILABLE and os.getenv('ENABLE_SHAP', 'true').lower() == 'true'
 try:
    import torch
    import torch.nn as nn
    TORCH_AVAILABLE = True
 except ImportError:
    TORCH_AVAILABLE = False
 try:
    import xgboost as xgb
    XGB_AVAILABLE = True
 except ImportError:
    XGB_AVAILABLE = False
--- a/services/bot-detector/bot_detector/cycle.py
+++ b/services/bot-detector/bot_detector/cycle.py
@ -0,0 +1,371 @@
 """Cycle principal d'analyse : récupération, scoring et insertion des résultats.
 Orchestre un cycle complet : requête ClickHouse, preprocessing, scoring
 (Complet + Applicatif), feedback SOC, déduplication et insertion des résultats.
 """
 import pandas as pd
 from datetime import datetime
 from .config import (
    DB, DB_LOGS, CYCLE_INTERVAL, DEDUP_TTL_MIN,
    ENABLE_MULTIWINDOW, MULTIWINDOW_VIEW,
    ENABLE_FEEDBACK, FEEDBACK_WINDOW_DAYS, MAX_FAILURES,
    BROWSER_CONFIDENCE_THRESHOLD, BROWSER_COHORT_RATIO,
 )
 from .log import log_info, log_decision
 from .infra import get_client, set_healthy
 from .preprocessing import preprocess_df, FEATURES_BASE, FEATURES_COMPLET
 from .pipeline import run_semi_supervised_logic
 # ═══════════════════════════════════════════════════════════════════════════════
 # A5 — DÉDUPLICATION INTER-CYCLES AVEC TTL
 # ═══════════════════════════════════════════════════════════════════════════════
 # ═══════════════════════════════════════════════════════════════════════════════
 # FEEDBACK LOOP — Intégration des classifications SOC dans la baseline
 # ═══════════════════════════════════════════════════════════════════════════════
 def _load_soc_feedback(client) -> dict:
    """Charge les classifications SOC récentes pour ajuster la baseline.
    Retourne un dict {src_ip: classification} où classification est
    'true_positive', 'false_positive', 'suspicious', etc.
    Les faux positifs sont exclus du scoring (considérés comme humains),
    les vrais positifs sont exclus de la baseline humaine.
    """
    if not ENABLE_FEEDBACK:
        return {}
    try:
        feedback_df = client.query_df(
            f"SELECT entity_id AS src_ip, "
            f"  argMax(JSONExtractString(details, 'classification'), timestamp) AS classification "
            f"FROM {DB}.audit_logs "
            f"WHERE action = 'create_classification' "
            f"  AND entity_type = 'ip' "
            f"  AND timestamp >= now() - INTERVAL {FEEDBACK_WINDOW_DAYS} DAY "
            f"GROUP BY entity_id"
        )
        if feedback_df is None or feedback_df.empty:
            return {}
        result = dict(zip(feedback_df['src_ip'], feedback_df['classification']))
        log_info(f"[Feedback] {len(result)} classification(s) SOC chargées ({FEEDBACK_WINDOW_DAYS}j).")
        return result
    except Exception as e:
        log_info(f"[Feedback] Impossible de charger les classifications SOC : {e}")
        return {}
 # ═══════════════════════════════════════════════════════════════════════════════
 # A5 — DÉDUPLICATION INTER-CYCLES AVEC TTL
 # ═══════════════════════════════════════════════════════════════════════════════
 def _filter_recent_detections(client, all_anom: pd.DataFrame) -> pd.DataFrame:
    """
    A5 : Filtre les IPs déjà insérées dans ml_detected_anomalies dans les DEDUP_TTL_MIN dernières minutes.
    Exception : une IP est réinsérée si son nouveau score est ≥ 0.05 points plus bas (aggravation).
    """
    if DEDUP_TTL_MIN <= 0 or all_anom.empty:
        return all_anom
    try:
        recent_df = client.query_df(
            f"SELECT src_ip, min(anomaly_score) AS best_score "
            f"FROM {DB}.ml_detected_anomalies "
            f"WHERE detected_at > now() - INTERVAL {DEDUP_TTL_MIN} MINUTE "
            f"GROUP BY src_ip"
        )
        if recent_df.empty:
            return all_anom
        recent_map = dict(zip(recent_df['src_ip'], recent_df['best_score']))
        def _should_insert(row):
            """Détermine si une anomalie doit être réinsérée selon l'évolution du score."""
            prev = recent_map.get(row['src_ip'])
            if prev is None:
                return True
            # Réinsérer seulement si le score brut s'est significativement aggravé
            return float(row.get('raw_anomaly_score', row['anomaly_score'])) < float(prev) - 0.05
        mask = all_anom.apply(_should_insert, axis=1)
        filtered = all_anom[mask]
        skipped = len(all_anom) - len(filtered)
        if skipped > 0:
            log_info(f"[Dedup TTL={DEDUP_TTL_MIN}min] {skipped} IP(s) filtrée(s) (déjà détectées récemment).")
        return filtered
    except Exception as e:
        log_info(f"[Dedup] Erreur lors de la déduplication TTL : {e}")
        return all_anom
 # ═══════════════════════════════════════════════════════════════════════════════
 # CYCLE PRINCIPAL
 # ═══════════════════════════════════════════════════════════════════════════════
 _consecutive_failures = 0
 def fetch_and_analyze():
    """Exécute un cycle complet de détection : requête ClickHouse, scoring et insertion des résultats.
    Récupère le trafic depuis la vue view_ai_features_1h (et optionnellement view_ai_features_24h),
    applique run_semi_supervised_logic sur les deux modèles (Complet / Applicatif),
    insère les scores dans ml_all_scores et les anomalies dans ml_detected_anomalies.
    Met à jour l'état de santé et _consecutive_failures en cas d'échec de requête.
    """
    global _consecutive_failures
    cycle_id = datetime.now().strftime('%Y%m%d_%H%M%S')
    log_info('')
    log_info('=' * 70)
    log_info(f'  CYCLE {cycle_id}')
    log_info('=' * 70)
    client = get_client()
    # ── Récupération du trafic (fenêtre 1h) ──────────────────────────────────
    try:
        df = client.query_df(f'SELECT * FROM {DB}.view_ai_features_1h')
    except Exception as e:
        log_info(f'ERREUR REQUETE: {e}')
        _consecutive_failures += 1
        if _consecutive_failures >= MAX_FAILURES:
            set_healthy(False)
        log_decision('CONSECUTIVE_FAILURES', cycle_id, '', {'count': _consecutive_failures, 'error': str(e)})
        return
    _consecutive_failures = 0
    set_healthy(True)
    if df is None or df.empty:
        log_info('[Données] Aucun trafic trouvé dans view_ai_features_1h.')
        return
    log_info(f'[Données] {len(df)} sessions chargées depuis view_ai_features_1h ({len(df.columns)} colonnes).')
    # ── Enrichissement avec les features avancées de la thèse §5 ─────────────
    try:
        df_thesis = client.query_df(f'SELECT * FROM {DB}.view_thesis_features_1h')
        if df_thesis is not None and not df_thesis.empty:
            df_thesis.columns = [c.split('.')[-1] for c in df_thesis.columns]
            df.columns = [c.split('.')[-1] for c in df.columns]
            thesis_cols = [c for c in df_thesis.columns if c not in ('window_start', 'src_ip', 'ja4', 'host')]
            df = df.merge(
                df_thesis, on=['window_start', 'src_ip', 'ja4', 'host'],
                how='left', suffixes=('', '_thesis')
            )
            for col in thesis_cols:
                if col in df.columns:
                    df[col] = df[col].fillna(df[col].median() if df[col].notna().any() else 0)
            log_info(f'[Thèse §5] {len(df_thesis)} sessions enrichies avec {len(thesis_cols)} features avancées.')
        else:
            log_info('[Thèse §5] view_thesis_features_1h vide — features avancées indisponibles.')
    except Exception as e:
        log_info(f'[Thèse §5] view_thesis_features_1h inaccessible : {e} — features avancées ignorées.')
    df = preprocess_df(df)
    # ── Résumé des données chargées ───────────────────────────────────────────
    n_total       = len(df)
    n_correlated  = int((df.get('correlated', pd.Series()) == 1).sum())
    n_uncorrelated = n_total - n_correlated
    n_isp         = int((df.get('asn_label', pd.Series()) == 'isp').sum())
    n_datacenter  = int((df.get('asn_label', pd.Series()) == 'datacenter').sum())
    n_cdn         = int((df.get('asn_label', pd.Series()) == 'cdn').sum())
    n_known_bot   = int((df.get('bot_name', pd.Series()) != '').sum())
    n_anubis_allow = int((df.get('anubis_bot_action', pd.Series()) == 'ALLOW').sum())
    n_anubis_deny  = int((df.get('anubis_bot_action', pd.Series()) == 'DENY').sum())
    n_anubis_weigh = int((df.get('anubis_bot_action', pd.Series()) == 'WEIGH').sum())
    n_unique_ips  = int(df['src_ip'].nunique()) if 'src_ip' in df.columns else 0
    n_unique_ja4  = int(df['ja4'].nunique()) if 'ja4' in df.columns else 0
    log_info(f'[Données] Après preprocessing : {n_total} sessions, {n_unique_ips} IP uniques, {n_unique_ja4} JA4 uniques.')
    log_info(f'[Données]   Corrélées (L3-L7)  : {n_correlated:>6}  |  Non-corrélées (L7) : {n_uncorrelated:>6}')
    log_info(f'[Données]   ASN ISP            : {n_isp:>6}  |  Datacenter           : {n_datacenter:>6}  |  CDN : {n_cdn}')
    log_info(f'[Données]   Bots connus (dict) : {n_known_bot:>6}  |  Anubis ALLOW         : {n_anubis_allow:>6}')
    log_info(f'[Données]   Anubis DENY        : {n_anubis_deny:>6}  |  Anubis WEIGH         : {n_anubis_weigh:>6}')
    # Distribution navigateurs : dict_browser_ja4 (connu) + inféré (structurel)
    if 'inferred_browser_family' in df.columns:
        ibf_counts = df['inferred_browser_family'].value_counts()
        ibf_known = ibf_counts[ibf_counts.index != '']
        if not ibf_known.empty:
            ibf_summary = ', '.join(f'{fam}={cnt}' for fam, cnt in ibf_known.head(7).items())
            n_dict = int((df.get('browser_family', pd.Series('')).fillna('').astype(str) != '').sum())
            n_inferred = int(ibf_known.sum()) - n_dict
            log_info(f'[Données]   Navigateurs       : {ibf_known.sum():>6} sessions ({len(ibf_known)} familles : {ibf_summary})')
            log_info(f'[Données]     Dict JA4 connu   : {n_dict:>6}  |  Inféré structurel   : {max(0, n_inferred):>6}')
    elif 'browser_family' in df.columns:
        bf_counts = df['browser_family'].value_counts()
        bf_known = bf_counts[bf_counts.index != '']
        if not bf_known.empty:
            bf_summary = ', '.join(f'{fam}={cnt}' for fam, cnt in bf_known.head(5).items())
            log_info(f'[Données]   Navigateurs JA4   : {bf_known.sum():>6} sessions ({len(bf_known)} familles : {bf_summary})')
    # Distribution browser_confidence
    if 'browser_confidence' in df.columns:
        bc = df['browser_confidence']
        n_high = int((bc >= BROWSER_CONFIDENCE_THRESHOLD).sum())
        log_info(f'[Données]   browser_confidence: mean={bc.mean():.3f}, ≥seuil({BROWSER_CONFIDENCE_THRESHOLD})={n_high}')
    log_decision('CYCLE_START', cycle_id, '', {
        'total_rows': n_total,
        'human_rows': n_isp,
        'known_bot_rows': n_known_bot,
        'correlated_rows': n_correlated,
        'anubis_allow_rows': n_anubis_allow,
        'anubis_deny_rows':  n_anubis_deny,
        'anubis_weigh_rows': n_anubis_weigh,
        'multiwindow': ENABLE_MULTIWINDOW,
    })
    try:
        rec_df = client.query_df(f'SELECT src_ip, recurrence FROM {DB}.view_ip_recurrence')
        recurrence_map = dict(zip(rec_df['src_ip'], rec_df['recurrence']))
    except Exception:
        recurrence_map = {}
    # ── Feedback SOC : ajuster la baseline selon les classifications humaines ─
    soc_feedback = _load_soc_feedback(client)
    if soc_feedback:
        fp_ips = {ip for ip, cls in soc_feedback.items() if cls in ('false_positive', 'legitimate')}
        tp_ips = {ip for ip, cls in soc_feedback.items() if cls in ('true_positive', 'malicious', 'bot')}
        if fp_ips:
            # Les faux positifs confirmés rejoignent le pool humain
            mask_fp = df['src_ip'].isin(fp_ips) & (df.get('asn_label', pd.Series(dtype=str)) != 'isp')
            df.loc[mask_fp, 'asn_label'] = 'isp'
            log_info(f"[Feedback] {mask_fp.sum()} lignes reclassées 'isp' (FP confirmés).")
        if tp_ips:
            # Les vrais positifs confirmés sont exclus de la baseline humaine
            mask_tp = df['src_ip'].isin(tp_ips) & (df.get('asn_label', pd.Series(dtype=str)) == 'isp')
            df.loc[mask_tp, 'asn_label'] = 'soc_confirmed_bot'
            log_info(f"[Feedback] {mask_tp.sum()} lignes exclues de la baseline humaine (TP confirmés).")
        log_decision('SOC_FEEDBACK', cycle_id, '', {
            'fp_ips': len(fp_ips), 'tp_ips': len(tp_ips),
            'total_classifications': len(soc_feedback),
        })
    # ── Features par modèle (voir DOCUMENTATION.md §4) ───────────────────────
    feats = FEATURES_BASE
    feats_complet = FEATURES_COMPLET
    # ── Analyse fenêtre 1h ────────────────────────────────────────────────────
    df_corr = df[df['correlated'] == 1].copy()
    df_uncorr = df[df['correlated'] == 0].copy()
    log_info('')
    log_info(f'── Modèle Complet (L3→L7, corrélé) : {len(df_corr)} sessions, {len(feats_complet)} features ──')
    anom_a, scored_a = run_semi_supervised_logic(df_corr, feats_complet, 'Complet', cycle_id, recurrence_map)
    log_info('')
    log_info(f'── Modèle Applicatif (L7 seul, non-corrélé) : {len(df_uncorr)} sessions, {len(feats)} features ──')
    anom_b, scored_b = run_semi_supervised_logic(df_uncorr, feats, 'Applicatif', cycle_id, recurrence_map)
    all_anom = pd.concat([anom_a, anom_b], ignore_index=True)
    all_scored = pd.concat([scored_a, scored_b], ignore_index=True)
    # ── A3 : Analyse fenêtre 24h (optionnelle) ────────────────────────────────
    if ENABLE_MULTIWINDOW:
        try:
            df_24h = client.query_df(f'SELECT * FROM {DB}.{MULTIWINDOW_VIEW}')
            if df_24h is not None and not df_24h.empty:
                df_24h = preprocess_df(df_24h)
                log_info(f"[24h] {len(df_24h)} sessions dans la fenêtre 24h.")
                anom_c, scored_c = run_semi_supervised_logic(df_24h[df_24h['correlated'] == 1].copy(), feats_complet, 'Complet_24h', cycle_id, recurrence_map)
                anom_d, scored_d = run_semi_supervised_logic(df_24h[df_24h['correlated'] == 0].copy(), feats, 'Applicatif_24h', cycle_id, recurrence_map)
                all_anom_24h = pd.concat([anom_c, anom_d], ignore_index=True)
                all_scored_24h = pd.concat([scored_c, scored_d], ignore_index=True)
                # Fusion : pour les IPs présentes dans les deux fenêtres, conserver le score le plus bas
                if not all_anom_24h.empty:
                    all_anom = pd.concat([all_anom, all_anom_24h], ignore_index=True)
                    log_info(f"[24h] Fusion 1h+24h : {len(all_anom)} entrées avant déduplication.")
                all_scored = pd.concat([all_scored, all_scored_24h], ignore_index=True)
            else:
                log_info(f"[24h] Vue {MULTIWINDOW_VIEW} vide — analyse mono-fenêtre.")
        except Exception as e:
            log_info(f"[24h] Vue {MULTIWINDOW_VIEW} inaccessible : {e} — analyse mono-fenêtre.")
    # ── Insertion de toutes les classifications dans ml_all_scores ───────────
    if not all_scored.empty:
        try:
            now = datetime.now().replace(microsecond=0)
            all_scored['detected_at'] = now
            all_scored['ja4'] = all_scored['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
            # Utiliser la famille inférée (multifactorielle) pour browser_family
            if 'inferred_browser_family' in all_scored.columns:
                all_scored['browser_family'] = all_scored['inferred_browser_family']
            all_scores_cols = [
                'detected_at', 'window_start', 'src_ip', 'ja4', 'host', 'bot_name',
                'browser_family',
                'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category',
                'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name',
                'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label',
                'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id',
                'ae_recon_error', 'xgb_prob'
            ]
            scores_df = all_scored[[c for c in all_scores_cols if c in all_scored.columns]]
            client.insert_df(f'{DB}.ml_all_scores', scores_df)
            log_info(f'[ml_all_scores] {len(scores_df)} sessions scorées enregistrées.')
        except Exception as e:
            log_info(f'[ml_all_scores] ERREUR INSERTION: {e}')
    if not all_anom.empty:
        all_anom = all_anom.sort_values('raw_anomaly_score', ascending=True).drop_duplicates(subset=['src_ip'], keep='first')
        log_info(f'[Dédup] Intra-cycle : {len(all_anom)} IP uniques après déduplication.')
        # A5 — Déduplication inter-cycles avec TTL
        all_anom = _filter_recent_detections(client, all_anom)
        if all_anom.empty:
            log_info('[Dédup] Toutes les anomalies filtrées par TTL — rien à insérer.')
            log_decision('CYCLE_END', cycle_id, '', {'inserted': 0, 'anomalies': 0, 'known_bots': 0, 'critical': 0, 'high': 0, 'dedup_ttl_min': DEDUP_TTL_MIN})
            return
        all_anom['detected_at'] = datetime.now().replace(microsecond=0)
        fake_nav_col = 'is_fake_navigation'
        all_anom['is_headless'] = all_anom[fake_nav_col].astype(int) if fake_nav_col in all_anom.columns else 0
        cols = [
            'detected_at', 'src_ip', 'ja4', 'host', 'bot_name', 'browser_family', 'anomaly_score',
            'raw_anomaly_score', 'campaign_id',
            'threat_level', 'model_name', 'recurrence',
            'asn_number', 'asn_org', 'asn_detail', 'asn_domain', 'country_code', 'asn_label',
            'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio', 'max_keepalives', 'orphan_ratio',
            'tcp_jitter_variance', 'tcp_shared_count', 'true_window_size', 'window_mss_ratio',
            'alpn_http_mismatch', 'is_alpn_missing', 'sni_host_mismatch',
            'header_count', 'has_accept_language', 'has_cookie', 'has_referer',
            'modern_browser_score', 'is_headless', 'ua_ch_mismatch',
            'header_order_shared_count', 'ip_id_zero_ratio', 'request_size_variance',
            'multiplexing_efficiency', 'mss_mobile_mismatch',
            'correlated', 'reason', 'asset_ratio', 'direct_access_ratio', 'is_ua_rotating',
            'distinct_ja4_count', 'src_port_density', 'ja4_asn_concentration',
            'ja4_country_concentration', 'is_rare_ja4',
            'header_order_confidence', 'distinct_header_orders', 'temporal_entropy',
            'path_diversity_ratio', 'url_depth_variance', 'anomalous_payload_ratio',
            'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category',
        ]
        try:
            final_df = all_anom[[c for c in cols if c in all_anom.columns]]
            client.insert_df(f'{DB}.ml_detected_anomalies', final_df)
            n_critical = int((final_df.get('threat_level', pd.Series()) == 'CRITICAL').sum())
            n_high = int((final_df.get('threat_level', pd.Series()) == 'HIGH').sum())
            n_medium = int((final_df.get('threat_level', pd.Series()) == 'MEDIUM').sum())
            n_known = int((final_df.get('bot_name', pd.Series()) != '').sum())
            log_info('')
            log_info(f'╔══ RÉSULTAT CYCLE {cycle_id} ══════════════════════════════════')
            log_info(f'║  {len(final_df)} menaces insérées dans ml_detected_anomalies')
            log_info(f'║  CRITICAL={n_critical}  HIGH={n_high}  MEDIUM={n_medium}  KNOWN_BOT={n_known}')
            if not all_scored.empty:
                log_info(f'║  {len(all_scored)} sessions scorées dans ml_all_scores')
            log_info(f'╚═══════════════════════════════════════════════════════════')
            log_decision('CYCLE_END', cycle_id, '', {
                'inserted': len(final_df),
                'anomalies': int((final_df.get('bot_name', pd.Series()) == '').sum()),
                'known_bots': n_known,
                'critical': n_critical,
                'high': n_high,
                'dedup_ttl_min': DEDUP_TTL_MIN,
            })
        except Exception as e:
            log_info(f'ERREUR INSERTION: {e}')
    else:
        log_info('')
        log_info(f'╔══ RÉSULTAT CYCLE {cycle_id} ══════════════════════════════════')
        log_info(f'║  Aucune menace détectée ce cycle.')
        if not all_scored.empty:
            log_info(f'║  {len(all_scored)} sessions scorées dans ml_all_scores')
        tl_dist = all_scored['threat_level'].value_counts().to_dict() if not all_scored.empty and 'threat_level' in all_scored.columns else {}
        if tl_dist:
            log_info(f'║  Distribution : {", ".join(f"{k}={v}" for k, v in sorted(tl_dist.items()))}')
        log_info(f'╚═══════════════════════════════════════════════════════════')
        log_decision('CYCLE_END', cycle_id, '', {'inserted': 0, 'anomalies': 0, 'known_bots': 0, 'critical': 0, 'high': 0, 'dedup_ttl_min': DEDUP_TTL_MIN})
--- a/services/bot-detector/bot_detector/infra.py
+++ b/services/bot-detector/bot_detector/infra.py
@ -0,0 +1,89 @@
 """Infrastructure : client ClickHouse, health check HTTP, arrêt propre.
 Exécute le serveur de santé en thread daemon dès l'import.
 """
 import signal
 import sys
 import threading
 from http.server import HTTPServer, BaseHTTPRequestHandler
 from ja4_common.clickhouse import get_client as _ja4_get_client
 from .config import HEALTH_PORT
 from .log import log_info, log_decision
 # ─── Arrêt propre ───────────────────────────────────────────────────────────
 def _shutdown(sig, frame):
    """Gestionnaire de signal SIGTERM/SIGINT : journalise l'arrêt et quitte proprement."""
    log_info(f"Signal {sig} reçu — arrêt propre.")
    log_decision('SERVICE_STOP', 'shutdown', '', {'signal': sig})
    sys.exit(0)
 signal.signal(signal.SIGTERM, _shutdown)
 signal.signal(signal.SIGINT, _shutdown)
 # ─── Health check ───────────────────────────────────────────────────────────
 _service_healthy = True
 _health_lock = threading.Lock()
 def set_healthy(healthy: bool):
    """Met à jour l'état de santé du service (thread-safe)."""
    global _service_healthy
    with _health_lock:
        _service_healthy = healthy
 def is_healthy() -> bool:
    """Retourne l'état de santé courant."""
    with _health_lock:
        return _service_healthy
 class _HealthHandler(BaseHTTPRequestHandler):
    """Gestionnaire HTTP minimal pour le point de santé du service."""
    def do_GET(self):
        """Répond à la requête GET : renvoie 200 OK ou 503 DEGRADED selon l'état du service."""
        healthy = is_healthy()
        code = 200 if healthy else 503
        self.send_response(code)
        self.end_headers()
        self.wfile.write(b'OK' if healthy else b'DEGRADED')
    def log_message(self, *args):
        """Supprime les logs HTTP internes pour ne pas polluer la sortie standard."""
        pass
 threading.Thread(
    target=lambda: HTTPServer(('', HEALTH_PORT), _HealthHandler).serve_forever(),
    daemon=True
 ).start()
 # ─── Client ClickHouse ──────────────────────────────────────────────────────
 def get_client():
    """Return the shared ja4_common ClickHouse client, reconnecting on ping failure."""
    return _ja4_get_client().connect()
 def score_to_threat_level(score: float) -> str:
    """Convertit un score d'anomalie brut IsolationForest en niveau de menace textuel.
    Seuils : CRITICAL < −0.30 | HIGH < −0.15 | MEDIUM < −0.05 | LOW < 0 | NORMAL ≥ 0.
    """
    if score < -0.30:
        return 'CRITICAL'
    if score < -0.15:
        return 'HIGH'
    if score < -0.05:
        return 'MEDIUM'
    if score < 0:
        return 'LOW'
    return 'NORMAL'
--- a/services/bot-detector/bot_detector/log.py
+++ b/services/bot-detector/bot_detector/log.py
@ -0,0 +1,65 @@
 """Logging et journalisation des décisions IA.
 Fournit log_info() et log_decision() utilisés par tous les modules.
 """
 import os
 import json
 import logging
 from datetime import datetime
 from logging.handlers import RotatingFileHandler
 from .config import (
    LOG_FILE, LOG_BACKUP_COUNT, TRAINING_HISTORY_FILE,
    CONTAMINATION, ANOMALY_THRESHOLD, MODEL_DIR,
 )
 # ─── Initialisation des répertoires ─────────────────────────────────────────
 os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
 os.makedirs(MODEL_DIR, exist_ok=True)
 # ─── Logger principal ───────────────────────────────────────────────────────
 logger = logging.getLogger('bot_detector')
 logger.setLevel(logging.DEBUG)
 _console_handler = logging.StreamHandler()
 _console_handler.setFormatter(logging.Formatter('[%(asctime)s] %(message)s', '%Y-%m-%d %H:%M:%S'))
 logger.addHandler(_console_handler)
 _file_handler = RotatingFileHandler(
    LOG_FILE, maxBytes=50 * 1024 * 1024, backupCount=LOG_BACKUP_COUNT, encoding='utf-8'
 )
 _file_handler.setFormatter(logging.Formatter('%(message)s'))
 logger.addHandler(_file_handler)
 def log_info(message: str):
    """Enregistre un message de niveau INFO dans le logger du service."""
    logger.info(message)
 def log_decision(event: str, cycle_id: str, model: str = '', row: dict = None):
    """Enregistre un événement de décision IA au format JSONL dans le fichier de log rotatif.
    Chaque ligne contient l'horodatage, le cycle_id, l'événement, le modèle,
    la contamination, le seuil et les données supplémentaires de ``row``.
    """
    entry = {
        'ts': datetime.now().strftime('%Y-%m-%dT%H:%M:%S'),
        'cycle_id': cycle_id,
        'event': event,
        'model': model,
        'contamination': CONTAMINATION,
        'threshold': ANOMALY_THRESHOLD,
    }
    if row:
        entry.update(row)
    _file_handler.stream.write(json.dumps(entry, ensure_ascii=False, default=str) + '\n')
    _file_handler.stream.flush()
 def append_training_history(entry: dict):
    """Ajoute une entrée de métadonnées d'entraînement au fichier d'historique JSONL."""
    with open(TRAINING_HISTORY_FILE, 'a', encoding='utf-8') as f:
        f.write(json.dumps(entry, ensure_ascii=False, default=str) + '\n')
--- a/services/bot-detector/bot_detector/models.py
+++ b/services/bot-detector/bot_detector/models.py
@ -0,0 +1,478 @@
 """Gestion des modèles : chargement, entraînement, versionnement.
 IsolationForest (EIF), Autoencoder (PyTorch), XGBoost supervisé.
 """
 import os
 import json
 import glob
 import joblib
 import numpy as np
 import pandas as pd
 from datetime import datetime
 from .config import (
    MODEL_DIR, MODEL_HISTORY_COUNT, RETRAIN_INTERVAL_H, DRIFT_THRESHOLD,
    N_ESTIMATORS, CONTAMINATION, ANOMALY_THRESHOLD, AE_WEIGHT, AE_EPOCHS, AE_LATENT_DIM,
    AE_LEARNING_RATE, XGB_WEIGHT, XGB_MIN_LABELS, XGB_RETRAIN_INTERVAL_H,
    EIF_AVAILABLE, TORCH_AVAILABLE, XGB_AVAILABLE, DB,
    IsolationForest, StandardScaler,
 )
 from .log import log_info, log_decision, append_training_history
 from .scoring import compute_drift_score
 # Imports conditionnels depuis config (déjà importés une seule fois)
 if EIF_AVAILABLE:
    from .config import ExtendedIsolationForest
 if TORCH_AVAILABLE:
    from .config import torch, nn
 if XGB_AVAILABLE:
    import xgboost as xgb
 # ─── Caches de modèles ─────────────────────────────────────────────────────
 _model_cache: dict = {}
 _xgb_cache: dict = {}
 # ═══════════════════════════════════════════════════════════════════════════════
 # GESTION DES MODÈLES
 # ═══════════════════════════════════════════════════════════════════════════════
 def _current_pointer_path(name: str) -> str:
    """Retourne le chemin du fichier pointeur vers la version courante du modèle ``name``."""
    return os.path.join(MODEL_DIR, f'model_{name}.current')
 def _get_current_version(name: str):
    """Lit le fichier pointeur et retourne (chemin_modèle, métadonnées) ou (None, None) si absent."""
    pointer = _current_pointer_path(name)
    if not os.path.exists(pointer): return None, None
    with open(pointer) as f: version_id = f.read().strip()
    model_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.joblib')
    meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
    if not os.path.exists(model_path) or not os.path.exists(meta_path): return None, None
    with open(meta_path) as f: meta = json.load(f)
    return model_path, meta
 def _purge_old_versions(name: str):
    """Supprime les versions excédentaires du modèle ``name`` en ne conservant que MODEL_HISTORY_COUNT fichiers."""
    pattern = os.path.join(MODEL_DIR, f'model_{name}_*.joblib')
    versions = sorted(glob.glob(pattern))
    to_delete = versions[:-MODEL_HISTORY_COUNT] if len(versions) > MODEL_HISTORY_COUNT else []
    for joblib_path in to_delete:
        version_id = os.path.basename(joblib_path).replace(f'model_{name}_', '').replace('.joblib', '')
        meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
        os.remove(joblib_path)
        if os.path.exists(meta_path): os.remove(meta_path)
        log_info(f"[{name}] Version purgée : {version_id} (limite={MODEL_HISTORY_COUNT})")
 # ═══════════════════════════════════════════════════════════════════════════════
 # AUTOENCODER — Second scorer parallèle (détection d'anomalies par reconstruction)
 # ═══════════════════════════════════════════════════════════════════════════════
 class TrafficAutoEncoder:
    """Autoencoder symétrique pour détection d'anomalies par erreur de reconstruction.
    Architecture : encoder (n→64→32→latent_dim) — decoder (latent_dim→32→64→n)
    Activation : ReLU + BatchNorm (encoder/decoder), sigmoid (sortie — données normalisées [0,1])
    Score = MSE(input, reconstruction) par échantillon.
    L'espace latent (16-dim par défaut) peut servir de features compressées pour HDBSCAN.
    """
    def __init__(self, n_features: int, latent_dim: int = AE_LATENT_DIM):
        if not TORCH_AVAILABLE:
            raise RuntimeError("PyTorch non disponible — autoencoder désactivé.")
        self.n_features = n_features
        self.latent_dim = latent_dim
        self.device = torch.device('cpu')
        self._build_model()
        self._scaler_min = None
        self._scaler_range = None
    def _build_model(self):
        dim1 = min(64, max(self.n_features, self.latent_dim + 4))
        dim2 = min(32, max(dim1 // 2, self.latent_dim + 2))
        self.encoder = nn.Sequential(
            nn.Linear(self.n_features, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
            nn.Linear(dim1, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
            nn.Linear(dim2, self.latent_dim),
        ).to(self.device)
        self.decoder = nn.Sequential(
            nn.Linear(self.latent_dim, dim2), nn.BatchNorm1d(dim2), nn.ReLU(),
            nn.Linear(dim2, dim1), nn.BatchNorm1d(dim1), nn.ReLU(),
            nn.Linear(dim1, self.n_features), nn.Sigmoid(),
        ).to(self.device)
        self._all_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
    def _to_tensor(self, X: np.ndarray) -> 'torch.Tensor':
        """Normalise [0,1] via min-max puis convertit en Tensor."""
        if self._scaler_min is not None:
            X_norm = (X - self._scaler_min) / (self._scaler_range + 1e-9)
        else:
            X_norm = X
        return torch.tensor(np.clip(X_norm, 0, 1), dtype=torch.float32, device=self.device)
    def fit(self, X: np.ndarray, epochs: int = AE_EPOCHS, lr: float = AE_LEARNING_RATE,
            batch_size: int = 256) -> dict:
        """Entraîne l'autoencoder sur la baseline humaine (données normales)."""
        self._scaler_min = X.min(axis=0)
        self._scaler_range = X.max(axis=0) - self._scaler_min
        X_t = self._to_tensor(X)
        dataset = torch.utils.data.TensorDataset(X_t)
        loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
        optimizer = torch.optim.Adam(self._all_params, lr=lr, weight_decay=1e-5)
        criterion = nn.MSELoss()
        self.encoder.train()
        self.decoder.train()
        losses = []
        for epoch in range(epochs):
            epoch_loss = 0.0
            for (batch,) in loader:
                latent = self.encoder(batch)
                recon = self.decoder(latent)
                loss = criterion(recon, batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item() * len(batch)
            losses.append(epoch_loss / len(X_t))
        return {'final_loss': losses[-1], 'epochs': epochs, 'n_samples': len(X)}
    def score_samples(self, X: np.ndarray) -> np.ndarray:
        """Retourne l'erreur de reconstruction MSE par échantillon (plus élevé = plus anomal)."""
        self.encoder.eval()
        self.decoder.eval()
        X_t = self._to_tensor(X)
        with torch.no_grad():
            latent = self.encoder(X_t)
            recon = self.decoder(latent)
            mse = ((recon - X_t) ** 2).mean(dim=1).numpy()
        return mse
    def encode(self, X: np.ndarray) -> np.ndarray:
        """Retourne l'espace latent (pour HDBSCAN clustering)."""
        self.encoder.eval()
        X_t = self._to_tensor(X)
        with torch.no_grad():
            return self.encoder(X_t).numpy()
    def state_dict(self) -> dict:
        return {
            'encoder': self.encoder.state_dict(),
            'decoder': self.decoder.state_dict(),
            'scaler_min': self._scaler_min,
            'scaler_range': self._scaler_range,
            'n_features': self.n_features,
            'latent_dim': self.latent_dim,
        }
    @classmethod
    def load_state_dict(cls, state: dict) -> 'TrafficAutoEncoder':
        ae = cls(state['n_features'], state['latent_dim'])
        ae._scaler_min = state['scaler_min']
        ae._scaler_range = state['scaler_range']
        ae.encoder.load_state_dict(state['encoder'])
        ae.decoder.load_state_dict(state['decoder'])
        return ae
 def _ae_model_path(name: str, version_id: str) -> str:
    return os.path.join(MODEL_DIR, f'ae_{name}_{version_id}.pt')
 # ═══════════════════════════════════════════════════════════════════════════════
 # XGBOOST — Troisième voix supervisée (labels historiques + feedback SOC)
 # ═══════════════════════════════════════════════════════════════════════════════
 def _xgb_model_path(name: str) -> str:
    return os.path.join(MODEL_DIR, f'xgb_{name}.json')
 def _xgb_meta_path(name: str) -> str:
    return os.path.join(MODEL_DIR, f'xgb_{name}.meta.json')
 def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -> tuple:
    """Charge les labels historiques depuis ml_all_scores + view_ai_features_1h.
    Les labels (threat_level) viennent de ml_all_scores, les features de
    view_ai_features_1h via une jointure sur (src_ip, ja4, host).
    Les features absentes de la vue (ex: thesis §5 features) sont ignorées.
    Positifs : threat_level IN ('HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') → label=1
    Négatifs : threat_level IN ('NORMAL', 'LEGITIMATE_BROWSER') → label=0
    Retourne (X, y, usable_features) ou (None, None, None) si insuffisant.
    """
    try:
        # Découvrir les colonnes disponibles dans la vue
        cols_result = client.query(
            f"SELECT name FROM system.columns "
            f"WHERE database = '{DB}' AND table = 'view_ai_features_1h'"
        )
        available_cols = {row[0] for row in cols_result.result_rows} if cols_result.result_rows else set()
        usable_features = [f for f in features if f in available_cols]
        if len(usable_features) < 10:
            log_info(f"[XGB] Seulement {len(usable_features)} features disponibles dans view_ai_features_1h — insuffisant.")
            return None, None, None
        feature_cols = ', '.join(f'f.{c}' for c in usable_features)
        result = client.query(
            f"SELECT {feature_cols}, s.threat_level "
            f"FROM {DB}.ml_all_scores AS s "
            f"INNER JOIN {DB}.view_ai_features_1h AS f "
            f"  ON s.src_ip = f.src_ip AND s.ja4 = f.ja4 AND s.host = f.host "
            f"WHERE s.threat_level IN ('NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') "
            f"AND s.window_start >= now() - INTERVAL 7 DAY "
            f"ORDER BY rand() LIMIT 50000"
        )
        if not result.result_rows:
            return None, None, None
        cols = usable_features + ['threat_level']
        df = pd.DataFrame(result.result_rows, columns=cols)
        df[usable_features] = df[usable_features].apply(pd.to_numeric, errors='coerce')
        df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=usable_features)
        y = (~df['threat_level'].isin(['NORMAL', 'LEGITIMATE_BROWSER'])).astype(int)
        if y.sum() < 10 or len(y) < min_labels:
            return None, None, None
        X = df[usable_features].values
        return X, y.values, usable_features
    except Exception as exc:
        log_info(f"[XGB] Erreur chargement labels : {exc}")
        return None, None, None
 def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
    """Charge ou entraîne le modèle XGBoost supervisé.
    Retourne (XGBClassifier, list[str] features) ou (None, None) si indisponible.
    """
    if not XGB_AVAILABLE or XGB_WEIGHT <= 0:
        return None, None
    model_path = _xgb_model_path(name)
    meta_path = _xgb_meta_path(name)
    # Charger le modèle existant si récent
    if os.path.exists(model_path) and os.path.exists(meta_path):
        try:
            with open(meta_path) as f:
                meta = json.load(f)
            trained_at = datetime.fromisoformat(meta['trained_at'])
            age_h = (datetime.now() - trained_at).total_seconds() / 3600
            if age_h < XGB_RETRAIN_INTERVAL_H:
                model = xgb.XGBClassifier()
                model.load_model(model_path)
                log_info(f"[XGB][{name}] Modèle rechargé ({age_h:.1f}h / {XGB_RETRAIN_INTERVAL_H}h, {meta.get('n_labels', '?')} labels).")
                return model, meta.get('features', features)
        except Exception as exc:
            log_info(f"[XGB][{name}] Erreur chargement : {exc}")
    # Entraîner un nouveau modèle
    X, y, xgb_features = _load_xgb_labels(client, features)
    if X is None:
        log_info(f"[XGB][{name}] Labels insuffisants (< {XGB_MIN_LABELS}) — XGBoost désactivé ce cycle.")
        # Tenter de réutiliser un modèle ancien
        if os.path.exists(model_path) and os.path.exists(meta_path):
            try:
                model = xgb.XGBClassifier()
                model.load_model(model_path)
                with open(meta_path) as f:
                    meta = json.load(f)
                return model, meta.get('features', features)
            except Exception:
                pass
        return None, None
    scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
    model = xgb.XGBClassifier(
        n_estimators=200, max_depth=6, learning_rate=0.1,
        scale_pos_weight=scale_pos, eval_metric='logloss',
        random_state=42, n_jobs=-1,
        tree_method='hist',
    )
    model.fit(X, y, verbose=False)
    model.save_model(model_path)
    meta = {
        'trained_at': datetime.now().isoformat(),
        'n_labels': len(y), 'n_positive': int(y.sum()),
        'n_negative': int((y == 0).sum()), 'n_features': len(xgb_features),
        'features': xgb_features,
        'scale_pos_weight': scale_pos, 'model_name': name,
    }
    with open(meta_path, 'w') as f:
        json.dump(meta, f, indent=2)
    log_info(f"[XGB][{name}] Modèle entraîné : {len(y)} labels ({y.sum()} positifs), scale_pos_weight={scale_pos}")
    log_decision('XGB_TRAINED', cycle_id, name, meta)
    return model, xgb_features
 def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str):
    """Charge le modèle IsolationForest existant ou en entraîne un nouveau si nécessaire.
    Réutilise le modèle si son âge est inférieur à RETRAIN_INTERVAL_H et si aucune
    dérive conceptuelle significative n'est détectée (A1). En cas d'expiration ou de
    dérive, entraîne un nouveau modèle sur ``human_baseline``, le sérialise sur disque,
    met à jour le fichier pointeur et purge les anciennes versions.
    Retourne (IsolationForest, TrafficAutoEncoder|None, list[str] features).
    """
    model_path, meta = _get_current_version(name)
    if model_path and meta:
        trained_at = datetime.fromisoformat(meta['trained_at'])
        age_h = (datetime.now() - trained_at).total_seconds() / 3600
        age_ok = age_h < RETRAIN_INTERVAL_H
        # A1 — Dérive conceptuelle : comparer la distribution actuelle avec celle de l'entraînement
        drift_score = 0.0
        drift_forced = False
        if age_ok and 'baseline_stats' in meta:
            drift_score = compute_drift_score(meta['baseline_stats'], human_baseline, features)
            if drift_score >= DRIFT_THRESHOLD:
                drift_forced = True
                log_info(f"[{name}] Dérive détectée ({drift_score:.0%} features) — retraining forcé.")
                log_decision('DRIFT_DETECTED', cycle_id, name, {
                    'version_id': meta['version_id'], 'drift_score': round(drift_score, 3),
                    'drift_threshold': DRIFT_THRESHOLD, 'model_age_hours': round(age_h, 2)
                })
        if age_ok and not drift_forced:
            log_info(f"[{name}] Modèle v{meta['version_id']} valide ({age_h:.1f}h / {RETRAIN_INTERVAL_H}h, drift={drift_score:.0%}) — réutilisation.")
            log_decision('MODEL_LOADED', cycle_id, name, {
                'version_id': meta['version_id'], 'model_age_hours': round(age_h, 2),
                'trained_at': meta['trained_at'], 'human_samples': meta.get('human_samples', '?'),
                'retrain_in_hours': round(RETRAIN_INTERVAL_H - age_h, 1), 'drift_score': round(drift_score, 3)
            })
            ae_loaded = None
            if TORCH_AVAILABLE and AE_WEIGHT > 0:
                ae_path = _ae_model_path(name, meta['version_id'])
                if os.path.exists(ae_path):
                    try:
                        ae_loaded = TrafficAutoEncoder.load_state_dict(torch.load(ae_path, weights_only=False))
                        log_info(f"[{name}] Autoencoder v{meta['version_id']} rechargé.")
                    except Exception as exc:
                        log_info(f"[{name}] Erreur chargement AE : {exc} — AE désactivé ce cycle.")
            return joblib.load(model_path), ae_loaded, meta.get('features', features)
        elif not drift_forced:
            log_info(f"[{name}] Modèle v{meta['version_id']} expiré ({age_h:.1f}h ≥ {RETRAIN_INTERVAL_H}h) — retraining.")
    version_id = datetime.now().strftime('%Y%m%d_%H%M%S')
    log_info(f"[{name}] Entraînement EIF v{version_id} — {len(human_baseline)} sessions ISP, {len(features)} features, contamination={CONTAMINATION}")
    X = human_baseline[features].replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median())
    # Feature pruning : retirer les features à variance quasi-nulle (inutiles pour les arbres)
    PRUNE_VARIANCE_THRESHOLD = float(os.getenv('PRUNE_VARIANCE_THRESHOLD', '1e-6'))
    feature_variances = X.var()
    low_var_features = feature_variances[feature_variances < PRUNE_VARIANCE_THRESHOLD].index.tolist()
    if low_var_features:
        log_info(f"[{name}] Élagage : {len(low_var_features)} feature(s) à variance < {PRUNE_VARIANCE_THRESHOLD} retirées : {low_var_features}")
        X = X.drop(columns=low_var_features)
        features = [f for f in features if f not in low_var_features]
        log_decision('FEATURE_PRUNED', cycle_id, name, {'pruned': low_var_features, 'remaining': len(features)})
    # Validation split : réserver 20% pour évaluation offline
    val_size = max(1, int(len(X) * 0.2))
    X_train = X.iloc[:-val_size]
    X_val = X.iloc[-val_size:]
    if EIF_AVAILABLE:
        model = ExtendedIsolationForest(
            ntrees=300, ndim=min(3, len(features)),
            sample_size='auto', missing_action='impute',
            random_seed=42, nthreads=-1
        )
    else:
        model = IsolationForest(n_estimators=300, contamination=CONTAMINATION, random_state=42, n_jobs=-1)
    model.fit(X_train)
    # Évaluation offline : score moyen sur la validation (devrait être > 0 pour du trafic humain sklearn)
    val_scores = model.decision_function(X_val)
    # Unifier la convention : négatif = anomal (isotree: 0.5 - score)
    if EIF_AVAILABLE:
        val_scores = 0.5 - val_scores
    val_mean_score = float(np.mean(val_scores))
    val_anomaly_rate = float(np.mean(val_scores < 0))
    log_info(f"[{name}] Validation : score moyen={val_mean_score:.4f}, taux anomalie={val_anomaly_rate:.2%} ({len(X_val)} échantillons)")
    # GATE CONDITION : rejeter le modèle si la baseline semble contaminée
    VAL_ANOMALY_GATE = float(os.getenv('VAL_ANOMALY_GATE', '0.20'))
    if val_anomaly_rate > VAL_ANOMALY_GATE:
        log_info(f"[{name}] ⚠ REJET : val_anomaly_rate={val_anomaly_rate:.2%} > gate={VAL_ANOMALY_GATE:.0%} — baseline probablement contaminée.")
        log_decision('MODEL_REJECTED', cycle_id, name, {
            'val_anomaly_rate': round(val_anomaly_rate, 4), 'gate': VAL_ANOMALY_GATE,
            'val_mean_score': round(val_mean_score, 4), 'version_id': version_id,
        })
        # Tenter de réutiliser le modèle précédent
        if model_path and os.path.exists(model_path):
            log_info(f"[{name}] Conservation du modèle précédent v{meta.get('version_id', '?')}.")
            ae_prev = None
            if TORCH_AVAILABLE and AE_WEIGHT > 0:
                ae_prev_path = _ae_model_path(name, meta.get('version_id', ''))
                if os.path.exists(ae_prev_path):
                    try:
                        ae_prev = TrafficAutoEncoder.load_state_dict(torch.load(ae_prev_path, weights_only=False))
                    except Exception:
                        pass
            return joblib.load(model_path), ae_prev, meta.get('features', features)
        log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.")
    # A1 — Sauvegarder les statistiques de distribution avec quantile digest pour drift detection
    baseline_stats = {
        f: {
            'mean': float(X[f].mean()), 'std': float(X[f].std()),
            'p10': float(X[f].quantile(0.10)), 'p25': float(X[f].quantile(0.25)),
            'p50': float(X[f].quantile(0.50)), 'p75': float(X[f].quantile(0.75)),
            'p90': float(X[f].quantile(0.90)),
        }
        for f in features
    }
    new_model_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.joblib')
    new_meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
    joblib.dump(model, new_model_path)
    # Entraînement de l'Autoencoder en parallèle (si PyTorch disponible et AE_WEIGHT > 0)
    ae_model = None
    if TORCH_AVAILABLE and AE_WEIGHT > 0:
        try:
            ae_model = TrafficAutoEncoder(n_features=len(features))
            ae_stats = ae_model.fit(X_train.values)
            ae_path = _ae_model_path(name, version_id)
            torch.save(ae_model.state_dict(), ae_path)
            log_info(f"[{name}] Autoencoder entraîné : loss={ae_stats['final_loss']:.6f}, epochs={ae_stats['epochs']}")
        except Exception as exc:
            log_info(f"[{name}] Autoencoder training échoué : {exc} — AE désactivé.")
            ae_model = None
    previous_version = meta.get('version_id', None) if meta else None
    new_meta = {
        'version_id': version_id, 'trained_at': datetime.now().isoformat(),
        'human_samples': len(human_baseline), 'contamination': CONTAMINATION,
        'threshold': ANOMALY_THRESHOLD, 'features': features,
        'model_name': name, 'previous_version': previous_version,
        'retrain_interval': RETRAIN_INTERVAL_H, 'baseline_stats': baseline_stats,
        'algorithm': 'ExtendedIsolationForest' if EIF_AVAILABLE else 'IsolationForest',
        'autoencoder': ae_model is not None,
        'ae_weight': AE_WEIGHT if ae_model else 0.0,
        'validation': {
            'val_size': len(X_val), 'train_size': len(X_train),
            'val_mean_score': round(val_mean_score, 4),
            'val_anomaly_rate': round(val_anomaly_rate, 4),
        }
    }
    with open(new_meta_path, 'w') as f: json.dump(new_meta, f, indent=2)
    with open(_current_pointer_path(name), 'w') as f: f.write(version_id)
    append_training_history({k: v for k, v in new_meta.items() if k != 'baseline_stats'})
    _purge_old_versions(name)
    log_info(f"[{name}] Modèle v{version_id} sauvegardé → {new_model_path} (AE={'oui' if ae_model is not None else 'non'})")
    log_decision('MODEL_TRAINED', cycle_id, name, {
        'version_id': version_id, 'previous_version': previous_version,
        'human_samples': len(human_baseline), 'next_retrain_in_h': RETRAIN_INTERVAL_H,
        'history_kept': MODEL_HISTORY_COUNT
    })
    return model, ae_model, features
--- a/services/bot-detector/bot_detector/pipeline.py
+++ b/services/bot-detector/bot_detector/pipeline.py
@ -0,0 +1,353 @@
 """Pipeline de détection semi-supervisée.
 Fonction principale run_semi_supervised_logic() : triage, scoring EIF, AE, XGB,
 classification navigateur, détection d'anomalies, SHAP et clustering.
 """
 import numpy as np
 import pandas as pd
 from .config import (
    DB, CONTAMINATION, AE_WEIGHT, XGB_WEIGHT, RECURRENCE_WEIGHT,
    ANOMALY_THRESHOLD, ANOMALY_PERCENTILE, ENABLE_CLUSTERING,
    ENABLE_SHAP, EIF_AVAILABLE, TORCH_AVAILABLE, XGB_AVAILABLE,
    BROWSER_CONFIDENCE_THRESHOLD, BROWSER_COHORT_RATIO,
    MIN_VALID_FEATURE_RATIO, STRUCTURAL_EXCLUDED_FEATURES,
 )
 from .log import log_info, log_decision
 from .infra import score_to_threat_level, get_client
 from .models import load_or_train_model, load_or_train_xgb, TrafficAutoEncoder
 from .scoring import (
    validate_features, compute_adaptive_threshold, normalize_scores,
    compute_shap_top_features, build_reason, cluster_anomalies,
 )
 # ═══════════════════════════════════════════════════════════════════════════════
 # ANALYSE SEMI-SUPERVISÉE
 # ═══════════════════════════════════════════════════════════════════════════════
 def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
    """Applique le pipeline de détection semi-supervisée sur un sous-ensemble du trafic.
    Trifurque le trafic en bots connus, bots Anubis ALLOW et trafic inconnu,
    entraîne ou charge le modèle IsolationForest sur la baseline humaine,
    score le trafic inconnu, applique les améliorations A2/A4/A6/A8,
    et retourne (threats, all_scored) sous forme de DataFrames.
    Effets de bord : écriture dans les logs de décision via log_decision.
    """
    # 1. Bots connus (dict_bot_ip / dict_bot_ja4) → exclus du scoring IF
    known_bots = df[df['bot_name'] != ''].copy()
    rest = df[df['bot_name'] == ''].copy()
    # 2. Bots Anubis ALLOW → bots légitimes, exclus du scoring IF
    anubis_allow = rest[rest['anubis_bot_action'] == 'ALLOW'].copy()
    # 3. Tout le reste passe par l'IsolationForest pour un score réel :
    #    - DENY  : menaces identifiées par règles Anubis → IF donne le score de sévérité
    #    - WEIGH / inconnu → scorés normalement (anubis_is_flagged=1 pour WEIGH)
    #    Les DENY sont TOUJOURS inclus dans les threats, indépendamment du seuil IF.
    unknown_traffic = rest[rest['anubis_bot_action'] != 'ALLOW'].copy()
    human_baseline  = unknown_traffic[unknown_traffic['asn_label'] == 'isp']
    log_info(f'[{name}] ── Triage ──────────────────────────────────────')
    log_info(f'[{name}]   Total sessions       : {len(df):>6}')
    log_info(f'[{name}]   Bots connus (dict)   : {len(known_bots):>6}')
    log_info(f'[{name}]   Anubis ALLOW         : {len(anubis_allow):>6}')
    log_info(f'[{name}]   Trafic à scorer (IF) : {len(unknown_traffic):>6}')
    log_info(f'[{name}]   Baseline ISP (human) : {len(human_baseline):>6}  (seuil min=500)')
    # A7 — Valider les features avant tout traitement
    valid_features = validate_features(df, features, name, cycle_id)
    if valid_features is None:
        return pd.DataFrame(), pd.DataFrame()
    if len(human_baseline) < 500:
        log_info(f"[{name}] ⚠ Données humaines insuffisantes ({len(human_baseline)} < 500) — cycle ignoré.")
        log_info(f"[{name}]   Distribution asn_label dans le trafic à scorer :")
        if 'asn_label' in unknown_traffic.columns:
            for label, cnt in unknown_traffic['asn_label'].value_counts().head(8).items():
                log_info(f"[{name}]     {label:>15} : {cnt}")
        log_decision('SKIPPED_LOW_DATA', cycle_id, name, {
            'human_count': len(human_baseline), 'unknown_count': len(unknown_traffic)
        })
        return pd.DataFrame(), pd.DataFrame()
    log_info(f'[{name}] ── Modèle EIF ─────────────────────────────────')
    log_info(f'[{name}]   Features validées : {len(valid_features)}/{len(features)} ({", ".join(valid_features[:5])}{"…" if len(valid_features) > 5 else ""})')
    # A1 — Dérive conceptuelle intégrée dans load_or_train_model
    model, ae_model, model_features = load_or_train_model(name, human_baseline, valid_features, cycle_id)
    # Utiliser les features du modèle (possiblement différentes après pruning/chargement)
    scoring_features = [f for f in model_features if f in unknown_traffic.columns]
    unknown_traffic = unknown_traffic.copy()
    X_test = unknown_traffic[scoring_features].replace([np.inf, -np.inf], np.nan)
    X_test = X_test.fillna(X_test.median())
    raw_scores = model.decision_function(X_test)
    # isotree renvoie des scores dans [0, 1] : 0.5 = frontière, >0.5 = anomal
    # sklearn renvoie des scores centrés sur 0 : <0 = anomal, >0 = normal
    # Conversion : sklearn_equiv = 0.5 - isotree_score
    #   isotree 0.8 → -0.3 (CRITICAL)  |  isotree 0.5 → 0.0 (frontière)
    #   isotree 0.3 → +0.2 (NORMAL)
    if EIF_AVAILABLE:
        raw_scores = 0.5 - raw_scores
    log_info(f'[{name}]   Scoring EIF : {len(X_test)} sessions scorées (min={raw_scores.min():.4f}, max={raw_scores.max():.4f}, mean={raw_scores.mean():.4f})')
    # Combinaison EIF + Autoencoder si disponible
    # Score final = (1-α) * eif_norm + α * ae_norm   où α = AE_WEIGHT
    if ae_model is not None and AE_WEIGHT > 0:
        try:
            ae_recon_errors = ae_model.score_samples(X_test.values)
            ae_norm = normalize_scores(-ae_recon_errors)  # plus élevé = plus anomal
            eif_norm = normalize_scores(raw_scores)
            combined_norm = (1 - AE_WEIGHT) * eif_norm + AE_WEIGHT * ae_norm
            unknown_traffic['ae_recon_error'] = ae_recon_errors
            unknown_traffic['anomaly_score'] = combined_norm
            log_info(f"[{name}] Score combiné EIF+AE (α={AE_WEIGHT}): ae_mean={ae_recon_errors.mean():.6f}")
        except Exception as exc:
            log_info(f"[{name}] AE scoring échoué : {exc} — utilisation EIF seul.")
            unknown_traffic['ae_recon_error'] = 0.0
            unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
    else:
        unknown_traffic['ae_recon_error'] = 0.0
        unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
    # raw_anomaly_score : score brut IF pour comparaison au seuil et assignation du threat_level
    unknown_traffic['raw_anomaly_score'] = raw_scores
    unknown_traffic['model_name'] = name
    # XGBoost supervisé — troisième voix (si labels historiques disponibles)
    unknown_traffic['xgb_prob'] = 0.0
    if XGB_AVAILABLE and XGB_WEIGHT > 0:
        try:
            xgb_client = get_client()
            xgb_model, xgb_feats = load_or_train_xgb(name, xgb_client, scoring_features, cycle_id)
            if xgb_model is not None and xgb_feats is not None:
                # XGB peut utiliser un sous-ensemble de features (celles disponibles dans la vue)
                xgb_cols = [f for f in xgb_feats if f in unknown_traffic.columns]
                X_xgb = unknown_traffic[xgb_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
                xgb_probs = xgb_model.predict_proba(X_xgb.values)[:, 1]
                unknown_traffic['xgb_prob'] = xgb_probs
                # Méta-learner : combiner anomaly_score (EIF+AE) et xgb_prob
                # anomaly_score déjà normalisé [0,1], xgb_prob est [0,1]
                α_xgb = XGB_WEIGHT
                unknown_traffic['anomaly_score'] = (
                    (1 - α_xgb) * unknown_traffic['anomaly_score'] + α_xgb * xgb_probs
                )
                log_info(f"[{name}] Score combiné EIF+AE+XGB (β={α_xgb}): xgb_mean={xgb_probs.mean():.4f}")
        except Exception as exc:
            log_info(f"[{name}] XGBoost scoring échoué : {exc} — EIF+AE seuls.")
    # A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD)
    effective_threshold = compute_adaptive_threshold(raw_scores)
    log_info(f"[{name}] Seuil effectif : {effective_threshold:.4f} (statique={ANOMALY_THRESHOLD}, percentile={ANOMALY_PERCENTILE})")
    # A6 — Pénaliser les IPs récurrentes sur le score BRUT avant comparaison au seuil
    if RECURRENCE_WEIGHT > 0:
        recurrences = unknown_traffic['src_ip'].map(recurrence_map).fillna(0)
        penalty = np.log1p(recurrences.values) * RECURRENCE_WEIGHT
        unknown_traffic['raw_anomaly_score'] = unknown_traffic['raw_anomaly_score'] - penalty
    # Assigner threat_level à TOUTES les sessions scorées (pour ml_all_scores)
    unknown_traffic['threat_level'] = unknown_traffic['raw_anomaly_score'].apply(score_to_threat_level)
    unknown_traffic['recurrence'] = unknown_traffic['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
    unknown_traffic['campaign_id'] = -1
    # Extraire les DENY (maintenant avec leur vrai score IF) et forcer leur threat_level
    deny_mask = unknown_traffic['anubis_bot_action'] == 'DENY'
    unknown_traffic.loc[deny_mask, 'threat_level'] = 'ANUBIS_DENY'
    # ── A9 — Classification multifactorielle des navigateurs légitimes ─────────
    # 5 axes indépendants : JA4 connu, structure JA4, headers HTTP modernes,
    # comportement de navigation, cohérence TLS/TCP.
    # browser_confidence [0..1] = combinaison pondérée des 5 axes.
    # La classification n'exige plus que le JA4 soit dans le dictionnaire statique :
    # un JA4 inconnu avec des signaux HTTP/TLS/nav forts sera quand même reconnu.
    _bconf = unknown_traffic.get('browser_confidence', pd.Series(0, index=unknown_traffic.index)).fillna(0)
    _ibf = unknown_traffic.get('inferred_browser_family', pd.Series('', index=unknown_traffic.index)).fillna('').astype(str)
    browser_legit_mask = (
        (_bconf >= BROWSER_CONFIDENCE_THRESHOLD) &                  # confiance multifactorielle
        (_ibf != '') &                                              # famille identifiée (dict ou inférée)
        (unknown_traffic['threat_level'].isin(['NORMAL', 'LOW'])) & # pas de menace IF
        (~deny_mask)                                                # pas un DENY Anubis
    )
    # Propagation par cohorte JA4 : si ≥ BROWSER_COHORT_RATIO% des sessions d'un JA4
    # sont déjà classées navigateur, propager aux sessions restantes du même JA4.
    if browser_legit_mask.any():
        ja4_col = unknown_traffic['ja4']
        legit_per_ja4 = ja4_col[browser_legit_mask].value_counts()
        total_per_ja4 = ja4_col.value_counts()
        ratio_per_ja4 = (legit_per_ja4 / total_per_ja4).dropna()
        cohort_ja4s = set(ratio_per_ja4[ratio_per_ja4 >= BROWSER_COHORT_RATIO].index)
        if cohort_ja4s:
            cohort_mask = (
                ja4_col.isin(cohort_ja4s) &
                (~browser_legit_mask) &                                     # pas déjà classé
                (unknown_traffic['threat_level'].isin(['NORMAL', 'LOW'])) &
                (~deny_mask)
            )
            browser_legit_mask = browser_legit_mask | cohort_mask
            n_cohort = cohort_mask.sum()
            if n_cohort > 0:
                log_info(f"[{name}] Propagation cohorte JA4 : {n_cohort} sessions supplémentaires ({len(cohort_ja4s)} JA4)")
    if browser_legit_mask.any():
        unknown_traffic.loc[browser_legit_mask, 'threat_level'] = 'LEGITIMATE_BROWSER'
        # Utiliser la famille inférée (dict ou structurel)
        _family_disp = _ibf[browser_legit_mask].where(_ibf[browser_legit_mask] != '', 'Unknown')
        unknown_traffic.loc[browser_legit_mask, 'reason'] = (
            '[Navigateur légitime] ' + _family_disp +
            ' (confiance=' + _bconf[browser_legit_mask].round(2).astype(str) + ')'
        )
        n_legit = browser_legit_mask.sum()
        families = _ibf[browser_legit_mask].value_counts().to_dict()
        # Log des axes moyens pour diagnostic
        ax_means = {}
        for ax in ['axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
                    'axis_nav_behavior', 'axis_tls_coherence']:
            col = unknown_traffic.get(ax, None)
            if col is not None:
                ax_means[ax.replace('axis_', '')] = round(float(col[browser_legit_mask].mean()), 3)
        log_info(f"[{name}] {n_legit} session(s) classée(s) LEGITIMATE_BROWSER : {families}")
        log_info(f"[{name}]   Axes moyens : {ax_means}")
        log_decision('LEGITIMATE_BROWSER', cycle_id, name, {
            'count': int(n_legit), 'families': families,
            'mean_confidence': round(float(_bconf[browser_legit_mask].mean()), 3),
            'axis_means': ax_means,
        })
    # Capturer toutes les sessions scorées (avant filtrage par seuil) — pour ml_all_scores
    all_scored = unknown_traffic.copy()
    if not known_bots.empty:
        known_bots = known_bots.copy()
        known_bots['anomaly_score'] = 0.0
        known_bots['raw_anomaly_score'] = 0.0
        known_bots['ae_recon_error'] = 0.0
        known_bots['xgb_prob'] = 0.0
        known_bots['threat_level'] = 'KNOWN_BOT'
        known_bots['model_name'] = name
        known_bots['campaign_id'] = -1
        known_bots['reason'] = '[Identification] Bot légitime: ' + known_bots['bot_name']
        known_bots['recurrence'] = known_bots['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
        for _, row in known_bots.iterrows():
            log_decision('KNOWN_BOT', cycle_id, name, {
                'src_ip': row.get('src_ip', ''), 'bot_name': row.get('bot_name', ''),
                'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
                'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''),
                'recurrence': int(row.get('recurrence', 1))
            })
    # ── Anubis ALLOW : bots légitimes identifiés par règles Anubis ───────────
    if not anubis_allow.empty:
        anubis_allow = anubis_allow.copy()
        anubis_allow['anomaly_score'] = 0.0
        anubis_allow['raw_anomaly_score'] = 0.0
        anubis_allow['ae_recon_error'] = 0.0
        anubis_allow['xgb_prob'] = 0.0
        anubis_allow['threat_level'] = 'KNOWN_BOT'
        anubis_allow['bot_name'] = anubis_allow['anubis_bot_name']
        anubis_allow['model_name'] = name
        anubis_allow['campaign_id'] = -1
        anubis_allow['reason'] = '[Anubis ALLOW] ' + anubis_allow['anubis_bot_name']
        anubis_allow['recurrence'] = anubis_allow['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
        for _, row in anubis_allow.iterrows():
            log_decision('KNOWN_BOT', cycle_id, name, {
                'src_ip': row.get('src_ip', ''), 'bot_name': row.get('anubis_bot_name', ''),
                'anubis_bot_name': row.get('anubis_bot_name', ''),
                'anubis_bot_action': row.get('anubis_bot_action', ''),
                'anubis_bot_category': row.get('anubis_bot_category', ''),
                'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
                'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''),
                'recurrence': int(row.get('recurrence', 1)),
            })
    # ── Anubis DENY : scorés par IF, toujours inclus dans les threats ────────
    # Extraits de unknown_traffic après scoring — ils ont leur vrai score IF.
    anubis_deny = unknown_traffic[deny_mask].copy()
    if not anubis_deny.empty:
        anubis_deny['reason'] = '[Anubis DENY] ' + anubis_deny['anubis_bot_name'].fillna('') + \
            ' | ' + anubis_deny['raw_anomaly_score'].apply(lambda s: f'IF={s:.4f}')
        log_info(f"[{name}] Anubis DENY: {len(anubis_deny)} IP(s) scorées par IF "
                 f"(score moyen: {anubis_deny['raw_anomaly_score'].mean():.4f}).")
        for _, row in anubis_deny.iterrows():
            log_decision('ANUBIS_DENY', cycle_id, name, {
                'src_ip': row.get('src_ip', ''), 'anubis_bot_name': row.get('anubis_bot_name', ''),
                'anubis_bot_action': row.get('anubis_bot_action', ''),
                'anubis_bot_category': row.get('anubis_bot_category', ''),
                'anomaly_score': round(float(row.get('anomaly_score', 0)), 4),
                'raw_anomaly_score': round(float(row.get('raw_anomaly_score', 0)), 4),
                'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
                'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''),
                'recurrence': int(row.get('recurrence', 1)),
            })
    # Filtrer sur raw_anomaly_score (A6 inclus) — seulement le trafic non-DENY et non-navigateur légitime
    # Les DENY sont toujours des threats, indépendamment du seuil IF
    # Les LEGITIMATE_BROWSER sont exclus des anomalies (navigateurs confirmés)
    non_deny_traffic = unknown_traffic[~deny_mask & (unknown_traffic['threat_level'] != 'LEGITIMATE_BROWSER')]
    anomalies = non_deny_traffic[non_deny_traffic['raw_anomaly_score'] < effective_threshold].copy()
    if not anomalies.empty:
        log_info(f"[{name}] ALERT: {len(anomalies)} anomalies détectées (seuil={effective_threshold:.4f}).")
        anomalies['recurrence'] = anomalies['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
        # A4 — Explainabilité SHAP : top features responsables de chaque anomalie
        X_anomalies = X_test.loc[anomalies.index]
        shap_tops = compute_shap_top_features(model, X_anomalies, valid_features)
        anomalies['reason'] = [
            build_reason(name, row, shap)
            for (_, row), shap in zip(anomalies.iterrows(), shap_tops)
        ]
        # A8 — Clustering DBSCAN pour identifier les campagnes coordonnées
        if ENABLE_CLUSTERING:
            anomalies = cluster_anomalies(anomalies, scoring_features, ae_model=ae_model)
        anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
        for _, row in anomalies.iterrows():
            log_decision('ANOMALY', cycle_id, name, {
                'src_ip': row.get('src_ip', ''), 'anomaly_score': round(float(row.get('anomaly_score', 0)), 4),
                'raw_anomaly_score': round(float(row.get('raw_anomaly_score', 0)), 4),
                'threat_level': row.get('threat_level', ''), 'recurrence': int(row.get('recurrence', 1)),
                'hit_velocity': round(float(row.get('hit_velocity', 0)), 2),
                'fuzzing_index': round(float(row.get('fuzzing_index', 0)), 2),
                'post_ratio': round(float(row.get('post_ratio', 0)), 3),
                'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
                'asn_detail': row.get('asn_detail', ''), 'asn_domain': row.get('asn_domain', ''),
                'country_code': row.get('country_code', ''), 'asn_label': row.get('asn_label', ''),
                'ja4': row.get('ja4', ''), 'host': row.get('host', ''),
                'correlated': int(row.get('correlated', 0)), 'campaign_id': int(row.get('campaign_id', -1)),
                'effective_threshold': round(effective_threshold, 4), 'reason': row.get('reason', '')
            })
    threats = pd.concat([df for df in [
        anomalies      if not anomalies.empty      else None,
        known_bots     if not known_bots.empty     else None,
        anubis_allow   if not anubis_allow.empty   else None,
        anubis_deny    if not anubis_deny.empty    else None,
    ] if df is not None], ignore_index=True)
    # Inclure anubis_allow dans all_scored pour traçabilité dans ml_all_scores.
    # Ces IPs sont exclues de l'analyse IF mais doivent apparaître dans la table
    # de scores avec threat_level='KNOWN_BOT' et anomaly_score=0.0.
    if not anubis_allow.empty:
        all_scored = pd.concat([all_scored, anubis_allow], ignore_index=True)
    # ── Résumé du modèle ─────────────────────────────────────────────────────
    n_threats = len(threats) if not threats.empty else 0
    n_anomalies = len(anomalies) if not anomalies.empty else 0
    n_legit_browser = int(browser_legit_mask.sum()) if browser_legit_mask is not None else 0
    n_deny = len(anubis_deny) if not anubis_deny.empty else 0
    tl_counts = threats['threat_level'].value_counts().to_dict() if not threats.empty else {}
    tl_str = ', '.join(f'{k}={v}' for k, v in sorted(tl_counts.items())) if tl_counts else 'aucune'
    log_info(f'[{name}] ── Résultat ────────────────────────────────────')
    log_info(f'[{name}]   Menaces totales     : {n_threats:>6}  ({tl_str})')
    log_info(f'[{name}]   Anomalies IF        : {n_anomalies:>6}  (seuil={effective_threshold:.4f})')
    log_info(f'[{name}]   Navigateurs légit.  : {n_legit_browser:>6}')
    log_info(f'[{name}]   Anubis DENY (forcé) : {n_deny:>6}')
    log_info(f'[{name}]   Sessions scorées    : {len(all_scored):>6}  (→ ml_all_scores)')
    return threats, all_scored
--- a/services/bot-detector/bot_detector/preprocessing.py
+++ b/services/bot-detector/bot_detector/preprocessing.py
@ -0,0 +1,110 @@
 """Prétraitement des données et listes de features.
 Normalise les colonnes, enrichit via l'identification multifactorielle des
 navigateurs, et définit les listes de features pour chaque modèle.
 """
 import pandas as pd
 import numpy as np
 from .config import BROWSER_CONFIDENCE_THRESHOLD
 from .log import log_info
 from .browser import _compute_browser_axes, _parse_ja4_columns, _infer_browser_family
 # ═══════════════════════════════════════════════════════════════════════════════
 # LISTES DE FEATURES PAR MODÈLE
 # ═══════════════════════════════════════════════════════════════════════════════
 # Features communes (L7 HTTP pur, disponibles correlated=0 et 1)
 FEATURES = [
    'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio',
    'orphan_ratio', 'max_keepalives', 'tcp_shared_count', 'header_order_shared_count',
    'header_count', 'has_accept_language', 'has_cookie', 'has_referer',
    'modern_browser_score', 'ua_ch_mismatch', 'ip_id_zero_ratio',
    'request_size_variance', 'multiplexing_efficiency', 'mss_mobile_mismatch',
    'asset_ratio', 'direct_access_ratio', 'is_ua_rotating', 'distinct_ja4_count',
    'src_port_density', 'ja4_asn_concentration', 'ja4_country_concentration', 'is_rare_ja4',
    'header_order_confidence', 'distinct_header_orders', 'temporal_entropy',
    'path_diversity_ratio', 'url_depth_variance', 'anomalous_payload_ratio',
    # B4-B7 : features L7 pures
    'head_ratio', 'sec_fetch_absence_rate', 'generic_accept_ratio', 'http10_ratio',
    # Anubis
    'anubis_is_flagged',
    # Browser multifactoriel
    'is_known_browser', 'browser_consistency_score', 'browser_confidence',
    'axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
    'axis_nav_behavior', 'axis_tls_coherence',
    # HTTP
    'missing_accept_enc_ratio', 'http_scheme_ratio',
    # Thèse §5
    'path_transition_entropy',
    'cadence_cv', 'burst_ratio', 'pause_ratio',
    'lag1_autocorrelation', 'benford_deviation',
    'host_diversity', 'host_sweep_speed', 'host_coverage_uniformity',
 ]
 # Features supplémentaires pour le modèle Complet (données TCP/TLS requises)
 FEATURES_COMPLET = FEATURES + [
    'tcp_jitter_variance', 'alpn_http_mismatch', 'is_alpn_missing', 'sni_host_mismatch',
    # B1-B3, B8 : features TLS/TCP
    'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
    # TTL fingerprinting OS + TCP window scale
    'avg_ttl', 'ttl_std', 'no_window_scale_ratio',
    # §5.5 — Dérive JA4 intra-session
    'ja4_drift_ratio',
 ]
 # ═══════════════════════════════════════════════════════════════════════════════
 # PRÉTRAITEMENT
 # ═══════════════════════════════════════════════════════════════════════════════
 def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    """Normalise les colonnes et remplit les valeurs manquantes (commun 1h et 24h)."""
    df.columns = [c.split('.')[-1] for c in df.columns]
    for col in ['src_ip', 'ja4', 'host', 'bot_name', 'anubis_bot_name', 'anubis_bot_action',
                'anubis_bot_category', 'asn_number', 'asn_org', 'asn_detail', 'asn_domain',
                'country_code', 'asn_label']:
        if col in df.columns:
            df[col] = df[col].fillna('').astype(str)
    # ── A9 — Identification multifactorielle des navigateurs ──────────────────
    browser_axes = _compute_browser_axes(df)
    ja4_parsed = _parse_ja4_columns(df.get('ja4', pd.Series('', index=df.index)))
    df['inferred_browser_family'] = _infer_browser_family(df, ja4_parsed, browser_axes)
    df['browser_confidence'] = browser_axes['browser_confidence']
    for ax in ['axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
               'axis_nav_behavior', 'axis_tls_coherence']:
        df[ax] = browser_axes[ax]
    # Rétro-compatibilité
    df['is_known_browser'] = browser_axes['axis_ja4_known'].astype(int)
    df['browser_consistency_score'] = (
        browser_axes['axis_ja4_known'].clip(0, 1)
        + browser_axes['axis_http_modern'].apply(lambda x: 1 if x >= 0.5 else 0)
        + browser_axes['axis_nav_behavior'].apply(lambda x: 1 if x >= 0.5 else 0)
        + browser_axes['axis_tls_coherence'].apply(lambda x: 1 if x >= 0.5 else 0)
        + (df['inferred_browser_family'] != '').astype(int)
    ).astype(int)
    # anubis_is_flagged : signal de suspicion modéré
    df['anubis_is_flagged'] = (
        (df.get('anubis_bot_name', pd.Series('', index=df.index)) != '') &
        (~df.get('anubis_bot_action', pd.Series('', index=df.index)).isin(['ALLOW', 'DENY', '']))
    ).astype(int)
    # Imputation intelligente
    binary_features = {
        'has_accept_language', 'has_cookie', 'has_referer', 'ua_ch_mismatch',
        'is_ua_rotating', 'is_alpn_missing', 'sni_host_mismatch', 'alpn_http_mismatch',
        'mss_mobile_mismatch', 'anubis_is_flagged', 'is_rare_ja4',
    }
    for col in df.columns:
        if col in binary_features:
            df[col] = df[col].fillna(-1)
        elif df[col].dtype in ('float64', 'float32', 'int64', 'int32', 'uint64', 'uint32'):
            df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(df[col].median())
    return df
--- a/services/bot-detector/bot_detector/scoring.py
+++ b/services/bot-detector/bot_detector/scoring.py
@ -0,0 +1,279 @@
 """Scoring, dérive, validation, seuil adaptatif, SHAP et clustering.
 Regroupe les fonctions de scoring utilisées par le pipeline de détection :
  - A1 : détection de dérive conceptuelle (KS-test / Z-score)
  - A2 : seuil adaptatif basé sur le percentile des scores négatifs
  - A4 : explainabilité SHAP (top features contributives)
  - A7 : validation de complétude des features
  - A8 : clustering HDBSCAN / DBSCAN des anomalies
  - A10 : normalisation [0,1] des scores d'anomalie
 """
 import numpy as np
 import pandas as pd
 from .config import (
    ANOMALY_THRESHOLD, ANOMALY_PERCENTILE,
    DRIFT_THRESHOLD, MIN_VALID_FEATURE_RATIO,
    ENABLE_SHAP, SHAP_AVAILABLE, ENABLE_CLUSTERING, CLUSTERING_MIN_SAMPLES,
    EIF_AVAILABLE, HDBSCAN_AVAILABLE,
    STRUCTURAL_EXCLUDED_FEATURES,
 )
 from .log import log_info, log_decision
 # Imports conditionnels depuis config
 if HDBSCAN_AVAILABLE:
    from .config import _hdbscan
 from .config import DBSCAN, StandardScaler
 if ENABLE_SHAP:
    from .config import _shap
 # Cache par modèle : dernier état des features invalides (pour ne logguer que les changements).
 _feature_warning_cache: dict = {}
 # ═══════════════════════════════════════════════════════════════════════════════
 # A1 — DÉTECTION DE DÉRIVE CONCEPTUELLE
 # ═══════════════════════════════════════════════════════════════════════════════
 def _compute_drift_score(baseline_stats: dict, current_baseline: pd.DataFrame,
                         features: list) -> float:
    """Compare la distribution actuelle de la baseline humaine avec celle de l'entraînement.
    Utilise le test de Kolmogorov-Smirnov bilatéral par feature. La distribution
    d'entraînement est reconstruite à partir d'un quantile digest (p10..p90) par
    interpolation linéaire — bien plus fidèle qu'une approximation N(μ,σ) pour les
    features asymétriques ou multimodales.
    Retourne la fraction de features en dérive significative (p < 0.05).
    """
    if not baseline_stats or current_baseline.empty:
        return 0.0
    try:
        from scipy.stats import ks_2samp
    except ImportError:
        return _compute_drift_score_zscore(baseline_stats, current_baseline, features)
    drifted = 0
    tested = 0
    rng = np.random.default_rng(42)
    for feat in features:
        if feat not in baseline_stats or feat not in current_baseline.columns:
            continue
        stats = baseline_stats[feat]
        curr_values = current_baseline[feat].dropna()
        if len(curr_values) < 30:
            continue
        trained_std = stats.get('std', 0)
        if trained_std < 1e-9:
            continue
        quantile_keys = ['p10', 'p25', 'p50', 'p75', 'p90']
        if all(k in stats for k in quantile_keys):
            quantile_probs = np.array([0.10, 0.25, 0.50, 0.75, 0.90])
            quantile_vals = np.array([stats[k] for k in quantile_keys])
            u = rng.uniform(0, 1, size=len(curr_values))
            synthetic_trained = np.interp(u, quantile_probs, quantile_vals)
        else:
            synthetic_trained = rng.normal(stats['mean'], trained_std, size=len(curr_values))
        _, p_value = ks_2samp(curr_values.values, synthetic_trained)
        if p_value < 0.05:
            drifted += 1
        tested += 1
    return drifted / max(tested, 1)
 def _compute_drift_score_zscore(baseline_stats: dict, current_baseline: pd.DataFrame,
                                features: list) -> float:
    """Fallback Z-score pour la détection de dérive quand scipy n'est pas disponible."""
    if not baseline_stats or current_baseline.empty:
        return 0.0
    drifted = 0
    tested = 0
    for feat in features:
        if feat not in baseline_stats or feat not in current_baseline.columns:
            continue
        stats = baseline_stats[feat]
        curr_mean = current_baseline[feat].mean()
        trained_std = stats.get('std', 0)
        if trained_std < 1e-9:
            continue
        z = abs(curr_mean - stats['mean']) / trained_std
        if z > 2.0:
            drifted += 1
        tested += 1
    return drifted / max(tested, 1)
 # ═══════════════════════════════════════════════════════════════════════════════
 # A7 — VALIDATION DE COMPLÉTUDE DES FEATURES
 # ═══════════════════════════════════════════════════════════════════════════════
 def validate_features(df: pd.DataFrame, features: list, name: str,
                      cycle_id: str):
    """Vérifie que les features sont présentes et non constantes dans le DataFrame.
    Retourne la liste des features valides, ou None si le ratio est insuffisant.
    """
    structural = STRUCTURAL_EXCLUDED_FEATURES.get(name, [])
    active_features = [f for f in features if f not in structural]
    missing = [f for f in active_features if f not in df.columns]
    present = [f for f in active_features if f in df.columns]
    zero_val = [f for f in present if df[f].nunique() == 1 and df[f].max() == 0]
    unique_val = [f for f in present if df[f].nunique() == 1 and df[f].max() != 0]
    constant = zero_val + unique_val
    valid = [f for f in present if f not in constant]
    current_state = (frozenset(missing), frozenset(zero_val), frozenset(unique_val))
    state_changed = _feature_warning_cache.get(name) != current_state
    _feature_warning_cache[name] = current_state
    if structural:
        log_info(f"[{name}] Features exclues (structurelles / L4 indisponible) : {structural}")
    if state_changed:
        if missing:
            log_info(f"[{name}] Features absentes du schéma : {missing}")
        if zero_val:
            log_info(f"[{name}] Features à 0 (pipeline non-alimenté) : {zero_val}")
        if unique_val:
            log_info(f"[{name}] Features non-discriminantes (agrégat global) : {unique_val}")
        if missing or zero_val or unique_val:
            log_decision('FEATURE_WARNING', cycle_id, name, {
                'structural': structural, 'missing': missing,
                'zero': zero_val, 'unique_nonzero': unique_val,
                'valid_count': len(valid), 'total': len(active_features)
            })
    ratio = len(valid) / max(len(active_features), 1)
    if ratio < MIN_VALID_FEATURE_RATIO:
        log_info(f"[{name}] Ratio features valides insuffisant ({ratio:.0%} < {MIN_VALID_FEATURE_RATIO:.0%}) — cycle ignoré.")
        log_decision('SKIPPED_INVALID_FEATURES', cycle_id, name, {
            'valid_ratio': round(ratio, 3), 'threshold': MIN_VALID_FEATURE_RATIO
        })
        return None
    return valid
 # ═══════════════════════════════════════════════════════════════════════════════
 # A2 / A10 — SEUIL ADAPTATIF ET NORMALISATION
 # ═══════════════════════════════════════════════════════════════════════════════
 def compute_adaptive_threshold(scores: np.ndarray) -> float:
    """A2 : Seuil adaptatif basé sur le percentile des scores négatifs."""
    neg_scores = scores[scores < 0]
    if len(neg_scores) == 0:
        return ANOMALY_THRESHOLD
    adaptive = float(np.percentile(neg_scores, ANOMALY_PERCENTILE))
    return min(adaptive, ANOMALY_THRESHOLD)
 def normalize_scores(scores: np.ndarray) -> np.ndarray:
    """A10 : Normalise les scores en [0, 1] avec 1 = le plus anomal."""
    result = np.zeros_like(scores)
    mask = scores < 0
    if mask.sum() == 0:
        return result
    s_min = scores[mask].min()
    if s_min == 0:
        return result
    result[mask] = np.clip(-scores[mask] / (-s_min + 1e-9), 0.0, 1.0)
    return result
 # ═══════════════════════════════════════════════════════════════════════════════
 # A4 — EXPLAINABILITÉ SHAP
 # ═══════════════════════════════════════════════════════════════════════════════
 def _compute_shap_top_features(model, X: pd.DataFrame, features: list,
                               n_top: int = 5) -> list:
    """
    Calcule les valeurs SHAP pour chaque ligne de X et retourne les n_top features
    les plus contributives (valeur SHAP la plus négative = plus responsable de l'anomalie).
    Retourne une liste de dicts {feature: shap_value} par ligne.
    Utilise TreeExplainer pour sklearn, et un échantillon Permutation pour isotree.
    """
    if not ENABLE_SHAP or X.empty:
        return [{}] * len(X)
    try:
        if EIF_AVAILABLE:
            sample_size = min(100, len(X))
            X_sample = X.sample(n=sample_size, random_state=42) if len(X) > sample_size else X
            explainer = _shap.Explainer(model.decision_function, X_sample)
            shap_values = explainer(X).values
        else:
            explainer = _shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X)
        result = []
        for sv in shap_values:
            pairs = sorted(zip(features, sv), key=lambda x: x[1])
            result.append({f: round(float(v), 4) for f, v in pairs[:n_top]})
        return result
    except Exception as e:
        log_info(f"[SHAP] Erreur de calcul SHAP: {e}")
        return [{}] * len(X)
 def _build_reason(name: str, row: pd.Series, shap_top: dict) -> str:
    """Construit le champ reason enrichi avec le top SHAP ou les métriques clés."""
    # Utilise le score brut pour l'affichage (plus interprétable que le score normalisé)
    score = round(float(row.get('raw_anomaly_score', row.get('anomaly_score', 0))), 3)
    threat = row.get('threat_level', '')
    if shap_top:
        top_str = ' | '.join(f"{f}({v:+.3f})" for f, v in shap_top.items())
        return f"[{name}] Score: {score} | SHAP: {top_str} | Threat: {threat}"
    vel = round(float(row.get('hit_velocity', 0)), 1)
    fuzz = round(float(row.get('fuzzing_index', 0)), 1)
    return f"[{name}] Score: {score} | Vel: {vel} req/s | Fuzzing: {fuzz} | Threat: {threat}"
 # ═══════════════════════════════════════════════════════════════════════════════
 # A8 — CLUSTERING COMPORTEMENTAL DES ANOMALIES
 # ═══════════════════════════════════════════════════════════════════════════════
 def _cluster_anomalies(anomalies: pd.DataFrame, features: list,
                       ae_model=None) -> pd.DataFrame:
    """A8 : Applique HDBSCAN (ou DBSCAN en fallback) sur les features normalisées des anomalies.
    HDBSCAN est préféré car il détermine automatiquement le nombre de clusters
    et la densité optimale (pas de paramètre eps à régler manuellement).
    Si un autoencoder est disponible, utilise l'espace latent (16-dim) au lieu des features brutes
    pour un clustering plus expressif dans un espace de dimension réduite.
    Ajoute une colonne campaign_id : −1 = IP isolée, ≥0 = identifiant de campagne coordonnée.
    """
    anomalies = anomalies.copy()
    if len(anomalies) < CLUSTERING_MIN_SAMPLES:
        anomalies['campaign_id'] = -1
        return anomalies
    try:
        X = anomalies[features].replace([np.inf, -np.inf], np.nan).fillna(0)
        if ae_model is not None:
            try:
                X_scaled = ae_model.encode(X.values)
                algo_prefix = 'AE+'
            except Exception:
                X_scaled = StandardScaler().fit_transform(X)
                algo_prefix = ''
        else:
            X_scaled = StandardScaler().fit_transform(X)
            algo_prefix = ''
        if HDBSCAN_AVAILABLE:
            clusterer = _hdbscan.HDBSCAN(
                min_cluster_size=CLUSTERING_MIN_SAMPLES,
                min_samples=max(2, CLUSTERING_MIN_SAMPLES - 1),
                cluster_selection_method='eom'
            )
            labels = clusterer.fit_predict(X_scaled)
        else:
            labels = DBSCAN(eps=0.5, min_samples=CLUSTERING_MIN_SAMPLES).fit_predict(X_scaled)
        anomalies['campaign_id'] = labels
        n_campaigns = len(set(labels)) - (1 if -1 in labels else 0)
        algo = algo_prefix + ('HDBSCAN' if HDBSCAN_AVAILABLE else 'DBSCAN')
        if n_campaigns > 0:
            log_info(f"[{algo}] {n_campaigns} campagne(s) détectée(s) parmi {len(anomalies)} anomalies.")
    except Exception as e:
        log_info(f"[Clustering] Erreur de clustering: {e}")
        anomalies['campaign_id'] = -1
    return anomalies
		`@ -0,0 +1 @@`
							`"""Package bot_detector — détection de bots par ML semi-supervisé."""`