ja4-platform/services/bot-detector/bot_detector/pipeline.py

"""Pipeline de détection semi-supervisée.

Fonction principale run_semi_supervised_logic() : triage, scoring EIF, AE, XGB,
classification navigateur, détection d'anomalies, SHAP et clustering.
"""
import numpy as np
import pandas as pd

from .config import (
    DB, CONTAMINATION, AE_WEIGHT, XGB_WEIGHT, RECURRENCE_WEIGHT,
    ANOMALY_THRESHOLD, ANOMALY_PERCENTILE, ENABLE_CLUSTERING,
    ENABLE_SHAP, EIF_AVAILABLE, TORCH_AVAILABLE, XGB_AVAILABLE,
    BROWSER_CONFIDENCE_THRESHOLD, BROWSER_COHORT_RATIO,
    MIN_VALID_FEATURE_RATIO, STRUCTURAL_EXCLUDED_FEATURES,
)
from .log import log_info, log_decision
from .infra import score_to_threat_level, get_client
from .models import load_or_train_model, load_or_train_xgb, TrafficNormalizingFlow
from .scoring import (
    validate_features, compute_adaptive_threshold, normalize_scores,
    compute_shap_top_features, build_reason, cluster_anomalies,
    compute_exiffi_importance, compute_ae_feature_errors, get_meta_learner,
    FINGERPRINT_COHERENCE_THRESHOLD,
)
from .browser_matcher import log_dual_mode_comparison, BROWSER_MATCHER_ENABLED, BROWSER_MATCHER_REPLACE


# ═══════════════════════════════════════════════════════════════════════════════
# ANALYSE SEMI-SUPERVISÉE
# ═══════════════════════════════════════════════════════════════════════════════
def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
    """Applique le pipeline de détection semi-supervisée sur un sous-ensemble du trafic.

    Trifurque le trafic en bots connus, bots Anubis ALLOW et trafic inconnu,
    entraîne ou charge le modèle IsolationForest sur la baseline humaine,
    score le trafic inconnu, applique les améliorations A2/A4/A6/A8,
    et retourne (threats, all_scored) sous forme de DataFrames.

    Effets de bord : écriture dans les logs de décision via log_decision.
    """
    # 1. Bots connus (dict_bot_ip / dict_bot_ja4) → exclus du scoring IF
    known_bots = df[df['bot_name'] != ''].copy()
    rest = df[df['bot_name'] == ''].copy()

    # 2. Bots Anubis ALLOW → bots légitimes, exclus du scoring IF
    anubis_allow = rest[rest['anubis_bot_action'] == 'ALLOW'].copy()

    # 3. Tout le reste passe par l'IsolationForest pour un score réel :
    #    - DENY  : menaces identifiées par règles Anubis → IF donne le score de sévérité
    #    - WEIGH / inconnu → scorés normalement (anubis_is_flagged=1 pour WEIGH)
    #    Les DENY sont TOUJOURS inclus dans les threats, indépendamment du seuil IF.
    unknown_traffic = rest[rest['anubis_bot_action'] != 'ALLOW'].copy()
    human_baseline  = unknown_traffic[unknown_traffic['asn_label'] == 'isp']

    log_info(f'[{name}] ── Triage ──────────────────────────────────────')
    log_info(f'[{name}]   Total sessions       : {len(df):>6}')
    log_info(f'[{name}]   Bots connus (dict)   : {len(known_bots):>6}')
    log_info(f'[{name}]   Anubis ALLOW         : {len(anubis_allow):>6}')
    log_info(f'[{name}]   Trafic à scorer (IF) : {len(unknown_traffic):>6}')
    log_info(f'[{name}]   Baseline ISP (human) : {len(human_baseline):>6}  (seuil min=500)')

    # §3 — Exclure les sessions ISP à faible cohérence de fingerprint de la baseline humaine
    # Ces sessions ISP avec un fingerprint incohérent sont probablement des proxies résidentiels
    # ou des appareils mal configurés qui contamineraient la baseline.
    if 'fingerprint_coherence_score' in human_baseline.columns:
        low_coh = human_baseline['fingerprint_coherence_score'] < FINGERPRINT_COHERENCE_THRESHOLD
        n_low_coh = int(low_coh.sum())
        if n_low_coh > 0:
            human_baseline = human_baseline[~low_coh]
            log_info(
                f'[{name}]   Baseline après filtre cohérence (<{FINGERPRINT_COHERENCE_THRESHOLD}) : '
                f'{len(human_baseline):>6}  ({n_low_coh} exclues)'
            )
            log_decision('LOW_COHERENCE_EXCLUDED', cycle_id, name, {
                'n_excluded': n_low_coh, 'threshold': FINGERPRINT_COHERENCE_THRESHOLD,
                'baseline_after': len(human_baseline),
            })

    # A7 — Valider les features avant tout traitement
    valid_features = validate_features(df, features, name, cycle_id)
    if valid_features is None:
        return pd.DataFrame(), pd.DataFrame()

    if len(human_baseline) < 500:
        log_info(f"[{name}] ⚠ Données humaines insuffisantes ({len(human_baseline)} < 500) — cycle ignoré.")
        log_info(f"[{name}]   Distribution asn_label dans le trafic à scorer :")
        if 'asn_label' in unknown_traffic.columns:
            for label, cnt in unknown_traffic['asn_label'].value_counts().head(8).items():
                log_info(f"[{name}]     {label:>15} : {cnt}")
        log_decision('SKIPPED_LOW_DATA', cycle_id, name, {
            'human_count': len(human_baseline), 'unknown_count': len(unknown_traffic)
        })
        return pd.DataFrame(), pd.DataFrame()

    log_info(f'[{name}] ── Modèle EIF ─────────────────────────────────')
    log_info(f'[{name}]   Features validées : {len(valid_features)}/{len(features)} ({", ".join(valid_features[:5])}{"…" if len(valid_features) > 5 else ""})')

    # A1 — Dérive conceptuelle intégrée dans load_or_train_model
    model, ae_model, model_features = load_or_train_model(name, human_baseline, valid_features, cycle_id)
    # Utiliser les features du modèle (possiblement différentes après pruning/chargement)
    scoring_features = [f for f in model_features if f in unknown_traffic.columns]
    unknown_traffic = unknown_traffic.copy()

    X_test = unknown_traffic[scoring_features].replace([np.inf, -np.inf], np.nan)
    X_test = X_test.fillna(X_test.median())
    raw_scores = model.decision_function(X_test)

    # isotree renvoie des scores dans [0, 1] : 0.5 = frontière, >0.5 = anomal
    # sklearn renvoie des scores centrés sur 0 : <0 = anomal, >0 = normal
    # Conversion : sklearn_equiv = 0.5 - isotree_score
    #   isotree 0.8 → -0.3 (CRITICAL)  |  isotree 0.5 → 0.0 (frontière)
    #   isotree 0.3 → +0.2 (NORMAL)
    if EIF_AVAILABLE:
        raw_scores = 0.5 - raw_scores

    log_info(f'[{name}]   Scoring EIF : {len(X_test)} sessions scorées (min={raw_scores.min():.4f}, max={raw_scores.max():.4f}, mean={raw_scores.mean():.4f})')

    # Combinaison EIF + Normalizing Flow si disponible
    # Score final = (1-α) * eif_norm + α * nf_norm   où α = AE_WEIGHT
    if ae_model is not None and AE_WEIGHT > 0:
        try:
            nf_neg_ll = ae_model.score_samples(X_test.values)  # -log p(x)
            nf_norm = normalize_scores(-nf_neg_ll)  # plus élevé = plus anomal
            eif_norm = normalize_scores(raw_scores)
            combined_norm = (1 - AE_WEIGHT) * eif_norm + AE_WEIGHT * nf_norm
            unknown_traffic['ae_recon_error'] = nf_neg_ll  # nom conservé pour rétro-compatibilité
            unknown_traffic['anomaly_score'] = combined_norm
            log_info(f"[{name}] Score combiné EIF+NF (α={AE_WEIGHT}): nf_mean={nf_neg_ll.mean():.6f}")
        except Exception as exc:
            log_info(f"[{name}] NF scoring échoué : {exc} — utilisation EIF seul.")
            unknown_traffic['ae_recon_error'] = 0.0
            unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
    else:
        unknown_traffic['ae_recon_error'] = 0.0
        unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)

    # raw_anomaly_score : score brut IF pour comparaison au seuil et assignation du threat_level
    unknown_traffic['raw_anomaly_score'] = raw_scores
    unknown_traffic['model_name'] = name

    # Modèle supervisé — troisième voix (Hoeffding Adaptive Tree ou XGBoost fallback)
    unknown_traffic['xgb_prob'] = 0.0
    xgb_model_ref = None  # Référence pour SHAP TreeExplainer (§2.4.5)
    if XGB_AVAILABLE and XGB_WEIGHT > 0:
        try:
            xgb_client = get_client()
            xgb_model, xgb_feats = load_or_train_xgb(name, xgb_client, scoring_features, cycle_id)
            if xgb_model is not None and xgb_feats is not None:
                xgb_cols = [f for f in xgb_feats if f in unknown_traffic.columns]
                X_xgb = unknown_traffic[xgb_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
                # River HAT utilise predict_proba_many(DataFrame), XGBoost utilise predict_proba(ndarray)
                if hasattr(xgb_model, 'predict_proba_many'):
                    proba_df = xgb_model.predict_proba_many(X_xgb[xgb_cols])
                    xgb_probs = proba_df[1].values if 1 in proba_df.columns else np.zeros(len(X_xgb))
                else:
                    xgb_probs = xgb_model.predict_proba(X_xgb.values)[:, 1]
                unknown_traffic['xgb_prob'] = xgb_probs
                xgb_model_ref = xgb_model
                log_info(f"[{name}] Supervisé : score moyen={xgb_probs.mean():.4f}")
        except Exception as exc:
            log_info(f"[{name}] Supervisé scoring échoué : {exc} — EIF+NF seuls.")

    # §8 — Score final via MetaLearner MLP (ou poids fixes en fallback)
    meta_learner = get_meta_learner(name)
    eif_norm_arr  = unknown_traffic['anomaly_score'].values.copy()
    ae_norm_arr   = normalize_scores(-unknown_traffic['ae_recon_error'].values)
    xgb_prob_arr  = unknown_traffic['xgb_prob'].values

    final_scores = meta_learner.predict(eif_norm_arr, ae_norm_arr, xgb_prob_arr)
    unknown_traffic['anomaly_score'] = final_scores

    if meta_learner.is_trained:
        log_info(
            f"[{name}] §8 MetaFusionMLP actif ({meta_learner._n_samples} labels) — "
            f"score moyen={final_scores.mean():.4f}"
        )
    elif unknown_traffic['xgb_prob'].mean() > 0:
        log_info(f"[{name}] §8 Poids fixes EIF+NF+XGB (MetaFusionMLP pas encore entraîné).")

    # §8 — Entraînement du MetaLearner sur les labels du cycle courant
    # (accumulation progressive — activation dès MIN_SAMPLES labels)
    try:
        labeled_df = meta_learner.build_labels_from_df(unknown_traffic)
        if not labeled_df.empty:
            unknown_traffic_labeled = labeled_df.copy()
            unknown_traffic_labeled['eif_norm'] = normalize_scores(raw_scores)
            unknown_traffic_labeled['ae_norm'] = ae_norm_arr
            if meta_learner.fit(unknown_traffic_labeled):
                log_decision('META_LEARNER_TRAINED', cycle_id, name, meta_learner._weights_log)
    except Exception as exc:
        log_info(f"[{name}] MetaLearner entraînement échoué : {exc}")

    # §7 — ExIFFI : importance de features pour l'EIF (toujours actif en complément de SHAP)
    exiffi_tops: list = [{}] * len(unknown_traffic)
    if len(unknown_traffic) > 0:
        try:
            exiffi_tops = compute_exiffi_importance(model, X_test, scoring_features)
        except Exception:
            pass

    # A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD)
    effective_threshold = compute_adaptive_threshold(raw_scores)
    log_info(f"[{name}] Seuil effectif : {effective_threshold:.4f} (statique={ANOMALY_THRESHOLD}, percentile={ANOMALY_PERCENTILE})")

    # A6 — Pénaliser les IPs récurrentes sur le score BRUT avant comparaison au seuil
    if RECURRENCE_WEIGHT > 0:
        recurrences = unknown_traffic['src_ip'].map(recurrence_map).fillna(0)
        penalty = np.log1p(recurrences.values) * RECURRENCE_WEIGHT
        unknown_traffic['raw_anomaly_score'] = unknown_traffic['raw_anomaly_score'] - penalty

    # Assigner threat_level à TOUTES les sessions scorées (pour ml_all_scores)
    unknown_traffic['threat_level'] = unknown_traffic['raw_anomaly_score'].apply(score_to_threat_level)
    unknown_traffic['recurrence'] = unknown_traffic['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
    unknown_traffic['campaign_id'] = -1

    # Extraire les DENY (maintenant avec leur vrai score IF) et forcer leur threat_level
    deny_mask = unknown_traffic['anubis_bot_action'] == 'DENY'
    unknown_traffic.loc[deny_mask, 'threat_level'] = 'ANUBIS_DENY'

    # ── A9 — Classification multifactorielle des navigateurs légitimes ─────────
    # 5 axes indépendants : JA4 connu, structure JA4, headers HTTP modernes,
    # comportement de navigation, cohérence TLS/TCP.
    # browser_confidence [0..1] = combinaison pondérée des 5 axes.
    # La classification n'exige plus que le JA4 soit dans le dictionnaire statique :
    # un JA4 inconnu avec des signaux HTTP/TLS/nav forts sera quand même reconnu.
    _bconf = unknown_traffic.get('browser_confidence', pd.Series(0, index=unknown_traffic.index)).fillna(0)
    _ibf = unknown_traffic.get('inferred_browser_family', pd.Series('', index=unknown_traffic.index)).fillna('').astype(str)
    browser_legit_mask = (
        (_bconf >= BROWSER_CONFIDENCE_THRESHOLD) &                  # confiance multifactorielle
        (_ibf != '') &                                              # famille identifiée (dict ou inférée)
        (unknown_traffic['threat_level'].isin(['NORMAL', 'LOW'])) & # pas de menace IF
        (~deny_mask)                                                # pas un DENY Anubis
    )

    # Propagation par cohorte JA4 : si ≥ BROWSER_COHORT_RATIO% des sessions d'un JA4
    # sont déjà classées navigateur, propager aux sessions restantes du même JA4.
    if browser_legit_mask.any():
        ja4_col = unknown_traffic['ja4']
        legit_per_ja4 = ja4_col[browser_legit_mask].value_counts()
        total_per_ja4 = ja4_col.value_counts()
        ratio_per_ja4 = (legit_per_ja4 / total_per_ja4).dropna()
        cohort_ja4s = set(ratio_per_ja4[ratio_per_ja4 >= BROWSER_COHORT_RATIO].index)
        if cohort_ja4s:
            cohort_mask = (
                ja4_col.isin(cohort_ja4s) &
                (~browser_legit_mask) &                                     # pas déjà classé
                (unknown_traffic['threat_level'].isin(['NORMAL', 'LOW'])) &
                (~deny_mask)
            )
            browser_legit_mask = browser_legit_mask | cohort_mask
            n_cohort = cohort_mask.sum()
            if n_cohort > 0:
                log_info(f"[{name}] Propagation cohorte JA4 : {n_cohort} sessions supplémentaires ({len(cohort_ja4s)} JA4)")

    if browser_legit_mask.any():
        unknown_traffic.loc[browser_legit_mask, 'threat_level'] = 'LEGITIMATE_BROWSER'
        # Utiliser la famille inférée (dict ou structurel)
        _family_disp = _ibf[browser_legit_mask].where(_ibf[browser_legit_mask] != '', 'Unknown')
        unknown_traffic.loc[browser_legit_mask, 'reason'] = (
            '[Navigateur légitime] ' + _family_disp +
            ' (confiance=' + _bconf[browser_legit_mask].round(2).astype(str) + ')'
        )
        n_legit = browser_legit_mask.sum()
        families = _ibf[browser_legit_mask].value_counts().to_dict()
        # Log des axes moyens pour diagnostic
        ax_means = {}
        for ax in ['axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
                    'axis_nav_behavior', 'axis_tls_coherence', 'axis_h2_coherence']:
            col = unknown_traffic.get(ax, None)
            if col is not None:
                ax_means[ax.replace('axis_', '')] = round(float(col[browser_legit_mask].mean()), 3)
        log_info(f"[{name}] {n_legit} session(s) classée(s) LEGITIMATE_BROWSER : {families}")
        log_info(f"[{name}]   Axes moyens : {ax_means}")
        log_decision('LEGITIMATE_BROWSER', cycle_id, name, {
            'count': int(n_legit), 'families': families,
            'mean_confidence': round(float(_bconf[browser_legit_mask].mean()), 3),
            'axis_means': ax_means,
        })

    # ── A9b — DUAL_MODE : journaliser les décisions browser_matcher vs browser_confidence ──
    # Quand BROWSER_MATCHER_REPLACE=true, browser_matcher pilote le bypass à la place.
    if BROWSER_MATCHER_ENABLED and 'bm_decision' in unknown_traffic.columns:
        log_dual_mode_comparison(unknown_traffic, cycle_id, name)
        if BROWSER_MATCHER_REPLACE:
            # Appliquer la décision du matcher (remplace le résultat du bloc A9 ci-dessus)
            bm_legit = unknown_traffic['bm_decision'] == 'LEGITIMATE_BROWSER'
            if bm_legit.any():
                unknown_traffic.loc[bm_legit, 'threat_level'] = 'LEGITIMATE_BROWSER'
                unknown_traffic.loc[bm_legit, 'reason'] = (
                    '[BrowserMatcher] '
                    + unknown_traffic.loc[bm_legit, 'bm_family'].fillna('Unknown')
                    + ' (score=' + unknown_traffic.loc[bm_legit, 'bm_score'].round(2).astype(str) + ')'
                )
                log_info(
                    f"[{name}][BrowserMatcher] {bm_legit.sum()} bypass(es) appliqué(s) "
                    f"(BROWSER_MATCHER_REPLACE=true)"
                )
            # Atténuation par score partiel pour les zones grises
            bm_partial = unknown_traffic['bm_decision'] == 'PARTIAL'
            if bm_partial.any():
                partial_scores = unknown_traffic.loc[bm_partial, 'bm_score'].fillna(0.0)
                unknown_traffic.loc[bm_partial, 'raw_anomaly_score'] = (
                    unknown_traffic.loc[bm_partial, 'raw_anomaly_score']
                    * (1 - 0.5 * partial_scores.values)
                )

    # Capturer toutes les sessions scorées (avant filtrage par seuil) — pour ml_all_scores
    all_scored = unknown_traffic.copy()

    if not known_bots.empty:
        known_bots = known_bots.copy()
        known_bots['anomaly_score'] = 0.0
        known_bots['raw_anomaly_score'] = 0.0
        known_bots['ae_recon_error'] = 0.0
        known_bots['xgb_prob'] = 0.0
        known_bots['threat_level'] = 'KNOWN_BOT'
        known_bots['model_name'] = name
        known_bots['campaign_id'] = -1
        known_bots['reason'] = '[Identification] Bot légitime: ' + known_bots['bot_name']
        known_bots['recurrence'] = known_bots['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
        for _, row in known_bots.iterrows():
            log_decision('KNOWN_BOT', cycle_id, name, {
                'src_ip': row.get('src_ip', ''), 'bot_name': row.get('bot_name', ''),
                'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
                'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''),
                'recurrence': int(row.get('recurrence', 1))
            })

    # ── Anubis ALLOW : bots légitimes identifiés par règles Anubis ───────────
    if not anubis_allow.empty:
        anubis_allow = anubis_allow.copy()
        anubis_allow['anomaly_score'] = 0.0
        anubis_allow['raw_anomaly_score'] = 0.0
        anubis_allow['ae_recon_error'] = 0.0
        anubis_allow['xgb_prob'] = 0.0
        anubis_allow['threat_level'] = 'KNOWN_BOT'
        anubis_allow['bot_name'] = anubis_allow['anubis_bot_name']
        anubis_allow['model_name'] = name
        anubis_allow['campaign_id'] = -1
        anubis_allow['reason'] = '[Anubis ALLOW] ' + anubis_allow['anubis_bot_name']
        anubis_allow['recurrence'] = anubis_allow['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
        for _, row in anubis_allow.iterrows():
            log_decision('KNOWN_BOT', cycle_id, name, {
                'src_ip': row.get('src_ip', ''), 'bot_name': row.get('anubis_bot_name', ''),
                'anubis_bot_name': row.get('anubis_bot_name', ''),
                'anubis_bot_action': row.get('anubis_bot_action', ''),
                'anubis_bot_category': row.get('anubis_bot_category', ''),
                'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
                'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''),
                'recurrence': int(row.get('recurrence', 1)),
            })

    # ── Anubis DENY : scorés par IF, toujours inclus dans les threats ────────
    # Extraits de unknown_traffic après scoring — ils ont leur vrai score IF.
    anubis_deny = unknown_traffic[deny_mask].copy()
    if not anubis_deny.empty:
        anubis_deny['reason'] = '[Anubis DENY] ' + anubis_deny['anubis_bot_name'].fillna('') + \
            ' | ' + anubis_deny['raw_anomaly_score'].apply(lambda s: f'IF={s:.4f}')
        log_info(f"[{name}] Anubis DENY: {len(anubis_deny)} IP(s) scorées par IF "
                 f"(score moyen: {anubis_deny['raw_anomaly_score'].mean():.4f}).")
        for _, row in anubis_deny.iterrows():
            log_decision('ANUBIS_DENY', cycle_id, name, {
                'src_ip': row.get('src_ip', ''), 'anubis_bot_name': row.get('anubis_bot_name', ''),
                'anubis_bot_action': row.get('anubis_bot_action', ''),
                'anubis_bot_category': row.get('anubis_bot_category', ''),
                'anomaly_score': round(float(row.get('anomaly_score', 0)), 4),
                'raw_anomaly_score': round(float(row.get('raw_anomaly_score', 0)), 4),
                'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
                'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''),
                'recurrence': int(row.get('recurrence', 1)),
            })

    # Filtrer sur raw_anomaly_score (A6 inclus) — seulement le trafic non-DENY et non-navigateur légitime
    # Les DENY sont toujours des threats, indépendamment du seuil IF
    # Les LEGITIMATE_BROWSER sont exclus des anomalies (navigateurs confirmés)
    non_deny_traffic = unknown_traffic[~deny_mask & (unknown_traffic['threat_level'] != 'LEGITIMATE_BROWSER')]
    anomalies = non_deny_traffic[non_deny_traffic['raw_anomaly_score'] < effective_threshold].copy()
    if not anomalies.empty:
        log_info(f"[{name}] ALERT: {len(anomalies)} anomalies détectées (seuil={effective_threshold:.4f}).")
        anomalies['recurrence'] = anomalies['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1

        # A4 — Explainabilité SHAP : TreeExplainer sur XGBoost si dispo, sinon EIF
        X_anomalies = X_test.loc[anomalies.index]
        shap_tops = compute_shap_top_features(model, X_anomalies, valid_features,
                                               xgb_model=xgb_model_ref)

        # §7 — ExIFFI : utiliser les tops ExIFFI précalculés quand SHAP est inactif
        # Construire un mapping index → exiffi_top pour accès rapide
        if len(exiffi_tops) == len(unknown_traffic):
            _exiffi_map = dict(zip(unknown_traffic.index, exiffi_tops))
            exiffi_for_anomalies = [_exiffi_map.get(idx, {}) for idx in anomalies.index]
        else:
            exiffi_for_anomalies = [{}] * len(anomalies)
        anomalies['reason'] = [
            build_reason(name, row, shap, exiffi)
            for (_, row), shap, exiffi
            in zip(anomalies.iterrows(), shap_tops, exiffi_for_anomalies)
        ]

        # A8 — Clustering DBSCAN pour identifier les campagnes coordonnées
        if ENABLE_CLUSTERING:
            anomalies = cluster_anomalies(anomalies, scoring_features, ae_model=ae_model)

            # P2 — Escalade par taille de campagne : les IPs dans un cluster
            # coordonné de grande taille sont plus menaçantes que des IPs isolées.
            # Escalader HIGH → CRITICAL si cluster_size ≥ 5.
            if 'campaign_id' in anomalies.columns:
                cid_counts = anomalies['campaign_id'].value_counts()
                for cid, size in cid_counts.items():
                    if cid < 0:
                        continue
                    if size >= 5:
                        mask = (anomalies['campaign_id'] == cid) & (anomalies['threat_level'] == 'HIGH')
                        n_escalated = mask.sum()
                        if n_escalated > 0:
                            anomalies.loc[mask, 'threat_level'] = 'CRITICAL'
                            anomalies.loc[mask, 'reason'] = anomalies.loc[mask, 'reason'] + \
                                f' [Escalade campagne #{cid}, {size} IPs coordonnées]'
                            log_info(f"[{name}] Escalade campagne #{cid}: {n_escalated} IP(s) HIGH→CRITICAL ({size} membres)")

        anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
        for _, row in anomalies.iterrows():
            log_decision('ANOMALY', cycle_id, name, {
                'src_ip': row.get('src_ip', ''), 'anomaly_score': round(float(row.get('anomaly_score', 0)), 4),
                'raw_anomaly_score': round(float(row.get('raw_anomaly_score', 0)), 4),
                'threat_level': row.get('threat_level', ''), 'recurrence': int(row.get('recurrence', 1)),
                'hit_velocity': round(float(row.get('hit_velocity', 0)), 2),
                'fuzzing_index': round(float(row.get('fuzzing_index', 0)), 2),
                'post_ratio': round(float(row.get('post_ratio', 0)), 3),
                'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
                'asn_detail': row.get('asn_detail', ''), 'asn_domain': row.get('asn_domain', ''),
                'country_code': row.get('country_code', ''), 'asn_label': row.get('asn_label', ''),
                'ja4': row.get('ja4', ''), 'host': row.get('host', ''),
                'correlated': int(row.get('correlated', 0)), 'campaign_id': int(row.get('campaign_id', -1)),
                'effective_threshold': round(effective_threshold, 4), 'reason': row.get('reason', '')
            })

    threats = pd.concat([df for df in [
        anomalies      if not anomalies.empty      else None,
        known_bots     if not known_bots.empty     else None,
        anubis_allow   if not anubis_allow.empty   else None,
        anubis_deny    if not anubis_deny.empty    else None,
    ] if df is not None], ignore_index=True)

    # Propager campaign_id des anomalies clusterisées vers all_scored
    # (all_scored a été capturé avant clustering, ses campaign_id sont tous -1)
    if not anomalies.empty and 'campaign_id' in anomalies.columns:
        cid_map = anomalies.set_index(anomalies.index)['campaign_id']
        matched = all_scored.index.isin(cid_map.index)
        if matched.any():
            all_scored.loc[matched, 'campaign_id'] = cid_map

    # Inclure anubis_allow dans all_scored pour traçabilité dans ml_all_scores.
    # Ces IPs sont exclues de l'analyse IF mais doivent apparaître dans la table
    # de scores avec threat_level='KNOWN_BOT' et anomaly_score=0.0.
    if not anubis_allow.empty:
        all_scored = pd.concat([all_scored, anubis_allow], ignore_index=True)

    # ── Résumé du modèle ─────────────────────────────────────────────────────
    n_threats = len(threats) if not threats.empty else 0
    n_anomalies = len(anomalies) if not anomalies.empty else 0
    n_legit_browser = int(browser_legit_mask.sum()) if browser_legit_mask is not None else 0
    n_deny = len(anubis_deny) if not anubis_deny.empty else 0
    tl_counts = threats['threat_level'].value_counts().to_dict() if not threats.empty else {}
    tl_str = ', '.join(f'{k}={v}' for k, v in sorted(tl_counts.items())) if tl_counts else 'aucune'
    log_info(f'[{name}] ── Résultat ────────────────────────────────────')
    log_info(f'[{name}]   Menaces totales     : {n_threats:>6}  ({tl_str})')
    log_info(f'[{name}]   Anomalies IF        : {n_anomalies:>6}  (seuil={effective_threshold:.4f})')
    log_info(f'[{name}]   Navigateurs légit.  : {n_legit_browser:>6}')
    log_info(f'[{name}]   Anubis DENY (forcé) : {n_deny:>6}')
    log_info(f'[{name}]   Sessions scorées    : {len(all_scored):>6}  (→ ml_all_scores)')

    return threats, all_scored