"""Pipeline de détection semi-supervisée. Fonction principale run_semi_supervised_logic() : triage, scoring EIF, AE, XGB, classification navigateur, détection d'anomalies, SHAP et clustering. """ import numpy as np import pandas as pd from .config import ( DB, CONTAMINATION, AE_WEIGHT, XGB_WEIGHT, RECURRENCE_WEIGHT, ANOMALY_THRESHOLD, ANOMALY_PERCENTILE, ENABLE_CLUSTERING, ENABLE_SHAP, EIF_AVAILABLE, TORCH_AVAILABLE, XGB_AVAILABLE, BROWSER_CONFIDENCE_THRESHOLD, BROWSER_COHORT_RATIO, MIN_VALID_FEATURE_RATIO, STRUCTURAL_EXCLUDED_FEATURES, ) from .log import log_info, log_decision from .infra import score_to_threat_level, get_client from .models import load_or_train_model, load_or_train_xgb, TrafficAutoEncoder from .scoring import ( validate_features, compute_adaptive_threshold, normalize_scores, compute_shap_top_features, build_reason, cluster_anomalies, compute_exiffi_importance, compute_ae_feature_errors, get_meta_learner, FINGERPRINT_COHERENCE_THRESHOLD, ) from .browser_matcher import log_dual_mode_comparison, BROWSER_MATCHER_ENABLED, BROWSER_MATCHER_REPLACE # ═══════════════════════════════════════════════════════════════════════════════ # ANALYSE SEMI-SUPERVISÉE # ═══════════════════════════════════════════════════════════════════════════════ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map): """Applique le pipeline de détection semi-supervisée sur un sous-ensemble du trafic. Trifurque le trafic en bots connus, bots Anubis ALLOW et trafic inconnu, entraîne ou charge le modèle IsolationForest sur la baseline humaine, score le trafic inconnu, applique les améliorations A2/A4/A6/A8, et retourne (threats, all_scored) sous forme de DataFrames. Effets de bord : écriture dans les logs de décision via log_decision. """ # 1. Bots connus (dict_bot_ip / dict_bot_ja4) → exclus du scoring IF known_bots = df[df['bot_name'] != ''].copy() rest = df[df['bot_name'] == ''].copy() # 2. Bots Anubis ALLOW → bots légitimes, exclus du scoring IF anubis_allow = rest[rest['anubis_bot_action'] == 'ALLOW'].copy() # 3. Tout le reste passe par l'IsolationForest pour un score réel : # - DENY : menaces identifiées par règles Anubis → IF donne le score de sévérité # - WEIGH / inconnu → scorés normalement (anubis_is_flagged=1 pour WEIGH) # Les DENY sont TOUJOURS inclus dans les threats, indépendamment du seuil IF. unknown_traffic = rest[rest['anubis_bot_action'] != 'ALLOW'].copy() human_baseline = unknown_traffic[unknown_traffic['asn_label'] == 'isp'] log_info(f'[{name}] ── Triage ──────────────────────────────────────') log_info(f'[{name}] Total sessions : {len(df):>6}') log_info(f'[{name}] Bots connus (dict) : {len(known_bots):>6}') log_info(f'[{name}] Anubis ALLOW : {len(anubis_allow):>6}') log_info(f'[{name}] Trafic à scorer (IF) : {len(unknown_traffic):>6}') log_info(f'[{name}] Baseline ISP (human) : {len(human_baseline):>6} (seuil min=500)') # §3 — Exclure les sessions ISP à faible cohérence de fingerprint de la baseline humaine # Ces sessions ISP avec un fingerprint incohérent sont probablement des proxies résidentiels # ou des appareils mal configurés qui contamineraient la baseline. if 'fingerprint_coherence_score' in human_baseline.columns: low_coh = human_baseline['fingerprint_coherence_score'] < FINGERPRINT_COHERENCE_THRESHOLD n_low_coh = int(low_coh.sum()) if n_low_coh > 0: human_baseline = human_baseline[~low_coh] log_info( f'[{name}] Baseline après filtre cohérence (<{FINGERPRINT_COHERENCE_THRESHOLD}) : ' f'{len(human_baseline):>6} ({n_low_coh} exclues)' ) log_decision('LOW_COHERENCE_EXCLUDED', cycle_id, name, { 'n_excluded': n_low_coh, 'threshold': FINGERPRINT_COHERENCE_THRESHOLD, 'baseline_after': len(human_baseline), }) # A7 — Valider les features avant tout traitement valid_features = validate_features(df, features, name, cycle_id) if valid_features is None: return pd.DataFrame(), pd.DataFrame() if len(human_baseline) < 500: log_info(f"[{name}] ⚠ Données humaines insuffisantes ({len(human_baseline)} < 500) — cycle ignoré.") log_info(f"[{name}] Distribution asn_label dans le trafic à scorer :") if 'asn_label' in unknown_traffic.columns: for label, cnt in unknown_traffic['asn_label'].value_counts().head(8).items(): log_info(f"[{name}] {label:>15} : {cnt}") log_decision('SKIPPED_LOW_DATA', cycle_id, name, { 'human_count': len(human_baseline), 'unknown_count': len(unknown_traffic) }) return pd.DataFrame(), pd.DataFrame() log_info(f'[{name}] ── Modèle EIF ─────────────────────────────────') log_info(f'[{name}] Features validées : {len(valid_features)}/{len(features)} ({", ".join(valid_features[:5])}{"…" if len(valid_features) > 5 else ""})') # A1 — Dérive conceptuelle intégrée dans load_or_train_model model, ae_model, model_features = load_or_train_model(name, human_baseline, valid_features, cycle_id) # Utiliser les features du modèle (possiblement différentes après pruning/chargement) scoring_features = [f for f in model_features if f in unknown_traffic.columns] unknown_traffic = unknown_traffic.copy() X_test = unknown_traffic[scoring_features].replace([np.inf, -np.inf], np.nan) X_test = X_test.fillna(X_test.median()) raw_scores = model.decision_function(X_test) # isotree renvoie des scores dans [0, 1] : 0.5 = frontière, >0.5 = anomal # sklearn renvoie des scores centrés sur 0 : <0 = anomal, >0 = normal # Conversion : sklearn_equiv = 0.5 - isotree_score # isotree 0.8 → -0.3 (CRITICAL) | isotree 0.5 → 0.0 (frontière) # isotree 0.3 → +0.2 (NORMAL) if EIF_AVAILABLE: raw_scores = 0.5 - raw_scores log_info(f'[{name}] Scoring EIF : {len(X_test)} sessions scorées (min={raw_scores.min():.4f}, max={raw_scores.max():.4f}, mean={raw_scores.mean():.4f})') # Combinaison EIF + Autoencoder si disponible # Score final = (1-α) * eif_norm + α * ae_norm où α = AE_WEIGHT if ae_model is not None and AE_WEIGHT > 0: try: ae_recon_errors = ae_model.score_samples(X_test.values) ae_norm = normalize_scores(-ae_recon_errors) # plus élevé = plus anomal eif_norm = normalize_scores(raw_scores) combined_norm = (1 - AE_WEIGHT) * eif_norm + AE_WEIGHT * ae_norm unknown_traffic['ae_recon_error'] = ae_recon_errors unknown_traffic['anomaly_score'] = combined_norm log_info(f"[{name}] Score combiné EIF+AE (α={AE_WEIGHT}): ae_mean={ae_recon_errors.mean():.6f}") except Exception as exc: log_info(f"[{name}] AE scoring échoué : {exc} — utilisation EIF seul.") unknown_traffic['ae_recon_error'] = 0.0 unknown_traffic['anomaly_score'] = normalize_scores(raw_scores) else: unknown_traffic['ae_recon_error'] = 0.0 unknown_traffic['anomaly_score'] = normalize_scores(raw_scores) # raw_anomaly_score : score brut IF pour comparaison au seuil et assignation du threat_level unknown_traffic['raw_anomaly_score'] = raw_scores unknown_traffic['model_name'] = name # XGBoost supervisé — troisième voix (si labels historiques disponibles) unknown_traffic['xgb_prob'] = 0.0 if XGB_AVAILABLE and XGB_WEIGHT > 0: try: xgb_client = get_client() xgb_model, xgb_feats = load_or_train_xgb(name, xgb_client, scoring_features, cycle_id) if xgb_model is not None and xgb_feats is not None: # XGB peut utiliser un sous-ensemble de features (celles disponibles dans la vue) xgb_cols = [f for f in xgb_feats if f in unknown_traffic.columns] X_xgb = unknown_traffic[xgb_cols].replace([np.inf, -np.inf], np.nan).fillna(0) xgb_probs = xgb_model.predict_proba(X_xgb.values)[:, 1] unknown_traffic['xgb_prob'] = xgb_probs log_info(f"[{name}] XGBoost : xgb_mean={xgb_probs.mean():.4f}") except Exception as exc: log_info(f"[{name}] XGBoost scoring échoué : {exc} — EIF+AE seuls.") # §8 — Score final via MetaLearner (ou poids fixes en fallback) meta_learner = get_meta_learner(name) eif_norm_arr = unknown_traffic['anomaly_score'].values.copy() ae_norm_arr = normalize_scores(-unknown_traffic['ae_recon_error'].values) xgb_prob_arr = unknown_traffic['xgb_prob'].values hits_arr = unknown_traffic.get('hits', pd.Series(1, index=unknown_traffic.index)).values corr_arr = unknown_traffic.get('correlated', pd.Series(0, index=unknown_traffic.index)).values final_scores = meta_learner.predict(eif_norm_arr, ae_norm_arr, xgb_prob_arr, hits_arr, corr_arr) unknown_traffic['anomaly_score'] = final_scores if meta_learner.is_trained: log_info( f"[{name}] §8 MetaLearner actif ({meta_learner._n_samples} labels) — " f"score moyen={final_scores.mean():.4f}" ) elif unknown_traffic['xgb_prob'].mean() > 0: log_info(f"[{name}] §8 Poids fixes EIF+AE+XGB (MetaLearner pas encore entraîné).") # §8 — Entraînement du MetaLearner sur les labels du cycle courant # (accumulation progressive — activation dès MIN_SAMPLES labels) try: labeled_df = meta_learner.build_labels_from_df(unknown_traffic) if not labeled_df.empty: unknown_traffic_labeled = labeled_df.copy() unknown_traffic_labeled['eif_norm'] = normalize_scores(raw_scores) unknown_traffic_labeled['ae_norm'] = ae_norm_arr if meta_learner.fit(unknown_traffic_labeled): log_decision('META_LEARNER_TRAINED', cycle_id, name, meta_learner._weights_log) except Exception as exc: log_info(f"[{name}] MetaLearner entraînement échoué : {exc}") # §7 — ExIFFI : importance de features pour l'EIF (quand SHAP désactivé) exiffi_tops: list = [{}] * len(unknown_traffic) if not ENABLE_SHAP and len(unknown_traffic) > 0: try: exiffi_tops = compute_exiffi_importance(model, X_test, scoring_features) except Exception: pass # A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD) effective_threshold = compute_adaptive_threshold(raw_scores) log_info(f"[{name}] Seuil effectif : {effective_threshold:.4f} (statique={ANOMALY_THRESHOLD}, percentile={ANOMALY_PERCENTILE})") # A6 — Pénaliser les IPs récurrentes sur le score BRUT avant comparaison au seuil if RECURRENCE_WEIGHT > 0: recurrences = unknown_traffic['src_ip'].map(recurrence_map).fillna(0) penalty = np.log1p(recurrences.values) * RECURRENCE_WEIGHT unknown_traffic['raw_anomaly_score'] = unknown_traffic['raw_anomaly_score'] - penalty # Assigner threat_level à TOUTES les sessions scorées (pour ml_all_scores) unknown_traffic['threat_level'] = unknown_traffic['raw_anomaly_score'].apply(score_to_threat_level) unknown_traffic['recurrence'] = unknown_traffic['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1 unknown_traffic['campaign_id'] = -1 # Extraire les DENY (maintenant avec leur vrai score IF) et forcer leur threat_level deny_mask = unknown_traffic['anubis_bot_action'] == 'DENY' unknown_traffic.loc[deny_mask, 'threat_level'] = 'ANUBIS_DENY' # ── A9 — Classification multifactorielle des navigateurs légitimes ───────── # 5 axes indépendants : JA4 connu, structure JA4, headers HTTP modernes, # comportement de navigation, cohérence TLS/TCP. # browser_confidence [0..1] = combinaison pondérée des 5 axes. # La classification n'exige plus que le JA4 soit dans le dictionnaire statique : # un JA4 inconnu avec des signaux HTTP/TLS/nav forts sera quand même reconnu. _bconf = unknown_traffic.get('browser_confidence', pd.Series(0, index=unknown_traffic.index)).fillna(0) _ibf = unknown_traffic.get('inferred_browser_family', pd.Series('', index=unknown_traffic.index)).fillna('').astype(str) browser_legit_mask = ( (_bconf >= BROWSER_CONFIDENCE_THRESHOLD) & # confiance multifactorielle (_ibf != '') & # famille identifiée (dict ou inférée) (unknown_traffic['threat_level'].isin(['NORMAL', 'LOW'])) & # pas de menace IF (~deny_mask) # pas un DENY Anubis ) # Propagation par cohorte JA4 : si ≥ BROWSER_COHORT_RATIO% des sessions d'un JA4 # sont déjà classées navigateur, propager aux sessions restantes du même JA4. if browser_legit_mask.any(): ja4_col = unknown_traffic['ja4'] legit_per_ja4 = ja4_col[browser_legit_mask].value_counts() total_per_ja4 = ja4_col.value_counts() ratio_per_ja4 = (legit_per_ja4 / total_per_ja4).dropna() cohort_ja4s = set(ratio_per_ja4[ratio_per_ja4 >= BROWSER_COHORT_RATIO].index) if cohort_ja4s: cohort_mask = ( ja4_col.isin(cohort_ja4s) & (~browser_legit_mask) & # pas déjà classé (unknown_traffic['threat_level'].isin(['NORMAL', 'LOW'])) & (~deny_mask) ) browser_legit_mask = browser_legit_mask | cohort_mask n_cohort = cohort_mask.sum() if n_cohort > 0: log_info(f"[{name}] Propagation cohorte JA4 : {n_cohort} sessions supplémentaires ({len(cohort_ja4s)} JA4)") if browser_legit_mask.any(): unknown_traffic.loc[browser_legit_mask, 'threat_level'] = 'LEGITIMATE_BROWSER' # Utiliser la famille inférée (dict ou structurel) _family_disp = _ibf[browser_legit_mask].where(_ibf[browser_legit_mask] != '', 'Unknown') unknown_traffic.loc[browser_legit_mask, 'reason'] = ( '[Navigateur légitime] ' + _family_disp + ' (confiance=' + _bconf[browser_legit_mask].round(2).astype(str) + ')' ) n_legit = browser_legit_mask.sum() families = _ibf[browser_legit_mask].value_counts().to_dict() # Log des axes moyens pour diagnostic ax_means = {} for ax in ['axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern', 'axis_nav_behavior', 'axis_tls_coherence', 'axis_h2_coherence']: col = unknown_traffic.get(ax, None) if col is not None: ax_means[ax.replace('axis_', '')] = round(float(col[browser_legit_mask].mean()), 3) log_info(f"[{name}] {n_legit} session(s) classée(s) LEGITIMATE_BROWSER : {families}") log_info(f"[{name}] Axes moyens : {ax_means}") log_decision('LEGITIMATE_BROWSER', cycle_id, name, { 'count': int(n_legit), 'families': families, 'mean_confidence': round(float(_bconf[browser_legit_mask].mean()), 3), 'axis_means': ax_means, }) # ── A9b — DUAL_MODE : journaliser les décisions browser_matcher vs browser_confidence ── # Quand BROWSER_MATCHER_REPLACE=true, browser_matcher pilote le bypass à la place. if BROWSER_MATCHER_ENABLED and 'bm_decision' in unknown_traffic.columns: log_dual_mode_comparison(unknown_traffic, cycle_id, name) if BROWSER_MATCHER_REPLACE: # Appliquer la décision du matcher (remplace le résultat du bloc A9 ci-dessus) bm_legit = unknown_traffic['bm_decision'] == 'LEGITIMATE_BROWSER' if bm_legit.any(): unknown_traffic.loc[bm_legit, 'threat_level'] = 'LEGITIMATE_BROWSER' unknown_traffic.loc[bm_legit, 'reason'] = ( '[BrowserMatcher] ' + unknown_traffic.loc[bm_legit, 'bm_family'].fillna('Unknown') + ' (score=' + unknown_traffic.loc[bm_legit, 'bm_score'].round(2).astype(str) + ')' ) log_info( f"[{name}][BrowserMatcher] {bm_legit.sum()} bypass(es) appliqué(s) " f"(BROWSER_MATCHER_REPLACE=true)" ) # Atténuation par score partiel pour les zones grises bm_partial = unknown_traffic['bm_decision'] == 'PARTIAL' if bm_partial.any(): partial_scores = unknown_traffic.loc[bm_partial, 'bm_score'].fillna(0.0) unknown_traffic.loc[bm_partial, 'raw_anomaly_score'] = ( unknown_traffic.loc[bm_partial, 'raw_anomaly_score'] * (1 - 0.5 * partial_scores.values) ) # Capturer toutes les sessions scorées (avant filtrage par seuil) — pour ml_all_scores all_scored = unknown_traffic.copy() if not known_bots.empty: known_bots = known_bots.copy() known_bots['anomaly_score'] = 0.0 known_bots['raw_anomaly_score'] = 0.0 known_bots['ae_recon_error'] = 0.0 known_bots['xgb_prob'] = 0.0 known_bots['threat_level'] = 'KNOWN_BOT' known_bots['model_name'] = name known_bots['campaign_id'] = -1 known_bots['reason'] = '[Identification] Bot légitime: ' + known_bots['bot_name'] known_bots['recurrence'] = known_bots['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1 for _, row in known_bots.iterrows(): log_decision('KNOWN_BOT', cycle_id, name, { 'src_ip': row.get('src_ip', ''), 'bot_name': row.get('bot_name', ''), 'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''), 'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''), 'recurrence': int(row.get('recurrence', 1)) }) # ── Anubis ALLOW : bots légitimes identifiés par règles Anubis ─────────── if not anubis_allow.empty: anubis_allow = anubis_allow.copy() anubis_allow['anomaly_score'] = 0.0 anubis_allow['raw_anomaly_score'] = 0.0 anubis_allow['ae_recon_error'] = 0.0 anubis_allow['xgb_prob'] = 0.0 anubis_allow['threat_level'] = 'KNOWN_BOT' anubis_allow['bot_name'] = anubis_allow['anubis_bot_name'] anubis_allow['model_name'] = name anubis_allow['campaign_id'] = -1 anubis_allow['reason'] = '[Anubis ALLOW] ' + anubis_allow['anubis_bot_name'] anubis_allow['recurrence'] = anubis_allow['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1 for _, row in anubis_allow.iterrows(): log_decision('KNOWN_BOT', cycle_id, name, { 'src_ip': row.get('src_ip', ''), 'bot_name': row.get('anubis_bot_name', ''), 'anubis_bot_name': row.get('anubis_bot_name', ''), 'anubis_bot_action': row.get('anubis_bot_action', ''), 'anubis_bot_category': row.get('anubis_bot_category', ''), 'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''), 'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''), 'recurrence': int(row.get('recurrence', 1)), }) # ── Anubis DENY : scorés par IF, toujours inclus dans les threats ──────── # Extraits de unknown_traffic après scoring — ils ont leur vrai score IF. anubis_deny = unknown_traffic[deny_mask].copy() if not anubis_deny.empty: anubis_deny['reason'] = '[Anubis DENY] ' + anubis_deny['anubis_bot_name'].fillna('') + \ ' | ' + anubis_deny['raw_anomaly_score'].apply(lambda s: f'IF={s:.4f}') log_info(f"[{name}] Anubis DENY: {len(anubis_deny)} IP(s) scorées par IF " f"(score moyen: {anubis_deny['raw_anomaly_score'].mean():.4f}).") for _, row in anubis_deny.iterrows(): log_decision('ANUBIS_DENY', cycle_id, name, { 'src_ip': row.get('src_ip', ''), 'anubis_bot_name': row.get('anubis_bot_name', ''), 'anubis_bot_action': row.get('anubis_bot_action', ''), 'anubis_bot_category': row.get('anubis_bot_category', ''), 'anomaly_score': round(float(row.get('anomaly_score', 0)), 4), 'raw_anomaly_score': round(float(row.get('raw_anomaly_score', 0)), 4), 'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''), 'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''), 'recurrence': int(row.get('recurrence', 1)), }) # Filtrer sur raw_anomaly_score (A6 inclus) — seulement le trafic non-DENY et non-navigateur légitime # Les DENY sont toujours des threats, indépendamment du seuil IF # Les LEGITIMATE_BROWSER sont exclus des anomalies (navigateurs confirmés) non_deny_traffic = unknown_traffic[~deny_mask & (unknown_traffic['threat_level'] != 'LEGITIMATE_BROWSER')] anomalies = non_deny_traffic[non_deny_traffic['raw_anomaly_score'] < effective_threshold].copy() if not anomalies.empty: log_info(f"[{name}] ALERT: {len(anomalies)} anomalies détectées (seuil={effective_threshold:.4f}).") anomalies['recurrence'] = anomalies['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1 # A4 — Explainabilité SHAP : top features responsables de chaque anomalie X_anomalies = X_test.loc[anomalies.index] shap_tops = compute_shap_top_features(model, X_anomalies, valid_features) # §7 — ExIFFI : utiliser les tops ExIFFI précalculés quand SHAP est inactif # Construire un mapping index → exiffi_top pour accès rapide if len(exiffi_tops) == len(unknown_traffic): _exiffi_map = dict(zip(unknown_traffic.index, exiffi_tops)) exiffi_for_anomalies = [_exiffi_map.get(idx, {}) for idx in anomalies.index] else: exiffi_for_anomalies = [{}] * len(anomalies) anomalies['reason'] = [ build_reason(name, row, shap, exiffi) for (_, row), shap, exiffi in zip(anomalies.iterrows(), shap_tops, exiffi_for_anomalies) ] # A8 — Clustering DBSCAN pour identifier les campagnes coordonnées if ENABLE_CLUSTERING: anomalies = cluster_anomalies(anomalies, scoring_features, ae_model=ae_model) # P2 — Escalade par taille de campagne : les IPs dans un cluster # coordonné de grande taille sont plus menaçantes que des IPs isolées. # Escalader HIGH → CRITICAL si cluster_size ≥ 5. if 'campaign_id' in anomalies.columns: cid_counts = anomalies['campaign_id'].value_counts() for cid, size in cid_counts.items(): if cid < 0: continue if size >= 5: mask = (anomalies['campaign_id'] == cid) & (anomalies['threat_level'] == 'HIGH') n_escalated = mask.sum() if n_escalated > 0: anomalies.loc[mask, 'threat_level'] = 'CRITICAL' anomalies.loc[mask, 'reason'] = anomalies.loc[mask, 'reason'] + \ f' [Escalade campagne #{cid}, {size} IPs coordonnées]' log_info(f"[{name}] Escalade campagne #{cid}: {n_escalated} IP(s) HIGH→CRITICAL ({size} membres)") anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'}) for _, row in anomalies.iterrows(): log_decision('ANOMALY', cycle_id, name, { 'src_ip': row.get('src_ip', ''), 'anomaly_score': round(float(row.get('anomaly_score', 0)), 4), 'raw_anomaly_score': round(float(row.get('raw_anomaly_score', 0)), 4), 'threat_level': row.get('threat_level', ''), 'recurrence': int(row.get('recurrence', 1)), 'hit_velocity': round(float(row.get('hit_velocity', 0)), 2), 'fuzzing_index': round(float(row.get('fuzzing_index', 0)), 2), 'post_ratio': round(float(row.get('post_ratio', 0)), 3), 'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''), 'asn_detail': row.get('asn_detail', ''), 'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''), 'asn_label': row.get('asn_label', ''), 'ja4': row.get('ja4', ''), 'host': row.get('host', ''), 'correlated': int(row.get('correlated', 0)), 'campaign_id': int(row.get('campaign_id', -1)), 'effective_threshold': round(effective_threshold, 4), 'reason': row.get('reason', '') }) threats = pd.concat([df for df in [ anomalies if not anomalies.empty else None, known_bots if not known_bots.empty else None, anubis_allow if not anubis_allow.empty else None, anubis_deny if not anubis_deny.empty else None, ] if df is not None], ignore_index=True) # Propager campaign_id des anomalies clusterisées vers all_scored # (all_scored a été capturé avant clustering, ses campaign_id sont tous -1) if not anomalies.empty and 'campaign_id' in anomalies.columns: cid_map = anomalies.set_index(anomalies.index)['campaign_id'] matched = all_scored.index.isin(cid_map.index) if matched.any(): all_scored.loc[matched, 'campaign_id'] = cid_map # Inclure anubis_allow dans all_scored pour traçabilité dans ml_all_scores. # Ces IPs sont exclues de l'analyse IF mais doivent apparaître dans la table # de scores avec threat_level='KNOWN_BOT' et anomaly_score=0.0. if not anubis_allow.empty: all_scored = pd.concat([all_scored, anubis_allow], ignore_index=True) # ── Résumé du modèle ───────────────────────────────────────────────────── n_threats = len(threats) if not threats.empty else 0 n_anomalies = len(anomalies) if not anomalies.empty else 0 n_legit_browser = int(browser_legit_mask.sum()) if browser_legit_mask is not None else 0 n_deny = len(anubis_deny) if not anubis_deny.empty else 0 tl_counts = threats['threat_level'].value_counts().to_dict() if not threats.empty else {} tl_str = ', '.join(f'{k}={v}' for k, v in sorted(tl_counts.items())) if tl_counts else 'aucune' log_info(f'[{name}] ── Résultat ────────────────────────────────────') log_info(f'[{name}] Menaces totales : {n_threats:>6} ({tl_str})') log_info(f'[{name}] Anomalies IF : {n_anomalies:>6} (seuil={effective_threshold:.4f})') log_info(f'[{name}] Navigateurs légit. : {n_legit_browser:>6}') log_info(f'[{name}] Anubis DENY (forcé) : {n_deny:>6}') log_info(f'[{name}] Sessions scorées : {len(all_scored):>6} (→ ml_all_scores)') return threats, all_scored