Files
ja4-platform/services/bot-detector/bot_detector/pipeline.py
Jacquin Antoine 7894d39f1c feat(ml): replace logistic regression with MLP fusion and KS drift with ADWIN online learning
Replace the LogisticRegression meta-learner with a PyTorch MetaFusionMLP
(Linear(3,16)->BN->ReLU->Dropout->Linear(16,1)->Sigmoid) for non-linear
fusion of EIF, NF, and XGBoost scores. Replace KS-test + quantile digest
drift detection with ADWIN (adaptive sliding window, Hoeffding bound).
Replace weekly XGBoost batch retraining with River HoeffdingAdaptiveTree
for incremental online learning (learn_one per cycle). Update all thesis
documentation sections (2.4.2c, 2.4.3, 3.8, discussion, conclusion).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-13 16:32:34 +02:00

474 lines
28 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Pipeline de détection semi-supervisée.
Fonction principale run_semi_supervised_logic() : triage, scoring EIF, AE, XGB,
classification navigateur, détection d'anomalies, SHAP et clustering.
"""
import numpy as np
import pandas as pd
from .config import (
DB, CONTAMINATION, AE_WEIGHT, XGB_WEIGHT, RECURRENCE_WEIGHT,
ANOMALY_THRESHOLD, ANOMALY_PERCENTILE, ENABLE_CLUSTERING,
ENABLE_SHAP, EIF_AVAILABLE, TORCH_AVAILABLE, XGB_AVAILABLE,
BROWSER_CONFIDENCE_THRESHOLD, BROWSER_COHORT_RATIO,
MIN_VALID_FEATURE_RATIO, STRUCTURAL_EXCLUDED_FEATURES,
)
from .log import log_info, log_decision
from .infra import score_to_threat_level, get_client
from .models import load_or_train_model, load_or_train_xgb, TrafficNormalizingFlow
from .scoring import (
validate_features, compute_adaptive_threshold, normalize_scores,
compute_shap_top_features, build_reason, cluster_anomalies,
compute_exiffi_importance, compute_ae_feature_errors, get_meta_learner,
FINGERPRINT_COHERENCE_THRESHOLD,
)
from .browser_matcher import log_dual_mode_comparison, BROWSER_MATCHER_ENABLED, BROWSER_MATCHER_REPLACE
# ═══════════════════════════════════════════════════════════════════════════════
# ANALYSE SEMI-SUPERVISÉE
# ═══════════════════════════════════════════════════════════════════════════════
def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
"""Applique le pipeline de détection semi-supervisée sur un sous-ensemble du trafic.
Trifurque le trafic en bots connus, bots Anubis ALLOW et trafic inconnu,
entraîne ou charge le modèle IsolationForest sur la baseline humaine,
score le trafic inconnu, applique les améliorations A2/A4/A6/A8,
et retourne (threats, all_scored) sous forme de DataFrames.
Effets de bord : écriture dans les logs de décision via log_decision.
"""
# 1. Bots connus (dict_bot_ip / dict_bot_ja4) → exclus du scoring IF
known_bots = df[df['bot_name'] != ''].copy()
rest = df[df['bot_name'] == ''].copy()
# 2. Bots Anubis ALLOW → bots légitimes, exclus du scoring IF
anubis_allow = rest[rest['anubis_bot_action'] == 'ALLOW'].copy()
# 3. Tout le reste passe par l'IsolationForest pour un score réel :
# - DENY : menaces identifiées par règles Anubis → IF donne le score de sévérité
# - WEIGH / inconnu → scorés normalement (anubis_is_flagged=1 pour WEIGH)
# Les DENY sont TOUJOURS inclus dans les threats, indépendamment du seuil IF.
unknown_traffic = rest[rest['anubis_bot_action'] != 'ALLOW'].copy()
human_baseline = unknown_traffic[unknown_traffic['asn_label'] == 'isp']
log_info(f'[{name}] ── Triage ──────────────────────────────────────')
log_info(f'[{name}] Total sessions : {len(df):>6}')
log_info(f'[{name}] Bots connus (dict) : {len(known_bots):>6}')
log_info(f'[{name}] Anubis ALLOW : {len(anubis_allow):>6}')
log_info(f'[{name}] Trafic à scorer (IF) : {len(unknown_traffic):>6}')
log_info(f'[{name}] Baseline ISP (human) : {len(human_baseline):>6} (seuil min=500)')
# §3 — Exclure les sessions ISP à faible cohérence de fingerprint de la baseline humaine
# Ces sessions ISP avec un fingerprint incohérent sont probablement des proxies résidentiels
# ou des appareils mal configurés qui contamineraient la baseline.
if 'fingerprint_coherence_score' in human_baseline.columns:
low_coh = human_baseline['fingerprint_coherence_score'] < FINGERPRINT_COHERENCE_THRESHOLD
n_low_coh = int(low_coh.sum())
if n_low_coh > 0:
human_baseline = human_baseline[~low_coh]
log_info(
f'[{name}] Baseline après filtre cohérence (<{FINGERPRINT_COHERENCE_THRESHOLD}) : '
f'{len(human_baseline):>6} ({n_low_coh} exclues)'
)
log_decision('LOW_COHERENCE_EXCLUDED', cycle_id, name, {
'n_excluded': n_low_coh, 'threshold': FINGERPRINT_COHERENCE_THRESHOLD,
'baseline_after': len(human_baseline),
})
# A7 — Valider les features avant tout traitement
valid_features = validate_features(df, features, name, cycle_id)
if valid_features is None:
return pd.DataFrame(), pd.DataFrame()
if len(human_baseline) < 500:
log_info(f"[{name}] ⚠ Données humaines insuffisantes ({len(human_baseline)} < 500) — cycle ignoré.")
log_info(f"[{name}] Distribution asn_label dans le trafic à scorer :")
if 'asn_label' in unknown_traffic.columns:
for label, cnt in unknown_traffic['asn_label'].value_counts().head(8).items():
log_info(f"[{name}] {label:>15} : {cnt}")
log_decision('SKIPPED_LOW_DATA', cycle_id, name, {
'human_count': len(human_baseline), 'unknown_count': len(unknown_traffic)
})
return pd.DataFrame(), pd.DataFrame()
log_info(f'[{name}] ── Modèle EIF ─────────────────────────────────')
log_info(f'[{name}] Features validées : {len(valid_features)}/{len(features)} ({", ".join(valid_features[:5])}{"" if len(valid_features) > 5 else ""})')
# A1 — Dérive conceptuelle intégrée dans load_or_train_model
model, ae_model, model_features = load_or_train_model(name, human_baseline, valid_features, cycle_id)
# Utiliser les features du modèle (possiblement différentes après pruning/chargement)
scoring_features = [f for f in model_features if f in unknown_traffic.columns]
unknown_traffic = unknown_traffic.copy()
X_test = unknown_traffic[scoring_features].replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(X_test.median())
raw_scores = model.decision_function(X_test)
# isotree renvoie des scores dans [0, 1] : 0.5 = frontière, >0.5 = anomal
# sklearn renvoie des scores centrés sur 0 : <0 = anomal, >0 = normal
# Conversion : sklearn_equiv = 0.5 - isotree_score
# isotree 0.8 → -0.3 (CRITICAL) | isotree 0.5 → 0.0 (frontière)
# isotree 0.3 → +0.2 (NORMAL)
if EIF_AVAILABLE:
raw_scores = 0.5 - raw_scores
log_info(f'[{name}] Scoring EIF : {len(X_test)} sessions scorées (min={raw_scores.min():.4f}, max={raw_scores.max():.4f}, mean={raw_scores.mean():.4f})')
# Combinaison EIF + Normalizing Flow si disponible
# Score final = (1-α) * eif_norm + α * nf_norm où α = AE_WEIGHT
if ae_model is not None and AE_WEIGHT > 0:
try:
nf_neg_ll = ae_model.score_samples(X_test.values) # -log p(x)
nf_norm = normalize_scores(-nf_neg_ll) # plus élevé = plus anomal
eif_norm = normalize_scores(raw_scores)
combined_norm = (1 - AE_WEIGHT) * eif_norm + AE_WEIGHT * nf_norm
unknown_traffic['ae_recon_error'] = nf_neg_ll # nom conservé pour rétro-compatibilité
unknown_traffic['anomaly_score'] = combined_norm
log_info(f"[{name}] Score combiné EIF+NF (α={AE_WEIGHT}): nf_mean={nf_neg_ll.mean():.6f}")
except Exception as exc:
log_info(f"[{name}] NF scoring échoué : {exc} — utilisation EIF seul.")
unknown_traffic['ae_recon_error'] = 0.0
unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
else:
unknown_traffic['ae_recon_error'] = 0.0
unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
# raw_anomaly_score : score brut IF pour comparaison au seuil et assignation du threat_level
unknown_traffic['raw_anomaly_score'] = raw_scores
unknown_traffic['model_name'] = name
# Modèle supervisé — troisième voix (Hoeffding Adaptive Tree ou XGBoost fallback)
unknown_traffic['xgb_prob'] = 0.0
xgb_model_ref = None # Référence pour SHAP TreeExplainer (§2.4.5)
if XGB_AVAILABLE and XGB_WEIGHT > 0:
try:
xgb_client = get_client()
xgb_model, xgb_feats = load_or_train_xgb(name, xgb_client, scoring_features, cycle_id)
if xgb_model is not None and xgb_feats is not None:
xgb_cols = [f for f in xgb_feats if f in unknown_traffic.columns]
X_xgb = unknown_traffic[xgb_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
# River HAT utilise predict_proba_many(DataFrame), XGBoost utilise predict_proba(ndarray)
if hasattr(xgb_model, 'predict_proba_many'):
proba_df = xgb_model.predict_proba_many(X_xgb[xgb_cols])
xgb_probs = proba_df[1].values if 1 in proba_df.columns else np.zeros(len(X_xgb))
else:
xgb_probs = xgb_model.predict_proba(X_xgb.values)[:, 1]
unknown_traffic['xgb_prob'] = xgb_probs
xgb_model_ref = xgb_model
log_info(f"[{name}] Supervisé : score moyen={xgb_probs.mean():.4f}")
except Exception as exc:
log_info(f"[{name}] Supervisé scoring échoué : {exc} — EIF+NF seuls.")
# §8 — Score final via MetaLearner MLP (ou poids fixes en fallback)
meta_learner = get_meta_learner(name)
eif_norm_arr = unknown_traffic['anomaly_score'].values.copy()
ae_norm_arr = normalize_scores(-unknown_traffic['ae_recon_error'].values)
xgb_prob_arr = unknown_traffic['xgb_prob'].values
final_scores = meta_learner.predict(eif_norm_arr, ae_norm_arr, xgb_prob_arr)
unknown_traffic['anomaly_score'] = final_scores
if meta_learner.is_trained:
log_info(
f"[{name}] §8 MetaFusionMLP actif ({meta_learner._n_samples} labels) — "
f"score moyen={final_scores.mean():.4f}"
)
elif unknown_traffic['xgb_prob'].mean() > 0:
log_info(f"[{name}] §8 Poids fixes EIF+NF+XGB (MetaFusionMLP pas encore entraîné).")
# §8 — Entraînement du MetaLearner sur les labels du cycle courant
# (accumulation progressive — activation dès MIN_SAMPLES labels)
try:
labeled_df = meta_learner.build_labels_from_df(unknown_traffic)
if not labeled_df.empty:
unknown_traffic_labeled = labeled_df.copy()
unknown_traffic_labeled['eif_norm'] = normalize_scores(raw_scores)
unknown_traffic_labeled['ae_norm'] = ae_norm_arr
if meta_learner.fit(unknown_traffic_labeled):
log_decision('META_LEARNER_TRAINED', cycle_id, name, meta_learner._weights_log)
except Exception as exc:
log_info(f"[{name}] MetaLearner entraînement échoué : {exc}")
# §7 — ExIFFI : importance de features pour l'EIF (toujours actif en complément de SHAP)
exiffi_tops: list = [{}] * len(unknown_traffic)
if len(unknown_traffic) > 0:
try:
exiffi_tops = compute_exiffi_importance(model, X_test, scoring_features)
except Exception:
pass
# A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD)
effective_threshold = compute_adaptive_threshold(raw_scores)
log_info(f"[{name}] Seuil effectif : {effective_threshold:.4f} (statique={ANOMALY_THRESHOLD}, percentile={ANOMALY_PERCENTILE})")
# A6 — Pénaliser les IPs récurrentes sur le score BRUT avant comparaison au seuil
if RECURRENCE_WEIGHT > 0:
recurrences = unknown_traffic['src_ip'].map(recurrence_map).fillna(0)
penalty = np.log1p(recurrences.values) * RECURRENCE_WEIGHT
unknown_traffic['raw_anomaly_score'] = unknown_traffic['raw_anomaly_score'] - penalty
# Assigner threat_level à TOUTES les sessions scorées (pour ml_all_scores)
unknown_traffic['threat_level'] = unknown_traffic['raw_anomaly_score'].apply(score_to_threat_level)
unknown_traffic['recurrence'] = unknown_traffic['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
unknown_traffic['campaign_id'] = -1
# Extraire les DENY (maintenant avec leur vrai score IF) et forcer leur threat_level
deny_mask = unknown_traffic['anubis_bot_action'] == 'DENY'
unknown_traffic.loc[deny_mask, 'threat_level'] = 'ANUBIS_DENY'
# ── A9 — Classification multifactorielle des navigateurs légitimes ─────────
# 5 axes indépendants : JA4 connu, structure JA4, headers HTTP modernes,
# comportement de navigation, cohérence TLS/TCP.
# browser_confidence [0..1] = combinaison pondérée des 5 axes.
# La classification n'exige plus que le JA4 soit dans le dictionnaire statique :
# un JA4 inconnu avec des signaux HTTP/TLS/nav forts sera quand même reconnu.
_bconf = unknown_traffic.get('browser_confidence', pd.Series(0, index=unknown_traffic.index)).fillna(0)
_ibf = unknown_traffic.get('inferred_browser_family', pd.Series('', index=unknown_traffic.index)).fillna('').astype(str)
browser_legit_mask = (
(_bconf >= BROWSER_CONFIDENCE_THRESHOLD) & # confiance multifactorielle
(_ibf != '') & # famille identifiée (dict ou inférée)
(unknown_traffic['threat_level'].isin(['NORMAL', 'LOW'])) & # pas de menace IF
(~deny_mask) # pas un DENY Anubis
)
# Propagation par cohorte JA4 : si ≥ BROWSER_COHORT_RATIO% des sessions d'un JA4
# sont déjà classées navigateur, propager aux sessions restantes du même JA4.
if browser_legit_mask.any():
ja4_col = unknown_traffic['ja4']
legit_per_ja4 = ja4_col[browser_legit_mask].value_counts()
total_per_ja4 = ja4_col.value_counts()
ratio_per_ja4 = (legit_per_ja4 / total_per_ja4).dropna()
cohort_ja4s = set(ratio_per_ja4[ratio_per_ja4 >= BROWSER_COHORT_RATIO].index)
if cohort_ja4s:
cohort_mask = (
ja4_col.isin(cohort_ja4s) &
(~browser_legit_mask) & # pas déjà classé
(unknown_traffic['threat_level'].isin(['NORMAL', 'LOW'])) &
(~deny_mask)
)
browser_legit_mask = browser_legit_mask | cohort_mask
n_cohort = cohort_mask.sum()
if n_cohort > 0:
log_info(f"[{name}] Propagation cohorte JA4 : {n_cohort} sessions supplémentaires ({len(cohort_ja4s)} JA4)")
if browser_legit_mask.any():
unknown_traffic.loc[browser_legit_mask, 'threat_level'] = 'LEGITIMATE_BROWSER'
# Utiliser la famille inférée (dict ou structurel)
_family_disp = _ibf[browser_legit_mask].where(_ibf[browser_legit_mask] != '', 'Unknown')
unknown_traffic.loc[browser_legit_mask, 'reason'] = (
'[Navigateur légitime] ' + _family_disp +
' (confiance=' + _bconf[browser_legit_mask].round(2).astype(str) + ')'
)
n_legit = browser_legit_mask.sum()
families = _ibf[browser_legit_mask].value_counts().to_dict()
# Log des axes moyens pour diagnostic
ax_means = {}
for ax in ['axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
'axis_nav_behavior', 'axis_tls_coherence', 'axis_h2_coherence']:
col = unknown_traffic.get(ax, None)
if col is not None:
ax_means[ax.replace('axis_', '')] = round(float(col[browser_legit_mask].mean()), 3)
log_info(f"[{name}] {n_legit} session(s) classée(s) LEGITIMATE_BROWSER : {families}")
log_info(f"[{name}] Axes moyens : {ax_means}")
log_decision('LEGITIMATE_BROWSER', cycle_id, name, {
'count': int(n_legit), 'families': families,
'mean_confidence': round(float(_bconf[browser_legit_mask].mean()), 3),
'axis_means': ax_means,
})
# ── A9b — DUAL_MODE : journaliser les décisions browser_matcher vs browser_confidence ──
# Quand BROWSER_MATCHER_REPLACE=true, browser_matcher pilote le bypass à la place.
if BROWSER_MATCHER_ENABLED and 'bm_decision' in unknown_traffic.columns:
log_dual_mode_comparison(unknown_traffic, cycle_id, name)
if BROWSER_MATCHER_REPLACE:
# Appliquer la décision du matcher (remplace le résultat du bloc A9 ci-dessus)
bm_legit = unknown_traffic['bm_decision'] == 'LEGITIMATE_BROWSER'
if bm_legit.any():
unknown_traffic.loc[bm_legit, 'threat_level'] = 'LEGITIMATE_BROWSER'
unknown_traffic.loc[bm_legit, 'reason'] = (
'[BrowserMatcher] '
+ unknown_traffic.loc[bm_legit, 'bm_family'].fillna('Unknown')
+ ' (score=' + unknown_traffic.loc[bm_legit, 'bm_score'].round(2).astype(str) + ')'
)
log_info(
f"[{name}][BrowserMatcher] {bm_legit.sum()} bypass(es) appliqué(s) "
f"(BROWSER_MATCHER_REPLACE=true)"
)
# Atténuation par score partiel pour les zones grises
bm_partial = unknown_traffic['bm_decision'] == 'PARTIAL'
if bm_partial.any():
partial_scores = unknown_traffic.loc[bm_partial, 'bm_score'].fillna(0.0)
unknown_traffic.loc[bm_partial, 'raw_anomaly_score'] = (
unknown_traffic.loc[bm_partial, 'raw_anomaly_score']
* (1 - 0.5 * partial_scores.values)
)
# Capturer toutes les sessions scorées (avant filtrage par seuil) — pour ml_all_scores
all_scored = unknown_traffic.copy()
if not known_bots.empty:
known_bots = known_bots.copy()
known_bots['anomaly_score'] = 0.0
known_bots['raw_anomaly_score'] = 0.0
known_bots['ae_recon_error'] = 0.0
known_bots['xgb_prob'] = 0.0
known_bots['threat_level'] = 'KNOWN_BOT'
known_bots['model_name'] = name
known_bots['campaign_id'] = -1
known_bots['reason'] = '[Identification] Bot légitime: ' + known_bots['bot_name']
known_bots['recurrence'] = known_bots['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
for _, row in known_bots.iterrows():
log_decision('KNOWN_BOT', cycle_id, name, {
'src_ip': row.get('src_ip', ''), 'bot_name': row.get('bot_name', ''),
'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''),
'recurrence': int(row.get('recurrence', 1))
})
# ── Anubis ALLOW : bots légitimes identifiés par règles Anubis ───────────
if not anubis_allow.empty:
anubis_allow = anubis_allow.copy()
anubis_allow['anomaly_score'] = 0.0
anubis_allow['raw_anomaly_score'] = 0.0
anubis_allow['ae_recon_error'] = 0.0
anubis_allow['xgb_prob'] = 0.0
anubis_allow['threat_level'] = 'KNOWN_BOT'
anubis_allow['bot_name'] = anubis_allow['anubis_bot_name']
anubis_allow['model_name'] = name
anubis_allow['campaign_id'] = -1
anubis_allow['reason'] = '[Anubis ALLOW] ' + anubis_allow['anubis_bot_name']
anubis_allow['recurrence'] = anubis_allow['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
for _, row in anubis_allow.iterrows():
log_decision('KNOWN_BOT', cycle_id, name, {
'src_ip': row.get('src_ip', ''), 'bot_name': row.get('anubis_bot_name', ''),
'anubis_bot_name': row.get('anubis_bot_name', ''),
'anubis_bot_action': row.get('anubis_bot_action', ''),
'anubis_bot_category': row.get('anubis_bot_category', ''),
'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''),
'recurrence': int(row.get('recurrence', 1)),
})
# ── Anubis DENY : scorés par IF, toujours inclus dans les threats ────────
# Extraits de unknown_traffic après scoring — ils ont leur vrai score IF.
anubis_deny = unknown_traffic[deny_mask].copy()
if not anubis_deny.empty:
anubis_deny['reason'] = '[Anubis DENY] ' + anubis_deny['anubis_bot_name'].fillna('') + \
' | ' + anubis_deny['raw_anomaly_score'].apply(lambda s: f'IF={s:.4f}')
log_info(f"[{name}] Anubis DENY: {len(anubis_deny)} IP(s) scorées par IF "
f"(score moyen: {anubis_deny['raw_anomaly_score'].mean():.4f}).")
for _, row in anubis_deny.iterrows():
log_decision('ANUBIS_DENY', cycle_id, name, {
'src_ip': row.get('src_ip', ''), 'anubis_bot_name': row.get('anubis_bot_name', ''),
'anubis_bot_action': row.get('anubis_bot_action', ''),
'anubis_bot_category': row.get('anubis_bot_category', ''),
'anomaly_score': round(float(row.get('anomaly_score', 0)), 4),
'raw_anomaly_score': round(float(row.get('raw_anomaly_score', 0)), 4),
'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''),
'recurrence': int(row.get('recurrence', 1)),
})
# Filtrer sur raw_anomaly_score (A6 inclus) — seulement le trafic non-DENY et non-navigateur légitime
# Les DENY sont toujours des threats, indépendamment du seuil IF
# Les LEGITIMATE_BROWSER sont exclus des anomalies (navigateurs confirmés)
non_deny_traffic = unknown_traffic[~deny_mask & (unknown_traffic['threat_level'] != 'LEGITIMATE_BROWSER')]
anomalies = non_deny_traffic[non_deny_traffic['raw_anomaly_score'] < effective_threshold].copy()
if not anomalies.empty:
log_info(f"[{name}] ALERT: {len(anomalies)} anomalies détectées (seuil={effective_threshold:.4f}).")
anomalies['recurrence'] = anomalies['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
# A4 — Explainabilité SHAP : TreeExplainer sur XGBoost si dispo, sinon EIF
X_anomalies = X_test.loc[anomalies.index]
shap_tops = compute_shap_top_features(model, X_anomalies, valid_features,
xgb_model=xgb_model_ref)
# §7 — ExIFFI : utiliser les tops ExIFFI précalculés quand SHAP est inactif
# Construire un mapping index → exiffi_top pour accès rapide
if len(exiffi_tops) == len(unknown_traffic):
_exiffi_map = dict(zip(unknown_traffic.index, exiffi_tops))
exiffi_for_anomalies = [_exiffi_map.get(idx, {}) for idx in anomalies.index]
else:
exiffi_for_anomalies = [{}] * len(anomalies)
anomalies['reason'] = [
build_reason(name, row, shap, exiffi)
for (_, row), shap, exiffi
in zip(anomalies.iterrows(), shap_tops, exiffi_for_anomalies)
]
# A8 — Clustering DBSCAN pour identifier les campagnes coordonnées
if ENABLE_CLUSTERING:
anomalies = cluster_anomalies(anomalies, scoring_features, ae_model=ae_model)
# P2 — Escalade par taille de campagne : les IPs dans un cluster
# coordonné de grande taille sont plus menaçantes que des IPs isolées.
# Escalader HIGH → CRITICAL si cluster_size ≥ 5.
if 'campaign_id' in anomalies.columns:
cid_counts = anomalies['campaign_id'].value_counts()
for cid, size in cid_counts.items():
if cid < 0:
continue
if size >= 5:
mask = (anomalies['campaign_id'] == cid) & (anomalies['threat_level'] == 'HIGH')
n_escalated = mask.sum()
if n_escalated > 0:
anomalies.loc[mask, 'threat_level'] = 'CRITICAL'
anomalies.loc[mask, 'reason'] = anomalies.loc[mask, 'reason'] + \
f' [Escalade campagne #{cid}, {size} IPs coordonnées]'
log_info(f"[{name}] Escalade campagne #{cid}: {n_escalated} IP(s) HIGH→CRITICAL ({size} membres)")
anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
for _, row in anomalies.iterrows():
log_decision('ANOMALY', cycle_id, name, {
'src_ip': row.get('src_ip', ''), 'anomaly_score': round(float(row.get('anomaly_score', 0)), 4),
'raw_anomaly_score': round(float(row.get('raw_anomaly_score', 0)), 4),
'threat_level': row.get('threat_level', ''), 'recurrence': int(row.get('recurrence', 1)),
'hit_velocity': round(float(row.get('hit_velocity', 0)), 2),
'fuzzing_index': round(float(row.get('fuzzing_index', 0)), 2),
'post_ratio': round(float(row.get('post_ratio', 0)), 3),
'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
'asn_detail': row.get('asn_detail', ''), 'asn_domain': row.get('asn_domain', ''),
'country_code': row.get('country_code', ''), 'asn_label': row.get('asn_label', ''),
'ja4': row.get('ja4', ''), 'host': row.get('host', ''),
'correlated': int(row.get('correlated', 0)), 'campaign_id': int(row.get('campaign_id', -1)),
'effective_threshold': round(effective_threshold, 4), 'reason': row.get('reason', '')
})
threats = pd.concat([df for df in [
anomalies if not anomalies.empty else None,
known_bots if not known_bots.empty else None,
anubis_allow if not anubis_allow.empty else None,
anubis_deny if not anubis_deny.empty else None,
] if df is not None], ignore_index=True)
# Propager campaign_id des anomalies clusterisées vers all_scored
# (all_scored a été capturé avant clustering, ses campaign_id sont tous -1)
if not anomalies.empty and 'campaign_id' in anomalies.columns:
cid_map = anomalies.set_index(anomalies.index)['campaign_id']
matched = all_scored.index.isin(cid_map.index)
if matched.any():
all_scored.loc[matched, 'campaign_id'] = cid_map
# Inclure anubis_allow dans all_scored pour traçabilité dans ml_all_scores.
# Ces IPs sont exclues de l'analyse IF mais doivent apparaître dans la table
# de scores avec threat_level='KNOWN_BOT' et anomaly_score=0.0.
if not anubis_allow.empty:
all_scored = pd.concat([all_scored, anubis_allow], ignore_index=True)
# ── Résumé du modèle ─────────────────────────────────────────────────────
n_threats = len(threats) if not threats.empty else 0
n_anomalies = len(anomalies) if not anomalies.empty else 0
n_legit_browser = int(browser_legit_mask.sum()) if browser_legit_mask is not None else 0
n_deny = len(anubis_deny) if not anubis_deny.empty else 0
tl_counts = threats['threat_level'].value_counts().to_dict() if not threats.empty else {}
tl_str = ', '.join(f'{k}={v}' for k, v in sorted(tl_counts.items())) if tl_counts else 'aucune'
log_info(f'[{name}] ── Résultat ────────────────────────────────────')
log_info(f'[{name}] Menaces totales : {n_threats:>6} ({tl_str})')
log_info(f'[{name}] Anomalies IF : {n_anomalies:>6} (seuil={effective_threshold:.4f})')
log_info(f'[{name}] Navigateurs légit. : {n_legit_browser:>6}')
log_info(f'[{name}] Anubis DENY (forcé) : {n_deny:>6}')
log_info(f'[{name}] Sessions scorées : {len(all_scored):>6} (→ ml_all_scores)')
return threats, all_scored