feat(bot-detector): upgrade to state-of-the-art detection pipeline

- Fix UnboundLocalError on global _consecutive_failures/_service_healthy
- Add SQL identifier validation for DB names at startup
- Replace Z-score drift detection with KS test (scipy.stats.ks_2samp)
- Replace DBSCAN with HDBSCAN (adaptive clustering, no epsilon needed)
- Fix NaN→0 blanket imputation with per-feature median/sentinel strategy
- Add 80/20 temporal train/validation split with offline metrics logging
- Integrate thesis §5 features from view_thesis_features_1h:
  path_transition_entropy, cadence_cv, burst/pause ratios,
  host_diversity, host_sweep_speed, host_coverage_uniformity,
  ja4_drift_ratio (Complet model only)
- Add SOC feedback loop: read classifications from audit_logs,
  reclassify FP IPs as human, exclude TP IPs from baseline
- Update dependencies: clickhouse-connect 0.8.12, scikit-learn 1.6.1,
  pandas 2.2.3, shap 0.47.2, add scipy>=1.14, hdbscan>=0.8.38

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-08 02:09:18 +02:00
parent 6d02f21c1e
commit 3ae8c7d9c9
2 changed files with 205 additions and 20 deletions

View File

@ -23,6 +23,12 @@ import clickhouse_connect
from logging.handlers import RotatingFileHandler
from http.server import HTTPServer, BaseHTTPRequestHandler
from sklearn.ensemble import IsolationForest
try:
import hdbscan as _hdbscan
HDBSCAN_AVAILABLE = True
except ImportError:
HDBSCAN_AVAILABLE = False
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import warnings
@ -60,6 +66,12 @@ def _require_float(name, default, lo=None, hi=None):
DB = os.getenv('CLICKHOUSE_DB_PROCESSING', os.getenv('CLICKHOUSE_DB', 'ja4_processing'))
DB_LOGS = os.getenv('CLICKHOUSE_DB_LOGS', 'ja4_logs')
import re
_SAFE_IDENTIFIER_RE = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$')
for _db_name, _db_val in [('CLICKHOUSE_DB_PROCESSING', DB), ('CLICKHOUSE_DB_LOGS', DB_LOGS)]:
if not _SAFE_IDENTIFIER_RE.match(_db_val):
raise SystemExit(f"[CONFIG] {_db_name}={_db_val!r} invalide — doit être un identifiant SQL valide.")
CONTAMINATION = _require_float('ISOLATION_CONTAMINATION', 0.001, 0, 0.5)
ANOMALY_THRESHOLD = _require_float('ANOMALY_THRESHOLD', -0.05)
LOG_FILE = os.getenv('BOT_DETECTOR_LOG', '/var/log/bot_detector/decisions.jsonl')
@ -81,6 +93,8 @@ ANOMALY_PERCENTILE = int(os.getenv('ANOMALY_PERCENTILE', '5'))
# A3 — Analyse multi-fenêtres
ENABLE_MULTIWINDOW = os.getenv('ENABLE_MULTIWINDOW', 'false').lower() == 'true'
MULTIWINDOW_VIEW = os.getenv('MULTIWINDOW_VIEW', 'view_ai_features_24h')
if not _SAFE_IDENTIFIER_RE.match(MULTIWINDOW_VIEW):
raise SystemExit(f"[CONFIG] MULTIWINDOW_VIEW={MULTIWINDOW_VIEW!r} invalide — doit être un identifiant SQL valide.")
# A4 — Explainabilité SHAP
ENABLE_SHAP = SHAP_AVAILABLE and os.getenv('ENABLE_SHAP', 'true').lower() == 'true'
@ -107,7 +121,9 @@ STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
# B features TLS/TCP : indisponibles pour trafic non-corrélé
'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
# L4 uniquement : TTL et window scale indisponibles sans capture TCP
'avg_ttl', 'ttl_std', 'no_window_scale_ratio'],
'avg_ttl', 'ttl_std', 'no_window_scale_ratio',
# §5.5 JA4 Drift nécessite une corrélation fiable pour le JA4
'ja4_drift_ratio'],
}
TRAINING_HISTORY_FILE = os.path.join(MODEL_DIR, 'training_history.jsonl')
@ -287,9 +303,22 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
version_id = datetime.now().strftime('%Y%m%d_%H%M%S')
log_info(f"[{name}] Entraînement version {version_id} sur {len(human_baseline)} sessions humaines... (contamination={CONTAMINATION})")
X = human_baseline[features].replace([np.inf, -np.inf], np.nan).fillna(0)
X = human_baseline[features].replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())
# Validation split : réserver 20% pour évaluation offline
val_size = max(1, int(len(X) * 0.2))
X_train = X.iloc[:-val_size]
X_val = X.iloc[-val_size:]
model = IsolationForest(n_estimators=300, contamination=CONTAMINATION, random_state=42, n_jobs=-1)
model.fit(X)
model.fit(X_train)
# Évaluation offline : score moyen sur la validation (devrait être > 0 pour du trafic humain)
val_scores = model.decision_function(X_val)
val_mean_score = float(np.mean(val_scores))
val_anomaly_rate = float(np.mean(val_scores < 0))
log_info(f"[{name}] Validation : score moyen={val_mean_score:.4f}, taux anomalie={val_anomaly_rate:.2%} ({len(X_val)} échantillons)")
# A1 — Sauvegarder les statistiques de distribution de la baseline pour la détection de dérive future
baseline_stats = {
@ -307,7 +336,12 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
'human_samples': len(human_baseline), 'contamination': CONTAMINATION,
'threshold': ANOMALY_THRESHOLD, 'features': features,
'model_name': name, 'previous_version': previous_version,
'retrain_interval': RETRAIN_INTERVAL_H, 'baseline_stats': baseline_stats
'retrain_interval': RETRAIN_INTERVAL_H, 'baseline_stats': baseline_stats,
'validation': {
'val_size': len(X_val), 'train_size': len(X_train),
'val_mean_score': round(val_mean_score, 4),
'val_anomaly_rate': round(val_anomaly_rate, 4),
}
}
with open(new_meta_path, 'w') as f: json.dump(new_meta, f, indent=2)
with open(_current_pointer_path(name), 'w') as f: f.write(version_id)
@ -327,11 +361,47 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
# A1 — DÉTECTION DE DÉRIVE CONCEPTUELLE (CONCEPT DRIFT)
# ═══════════════════════════════════════════════════════════════════════════════
def _compute_drift_score(baseline_stats: dict, current_baseline: pd.DataFrame, features: list) -> float:
"""
Compare la distribution actuelle de la baseline humaine avec celle utilisée à l'entraînement.
Utilise un test de Kolmogorov-Smirnov par feature. Retourne la fraction de features déroutantes.
"""Compare la distribution actuelle de la baseline humaine avec celle de l'entraînement.
Utilise le test de Kolmogorov-Smirnov bilatéral par feature pour détecter
les changements de distribution (forme, moyenne, variance).
Retourne la fraction de features en dérive significative (p < 0.05).
Une valeur >= DRIFT_THRESHOLD déclenche un retraining forcé.
"""
if not baseline_stats or current_baseline.empty:
return 0.0
try:
from scipy.stats import ks_2samp
except ImportError:
# Fallback Z-score si scipy indisponible
return _compute_drift_score_zscore(baseline_stats, current_baseline, features)
drifted = 0
tested = 0
for feat in features:
if feat not in baseline_stats or feat not in current_baseline.columns:
continue
stats = baseline_stats[feat]
curr_values = current_baseline[feat].dropna()
if len(curr_values) < 30:
continue
# Reconstruire un échantillon synthétique de la distribution d'entraînement
# à partir des statistiques sauvegardées (mean, std, p25, p75)
trained_std = stats.get('std', 0)
if trained_std < 1e-9:
continue
# Générer un échantillon normal avec les mêmes paramètres
rng = np.random.default_rng(42)
synthetic_trained = rng.normal(stats['mean'], trained_std, size=len(curr_values))
_, p_value = ks_2samp(curr_values.values, synthetic_trained)
if p_value < 0.05:
drifted += 1
tested += 1
return drifted / max(tested, 1)
def _compute_drift_score_zscore(baseline_stats: dict, current_baseline: pd.DataFrame, features: list) -> float:
"""Fallback Z-score pour la détection de dérive quand scipy n'est pas disponible."""
if not baseline_stats or current_baseline.empty:
return 0.0
drifted = 0
@ -344,9 +414,7 @@ def _compute_drift_score(baseline_stats: dict, current_baseline: pd.DataFrame, f
trained_std = stats.get('std', 0)
if trained_std < 1e-9:
continue
# Z-score : écart entre la moyenne actuelle et celle de l'entraînement
z = abs(curr_mean - stats['mean']) / trained_std
# Un z > 2 indique une dérive significative de la distribution
if z > 2.0:
drifted += 1
tested += 1
@ -494,8 +562,10 @@ def _build_reason(name: str, row: pd.Series, shap_top: dict) -> str:
# A8 — CLUSTERING COMPORTEMENTAL DES ANOMALIES (DBSCAN)
# ═══════════════════════════════════════════════════════════════════════════════
def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
"""
A8 : Applique DBSCAN sur les features normalisées des anomalies.
"""A8 : Applique HDBSCAN (ou DBSCAN en fallback) sur les features normalisées des anomalies.
HDBSCAN est préféré car il détermine automatiquement le nombre de clusters
et la densité optimale (pas de paramètre eps à régler manuellement).
Ajoute une colonne campaign_id : 1 = IP isolée, ≥0 = identifiant de campagne coordonnée.
"""
anomalies = anomalies.copy()
@ -505,13 +575,22 @@ def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
try:
X = anomalies[features].replace([np.inf, -np.inf], np.nan).fillna(0)
X_scaled = StandardScaler().fit_transform(X)
if HDBSCAN_AVAILABLE:
clusterer = _hdbscan.HDBSCAN(
min_cluster_size=CLUSTERING_MIN_SAMPLES,
min_samples=max(2, CLUSTERING_MIN_SAMPLES - 1),
cluster_selection_method='eom'
)
labels = clusterer.fit_predict(X_scaled)
else:
labels = DBSCAN(eps=0.5, min_samples=CLUSTERING_MIN_SAMPLES).fit_predict(X_scaled)
anomalies['campaign_id'] = labels
n_campaigns = len(set(labels)) - (1 if -1 in labels else 0)
algo = 'HDBSCAN' if HDBSCAN_AVAILABLE else 'DBSCAN'
if n_campaigns > 0:
log_info(f"[DBSCAN] {n_campaigns} campagne(s) détectée(s) parmi {len(anomalies)} anomalies.")
log_info(f"[{algo}] {n_campaigns} campagne(s) détectée(s) parmi {len(anomalies)} anomalies.")
except Exception as e:
log_info(f"[DBSCAN] Erreur de clustering: {e}")
log_info(f"[Clustering] Erreur de clustering: {e}")
anomalies['campaign_id'] = -1
return anomalies
@ -559,7 +638,8 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
model = load_or_train_model(name, human_baseline, valid_features, cycle_id)
unknown_traffic = unknown_traffic.copy()
X_test = unknown_traffic[valid_features].replace([np.inf, -np.inf], np.nan).fillna(0)
X_test = unknown_traffic[valid_features].replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(X_test.median())
raw_scores = model.decision_function(X_test)
# raw_anomaly_score : score brut IF pour comparaison au seuil et assignation du threat_level
@ -700,6 +780,46 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
return threats, all_scored
# ═══════════════════════════════════════════════════════════════════════════════
# A5 — DÉDUPLICATION INTER-CYCLES AVEC TTL
# ═══════════════════════════════════════════════════════════════════════════════
# ═══════════════════════════════════════════════════════════════════════════════
# FEEDBACK LOOP — Intégration des classifications SOC dans la baseline
# ═══════════════════════════════════════════════════════════════════════════════
ENABLE_FEEDBACK = os.getenv('ENABLE_FEEDBACK', 'true').lower() == 'true'
FEEDBACK_WINDOW_DAYS = int(os.getenv('FEEDBACK_WINDOW_DAYS', '7'))
def _load_soc_feedback(client) -> dict:
"""Charge les classifications SOC récentes pour ajuster la baseline.
Retourne un dict {src_ip: classification} où classification est
'true_positive', 'false_positive', 'suspicious', etc.
Les faux positifs sont exclus du scoring (considérés comme humains),
les vrais positifs sont exclus de la baseline humaine.
"""
if not ENABLE_FEEDBACK:
return {}
try:
feedback_df = client.query_df(
f"SELECT entity_id AS src_ip, "
f" argMax(JSONExtractString(details, 'classification'), timestamp) AS classification "
f"FROM {DB}.audit_logs "
f"WHERE action = 'create_classification' "
f" AND entity_type = 'ip' "
f" AND timestamp >= now() - INTERVAL {FEEDBACK_WINDOW_DAYS} DAY "
f"GROUP BY entity_id"
)
if feedback_df is None or feedback_df.empty:
return {}
result = dict(zip(feedback_df['src_ip'], feedback_df['classification']))
log_info(f"[Feedback] {len(result)} classification(s) SOC chargées ({FEEDBACK_WINDOW_DAYS}j).")
return result
except Exception as e:
log_info(f"[Feedback] Impossible de charger les classifications SOC : {e}")
return {}
# ═══════════════════════════════════════════════════════════════════════════════
# A5 — DÉDUPLICATION INTER-CYCLES AVEC TTL
# ═══════════════════════════════════════════════════════════════════════════════
@ -748,7 +868,6 @@ def _preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
'asn_number', 'asn_org', 'asn_detail', 'asn_domain', 'country_code', 'asn_label']:
if col in df.columns:
df[col] = df[col].fillna('').astype(str)
df.fillna(0, inplace=True)
# ── Features numériques dérivées des labels Anubis (pour IsolationForest) ──
# anubis_is_flagged : 1 si le trafic est marqué WEIGH/CHALLENGE par Anubis
@ -758,6 +877,20 @@ def _preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
(~df.get('anubis_bot_action', pd.Series('', index=df.index)).isin(['ALLOW', 'DENY', '']))
).astype(int)
# ── Imputation intelligente des valeurs manquantes ──
# Les features binaires (0/1) et les ratios utilisent -1 comme sentinelle pour "inconnu"
# afin de ne pas biaiser le modèle (0 = valeur légitime pour beaucoup de features).
binary_features = {
'has_accept_language', 'has_cookie', 'has_referer', 'ua_ch_mismatch',
'is_ua_rotating', 'is_alpn_missing', 'sni_host_mismatch', 'alpn_http_mismatch',
'mss_mobile_mismatch', 'anubis_is_flagged', 'is_rare_ja4',
}
for col in df.columns:
if col in binary_features:
df[col] = df[col].fillna(-1)
elif df[col].dtype in ('float64', 'float32', 'int64', 'int32', 'uint64', 'uint32'):
df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(df[col].median())
return df
@ -773,6 +906,7 @@ def fetch_and_analyze():
insère les scores dans ml_all_scores et les anomalies dans ml_detected_anomalies.
Met à jour _service_healthy et _consecutive_failures en cas d'échec de requête.
"""
global _consecutive_failures, _service_healthy
cycle_id = datetime.now().strftime('%Y%m%d_%H%M%S')
log_info('=== Lancement cycle IA ===')
@ -796,6 +930,26 @@ def fetch_and_analyze():
log_info('Aucun trafic trouvé.')
return
# ── Enrichissement avec les features avancées de la thèse §5 ─────────────
try:
df_thesis = client.query_df(f'SELECT * FROM {DB}.view_thesis_features_1h')
if df_thesis is not None and not df_thesis.empty:
df_thesis.columns = [c.split('.')[-1] for c in df_thesis.columns]
df.columns = [c.split('.')[-1] for c in df.columns]
thesis_cols = [c for c in df_thesis.columns if c not in ('window_start', 'src_ip', 'ja4', 'host')]
df = df.merge(
df_thesis, on=['window_start', 'src_ip', 'ja4', 'host'],
how='left', suffixes=('', '_thesis')
)
for col in thesis_cols:
if col in df.columns:
df[col] = df[col].fillna(df[col].median() if df[col].notna().any() else 0)
log_info(f'[Thèse §5] {len(df_thesis)} sessions enrichies avec {len(thesis_cols)} features avancées.')
else:
log_info('[Thèse §5] view_thesis_features_1h vide — features avancées indisponibles.')
except Exception as e:
log_info(f'[Thèse §5] view_thesis_features_1h inaccessible : {e} — features avancées ignorées.')
df = _preprocess_df(df)
log_decision('CYCLE_START', cycle_id, '', {
@ -815,6 +969,26 @@ def fetch_and_analyze():
except Exception:
recurrence_map = {}
# ── Feedback SOC : ajuster la baseline selon les classifications humaines ─
soc_feedback = _load_soc_feedback(client)
if soc_feedback:
fp_ips = {ip for ip, cls in soc_feedback.items() if cls in ('false_positive', 'legitimate')}
tp_ips = {ip for ip, cls in soc_feedback.items() if cls in ('true_positive', 'malicious', 'bot')}
if fp_ips:
# Les faux positifs confirmés rejoignent le pool humain
mask_fp = df['src_ip'].isin(fp_ips) & (df.get('asn_label', pd.Series(dtype=str)) != 'human')
df.loc[mask_fp, 'asn_label'] = 'human'
log_info(f"[Feedback] {mask_fp.sum()} lignes reclassées 'human' (FP confirmés).")
if tp_ips:
# Les vrais positifs confirmés sont exclus de la baseline humaine
mask_tp = df['src_ip'].isin(tp_ips) & (df.get('asn_label', pd.Series(dtype=str)) == 'human')
df.loc[mask_tp, 'asn_label'] = 'soc_confirmed_bot'
log_info(f"[Feedback] {mask_tp.sum()} lignes exclues de la baseline humaine (TP confirmés).")
log_decision('SOC_FEEDBACK', cycle_id, '', {
'fp_ips': len(fp_ips), 'tp_ips': len(tp_ips),
'total_classifications': len(soc_feedback),
})
# ── Features par modèle (voir DOCUMENTATION.md §4) ───────────────────────
# Features communes aux deux modèles (L7 HTTP pur, disponibles correlated=0 et 1)
feats = [
@ -833,6 +1007,13 @@ def fetch_and_analyze():
'anubis_is_flagged',
# HTTP : header incomplet et usage HTTP plain (disponibles pour les deux modèles)
'missing_accept_enc_ratio', 'http_scheme_ratio',
# ── Thèse §5 : features avancées (optionnelles — ignorées si indisponibles) ──
# §5.1 — Entropie de séquence de chemins
'path_transition_entropy',
# §5.3 — Cadence inter-requêtes
'cadence_cv', 'burst_ratio', 'pause_ratio',
# §5.8 — Cross-domain (par IP, sans décomposition host)
'host_diversity', 'host_sweep_speed', 'host_coverage_uniformity',
]
# Features supplémentaires pour le modèle Complet (nécessitent des données TCP/TLS)
feats_complet = feats + [
@ -841,6 +1022,8 @@ def fetch_and_analyze():
'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
# TTL fingerprinting OS + TCP window scale (L4 uniquement)
'avg_ttl', 'ttl_std', 'no_window_scale_ratio',
# §5.5 — Dérive JA4 intra-session (nécessite corrélation pour JA4 fiable)
'ja4_drift_ratio',
]
# ── Analyse fenêtre 1h ────────────────────────────────────────────────────

View File

@ -1,6 +1,8 @@
clickhouse-connect==0.8.0
pandas==2.2.0
scikit-learn==1.4.0
shap==0.44.1
clickhouse-connect==0.8.12
pandas==2.2.3
scikit-learn==1.6.1
shap==0.47.2
scipy>=1.14
hdbscan>=0.8.38
pyyaml>=6.0
ja4-common @ file:///app/shared/ja4_common