refactor(bot-detector): extract monolith into modular package
Split bot_detector.py (~1982 lines) into 10 focused modules: - config.py: all configuration constants and optional imports - log.py: logging utilities (log_info, log_decision, append_training_history) - infra.py: ClickHouse client, health check HTTP server, shutdown - browser.py: multifactorial browser identification (5 axes) - scoring.py: drift detection, feature validation, SHAP, clustering - models.py: EIF, Autoencoder, XGBoost model management - preprocessing.py: data preprocessing and feature list definitions - pipeline.py: core semi-supervised scoring loop - cycle.py: main analysis cycle orchestration - __main__.py: entry point with startup banner Update Dockerfile to copy package directory and use python -m bot_detector. All 36 existing tests pass unchanged. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
110
services/bot-detector/bot_detector/preprocessing.py
Normal file
110
services/bot-detector/bot_detector/preprocessing.py
Normal file
@ -0,0 +1,110 @@
|
||||
"""Prétraitement des données et listes de features.
|
||||
|
||||
Normalise les colonnes, enrichit via l'identification multifactorielle des
|
||||
navigateurs, et définit les listes de features pour chaque modèle.
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from .config import BROWSER_CONFIDENCE_THRESHOLD
|
||||
from .log import log_info
|
||||
from .browser import _compute_browser_axes, _parse_ja4_columns, _infer_browser_family
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# LISTES DE FEATURES PAR MODÈLE
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Features communes (L7 HTTP pur, disponibles correlated=0 et 1)
|
||||
FEATURES = [
|
||||
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio',
|
||||
'orphan_ratio', 'max_keepalives', 'tcp_shared_count', 'header_order_shared_count',
|
||||
'header_count', 'has_accept_language', 'has_cookie', 'has_referer',
|
||||
'modern_browser_score', 'ua_ch_mismatch', 'ip_id_zero_ratio',
|
||||
'request_size_variance', 'multiplexing_efficiency', 'mss_mobile_mismatch',
|
||||
'asset_ratio', 'direct_access_ratio', 'is_ua_rotating', 'distinct_ja4_count',
|
||||
'src_port_density', 'ja4_asn_concentration', 'ja4_country_concentration', 'is_rare_ja4',
|
||||
'header_order_confidence', 'distinct_header_orders', 'temporal_entropy',
|
||||
'path_diversity_ratio', 'url_depth_variance', 'anomalous_payload_ratio',
|
||||
# B4-B7 : features L7 pures
|
||||
'head_ratio', 'sec_fetch_absence_rate', 'generic_accept_ratio', 'http10_ratio',
|
||||
# Anubis
|
||||
'anubis_is_flagged',
|
||||
# Browser multifactoriel
|
||||
'is_known_browser', 'browser_consistency_score', 'browser_confidence',
|
||||
'axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
|
||||
'axis_nav_behavior', 'axis_tls_coherence',
|
||||
# HTTP
|
||||
'missing_accept_enc_ratio', 'http_scheme_ratio',
|
||||
# Thèse §5
|
||||
'path_transition_entropy',
|
||||
'cadence_cv', 'burst_ratio', 'pause_ratio',
|
||||
'lag1_autocorrelation', 'benford_deviation',
|
||||
'host_diversity', 'host_sweep_speed', 'host_coverage_uniformity',
|
||||
]
|
||||
|
||||
# Features supplémentaires pour le modèle Complet (données TCP/TLS requises)
|
||||
FEATURES_COMPLET = FEATURES + [
|
||||
'tcp_jitter_variance', 'alpn_http_mismatch', 'is_alpn_missing', 'sni_host_mismatch',
|
||||
# B1-B3, B8 : features TLS/TCP
|
||||
'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
|
||||
# TTL fingerprinting OS + TCP window scale
|
||||
'avg_ttl', 'ttl_std', 'no_window_scale_ratio',
|
||||
# §5.5 — Dérive JA4 intra-session
|
||||
'ja4_drift_ratio',
|
||||
]
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# PRÉTRAITEMENT
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Normalise les colonnes et remplit les valeurs manquantes (commun 1h et 24h)."""
|
||||
df.columns = [c.split('.')[-1] for c in df.columns]
|
||||
for col in ['src_ip', 'ja4', 'host', 'bot_name', 'anubis_bot_name', 'anubis_bot_action',
|
||||
'anubis_bot_category', 'asn_number', 'asn_org', 'asn_detail', 'asn_domain',
|
||||
'country_code', 'asn_label']:
|
||||
if col in df.columns:
|
||||
df[col] = df[col].fillna('').astype(str)
|
||||
|
||||
# ── A9 — Identification multifactorielle des navigateurs ──────────────────
|
||||
browser_axes = _compute_browser_axes(df)
|
||||
ja4_parsed = _parse_ja4_columns(df.get('ja4', pd.Series('', index=df.index)))
|
||||
|
||||
df['inferred_browser_family'] = _infer_browser_family(df, ja4_parsed, browser_axes)
|
||||
|
||||
df['browser_confidence'] = browser_axes['browser_confidence']
|
||||
for ax in ['axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
|
||||
'axis_nav_behavior', 'axis_tls_coherence']:
|
||||
df[ax] = browser_axes[ax]
|
||||
|
||||
# Rétro-compatibilité
|
||||
df['is_known_browser'] = browser_axes['axis_ja4_known'].astype(int)
|
||||
df['browser_consistency_score'] = (
|
||||
browser_axes['axis_ja4_known'].clip(0, 1)
|
||||
+ browser_axes['axis_http_modern'].apply(lambda x: 1 if x >= 0.5 else 0)
|
||||
+ browser_axes['axis_nav_behavior'].apply(lambda x: 1 if x >= 0.5 else 0)
|
||||
+ browser_axes['axis_tls_coherence'].apply(lambda x: 1 if x >= 0.5 else 0)
|
||||
+ (df['inferred_browser_family'] != '').astype(int)
|
||||
).astype(int)
|
||||
|
||||
# anubis_is_flagged : signal de suspicion modéré
|
||||
df['anubis_is_flagged'] = (
|
||||
(df.get('anubis_bot_name', pd.Series('', index=df.index)) != '') &
|
||||
(~df.get('anubis_bot_action', pd.Series('', index=df.index)).isin(['ALLOW', 'DENY', '']))
|
||||
).astype(int)
|
||||
|
||||
# Imputation intelligente
|
||||
binary_features = {
|
||||
'has_accept_language', 'has_cookie', 'has_referer', 'ua_ch_mismatch',
|
||||
'is_ua_rotating', 'is_alpn_missing', 'sni_host_mismatch', 'alpn_http_mismatch',
|
||||
'mss_mobile_mismatch', 'anubis_is_flagged', 'is_rare_ja4',
|
||||
}
|
||||
for col in df.columns:
|
||||
if col in binary_features:
|
||||
df[col] = df[col].fillna(-1)
|
||||
elif df[col].dtype in ('float64', 'float32', 'int64', 'int32', 'uint64', 'uint32'):
|
||||
df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(df[col].median())
|
||||
|
||||
return df
|
||||
Reference in New Issue
Block a user