Split bot_detector.py (~1982 lines) into 10 focused modules: - config.py: all configuration constants and optional imports - log.py: logging utilities (log_info, log_decision, append_training_history) - infra.py: ClickHouse client, health check HTTP server, shutdown - browser.py: multifactorial browser identification (5 axes) - scoring.py: drift detection, feature validation, SHAP, clustering - models.py: EIF, Autoencoder, XGBoost model management - preprocessing.py: data preprocessing and feature list definitions - pipeline.py: core semi-supervised scoring loop - cycle.py: main analysis cycle orchestration - __main__.py: entry point with startup banner Update Dockerfile to copy package directory and use python -m bot_detector. All 36 existing tests pass unchanged. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
111 lines
5.7 KiB
Python
111 lines
5.7 KiB
Python
"""Prétraitement des données et listes de features.
|
|
|
|
Normalise les colonnes, enrichit via l'identification multifactorielle des
|
|
navigateurs, et définit les listes de features pour chaque modèle.
|
|
"""
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
from .config import BROWSER_CONFIDENCE_THRESHOLD
|
|
from .log import log_info
|
|
from .browser import _compute_browser_axes, _parse_ja4_columns, _infer_browser_family
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# LISTES DE FEATURES PAR MODÈLE
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
# Features communes (L7 HTTP pur, disponibles correlated=0 et 1)
|
|
FEATURES = [
|
|
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio',
|
|
'orphan_ratio', 'max_keepalives', 'tcp_shared_count', 'header_order_shared_count',
|
|
'header_count', 'has_accept_language', 'has_cookie', 'has_referer',
|
|
'modern_browser_score', 'ua_ch_mismatch', 'ip_id_zero_ratio',
|
|
'request_size_variance', 'multiplexing_efficiency', 'mss_mobile_mismatch',
|
|
'asset_ratio', 'direct_access_ratio', 'is_ua_rotating', 'distinct_ja4_count',
|
|
'src_port_density', 'ja4_asn_concentration', 'ja4_country_concentration', 'is_rare_ja4',
|
|
'header_order_confidence', 'distinct_header_orders', 'temporal_entropy',
|
|
'path_diversity_ratio', 'url_depth_variance', 'anomalous_payload_ratio',
|
|
# B4-B7 : features L7 pures
|
|
'head_ratio', 'sec_fetch_absence_rate', 'generic_accept_ratio', 'http10_ratio',
|
|
# Anubis
|
|
'anubis_is_flagged',
|
|
# Browser multifactoriel
|
|
'is_known_browser', 'browser_consistency_score', 'browser_confidence',
|
|
'axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
|
|
'axis_nav_behavior', 'axis_tls_coherence',
|
|
# HTTP
|
|
'missing_accept_enc_ratio', 'http_scheme_ratio',
|
|
# Thèse §5
|
|
'path_transition_entropy',
|
|
'cadence_cv', 'burst_ratio', 'pause_ratio',
|
|
'lag1_autocorrelation', 'benford_deviation',
|
|
'host_diversity', 'host_sweep_speed', 'host_coverage_uniformity',
|
|
]
|
|
|
|
# Features supplémentaires pour le modèle Complet (données TCP/TLS requises)
|
|
FEATURES_COMPLET = FEATURES + [
|
|
'tcp_jitter_variance', 'alpn_http_mismatch', 'is_alpn_missing', 'sni_host_mismatch',
|
|
# B1-B3, B8 : features TLS/TCP
|
|
'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
|
|
# TTL fingerprinting OS + TCP window scale
|
|
'avg_ttl', 'ttl_std', 'no_window_scale_ratio',
|
|
# §5.5 — Dérive JA4 intra-session
|
|
'ja4_drift_ratio',
|
|
]
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# PRÉTRAITEMENT
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Normalise les colonnes et remplit les valeurs manquantes (commun 1h et 24h)."""
|
|
df.columns = [c.split('.')[-1] for c in df.columns]
|
|
for col in ['src_ip', 'ja4', 'host', 'bot_name', 'anubis_bot_name', 'anubis_bot_action',
|
|
'anubis_bot_category', 'asn_number', 'asn_org', 'asn_detail', 'asn_domain',
|
|
'country_code', 'asn_label']:
|
|
if col in df.columns:
|
|
df[col] = df[col].fillna('').astype(str)
|
|
|
|
# ── A9 — Identification multifactorielle des navigateurs ──────────────────
|
|
browser_axes = _compute_browser_axes(df)
|
|
ja4_parsed = _parse_ja4_columns(df.get('ja4', pd.Series('', index=df.index)))
|
|
|
|
df['inferred_browser_family'] = _infer_browser_family(df, ja4_parsed, browser_axes)
|
|
|
|
df['browser_confidence'] = browser_axes['browser_confidence']
|
|
for ax in ['axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
|
|
'axis_nav_behavior', 'axis_tls_coherence']:
|
|
df[ax] = browser_axes[ax]
|
|
|
|
# Rétro-compatibilité
|
|
df['is_known_browser'] = browser_axes['axis_ja4_known'].astype(int)
|
|
df['browser_consistency_score'] = (
|
|
browser_axes['axis_ja4_known'].clip(0, 1)
|
|
+ browser_axes['axis_http_modern'].apply(lambda x: 1 if x >= 0.5 else 0)
|
|
+ browser_axes['axis_nav_behavior'].apply(lambda x: 1 if x >= 0.5 else 0)
|
|
+ browser_axes['axis_tls_coherence'].apply(lambda x: 1 if x >= 0.5 else 0)
|
|
+ (df['inferred_browser_family'] != '').astype(int)
|
|
).astype(int)
|
|
|
|
# anubis_is_flagged : signal de suspicion modéré
|
|
df['anubis_is_flagged'] = (
|
|
(df.get('anubis_bot_name', pd.Series('', index=df.index)) != '') &
|
|
(~df.get('anubis_bot_action', pd.Series('', index=df.index)).isin(['ALLOW', 'DENY', '']))
|
|
).astype(int)
|
|
|
|
# Imputation intelligente
|
|
binary_features = {
|
|
'has_accept_language', 'has_cookie', 'has_referer', 'ua_ch_mismatch',
|
|
'is_ua_rotating', 'is_alpn_missing', 'sni_host_mismatch', 'alpn_http_mismatch',
|
|
'mss_mobile_mismatch', 'anubis_is_flagged', 'is_rare_ja4',
|
|
}
|
|
for col in df.columns:
|
|
if col in binary_features:
|
|
df[col] = df[col].fillna(-1)
|
|
elif df[col].dtype in ('float64', 'float32', 'int64', 'int32', 'uint64', 'uint32'):
|
|
df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(df[col].median())
|
|
|
|
return df
|