refactor(bot-detector): extract monolith into modular package

Split bot_detector.py (~1982 lines) into 10 focused modules:
- config.py: all configuration constants and optional imports
- log.py: logging utilities (log_info, log_decision, append_training_history)
- infra.py: ClickHouse client, health check HTTP server, shutdown
- browser.py: multifactorial browser identification (5 axes)
- scoring.py: drift detection, feature validation, SHAP, clustering
- models.py: EIF, Autoencoder, XGBoost model management
- preprocessing.py: data preprocessing and feature list definitions
- pipeline.py: core semi-supervised scoring loop
- cycle.py: main analysis cycle orchestration
- __main__.py: entry point with startup banner

Update Dockerfile to copy package directory and use python -m bot_detector.

All 36 existing tests pass unchanged.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-09 01:02:04 +02:00
parent 2d04288e95
commit 1f103392ac
12 changed files with 2112 additions and 2 deletions

View File

@ -0,0 +1,89 @@
"""Infrastructure : client ClickHouse, health check HTTP, arrêt propre.
Exécute le serveur de santé en thread daemon dès l'import.
"""
import signal
import sys
import threading
from http.server import HTTPServer, BaseHTTPRequestHandler
from ja4_common.clickhouse import get_client as _ja4_get_client
from .config import HEALTH_PORT
from .log import log_info, log_decision
# ─── Arrêt propre ───────────────────────────────────────────────────────────
def _shutdown(sig, frame):
"""Gestionnaire de signal SIGTERM/SIGINT : journalise l'arrêt et quitte proprement."""
log_info(f"Signal {sig} reçu — arrêt propre.")
log_decision('SERVICE_STOP', 'shutdown', '', {'signal': sig})
sys.exit(0)
signal.signal(signal.SIGTERM, _shutdown)
signal.signal(signal.SIGINT, _shutdown)
# ─── Health check ───────────────────────────────────────────────────────────
_service_healthy = True
_health_lock = threading.Lock()
def set_healthy(healthy: bool):
"""Met à jour l'état de santé du service (thread-safe)."""
global _service_healthy
with _health_lock:
_service_healthy = healthy
def is_healthy() -> bool:
"""Retourne l'état de santé courant."""
with _health_lock:
return _service_healthy
class _HealthHandler(BaseHTTPRequestHandler):
"""Gestionnaire HTTP minimal pour le point de santé du service."""
def do_GET(self):
"""Répond à la requête GET : renvoie 200 OK ou 503 DEGRADED selon l'état du service."""
healthy = is_healthy()
code = 200 if healthy else 503
self.send_response(code)
self.end_headers()
self.wfile.write(b'OK' if healthy else b'DEGRADED')
def log_message(self, *args):
"""Supprime les logs HTTP internes pour ne pas polluer la sortie standard."""
pass
threading.Thread(
target=lambda: HTTPServer(('', HEALTH_PORT), _HealthHandler).serve_forever(),
daemon=True
).start()
# ─── Client ClickHouse ──────────────────────────────────────────────────────
def get_client():
"""Return the shared ja4_common ClickHouse client, reconnecting on ping failure."""
return _ja4_get_client().connect()
def score_to_threat_level(score: float) -> str:
"""Convertit un score d'anomalie brut IsolationForest en niveau de menace textuel.
Seuils : CRITICAL < 0.30 | HIGH < 0.15 | MEDIUM < 0.05 | LOW < 0 | NORMAL ≥ 0.
"""
if score < -0.30:
return 'CRITICAL'
if score < -0.15:
return 'HIGH'
if score < -0.05:
return 'MEDIUM'
if score < 0:
return 'LOW'
return 'NORMAL'