Files
ja4-platform/services/bot-detector/bot_detector/config.py
toto 1f103392ac refactor(bot-detector): extract monolith into modular package
Split bot_detector.py (~1982 lines) into 10 focused modules:
- config.py: all configuration constants and optional imports
- log.py: logging utilities (log_info, log_decision, append_training_history)
- infra.py: ClickHouse client, health check HTTP server, shutdown
- browser.py: multifactorial browser identification (5 axes)
- scoring.py: drift detection, feature validation, SHAP, clustering
- models.py: EIF, Autoencoder, XGBoost model management
- preprocessing.py: data preprocessing and feature list definitions
- pipeline.py: core semi-supervised scoring loop
- cycle.py: main analysis cycle orchestration
- __main__.py: entry point with startup banner

Update Dockerfile to copy package directory and use python -m bot_detector.

All 36 existing tests pass unchanged.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-09 01:02:04 +02:00

154 lines
7.0 KiB
Python

"""Configuration centralisée du bot-detector.
Toutes les variables d'environnement, constantes et imports optionnels.
Aucun effet de bord (pas de logging, pas de connexion) — pur configuration.
"""
import os
import re
import warnings
warnings.filterwarnings('ignore')
# ─── Utilitaire de lecture d'env var ────────────────────────────────────────
def _require_float(name, default, lo=None, hi=None):
"""Lit une variable d'environnement comme flottant et valide la plage si spécifiée.
Lève SystemExit si la valeur est non numérique ou hors plage (lo, hi) exclusive.
"""
raw = os.getenv(name, str(default))
try:
v = float(raw)
except ValueError:
raise SystemExit(f"[CONFIG] {name}={raw!r} invalide — doit être un nombre décimal.")
if lo is not None and not (lo < v < hi):
raise SystemExit(f"[CONFIG] {name}={v} hors plage ({lo} < valeur < {hi}).")
return v
# ─── Validation identifiants SQL ────────────────────────────────────────────
_SAFE_IDENTIFIER_RE = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$')
# ─── ClickHouse ─────────────────────────────────────────────────────────────
DB = os.getenv('CLICKHOUSE_DB_PROCESSING', os.getenv('CLICKHOUSE_DB', 'ja4_processing'))
DB_LOGS = os.getenv('CLICKHOUSE_DB_LOGS', 'ja4_logs')
for _db_name, _db_val in [('CLICKHOUSE_DB_PROCESSING', DB), ('CLICKHOUSE_DB_LOGS', DB_LOGS)]:
if not _SAFE_IDENTIFIER_RE.match(_db_val):
raise SystemExit(f"[CONFIG] {_db_name}={_db_val!r} invalide — doit être un identifiant SQL valide.")
# ─── Isolation Forest ───────────────────────────────────────────────────────
CONTAMINATION = _require_float('ISOLATION_CONTAMINATION', 0.001, 0, 0.5)
N_ESTIMATORS = int(os.getenv('N_ESTIMATORS', '300'))
ANOMALY_THRESHOLD = _require_float('ANOMALY_THRESHOLD', -0.05)
ANOMALY_PERCENTILE = int(os.getenv('ANOMALY_PERCENTILE', '5'))
# ─── Modèles ───────────────────────────────────────────────────────────────
MODEL_DIR = os.getenv('MODEL_DIR', '/var/lib/bot_detector')
MODEL_HISTORY_COUNT = int(os.getenv('MODEL_HISTORY_COUNT', '10'))
RETRAIN_INTERVAL_H = int(os.getenv('RETRAIN_INTERVAL_HOURS', '24'))
DRIFT_THRESHOLD = _require_float('DRIFT_THRESHOLD', 0.30, 0, 1)
MIN_VALID_FEATURE_RATIO = _require_float('MIN_VALID_FEATURE_RATIO', 0.50, 0, 1)
MAX_FAILURES = int(os.getenv('MAX_CONSECUTIVE_FAILURES', '3'))
# ─── Cycle & Logging ───────────────────────────────────────────────────────
CYCLE_INTERVAL = int(os.getenv('CYCLE_INTERVAL_SEC', '300'))
LOG_FILE = os.getenv('BOT_DETECTOR_LOG', '/var/log/bot_detector/decisions.jsonl')
LOG_BACKUP_COUNT = int(os.getenv('LOG_BACKUP_COUNT', '7'))
TRAINING_HISTORY_FILE = os.path.join(MODEL_DIR, 'training_history.jsonl')
# ─── Health check ───────────────────────────────────────────────────────────
HEALTH_PORT = int(os.getenv('HEALTH_PORT', '8080'))
# ─── Déduplication et récurrence ────────────────────────────────────────────
DEDUP_TTL_MIN = int(os.getenv('DEDUP_TTL_MIN', '60'))
RECURRENCE_WEIGHT = _require_float('RECURRENCE_WEIGHT', 0.005)
# ─── Autoencoder (AE) — second scorer parallèle ────────────────────────────
AE_WEIGHT = _require_float('AE_WEIGHT', 0.30, 0, 1)
AE_EPOCHS = int(os.getenv('AE_EPOCHS', '50'))
AE_LATENT_DIM = int(os.getenv('AE_LATENT_DIM', '16'))
AE_LEARNING_RATE = float(os.getenv('AE_LEARNING_RATE', '1e-3'))
# ─── XGBoost — troisième voix supervisée ────────────────────────────────────
XGB_WEIGHT = _require_float('XGB_WEIGHT', 0.20, 0, 1)
XGB_MIN_LABELS = int(os.getenv('XGB_MIN_LABELS', '100'))
XGB_RETRAIN_INTERVAL_H = int(os.getenv('XGB_RETRAIN_INTERVAL_HOURS', '168'))
# ─── A9 — Classification multifactorielle des navigateurs ──────────────────
BROWSER_CONFIDENCE_THRESHOLD = _require_float('BROWSER_CONFIDENCE_THRESHOLD', 0.55, 0, 1)
BROWSER_COHORT_RATIO = _require_float('BROWSER_COHORT_RATIO', 0.70, 0, 1)
# ─── SHAP / Clustering / Multi-fenêtres / Feedback ─────────────────────────
ENABLE_CLUSTERING = os.getenv('ENABLE_CLUSTERING', 'true').lower() == 'true'
CLUSTERING_MIN_SAMPLES = int(os.getenv('CLUSTERING_MIN_SAMPLES', '3'))
ENABLE_MULTIWINDOW = os.getenv('ENABLE_MULTIWINDOW', 'false').lower() == 'true'
MULTIWINDOW_VIEW = os.getenv('MULTIWINDOW_VIEW', 'view_ai_features_24h')
if not _SAFE_IDENTIFIER_RE.match(MULTIWINDOW_VIEW):
raise SystemExit(f"[CONFIG] MULTIWINDOW_VIEW={MULTIWINDOW_VIEW!r} invalide.")
ENABLE_FEEDBACK = os.getenv('ENABLE_FEEDBACK', 'true').lower() == 'true'
FEEDBACK_WINDOW_DAYS = int(os.getenv('FEEDBACK_WINDOW_DAYS', '7'))
# ─── Features structurellement exclues par modèle ──────────────────────────
STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
'Complet': ['orphan_ratio'],
'Applicatif': ['orphan_ratio', 'is_rare_ja4', 'tcp_shared_count',
'request_size_variance', 'mss_mobile_mismatch',
'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
'avg_ttl', 'ttl_std', 'no_window_scale_ratio',
'ja4_drift_ratio'],
}
# ─── Imports optionnels (bibliothèques lourdes) ────────────────────────────
try:
from isotree import IsolationForest as ExtendedIsolationForest
EIF_AVAILABLE = True
except ImportError:
EIF_AVAILABLE = False
from sklearn.ensemble import IsolationForest
try:
import hdbscan as _hdbscan
HDBSCAN_AVAILABLE = True
except ImportError:
HDBSCAN_AVAILABLE = False
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
try:
import shap as _shap
SHAP_AVAILABLE = True
except ImportError:
SHAP_AVAILABLE = False
ENABLE_SHAP = SHAP_AVAILABLE and os.getenv('ENABLE_SHAP', 'true').lower() == 'true'
try:
import torch
import torch.nn as nn
TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False
try:
import xgboost as xgb
XGB_AVAILABLE = True
except ImportError:
XGB_AVAILABLE = False