feat: ja4-platform monorepo — 5 services unified, tests & RPM builds standardized
Services: - ja4sentinel: TLS/JA4 fingerprint capture daemon (Go, libpcap) - logcorrelator: JA4 log correlation engine (Go, ClickHouse) - mod_reqin_log: Apache module (C, JSON request logging) - bot_detector: ML bot detection pipeline (Python) - dashboard: FastAPI/Streamlit analytics UI (Python) Shared libraries: - shared/go/ja4common: logger, config, shutdown, ipfilter (Go module) - shared/python/ja4_common: ClickHouseClient, ClickHouseSettings (Python package) - shared/clickhouse/: canonical SQL migrations (10 files) Build & packaging: - Unified 3-stage Dockerfile.package for Go RPMs (el8/el9/el10) - go.work workspace linking sentinel, correlator, ja4common - Makefile with test-all, build-all, rpm-* targets Fixes applied: - go.work: 1.21 → 1.24.6 (required by sentinel) - correlator Dockerfiles: golang:1.21 → golang:1.24 - replace directives in go.mod for ja4common local path - pyproject.toml: setuptools.backends → setuptools.build_meta - Removed static libpcap linking (unavailable on Rocky 9) - Fixed data races in output/writers_test.go (sync.Mutex + atomic.Int32) - Rewrote corrupted test files (logger_test.go × 2) Test coverage: - correlator: 67.1% total (unixsocket 80.5%, config 91.7%, app 83.3%, multi 87.7%, stdout 100%) - sentinel: all 10 packages pass (api, capture, config, fingerprint, ipfilter, logging, output, tlsparse) Documentation: - README.md + docs/ (architecture, development, 5 services, shared libs, DB schema & migrations) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
15
services/bot-detector/bot_detector/Dockerfile
Normal file
15
services/bot-detector/bot_detector/Dockerfile
Normal file
@ -0,0 +1,15 @@
|
||||
FROM python:3.11-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
WORKDIR /app
|
||||
|
||||
# Install shared package first
|
||||
COPY shared/python/ja4_common/ /app/shared/ja4_common/
|
||||
RUN pip install --no-cache-dir /app/shared/ja4_common/
|
||||
|
||||
COPY services/bot-detector/bot_detector/requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY services/bot-detector/bot_detector/bot_detector.py .
|
||||
|
||||
CMD ["python", "bot_detector.py"]
|
||||
10
services/bot-detector/bot_detector/Dockerfile.tests
Normal file
10
services/bot-detector/bot_detector/Dockerfile.tests
Normal file
@ -0,0 +1,10 @@
|
||||
FROM python:3.11-slim
|
||||
WORKDIR /app
|
||||
COPY shared/python/ja4_common/ /app/shared/ja4_common/
|
||||
RUN pip install --no-cache-dir /app/shared/ja4_common/
|
||||
COPY services/bot-detector/bot_detector/requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip install --no-cache-dir pytest pytest-mock
|
||||
COPY services/bot-detector/bot_detector/ /app/bot_detector/
|
||||
WORKDIR /app
|
||||
CMD ["pytest", "bot_detector/tests/", "-v"]
|
||||
906
services/bot-detector/bot_detector/bot_detector.py
Normal file
906
services/bot-detector/bot_detector/bot_detector.py
Normal file
@ -0,0 +1,906 @@
|
||||
import time
|
||||
import os
|
||||
import json
|
||||
import glob
|
||||
import signal
|
||||
import sys
|
||||
import logging
|
||||
import threading
|
||||
import joblib
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import clickhouse_connect
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
from sklearn.ensemble import IsolationForest
|
||||
from sklearn.cluster import DBSCAN
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
import warnings
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
import shap as _shap
|
||||
SHAP_AVAILABLE = True
|
||||
except ImportError:
|
||||
SHAP_AVAILABLE = False
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# CONFIGURATION
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
def _require_float(name, default, lo=None, hi=None):
|
||||
raw = os.getenv(name, str(default))
|
||||
try:
|
||||
v = float(raw)
|
||||
except ValueError:
|
||||
raise SystemExit(f"[CONFIG] {name}={raw!r} invalide — doit être un nombre décimal.")
|
||||
if lo is not None and not (lo < v < hi):
|
||||
raise SystemExit(f"[CONFIG] {name}={v} hors plage ({lo} < valeur < {hi}).")
|
||||
return v
|
||||
|
||||
# Nom de la base de données ClickHouse
|
||||
# Note : Utilisé dans des requêtes SQL via f-string (ex: f'SELECT * FROM {DB}.view_ai_features_1h')
|
||||
# Cette variable provient uniquement de variables d'environnement contrôlées (docker-compose, K8s, etc.)
|
||||
# et n'est jamais exposée à des entrées utilisateur. Le risque d'injection SQL est considéré comme négligeable.
|
||||
DB = os.getenv('CLICKHOUSE_DB', 'mabase_prod')
|
||||
|
||||
CONTAMINATION = _require_float('ISOLATION_CONTAMINATION', 0.001, 0, 0.5)
|
||||
ANOMALY_THRESHOLD = _require_float('ANOMALY_THRESHOLD', -0.05)
|
||||
LOG_FILE = os.getenv('BOT_DETECTOR_LOG', '/var/log/bot_detector/decisions.jsonl')
|
||||
LOG_BACKUP_COUNT = int(os.getenv('LOG_BACKUP_COUNT', '7'))
|
||||
MODEL_DIR = os.getenv('MODEL_DIR', '/var/lib/bot_detector')
|
||||
RETRAIN_INTERVAL_H = int(os.getenv('RETRAIN_INTERVAL_HOURS', '24'))
|
||||
MODEL_HISTORY_COUNT = int(os.getenv('MODEL_HISTORY_COUNT', '10'))
|
||||
MAX_FAILURES = int(os.getenv('MAX_CONSECUTIVE_FAILURES', '3'))
|
||||
HEALTH_PORT = int(os.getenv('HEALTH_PORT', '8080'))
|
||||
CYCLE_INTERVAL = int(os.getenv('CYCLE_INTERVAL_SEC', '300'))
|
||||
|
||||
# ── Améliorations A1 / A2 / A3 / A4 / A5 / A6 / A7 / A8 / A10 ──────────────
|
||||
# A1 — Dérive conceptuelle (concept drift)
|
||||
DRIFT_THRESHOLD = _require_float('DRIFT_THRESHOLD', 0.30, 0, 1)
|
||||
|
||||
# A2 — Seuil adaptatif
|
||||
ANOMALY_PERCENTILE = int(os.getenv('ANOMALY_PERCENTILE', '5'))
|
||||
|
||||
# A3 — Analyse multi-fenêtres
|
||||
ENABLE_MULTIWINDOW = os.getenv('ENABLE_MULTIWINDOW', 'false').lower() == 'true'
|
||||
MULTIWINDOW_VIEW = os.getenv('MULTIWINDOW_VIEW', 'view_ai_features_24h')
|
||||
|
||||
# A4 — Explainabilité SHAP
|
||||
ENABLE_SHAP = SHAP_AVAILABLE and os.getenv('ENABLE_SHAP', 'true').lower() == 'true'
|
||||
|
||||
# A5 — Déduplication inter-cycles avec TTL
|
||||
DEDUP_TTL_MIN = int(os.getenv('DEDUP_TTL_MIN', '60'))
|
||||
|
||||
# A6 — Pondération par récurrence
|
||||
RECURRENCE_WEIGHT = _require_float('RECURRENCE_WEIGHT', 0.005)
|
||||
|
||||
# A7 — Validation de complétude des features
|
||||
MIN_VALID_FEATURE_RATIO = _require_float('MIN_VALID_FEATURE_RATIO', 0.50, 0, 1)
|
||||
|
||||
# A8 — Clustering comportemental des anomalies
|
||||
ENABLE_CLUSTERING = os.getenv('ENABLE_CLUSTERING', 'true').lower() == 'true'
|
||||
CLUSTERING_MIN_SAMPLES = int(os.getenv('CLUSTERING_MIN_SAMPLES', '3'))
|
||||
|
||||
# Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé)
|
||||
# Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design.
|
||||
STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
|
||||
'Complet': ['orphan_ratio'],
|
||||
'Applicatif': ['orphan_ratio', 'is_rare_ja4', 'tcp_shared_count',
|
||||
'request_size_variance', 'mss_mobile_mismatch',
|
||||
# B features TLS/TCP : indisponibles pour trafic non-corrélé
|
||||
'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
|
||||
# L4 uniquement : TTL et window scale indisponibles sans capture TCP
|
||||
'avg_ttl', 'ttl_std', 'no_window_scale_ratio'],
|
||||
}
|
||||
|
||||
TRAINING_HISTORY_FILE = os.path.join(MODEL_DIR, 'training_history.jsonl')
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# LOGGING
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
||||
os.makedirs(MODEL_DIR, exist_ok=True)
|
||||
|
||||
logger = logging.getLogger('bot_detector')
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
_console_handler = logging.StreamHandler()
|
||||
_console_handler.setFormatter(logging.Formatter('[%(asctime)s] %(message)s', '%Y-%m-%d %H:%M:%S'))
|
||||
logger.addHandler(_console_handler)
|
||||
|
||||
_file_handler = RotatingFileHandler(
|
||||
LOG_FILE, maxBytes=50 * 1024 * 1024, backupCount=LOG_BACKUP_COUNT, encoding='utf-8'
|
||||
)
|
||||
_file_handler.setFormatter(logging.Formatter('%(message)s'))
|
||||
logger.addHandler(_file_handler)
|
||||
|
||||
# Wrapper court pour homogénéiser les appels de logging (évite d'importer logger partout).
|
||||
def log_info(message: str):
|
||||
logger.info(message)
|
||||
|
||||
def log_decision(event: str, cycle_id: str, model: str = '', row: dict = None):
|
||||
entry = {
|
||||
'ts': datetime.now().strftime('%Y-%m-%dT%H:%M:%S'),
|
||||
'cycle_id': cycle_id,
|
||||
'event': event,
|
||||
'model': model,
|
||||
'contamination': CONTAMINATION,
|
||||
'threshold': ANOMALY_THRESHOLD,
|
||||
}
|
||||
if row:
|
||||
entry.update(row)
|
||||
_file_handler.stream.write(json.dumps(entry, ensure_ascii=False, default=str) + '\n')
|
||||
_file_handler.stream.flush()
|
||||
|
||||
def _append_training_history(entry: dict):
|
||||
with open(TRAINING_HISTORY_FILE, 'a', encoding='utf-8') as f:
|
||||
f.write(json.dumps(entry, ensure_ascii=False, default=str) + '\n')
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# ARRÊT PROPRE ET HEALTH CHECK
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
def _shutdown(sig, frame):
|
||||
log_info(f"Signal {sig} reçu — arrêt propre.")
|
||||
log_decision('SERVICE_STOP', 'shutdown', '', {'signal': sig})
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGTERM, _shutdown)
|
||||
signal.signal(signal.SIGINT, _shutdown)
|
||||
|
||||
_service_healthy = True
|
||||
class _HealthHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
code = 200 if _service_healthy else 503
|
||||
self.send_response(code)
|
||||
self.end_headers()
|
||||
self.wfile.write(b'OK' if _service_healthy else b'DEGRADED')
|
||||
def log_message(self, *args): pass
|
||||
|
||||
threading.Thread(
|
||||
target=lambda: HTTPServer(('', HEALTH_PORT), _HealthHandler).serve_forever(),
|
||||
daemon=True
|
||||
).start()
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# CONNEXION CLICKHOUSE — delegated to ja4_common shared client
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
from ja4_common.clickhouse import get_client as _ja4_get_client
|
||||
|
||||
def get_client():
|
||||
"""Return the shared ja4_common ClickHouse client, reconnecting on ping failure."""
|
||||
return _ja4_get_client().connect()
|
||||
|
||||
def score_to_threat_level(score: float) -> str:
|
||||
# Seuils : CRITICAL < -0.30 | HIGH < -0.15 | MEDIUM < -0.05 | LOW < 0 | NORMAL ≥ 0
|
||||
if score < -0.30: return 'CRITICAL'
|
||||
if score < -0.15: return 'HIGH'
|
||||
if score < -0.05: return 'MEDIUM'
|
||||
if score < 0: return 'LOW'
|
||||
return 'NORMAL'
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# GESTION DES MODÈLES
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
def _current_pointer_path(name: str) -> str:
|
||||
return os.path.join(MODEL_DIR, f'model_{name}.current')
|
||||
|
||||
def _get_current_version(name: str):
|
||||
pointer = _current_pointer_path(name)
|
||||
if not os.path.exists(pointer): return None, None
|
||||
with open(pointer) as f: version_id = f.read().strip()
|
||||
model_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.joblib')
|
||||
meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
|
||||
if not os.path.exists(model_path) or not os.path.exists(meta_path): return None, None
|
||||
with open(meta_path) as f: meta = json.load(f)
|
||||
return model_path, meta
|
||||
|
||||
def _purge_old_versions(name: str):
|
||||
pattern = os.path.join(MODEL_DIR, f'model_{name}_*.joblib')
|
||||
versions = sorted(glob.glob(pattern))
|
||||
to_delete = versions[:-MODEL_HISTORY_COUNT] if len(versions) > MODEL_HISTORY_COUNT else []
|
||||
for joblib_path in to_delete:
|
||||
version_id = os.path.basename(joblib_path).replace(f'model_{name}_', '').replace('.joblib', '')
|
||||
meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
|
||||
os.remove(joblib_path)
|
||||
if os.path.exists(meta_path): os.remove(meta_path)
|
||||
log_info(f"[{name}] Version purgée : {version_id} (limite={MODEL_HISTORY_COUNT})")
|
||||
|
||||
def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str):
|
||||
model_path, meta = _get_current_version(name)
|
||||
if model_path and meta:
|
||||
trained_at = datetime.fromisoformat(meta['trained_at'])
|
||||
age_h = (datetime.now() - trained_at).total_seconds() / 3600
|
||||
age_ok = age_h < RETRAIN_INTERVAL_H
|
||||
|
||||
# A1 — Dérive conceptuelle : comparer la distribution actuelle avec celle de l'entraînement
|
||||
drift_score = 0.0
|
||||
drift_forced = False
|
||||
if age_ok and 'baseline_stats' in meta:
|
||||
drift_score = _compute_drift_score(meta['baseline_stats'], human_baseline, features)
|
||||
if drift_score >= DRIFT_THRESHOLD:
|
||||
drift_forced = True
|
||||
log_info(f"[{name}] Dérive détectée ({drift_score:.0%} features) — retraining forcé.")
|
||||
log_decision('DRIFT_DETECTED', cycle_id, name, {
|
||||
'version_id': meta['version_id'], 'drift_score': round(drift_score, 3),
|
||||
'drift_threshold': DRIFT_THRESHOLD, 'model_age_hours': round(age_h, 2)
|
||||
})
|
||||
|
||||
if age_ok and not drift_forced:
|
||||
log_info(f"[{name}] Modèle v{meta['version_id']} valide ({age_h:.1f}h / {RETRAIN_INTERVAL_H}h, drift={drift_score:.0%}) — réutilisation.")
|
||||
log_decision('MODEL_LOADED', cycle_id, name, {
|
||||
'version_id': meta['version_id'], 'model_age_hours': round(age_h, 2),
|
||||
'trained_at': meta['trained_at'], 'human_samples': meta.get('human_samples', '?'),
|
||||
'retrain_in_hours': round(RETRAIN_INTERVAL_H - age_h, 1), 'drift_score': round(drift_score, 3)
|
||||
})
|
||||
return joblib.load(model_path)
|
||||
elif not drift_forced:
|
||||
log_info(f"[{name}] Modèle v{meta['version_id']} expiré ({age_h:.1f}h ≥ {RETRAIN_INTERVAL_H}h) — retraining.")
|
||||
|
||||
version_id = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
log_info(f"[{name}] Entraînement version {version_id} sur {len(human_baseline)} sessions humaines... (contamination={CONTAMINATION})")
|
||||
|
||||
X = human_baseline[features].replace([np.inf, -np.inf], np.nan).fillna(0)
|
||||
model = IsolationForest(n_estimators=300, contamination=CONTAMINATION, random_state=42, n_jobs=-1)
|
||||
model.fit(X)
|
||||
|
||||
# A1 — Sauvegarder les statistiques de distribution de la baseline pour la détection de dérive future
|
||||
baseline_stats = {
|
||||
f: {'mean': float(X[f].mean()), 'std': float(X[f].std()), 'p25': float(X[f].quantile(0.25)), 'p75': float(X[f].quantile(0.75))}
|
||||
for f in features
|
||||
}
|
||||
|
||||
new_model_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.joblib')
|
||||
new_meta_path = os.path.join(MODEL_DIR, f'model_{name}_{version_id}.meta.json')
|
||||
joblib.dump(model, new_model_path)
|
||||
|
||||
previous_version = meta.get('version_id', None) if meta else None
|
||||
new_meta = {
|
||||
'version_id': version_id, 'trained_at': datetime.now().isoformat(),
|
||||
'human_samples': len(human_baseline), 'contamination': CONTAMINATION,
|
||||
'threshold': ANOMALY_THRESHOLD, 'features': features,
|
||||
'model_name': name, 'previous_version': previous_version,
|
||||
'retrain_interval': RETRAIN_INTERVAL_H, 'baseline_stats': baseline_stats
|
||||
}
|
||||
with open(new_meta_path, 'w') as f: json.dump(new_meta, f, indent=2)
|
||||
with open(_current_pointer_path(name), 'w') as f: f.write(version_id)
|
||||
|
||||
_append_training_history({k: v for k, v in new_meta.items() if k != 'baseline_stats'})
|
||||
_purge_old_versions(name)
|
||||
|
||||
log_info(f"[{name}] Modèle v{version_id} sauvegardé → {new_model_path}")
|
||||
log_decision('MODEL_TRAINED', cycle_id, name, {
|
||||
'version_id': version_id, 'previous_version': previous_version,
|
||||
'human_samples': len(human_baseline), 'next_retrain_in_h': RETRAIN_INTERVAL_H,
|
||||
'history_kept': MODEL_HISTORY_COUNT
|
||||
})
|
||||
return model
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# A1 — DÉTECTION DE DÉRIVE CONCEPTUELLE (CONCEPT DRIFT)
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
def _compute_drift_score(baseline_stats: dict, current_baseline: pd.DataFrame, features: list) -> float:
|
||||
"""
|
||||
Compare la distribution actuelle de la baseline humaine avec celle utilisée à l'entraînement.
|
||||
Utilise un test de Kolmogorov-Smirnov par feature. Retourne la fraction de features déroutantes.
|
||||
Une valeur >= DRIFT_THRESHOLD déclenche un retraining forcé.
|
||||
"""
|
||||
if not baseline_stats or current_baseline.empty:
|
||||
return 0.0
|
||||
drifted = 0
|
||||
tested = 0
|
||||
for feat in features:
|
||||
if feat not in baseline_stats or feat not in current_baseline.columns:
|
||||
continue
|
||||
stats = baseline_stats[feat]
|
||||
curr_mean = current_baseline[feat].mean()
|
||||
trained_std = stats.get('std', 0)
|
||||
if trained_std < 1e-9:
|
||||
continue
|
||||
# Z-score : écart entre la moyenne actuelle et celle de l'entraînement
|
||||
z = abs(curr_mean - stats['mean']) / trained_std
|
||||
# Un z > 2 indique une dérive significative de la distribution
|
||||
if z > 2.0:
|
||||
drifted += 1
|
||||
tested += 1
|
||||
return drifted / max(tested, 1)
|
||||
|
||||
|
||||
# Cache par modèle conservant le dernier état des features invalides.
|
||||
# Permet de supprimer les logs répétitifs : on ne loggue que si l'état a changé depuis le cycle précédent.
|
||||
_feature_warning_cache: dict = {}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# A7 — VALIDATION DE COMPLÉTUDE DES FEATURES
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
def validate_features(df: pd.DataFrame, features: list, name: str, cycle_id: str):
|
||||
"""
|
||||
Vérifie que les features sont présentes et non constantes dans le DataFrame.
|
||||
Catégorise les features invalides :
|
||||
- structural : absente par design pour ce modèle (défini dans STRUCTURAL_EXCLUDED_FEATURES)
|
||||
- zero : colonne toujours à 0 — problème de pipeline
|
||||
- unique : colonne avec une seule valeur non-nulle — agrégat global non discriminant
|
||||
- missing : colonne absente du DataFrame
|
||||
Retourne la liste des features valides, ou None si trop de features sont invalides.
|
||||
Les avertissements ne sont logués que si l'état a changé depuis le cycle précédent
|
||||
(grâce à _feature_warning_cache), pour éviter de polluer les logs à chaque cycle.
|
||||
"""
|
||||
structural = STRUCTURAL_EXCLUDED_FEATURES.get(name, [])
|
||||
# Exclure les features structurelles d'emblée (sans warning pipeline)
|
||||
active_features = [f for f in features if f not in structural]
|
||||
|
||||
missing = [f for f in active_features if f not in df.columns]
|
||||
present = [f for f in active_features if f in df.columns]
|
||||
|
||||
zero_val = [f for f in present if df[f].nunique() == 1 and df[f].max() == 0]
|
||||
unique_val = [f for f in present if df[f].nunique() == 1 and df[f].max() != 0]
|
||||
constant = zero_val + unique_val
|
||||
valid = [f for f in present if f not in constant]
|
||||
|
||||
current_state = (frozenset(missing), frozenset(zero_val), frozenset(unique_val))
|
||||
state_changed = _feature_warning_cache.get(name) != current_state
|
||||
_feature_warning_cache[name] = current_state
|
||||
|
||||
if structural:
|
||||
log_info(f"[{name}] Features exclues (structurelles / L4 indisponible) : {structural}")
|
||||
# Ne logguer les avertissements que si l'état a changé (nouveau problème ou résolution)
|
||||
if state_changed:
|
||||
if missing:
|
||||
log_info(f"[{name}] Features absentes du schéma : {missing}")
|
||||
if zero_val:
|
||||
log_info(f"[{name}] Features à 0 (pipeline non-alimenté) : {zero_val}")
|
||||
if unique_val:
|
||||
log_info(f"[{name}] Features non-discriminantes (agrégat global) : {unique_val}")
|
||||
if missing or zero_val or unique_val:
|
||||
log_decision('FEATURE_WARNING', cycle_id, name, {
|
||||
'structural': structural, 'missing': missing,
|
||||
'zero': zero_val, 'unique_nonzero': unique_val,
|
||||
'valid_count': len(valid), 'total': len(active_features)
|
||||
})
|
||||
|
||||
ratio = len(valid) / max(len(active_features), 1)
|
||||
if ratio < MIN_VALID_FEATURE_RATIO:
|
||||
log_info(f"[{name}] Ratio features valides insuffisant ({ratio:.0%} < {MIN_VALID_FEATURE_RATIO:.0%}) — cycle ignoré.")
|
||||
log_decision('SKIPPED_INVALID_FEATURES', cycle_id, name, {
|
||||
'valid_ratio': round(ratio, 3), 'threshold': MIN_VALID_FEATURE_RATIO
|
||||
})
|
||||
return None
|
||||
return valid
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# A2 / A10 — SEUIL ADAPTATIF ET NORMALISATION DES SCORES
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
def compute_adaptive_threshold(scores: np.ndarray) -> float:
|
||||
"""
|
||||
A2 : Calcule un seuil adaptatif basé sur le percentile ANOMALY_PERCENTILE des scores négatifs.
|
||||
Retourne le min entre le seuil adaptatif et le seuil statique configuré.
|
||||
"""
|
||||
neg_scores = scores[scores < 0]
|
||||
if len(neg_scores) == 0:
|
||||
return ANOMALY_THRESHOLD
|
||||
adaptive = float(np.percentile(neg_scores, ANOMALY_PERCENTILE))
|
||||
return min(adaptive, ANOMALY_THRESHOLD)
|
||||
|
||||
|
||||
def normalize_scores(scores: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
A10 : Normalise les scores négatifs en [−1, 0] pour comparer des modèles différents.
|
||||
Les scores positifs (trafic normal) restent inchangés.
|
||||
|
||||
Attention : la formule mappe le score le PLUS négatif (plus anomaleux) vers 0
|
||||
et le score le MOINS négatif (moins anomaleux) vers −1.
|
||||
Ce résultat counter-intuitif est intentionnel : anomaly_score n'est utilisé qu'à titre
|
||||
indicatif dans les tables de résultats. Les décisions réelles s'appuient sur raw_anomaly_score.
|
||||
"""
|
||||
result = scores.copy()
|
||||
mask = scores < 0
|
||||
if mask.sum() == 0:
|
||||
return result
|
||||
s_min, s_max = scores[mask].min(), scores[mask].max()
|
||||
if s_min == s_max:
|
||||
return result
|
||||
result[mask] = (scores[mask] - s_min) / (s_max - s_min + 1e-9) * -1
|
||||
return result
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# A4 — EXPLAINABILITÉ PAR SHAP
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
def _compute_shap_top_features(model, X: pd.DataFrame, features: list, n_top: int = 5) -> list:
|
||||
"""
|
||||
Calcule les valeurs SHAP pour chaque ligne de X et retourne les n_top features
|
||||
les plus contributives (valeur SHAP la plus négative = plus responsable de l'anomalie).
|
||||
Retourne une liste de dicts {feature: shap_value} par ligne.
|
||||
"""
|
||||
if not ENABLE_SHAP or X.empty:
|
||||
return [{}] * len(X)
|
||||
try:
|
||||
explainer = _shap.TreeExplainer(model)
|
||||
shap_values = explainer.shap_values(X)
|
||||
result = []
|
||||
for sv in shap_values:
|
||||
# Features les plus négatives = les plus responsables de l'anomalie
|
||||
pairs = sorted(zip(features, sv), key=lambda x: x[1])
|
||||
result.append({f: round(float(v), 4) for f, v in pairs[:n_top]})
|
||||
return result
|
||||
except Exception as e:
|
||||
log_info(f"[SHAP] Erreur de calcul SHAP: {e}")
|
||||
return [{}] * len(X)
|
||||
|
||||
|
||||
def _build_reason(name: str, row: pd.Series, shap_top: dict) -> str:
|
||||
"""Construit le champ reason enrichi avec le top SHAP ou les métriques clés."""
|
||||
# Utilise le score brut pour l'affichage (plus interprétable que le score normalisé)
|
||||
score = round(float(row.get('raw_anomaly_score', row.get('anomaly_score', 0))), 3)
|
||||
threat = row.get('threat_level', '')
|
||||
if shap_top:
|
||||
top_str = ' | '.join(f"{f}({v:+.3f})" for f, v in shap_top.items())
|
||||
return f"[{name}] Score: {score} | SHAP: {top_str} | Threat: {threat}"
|
||||
vel = round(float(row.get('hit_velocity', 0)), 1)
|
||||
fuzz = round(float(row.get('fuzzing_index', 0)), 1)
|
||||
return f"[{name}] Score: {score} | Vel: {vel} req/s | Fuzzing: {fuzz} | Threat: {threat}"
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# A8 — CLUSTERING COMPORTEMENTAL DES ANOMALIES (DBSCAN)
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
def _cluster_anomalies(anomalies: pd.DataFrame, features: list) -> pd.DataFrame:
|
||||
"""
|
||||
A8 : Applique DBSCAN sur les features normalisées des anomalies.
|
||||
Ajoute une colonne campaign_id : −1 = IP isolée, ≥0 = identifiant de campagne coordonnée.
|
||||
"""
|
||||
anomalies = anomalies.copy()
|
||||
if len(anomalies) < CLUSTERING_MIN_SAMPLES:
|
||||
anomalies['campaign_id'] = -1
|
||||
return anomalies
|
||||
try:
|
||||
X = anomalies[features].replace([np.inf, -np.inf], np.nan).fillna(0)
|
||||
X_scaled = StandardScaler().fit_transform(X)
|
||||
labels = DBSCAN(eps=0.5, min_samples=CLUSTERING_MIN_SAMPLES).fit_predict(X_scaled)
|
||||
anomalies['campaign_id'] = labels
|
||||
n_campaigns = len(set(labels)) - (1 if -1 in labels else 0)
|
||||
if n_campaigns > 0:
|
||||
log_info(f"[DBSCAN] {n_campaigns} campagne(s) détectée(s) parmi {len(anomalies)} anomalies.")
|
||||
except Exception as e:
|
||||
log_info(f"[DBSCAN] Erreur de clustering: {e}")
|
||||
anomalies['campaign_id'] = -1
|
||||
return anomalies
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# ANALYSE SEMI-SUPERVISÉE
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
# ── Trifurcation du trafic selon bot_name et Anubis ─────────────────────
|
||||
# 1. Bots connus (dict_bot_ip / dict_bot_ja4) → exclus du scoring IF
|
||||
known_bots = df[df['bot_name'] != ''].copy()
|
||||
rest = df[df['bot_name'] == ''].copy()
|
||||
|
||||
# 2. Bots Anubis ALLOW → bots légitimes, exclus du scoring IF
|
||||
anubis_allow = rest[rest['anubis_bot_action'] == 'ALLOW'].copy()
|
||||
|
||||
# 3. Tout le reste passe par l'IsolationForest pour un score réel :
|
||||
# - DENY : menaces identifiées par règles Anubis → IF donne le score de sévérité
|
||||
# - WEIGH / inconnu → scorés normalement (anubis_is_flagged=1 pour WEIGH)
|
||||
# Les DENY sont TOUJOURS inclus dans les threats, indépendamment du seuil IF.
|
||||
unknown_traffic = rest[rest['anubis_bot_action'] != 'ALLOW'].copy()
|
||||
human_baseline = unknown_traffic[unknown_traffic['asn_label'] == 'human']
|
||||
|
||||
# A7 — Valider les features avant tout traitement
|
||||
valid_features = validate_features(df, features, name, cycle_id)
|
||||
if valid_features is None:
|
||||
return pd.DataFrame(), pd.DataFrame()
|
||||
|
||||
if len(human_baseline) < 500:
|
||||
log_info(f"[{name}] Données humaines insuffisantes ({len(human_baseline)} < 500).")
|
||||
log_decision('SKIPPED_LOW_DATA', cycle_id, name, {
|
||||
'human_count': len(human_baseline), 'unknown_count': len(unknown_traffic)
|
||||
})
|
||||
return pd.DataFrame(), pd.DataFrame()
|
||||
|
||||
# A1 — Dérive conceptuelle intégrée dans load_or_train_model
|
||||
model = load_or_train_model(name, human_baseline, valid_features, cycle_id)
|
||||
unknown_traffic = unknown_traffic.copy()
|
||||
|
||||
X_test = unknown_traffic[valid_features].replace([np.inf, -np.inf], np.nan).fillna(0)
|
||||
raw_scores = model.decision_function(X_test)
|
||||
|
||||
# raw_anomaly_score : score brut IF pour comparaison au seuil et assignation du threat_level
|
||||
# anomaly_score : score normalisé [-1, 0] pour cohérence cross-modèles (A10)
|
||||
unknown_traffic['raw_anomaly_score'] = raw_scores
|
||||
unknown_traffic['anomaly_score'] = normalize_scores(raw_scores)
|
||||
unknown_traffic['model_name'] = name
|
||||
|
||||
# A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD)
|
||||
effective_threshold = compute_adaptive_threshold(raw_scores)
|
||||
log_info(f"[{name}] Seuil effectif : {effective_threshold:.4f} (statique={ANOMALY_THRESHOLD}, percentile={ANOMALY_PERCENTILE})")
|
||||
|
||||
# A6 — Pénaliser les IPs récurrentes sur le score BRUT avant comparaison au seuil
|
||||
if RECURRENCE_WEIGHT > 0:
|
||||
recurrences = unknown_traffic['src_ip'].map(recurrence_map).fillna(0)
|
||||
penalty = np.log1p(recurrences.values) * RECURRENCE_WEIGHT
|
||||
unknown_traffic['raw_anomaly_score'] = unknown_traffic['raw_anomaly_score'] - penalty
|
||||
|
||||
# Assigner threat_level à TOUTES les sessions scorées (pour ml_all_scores)
|
||||
unknown_traffic['threat_level'] = unknown_traffic['raw_anomaly_score'].apply(score_to_threat_level)
|
||||
unknown_traffic['recurrence'] = unknown_traffic['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
|
||||
unknown_traffic['campaign_id'] = -1
|
||||
|
||||
# Extraire les DENY (maintenant avec leur vrai score IF) et forcer leur threat_level
|
||||
deny_mask = unknown_traffic['anubis_bot_action'] == 'DENY'
|
||||
unknown_traffic.loc[deny_mask, 'threat_level'] = 'ANUBIS_DENY'
|
||||
|
||||
# Capturer toutes les sessions scorées (avant filtrage par seuil) — pour ml_all_scores
|
||||
all_scored = unknown_traffic.copy()
|
||||
|
||||
if not known_bots.empty:
|
||||
known_bots = known_bots.copy()
|
||||
known_bots['anomaly_score'] = 0.0
|
||||
known_bots['raw_anomaly_score'] = 0.0
|
||||
known_bots['threat_level'] = 'KNOWN_BOT'
|
||||
known_bots['model_name'] = name
|
||||
known_bots['campaign_id'] = -1
|
||||
known_bots['reason'] = '[Identification] Bot légitime: ' + known_bots['bot_name']
|
||||
known_bots['recurrence'] = known_bots['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
|
||||
for _, row in known_bots.iterrows():
|
||||
log_decision('KNOWN_BOT', cycle_id, name, {
|
||||
'src_ip': row.get('src_ip', ''), 'bot_name': row.get('bot_name', ''),
|
||||
'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
|
||||
'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''),
|
||||
'recurrence': int(row.get('recurrence', 1))
|
||||
})
|
||||
|
||||
# ── Anubis ALLOW : bots légitimes identifiés par règles Anubis ───────────
|
||||
if not anubis_allow.empty:
|
||||
anubis_allow = anubis_allow.copy()
|
||||
anubis_allow['anomaly_score'] = 0.0
|
||||
anubis_allow['raw_anomaly_score'] = 0.0
|
||||
anubis_allow['threat_level'] = 'KNOWN_BOT'
|
||||
anubis_allow['model_name'] = name
|
||||
anubis_allow['campaign_id'] = -1
|
||||
anubis_allow['reason'] = '[Anubis ALLOW] ' + anubis_allow['anubis_bot_name']
|
||||
anubis_allow['recurrence'] = anubis_allow['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
|
||||
for _, row in anubis_allow.iterrows():
|
||||
log_decision('KNOWN_BOT', cycle_id, name, {
|
||||
'src_ip': row.get('src_ip', ''), 'bot_name': row.get('anubis_bot_name', ''),
|
||||
'anubis_bot_name': row.get('anubis_bot_name', ''),
|
||||
'anubis_bot_action': row.get('anubis_bot_action', ''),
|
||||
'anubis_bot_category': row.get('anubis_bot_category', ''),
|
||||
'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
|
||||
'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''),
|
||||
'recurrence': int(row.get('recurrence', 1)),
|
||||
})
|
||||
|
||||
# ── Anubis DENY : scorés par IF, toujours inclus dans les threats ────────
|
||||
# Extraits de unknown_traffic après scoring — ils ont leur vrai score IF.
|
||||
anubis_deny = unknown_traffic[deny_mask].copy()
|
||||
if not anubis_deny.empty:
|
||||
anubis_deny['reason'] = '[Anubis DENY] ' + anubis_deny['anubis_bot_name'].fillna('') + \
|
||||
' | ' + anubis_deny['raw_anomaly_score'].apply(lambda s: f'IF={s:.4f}')
|
||||
log_info(f"[{name}] Anubis DENY: {len(anubis_deny)} IP(s) scorées par IF "
|
||||
f"(score moyen: {anubis_deny['raw_anomaly_score'].mean():.4f}).")
|
||||
for _, row in anubis_deny.iterrows():
|
||||
log_decision('ANUBIS_DENY', cycle_id, name, {
|
||||
'src_ip': row.get('src_ip', ''), 'anubis_bot_name': row.get('anubis_bot_name', ''),
|
||||
'anubis_bot_action': row.get('anubis_bot_action', ''),
|
||||
'anubis_bot_category': row.get('anubis_bot_category', ''),
|
||||
'anomaly_score': round(float(row.get('anomaly_score', 0)), 4),
|
||||
'raw_anomaly_score': round(float(row.get('raw_anomaly_score', 0)), 4),
|
||||
'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
|
||||
'asn_domain': row.get('asn_domain', ''), 'country_code': row.get('country_code', ''),
|
||||
'recurrence': int(row.get('recurrence', 1)),
|
||||
})
|
||||
|
||||
# Filtrer sur raw_anomaly_score (A6 inclus) — seulement le trafic non-DENY
|
||||
# Les DENY sont toujours des threats, indépendamment du seuil IF
|
||||
non_deny_traffic = unknown_traffic[~deny_mask]
|
||||
anomalies = non_deny_traffic[non_deny_traffic['raw_anomaly_score'] < effective_threshold].copy()
|
||||
if not anomalies.empty:
|
||||
log_info(f"[{name}] ALERT: {len(anomalies)} anomalies détectées (seuil={effective_threshold:.4f}).")
|
||||
anomalies['recurrence'] = anomalies['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
|
||||
|
||||
# A4 — Explainabilité SHAP : top features responsables de chaque anomalie
|
||||
X_anomalies = X_test.loc[anomalies.index]
|
||||
shap_tops = _compute_shap_top_features(model, X_anomalies, valid_features)
|
||||
anomalies['reason'] = [
|
||||
_build_reason(name, row, shap)
|
||||
for (_, row), shap in zip(anomalies.iterrows(), shap_tops)
|
||||
]
|
||||
|
||||
# A8 — Clustering DBSCAN pour identifier les campagnes coordonnées
|
||||
if ENABLE_CLUSTERING:
|
||||
anomalies = _cluster_anomalies(anomalies, valid_features)
|
||||
|
||||
anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
|
||||
for _, row in anomalies.iterrows():
|
||||
log_decision('ANOMALY', cycle_id, name, {
|
||||
'src_ip': row.get('src_ip', ''), 'anomaly_score': round(float(row.get('anomaly_score', 0)), 4),
|
||||
'raw_anomaly_score': round(float(row.get('raw_anomaly_score', 0)), 4),
|
||||
'threat_level': row.get('threat_level', ''), 'recurrence': int(row.get('recurrence', 1)),
|
||||
'hit_velocity': round(float(row.get('hit_velocity', 0)), 2),
|
||||
'fuzzing_index': round(float(row.get('fuzzing_index', 0)), 2),
|
||||
'post_ratio': round(float(row.get('post_ratio', 0)), 3),
|
||||
'asn_number': row.get('asn_number', ''), 'asn_org': row.get('asn_org', ''),
|
||||
'asn_detail': row.get('asn_detail', ''), 'asn_domain': row.get('asn_domain', ''),
|
||||
'country_code': row.get('country_code', ''), 'asn_label': row.get('asn_label', ''),
|
||||
'ja4': row.get('ja4', ''), 'host': row.get('host', ''),
|
||||
'correlated': int(row.get('correlated', 0)), 'campaign_id': int(row.get('campaign_id', -1)),
|
||||
'effective_threshold': round(effective_threshold, 4), 'reason': row.get('reason', '')
|
||||
})
|
||||
|
||||
threats = pd.concat([df for df in [
|
||||
anomalies if not anomalies.empty else None,
|
||||
known_bots if not known_bots.empty else None,
|
||||
anubis_allow if not anubis_allow.empty else None,
|
||||
anubis_deny if not anubis_deny.empty else None,
|
||||
] if df is not None], ignore_index=True)
|
||||
|
||||
# Inclure anubis_allow dans all_scored pour traçabilité dans ml_all_scores.
|
||||
# Ces IPs sont exclues de l'analyse IF mais doivent apparaître dans la table
|
||||
# de scores avec threat_level='KNOWN_BOT' et anomaly_score=0.0.
|
||||
if not anubis_allow.empty:
|
||||
all_scored = pd.concat([all_scored, anubis_allow], ignore_index=True)
|
||||
|
||||
return threats, all_scored
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# A5 — DÉDUPLICATION INTER-CYCLES AVEC TTL
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
def _filter_recent_detections(client, all_anom: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
A5 : Filtre les IPs déjà insérées dans ml_detected_anomalies dans les DEDUP_TTL_MIN dernières minutes.
|
||||
Exception : une IP est réinsérée si son nouveau score est ≥ 0.05 points plus bas (aggravation).
|
||||
"""
|
||||
if DEDUP_TTL_MIN <= 0 or all_anom.empty:
|
||||
return all_anom
|
||||
try:
|
||||
recent_df = client.query_df(
|
||||
f"SELECT src_ip, min(anomaly_score) AS best_score "
|
||||
f"FROM {DB}.ml_detected_anomalies "
|
||||
f"WHERE detected_at > now() - INTERVAL {DEDUP_TTL_MIN} MINUTE "
|
||||
f"GROUP BY src_ip"
|
||||
)
|
||||
if recent_df.empty:
|
||||
return all_anom
|
||||
recent_map = dict(zip(recent_df['src_ip'], recent_df['best_score']))
|
||||
def _should_insert(row):
|
||||
prev = recent_map.get(row['src_ip'])
|
||||
if prev is None:
|
||||
return True
|
||||
# Réinsérer seulement si le score brut s'est significativement aggravé
|
||||
return float(row.get('raw_anomaly_score', row['anomaly_score'])) < float(prev) - 0.05
|
||||
mask = all_anom.apply(_should_insert, axis=1)
|
||||
filtered = all_anom[mask]
|
||||
skipped = len(all_anom) - len(filtered)
|
||||
if skipped > 0:
|
||||
log_info(f"[Dedup TTL={DEDUP_TTL_MIN}min] {skipped} IP(s) filtrée(s) (déjà détectées récemment).")
|
||||
return filtered
|
||||
except Exception as e:
|
||||
log_info(f"[Dedup] Erreur lors de la déduplication TTL : {e}")
|
||||
return all_anom
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# A3 — ANALYSE MULTI-FENÊTRES : PRÉTRAITEMENT COMMUN
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
def _preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Normalise les colonnes et remplit les valeurs manquantes (commun 1h et 24h)."""
|
||||
df.columns = [c.split('.')[-1] for c in df.columns]
|
||||
for col in ['src_ip', 'ja4', 'host', 'bot_name', 'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category',
|
||||
'asn_number', 'asn_org', 'asn_detail', 'asn_domain', 'country_code', 'asn_label']:
|
||||
if col in df.columns:
|
||||
df[col] = df[col].fillna('').astype(str)
|
||||
df.fillna(0, inplace=True)
|
||||
|
||||
# ── Features numériques dérivées des labels Anubis (pour IsolationForest) ──
|
||||
# anubis_is_flagged : 1 si le trafic est marqué WEIGH/CHALLENGE par Anubis
|
||||
# → signal de suspicion modéré passé à l'IF (ALLOW/DENY sont exclus du pipeline)
|
||||
df['anubis_is_flagged'] = (
|
||||
(df.get('anubis_bot_name', pd.Series('', index=df.index)) != '') &
|
||||
(~df.get('anubis_bot_action', pd.Series('', index=df.index)).isin(['ALLOW', 'DENY', '']))
|
||||
).astype(int)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# CYCLE PRINCIPAL
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
_consecutive_failures = 0
|
||||
def fetch_and_analyze():
|
||||
global _service_healthy, _consecutive_failures
|
||||
cycle_id = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
log_info('=== Lancement cycle IA ===')
|
||||
|
||||
client = get_client()
|
||||
|
||||
# ── Récupération du trafic (fenêtre 1h) ──────────────────────────────────
|
||||
try:
|
||||
df = client.query_df(f'SELECT * FROM {DB}.view_ai_features_1h')
|
||||
except Exception as e:
|
||||
log_info(f'ERREUR REQUETE: {e}')
|
||||
_consecutive_failures += 1
|
||||
if _consecutive_failures >= MAX_FAILURES:
|
||||
_service_healthy = False
|
||||
log_decision('CONSECUTIVE_FAILURES', cycle_id, '', {'count': _consecutive_failures, 'error': str(e)})
|
||||
return
|
||||
|
||||
_consecutive_failures = 0
|
||||
_service_healthy = True
|
||||
|
||||
if df is None or df.empty:
|
||||
log_info('Aucun trafic trouvé.')
|
||||
return
|
||||
|
||||
df = _preprocess_df(df)
|
||||
|
||||
log_decision('CYCLE_START', cycle_id, '', {
|
||||
'total_rows': len(df),
|
||||
'human_rows': int((df.get('asn_label', pd.Series()) == 'human').sum()),
|
||||
'known_bot_rows': int((df.get('bot_name', pd.Series()) != '').sum()),
|
||||
'correlated_rows': int((df.get('correlated', pd.Series()) == 1).sum()),
|
||||
'anubis_allow_rows': int((df.get('anubis_bot_action', pd.Series()) == 'ALLOW').sum()),
|
||||
'anubis_deny_rows': int((df.get('anubis_bot_action', pd.Series()) == 'DENY').sum()),
|
||||
'anubis_weigh_rows': int((df.get('anubis_bot_action', pd.Series()) == 'WEIGH').sum()),
|
||||
'multiwindow': ENABLE_MULTIWINDOW,
|
||||
})
|
||||
|
||||
try:
|
||||
rec_df = client.query_df(f'SELECT src_ip, recurrence FROM {DB}.view_ip_recurrence')
|
||||
recurrence_map = dict(zip(rec_df['src_ip'], rec_df['recurrence']))
|
||||
except Exception:
|
||||
recurrence_map = {}
|
||||
|
||||
# ── Features par modèle (voir DOCUMENTATION.md §4) ───────────────────────
|
||||
# Features communes aux deux modèles (L7 HTTP pur, disponibles correlated=0 et 1)
|
||||
feats = [
|
||||
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio',
|
||||
'orphan_ratio', 'max_keepalives', 'tcp_shared_count', 'header_order_shared_count',
|
||||
'header_count', 'has_accept_language', 'has_cookie', 'has_referer',
|
||||
'modern_browser_score', 'ua_ch_mismatch', 'ip_id_zero_ratio',
|
||||
'request_size_variance', 'multiplexing_efficiency', 'mss_mobile_mismatch',
|
||||
'asset_ratio', 'direct_access_ratio', 'is_ua_rotating', 'distinct_ja4_count',
|
||||
'src_port_density', 'ja4_asn_concentration', 'ja4_country_concentration', 'is_rare_ja4',
|
||||
'header_order_confidence', 'distinct_header_orders', 'temporal_entropy',
|
||||
'path_diversity_ratio', 'url_depth_variance', 'anomalous_payload_ratio',
|
||||
# B4-B7 : features L7 pures (disponibles correlated=0 et 1)
|
||||
'head_ratio', 'sec_fetch_absence_rate', 'generic_accept_ratio', 'http10_ratio',
|
||||
# Anubis : signal de suspicion modéré (WEIGH/CHALLENGE) — bypass pour ALLOW/DENY
|
||||
'anubis_is_flagged',
|
||||
# HTTP : header incomplet et usage HTTP plain (disponibles pour les deux modèles)
|
||||
'missing_accept_enc_ratio', 'http_scheme_ratio',
|
||||
]
|
||||
# Features supplémentaires pour le modèle Complet (nécessitent des données TCP/TLS)
|
||||
feats_complet = feats + [
|
||||
'tcp_jitter_variance', 'alpn_http_mismatch', 'is_alpn_missing', 'sni_host_mismatch',
|
||||
# B1-B3, B8 : features TLS/TCP (disponibles correlated=1 uniquement)
|
||||
'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
|
||||
# TTL fingerprinting OS + TCP window scale (L4 uniquement)
|
||||
'avg_ttl', 'ttl_std', 'no_window_scale_ratio',
|
||||
]
|
||||
|
||||
# ── Analyse fenêtre 1h ────────────────────────────────────────────────────
|
||||
anom_a, scored_a = run_semi_supervised_logic(df[df['correlated'] == 1].copy(), feats_complet, 'Complet', cycle_id, recurrence_map)
|
||||
anom_b, scored_b = run_semi_supervised_logic(df[df['correlated'] == 0].copy(), feats, 'Applicatif', cycle_id, recurrence_map)
|
||||
all_anom = pd.concat([anom_a, anom_b], ignore_index=True)
|
||||
all_scored = pd.concat([scored_a, scored_b], ignore_index=True)
|
||||
|
||||
# ── A3 : Analyse fenêtre 24h (optionnelle) ────────────────────────────────
|
||||
if ENABLE_MULTIWINDOW:
|
||||
try:
|
||||
df_24h = client.query_df(f'SELECT * FROM {DB}.{MULTIWINDOW_VIEW}')
|
||||
if df_24h is not None and not df_24h.empty:
|
||||
df_24h = _preprocess_df(df_24h)
|
||||
log_info(f"[24h] {len(df_24h)} sessions dans la fenêtre 24h.")
|
||||
anom_c, scored_c = run_semi_supervised_logic(df_24h[df_24h['correlated'] == 1].copy(), feats_complet, 'Complet_24h', cycle_id, recurrence_map)
|
||||
anom_d, scored_d = run_semi_supervised_logic(df_24h[df_24h['correlated'] == 0].copy(), feats, 'Applicatif_24h', cycle_id, recurrence_map)
|
||||
all_anom_24h = pd.concat([anom_c, anom_d], ignore_index=True)
|
||||
all_scored_24h = pd.concat([scored_c, scored_d], ignore_index=True)
|
||||
# Fusion : pour les IPs présentes dans les deux fenêtres, conserver le score le plus bas
|
||||
if not all_anom_24h.empty:
|
||||
all_anom = pd.concat([all_anom, all_anom_24h], ignore_index=True)
|
||||
log_info(f"[24h] Fusion 1h+24h : {len(all_anom)} entrées avant déduplication.")
|
||||
all_scored = pd.concat([all_scored, all_scored_24h], ignore_index=True)
|
||||
else:
|
||||
log_info(f"[24h] Vue {MULTIWINDOW_VIEW} vide — analyse mono-fenêtre.")
|
||||
except Exception as e:
|
||||
log_info(f"[24h] Vue {MULTIWINDOW_VIEW} inaccessible : {e} — analyse mono-fenêtre.")
|
||||
|
||||
# ── Insertion de toutes les classifications dans ml_all_scores ───────────
|
||||
if not all_scored.empty:
|
||||
try:
|
||||
now = datetime.now().replace(microsecond=0)
|
||||
all_scored['detected_at'] = now
|
||||
all_scored['ja4'] = all_scored['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
|
||||
all_scores_cols = [
|
||||
'detected_at', 'window_start', 'src_ip', 'ja4', 'host', 'bot_name',
|
||||
'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category',
|
||||
'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name',
|
||||
'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label',
|
||||
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id'
|
||||
]
|
||||
scores_df = all_scored[[c for c in all_scores_cols if c in all_scored.columns]]
|
||||
client.insert_df(f'{DB}.ml_all_scores', scores_df)
|
||||
log_info(f'[ml_all_scores] {len(scores_df)} sessions scorées enregistrées.')
|
||||
except Exception as e:
|
||||
log_info(f'[ml_all_scores] ERREUR INSERTION: {e}')
|
||||
|
||||
if not all_anom.empty:
|
||||
all_anom = all_anom.sort_values('raw_anomaly_score', ascending=True).drop_duplicates(subset=['src_ip'], keep='first')
|
||||
log_info(f'Après déduplication intra-cycle : {len(all_anom)} IP uniques.')
|
||||
|
||||
# A5 — Déduplication inter-cycles avec TTL
|
||||
all_anom = _filter_recent_detections(client, all_anom)
|
||||
|
||||
if all_anom.empty:
|
||||
log_info('Toutes les anomalies filtrées par déduplication TTL.')
|
||||
log_decision('CYCLE_END', cycle_id, '', {'inserted': 0, 'anomalies': 0, 'known_bots': 0, 'critical': 0, 'high': 0, 'dedup_ttl_min': DEDUP_TTL_MIN})
|
||||
return
|
||||
|
||||
all_anom['detected_at'] = datetime.now().replace(microsecond=0)
|
||||
fake_nav_col = 'is_fake_navigation'
|
||||
all_anom['is_headless'] = all_anom[fake_nav_col].astype(int) if fake_nav_col in all_anom.columns else 0
|
||||
|
||||
cols = [
|
||||
'detected_at', 'src_ip', 'ja4', 'host', 'bot_name', 'anomaly_score',
|
||||
'threat_level', 'model_name', 'recurrence',
|
||||
'asn_number', 'asn_org', 'asn_detail', 'asn_domain', 'country_code', 'asn_label',
|
||||
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio', 'max_keepalives', 'orphan_ratio',
|
||||
'tcp_jitter_variance', 'tcp_shared_count', 'true_window_size', 'window_mss_ratio',
|
||||
'alpn_http_mismatch', 'is_alpn_missing', 'sni_host_mismatch',
|
||||
'header_count', 'has_accept_language', 'has_cookie', 'has_referer',
|
||||
'modern_browser_score', 'is_headless', 'ua_ch_mismatch',
|
||||
'header_order_shared_count', 'ip_id_zero_ratio', 'request_size_variance',
|
||||
'multiplexing_efficiency', 'mss_mobile_mismatch',
|
||||
'correlated', 'reason', 'asset_ratio', 'direct_access_ratio', 'is_ua_rotating',
|
||||
'distinct_ja4_count', 'src_port_density', 'ja4_asn_concentration',
|
||||
'ja4_country_concentration', 'is_rare_ja4',
|
||||
'header_order_confidence', 'distinct_header_orders', 'temporal_entropy',
|
||||
'path_diversity_ratio', 'url_depth_variance', 'anomalous_payload_ratio',
|
||||
'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category',
|
||||
]
|
||||
|
||||
try:
|
||||
final_df = all_anom[[c for c in cols if c in all_anom.columns]]
|
||||
client.insert_df(f'{DB}.ml_detected_anomalies', final_df)
|
||||
log_info(f'Succès: {len(final_df)} menaces enregistrées.')
|
||||
log_decision('CYCLE_END', cycle_id, '', {
|
||||
'inserted': len(final_df),
|
||||
'anomalies': int((final_df.get('bot_name', pd.Series()) == '').sum()),
|
||||
'known_bots': int((final_df.get('bot_name', pd.Series()) != '').sum()),
|
||||
'critical': int((final_df.get('threat_level', pd.Series()) == 'CRITICAL').sum()),
|
||||
'high': int((final_df.get('threat_level', pd.Series()) == 'HIGH').sum()),
|
||||
'dedup_ttl_min': DEDUP_TTL_MIN,
|
||||
})
|
||||
except Exception as e:
|
||||
log_info(f'ERREUR INSERTION: {e}')
|
||||
else:
|
||||
log_info('Aucune menace détectée.')
|
||||
log_decision('CYCLE_END', cycle_id, '', {'inserted': 0, 'anomalies': 0, 'known_bots': 0, 'critical': 0, 'high': 0, 'dedup_ttl_min': DEDUP_TTL_MIN})
|
||||
|
||||
if __name__ == '__main__':
|
||||
log_info('*' * 65)
|
||||
log_info(' DÉMARRAGE DU SERVICE BOT DETECTOR IA v12 (+ Anubis)')
|
||||
log_info(f' DB : {DB}')
|
||||
log_info(f' Contamination : {CONTAMINATION}')
|
||||
log_info(f' Seuil anomalie : {ANOMALY_THRESHOLD} (adaptatif percentile={ANOMALY_PERCENTILE})')
|
||||
log_info(f' Cycle : {CYCLE_INTERVAL}s | Fenêtre 1h | Multi-fenêtres : {ENABLE_MULTIWINDOW}')
|
||||
log_info(f' Retraining : toutes les {RETRAIN_INTERVAL_H}h | Drift threshold : {DRIFT_THRESHOLD:.0%}')
|
||||
log_info(f' Modèles : {MODEL_DIR}')
|
||||
log_info(f' SHAP : {"activé" if ENABLE_SHAP else "désactivé (shap non installé)" if not SHAP_AVAILABLE else "désactivé"}')
|
||||
log_info(f' Clustering : {"activé" if ENABLE_CLUSTERING else "désactivé"} | Dedup TTL : {DEDUP_TTL_MIN}min')
|
||||
log_info(f' Récurrence weight : {RECURRENCE_WEIGHT} | Min features ratio : {MIN_VALID_FEATURE_RATIO:.0%}')
|
||||
log_info(f' Anubis : ALLOW→KNOWN_BOT (score=0), DENY→ANUBIS_DENY (score IF réel)')
|
||||
log_info('*' * 65)
|
||||
log_decision('SERVICE_START', 'boot', '', {
|
||||
'db': DB, 'contamination': CONTAMINATION, 'anomaly_threshold': ANOMALY_THRESHOLD,
|
||||
'cycle_interval': CYCLE_INTERVAL, 'retrain_interval_h': RETRAIN_INTERVAL_H
|
||||
})
|
||||
while True:
|
||||
try: fetch_and_analyze()
|
||||
except Exception as e: log_info(f"Erreur globale : {e}")
|
||||
time.sleep(CYCLE_INTERVAL)
|
||||
|
||||
6
services/bot-detector/bot_detector/requirements.txt
Normal file
6
services/bot-detector/bot_detector/requirements.txt
Normal file
@ -0,0 +1,6 @@
|
||||
clickhouse-connect==0.8.0
|
||||
pandas==2.2.0
|
||||
scikit-learn==1.4.0
|
||||
shap==0.44.1
|
||||
pyyaml>=6.0
|
||||
ja4-common @ file:///app/shared/ja4_common
|
||||
17
services/bot-detector/bot_detector/tests/conftest.py
Normal file
17
services/bot-detector/bot_detector/tests/conftest.py
Normal file
@ -0,0 +1,17 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_ch_client():
|
||||
"""Mock ClickHouse client."""
|
||||
client = MagicMock()
|
||||
client.query.return_value = MagicMock(result_rows=[])
|
||||
client.command.return_value = None
|
||||
return client
|
||||
|
||||
|
||||
@pytest.fixture(autouse=False)
|
||||
def mock_get_client(mock_ch_client):
|
||||
with patch("ja4_common.clickhouse.get_client", return_value=mock_ch_client):
|
||||
yield mock_ch_client
|
||||
166
services/bot-detector/bot_detector/tests/test_detector.py
Normal file
166
services/bot-detector/bot_detector/tests/test_detector.py
Normal file
@ -0,0 +1,166 @@
|
||||
import os
|
||||
import pytest
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
|
||||
def test_settings_from_env(monkeypatch):
|
||||
"""ClickHouseSettings loads CLICKHOUSE_HOST from env."""
|
||||
monkeypatch.setenv("CLICKHOUSE_HOST", "testhost")
|
||||
from ja4_common.settings import ClickHouseSettings
|
||||
s = ClickHouseSettings()
|
||||
assert s.CLICKHOUSE_HOST == "testhost"
|
||||
|
||||
|
||||
def test_feature_dataframe_validation():
|
||||
"""MIN_VALID_FEATURE_RATIO logic: if < ratio of features have data, skip."""
|
||||
MIN_VALID_FEATURE_RATIO = 0.5
|
||||
df = pd.DataFrame({"f1": [1.0], "f2": [None], "f3": [None], "f4": [None]})
|
||||
non_null_ratio = df.notna().mean().mean()
|
||||
assert non_null_ratio < MIN_VALID_FEATURE_RATIO, "Should detect insufficient features"
|
||||
|
||||
|
||||
def test_anomaly_threshold():
|
||||
"""Scores below ANOMALY_THRESHOLD trigger detection."""
|
||||
ANOMALY_THRESHOLD = -0.1
|
||||
anomaly_scores = np.array([-0.5, -0.3, 0.1, 0.2])
|
||||
anomalies = anomaly_scores[anomaly_scores < ANOMALY_THRESHOLD]
|
||||
assert len(anomalies) == 2, "Should detect 2 anomalies"
|
||||
|
||||
|
||||
def test_dedup_logic():
|
||||
"""Duplicate detections within DEDUP_TTL_MIN are skipped."""
|
||||
from datetime import datetime, timedelta
|
||||
DEDUP_TTL_MIN = 60
|
||||
dedup_cache = {}
|
||||
|
||||
def should_insert(ip: str, now: datetime) -> bool:
|
||||
if ip in dedup_cache:
|
||||
if (now - dedup_cache[ip]).total_seconds() < DEDUP_TTL_MIN * 60:
|
||||
return False
|
||||
dedup_cache[ip] = now
|
||||
return True
|
||||
|
||||
now = datetime(2024, 1, 1, 12, 0, 0)
|
||||
assert should_insert("1.2.3.4", now) is True
|
||||
assert should_insert("1.2.3.4", now + timedelta(minutes=30)) is False # within TTL
|
||||
assert should_insert("1.2.3.4", now + timedelta(minutes=61)) is True # past TTL
|
||||
|
||||
|
||||
def test_health_check():
|
||||
"""Health check endpoint returns 200."""
|
||||
import threading
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
|
||||
class HealthHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
self.send_response(200)
|
||||
self.end_headers()
|
||||
|
||||
def log_message(self, *args):
|
||||
pass
|
||||
|
||||
server = HTTPServer(("127.0.0.1", 0), HealthHandler)
|
||||
port = server.server_address[1]
|
||||
t = threading.Thread(target=server.handle_request)
|
||||
t.start()
|
||||
|
||||
import urllib.request
|
||||
resp = urllib.request.urlopen(f"http://127.0.0.1:{port}/")
|
||||
assert resp.status == 200
|
||||
server.server_close()
|
||||
|
||||
|
||||
def test_dedup_different_ips_are_independent():
|
||||
"""Different IPs are tracked independently in dedup cache."""
|
||||
from datetime import datetime, timedelta
|
||||
DEDUP_TTL_MIN = 60
|
||||
dedup_cache = {}
|
||||
|
||||
def should_insert(ip: str, now: datetime) -> bool:
|
||||
if ip in dedup_cache:
|
||||
if (now - dedup_cache[ip]).total_seconds() < DEDUP_TTL_MIN * 60:
|
||||
return False
|
||||
dedup_cache[ip] = now
|
||||
return True
|
||||
|
||||
now = datetime(2024, 1, 1, 12, 0, 0)
|
||||
assert should_insert("1.1.1.1", now) is True
|
||||
assert should_insert("2.2.2.2", now) is True # Different IP, should be allowed
|
||||
assert should_insert("1.1.1.1", now + timedelta(minutes=30)) is False # Same IP within TTL
|
||||
assert should_insert("2.2.2.2", now + timedelta(minutes=30)) is False # Same IP within TTL
|
||||
|
||||
|
||||
def test_dedup_exact_ttl_boundary():
|
||||
"""Dedup: insertion exactly at TTL boundary is still blocked."""
|
||||
from datetime import datetime, timedelta
|
||||
DEDUP_TTL_MIN = 60
|
||||
dedup_cache = {}
|
||||
|
||||
def should_insert(ip: str, now: datetime) -> bool:
|
||||
if ip in dedup_cache:
|
||||
if (now - dedup_cache[ip]).total_seconds() < DEDUP_TTL_MIN * 60:
|
||||
return False
|
||||
dedup_cache[ip] = now
|
||||
return True
|
||||
|
||||
now = datetime(2024, 1, 1, 12, 0, 0)
|
||||
assert should_insert("1.2.3.4", now) is True
|
||||
# Exactly at 60 minutes should be blocked (< not <=)
|
||||
assert should_insert("1.2.3.4", now + timedelta(minutes=60)) is False
|
||||
|
||||
|
||||
def test_anomaly_threshold_no_anomalies():
|
||||
"""No anomalies when all scores are above threshold."""
|
||||
import numpy as np
|
||||
ANOMALY_THRESHOLD = -0.1
|
||||
scores = np.array([0.0, 0.1, 0.5, 1.0])
|
||||
anomalies = scores[scores < ANOMALY_THRESHOLD]
|
||||
assert len(anomalies) == 0
|
||||
|
||||
|
||||
def test_anomaly_threshold_all_anomalies():
|
||||
"""All items flagged when all scores are below threshold."""
|
||||
import numpy as np
|
||||
ANOMALY_THRESHOLD = -0.1
|
||||
scores = np.array([-0.5, -0.3, -0.2, -0.15])
|
||||
anomalies = scores[scores < ANOMALY_THRESHOLD]
|
||||
assert len(anomalies) == 4
|
||||
|
||||
|
||||
def test_feature_dataframe_all_valid():
|
||||
"""Feature dataframe with all valid values passes ratio check."""
|
||||
import pandas as pd
|
||||
MIN_VALID_FEATURE_RATIO = 0.5
|
||||
df = pd.DataFrame({"f1": [1.0], "f2": [2.0], "f3": [3.0], "f4": [4.0]})
|
||||
non_null_ratio = df.notna().mean().mean()
|
||||
assert non_null_ratio >= MIN_VALID_FEATURE_RATIO
|
||||
|
||||
|
||||
def test_health_check_returns_correct_status():
|
||||
"""Health check endpoint body is readable."""
|
||||
import threading
|
||||
import urllib.request
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
|
||||
class StatusHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.end_headers()
|
||||
self.wfile.write(b'{"status": "ok"}')
|
||||
|
||||
def log_message(self, *args):
|
||||
pass
|
||||
|
||||
server = HTTPServer(("127.0.0.1", 0), StatusHandler)
|
||||
port = server.server_address[1]
|
||||
t = threading.Thread(target=server.handle_request)
|
||||
t.start()
|
||||
|
||||
resp = urllib.request.urlopen(f"http://127.0.0.1:{port}/health")
|
||||
assert resp.status == 200
|
||||
body = resp.read()
|
||||
assert b"ok" in body
|
||||
server.server_close()
|
||||
Reference in New Issue
Block a user