feat(bot-detector): add XGBoost supervised third voice (#10)

Triple-voice ensemble architecture: - EIF (non-supervisé, anomalies zero-day) - Autoencoder (non-supervisé, corrélations non-linéaires) - XGBoost (supervisé, patterns connus + feedback SOC) XGBoost implementation: - Trained on historical ml_all_scores labels (NORMAL=0, HIGH/CRITICAL/DENY/KNOWN=1) - Weekly retraining (XGB_RETRAIN_INTERVAL_H=168), min 100 labels required - Score = predict_proba, combined via meta-learner: (1-β)*(EIF+AE) + β*xgb_prob - Configurable: XGB_WEIGHT (β=0.20), XGB_MIN_LABELS, XGB_RETRAIN_INTERVAL_HOURS - Graceful fallback: if xgboost unavailable or labels insufficient, EIF+AE only - ClickHouse: xgb_prob column added to ml_all_scores - Tests: 4 new tests (availability, train/predict, meta-learner, save/load) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-08 02:45:57 +02:00
parent 57cf6c3828
commit 8d58f2b932
4 changed files with 230 additions and 1 deletions
--- a/services/bot-detector/bot_detector/bot_detector.py
+++ b/services/bot-detector/bot_detector/bot_detector.py
@ -60,6 +60,14 @@ try:
 except ImportError:
    TORCH_AVAILABLE = False

+# XGBoost supervisé (Osama et al. 2025, Chen & Guestrin 2016)
+# Troisième voix de l'ensemble : classificateur sur labels historiques + feedback SOC
+try:
+    import xgboost as xgb
+    XGB_AVAILABLE = True
+except ImportError:
+    XGB_AVAILABLE = False
+
 warnings.filterwarnings('ignore')

 # ═══════════════════════════════════════════════════════════════════════════════
@ -138,6 +146,11 @@ AE_EPOCHS = int(os.getenv('AE_EPOCHS', '50'))
 AE_LATENT_DIM = int(os.getenv('AE_LATENT_DIM', '16'))
 AE_LEARNING_RATE = float(os.getenv('AE_LEARNING_RATE', '1e-3'))

+# XGBoost — Troisième voix supervisée (Chen & Guestrin 2016, Osama et al. 2025)
+XGB_WEIGHT = _require_float('XGB_WEIGHT', 0.20, 0, 1)  # pondération dans le score combiné (0 = désactivé)
+XGB_MIN_LABELS = int(os.getenv('XGB_MIN_LABELS', '100'))  # nb minimum de labels historiques pour entraîner
+XGB_RETRAIN_INTERVAL_H = int(os.getenv('XGB_RETRAIN_INTERVAL_HOURS', '168'))  # retrain hebdomadaire
+
 # Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé)
 # Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design.
 STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
@ -405,6 +418,113 @@ def _ae_model_path(name: str, version_id: str) -> str:
    return os.path.join(MODEL_DIR, f'ae_{name}_{version_id}.pt')


+# ═══════════════════════════════════════════════════════════════════════════════
+# XGBOOST — Troisième voix supervisée (labels historiques + feedback SOC)
+# ═══════════════════════════════════════════════════════════════════════════════
+def _xgb_model_path(name: str) -> str:
+    return os.path.join(MODEL_DIR, f'xgb_{name}.json')
+
+
+def _xgb_meta_path(name: str) -> str:
+    return os.path.join(MODEL_DIR, f'xgb_{name}.meta.json')
+
+
+def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -> tuple:
+    """Charge les labels historiques depuis ml_all_scores pour l'entraînement supervisé.
+
+    Positifs : threat_level IN ('HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') → label=1
+    Négatifs : threat_level = 'NORMAL' → label=0
+    Retourne (X, y) ou (None, None) si insuffisant.
+    """
+    feature_cols = ', '.join(features)
+    try:
+        result = client.query(
+            f"SELECT {feature_cols}, threat_level "
+            f"FROM {DB}.ml_all_scores "
+            f"WHERE threat_level IN ('NORMAL', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') "
+            f"AND window_start >= now() - INTERVAL 7 DAY "
+            f"ORDER BY rand() LIMIT 50000"
+        )
+        if not result.result_rows:
+            return None, None
+        cols = features + ['threat_level']
+        df = pd.DataFrame(result.result_rows, columns=cols)
+        df[features] = df[features].apply(pd.to_numeric, errors='coerce')
+        df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=features)
+        y = (df['threat_level'] != 'NORMAL').astype(int)
+        if y.sum() < 10 or len(y) < min_labels:
+            return None, None
+        X = df[features].values
+        return X, y.values
+    except Exception as exc:
+        log_info(f"[XGB] Erreur chargement labels : {exc}")
+        return None, None
+
+
+def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
+    """Charge ou entraîne le modèle XGBoost supervisé.
+
+    Retourne le modèle XGBClassifier ou None si indisponible.
+    """
+    if not XGB_AVAILABLE or XGB_WEIGHT <= 0:
+        return None
+
+    model_path = _xgb_model_path(name)
+    meta_path = _xgb_meta_path(name)
+
+    # Charger le modèle existant si récent
+    if os.path.exists(model_path) and os.path.exists(meta_path):
+        try:
+            with open(meta_path) as f:
+                meta = json.load(f)
+            trained_at = datetime.fromisoformat(meta['trained_at'])
+            age_h = (datetime.now() - trained_at).total_seconds() / 3600
+            if age_h < XGB_RETRAIN_INTERVAL_H:
+                model = xgb.XGBClassifier()
+                model.load_model(model_path)
+                log_info(f"[XGB][{name}] Modèle rechargé ({age_h:.1f}h / {XGB_RETRAIN_INTERVAL_H}h, {meta.get('n_labels', '?')} labels).")
+                return model
+        except Exception as exc:
+            log_info(f"[XGB][{name}] Erreur chargement : {exc}")
+
+    # Entraîner un nouveau modèle
+    X, y = _load_xgb_labels(client, features)
+    if X is None:
+        log_info(f"[XGB][{name}] Labels insuffisants (< {XGB_MIN_LABELS}) — XGBoost désactivé ce cycle.")
+        # Tenter de réutiliser un modèle ancien
+        if os.path.exists(model_path):
+            try:
+                model = xgb.XGBClassifier()
+                model.load_model(model_path)
+                return model
+            except Exception:
+                pass
+        return None
+
+    scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
+    model = xgb.XGBClassifier(
+        n_estimators=200, max_depth=6, learning_rate=0.1,
+        scale_pos_weight=scale_pos, eval_metric='logloss',
+        random_state=42, n_jobs=-1,
+        tree_method='hist',
+    )
+    model.fit(X, y, verbose=False)
+
+    model.save_model(model_path)
+    meta = {
+        'trained_at': datetime.now().isoformat(),
+        'n_labels': len(y), 'n_positive': int(y.sum()),
+        'n_negative': int((y == 0).sum()), 'n_features': len(features),
+        'scale_pos_weight': scale_pos, 'model_name': name,
+    }
+    with open(meta_path, 'w') as f:
+        json.dump(meta, f, indent=2)
+
+    log_info(f"[XGB][{name}] Modèle entraîné : {len(y)} labels ({y.sum()} positifs), scale_pos_weight={scale_pos}")
+    log_decision('XGB_TRAINED', cycle_id, name, meta)
+    return model
+
+
 def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str):
    """Charge le modèle IsolationForest existant ou en entraîne un nouveau si nécessaire.

@ -897,6 +1017,25 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
    unknown_traffic['raw_anomaly_score'] = raw_scores
    unknown_traffic['model_name'] = name

+    # XGBoost supervisé — troisième voix (si labels historiques disponibles)
+    unknown_traffic['xgb_prob'] = 0.0
+    if XGB_AVAILABLE and XGB_WEIGHT > 0:
+        try:
+            xgb_client = get_client()
+            xgb_model = load_or_train_xgb(name, xgb_client, valid_features, cycle_id)
+            if xgb_model is not None:
+                xgb_probs = xgb_model.predict_proba(X_test.values)[:, 1]
+                unknown_traffic['xgb_prob'] = xgb_probs
+                # Méta-learner : combiner anomaly_score (EIF+AE) et xgb_prob
+                # anomaly_score déjà normalisé [0,1], xgb_prob est [0,1]
+                α_xgb = XGB_WEIGHT
+                unknown_traffic['anomaly_score'] = (
+                    (1 - α_xgb) * unknown_traffic['anomaly_score'] + α_xgb * xgb_probs
+                )
+                log_info(f"[{name}] Score combiné EIF+AE+XGB (β={α_xgb}): xgb_mean={xgb_probs.mean():.4f}")
+        except Exception as exc:
+            log_info(f"[{name}] XGBoost scoring échoué : {exc} — EIF+AE seuls.")
+
    # A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD)
    effective_threshold = compute_adaptive_threshold(raw_scores)
    log_info(f"[{name}] Seuil effectif : {effective_threshold:.4f} (statique={ANOMALY_THRESHOLD}, percentile={ANOMALY_PERCENTILE})")
@ -923,6 +1062,8 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
        known_bots = known_bots.copy()
        known_bots['anomaly_score'] = 0.0
        known_bots['raw_anomaly_score'] = 0.0
+        known_bots['ae_recon_error'] = 0.0
+        known_bots['xgb_prob'] = 0.0
        known_bots['threat_level'] = 'KNOWN_BOT'
        known_bots['model_name'] = name
        known_bots['campaign_id'] = -1
@ -941,6 +1082,8 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
        anubis_allow = anubis_allow.copy()
        anubis_allow['anomaly_score'] = 0.0
        anubis_allow['raw_anomaly_score'] = 0.0
+        anubis_allow['ae_recon_error'] = 0.0
+        anubis_allow['xgb_prob'] = 0.0
        anubis_allow['threat_level'] = 'KNOWN_BOT'
        anubis_allow['model_name'] = name
        anubis_allow['campaign_id'] = -1
@ -1317,7 +1460,7 @@ def fetch_and_analyze():
                'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name',
                'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label',
                'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id',
-                'ae_recon_error'
+                'ae_recon_error', 'xgb_prob'
            ]
            scores_df = all_scored[[c for c in all_scores_cols if c in all_scored.columns]]
            client.insert_df(f'{DB}.ml_all_scores', scores_df)
--- a/services/bot-detector/bot_detector/requirements.txt
+++ b/services/bot-detector/bot_detector/requirements.txt
@ -6,5 +6,6 @@ scipy>=1.14
 hdbscan>=0.8.38
 isotree>=0.6.1
 torch>=2.0
+xgboost>=2.0
 pyyaml>=6.0
 ja4-common @ file:///app/shared/ja4_common
--- a/services/bot-detector/bot_detector/tests/test_detector.py
+++ b/services/bot-detector/bot_detector/tests/test_detector.py
@ -504,3 +504,86 @@ def test_ae_weight_combination():
    # Combined should be between min and max of components
    assert all(combined >= np.minimum(eif_norm, ae_norm) - 1e-9)
    assert all(combined <= np.maximum(eif_norm, ae_norm) + 1e-9)
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# XGBOOST TESTS
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def test_xgb_availability_flag():
+    """Verify XGBoost availability detection works without crashing."""
+    try:
+        import xgboost
+        avail = True
+    except ImportError:
+        avail = False
+    assert isinstance(avail, bool)
+
+
+def test_xgb_basic_train_and_predict():
+    """XGBoost can train on labeled data and predict probabilities."""
+    try:
+        import xgboost as xgb
+    except ImportError:
+        pytest.skip("xgboost not installed")
+
+    rng = np.random.default_rng(42)
+    n_features = 10
+    n_samples = 500
+    X = rng.normal(0, 1, (n_samples, n_features))
+    # Label: anomaly if feature 0 > 1.5 (simple rule)
+    y = (X[:, 0] > 1.5).astype(int)
+
+    model = xgb.XGBClassifier(n_estimators=50, max_depth=3,
+                               eval_metric='logloss', random_state=42)
+    model.fit(X, y, verbose=False)
+
+    probs = model.predict_proba(X)[:, 1]
+    assert probs.shape == (n_samples,)
+    assert 0 <= probs.min() <= probs.max() <= 1
+    # High feature 0 should have higher probability
+    high_mask = X[:, 0] > 2.0
+    low_mask = X[:, 0] < -1.0
+    if high_mask.any() and low_mask.any():
+        assert np.mean(probs[high_mask]) > np.mean(probs[low_mask]), \
+            "XGBoost should give higher prob to anomalous samples"
+
+
+def test_xgb_meta_learner_combination():
+    """Meta-learner combines EIF+AE score and XGBoost probability correctly."""
+    eif_ae_score = np.array([0.1, 0.5, 0.9])
+    xgb_prob = np.array([0.2, 0.6, 0.8])
+    beta = 0.20  # XGB_WEIGHT default
+    combined = (1 - beta) * eif_ae_score + beta * xgb_prob
+    expected = np.array([0.1*0.8 + 0.2*0.2, 0.5*0.8 + 0.6*0.2, 0.9*0.8 + 0.8*0.2])
+    np.testing.assert_allclose(combined, expected, rtol=1e-7)
+    assert all(combined >= 0) and all(combined <= 1)
+
+
+def test_xgb_save_load_model():
+    """XGBoost model can be saved and loaded correctly."""
+    try:
+        import xgboost as xgb
+    except ImportError:
+        pytest.skip("xgboost not installed")
+    import tempfile
+
+    rng = np.random.default_rng(42)
+    X = rng.normal(0, 1, (200, 5))
+    y = (X[:, 0] > 1.0).astype(int)
+
+    model = xgb.XGBClassifier(n_estimators=20, max_depth=3,
+                               eval_metric='logloss', random_state=42)
+    model.fit(X, y, verbose=False)
+    probs_before = model.predict_proba(X)[:, 1]
+
+    with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f:
+        path = f.name
+    try:
+        model.save_model(path)
+        model2 = xgb.XGBClassifier()
+        model2.load_model(path)
+        probs_after = model2.predict_proba(X)[:, 1]
+        np.testing.assert_allclose(probs_before, probs_after, rtol=1e-5)
+    finally:
+        os.remove(path)