diff --git a/services/bot-detector/bot_detector/bot_detector.py b/services/bot-detector/bot_detector/bot_detector.py index b68e983..7bdafe3 100644 --- a/services/bot-detector/bot_detector/bot_detector.py +++ b/services/bot-detector/bot_detector/bot_detector.py @@ -60,6 +60,14 @@ try: except ImportError: TORCH_AVAILABLE = False +# XGBoost supervisé (Osama et al. 2025, Chen & Guestrin 2016) +# Troisième voix de l'ensemble : classificateur sur labels historiques + feedback SOC +try: + import xgboost as xgb + XGB_AVAILABLE = True +except ImportError: + XGB_AVAILABLE = False + warnings.filterwarnings('ignore') # ═══════════════════════════════════════════════════════════════════════════════ @@ -138,6 +146,11 @@ AE_EPOCHS = int(os.getenv('AE_EPOCHS', '50')) AE_LATENT_DIM = int(os.getenv('AE_LATENT_DIM', '16')) AE_LEARNING_RATE = float(os.getenv('AE_LEARNING_RATE', '1e-3')) +# XGBoost — Troisième voix supervisée (Chen & Guestrin 2016, Osama et al. 2025) +XGB_WEIGHT = _require_float('XGB_WEIGHT', 0.20, 0, 1) # pondération dans le score combiné (0 = désactivé) +XGB_MIN_LABELS = int(os.getenv('XGB_MIN_LABELS', '100')) # nb minimum de labels historiques pour entraîner +XGB_RETRAIN_INTERVAL_H = int(os.getenv('XGB_RETRAIN_INTERVAL_HOURS', '168')) # retrain hebdomadaire + # Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé) # Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design. STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = { @@ -405,6 +418,113 @@ def _ae_model_path(name: str, version_id: str) -> str: return os.path.join(MODEL_DIR, f'ae_{name}_{version_id}.pt') +# ═══════════════════════════════════════════════════════════════════════════════ +# XGBOOST — Troisième voix supervisée (labels historiques + feedback SOC) +# ═══════════════════════════════════════════════════════════════════════════════ +def _xgb_model_path(name: str) -> str: + return os.path.join(MODEL_DIR, f'xgb_{name}.json') + + +def _xgb_meta_path(name: str) -> str: + return os.path.join(MODEL_DIR, f'xgb_{name}.meta.json') + + +def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -> tuple: + """Charge les labels historiques depuis ml_all_scores pour l'entraînement supervisé. + + Positifs : threat_level IN ('HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') → label=1 + Négatifs : threat_level = 'NORMAL' → label=0 + Retourne (X, y) ou (None, None) si insuffisant. + """ + feature_cols = ', '.join(features) + try: + result = client.query( + f"SELECT {feature_cols}, threat_level " + f"FROM {DB}.ml_all_scores " + f"WHERE threat_level IN ('NORMAL', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') " + f"AND window_start >= now() - INTERVAL 7 DAY " + f"ORDER BY rand() LIMIT 50000" + ) + if not result.result_rows: + return None, None + cols = features + ['threat_level'] + df = pd.DataFrame(result.result_rows, columns=cols) + df[features] = df[features].apply(pd.to_numeric, errors='coerce') + df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=features) + y = (df['threat_level'] != 'NORMAL').astype(int) + if y.sum() < 10 or len(y) < min_labels: + return None, None + X = df[features].values + return X, y.values + except Exception as exc: + log_info(f"[XGB] Erreur chargement labels : {exc}") + return None, None + + +def load_or_train_xgb(name: str, client, features: list, cycle_id: str): + """Charge ou entraîne le modèle XGBoost supervisé. + + Retourne le modèle XGBClassifier ou None si indisponible. + """ + if not XGB_AVAILABLE or XGB_WEIGHT <= 0: + return None + + model_path = _xgb_model_path(name) + meta_path = _xgb_meta_path(name) + + # Charger le modèle existant si récent + if os.path.exists(model_path) and os.path.exists(meta_path): + try: + with open(meta_path) as f: + meta = json.load(f) + trained_at = datetime.fromisoformat(meta['trained_at']) + age_h = (datetime.now() - trained_at).total_seconds() / 3600 + if age_h < XGB_RETRAIN_INTERVAL_H: + model = xgb.XGBClassifier() + model.load_model(model_path) + log_info(f"[XGB][{name}] Modèle rechargé ({age_h:.1f}h / {XGB_RETRAIN_INTERVAL_H}h, {meta.get('n_labels', '?')} labels).") + return model + except Exception as exc: + log_info(f"[XGB][{name}] Erreur chargement : {exc}") + + # Entraîner un nouveau modèle + X, y = _load_xgb_labels(client, features) + if X is None: + log_info(f"[XGB][{name}] Labels insuffisants (< {XGB_MIN_LABELS}) — XGBoost désactivé ce cycle.") + # Tenter de réutiliser un modèle ancien + if os.path.exists(model_path): + try: + model = xgb.XGBClassifier() + model.load_model(model_path) + return model + except Exception: + pass + return None + + scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1))) + model = xgb.XGBClassifier( + n_estimators=200, max_depth=6, learning_rate=0.1, + scale_pos_weight=scale_pos, eval_metric='logloss', + random_state=42, n_jobs=-1, + tree_method='hist', + ) + model.fit(X, y, verbose=False) + + model.save_model(model_path) + meta = { + 'trained_at': datetime.now().isoformat(), + 'n_labels': len(y), 'n_positive': int(y.sum()), + 'n_negative': int((y == 0).sum()), 'n_features': len(features), + 'scale_pos_weight': scale_pos, 'model_name': name, + } + with open(meta_path, 'w') as f: + json.dump(meta, f, indent=2) + + log_info(f"[XGB][{name}] Modèle entraîné : {len(y)} labels ({y.sum()} positifs), scale_pos_weight={scale_pos}") + log_decision('XGB_TRAINED', cycle_id, name, meta) + return model + + def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str): """Charge le modèle IsolationForest existant ou en entraîne un nouveau si nécessaire. @@ -897,6 +1017,25 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map): unknown_traffic['raw_anomaly_score'] = raw_scores unknown_traffic['model_name'] = name + # XGBoost supervisé — troisième voix (si labels historiques disponibles) + unknown_traffic['xgb_prob'] = 0.0 + if XGB_AVAILABLE and XGB_WEIGHT > 0: + try: + xgb_client = get_client() + xgb_model = load_or_train_xgb(name, xgb_client, valid_features, cycle_id) + if xgb_model is not None: + xgb_probs = xgb_model.predict_proba(X_test.values)[:, 1] + unknown_traffic['xgb_prob'] = xgb_probs + # Méta-learner : combiner anomaly_score (EIF+AE) et xgb_prob + # anomaly_score déjà normalisé [0,1], xgb_prob est [0,1] + α_xgb = XGB_WEIGHT + unknown_traffic['anomaly_score'] = ( + (1 - α_xgb) * unknown_traffic['anomaly_score'] + α_xgb * xgb_probs + ) + log_info(f"[{name}] Score combiné EIF+AE+XGB (β={α_xgb}): xgb_mean={xgb_probs.mean():.4f}") + except Exception as exc: + log_info(f"[{name}] XGBoost scoring échoué : {exc} — EIF+AE seuls.") + # A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD) effective_threshold = compute_adaptive_threshold(raw_scores) log_info(f"[{name}] Seuil effectif : {effective_threshold:.4f} (statique={ANOMALY_THRESHOLD}, percentile={ANOMALY_PERCENTILE})") @@ -923,6 +1062,8 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map): known_bots = known_bots.copy() known_bots['anomaly_score'] = 0.0 known_bots['raw_anomaly_score'] = 0.0 + known_bots['ae_recon_error'] = 0.0 + known_bots['xgb_prob'] = 0.0 known_bots['threat_level'] = 'KNOWN_BOT' known_bots['model_name'] = name known_bots['campaign_id'] = -1 @@ -941,6 +1082,8 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map): anubis_allow = anubis_allow.copy() anubis_allow['anomaly_score'] = 0.0 anubis_allow['raw_anomaly_score'] = 0.0 + anubis_allow['ae_recon_error'] = 0.0 + anubis_allow['xgb_prob'] = 0.0 anubis_allow['threat_level'] = 'KNOWN_BOT' anubis_allow['model_name'] = name anubis_allow['campaign_id'] = -1 @@ -1317,7 +1460,7 @@ def fetch_and_analyze(): 'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name', 'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label', 'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id', - 'ae_recon_error' + 'ae_recon_error', 'xgb_prob' ] scores_df = all_scored[[c for c in all_scores_cols if c in all_scored.columns]] client.insert_df(f'{DB}.ml_all_scores', scores_df) diff --git a/services/bot-detector/bot_detector/requirements.txt b/services/bot-detector/bot_detector/requirements.txt index 1c72841..80af210 100644 --- a/services/bot-detector/bot_detector/requirements.txt +++ b/services/bot-detector/bot_detector/requirements.txt @@ -6,5 +6,6 @@ scipy>=1.14 hdbscan>=0.8.38 isotree>=0.6.1 torch>=2.0 +xgboost>=2.0 pyyaml>=6.0 ja4-common @ file:///app/shared/ja4_common diff --git a/services/bot-detector/bot_detector/tests/test_detector.py b/services/bot-detector/bot_detector/tests/test_detector.py index 16ee890..c11ba2b 100644 --- a/services/bot-detector/bot_detector/tests/test_detector.py +++ b/services/bot-detector/bot_detector/tests/test_detector.py @@ -504,3 +504,86 @@ def test_ae_weight_combination(): # Combined should be between min and max of components assert all(combined >= np.minimum(eif_norm, ae_norm) - 1e-9) assert all(combined <= np.maximum(eif_norm, ae_norm) + 1e-9) + + +# ═══════════════════════════════════════════════════════════════════════════════ +# XGBOOST TESTS +# ═══════════════════════════════════════════════════════════════════════════════ + +def test_xgb_availability_flag(): + """Verify XGBoost availability detection works without crashing.""" + try: + import xgboost + avail = True + except ImportError: + avail = False + assert isinstance(avail, bool) + + +def test_xgb_basic_train_and_predict(): + """XGBoost can train on labeled data and predict probabilities.""" + try: + import xgboost as xgb + except ImportError: + pytest.skip("xgboost not installed") + + rng = np.random.default_rng(42) + n_features = 10 + n_samples = 500 + X = rng.normal(0, 1, (n_samples, n_features)) + # Label: anomaly if feature 0 > 1.5 (simple rule) + y = (X[:, 0] > 1.5).astype(int) + + model = xgb.XGBClassifier(n_estimators=50, max_depth=3, + eval_metric='logloss', random_state=42) + model.fit(X, y, verbose=False) + + probs = model.predict_proba(X)[:, 1] + assert probs.shape == (n_samples,) + assert 0 <= probs.min() <= probs.max() <= 1 + # High feature 0 should have higher probability + high_mask = X[:, 0] > 2.0 + low_mask = X[:, 0] < -1.0 + if high_mask.any() and low_mask.any(): + assert np.mean(probs[high_mask]) > np.mean(probs[low_mask]), \ + "XGBoost should give higher prob to anomalous samples" + + +def test_xgb_meta_learner_combination(): + """Meta-learner combines EIF+AE score and XGBoost probability correctly.""" + eif_ae_score = np.array([0.1, 0.5, 0.9]) + xgb_prob = np.array([0.2, 0.6, 0.8]) + beta = 0.20 # XGB_WEIGHT default + combined = (1 - beta) * eif_ae_score + beta * xgb_prob + expected = np.array([0.1*0.8 + 0.2*0.2, 0.5*0.8 + 0.6*0.2, 0.9*0.8 + 0.8*0.2]) + np.testing.assert_allclose(combined, expected, rtol=1e-7) + assert all(combined >= 0) and all(combined <= 1) + + +def test_xgb_save_load_model(): + """XGBoost model can be saved and loaded correctly.""" + try: + import xgboost as xgb + except ImportError: + pytest.skip("xgboost not installed") + import tempfile + + rng = np.random.default_rng(42) + X = rng.normal(0, 1, (200, 5)) + y = (X[:, 0] > 1.0).astype(int) + + model = xgb.XGBClassifier(n_estimators=20, max_depth=3, + eval_metric='logloss', random_state=42) + model.fit(X, y, verbose=False) + probs_before = model.predict_proba(X)[:, 1] + + with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f: + path = f.name + try: + model.save_model(path) + model2 = xgb.XGBClassifier() + model2.load_model(path) + probs_after = model2.predict_proba(X)[:, 1] + np.testing.assert_allclose(probs_before, probs_after, rtol=1e-5) + finally: + os.remove(path) diff --git a/shared/clickhouse/06_ml_tables.sql b/shared/clickhouse/06_ml_tables.sql index 20358fd..f6c3406 100644 --- a/shared/clickhouse/06_ml_tables.sql +++ b/shared/clickhouse/06_ml_tables.sql @@ -96,6 +96,8 @@ CREATE TABLE IF NOT EXISTS ja4_processing.ml_all_scores campaign_id Int32, -- Autoencoder reconstruction error (parallel scorer) ae_recon_error Float32 DEFAULT 0, + -- XGBoost supervised probability (third voice) + xgb_prob Float32 DEFAULT 0, -- Anubis enrichment (deploy_schema.sql item 12) anubis_bot_name LowCardinality(String) DEFAULT '', anubis_bot_action LowCardinality(String) DEFAULT '',