feat(bot-detector): add XGBoost supervised third voice (#10)

Triple-voice ensemble architecture:
- EIF (non-supervisé, anomalies zero-day)
- Autoencoder (non-supervisé, corrélations non-linéaires)
- XGBoost (supervisé, patterns connus + feedback SOC)

XGBoost implementation:
- Trained on historical ml_all_scores labels (NORMAL=0, HIGH/CRITICAL/DENY/KNOWN=1)
- Weekly retraining (XGB_RETRAIN_INTERVAL_H=168), min 100 labels required
- Score = predict_proba, combined via meta-learner: (1-β)*(EIF+AE) + β*xgb_prob
- Configurable: XGB_WEIGHT (β=0.20), XGB_MIN_LABELS, XGB_RETRAIN_INTERVAL_HOURS
- Graceful fallback: if xgboost unavailable or labels insufficient, EIF+AE only
- ClickHouse: xgb_prob column added to ml_all_scores
- Tests: 4 new tests (availability, train/predict, meta-learner, save/load)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-08 02:45:57 +02:00
parent 57cf6c3828
commit 8d58f2b932
4 changed files with 230 additions and 1 deletions

View File

@ -60,6 +60,14 @@ try:
except ImportError:
TORCH_AVAILABLE = False
# XGBoost supervisé (Osama et al. 2025, Chen & Guestrin 2016)
# Troisième voix de l'ensemble : classificateur sur labels historiques + feedback SOC
try:
import xgboost as xgb
XGB_AVAILABLE = True
except ImportError:
XGB_AVAILABLE = False
warnings.filterwarnings('ignore')
# ═══════════════════════════════════════════════════════════════════════════════
@ -138,6 +146,11 @@ AE_EPOCHS = int(os.getenv('AE_EPOCHS', '50'))
AE_LATENT_DIM = int(os.getenv('AE_LATENT_DIM', '16'))
AE_LEARNING_RATE = float(os.getenv('AE_LEARNING_RATE', '1e-3'))
# XGBoost — Troisième voix supervisée (Chen & Guestrin 2016, Osama et al. 2025)
XGB_WEIGHT = _require_float('XGB_WEIGHT', 0.20, 0, 1) # pondération dans le score combiné (0 = désactivé)
XGB_MIN_LABELS = int(os.getenv('XGB_MIN_LABELS', '100')) # nb minimum de labels historiques pour entraîner
XGB_RETRAIN_INTERVAL_H = int(os.getenv('XGB_RETRAIN_INTERVAL_HOURS', '168')) # retrain hebdomadaire
# Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé)
# Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design.
STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
@ -405,6 +418,113 @@ def _ae_model_path(name: str, version_id: str) -> str:
return os.path.join(MODEL_DIR, f'ae_{name}_{version_id}.pt')
# ═══════════════════════════════════════════════════════════════════════════════
# XGBOOST — Troisième voix supervisée (labels historiques + feedback SOC)
# ═══════════════════════════════════════════════════════════════════════════════
def _xgb_model_path(name: str) -> str:
return os.path.join(MODEL_DIR, f'xgb_{name}.json')
def _xgb_meta_path(name: str) -> str:
return os.path.join(MODEL_DIR, f'xgb_{name}.meta.json')
def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -> tuple:
"""Charge les labels historiques depuis ml_all_scores pour l'entraînement supervisé.
Positifs : threat_level IN ('HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') → label=1
Négatifs : threat_level = 'NORMAL' → label=0
Retourne (X, y) ou (None, None) si insuffisant.
"""
feature_cols = ', '.join(features)
try:
result = client.query(
f"SELECT {feature_cols}, threat_level "
f"FROM {DB}.ml_all_scores "
f"WHERE threat_level IN ('NORMAL', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') "
f"AND window_start >= now() - INTERVAL 7 DAY "
f"ORDER BY rand() LIMIT 50000"
)
if not result.result_rows:
return None, None
cols = features + ['threat_level']
df = pd.DataFrame(result.result_rows, columns=cols)
df[features] = df[features].apply(pd.to_numeric, errors='coerce')
df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=features)
y = (df['threat_level'] != 'NORMAL').astype(int)
if y.sum() < 10 or len(y) < min_labels:
return None, None
X = df[features].values
return X, y.values
except Exception as exc:
log_info(f"[XGB] Erreur chargement labels : {exc}")
return None, None
def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
"""Charge ou entraîne le modèle XGBoost supervisé.
Retourne le modèle XGBClassifier ou None si indisponible.
"""
if not XGB_AVAILABLE or XGB_WEIGHT <= 0:
return None
model_path = _xgb_model_path(name)
meta_path = _xgb_meta_path(name)
# Charger le modèle existant si récent
if os.path.exists(model_path) and os.path.exists(meta_path):
try:
with open(meta_path) as f:
meta = json.load(f)
trained_at = datetime.fromisoformat(meta['trained_at'])
age_h = (datetime.now() - trained_at).total_seconds() / 3600
if age_h < XGB_RETRAIN_INTERVAL_H:
model = xgb.XGBClassifier()
model.load_model(model_path)
log_info(f"[XGB][{name}] Modèle rechargé ({age_h:.1f}h / {XGB_RETRAIN_INTERVAL_H}h, {meta.get('n_labels', '?')} labels).")
return model
except Exception as exc:
log_info(f"[XGB][{name}] Erreur chargement : {exc}")
# Entraîner un nouveau modèle
X, y = _load_xgb_labels(client, features)
if X is None:
log_info(f"[XGB][{name}] Labels insuffisants (< {XGB_MIN_LABELS}) — XGBoost désactivé ce cycle.")
# Tenter de réutiliser un modèle ancien
if os.path.exists(model_path):
try:
model = xgb.XGBClassifier()
model.load_model(model_path)
return model
except Exception:
pass
return None
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
model = xgb.XGBClassifier(
n_estimators=200, max_depth=6, learning_rate=0.1,
scale_pos_weight=scale_pos, eval_metric='logloss',
random_state=42, n_jobs=-1,
tree_method='hist',
)
model.fit(X, y, verbose=False)
model.save_model(model_path)
meta = {
'trained_at': datetime.now().isoformat(),
'n_labels': len(y), 'n_positive': int(y.sum()),
'n_negative': int((y == 0).sum()), 'n_features': len(features),
'scale_pos_weight': scale_pos, 'model_name': name,
}
with open(meta_path, 'w') as f:
json.dump(meta, f, indent=2)
log_info(f"[XGB][{name}] Modèle entraîné : {len(y)} labels ({y.sum()} positifs), scale_pos_weight={scale_pos}")
log_decision('XGB_TRAINED', cycle_id, name, meta)
return model
def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str):
"""Charge le modèle IsolationForest existant ou en entraîne un nouveau si nécessaire.
@ -897,6 +1017,25 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
unknown_traffic['raw_anomaly_score'] = raw_scores
unknown_traffic['model_name'] = name
# XGBoost supervisé — troisième voix (si labels historiques disponibles)
unknown_traffic['xgb_prob'] = 0.0
if XGB_AVAILABLE and XGB_WEIGHT > 0:
try:
xgb_client = get_client()
xgb_model = load_or_train_xgb(name, xgb_client, valid_features, cycle_id)
if xgb_model is not None:
xgb_probs = xgb_model.predict_proba(X_test.values)[:, 1]
unknown_traffic['xgb_prob'] = xgb_probs
# Méta-learner : combiner anomaly_score (EIF+AE) et xgb_prob
# anomaly_score déjà normalisé [0,1], xgb_prob est [0,1]
α_xgb = XGB_WEIGHT
unknown_traffic['anomaly_score'] = (
(1 - α_xgb) * unknown_traffic['anomaly_score'] + α_xgb * xgb_probs
)
log_info(f"[{name}] Score combiné EIF+AE+XGB (β={α_xgb}): xgb_mean={xgb_probs.mean():.4f}")
except Exception as exc:
log_info(f"[{name}] XGBoost scoring échoué : {exc} — EIF+AE seuls.")
# A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD)
effective_threshold = compute_adaptive_threshold(raw_scores)
log_info(f"[{name}] Seuil effectif : {effective_threshold:.4f} (statique={ANOMALY_THRESHOLD}, percentile={ANOMALY_PERCENTILE})")
@ -923,6 +1062,8 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
known_bots = known_bots.copy()
known_bots['anomaly_score'] = 0.0
known_bots['raw_anomaly_score'] = 0.0
known_bots['ae_recon_error'] = 0.0
known_bots['xgb_prob'] = 0.0
known_bots['threat_level'] = 'KNOWN_BOT'
known_bots['model_name'] = name
known_bots['campaign_id'] = -1
@ -941,6 +1082,8 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
anubis_allow = anubis_allow.copy()
anubis_allow['anomaly_score'] = 0.0
anubis_allow['raw_anomaly_score'] = 0.0
anubis_allow['ae_recon_error'] = 0.0
anubis_allow['xgb_prob'] = 0.0
anubis_allow['threat_level'] = 'KNOWN_BOT'
anubis_allow['model_name'] = name
anubis_allow['campaign_id'] = -1
@ -1317,7 +1460,7 @@ def fetch_and_analyze():
'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name',
'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label',
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id',
'ae_recon_error'
'ae_recon_error', 'xgb_prob'
]
scores_df = all_scored[[c for c in all_scores_cols if c in all_scored.columns]]
client.insert_df(f'{DB}.ml_all_scores', scores_df)

View File

@ -6,5 +6,6 @@ scipy>=1.14
hdbscan>=0.8.38
isotree>=0.6.1
torch>=2.0
xgboost>=2.0
pyyaml>=6.0
ja4-common @ file:///app/shared/ja4_common

View File

@ -504,3 +504,86 @@ def test_ae_weight_combination():
# Combined should be between min and max of components
assert all(combined >= np.minimum(eif_norm, ae_norm) - 1e-9)
assert all(combined <= np.maximum(eif_norm, ae_norm) + 1e-9)
# ═══════════════════════════════════════════════════════════════════════════════
# XGBOOST TESTS
# ═══════════════════════════════════════════════════════════════════════════════
def test_xgb_availability_flag():
"""Verify XGBoost availability detection works without crashing."""
try:
import xgboost
avail = True
except ImportError:
avail = False
assert isinstance(avail, bool)
def test_xgb_basic_train_and_predict():
"""XGBoost can train on labeled data and predict probabilities."""
try:
import xgboost as xgb
except ImportError:
pytest.skip("xgboost not installed")
rng = np.random.default_rng(42)
n_features = 10
n_samples = 500
X = rng.normal(0, 1, (n_samples, n_features))
# Label: anomaly if feature 0 > 1.5 (simple rule)
y = (X[:, 0] > 1.5).astype(int)
model = xgb.XGBClassifier(n_estimators=50, max_depth=3,
eval_metric='logloss', random_state=42)
model.fit(X, y, verbose=False)
probs = model.predict_proba(X)[:, 1]
assert probs.shape == (n_samples,)
assert 0 <= probs.min() <= probs.max() <= 1
# High feature 0 should have higher probability
high_mask = X[:, 0] > 2.0
low_mask = X[:, 0] < -1.0
if high_mask.any() and low_mask.any():
assert np.mean(probs[high_mask]) > np.mean(probs[low_mask]), \
"XGBoost should give higher prob to anomalous samples"
def test_xgb_meta_learner_combination():
"""Meta-learner combines EIF+AE score and XGBoost probability correctly."""
eif_ae_score = np.array([0.1, 0.5, 0.9])
xgb_prob = np.array([0.2, 0.6, 0.8])
beta = 0.20 # XGB_WEIGHT default
combined = (1 - beta) * eif_ae_score + beta * xgb_prob
expected = np.array([0.1*0.8 + 0.2*0.2, 0.5*0.8 + 0.6*0.2, 0.9*0.8 + 0.8*0.2])
np.testing.assert_allclose(combined, expected, rtol=1e-7)
assert all(combined >= 0) and all(combined <= 1)
def test_xgb_save_load_model():
"""XGBoost model can be saved and loaded correctly."""
try:
import xgboost as xgb
except ImportError:
pytest.skip("xgboost not installed")
import tempfile
rng = np.random.default_rng(42)
X = rng.normal(0, 1, (200, 5))
y = (X[:, 0] > 1.0).astype(int)
model = xgb.XGBClassifier(n_estimators=20, max_depth=3,
eval_metric='logloss', random_state=42)
model.fit(X, y, verbose=False)
probs_before = model.predict_proba(X)[:, 1]
with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f:
path = f.name
try:
model.save_model(path)
model2 = xgb.XGBClassifier()
model2.load_model(path)
probs_after = model2.predict_proba(X)[:, 1]
np.testing.assert_allclose(probs_before, probs_after, rtol=1e-5)
finally:
os.remove(path)