feat(bot-detector): add XGBoost supervised third voice (#10)
Triple-voice ensemble architecture: - EIF (non-supervisé, anomalies zero-day) - Autoencoder (non-supervisé, corrélations non-linéaires) - XGBoost (supervisé, patterns connus + feedback SOC) XGBoost implementation: - Trained on historical ml_all_scores labels (NORMAL=0, HIGH/CRITICAL/DENY/KNOWN=1) - Weekly retraining (XGB_RETRAIN_INTERVAL_H=168), min 100 labels required - Score = predict_proba, combined via meta-learner: (1-β)*(EIF+AE) + β*xgb_prob - Configurable: XGB_WEIGHT (β=0.20), XGB_MIN_LABELS, XGB_RETRAIN_INTERVAL_HOURS - Graceful fallback: if xgboost unavailable or labels insufficient, EIF+AE only - ClickHouse: xgb_prob column added to ml_all_scores - Tests: 4 new tests (availability, train/predict, meta-learner, save/load) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -60,6 +60,14 @@ try:
|
||||
except ImportError:
|
||||
TORCH_AVAILABLE = False
|
||||
|
||||
# XGBoost supervisé (Osama et al. 2025, Chen & Guestrin 2016)
|
||||
# Troisième voix de l'ensemble : classificateur sur labels historiques + feedback SOC
|
||||
try:
|
||||
import xgboost as xgb
|
||||
XGB_AVAILABLE = True
|
||||
except ImportError:
|
||||
XGB_AVAILABLE = False
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
@ -138,6 +146,11 @@ AE_EPOCHS = int(os.getenv('AE_EPOCHS', '50'))
|
||||
AE_LATENT_DIM = int(os.getenv('AE_LATENT_DIM', '16'))
|
||||
AE_LEARNING_RATE = float(os.getenv('AE_LEARNING_RATE', '1e-3'))
|
||||
|
||||
# XGBoost — Troisième voix supervisée (Chen & Guestrin 2016, Osama et al. 2025)
|
||||
XGB_WEIGHT = _require_float('XGB_WEIGHT', 0.20, 0, 1) # pondération dans le score combiné (0 = désactivé)
|
||||
XGB_MIN_LABELS = int(os.getenv('XGB_MIN_LABELS', '100')) # nb minimum de labels historiques pour entraîner
|
||||
XGB_RETRAIN_INTERVAL_H = int(os.getenv('XGB_RETRAIN_INTERVAL_HOURS', '168')) # retrain hebdomadaire
|
||||
|
||||
# Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé)
|
||||
# Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design.
|
||||
STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
|
||||
@ -405,6 +418,113 @@ def _ae_model_path(name: str, version_id: str) -> str:
|
||||
return os.path.join(MODEL_DIR, f'ae_{name}_{version_id}.pt')
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# XGBOOST — Troisième voix supervisée (labels historiques + feedback SOC)
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
def _xgb_model_path(name: str) -> str:
|
||||
return os.path.join(MODEL_DIR, f'xgb_{name}.json')
|
||||
|
||||
|
||||
def _xgb_meta_path(name: str) -> str:
|
||||
return os.path.join(MODEL_DIR, f'xgb_{name}.meta.json')
|
||||
|
||||
|
||||
def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -> tuple:
|
||||
"""Charge les labels historiques depuis ml_all_scores pour l'entraînement supervisé.
|
||||
|
||||
Positifs : threat_level IN ('HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') → label=1
|
||||
Négatifs : threat_level = 'NORMAL' → label=0
|
||||
Retourne (X, y) ou (None, None) si insuffisant.
|
||||
"""
|
||||
feature_cols = ', '.join(features)
|
||||
try:
|
||||
result = client.query(
|
||||
f"SELECT {feature_cols}, threat_level "
|
||||
f"FROM {DB}.ml_all_scores "
|
||||
f"WHERE threat_level IN ('NORMAL', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') "
|
||||
f"AND window_start >= now() - INTERVAL 7 DAY "
|
||||
f"ORDER BY rand() LIMIT 50000"
|
||||
)
|
||||
if not result.result_rows:
|
||||
return None, None
|
||||
cols = features + ['threat_level']
|
||||
df = pd.DataFrame(result.result_rows, columns=cols)
|
||||
df[features] = df[features].apply(pd.to_numeric, errors='coerce')
|
||||
df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=features)
|
||||
y = (df['threat_level'] != 'NORMAL').astype(int)
|
||||
if y.sum() < 10 or len(y) < min_labels:
|
||||
return None, None
|
||||
X = df[features].values
|
||||
return X, y.values
|
||||
except Exception as exc:
|
||||
log_info(f"[XGB] Erreur chargement labels : {exc}")
|
||||
return None, None
|
||||
|
||||
|
||||
def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
|
||||
"""Charge ou entraîne le modèle XGBoost supervisé.
|
||||
|
||||
Retourne le modèle XGBClassifier ou None si indisponible.
|
||||
"""
|
||||
if not XGB_AVAILABLE or XGB_WEIGHT <= 0:
|
||||
return None
|
||||
|
||||
model_path = _xgb_model_path(name)
|
||||
meta_path = _xgb_meta_path(name)
|
||||
|
||||
# Charger le modèle existant si récent
|
||||
if os.path.exists(model_path) and os.path.exists(meta_path):
|
||||
try:
|
||||
with open(meta_path) as f:
|
||||
meta = json.load(f)
|
||||
trained_at = datetime.fromisoformat(meta['trained_at'])
|
||||
age_h = (datetime.now() - trained_at).total_seconds() / 3600
|
||||
if age_h < XGB_RETRAIN_INTERVAL_H:
|
||||
model = xgb.XGBClassifier()
|
||||
model.load_model(model_path)
|
||||
log_info(f"[XGB][{name}] Modèle rechargé ({age_h:.1f}h / {XGB_RETRAIN_INTERVAL_H}h, {meta.get('n_labels', '?')} labels).")
|
||||
return model
|
||||
except Exception as exc:
|
||||
log_info(f"[XGB][{name}] Erreur chargement : {exc}")
|
||||
|
||||
# Entraîner un nouveau modèle
|
||||
X, y = _load_xgb_labels(client, features)
|
||||
if X is None:
|
||||
log_info(f"[XGB][{name}] Labels insuffisants (< {XGB_MIN_LABELS}) — XGBoost désactivé ce cycle.")
|
||||
# Tenter de réutiliser un modèle ancien
|
||||
if os.path.exists(model_path):
|
||||
try:
|
||||
model = xgb.XGBClassifier()
|
||||
model.load_model(model_path)
|
||||
return model
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
|
||||
model = xgb.XGBClassifier(
|
||||
n_estimators=200, max_depth=6, learning_rate=0.1,
|
||||
scale_pos_weight=scale_pos, eval_metric='logloss',
|
||||
random_state=42, n_jobs=-1,
|
||||
tree_method='hist',
|
||||
)
|
||||
model.fit(X, y, verbose=False)
|
||||
|
||||
model.save_model(model_path)
|
||||
meta = {
|
||||
'trained_at': datetime.now().isoformat(),
|
||||
'n_labels': len(y), 'n_positive': int(y.sum()),
|
||||
'n_negative': int((y == 0).sum()), 'n_features': len(features),
|
||||
'scale_pos_weight': scale_pos, 'model_name': name,
|
||||
}
|
||||
with open(meta_path, 'w') as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
|
||||
log_info(f"[XGB][{name}] Modèle entraîné : {len(y)} labels ({y.sum()} positifs), scale_pos_weight={scale_pos}")
|
||||
log_decision('XGB_TRAINED', cycle_id, name, meta)
|
||||
return model
|
||||
|
||||
|
||||
def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str):
|
||||
"""Charge le modèle IsolationForest existant ou en entraîne un nouveau si nécessaire.
|
||||
|
||||
@ -897,6 +1017,25 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
unknown_traffic['raw_anomaly_score'] = raw_scores
|
||||
unknown_traffic['model_name'] = name
|
||||
|
||||
# XGBoost supervisé — troisième voix (si labels historiques disponibles)
|
||||
unknown_traffic['xgb_prob'] = 0.0
|
||||
if XGB_AVAILABLE and XGB_WEIGHT > 0:
|
||||
try:
|
||||
xgb_client = get_client()
|
||||
xgb_model = load_or_train_xgb(name, xgb_client, valid_features, cycle_id)
|
||||
if xgb_model is not None:
|
||||
xgb_probs = xgb_model.predict_proba(X_test.values)[:, 1]
|
||||
unknown_traffic['xgb_prob'] = xgb_probs
|
||||
# Méta-learner : combiner anomaly_score (EIF+AE) et xgb_prob
|
||||
# anomaly_score déjà normalisé [0,1], xgb_prob est [0,1]
|
||||
α_xgb = XGB_WEIGHT
|
||||
unknown_traffic['anomaly_score'] = (
|
||||
(1 - α_xgb) * unknown_traffic['anomaly_score'] + α_xgb * xgb_probs
|
||||
)
|
||||
log_info(f"[{name}] Score combiné EIF+AE+XGB (β={α_xgb}): xgb_mean={xgb_probs.mean():.4f}")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] XGBoost scoring échoué : {exc} — EIF+AE seuls.")
|
||||
|
||||
# A2 — Seuil adaptatif calculé sur les scores BRUTS (même échelle que ANOMALY_THRESHOLD)
|
||||
effective_threshold = compute_adaptive_threshold(raw_scores)
|
||||
log_info(f"[{name}] Seuil effectif : {effective_threshold:.4f} (statique={ANOMALY_THRESHOLD}, percentile={ANOMALY_PERCENTILE})")
|
||||
@ -923,6 +1062,8 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
known_bots = known_bots.copy()
|
||||
known_bots['anomaly_score'] = 0.0
|
||||
known_bots['raw_anomaly_score'] = 0.0
|
||||
known_bots['ae_recon_error'] = 0.0
|
||||
known_bots['xgb_prob'] = 0.0
|
||||
known_bots['threat_level'] = 'KNOWN_BOT'
|
||||
known_bots['model_name'] = name
|
||||
known_bots['campaign_id'] = -1
|
||||
@ -941,6 +1082,8 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
anubis_allow = anubis_allow.copy()
|
||||
anubis_allow['anomaly_score'] = 0.0
|
||||
anubis_allow['raw_anomaly_score'] = 0.0
|
||||
anubis_allow['ae_recon_error'] = 0.0
|
||||
anubis_allow['xgb_prob'] = 0.0
|
||||
anubis_allow['threat_level'] = 'KNOWN_BOT'
|
||||
anubis_allow['model_name'] = name
|
||||
anubis_allow['campaign_id'] = -1
|
||||
@ -1317,7 +1460,7 @@ def fetch_and_analyze():
|
||||
'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name',
|
||||
'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label',
|
||||
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'campaign_id',
|
||||
'ae_recon_error'
|
||||
'ae_recon_error', 'xgb_prob'
|
||||
]
|
||||
scores_df = all_scored[[c for c in all_scores_cols if c in all_scored.columns]]
|
||||
client.insert_df(f'{DB}.ml_all_scores', scores_df)
|
||||
|
||||
@ -6,5 +6,6 @@ scipy>=1.14
|
||||
hdbscan>=0.8.38
|
||||
isotree>=0.6.1
|
||||
torch>=2.0
|
||||
xgboost>=2.0
|
||||
pyyaml>=6.0
|
||||
ja4-common @ file:///app/shared/ja4_common
|
||||
|
||||
@ -504,3 +504,86 @@ def test_ae_weight_combination():
|
||||
# Combined should be between min and max of components
|
||||
assert all(combined >= np.minimum(eif_norm, ae_norm) - 1e-9)
|
||||
assert all(combined <= np.maximum(eif_norm, ae_norm) + 1e-9)
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# XGBOOST TESTS
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def test_xgb_availability_flag():
|
||||
"""Verify XGBoost availability detection works without crashing."""
|
||||
try:
|
||||
import xgboost
|
||||
avail = True
|
||||
except ImportError:
|
||||
avail = False
|
||||
assert isinstance(avail, bool)
|
||||
|
||||
|
||||
def test_xgb_basic_train_and_predict():
|
||||
"""XGBoost can train on labeled data and predict probabilities."""
|
||||
try:
|
||||
import xgboost as xgb
|
||||
except ImportError:
|
||||
pytest.skip("xgboost not installed")
|
||||
|
||||
rng = np.random.default_rng(42)
|
||||
n_features = 10
|
||||
n_samples = 500
|
||||
X = rng.normal(0, 1, (n_samples, n_features))
|
||||
# Label: anomaly if feature 0 > 1.5 (simple rule)
|
||||
y = (X[:, 0] > 1.5).astype(int)
|
||||
|
||||
model = xgb.XGBClassifier(n_estimators=50, max_depth=3,
|
||||
eval_metric='logloss', random_state=42)
|
||||
model.fit(X, y, verbose=False)
|
||||
|
||||
probs = model.predict_proba(X)[:, 1]
|
||||
assert probs.shape == (n_samples,)
|
||||
assert 0 <= probs.min() <= probs.max() <= 1
|
||||
# High feature 0 should have higher probability
|
||||
high_mask = X[:, 0] > 2.0
|
||||
low_mask = X[:, 0] < -1.0
|
||||
if high_mask.any() and low_mask.any():
|
||||
assert np.mean(probs[high_mask]) > np.mean(probs[low_mask]), \
|
||||
"XGBoost should give higher prob to anomalous samples"
|
||||
|
||||
|
||||
def test_xgb_meta_learner_combination():
|
||||
"""Meta-learner combines EIF+AE score and XGBoost probability correctly."""
|
||||
eif_ae_score = np.array([0.1, 0.5, 0.9])
|
||||
xgb_prob = np.array([0.2, 0.6, 0.8])
|
||||
beta = 0.20 # XGB_WEIGHT default
|
||||
combined = (1 - beta) * eif_ae_score + beta * xgb_prob
|
||||
expected = np.array([0.1*0.8 + 0.2*0.2, 0.5*0.8 + 0.6*0.2, 0.9*0.8 + 0.8*0.2])
|
||||
np.testing.assert_allclose(combined, expected, rtol=1e-7)
|
||||
assert all(combined >= 0) and all(combined <= 1)
|
||||
|
||||
|
||||
def test_xgb_save_load_model():
|
||||
"""XGBoost model can be saved and loaded correctly."""
|
||||
try:
|
||||
import xgboost as xgb
|
||||
except ImportError:
|
||||
pytest.skip("xgboost not installed")
|
||||
import tempfile
|
||||
|
||||
rng = np.random.default_rng(42)
|
||||
X = rng.normal(0, 1, (200, 5))
|
||||
y = (X[:, 0] > 1.0).astype(int)
|
||||
|
||||
model = xgb.XGBClassifier(n_estimators=20, max_depth=3,
|
||||
eval_metric='logloss', random_state=42)
|
||||
model.fit(X, y, verbose=False)
|
||||
probs_before = model.predict_proba(X)[:, 1]
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f:
|
||||
path = f.name
|
||||
try:
|
||||
model.save_model(path)
|
||||
model2 = xgb.XGBClassifier()
|
||||
model2.load_model(path)
|
||||
probs_after = model2.predict_proba(X)[:, 1]
|
||||
np.testing.assert_allclose(probs_before, probs_after, rtol=1e-5)
|
||||
finally:
|
||||
os.remove(path)
|
||||
|
||||
Reference in New Issue
Block a user