From c994ad4466470999f32e69e7063bf9cbf7d08ca8 Mon Sep 17 00:00:00 2001 From: toto Date: Thu, 9 Apr 2026 00:06:54 +0200 Subject: [PATCH] fix: XGB label query + SHAP isotree compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XGB: query was selecting features from ml_all_scores which doesn't store them. Now joins ml_all_scores (labels) with view_ai_features_1h (features). Dynamically discovers available columns to skip thesis §5 features not present in the view. Returns (model, features) tuple. SHAP: TreeExplainer doesn't support isotree. Fall back to permutation- based Explainer(model.decision_function, X_sample) for isotree. Verified: XGB trained on 50000 labels (18436 positives), triple-voice ensemble scoring active (EIF+AE+XGB), SHAP silent. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../bot-detector/bot_detector/bot_detector.py | 92 +++++++++++++------ 1 file changed, 62 insertions(+), 30 deletions(-) diff --git a/services/bot-detector/bot_detector/bot_detector.py b/services/bot-detector/bot_detector/bot_detector.py index 106249a..cca0e49 100644 --- a/services/bot-detector/bot_detector/bot_detector.py +++ b/services/bot-detector/bot_detector/bot_detector.py @@ -435,44 +435,61 @@ def _xgb_meta_path(name: str) -> str: def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -> tuple: - """Charge les labels historiques depuis ml_all_scores pour l'entraînement supervisé. + """Charge les labels historiques depuis ml_all_scores + view_ai_features_1h. + + Les labels (threat_level) viennent de ml_all_scores, les features de + view_ai_features_1h via une jointure sur (src_ip, ja4, host). + Les features absentes de la vue (ex: thesis §5 features) sont ignorées. Positifs : threat_level IN ('HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') → label=1 - Négatifs : threat_level = 'NORMAL' → label=0 - Retourne (X, y) ou (None, None) si insuffisant. + Négatifs : threat_level IN ('NORMAL', 'LEGITIMATE_BROWSER') → label=0 + Retourne (X, y, usable_features) ou (None, None, None) si insuffisant. """ - feature_cols = ', '.join(features) try: + # Découvrir les colonnes disponibles dans la vue + cols_result = client.query( + f"SELECT name FROM system.columns " + f"WHERE database = '{DB}' AND table = 'view_ai_features_1h'" + ) + available_cols = {row[0] for row in cols_result.result_rows} if cols_result.result_rows else set() + usable_features = [f for f in features if f in available_cols] + if len(usable_features) < 10: + log_info(f"[XGB] Seulement {len(usable_features)} features disponibles dans view_ai_features_1h — insuffisant.") + return None, None, None + + feature_cols = ', '.join(f'f.{c}' for c in usable_features) result = client.query( - f"SELECT {feature_cols}, threat_level " - f"FROM {DB}.ml_all_scores " - f"WHERE threat_level IN ('NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') " - f"AND window_start >= now() - INTERVAL 7 DAY " + f"SELECT {feature_cols}, s.threat_level " + f"FROM {DB}.ml_all_scores AS s " + f"INNER JOIN {DB}.view_ai_features_1h AS f " + f" ON s.src_ip = f.src_ip AND s.ja4 = f.ja4 AND s.host = f.host " + f"WHERE s.threat_level IN ('NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') " + f"AND s.window_start >= now() - INTERVAL 7 DAY " f"ORDER BY rand() LIMIT 50000" ) if not result.result_rows: - return None, None - cols = features + ['threat_level'] + return None, None, None + cols = usable_features + ['threat_level'] df = pd.DataFrame(result.result_rows, columns=cols) - df[features] = df[features].apply(pd.to_numeric, errors='coerce') - df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=features) + df[usable_features] = df[usable_features].apply(pd.to_numeric, errors='coerce') + df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=usable_features) y = (~df['threat_level'].isin(['NORMAL', 'LEGITIMATE_BROWSER'])).astype(int) if y.sum() < 10 or len(y) < min_labels: - return None, None - X = df[features].values - return X, y.values + return None, None, None + X = df[usable_features].values + return X, y.values, usable_features except Exception as exc: log_info(f"[XGB] Erreur chargement labels : {exc}") - return None, None + return None, None, None def load_or_train_xgb(name: str, client, features: list, cycle_id: str): """Charge ou entraîne le modèle XGBoost supervisé. - Retourne le modèle XGBClassifier ou None si indisponible. + Retourne (XGBClassifier, list[str] features) ou (None, None) si indisponible. """ if not XGB_AVAILABLE or XGB_WEIGHT <= 0: - return None + return None, None model_path = _xgb_model_path(name) meta_path = _xgb_meta_path(name) @@ -488,23 +505,25 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str): model = xgb.XGBClassifier() model.load_model(model_path) log_info(f"[XGB][{name}] Modèle rechargé ({age_h:.1f}h / {XGB_RETRAIN_INTERVAL_H}h, {meta.get('n_labels', '?')} labels).") - return model + return model, meta.get('features', features) except Exception as exc: log_info(f"[XGB][{name}] Erreur chargement : {exc}") # Entraîner un nouveau modèle - X, y = _load_xgb_labels(client, features) + X, y, xgb_features = _load_xgb_labels(client, features) if X is None: log_info(f"[XGB][{name}] Labels insuffisants (< {XGB_MIN_LABELS}) — XGBoost désactivé ce cycle.") # Tenter de réutiliser un modèle ancien - if os.path.exists(model_path): + if os.path.exists(model_path) and os.path.exists(meta_path): try: model = xgb.XGBClassifier() model.load_model(model_path) - return model + with open(meta_path) as f: + meta = json.load(f) + return model, meta.get('features', features) except Exception: pass - return None + return None, None scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1))) model = xgb.XGBClassifier( @@ -519,7 +538,8 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str): meta = { 'trained_at': datetime.now().isoformat(), 'n_labels': len(y), 'n_positive': int(y.sum()), - 'n_negative': int((y == 0).sum()), 'n_features': len(features), + 'n_negative': int((y == 0).sum()), 'n_features': len(xgb_features), + 'features': xgb_features, 'scale_pos_weight': scale_pos, 'model_name': name, } with open(meta_path, 'w') as f: @@ -527,7 +547,7 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str): log_info(f"[XGB][{name}] Modèle entraîné : {len(y)} labels ({y.sum()} positifs), scale_pos_weight={scale_pos}") log_decision('XGB_TRAINED', cycle_id, name, meta) - return model + return model, xgb_features def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str): @@ -876,12 +896,21 @@ def _compute_shap_top_features(model, X: pd.DataFrame, features: list, n_top: in Calcule les valeurs SHAP pour chaque ligne de X et retourne les n_top features les plus contributives (valeur SHAP la plus négative = plus responsable de l'anomalie). Retourne une liste de dicts {feature: shap_value} par ligne. + + Utilise TreeExplainer pour sklearn, et un échantillon Permutation pour isotree. """ if not ENABLE_SHAP or X.empty: return [{}] * len(X) try: - explainer = _shap.TreeExplainer(model) - shap_values = explainer.shap_values(X) + if EIF_AVAILABLE: + # isotree non supporté par TreeExplainer — utiliser un échantillon + Permutation + sample_size = min(100, len(X)) + X_sample = X.sample(n=sample_size, random_state=42) if len(X) > sample_size else X + explainer = _shap.Explainer(model.decision_function, X_sample) + shap_values = explainer(X).values + else: + explainer = _shap.TreeExplainer(model) + shap_values = explainer.shap_values(X) result = [] for sv in shap_values: # Features les plus négatives = les plus responsables de l'anomalie @@ -1056,9 +1085,12 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map): if XGB_AVAILABLE and XGB_WEIGHT > 0: try: xgb_client = get_client() - xgb_model = load_or_train_xgb(name, xgb_client, valid_features, cycle_id) - if xgb_model is not None: - xgb_probs = xgb_model.predict_proba(X_test.values)[:, 1] + xgb_model, xgb_feats = load_or_train_xgb(name, xgb_client, scoring_features, cycle_id) + if xgb_model is not None and xgb_feats is not None: + # XGB peut utiliser un sous-ensemble de features (celles disponibles dans la vue) + xgb_cols = [f for f in xgb_feats if f in unknown_traffic.columns] + X_xgb = unknown_traffic[xgb_cols].replace([np.inf, -np.inf], np.nan).fillna(0) + xgb_probs = xgb_model.predict_proba(X_xgb.values)[:, 1] unknown_traffic['xgb_prob'] = xgb_probs # Méta-learner : combiner anomaly_score (EIF+AE) et xgb_prob # anomaly_score déjà normalisé [0,1], xgb_prob est [0,1]