fix: XGB label query + SHAP isotree compatibility
XGB: query was selecting features from ml_all_scores which doesn't store them. Now joins ml_all_scores (labels) with view_ai_features_1h (features). Dynamically discovers available columns to skip thesis §5 features not present in the view. Returns (model, features) tuple. SHAP: TreeExplainer doesn't support isotree. Fall back to permutation- based Explainer(model.decision_function, X_sample) for isotree. Verified: XGB trained on 50000 labels (18436 positives), triple-voice ensemble scoring active (EIF+AE+XGB), SHAP silent. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -435,44 +435,61 @@ def _xgb_meta_path(name: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -> tuple:
|
def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -> tuple:
|
||||||
"""Charge les labels historiques depuis ml_all_scores pour l'entraînement supervisé.
|
"""Charge les labels historiques depuis ml_all_scores + view_ai_features_1h.
|
||||||
|
|
||||||
|
Les labels (threat_level) viennent de ml_all_scores, les features de
|
||||||
|
view_ai_features_1h via une jointure sur (src_ip, ja4, host).
|
||||||
|
Les features absentes de la vue (ex: thesis §5 features) sont ignorées.
|
||||||
|
|
||||||
Positifs : threat_level IN ('HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') → label=1
|
Positifs : threat_level IN ('HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') → label=1
|
||||||
Négatifs : threat_level = 'NORMAL' → label=0
|
Négatifs : threat_level IN ('NORMAL', 'LEGITIMATE_BROWSER') → label=0
|
||||||
Retourne (X, y) ou (None, None) si insuffisant.
|
Retourne (X, y, usable_features) ou (None, None, None) si insuffisant.
|
||||||
"""
|
"""
|
||||||
feature_cols = ', '.join(features)
|
|
||||||
try:
|
try:
|
||||||
|
# Découvrir les colonnes disponibles dans la vue
|
||||||
|
cols_result = client.query(
|
||||||
|
f"SELECT name FROM system.columns "
|
||||||
|
f"WHERE database = '{DB}' AND table = 'view_ai_features_1h'"
|
||||||
|
)
|
||||||
|
available_cols = {row[0] for row in cols_result.result_rows} if cols_result.result_rows else set()
|
||||||
|
usable_features = [f for f in features if f in available_cols]
|
||||||
|
if len(usable_features) < 10:
|
||||||
|
log_info(f"[XGB] Seulement {len(usable_features)} features disponibles dans view_ai_features_1h — insuffisant.")
|
||||||
|
return None, None, None
|
||||||
|
|
||||||
|
feature_cols = ', '.join(f'f.{c}' for c in usable_features)
|
||||||
result = client.query(
|
result = client.query(
|
||||||
f"SELECT {feature_cols}, threat_level "
|
f"SELECT {feature_cols}, s.threat_level "
|
||||||
f"FROM {DB}.ml_all_scores "
|
f"FROM {DB}.ml_all_scores AS s "
|
||||||
f"WHERE threat_level IN ('NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') "
|
f"INNER JOIN {DB}.view_ai_features_1h AS f "
|
||||||
f"AND window_start >= now() - INTERVAL 7 DAY "
|
f" ON s.src_ip = f.src_ip AND s.ja4 = f.ja4 AND s.host = f.host "
|
||||||
|
f"WHERE s.threat_level IN ('NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') "
|
||||||
|
f"AND s.window_start >= now() - INTERVAL 7 DAY "
|
||||||
f"ORDER BY rand() LIMIT 50000"
|
f"ORDER BY rand() LIMIT 50000"
|
||||||
)
|
)
|
||||||
if not result.result_rows:
|
if not result.result_rows:
|
||||||
return None, None
|
return None, None, None
|
||||||
cols = features + ['threat_level']
|
cols = usable_features + ['threat_level']
|
||||||
df = pd.DataFrame(result.result_rows, columns=cols)
|
df = pd.DataFrame(result.result_rows, columns=cols)
|
||||||
df[features] = df[features].apply(pd.to_numeric, errors='coerce')
|
df[usable_features] = df[usable_features].apply(pd.to_numeric, errors='coerce')
|
||||||
df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=features)
|
df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=usable_features)
|
||||||
y = (~df['threat_level'].isin(['NORMAL', 'LEGITIMATE_BROWSER'])).astype(int)
|
y = (~df['threat_level'].isin(['NORMAL', 'LEGITIMATE_BROWSER'])).astype(int)
|
||||||
if y.sum() < 10 or len(y) < min_labels:
|
if y.sum() < 10 or len(y) < min_labels:
|
||||||
return None, None
|
return None, None, None
|
||||||
X = df[features].values
|
X = df[usable_features].values
|
||||||
return X, y.values
|
return X, y.values, usable_features
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
log_info(f"[XGB] Erreur chargement labels : {exc}")
|
log_info(f"[XGB] Erreur chargement labels : {exc}")
|
||||||
return None, None
|
return None, None, None
|
||||||
|
|
||||||
|
|
||||||
def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
|
def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
|
||||||
"""Charge ou entraîne le modèle XGBoost supervisé.
|
"""Charge ou entraîne le modèle XGBoost supervisé.
|
||||||
|
|
||||||
Retourne le modèle XGBClassifier ou None si indisponible.
|
Retourne (XGBClassifier, list[str] features) ou (None, None) si indisponible.
|
||||||
"""
|
"""
|
||||||
if not XGB_AVAILABLE or XGB_WEIGHT <= 0:
|
if not XGB_AVAILABLE or XGB_WEIGHT <= 0:
|
||||||
return None
|
return None, None
|
||||||
|
|
||||||
model_path = _xgb_model_path(name)
|
model_path = _xgb_model_path(name)
|
||||||
meta_path = _xgb_meta_path(name)
|
meta_path = _xgb_meta_path(name)
|
||||||
@ -488,23 +505,25 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
|
|||||||
model = xgb.XGBClassifier()
|
model = xgb.XGBClassifier()
|
||||||
model.load_model(model_path)
|
model.load_model(model_path)
|
||||||
log_info(f"[XGB][{name}] Modèle rechargé ({age_h:.1f}h / {XGB_RETRAIN_INTERVAL_H}h, {meta.get('n_labels', '?')} labels).")
|
log_info(f"[XGB][{name}] Modèle rechargé ({age_h:.1f}h / {XGB_RETRAIN_INTERVAL_H}h, {meta.get('n_labels', '?')} labels).")
|
||||||
return model
|
return model, meta.get('features', features)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
log_info(f"[XGB][{name}] Erreur chargement : {exc}")
|
log_info(f"[XGB][{name}] Erreur chargement : {exc}")
|
||||||
|
|
||||||
# Entraîner un nouveau modèle
|
# Entraîner un nouveau modèle
|
||||||
X, y = _load_xgb_labels(client, features)
|
X, y, xgb_features = _load_xgb_labels(client, features)
|
||||||
if X is None:
|
if X is None:
|
||||||
log_info(f"[XGB][{name}] Labels insuffisants (< {XGB_MIN_LABELS}) — XGBoost désactivé ce cycle.")
|
log_info(f"[XGB][{name}] Labels insuffisants (< {XGB_MIN_LABELS}) — XGBoost désactivé ce cycle.")
|
||||||
# Tenter de réutiliser un modèle ancien
|
# Tenter de réutiliser un modèle ancien
|
||||||
if os.path.exists(model_path):
|
if os.path.exists(model_path) and os.path.exists(meta_path):
|
||||||
try:
|
try:
|
||||||
model = xgb.XGBClassifier()
|
model = xgb.XGBClassifier()
|
||||||
model.load_model(model_path)
|
model.load_model(model_path)
|
||||||
return model
|
with open(meta_path) as f:
|
||||||
|
meta = json.load(f)
|
||||||
|
return model, meta.get('features', features)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return None
|
return None, None
|
||||||
|
|
||||||
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
|
scale_pos = max(1, int((y == 0).sum() / max((y == 1).sum(), 1)))
|
||||||
model = xgb.XGBClassifier(
|
model = xgb.XGBClassifier(
|
||||||
@ -519,7 +538,8 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
|
|||||||
meta = {
|
meta = {
|
||||||
'trained_at': datetime.now().isoformat(),
|
'trained_at': datetime.now().isoformat(),
|
||||||
'n_labels': len(y), 'n_positive': int(y.sum()),
|
'n_labels': len(y), 'n_positive': int(y.sum()),
|
||||||
'n_negative': int((y == 0).sum()), 'n_features': len(features),
|
'n_negative': int((y == 0).sum()), 'n_features': len(xgb_features),
|
||||||
|
'features': xgb_features,
|
||||||
'scale_pos_weight': scale_pos, 'model_name': name,
|
'scale_pos_weight': scale_pos, 'model_name': name,
|
||||||
}
|
}
|
||||||
with open(meta_path, 'w') as f:
|
with open(meta_path, 'w') as f:
|
||||||
@ -527,7 +547,7 @@ def load_or_train_xgb(name: str, client, features: list, cycle_id: str):
|
|||||||
|
|
||||||
log_info(f"[XGB][{name}] Modèle entraîné : {len(y)} labels ({y.sum()} positifs), scale_pos_weight={scale_pos}")
|
log_info(f"[XGB][{name}] Modèle entraîné : {len(y)} labels ({y.sum()} positifs), scale_pos_weight={scale_pos}")
|
||||||
log_decision('XGB_TRAINED', cycle_id, name, meta)
|
log_decision('XGB_TRAINED', cycle_id, name, meta)
|
||||||
return model
|
return model, xgb_features
|
||||||
|
|
||||||
|
|
||||||
def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str):
|
def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, cycle_id: str):
|
||||||
@ -876,12 +896,21 @@ def _compute_shap_top_features(model, X: pd.DataFrame, features: list, n_top: in
|
|||||||
Calcule les valeurs SHAP pour chaque ligne de X et retourne les n_top features
|
Calcule les valeurs SHAP pour chaque ligne de X et retourne les n_top features
|
||||||
les plus contributives (valeur SHAP la plus négative = plus responsable de l'anomalie).
|
les plus contributives (valeur SHAP la plus négative = plus responsable de l'anomalie).
|
||||||
Retourne une liste de dicts {feature: shap_value} par ligne.
|
Retourne une liste de dicts {feature: shap_value} par ligne.
|
||||||
|
|
||||||
|
Utilise TreeExplainer pour sklearn, et un échantillon Permutation pour isotree.
|
||||||
"""
|
"""
|
||||||
if not ENABLE_SHAP or X.empty:
|
if not ENABLE_SHAP or X.empty:
|
||||||
return [{}] * len(X)
|
return [{}] * len(X)
|
||||||
try:
|
try:
|
||||||
explainer = _shap.TreeExplainer(model)
|
if EIF_AVAILABLE:
|
||||||
shap_values = explainer.shap_values(X)
|
# isotree non supporté par TreeExplainer — utiliser un échantillon + Permutation
|
||||||
|
sample_size = min(100, len(X))
|
||||||
|
X_sample = X.sample(n=sample_size, random_state=42) if len(X) > sample_size else X
|
||||||
|
explainer = _shap.Explainer(model.decision_function, X_sample)
|
||||||
|
shap_values = explainer(X).values
|
||||||
|
else:
|
||||||
|
explainer = _shap.TreeExplainer(model)
|
||||||
|
shap_values = explainer.shap_values(X)
|
||||||
result = []
|
result = []
|
||||||
for sv in shap_values:
|
for sv in shap_values:
|
||||||
# Features les plus négatives = les plus responsables de l'anomalie
|
# Features les plus négatives = les plus responsables de l'anomalie
|
||||||
@ -1056,9 +1085,12 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
|||||||
if XGB_AVAILABLE and XGB_WEIGHT > 0:
|
if XGB_AVAILABLE and XGB_WEIGHT > 0:
|
||||||
try:
|
try:
|
||||||
xgb_client = get_client()
|
xgb_client = get_client()
|
||||||
xgb_model = load_or_train_xgb(name, xgb_client, valid_features, cycle_id)
|
xgb_model, xgb_feats = load_or_train_xgb(name, xgb_client, scoring_features, cycle_id)
|
||||||
if xgb_model is not None:
|
if xgb_model is not None and xgb_feats is not None:
|
||||||
xgb_probs = xgb_model.predict_proba(X_test.values)[:, 1]
|
# XGB peut utiliser un sous-ensemble de features (celles disponibles dans la vue)
|
||||||
|
xgb_cols = [f for f in xgb_feats if f in unknown_traffic.columns]
|
||||||
|
X_xgb = unknown_traffic[xgb_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
|
||||||
|
xgb_probs = xgb_model.predict_proba(X_xgb.values)[:, 1]
|
||||||
unknown_traffic['xgb_prob'] = xgb_probs
|
unknown_traffic['xgb_prob'] = xgb_probs
|
||||||
# Méta-learner : combiner anomaly_score (EIF+AE) et xgb_prob
|
# Méta-learner : combiner anomaly_score (EIF+AE) et xgb_prob
|
||||||
# anomaly_score déjà normalisé [0,1], xgb_prob est [0,1]
|
# anomaly_score déjà normalisé [0,1], xgb_prob est [0,1]
|
||||||
|
|||||||
Reference in New Issue
Block a user