fix: P0 audit bugs — bot-detector + dashboard + SQL
Bot-detector:
- B1.1: campaign_id and raw_anomaly_score now inserted into ml_detected_anomalies
- B1.4/B1.5: log_decision argument order fixed (cycle_id, name)
- B1.7: AE broadcast error — model now returns features list, scoring
uses model's features instead of current cycle's (prevents dim mismatch)
- B1.8: Anubis ALLOW bots now get bot_name from anubis_bot_name
Dashboard:
- C1.1: XSS in ip_detail.html — {{ ip | tojson }} instead of raw string
- C1.2: Stored XSS via innerHTML — added escapeHtml() helper, all user-facing
formatters (fmtIP, fmtASN, fmtCountry, fmtJA4, fmtBotName, fmtLabel) sanitized
- C2.1: status filter now correctly filters http_version column
- C2.2: heatmap toDayOfWeek() - 1 for 0-indexed JS days
SQL:
- B1.3: view_ip_recurrence worst_score uses max() not min() (0=normal, 1=anomal)
- B1.6: view_resource_cascade_1h joined into view_thesis_features_1h (§5.4)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -538,7 +538,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
dérive, entraîne un nouveau modèle sur ``human_baseline``, le sérialise sur disque,
|
||||
met à jour le fichier pointeur et purge les anciennes versions.
|
||||
|
||||
Retourne (IsolationForest, TrafficAutoEncoder|None).
|
||||
Retourne (IsolationForest, TrafficAutoEncoder|None, list[str] features).
|
||||
"""
|
||||
model_path, meta = _get_current_version(name)
|
||||
if model_path and meta:
|
||||
@ -575,7 +575,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
log_info(f"[{name}] Autoencoder v{meta['version_id']} rechargé.")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] Erreur chargement AE : {exc} — AE désactivé ce cycle.")
|
||||
return joblib.load(model_path), ae_loaded
|
||||
return joblib.load(model_path), ae_loaded, meta.get('features', features)
|
||||
elif not drift_forced:
|
||||
log_info(f"[{name}] Modèle v{meta['version_id']} expiré ({age_h:.1f}h ≥ {RETRAIN_INTERVAL_H}h) — retraining.")
|
||||
|
||||
@ -593,7 +593,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
log_info(f"[{name}] Élagage : {len(low_var_features)} feature(s) à variance < {PRUNE_VARIANCE_THRESHOLD} retirées : {low_var_features}")
|
||||
X = X.drop(columns=low_var_features)
|
||||
features = [f for f in features if f not in low_var_features]
|
||||
log_decision('FEATURE_PRUNED', name, '', {'pruned': low_var_features, 'remaining': len(features)})
|
||||
log_decision('FEATURE_PRUNED', cycle_id, name, {'pruned': low_var_features, 'remaining': len(features)})
|
||||
|
||||
# Validation split : réserver 20% pour évaluation offline
|
||||
val_size = max(1, int(len(X) * 0.2))
|
||||
@ -620,7 +620,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
VAL_ANOMALY_GATE = float(os.getenv('VAL_ANOMALY_GATE', '0.20'))
|
||||
if val_anomaly_rate > VAL_ANOMALY_GATE:
|
||||
log_info(f"[{name}] ⚠ REJET : val_anomaly_rate={val_anomaly_rate:.2%} > gate={VAL_ANOMALY_GATE:.0%} — baseline probablement contaminée.")
|
||||
log_decision('MODEL_REJECTED', name, '', {
|
||||
log_decision('MODEL_REJECTED', cycle_id, name, {
|
||||
'val_anomaly_rate': round(val_anomaly_rate, 4), 'gate': VAL_ANOMALY_GATE,
|
||||
'val_mean_score': round(val_mean_score, 4), 'version_id': version_id,
|
||||
})
|
||||
@ -635,7 +635,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
ae_prev = TrafficAutoEncoder.load_state_dict(torch.load(ae_prev_path, weights_only=False))
|
||||
except Exception:
|
||||
pass
|
||||
return joblib.load(model_path), ae_prev
|
||||
return joblib.load(model_path), ae_prev, meta.get('features', features)
|
||||
log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.")
|
||||
|
||||
# A1 — Sauvegarder les statistiques de distribution avec quantile digest pour drift detection
|
||||
@ -694,7 +694,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
|
||||
'human_samples': len(human_baseline), 'next_retrain_in_h': RETRAIN_INTERVAL_H,
|
||||
'history_kept': MODEL_HISTORY_COUNT
|
||||
})
|
||||
return model, ae_model
|
||||
return model, ae_model, features
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# A1 — DÉTECTION DE DÉRIVE CONCEPTUELLE (CONCEPT DRIFT)
|
||||
@ -1006,10 +1006,12 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
log_info(f'[{name}] Features validées : {len(valid_features)}/{len(features)} ({", ".join(valid_features[:5])}{"…" if len(valid_features) > 5 else ""})')
|
||||
|
||||
# A1 — Dérive conceptuelle intégrée dans load_or_train_model
|
||||
model, ae_model = load_or_train_model(name, human_baseline, valid_features, cycle_id)
|
||||
model, ae_model, model_features = load_or_train_model(name, human_baseline, valid_features, cycle_id)
|
||||
# Utiliser les features du modèle (possiblement différentes après pruning/chargement)
|
||||
scoring_features = [f for f in model_features if f in unknown_traffic.columns]
|
||||
unknown_traffic = unknown_traffic.copy()
|
||||
|
||||
X_test = unknown_traffic[valid_features].replace([np.inf, -np.inf], np.nan)
|
||||
X_test = unknown_traffic[scoring_features].replace([np.inf, -np.inf], np.nan)
|
||||
X_test = X_test.fillna(X_test.median())
|
||||
raw_scores = model.decision_function(X_test)
|
||||
log_info(f'[{name}] Scoring EIF : {len(X_test)} sessions scorées (min={raw_scores.min():.4f}, max={raw_scores.max():.4f}, mean={raw_scores.mean():.4f})')
|
||||
@ -1135,6 +1137,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
anubis_allow['ae_recon_error'] = 0.0
|
||||
anubis_allow['xgb_prob'] = 0.0
|
||||
anubis_allow['threat_level'] = 'KNOWN_BOT'
|
||||
anubis_allow['bot_name'] = anubis_allow['anubis_bot_name']
|
||||
anubis_allow['model_name'] = name
|
||||
anubis_allow['campaign_id'] = -1
|
||||
anubis_allow['reason'] = '[Anubis ALLOW] ' + anubis_allow['anubis_bot_name']
|
||||
@ -1189,7 +1192,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
|
||||
# A8 — Clustering DBSCAN pour identifier les campagnes coordonnées
|
||||
if ENABLE_CLUSTERING:
|
||||
anomalies = _cluster_anomalies(anomalies, valid_features, ae_model=ae_model)
|
||||
anomalies = _cluster_anomalies(anomalies, scoring_features, ae_model=ae_model)
|
||||
|
||||
anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
|
||||
for _, row in anomalies.iterrows():
|
||||
@ -1623,6 +1626,7 @@ def fetch_and_analyze():
|
||||
|
||||
cols = [
|
||||
'detected_at', 'src_ip', 'ja4', 'host', 'bot_name', 'browser_family', 'anomaly_score',
|
||||
'raw_anomaly_score', 'campaign_id',
|
||||
'threat_level', 'model_name', 'recurrence',
|
||||
'asn_number', 'asn_org', 'asn_detail', 'asn_domain', 'country_code', 'asn_label',
|
||||
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio', 'max_keepalives', 'orphan_ratio',
|
||||
|
||||
Reference in New Issue
Block a user