From db306fb9da7dbe461584c7fabe8f7e4744615e15 Mon Sep 17 00:00:00 2001 From: toto Date: Wed, 8 Apr 2026 23:33:00 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20P0=20audit=20bugs=20=E2=80=94=20bot-dete?= =?UTF-8?q?ctor=20+=20dashboard=20+=20SQL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bot-detector: - B1.1: campaign_id and raw_anomaly_score now inserted into ml_detected_anomalies - B1.4/B1.5: log_decision argument order fixed (cycle_id, name) - B1.7: AE broadcast error — model now returns features list, scoring uses model's features instead of current cycle's (prevents dim mismatch) - B1.8: Anubis ALLOW bots now get bot_name from anubis_bot_name Dashboard: - C1.1: XSS in ip_detail.html — {{ ip | tojson }} instead of raw string - C1.2: Stored XSS via innerHTML — added escapeHtml() helper, all user-facing formatters (fmtIP, fmtASN, fmtCountry, fmtJA4, fmtBotName, fmtLabel) sanitized - C2.1: status filter now correctly filters http_version column - C2.2: heatmap toDayOfWeek() - 1 for 0-indexed JS days SQL: - B1.3: view_ip_recurrence worst_score uses max() not min() (0=normal, 1=anomal) - B1.6: view_resource_cascade_1h joined into view_thesis_features_1h (§5.4) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../bot-detector/bot_detector/bot_detector.py | 22 +++++++++++-------- services/dashboard/backend/routes/api.py | 10 ++++----- .../dashboard/backend/templates/base.html | 19 ++++++++++------ .../backend/templates/ip_detail.html | 2 +- shared/clickhouse/06_ml_tables.sql | 4 ++-- shared/clickhouse/12_thesis_features.sql | 13 ++++++++++- 6 files changed, 45 insertions(+), 25 deletions(-) diff --git a/services/bot-detector/bot_detector/bot_detector.py b/services/bot-detector/bot_detector/bot_detector.py index 5781381..6047856 100644 --- a/services/bot-detector/bot_detector/bot_detector.py +++ b/services/bot-detector/bot_detector/bot_detector.py @@ -538,7 +538,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, dérive, entraîne un nouveau modèle sur ``human_baseline``, le sérialise sur disque, met à jour le fichier pointeur et purge les anciennes versions. - Retourne (IsolationForest, TrafficAutoEncoder|None). + Retourne (IsolationForest, TrafficAutoEncoder|None, list[str] features). """ model_path, meta = _get_current_version(name) if model_path and meta: @@ -575,7 +575,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, log_info(f"[{name}] Autoencoder v{meta['version_id']} rechargé.") except Exception as exc: log_info(f"[{name}] Erreur chargement AE : {exc} — AE désactivé ce cycle.") - return joblib.load(model_path), ae_loaded + return joblib.load(model_path), ae_loaded, meta.get('features', features) elif not drift_forced: log_info(f"[{name}] Modèle v{meta['version_id']} expiré ({age_h:.1f}h ≥ {RETRAIN_INTERVAL_H}h) — retraining.") @@ -593,7 +593,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, log_info(f"[{name}] Élagage : {len(low_var_features)} feature(s) à variance < {PRUNE_VARIANCE_THRESHOLD} retirées : {low_var_features}") X = X.drop(columns=low_var_features) features = [f for f in features if f not in low_var_features] - log_decision('FEATURE_PRUNED', name, '', {'pruned': low_var_features, 'remaining': len(features)}) + log_decision('FEATURE_PRUNED', cycle_id, name, {'pruned': low_var_features, 'remaining': len(features)}) # Validation split : réserver 20% pour évaluation offline val_size = max(1, int(len(X) * 0.2)) @@ -620,7 +620,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, VAL_ANOMALY_GATE = float(os.getenv('VAL_ANOMALY_GATE', '0.20')) if val_anomaly_rate > VAL_ANOMALY_GATE: log_info(f"[{name}] ⚠ REJET : val_anomaly_rate={val_anomaly_rate:.2%} > gate={VAL_ANOMALY_GATE:.0%} — baseline probablement contaminée.") - log_decision('MODEL_REJECTED', name, '', { + log_decision('MODEL_REJECTED', cycle_id, name, { 'val_anomaly_rate': round(val_anomaly_rate, 4), 'gate': VAL_ANOMALY_GATE, 'val_mean_score': round(val_mean_score, 4), 'version_id': version_id, }) @@ -635,7 +635,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, ae_prev = TrafficAutoEncoder.load_state_dict(torch.load(ae_prev_path, weights_only=False)) except Exception: pass - return joblib.load(model_path), ae_prev + return joblib.load(model_path), ae_prev, meta.get('features', features) log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.") # A1 — Sauvegarder les statistiques de distribution avec quantile digest pour drift detection @@ -694,7 +694,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list, 'human_samples': len(human_baseline), 'next_retrain_in_h': RETRAIN_INTERVAL_H, 'history_kept': MODEL_HISTORY_COUNT }) - return model, ae_model + return model, ae_model, features # ═══════════════════════════════════════════════════════════════════════════════ # A1 — DÉTECTION DE DÉRIVE CONCEPTUELLE (CONCEPT DRIFT) @@ -1006,10 +1006,12 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map): log_info(f'[{name}] Features validées : {len(valid_features)}/{len(features)} ({", ".join(valid_features[:5])}{"…" if len(valid_features) > 5 else ""})') # A1 — Dérive conceptuelle intégrée dans load_or_train_model - model, ae_model = load_or_train_model(name, human_baseline, valid_features, cycle_id) + model, ae_model, model_features = load_or_train_model(name, human_baseline, valid_features, cycle_id) + # Utiliser les features du modèle (possiblement différentes après pruning/chargement) + scoring_features = [f for f in model_features if f in unknown_traffic.columns] unknown_traffic = unknown_traffic.copy() - X_test = unknown_traffic[valid_features].replace([np.inf, -np.inf], np.nan) + X_test = unknown_traffic[scoring_features].replace([np.inf, -np.inf], np.nan) X_test = X_test.fillna(X_test.median()) raw_scores = model.decision_function(X_test) log_info(f'[{name}] Scoring EIF : {len(X_test)} sessions scorées (min={raw_scores.min():.4f}, max={raw_scores.max():.4f}, mean={raw_scores.mean():.4f})') @@ -1135,6 +1137,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map): anubis_allow['ae_recon_error'] = 0.0 anubis_allow['xgb_prob'] = 0.0 anubis_allow['threat_level'] = 'KNOWN_BOT' + anubis_allow['bot_name'] = anubis_allow['anubis_bot_name'] anubis_allow['model_name'] = name anubis_allow['campaign_id'] = -1 anubis_allow['reason'] = '[Anubis ALLOW] ' + anubis_allow['anubis_bot_name'] @@ -1189,7 +1192,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map): # A8 — Clustering DBSCAN pour identifier les campagnes coordonnées if ENABLE_CLUSTERING: - anomalies = _cluster_anomalies(anomalies, valid_features, ae_model=ae_model) + anomalies = _cluster_anomalies(anomalies, scoring_features, ae_model=ae_model) anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'}) for _, row in anomalies.iterrows(): @@ -1623,6 +1626,7 @@ def fetch_and_analyze(): cols = [ 'detected_at', 'src_ip', 'ja4', 'host', 'bot_name', 'browser_family', 'anomaly_score', + 'raw_anomaly_score', 'campaign_id', 'threat_level', 'model_name', 'recurrence', 'asn_number', 'asn_org', 'asn_detail', 'asn_domain', 'country_code', 'asn_label', 'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio', 'max_keepalives', 'orphan_ratio', diff --git a/services/dashboard/backend/routes/api.py b/services/dashboard/backend/routes/api.py index 459c368..611b879 100644 --- a/services/dashboard/backend/routes/api.py +++ b/services/dashboard/backend/routes/api.py @@ -314,7 +314,7 @@ async def traffic( order: str = Query("DESC"), method: str | None = Query(None), host: str | None = Query(None), - status: str | None = Query(None), + http_version: str | None = Query(None), ) -> dict[str, Any]: sort = _validate_sort(sort, _TRAFFIC_SORT_COLS, "time") order = _validate_order(order) @@ -331,9 +331,9 @@ async def traffic( where_clauses.append("host LIKE {host:String}") params["host"] = f"%{host}%" - if status is not None: - where_clauses.append("http_version = {status:String}") - params["status"] = status + if http_version is not None: + where_clauses.append("http_version = {http_version:String}") + params["http_version"] = http_version where = " AND ".join(where_clauses) @@ -651,7 +651,7 @@ async def behavior() -> dict[str, Any]: async def heatmap() -> dict[str, Any]: try: cells = query( - f"SELECT toDayOfWeek(time) AS dow, toHour(time) AS hour, count() AS cnt " + f"SELECT toDayOfWeek(time) - 1 AS dow, toHour(time) AS hour, count() AS cnt " f"FROM {_DB_LOGS}.http_logs " "WHERE time >= now() - INTERVAL 7 DAY " "GROUP BY dow, hour ORDER BY dow, hour" diff --git a/services/dashboard/backend/templates/base.html b/services/dashboard/backend/templates/base.html index 84fd630..c18c893 100644 --- a/services/dashboard/backend/templates/base.html +++ b/services/dashboard/backend/templates/base.html @@ -119,10 +119,15 @@ }; return `${level}`; } + function escapeHtml(s) { + const d = document.createElement('div'); + d.textContent = s; + return d.innerHTML; + } function fmtIP(ip) { if (!ip) return ''; let s = String(ip).replace('::ffff:',''); - return `${s}`; + return `${escapeHtml(s)}`; } function fmtScore(v) { let n = parseFloat(v); @@ -134,24 +139,24 @@ // ── Navigation helpers ── function fmtASN(org) { if (!org) return ''; - return `${org}`; + return `${escapeHtml(org)}`; } function fmtCountry(cc) { if (!cc) return ''; const flags = {'FR':'🇫🇷','DE':'🇩🇪','NL':'🇳🇱','GB':'🇬🇧','ES':'🇪🇸','US':'🇺🇸','RU':'🇷🇺','IT':'🇮🇹','JP':'🇯🇵','CN':'🇨🇳','KR':'🇰🇷','BR':'🇧🇷','AU':'🇦🇺','CA':'🇨🇦','IN':'🇮🇳'}; - return `${flags[cc]||'🏳️'} ${cc}`; + return `${flags[cc]||'🏳️'} ${escapeHtml(cc)}`; } function fmtJA4(ja4) { if (!ja4) return ''; - return `${ja4.substring(0,20)}…`; + return `${escapeHtml(ja4.substring(0,20))}…`; } function fmtJA4Full(ja4) { if (!ja4) return ''; - return `${ja4}`; + return `${escapeHtml(ja4)}`; } function fmtBotName(name) { if (!name) return ''; - return `${name}`; + return `${escapeHtml(name)}`; } function fmtThreatLink(level) { if (!level) return ''; @@ -160,7 +165,7 @@ function fmtLabel(label) { if (!label) return ''; const colors = {human:'text-green-400 bg-green-500/10',datacenter:'text-red-400 bg-red-500/10',hosting:'text-orange-400 bg-orange-500/10'}; - return `${label}`; + return `${escapeHtml(label)}`; } // ── ECharts helpers ── diff --git a/services/dashboard/backend/templates/ip_detail.html b/services/dashboard/backend/templates/ip_detail.html index c7a4562..6d6cafc 100644 --- a/services/dashboard/backend/templates/ip_detail.html +++ b/services/dashboard/backend/templates/ip_detail.html @@ -69,7 +69,7 @@ {% endblock %} {% block scripts %}