From db306fb9da7dbe461584c7fabe8f7e4744615e15 Mon Sep 17 00:00:00 2001
From: toto <antitbone@jefaismonrhum.sdv.fr>
Date: Wed, 8 Apr 2026 23:33:00 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20P0=20audit=20bugs=20=E2=80=94=20bot-dete?=
 =?UTF-8?q?ctor=20+=20dashboard=20+=20SQL?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bot-detector:
- B1.1: campaign_id and raw_anomaly_score now inserted into ml_detected_anomalies
- B1.4/B1.5: log_decision argument order fixed (cycle_id, name)
- B1.7: AE broadcast error — model now returns features list, scoring
  uses model's features instead of current cycle's (prevents dim mismatch)
- B1.8: Anubis ALLOW bots now get bot_name from anubis_bot_name

Dashboard:
- C1.1: XSS in ip_detail.html — {{ ip | tojson }} instead of raw string
- C1.2: Stored XSS via innerHTML — added escapeHtml() helper, all user-facing
  formatters (fmtIP, fmtASN, fmtCountry, fmtJA4, fmtBotName, fmtLabel) sanitized
- C2.1: status filter now correctly filters http_version column
- C2.2: heatmap toDayOfWeek() - 1 for 0-indexed JS days

SQL:
- B1.3: view_ip_recurrence worst_score uses max() not min() (0=normal, 1=anomal)
- B1.6: view_resource_cascade_1h joined into view_thesis_features_1h (§5.4)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../bot-detector/bot_detector/bot_detector.py | 22 +++++++++++--------
 services/dashboard/backend/routes/api.py      | 10 ++++-----
 .../dashboard/backend/templates/base.html     | 19 ++++++++++------
 .../backend/templates/ip_detail.html          |  2 +-
 shared/clickhouse/06_ml_tables.sql            |  4 ++--
 shared/clickhouse/12_thesis_features.sql      | 13 ++++++++++-
 6 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/services/bot-detector/bot_detector/bot_detector.py b/services/bot-detector/bot_detector/bot_detector.py
index 5781381..6047856 100644
--- a/services/bot-detector/bot_detector/bot_detector.py
+++ b/services/bot-detector/bot_detector/bot_detector.py
@@ -538,7 +538,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
     dérive, entraîne un nouveau modèle sur ``human_baseline``, le sérialise sur disque,
     met à jour le fichier pointeur et purge les anciennes versions.
 
-    Retourne (IsolationForest, TrafficAutoEncoder|None).
+    Retourne (IsolationForest, TrafficAutoEncoder|None, list[str] features).
     """
     model_path, meta = _get_current_version(name)
     if model_path and meta:
@@ -575,7 +575,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
                         log_info(f"[{name}] Autoencoder v{meta['version_id']} rechargé.")
                     except Exception as exc:
                         log_info(f"[{name}] Erreur chargement AE : {exc} — AE désactivé ce cycle.")
-            return joblib.load(model_path), ae_loaded
+            return joblib.load(model_path), ae_loaded, meta.get('features', features)
         elif not drift_forced:
             log_info(f"[{name}] Modèle v{meta['version_id']} expiré ({age_h:.1f}h ≥ {RETRAIN_INTERVAL_H}h) — retraining.")
 
@@ -593,7 +593,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
         log_info(f"[{name}] Élagage : {len(low_var_features)} feature(s) à variance < {PRUNE_VARIANCE_THRESHOLD} retirées : {low_var_features}")
         X = X.drop(columns=low_var_features)
         features = [f for f in features if f not in low_var_features]
-        log_decision('FEATURE_PRUNED', name, '', {'pruned': low_var_features, 'remaining': len(features)})
+        log_decision('FEATURE_PRUNED', cycle_id, name, {'pruned': low_var_features, 'remaining': len(features)})
 
     # Validation split : réserver 20% pour évaluation offline
     val_size = max(1, int(len(X) * 0.2))
@@ -620,7 +620,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
     VAL_ANOMALY_GATE = float(os.getenv('VAL_ANOMALY_GATE', '0.20'))
     if val_anomaly_rate > VAL_ANOMALY_GATE:
         log_info(f"[{name}] ⚠ REJET : val_anomaly_rate={val_anomaly_rate:.2%} > gate={VAL_ANOMALY_GATE:.0%} — baseline probablement contaminée.")
-        log_decision('MODEL_REJECTED', name, '', {
+        log_decision('MODEL_REJECTED', cycle_id, name, {
             'val_anomaly_rate': round(val_anomaly_rate, 4), 'gate': VAL_ANOMALY_GATE,
             'val_mean_score': round(val_mean_score, 4), 'version_id': version_id,
         })
@@ -635,7 +635,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
                         ae_prev = TrafficAutoEncoder.load_state_dict(torch.load(ae_prev_path, weights_only=False))
                     except Exception:
                         pass
-            return joblib.load(model_path), ae_prev
+            return joblib.load(model_path), ae_prev, meta.get('features', features)
         log_info(f"[{name}] Aucun modèle précédent — utilisation du modèle rejeté par défaut.")
 
     # A1 — Sauvegarder les statistiques de distribution avec quantile digest pour drift detection
@@ -694,7 +694,7 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
         'human_samples': len(human_baseline), 'next_retrain_in_h': RETRAIN_INTERVAL_H,
         'history_kept': MODEL_HISTORY_COUNT
     })
-    return model, ae_model
+    return model, ae_model, features
 
 # ═══════════════════════════════════════════════════════════════════════════════
 # A1 — DÉTECTION DE DÉRIVE CONCEPTUELLE (CONCEPT DRIFT)
@@ -1006,10 +1006,12 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
     log_info(f'[{name}]   Features validées : {len(valid_features)}/{len(features)} ({", ".join(valid_features[:5])}{"…" if len(valid_features) > 5 else ""})')
 
     # A1 — Dérive conceptuelle intégrée dans load_or_train_model
-    model, ae_model = load_or_train_model(name, human_baseline, valid_features, cycle_id)
+    model, ae_model, model_features = load_or_train_model(name, human_baseline, valid_features, cycle_id)
+    # Utiliser les features du modèle (possiblement différentes après pruning/chargement)
+    scoring_features = [f for f in model_features if f in unknown_traffic.columns]
     unknown_traffic = unknown_traffic.copy()
 
-    X_test = unknown_traffic[valid_features].replace([np.inf, -np.inf], np.nan)
+    X_test = unknown_traffic[scoring_features].replace([np.inf, -np.inf], np.nan)
     X_test = X_test.fillna(X_test.median())
     raw_scores = model.decision_function(X_test)
     log_info(f'[{name}]   Scoring EIF : {len(X_test)} sessions scorées (min={raw_scores.min():.4f}, max={raw_scores.max():.4f}, mean={raw_scores.mean():.4f})')
@@ -1135,6 +1137,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
         anubis_allow['ae_recon_error'] = 0.0
         anubis_allow['xgb_prob'] = 0.0
         anubis_allow['threat_level'] = 'KNOWN_BOT'
+        anubis_allow['bot_name'] = anubis_allow['anubis_bot_name']
         anubis_allow['model_name'] = name
         anubis_allow['campaign_id'] = -1
         anubis_allow['reason'] = '[Anubis ALLOW] ' + anubis_allow['anubis_bot_name']
@@ -1189,7 +1192,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
 
         # A8 — Clustering DBSCAN pour identifier les campagnes coordonnées
         if ENABLE_CLUSTERING:
-            anomalies = _cluster_anomalies(anomalies, valid_features, ae_model=ae_model)
+            anomalies = _cluster_anomalies(anomalies, scoring_features, ae_model=ae_model)
 
         anomalies['ja4'] = anomalies['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
         for _, row in anomalies.iterrows():
@@ -1623,6 +1626,7 @@ def fetch_and_analyze():
 
         cols = [
             'detected_at', 'src_ip', 'ja4', 'host', 'bot_name', 'browser_family', 'anomaly_score',
+            'raw_anomaly_score', 'campaign_id',
             'threat_level', 'model_name', 'recurrence',
             'asn_number', 'asn_org', 'asn_detail', 'asn_domain', 'country_code', 'asn_label',
             'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio', 'max_keepalives', 'orphan_ratio',
diff --git a/services/dashboard/backend/routes/api.py b/services/dashboard/backend/routes/api.py
index 459c368..611b879 100644
--- a/services/dashboard/backend/routes/api.py
+++ b/services/dashboard/backend/routes/api.py
@@ -314,7 +314,7 @@ async def traffic(
     order: str = Query("DESC"),
     method: str | None = Query(None),
     host: str | None = Query(None),
-    status: str | None = Query(None),
+    http_version: str | None = Query(None),
 ) -> dict[str, Any]:
     sort = _validate_sort(sort, _TRAFFIC_SORT_COLS, "time")
     order = _validate_order(order)
@@ -331,9 +331,9 @@ async def traffic(
         where_clauses.append("host LIKE {host:String}")
         params["host"] = f"%{host}%"
 
-    if status is not None:
-        where_clauses.append("http_version = {status:String}")
-        params["status"] = status
+    if http_version is not None:
+        where_clauses.append("http_version = {http_version:String}")
+        params["http_version"] = http_version
 
     where = " AND ".join(where_clauses)
 
@@ -651,7 +651,7 @@ async def behavior() -> dict[str, Any]:
 async def heatmap() -> dict[str, Any]:
     try:
         cells = query(
-            f"SELECT toDayOfWeek(time) AS dow, toHour(time) AS hour, count() AS cnt "
+            f"SELECT toDayOfWeek(time) - 1 AS dow, toHour(time) AS hour, count() AS cnt "
             f"FROM {_DB_LOGS}.http_logs "
             "WHERE time >= now() - INTERVAL 7 DAY "
             "GROUP BY dow, hour ORDER BY dow, hour"
diff --git a/services/dashboard/backend/templates/base.html b/services/dashboard/backend/templates/base.html
index 84fd630..c18c893 100644
--- a/services/dashboard/backend/templates/base.html
+++ b/services/dashboard/backend/templates/base.html
@@ -119,10 +119,15 @@
             };
             return `<span class="badge ${map[level]||'badge-normal'}">${level}</span>`;
         }
+        function escapeHtml(s) {
+            const d = document.createElement('div');
+            d.textContent = s;
+            return d.innerHTML;
+        }
         function fmtIP(ip) {
             if (!ip) return '';
             let s = String(ip).replace('::ffff:','');
-            return `<a href="/ip/${encodeURIComponent(s)}" class="text-brand-500 hover:underline">${s}</a>`;
+            return `<a href="/ip/${encodeURIComponent(s)}" class="text-brand-500 hover:underline">${escapeHtml(s)}</a>`;
         }
         function fmtScore(v) {
             let n = parseFloat(v);
@@ -134,24 +139,24 @@
         // ── Navigation helpers ──
         function fmtASN(org) {
             if (!org) return '';
-            return `<a href="/detections?asn_org=${encodeURIComponent(org)}" class="text-blue-400 hover:underline cursor-pointer">${org}</a>`;
+            return `<a href="/detections?asn_org=${encodeURIComponent(org)}" class="text-blue-400 hover:underline cursor-pointer">${escapeHtml(org)}</a>`;
         }
         function fmtCountry(cc) {
             if (!cc) return '';
             const flags = {'FR':'🇫🇷','DE':'🇩🇪','NL':'🇳🇱','GB':'🇬🇧','ES':'🇪🇸','US':'🇺🇸','RU':'🇷🇺','IT':'🇮🇹','JP':'🇯🇵','CN':'🇨🇳','KR':'🇰🇷','BR':'🇧🇷','AU':'🇦🇺','CA':'🇨🇦','IN':'🇮🇳'};
-            return `<a href="/detections?country_code=${encodeURIComponent(cc)}" class="hover:underline cursor-pointer">${flags[cc]||'🏳️'} ${cc}</a>`;
+            return `<a href="/detections?country_code=${encodeURIComponent(cc)}" class="hover:underline cursor-pointer">${flags[cc]||'🏳️'} ${escapeHtml(cc)}</a>`;
         }
         function fmtJA4(ja4) {
             if (!ja4) return '';
-            return `<a href="/detections?ja4=${encodeURIComponent(ja4)}" class="text-purple-400 hover:underline cursor-pointer font-mono text-xs" title="${ja4}">${ja4.substring(0,20)}…</a>`;
+            return `<a href="/detections?ja4=${encodeURIComponent(ja4)}" class="text-purple-400 hover:underline cursor-pointer font-mono text-xs" title="${escapeHtml(ja4)}">${escapeHtml(ja4.substring(0,20))}…</a>`;
         }
         function fmtJA4Full(ja4) {
             if (!ja4) return '';
-            return `<a href="/detections?ja4=${encodeURIComponent(ja4)}" class="text-purple-400 hover:underline cursor-pointer font-mono text-xs">${ja4}</a>`;
+            return `<a href="/detections?ja4=${encodeURIComponent(ja4)}" class="text-purple-400 hover:underline cursor-pointer font-mono text-xs">${escapeHtml(ja4)}</a>`;
         }
         function fmtBotName(name) {
             if (!name) return '';
-            return `<a href="/detections?bot_name=${encodeURIComponent(name)}" class="text-cyan-400 hover:underline cursor-pointer">${name}</a>`;
+            return `<a href="/detections?bot_name=${encodeURIComponent(name)}" class="text-cyan-400 hover:underline cursor-pointer">${escapeHtml(name)}</a>`;
         }
         function fmtThreatLink(level) {
             if (!level) return '';
@@ -160,7 +165,7 @@
         function fmtLabel(label) {
             if (!label) return '';
             const colors = {human:'text-green-400 bg-green-500/10',datacenter:'text-red-400 bg-red-500/10',hosting:'text-orange-400 bg-orange-500/10'};
-            return `<span class="px-1.5 py-0.5 rounded text-xs ${colors[label]||'text-gray-400 bg-gray-500/10'}">${label}</span>`;
+            return `<span class="px-1.5 py-0.5 rounded text-xs ${colors[label]||'text-gray-400 bg-gray-500/10'}">${escapeHtml(label)}</span>`;
         }
 
         // ── ECharts helpers ──
diff --git a/services/dashboard/backend/templates/ip_detail.html b/services/dashboard/backend/templates/ip_detail.html
index c7a4562..6d6cafc 100644
--- a/services/dashboard/backend/templates/ip_detail.html
+++ b/services/dashboard/backend/templates/ip_detail.html
@@ -69,7 +69,7 @@
 {% endblock %}
 {% block scripts %}
 <script>
-const IP = "{{ ip }}";
+const IP = {{ ip | tojson }};
 let charts = {};
 function initChart(id) {
     const el = document.getElementById(id);
diff --git a/shared/clickhouse/06_ml_tables.sql b/shared/clickhouse/06_ml_tables.sql
index 35420da..a7ce601 100644
--- a/shared/clickhouse/06_ml_tables.sql
+++ b/shared/clickhouse/06_ml_tables.sql
@@ -135,8 +135,8 @@ SELECT
     count()                             AS recurrence,
     min(detected_at)                    AS first_seen,
     max(detected_at)                    AS last_seen,
-    min(anomaly_score)                  AS worst_score,
-    argMin(threat_level, anomaly_score) AS worst_threat_level
+    max(anomaly_score)                  AS worst_score,
+    argMax(threat_level, anomaly_score) AS worst_threat_level
 FROM ja4_processing.ml_detected_anomalies
 -- Filtre temporel aligné sur le TTL de la table (30 jours)
 -- Évite de scanner les partitions expirées non encore supprimées par le TTL
diff --git a/shared/clickhouse/12_thesis_features.sql b/shared/clickhouse/12_thesis_features.sql
index d967125..3e78968 100644
--- a/shared/clickhouse/12_thesis_features.sql
+++ b/shared/clickhouse/12_thesis_features.sql
@@ -419,6 +419,7 @@ cross_domain_features AS (
 
 -- ── Jointure finale : features §5.1/§5.3 par (window, ip, ja4, host)
 --    enrichies des features §5.5/§5.8 par (window, ip)
+--    et des features §5.4 Resource Cascade par (window, ip, ja4, host)
 SELECT
     p.window_start,
     p.src_ip,
@@ -434,6 +435,11 @@ SELECT
     c.lag1_autocorrelation,
     c.benford_deviation,
     c.cadence_request_count,
+    -- §5.4 Resource Dependency Tree
+    coalesce(rc.doc_count, 0) AS doc_count,
+    coalesce(rc.asset_count, 0) AS asset_count,
+    coalesce(rc.root_to_first_asset_delay, -1.0) AS root_to_first_asset_delay,
+    coalesce(rc.asset_load_stddev, -1.0) AS asset_load_stddev,
     -- §5.5 Intra-Session JA4 Drift
     d.ja4_drift_ratio,
     d.ja4_distinct_in_session,
@@ -449,7 +455,12 @@ LEFT JOIN cadence_features c
     AND p.host = c.host
 LEFT JOIN cross_domain_features d
     ON p.window_start = d.window_start
-    AND p.src_ip = d.src_ip;
+    AND p.src_ip = d.src_ip
+LEFT JOIN ja4_processing.view_resource_cascade_1h rc
+    ON p.window_start = rc.window_start
+    AND p.src_ip = rc.src_ip
+    AND p.ja4 = rc.ja4
+    AND p.host = rc.host;
 
 
 -- =============================================================================