feat: HTTP/2 passive fingerprinting with individual SETTINGS fields

Complete implementation of HTTP/2 passive fingerprinting per thesis §2.5.3: mod-reqin-log (C module): - Replace connection-level filter with ap_hook_process_connection (APR_HOOK_FIRST) to capture H2 preface before mod_http2 takes over the connection - AP_MODE_SPECULATIVE read of 512 bytes from c->input_filters - Parse SETTINGS, WINDOW_UPDATE, PRIORITY flags, pseudo-header order - Output individual SETTINGS params as separate JSON fields (IDs 1-6, 8) - Read H2 notes from c1 (master connection) for mod_http2 secondary conns - Fix header_order_signature JSON length bug (26→strlen) ClickHouse schema: - Add 8 new columns to http_logs: h2_has_priority, h2_header_table_size, h2_enable_push, h2_max_concurrent_streams, h2_initial_window_size, h2_max_frame_size, h2_max_header_list_size, h2_enable_connect_protocol - Use Int32/Int64 with DEFAULT -1 to distinguish absent vs zero - Update mv_http_logs to extract individual fields via JSONHas/JSONExtractInt - Migration 04_http2_fields.sql updated for existing deployments Correlator: - Accept both timestamp_ns and timestamp field names (backward compat) Integration: - Enable HTTP/2 in Apache: Protocols h2 http/1.1 in httpd-integration.conf Validated end-to-end via Playwright: H2 curl traffic → mod-reqin-log → correlator → ClickHouse with all 12 H2 columns populated correctly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-11 02:33:45 +02:00
parent bd81331411
commit 85d3b95b7b
25 changed files with 649 additions and 160 deletions
--- a/services/dashboard/backend/routes/api.py
+++ b/services/dashboard/backend/routes/api.py
@ -5,6 +5,8 @@ from __future__ import annotations
 import json
 import logging
 import os
+import re
+from collections import defaultdict
 from pathlib import Path
 from typing import Any

@ -21,6 +23,34 @@ router = APIRouter(prefix="/api")
 _DB = safe_identifier(DB_PROCESSING)
 _DB_LOGS = safe_identifier(DB_LOGS)

+# Regex pour extraire les features SHAP/ExIFFI depuis le champ reason
+# Format: "SHAP: feat1(+0.123) | feat2(-0.456)" ou "ExIFFI: ..."
+_SHAP_RE = re.compile(r"(?:SHAP|ExIFFI):\s*(.+?)(?:\s*\|\s*Threat|$)")
+_FEAT_RE = re.compile(r"(\w+)\(([+-]?\d+\.\d+)\)")
+
+
+def _aggregate_shap_importance(reasons: list[str]) -> list[dict]:
+    """Agrège les valeurs SHAP/ExIFFI extraites des champs reason."""
+    totals: dict[str, float] = defaultdict(float)
+    counts: dict[str, int] = defaultdict(int)
+    for reason in reasons:
+        m = _SHAP_RE.search(reason or "")
+        if not m:
+            continue
+        for feat_match in _FEAT_RE.finditer(m.group(1)):
+            name = feat_match.group(1)
+            val = abs(float(feat_match.group(2)))
+            totals[name] += val
+            counts[name] += 1
+    if not totals:
+        return []
+    return sorted(
+        [{"name": k, "importance": round(totals[k] / counts[k], 4), "occurrences": counts[k]}
+         for k in totals],
+        key=lambda x: -x["importance"],
+    )
+
+
 # Whitelists for sort/order to prevent SQL injection
 _DETECTION_SORT_COLS = {
    "detected_at", "src_ip", "ja4", "host", "anomaly_score",
@ -500,7 +530,7 @@ async def features() -> dict[str, Any]:
    except Exception:
        pass

-    # Feature variance (importance proxy)
+    # Feature variance (importance proxy — fallback si SHAP indisponible)
    try:
        variance_rows = query(
            f"SELECT "
@ -523,6 +553,22 @@ async def features() -> dict[str, Any]:
    except Exception:
        pass

+    # SHAP/ExIFFI — importance réelle extraite des anomalies détectées
+    try:
+        reason_rows = query(
+            f"SELECT reason FROM {_DB}.ml_detected_anomalies "
+            "WHERE reason LIKE '%SHAP:%' OR reason LIKE '%ExIFFI:%' "
+            "ORDER BY detected_at DESC LIMIT 500"
+        )
+        if reason_rows:
+            shap_importance = _aggregate_shap_importance(
+                [r["reason"] for r in reason_rows]
+            )
+            if shap_importance:
+                result["shap_importance"] = shap_importance
+    except Exception:
+        logger.debug("SHAP importance extraction unavailable")
+
    return result


@ -846,11 +892,11 @@ async def classify_suggested() -> dict[str, Any]:
 # ---------------------------------------------------------------------------
 class ClassifyRequest(BaseModel):
    src_ip: str
-    classification: str  # bot | legitimate | suspicious
+    classification: str  # true_positive | false_positive | suspicious
    comment: str = ""


-_VALID_CLASSIFICATIONS = {"bot", "legitimate", "suspicious"}
+_VALID_CLASSIFICATIONS = {"true_positive", "false_positive", "suspicious"}
 _feedback_table_ensured = False


--- a/services/dashboard/backend/templates/classify.html
+++ b/services/dashboard/backend/templates/classify.html
@ -6,7 +6,7 @@
        <h4>Feedback analyste SOC</h4>
        <p>Classifiez les IPs pour entraîner le modèle XGBoost supervisé. Les labels sont utilisés au prochain cycle ML.</p>
        <p><strong>Workflow :</strong> 1. Consultez les IPs suggérées (non classifiées). 2. Classifiez-les. 3. Les labels alimentent XGBoost au prochain cycle.</p>
-        <p><strong>Bot :</strong> Confirme une IP malveillante. <strong>Légitime :</strong> Faux positif. <strong>Suspect :</strong> À surveiller.</p>
+        <p><strong>Vrai positif :</strong> Confirme un bot détecté. <strong>Faux positif :</strong> Trafic légitime mal détecté. <strong>Suspect :</strong> À surveiller.</p>
        <p class="doc-source">Source : soc_feedback → XGBoost training</p>
    </div></span>
 {% endblock %}
@ -15,8 +15,8 @@
    <!-- KPIs -->
    <div class="grid grid-cols-2 md:grid-cols-4 gap-3">
        <div class="kpi-card"><div class="text-[11px] text-gray-500 mb-1">Total classifiées</div><div class="text-xl font-bold text-brand-500" id="kpi-total">0</div></div>
-        <div class="kpi-card"><div class="text-[11px] text-gray-500 mb-1">🤖 Bots confirmés</div><div class="text-xl font-bold text-red-400" id="kpi-bots">0</div></div>
-        <div class="kpi-card"><div class="text-[11px] text-gray-500 mb-1">✅ Légitimes</div><div class="text-xl font-bold text-green-400" id="kpi-legit">0</div></div>
+        <div class="kpi-card"><div class="text-[11px] text-gray-500 mb-1">✅ Vrais positifs</div><div class="text-xl font-bold text-red-400" id="kpi-tp">0</div></div>
+        <div class="kpi-card"><div class="text-[11px] text-gray-500 mb-1">❌ Faux positifs</div><div class="text-xl font-bold text-green-400" id="kpi-fp">0</div></div>
        <div class="kpi-card"><div class="text-[11px] text-gray-500 mb-1">⚠️ Suspects</div><div class="text-xl font-bold text-yellow-400" id="kpi-suspect">0</div></div>
    </div>

@ -38,8 +38,8 @@
                <div>
                    <label class="block text-[11px] text-gray-500 mb-1">Classification</label>
                    <div class="grid grid-cols-3 gap-2">
-                        <button class="cls-type-btn px-3 py-2 rounded-lg text-sm font-medium transition-colors bg-red-500/20 text-red-400 border border-red-500/30 hover:bg-red-500/30" data-cls="bot">🤖 Bot</button>
-                        <button class="cls-type-btn px-3 py-2 rounded-lg text-sm font-medium transition-colors bg-green-500/20 text-green-400 border border-green-500/30 hover:bg-green-500/30" data-cls="legitimate">✅ Légitime</button>
+                        <button class="cls-type-btn px-3 py-2 rounded-lg text-sm font-medium transition-colors bg-red-500/20 text-red-400 border border-red-500/30 hover:bg-red-500/30" data-cls="true_positive">✅ Vrai positif</button>
+                        <button class="cls-type-btn px-3 py-2 rounded-lg text-sm font-medium transition-colors bg-green-500/20 text-green-400 border border-green-500/30 hover:bg-green-500/30" data-cls="false_positive">❌ Faux positif</button>
                        <button class="cls-type-btn px-3 py-2 rounded-lg text-sm font-medium transition-colors bg-yellow-500/20 text-yellow-400 border border-yellow-500/30 hover:bg-yellow-500/30" data-cls="suspicious">⚠️ Suspect</button>
                    </div>
                </div>
@ -114,7 +114,7 @@ document.querySelectorAll('.cls-type-btn').forEach(btn => {
        selectedCls = btn.dataset.cls;
        const sub = document.getElementById('cls-submit');
        sub.disabled = false;
-        sub.textContent = {bot:'🤖 Classifier comme Bot',legitimate:'✅ Classifier comme Légitime',suspicious:'⚠️ Classifier comme Suspect'}[selectedCls];
+        sub.textContent = {true_positive:'✅ Classifier comme Vrai positif',false_positive:'❌ Classifier comme Faux positif',suspicious:'⚠️ Classifier comme Suspect'}[selectedCls];
    };
 });

@ -158,13 +158,13 @@ async function loadAll() {
        const byType = {};
        (stats.stats||[]).forEach(r => { byType[r.classification] = r.cnt; });
        document.getElementById('kpi-total').textContent = fmtNum(stats.total||0);
-        document.getElementById('kpi-bots').textContent = fmtNum(byType.bot||0);
-        document.getElementById('kpi-legit').textContent = fmtNum(byType.legitimate||0);
+        document.getElementById('kpi-tp').textContent = fmtNum(byType.true_positive||0);
+        document.getElementById('kpi-fp').textContent = fmtNum(byType.false_positive||0);
        document.getElementById('kpi-suspect').textContent = fmtNum(byType.suspicious||0);

        // ── Distribution chart ──
-        const CLS_COLORS = {bot:'#ef4444',legitimate:'#22c55e',suspicious:'#eab308'};
-        const CLS_LABELS = {bot:'🤖 Bot',legitimate:'✅ Légitime',suspicious:'⚠️ Suspect'};
+        const CLS_COLORS = {true_positive:'#ef4444',false_positive:'#22c55e',suspicious:'#eab308'};
+        const CLS_LABELS = {true_positive:'✅ Vrai positif',false_positive:'❌ Faux positif',suspicious:'⚠️ Suspect'};
        if (stats.total > 0) {
            const el = document.getElementById('dist-chart');
            const ch = echarts.init(el);
@ -188,8 +188,8 @@ async function loadAll() {
            <td class="text-xs max-w-[100px] truncate">${row.asn_org ? fmtASN(row.asn_org) : ''}</td>
            <td>${fmtCountry(row.country_code)}</td>
            <td class="whitespace-nowrap">
-                <button onclick="quickClassify('${escapeHtml(row.src_ip)}','bot')" class="px-1.5 py-0.5 text-[10px] bg-red-500/20 text-red-400 rounded hover:bg-red-500/30" title="Bot">🤖</button>
-                <button onclick="quickClassify('${escapeHtml(row.src_ip)}','legitimate')" class="px-1.5 py-0.5 text-[10px] bg-green-500/20 text-green-400 rounded hover:bg-green-500/30" title="Légitime">✅</button>
+                <button onclick="quickClassify('${escapeHtml(row.src_ip)}','true_positive')" class="px-1.5 py-0.5 text-[10px] bg-red-500/20 text-red-400 rounded hover:bg-red-500/30" title="Vrai positif">✅</button>
+                <button onclick="quickClassify('${escapeHtml(row.src_ip)}','false_positive')" class="px-1.5 py-0.5 text-[10px] bg-green-500/20 text-green-400 rounded hover:bg-green-500/30" title="Faux positif">❌</button>
                <a href="/ip/${encodeURIComponent(row.src_ip)}" class="px-1.5 py-0.5 text-[10px] bg-gray-700 text-gray-300 rounded hover:bg-gray-600 inline-block" title="Détail">🔍</a>
            </td>
        </tr>`).join('') || '<tr><td colspan="8" class="text-center text-gray-500 py-4">Toutes les IPs ont été classifiées 🎉</td></tr>';
@ -198,7 +198,7 @@ async function loadAll() {
        document.getElementById('cls-history').innerHTML = (history.data||[]).map(row => `<tr onclick="window.location='/ip/${encodeURIComponent(row.src_ip)}'">
            <td class="text-xs text-gray-400">${(row.created_at||'').substring(0,16)}</td>
            <td class="whitespace-nowrap">${fmtIP(row.src_ip)}</td>
-            <td><span class="badge ${row.classification==='bot'?'badge-critical':row.classification==='legitimate'?'badge-low':'badge-medium'}">${escapeHtml(row.classification)}</span></td>
+            <td><span class="badge ${row.classification==='true_positive'?'badge-critical':row.classification==='false_positive'?'badge-low':'badge-medium'}">${escapeHtml(row.classification)}</span></td>
            <td class="text-xs max-w-[300px] truncate text-gray-400">${escapeHtml(row.comment||'')}</td>
        </tr>`).join('') || '<tr><td colspan="4" class="text-center text-gray-500 py-4">Aucune classification</td></tr>';

--- a/services/dashboard/backend/templates/features.html
+++ b/services/dashboard/backend/templates/features.html
@ -26,12 +26,12 @@
            <div class="section-body"><div id="chart-radar" style="height:360px"></div></div>
        </div>
        <div class="section-card">
-            <div class="section-header"><span class="section-title">Importance des features (Variance)
+            <div class="section-header"><span class="section-title" id="importance-title">Importance des features (SHAP/ExIFFI)
                <span class="relative inline-block"><button onclick="docToggle(this)" class="doc-btn">ⓘ</button><div class="doc-panel">
                    <h4>Feature importance</h4>
-                    <p>Variance inter-classe (ISP vs datacenter) de chaque feature. Les features à haute variance discriminent le mieux bots et humains.</p>
-                    <p><strong>Usage :</strong> Les features en tête sont les plus utiles pour le modèle EIF. Celles à variance nulle sont élaguées automatiquement.</p>
-                    <p class="doc-source">Source : view_ai_features_1h</p>
+                    <p>Importance moyenne des features issue de SHAP (XGBoost) ou ExIFFI (EIF). Chaque barre représente la contribution absolue moyenne d'une feature aux décisions d'anomalie récentes.</p>
+                    <p><strong>Fallback :</strong> Si aucune donnée SHAP/ExIFFI n'est disponible, la variance inter-classe (proxy statistique) est affichée à la place.</p>
+                    <p class="doc-source">Source : ml_detected_anomalies.reason (SHAP/ExIFFI) ou view_ai_features_1h (variance)</p>
                </div></span>
            </span></div>
            <div class="section-body"><div id="chart-importance" style="height:360px"></div></div>
@ -158,8 +158,16 @@ async function loadAll() {
            }));
        }

-        // ── Feature Importance (horizontal bar) ──
-        const fi = (feat.feature_importance || []).sort((a,b) => a.variance - b.variance);
+        // ── Feature Importance (horizontal bar) — SHAP/ExIFFI si disponible, variance sinon ──
+        const shapData = feat.shap_importance || [];
+        const varianceData = (feat.feature_importance || []).sort((a,b) => a.variance - b.variance);
+        const useShap = shapData.length > 0;
+        const fi = useShap
+            ? shapData.slice().sort((a,b) => a.importance - b.importance)
+            : varianceData;
+        const impLabel = useShap ? 'SHAP/ExIFFI (|valeur| moyenne)' : 'Variance';
+        document.getElementById('importance-title').childNodes[0].textContent =
+            useShap ? 'Importance des features (SHAP/ExIFFI) ' : 'Importance des features (Variance) ';
        const impChart = initChart('chart-importance');
        if (impChart && fi.length) {
            impChart.setOption(ecBase({
@ -175,12 +183,13 @@ async function loadAll() {
                    type:'value',
                    splitLine:{lineStyle:{color:EC_GRID, type:'dashed'}},
                    axisLabel:{color:EC_TEXT},
-                    name:'Variance', nameTextStyle:{color:EC_TEXT},
+                    name: impLabel, nameTextStyle:{color:EC_TEXT},
                },
                series:[{
-                    type:'bar', data: fi.map(f => f.variance), barWidth:'60%',
+                    type:'bar', data: fi.map(f => useShap ? f.importance : f.variance), barWidth:'60%',
                    itemStyle:{color: new echarts.graphic.LinearGradient(0,0,1,0,[
-                        {offset:0, color:'#6366f1'}, {offset:1, color:'#8b5cf6'}
+                        {offset:0, color: useShap ? '#f59e0b' : '#6366f1'},
+                        {offset:1, color: useShap ? '#ef4444' : '#8b5cf6'}
                    ])},
                    label:{show:true, position:'right', color:EC_TEXT, fontSize:10, formatter:p => p.value.toFixed(4)},
                }]