feat: HTTP/2 passive fingerprinting with individual SETTINGS fields

Complete implementation of HTTP/2 passive fingerprinting per thesis §2.5.3: mod-reqin-log (C module): - Replace connection-level filter with ap_hook_process_connection (APR_HOOK_FIRST) to capture H2 preface before mod_http2 takes over the connection - AP_MODE_SPECULATIVE read of 512 bytes from c->input_filters - Parse SETTINGS, WINDOW_UPDATE, PRIORITY flags, pseudo-header order - Output individual SETTINGS params as separate JSON fields (IDs 1-6, 8) - Read H2 notes from c1 (master connection) for mod_http2 secondary conns - Fix header_order_signature JSON length bug (26→strlen) ClickHouse schema: - Add 8 new columns to http_logs: h2_has_priority, h2_header_table_size, h2_enable_push, h2_max_concurrent_streams, h2_initial_window_size, h2_max_frame_size, h2_max_header_list_size, h2_enable_connect_protocol - Use Int32/Int64 with DEFAULT -1 to distinguish absent vs zero - Update mv_http_logs to extract individual fields via JSONHas/JSONExtractInt - Migration 04_http2_fields.sql updated for existing deployments Correlator: - Accept both timestamp_ns and timestamp field names (backward compat) Integration: - Enable HTTP/2 in Apache: Protocols h2 http/1.1 in httpd-integration.conf Validated end-to-end via Playwright: H2 curl traffic → mod-reqin-log → correlator → ClickHouse with all 12 H2 columns populated correctly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-11 02:33:45 +02:00
parent bd81331411
commit 85d3b95b7b
25 changed files with 649 additions and 160 deletions
--- a/services/dashboard/backend/routes/api.py
+++ b/services/dashboard/backend/routes/api.py
@ -5,6 +5,8 @@ from __future__ import annotations
 import json
 import logging
 import os
+import re
+from collections import defaultdict
 from pathlib import Path
 from typing import Any

@ -21,6 +23,34 @@ router = APIRouter(prefix="/api")
 _DB = safe_identifier(DB_PROCESSING)
 _DB_LOGS = safe_identifier(DB_LOGS)

+# Regex pour extraire les features SHAP/ExIFFI depuis le champ reason
+# Format: "SHAP: feat1(+0.123) | feat2(-0.456)" ou "ExIFFI: ..."
+_SHAP_RE = re.compile(r"(?:SHAP|ExIFFI):\s*(.+?)(?:\s*\|\s*Threat|$)")
+_FEAT_RE = re.compile(r"(\w+)\(([+-]?\d+\.\d+)\)")
+
+
+def _aggregate_shap_importance(reasons: list[str]) -> list[dict]:
+    """Agrège les valeurs SHAP/ExIFFI extraites des champs reason."""
+    totals: dict[str, float] = defaultdict(float)
+    counts: dict[str, int] = defaultdict(int)
+    for reason in reasons:
+        m = _SHAP_RE.search(reason or "")
+        if not m:
+            continue
+        for feat_match in _FEAT_RE.finditer(m.group(1)):
+            name = feat_match.group(1)
+            val = abs(float(feat_match.group(2)))
+            totals[name] += val
+            counts[name] += 1
+    if not totals:
+        return []
+    return sorted(
+        [{"name": k, "importance": round(totals[k] / counts[k], 4), "occurrences": counts[k]}
+         for k in totals],
+        key=lambda x: -x["importance"],
+    )
+
+
 # Whitelists for sort/order to prevent SQL injection
 _DETECTION_SORT_COLS = {
    "detected_at", "src_ip", "ja4", "host", "anomaly_score",
@ -500,7 +530,7 @@ async def features() -> dict[str, Any]:
    except Exception:
        pass

-    # Feature variance (importance proxy)
+    # Feature variance (importance proxy — fallback si SHAP indisponible)
    try:
        variance_rows = query(
            f"SELECT "
@ -523,6 +553,22 @@ async def features() -> dict[str, Any]:
    except Exception:
        pass

+    # SHAP/ExIFFI — importance réelle extraite des anomalies détectées
+    try:
+        reason_rows = query(
+            f"SELECT reason FROM {_DB}.ml_detected_anomalies "
+            "WHERE reason LIKE '%SHAP:%' OR reason LIKE '%ExIFFI:%' "
+            "ORDER BY detected_at DESC LIMIT 500"
+        )
+        if reason_rows:
+            shap_importance = _aggregate_shap_importance(
+                [r["reason"] for r in reason_rows]
+            )
+            if shap_importance:
+                result["shap_importance"] = shap_importance
+    except Exception:
+        logger.debug("SHAP importance extraction unavailable")
+
    return result


@ -846,11 +892,11 @@ async def classify_suggested() -> dict[str, Any]:
 # ---------------------------------------------------------------------------
 class ClassifyRequest(BaseModel):
    src_ip: str
-    classification: str  # bot | legitimate | suspicious
+    classification: str  # true_positive | false_positive | suspicious
    comment: str = ""


-_VALID_CLASSIFICATIONS = {"bot", "legitimate", "suspicious"}
+_VALID_CLASSIFICATIONS = {"true_positive", "false_positive", "suspicious"}
 _feedback_table_ensured = False