feat: HTTP/2 passive fingerprinting with individual SETTINGS fields
Complete implementation of HTTP/2 passive fingerprinting per thesis §2.5.3: mod-reqin-log (C module): - Replace connection-level filter with ap_hook_process_connection (APR_HOOK_FIRST) to capture H2 preface before mod_http2 takes over the connection - AP_MODE_SPECULATIVE read of 512 bytes from c->input_filters - Parse SETTINGS, WINDOW_UPDATE, PRIORITY flags, pseudo-header order - Output individual SETTINGS params as separate JSON fields (IDs 1-6, 8) - Read H2 notes from c1 (master connection) for mod_http2 secondary conns - Fix header_order_signature JSON length bug (26→strlen) ClickHouse schema: - Add 8 new columns to http_logs: h2_has_priority, h2_header_table_size, h2_enable_push, h2_max_concurrent_streams, h2_initial_window_size, h2_max_frame_size, h2_max_header_list_size, h2_enable_connect_protocol - Use Int32/Int64 with DEFAULT -1 to distinguish absent vs zero - Update mv_http_logs to extract individual fields via JSONHas/JSONExtractInt - Migration 04_http2_fields.sql updated for existing deployments Correlator: - Accept both timestamp_ns and timestamp field names (backward compat) Integration: - Enable HTTP/2 in Apache: Protocols h2 http/1.1 in httpd-integration.conf Validated end-to-end via Playwright: H2 curl traffic → mod-reqin-log → correlator → ClickHouse with all 12 H2 columns populated correctly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -5,6 +5,8 @@ from __future__ import annotations
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
@ -21,6 +23,34 @@ router = APIRouter(prefix="/api")
|
||||
_DB = safe_identifier(DB_PROCESSING)
|
||||
_DB_LOGS = safe_identifier(DB_LOGS)
|
||||
|
||||
# Regex pour extraire les features SHAP/ExIFFI depuis le champ reason
|
||||
# Format: "SHAP: feat1(+0.123) | feat2(-0.456)" ou "ExIFFI: ..."
|
||||
_SHAP_RE = re.compile(r"(?:SHAP|ExIFFI):\s*(.+?)(?:\s*\|\s*Threat|$)")
|
||||
_FEAT_RE = re.compile(r"(\w+)\(([+-]?\d+\.\d+)\)")
|
||||
|
||||
|
||||
def _aggregate_shap_importance(reasons: list[str]) -> list[dict]:
|
||||
"""Agrège les valeurs SHAP/ExIFFI extraites des champs reason."""
|
||||
totals: dict[str, float] = defaultdict(float)
|
||||
counts: dict[str, int] = defaultdict(int)
|
||||
for reason in reasons:
|
||||
m = _SHAP_RE.search(reason or "")
|
||||
if not m:
|
||||
continue
|
||||
for feat_match in _FEAT_RE.finditer(m.group(1)):
|
||||
name = feat_match.group(1)
|
||||
val = abs(float(feat_match.group(2)))
|
||||
totals[name] += val
|
||||
counts[name] += 1
|
||||
if not totals:
|
||||
return []
|
||||
return sorted(
|
||||
[{"name": k, "importance": round(totals[k] / counts[k], 4), "occurrences": counts[k]}
|
||||
for k in totals],
|
||||
key=lambda x: -x["importance"],
|
||||
)
|
||||
|
||||
|
||||
# Whitelists for sort/order to prevent SQL injection
|
||||
_DETECTION_SORT_COLS = {
|
||||
"detected_at", "src_ip", "ja4", "host", "anomaly_score",
|
||||
@ -500,7 +530,7 @@ async def features() -> dict[str, Any]:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Feature variance (importance proxy)
|
||||
# Feature variance (importance proxy — fallback si SHAP indisponible)
|
||||
try:
|
||||
variance_rows = query(
|
||||
f"SELECT "
|
||||
@ -523,6 +553,22 @@ async def features() -> dict[str, Any]:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# SHAP/ExIFFI — importance réelle extraite des anomalies détectées
|
||||
try:
|
||||
reason_rows = query(
|
||||
f"SELECT reason FROM {_DB}.ml_detected_anomalies "
|
||||
"WHERE reason LIKE '%SHAP:%' OR reason LIKE '%ExIFFI:%' "
|
||||
"ORDER BY detected_at DESC LIMIT 500"
|
||||
)
|
||||
if reason_rows:
|
||||
shap_importance = _aggregate_shap_importance(
|
||||
[r["reason"] for r in reason_rows]
|
||||
)
|
||||
if shap_importance:
|
||||
result["shap_importance"] = shap_importance
|
||||
except Exception:
|
||||
logger.debug("SHAP importance extraction unavailable")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@ -846,11 +892,11 @@ async def classify_suggested() -> dict[str, Any]:
|
||||
# ---------------------------------------------------------------------------
|
||||
class ClassifyRequest(BaseModel):
|
||||
src_ip: str
|
||||
classification: str # bot | legitimate | suspicious
|
||||
classification: str # true_positive | false_positive | suspicious
|
||||
comment: str = ""
|
||||
|
||||
|
||||
_VALID_CLASSIFICATIONS = {"bot", "legitimate", "suspicious"}
|
||||
_VALID_CLASSIFICATIONS = {"true_positive", "false_positive", "suspicious"}
|
||||
_feedback_table_ensured = False
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user