feat(e2e): add distributed E2E test framework with parametric traffic generation

Add run-e2e-test.sh with CLI parameters (--hits, --http-ratio, --dns, --tls,
--src-ips, --keep-analysis, --up) for configurable traffic generation. Traffic
runs from VM endpoints with multiple source IPs (alias IPs on eth0) to produce
distinct sessions for the ML pipeline. Fix curl TLS flags (--tlsv1.2 instead
of --tls-v1-2), skip redundant local verification in distributed mode, and
fix dashboard is_available() cache that never retried after ClickHouse recovery.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jacquin Antoine
2026-04-15 00:09:32 +02:00
parent 7894d39f1c
commit f88b739992
40 changed files with 2154 additions and 337 deletions

View File

@ -6,17 +6,43 @@ from typing import Any
import clickhouse_connect
from clickhouse_connect.driver.client import Client
from clickhouse_connect.driver.exceptions import DatabaseError
from backend.config import CLICKHOUSE_HOST, CLICKHOUSE_PORT, CLICKHOUSE_USER, CLICKHOUSE_PASSWORD
logger = logging.getLogger(__name__)
_client: Client | None = None
_available: bool | None = None # None = not tested yet
class ClickHouseUnavailable(Exception):
"""Raised when ClickHouse is not reachable."""
def is_available() -> bool:
"""Check ClickHouse connectivity (retries on every call if previously failed)."""
global _client, _available
try:
# Force re-creation if previously marked unavailable
if _available is False:
_client = None
get_client()
_available = True
return True
except Exception:
_available = False
_client = None
logger.warning("ClickHouse unavailable at %s:%s", CLICKHOUSE_HOST, CLICKHOUSE_PORT)
return False
def get_client() -> Client:
"""Return a lazily-initialised ClickHouse client (singleton)."""
global _client
"""Return a lazily-initialised ClickHouse client (singleton).
Resets the singleton on connection failure so the next call retries.
"""
global _client, _available
if _client is None:
_client = clickhouse_connect.get_client(
host=CLICKHOUSE_HOST,
@ -25,9 +51,17 @@ def get_client() -> Client:
password=CLICKHOUSE_PASSWORD,
)
logger.info("Connected to ClickHouse at %s:%s", CLICKHOUSE_HOST, CLICKHOUSE_PORT)
_available = True
return _client
def _mark_unavailable() -> None:
"""Reset client and mark ClickHouse as unavailable."""
global _client, _available
_client = None
_available = False
def _normalise_value(v: Any) -> Any:
"""Convert ClickHouse-specific types to JSON-friendly Python types."""
if isinstance(v, (ipaddress.IPv4Address, ipaddress.IPv6Address)):
@ -41,26 +75,47 @@ def _normalise_value(v: Any) -> Any:
def query(sql: str, params: dict | None = None) -> list[dict[str, Any]]:
"""Execute *sql* and return a list of row-dicts."""
client = get_client()
result = client.query(sql, parameters=params or {})
columns = result.column_names
rows: list[dict[str, Any]] = []
for row in result.result_rows:
rows.append({col: _normalise_value(val) for col, val in zip(columns, row)})
return rows
"""Execute *sql* and return a list of row-dicts.
Raises ClickHouseUnavailable if the server is not reachable.
"""
try:
client = get_client()
result = client.query(sql, parameters=params or {})
columns = result.column_names
rows: list[dict[str, Any]] = []
for row in result.result_rows:
rows.append({col: _normalise_value(val) for col, val in zip(columns, row)})
return rows
except (DatabaseError, ConnectionError, OSError) as exc:
_mark_unavailable()
raise ClickHouseUnavailable(str(exc)) from exc
def query_scalar(sql: str, params: dict | None = None) -> Any:
"""Execute *sql* and return the single scalar value."""
client = get_client()
result = client.query(sql, parameters=params or {})
if result.result_rows:
return _normalise_value(result.result_rows[0][0])
return None
"""Execute *sql* and return the single scalar value.
Raises ClickHouseUnavailable if the server is not reachable.
"""
try:
client = get_client()
result = client.query(sql, parameters=params or {})
if result.result_rows:
return _normalise_value(result.result_rows[0][0])
return None
except (DatabaseError, ConnectionError, OSError) as exc:
_mark_unavailable()
raise ClickHouseUnavailable(str(exc)) from exc
def execute(sql: str, params: dict | None = None) -> None:
"""Execute a DDL / DML statement that returns no rows."""
client = get_client()
client.command(sql, parameters=params or {})
"""Execute a DDL / DML statement that returns no rows.
Raises ClickHouseUnavailable if the server is not reachable.
"""
try:
client = get_client()
client.command(sql, parameters=params or {})
except (DatabaseError, ConnectionError, OSError) as exc:
_mark_unavailable()
raise ClickHouseUnavailable(str(exc)) from exc

View File

@ -4,15 +4,28 @@ from __future__ import annotations
import logging
from fastapi import FastAPI
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from backend.database import ClickHouseUnavailable, is_available
from backend.routes.api import router as api_router
from backend.routes.pages import router as pages_router
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
_templates = Jinja2Templates(directory="backend/templates")
_PAGE_MAP = {
"/": "overview", "/detections": "detections", "/scores": "scores",
"/traffic": "traffic", "/classify": "classify", "/features": "features",
"/models": "models", "/network": "network", "/campaigns": "campaigns",
"/tactics": "tactics", "/reflists": "reflists", "/fleet": "fleet",
"/health": "health", "/browsers": "browsers", "/fingerprints": "fingerprints",
}
app = FastAPI(title="JA4 SOC Dashboard", version="1.0.0")
# CORS — allow all origins for dashboard access
@ -24,6 +37,29 @@ app.add_middleware(
allow_headers=["*"],
)
@app.exception_handler(ClickHouseUnavailable)
async def ch_unavailable_handler(request: Request, exc: ClickHouseUnavailable):
"""Return 503 for API calls, render degraded pages for HTML requests."""
accept = request.headers.get("accept", "")
path = request.url.path
# If the client expects JSON (API call), return 503 JSON
if "application/json" in accept or path.startswith("/api/"):
return JSONResponse(
status_code=503,
content={"detail": "ClickHouse unavailable", "error": str(exc)},
)
# For HTML pages, render the template with ch_available=False
page_name = _PAGE_MAP.get(path, "overview")
return _templates.TemplateResponse(
f"{page_name}.html",
{"request": request, "active_page": page_name, "ch_available": False},
status_code=503,
)
# Static assets
app.mount("/static", StaticFiles(directory="backend/static"), name="static")
@ -32,6 +68,7 @@ app.include_router(api_router)
app.include_router(pages_router)
@app.get("/health")
async def health():
return {"status": "ok"}
@app.get("/api/healthcheck")
async def healthcheck():
ch = is_available()
return {"status": "ok" if ch else "degraded", "clickhouse": "up" if ch else "down"}

View File

@ -13,6 +13,8 @@ from typing import Any
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from backend.database import ClickHouseUnavailable
from backend.config import DB_PROCESSING, DB_LOGS, safe_identifier
from backend.database import query, query_scalar, execute
@ -29,6 +31,17 @@ _SHAP_RE = re.compile(r"(?:SHAP|ExIFFI):\s*(.+?)(?:\s*\|\s*Threat|$)")
_FEAT_RE = re.compile(r"(\w+)\(([+-]?\d+\.\d+)\)")
def _ch_fallback(exc: Exception) -> None:
"""Raise ClickHouseUnavailable for connection errors, re-raise otherwise."""
if isinstance(exc, ClickHouseUnavailable):
raise
# Detect connection-level errors from clickhouse_connect
err_msg = str(exc).lower()
if "connection" in err_msg or "refused" in err_msg or "unavailable" in err_msg:
raise ClickHouseUnavailable(str(exc)) from exc
raise HTTPException(status_code=500, detail=str(exc)) from exc
def _aggregate_shap_importance(reasons: list[str]) -> list[dict]:
"""Agrège les valeurs SHAP/ExIFFI extraites des champs reason."""
totals: dict[str, float] = defaultdict(float)
@ -171,7 +184,7 @@ async def overview() -> dict[str, Any]:
}
except Exception as exc:
logger.exception("overview query failed")
raise HTTPException(status_code=500, detail=str(exc))
_ch_fallback(exc)
# ---------------------------------------------------------------------------
@ -253,7 +266,7 @@ async def detections(
}
except Exception as exc:
logger.exception("detections query failed")
raise HTTPException(status_code=500, detail=str(exc))
_ch_fallback(exc)
# ---------------------------------------------------------------------------
@ -330,7 +343,7 @@ async def scores(
}
except Exception as exc:
logger.exception("scores query failed")
raise HTTPException(status_code=500, detail=str(exc))
_ch_fallback(exc)
# ---------------------------------------------------------------------------
@ -401,7 +414,7 @@ async def traffic(
}
except Exception as exc:
logger.exception("traffic query failed")
raise HTTPException(status_code=500, detail=str(exc))
_ch_fallback(exc)
# ---------------------------------------------------------------------------
@ -470,7 +483,7 @@ async def ip_detail(ip: str) -> dict[str, Any]:
}
except Exception as exc:
logger.exception("ip detail query failed for %s", ip)
raise HTTPException(status_code=500, detail=str(exc))
_ch_fallback(exc)
# ---------------------------------------------------------------------------
@ -932,7 +945,7 @@ async def classify(body: ClassifyRequest) -> dict[str, Any]:
return {"status": "ok", "src_ip": body.src_ip, "classification": body.classification}
except Exception as exc:
logger.exception("classify insert failed")
raise HTTPException(status_code=500, detail=str(exc))
_ch_fallback(exc)
# ---------------------------------------------------------------------------
@ -1403,7 +1416,7 @@ async def ja4_detail(fingerprint: str) -> dict[str, Any]:
}
except Exception as exc:
logger.exception("ja4 detail query failed for %s", fingerprint)
raise HTTPException(status_code=500, detail=str(exc))
_ch_fallback(exc)
# ---------------------------------------------------------------------------
@ -1526,7 +1539,7 @@ async def cluster_detail(cid: int) -> dict[str, Any]:
}
except Exception as exc:
logger.exception("cluster detail query failed for %s", cid)
raise HTTPException(status_code=500, detail=str(exc))
_ch_fallback(exc)
# ═══════════════════════════════════════════════════════════════════════════════
@ -1545,9 +1558,9 @@ async def dictionaries_meta():
"ORDER BY name",
)
return {"dictionaries": rows}
except Exception as exc:
logger.exception("dictionaries meta query failed")
raise HTTPException(status_code=500, detail=str(exc))
except Exception:
logger.debug("dictionaries meta query failed — ClickHouse may be unavailable")
return {"dictionaries": []}
_REFLIST_SORT = {
@ -1640,7 +1653,7 @@ async def reflist(
return {"name": name, "total": total, "limit": limit, "offset": offset, "rows": rows}
except Exception as exc:
logger.exception("reflist query failed for %s", name)
raise HTTPException(status_code=500, detail=str(exc))
_ch_fallback(exc)
@router.get("/reflist/{name}/stats")
@ -1695,34 +1708,48 @@ async def reflist_stats(name: str):
return {"name": name, "total": total, "breakdown": agg}
except Exception as exc:
logger.exception("reflist stats query failed for %s", name)
raise HTTPException(status_code=500, detail=str(exc))
_ch_fallback(exc)
@router.get("/fleet")
async def fleet() -> dict[str, Any]:
"""Détections de flottes JA4×ASN (§5.2)."""
rows = query(
f"SELECT detected_at, community_id, fleet_score, n_ips, ja4_set, asn_set, ip_sample "
f"FROM {_DB}.fleet_detections "
f"WHERE detected_at >= now() - INTERVAL 7 DAY "
f"ORDER BY fleet_score DESC "
f"LIMIT 100"
)
try:
rows = query(
f"SELECT detected_at, community_id, fleet_score, n_ips, ja4_set, asn_set, ip_sample "
f"FROM {_DB}.fleet_detections "
f"WHERE detected_at >= now() - INTERVAL 7 DAY "
f"ORDER BY fleet_score DESC "
f"LIMIT 100"
)
except ClickHouseUnavailable:
raise
except Exception as exc:
_ch_fallback(exc)
rows = []
return {"fleets": rows}
@router.get("/health")
async def health_metrics() -> dict[str, Any]:
"""Métriques de santé du pipeline ML (Étape 9)."""
rows = query(
f"SELECT cycle_at, model_name, total_sessions, correlated_rate, anomaly_rate, "
f" critical_count, high_count, drift_rate, drift_alert, cycle_latency_ms, "
f" features_valid, features_total, baseline_size, meta_learner_active "
f"FROM {_DB}.ml_performance_metrics "
f"WHERE cycle_at >= now() - INTERVAL 7 DAY "
f"ORDER BY cycle_at DESC "
f"LIMIT 500"
)
try:
rows = query(
f"SELECT cycle_at, model_name, total_sessions, correlated_rate, anomaly_rate, "
f" critical_count, high_count, medium_count, low_count, "
f" known_bot_count, anubis_deny_count, legit_browser_count, "
f" drift_rate, drift_alert, cycle_latency_ms, "
f" features_valid, features_total, baseline_size, threshold, meta_learner_active "
f"FROM {_DB}.ml_performance_metrics "
f"WHERE cycle_at >= now() - INTERVAL 7 DAY "
f"ORDER BY cycle_at DESC "
f"LIMIT 500"
)
except ClickHouseUnavailable:
raise
except Exception as exc:
_ch_fallback(exc)
rows = []
# Statistiques de synthèse
if rows:
latest = {r['model_name']: r for r in rows}
@ -1895,9 +1922,9 @@ async def browser_sig_entries() -> dict[str, Any]:
f"ORDER BY browser_family"
)
return {"entries": rows, "total": len(rows), "source": "dict_csv", "readonly": True}
except Exception as exc:
logger.exception("browser_h2 entries fallback failed")
raise HTTPException(status_code=500, detail=str(exc))
except Exception:
logger.debug("browser_h2 entries fallback failed — ClickHouse may be unavailable")
return {"entries": [], "total": 0, "source": "unavailable"}
@router.post("/browser-signatures/entries", status_code=201)
@ -1932,7 +1959,7 @@ async def browser_sig_add(body: BrowserH2Entry) -> dict[str, Any]:
return {"status": "ok", "h2_fingerprint": body.h2_fingerprint.strip()}
except Exception as exc:
logger.exception("browser_h2_signatures insert failed")
raise HTTPException(status_code=500, detail=str(exc))
_ch_fallback(exc)
@router.delete("/browser-signatures/entries")
@ -1953,7 +1980,7 @@ async def browser_sig_delete(fingerprint: str = Query(...)) -> dict[str, Any]:
return {"status": "ok", "deleted": fingerprint.strip()}
except Exception as exc:
logger.exception("browser_h2_signatures delete failed")
raise HTTPException(status_code=500, detail=str(exc))
_ch_fallback(exc)
# ---------------------------------------------------------------------------
@ -2042,8 +2069,8 @@ async def fingerprint_discovery(
{"days": days, "min_hits": min_hits, "lim": limit},
)
except Exception as exc:
logger.exception("fingerprint-discovery query failed")
raise HTTPException(status_code=500, detail=str(exc))
logger.debug("fingerprint-discovery query failed — ClickHouse may be unavailable")
return {"profiles": [], "groups": [], "meta": {"total_ja4": 0, "total_groups": 0, "days": days, "min_hits": min_hits}}
# ── Regroupement par famille navigateur côté Python ──
groups: dict[str, dict[str, Any]] = {}

View File

@ -5,12 +5,14 @@ from __future__ import annotations
from fastapi import APIRouter, Request
from fastapi.templating import Jinja2Templates
from backend.database import is_available
router = APIRouter()
templates = Jinja2Templates(directory="backend/templates")
def _ctx(request: Request, page: str, **extra) -> dict:
return {"request": request, "active_page": page, **extra}
return {"request": request, "active_page": page, "ch_available": is_available(), **extra}
@router.get("/")

View File

@ -219,6 +219,12 @@
{% block header_actions %}{% endblock %}
</div>
</header>
{% if not ch_available %}
<div id="ch-banner" class="bg-amber-900/60 border-b border-amber-600 px-4 py-2 text-amber-200 text-sm flex items-center gap-2">
<svg class="w-4 h-4 shrink-0" fill="none" stroke="currentColor" stroke-width="2" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" d="M12 9v3.75m-9.303 3.376c-.866 1.5.217 3.374 1.948 3.374h14.71c1.73 0 2.813-1.874 1.948-3.374L13.949 3.378c-.866-1.5-3.032-1.5-3.898 0L2.697 16.126zM12 15.75h.008v.008H12v-.008z"/></svg>
<span><strong>ClickHouse indisponible</strong> — Les données ne peuvent pas être chargées. Vérifiez la connexion au serveur.</span>
</div>
{% endif %}
<main class="px-3 py-4 lg:px-5 lg:py-5 xl:px-6">
{% block content %}{% endblock %}
</main>