feat(e2e): add distributed E2E test framework with parametric traffic generation
Add run-e2e-test.sh with CLI parameters (--hits, --http-ratio, --dns, --tls, --src-ips, --keep-analysis, --up) for configurable traffic generation. Traffic runs from VM endpoints with multiple source IPs (alias IPs on eth0) to produce distinct sessions for the ML pipeline. Fix curl TLS flags (--tlsv1.2 instead of --tls-v1-2), skip redundant local verification in distributed mode, and fix dashboard is_available() cache that never retried after ClickHouse recovery. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -6,17 +6,43 @@ from typing import Any
|
||||
|
||||
import clickhouse_connect
|
||||
from clickhouse_connect.driver.client import Client
|
||||
from clickhouse_connect.driver.exceptions import DatabaseError
|
||||
|
||||
from backend.config import CLICKHOUSE_HOST, CLICKHOUSE_PORT, CLICKHOUSE_USER, CLICKHOUSE_PASSWORD
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_client: Client | None = None
|
||||
_available: bool | None = None # None = not tested yet
|
||||
|
||||
|
||||
class ClickHouseUnavailable(Exception):
|
||||
"""Raised when ClickHouse is not reachable."""
|
||||
|
||||
|
||||
def is_available() -> bool:
|
||||
"""Check ClickHouse connectivity (retries on every call if previously failed)."""
|
||||
global _client, _available
|
||||
try:
|
||||
# Force re-creation if previously marked unavailable
|
||||
if _available is False:
|
||||
_client = None
|
||||
get_client()
|
||||
_available = True
|
||||
return True
|
||||
except Exception:
|
||||
_available = False
|
||||
_client = None
|
||||
logger.warning("ClickHouse unavailable at %s:%s", CLICKHOUSE_HOST, CLICKHOUSE_PORT)
|
||||
return False
|
||||
|
||||
|
||||
def get_client() -> Client:
|
||||
"""Return a lazily-initialised ClickHouse client (singleton)."""
|
||||
global _client
|
||||
"""Return a lazily-initialised ClickHouse client (singleton).
|
||||
|
||||
Resets the singleton on connection failure so the next call retries.
|
||||
"""
|
||||
global _client, _available
|
||||
if _client is None:
|
||||
_client = clickhouse_connect.get_client(
|
||||
host=CLICKHOUSE_HOST,
|
||||
@ -25,9 +51,17 @@ def get_client() -> Client:
|
||||
password=CLICKHOUSE_PASSWORD,
|
||||
)
|
||||
logger.info("Connected to ClickHouse at %s:%s", CLICKHOUSE_HOST, CLICKHOUSE_PORT)
|
||||
_available = True
|
||||
return _client
|
||||
|
||||
|
||||
def _mark_unavailable() -> None:
|
||||
"""Reset client and mark ClickHouse as unavailable."""
|
||||
global _client, _available
|
||||
_client = None
|
||||
_available = False
|
||||
|
||||
|
||||
def _normalise_value(v: Any) -> Any:
|
||||
"""Convert ClickHouse-specific types to JSON-friendly Python types."""
|
||||
if isinstance(v, (ipaddress.IPv4Address, ipaddress.IPv6Address)):
|
||||
@ -41,26 +75,47 @@ def _normalise_value(v: Any) -> Any:
|
||||
|
||||
|
||||
def query(sql: str, params: dict | None = None) -> list[dict[str, Any]]:
|
||||
"""Execute *sql* and return a list of row-dicts."""
|
||||
client = get_client()
|
||||
result = client.query(sql, parameters=params or {})
|
||||
columns = result.column_names
|
||||
rows: list[dict[str, Any]] = []
|
||||
for row in result.result_rows:
|
||||
rows.append({col: _normalise_value(val) for col, val in zip(columns, row)})
|
||||
return rows
|
||||
"""Execute *sql* and return a list of row-dicts.
|
||||
|
||||
Raises ClickHouseUnavailable if the server is not reachable.
|
||||
"""
|
||||
try:
|
||||
client = get_client()
|
||||
result = client.query(sql, parameters=params or {})
|
||||
columns = result.column_names
|
||||
rows: list[dict[str, Any]] = []
|
||||
for row in result.result_rows:
|
||||
rows.append({col: _normalise_value(val) for col, val in zip(columns, row)})
|
||||
return rows
|
||||
except (DatabaseError, ConnectionError, OSError) as exc:
|
||||
_mark_unavailable()
|
||||
raise ClickHouseUnavailable(str(exc)) from exc
|
||||
|
||||
|
||||
def query_scalar(sql: str, params: dict | None = None) -> Any:
|
||||
"""Execute *sql* and return the single scalar value."""
|
||||
client = get_client()
|
||||
result = client.query(sql, parameters=params or {})
|
||||
if result.result_rows:
|
||||
return _normalise_value(result.result_rows[0][0])
|
||||
return None
|
||||
"""Execute *sql* and return the single scalar value.
|
||||
|
||||
Raises ClickHouseUnavailable if the server is not reachable.
|
||||
"""
|
||||
try:
|
||||
client = get_client()
|
||||
result = client.query(sql, parameters=params or {})
|
||||
if result.result_rows:
|
||||
return _normalise_value(result.result_rows[0][0])
|
||||
return None
|
||||
except (DatabaseError, ConnectionError, OSError) as exc:
|
||||
_mark_unavailable()
|
||||
raise ClickHouseUnavailable(str(exc)) from exc
|
||||
|
||||
|
||||
def execute(sql: str, params: dict | None = None) -> None:
|
||||
"""Execute a DDL / DML statement that returns no rows."""
|
||||
client = get_client()
|
||||
client.command(sql, parameters=params or {})
|
||||
"""Execute a DDL / DML statement that returns no rows.
|
||||
|
||||
Raises ClickHouseUnavailable if the server is not reachable.
|
||||
"""
|
||||
try:
|
||||
client = get_client()
|
||||
client.command(sql, parameters=params or {})
|
||||
except (DatabaseError, ConnectionError, OSError) as exc:
|
||||
_mark_unavailable()
|
||||
raise ClickHouseUnavailable(str(exc)) from exc
|
||||
|
||||
@ -4,15 +4,28 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import HTMLResponse, JSONResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.templating import Jinja2Templates
|
||||
|
||||
from backend.database import ClickHouseUnavailable, is_available
|
||||
from backend.routes.api import router as api_router
|
||||
from backend.routes.pages import router as pages_router
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
|
||||
|
||||
_templates = Jinja2Templates(directory="backend/templates")
|
||||
|
||||
_PAGE_MAP = {
|
||||
"/": "overview", "/detections": "detections", "/scores": "scores",
|
||||
"/traffic": "traffic", "/classify": "classify", "/features": "features",
|
||||
"/models": "models", "/network": "network", "/campaigns": "campaigns",
|
||||
"/tactics": "tactics", "/reflists": "reflists", "/fleet": "fleet",
|
||||
"/health": "health", "/browsers": "browsers", "/fingerprints": "fingerprints",
|
||||
}
|
||||
|
||||
app = FastAPI(title="JA4 SOC Dashboard", version="1.0.0")
|
||||
|
||||
# CORS — allow all origins for dashboard access
|
||||
@ -24,6 +37,29 @@ app.add_middleware(
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
@app.exception_handler(ClickHouseUnavailable)
|
||||
async def ch_unavailable_handler(request: Request, exc: ClickHouseUnavailable):
|
||||
"""Return 503 for API calls, render degraded pages for HTML requests."""
|
||||
accept = request.headers.get("accept", "")
|
||||
path = request.url.path
|
||||
|
||||
# If the client expects JSON (API call), return 503 JSON
|
||||
if "application/json" in accept or path.startswith("/api/"):
|
||||
return JSONResponse(
|
||||
status_code=503,
|
||||
content={"detail": "ClickHouse unavailable", "error": str(exc)},
|
||||
)
|
||||
|
||||
# For HTML pages, render the template with ch_available=False
|
||||
page_name = _PAGE_MAP.get(path, "overview")
|
||||
return _templates.TemplateResponse(
|
||||
f"{page_name}.html",
|
||||
{"request": request, "active_page": page_name, "ch_available": False},
|
||||
status_code=503,
|
||||
)
|
||||
|
||||
|
||||
# Static assets
|
||||
app.mount("/static", StaticFiles(directory="backend/static"), name="static")
|
||||
|
||||
@ -32,6 +68,7 @@ app.include_router(api_router)
|
||||
app.include_router(pages_router)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
@app.get("/api/healthcheck")
|
||||
async def healthcheck():
|
||||
ch = is_available()
|
||||
return {"status": "ok" if ch else "degraded", "clickhouse": "up" if ch else "down"}
|
||||
|
||||
@ -13,6 +13,8 @@ from typing import Any
|
||||
from fastapi import APIRouter, HTTPException, Query
|
||||
from pydantic import BaseModel
|
||||
|
||||
from backend.database import ClickHouseUnavailable
|
||||
|
||||
from backend.config import DB_PROCESSING, DB_LOGS, safe_identifier
|
||||
from backend.database import query, query_scalar, execute
|
||||
|
||||
@ -29,6 +31,17 @@ _SHAP_RE = re.compile(r"(?:SHAP|ExIFFI):\s*(.+?)(?:\s*\|\s*Threat|$)")
|
||||
_FEAT_RE = re.compile(r"(\w+)\(([+-]?\d+\.\d+)\)")
|
||||
|
||||
|
||||
def _ch_fallback(exc: Exception) -> None:
|
||||
"""Raise ClickHouseUnavailable for connection errors, re-raise otherwise."""
|
||||
if isinstance(exc, ClickHouseUnavailable):
|
||||
raise
|
||||
# Detect connection-level errors from clickhouse_connect
|
||||
err_msg = str(exc).lower()
|
||||
if "connection" in err_msg or "refused" in err_msg or "unavailable" in err_msg:
|
||||
raise ClickHouseUnavailable(str(exc)) from exc
|
||||
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||
|
||||
|
||||
def _aggregate_shap_importance(reasons: list[str]) -> list[dict]:
|
||||
"""Agrège les valeurs SHAP/ExIFFI extraites des champs reason."""
|
||||
totals: dict[str, float] = defaultdict(float)
|
||||
@ -171,7 +184,7 @@ async def overview() -> dict[str, Any]:
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.exception("overview query failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
_ch_fallback(exc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -253,7 +266,7 @@ async def detections(
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.exception("detections query failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
_ch_fallback(exc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -330,7 +343,7 @@ async def scores(
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.exception("scores query failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
_ch_fallback(exc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -401,7 +414,7 @@ async def traffic(
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.exception("traffic query failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
_ch_fallback(exc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -470,7 +483,7 @@ async def ip_detail(ip: str) -> dict[str, Any]:
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.exception("ip detail query failed for %s", ip)
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
_ch_fallback(exc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -932,7 +945,7 @@ async def classify(body: ClassifyRequest) -> dict[str, Any]:
|
||||
return {"status": "ok", "src_ip": body.src_ip, "classification": body.classification}
|
||||
except Exception as exc:
|
||||
logger.exception("classify insert failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
_ch_fallback(exc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -1403,7 +1416,7 @@ async def ja4_detail(fingerprint: str) -> dict[str, Any]:
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.exception("ja4 detail query failed for %s", fingerprint)
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
_ch_fallback(exc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -1526,7 +1539,7 @@ async def cluster_detail(cid: int) -> dict[str, Any]:
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.exception("cluster detail query failed for %s", cid)
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
_ch_fallback(exc)
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
@ -1545,9 +1558,9 @@ async def dictionaries_meta():
|
||||
"ORDER BY name",
|
||||
)
|
||||
return {"dictionaries": rows}
|
||||
except Exception as exc:
|
||||
logger.exception("dictionaries meta query failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
except Exception:
|
||||
logger.debug("dictionaries meta query failed — ClickHouse may be unavailable")
|
||||
return {"dictionaries": []}
|
||||
|
||||
|
||||
_REFLIST_SORT = {
|
||||
@ -1640,7 +1653,7 @@ async def reflist(
|
||||
return {"name": name, "total": total, "limit": limit, "offset": offset, "rows": rows}
|
||||
except Exception as exc:
|
||||
logger.exception("reflist query failed for %s", name)
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
_ch_fallback(exc)
|
||||
|
||||
|
||||
@router.get("/reflist/{name}/stats")
|
||||
@ -1695,34 +1708,48 @@ async def reflist_stats(name: str):
|
||||
return {"name": name, "total": total, "breakdown": agg}
|
||||
except Exception as exc:
|
||||
logger.exception("reflist stats query failed for %s", name)
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
_ch_fallback(exc)
|
||||
|
||||
|
||||
@router.get("/fleet")
|
||||
async def fleet() -> dict[str, Any]:
|
||||
"""Détections de flottes JA4×ASN (§5.2)."""
|
||||
rows = query(
|
||||
f"SELECT detected_at, community_id, fleet_score, n_ips, ja4_set, asn_set, ip_sample "
|
||||
f"FROM {_DB}.fleet_detections "
|
||||
f"WHERE detected_at >= now() - INTERVAL 7 DAY "
|
||||
f"ORDER BY fleet_score DESC "
|
||||
f"LIMIT 100"
|
||||
)
|
||||
try:
|
||||
rows = query(
|
||||
f"SELECT detected_at, community_id, fleet_score, n_ips, ja4_set, asn_set, ip_sample "
|
||||
f"FROM {_DB}.fleet_detections "
|
||||
f"WHERE detected_at >= now() - INTERVAL 7 DAY "
|
||||
f"ORDER BY fleet_score DESC "
|
||||
f"LIMIT 100"
|
||||
)
|
||||
except ClickHouseUnavailable:
|
||||
raise
|
||||
except Exception as exc:
|
||||
_ch_fallback(exc)
|
||||
rows = []
|
||||
return {"fleets": rows}
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
async def health_metrics() -> dict[str, Any]:
|
||||
"""Métriques de santé du pipeline ML (Étape 9)."""
|
||||
rows = query(
|
||||
f"SELECT cycle_at, model_name, total_sessions, correlated_rate, anomaly_rate, "
|
||||
f" critical_count, high_count, drift_rate, drift_alert, cycle_latency_ms, "
|
||||
f" features_valid, features_total, baseline_size, meta_learner_active "
|
||||
f"FROM {_DB}.ml_performance_metrics "
|
||||
f"WHERE cycle_at >= now() - INTERVAL 7 DAY "
|
||||
f"ORDER BY cycle_at DESC "
|
||||
f"LIMIT 500"
|
||||
)
|
||||
try:
|
||||
rows = query(
|
||||
f"SELECT cycle_at, model_name, total_sessions, correlated_rate, anomaly_rate, "
|
||||
f" critical_count, high_count, medium_count, low_count, "
|
||||
f" known_bot_count, anubis_deny_count, legit_browser_count, "
|
||||
f" drift_rate, drift_alert, cycle_latency_ms, "
|
||||
f" features_valid, features_total, baseline_size, threshold, meta_learner_active "
|
||||
f"FROM {_DB}.ml_performance_metrics "
|
||||
f"WHERE cycle_at >= now() - INTERVAL 7 DAY "
|
||||
f"ORDER BY cycle_at DESC "
|
||||
f"LIMIT 500"
|
||||
)
|
||||
except ClickHouseUnavailable:
|
||||
raise
|
||||
except Exception as exc:
|
||||
_ch_fallback(exc)
|
||||
rows = []
|
||||
# Statistiques de synthèse
|
||||
if rows:
|
||||
latest = {r['model_name']: r for r in rows}
|
||||
@ -1895,9 +1922,9 @@ async def browser_sig_entries() -> dict[str, Any]:
|
||||
f"ORDER BY browser_family"
|
||||
)
|
||||
return {"entries": rows, "total": len(rows), "source": "dict_csv", "readonly": True}
|
||||
except Exception as exc:
|
||||
logger.exception("browser_h2 entries fallback failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
except Exception:
|
||||
logger.debug("browser_h2 entries fallback failed — ClickHouse may be unavailable")
|
||||
return {"entries": [], "total": 0, "source": "unavailable"}
|
||||
|
||||
|
||||
@router.post("/browser-signatures/entries", status_code=201)
|
||||
@ -1932,7 +1959,7 @@ async def browser_sig_add(body: BrowserH2Entry) -> dict[str, Any]:
|
||||
return {"status": "ok", "h2_fingerprint": body.h2_fingerprint.strip()}
|
||||
except Exception as exc:
|
||||
logger.exception("browser_h2_signatures insert failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
_ch_fallback(exc)
|
||||
|
||||
|
||||
@router.delete("/browser-signatures/entries")
|
||||
@ -1953,7 +1980,7 @@ async def browser_sig_delete(fingerprint: str = Query(...)) -> dict[str, Any]:
|
||||
return {"status": "ok", "deleted": fingerprint.strip()}
|
||||
except Exception as exc:
|
||||
logger.exception("browser_h2_signatures delete failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
_ch_fallback(exc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -2042,8 +2069,8 @@ async def fingerprint_discovery(
|
||||
{"days": days, "min_hits": min_hits, "lim": limit},
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.exception("fingerprint-discovery query failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
logger.debug("fingerprint-discovery query failed — ClickHouse may be unavailable")
|
||||
return {"profiles": [], "groups": [], "meta": {"total_ja4": 0, "total_groups": 0, "days": days, "min_hits": min_hits}}
|
||||
|
||||
# ── Regroupement par famille navigateur côté Python ──
|
||||
groups: dict[str, dict[str, Any]] = {}
|
||||
|
||||
@ -5,12 +5,14 @@ from __future__ import annotations
|
||||
from fastapi import APIRouter, Request
|
||||
from fastapi.templating import Jinja2Templates
|
||||
|
||||
from backend.database import is_available
|
||||
|
||||
router = APIRouter()
|
||||
templates = Jinja2Templates(directory="backend/templates")
|
||||
|
||||
|
||||
def _ctx(request: Request, page: str, **extra) -> dict:
|
||||
return {"request": request, "active_page": page, **extra}
|
||||
return {"request": request, "active_page": page, "ch_available": is_available(), **extra}
|
||||
|
||||
|
||||
@router.get("/")
|
||||
|
||||
@ -219,6 +219,12 @@
|
||||
{% block header_actions %}{% endblock %}
|
||||
</div>
|
||||
</header>
|
||||
{% if not ch_available %}
|
||||
<div id="ch-banner" class="bg-amber-900/60 border-b border-amber-600 px-4 py-2 text-amber-200 text-sm flex items-center gap-2">
|
||||
<svg class="w-4 h-4 shrink-0" fill="none" stroke="currentColor" stroke-width="2" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" d="M12 9v3.75m-9.303 3.376c-.866 1.5.217 3.374 1.948 3.374h14.71c1.73 0 2.813-1.874 1.948-3.374L13.949 3.378c-.866-1.5-3.032-1.5-3.898 0L2.697 16.126zM12 15.75h.008v.008H12v-.008z"/></svg>
|
||||
<span><strong>ClickHouse indisponible</strong> — Les données ne peuvent pas être chargées. Vérifiez la connexion au serveur.</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
<main class="px-3 py-4 lg:px-5 lg:py-5 xl:px-6">
|
||||
{% block content %}{% endblock %}
|
||||
</main>
|
||||
|
||||
Reference in New Issue
Block a user