feat: ja4-platform monorepo — 5 services unified, tests & RPM builds standardized
Services: - ja4sentinel: TLS/JA4 fingerprint capture daemon (Go, libpcap) - logcorrelator: JA4 log correlation engine (Go, ClickHouse) - mod_reqin_log: Apache module (C, JSON request logging) - bot_detector: ML bot detection pipeline (Python) - dashboard: FastAPI/Streamlit analytics UI (Python) Shared libraries: - shared/go/ja4common: logger, config, shutdown, ipfilter (Go module) - shared/python/ja4_common: ClickHouseClient, ClickHouseSettings (Python package) - shared/clickhouse/: canonical SQL migrations (10 files) Build & packaging: - Unified 3-stage Dockerfile.package for Go RPMs (el8/el9/el10) - go.work workspace linking sentinel, correlator, ja4common - Makefile with test-all, build-all, rpm-* targets Fixes applied: - go.work: 1.21 → 1.24.6 (required by sentinel) - correlator Dockerfiles: golang:1.21 → golang:1.24 - replace directives in go.mod for ja4common local path - pyproject.toml: setuptools.backends → setuptools.build_meta - Removed static libpcap linking (unavailable on Rocky 9) - Fixed data races in output/writers_test.go (sync.Mutex + atomic.Int32) - Rewrote corrupted test files (logger_test.go × 2) Test coverage: - correlator: 67.1% total (unixsocket 80.5%, config 91.7%, app 83.3%, multi 87.7%, stdout 100%) - sentinel: all 10 packages pass (api, capture, config, fingerprint, ipfilter, logging, output, tlsparse) Documentation: - README.md + docs/ (architecture, development, 5 services, shared libs, DB schema & migrations) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
551
services/dashboard/backend/routes/clustering.py
Normal file
551
services/dashboard/backend/routes/clustering.py
Normal file
@ -0,0 +1,551 @@
|
||||
"""
|
||||
Clustering d'IPs multi-métriques — WebGL / deck.gl backend.
|
||||
|
||||
- Calcul sur la TOTALITÉ des IPs (GROUP BY src_ip, ja4 sans LIMIT)
|
||||
- K-means++ vectorisé (numpy) + PCA-2D + enveloppes convexes (scipy)
|
||||
- Calcul en background thread + cache 30 min
|
||||
- Endpoints : /clusters, /status, /cluster/{id}/points
|
||||
"""
|
||||
import math
|
||||
import time
|
||||
import logging
|
||||
import threading
|
||||
from collections import Counter
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
from fastapi import APIRouter, HTTPException, Query
|
||||
|
||||
from ..database import db
|
||||
from ..services.clustering_engine import (
|
||||
FEATURE_NAMES,
|
||||
build_feature_vector, kmeans_pp, pca_2d, compute_hulls,
|
||||
name_cluster, risk_score_from_centroid, standardize,
|
||||
risk_to_gradient_color,
|
||||
)
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/api/clustering", tags=["clustering"])
|
||||
|
||||
# ─── Cache global ──────────────────────────────────────────────────────────────
|
||||
_CACHE: dict[str, Any] = {
|
||||
"status": "idle", # idle | computing | ready | error
|
||||
"error": None,
|
||||
"result": None, # dict résultat complet
|
||||
"ts": 0.0, # timestamp dernière mise à jour
|
||||
"params": {},
|
||||
"cluster_ips": {}, # cluster_idx → [(ip, ja4, pca_x, pca_y, risk)]
|
||||
}
|
||||
_CACHE_TTL = 1800 # 30 minutes
|
||||
_LOCK = threading.Lock()
|
||||
_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="clustering")
|
||||
|
||||
# ─── Palette de couleurs (remplace l'ancienne logique menace) ─────────────────
|
||||
# Les couleurs sont désormais attribuées par index de cluster pour maximiser
|
||||
# la distinction visuelle, indépendamment du niveau de risque.
|
||||
|
||||
|
||||
# ─── SQL : TOUTES les IPs sans LIMIT ─────────────────────────────────────────
|
||||
_SQL_ALL_IPS = """
|
||||
SELECT
|
||||
replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') AS ip,
|
||||
t.ja4,
|
||||
any(t.tcp_ttl_raw) AS ttl,
|
||||
any(t.tcp_win_raw) AS win,
|
||||
any(t.tcp_scale_raw) AS scale,
|
||||
any(t.tcp_mss_raw) AS mss,
|
||||
any(t.first_ua) AS ua,
|
||||
sum(t.hits) AS hits,
|
||||
|
||||
avg(abs(ml.anomaly_score)) AS avg_score,
|
||||
avg(ml.hit_velocity) AS avg_velocity,
|
||||
avg(ml.fuzzing_index) AS avg_fuzzing,
|
||||
avg(ml.is_headless) AS pct_headless,
|
||||
avg(ml.post_ratio) AS avg_post,
|
||||
avg(ml.ip_id_zero_ratio) AS ip_id_zero,
|
||||
avg(ml.temporal_entropy) AS entropy,
|
||||
avg(ml.modern_browser_score) AS browser_score,
|
||||
avg(ml.alpn_http_mismatch) AS alpn_mismatch,
|
||||
avg(ml.is_alpn_missing) AS alpn_missing,
|
||||
avg(ml.multiplexing_efficiency) AS h2_eff,
|
||||
avg(ml.header_order_confidence) AS hdr_conf,
|
||||
avg(ml.ua_ch_mismatch) AS ua_ch_mismatch,
|
||||
avg(ml.asset_ratio) AS asset_ratio,
|
||||
avg(ml.direct_access_ratio) AS direct_ratio,
|
||||
avg(ml.distinct_ja4_count) AS ja4_count,
|
||||
max(ml.is_ua_rotating) AS ua_rotating,
|
||||
|
||||
max(ml.threat_level) AS threat,
|
||||
any(ml.country_code) AS country,
|
||||
any(ml.asn_org) AS asn_org,
|
||||
|
||||
-- Features headers HTTP (depuis view_dashboard_entities)
|
||||
avg(ml.has_accept_language) AS hdr_accept_lang,
|
||||
any(vh.hdr_enc) AS hdr_has_encoding,
|
||||
any(vh.hdr_sec_fetch) AS hdr_has_sec_fetch,
|
||||
any(vh.hdr_count) AS hdr_count_raw,
|
||||
|
||||
-- Fingerprint HTTP Headers (depuis agg_header_fingerprint_1h + ml_detected_anomalies)
|
||||
-- header_order_shared_count : nb d'IPs partageant le même fingerprint
|
||||
-- → faible = fingerprint rare = comportement suspect
|
||||
avg(ml.header_order_shared_count) AS hfp_shared_count,
|
||||
-- distinct_header_orders : nb de fingerprints distincts émis par cette IP
|
||||
-- → élevé = rotation de fingerprint = comportement bot
|
||||
avg(ml.distinct_header_orders) AS hfp_distinct_orders,
|
||||
-- Cookie et Referer issus de la table dédiée aux empreintes
|
||||
any(hfp.hfp_cookie) AS hfp_cookie,
|
||||
any(hfp.hfp_referer) AS hfp_referer
|
||||
FROM mabase_prod.agg_host_ip_ja4_1h t
|
||||
LEFT JOIN mabase_prod.ml_detected_anomalies ml
|
||||
ON t.src_ip = ml.src_ip AND t.ja4 = ml.ja4
|
||||
AND ml.detected_at >= now() - INTERVAL %(hours)s HOUR
|
||||
LEFT JOIN (
|
||||
SELECT
|
||||
toIPv6(concat('::ffff:', toString(src_ip))) AS src_ip_v6,
|
||||
ja4,
|
||||
any(arrayExists(x -> x LIKE '%%Accept-Encoding%%', client_headers)) AS hdr_enc,
|
||||
any(arrayExists(x -> x LIKE '%%Sec-Fetch%%', client_headers)) AS hdr_sec_fetch,
|
||||
any(length(splitByChar(',', client_headers[1]))) AS hdr_count
|
||||
FROM mabase_prod.view_dashboard_entities
|
||||
WHERE length(client_headers) > 0
|
||||
AND log_date >= today() - 2
|
||||
GROUP BY src_ip_v6, ja4
|
||||
) vh ON t.src_ip = vh.src_ip_v6 AND t.ja4 = vh.ja4
|
||||
LEFT JOIN (
|
||||
SELECT
|
||||
src_ip,
|
||||
avg(has_cookie) AS hfp_cookie,
|
||||
avg(has_referer) AS hfp_referer
|
||||
FROM mabase_prod.agg_header_fingerprint_1h
|
||||
WHERE window_start >= now() - INTERVAL %(hours)s HOUR
|
||||
GROUP BY src_ip
|
||||
) hfp ON t.src_ip = hfp.src_ip
|
||||
WHERE t.window_start >= now() - INTERVAL %(hours)s HOUR
|
||||
AND t.tcp_ttl_raw > 0
|
||||
GROUP BY t.src_ip, t.ja4
|
||||
"""
|
||||
|
||||
_SQL_COLS = [
|
||||
"ip", "ja4", "ttl", "win", "scale", "mss", "ua", "hits",
|
||||
"avg_score", "avg_velocity", "avg_fuzzing", "pct_headless", "avg_post",
|
||||
"ip_id_zero", "entropy", "browser_score", "alpn_mismatch", "alpn_missing",
|
||||
"h2_eff", "hdr_conf", "ua_ch_mismatch", "asset_ratio", "direct_ratio",
|
||||
"ja4_count", "ua_rotating", "threat", "country", "asn_org",
|
||||
"hdr_accept_lang", "hdr_has_encoding", "hdr_has_sec_fetch", "hdr_count_raw",
|
||||
"hfp_shared_count", "hfp_distinct_orders", "hfp_cookie", "hfp_referer",
|
||||
]
|
||||
|
||||
|
||||
# ─── Worker de clustering (thread pool) ──────────────────────────────────────
|
||||
|
||||
def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
|
||||
"""Exécuté dans le thread pool. Met à jour _CACHE.
|
||||
|
||||
sensitivity : multiplicateur de k [0.5 – 5.0].
|
||||
0.5 = vue très agrégée (k/2 clusters)
|
||||
1.0 = comportement par défaut
|
||||
2.0 = deux fois plus de clusters → groupes plus homogènes
|
||||
5.0 = granularité maximale (classification la plus fine)
|
||||
|
||||
k_actual est plafonné à 300 pour éviter des temps de calcul excessifs.
|
||||
n_init est réduit à 1 quand k_actual > 60 pour rester rapide.
|
||||
"""
|
||||
k_actual = max(4, min(300, round(k * sensitivity)))
|
||||
t0 = time.time()
|
||||
with _LOCK:
|
||||
_CACHE["status"] = "computing"
|
||||
_CACHE["error"] = None
|
||||
|
||||
try:
|
||||
log.info(f"[clustering] Démarrage k={k_actual} (base={k}×sens={sensitivity}) hours={hours}")
|
||||
|
||||
# ── 1. Chargement de toutes les IPs ──────────────────────────────
|
||||
result = db.query(_SQL_ALL_IPS, {"hours": hours})
|
||||
rows: list[dict] = []
|
||||
for row in result.result_rows:
|
||||
rows.append({col: row[i] for i, col in enumerate(_SQL_COLS)})
|
||||
|
||||
n = len(rows)
|
||||
log.info(f"[clustering] {n} IPs chargées")
|
||||
if n < k_actual:
|
||||
raise ValueError(f"Seulement {n} IPs disponibles (k={k_actual} requis)")
|
||||
|
||||
# ── 2. Construction de la matrice de features (numpy) ────────────
|
||||
X = np.array([build_feature_vector(r) for r in rows], dtype=np.float32)
|
||||
log.info(f"[clustering] Matrice X: {X.shape} — {X.nbytes/1024/1024:.1f} MB")
|
||||
|
||||
# ── 3. Standardisation z-score ────────────────────────────────────
|
||||
# Normalise par variance : features discriminantes (forte std)
|
||||
# contribuent plus que les features quasi-constantes.
|
||||
X64 = X.astype(np.float64)
|
||||
X_std, feat_mean, feat_std = standardize(X64)
|
||||
|
||||
# ── 4. K-means++ sur l'espace standardisé ────────────────────────
|
||||
# n_init réduit à 1 pour k élevé (> 60) afin de limiter le temps de calcul
|
||||
n_init = 1 if k_actual > 60 else 3
|
||||
km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=n_init, seed=42)
|
||||
log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")
|
||||
|
||||
# Centroïdes dans l'espace original [0,1] pour affichage radar
|
||||
# (dé-standardisation : c_orig = c_std * std + mean, puis clip [0,1])
|
||||
centroids_orig = np.clip(km.centroids * feat_std + feat_mean, 0.0, 1.0)
|
||||
|
||||
# ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ────
|
||||
coords = pca_2d(X64) # (n, 2), normalisé [0,1]
|
||||
|
||||
# ── 5b. Enveloppes convexes par cluster ──────────────────────────
|
||||
hulls = compute_hulls(coords, km.labels, k_actual)
|
||||
|
||||
# ── 6. Agrégation par cluster ─────────────────────────────────────
|
||||
cluster_rows: list[list[dict]] = [[] for _ in range(k_actual)]
|
||||
cluster_coords: list[list[list[float]]] = [[] for _ in range(k_actual)]
|
||||
cluster_ips_map: dict[int, list] = {j: [] for j in range(k_actual)}
|
||||
|
||||
for i, label in enumerate(km.labels):
|
||||
j = int(label)
|
||||
cluster_rows[j].append(rows[i])
|
||||
cluster_coords[j].append(coords[i].tolist())
|
||||
cluster_ips_map[j].append((
|
||||
rows[i]["ip"],
|
||||
rows[i]["ja4"],
|
||||
float(coords[i][0]),
|
||||
float(coords[i][1]),
|
||||
float(risk_score_from_centroid(centroids_orig[j])),
|
||||
))
|
||||
|
||||
# ── 7. Construction des nœuds ─────────────────────────────────────
|
||||
nodes = []
|
||||
for j in range(k_actual):
|
||||
if not cluster_rows[j]:
|
||||
continue
|
||||
|
||||
def avg_f(key: str, crows: list[dict] = cluster_rows[j]) -> float:
|
||||
return float(np.mean([float(r.get(key) or 0) for r in crows]))
|
||||
|
||||
mean_ttl = avg_f("ttl")
|
||||
mean_mss = avg_f("mss")
|
||||
mean_scale = avg_f("scale")
|
||||
mean_win = avg_f("win")
|
||||
|
||||
raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale}
|
||||
label_name = name_cluster(centroids_orig[j], raw_stats)
|
||||
risk = float(risk_score_from_centroid(centroids_orig[j]))
|
||||
color = risk_to_gradient_color(risk)
|
||||
|
||||
# Centroïde 2D = moyenne des coords du cluster
|
||||
cxy = np.mean(cluster_coords[j], axis=0).tolist() if cluster_coords[j] else [0.5, 0.5]
|
||||
ip_set = list({r["ip"] for r in cluster_rows[j]})
|
||||
ip_count = len(ip_set)
|
||||
hit_count = int(sum(float(r.get("hits") or 0) for r in cluster_rows[j]))
|
||||
|
||||
threats = [str(r.get("threat") or "") for r in cluster_rows[j] if r.get("threat")]
|
||||
countries = [str(r.get("country") or "") for r in cluster_rows[j] if r.get("country")]
|
||||
orgs = [str(r.get("asn_org") or "") for r in cluster_rows[j] if r.get("asn_org")]
|
||||
|
||||
def topk(lst: list[str], n: int = 5) -> list[str]:
|
||||
return [v for v, _ in Counter(lst).most_common(n) if v]
|
||||
|
||||
radar = [
|
||||
{"feature": name, "value": round(float(centroids_orig[j][i]), 4)}
|
||||
for i, name in enumerate(FEATURE_NAMES)
|
||||
]
|
||||
|
||||
radius = max(8, min(30, int(math.log1p(ip_count) * 2.2)))
|
||||
|
||||
sample_rows = sorted(cluster_rows[j], key=lambda r: float(r.get("hits") or 0), reverse=True)[:8]
|
||||
sample_ips = [r["ip"] for r in sample_rows]
|
||||
sample_ua = str(cluster_rows[j][0].get("ua") or "")
|
||||
|
||||
nodes.append({
|
||||
"id": f"c{j}_k{k_actual}",
|
||||
"cluster_idx": j,
|
||||
"label": label_name,
|
||||
"pca_x": round(cxy[0], 6),
|
||||
"pca_y": round(cxy[1], 6),
|
||||
"radius": radius,
|
||||
"color": color,
|
||||
"risk_score": round(risk, 4),
|
||||
|
||||
"mean_ttl": round(mean_ttl, 1),
|
||||
"mean_mss": round(mean_mss, 0),
|
||||
"mean_scale": round(mean_scale, 1),
|
||||
"mean_win": round(mean_win, 0),
|
||||
"mean_velocity":round(avg_f("avg_velocity"),3),
|
||||
"mean_fuzzing": round(avg_f("avg_fuzzing"), 3),
|
||||
"mean_headless":round(avg_f("pct_headless"),3),
|
||||
"mean_post": round(avg_f("avg_post"), 3),
|
||||
"mean_asset": round(avg_f("asset_ratio"), 3),
|
||||
"mean_direct": round(avg_f("direct_ratio"),3),
|
||||
"mean_alpn_mismatch": round(avg_f("alpn_mismatch"),3),
|
||||
"mean_h2_eff": round(avg_f("h2_eff"), 3),
|
||||
"mean_hdr_conf":round(avg_f("hdr_conf"), 3),
|
||||
"mean_ua_ch": round(avg_f("ua_ch_mismatch"),3),
|
||||
"mean_entropy": round(avg_f("entropy"), 3),
|
||||
"mean_ja4_diversity": round(avg_f("ja4_count"),3),
|
||||
"mean_ip_id_zero": round(avg_f("ip_id_zero"),3),
|
||||
"mean_browser_score": round(avg_f("browser_score"),1),
|
||||
"mean_ua_rotating": round(avg_f("ua_rotating"),3),
|
||||
|
||||
"ip_count": ip_count,
|
||||
"hit_count": hit_count,
|
||||
"top_threat": topk(threats, 1)[0] if threats else "",
|
||||
"top_countries":topk(countries, 5),
|
||||
"top_orgs": topk(orgs, 5),
|
||||
"sample_ips": sample_ips,
|
||||
"sample_ua": sample_ua,
|
||||
"radar": radar,
|
||||
|
||||
# Hull pour deck.gl PolygonLayer
|
||||
"hull": hulls.get(j, []),
|
||||
})
|
||||
|
||||
# ── 8. Arêtes k-NN entre clusters ────────────────────────────────
|
||||
edges = []
|
||||
seen: set[frozenset] = set()
|
||||
for i, ni in enumerate(nodes):
|
||||
ci = ni["cluster_idx"]
|
||||
dists = sorted(
|
||||
[(j, nj["cluster_idx"],
|
||||
float(np.sum((centroids_orig[ci] - centroids_orig[nj["cluster_idx"]]) ** 2)))
|
||||
for j, nj in enumerate(nodes) if j != i],
|
||||
key=lambda x: x[2]
|
||||
)
|
||||
for j_idx, cj, d2 in dists[:2]:
|
||||
key = frozenset([ni["id"], nodes[j_idx]["id"]])
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
edges.append({
|
||||
"id": f"e_{ni['id']}_{nodes[j_idx]['id']}",
|
||||
"source": ni["id"],
|
||||
"target": nodes[j_idx]["id"],
|
||||
"similarity": round(1.0 / (1.0 + math.sqrt(d2)), 3),
|
||||
})
|
||||
|
||||
# ── 9. Stockage résultat + cache IPs ─────────────────────────────
|
||||
total_ips = sum(n_["ip_count"] for n_ in nodes)
|
||||
total_hits = sum(n_["hit_count"] for n_ in nodes)
|
||||
elapsed = round(time.time() - t0, 2)
|
||||
|
||||
result_dict = {
|
||||
"nodes": nodes,
|
||||
"edges": edges,
|
||||
"stats": {
|
||||
"total_clusters": len(nodes),
|
||||
"total_ips": total_ips,
|
||||
"total_hits": total_hits,
|
||||
"n_samples": n,
|
||||
"k": k_actual,
|
||||
"k_base": k,
|
||||
"sensitivity": sensitivity,
|
||||
"elapsed_s": elapsed,
|
||||
},
|
||||
"feature_names": FEATURE_NAMES,
|
||||
}
|
||||
|
||||
with _LOCK:
|
||||
_CACHE["result"] = result_dict
|
||||
_CACHE["cluster_ips"] = cluster_ips_map
|
||||
_CACHE["status"] = "ready"
|
||||
_CACHE["ts"] = time.time()
|
||||
_CACHE["params"] = {"k": k, "hours": hours, "sensitivity": sensitivity}
|
||||
_CACHE["error"] = None
|
||||
|
||||
log.info(f"[clustering] Terminé en {elapsed}s — {total_ips} IPs, {len(nodes)} clusters")
|
||||
|
||||
except Exception as e:
|
||||
log.exception("[clustering] Erreur lors du calcul")
|
||||
with _LOCK:
|
||||
_CACHE["status"] = "error"
|
||||
_CACHE["error"] = str(e)
|
||||
|
||||
|
||||
def _maybe_trigger(k: int, hours: int, sensitivity: float) -> None:
|
||||
"""Lance le calcul si cache absent, expiré ou paramètres différents."""
|
||||
with _LOCK:
|
||||
status = _CACHE["status"]
|
||||
params = _CACHE["params"]
|
||||
ts = _CACHE["ts"]
|
||||
|
||||
cache_stale = (time.time() - ts) > _CACHE_TTL
|
||||
params_changed = (
|
||||
params.get("k") != k or
|
||||
params.get("hours") != hours or
|
||||
params.get("sensitivity") != sensitivity
|
||||
)
|
||||
|
||||
if status in ("computing",):
|
||||
return # déjà en cours
|
||||
|
||||
if status == "ready" and not cache_stale and not params_changed:
|
||||
return # cache frais
|
||||
|
||||
_EXECUTOR.submit(_run_clustering_job, k, hours, sensitivity)
|
||||
|
||||
|
||||
# ─── Endpoints ────────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/status")
|
||||
async def get_status():
|
||||
"""État du calcul en cours (polling frontend)."""
|
||||
with _LOCK:
|
||||
return {
|
||||
"status": _CACHE["status"],
|
||||
"error": _CACHE["error"],
|
||||
"ts": _CACHE["ts"],
|
||||
"params": _CACHE["params"],
|
||||
"age_s": round(time.time() - _CACHE["ts"], 0) if _CACHE["ts"] else None,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/clusters")
|
||||
async def get_clusters(
|
||||
k: int = Query(20, ge=4, le=100, description="Nombre de clusters de base"),
|
||||
hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle (heures)"),
|
||||
sensitivity: float = Query(1.0, ge=0.5, le=5.0, description="Sensibilité : multiplicateur de k (5.0 = granularité maximale)"),
|
||||
force: bool = Query(False, description="Forcer le recalcul"),
|
||||
):
|
||||
"""
|
||||
Clustering multi-métriques sur TOUTES les IPs.
|
||||
|
||||
k_actual = round(k × sensitivity) — la sensibilité contrôle la granularité.
|
||||
Retourne immédiatement depuis le cache. Déclenche le calcul si nécessaire.
|
||||
"""
|
||||
if force:
|
||||
with _LOCK:
|
||||
_CACHE["status"] = "idle"
|
||||
_CACHE["ts"] = 0.0
|
||||
_CACHE["result"] = None
|
||||
_CACHE["cluster_ips"] = {}
|
||||
|
||||
_maybe_trigger(k, hours, sensitivity)
|
||||
|
||||
with _LOCK:
|
||||
status = _CACHE["status"]
|
||||
result = _CACHE["result"]
|
||||
error = _CACHE["error"]
|
||||
|
||||
if status == "computing":
|
||||
return {"status": "computing", "message": "Calcul en cours, réessayez dans quelques secondes"}
|
||||
|
||||
if status == "error":
|
||||
raise HTTPException(status_code=500, detail=error or "Erreur inconnue")
|
||||
|
||||
if result is None:
|
||||
return {"status": "idle", "message": "Calcul démarré, réessayez dans quelques secondes"}
|
||||
|
||||
return {**result, "status": "ready"}
|
||||
|
||||
|
||||
@router.get("/cluster/{cluster_id}/points")
|
||||
async def get_cluster_points(
|
||||
cluster_id: str,
|
||||
limit: int = Query(5000, ge=1, le=20000),
|
||||
offset: int = Query(0, ge=0),
|
||||
):
|
||||
"""
|
||||
Coordonnées PCA + métadonnées de toutes les IPs d'un cluster.
|
||||
Utilisé par deck.gl ScatterplotLayer (drill-down ou zoom avancé).
|
||||
"""
|
||||
with _LOCK:
|
||||
status = _CACHE["status"]
|
||||
ips_map = _CACHE["cluster_ips"]
|
||||
|
||||
if status != "ready" or not ips_map:
|
||||
raise HTTPException(status_code=404, detail="Cache absent — appelez /clusters d'abord")
|
||||
|
||||
try:
|
||||
idx = int(cluster_id.split("_")[0][1:])
|
||||
except (ValueError, IndexError):
|
||||
raise HTTPException(status_code=400, detail="cluster_id invalide (format: c{n}_k{k})")
|
||||
|
||||
members = ips_map.get(idx, [])
|
||||
total = len(members)
|
||||
page = members[offset: offset + limit]
|
||||
|
||||
points = [
|
||||
{"ip": m[0], "ja4": m[1], "pca_x": round(m[2], 6), "pca_y": round(m[3], 6), "risk": round(m[4], 3)}
|
||||
for m in page
|
||||
]
|
||||
return {"points": points, "total": total, "offset": offset, "limit": limit}
|
||||
|
||||
|
||||
@router.get("/cluster/{cluster_id}/ips")
|
||||
async def get_cluster_ips(
|
||||
cluster_id: str,
|
||||
limit: int = Query(100, ge=1, le=500),
|
||||
offset: int = Query(0, ge=0),
|
||||
):
|
||||
"""IPs avec détails SQL (backward-compat avec l'ancienne UI)."""
|
||||
with _LOCK:
|
||||
status = _CACHE["status"]
|
||||
ips_map = _CACHE["cluster_ips"]
|
||||
|
||||
if status != "ready" or not ips_map:
|
||||
raise HTTPException(status_code=404, detail="Cache absent — appelez /clusters d'abord")
|
||||
|
||||
try:
|
||||
idx = int(cluster_id.split("_")[0][1:])
|
||||
except (ValueError, IndexError):
|
||||
raise HTTPException(status_code=400, detail="cluster_id invalide")
|
||||
|
||||
members = ips_map.get(idx, [])
|
||||
total = len(members)
|
||||
page = members[offset: offset + limit]
|
||||
if not page:
|
||||
return {"ips": [], "total": total, "cluster_id": cluster_id}
|
||||
|
||||
safe_ips = [m[0].replace("'", "") for m in page[:200]]
|
||||
ip_filter = ", ".join(f"'{ip}'" for ip in safe_ips)
|
||||
|
||||
sql = f"""
|
||||
SELECT
|
||||
replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') AS src_ip,
|
||||
t.ja4,
|
||||
any(t.tcp_ttl_raw) AS ttl,
|
||||
any(t.tcp_win_raw) AS win,
|
||||
any(t.tcp_scale_raw) AS scale,
|
||||
any(t.tcp_mss_raw) AS mss,
|
||||
sum(t.hits) AS hits,
|
||||
any(t.first_ua) AS ua,
|
||||
round(avg(abs(ml.anomaly_score)), 3) AS avg_score,
|
||||
max(ml.threat_level) AS threat_level,
|
||||
any(ml.country_code) AS country_code,
|
||||
any(ml.asn_org) AS asn_org,
|
||||
round(avg(ml.fuzzing_index), 2) AS fuzzing,
|
||||
round(avg(ml.hit_velocity), 2) AS velocity
|
||||
FROM mabase_prod.agg_host_ip_ja4_1h t
|
||||
LEFT JOIN mabase_prod.ml_detected_anomalies ml
|
||||
ON t.src_ip = ml.src_ip AND t.ja4 = ml.ja4
|
||||
AND ml.detected_at >= now() - INTERVAL 24 HOUR
|
||||
WHERE t.window_start >= now() - INTERVAL 24 HOUR
|
||||
AND replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') IN ({ip_filter})
|
||||
GROUP BY t.src_ip, t.ja4
|
||||
ORDER BY hits DESC
|
||||
"""
|
||||
try:
|
||||
result = db.query(sql)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
ips = []
|
||||
for row in result.result_rows:
|
||||
ips.append({
|
||||
"ip": str(row[0] or ""),
|
||||
"ja4": str(row[1] or ""),
|
||||
"tcp_ttl": int(row[2] or 0),
|
||||
"tcp_win": int(row[3] or 0),
|
||||
"tcp_scale": int(row[4] or 0),
|
||||
"tcp_mss": int(row[5] or 0),
|
||||
"hits": int(row[6] or 0),
|
||||
"ua": str(row[7] or ""),
|
||||
"avg_score": float(row[8] or 0),
|
||||
"threat_level": str(row[9] or ""),
|
||||
"country_code": str(row[10] or ""),
|
||||
"asn_org": str(row[11] or ""),
|
||||
"fuzzing": float(row[12] or 0),
|
||||
"velocity": float(row[13] or 0),
|
||||
})
|
||||
|
||||
return {"ips": ips, "total": total, "cluster_id": cluster_id}
|
||||
Reference in New Issue
Block a user