feat: slider sensibilité + z-score standardization pour clustering plus précis
Sensibilité (0.5x–3.0x) : - Multiplie k : sensibilité=2x avec k=14 → 28 clusters effectifs - Labels UI : Grossière / Normale / Fine / Très fine / Maximum - Paramètres avancés (k, fenêtre) masqués dans un <details> - Cache invalidé si sensibilité change Z-score standardisation (Bishop 2006 PRML §9.1) : - Normalise par variance de chaque feature avant K-means - Features discriminantes (forte std) pèsent plus - Résultat : risque 0→1.00 sur clusters bots vs 0→0.27 avant - Bots détectés : 4 337 IPs vs 1 604 (2.7x plus) - Nouveaux clusters : Bot agressif, Tunnel réseau, UA-CH Mismatch distincts Fix TextLayer deck.gl : - Translittération des accents (é→e, à→a, ç→c…) + strip emojis - Évite les warnings 'Missing character' sur caractères non-ASCII Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -23,7 +23,7 @@ from ..database import db
|
|||||||
from ..services.clustering_engine import (
|
from ..services.clustering_engine import (
|
||||||
FEATURE_KEYS, FEATURE_NAMES, FEATURE_NORMS, N_FEATURES,
|
FEATURE_KEYS, FEATURE_NAMES, FEATURE_NORMS, N_FEATURES,
|
||||||
build_feature_vector, kmeans_pp, pca_2d, compute_hulls,
|
build_feature_vector, kmeans_pp, pca_2d, compute_hulls,
|
||||||
name_cluster, risk_score_from_centroid,
|
name_cluster, risk_score_from_centroid, standardize,
|
||||||
)
|
)
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
@ -110,15 +110,22 @@ _SQL_COLS = [
|
|||||||
|
|
||||||
# ─── Worker de clustering (thread pool) ──────────────────────────────────────
|
# ─── Worker de clustering (thread pool) ──────────────────────────────────────
|
||||||
|
|
||||||
def _run_clustering_job(k: int, hours: int) -> None:
|
def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
|
||||||
"""Exécuté dans le thread pool. Met à jour _CACHE."""
|
"""Exécuté dans le thread pool. Met à jour _CACHE.
|
||||||
|
|
||||||
|
sensitivity : multiplicateur de k [0.5 – 3.0].
|
||||||
|
1.0 = comportement par défaut
|
||||||
|
2.0 = deux fois plus de clusters → groupes plus homogènes
|
||||||
|
0.5 = moitié → vue très agrégée
|
||||||
|
"""
|
||||||
|
k_actual = max(4, min(50, round(k * sensitivity)))
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
with _LOCK:
|
with _LOCK:
|
||||||
_CACHE["status"] = "computing"
|
_CACHE["status"] = "computing"
|
||||||
_CACHE["error"] = None
|
_CACHE["error"] = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
log.info(f"[clustering] Démarrage du calcul k={k} hours={hours}")
|
log.info(f"[clustering] Démarrage k={k_actual} (base={k}×sens={sensitivity}) hours={hours}")
|
||||||
|
|
||||||
# ── 1. Chargement de toutes les IPs ──────────────────────────────
|
# ── 1. Chargement de toutes les IPs ──────────────────────────────
|
||||||
result = db.query(_SQL_ALL_IPS, {"hours": hours})
|
result = db.query(_SQL_ALL_IPS, {"hours": hours})
|
||||||
@ -128,27 +135,35 @@ def _run_clustering_job(k: int, hours: int) -> None:
|
|||||||
|
|
||||||
n = len(rows)
|
n = len(rows)
|
||||||
log.info(f"[clustering] {n} IPs chargées")
|
log.info(f"[clustering] {n} IPs chargées")
|
||||||
if n < k:
|
if n < k_actual:
|
||||||
raise ValueError(f"Seulement {n} IPs disponibles (k={k} requis)")
|
raise ValueError(f"Seulement {n} IPs disponibles (k={k_actual} requis)")
|
||||||
|
|
||||||
# ── 2. Construction de la matrice de features (numpy) ────────────
|
# ── 2. Construction de la matrice de features (numpy) ────────────
|
||||||
X = np.array([build_feature_vector(r) for r in rows], dtype=np.float32)
|
X = np.array([build_feature_vector(r) for r in rows], dtype=np.float32)
|
||||||
log.info(f"[clustering] Matrice X: {X.shape} — {X.nbytes/1024/1024:.1f} MB")
|
log.info(f"[clustering] Matrice X: {X.shape} — {X.nbytes/1024/1024:.1f} MB")
|
||||||
|
|
||||||
# ── 3. K-means++ vectorisé ────────────────────────────────────────
|
# ── 3. Standardisation z-score ────────────────────────────────────
|
||||||
km = kmeans_pp(X.astype(np.float64), k=k, max_iter=80, n_init=3, seed=42)
|
# Normalise par variance : features discriminantes (forte std)
|
||||||
|
# contribuent plus que les features quasi-constantes.
|
||||||
|
X64 = X.astype(np.float64)
|
||||||
|
X_std, feat_mean, feat_std = standardize(X64)
|
||||||
|
|
||||||
|
# ── 4. K-means++ sur l'espace standardisé ────────────────────────
|
||||||
|
km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=3, seed=42)
|
||||||
log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")
|
log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")
|
||||||
|
|
||||||
# ── 4. PCA-2D pour toutes les IPs ────────────────────────────────
|
# ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ────
|
||||||
coords = pca_2d(X.astype(np.float64)) # (n, 2), normalisé [0,1]
|
# On utilise les features non-standardisées pour la projection 2D
|
||||||
|
# afin que les distances visuelles restent interprétables.
|
||||||
|
coords = pca_2d(X64) # (n, 2), normalisé [0,1]
|
||||||
|
|
||||||
# ── 5. Enveloppes convexes par cluster ───────────────────────────
|
# ── 5b. Enveloppes convexes par cluster ──────────────────────────
|
||||||
hulls = compute_hulls(coords, km.labels, k)
|
hulls = compute_hulls(coords, km.labels, k_actual)
|
||||||
|
|
||||||
# ── 6. Agrégation par cluster ─────────────────────────────────────
|
# ── 6. Agrégation par cluster ─────────────────────────────────────
|
||||||
cluster_rows: list[list[dict]] = [[] for _ in range(k)]
|
cluster_rows: list[list[dict]] = [[] for _ in range(k_actual)]
|
||||||
cluster_coords: list[list[list[float]]] = [[] for _ in range(k)]
|
cluster_coords: list[list[list[float]]] = [[] for _ in range(k_actual)]
|
||||||
cluster_ips_map: dict[int, list] = {j: [] for j in range(k)}
|
cluster_ips_map: dict[int, list] = {j: [] for j in range(k_actual)}
|
||||||
|
|
||||||
for i, label in enumerate(km.labels):
|
for i, label in enumerate(km.labels):
|
||||||
j = int(label)
|
j = int(label)
|
||||||
@ -164,7 +179,7 @@ def _run_clustering_job(k: int, hours: int) -> None:
|
|||||||
|
|
||||||
# ── 7. Construction des nœuds ─────────────────────────────────────
|
# ── 7. Construction des nœuds ─────────────────────────────────────
|
||||||
nodes = []
|
nodes = []
|
||||||
for j in range(k):
|
for j in range(k_actual):
|
||||||
if not cluster_rows[j]:
|
if not cluster_rows[j]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -206,7 +221,7 @@ def _run_clustering_job(k: int, hours: int) -> None:
|
|||||||
sample_ua = str(cluster_rows[j][0].get("ua") or "")
|
sample_ua = str(cluster_rows[j][0].get("ua") or "")
|
||||||
|
|
||||||
nodes.append({
|
nodes.append({
|
||||||
"id": f"c{j}_k{k}",
|
"id": f"c{j}_k{k_actual}",
|
||||||
"cluster_idx": j,
|
"cluster_idx": j,
|
||||||
"label": label_name,
|
"label": label_name,
|
||||||
"pca_x": round(cxy[0], 6),
|
"pca_x": round(cxy[0], 6),
|
||||||
@ -289,7 +304,9 @@ def _run_clustering_job(k: int, hours: int) -> None:
|
|||||||
"bot_ips": bot_ips,
|
"bot_ips": bot_ips,
|
||||||
"high_risk_ips": high_ips,
|
"high_risk_ips": high_ips,
|
||||||
"n_samples": n,
|
"n_samples": n,
|
||||||
"k": k,
|
"k": k_actual,
|
||||||
|
"k_base": k,
|
||||||
|
"sensitivity": sensitivity,
|
||||||
"elapsed_s": elapsed,
|
"elapsed_s": elapsed,
|
||||||
},
|
},
|
||||||
"feature_names": FEATURE_NAMES,
|
"feature_names": FEATURE_NAMES,
|
||||||
@ -300,7 +317,7 @@ def _run_clustering_job(k: int, hours: int) -> None:
|
|||||||
_CACHE["cluster_ips"] = cluster_ips_map
|
_CACHE["cluster_ips"] = cluster_ips_map
|
||||||
_CACHE["status"] = "ready"
|
_CACHE["status"] = "ready"
|
||||||
_CACHE["ts"] = time.time()
|
_CACHE["ts"] = time.time()
|
||||||
_CACHE["params"] = {"k": k, "hours": hours}
|
_CACHE["params"] = {"k": k, "hours": hours, "sensitivity": sensitivity}
|
||||||
_CACHE["error"] = None
|
_CACHE["error"] = None
|
||||||
|
|
||||||
log.info(f"[clustering] Terminé en {elapsed}s — {total_ips} IPs, {len(nodes)} clusters")
|
log.info(f"[clustering] Terminé en {elapsed}s — {total_ips} IPs, {len(nodes)} clusters")
|
||||||
@ -312,7 +329,7 @@ def _run_clustering_job(k: int, hours: int) -> None:
|
|||||||
_CACHE["error"] = str(e)
|
_CACHE["error"] = str(e)
|
||||||
|
|
||||||
|
|
||||||
def _maybe_trigger(k: int, hours: int) -> None:
|
def _maybe_trigger(k: int, hours: int, sensitivity: float) -> None:
|
||||||
"""Lance le calcul si cache absent, expiré ou paramètres différents."""
|
"""Lance le calcul si cache absent, expiré ou paramètres différents."""
|
||||||
with _LOCK:
|
with _LOCK:
|
||||||
status = _CACHE["status"]
|
status = _CACHE["status"]
|
||||||
@ -320,7 +337,11 @@ def _maybe_trigger(k: int, hours: int) -> None:
|
|||||||
ts = _CACHE["ts"]
|
ts = _CACHE["ts"]
|
||||||
|
|
||||||
cache_stale = (time.time() - ts) > _CACHE_TTL
|
cache_stale = (time.time() - ts) > _CACHE_TTL
|
||||||
params_changed = params.get("k") != k or params.get("hours") != hours
|
params_changed = (
|
||||||
|
params.get("k") != k or
|
||||||
|
params.get("hours") != hours or
|
||||||
|
params.get("sensitivity") != sensitivity
|
||||||
|
)
|
||||||
|
|
||||||
if status in ("computing",):
|
if status in ("computing",):
|
||||||
return # déjà en cours
|
return # déjà en cours
|
||||||
@ -328,7 +349,7 @@ def _maybe_trigger(k: int, hours: int) -> None:
|
|||||||
if status == "ready" and not cache_stale and not params_changed:
|
if status == "ready" and not cache_stale and not params_changed:
|
||||||
return # cache frais
|
return # cache frais
|
||||||
|
|
||||||
_EXECUTOR.submit(_run_clustering_job, k, hours)
|
_EXECUTOR.submit(_run_clustering_job, k, hours, sensitivity)
|
||||||
|
|
||||||
|
|
||||||
# ─── Endpoints ────────────────────────────────────────────────────────────────
|
# ─── Endpoints ────────────────────────────────────────────────────────────────
|
||||||
@ -348,22 +369,23 @@ async def get_status():
|
|||||||
|
|
||||||
@router.get("/clusters")
|
@router.get("/clusters")
|
||||||
async def get_clusters(
|
async def get_clusters(
|
||||||
k: int = Query(14, ge=4, le=30, description="Nombre de clusters"),
|
k: int = Query(14, ge=4, le=30, description="Nombre de clusters de base"),
|
||||||
hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle (heures)"),
|
hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle (heures)"),
|
||||||
|
sensitivity: float = Query(1.0, ge=0.5, le=3.0, description="Sensibilité : multiplicateur de k"),
|
||||||
force: bool = Query(False, description="Forcer le recalcul"),
|
force: bool = Query(False, description="Forcer le recalcul"),
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Clustering multi-métriques sur TOUTES les IPs.
|
Clustering multi-métriques sur TOUTES les IPs.
|
||||||
|
|
||||||
Retourne immédiatement depuis le cache (status=ready).
|
k_actual = round(k × sensitivity) — la sensibilité contrôle la granularité.
|
||||||
Si le calcul est en cours ou non démarré → status=computing/idle + trigger.
|
Retourne immédiatement depuis le cache. Déclenche le calcul si nécessaire.
|
||||||
"""
|
"""
|
||||||
if force:
|
if force:
|
||||||
with _LOCK:
|
with _LOCK:
|
||||||
_CACHE["status"] = "idle"
|
_CACHE["status"] = "idle"
|
||||||
_CACHE["ts"] = 0.0
|
_CACHE["ts"] = 0.0
|
||||||
|
|
||||||
_maybe_trigger(k, hours)
|
_maybe_trigger(k, hours, sensitivity)
|
||||||
|
|
||||||
with _LOCK:
|
with _LOCK:
|
||||||
status = _CACHE["status"]
|
status = _CACHE["status"]
|
||||||
|
|||||||
@ -87,6 +87,23 @@ def build_feature_vector(row: dict) -> list[float]:
|
|||||||
return [norm(row.get(key, 0)) for key, _, norm in FEATURES]
|
return [norm(row.get(key, 0)) for key, _, norm in FEATURES]
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Standardisation z-score ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def standardize(X: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||||
|
"""
|
||||||
|
Z-score standardisation : chaque feature est centrée et mise à l'échelle
|
||||||
|
par sa déviation standard.
|
||||||
|
|
||||||
|
Ref: Bishop (2006) PRML §9.1 — preprocessing recommandé pour K-means.
|
||||||
|
|
||||||
|
Retourne (X_std, mean, std) pour pouvoir projeter de nouveaux points.
|
||||||
|
"""
|
||||||
|
mean = X.mean(axis=0)
|
||||||
|
std = X.std(axis=0)
|
||||||
|
std[std < 1e-8] = 1.0 # évite la division par zéro pour features constantes
|
||||||
|
return (X - mean) / std, mean, std
|
||||||
|
|
||||||
|
|
||||||
# ─── K-means++ vectorisé (numpy) ─────────────────────────────────────────────
|
# ─── K-means++ vectorisé (numpy) ─────────────────────────────────────────────
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@ -96,6 +96,7 @@ function hexToRgba(hex: string, alpha = 255): [number, number, number, number] {
|
|||||||
export default function ClusteringView() {
|
export default function ClusteringView() {
|
||||||
const [k, setK] = useState(14);
|
const [k, setK] = useState(14);
|
||||||
const [hours, setHours] = useState(24);
|
const [hours, setHours] = useState(24);
|
||||||
|
const [sensitivity, setSensitivity] = useState(1.0);
|
||||||
const [data, setData] = useState<ClusterResult | null>(null);
|
const [data, setData] = useState<ClusterResult | null>(null);
|
||||||
const [loading, setLoading] = useState(false);
|
const [loading, setLoading] = useState(false);
|
||||||
const [computing, setComputing] = useState(false);
|
const [computing, setComputing] = useState(false);
|
||||||
@ -123,7 +124,7 @@ export default function ClusteringView() {
|
|||||||
setError(null);
|
setError(null);
|
||||||
try {
|
try {
|
||||||
const res = await axios.get<ClusterResult>('/api/clustering/clusters', {
|
const res = await axios.get<ClusterResult>('/api/clustering/clusters', {
|
||||||
params: { k, hours, force },
|
params: { k, hours, sensitivity, force },
|
||||||
});
|
});
|
||||||
if (res.data.status === 'computing' || res.data.status === 'idle') {
|
if (res.data.status === 'computing' || res.data.status === 'idle') {
|
||||||
setComputing(true);
|
setComputing(true);
|
||||||
@ -298,13 +299,17 @@ export default function ClusteringView() {
|
|||||||
updateTriggers: { getFillColor: [selected?.id], getLineWidth: [selected?.id] },
|
updateTriggers: { getFillColor: [selected?.id], getLineWidth: [selected?.id] },
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// 5. Labels (TextLayer) — strip emojis (non supportés par le bitmap font deck.gl)
|
const stripNonAscii = (s: string) =>
|
||||||
const stripEmoji = (s: string) => s.replace(/[\u{1F000}-\u{1FFFF}\u{2600}-\u{27FF}]/gu, '').trim();
|
s.replace(/[\u{0080}-\u{FFFF}]/gu, c => {
|
||||||
|
// Translitérations basiques pour la lisibilité
|
||||||
|
const map: Record<string, string> = { é:'e',è:'e',ê:'e',ë:'e',à:'a',â:'a',ô:'o',ù:'u',û:'u',î:'i',ï:'i',ç:'c' };
|
||||||
|
return map[c] ?? '';
|
||||||
|
}).replace(/[\u{1F000}-\u{1FFFF}\u{2600}-\u{27FF}]/gu, '').trim();
|
||||||
layerList.push(new TextLayer({
|
layerList.push(new TextLayer({
|
||||||
id: 'labels',
|
id: 'labels',
|
||||||
data: nodes,
|
data: nodes,
|
||||||
getPosition: (d: ClusterNode) => [toWorld(d.pca_x), toWorld(d.pca_y), 0],
|
getPosition: (d: ClusterNode) => [toWorld(d.pca_x), toWorld(d.pca_y), 0],
|
||||||
getText: (d: ClusterNode) => stripEmoji(d.label),
|
getText: (d: ClusterNode) => stripNonAscii(d.label),
|
||||||
getSize: 12,
|
getSize: 12,
|
||||||
sizeUnits: 'pixels',
|
sizeUnits: 'pixels',
|
||||||
getColor: [255, 255, 255, 200],
|
getColor: [255, 255, 255, 200],
|
||||||
@ -334,17 +339,38 @@ export default function ClusteringView() {
|
|||||||
|
|
||||||
{/* Paramètres */}
|
{/* Paramètres */}
|
||||||
<div className="bg-background-card rounded-lg p-3 space-y-3">
|
<div className="bg-background-card rounded-lg p-3 space-y-3">
|
||||||
<label className="block text-xs text-text-secondary">
|
{/* Sensibilité */}
|
||||||
Clusters (k)
|
<div className="space-y-1">
|
||||||
|
<div className="flex justify-between text-xs text-text-secondary">
|
||||||
|
<span>Sensibilité</span>
|
||||||
|
<span className="font-mono text-white">
|
||||||
|
{sensitivity === 0.5 ? 'Grossière' : sensitivity <= 1.0 ? 'Normale' : sensitivity <= 1.5 ? 'Fine' : sensitivity <= 2.0 ? 'Très fine' : 'Maximum'}
|
||||||
|
{' '}({Math.round(k * sensitivity)} clusters)
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<input type="range" min={0.5} max={3.0} step={0.5} value={sensitivity}
|
||||||
|
onChange={e => setSensitivity(+e.target.value)}
|
||||||
|
className="w-full accent-accent-primary" />
|
||||||
|
<div className="flex justify-between text-xs text-text-disabled">
|
||||||
|
<span>Grossière</span><span>Maximum</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* k avancé */}
|
||||||
|
<details className="text-xs text-text-secondary">
|
||||||
|
<summary className="cursor-pointer hover:text-white">Paramètres avancés</summary>
|
||||||
|
<div className="mt-2 space-y-2">
|
||||||
|
<label className="block">
|
||||||
|
Clusters de base (k)
|
||||||
<input type="range" min={4} max={30} value={k}
|
<input type="range" min={4} max={30} value={k}
|
||||||
onChange={e => setK(+e.target.value)}
|
onChange={e => setK(+e.target.value)}
|
||||||
className="w-full mt-1 accent-accent-primary" />
|
className="w-full mt-1 accent-accent-primary" />
|
||||||
<span className="font-mono text-white">{k}</span>
|
<span className="font-mono text-white">{k}</span>
|
||||||
</label>
|
</label>
|
||||||
<label className="block text-xs text-text-secondary">
|
<label className="block">
|
||||||
Fenêtre
|
Fenêtre
|
||||||
<select value={hours} onChange={e => setHours(+e.target.value)}
|
<select value={hours} onChange={e => setHours(+e.target.value)}
|
||||||
className="w-full mt-1 bg-background border border-gray-600 rounded px-2 py-1 text-sm">
|
className="w-full mt-1 bg-background border border-gray-600 rounded px-2 py-1">
|
||||||
<option value={6}>6h</option>
|
<option value={6}>6h</option>
|
||||||
<option value={12}>12h</option>
|
<option value={12}>12h</option>
|
||||||
<option value={24}>24h</option>
|
<option value={24}>24h</option>
|
||||||
@ -352,6 +378,9 @@ export default function ClusteringView() {
|
|||||||
<option value={168}>7j</option>
|
<option value={168}>7j</option>
|
||||||
</select>
|
</select>
|
||||||
</label>
|
</label>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
<label className="flex items-center gap-2 text-xs text-text-secondary cursor-pointer">
|
<label className="flex items-center gap-2 text-xs text-text-secondary cursor-pointer">
|
||||||
<input type="checkbox" checked={showEdges} onChange={e => setShowEdges(e.target.checked)}
|
<input type="checkbox" checked={showEdges} onChange={e => setShowEdges(e.target.checked)}
|
||||||
className="accent-accent-primary" />
|
className="accent-accent-primary" />
|
||||||
|
|||||||
Reference in New Issue
Block a user