feat(clustering): ASN + country risk features, fix risk scoring and cluster labels

- Add country_risk() feature (index 21): lookup table 30 pays, CN/RU/KP=1.0, US/DE=0.1
- Add asn_cloud_score() feature (index 22): 50+ keyword match (cloud/CDN/VPN/crawler)
- N_FEATURES: 20 → 23 (SQL query extended with country + asn_org columns)
- Fix centroids_orig: denormalize K-means z-score centroids to [0,1] for display
- Update risk_score_from_centroid(): weights 0.30 ML + 0.12 fuzzing + 0.12 UA-CH
  + 0.12 country_risk + 0.10 asn_cloud (total=1.0)
- Update name_cluster(): new labels 'Source pays risqué', 'Bot cloud UA-CH',
  'Infrastructure cloud'; thresholds lowered for better sensitivity
- Risk scores now discriminate: pays risqué 0.35, bot cloud 0.23-0.28, sain 0.06

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
SOC Analyst
2026-03-19 10:15:21 +01:00
parent fc3392779b
commit f456c807db
2 changed files with 128 additions and 28 deletions

View File

@ -152,9 +152,11 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=3, seed=42)
log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")
# Centroïdes dans l'espace original [0,1] pour affichage radar
# (dé-standardisation : c_orig = c_std * std + mean, puis clip [0,1])
centroids_orig = np.clip(km.centroids * feat_std + feat_mean, 0.0, 1.0)
# ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ────
# On utilise les features non-standardisées pour la projection 2D
# afin que les distances visuelles restent interprétables.
coords = pca_2d(X64) # (n, 2), normalisé [0,1]
# ── 5b. Enveloppes convexes par cluster ──────────────────────────
@ -174,7 +176,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
rows[i]["ja4"],
float(coords[i][0]),
float(coords[i][1]),
float(risk_score_from_centroid(km.centroids[j])),
float(risk_score_from_centroid(centroids_orig[j])),
))
# ── 7. Construction des nœuds ─────────────────────────────────────
@ -192,8 +194,8 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
mean_win = avg_f("win")
raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale}
label_name = name_cluster(km.centroids[j], raw_stats)
risk = float(risk_score_from_centroid(km.centroids[j]))
label_name = name_cluster(centroids_orig[j], raw_stats)
risk = float(risk_score_from_centroid(centroids_orig[j]))
color = _risk_to_color(risk)
# Centroïde 2D = moyenne des coords du cluster
@ -210,7 +212,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
return [v for v, _ in Counter(lst).most_common(n) if v]
radar = [
{"feature": name, "value": round(float(km.centroids[j][i]), 4)}
{"feature": name, "value": round(float(centroids_orig[j][i]), 4)}
for i, name in enumerate(FEATURE_NAMES)
]
@ -271,7 +273,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
ci = ni["cluster_idx"]
dists = sorted(
[(j, nj["cluster_idx"],
float(np.sum((km.centroids[ci] - km.centroids[nj["cluster_idx"]]) ** 2)))
float(np.sum((centroids_orig[ci] - centroids_orig[nj["cluster_idx"]]) ** 2)))
for j, nj in enumerate(nodes) if j != i],
key=lambda x: x[2]
)