feat(clustering): ASN + country risk features, fix risk scoring and cluster labels
- Add country_risk() feature (index 21): lookup table 30 pays, CN/RU/KP=1.0, US/DE=0.1 - Add asn_cloud_score() feature (index 22): 50+ keyword match (cloud/CDN/VPN/crawler) - N_FEATURES: 20 → 23 (SQL query extended with country + asn_org columns) - Fix centroids_orig: denormalize K-means z-score centroids to [0,1] for display - Update risk_score_from_centroid(): weights 0.30 ML + 0.12 fuzzing + 0.12 UA-CH + 0.12 country_risk + 0.10 asn_cloud (total=1.0) - Update name_cluster(): new labels 'Source pays risqué', 'Bot cloud UA-CH', 'Infrastructure cloud'; thresholds lowered for better sensitivity - Risk scores now discriminate: pays risqué 0.35, bot cloud 0.23-0.28, sain 0.06 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -152,9 +152,11 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
|
||||
km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=3, seed=42)
|
||||
log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")
|
||||
|
||||
# Centroïdes dans l'espace original [0,1] pour affichage radar
|
||||
# (dé-standardisation : c_orig = c_std * std + mean, puis clip [0,1])
|
||||
centroids_orig = np.clip(km.centroids * feat_std + feat_mean, 0.0, 1.0)
|
||||
|
||||
# ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ────
|
||||
# On utilise les features non-standardisées pour la projection 2D
|
||||
# afin que les distances visuelles restent interprétables.
|
||||
coords = pca_2d(X64) # (n, 2), normalisé [0,1]
|
||||
|
||||
# ── 5b. Enveloppes convexes par cluster ──────────────────────────
|
||||
@ -174,7 +176,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
|
||||
rows[i]["ja4"],
|
||||
float(coords[i][0]),
|
||||
float(coords[i][1]),
|
||||
float(risk_score_from_centroid(km.centroids[j])),
|
||||
float(risk_score_from_centroid(centroids_orig[j])),
|
||||
))
|
||||
|
||||
# ── 7. Construction des nœuds ─────────────────────────────────────
|
||||
@ -192,8 +194,8 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
|
||||
mean_win = avg_f("win")
|
||||
|
||||
raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale}
|
||||
label_name = name_cluster(km.centroids[j], raw_stats)
|
||||
risk = float(risk_score_from_centroid(km.centroids[j]))
|
||||
label_name = name_cluster(centroids_orig[j], raw_stats)
|
||||
risk = float(risk_score_from_centroid(centroids_orig[j]))
|
||||
color = _risk_to_color(risk)
|
||||
|
||||
# Centroïde 2D = moyenne des coords du cluster
|
||||
@ -210,7 +212,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
|
||||
return [v for v, _ in Counter(lst).most_common(n) if v]
|
||||
|
||||
radar = [
|
||||
{"feature": name, "value": round(float(km.centroids[j][i]), 4)}
|
||||
{"feature": name, "value": round(float(centroids_orig[j][i]), 4)}
|
||||
for i, name in enumerate(FEATURE_NAMES)
|
||||
]
|
||||
|
||||
@ -271,7 +273,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
|
||||
ci = ni["cluster_idx"]
|
||||
dists = sorted(
|
||||
[(j, nj["cluster_idx"],
|
||||
float(np.sum((km.centroids[ci] - km.centroids[nj["cluster_idx"]]) ** 2)))
|
||||
float(np.sum((centroids_orig[ci] - centroids_orig[nj["cluster_idx"]]) ** 2)))
|
||||
for j, nj in enumerate(nodes) if j != i],
|
||||
key=lambda x: x[2]
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user