feat(clustering): ASN + country risk features, fix risk scoring and cluster labels

- Add country_risk() feature (index 21): lookup table 30 pays, CN/RU/KP=1.0, US/DE=0.1
- Add asn_cloud_score() feature (index 22): 50+ keyword match (cloud/CDN/VPN/crawler)
- N_FEATURES: 20 → 23 (SQL query extended with country + asn_org columns)
- Fix centroids_orig: denormalize K-means z-score centroids to [0,1] for display
- Update risk_score_from_centroid(): weights 0.30 ML + 0.12 fuzzing + 0.12 UA-CH
  + 0.12 country_risk + 0.10 asn_cloud (total=1.0)
- Update name_cluster(): new labels 'Source pays risqué', 'Bot cloud UA-CH',
  'Infrastructure cloud'; thresholds lowered for better sensitivity
- Risk scores now discriminate: pays risqué 0.35, bot cloud 0.23-0.28, sain 0.06

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
SOC Analyst
2026-03-19 10:15:21 +01:00
parent fc3392779b
commit f456c807db
2 changed files with 128 additions and 28 deletions

View File

@ -152,9 +152,11 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=3, seed=42)
log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")
# Centroïdes dans l'espace original [0,1] pour affichage radar
# (dé-standardisation : c_orig = c_std * std + mean, puis clip [0,1])
centroids_orig = np.clip(km.centroids * feat_std + feat_mean, 0.0, 1.0)
# ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ────
# On utilise les features non-standardisées pour la projection 2D
# afin que les distances visuelles restent interprétables.
coords = pca_2d(X64) # (n, 2), normalisé [0,1]
# ── 5b. Enveloppes convexes par cluster ──────────────────────────
@ -174,7 +176,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
rows[i]["ja4"],
float(coords[i][0]),
float(coords[i][1]),
float(risk_score_from_centroid(km.centroids[j])),
float(risk_score_from_centroid(centroids_orig[j])),
))
# ── 7. Construction des nœuds ─────────────────────────────────────
@ -192,8 +194,8 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
mean_win = avg_f("win")
raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale}
label_name = name_cluster(km.centroids[j], raw_stats)
risk = float(risk_score_from_centroid(km.centroids[j]))
label_name = name_cluster(centroids_orig[j], raw_stats)
risk = float(risk_score_from_centroid(centroids_orig[j]))
color = _risk_to_color(risk)
# Centroïde 2D = moyenne des coords du cluster
@ -210,7 +212,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
return [v for v, _ in Counter(lst).most_common(n) if v]
radar = [
{"feature": name, "value": round(float(km.centroids[j][i]), 4)}
{"feature": name, "value": round(float(centroids_orig[j][i]), 4)}
for i, name in enumerate(FEATURE_NAMES)
]
@ -271,7 +273,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
ci = ni["cluster_idx"]
dists = sorted(
[(j, nj["cluster_idx"],
float(np.sum((km.centroids[ci] - km.centroids[nj["cluster_idx"]]) ** 2)))
float(np.sum((centroids_orig[ci] - centroids_orig[nj["cluster_idx"]]) ** 2)))
for j, nj in enumerate(nodes) if j != i],
key=lambda x: x[2]
)

View File

@ -6,7 +6,7 @@ Ref:
scipy.spatial.ConvexHull — enveloppe convexe (Graham/Qhull)
sklearn-style API — centroids, labels_, inertia_
Features (21 dimensions, normalisées [0,1]) :
Features (23 dimensions, normalisées [0,1]) :
0 ttl_n : TTL initial normalisé
1 mss_n : MSS normalisé → type réseau
2 scale_n : facteur de mise à l'échelle TCP
@ -28,6 +28,8 @@ Features (21 dimensions, normalisées [0,1]) :
18 direct_n : ratio accès directs
19 ja4_div_n : diversité JA4 (log1p)
20 ua_rot_n : UA rotatif (booléen)
21 country_risk_n : risque pays source (CN/RU/KP → 1.0, US/DE/FR → 0.0)
22 asn_cloud_n : hébergeur cloud/CDN/VPN (Cloudflare/AWS/OVH → 1.0)
"""
from __future__ import annotations
@ -39,6 +41,73 @@ from scipy.spatial import ConvexHull
log = logging.getLogger(__name__)
# ─── Encodage pays (risque source) ───────────────────────────────────────────
# Source: MISP threat intel, Spamhaus DROP list, géographie offensive connue
_COUNTRY_RISK: dict[str, float] = {
# Très haut risque : infrastructure offensive documentée
"CN": 1.0, "RU": 1.0, "KP": 1.0, "IR": 1.0,
"BY": 0.9, "SY": 0.9, "CU": 0.8,
# Haut risque : transit/hébergement permissif, bulletproof hosters
"HK": 0.75, "VN": 0.7, "UA": 0.65,
"RO": 0.6, "PK": 0.6, "NG": 0.6,
"BG": 0.55, "TR": 0.55, "BR": 0.5,
"TH": 0.5, "IN": 0.45, "ID": 0.45,
# Risque faible : pays à faible tolérance envers activité malveillante
"US": 0.1, "DE": 0.1, "FR": 0.1, "GB": 0.1,
"CA": 0.1, "JP": 0.1, "AU": 0.1, "NL": 0.15,
"CH": 0.1, "SE": 0.1, "NO": 0.1, "DK": 0.1,
"FI": 0.1, "AT": 0.1, "BE": 0.1, "IT": 0.15,
"SG": 0.3, "TW": 0.2, "KR": 0.2, "RS": 0.4,
}
_DEFAULT_COUNTRY_RISK = 0.35 # pays inconnu → risque modéré
def country_risk(cc: str | None) -> float:
"""Score de risque [0,1] d'un code pays ISO-3166."""
return _COUNTRY_RISK.get((cc or "").upper(), _DEFAULT_COUNTRY_RISK)
# ─── Encodage ASN (type d'infrastructure) ────────────────────────────────────
# Cloud/CDN/hosting → fort corrélé avec scanners automatisés et bots
_ASN_CLOUD_KEYWORDS = [
# Hyperscalers
"amazon", "aws", "google", "microsoft", "azure", "alibaba", "tencent", "huawei",
# CDN / edge
"cloudflare", "akamai", "fastly", "cloudfront", "incapsula", "imperva",
"sucuri", "stackpath", "keycdn",
# Hébergeurs
"ovh", "hetzner", "digitalocean", "vultr", "linode", "akamai-linode",
"leaseweb", "choopa", "packet", "equinix", "serverius", "combahton",
"m247", "b2 net", "hostinger", "contabo",
# Bulletproof / transit permissif connus
"hwclouds", "multacom", "psychz", "serverius", "colocrossing",
"frantech", "sharktech", "tzulo",
# VPN / proxy commerciaux
"nordvpn", "expressvpn", "mullvad", "protonvpn", "surfshark",
"privateinternetaccess", "pia ", "cyberghost", "hotspot shield",
"ipvanish", "hide.me",
# Bots search engines / crawlers
"facebook", "meta ", "twitter", "linkedin", "semrush", "ahrefs",
"majestic", "moz ", "babbar", "sistrix", "criteo", "peer39",
]
def asn_cloud_score(asn_org: str | None) -> float:
"""
Score [0,1] : 1.0 = cloud/CDN/hébergement/VPN confirmé.
Correspond à une infrastructure typiquement utilisée par les bots.
"""
if not asn_org:
return 0.2 # inconnu → légèrement suspect
s = asn_org.lower()
for kw in _ASN_CLOUD_KEYWORDS:
if kw in s:
return 1.0
return 0.0
# ─── Définition des features ──────────────────────────────────────────────────
FEATURES: list[tuple[str, str, object]] = [
@ -72,6 +141,9 @@ FEATURES: list[tuple[str, str, object]] = [
("ja4_count", "Diversité JA4", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(30))),
# UA rotatif
("ua_rotating", "UA Rotatif", lambda v: 1.0 if float(v or 0) > 0 else 0.0),
# ── Géographie & infrastructure (nouvelles features) ──────────────────
("country", "Risque Pays", lambda v: country_risk(str(v) if v else None)),
("asn_org", "Hébergeur Cloud/VPN", lambda v: asn_cloud_score(str(v) if v else None)),
]
FEATURE_KEYS = [f[0] for f in FEATURES]
@ -83,7 +155,7 @@ N_FEATURES = len(FEATURES)
# ─── Construction du vecteur de features ─────────────────────────────────────
def build_feature_vector(row: dict) -> list[float]:
"""Construit le vecteur normalisé [0,1]^21 depuis un dict SQL."""
"""Construit le vecteur normalisé [0,1]^23 depuis un dict SQL."""
return [norm(row.get(key, 0)) for key, _, norm in FEATURES]
@ -249,46 +321,72 @@ def compute_hulls(coords_2d: np.ndarray, labels: np.ndarray,
# ─── Nommage et scoring ───────────────────────────────────────────────────────
def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
"""Nom lisible basé sur les features dominantes du centroïde."""
s = centroid # alias
"""Nom lisible basé sur les features dominantes du centroïde [0,1]."""
s = centroid
ttl_raw = float(raw_stats.get("mean_ttl", 0))
mss_raw = float(raw_stats.get("mean_mss", 0))
# Indices : 21=country_risk, 22=asn_cloud
country_risk = s[21] if len(s) > 21 else 0.0
asn_cloud = s[22] if len(s) > 22 else 0.0
# Scanners / bots masscan
# Scanners Masscan
if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25:
return "🤖 Masscan Scanner"
if s[4] > 0.70 and s[6] > 0.5:
# Bots offensifs agressifs (fuzzing + anomalie + pays risqué)
if s[4] > 0.40 and s[6] > 0.3:
return "🤖 Bot agressif"
if s[16] > 0.80:
# Pays à très haut risque (CN, RU, KP) avec trafic anormal
if country_risk > 0.75 and (s[4] > 0.10 or asn_cloud > 0.5):
return "🌏 Source pays risqué"
# Cloud + UA-CH mismatch = crawler/bot cloud
if s[16] > 0.50 and asn_cloud > 0.70:
return "☁️ Bot cloud UA-CH"
# UA-CH mismatch seul
if s[16] > 0.60:
return "🤖 UA-CH Mismatch"
if s[7] > 0.70:
# Headless browser
if s[7] > 0.50:
return "🤖 Headless Browser"
if s[4] > 0.50:
return "⚠️ Anomalie ML haute"
# Anomalie ML significative
if s[4] > 0.35:
return "⚠️ Anomalie ML"
# Cloud pur (CDN/crawler légitime ?)
if asn_cloud > 0.85 and s[4] < 0.15:
return "☁️ Infrastructure cloud"
# Pays à risque élevé sans autre signal
if country_risk > 0.60:
return "🌏 Trafic suspect (pays)"
# OS fingerprinting
if s[3] > 0.85 and ttl_raw > 120:
return "🖥️ Windows"
if s[0] > 0.22 and s[0] < 0.28 and mss_raw > 1400:
return "🐧 Linux"
if s[1] < 0.90 and s[1] > 0.95:
return "📡 VPN/Proxy"
if mss_raw < 1380 and mss_raw > 0:
return "🌐 Tunnel réseau"
if s[5] > 0.60:
if s[5] > 0.40:
return "⚡ Trafic rapide"
if s[4] < 0.10 and s[5] < 0.10:
if s[4] < 0.08 and s[5] < 0.10 and asn_cloud < 0.30:
return "✅ Trafic sain"
return "📊 Cluster mixte"
def risk_score_from_centroid(centroid: np.ndarray) -> float:
"""Score de risque [0,1] agrégé depuis le centroïde."""
"""
Score de risque [0,1] agrégé depuis le centroïde (espace original [0,1]).
Intègre pays et infrastructure cloud.
"""
s = centroid
country_risk = s[21] if len(s) > 21 else 0.0
asn_cloud = s[22] if len(s) > 22 else 0.0
return float(np.clip(
0.40 * s[4] + # score ML
0.15 * s[6] + # fuzzing
0.15 * s[16] + # UA-CH mismatch
0.10 * s[7] + # headless
0.10 * s[5] + # vélocité
0.10 * s[9], # IP-ID zéro
0.30 * s[4] + # score ML anomalie (principal)
0.12 * s[6] + # fuzzing
0.12 * s[16] + # UA-CH mismatch
0.08 * s[7] + # headless
0.08 * s[5] + # vélocité
0.08 * s[9] + # IP-ID zéro
0.12 * country_risk + # risque pays source
0.10 * asn_cloud, # infrastructure cloud/VPN
0.0, 1.0
))