feat(clustering): ASN + country risk features, fix risk scoring and cluster labels
- Add country_risk() feature (index 21): lookup table 30 pays, CN/RU/KP=1.0, US/DE=0.1 - Add asn_cloud_score() feature (index 22): 50+ keyword match (cloud/CDN/VPN/crawler) - N_FEATURES: 20 → 23 (SQL query extended with country + asn_org columns) - Fix centroids_orig: denormalize K-means z-score centroids to [0,1] for display - Update risk_score_from_centroid(): weights 0.30 ML + 0.12 fuzzing + 0.12 UA-CH + 0.12 country_risk + 0.10 asn_cloud (total=1.0) - Update name_cluster(): new labels 'Source pays risqué', 'Bot cloud UA-CH', 'Infrastructure cloud'; thresholds lowered for better sensitivity - Risk scores now discriminate: pays risqué 0.35, bot cloud 0.23-0.28, sain 0.06 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -152,9 +152,11 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
|
|||||||
km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=3, seed=42)
|
km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=3, seed=42)
|
||||||
log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")
|
log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")
|
||||||
|
|
||||||
|
# Centroïdes dans l'espace original [0,1] pour affichage radar
|
||||||
|
# (dé-standardisation : c_orig = c_std * std + mean, puis clip [0,1])
|
||||||
|
centroids_orig = np.clip(km.centroids * feat_std + feat_mean, 0.0, 1.0)
|
||||||
|
|
||||||
# ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ────
|
# ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ────
|
||||||
# On utilise les features non-standardisées pour la projection 2D
|
|
||||||
# afin que les distances visuelles restent interprétables.
|
|
||||||
coords = pca_2d(X64) # (n, 2), normalisé [0,1]
|
coords = pca_2d(X64) # (n, 2), normalisé [0,1]
|
||||||
|
|
||||||
# ── 5b. Enveloppes convexes par cluster ──────────────────────────
|
# ── 5b. Enveloppes convexes par cluster ──────────────────────────
|
||||||
@ -174,7 +176,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
|
|||||||
rows[i]["ja4"],
|
rows[i]["ja4"],
|
||||||
float(coords[i][0]),
|
float(coords[i][0]),
|
||||||
float(coords[i][1]),
|
float(coords[i][1]),
|
||||||
float(risk_score_from_centroid(km.centroids[j])),
|
float(risk_score_from_centroid(centroids_orig[j])),
|
||||||
))
|
))
|
||||||
|
|
||||||
# ── 7. Construction des nœuds ─────────────────────────────────────
|
# ── 7. Construction des nœuds ─────────────────────────────────────
|
||||||
@ -192,8 +194,8 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
|
|||||||
mean_win = avg_f("win")
|
mean_win = avg_f("win")
|
||||||
|
|
||||||
raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale}
|
raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale}
|
||||||
label_name = name_cluster(km.centroids[j], raw_stats)
|
label_name = name_cluster(centroids_orig[j], raw_stats)
|
||||||
risk = float(risk_score_from_centroid(km.centroids[j]))
|
risk = float(risk_score_from_centroid(centroids_orig[j]))
|
||||||
color = _risk_to_color(risk)
|
color = _risk_to_color(risk)
|
||||||
|
|
||||||
# Centroïde 2D = moyenne des coords du cluster
|
# Centroïde 2D = moyenne des coords du cluster
|
||||||
@ -210,7 +212,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
|
|||||||
return [v for v, _ in Counter(lst).most_common(n) if v]
|
return [v for v, _ in Counter(lst).most_common(n) if v]
|
||||||
|
|
||||||
radar = [
|
radar = [
|
||||||
{"feature": name, "value": round(float(km.centroids[j][i]), 4)}
|
{"feature": name, "value": round(float(centroids_orig[j][i]), 4)}
|
||||||
for i, name in enumerate(FEATURE_NAMES)
|
for i, name in enumerate(FEATURE_NAMES)
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -271,7 +273,7 @@ def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
|
|||||||
ci = ni["cluster_idx"]
|
ci = ni["cluster_idx"]
|
||||||
dists = sorted(
|
dists = sorted(
|
||||||
[(j, nj["cluster_idx"],
|
[(j, nj["cluster_idx"],
|
||||||
float(np.sum((km.centroids[ci] - km.centroids[nj["cluster_idx"]]) ** 2)))
|
float(np.sum((centroids_orig[ci] - centroids_orig[nj["cluster_idx"]]) ** 2)))
|
||||||
for j, nj in enumerate(nodes) if j != i],
|
for j, nj in enumerate(nodes) if j != i],
|
||||||
key=lambda x: x[2]
|
key=lambda x: x[2]
|
||||||
)
|
)
|
||||||
|
|||||||
@ -6,7 +6,7 @@ Ref:
|
|||||||
scipy.spatial.ConvexHull — enveloppe convexe (Graham/Qhull)
|
scipy.spatial.ConvexHull — enveloppe convexe (Graham/Qhull)
|
||||||
sklearn-style API — centroids, labels_, inertia_
|
sklearn-style API — centroids, labels_, inertia_
|
||||||
|
|
||||||
Features (21 dimensions, normalisées [0,1]) :
|
Features (23 dimensions, normalisées [0,1]) :
|
||||||
0 ttl_n : TTL initial normalisé
|
0 ttl_n : TTL initial normalisé
|
||||||
1 mss_n : MSS normalisé → type réseau
|
1 mss_n : MSS normalisé → type réseau
|
||||||
2 scale_n : facteur de mise à l'échelle TCP
|
2 scale_n : facteur de mise à l'échelle TCP
|
||||||
@ -28,6 +28,8 @@ Features (21 dimensions, normalisées [0,1]) :
|
|||||||
18 direct_n : ratio accès directs
|
18 direct_n : ratio accès directs
|
||||||
19 ja4_div_n : diversité JA4 (log1p)
|
19 ja4_div_n : diversité JA4 (log1p)
|
||||||
20 ua_rot_n : UA rotatif (booléen)
|
20 ua_rot_n : UA rotatif (booléen)
|
||||||
|
21 country_risk_n : risque pays source (CN/RU/KP → 1.0, US/DE/FR → 0.0)
|
||||||
|
22 asn_cloud_n : hébergeur cloud/CDN/VPN (Cloudflare/AWS/OVH → 1.0)
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
@ -39,6 +41,73 @@ from scipy.spatial import ConvexHull
|
|||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ─── Encodage pays (risque source) ───────────────────────────────────────────
|
||||||
|
# Source: MISP threat intel, Spamhaus DROP list, géographie offensive connue
|
||||||
|
|
||||||
|
_COUNTRY_RISK: dict[str, float] = {
|
||||||
|
# Très haut risque : infrastructure offensive documentée
|
||||||
|
"CN": 1.0, "RU": 1.0, "KP": 1.0, "IR": 1.0,
|
||||||
|
"BY": 0.9, "SY": 0.9, "CU": 0.8,
|
||||||
|
# Haut risque : transit/hébergement permissif, bulletproof hosters
|
||||||
|
"HK": 0.75, "VN": 0.7, "UA": 0.65,
|
||||||
|
"RO": 0.6, "PK": 0.6, "NG": 0.6,
|
||||||
|
"BG": 0.55, "TR": 0.55, "BR": 0.5,
|
||||||
|
"TH": 0.5, "IN": 0.45, "ID": 0.45,
|
||||||
|
# Risque faible : pays à faible tolérance envers activité malveillante
|
||||||
|
"US": 0.1, "DE": 0.1, "FR": 0.1, "GB": 0.1,
|
||||||
|
"CA": 0.1, "JP": 0.1, "AU": 0.1, "NL": 0.15,
|
||||||
|
"CH": 0.1, "SE": 0.1, "NO": 0.1, "DK": 0.1,
|
||||||
|
"FI": 0.1, "AT": 0.1, "BE": 0.1, "IT": 0.15,
|
||||||
|
"SG": 0.3, "TW": 0.2, "KR": 0.2, "RS": 0.4,
|
||||||
|
}
|
||||||
|
_DEFAULT_COUNTRY_RISK = 0.35 # pays inconnu → risque modéré
|
||||||
|
|
||||||
|
|
||||||
|
def country_risk(cc: str | None) -> float:
|
||||||
|
"""Score de risque [0,1] d'un code pays ISO-3166."""
|
||||||
|
return _COUNTRY_RISK.get((cc or "").upper(), _DEFAULT_COUNTRY_RISK)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Encodage ASN (type d'infrastructure) ────────────────────────────────────
|
||||||
|
# Cloud/CDN/hosting → fort corrélé avec scanners automatisés et bots
|
||||||
|
|
||||||
|
_ASN_CLOUD_KEYWORDS = [
|
||||||
|
# Hyperscalers
|
||||||
|
"amazon", "aws", "google", "microsoft", "azure", "alibaba", "tencent", "huawei",
|
||||||
|
# CDN / edge
|
||||||
|
"cloudflare", "akamai", "fastly", "cloudfront", "incapsula", "imperva",
|
||||||
|
"sucuri", "stackpath", "keycdn",
|
||||||
|
# Hébergeurs
|
||||||
|
"ovh", "hetzner", "digitalocean", "vultr", "linode", "akamai-linode",
|
||||||
|
"leaseweb", "choopa", "packet", "equinix", "serverius", "combahton",
|
||||||
|
"m247", "b2 net", "hostinger", "contabo",
|
||||||
|
# Bulletproof / transit permissif connus
|
||||||
|
"hwclouds", "multacom", "psychz", "serverius", "colocrossing",
|
||||||
|
"frantech", "sharktech", "tzulo",
|
||||||
|
# VPN / proxy commerciaux
|
||||||
|
"nordvpn", "expressvpn", "mullvad", "protonvpn", "surfshark",
|
||||||
|
"privateinternetaccess", "pia ", "cyberghost", "hotspot shield",
|
||||||
|
"ipvanish", "hide.me",
|
||||||
|
# Bots search engines / crawlers
|
||||||
|
"facebook", "meta ", "twitter", "linkedin", "semrush", "ahrefs",
|
||||||
|
"majestic", "moz ", "babbar", "sistrix", "criteo", "peer39",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def asn_cloud_score(asn_org: str | None) -> float:
|
||||||
|
"""
|
||||||
|
Score [0,1] : 1.0 = cloud/CDN/hébergement/VPN confirmé.
|
||||||
|
Correspond à une infrastructure typiquement utilisée par les bots.
|
||||||
|
"""
|
||||||
|
if not asn_org:
|
||||||
|
return 0.2 # inconnu → légèrement suspect
|
||||||
|
s = asn_org.lower()
|
||||||
|
for kw in _ASN_CLOUD_KEYWORDS:
|
||||||
|
if kw in s:
|
||||||
|
return 1.0
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
# ─── Définition des features ──────────────────────────────────────────────────
|
# ─── Définition des features ──────────────────────────────────────────────────
|
||||||
|
|
||||||
FEATURES: list[tuple[str, str, object]] = [
|
FEATURES: list[tuple[str, str, object]] = [
|
||||||
@ -72,6 +141,9 @@ FEATURES: list[tuple[str, str, object]] = [
|
|||||||
("ja4_count", "Diversité JA4", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(30))),
|
("ja4_count", "Diversité JA4", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(30))),
|
||||||
# UA rotatif
|
# UA rotatif
|
||||||
("ua_rotating", "UA Rotatif", lambda v: 1.0 if float(v or 0) > 0 else 0.0),
|
("ua_rotating", "UA Rotatif", lambda v: 1.0 if float(v or 0) > 0 else 0.0),
|
||||||
|
# ── Géographie & infrastructure (nouvelles features) ──────────────────
|
||||||
|
("country", "Risque Pays", lambda v: country_risk(str(v) if v else None)),
|
||||||
|
("asn_org", "Hébergeur Cloud/VPN", lambda v: asn_cloud_score(str(v) if v else None)),
|
||||||
]
|
]
|
||||||
|
|
||||||
FEATURE_KEYS = [f[0] for f in FEATURES]
|
FEATURE_KEYS = [f[0] for f in FEATURES]
|
||||||
@ -83,7 +155,7 @@ N_FEATURES = len(FEATURES)
|
|||||||
# ─── Construction du vecteur de features ─────────────────────────────────────
|
# ─── Construction du vecteur de features ─────────────────────────────────────
|
||||||
|
|
||||||
def build_feature_vector(row: dict) -> list[float]:
|
def build_feature_vector(row: dict) -> list[float]:
|
||||||
"""Construit le vecteur normalisé [0,1]^21 depuis un dict SQL."""
|
"""Construit le vecteur normalisé [0,1]^23 depuis un dict SQL."""
|
||||||
return [norm(row.get(key, 0)) for key, _, norm in FEATURES]
|
return [norm(row.get(key, 0)) for key, _, norm in FEATURES]
|
||||||
|
|
||||||
|
|
||||||
@ -249,46 +321,72 @@ def compute_hulls(coords_2d: np.ndarray, labels: np.ndarray,
|
|||||||
# ─── Nommage et scoring ───────────────────────────────────────────────────────
|
# ─── Nommage et scoring ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
|
def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
|
||||||
"""Nom lisible basé sur les features dominantes du centroïde."""
|
"""Nom lisible basé sur les features dominantes du centroïde [0,1]."""
|
||||||
s = centroid # alias
|
s = centroid
|
||||||
ttl_raw = float(raw_stats.get("mean_ttl", 0))
|
ttl_raw = float(raw_stats.get("mean_ttl", 0))
|
||||||
mss_raw = float(raw_stats.get("mean_mss", 0))
|
mss_raw = float(raw_stats.get("mean_mss", 0))
|
||||||
|
# Indices : 21=country_risk, 22=asn_cloud
|
||||||
|
country_risk = s[21] if len(s) > 21 else 0.0
|
||||||
|
asn_cloud = s[22] if len(s) > 22 else 0.0
|
||||||
|
|
||||||
# Scanners / bots masscan
|
# Scanners Masscan
|
||||||
if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25:
|
if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25:
|
||||||
return "🤖 Masscan Scanner"
|
return "🤖 Masscan Scanner"
|
||||||
if s[4] > 0.70 and s[6] > 0.5:
|
# Bots offensifs agressifs (fuzzing + anomalie + pays risqué)
|
||||||
|
if s[4] > 0.40 and s[6] > 0.3:
|
||||||
return "🤖 Bot agressif"
|
return "🤖 Bot agressif"
|
||||||
if s[16] > 0.80:
|
# Pays à très haut risque (CN, RU, KP) avec trafic anormal
|
||||||
|
if country_risk > 0.75 and (s[4] > 0.10 or asn_cloud > 0.5):
|
||||||
|
return "🌏 Source pays risqué"
|
||||||
|
# Cloud + UA-CH mismatch = crawler/bot cloud
|
||||||
|
if s[16] > 0.50 and asn_cloud > 0.70:
|
||||||
|
return "☁️ Bot cloud UA-CH"
|
||||||
|
# UA-CH mismatch seul
|
||||||
|
if s[16] > 0.60:
|
||||||
return "🤖 UA-CH Mismatch"
|
return "🤖 UA-CH Mismatch"
|
||||||
if s[7] > 0.70:
|
# Headless browser
|
||||||
|
if s[7] > 0.50:
|
||||||
return "🤖 Headless Browser"
|
return "🤖 Headless Browser"
|
||||||
if s[4] > 0.50:
|
# Anomalie ML significative
|
||||||
return "⚠️ Anomalie ML haute"
|
if s[4] > 0.35:
|
||||||
|
return "⚠️ Anomalie ML"
|
||||||
|
# Cloud pur (CDN/crawler légitime ?)
|
||||||
|
if asn_cloud > 0.85 and s[4] < 0.15:
|
||||||
|
return "☁️ Infrastructure cloud"
|
||||||
|
# Pays à risque élevé sans autre signal
|
||||||
|
if country_risk > 0.60:
|
||||||
|
return "🌏 Trafic suspect (pays)"
|
||||||
|
# OS fingerprinting
|
||||||
if s[3] > 0.85 and ttl_raw > 120:
|
if s[3] > 0.85 and ttl_raw > 120:
|
||||||
return "🖥️ Windows"
|
return "🖥️ Windows"
|
||||||
if s[0] > 0.22 and s[0] < 0.28 and mss_raw > 1400:
|
if s[0] > 0.22 and s[0] < 0.28 and mss_raw > 1400:
|
||||||
return "🐧 Linux"
|
return "🐧 Linux"
|
||||||
if s[1] < 0.90 and s[1] > 0.95:
|
|
||||||
return "📡 VPN/Proxy"
|
|
||||||
if mss_raw < 1380 and mss_raw > 0:
|
if mss_raw < 1380 and mss_raw > 0:
|
||||||
return "🌐 Tunnel réseau"
|
return "🌐 Tunnel réseau"
|
||||||
if s[5] > 0.60:
|
if s[5] > 0.40:
|
||||||
return "⚡ Trafic rapide"
|
return "⚡ Trafic rapide"
|
||||||
if s[4] < 0.10 and s[5] < 0.10:
|
if s[4] < 0.08 and s[5] < 0.10 and asn_cloud < 0.30:
|
||||||
return "✅ Trafic sain"
|
return "✅ Trafic sain"
|
||||||
return "📊 Cluster mixte"
|
return "📊 Cluster mixte"
|
||||||
|
|
||||||
|
|
||||||
def risk_score_from_centroid(centroid: np.ndarray) -> float:
|
def risk_score_from_centroid(centroid: np.ndarray) -> float:
|
||||||
"""Score de risque [0,1] agrégé depuis le centroïde."""
|
"""
|
||||||
|
Score de risque [0,1] agrégé depuis le centroïde (espace original [0,1]).
|
||||||
|
Intègre pays et infrastructure cloud.
|
||||||
|
"""
|
||||||
s = centroid
|
s = centroid
|
||||||
|
country_risk = s[21] if len(s) > 21 else 0.0
|
||||||
|
asn_cloud = s[22] if len(s) > 22 else 0.0
|
||||||
|
|
||||||
return float(np.clip(
|
return float(np.clip(
|
||||||
0.40 * s[4] + # score ML
|
0.30 * s[4] + # score ML anomalie (principal)
|
||||||
0.15 * s[6] + # fuzzing
|
0.12 * s[6] + # fuzzing
|
||||||
0.15 * s[16] + # UA-CH mismatch
|
0.12 * s[16] + # UA-CH mismatch
|
||||||
0.10 * s[7] + # headless
|
0.08 * s[7] + # headless
|
||||||
0.10 * s[5] + # vélocité
|
0.08 * s[5] + # vélocité
|
||||||
0.10 * s[9], # IP-ID zéro
|
0.08 * s[9] + # IP-ID zéro
|
||||||
|
0.12 * country_risk + # risque pays source
|
||||||
|
0.10 * asn_cloud, # infrastructure cloud/VPN
|
||||||
0.0, 1.0
|
0.0, 1.0
|
||||||
))
|
))
|
||||||
|
|||||||
Reference in New Issue
Block a user