feat: clustering multi-métriques + TCP fingerprinting amélioré
- TCP fingerprinting: 20 signatures OS (p0f-style), scoring multi-signal
TTL/MSS/scale/fenêtre, détection Masscan 97% confiance, réseau path
(Ethernet/PPPoE/VPN/Tunnel), estimation hop-count
- Clustering IPs: K-means++ (Arthur & Vassilvitskii 2007) sur 21 features
TCP stack + anomalie ML + TLS/protocole + navigateur + temporel
PCA-2D par puissance itérative (Hotelling) pour positionnement
- Visualisation redesign: 2 vues lisibles
- Tableau de bord: grille de cartes groupées par niveau de risque
(Bots / Suspects / Légitimes), métriques clés + mini-barres
- Graphe de relations: ReactFlow avec nœuds-cartes en colonnes
par niveau de menace, arêtes colorées par similarité, légende
- Sidebar: RadarChart comportemental + toutes métriques + export CSV
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
328
backend/services/clustering_engine.py
Normal file
328
backend/services/clustering_engine.py
Normal file
@ -0,0 +1,328 @@
|
||||
"""
|
||||
Moteur de clustering K-means++ multi-métriques (pur Python).
|
||||
|
||||
Ref: Arthur & Vassilvitskii (2007) — k-means++: The Advantages of Careful Seeding
|
||||
Hotelling (1933) — PCA par puissance itérative (deflation)
|
||||
|
||||
Features (21 dimensions, normalisées [0,1]) :
|
||||
0 ttl_n : TTL initial normalisé (hops-count estimé)
|
||||
1 mss_n : MSS normalisé → type réseau (Ethernet/PPPoE/VPN)
|
||||
2 scale_n : facteur de mise à l'échelle TCP
|
||||
3 win_n : fenêtre TCP normalisée
|
||||
4 score_n : score anomalie ML (abs)
|
||||
5 velocity_n : vélocité de requêtes (log1p)
|
||||
6 fuzzing_n : index de fuzzing (log1p)
|
||||
7 headless_n : ratio sessions headless
|
||||
8 post_n : ratio POST/total
|
||||
9 ip_id_zero_n : ratio IP-ID=0 (Linux/spoofé)
|
||||
10 entropy_n : entropie temporelle
|
||||
11 browser_n : score navigateur moderne (normalisé max 50)
|
||||
12 alpn_n : mismatch ALPN/protocole
|
||||
13 alpn_absent_n : ratio ALPN absent
|
||||
14 h2_n : efficacité H2 multiplexing (log1p)
|
||||
15 hdr_conf_n : confiance ordre headers
|
||||
16 ua_ch_n : mismatch User-Agent-Client-Hints
|
||||
17 asset_n : ratio assets statiques
|
||||
18 direct_n : ratio accès directs
|
||||
19 ja4_div_n : diversité JA4 (log1p)
|
||||
20 ua_rot_n : UA rotatif (booléen)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import math
|
||||
import random
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
# ─── Définition des features ──────────────────────────────────────────────────
|
||||
|
||||
# (clé SQL, nom lisible, fonction de normalisation)
|
||||
FEATURES = [
|
||||
# TCP stack
|
||||
("ttl", "TTL Initial", lambda v: min(1.0, (v or 0) / 255.0)),
|
||||
("mss", "MSS Réseau", lambda v: min(1.0, (v or 0) / 1460.0)),
|
||||
("scale", "Scale TCP", lambda v: min(1.0, (v or 0) / 14.0)),
|
||||
("win", "Fenêtre TCP", lambda v: min(1.0, (v or 0) / 65535.0)),
|
||||
# Anomalie ML
|
||||
("avg_score", "Score Anomalie", lambda v: min(1.0, float(v or 0))),
|
||||
("avg_velocity", "Vélocité (rps)", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))),
|
||||
("avg_fuzzing", "Fuzzing", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))),
|
||||
("pct_headless", "Headless", lambda v: min(1.0, float(v or 0))),
|
||||
("avg_post", "Ratio POST", lambda v: min(1.0, float(v or 0))),
|
||||
# IP-ID
|
||||
("ip_id_zero", "IP-ID Zéro", lambda v: min(1.0, float(v or 0))),
|
||||
# Temporel
|
||||
("entropy", "Entropie Temporelle", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(10))),
|
||||
# Navigateur
|
||||
("browser_score","Score Navigateur", lambda v: min(1.0, float(v or 0) / 50.0)),
|
||||
# TLS / Protocole
|
||||
("alpn_mismatch","ALPN Mismatch", lambda v: min(1.0, float(v or 0))),
|
||||
("alpn_missing", "ALPN Absent", lambda v: min(1.0, float(v or 0))),
|
||||
("h2_eff", "H2 Multiplexing", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(20))),
|
||||
("hdr_conf", "Ordre Headers", lambda v: min(1.0, float(v or 0))),
|
||||
("ua_ch_mismatch","UA-CH Mismatch", lambda v: min(1.0, float(v or 0))),
|
||||
# Comportement HTTP
|
||||
("asset_ratio", "Ratio Assets", lambda v: min(1.0, float(v or 0))),
|
||||
("direct_ratio", "Accès Direct", lambda v: min(1.0, float(v or 0))),
|
||||
# Diversité JA4
|
||||
("ja4_count", "Diversité JA4", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(30))),
|
||||
# UA rotatif
|
||||
("ua_rotating", "UA Rotatif", lambda v: 1.0 if float(v or 0) > 0 else 0.0),
|
||||
]
|
||||
|
||||
FEATURE_KEYS = [f[0] for f in FEATURES]
|
||||
FEATURE_NAMES = [f[1] for f in FEATURES]
|
||||
FEATURE_NORMS = [f[2] for f in FEATURES]
|
||||
N_FEATURES = len(FEATURES)
|
||||
|
||||
|
||||
# ─── Utilitaires vectoriels (pur Python) ──────────────────────────────────────
|
||||
|
||||
def _dist2(a: list[float], b: list[float]) -> float:
|
||||
return sum((x - y) ** 2 for x, y in zip(a, b))
|
||||
|
||||
def _mean_vec(vecs: list[list[float]]) -> list[float]:
|
||||
n = len(vecs)
|
||||
if n == 0:
|
||||
return [0.0] * N_FEATURES
|
||||
return [sum(v[i] for v in vecs) / n for i in range(N_FEATURES)]
|
||||
|
||||
|
||||
# ─── Construction du vecteur de features ─────────────────────────────────────
|
||||
|
||||
def build_feature_vector(row: dict) -> list[float]:
|
||||
"""Normalise un dict de colonnes SQL → vecteur [0,1]^N_FEATURES."""
|
||||
return [fn(row.get(key)) for key, fn in zip(FEATURE_KEYS, FEATURE_NORMS)]
|
||||
|
||||
|
||||
# ─── K-means++ ───────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class KMeansResult:
|
||||
centroids: list[list[float]]
|
||||
labels: list[int]
|
||||
inertia: float
|
||||
n_iter: int
|
||||
|
||||
|
||||
def kmeans_pp(
|
||||
points: list[list[float]],
|
||||
k: int,
|
||||
max_iter: int = 60,
|
||||
seed: int = 42,
|
||||
n_init: int = 3,
|
||||
) -> KMeansResult:
|
||||
"""
|
||||
K-means avec initialisation k-means++ (Arthur & Vassilvitskii, 2007).
|
||||
Lance `n_init` fois et retourne le meilleur résultat (inertie minimale).
|
||||
"""
|
||||
rng = random.Random(seed)
|
||||
best: KMeansResult | None = None
|
||||
|
||||
for attempt in range(n_init):
|
||||
# ── Initialisation k-means++ ────────────────────────────────────
|
||||
first_idx = rng.randrange(len(points))
|
||||
centroids = [points[first_idx][:]]
|
||||
|
||||
for _ in range(k - 1):
|
||||
d2 = [min(_dist2(p, c) for c in centroids) for p in points]
|
||||
total = sum(d2)
|
||||
if total == 0:
|
||||
break
|
||||
r = rng.random() * total
|
||||
cumul = 0.0
|
||||
for i, d in enumerate(d2):
|
||||
cumul += d
|
||||
if cumul >= r:
|
||||
centroids.append(points[i][:])
|
||||
break
|
||||
else:
|
||||
centroids.append(points[rng.randrange(len(points))][:])
|
||||
|
||||
# ── Itérations EM ───────────────────────────────────────────────
|
||||
labels: list[int] = [0] * len(points)
|
||||
for iteration in range(max_iter):
|
||||
# E-step : affectation
|
||||
new_labels = [
|
||||
min(range(len(centroids)), key=lambda c: _dist2(p, centroids[c]))
|
||||
for p in points
|
||||
]
|
||||
if new_labels == labels and iteration > 0:
|
||||
break
|
||||
labels = new_labels
|
||||
|
||||
# M-step : mise à jour
|
||||
clusters: list[list[list[float]]] = [[] for _ in range(k)]
|
||||
for i, l in enumerate(labels):
|
||||
clusters[l].append(points[i])
|
||||
for j in range(k):
|
||||
if clusters[j]:
|
||||
centroids[j] = _mean_vec(clusters[j])
|
||||
|
||||
inertia = sum(_dist2(points[i], centroids[labels[i]]) for i in range(len(points)))
|
||||
result = KMeansResult(
|
||||
centroids=centroids,
|
||||
labels=labels,
|
||||
inertia=inertia,
|
||||
n_iter=iteration + 1,
|
||||
)
|
||||
if best is None or inertia < best.inertia:
|
||||
best = result
|
||||
|
||||
return best # type: ignore
|
||||
|
||||
|
||||
# ─── PCA 2D par puissance itérative ──────────────────────────────────────────
|
||||
|
||||
def pca_2d(points: list[list[float]]) -> list[tuple[float, float]]:
|
||||
"""
|
||||
Projection PCA 2D par puissance itérative avec déflation (Hotelling).
|
||||
Retourne les coordonnées (pc1, pc2) normalisées dans [0,1].
|
||||
"""
|
||||
n = len(points)
|
||||
if n == 0:
|
||||
return []
|
||||
|
||||
# Centrage
|
||||
mean = _mean_vec(points)
|
||||
X = [[p[i] - mean[i] for i in range(N_FEATURES)] for p in points]
|
||||
|
||||
def power_iter(X_centered: list[list[float]], n_iter: int = 30) -> list[float]:
|
||||
"""Trouve le premier vecteur propre de X^T X par puissance itérative."""
|
||||
v = [1.0 / math.sqrt(N_FEATURES)] * N_FEATURES
|
||||
for _ in range(n_iter):
|
||||
# Xv = X @ v
|
||||
Xv = [sum(row[j] * v[j] for j in range(N_FEATURES)) for row in X_centered]
|
||||
# Xtxv = X^T @ Xv
|
||||
xtxv = [sum(X_centered[i][j] * Xv[i] for i in range(len(X_centered))) for j in range(N_FEATURES)]
|
||||
norm = math.sqrt(sum(x ** 2 for x in xtxv)) or 1e-10
|
||||
v = [x / norm for x in xtxv]
|
||||
return v
|
||||
|
||||
# PC1
|
||||
v1 = power_iter(X)
|
||||
proj1 = [sum(row[j] * v1[j] for j in range(N_FEATURES)) for row in X]
|
||||
|
||||
# Déflation : retire la composante PC1 de X
|
||||
X2 = [
|
||||
[X[i][j] - proj1[i] * v1[j] for j in range(N_FEATURES)]
|
||||
for i in range(n)
|
||||
]
|
||||
|
||||
# PC2
|
||||
v2 = power_iter(X2)
|
||||
proj2 = [sum(row[j] * v2[j] for j in range(N_FEATURES)) for row in X2]
|
||||
|
||||
# Normalisation [0,1]
|
||||
def _norm01(vals: list[float]) -> list[float]:
|
||||
lo, hi = min(vals), max(vals)
|
||||
rng = hi - lo or 1e-10
|
||||
return [(v - lo) / rng for v in vals]
|
||||
|
||||
p1 = _norm01(proj1)
|
||||
p2 = _norm01(proj2)
|
||||
|
||||
return list(zip(p1, p2))
|
||||
|
||||
|
||||
# ─── Nommage automatique des clusters ────────────────────────────────────────
|
||||
|
||||
def name_cluster(centroid: list[float], raw_stats: dict | None = None) -> str:
|
||||
"""
|
||||
Génère un nom lisible à partir du centroïde normalisé et de statistiques brutes.
|
||||
Priorité : signaux les plus discriminants en premier.
|
||||
"""
|
||||
score = centroid[4] # anomalie ML
|
||||
vel = centroid[5] # vélocité
|
||||
fuzz = centroid[6] # fuzzing (log1p normalisé, >0.35 ≈ fuzzing_index > 100)
|
||||
hless = centroid[7] # headless
|
||||
post = centroid[8] # POST ratio
|
||||
alpn = centroid[12] # ALPN mismatch
|
||||
h2 = centroid[14] # H2 eff
|
||||
ua_ch = centroid[16] # UA-CH mismatch
|
||||
ja4d = centroid[19] # JA4 diversité
|
||||
ua_rot = centroid[20] # UA rotatif
|
||||
|
||||
raw_mss = (raw_stats or {}).get("mean_mss", 0)
|
||||
raw_ttl = (raw_stats or {}).get("mean_ttl", 0) or (centroid[0] * 255)
|
||||
raw_scale = (raw_stats or {}).get("mean_scale", 0)
|
||||
|
||||
# ── Signaux forts (déterministes) ────────────────────────────────────
|
||||
|
||||
# Pattern Masscan : mss≈1452, scale≈4, TTL 48-57
|
||||
if raw_mss and 1440 <= raw_mss <= 1460 and raw_scale and 3 <= raw_scale <= 5 and raw_ttl < 60:
|
||||
return "🤖 Masscan / Scanner IP"
|
||||
|
||||
# Fuzzer agressif (fuzzing_index normalisé > 0.35 ≈ valeur brute > 100)
|
||||
if fuzz > 0.35:
|
||||
return "🤖 Bot Fuzzer / Scanner"
|
||||
|
||||
# UA rotatif + UA-CH mismatch : bot sophistiqué simulant un navigateur
|
||||
if ua_rot > 0.5 and ua_ch > 0.7:
|
||||
return "🤖 Bot UA Rotatif + CH Mismatch"
|
||||
|
||||
# UA-CH mismatch fort seul (navigateur simulé sans headers CH)
|
||||
if ua_ch > 0.8:
|
||||
return "⚠️ Bot UA-CH Incohérent"
|
||||
|
||||
# ── Score ML modéré + signal comportemental ──────────────────────────
|
||||
|
||||
if score > 0.20:
|
||||
if hless > 0.3:
|
||||
return "⚠️ Navigateur Headless Suspect"
|
||||
if vel > 0.25:
|
||||
return "⚠️ Bot Haute Vélocité"
|
||||
if post > 0.4:
|
||||
return "⚠️ Bot POST Automatisé"
|
||||
if alpn > 0.5 or h2 > 0.5:
|
||||
return "⚠️ TLS/H2 Anormal"
|
||||
if ua_ch > 0.4:
|
||||
return "⚠️ Anomalie UA-CH"
|
||||
return "⚠️ Anomalie ML Modérée"
|
||||
|
||||
# ── Signaux faibles ───────────────────────────────────────────────────
|
||||
|
||||
if ua_ch > 0.4:
|
||||
return "🔎 UA-CH Incohérent"
|
||||
|
||||
if ja4d > 0.5:
|
||||
return "🔄 Client Multi-Fingerprint"
|
||||
|
||||
# ── Classification réseau / OS ────────────────────────────────────────
|
||||
|
||||
# MSS bas → VPN ou tunnel
|
||||
if raw_mss and raw_mss < 1360:
|
||||
return "🌐 VPN / Tunnel"
|
||||
|
||||
if raw_ttl < 70:
|
||||
return "🐧 Linux / Mobile"
|
||||
if raw_ttl > 110:
|
||||
return "🪟 Windows"
|
||||
|
||||
return "✅ Trafic Légitime"
|
||||
|
||||
|
||||
def risk_score_from_centroid(centroid: list[float]) -> float:
|
||||
"""Score de risque [0,1] pondéré. Calibré pour les valeurs observées (score ML ~0.3)."""
|
||||
# Normalisation de score ML : x / 0.5 pour étendre la plage utile (0-0.5 → 0-1)
|
||||
score_n = min(1.0, centroid[4] / 0.5)
|
||||
fuzz_n = centroid[6]
|
||||
ua_ch_n = centroid[16]
|
||||
ua_rot_n = centroid[20]
|
||||
vel_n = centroid[5]
|
||||
hless_n = centroid[7]
|
||||
ip_id_n = centroid[9]
|
||||
alpn_n = centroid[12]
|
||||
ja4d_n = centroid[19]
|
||||
post_n = centroid[8]
|
||||
|
||||
return min(1.0,
|
||||
0.25 * score_n +
|
||||
0.20 * ua_ch_n +
|
||||
0.15 * fuzz_n +
|
||||
0.12 * ua_rot_n +
|
||||
0.10 * hless_n +
|
||||
0.07 * vel_n +
|
||||
0.04 * ip_id_n +
|
||||
0.04 * alpn_n +
|
||||
0.03 * ja4d_n +
|
||||
0.03 * post_n
|
||||
)
|
||||
436
backend/services/tcp_fingerprint.py
Normal file
436
backend/services/tcp_fingerprint.py
Normal file
@ -0,0 +1,436 @@
|
||||
"""
|
||||
Service de fingerprinting OS par signature TCP — approche multi-signal inspirée de p0f.
|
||||
|
||||
Signaux utilisés (par ordre de poids) :
|
||||
1. TTL initial estimé (→ famille OS : Linux/Mac=64, Windows=128, Cisco/BSD=255)
|
||||
2. MSS (→ type de réseau : Ethernet=1460, PPPoE=1452, VPN=1380-1420)
|
||||
3. Taille de fenêtre (→ signature OS précise)
|
||||
4. Facteur d'échelle (→ affine la version du kernel/stack TCP)
|
||||
|
||||
Références :
|
||||
- p0f v3 (Michal Zalewski) — passive OS fingerprinting
|
||||
- Nmap OS detection (Gordon Lyon)
|
||||
- "OS Fingerprinting Revisited" (Beverly, 2004)
|
||||
- "Passive OS fingerprinting" (Orebaugh, Ramirez)
|
||||
- Recherche sur Masscan/ZMap : signatures SYN craftées connues
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# ─── Constantes ───────────────────────────────────────────────────────────────
|
||||
|
||||
_INITIAL_TTLS = (64, 128, 255)
|
||||
|
||||
# MSS → type de chemin réseau (MTU - 40 octets d'en-têtes IP+TCP)
|
||||
_MSS_PATH: list[tuple[range, str]] = [
|
||||
(range(1461, 9001), "Ethernet/Jumbo"), # jumbo frames (CDN/datacenter)
|
||||
(range(1460, 1461), "Ethernet directe"), # MTU 1500 standard
|
||||
(range(1453, 1460), "Ethernet directe"), # légèrement réduit (padding)
|
||||
(range(1452, 1453), "PPPoE/DSL"), # MTU 1492
|
||||
(range(1436, 1452), "PPPoE/DSL ajusté"), # variations DSL
|
||||
(range(1420, 1436), "VPN léger"), # WireGuard / IPsec transport
|
||||
(range(1380, 1420), "VPN/Tunnel"), # OpenVPN / L2TP
|
||||
(range(1300, 1380), "VPN double ou mobile"),
|
||||
(range(0, 1300), "Lien bas débit / GPRS"),
|
||||
]
|
||||
|
||||
|
||||
# ─── Base de signatures OS ─────────────────────────────────────────────────────
|
||||
#
|
||||
# Format : chaque entrée est un dict avec :
|
||||
# ttl : int — TTL initial attendu (64 | 128 | 255)
|
||||
# win : set[int]|None — tailles de fenêtre attendues (None = ignorer)
|
||||
# scale : set[int]|None — facteurs d'échelle attendus (None = ignorer)
|
||||
# mss : set[int]|None — valeurs MSS attendues (None = ignorer)
|
||||
# name : str — libellé affiché
|
||||
# conf : float — poids de confiance de base (0–1)
|
||||
# bot : bool — outil de scan/bot connu
|
||||
|
||||
_SIGNATURES: list[dict] = [
|
||||
# ══════════════════════════════════════════════════════
|
||||
# OUTILS DE SCAN ET BOTS CONNUS (priorité maximale)
|
||||
# ══════════════════════════════════════════════════════
|
||||
|
||||
# Masscan / scanner personnalisé avec stack Linux modifiée (PPPoE MSS=1452)
|
||||
# Pattern très présent dans les données : ~111k requêtes, UA spoofé macOS/Windows
|
||||
{
|
||||
"ttl": 64, "win": {5808}, "scale": {4}, "mss": {1452},
|
||||
"name": "Bot-Scanner/Masscan", "conf": 0.97, "bot": True,
|
||||
},
|
||||
# Masscan TTL=255 (mode direct, pas de hop)
|
||||
{
|
||||
"ttl": 255, "win": {1024}, "scale": {0}, "mss": None,
|
||||
"name": "Bot-ZMap/Masscan", "conf": 0.96, "bot": True,
|
||||
},
|
||||
# Mirai variant (petite fenêtre, pas de scale, TTL Linux)
|
||||
{
|
||||
"ttl": 64, "win": {1024, 2048}, "scale": {0}, "mss": {1460},
|
||||
"name": "Bot-Mirai", "conf": 0.92, "bot": True,
|
||||
},
|
||||
# Mirai variant (petite fenêtre Windows)
|
||||
{
|
||||
"ttl": 128, "win": {1024, 2048}, "scale": {0}, "mss": {1460},
|
||||
"name": "Bot-Mirai/Win", "conf": 0.92, "bot": True,
|
||||
},
|
||||
# Scapy / forge manuelle (fenêtre 8192 exactement + TTL 64 + pas de scale)
|
||||
{
|
||||
"ttl": 64, "win": {8192}, "scale": {0}, "mss": {1460},
|
||||
"name": "Bot-Scapy/Forge", "conf": 0.85, "bot": True,
|
||||
},
|
||||
# Nmap SYN scan (window=1024, MSS=1460, TTL=64 ou 128)
|
||||
{
|
||||
"ttl": 64, "win": {1}, "scale": None, "mss": None,
|
||||
"name": "Bot-ZMap", "conf": 0.95, "bot": True,
|
||||
},
|
||||
|
||||
# ══════════════════════════════════════════════════════
|
||||
# WINDOWS
|
||||
# ══════════════════════════════════════════════════════
|
||||
|
||||
# Windows 10 / 11 — signature standard (LAN direct)
|
||||
{
|
||||
"ttl": 128, "win": {64240}, "scale": {8}, "mss": {1460},
|
||||
"name": "Windows 10/11", "conf": 0.93, "bot": False,
|
||||
},
|
||||
# Windows 10/11 — derrière VPN/proxy (MSS réduit)
|
||||
{
|
||||
"ttl": 128, "win": {64240}, "scale": {8}, "mss": {1380, 1400, 1412, 1420, 1440},
|
||||
"name": "Windows 10/11 (VPN)", "conf": 0.90, "bot": False,
|
||||
},
|
||||
# Windows Server 2019/2022 — grande fenêtre
|
||||
{
|
||||
"ttl": 128, "win": {65535, 131072}, "scale": {8, 9}, "mss": {1460},
|
||||
"name": "Windows Server", "conf": 0.88, "bot": False,
|
||||
},
|
||||
# Windows 7/8.1
|
||||
{
|
||||
"ttl": 128, "win": {8192, 65535}, "scale": {4, 8}, "mss": {1460},
|
||||
"name": "Windows 7/8", "conf": 0.83, "bot": False,
|
||||
},
|
||||
# Windows générique (TTL=128, scale=8, tout MSS)
|
||||
{
|
||||
"ttl": 128, "win": None, "scale": {8}, "mss": None,
|
||||
"name": "Windows", "conf": 0.70, "bot": False,
|
||||
},
|
||||
|
||||
# ══════════════════════════════════════════════════════
|
||||
# ANDROID (stack BBRv2 / CUBIC moderne)
|
||||
# ══════════════════════════════════════════════════════
|
||||
|
||||
# Android 10+ — scale=9 ou 10, grande fenêtre (BBRv2)
|
||||
{
|
||||
"ttl": 64, "win": {65535, 131072, 42340, 35844}, "scale": {9, 10}, "mss": {1460},
|
||||
"name": "Android 10+", "conf": 0.82, "bot": False,
|
||||
},
|
||||
# Android via proxy TTL=128 (app Facebook, TikTok etc. passant par infra)
|
||||
{
|
||||
"ttl": 128, "win": {62727, 65535}, "scale": {7}, "mss": {1460},
|
||||
"name": "Android/App (proxy)", "conf": 0.75, "bot": False,
|
||||
},
|
||||
# Android derrière VPN (MSS réduit)
|
||||
{
|
||||
"ttl": 64, "win": {65535, 59640, 63940}, "scale": {8, 9, 10}, "mss": {1380, 1390, 1400, 1418, 1420},
|
||||
"name": "Android (VPN/mobile)", "conf": 0.78, "bot": False,
|
||||
},
|
||||
|
||||
# ══════════════════════════════════════════════════════
|
||||
# iOS / macOS
|
||||
# ══════════════════════════════════════════════════════
|
||||
|
||||
# iOS 14+ / macOS Monterey+ — scale=6, win=65535 (signature XNU)
|
||||
{
|
||||
"ttl": 64, "win": {65535, 32768}, "scale": {6}, "mss": {1460},
|
||||
"name": "iOS/macOS", "conf": 0.87, "bot": False,
|
||||
},
|
||||
# macOS Sonoma+ / iOS 17+ (scale=9, fenêtre plus grande)
|
||||
{
|
||||
"ttl": 64, "win": {65535, 32768}, "scale": {9}, "mss": {1460},
|
||||
"name": "macOS Sonoma+/iOS 17+", "conf": 0.83, "bot": False,
|
||||
},
|
||||
# macOS derrière VPN (MSS réduit)
|
||||
{
|
||||
"ttl": 64, "win": {65535}, "scale": {6, 9}, "mss": {1380, 1400, 1412, 1436},
|
||||
"name": "iOS/macOS (VPN)", "conf": 0.80, "bot": False,
|
||||
},
|
||||
|
||||
# ══════════════════════════════════════════════════════
|
||||
# LINUX (desktop/serveur)
|
||||
# ══════════════════════════════════════════════════════
|
||||
|
||||
# Linux 5.x+ — scale=7, win=64240 ou 65535 (kernel ≥ 4.19)
|
||||
{
|
||||
"ttl": 64, "win": {64240, 65320}, "scale": {7}, "mss": {1460},
|
||||
"name": "Linux 5.x+", "conf": 0.86, "bot": False,
|
||||
},
|
||||
# Linux 4.x / ChromeOS
|
||||
{
|
||||
"ttl": 64, "win": {29200, 65535, 43690, 32120}, "scale": {7}, "mss": {1460},
|
||||
"name": "Linux 4.x/ChromeOS", "conf": 0.83, "bot": False,
|
||||
},
|
||||
# Linux derrière VPN (MSS réduit)
|
||||
{
|
||||
"ttl": 64, "win": {64240, 65535, 42600}, "scale": {7}, "mss": {1380, 1400, 1420, 1436},
|
||||
"name": "Linux (VPN)", "conf": 0.80, "bot": False,
|
||||
},
|
||||
# Linux 2.6.x (ancien — win=5840/14600)
|
||||
{
|
||||
"ttl": 64, "win": {5840, 14600, 16384}, "scale": {4, 5}, "mss": {1460},
|
||||
"name": "Linux 2.6", "conf": 0.78, "bot": False,
|
||||
},
|
||||
|
||||
# ══════════════════════════════════════════════════════
|
||||
# BSD / ÉQUIPEMENTS RÉSEAU / CDN
|
||||
# ══════════════════════════════════════════════════════
|
||||
|
||||
# FreeBSD / OpenBSD (initial TTL=64)
|
||||
{
|
||||
"ttl": 64, "win": {65535}, "scale": {6}, "mss": {512, 1460},
|
||||
"name": "FreeBSD/OpenBSD", "conf": 0.74, "bot": False,
|
||||
},
|
||||
# Cisco IOS / équipements réseau (initial TTL=255, fenêtre petite)
|
||||
{
|
||||
"ttl": 255, "win": {4096, 4128, 8760}, "scale": {0, 1, 2}, "mss": {512, 1460},
|
||||
"name": "Cisco/Réseau", "conf": 0.87, "bot": False,
|
||||
},
|
||||
# CDN / Applebot (TTL=255, jumbo MSS, fenêtre élevée)
|
||||
{
|
||||
"ttl": 255, "win": {26883, 65535, 59640}, "scale": {7, 8}, "mss": {8373, 8365, 1460},
|
||||
"name": "CDN/Applebot (jumbo)", "conf": 0.85, "bot": False,
|
||||
},
|
||||
# BSD/Unix générique (TTL=255)
|
||||
{
|
||||
"ttl": 255, "win": None, "scale": {6, 7, 8}, "mss": {1460},
|
||||
"name": "BSD/Unix", "conf": 0.68, "bot": False,
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ─── Data classes ──────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class OsFingerprint:
|
||||
os_name: str
|
||||
initial_ttl: int
|
||||
hop_count: int
|
||||
confidence: float
|
||||
is_bot_tool: bool
|
||||
network_path: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SpoofResult:
|
||||
is_spoof: bool
|
||||
is_bot_tool: bool
|
||||
reason: str
|
||||
|
||||
|
||||
# ─── Fonctions utilitaires ─────────────────────────────────────────────────────
|
||||
|
||||
def _estimate_initial_ttl(observed_ttl: int) -> tuple[int, int]:
|
||||
"""Retourne (initial_ttl, hop_count).
|
||||
Cherche le TTL standard le plus bas >= observed_ttl.
|
||||
Rejette les hop counts > 45 (réseau légitimement long = peu probable).
|
||||
"""
|
||||
if observed_ttl <= 0:
|
||||
return 0, -1
|
||||
for initial in _INITIAL_TTLS:
|
||||
if observed_ttl <= initial:
|
||||
hop = initial - observed_ttl
|
||||
if hop <= 45:
|
||||
return initial, hop
|
||||
return 255, 255 - observed_ttl # TTL > 255 impossible, fallback
|
||||
|
||||
|
||||
def _infer_network_path(mss: int) -> str:
|
||||
"""Retourne le type de chemin réseau probable à partir du MSS."""
|
||||
if mss <= 0:
|
||||
return "Inconnu"
|
||||
for rng, label in _MSS_PATH:
|
||||
if mss in rng:
|
||||
return label
|
||||
return "Inconnu"
|
||||
|
||||
|
||||
def _os_family(os_name: str) -> str:
|
||||
"""Réduit un nom OS détaillé à sa famille pour comparaison avec l'UA."""
|
||||
n = os_name.lower()
|
||||
if "windows" in n:
|
||||
return "Windows"
|
||||
if "android" in n:
|
||||
return "Android"
|
||||
if "ios" in n or "macos" in n or "iphone" in n or "ipad" in n:
|
||||
return "Apple"
|
||||
if "linux" in n or "chromeos" in n:
|
||||
return "Linux"
|
||||
if "bsd" in n or "cisco" in n or "cdn" in n or "réseau" in n:
|
||||
return "Network"
|
||||
if "bot" in n or "scanner" in n or "mirai" in n or "zmap" in n:
|
||||
return "Bot"
|
||||
return "Unknown"
|
||||
|
||||
|
||||
def _ua_os_family(declared_os: str) -> str:
|
||||
"""Réduit l'OS déclaré (UA) à sa famille."""
|
||||
mapping = {
|
||||
"Windows": "Windows",
|
||||
"Android": "Android",
|
||||
"iOS": "Apple",
|
||||
"macOS": "Apple",
|
||||
"Linux": "Linux",
|
||||
"ChromeOS": "Linux",
|
||||
"BSD": "Network",
|
||||
}
|
||||
return mapping.get(declared_os, "Unknown")
|
||||
|
||||
|
||||
# ─── Fonctions publiques ───────────────────────────────────────────────────────
|
||||
|
||||
def declared_os_from_ua(ua: str) -> str:
|
||||
"""Infère l'OS déclaré à partir du User-Agent."""
|
||||
ua = ua or ""
|
||||
ul = ua.lower()
|
||||
if not ul:
|
||||
return "Unknown"
|
||||
if "windows nt" in ul:
|
||||
return "Windows"
|
||||
if "android" in ul:
|
||||
return "Android"
|
||||
if "iphone" in ul or "ipad" in ul or "cpu iphone" in ul or "cpu ipad" in ul:
|
||||
return "iOS"
|
||||
if "mac os x" in ul or "macos" in ul:
|
||||
return "macOS"
|
||||
if "cros" in ul or "chromeos" in ul:
|
||||
return "ChromeOS"
|
||||
if "linux" in ul:
|
||||
return "Linux"
|
||||
if "freebsd" in ul or "openbsd" in ul or "netbsd" in ul:
|
||||
return "BSD"
|
||||
return "Unknown"
|
||||
|
||||
|
||||
def fingerprint_os(ttl: int, win: int, scale: int, mss: int) -> OsFingerprint:
|
||||
"""Fingerprint OS multi-signal avec scoring pondéré.
|
||||
|
||||
Poids des signaux :
|
||||
- TTL initial 40 % (discriminant principal : famille OS)
|
||||
- MSS 30 % (type de réseau ET OS)
|
||||
- Fenêtre TCP 20 % (version/distrib précise)
|
||||
- Scale 10 % (affine la version kernel)
|
||||
"""
|
||||
initial_ttl, hop_count = _estimate_initial_ttl(ttl)
|
||||
network_path = _infer_network_path(mss)
|
||||
|
||||
if initial_ttl == 0:
|
||||
return OsFingerprint(
|
||||
os_name="Unknown", initial_ttl=0, hop_count=-1,
|
||||
confidence=0.0, is_bot_tool=False, network_path=network_path,
|
||||
)
|
||||
|
||||
best_score: float = -1.0
|
||||
best_sig: Optional[dict] = None
|
||||
|
||||
for sig in _SIGNATURES:
|
||||
# Le TTL est un filtre strict — pas de correspondance, on passe
|
||||
if sig["ttl"] != initial_ttl:
|
||||
continue
|
||||
|
||||
score: float = 0.40 # Score de base pour correspondance TTL
|
||||
|
||||
# MSS (poids 0.30)
|
||||
if sig["mss"] is not None:
|
||||
score += 0.30 if mss in sig["mss"] else -0.12
|
||||
|
||||
# Fenêtre (poids 0.20)
|
||||
if sig["win"] is not None:
|
||||
score += 0.20 if win in sig["win"] else -0.08
|
||||
|
||||
# Scale (poids 0.10)
|
||||
if sig["scale"] is not None:
|
||||
score += 0.10 if scale in sig["scale"] else -0.04
|
||||
|
||||
# Pénalité si hop count anormalement élevé (>30 hops)
|
||||
if hop_count > 30:
|
||||
score -= 0.05
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_sig = sig
|
||||
|
||||
if best_sig and best_score >= 0.38:
|
||||
# Pondérer la confiance finale par le score et le conf de la signature
|
||||
raw_conf = best_score * best_sig["conf"]
|
||||
confidence = round(min(max(raw_conf, 0.0), 1.0), 2)
|
||||
return OsFingerprint(
|
||||
os_name=best_sig["name"],
|
||||
initial_ttl=initial_ttl,
|
||||
hop_count=hop_count,
|
||||
confidence=confidence,
|
||||
is_bot_tool=best_sig["bot"],
|
||||
network_path=network_path,
|
||||
)
|
||||
|
||||
# Repli : classification TTL seule (confiance minimale)
|
||||
fallback = {64: "Linux/macOS", 128: "Windows", 255: "Cisco/BSD"}
|
||||
return OsFingerprint(
|
||||
os_name=fallback.get(initial_ttl, "Unknown"),
|
||||
initial_ttl=initial_ttl,
|
||||
hop_count=hop_count,
|
||||
confidence=round(0.40 * 0.65, 2), # confiance faible
|
||||
is_bot_tool=False,
|
||||
network_path=network_path,
|
||||
)
|
||||
|
||||
|
||||
def detect_spoof(fp: OsFingerprint, declared_os: str) -> SpoofResult:
|
||||
"""Détecte les incohérences OS entre TCP et UA.
|
||||
|
||||
Règles :
|
||||
1. Outil de scan connu → spoof/bot, quelle que soit l'UA
|
||||
2. Confiance < 0.50 → indéterminable
|
||||
3. OS incompatibles → spoof confirmé
|
||||
4. Cohérent → OK
|
||||
"""
|
||||
if fp.is_bot_tool:
|
||||
return SpoofResult(
|
||||
is_spoof=True,
|
||||
is_bot_tool=True,
|
||||
reason=f"Outil de scan détecté ({fp.os_name})",
|
||||
)
|
||||
|
||||
if fp.confidence < 0.50 or fp.os_name == "Unknown" or declared_os == "Unknown":
|
||||
return SpoofResult(
|
||||
is_spoof=False,
|
||||
is_bot_tool=False,
|
||||
reason="Corrélation insuffisante",
|
||||
)
|
||||
|
||||
tcp_family = _os_family(fp.os_name)
|
||||
ua_family = _ua_os_family(declared_os)
|
||||
|
||||
# Les familles Network/Bot sont toujours suspectes si l'UA prétend être un navigateur
|
||||
if tcp_family == "Network" and ua_family not in ("Network", "Unknown"):
|
||||
return SpoofResult(
|
||||
is_spoof=True,
|
||||
is_bot_tool=False,
|
||||
reason=f"Équipement réseau/CDN (TCP) vs {declared_os} (UA)",
|
||||
)
|
||||
|
||||
if tcp_family == "Unknown" or ua_family == "Unknown":
|
||||
return SpoofResult(is_spoof=False, is_bot_tool=False, reason="OS indéterminé")
|
||||
|
||||
# Android passant par un proxy infra (ex: Facebook app → proxy Windows)
|
||||
# → pas forcément un spoof, noté mais non flaggé
|
||||
if declared_os == "Android" and tcp_family == "Windows" and "proxy" in fp.os_name.lower():
|
||||
return SpoofResult(is_spoof=False, is_bot_tool=False, reason="App mobile via proxy infra")
|
||||
|
||||
if tcp_family != ua_family:
|
||||
return SpoofResult(
|
||||
is_spoof=True,
|
||||
is_bot_tool=False,
|
||||
reason=f"TCP→{tcp_family} vs UA→{ua_family}",
|
||||
)
|
||||
|
||||
return SpoofResult(is_spoof=False, is_bot_tool=False, reason="Cohérent")
|
||||
Reference in New Issue
Block a user