feat: clustering multi-métriques + TCP fingerprinting amélioré

- TCP fingerprinting: 20 signatures OS (p0f-style), scoring multi-signal
  TTL/MSS/scale/fenêtre, détection Masscan 97% confiance, réseau path
  (Ethernet/PPPoE/VPN/Tunnel), estimation hop-count

- Clustering IPs: K-means++ (Arthur & Vassilvitskii 2007) sur 21 features
  TCP stack + anomalie ML + TLS/protocole + navigateur + temporel
  PCA-2D par puissance itérative (Hotelling) pour positionnement

- Visualisation redesign: 2 vues lisibles
  - Tableau de bord: grille de cartes groupées par niveau de risque
    (Bots / Suspects / Légitimes), métriques clés + mini-barres
  - Graphe de relations: ReactFlow avec nœuds-cartes en colonnes
    par niveau de menace, arêtes colorées par similarité, légende
  - Sidebar: RadarChart comportemental + toutes métriques + export CSV

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
SOC Analyst
2026-03-18 18:22:57 +01:00
parent c887846af5
commit e2db8ca84e
9 changed files with 2430 additions and 202 deletions

View File

@ -1,130 +1,95 @@
"""
Endpoints pour la détection du TCP spoofing (TTL / window size anormaux)
Endpoints pour la détection du TCP spoofing / fingerprinting OS
Règle de corrélation :
- TTL=0 ou tcp_window_size=0 → données TCP absentes (proxy/LB) → pas de corrélation possible
- TTL 55-65 → fingerprint Linux/Mac (initial TTL 64)
- TTL 120-135 → fingerprint Windows (initial TTL 128)
- TTL 110-120 → fingerprint Windows (initial TTL 128, quelques sauts)
- Toute autre valeur → OS indéterminé → pas de flag spoofing
- spoof_flag = True UNIQUEMENT si OS fingerprinting TCP possible ET incompatible avec l'UA
Approche multi-signal (p0f-style) :
- TTL initial estimé → famille OS (Linux/Mac=64, Windows=128, Cisco/BSD=255)
- MSS → type de réseau (Ethernet=1460, PPPoE=1452, VPN=1380-1420)
- Taille de fenêtre → signature OS précise
- Facteur d'échelle affine la version kernel/stack TCP
Détection bots : signatures connues (Masscan/ZMap/Mirai) identifiées par combinaison
win+scale+mss indépendamment de l'UA.
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
from ..services.tcp_fingerprint import (
fingerprint_os,
detect_spoof,
declared_os_from_ua,
)
router = APIRouter(prefix="/api/tcp-spoofing", tags=["tcp_spoofing"])
# Plages TTL qui permettent une corrélation fiable
_TTL_LINUX = (range(52, 66), "Linux/Mac") # initial 64, 1-12 sauts
_TTL_WINDOWS = (range(110, 136), "Windows") # initial 128, 1-18 sauts
_TTL_CISCO = (range(240, 256), "Cisco/BSD") # initial 255
def _suspected_os(ttl: int) -> str:
"""Retourne l'OS probable à partir du TTL observé.
Retourne 'Unknown' si le TTL ne permet pas une corrélation fiable
(TTL=0 = pas de données TCP, ou hors plage connue).
"""
if ttl <= 0:
return "Unknown" # Pas de données TCP (proxy/CDN)
for rng, name in (_TTL_LINUX, _TTL_WINDOWS, _TTL_CISCO):
if ttl in rng:
return name
return "Unknown"
def _declared_os(ua: str) -> str:
ua = ua or ""
if "Windows" in ua:
return "Windows"
if "Mac OS X" in ua:
return "macOS"
if "Linux" in ua or "Android" in ua:
return "Linux/Android"
return "Unknown"
def _is_spoof(suspected_os: str, declared_os: str) -> bool:
"""Spoof confirmé uniquement si on a un fingerprint TCP fiable ET une incompatibilité d'OS."""
if suspected_os == "Unknown" or declared_os == "Unknown":
return False # Pas de corrélation possible
# Linux/Mac fingerprint TCP mais UA déclare Windows
if suspected_os == "Linux/Mac" and declared_os == "Windows":
return True
# Windows fingerprint TCP mais UA déclare Linux/Android ou macOS
if suspected_os == "Windows" and declared_os in ("Linux/Android", "macOS"):
return True
return False
@router.get("/overview")
async def get_tcp_spoofing_overview():
"""Statistiques globales : seules les entrées avec données TCP valides sont analysées."""
"""Statistiques globales avec fingerprinting multi-signal (TTL + MSS + fenêtre + scale)."""
try:
sql = """
SELECT
count() AS total_entries,
uniq(src_ip) AS unique_ips,
countIf(tcp_ttl = 0) AS no_tcp_data,
countIf(tcp_ttl > 0) AS with_tcp_data,
countIf(tcp_ttl BETWEEN 52 AND 65) AS linux_fingerprint,
countIf(tcp_ttl BETWEEN 110 AND 135) AS windows_fingerprint
FROM mabase_prod.view_tcp_spoofing_detected
count() AS total_entries,
uniq(src_ip) AS unique_ips,
countIf(tcp_ttl_raw = 0) AS no_tcp_data,
countIf(tcp_ttl_raw > 0) AS with_tcp_data,
countIf(tcp_ttl_raw > 0 AND tcp_ttl_raw <= 64) AS linux_mac_fp,
countIf(tcp_ttl_raw > 64 AND tcp_ttl_raw <= 128) AS windows_fp,
countIf(tcp_ttl_raw > 128) AS cisco_bsd_fp,
countIf(tcp_win_raw = 5808 AND tcp_mss_raw = 1452 AND tcp_scale_raw = 4) AS bot_scanner_fp
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
"""
result = db.query(sql)
row = result.result_rows[0]
total_entries = int(row[0])
unique_ips = int(row[1])
no_tcp_data = int(row[2])
with_tcp_data = int(row[3])
linux_fp = int(row[4])
windows_fp = int(row[5])
# Distribution TTL uniquement pour les entrées avec données TCP valides
# Distribution TTL (top 15)
ttl_sql = """
SELECT
tcp_ttl,
count() AS cnt,
uniq(src_ip) AS ips
FROM mabase_prod.view_tcp_spoofing_detected
WHERE tcp_ttl > 0
GROUP BY tcp_ttl
ORDER BY cnt DESC
LIMIT 15
SELECT tcp_ttl_raw AS ttl, count() AS cnt, uniq(src_ip) AS ips
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY ttl ORDER BY cnt DESC LIMIT 15
"""
ttl_res = db.query(ttl_sql)
ttl_distribution = [
{"ttl": int(r[0]), "count": int(r[1]), "ips": int(r[2])}
for r in ttl_res.result_rows
]
# Distribution window_size pour entrées avec données TCP
# Distribution MSS — nouveau signal clé (top 12)
mss_sql = """
SELECT tcp_mss_raw AS mss, count() AS cnt, uniq(src_ip) AS ips
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_mss_raw > 0
GROUP BY mss ORDER BY cnt DESC LIMIT 12
"""
mss_res = db.query(mss_sql)
# Distribution fenêtre (top 10)
win_sql = """
SELECT
tcp_window_size,
count() AS cnt
FROM mabase_prod.view_tcp_spoofing_detected
WHERE tcp_ttl > 0
GROUP BY tcp_window_size
ORDER BY cnt DESC
LIMIT 10
SELECT tcp_win_raw AS win, count() AS cnt
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY win ORDER BY cnt DESC LIMIT 10
"""
win_res = db.query(win_sql)
window_size_distribution = [
{"window_size": int(r[0]), "count": int(r[1])}
for r in win_res.result_rows
]
return {
"total_entries": total_entries,
"unique_ips": unique_ips,
"no_tcp_data": no_tcp_data,
"with_tcp_data": with_tcp_data,
"linux_fingerprint": linux_fp,
"windows_fingerprint": windows_fp,
"ttl_distribution": ttl_distribution,
"window_size_distribution": window_size_distribution,
"total_entries": int(row[0]),
"unique_ips": int(row[1]),
"no_tcp_data": int(row[2]),
"with_tcp_data": int(row[3]),
"linux_mac_fingerprint": int(row[4]),
"windows_fingerprint": int(row[5]),
"cisco_bsd_fingerprint": int(row[6]),
"bot_scanner_fingerprint": int(row[7]),
"ttl_distribution": [
{"ttl": int(r[0]), "count": int(r[1]), "ips": int(r[2])}
for r in ttl_res.result_rows
],
"mss_distribution": [
{"mss": int(r[0]), "count": int(r[1]), "ips": int(r[2])}
for r in mss_res.result_rows
],
"window_size_distribution": [
{"window_size": int(r[0]), "count": int(r[1])}
for r in win_res.result_rows
],
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@ -134,54 +99,75 @@ async def get_tcp_spoofing_overview():
async def get_tcp_spoofing_list(
limit: int = Query(100, ge=1, le=1000),
offset: int = Query(0, ge=0),
spoof_only: bool = Query(False, description="Ne retourner que les vrais spoofs (TTL corrélable + OS mismatch)"),
spoof_only: bool = Query(False, description="Retourner uniquement les spoofs/bots confirmés"),
):
"""Liste des entrées avec données TCP valides (tcp_ttl > 0).
Entrées sans données TCP (TTL=0) exclues : pas de corrélation possible.
Si spoof_only=True, retourne uniquement les entrées avec fingerprint OS identifiable (Linux/Mac TTL 52-65).
"""Liste avec fingerprinting multi-signal (TTL + MSS + fenêtre + scale).
Inclut les champs enrichis : mss, win_scale, initial_ttl, hop_count, confidence, network_path, is_bot_tool.
"""
try:
# Filtre SQL : seules les entrées avec TTL valide, et si spoof_only les plages corrélables
if spoof_only:
# Seules les plages de TTL qui permettent une identification OS fiable
ttl_filter = "tcp_ttl BETWEEN 52 AND 65 OR tcp_ttl BETWEEN 110 AND 135 OR tcp_ttl BETWEEN 240 AND 255"
else:
ttl_filter = "tcp_ttl > 0"
count_sql = f"SELECT count() FROM mabase_prod.view_tcp_spoofing_detected WHERE {ttl_filter}"
count_sql = """
SELECT count() FROM (
SELECT src_ip, ja4
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY src_ip, ja4
)
"""
total = int(db.query(count_sql).result_rows[0][0])
sql = f"""
sql = """
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS src_ip,
ja4, tcp_ttl, tcp_window_size, first_ua
FROM mabase_prod.view_tcp_spoofing_detected
WHERE {ttl_filter}
ORDER BY tcp_ttl ASC
ja4,
any(tcp_ttl_raw) AS tcp_ttl,
any(tcp_win_raw) AS tcp_window_size,
any(tcp_scale_raw) AS tcp_win_scale,
any(tcp_mss_raw) AS tcp_mss,
any(first_ua) AS first_ua,
sum(hits) AS hits
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY src_ip, ja4
ORDER BY hits DESC
LIMIT %(limit)s OFFSET %(offset)s
"""
result = db.query(sql, {"limit": limit, "offset": offset})
items = []
for row in result.result_rows:
ip = str(row[0])
ja4 = str(row[1] or "")
ttl = int(row[2])
window_size = int(row[3])
ua = str(row[4] or "")
sus_os = _suspected_os(ttl)
dec_os = _declared_os(ua)
spoof_flag = _is_spoof(sus_os, dec_os)
if spoof_only and not spoof_flag:
ip = str(row[0])
ja4 = str(row[1] or "")
ttl = int(row[2] or 0)
win = int(row[3] or 0)
scale = int(row[4] or 0)
mss = int(row[5] or 0)
ua = str(row[6] or "")
hits = int(row[7] or 0)
fp = fingerprint_os(ttl, win, scale, mss)
dec_os = declared_os_from_ua(ua)
spoof_res = detect_spoof(fp, dec_os)
if spoof_only and not spoof_res.is_spoof:
continue
items.append({
"ip": ip,
"ja4": ja4,
"tcp_ttl": ttl,
"tcp_window_size": window_size,
"tcp_window_size": win,
"tcp_win_scale": scale,
"tcp_mss": mss,
"hits": hits,
"first_ua": ua,
"suspected_os": sus_os,
"suspected_os": fp.os_name,
"initial_ttl": fp.initial_ttl,
"hop_count": fp.hop_count,
"confidence": fp.confidence,
"network_path": fp.network_path,
"is_bot_tool": fp.is_bot_tool,
"declared_os": dec_os,
"spoof_flag": spoof_flag,
"spoof_flag": spoof_res.is_spoof,
"spoof_reason": spoof_res.reason,
})
return {"items": items, "total": total}
except Exception as e:
@ -190,29 +176,44 @@ async def get_tcp_spoofing_list(
@router.get("/matrix")
async def get_tcp_spoofing_matrix():
"""Matrice suspected_os × declared_os — uniquement entrées avec TTL valide."""
"""Matrice OS suspecté × OS déclaré avec fingerprinting multi-signal."""
try:
sql = """
SELECT tcp_ttl, first_ua
FROM mabase_prod.view_tcp_spoofing_detected
WHERE tcp_ttl > 0
SELECT
any(tcp_ttl_raw) AS ttl,
any(tcp_win_raw) AS win,
any(tcp_scale_raw) AS scale,
any(tcp_mss_raw) AS mss,
any(first_ua) AS ua,
count() AS cnt
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY src_ip, ja4
"""
result = db.query(sql)
counts: dict = {}
for row in result.result_rows:
ttl = int(row[0])
ua = str(row[1] or "")
sus_os = _suspected_os(ttl)
dec_os = _declared_os(ua)
key = (sus_os, dec_os)
counts[key] = counts.get(key, 0) + 1
ttl = int(row[0] or 0)
win = int(row[1] or 0)
scale = int(row[2] or 0)
mss = int(row[3] or 0)
ua = str(row[4] or "")
cnt = int(row[5] or 1)
fp = fingerprint_os(ttl, win, scale, mss)
dec_os = declared_os_from_ua(ua)
spoof_res = detect_spoof(fp, dec_os)
key = (fp.os_name, dec_os, spoof_res.is_spoof, fp.is_bot_tool)
counts[key] = counts.get(key, 0) + cnt
matrix = [
{
"suspected_os": k[0],
"declared_os": k[1],
"count": v,
"is_spoof": _is_spoof(k[0], k[1]),
"is_spoof": k[2],
"is_bot_tool": k[3],
}
for k, v in counts.items()
]