feat: ja4-platform monorepo — 5 services unified, tests & RPM builds standardized

Services:
- ja4sentinel: TLS/JA4 fingerprint capture daemon (Go, libpcap)
- logcorrelator: JA4 log correlation engine (Go, ClickHouse)
- mod_reqin_log: Apache module (C, JSON request logging)
- bot_detector: ML bot detection pipeline (Python)
- dashboard: FastAPI/Streamlit analytics UI (Python)

Shared libraries:
- shared/go/ja4common: logger, config, shutdown, ipfilter (Go module)
- shared/python/ja4_common: ClickHouseClient, ClickHouseSettings (Python package)
- shared/clickhouse/: canonical SQL migrations (10 files)

Build & packaging:
- Unified 3-stage Dockerfile.package for Go RPMs (el8/el9/el10)
- go.work workspace linking sentinel, correlator, ja4common
- Makefile with test-all, build-all, rpm-* targets

Fixes applied:
- go.work: 1.21 → 1.24.6 (required by sentinel)
- correlator Dockerfiles: golang:1.21 → golang:1.24
- replace directives in go.mod for ja4common local path
- pyproject.toml: setuptools.backends → setuptools.build_meta
- Removed static libpcap linking (unavailable on Rocky 9)
- Fixed data races in output/writers_test.go (sync.Mutex + atomic.Int32)
- Rewrote corrupted test files (logger_test.go × 2)

Test coverage:
- correlator: 67.1% total (unixsocket 80.5%, config 91.7%, app 83.3%, multi 87.7%, stdout 100%)
- sentinel: all 10 packages pass (api, capture, config, fingerprint, ipfilter, logging, output, tlsparse)

Documentation:
- README.md + docs/ (architecture, development, 5 services, shared libs, DB schema & migrations)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-07 16:42:59 +02:00
commit d469e39da7
278 changed files with 1621301 additions and 0 deletions

View File

@ -0,0 +1,827 @@
"""
Endpoints pour l'analyse des fingerprints JA4 et User-Agents
Objectifs:
- Détecter le spoofing JA4 (fingerprint TLS qui prétend être un navigateur mais
dont les User-Agents, les headers HTTP ou les métriques comportementales trahissent
une origine bot/script)
- Construire une matrice JA4 × User-Agent pour visualiser les associations suspectes
- Analyser la distribution des User-Agents pour identifier les rotateurs et les bots
qui usurpent des UA de navigateurs légitimes
"""
from fastapi import APIRouter, HTTPException, Query
import re
from ..database import db
router = APIRouter(prefix="/api/fingerprints", tags=["fingerprints"])
# ─── Helpers ──────────────────────────────────────────────────────────────────
# Patterns indiquant clairement un bot/script sans simulation de navigateur
_BOT_PATTERNS = re.compile(
r"bot|crawler|spider|scraper|python|curl|wget|go-http|java/|axios|"
r"libwww|httpclient|okhttp|requests|aiohttp|httpx|playwright|puppeteer|"
r"selenium|headless|phantomjs",
re.IGNORECASE,
)
# Navigateurs légitimes communs — un JA4 de type "browser" devrait venir avec ces UAs
_BROWSER_PATTERNS = re.compile(
r"mozilla|chrome|safari|firefox|edge|opera|trident",
re.IGNORECASE,
)
def _classify_ua(ua: str) -> str:
"""Retourne 'bot', 'browser', ou 'script'"""
if not ua:
return "empty"
if _BOT_PATTERNS.search(ua):
return "bot"
if _BROWSER_PATTERNS.search(ua):
return "browser"
return "script"
# =============================================================================
# ENDPOINT 1 — Détection de spoofing JA4
# =============================================================================
@router.get("/spoofing")
async def get_ja4_spoofing(
hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle"),
min_detections: int = Query(10, ge=1, description="Nombre minimum de détections"),
limit: int = Query(50, ge=1, le=200),
):
"""
Identifie les JA4 fingerprints suspects de spoofing navigateur.
Un JA4 est considéré suspect quand:
- Il présente un taux élevé de ua_ch_mismatch (header UA ≠ Client Hints)
- Son modern_browser_score est élevé mais les UAs associés sont des bots/scripts
- Il apparaît avec un taux élevé de sni_host_mismatch ou alpn_http_mismatch
- is_rare_ja4 = true avec un volume important
Retourne un score de confiance de spoofing [0-100] pour chaque JA4.
"""
try:
# Agrégation par JA4 avec tous les indicateurs de spoofing
query = """
SELECT
ja4,
count() AS total_detections,
uniq(src_ip) AS unique_ips,
-- Indicateurs de mismatch
countIf(ua_ch_mismatch = true) AS ua_ch_mismatch_count,
round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
countIf(sni_host_mismatch = true) AS sni_mismatch_count,
round(countIf(sni_host_mismatch = true) * 100.0 / count(), 2) AS sni_mismatch_pct,
countIf(alpn_http_mismatch = true) AS alpn_mismatch_count,
round(countIf(alpn_http_mismatch = true) * 100.0 / count(), 2) AS alpn_mismatch_pct,
-- Indicateurs comportementaux
avg(modern_browser_score) AS avg_browser_score,
countIf(is_rare_ja4 = true) AS rare_ja4_count,
round(countIf(is_rare_ja4 = true) * 100.0 / count(), 2) AS rare_ja4_pct,
countIf(is_ua_rotating = true) AS ua_rotating_count,
round(countIf(is_ua_rotating = true) * 100.0 / count(), 2) AS ua_rotating_pct,
-- Métriques TLS/TCP
countIf(is_alpn_missing = true) AS alpn_missing_count,
avg(distinct_ja4_count) AS avg_distinct_ja4_per_ip,
-- Répartition threat levels
countIf(threat_level = 'CRITICAL') AS critical_count,
countIf(threat_level = 'HIGH') AS high_count,
-- Botnet indicators
avg(ja4_asn_concentration) AS avg_asn_concentration,
avg(ja4_country_concentration) AS avg_country_concentration,
argMax(threat_level, detected_at) AS last_threat_level
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND ja4 != '' AND ja4 IS NOT NULL
GROUP BY ja4
HAVING total_detections >= %(min_detections)s
ORDER BY ua_ch_mismatch_pct DESC, total_detections DESC
LIMIT %(limit)s
"""
result = db.query(query, {
"hours": hours,
"min_detections": min_detections,
"limit": limit,
})
# Fetch top UA per JA4 from view_dashboard_user_agents
ja4_list = [str(r[0]) for r in result.result_rows if r[0]]
ua_by_ja4: dict = {}
if ja4_list:
ja4_sql = ", ".join(f"'{j}'" for j in ja4_list[:100])
ua_q = f"""
SELECT ja4, groupArray(5)(ua) AS top_uas
FROM (
SELECT ja4, arrayJoin(user_agents) AS ua, sum(requests) AS cnt
FROM view_dashboard_user_agents
WHERE ja4 IN ({ja4_sql})
AND hour >= now() - INTERVAL {hours} HOUR
AND ua != ''
GROUP BY ja4, ua
ORDER BY ja4, cnt DESC
)
GROUP BY ja4
"""
try:
ua_res = db.query(ua_q)
for ua_row in ua_res.result_rows:
j4 = str(ua_row[0])
if ua_row[1]:
ua_by_ja4[j4] = list(ua_row[1])
except Exception:
pass
items = []
for row in result.result_rows:
ja4 = str(row[0])
ua_ch_mismatch_pct = float(row[4] or 0)
sni_mismatch_pct = float(row[6] or 0)
alpn_mismatch_pct = float(row[8] or 0)
avg_browser_score = float(row[9] or 0)
rare_ja4_pct = float(row[11] or 0)
ua_rotating_pct = float(row[13] or 0)
alpn_missing_count = int(row[14] or 0)
total = int(row[1] or 1)
top_uas = ua_by_ja4.get(ja4, [])
ua_classes = [_classify_ua(u) for u in top_uas]
has_bot_ua = any(c == "bot" for c in ua_classes)
has_browser_ua = any(c == "browser" for c in ua_classes)
# Spoofing confidence score [0-100]:
# UA/CH mismatch est le signal le plus fort (poids 40)
# Browser UA avec score navigateur élevé mais indicateurs bot (poids 25)
# SNI/ALPN mismatches (poids 15)
# is_rare_ja4 avec gros volume (poids 10)
# UA rotating (poids 10)
spoof_score = min(100, round(
ua_ch_mismatch_pct * 0.40
+ (avg_browser_score * 25 / 100 if has_bot_ua else 0)
+ sni_mismatch_pct * 0.10
+ alpn_mismatch_pct * 0.05
+ rare_ja4_pct * 0.10
+ ua_rotating_pct * 0.10
+ (10 if alpn_missing_count > total * 0.3 else 0)
))
# Classification du JA4
if spoof_score >= 60:
classification = "spoofed_browser"
elif has_bot_ua and avg_browser_score < 30:
classification = "known_bot"
elif has_browser_ua and ua_ch_mismatch_pct < 10:
classification = "legitimate_browser"
else:
classification = "suspicious"
items.append({
"ja4": ja4,
"classification": classification,
"spoofing_score": spoof_score,
"total_detections": int(row[1] or 0),
"unique_ips": int(row[2] or 0),
"indicators": {
"ua_ch_mismatch_pct": ua_ch_mismatch_pct,
"sni_mismatch_pct": sni_mismatch_pct,
"alpn_mismatch_pct": alpn_mismatch_pct,
"avg_browser_score": round(avg_browser_score, 1),
"rare_ja4_pct": rare_ja4_pct,
"ua_rotating_pct": ua_rotating_pct,
"alpn_missing_count": alpn_missing_count,
"avg_asn_concentration": round(float(row[18] or 0), 3),
"avg_country_concentration": round(float(row[19] or 0), 3),
},
"top_user_agents": [
{"ua": u, "type": _classify_ua(u)} for u in top_uas
],
"threat_breakdown": {
"critical": int(row[16] or 0),
"high": int(row[17] or 0),
"last_level": str(row[20] or "LOW"),
},
})
# Trier: spoofed_browser d'abord, puis par score
items.sort(key=lambda x: (-x["spoofing_score"], -x["total_detections"]))
return {
"items": items,
"total": len(items),
"period_hours": hours,
"summary": {
"spoofed_browser": sum(1 for i in items if i["classification"] == "spoofed_browser"),
"known_bot": sum(1 for i in items if i["classification"] == "known_bot"),
"suspicious": sum(1 for i in items if i["classification"] == "suspicious"),
"legitimate_browser": sum(1 for i in items if i["classification"] == "legitimate_browser"),
},
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ENDPOINT 2 — Matrice JA4 × User-Agent
# =============================================================================
@router.get("/ja4-ua-matrix")
async def get_ja4_ua_matrix(
hours: int = Query(24, ge=1, le=168),
min_ips: int = Query(3, ge=1, description="Nombre minimum d'IPs pour inclure un JA4"),
limit: int = Query(30, ge=1, le=100),
):
"""
Matrice JA4 × User-Agent.
Pour chaque JA4:
- Top User-Agents associés (depuis view_dashboard_entities)
- Taux de ua_ch_mismatch
- Classification UA (bot / browser / script)
- Indicateur de spoofing si browser_score élevé + UA non-navigateur
"""
try:
# Stats JA4 depuis ml_detected_anomalies
stats_query = """
SELECT
ja4,
uniq(src_ip) AS unique_ips,
count() AS total_detections,
round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
avg(modern_browser_score) AS avg_browser_score,
countIf(is_rare_ja4 = true) AS rare_count,
countIf(is_ua_rotating = true) AS rotating_count,
argMax(threat_level, detected_at) AS last_threat
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND ja4 != '' AND ja4 IS NOT NULL
GROUP BY ja4
HAVING unique_ips >= %(min_ips)s
ORDER BY ua_ch_mismatch_pct DESC, unique_ips DESC
LIMIT %(limit)s
"""
stats_res = db.query(stats_query, {"hours": hours, "min_ips": min_ips, "limit": limit})
ja4_list = [str(r[0]) for r in stats_res.result_rows]
if not ja4_list:
return {"items": [], "total": 0, "period_hours": hours}
# UAs par JA4 depuis view_dashboard_user_agents
ja4_sql = ", ".join(f"'{j}'" for j in ja4_list)
ua_query = f"""
SELECT
ja4,
ua,
sum(requests) AS cnt
FROM view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE ja4 IN ({ja4_sql})
AND hour >= now() - INTERVAL {hours} HOUR
AND ua != ''
GROUP BY ja4, ua
ORDER BY ja4, cnt DESC
"""
ua_by_ja4: dict = {}
try:
ua_res = db.query(ua_query)
for row in ua_res.result_rows:
j4 = str(row[0])
if j4 not in ua_by_ja4:
ua_by_ja4[j4] = []
if len(ua_by_ja4[j4]) < 8:
ua_by_ja4[j4].append({"ua": str(row[1]), "count": int(row[2] or 0)})
except Exception:
pass
items = []
for row in stats_res.result_rows:
ja4 = str(row[0])
unique_ips = int(row[1] or 0)
ua_ch_mismatch_pct = float(row[3] or 0)
avg_browser_score = float(row[4] or 0)
top_uas = ua_by_ja4.get(ja4, [])
ua_total = sum(u["count"] for u in top_uas) or 1
classified_uas = []
for u in top_uas:
ua_type = _classify_ua(u["ua"])
classified_uas.append({
"ua": u["ua"],
"count": u["count"],
"pct": round(u["count"] * 100 / ua_total, 1),
"type": ua_type,
})
bot_pct = sum(u["pct"] for u in classified_uas if u["type"] == "bot")
browser_pct = sum(u["pct"] for u in classified_uas if u["type"] == "browser")
# Spoofing flag: JA4 ressemble à un navigateur (browser_score élevé)
# mais les UAs sont des bots/scripts
is_spoofing = avg_browser_score > 50 and bot_pct > 30 and ua_ch_mismatch_pct > 20
items.append({
"ja4": ja4,
"unique_ips": unique_ips,
"total_detections": int(row[2] or 0),
"ua_ch_mismatch_pct": ua_ch_mismatch_pct,
"avg_browser_score": round(avg_browser_score, 1),
"rare_count": int(row[5] or 0),
"rotating_count": int(row[6] or 0),
"last_threat": str(row[7] or "LOW"),
"user_agents": classified_uas,
"ua_summary": {
"bot_pct": round(bot_pct, 1),
"browser_pct": round(browser_pct, 1),
"script_pct": round(100 - bot_pct - browser_pct, 1),
"total_distinct": len(top_uas),
},
"is_spoofing_suspect": is_spoofing,
})
return {
"items": items,
"total": len(items),
"period_hours": hours,
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ENDPOINT 3 — Analyse globale des User-Agents
# =============================================================================
@router.get("/ua-analysis")
async def get_ua_analysis(
hours: int = Query(24, ge=1, le=168),
limit: int = Query(50, ge=1, le=200),
):
"""
Analyse globale des User-Agents dans les détections.
Identifie:
- UAs de type bot/script
- UAs browser légitimes vs UAs browser utilisés par des bots (via ua_ch_mismatch)
- UAs rares/suspects qui tournent (is_ua_rotating)
- Distribution JA4 par UA pour détecter les UAs multi-fingerprints (rotation)
"""
try:
# Top UAs globaux depuis view_dashboard_user_agents
ua_global_query = """
SELECT
ua,
sum(requests) AS ip_count
FROM view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE hour >= now() - INTERVAL %(hours)s HOUR
AND ua != ''
GROUP BY ua
ORDER BY ip_count DESC
LIMIT %(limit)s
"""
ua_global_res = db.query(ua_global_query, {"hours": hours, "limit": limit})
top_uas = [str(r[0]) for r in ua_global_res.result_rows]
# Pour chaque UA, chercher ses JA4 via view_dashboard_user_agents
ua_sql = ", ".join(f"'{u.replace(chr(39), chr(39)*2)}'" for u in top_uas[:50]) if top_uas else "''"
ja4_per_ua_query = f"""
SELECT
ua,
uniq(ja4) AS unique_ja4s,
groupUniqArray(3)(ja4) AS sample_ja4s
FROM view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE ua IN ({ua_sql})
AND hour >= now() - INTERVAL {hours} HOUR
AND ua != ''
AND ja4 != ''
GROUP BY ua
"""
ja4_by_ua: dict = {}
try:
ja4_res = db.query(ja4_per_ua_query)
for r in ja4_res.result_rows:
ja4_by_ua[str(r[0])] = {
"unique_ja4s": int(r[1] or 0),
"sample_ja4s": list(r[2] or []),
}
except Exception:
pass
# IPs avec is_ua_rotating depuis ml_detected_anomalies
rotating_query = """
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
avg(ua_ch_mismatch) AS avg_ua_ch_mismatch
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND is_ua_rotating = true
GROUP BY clean_ip
ORDER BY avg_ua_ch_mismatch DESC
"""
rotating_ips: list = []
try:
rot_res = db.query(rotating_query, {"hours": hours})
rotating_ips = [str(r[0]) for r in rot_res.result_rows]
except Exception:
pass
# Construire la réponse
items = []
for row in ua_global_res.result_rows:
ua = str(row[0])
ip_count = int(row[1] or 0)
ua_type = _classify_ua(ua)
ja4_info = ja4_by_ua.get(ua, {"unique_ja4s": 0, "sample_ja4s": []})
# UA multi-JA4 est suspect: un vrai navigateur a généralement 1-2 JA4
multi_ja4_flag = ja4_info["unique_ja4s"] > 3
items.append({
"user_agent": ua,
"type": ua_type,
"ip_count": ip_count,
"unique_ja4_count": ja4_info["unique_ja4s"],
"sample_ja4s": ja4_info["sample_ja4s"],
"is_multi_ja4_suspect": multi_ja4_flag,
"risk_flags": _build_ua_risk_flags(ua, ua_type, ja4_info["unique_ja4s"], ip_count),
})
# IPs avec rotation d'UA
ua_rotating_stats = {
"rotating_ip_count": len(rotating_ips),
"sample_rotating_ips": rotating_ips[:10],
}
return {
"items": items,
"total": len(items),
"period_hours": hours,
"ua_rotating_stats": ua_rotating_stats,
"summary": {
"bot_count": sum(1 for i in items if i["type"] == "bot"),
"browser_count": sum(1 for i in items if i["type"] == "browser"),
"script_count": sum(1 for i in items if i["type"] == "script"),
"multi_ja4_suspect_count": sum(1 for i in items if i["is_multi_ja4_suspect"]),
},
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
def _build_ua_risk_flags(ua: str, ua_type: str, unique_ja4s: int, ip_count: int) -> list:
flags = []
if ua_type == "bot":
flags.append("ua_bot_signature")
elif ua_type == "script":
flags.append("ua_script_library")
if unique_ja4s > 5:
flags.append("ja4_rotation_suspect")
if unique_ja4s > 3 and ua_type == "browser":
flags.append("browser_ua_multi_fingerprint")
if ip_count > 100:
flags.append("high_volume")
return flags
# =============================================================================
# ENDPOINT 4 — JA4 d'un IP spécifique: analyse de cohérence UA/JA4
# =============================================================================
@router.get("/ip/{ip}/coherence")
async def get_ip_fingerprint_coherence(ip: str):
"""
Analyse la cohérence JA4/UA pour une IP spécifique.
Répond à la question: "Cette IP spoofait-elle son fingerprint?"
Calcule un score de cohérence basé sur:
- Correspondance entre JA4 (TLS client fingerprint) et User-Agent
- ua_ch_mismatch (User-Agent vs Client Hints)
- modern_browser_score vs type d'UA réel
- Nombre de JA4 distincts utilisés (rotation)
- sni_host_mismatch, alpn_http_mismatch
"""
try:
# Données depuis ml_detected_anomalies
ml_query = """
SELECT
ja4,
ua_ch_mismatch,
modern_browser_score,
sni_host_mismatch,
alpn_http_mismatch,
is_alpn_missing,
is_rare_ja4,
is_ua_rotating,
distinct_ja4_count,
header_count,
has_accept_language,
has_cookie,
has_referer,
header_order_shared_count,
detected_at,
threat_level,
window_mss_ratio,
tcp_jitter_variance,
multiplexing_efficiency
FROM ml_detected_anomalies
WHERE src_ip = %(ip)s
ORDER BY detected_at DESC
"""
ml_res = db.query(ml_query, {"ip": ip})
if not ml_res.result_rows:
raise HTTPException(status_code=404, detail="IP non trouvée dans les détections")
# User-agents réels depuis view_dashboard_user_agents
ua_query = """
SELECT ua, sum(requests) AS cnt
FROM view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE toString(src_ip) = %(ip)s
AND hour >= now() - INTERVAL 72 HOUR
AND ua != ''
GROUP BY ua ORDER BY cnt DESC
"""
ua_res = db.query(ua_query, {"ip": ip})
top_uas = [{"ua": str(r[0]), "count": int(r[1] or 0), "type": _classify_ua(str(r[0]))}
for r in ua_res.result_rows]
# Agréger les indicateurs de la dernière session
rows = ml_res.result_rows
latest = rows[0]
total_rows = len(rows)
ua_ch_mismatch_count = sum(1 for r in rows if r[1])
sni_mismatch_count = sum(1 for r in rows if r[3])
alpn_mismatch_count = sum(1 for r in rows if r[4])
is_rare_count = sum(1 for r in rows if r[6])
is_rotating = any(r[7] for r in rows)
distinct_ja4s = {str(r[0]) for r in rows if r[0]}
avg_browser_score = sum(int(r[2] or 0) for r in rows) / total_rows
# UA analysis
has_browser_ua = any(u["type"] == "browser" for u in top_uas)
has_bot_ua = any(u["type"] == "bot" for u in top_uas)
primary_ua_type = top_uas[0]["type"] if top_uas else "empty"
# Calcul du score de spoofing
spoof_score = min(100, round(
(ua_ch_mismatch_count / total_rows * 100) * 0.40
+ (avg_browser_score * 0.20 if has_bot_ua else 0)
+ (sni_mismatch_count / total_rows * 100) * 0.10
+ (alpn_mismatch_count / total_rows * 100) * 0.05
+ (len(distinct_ja4s) * 5 if len(distinct_ja4s) > 2 else 0)
+ (15 if is_rotating else 0)
+ (10 if is_rare_count > total_rows * 0.5 else 0)
))
# Verdict
if spoof_score >= 70:
verdict = "high_confidence_spoofing"
elif spoof_score >= 40:
verdict = "suspicious_spoofing"
elif has_bot_ua and avg_browser_score < 20:
verdict = "known_bot_no_spoofing"
elif has_browser_ua and spoof_score < 20:
verdict = "legitimate_browser"
else:
verdict = "inconclusive"
# Explication humaine
explanation = []
if ua_ch_mismatch_count > total_rows * 0.3:
explanation.append(f"UA-Client-Hints mismatch sur {round(ua_ch_mismatch_count*100/total_rows)}% des requêtes")
if has_bot_ua and avg_browser_score > 40:
explanation.append(f"JA4 ressemble à un navigateur (score {round(avg_browser_score)}/100) mais UA est de type bot")
if len(distinct_ja4s) > 2:
explanation.append(f"{len(distinct_ja4s)} JA4 distincts utilisés → rotation de fingerprint")
if is_rotating:
explanation.append("is_ua_rotating détecté → rotation d'User-Agent confirmée")
if sni_mismatch_count > 0:
explanation.append(f"SNI ≠ Host header sur {sni_mismatch_count}/{total_rows} requêtes")
if not explanation:
explanation.append("Aucun indicateur de spoofing majeur détecté")
return {
"ip": ip,
"verdict": verdict,
"spoofing_score": spoof_score,
"explanation": explanation,
"indicators": {
"ua_ch_mismatch_rate": round(ua_ch_mismatch_count / total_rows * 100, 1),
"sni_mismatch_rate": round(sni_mismatch_count / total_rows * 100, 1),
"alpn_mismatch_rate": round(alpn_mismatch_count / total_rows * 100, 1),
"avg_browser_score": round(avg_browser_score, 1),
"distinct_ja4_count": len(distinct_ja4s),
"is_ua_rotating": is_rotating,
"rare_ja4_rate": round(is_rare_count / total_rows * 100, 1),
},
"fingerprints": {
"ja4_list": list(distinct_ja4s),
"latest_ja4": str(latest[0] or ""),
},
"user_agents": top_uas,
"latest_detection": {
"detected_at": latest[14].isoformat() if latest[14] else "",
"threat_level": str(latest[15] or "LOW"),
"modern_browser_score": int(latest[2] or 0),
"header_count": int(latest[9] or 0),
"has_accept_language": bool(latest[10]),
"has_cookie": bool(latest[11]),
"has_referer": bool(latest[12]),
"header_order_shared_count": int(latest[13] or 0),
},
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ENDPOINT 5 — JA4 légitimes (baseline / whitelist)
# =============================================================================
@router.get("/legitimate-ja4")
async def get_legitimate_ja4(
hours: int = Query(168, ge=24, le=720, description="Fenêtre pour établir la baseline"),
min_ips: int = Query(50, ge=5, description="Nombre minimum d'IPs pour qualifier un JA4 de légitime"),
):
"""
Établit une baseline des JA4 fingerprints légitimes.
Un JA4 est considéré légitime si:
- Il est utilisé par un grand nombre d'IPs distinctes (> min_ips)
- Son taux de ua_ch_mismatch est faible (< 5%)
- Son modern_browser_score est élevé (> 60)
- Il n'est PAS is_rare_ja4
- Ses UAs sont dominés par des navigateurs connus
Utile comme whitelist pour réduire les faux positifs.
"""
try:
query = """
SELECT
ja4,
uniq(src_ip) AS unique_ips,
count() AS total_detections,
round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
avg(modern_browser_score) AS avg_browser_score,
countIf(is_rare_ja4 = true) AS rare_count,
round(countIf(threat_level = 'CRITICAL') * 100.0 / count(), 2) AS critical_pct,
round(countIf(threat_level = 'HIGH') * 100.0 / count(), 2) AS high_pct
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND ja4 != '' AND ja4 IS NOT NULL
GROUP BY ja4
HAVING unique_ips >= %(min_ips)s
AND ua_ch_mismatch_pct < 5.0
AND avg_browser_score > 60
AND rare_count = 0
ORDER BY unique_ips DESC
"""
result = db.query(query, {"hours": hours, "min_ips": min_ips})
items = [
{
"ja4": str(row[0]),
"unique_ips": int(row[1] or 0),
"total_detections": int(row[2] or 0),
"ua_ch_mismatch_pct": float(row[3] or 0),
"avg_browser_score": round(float(row[4] or 0), 1),
"critical_pct": float(row[6] or 0),
"high_pct": float(row[7] or 0),
"legitimacy_confidence": min(100, round(
(1 - float(row[3] or 0) / 100) * 40
+ float(row[4] or 0) * 0.40
+ min(int(row[1] or 0) / min_ips, 1) * 20
)),
}
for row in result.result_rows
]
return {
"items": items,
"total": len(items),
"period_hours": hours,
"note": "Ces JA4 sont candidats à une whitelist. Vérifier manuellement avant de whitelister.",
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ENDPOINT — Corrélation JA4 × ASN / Pays (C5)
# Détecte les JA4 fortement concentrés sur un seul ASN ou pays
# → signal de botnet ciblé ou d'infrastructure de test/attaque partagée
# =============================================================================
@router.get("/asn-correlation")
async def get_ja4_asn_correlation(
min_concentration: float = Query(0.7, ge=0.0, le=1.0, description="Seuil min de concentration ASN ou pays"),
min_ips: int = Query(5, ge=1, description="Nombre minimum d'IPs par JA4"),
limit: int = Query(50, ge=1, le=200),
):
"""
Identifie les JA4 fingerprints fortement concentrés sur un seul ASN ou pays.
Un JA4 avec asn_concentration ≥ 0.7 signifie que ≥70% des IPs utilisant ce fingerprint
proviennent du même ASN → infrastructure de bot partagée ou datacenter suspect.
"""
try:
# Two-pass: first aggregate per (ja4, asn) to get IP counts per ASN,
# then aggregate per ja4 to compute concentration ratio
sql = """
SELECT
ja4,
sum(ips_per_combo) AS unique_ips,
uniq(src_asn) AS unique_asns,
uniq(src_country_code) AS unique_countries,
toString(argMax(src_asn, ips_per_combo)) AS top_asn_number,
argMax(asn_name, ips_per_combo) AS top_asn_name,
argMax(src_country_code, country_ips) AS dominant_country,
sum(total_hits) AS total_hits,
round(max(ips_per_combo) / greatest(sum(ips_per_combo), 1), 3) AS asn_concentration,
round(max(country_ips) / greatest(sum(ips_per_combo), 1), 3) AS country_concentration
FROM (
SELECT
ja4,
src_asn,
src_country_code,
any(src_as_name) AS asn_name,
uniq(src_ip) AS ips_per_combo,
uniq(src_ip) AS country_ips,
sum(hits) AS total_hits
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
AND ja4 != ''
GROUP BY ja4, src_asn, src_country_code
)
GROUP BY ja4
HAVING unique_ips >= %(min_ips)s
AND (asn_concentration >= %(min_conc)s OR country_concentration >= %(min_conc)s)
ORDER BY asn_concentration DESC, unique_ips DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"min_ips": min_ips, "min_conc": min_concentration, "limit": limit})
items = []
for row in result.result_rows:
ja4 = str(row[0])
unique_ips = int(row[1])
unique_asns = int(row[2])
unique_countries = int(row[3])
top_asn_number = str(row[4] or "")
top_asn_name = str(row[5] or "")
dominant_country = str(row[6] or "")
total_hits = int(row[7] or 0)
asn_concentration = float(row[8] or 0)
country_concentration = float(row[9] or 0)
if asn_concentration >= 0.85:
corr_type, risk = "asn_monopoly", "high"
elif asn_concentration >= min_concentration:
corr_type, risk = "asn_dominant", "medium"
elif country_concentration >= min_concentration:
corr_type, risk = "geo_targeted", "medium"
else:
corr_type, risk = "distributed", "low"
items.append({
"ja4": ja4,
"unique_ips": unique_ips,
"unique_asns": unique_asns,
"unique_countries": unique_countries,
"top_asn_name": top_asn_name,
"top_asn_number": top_asn_number,
"dominant_country": dominant_country,
"total_hits": total_hits,
"asn_concentration": asn_concentration,
"country_concentration":country_concentration,
"correlation_type": corr_type,
"risk": risk,
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")