feat: ja4-platform monorepo — 5 services unified, tests & RPM builds standardized

Services:
- ja4sentinel: TLS/JA4 fingerprint capture daemon (Go, libpcap)
- logcorrelator: JA4 log correlation engine (Go, ClickHouse)
- mod_reqin_log: Apache module (C, JSON request logging)
- bot_detector: ML bot detection pipeline (Python)
- dashboard: FastAPI/Streamlit analytics UI (Python)

Shared libraries:
- shared/go/ja4common: logger, config, shutdown, ipfilter (Go module)
- shared/python/ja4_common: ClickHouseClient, ClickHouseSettings (Python package)
- shared/clickhouse/: canonical SQL migrations (10 files)

Build & packaging:
- Unified 3-stage Dockerfile.package for Go RPMs (el8/el9/el10)
- go.work workspace linking sentinel, correlator, ja4common
- Makefile with test-all, build-all, rpm-* targets

Fixes applied:
- go.work: 1.21 → 1.24.6 (required by sentinel)
- correlator Dockerfiles: golang:1.21 → golang:1.24
- replace directives in go.mod for ja4common local path
- pyproject.toml: setuptools.backends → setuptools.build_meta
- Removed static libpcap linking (unavailable on Rocky 9)
- Fixed data races in output/writers_test.go (sync.Mutex + atomic.Int32)
- Rewrote corrupted test files (logger_test.go × 2)

Test coverage:
- correlator: 67.1% total (unixsocket 80.5%, config 91.7%, app 83.3%, multi 87.7%, stdout 100%)
- sentinel: all 10 packages pass (api, capture, config, fingerprint, ipfilter, logging, output, tlsparse)

Documentation:
- README.md + docs/ (architecture, development, 5 services, shared libs, DB schema & migrations)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-07 16:42:59 +02:00
commit d469e39da7
278 changed files with 1621301 additions and 0 deletions

View File

@ -0,0 +1 @@
# Backend package

View File

@ -0,0 +1,27 @@
"""
Configuration du Dashboard Bot Detector
"""
from pydantic_settings import BaseSettings
class Settings(BaseSettings):
# ClickHouse
CLICKHOUSE_HOST: str = "clickhouse"
CLICKHOUSE_PORT: int = 8123
CLICKHOUSE_DB: str = "mabase_prod"
CLICKHOUSE_USER: str = "admin"
CLICKHOUSE_PASSWORD: str = ""
# API
API_HOST: str = "0.0.0.0"
API_PORT: int = 8000
# CORS
CORS_ORIGINS: list = ["http://localhost:3000", "http://127.0.0.1:3000"]
class Config:
env_file = ".env"
case_sensitive = True
settings = Settings()

View File

@ -0,0 +1,7 @@
"""
ClickHouse connection — delegates to ja4_common shared client.
"""
from ja4_common.clickhouse import get_client as _get_client, ClickHouseClient
# Re-export for backward compatibility with existing route imports
db: ClickHouseClient = _get_client()

View File

@ -0,0 +1,237 @@
"""
Bot Detector Dashboard - API Backend
FastAPI application pour servir le dashboard web
"""
import logging
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
import os
from .config import settings
from .database import db
from .routes import metrics, detections, variability, attributes, analysis, entities, incidents, audit, reputation, fingerprints
from .routes import bruteforce, tcp_spoofing, header_fingerprint, heatmap, botnets, rotation, ml_features, investigation_summary, search, clustering
# Configuration logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Gestion du cycle de vie de l'application"""
# Startup
logger.info("Démarrage du Bot Detector Dashboard API...")
logger.info(f"ClickHouse: {settings.CLICKHOUSE_HOST}:{settings.CLICKHOUSE_PORT}")
logger.info(f"Database: {settings.CLICKHOUSE_DB}")
# Tester la connexion ClickHouse
try:
client = db.connect()
client.ping()
logger.info("Connexion ClickHouse établie avec succès")
except Exception as e:
logger.error(f"Échec de connexion ClickHouse: {e}")
raise
yield
# Shutdown
logger.info("Arrêt du Bot Detector Dashboard API...")
db.close()
# Création de l'application FastAPI
OPENAPI_TAGS = [
{
"name": "Metrics",
"description": "Métriques globales : comptages, niveaux de menace, baseline et distribution des scores ML.",
},
{
"name": "Detections",
"description": "Liste paginée et filtrée des anomalies détectées par le modèle ML. Supporte tri, recherche texte et regroupement par IP.",
},
{
"name": "investigation",
"description": (
"**Point d'entrée principal pour l'analyse d'une IP.** "
"Agrège en un seul appel : score ML, brute-force, spoofing TCP, rotation JA4, persistance et timeline 24h. "
"Retourne un `risk_score` heuristique de 0 à 100."
),
},
{
"name": "Reputation",
"description": "Réputation externe d'une IP via IP-API.com et IPinfo.io (sans clé API). Détecte proxies, VPN, Tor, hébergeurs.",
},
{
"name": "Analysis",
"description": "Analyses approfondies par IP : subnet, pays, empreintes JA4, user-agents, recommandation SOC et gestion des classifications.",
},
{
"name": "Entities",
"description": "Investigation par entité (IP, JA4, subnet, user-agent, host). Retourne détections associées, user-agents, chemins, paramètres et entités liées.",
},
{
"name": "Incidents",
"description": "Clusters d'incidents actifs regroupés par similarité comportementale. Permet la classification et le suivi des incidents.",
},
{
"name": "Fingerprints",
"description": "Analyse des empreintes JA4/TLS : spoofing, matrice JA4↔UA, user-agents suspects, cohérence par IP, JA4 légitimes et corrélation ASN.",
},
{
"name": "Bruteforce",
"description": "Détection des attaques brute-force : cibles, attaquants, timeline et détail par host.",
},
{
"name": "TCP Spoofing",
"description": "Détection du spoofing TCP/OS fingerprinting : vue d'ensemble, liste et matrice TTL×MSS.",
},
{
"name": "Header Fingerprint",
"description": "Clusters de fingerprints d'en-têtes HTTP suspects et IPs associées.",
},
{
"name": "Heatmap",
"description": "Heatmap horaire du trafic, top hosts et matrice activité/heure.",
},
{
"name": "Botnets",
"description": "Détection de botnets : spread JA4, distribution géographique par JA4, résumé global.",
},
{
"name": "Rotation",
"description": "Détection de la rotation JA4 (évasion de détection), menaces persistantes, historique JA4 par IP et score de sophistication.",
},
{
"name": "ML Features",
"description": "Données brutes du modèle ML : top anomalies, radar par IP, distribution des scores, tendances, features B et scatter plot.",
},
{
"name": "Attributes",
"description": "Listes des valeurs distinctes d'attributs (JA4, user-agents, ASN, pays…) avec comptages.",
},
{
"name": "Variability",
"description": "Variabilité comportementale : IPs par attribut, attributs par valeur, analyse des user-agents.",
},
{
"name": "Clustering",
"description": "Clustering K-Means des IPs sur les features ML. Statut du cache, clusters, points et IPs par cluster.",
},
{
"name": "Search",
"description": "Recherche rapide cross-entités (IP, JA4, host, user-agent, pays, ASN).",
},
{
"name": "Audit",
"description": "Journal d'audit SOC : création de logs, consultation filtrée, statistiques et activité par utilisateur.",
},
]
app = FastAPI(
title="Bot Detector Dashboard API",
description=(
"API REST du **Bot Detector SOC Dashboard**.\n\n"
"Permet d'interroger la base ClickHouse (`mabase_prod`) pour visualiser et analyser "
"les détections de bots générées par le service `bot_detector_ai`.\n\n"
"**Endpoint clé :** `GET /api/investigation/{ip}/summary` — synthèse complète en un appel.\n\n"
"Documentation interactive : `/docs` (Swagger UI) · `/redoc` (ReDoc)"
),
version="1.0.0",
openapi_tags=OPENAPI_TAGS,
lifespan=lifespan,
)
# Configuration CORS
app.add_middleware(
CORSMiddleware,
allow_origins=settings.CORS_ORIGINS,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Enregistrement des routes
app.include_router(metrics.router)
app.include_router(detections.router)
app.include_router(variability.router)
app.include_router(attributes.router)
app.include_router(analysis.router)
app.include_router(entities.router)
app.include_router(incidents.router)
app.include_router(audit.router)
app.include_router(reputation.router)
app.include_router(fingerprints.router)
app.include_router(bruteforce.router)
app.include_router(tcp_spoofing.router)
app.include_router(header_fingerprint.router)
app.include_router(heatmap.router)
app.include_router(botnets.router)
app.include_router(rotation.router)
app.include_router(ml_features.router)
app.include_router(investigation_summary.router)
app.include_router(search.router)
app.include_router(clustering.router)
# Chemin vers le fichier index.html du frontend (utilisé par serve_frontend et serve_spa)
_FRONTEND_INDEX = os.path.join(os.path.dirname(__file__), "..", "frontend", "dist", "index.html")
# Route pour servir le frontend
@app.get("/")
async def serve_frontend():
"""Sert l'application React"""
if os.path.exists(_FRONTEND_INDEX):
return FileResponse(_FRONTEND_INDEX)
return {"message": "Dashboard API - Frontend non construit. Voir /docs pour l'API."}
# Servir les assets statiques
_assets_path = os.path.join(os.path.dirname(__file__), "..", "frontend", "dist", "assets")
if os.path.exists(_assets_path):
try:
app.mount("/assets", StaticFiles(directory=_assets_path), name="assets")
except Exception as _e:
logger.warning(f"Impossible de monter les assets statiques : {_e}")
# Health check
@app.get("/health")
async def health_check():
"""Endpoint de santé pour le health check Docker"""
try:
db.connect().ping()
return {"status": "healthy", "clickhouse": "connected"}
except Exception as e:
return {"status": "unhealthy", "clickhouse": "disconnected", "error": str(e)}
# Route catch-all pour le routing SPA (React Router) - DOIT ÊTRE EN DERNIER
# Sauf pour /api/* qui doit être géré par les routers
@app.get("/{full_path:path}")
async def serve_spa(full_path: str):
"""Redirige toutes les routes vers index.html pour le routing React"""
# Ne pas intercepter les routes API
if full_path.startswith("api/"):
raise HTTPException(status_code=404, detail="API endpoint not found")
if os.path.exists(_FRONTEND_INDEX):
return FileResponse(_FRONTEND_INDEX)
return {"message": "Dashboard API - Frontend non construit"}
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"main:app",
host=settings.API_HOST,
port=settings.API_PORT,
reload=True
)

View File

@ -0,0 +1,322 @@
"""
Modèles de données pour l'API
"""
from pydantic import BaseModel, Field, ConfigDict
from typing import Optional, List, Dict, Any
from datetime import datetime
from enum import Enum
class ThreatLevel(str, Enum):
CRITICAL = "CRITICAL"
HIGH = "HIGH"
MEDIUM = "MEDIUM"
LOW = "LOW"
# ─────────────────────────────────────────────────────────────────────────────
# MÉTRIQUES
# ─────────────────────────────────────────────────────────────────────────────
class MetricsSummary(BaseModel):
total_detections: int
critical_count: int
high_count: int
medium_count: int
low_count: int
known_bots_count: int
anomalies_count: int
unique_ips: int
class TimeSeriesPoint(BaseModel):
hour: datetime
total: int
critical: int
high: int
medium: int
low: int
class MetricsResponse(BaseModel):
summary: MetricsSummary
timeseries: List[TimeSeriesPoint]
threat_distribution: Dict[str, int]
# ─────────────────────────────────────────────────────────────────────────────
# DÉTECTIONS
# ─────────────────────────────────────────────────────────────────────────────
class Detection(BaseModel):
detected_at: datetime
src_ip: str
ja4: str
host: str
bot_name: str
anomaly_score: float
threat_level: str
model_name: str
recurrence: int
asn_number: str
asn_org: str
asn_detail: str
asn_domain: str
country_code: str
asn_label: str
hits: int
hit_velocity: float
fuzzing_index: float
post_ratio: float
reason: str
client_headers: str = ""
asn_score: Optional[float] = None
asn_rep_label: str = ""
first_seen: Optional[datetime] = None
last_seen: Optional[datetime] = None
unique_ja4s: Optional[List[str]] = None
unique_hosts: Optional[List[str]] = None
anubis_bot_name: str = ""
anubis_bot_action: str = ""
anubis_bot_category: str = ""
class DetectionsListResponse(BaseModel):
items: List[Detection]
total: int
page: int
page_size: int
total_pages: int
# ─────────────────────────────────────────────────────────────────────────────
# VARIABILITÉ
# ─────────────────────────────────────────────────────────────────────────────
class AttributeValue(BaseModel):
value: str
count: int
percentage: float
first_seen: Optional[datetime] = None
last_seen: Optional[datetime] = None
threat_levels: Optional[Dict[str, int]] = None
unique_ips: Optional[int] = None
primary_threat: Optional[str] = None
class VariabilityAttributes(BaseModel):
user_agents: List[AttributeValue] = Field(default_factory=list)
ja4: List[AttributeValue] = Field(default_factory=list)
countries: List[AttributeValue] = Field(default_factory=list)
asns: List[AttributeValue] = Field(default_factory=list)
hosts: List[AttributeValue] = Field(default_factory=list)
threat_levels: List[AttributeValue] = Field(default_factory=list)
model_names: List[AttributeValue] = Field(default_factory=list)
class Insight(BaseModel):
type: str # "warning", "info", "success"
message: str
class VariabilityResponse(BaseModel):
type: str
value: str
total_detections: int
unique_ips: int
date_range: Dict[str, datetime]
attributes: VariabilityAttributes
insights: List[Insight] = Field(default_factory=list)
# ─────────────────────────────────────────────────────────────────────────────
# ATTRIBUTS UNIQUES
# ─────────────────────────────────────────────────────────────────────────────
class AttributeListItem(BaseModel):
value: str
count: int
class AttributeListResponse(BaseModel):
type: str
items: List[AttributeListItem]
total: int
# ─────────────────────────────────────────────────────────────────────────────
# USER-AGENTS
# ─────────────────────────────────────────────────────────────────────────────
class UserAgentValue(BaseModel):
value: str
count: int
percentage: float
first_seen: Optional[datetime] = None
last_seen: Optional[datetime] = None
class UserAgentsResponse(BaseModel):
type: str
value: str
user_agents: List[UserAgentValue]
total: int
showing: int
# ─────────────────────────────────────────────────────────────────────────────
# CLASSIFICATIONS (SOC / ML)
# ─────────────────────────────────────────────────────────────────────────────
class ClassificationLabel(str, Enum):
LEGITIMATE = "legitimate"
SUSPICIOUS = "suspicious"
MALICIOUS = "malicious"
class ClassificationBase(BaseModel):
ip: Optional[str] = None
ja4: Optional[str] = None
label: ClassificationLabel
tags: List[str] = Field(default_factory=list)
comment: str = ""
confidence: float = Field(ge=0.0, le=1.0, default=0.5)
analyst: str = "unknown"
class ClassificationCreate(ClassificationBase):
"""Données pour créer une classification"""
features: dict = Field(default_factory=dict)
class Classification(ClassificationBase):
"""Classification complète avec métadonnées"""
model_config = ConfigDict(from_attributes=True)
created_at: datetime
features: dict = Field(default_factory=dict)
class ClassificationsListResponse(BaseModel):
items: List[Classification]
total: int
# ─────────────────────────────────────────────────────────────────────────────
# ANALYSIS (CORRELATION)
# ─────────────────────────────────────────────────────────────────────────────
class SubnetAnalysis(BaseModel):
"""Analyse subnet/ASN"""
ip: str
subnet: str
ips_in_subnet: List[str]
total_in_subnet: int
asn_number: str
asn_org: str
total_in_asn: int
alert: bool # True si > 10 IPs du subnet
class CountryData(BaseModel):
"""Données pour un pays"""
code: str
name: str
count: int
percentage: float
class CountryAnalysis(BaseModel):
"""Analyse des pays"""
top_countries: List[CountryData]
baseline: dict # Pays habituels
alert_country: Optional[str] = None # Pays surreprésenté
class JA4SubnetData(BaseModel):
"""Subnet pour un JA4"""
subnet: str
count: int
class JA4Analysis(BaseModel):
"""Analyse JA4"""
ja4: str
shared_ips_count: int
top_subnets: List[JA4SubnetData]
other_ja4_for_ip: List[str]
class UserAgentData(BaseModel):
"""Données pour un User-Agent"""
value: str
count: int
percentage: float
classification: str # "normal", "bot", "script"
class UserAgentAnalysis(BaseModel):
"""Analyse User-Agents"""
ip_user_agents: List[UserAgentData]
ja4_user_agents: List[UserAgentData]
bot_percentage: float
alert: bool # True si > 20% bots/scripts
class CorrelationIndicators(BaseModel):
"""Indicateurs de corrélation"""
subnet_ips_count: int
asn_ips_count: int
country_percentage: float
ja4_shared_ips: int
user_agents_count: int
bot_ua_percentage: float
class ClassificationRecommendation(BaseModel):
"""Recommandation de classification"""
label: ClassificationLabel
confidence: float
indicators: CorrelationIndicators
suggested_tags: List[str]
reason: str
# ─────────────────────────────────────────────────────────────────────────────
# ENTITIES (UNIFIED VIEW)
# ─────────────────────────────────────────────────────────────────────────────
class EntityStats(BaseModel):
"""Statistiques pour une entité"""
entity_type: str
entity_value: str
total_requests: int
unique_ips: int
first_seen: datetime
last_seen: datetime
class EntityRelatedAttributes(BaseModel):
"""Attributs associés à une entité"""
ips: List[str] = Field(default_factory=list)
ja4s: List[str] = Field(default_factory=list)
hosts: List[str] = Field(default_factory=list)
asns: List[str] = Field(default_factory=list)
countries: List[str] = Field(default_factory=list)
class EntityAttributeValue(BaseModel):
"""Valeur d'attribut avec count et percentage (pour les entities)"""
value: str
count: int
percentage: float
class EntityInvestigation(BaseModel):
"""Investigation complète pour une entité"""
stats: EntityStats
related: EntityRelatedAttributes
user_agents: List[EntityAttributeValue] = Field(default_factory=list)
client_headers: List[EntityAttributeValue] = Field(default_factory=list)
paths: List[EntityAttributeValue] = Field(default_factory=list)
query_params: List[EntityAttributeValue] = Field(default_factory=list)

View File

@ -0,0 +1 @@
# Routes package

View File

@ -0,0 +1,686 @@
"""
Endpoints pour l'analyse de corrélations et la classification SOC
"""
from collections import defaultdict
from fastapi import APIRouter, HTTPException, Query
from typing import Optional, List
import ipaddress
import json
from ..database import db
from ..models import (
SubnetAnalysis, CountryAnalysis, CountryData, JA4Analysis, JA4SubnetData,
UserAgentAnalysis, UserAgentData, CorrelationIndicators,
ClassificationRecommendation, ClassificationLabel,
ClassificationCreate, Classification, ClassificationsListResponse
)
router = APIRouter(prefix="/api/analysis", tags=["analysis"])
# Mapping code ISO → nom lisible (utilisé par analyze_ip_country et analyze_country)
_COUNTRY_NAMES: dict[str, str] = {
"CN": "China", "US": "United States", "DE": "Germany",
"FR": "France", "RU": "Russia", "GB": "United Kingdom",
"NL": "Netherlands", "IN": "India", "BR": "Brazil",
"JP": "Japan", "KR": "South Korea", "IT": "Italy",
"ES": "Spain", "CA": "Canada", "AU": "Australia"
}
# =============================================================================
# ANALYSE SUBNET / ASN
# =============================================================================
@router.get("/{ip}/subnet", response_model=SubnetAnalysis)
async def analyze_subnet(ip: str):
"""
Analyse les IPs du même subnet et ASN
"""
try:
# Calculer le subnet /24
ip_obj = ipaddress.ip_address(ip)
subnet = ipaddress.ip_network(f"{ip}/24", strict=False)
subnet_str = str(subnet)
# Récupérer les infos ASN pour cette IP
asn_query = """
SELECT asn_number, asn_org
FROM ml_detected_anomalies
WHERE src_ip = %(ip)s
ORDER BY detected_at DESC
LIMIT 1
"""
asn_result = db.query(asn_query, {"ip": ip})
if not asn_result.result_rows:
# Fallback: utiliser données par défaut
asn_number = "0"
asn_org = "Unknown"
else:
asn_number = str(asn_result.result_rows[0][0] or "0")
asn_org = asn_result.result_rows[0][1] or "Unknown"
# IPs du même subnet /24
subnet_ips_query = """
SELECT DISTINCT src_ip
FROM ml_detected_anomalies
WHERE toIPv4(src_ip) >= toIPv4(%(subnet_start)s)
AND toIPv4(src_ip) <= toIPv4(%(subnet_end)s)
AND detected_at >= now() - INTERVAL 24 HOUR
ORDER BY src_ip
"""
subnet_result = db.query(subnet_ips_query, {
"subnet_start": str(subnet.network_address),
"subnet_end": str(subnet.broadcast_address)
})
subnet_ips = [str(row[0]) for row in subnet_result.result_rows]
# Total IPs du même ASN
if asn_number != "0":
asn_total_query = """
SELECT uniq(src_ip)
FROM ml_detected_anomalies
WHERE asn_number = %(asn_number)s
AND detected_at >= now() - INTERVAL 24 HOUR
"""
asn_total_result = db.query(asn_total_query, {"asn_number": asn_number})
asn_total = asn_total_result.result_rows[0][0] if asn_total_result.result_rows else 0
else:
asn_total = 0
return SubnetAnalysis(
ip=ip,
subnet=subnet_str,
ips_in_subnet=subnet_ips,
total_in_subnet=len(subnet_ips),
asn_number=asn_number,
asn_org=asn_org,
total_in_asn=asn_total,
alert=len(subnet_ips) > 10
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/{ip}/country", response_model=dict)
async def analyze_ip_country(ip: str):
"""
Analyse le pays d'une IP spécifique et la répartition des autres pays du même ASN
"""
try:
# Pays de l'IP
ip_country_query = """
SELECT country_code, asn_number
FROM ml_detected_anomalies
WHERE src_ip = %(ip)s
ORDER BY detected_at DESC
LIMIT 1
"""
ip_result = db.query(ip_country_query, {"ip": ip})
if not ip_result.result_rows:
return {"ip_country": None, "asn_countries": []}
ip_country_code = ip_result.result_rows[0][0]
asn_number = ip_result.result_rows[0][1]
# Répartition des autres pays du même ASN
asn_countries_query = """
SELECT
country_code,
count() AS count
FROM ml_detected_anomalies
WHERE asn_number = %(asn_number)s
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY country_code
ORDER BY count DESC
"""
asn_result = db.query(asn_countries_query, {"asn_number": asn_number})
total = sum(row[1] for row in asn_result.result_rows)
asn_countries = [
{
"code": row[0],
"name": _COUNTRY_NAMES.get(row[0], row[0]),
"count": row[1],
"percentage": round((row[1] / total * 100), 2) if total > 0 else 0.0
}
for row in asn_result.result_rows
]
return {
"ip_country": {
"code": ip_country_code,
"name": _COUNTRY_NAMES.get(ip_country_code, ip_country_code)
},
"asn_countries": asn_countries
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ANALYSE PAYS
# =============================================================================
@router.get("/country", response_model=CountryAnalysis)
async def analyze_country(days: int = Query(1, ge=1, le=30)):
"""
Analyse la distribution des pays
"""
try:
# Top pays
top_query = """
SELECT
country_code,
count() AS count
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(days)s DAY
AND country_code != '' AND country_code IS NOT NULL
GROUP BY country_code
ORDER BY count DESC
"""
top_result = db.query(top_query, {"days": days})
# Calculer le total pour le pourcentage
total = sum(row[1] for row in top_result.result_rows)
top_countries = [
CountryData(
code=row[0],
name=_COUNTRY_NAMES.get(row[0], row[0]),
count=row[1],
percentage=round((row[1] / total * 100), 2) if total > 0 else 0.0
)
for row in top_result.result_rows
]
# Baseline (7 derniers jours)
baseline_query = """
SELECT
country_code,
count() AS count
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 7 DAY
AND country_code != '' AND country_code IS NOT NULL
GROUP BY country_code
ORDER BY count DESC
"""
baseline_result = db.query(baseline_query)
baseline_total = sum(row[1] for row in baseline_result.result_rows)
baseline = {
row[0]: round((row[1] / baseline_total * 100), 2) if baseline_total > 0 else 0.0
for row in baseline_result.result_rows
}
# Détecter pays surreprésenté
alert_country = None
for country in top_countries:
baseline_pct = baseline.get(country.code, 0)
if baseline_pct > 0 and country.percentage > baseline_pct * 2 and country.percentage > 30:
alert_country = country.code
break
return CountryAnalysis(
top_countries=top_countries,
baseline=baseline,
alert_country=alert_country
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ANALYSE JA4
# =============================================================================
@router.get("/{ip}/ja4", response_model=JA4Analysis)
async def analyze_ja4(ip: str):
"""
Analyse le JA4 fingerprint
"""
try:
# JA4 de cette IP
ja4_query = """
SELECT ja4
FROM ml_detected_anomalies
WHERE src_ip = %(ip)s
AND ja4 != '' AND ja4 IS NOT NULL
ORDER BY detected_at DESC
LIMIT 1
"""
ja4_result = db.query(ja4_query, {"ip": ip})
if not ja4_result.result_rows:
return JA4Analysis(
ja4="",
shared_ips_count=0,
top_subnets=[],
other_ja4_for_ip=[]
)
ja4 = ja4_result.result_rows[0][0]
# IPs avec le même JA4
shared_query = """
SELECT uniq(src_ip)
FROM ml_detected_anomalies
WHERE ja4 = %(ja4)s
AND detected_at >= now() - INTERVAL 24 HOUR
"""
shared_result = db.query(shared_query, {"ja4": ja4})
shared_count = shared_result.result_rows[0][0] if shared_result.result_rows else 0
# Top subnets pour ce JA4 - Simplifié
subnets_query = """
SELECT
src_ip,
count() AS count
FROM ml_detected_anomalies
WHERE ja4 = %(ja4)s
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY src_ip
ORDER BY count DESC
"""
subnets_result = db.query(subnets_query, {"ja4": ja4})
# Grouper par subnet /24
subnet_counts = defaultdict(int)
for row in subnets_result.result_rows:
ip_addr = str(row[0])
parts = ip_addr.split('.')
if len(parts) == 4:
subnet = f"{parts[0]}.{parts[1]}.{parts[2]}.0/24"
subnet_counts[subnet] += row[1]
top_subnets = [
JA4SubnetData(subnet=subnet, count=count)
for subnet, count in sorted(subnet_counts.items(), key=lambda x: x[1], reverse=True)[:10]
]
# Autres JA4 pour cette IP
other_ja4_query = """
SELECT DISTINCT ja4
FROM ml_detected_anomalies
WHERE src_ip = %(ip)s
AND ja4 != '' AND ja4 IS NOT NULL
AND ja4 != %(current_ja4)s
"""
other_result = db.query(other_ja4_query, {"ip": ip, "current_ja4": ja4})
other_ja4 = [row[0] for row in other_result.result_rows]
return JA4Analysis(
ja4=ja4,
shared_ips_count=shared_count,
top_subnets=top_subnets,
other_ja4_for_ip=other_ja4
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ANALYSE USER-AGENTS
# =============================================================================
@router.get("/{ip}/user-agents", response_model=UserAgentAnalysis)
async def analyze_user_agents(ip: str):
"""
Analyse les User-Agents
"""
try:
# User-Agents pour cette IP (depuis http_logs)
ip_ua_query = """
SELECT
header_user_agent AS ua,
count() AS count
FROM mabase_prod.http_logs
WHERE src_ip = %(ip)s
AND header_user_agent != '' AND header_user_agent IS NOT NULL
AND time >= now() - INTERVAL 24 HOUR
GROUP BY ua
ORDER BY count DESC
"""
# Total réel des requêtes (pour les pourcentages corrects)
ip_total_query = """
SELECT count()
FROM mabase_prod.http_logs
WHERE src_ip = %(ip)s
AND time >= now() - INTERVAL 24 HOUR
"""
ip_ua_result = db.query(ip_ua_query, {"ip": ip})
ip_total_result = db.query(ip_total_query, {"ip": ip})
# Classification des UAs
def classify_ua(ua: str) -> str:
ua_lower = ua.lower()
if any(bot in ua_lower for bot in ['bot', 'crawler', 'spider', 'curl', 'wget', 'python', 'requests', 'scrapy']):
return 'bot'
if any(script in ua_lower for script in ['python', 'java', 'php', 'ruby', 'perl', 'node']):
return 'script'
if not ua or ua.strip() == '':
return 'script'
return 'normal'
# Total réel de toutes les requêtes (pour des pourcentages corrects même avec LIMIT)
total_count = ip_total_result.result_rows[0][0] if ip_total_result.result_rows else 0
if total_count == 0:
total_count = sum(row[1] for row in ip_ua_result.result_rows)
ip_user_agents = [
UserAgentData(
value=row[0],
count=row[1],
percentage=round((row[1] / total_count * 100), 2) if total_count > 0 else 0.0,
classification=classify_ua(row[0])
)
for row in ip_ua_result.result_rows
]
# Pour les UAs du JA4, on retourne les mêmes pour l'instant
ja4_user_agents = ip_user_agents
# Pourcentage de bots
bot_count = sum(ua.count for ua in ip_user_agents if ua.classification in ['bot', 'script'])
bot_percentage = (bot_count / total_count * 100) if total_count > 0 else 0
return UserAgentAnalysis(
ip_user_agents=ip_user_agents,
ja4_user_agents=ja4_user_agents,
bot_percentage=bot_percentage,
alert=bot_percentage > 20
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# RECOMMANDATION DE CLASSIFICATION
# =============================================================================
@router.get("/{ip}/recommendation", response_model=ClassificationRecommendation)
async def get_classification_recommendation(ip: str):
"""
Génère une recommandation de classification basée sur les corrélations
"""
try:
# Récupérer les analyses
try:
subnet_analysis = await analyze_subnet(ip)
except Exception:
subnet_analysis = None
try:
country_analysis = await analyze_country(1)
except Exception:
country_analysis = None
try:
ja4_analysis = await analyze_ja4(ip)
except Exception:
ja4_analysis = None
try:
ua_analysis = await analyze_user_agents(ip)
except Exception:
ua_analysis = None
# Indicateurs par défaut
indicators = CorrelationIndicators(
subnet_ips_count=subnet_analysis.total_in_subnet if subnet_analysis else 0,
asn_ips_count=subnet_analysis.total_in_asn if subnet_analysis else 0,
country_percentage=0.0,
ja4_shared_ips=ja4_analysis.shared_ips_count if ja4_analysis else 0,
user_agents_count=len(ua_analysis.ja4_user_agents) if ua_analysis else 0,
bot_ua_percentage=ua_analysis.bot_percentage if ua_analysis else 0.0
)
# Score de confiance
score = 0.0
reasons = []
tags = []
# Subnet > 10 IPs
if subnet_analysis and subnet_analysis.total_in_subnet > 10:
score += 0.25
reasons.append(f"{subnet_analysis.total_in_subnet} IPs du même subnet")
tags.append("distributed")
# JA4 partagé > 50 IPs
if ja4_analysis and ja4_analysis.shared_ips_count > 50:
score += 0.25
reasons.append(f"{ja4_analysis.shared_ips_count} IPs avec même JA4")
tags.append("ja4-rotation")
# Bot UA > 20%
if ua_analysis and ua_analysis.bot_percentage > 20:
score += 0.25
reasons.append(f"{ua_analysis.bot_percentage:.0f}% UAs bots/scripts")
tags.append("bot-ua")
# Pays surreprésenté
if country_analysis and country_analysis.alert_country:
score += 0.15
reasons.append(f"Pays {country_analysis.alert_country} surreprésenté")
tags.append(f"country-{country_analysis.alert_country.lower()}")
# ASN hosting
if subnet_analysis:
hosting_keywords = ["ovh", "amazon", "aws", "google", "azure", "digitalocean", "linode", "vultr", "china169", "chinamobile"]
if any(kw in (subnet_analysis.asn_org or "").lower() for kw in hosting_keywords):
score += 0.10
tags.append("hosting-asn")
# Déterminer label
if score >= 0.7:
label = ClassificationLabel.MALICIOUS
tags.append("campaign")
elif score >= 0.4:
label = ClassificationLabel.SUSPICIOUS
else:
label = ClassificationLabel.LEGITIMATE
reason = " | ".join(reasons) if reasons else "Aucun indicateur fort"
return ClassificationRecommendation(
label=label,
confidence=min(score, 1.0),
indicators=indicators,
suggested_tags=tags,
reason=reason
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# CLASSIFICATIONS CRUD
# =============================================================================
@router.post("/classifications", response_model=Classification)
async def create_classification(data: ClassificationCreate):
"""
Crée une classification pour une IP ou un JA4
"""
try:
# Validation: soit ip, soit ja4 doit être fourni
if not data.ip and not data.ja4:
raise HTTPException(status_code=400, detail="IP ou JA4 requis")
query = """
INSERT INTO mabase_prod.classifications
(ip, ja4, label, tags, comment, confidence, features, analyst, created_at)
VALUES
(%(ip)s, %(ja4)s, %(label)s, %(tags)s, %(comment)s, %(confidence)s, %(features)s, %(analyst)s, now())
"""
db.query(query, {
"ip": data.ip or "",
"ja4": data.ja4 or "",
"label": data.label.value,
"tags": data.tags,
"comment": data.comment,
"confidence": data.confidence,
"features": json.dumps(data.features),
"analyst": data.analyst
})
# Récupérer la classification créée
where_clause = "ip = %(entity)s" if data.ip else "ja4 = %(entity)s"
select_query = f"""
SELECT ip, ja4, label, tags, comment, confidence, features, analyst, created_at
FROM mabase_prod.classifications
WHERE {where_clause}
ORDER BY created_at DESC
LIMIT 1
"""
result = db.query(select_query, {"entity": data.ip or data.ja4})
if not result.result_rows:
raise HTTPException(status_code=404, detail="Classification non trouvée")
row = result.result_rows[0]
return Classification(
ip=row[0] or None,
ja4=row[1] or None,
label=ClassificationLabel(row[2]),
tags=row[3],
comment=row[4],
confidence=row[5],
features=json.loads(row[6]) if row[6] else {},
analyst=row[7],
created_at=row[8]
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/classifications", response_model=ClassificationsListResponse)
async def list_classifications(
ip: Optional[str] = Query(None, description="Filtrer par IP"),
ja4: Optional[str] = Query(None, description="Filtrer par JA4"),
label: Optional[str] = Query(None, description="Filtrer par label"),
limit: int = Query(100, ge=1, le=1000)
):
"""
Liste les classifications
"""
try:
where_clauses = ["1=1"]
params = {"limit": limit}
if ip:
where_clauses.append("ip = %(ip)s")
params["ip"] = ip
if ja4:
where_clauses.append("ja4 = %(ja4)s")
params["ja4"] = ja4
if label:
where_clauses.append("label = %(label)s")
params["label"] = label
where_clause = " AND ".join(where_clauses)
query = f"""
SELECT ip, ja4, label, tags, comment, confidence, features, analyst, created_at
FROM mabase_prod.classifications
WHERE {where_clause}
ORDER BY created_at DESC
LIMIT %(limit)s
"""
result = db.query(query, params)
classifications = [
Classification(
ip=row[0] or None,
ja4=row[1] or None,
label=ClassificationLabel(row[2]),
tags=row[3],
comment=row[4],
confidence=row[5],
features=json.loads(row[6]) if row[6] else {},
analyst=row[7],
created_at=row[8]
)
for row in result.result_rows
]
# Total
count_query = f"""
SELECT count()
FROM mabase_prod.classifications
WHERE {where_clause}
"""
count_result = db.query(count_query, params)
total = count_result.result_rows[0][0] if count_result.result_rows else 0
return ClassificationsListResponse(
items=classifications,
total=total
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/classifications/stats")
async def get_classification_stats():
"""
Statistiques des classifications
"""
try:
stats_query = """
SELECT
label,
count() AS total,
uniq(ip) AS unique_ips,
avg(confidence) AS avg_confidence
FROM mabase_prod.classifications
GROUP BY label
ORDER BY total DESC
"""
result = db.query(stats_query)
stats = [
{
"label": row[0],
"total": row[1],
"unique_ips": row[2],
"avg_confidence": float(row[3]) if row[3] else 0.0
}
for row in result.result_rows
]
return {"stats": stats}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -0,0 +1,92 @@
"""
Endpoints pour la liste des attributs uniques
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
from ..models import AttributeListResponse, AttributeListItem
router = APIRouter(prefix="/api/attributes", tags=["attributes"])
@router.get("/{attr_type}", response_model=AttributeListResponse)
async def get_attributes(
attr_type: str,
limit: int = Query(100, ge=1, le=1000, description="Nombre maximum de résultats")
):
"""
Récupère la liste des valeurs uniques pour un type d'attribut
"""
try:
# Mapping des types vers les colonnes
type_column_map = {
"ip": "src_ip",
"ja4": "ja4",
"country": "country_code",
"asn": "asn_number",
"host": "host",
"threat_level": "threat_level",
"model_name": "model_name",
"asn_org": "asn_org"
}
if attr_type not in type_column_map:
raise HTTPException(
status_code=400,
detail=f"Type invalide. Types supportés: {', '.join(type_column_map.keys())}"
)
column = type_column_map[attr_type]
# Requête de base
base_query = f"""
SELECT
{column} AS value,
count() AS count
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 24 HOUR
"""
# Ajout du filtre pour exclure les valeurs vides/nulles
# Gestion spéciale pour les types IPv6/IPv4 qui ne peuvent pas être comparés à ''
if attr_type == "ip":
# Pour les adresses IP, on convertit en string et on filtre
query = f"""
SELECT value, count FROM (
SELECT toString({column}) AS value, count() AS count
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 24 HOUR
GROUP BY {column}
)
WHERE value != '' AND value IS NOT NULL
ORDER BY count DESC
LIMIT %(limit)s
"""
else:
query = f"""
{base_query}
AND {column} != '' AND {column} IS NOT NULL
GROUP BY value
ORDER BY count DESC
LIMIT %(limit)s
"""
result = db.query(query, {"limit": limit})
items = [
AttributeListItem(
value=str(row[0]),
count=row[1]
)
for row in result.result_rows
]
return AttributeListResponse(
type=attr_type,
items=items,
total=len(items)
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -0,0 +1,238 @@
"""
Routes pour l'audit et les logs d'activité
"""
import logging
from fastapi import APIRouter, HTTPException, Query, Request
from typing import Optional
from datetime import datetime
from ..database import db
router = APIRouter(prefix="/api/audit", tags=["audit"])
logger = logging.getLogger(__name__)
@router.post("/logs")
async def create_audit_log(
request: Request,
action: str,
entity_type: Optional[str] = None,
entity_id: Optional[str] = None,
entity_count: Optional[int] = None,
details: Optional[dict] = None,
user: Optional[str] = "soc_user"
):
"""
Crée un log d'audit pour une action utilisateur
"""
try:
# Récupérer l'IP du client
client_ip = request.client.host if request.client else "unknown"
# Insérer dans ClickHouse
insert_query = """
INSERT INTO mabase_prod.audit_logs
(timestamp, user_name, action, entity_type, entity_id, entity_count, details, client_ip)
VALUES
(%(timestamp)s, %(user)s, %(action)s, %(entity_type)s, %(entity_id)s, %(entity_count)s, %(details)s, %(client_ip)s)
"""
params = {
'timestamp': datetime.now(),
'user': user,
'action': action,
'entity_type': entity_type,
'entity_id': entity_id,
'entity_count': entity_count,
'details': str(details) if details else '',
'client_ip': client_ip
}
# Note: This requires the audit_logs table to exist
# See deploy_audit_logs_table.sql
try:
db.query(insert_query, params)
except Exception as e:
# La table peut ne pas encore exister — on logue mais on ne bloque pas l'appelant
logger.warning(f"Could not insert audit log: {e}")
return {
"status": "success",
"message": "Audit log created",
"action": action,
"timestamp": params['timestamp'].isoformat()
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/logs")
async def get_audit_logs(
hours: int = Query(24, ge=1, le=720, description="Fenêtre temporelle en heures"),
user: Optional[str] = Query(None, description="Filtrer par utilisateur"),
action: Optional[str] = Query(None, description="Filtrer par action"),
entity_type: Optional[str] = Query(None, description="Filtrer par type d'entité"),
limit: int = Query(100, ge=1, le=1000, description="Nombre maximum de résultats")
):
"""
Récupère les logs d'audit avec filtres
"""
try:
where_clauses = ["timestamp >= now() - INTERVAL %(hours)s HOUR"]
params = {"hours": hours, "limit": limit}
if user:
where_clauses.append("user_name = %(user)s")
params["user"] = user
if action:
where_clauses.append("action = %(action)s")
params["action"] = action
if entity_type:
where_clauses.append("entity_type = %(entity_type)s")
params["entity_type"] = entity_type
where_clause = " AND ".join(where_clauses)
query = f"""
SELECT
timestamp,
user_name,
action,
entity_type,
entity_id,
entity_count,
details,
client_ip
FROM mabase_prod.audit_logs
WHERE {where_clause}
ORDER BY timestamp DESC
LIMIT %(limit)s
"""
result = db.query(query, params)
logs = []
for row in result.result_rows:
logs.append({
"timestamp": row[0].isoformat() if row[0] else "",
"user_name": row[1] or "",
"action": row[2] or "",
"entity_type": row[3] or "",
"entity_id": row[4] or "",
"entity_count": row[5] or 0,
"details": row[6] or "",
"client_ip": row[7] or ""
})
return {
"items": logs,
"total": len(logs),
"period_hours": hours
}
except Exception as e:
# If table doesn't exist, return empty result
if "Table" in str(e) and "doesn't exist" in str(e):
return {
"items": [],
"total": 0,
"period_hours": hours,
"warning": "Audit logs table not created yet"
}
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/stats")
async def get_audit_stats(
hours: int = Query(24, ge=1, le=720)
):
"""
Statistiques d'audit
"""
try:
query = """
SELECT
action,
count() AS count,
uniq(user_name) AS unique_users,
sum(entity_count) AS total_entities
FROM mabase_prod.audit_logs
WHERE timestamp >= now() - INTERVAL %(hours)s HOUR
GROUP BY action
ORDER BY count DESC
"""
result = db.query(query, {"hours": hours})
stats = []
for row in result.result_rows:
stats.append({
"action": row[0] or "",
"count": row[1] or 0,
"unique_users": row[2] or 0,
"total_entities": row[3] or 0
})
return {
"items": stats,
"period_hours": hours
}
except Exception as e:
if "Table" in str(e) and "doesn't exist" in str(e):
return {
"items": [],
"period_hours": hours,
"warning": "Audit logs table not created yet"
}
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/users/activity")
async def get_user_activity(
hours: int = Query(24, ge=1, le=720)
):
"""
Activité par utilisateur
"""
try:
query = """
SELECT
user_name,
count() AS actions,
uniq(action) AS action_types,
min(timestamp) AS first_action,
max(timestamp) AS last_action
FROM mabase_prod.audit_logs
WHERE timestamp >= now() - INTERVAL %(hours)s HOUR
GROUP BY user_name
ORDER BY actions DESC
"""
result = db.query(query, {"hours": hours})
users = []
for row in result.result_rows:
users.append({
"user_name": row[0] or "",
"actions": row[1] or 0,
"action_types": row[2] or 0,
"first_action": row[3].isoformat() if row[3] else "",
"last_action": row[4].isoformat() if row[4] else ""
})
return {
"items": users,
"period_hours": hours
}
except Exception as e:
if "Table" in str(e) and "doesn't exist" in str(e):
return {
"items": [],
"period_hours": hours,
"warning": "Audit logs table not created yet"
}
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -0,0 +1,105 @@
"""
Endpoints pour l'analyse des botnets via la propagation des fingerprints JA4
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
router = APIRouter(prefix="/api/botnets", tags=["botnets"])
def _botnet_class(unique_countries: int) -> str:
if unique_countries > 100:
return "global_botnet"
if unique_countries > 20:
return "regional_botnet"
return "concentrated"
@router.get("/ja4-spread")
async def get_ja4_spread():
"""Propagation des JA4 fingerprints à travers les pays et les IPs."""
try:
sql = """
SELECT
ja4,
unique_ips,
unique_countries,
targeted_hosts
FROM mabase_prod.view_host_ja4_anomalies
ORDER BY unique_countries DESC
"""
result = db.query(sql)
items = []
for row in result.result_rows:
ja4 = str(row[0])
unique_ips = int(row[1])
unique_countries = int(row[2])
targeted_hosts = int(row[3])
dist_score = round(
unique_countries / max(unique_ips ** 0.5, 0.001), 2
)
items.append({
"ja4": ja4,
"unique_ips": unique_ips,
"unique_countries": unique_countries,
"targeted_hosts": targeted_hosts,
"distribution_score":dist_score,
"botnet_class": _botnet_class(unique_countries),
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/ja4/{ja4}/countries")
async def get_ja4_countries(ja4: str, limit: int = Query(30, ge=1, le=200)):
"""Top pays pour un JA4 donné depuis agg_host_ip_ja4_1h."""
try:
sql = """
SELECT
src_country_code AS country_code,
uniq(replaceRegexpAll(toString(src_ip), '^::ffff:', '')) AS unique_ips,
sum(hits) AS hits
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE ja4 = %(ja4)s
GROUP BY src_country_code
ORDER BY unique_ips DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"ja4": ja4, "limit": limit})
items = [
{
"country_code": str(row[0]),
"unique_ips": int(row[1]),
"hits": int(row[2]),
}
for row in result.result_rows
]
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/summary")
async def get_botnets_summary():
"""Statistiques globales sur les botnets détectés."""
try:
sql = """
SELECT
countIf(unique_countries > 100) AS total_global_botnets,
sumIf(unique_ips, unique_countries > 50) AS total_ips_in_botnets,
argMax(ja4, unique_countries) AS most_spread_ja4,
argMax(ja4, unique_ips) AS most_ips_ja4
FROM mabase_prod.view_host_ja4_anomalies
"""
result = db.query(sql)
row = result.result_rows[0]
return {
"total_global_botnets": int(row[0]),
"total_ips_in_botnets": int(row[1]),
"most_spread_ja4": str(row[2]),
"most_ips_ja4": str(row[3]),
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,141 @@
"""
Endpoints pour l'analyse des attaques par force brute sur les formulaires
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
router = APIRouter(prefix="/api/bruteforce", tags=["bruteforce"])
@router.get("/targets")
async def get_bruteforce_targets():
"""Liste des hôtes ciblés par brute-force, triés par total_hits DESC."""
try:
sql = """
SELECT
host,
uniq(src_ip) AS unique_ips,
sum(hits) AS total_hits,
sum(query_params_count) AS total_params,
groupArray(3)(ja4) AS top_ja4s
FROM mabase_prod.view_form_bruteforce_detected
GROUP BY host
ORDER BY total_hits DESC
"""
result = db.query(sql)
items = []
for row in result.result_rows:
host = str(row[0])
unique_ips = int(row[1])
total_hits = int(row[2])
total_params= int(row[3])
top_ja4s = [str(j) for j in (row[4] or [])]
attack_type = (
"credential_stuffing"
if total_hits > 0 and total_params / total_hits > 0.5
else "enumeration"
)
items.append({
"host": host,
"unique_ips": unique_ips,
"total_hits": total_hits,
"total_params":total_params,
"attack_type": attack_type,
"top_ja4s": top_ja4s,
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/attackers")
async def get_bruteforce_attackers(limit: int = Query(50, ge=1, le=500)):
"""Top IPs attaquantes triées par total_hits DESC."""
try:
sql = """
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
uniq(host) AS distinct_hosts,
sum(hits) AS total_hits,
sum(query_params_count) AS total_params,
argMax(ja4, hits) AS ja4
FROM mabase_prod.view_form_bruteforce_detected
GROUP BY src_ip
ORDER BY total_hits DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
items.append({
"ip": str(row[0]),
"distinct_hosts":int(row[1]),
"total_hits": int(row[2]),
"total_params": int(row[3]),
"ja4": str(row[4]),
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/timeline")
async def get_bruteforce_timeline():
"""Hits par heure (dernières 72h) depuis agg_host_ip_ja4_1h."""
try:
sql = """
SELECT
toHour(window_start) AS hour,
sum(hits) AS hits,
uniq(replaceRegexpAll(toString(src_ip), '^::ffff:', '')) AS ips
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 72 HOUR
GROUP BY hour
ORDER BY hour ASC
"""
result = db.query(sql)
hours = []
for row in result.result_rows:
hours.append({
"hour": int(row[0]),
"hits": int(row[1]),
"ips": int(row[2]),
})
return {"hours": hours}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/host/{host:path}/attackers")
async def get_host_attackers(host: str, limit: int = Query(20, ge=1, le=200)):
"""Top IPs attaquant un hôte spécifique, avec JA4 et type d'attaque."""
try:
sql = """
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
sum(hits) AS total_hits,
sum(query_params_count) AS total_params,
argMax(ja4, hits) AS ja4,
max(hits) AS max_hits_per_window
FROM mabase_prod.view_form_bruteforce_detected
WHERE host = %(host)s
GROUP BY src_ip
ORDER BY total_hits DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"host": host, "limit": limit})
items = []
for row in result.result_rows:
total_hits = int(row[1])
total_params = int(row[2])
items.append({
"ip": str(row[0]),
"total_hits": total_hits,
"total_params":total_params,
"ja4": str(row[3] or ""),
"attack_type": "credential_stuffing" if total_hits > 0 and total_params / total_hits > 0.5 else "enumeration",
})
return {"host": host, "items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,551 @@
"""
Clustering d'IPs multi-métriques — WebGL / deck.gl backend.
- Calcul sur la TOTALITÉ des IPs (GROUP BY src_ip, ja4 sans LIMIT)
- K-means++ vectorisé (numpy) + PCA-2D + enveloppes convexes (scipy)
- Calcul en background thread + cache 30 min
- Endpoints : /clusters, /status, /cluster/{id}/points
"""
import math
import time
import logging
import threading
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from typing import Any
import numpy as np
from fastapi import APIRouter, HTTPException, Query
from ..database import db
from ..services.clustering_engine import (
FEATURE_NAMES,
build_feature_vector, kmeans_pp, pca_2d, compute_hulls,
name_cluster, risk_score_from_centroid, standardize,
risk_to_gradient_color,
)
log = logging.getLogger(__name__)
router = APIRouter(prefix="/api/clustering", tags=["clustering"])
# ─── Cache global ──────────────────────────────────────────────────────────────
_CACHE: dict[str, Any] = {
"status": "idle", # idle | computing | ready | error
"error": None,
"result": None, # dict résultat complet
"ts": 0.0, # timestamp dernière mise à jour
"params": {},
"cluster_ips": {}, # cluster_idx → [(ip, ja4, pca_x, pca_y, risk)]
}
_CACHE_TTL = 1800 # 30 minutes
_LOCK = threading.Lock()
_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="clustering")
# ─── Palette de couleurs (remplace l'ancienne logique menace) ─────────────────
# Les couleurs sont désormais attribuées par index de cluster pour maximiser
# la distinction visuelle, indépendamment du niveau de risque.
# ─── SQL : TOUTES les IPs sans LIMIT ─────────────────────────────────────────
_SQL_ALL_IPS = """
SELECT
replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') AS ip,
t.ja4,
any(t.tcp_ttl_raw) AS ttl,
any(t.tcp_win_raw) AS win,
any(t.tcp_scale_raw) AS scale,
any(t.tcp_mss_raw) AS mss,
any(t.first_ua) AS ua,
sum(t.hits) AS hits,
avg(abs(ml.anomaly_score)) AS avg_score,
avg(ml.hit_velocity) AS avg_velocity,
avg(ml.fuzzing_index) AS avg_fuzzing,
avg(ml.is_headless) AS pct_headless,
avg(ml.post_ratio) AS avg_post,
avg(ml.ip_id_zero_ratio) AS ip_id_zero,
avg(ml.temporal_entropy) AS entropy,
avg(ml.modern_browser_score) AS browser_score,
avg(ml.alpn_http_mismatch) AS alpn_mismatch,
avg(ml.is_alpn_missing) AS alpn_missing,
avg(ml.multiplexing_efficiency) AS h2_eff,
avg(ml.header_order_confidence) AS hdr_conf,
avg(ml.ua_ch_mismatch) AS ua_ch_mismatch,
avg(ml.asset_ratio) AS asset_ratio,
avg(ml.direct_access_ratio) AS direct_ratio,
avg(ml.distinct_ja4_count) AS ja4_count,
max(ml.is_ua_rotating) AS ua_rotating,
max(ml.threat_level) AS threat,
any(ml.country_code) AS country,
any(ml.asn_org) AS asn_org,
-- Features headers HTTP (depuis view_dashboard_entities)
avg(ml.has_accept_language) AS hdr_accept_lang,
any(vh.hdr_enc) AS hdr_has_encoding,
any(vh.hdr_sec_fetch) AS hdr_has_sec_fetch,
any(vh.hdr_count) AS hdr_count_raw,
-- Fingerprint HTTP Headers (depuis agg_header_fingerprint_1h + ml_detected_anomalies)
-- header_order_shared_count : nb d'IPs partageant le même fingerprint
-- → faible = fingerprint rare = comportement suspect
avg(ml.header_order_shared_count) AS hfp_shared_count,
-- distinct_header_orders : nb de fingerprints distincts émis par cette IP
-- → élevé = rotation de fingerprint = comportement bot
avg(ml.distinct_header_orders) AS hfp_distinct_orders,
-- Cookie et Referer issus de la table dédiée aux empreintes
any(hfp.hfp_cookie) AS hfp_cookie,
any(hfp.hfp_referer) AS hfp_referer
FROM mabase_prod.agg_host_ip_ja4_1h t
LEFT JOIN mabase_prod.ml_detected_anomalies ml
ON t.src_ip = ml.src_ip AND t.ja4 = ml.ja4
AND ml.detected_at >= now() - INTERVAL %(hours)s HOUR
LEFT JOIN (
SELECT
toIPv6(concat('::ffff:', toString(src_ip))) AS src_ip_v6,
ja4,
any(arrayExists(x -> x LIKE '%%Accept-Encoding%%', client_headers)) AS hdr_enc,
any(arrayExists(x -> x LIKE '%%Sec-Fetch%%', client_headers)) AS hdr_sec_fetch,
any(length(splitByChar(',', client_headers[1]))) AS hdr_count
FROM mabase_prod.view_dashboard_entities
WHERE length(client_headers) > 0
AND log_date >= today() - 2
GROUP BY src_ip_v6, ja4
) vh ON t.src_ip = vh.src_ip_v6 AND t.ja4 = vh.ja4
LEFT JOIN (
SELECT
src_ip,
avg(has_cookie) AS hfp_cookie,
avg(has_referer) AS hfp_referer
FROM mabase_prod.agg_header_fingerprint_1h
WHERE window_start >= now() - INTERVAL %(hours)s HOUR
GROUP BY src_ip
) hfp ON t.src_ip = hfp.src_ip
WHERE t.window_start >= now() - INTERVAL %(hours)s HOUR
AND t.tcp_ttl_raw > 0
GROUP BY t.src_ip, t.ja4
"""
_SQL_COLS = [
"ip", "ja4", "ttl", "win", "scale", "mss", "ua", "hits",
"avg_score", "avg_velocity", "avg_fuzzing", "pct_headless", "avg_post",
"ip_id_zero", "entropy", "browser_score", "alpn_mismatch", "alpn_missing",
"h2_eff", "hdr_conf", "ua_ch_mismatch", "asset_ratio", "direct_ratio",
"ja4_count", "ua_rotating", "threat", "country", "asn_org",
"hdr_accept_lang", "hdr_has_encoding", "hdr_has_sec_fetch", "hdr_count_raw",
"hfp_shared_count", "hfp_distinct_orders", "hfp_cookie", "hfp_referer",
]
# ─── Worker de clustering (thread pool) ──────────────────────────────────────
def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
"""Exécuté dans le thread pool. Met à jour _CACHE.
sensitivity : multiplicateur de k [0.5 5.0].
0.5 = vue très agrégée (k/2 clusters)
1.0 = comportement par défaut
2.0 = deux fois plus de clusters → groupes plus homogènes
5.0 = granularité maximale (classification la plus fine)
k_actual est plafonné à 300 pour éviter des temps de calcul excessifs.
n_init est réduit à 1 quand k_actual > 60 pour rester rapide.
"""
k_actual = max(4, min(300, round(k * sensitivity)))
t0 = time.time()
with _LOCK:
_CACHE["status"] = "computing"
_CACHE["error"] = None
try:
log.info(f"[clustering] Démarrage k={k_actual} (base={k}×sens={sensitivity}) hours={hours}")
# ── 1. Chargement de toutes les IPs ──────────────────────────────
result = db.query(_SQL_ALL_IPS, {"hours": hours})
rows: list[dict] = []
for row in result.result_rows:
rows.append({col: row[i] for i, col in enumerate(_SQL_COLS)})
n = len(rows)
log.info(f"[clustering] {n} IPs chargées")
if n < k_actual:
raise ValueError(f"Seulement {n} IPs disponibles (k={k_actual} requis)")
# ── 2. Construction de la matrice de features (numpy) ────────────
X = np.array([build_feature_vector(r) for r in rows], dtype=np.float32)
log.info(f"[clustering] Matrice X: {X.shape}{X.nbytes/1024/1024:.1f} MB")
# ── 3. Standardisation z-score ────────────────────────────────────
# Normalise par variance : features discriminantes (forte std)
# contribuent plus que les features quasi-constantes.
X64 = X.astype(np.float64)
X_std, feat_mean, feat_std = standardize(X64)
# ── 4. K-means++ sur l'espace standardisé ────────────────────────
# n_init réduit à 1 pour k élevé (> 60) afin de limiter le temps de calcul
n_init = 1 if k_actual > 60 else 3
km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=n_init, seed=42)
log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")
# Centroïdes dans l'espace original [0,1] pour affichage radar
# (dé-standardisation : c_orig = c_std * std + mean, puis clip [0,1])
centroids_orig = np.clip(km.centroids * feat_std + feat_mean, 0.0, 1.0)
# ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ────
coords = pca_2d(X64) # (n, 2), normalisé [0,1]
# ── 5b. Enveloppes convexes par cluster ──────────────────────────
hulls = compute_hulls(coords, km.labels, k_actual)
# ── 6. Agrégation par cluster ─────────────────────────────────────
cluster_rows: list[list[dict]] = [[] for _ in range(k_actual)]
cluster_coords: list[list[list[float]]] = [[] for _ in range(k_actual)]
cluster_ips_map: dict[int, list] = {j: [] for j in range(k_actual)}
for i, label in enumerate(km.labels):
j = int(label)
cluster_rows[j].append(rows[i])
cluster_coords[j].append(coords[i].tolist())
cluster_ips_map[j].append((
rows[i]["ip"],
rows[i]["ja4"],
float(coords[i][0]),
float(coords[i][1]),
float(risk_score_from_centroid(centroids_orig[j])),
))
# ── 7. Construction des nœuds ─────────────────────────────────────
nodes = []
for j in range(k_actual):
if not cluster_rows[j]:
continue
def avg_f(key: str, crows: list[dict] = cluster_rows[j]) -> float:
return float(np.mean([float(r.get(key) or 0) for r in crows]))
mean_ttl = avg_f("ttl")
mean_mss = avg_f("mss")
mean_scale = avg_f("scale")
mean_win = avg_f("win")
raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale}
label_name = name_cluster(centroids_orig[j], raw_stats)
risk = float(risk_score_from_centroid(centroids_orig[j]))
color = risk_to_gradient_color(risk)
# Centroïde 2D = moyenne des coords du cluster
cxy = np.mean(cluster_coords[j], axis=0).tolist() if cluster_coords[j] else [0.5, 0.5]
ip_set = list({r["ip"] for r in cluster_rows[j]})
ip_count = len(ip_set)
hit_count = int(sum(float(r.get("hits") or 0) for r in cluster_rows[j]))
threats = [str(r.get("threat") or "") for r in cluster_rows[j] if r.get("threat")]
countries = [str(r.get("country") or "") for r in cluster_rows[j] if r.get("country")]
orgs = [str(r.get("asn_org") or "") for r in cluster_rows[j] if r.get("asn_org")]
def topk(lst: list[str], n: int = 5) -> list[str]:
return [v for v, _ in Counter(lst).most_common(n) if v]
radar = [
{"feature": name, "value": round(float(centroids_orig[j][i]), 4)}
for i, name in enumerate(FEATURE_NAMES)
]
radius = max(8, min(30, int(math.log1p(ip_count) * 2.2)))
sample_rows = sorted(cluster_rows[j], key=lambda r: float(r.get("hits") or 0), reverse=True)[:8]
sample_ips = [r["ip"] for r in sample_rows]
sample_ua = str(cluster_rows[j][0].get("ua") or "")
nodes.append({
"id": f"c{j}_k{k_actual}",
"cluster_idx": j,
"label": label_name,
"pca_x": round(cxy[0], 6),
"pca_y": round(cxy[1], 6),
"radius": radius,
"color": color,
"risk_score": round(risk, 4),
"mean_ttl": round(mean_ttl, 1),
"mean_mss": round(mean_mss, 0),
"mean_scale": round(mean_scale, 1),
"mean_win": round(mean_win, 0),
"mean_velocity":round(avg_f("avg_velocity"),3),
"mean_fuzzing": round(avg_f("avg_fuzzing"), 3),
"mean_headless":round(avg_f("pct_headless"),3),
"mean_post": round(avg_f("avg_post"), 3),
"mean_asset": round(avg_f("asset_ratio"), 3),
"mean_direct": round(avg_f("direct_ratio"),3),
"mean_alpn_mismatch": round(avg_f("alpn_mismatch"),3),
"mean_h2_eff": round(avg_f("h2_eff"), 3),
"mean_hdr_conf":round(avg_f("hdr_conf"), 3),
"mean_ua_ch": round(avg_f("ua_ch_mismatch"),3),
"mean_entropy": round(avg_f("entropy"), 3),
"mean_ja4_diversity": round(avg_f("ja4_count"),3),
"mean_ip_id_zero": round(avg_f("ip_id_zero"),3),
"mean_browser_score": round(avg_f("browser_score"),1),
"mean_ua_rotating": round(avg_f("ua_rotating"),3),
"ip_count": ip_count,
"hit_count": hit_count,
"top_threat": topk(threats, 1)[0] if threats else "",
"top_countries":topk(countries, 5),
"top_orgs": topk(orgs, 5),
"sample_ips": sample_ips,
"sample_ua": sample_ua,
"radar": radar,
# Hull pour deck.gl PolygonLayer
"hull": hulls.get(j, []),
})
# ── 8. Arêtes k-NN entre clusters ────────────────────────────────
edges = []
seen: set[frozenset] = set()
for i, ni in enumerate(nodes):
ci = ni["cluster_idx"]
dists = sorted(
[(j, nj["cluster_idx"],
float(np.sum((centroids_orig[ci] - centroids_orig[nj["cluster_idx"]]) ** 2)))
for j, nj in enumerate(nodes) if j != i],
key=lambda x: x[2]
)
for j_idx, cj, d2 in dists[:2]:
key = frozenset([ni["id"], nodes[j_idx]["id"]])
if key in seen:
continue
seen.add(key)
edges.append({
"id": f"e_{ni['id']}_{nodes[j_idx]['id']}",
"source": ni["id"],
"target": nodes[j_idx]["id"],
"similarity": round(1.0 / (1.0 + math.sqrt(d2)), 3),
})
# ── 9. Stockage résultat + cache IPs ─────────────────────────────
total_ips = sum(n_["ip_count"] for n_ in nodes)
total_hits = sum(n_["hit_count"] for n_ in nodes)
elapsed = round(time.time() - t0, 2)
result_dict = {
"nodes": nodes,
"edges": edges,
"stats": {
"total_clusters": len(nodes),
"total_ips": total_ips,
"total_hits": total_hits,
"n_samples": n,
"k": k_actual,
"k_base": k,
"sensitivity": sensitivity,
"elapsed_s": elapsed,
},
"feature_names": FEATURE_NAMES,
}
with _LOCK:
_CACHE["result"] = result_dict
_CACHE["cluster_ips"] = cluster_ips_map
_CACHE["status"] = "ready"
_CACHE["ts"] = time.time()
_CACHE["params"] = {"k": k, "hours": hours, "sensitivity": sensitivity}
_CACHE["error"] = None
log.info(f"[clustering] Terminé en {elapsed}s — {total_ips} IPs, {len(nodes)} clusters")
except Exception as e:
log.exception("[clustering] Erreur lors du calcul")
with _LOCK:
_CACHE["status"] = "error"
_CACHE["error"] = str(e)
def _maybe_trigger(k: int, hours: int, sensitivity: float) -> None:
"""Lance le calcul si cache absent, expiré ou paramètres différents."""
with _LOCK:
status = _CACHE["status"]
params = _CACHE["params"]
ts = _CACHE["ts"]
cache_stale = (time.time() - ts) > _CACHE_TTL
params_changed = (
params.get("k") != k or
params.get("hours") != hours or
params.get("sensitivity") != sensitivity
)
if status in ("computing",):
return # déjà en cours
if status == "ready" and not cache_stale and not params_changed:
return # cache frais
_EXECUTOR.submit(_run_clustering_job, k, hours, sensitivity)
# ─── Endpoints ────────────────────────────────────────────────────────────────
@router.get("/status")
async def get_status():
"""État du calcul en cours (polling frontend)."""
with _LOCK:
return {
"status": _CACHE["status"],
"error": _CACHE["error"],
"ts": _CACHE["ts"],
"params": _CACHE["params"],
"age_s": round(time.time() - _CACHE["ts"], 0) if _CACHE["ts"] else None,
}
@router.get("/clusters")
async def get_clusters(
k: int = Query(20, ge=4, le=100, description="Nombre de clusters de base"),
hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle (heures)"),
sensitivity: float = Query(1.0, ge=0.5, le=5.0, description="Sensibilité : multiplicateur de k (5.0 = granularité maximale)"),
force: bool = Query(False, description="Forcer le recalcul"),
):
"""
Clustering multi-métriques sur TOUTES les IPs.
k_actual = round(k × sensitivity) — la sensibilité contrôle la granularité.
Retourne immédiatement depuis le cache. Déclenche le calcul si nécessaire.
"""
if force:
with _LOCK:
_CACHE["status"] = "idle"
_CACHE["ts"] = 0.0
_CACHE["result"] = None
_CACHE["cluster_ips"] = {}
_maybe_trigger(k, hours, sensitivity)
with _LOCK:
status = _CACHE["status"]
result = _CACHE["result"]
error = _CACHE["error"]
if status == "computing":
return {"status": "computing", "message": "Calcul en cours, réessayez dans quelques secondes"}
if status == "error":
raise HTTPException(status_code=500, detail=error or "Erreur inconnue")
if result is None:
return {"status": "idle", "message": "Calcul démarré, réessayez dans quelques secondes"}
return {**result, "status": "ready"}
@router.get("/cluster/{cluster_id}/points")
async def get_cluster_points(
cluster_id: str,
limit: int = Query(5000, ge=1, le=20000),
offset: int = Query(0, ge=0),
):
"""
Coordonnées PCA + métadonnées de toutes les IPs d'un cluster.
Utilisé par deck.gl ScatterplotLayer (drill-down ou zoom avancé).
"""
with _LOCK:
status = _CACHE["status"]
ips_map = _CACHE["cluster_ips"]
if status != "ready" or not ips_map:
raise HTTPException(status_code=404, detail="Cache absent — appelez /clusters d'abord")
try:
idx = int(cluster_id.split("_")[0][1:])
except (ValueError, IndexError):
raise HTTPException(status_code=400, detail="cluster_id invalide (format: c{n}_k{k})")
members = ips_map.get(idx, [])
total = len(members)
page = members[offset: offset + limit]
points = [
{"ip": m[0], "ja4": m[1], "pca_x": round(m[2], 6), "pca_y": round(m[3], 6), "risk": round(m[4], 3)}
for m in page
]
return {"points": points, "total": total, "offset": offset, "limit": limit}
@router.get("/cluster/{cluster_id}/ips")
async def get_cluster_ips(
cluster_id: str,
limit: int = Query(100, ge=1, le=500),
offset: int = Query(0, ge=0),
):
"""IPs avec détails SQL (backward-compat avec l'ancienne UI)."""
with _LOCK:
status = _CACHE["status"]
ips_map = _CACHE["cluster_ips"]
if status != "ready" or not ips_map:
raise HTTPException(status_code=404, detail="Cache absent — appelez /clusters d'abord")
try:
idx = int(cluster_id.split("_")[0][1:])
except (ValueError, IndexError):
raise HTTPException(status_code=400, detail="cluster_id invalide")
members = ips_map.get(idx, [])
total = len(members)
page = members[offset: offset + limit]
if not page:
return {"ips": [], "total": total, "cluster_id": cluster_id}
safe_ips = [m[0].replace("'", "") for m in page[:200]]
ip_filter = ", ".join(f"'{ip}'" for ip in safe_ips)
sql = f"""
SELECT
replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') AS src_ip,
t.ja4,
any(t.tcp_ttl_raw) AS ttl,
any(t.tcp_win_raw) AS win,
any(t.tcp_scale_raw) AS scale,
any(t.tcp_mss_raw) AS mss,
sum(t.hits) AS hits,
any(t.first_ua) AS ua,
round(avg(abs(ml.anomaly_score)), 3) AS avg_score,
max(ml.threat_level) AS threat_level,
any(ml.country_code) AS country_code,
any(ml.asn_org) AS asn_org,
round(avg(ml.fuzzing_index), 2) AS fuzzing,
round(avg(ml.hit_velocity), 2) AS velocity
FROM mabase_prod.agg_host_ip_ja4_1h t
LEFT JOIN mabase_prod.ml_detected_anomalies ml
ON t.src_ip = ml.src_ip AND t.ja4 = ml.ja4
AND ml.detected_at >= now() - INTERVAL 24 HOUR
WHERE t.window_start >= now() - INTERVAL 24 HOUR
AND replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') IN ({ip_filter})
GROUP BY t.src_ip, t.ja4
ORDER BY hits DESC
"""
try:
result = db.query(sql)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
ips = []
for row in result.result_rows:
ips.append({
"ip": str(row[0] or ""),
"ja4": str(row[1] or ""),
"tcp_ttl": int(row[2] or 0),
"tcp_win": int(row[3] or 0),
"tcp_scale": int(row[4] or 0),
"tcp_mss": int(row[5] or 0),
"hits": int(row[6] or 0),
"ua": str(row[7] or ""),
"avg_score": float(row[8] or 0),
"threat_level": str(row[9] or ""),
"country_code": str(row[10] or ""),
"asn_org": str(row[11] or ""),
"fuzzing": float(row[12] or 0),
"velocity": float(row[13] or 0),
})
return {"ips": ips, "total": total, "cluster_id": cluster_id}

View File

@ -0,0 +1,450 @@
"""
Endpoints pour la liste des détections
"""
from fastapi import APIRouter, HTTPException, Query
from typing import Optional, List
from ..database import db
from ..models import DetectionsListResponse, Detection
router = APIRouter(prefix="/api/detections", tags=["detections"])
# Mapping label ASN → score float (0 = très suspect, 1 = légitime)
_ASN_LABEL_SCORES: dict[str, float] = {
'human': 0.9, 'bot': 0.05, 'proxy': 0.25, 'vpn': 0.3,
'tor': 0.1, 'datacenter': 0.4, 'scanner': 0.05, 'malicious': 0.05,
}
def _label_to_score(label: str) -> float | None:
"""Convertit un label de réputation ASN en score numérique."""
if not label:
return None
return _ASN_LABEL_SCORES.get(label.lower(), 0.5)
@router.get("", response_model=DetectionsListResponse, summary="Liste paginée des détections")
async def get_detections(
page: int = Query(1, ge=1, description="Numéro de page"),
page_size: int = Query(25, ge=1, le=100, description="Nombre de lignes par page"),
threat_level: Optional[str] = Query(None, description="Filtrer par niveau de menace"),
model_name: Optional[str] = Query(None, description="Filtrer par modèle"),
country_code: Optional[str] = Query(None, description="Filtrer par pays"),
asn_number: Optional[str] = Query(None, description="Filtrer par ASN"),
search: Optional[str] = Query(None, description="Recherche texte (IP, JA4, Host)"),
sort_by: str = Query("detected_at", description="Trier par"),
sort_order: str = Query("DESC", description="Ordre (ASC/DESC)"),
group_by_ip: bool = Query(False, description="Grouper par IP (first_seen/last_seen agrégés)"),
score_type: Optional[str] = Query(None, description="Filtrer par type de score: BOT, REGLE, BOT_REGLE, SCORE")
):
"""
Récupère la liste des détections avec pagination et filtres
"""
try:
# Construction de la requête
where_clauses = ["detected_at >= now() - INTERVAL 24 HOUR"]
params = {}
if threat_level:
where_clauses.append("threat_level = %(threat_level)s")
params["threat_level"] = threat_level
if model_name:
where_clauses.append("model_name = %(model_name)s")
params["model_name"] = model_name
if country_code:
where_clauses.append("country_code = %(country_code)s")
params["country_code"] = country_code.upper()
if asn_number:
where_clauses.append("asn_number = %(asn_number)s")
params["asn_number"] = asn_number
if search:
where_clauses.append(
"(ilike(toString(src_ip), %(search)s) OR ilike(ja4, %(search)s) OR ilike(host, %(search)s))"
)
params["search"] = f"%{search}%"
if score_type:
st = score_type.upper()
if st == "BOT":
where_clauses.append("threat_level = 'KNOWN_BOT'")
elif st == "REGLE":
where_clauses.append("threat_level = 'ANUBIS_DENY'")
elif st == "BOT_REGLE":
where_clauses.append("threat_level IN ('KNOWN_BOT', 'ANUBIS_DENY')")
elif st == "SCORE":
where_clauses.append("threat_level NOT IN ('KNOWN_BOT', 'ANUBIS_DENY')")
where_clause = " AND ".join(where_clauses)
# Requête de comptage
count_query = f"""
SELECT count()
FROM ml_detected_anomalies
WHERE {where_clause}
"""
count_result = db.query(count_query, params)
total = count_result.result_rows[0][0] if count_result.result_rows else 0
# Requête principale
offset = (page - 1) * page_size
sort_order = "DESC" if sort_order.upper() == "DESC" else "ASC"
# ── Mode groupé par IP (first_seen / last_seen depuis la DB) ────────────
if group_by_ip:
valid_sort_grouped = ["anomaly_score", "hits", "hit_velocity", "first_seen", "last_seen", "src_ip", "detected_at"]
grouped_sort = sort_by if sort_by in valid_sort_grouped else "last_seen"
# detected_at → last_seen (max(detected_at) dans le GROUP BY)
if grouped_sort == "detected_at":
grouped_sort = "last_seen"
# In outer query, min_score is exposed as anomaly_score — keep the alias
outer_sort = "min_score" if grouped_sort == "anomaly_score" else grouped_sort
# Count distinct IPs
count_ip_query = f"""
SELECT uniq(src_ip)
FROM ml_detected_anomalies
WHERE {where_clause}
"""
cr = db.query(count_ip_query, params)
total = cr.result_rows[0][0] if cr.result_rows else 0
grouped_query = f"""
SELECT
ip_data.src_ip,
ip_data.first_seen,
ip_data.last_seen,
ip_data.detection_count,
ip_data.unique_ja4s,
ip_data.unique_hosts,
ip_data.min_score AS anomaly_score,
ip_data.threat_level_best,
ip_data.model_name_best,
ip_data.country_code,
ip_data.asn_number,
ip_data.asn_org,
ip_data.hit_velocity,
ip_data.hits,
ip_data.asn_label,
ar.label AS asn_rep_label,
ip_data.anubis_bot_name_best,
ip_data.anubis_bot_action_best,
ip_data.anubis_bot_category_best
FROM (
SELECT
src_ip,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen,
count() AS detection_count,
groupUniqArray(5)(ja4) AS unique_ja4s,
groupUniqArray(5)(host) AS unique_hosts,
min(anomaly_score) AS min_score,
argMin(threat_level, anomaly_score) AS threat_level_best,
argMin(model_name, anomaly_score) AS model_name_best,
any(country_code) AS country_code,
any(asn_number) AS asn_number,
any(asn_org) AS asn_org,
max(hit_velocity) AS hit_velocity,
sum(hits) AS hits,
any(asn_label) AS asn_label,
argMin(anubis_bot_name, anomaly_score) AS anubis_bot_name_best,
argMin(anubis_bot_action, anomaly_score) AS anubis_bot_action_best,
argMin(anubis_bot_category, anomaly_score) AS anubis_bot_category_best
FROM ml_detected_anomalies
WHERE {where_clause}
GROUP BY src_ip
) ip_data
LEFT JOIN mabase_prod.asn_reputation ar
ON ar.src_asn = toUInt32OrZero(ip_data.asn_number)
ORDER BY {outer_sort} {sort_order}
LIMIT %(limit)s OFFSET %(offset)s
"""
params["limit"] = page_size
params["offset"] = offset
gresult = db.query(grouped_query, params)
detections = []
for row in gresult.result_rows:
# row: src_ip, first_seen, last_seen, detection_count, unique_ja4s, unique_hosts,
# anomaly_score, threat_level_best, model_name_best, country_code, asn_number,
# asn_org, hit_velocity, hits, asn_label, asn_rep_label,
# anubis_bot_name, anubis_bot_action, anubis_bot_category
ja4s = list(row[4]) if row[4] else []
hosts = list(row[5]) if row[5] else []
detections.append(Detection(
detected_at=row[1],
src_ip=str(row[0]),
ja4=ja4s[0] if ja4s else "",
host=hosts[0] if hosts else "",
bot_name="",
anomaly_score=float(row[6]) if row[6] else 0.0,
threat_level=row[7] or "LOW",
model_name=row[8] or "",
recurrence=int(row[3] or 0),
asn_number=str(row[10]) if row[10] else "",
asn_org=row[11] or "",
asn_detail="",
asn_domain="",
country_code=row[9] or "",
asn_label=row[14] or "",
hits=int(row[13] or 0),
hit_velocity=float(row[12]) if row[12] else 0.0,
fuzzing_index=0.0,
post_ratio=0.0,
reason="",
asn_rep_label=row[15] or "",
asn_score=_label_to_score(row[15] or ""),
first_seen=row[1],
last_seen=row[2],
unique_ja4s=ja4s,
unique_hosts=hosts,
anubis_bot_name=row[16] or "",
anubis_bot_action=row[17] or "",
anubis_bot_category=row[18] or "",
))
total_pages = (total + page_size - 1) // page_size
return DetectionsListResponse(
items=detections, total=total, page=page,
page_size=page_size, total_pages=total_pages
)
# ── Mode individuel (comportement original) ──────────────────────────────
# Validation du tri
valid_sort_columns = [
"detected_at", "src_ip", "threat_level", "anomaly_score",
"asn_number", "country_code", "hits", "hit_velocity"
]
if sort_by not in valid_sort_columns:
sort_by = "detected_at"
main_query = f"""
SELECT
detected_at,
src_ip,
ja4,
host,
bot_name,
anomaly_score,
threat_level,
model_name,
recurrence,
asn_number,
asn_org,
asn_detail,
asn_domain,
country_code,
asn_label,
hits,
hit_velocity,
fuzzing_index,
post_ratio,
reason,
ar.label AS asn_rep_label,
anubis_bot_name,
anubis_bot_action,
anubis_bot_category
FROM ml_detected_anomalies
LEFT JOIN mabase_prod.asn_reputation ar ON ar.src_asn = toUInt32OrZero(asn_number)
WHERE {where_clause}
ORDER BY {sort_by} {sort_order}
LIMIT %(limit)s OFFSET %(offset)s
"""
params["limit"] = page_size
params["offset"] = offset
result = db.query(main_query, params)
detections = [
Detection(
detected_at=row[0],
src_ip=str(row[1]),
ja4=row[2] or "",
host=row[3] or "",
bot_name=row[4] or "",
anomaly_score=float(row[5]) if row[5] else 0.0,
threat_level=row[6] or "LOW",
model_name=row[7] or "",
recurrence=row[8] or 0,
asn_number=str(row[9]) if row[9] else "",
asn_org=row[10] or "",
asn_detail=row[11] or "",
asn_domain=row[12] or "",
country_code=row[13] or "",
asn_label=row[14] or "",
hits=row[15] or 0,
hit_velocity=float(row[16]) if row[16] else 0.0,
fuzzing_index=float(row[17]) if row[17] else 0.0,
post_ratio=float(row[18]) if row[18] else 0.0,
reason=row[19] or "",
asn_rep_label=row[20] or "",
asn_score=_label_to_score(row[20] or ""),
anubis_bot_name=row[21] or "",
anubis_bot_action=row[22] or "",
anubis_bot_category=row[23] or "",
)
for row in result.result_rows
]
total_pages = (total + page_size - 1) // page_size
return DetectionsListResponse(
items=detections,
total=total,
page=page,
page_size=page_size,
total_pages=total_pages
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur lors de la récupération des détections: {str(e)}")
@router.get("/{detection_id}")
async def get_detection_details(detection_id: str):
"""
Récupère les détails d'une détection spécifique
detection_id peut être une IP ou un identifiant
"""
try:
query = """
SELECT
detected_at,
src_ip,
ja4,
host,
bot_name,
anomaly_score,
threat_level,
model_name,
recurrence,
asn_number,
asn_org,
asn_detail,
asn_domain,
country_code,
asn_label,
hits,
hit_velocity,
fuzzing_index,
post_ratio,
port_exhaustion_ratio,
orphan_ratio,
tcp_jitter_variance,
tcp_shared_count,
true_window_size,
window_mss_ratio,
alpn_http_mismatch,
is_alpn_missing,
sni_host_mismatch,
header_count,
has_accept_language,
has_cookie,
has_referer,
modern_browser_score,
ua_ch_mismatch,
header_order_shared_count,
ip_id_zero_ratio,
request_size_variance,
multiplexing_efficiency,
mss_mobile_mismatch,
correlated,
reason,
asset_ratio,
direct_access_ratio,
is_ua_rotating,
distinct_ja4_count,
src_port_density,
ja4_asn_concentration,
ja4_country_concentration,
is_rare_ja4
FROM ml_detected_anomalies
WHERE src_ip = %(ip)s
ORDER BY detected_at DESC
LIMIT 1
"""
result = db.query(query, {"ip": detection_id})
if not result.result_rows:
raise HTTPException(status_code=404, detail="Détection non trouvée")
row = result.result_rows[0]
return {
"detected_at": row[0],
"src_ip": str(row[1]),
"ja4": row[2] or "",
"host": row[3] or "",
"bot_name": row[4] or "",
"anomaly_score": float(row[5]) if row[5] else 0.0,
"threat_level": row[6] or "LOW",
"model_name": row[7] or "",
"recurrence": row[8] or 0,
"asn": {
"number": str(row[9]) if row[9] else "",
"org": row[10] or "",
"detail": row[11] or "",
"domain": row[12] or "",
"label": row[14] or ""
},
"country": {
"code": row[13] or "",
},
"metrics": {
"hits": row[15] or 0,
"hit_velocity": float(row[16]) if row[16] else 0.0,
"fuzzing_index": float(row[17]) if row[17] else 0.0,
"post_ratio": float(row[18]) if row[18] else 0.0,
"port_exhaustion_ratio": float(row[19]) if row[19] else 0.0,
"orphan_ratio": float(row[20]) if row[20] else 0.0,
},
"tcp": {
"jitter_variance": float(row[21]) if row[21] else 0.0,
"shared_count": row[22] or 0,
"true_window_size": row[23] or 0,
"window_mss_ratio": float(row[24]) if row[24] else 0.0,
},
"tls": {
"alpn_http_mismatch": bool(row[25]) if row[25] is not None else False,
"is_alpn_missing": bool(row[26]) if row[26] is not None else False,
"sni_host_mismatch": bool(row[27]) if row[27] is not None else False,
},
"headers": {
"count": row[28] or 0,
"has_accept_language": bool(row[29]) if row[29] is not None else False,
"has_cookie": bool(row[30]) if row[30] is not None else False,
"has_referer": bool(row[31]) if row[31] is not None else False,
"modern_browser_score": row[32] or 0,
"ua_ch_mismatch": bool(row[33]) if row[33] is not None else False,
"header_order_shared_count": row[34] or 0,
},
"behavior": {
"ip_id_zero_ratio": float(row[35]) if row[35] else 0.0,
"request_size_variance": float(row[36]) if row[36] else 0.0,
"multiplexing_efficiency": float(row[37]) if row[37] else 0.0,
"mss_mobile_mismatch": bool(row[38]) if row[38] is not None else False,
"correlated": bool(row[39]) if row[39] is not None else False,
},
"advanced": {
"asset_ratio": float(row[41]) if row[41] else 0.0,
"direct_access_ratio": float(row[42]) if row[42] else 0.0,
"is_ua_rotating": bool(row[43]) if row[43] is not None else False,
"distinct_ja4_count": row[44] or 0,
"src_port_density": float(row[45]) if row[45] else 0.0,
"ja4_asn_concentration": float(row[46]) if row[46] else 0.0,
"ja4_country_concentration": float(row[47]) if row[47] else 0.0,
"is_rare_ja4": bool(row[48]) if row[48] is not None else False,
},
"reason": row[40] or ""
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -0,0 +1,509 @@
"""
Routes pour l'investigation d'entités (IP, JA4, User-Agent, Client-Header, Host, Path, Query-Param)
"""
from fastapi import APIRouter, HTTPException, Query
from typing import Optional, List
from ..database import db
from ..models import (
EntityInvestigation,
EntityStats,
EntityRelatedAttributes,
EntityAttributeValue
)
router = APIRouter(prefix="/api/entities", tags=["Entities"])
# Ensemble des types d'entités valides
VALID_ENTITY_TYPES = frozenset({
'ip', 'ja4', 'user_agent', 'client_header', 'host', 'path', 'query_param'
})
def get_entity_stats(entity_type: str, entity_value: str, hours: int = 24) -> Optional[EntityStats]:
"""
Récupère les statistiques pour une entité donnée
"""
query = """
SELECT
entity_type,
entity_value,
sum(requests) as total_requests,
sum(unique_ips) as unique_ips,
min(log_date) as first_seen,
max(log_date) as last_seen
FROM mabase_prod.view_dashboard_entities
WHERE entity_type = %(entity_type)s
AND entity_value = %(entity_value)s
AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR)
GROUP BY entity_type, entity_value
"""
result = db.query(query, {
'entity_type': entity_type,
'entity_value': entity_value,
'hours': hours
})
if not result.result_rows:
return None
row = result.result_rows[0]
return EntityStats(
entity_type=row[0],
entity_value=row[1],
total_requests=row[2],
unique_ips=row[3],
first_seen=row[4],
last_seen=row[5]
)
def get_related_attributes(entity_type: str, entity_value: str, hours: int = 24) -> EntityRelatedAttributes:
"""
Récupère les attributs associés à une entité
"""
# Requête pour agréger tous les attributs associés
query = """
SELECT
(SELECT groupUniqArray(toString(src_ip)) FROM mabase_prod.view_dashboard_entities WHERE entity_type = %(entity_type)s AND entity_value = %(entity_value)s AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR)) as ips,
(SELECT groupUniqArray(ja4) FROM mabase_prod.view_dashboard_entities WHERE entity_type = %(entity_type)s AND entity_value = %(entity_value)s AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR) AND ja4 != '') as ja4s,
(SELECT groupUniqArray(host) FROM mabase_prod.view_dashboard_entities WHERE entity_type = %(entity_type)s AND entity_value = %(entity_value)s AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR) AND host != '') as hosts,
(SELECT groupUniqArrayArray(asns) FROM mabase_prod.view_dashboard_entities WHERE entity_type = %(entity_type)s AND entity_value = %(entity_value)s AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR) AND notEmpty(asns)) as asns,
(SELECT groupUniqArrayArray(countries) FROM mabase_prod.view_dashboard_entities WHERE entity_type = %(entity_type)s AND entity_value = %(entity_value)s AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR) AND notEmpty(countries)) as countries
"""
result = db.query(query, {
'entity_type': entity_type,
'entity_value': entity_value,
'hours': hours
})
if not result.result_rows or not any(result.result_rows[0]):
return EntityRelatedAttributes(
ips=[],
ja4s=[],
hosts=[],
asns=[],
countries=[]
)
row = result.result_rows[0]
return EntityRelatedAttributes(
ips=[str(ip) for ip in (row[0] or []) if ip],
ja4s=[ja4 for ja4 in (row[1] or []) if ja4],
hosts=[host for host in (row[2] or []) if host],
asns=[asn for asn in (row[3] or []) if asn],
countries=[country for country in (row[4] or []) if country]
)
def get_array_values(entity_type: str, entity_value: str, array_field: str, hours: int = 24) -> List[EntityAttributeValue]:
"""
Extrait et retourne les valeurs d'un champ Array (user_agents, client_headers, etc.)
"""
query = f"""
SELECT
value,
count() as count,
round(count * 100.0 / sum(count) OVER (), 2) as percentage
FROM (
SELECT
arrayJoin({array_field}) as value
FROM mabase_prod.view_dashboard_entities
WHERE entity_type = %(entity_type)s
AND entity_value = %(entity_value)s
AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR)
AND notEmpty({array_field})
)
GROUP BY value
ORDER BY count DESC
"""
result = db.query(query, {
'entity_type': entity_type,
'entity_value': entity_value,
'hours': hours
})
return [
EntityAttributeValue(
value=row[0],
count=row[1],
percentage=row[2]
)
for row in result.result_rows
]
@router.get("/subnet/{subnet:path}")
async def get_subnet_investigation(
subnet: str,
hours: int = Query(default=24, ge=1, le=720)
):
"""
Récupère toutes les IPs d'un subnet /24 avec leurs statistiques
Utilise ml_detected_anomalies pour les détections + view_dashboard_entities pour les user-agents
"""
try:
# Extraire l'IP de base du subnet (ex: 192.168.1.0/24 -> 192.168.1.0)
subnet_ip = subnet.replace('/24', '').replace('/16', '').replace('/8', '')
# Extraire les 3 premiers octets pour le filtre (ex: 141.98.11)
subnet_parts = subnet_ip.split('.')[:3]
subnet_prefix = subnet_parts[0]
subnet_mask = subnet_parts[1]
subnet_third = subnet_parts[2]
# Stats globales du subnet - utilise ml_detected_anomalies + view_dashboard_entities pour UA
stats_query = """
WITH cleaned_ips AS (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
detected_at,
ja4,
host,
country_code,
asn_number
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
),
subnet_filter AS (
SELECT *
FROM cleaned_ips
WHERE splitByChar('.', clean_ip)[1] = %(subnet_prefix)s
AND splitByChar('.', clean_ip)[2] = %(subnet_mask)s
AND splitByChar('.', clean_ip)[3] = %(subnet_third)s
),
-- Récupérer les user-agents depuis view_dashboard_entities
ua_data AS (
SELECT
entity_value AS ip,
arrayJoin(user_agents) AS user_agent
FROM view_dashboard_entities
WHERE entity_type = 'ip'
AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR)
AND splitByChar('.', entity_value)[1] = %(subnet_prefix)s
AND splitByChar('.', entity_value)[2] = %(subnet_mask)s
AND splitByChar('.', entity_value)[3] = %(subnet_third)s
)
SELECT
%(subnet)s AS subnet,
uniq(clean_ip) AS total_ips,
count() AS total_detections,
uniq(ja4) AS unique_ja4,
(SELECT uniq(user_agent) FROM ua_data) AS unique_ua,
uniq(host) AS unique_hosts,
argMax(country_code, detected_at) AS primary_country,
argMax(asn_number, detected_at) AS primary_asn,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM subnet_filter
"""
stats_result = db.query(stats_query, {
"subnet": subnet,
"subnet_prefix": subnet_prefix,
"subnet_mask": subnet_mask,
"subnet_third": subnet_third,
"hours": hours
})
if not stats_result.result_rows or stats_result.result_rows[0][1] == 0:
raise HTTPException(status_code=404, detail="Subnet non trouvé")
stats_row = stats_result.result_rows[0]
stats = {
"subnet": subnet,
"total_ips": stats_row[1] or 0,
"total_detections": stats_row[2] or 0,
"unique_ja4": stats_row[3] or 0,
"unique_ua": stats_row[4] or 0,
"unique_hosts": stats_row[5] or 0,
"primary_country": stats_row[6] or "XX",
"primary_asn": str(stats_row[7]) if stats_row[7] else "?",
"first_seen": stats_row[8].isoformat() if stats_row[8] else "",
"last_seen": stats_row[9].isoformat() if stats_row[9] else ""
}
# Liste des IPs avec détails - 2 requêtes séparées + fusion en Python
ips_query = """
WITH cleaned_ips AS (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
detected_at,
ja4,
country_code,
asn_number,
threat_level,
anomaly_score
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
),
subnet_filter AS (
SELECT *
FROM cleaned_ips
WHERE splitByChar('.', clean_ip)[1] = %(subnet_prefix)s
AND splitByChar('.', clean_ip)[2] = %(subnet_mask)s
AND splitByChar('.', clean_ip)[3] = %(subnet_third)s
)
SELECT
clean_ip AS ip,
count() AS total_detections,
uniq(ja4) AS unique_ja4,
argMax(country_code, detected_at) AS primary_country,
argMax(asn_number, detected_at) AS primary_asn,
argMax(threat_level, detected_at) AS threat_level,
avg(anomaly_score) AS avg_score,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM subnet_filter
GROUP BY ip
ORDER BY total_detections DESC
"""
# Exécuter la première requête pour obtenir les IPs
ips_result = db.query(ips_query, {
"subnet_prefix": subnet_prefix,
"subnet_mask": subnet_mask,
"subnet_third": subnet_third,
"hours": hours
})
# Extraire la liste des IPs pour la requête UA
ip_list = [str(row[0]) for row in ips_result.result_rows]
# Requête pour les user-agents avec IN clause (utilise l'index)
unique_ua_dict = {}
if ip_list:
# Formater la liste pour la clause IN
ip_values = ', '.join(f"'{ip}'" for ip in ip_list)
ua_query = f"""
SELECT
entity_value AS ip,
uniq(arrayJoin(user_agents)) AS unique_ua
FROM view_dashboard_entities
PREWHERE entity_type = 'ip'
WHERE entity_value IN ({ip_values})
AND log_date >= today() - INTERVAL 30 DAY
GROUP BY entity_value
"""
ua_result = db.query(ua_query, {})
unique_ua_dict = {row[0]: row[1] for row in ua_result.result_rows}
# Fusionner les résultats
ips = []
for row in ips_result.result_rows:
ips.append({
"ip": str(row[0]),
"total_detections": row[1],
"unique_ja4": row[2],
"unique_ua": unique_ua_dict.get(row[0], 0),
"primary_country": row[3] or "XX",
"primary_asn": str(row[4]) if row[4] else "?",
"threat_level": row[5] or "LOW",
"avg_score": abs(row[6] or 0),
"first_seen": row[7].isoformat() if row[7] else "",
"last_seen": row[8].isoformat() if row[8] else ""
})
return {
"stats": stats,
"ips": ips
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/types")
async def get_entity_types():
"""
Retourne la liste des types d'entités supportés.
NOTE: Cette route DOIT être déclarée avant /{entity_type}/... pour ne pas être masquée.
"""
return {
"entity_types": sorted(VALID_ENTITY_TYPES),
"descriptions": {
"ip": "Adresse IP source",
"ja4": "Fingerprint JA4 TLS",
"user_agent": "User-Agent HTTP",
"client_header": "Client Header",
"host": "Host HTTP",
"path": "Path URL",
"query_param": "Query Param"
}
}
@router.get("/{entity_type}/{entity_value:path}", response_model=EntityInvestigation)
async def get_entity_investigation(
entity_type: str,
entity_value: str,
hours: int = Query(default=24, ge=1, le=720, description="Fenêtre temporelle en heures")
):
"""
Investigation complète pour une entité donnée
- **entity_type**: Type d'entité (ip, ja4, user_agent, client_header, host, path, query_param)
- **entity_value**: Valeur de l'entité
- **hours**: Fenêtre temporelle (défaut: 24h)
Retourne:
- Stats générales
- Attributs associés (IPs, JA4, Hosts, ASNs, Pays)
- User-Agents
- Client-Headers
- Paths
- Query-Params
"""
# Valider le type d'entité
if entity_type not in VALID_ENTITY_TYPES:
raise HTTPException(
status_code=400,
detail=f"Type d'entité invalide. Types supportés: {', '.join(VALID_ENTITY_TYPES)}"
)
# Stats générales
stats = get_entity_stats(entity_type, entity_value, hours)
if not stats:
raise HTTPException(status_code=404, detail="Entité non trouvée")
# Attributs associés
related = get_related_attributes(entity_type, entity_value, hours)
# User-Agents
user_agents = get_array_values(entity_type, entity_value, 'user_agents', hours)
# Client-Headers
client_headers = get_array_values(entity_type, entity_value, 'client_headers', hours)
# Paths
paths = get_array_values(entity_type, entity_value, 'paths', hours)
# Query-Params
query_params = get_array_values(entity_type, entity_value, 'query_params', hours)
return EntityInvestigation(
stats=stats,
related=related,
user_agents=user_agents,
client_headers=client_headers,
paths=paths,
query_params=query_params
)
@router.get("/{entity_type}/{entity_value:path}/related")
async def get_entity_related(
entity_type: str,
entity_value: str,
hours: int = Query(default=24, ge=1, le=720)
):
"""
Récupère uniquement les attributs associés à une entité
"""
if entity_type not in VALID_ENTITY_TYPES:
raise HTTPException(
status_code=400,
detail=f"Type d'entité invalide. Types supportés: {', '.join(VALID_ENTITY_TYPES)}"
)
related = get_related_attributes(entity_type, entity_value, hours)
return {
"entity_type": entity_type,
"entity_value": entity_value,
"hours": hours,
"related": related
}
@router.get("/{entity_type}/{entity_value:path}/user_agents")
async def get_entity_user_agents(
entity_type: str,
entity_value: str,
hours: int = Query(default=24, ge=1, le=720)
):
"""
Récupère les User-Agents associés à une entité
"""
if entity_type not in VALID_ENTITY_TYPES:
raise HTTPException(status_code=400, detail="Type d'entité invalide")
user_agents = get_array_values(entity_type, entity_value, 'user_agents', hours)
return {
"entity_type": entity_type,
"entity_value": entity_value,
"user_agents": user_agents,
"total": len(user_agents)
}
@router.get("/{entity_type}/{entity_value:path}/client_headers")
async def get_entity_client_headers(
entity_type: str,
entity_value: str,
hours: int = Query(default=24, ge=1, le=720)
):
"""
Récupère les Client-Headers associés à une entité
"""
if entity_type not in VALID_ENTITY_TYPES:
raise HTTPException(status_code=400, detail="Type d'entité invalide")
client_headers = get_array_values(entity_type, entity_value, 'client_headers', hours)
return {
"entity_type": entity_type,
"entity_value": entity_value,
"client_headers": client_headers,
"total": len(client_headers)
}
@router.get("/{entity_type}/{entity_value:path}/paths")
async def get_entity_paths(
entity_type: str,
entity_value: str,
hours: int = Query(default=24, ge=1, le=720)
):
"""
Récupère les Paths associés à une entité
"""
if entity_type not in VALID_ENTITY_TYPES:
raise HTTPException(status_code=400, detail="Type d'entité invalide")
paths = get_array_values(entity_type, entity_value, 'paths', hours)
return {
"entity_type": entity_type,
"entity_value": entity_value,
"paths": paths,
"total": len(paths)
}
@router.get("/{entity_type}/{entity_value:path}/query_params")
async def get_entity_query_params(
entity_type: str,
entity_value: str,
hours: int = Query(default=24, ge=1, le=720)
):
"""
Récupère les Query-Params associés à une entité
"""
if entity_type not in VALID_ENTITY_TYPES:
raise HTTPException(status_code=400, detail="Type d'entité invalide")
query_params = get_array_values(entity_type, entity_value, 'query_params', hours)
return {
"entity_type": entity_type,
"entity_value": entity_value,
"query_params": query_params,
"total": len(query_params)
}

View File

@ -0,0 +1,827 @@
"""
Endpoints pour l'analyse des fingerprints JA4 et User-Agents
Objectifs:
- Détecter le spoofing JA4 (fingerprint TLS qui prétend être un navigateur mais
dont les User-Agents, les headers HTTP ou les métriques comportementales trahissent
une origine bot/script)
- Construire une matrice JA4 × User-Agent pour visualiser les associations suspectes
- Analyser la distribution des User-Agents pour identifier les rotateurs et les bots
qui usurpent des UA de navigateurs légitimes
"""
from fastapi import APIRouter, HTTPException, Query
import re
from ..database import db
router = APIRouter(prefix="/api/fingerprints", tags=["fingerprints"])
# ─── Helpers ──────────────────────────────────────────────────────────────────
# Patterns indiquant clairement un bot/script sans simulation de navigateur
_BOT_PATTERNS = re.compile(
r"bot|crawler|spider|scraper|python|curl|wget|go-http|java/|axios|"
r"libwww|httpclient|okhttp|requests|aiohttp|httpx|playwright|puppeteer|"
r"selenium|headless|phantomjs",
re.IGNORECASE,
)
# Navigateurs légitimes communs — un JA4 de type "browser" devrait venir avec ces UAs
_BROWSER_PATTERNS = re.compile(
r"mozilla|chrome|safari|firefox|edge|opera|trident",
re.IGNORECASE,
)
def _classify_ua(ua: str) -> str:
"""Retourne 'bot', 'browser', ou 'script'"""
if not ua:
return "empty"
if _BOT_PATTERNS.search(ua):
return "bot"
if _BROWSER_PATTERNS.search(ua):
return "browser"
return "script"
# =============================================================================
# ENDPOINT 1 — Détection de spoofing JA4
# =============================================================================
@router.get("/spoofing")
async def get_ja4_spoofing(
hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle"),
min_detections: int = Query(10, ge=1, description="Nombre minimum de détections"),
limit: int = Query(50, ge=1, le=200),
):
"""
Identifie les JA4 fingerprints suspects de spoofing navigateur.
Un JA4 est considéré suspect quand:
- Il présente un taux élevé de ua_ch_mismatch (header UA ≠ Client Hints)
- Son modern_browser_score est élevé mais les UAs associés sont des bots/scripts
- Il apparaît avec un taux élevé de sni_host_mismatch ou alpn_http_mismatch
- is_rare_ja4 = true avec un volume important
Retourne un score de confiance de spoofing [0-100] pour chaque JA4.
"""
try:
# Agrégation par JA4 avec tous les indicateurs de spoofing
query = """
SELECT
ja4,
count() AS total_detections,
uniq(src_ip) AS unique_ips,
-- Indicateurs de mismatch
countIf(ua_ch_mismatch = true) AS ua_ch_mismatch_count,
round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
countIf(sni_host_mismatch = true) AS sni_mismatch_count,
round(countIf(sni_host_mismatch = true) * 100.0 / count(), 2) AS sni_mismatch_pct,
countIf(alpn_http_mismatch = true) AS alpn_mismatch_count,
round(countIf(alpn_http_mismatch = true) * 100.0 / count(), 2) AS alpn_mismatch_pct,
-- Indicateurs comportementaux
avg(modern_browser_score) AS avg_browser_score,
countIf(is_rare_ja4 = true) AS rare_ja4_count,
round(countIf(is_rare_ja4 = true) * 100.0 / count(), 2) AS rare_ja4_pct,
countIf(is_ua_rotating = true) AS ua_rotating_count,
round(countIf(is_ua_rotating = true) * 100.0 / count(), 2) AS ua_rotating_pct,
-- Métriques TLS/TCP
countIf(is_alpn_missing = true) AS alpn_missing_count,
avg(distinct_ja4_count) AS avg_distinct_ja4_per_ip,
-- Répartition threat levels
countIf(threat_level = 'CRITICAL') AS critical_count,
countIf(threat_level = 'HIGH') AS high_count,
-- Botnet indicators
avg(ja4_asn_concentration) AS avg_asn_concentration,
avg(ja4_country_concentration) AS avg_country_concentration,
argMax(threat_level, detected_at) AS last_threat_level
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND ja4 != '' AND ja4 IS NOT NULL
GROUP BY ja4
HAVING total_detections >= %(min_detections)s
ORDER BY ua_ch_mismatch_pct DESC, total_detections DESC
LIMIT %(limit)s
"""
result = db.query(query, {
"hours": hours,
"min_detections": min_detections,
"limit": limit,
})
# Fetch top UA per JA4 from view_dashboard_user_agents
ja4_list = [str(r[0]) for r in result.result_rows if r[0]]
ua_by_ja4: dict = {}
if ja4_list:
ja4_sql = ", ".join(f"'{j}'" for j in ja4_list[:100])
ua_q = f"""
SELECT ja4, groupArray(5)(ua) AS top_uas
FROM (
SELECT ja4, arrayJoin(user_agents) AS ua, sum(requests) AS cnt
FROM view_dashboard_user_agents
WHERE ja4 IN ({ja4_sql})
AND hour >= now() - INTERVAL {hours} HOUR
AND ua != ''
GROUP BY ja4, ua
ORDER BY ja4, cnt DESC
)
GROUP BY ja4
"""
try:
ua_res = db.query(ua_q)
for ua_row in ua_res.result_rows:
j4 = str(ua_row[0])
if ua_row[1]:
ua_by_ja4[j4] = list(ua_row[1])
except Exception:
pass
items = []
for row in result.result_rows:
ja4 = str(row[0])
ua_ch_mismatch_pct = float(row[4] or 0)
sni_mismatch_pct = float(row[6] or 0)
alpn_mismatch_pct = float(row[8] or 0)
avg_browser_score = float(row[9] or 0)
rare_ja4_pct = float(row[11] or 0)
ua_rotating_pct = float(row[13] or 0)
alpn_missing_count = int(row[14] or 0)
total = int(row[1] or 1)
top_uas = ua_by_ja4.get(ja4, [])
ua_classes = [_classify_ua(u) for u in top_uas]
has_bot_ua = any(c == "bot" for c in ua_classes)
has_browser_ua = any(c == "browser" for c in ua_classes)
# Spoofing confidence score [0-100]:
# UA/CH mismatch est le signal le plus fort (poids 40)
# Browser UA avec score navigateur élevé mais indicateurs bot (poids 25)
# SNI/ALPN mismatches (poids 15)
# is_rare_ja4 avec gros volume (poids 10)
# UA rotating (poids 10)
spoof_score = min(100, round(
ua_ch_mismatch_pct * 0.40
+ (avg_browser_score * 25 / 100 if has_bot_ua else 0)
+ sni_mismatch_pct * 0.10
+ alpn_mismatch_pct * 0.05
+ rare_ja4_pct * 0.10
+ ua_rotating_pct * 0.10
+ (10 if alpn_missing_count > total * 0.3 else 0)
))
# Classification du JA4
if spoof_score >= 60:
classification = "spoofed_browser"
elif has_bot_ua and avg_browser_score < 30:
classification = "known_bot"
elif has_browser_ua and ua_ch_mismatch_pct < 10:
classification = "legitimate_browser"
else:
classification = "suspicious"
items.append({
"ja4": ja4,
"classification": classification,
"spoofing_score": spoof_score,
"total_detections": int(row[1] or 0),
"unique_ips": int(row[2] or 0),
"indicators": {
"ua_ch_mismatch_pct": ua_ch_mismatch_pct,
"sni_mismatch_pct": sni_mismatch_pct,
"alpn_mismatch_pct": alpn_mismatch_pct,
"avg_browser_score": round(avg_browser_score, 1),
"rare_ja4_pct": rare_ja4_pct,
"ua_rotating_pct": ua_rotating_pct,
"alpn_missing_count": alpn_missing_count,
"avg_asn_concentration": round(float(row[18] or 0), 3),
"avg_country_concentration": round(float(row[19] or 0), 3),
},
"top_user_agents": [
{"ua": u, "type": _classify_ua(u)} for u in top_uas
],
"threat_breakdown": {
"critical": int(row[16] or 0),
"high": int(row[17] or 0),
"last_level": str(row[20] or "LOW"),
},
})
# Trier: spoofed_browser d'abord, puis par score
items.sort(key=lambda x: (-x["spoofing_score"], -x["total_detections"]))
return {
"items": items,
"total": len(items),
"period_hours": hours,
"summary": {
"spoofed_browser": sum(1 for i in items if i["classification"] == "spoofed_browser"),
"known_bot": sum(1 for i in items if i["classification"] == "known_bot"),
"suspicious": sum(1 for i in items if i["classification"] == "suspicious"),
"legitimate_browser": sum(1 for i in items if i["classification"] == "legitimate_browser"),
},
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ENDPOINT 2 — Matrice JA4 × User-Agent
# =============================================================================
@router.get("/ja4-ua-matrix")
async def get_ja4_ua_matrix(
hours: int = Query(24, ge=1, le=168),
min_ips: int = Query(3, ge=1, description="Nombre minimum d'IPs pour inclure un JA4"),
limit: int = Query(30, ge=1, le=100),
):
"""
Matrice JA4 × User-Agent.
Pour chaque JA4:
- Top User-Agents associés (depuis view_dashboard_entities)
- Taux de ua_ch_mismatch
- Classification UA (bot / browser / script)
- Indicateur de spoofing si browser_score élevé + UA non-navigateur
"""
try:
# Stats JA4 depuis ml_detected_anomalies
stats_query = """
SELECT
ja4,
uniq(src_ip) AS unique_ips,
count() AS total_detections,
round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
avg(modern_browser_score) AS avg_browser_score,
countIf(is_rare_ja4 = true) AS rare_count,
countIf(is_ua_rotating = true) AS rotating_count,
argMax(threat_level, detected_at) AS last_threat
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND ja4 != '' AND ja4 IS NOT NULL
GROUP BY ja4
HAVING unique_ips >= %(min_ips)s
ORDER BY ua_ch_mismatch_pct DESC, unique_ips DESC
LIMIT %(limit)s
"""
stats_res = db.query(stats_query, {"hours": hours, "min_ips": min_ips, "limit": limit})
ja4_list = [str(r[0]) for r in stats_res.result_rows]
if not ja4_list:
return {"items": [], "total": 0, "period_hours": hours}
# UAs par JA4 depuis view_dashboard_user_agents
ja4_sql = ", ".join(f"'{j}'" for j in ja4_list)
ua_query = f"""
SELECT
ja4,
ua,
sum(requests) AS cnt
FROM view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE ja4 IN ({ja4_sql})
AND hour >= now() - INTERVAL {hours} HOUR
AND ua != ''
GROUP BY ja4, ua
ORDER BY ja4, cnt DESC
"""
ua_by_ja4: dict = {}
try:
ua_res = db.query(ua_query)
for row in ua_res.result_rows:
j4 = str(row[0])
if j4 not in ua_by_ja4:
ua_by_ja4[j4] = []
if len(ua_by_ja4[j4]) < 8:
ua_by_ja4[j4].append({"ua": str(row[1]), "count": int(row[2] or 0)})
except Exception:
pass
items = []
for row in stats_res.result_rows:
ja4 = str(row[0])
unique_ips = int(row[1] or 0)
ua_ch_mismatch_pct = float(row[3] or 0)
avg_browser_score = float(row[4] or 0)
top_uas = ua_by_ja4.get(ja4, [])
ua_total = sum(u["count"] for u in top_uas) or 1
classified_uas = []
for u in top_uas:
ua_type = _classify_ua(u["ua"])
classified_uas.append({
"ua": u["ua"],
"count": u["count"],
"pct": round(u["count"] * 100 / ua_total, 1),
"type": ua_type,
})
bot_pct = sum(u["pct"] for u in classified_uas if u["type"] == "bot")
browser_pct = sum(u["pct"] for u in classified_uas if u["type"] == "browser")
# Spoofing flag: JA4 ressemble à un navigateur (browser_score élevé)
# mais les UAs sont des bots/scripts
is_spoofing = avg_browser_score > 50 and bot_pct > 30 and ua_ch_mismatch_pct > 20
items.append({
"ja4": ja4,
"unique_ips": unique_ips,
"total_detections": int(row[2] or 0),
"ua_ch_mismatch_pct": ua_ch_mismatch_pct,
"avg_browser_score": round(avg_browser_score, 1),
"rare_count": int(row[5] or 0),
"rotating_count": int(row[6] or 0),
"last_threat": str(row[7] or "LOW"),
"user_agents": classified_uas,
"ua_summary": {
"bot_pct": round(bot_pct, 1),
"browser_pct": round(browser_pct, 1),
"script_pct": round(100 - bot_pct - browser_pct, 1),
"total_distinct": len(top_uas),
},
"is_spoofing_suspect": is_spoofing,
})
return {
"items": items,
"total": len(items),
"period_hours": hours,
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ENDPOINT 3 — Analyse globale des User-Agents
# =============================================================================
@router.get("/ua-analysis")
async def get_ua_analysis(
hours: int = Query(24, ge=1, le=168),
limit: int = Query(50, ge=1, le=200),
):
"""
Analyse globale des User-Agents dans les détections.
Identifie:
- UAs de type bot/script
- UAs browser légitimes vs UAs browser utilisés par des bots (via ua_ch_mismatch)
- UAs rares/suspects qui tournent (is_ua_rotating)
- Distribution JA4 par UA pour détecter les UAs multi-fingerprints (rotation)
"""
try:
# Top UAs globaux depuis view_dashboard_user_agents
ua_global_query = """
SELECT
ua,
sum(requests) AS ip_count
FROM view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE hour >= now() - INTERVAL %(hours)s HOUR
AND ua != ''
GROUP BY ua
ORDER BY ip_count DESC
LIMIT %(limit)s
"""
ua_global_res = db.query(ua_global_query, {"hours": hours, "limit": limit})
top_uas = [str(r[0]) for r in ua_global_res.result_rows]
# Pour chaque UA, chercher ses JA4 via view_dashboard_user_agents
ua_sql = ", ".join(f"'{u.replace(chr(39), chr(39)*2)}'" for u in top_uas[:50]) if top_uas else "''"
ja4_per_ua_query = f"""
SELECT
ua,
uniq(ja4) AS unique_ja4s,
groupUniqArray(3)(ja4) AS sample_ja4s
FROM view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE ua IN ({ua_sql})
AND hour >= now() - INTERVAL {hours} HOUR
AND ua != ''
AND ja4 != ''
GROUP BY ua
"""
ja4_by_ua: dict = {}
try:
ja4_res = db.query(ja4_per_ua_query)
for r in ja4_res.result_rows:
ja4_by_ua[str(r[0])] = {
"unique_ja4s": int(r[1] or 0),
"sample_ja4s": list(r[2] or []),
}
except Exception:
pass
# IPs avec is_ua_rotating depuis ml_detected_anomalies
rotating_query = """
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
avg(ua_ch_mismatch) AS avg_ua_ch_mismatch
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND is_ua_rotating = true
GROUP BY clean_ip
ORDER BY avg_ua_ch_mismatch DESC
"""
rotating_ips: list = []
try:
rot_res = db.query(rotating_query, {"hours": hours})
rotating_ips = [str(r[0]) for r in rot_res.result_rows]
except Exception:
pass
# Construire la réponse
items = []
for row in ua_global_res.result_rows:
ua = str(row[0])
ip_count = int(row[1] or 0)
ua_type = _classify_ua(ua)
ja4_info = ja4_by_ua.get(ua, {"unique_ja4s": 0, "sample_ja4s": []})
# UA multi-JA4 est suspect: un vrai navigateur a généralement 1-2 JA4
multi_ja4_flag = ja4_info["unique_ja4s"] > 3
items.append({
"user_agent": ua,
"type": ua_type,
"ip_count": ip_count,
"unique_ja4_count": ja4_info["unique_ja4s"],
"sample_ja4s": ja4_info["sample_ja4s"],
"is_multi_ja4_suspect": multi_ja4_flag,
"risk_flags": _build_ua_risk_flags(ua, ua_type, ja4_info["unique_ja4s"], ip_count),
})
# IPs avec rotation d'UA
ua_rotating_stats = {
"rotating_ip_count": len(rotating_ips),
"sample_rotating_ips": rotating_ips[:10],
}
return {
"items": items,
"total": len(items),
"period_hours": hours,
"ua_rotating_stats": ua_rotating_stats,
"summary": {
"bot_count": sum(1 for i in items if i["type"] == "bot"),
"browser_count": sum(1 for i in items if i["type"] == "browser"),
"script_count": sum(1 for i in items if i["type"] == "script"),
"multi_ja4_suspect_count": sum(1 for i in items if i["is_multi_ja4_suspect"]),
},
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
def _build_ua_risk_flags(ua: str, ua_type: str, unique_ja4s: int, ip_count: int) -> list:
flags = []
if ua_type == "bot":
flags.append("ua_bot_signature")
elif ua_type == "script":
flags.append("ua_script_library")
if unique_ja4s > 5:
flags.append("ja4_rotation_suspect")
if unique_ja4s > 3 and ua_type == "browser":
flags.append("browser_ua_multi_fingerprint")
if ip_count > 100:
flags.append("high_volume")
return flags
# =============================================================================
# ENDPOINT 4 — JA4 d'un IP spécifique: analyse de cohérence UA/JA4
# =============================================================================
@router.get("/ip/{ip}/coherence")
async def get_ip_fingerprint_coherence(ip: str):
"""
Analyse la cohérence JA4/UA pour une IP spécifique.
Répond à la question: "Cette IP spoofait-elle son fingerprint?"
Calcule un score de cohérence basé sur:
- Correspondance entre JA4 (TLS client fingerprint) et User-Agent
- ua_ch_mismatch (User-Agent vs Client Hints)
- modern_browser_score vs type d'UA réel
- Nombre de JA4 distincts utilisés (rotation)
- sni_host_mismatch, alpn_http_mismatch
"""
try:
# Données depuis ml_detected_anomalies
ml_query = """
SELECT
ja4,
ua_ch_mismatch,
modern_browser_score,
sni_host_mismatch,
alpn_http_mismatch,
is_alpn_missing,
is_rare_ja4,
is_ua_rotating,
distinct_ja4_count,
header_count,
has_accept_language,
has_cookie,
has_referer,
header_order_shared_count,
detected_at,
threat_level,
window_mss_ratio,
tcp_jitter_variance,
multiplexing_efficiency
FROM ml_detected_anomalies
WHERE src_ip = %(ip)s
ORDER BY detected_at DESC
"""
ml_res = db.query(ml_query, {"ip": ip})
if not ml_res.result_rows:
raise HTTPException(status_code=404, detail="IP non trouvée dans les détections")
# User-agents réels depuis view_dashboard_user_agents
ua_query = """
SELECT ua, sum(requests) AS cnt
FROM view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE toString(src_ip) = %(ip)s
AND hour >= now() - INTERVAL 72 HOUR
AND ua != ''
GROUP BY ua ORDER BY cnt DESC
"""
ua_res = db.query(ua_query, {"ip": ip})
top_uas = [{"ua": str(r[0]), "count": int(r[1] or 0), "type": _classify_ua(str(r[0]))}
for r in ua_res.result_rows]
# Agréger les indicateurs de la dernière session
rows = ml_res.result_rows
latest = rows[0]
total_rows = len(rows)
ua_ch_mismatch_count = sum(1 for r in rows if r[1])
sni_mismatch_count = sum(1 for r in rows if r[3])
alpn_mismatch_count = sum(1 for r in rows if r[4])
is_rare_count = sum(1 for r in rows if r[6])
is_rotating = any(r[7] for r in rows)
distinct_ja4s = {str(r[0]) for r in rows if r[0]}
avg_browser_score = sum(int(r[2] or 0) for r in rows) / total_rows
# UA analysis
has_browser_ua = any(u["type"] == "browser" for u in top_uas)
has_bot_ua = any(u["type"] == "bot" for u in top_uas)
primary_ua_type = top_uas[0]["type"] if top_uas else "empty"
# Calcul du score de spoofing
spoof_score = min(100, round(
(ua_ch_mismatch_count / total_rows * 100) * 0.40
+ (avg_browser_score * 0.20 if has_bot_ua else 0)
+ (sni_mismatch_count / total_rows * 100) * 0.10
+ (alpn_mismatch_count / total_rows * 100) * 0.05
+ (len(distinct_ja4s) * 5 if len(distinct_ja4s) > 2 else 0)
+ (15 if is_rotating else 0)
+ (10 if is_rare_count > total_rows * 0.5 else 0)
))
# Verdict
if spoof_score >= 70:
verdict = "high_confidence_spoofing"
elif spoof_score >= 40:
verdict = "suspicious_spoofing"
elif has_bot_ua and avg_browser_score < 20:
verdict = "known_bot_no_spoofing"
elif has_browser_ua and spoof_score < 20:
verdict = "legitimate_browser"
else:
verdict = "inconclusive"
# Explication humaine
explanation = []
if ua_ch_mismatch_count > total_rows * 0.3:
explanation.append(f"UA-Client-Hints mismatch sur {round(ua_ch_mismatch_count*100/total_rows)}% des requêtes")
if has_bot_ua and avg_browser_score > 40:
explanation.append(f"JA4 ressemble à un navigateur (score {round(avg_browser_score)}/100) mais UA est de type bot")
if len(distinct_ja4s) > 2:
explanation.append(f"{len(distinct_ja4s)} JA4 distincts utilisés → rotation de fingerprint")
if is_rotating:
explanation.append("is_ua_rotating détecté → rotation d'User-Agent confirmée")
if sni_mismatch_count > 0:
explanation.append(f"SNI ≠ Host header sur {sni_mismatch_count}/{total_rows} requêtes")
if not explanation:
explanation.append("Aucun indicateur de spoofing majeur détecté")
return {
"ip": ip,
"verdict": verdict,
"spoofing_score": spoof_score,
"explanation": explanation,
"indicators": {
"ua_ch_mismatch_rate": round(ua_ch_mismatch_count / total_rows * 100, 1),
"sni_mismatch_rate": round(sni_mismatch_count / total_rows * 100, 1),
"alpn_mismatch_rate": round(alpn_mismatch_count / total_rows * 100, 1),
"avg_browser_score": round(avg_browser_score, 1),
"distinct_ja4_count": len(distinct_ja4s),
"is_ua_rotating": is_rotating,
"rare_ja4_rate": round(is_rare_count / total_rows * 100, 1),
},
"fingerprints": {
"ja4_list": list(distinct_ja4s),
"latest_ja4": str(latest[0] or ""),
},
"user_agents": top_uas,
"latest_detection": {
"detected_at": latest[14].isoformat() if latest[14] else "",
"threat_level": str(latest[15] or "LOW"),
"modern_browser_score": int(latest[2] or 0),
"header_count": int(latest[9] or 0),
"has_accept_language": bool(latest[10]),
"has_cookie": bool(latest[11]),
"has_referer": bool(latest[12]),
"header_order_shared_count": int(latest[13] or 0),
},
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ENDPOINT 5 — JA4 légitimes (baseline / whitelist)
# =============================================================================
@router.get("/legitimate-ja4")
async def get_legitimate_ja4(
hours: int = Query(168, ge=24, le=720, description="Fenêtre pour établir la baseline"),
min_ips: int = Query(50, ge=5, description="Nombre minimum d'IPs pour qualifier un JA4 de légitime"),
):
"""
Établit une baseline des JA4 fingerprints légitimes.
Un JA4 est considéré légitime si:
- Il est utilisé par un grand nombre d'IPs distinctes (> min_ips)
- Son taux de ua_ch_mismatch est faible (< 5%)
- Son modern_browser_score est élevé (> 60)
- Il n'est PAS is_rare_ja4
- Ses UAs sont dominés par des navigateurs connus
Utile comme whitelist pour réduire les faux positifs.
"""
try:
query = """
SELECT
ja4,
uniq(src_ip) AS unique_ips,
count() AS total_detections,
round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
avg(modern_browser_score) AS avg_browser_score,
countIf(is_rare_ja4 = true) AS rare_count,
round(countIf(threat_level = 'CRITICAL') * 100.0 / count(), 2) AS critical_pct,
round(countIf(threat_level = 'HIGH') * 100.0 / count(), 2) AS high_pct
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND ja4 != '' AND ja4 IS NOT NULL
GROUP BY ja4
HAVING unique_ips >= %(min_ips)s
AND ua_ch_mismatch_pct < 5.0
AND avg_browser_score > 60
AND rare_count = 0
ORDER BY unique_ips DESC
"""
result = db.query(query, {"hours": hours, "min_ips": min_ips})
items = [
{
"ja4": str(row[0]),
"unique_ips": int(row[1] or 0),
"total_detections": int(row[2] or 0),
"ua_ch_mismatch_pct": float(row[3] or 0),
"avg_browser_score": round(float(row[4] or 0), 1),
"critical_pct": float(row[6] or 0),
"high_pct": float(row[7] or 0),
"legitimacy_confidence": min(100, round(
(1 - float(row[3] or 0) / 100) * 40
+ float(row[4] or 0) * 0.40
+ min(int(row[1] or 0) / min_ips, 1) * 20
)),
}
for row in result.result_rows
]
return {
"items": items,
"total": len(items),
"period_hours": hours,
"note": "Ces JA4 sont candidats à une whitelist. Vérifier manuellement avant de whitelister.",
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ENDPOINT — Corrélation JA4 × ASN / Pays (C5)
# Détecte les JA4 fortement concentrés sur un seul ASN ou pays
# → signal de botnet ciblé ou d'infrastructure de test/attaque partagée
# =============================================================================
@router.get("/asn-correlation")
async def get_ja4_asn_correlation(
min_concentration: float = Query(0.7, ge=0.0, le=1.0, description="Seuil min de concentration ASN ou pays"),
min_ips: int = Query(5, ge=1, description="Nombre minimum d'IPs par JA4"),
limit: int = Query(50, ge=1, le=200),
):
"""
Identifie les JA4 fingerprints fortement concentrés sur un seul ASN ou pays.
Un JA4 avec asn_concentration ≥ 0.7 signifie que ≥70% des IPs utilisant ce fingerprint
proviennent du même ASN → infrastructure de bot partagée ou datacenter suspect.
"""
try:
# Two-pass: first aggregate per (ja4, asn) to get IP counts per ASN,
# then aggregate per ja4 to compute concentration ratio
sql = """
SELECT
ja4,
sum(ips_per_combo) AS unique_ips,
uniq(src_asn) AS unique_asns,
uniq(src_country_code) AS unique_countries,
toString(argMax(src_asn, ips_per_combo)) AS top_asn_number,
argMax(asn_name, ips_per_combo) AS top_asn_name,
argMax(src_country_code, country_ips) AS dominant_country,
sum(total_hits) AS total_hits,
round(max(ips_per_combo) / greatest(sum(ips_per_combo), 1), 3) AS asn_concentration,
round(max(country_ips) / greatest(sum(ips_per_combo), 1), 3) AS country_concentration
FROM (
SELECT
ja4,
src_asn,
src_country_code,
any(src_as_name) AS asn_name,
uniq(src_ip) AS ips_per_combo,
uniq(src_ip) AS country_ips,
sum(hits) AS total_hits
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
AND ja4 != ''
GROUP BY ja4, src_asn, src_country_code
)
GROUP BY ja4
HAVING unique_ips >= %(min_ips)s
AND (asn_concentration >= %(min_conc)s OR country_concentration >= %(min_conc)s)
ORDER BY asn_concentration DESC, unique_ips DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"min_ips": min_ips, "min_conc": min_concentration, "limit": limit})
items = []
for row in result.result_rows:
ja4 = str(row[0])
unique_ips = int(row[1])
unique_asns = int(row[2])
unique_countries = int(row[3])
top_asn_number = str(row[4] or "")
top_asn_name = str(row[5] or "")
dominant_country = str(row[6] or "")
total_hits = int(row[7] or 0)
asn_concentration = float(row[8] or 0)
country_concentration = float(row[9] or 0)
if asn_concentration >= 0.85:
corr_type, risk = "asn_monopoly", "high"
elif asn_concentration >= min_concentration:
corr_type, risk = "asn_dominant", "medium"
elif country_concentration >= min_concentration:
corr_type, risk = "geo_targeted", "medium"
else:
corr_type, risk = "distributed", "low"
items.append({
"ja4": ja4,
"unique_ips": unique_ips,
"unique_asns": unique_asns,
"unique_countries": unique_countries,
"top_asn_name": top_asn_name,
"top_asn_number": top_asn_number,
"dominant_country": dominant_country,
"total_hits": total_hits,
"asn_concentration": asn_concentration,
"country_concentration":country_concentration,
"correlation_type": corr_type,
"risk": risk,
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -0,0 +1,101 @@
"""
Endpoints pour l'analyse des empreintes d'en-têtes HTTP
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
router = APIRouter(prefix="/api/headers", tags=["header_fingerprint"])
@router.get("/clusters")
async def get_header_clusters(limit: int = Query(50, ge=1, le=200)):
"""Clusters d'empreintes d'en-têtes groupés par header_order_hash."""
try:
sql = """
SELECT
header_order_hash AS hash,
uniq(replaceRegexpAll(toString(src_ip), '^::ffff:', '')) AS unique_ips,
avg(modern_browser_score) AS avg_browser_score,
sum(ua_ch_mismatch) AS ua_ch_mismatch_count,
round(sum(ua_ch_mismatch) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
groupArray(5)(sec_fetch_mode) AS top_sec_fetch_modes,
round(sum(has_cookie) * 100.0 / count(), 2) AS has_cookie_pct,
round(sum(has_referer) * 100.0 / count(), 2) AS has_referer_pct
FROM mabase_prod.agg_header_fingerprint_1h
GROUP BY header_order_hash
ORDER BY unique_ips DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
total_sql = """
SELECT uniq(header_order_hash)
FROM mabase_prod.agg_header_fingerprint_1h
"""
total_clusters = int(db.query(total_sql).result_rows[0][0])
clusters = []
for row in result.result_rows:
h = str(row[0])
unique_ips = int(row[1])
avg_browser_score = float(row[2] or 0)
ua_ch_mismatch_cnt = int(row[3])
ua_ch_mismatch_pct = float(row[4] or 0)
top_modes = list(set(str(m) for m in (row[5] or [])))
has_cookie_pct = float(row[6] or 0)
has_referer_pct = float(row[7] or 0)
if avg_browser_score >= 90 and ua_ch_mismatch_pct < 5:
classification = "legitimate"
elif ua_ch_mismatch_pct > 50:
classification = "bot_suspicious"
else:
classification = "mixed"
clusters.append({
"hash": h,
"unique_ips": unique_ips,
"avg_browser_score": round(avg_browser_score, 2),
"ua_ch_mismatch_count":ua_ch_mismatch_cnt,
"ua_ch_mismatch_pct": ua_ch_mismatch_pct,
"top_sec_fetch_modes": top_modes,
"has_cookie_pct": has_cookie_pct,
"has_referer_pct": has_referer_pct,
"classification": classification,
})
return {"clusters": clusters, "total_clusters": total_clusters}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/cluster/{hash}/ips")
async def get_cluster_ips(hash: str, limit: int = Query(50, ge=1, le=500)):
"""Liste des IPs appartenant à un cluster d'en-têtes donné."""
try:
sql = """
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
any(modern_browser_score) AS browser_score,
any(ua_ch_mismatch) AS ua_ch_mismatch,
any(sec_fetch_mode) AS sec_fetch_mode,
any(sec_fetch_dest) AS sec_fetch_dest
FROM mabase_prod.agg_header_fingerprint_1h
WHERE header_order_hash = %(hash)s
GROUP BY src_ip
ORDER BY browser_score DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"hash": hash, "limit": limit})
items = []
for row in result.result_rows:
items.append({
"ip": str(row[0]),
"browser_score": int(row[1] or 0),
"ua_ch_mismatch": int(row[2] or 0),
"sec_fetch_mode": str(row[3] or ""),
"sec_fetch_dest": str(row[4] or ""),
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,144 @@
"""
Endpoints pour la heatmap temporelle (hits par heure / hôte)
"""
from collections import defaultdict
from fastapi import APIRouter, HTTPException, Query
from ..database import db
router = APIRouter(prefix="/api/heatmap", tags=["heatmap"])
@router.get("/hourly")
async def get_heatmap_hourly():
"""Hits agrégés par heure sur les 72 dernières heures."""
try:
sql = """
SELECT
toHour(window_start) AS hour,
sum(hits) AS hits,
uniq(replaceRegexpAll(toString(src_ip), '^::ffff:', '')) AS unique_ips,
max(max_requests_per_sec) AS max_rps
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 72 HOUR
GROUP BY hour
ORDER BY hour ASC
"""
result = db.query(sql)
hours = [
{
"hour": int(row[0]),
"hits": int(row[1]),
"unique_ips": int(row[2]),
"max_rps": int(row[3]),
}
for row in result.result_rows
]
return {"hours": hours}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/top-hosts")
async def get_heatmap_top_hosts(limit: int = Query(20, ge=1, le=100)):
"""Hôtes les plus ciblés avec répartition horaire sur 24h."""
try:
# Aggregate overall stats per host
agg_sql = """
SELECT
host,
sum(hits) AS total_hits,
uniq(replaceRegexpAll(toString(src_ip), '^::ffff:', '')) AS unique_ips,
uniq(ja4) AS unique_ja4s
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 72 HOUR
GROUP BY host
ORDER BY total_hits DESC
LIMIT %(limit)s
"""
agg_res = db.query(agg_sql, {"limit": limit})
top_hosts = [str(r[0]) for r in agg_res.result_rows]
host_stats = {
str(r[0]): {
"host": str(r[0]),
"total_hits": int(r[1]),
"unique_ips": int(r[2]),
"unique_ja4s":int(r[3]),
}
for r in agg_res.result_rows
}
if not top_hosts:
return {"items": []}
# Hourly breakdown per host
hourly_sql = """
SELECT
host,
toHour(window_start) AS hour,
sum(hits) AS hits
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 72 HOUR
AND host IN %(hosts)s
GROUP BY host, hour
"""
hourly_res = db.query(hourly_sql, {"hosts": top_hosts})
hourly_map: dict = defaultdict(lambda: [0] * 24)
for row in hourly_res.result_rows:
h = str(row[0])
hour = int(row[1])
hits = int(row[2])
hourly_map[h][hour] += hits
items = []
for host in top_hosts:
entry = dict(host_stats[host])
entry["hourly_hits"] = hourly_map[host]
items.append(entry)
return {"items": items}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/matrix")
async def get_heatmap_matrix():
"""Matrice top-15 hôtes × 24 heures (sum hits) sur les 72 dernières heures."""
try:
top_sql = """
SELECT host, sum(hits) AS total_hits
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 72 HOUR
GROUP BY host
ORDER BY total_hits DESC
"""
top_res = db.query(top_sql)
top_hosts = [str(r[0]) for r in top_res.result_rows]
if not top_hosts:
return {"hosts": [], "matrix": []}
cell_sql = """
SELECT
host,
toHour(window_start) AS hour,
sum(hits) AS hits
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 72 HOUR
AND host IN %(hosts)s
GROUP BY host, hour
"""
cell_res = db.query(cell_sql, {"hosts": top_hosts})
matrix_map: dict = defaultdict(lambda: [0] * 24)
for row in cell_res.result_rows:
h = str(row[0])
hour = int(row[1])
hits = int(row[2])
matrix_map[h][hour] += hits
matrix = [matrix_map[h] for h in top_hosts]
return {"hosts": top_hosts, "matrix": matrix}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,266 @@
"""
Routes pour la gestion des incidents clusterisés
"""
import hashlib
from fastapi import APIRouter, HTTPException, Query
from typing import List, Optional
from datetime import datetime
from ..database import db
router = APIRouter(prefix="/api/incidents", tags=["incidents"])
@router.get("/clusters")
async def get_incident_clusters(
hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle en heures"),
min_severity: str = Query("LOW", description="Niveau de sévérité minimum"),
limit: int = Query(20, ge=1, le=100, description="Nombre maximum de clusters")
):
"""
Récupère les incidents clusterisés automatiquement
Les clusters sont formés par:
- Subnet /24
- JA4 fingerprint
- Pattern temporel
"""
try:
# Cluster par subnet /24 avec une IP exemple
# Note: src_ip est en IPv6, les IPv4 sont stockés comme ::ffff:x.x.x.x
# toIPv4() convertit les IPv4-mapped, IPv4NumToString() retourne l'IPv4 en notation x.x.x.x
cluster_query = """
WITH cleaned_ips AS (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
detected_at,
ja4,
country_code,
asn_number,
threat_level,
anomaly_score
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
),
subnet_groups AS (
SELECT
concat(
splitByChar('.', clean_ip)[1], '.',
splitByChar('.', clean_ip)[2], '.',
splitByChar('.', clean_ip)[3], '.0/24'
) AS subnet,
count() AS total_detections,
uniq(clean_ip) AS unique_ips,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen,
argMax(ja4, detected_at) AS ja4,
argMax(country_code, detected_at) AS country_code,
argMax(asn_number, detected_at) AS asn_number,
argMax(threat_level, detected_at) AS threat_level,
avg(anomaly_score) AS avg_score,
argMax(clean_ip, detected_at) AS sample_ip
FROM cleaned_ips
GROUP BY subnet
HAVING total_detections >= 2
)
SELECT
subnet,
total_detections,
unique_ips,
first_seen,
last_seen,
ja4,
country_code,
asn_number,
threat_level,
avg_score,
sample_ip
FROM subnet_groups
ORDER BY avg_score ASC, total_detections DESC
LIMIT %(limit)s
"""
result = db.query(cluster_query, {"hours": hours, "limit": limit})
# Collect sample IPs to fetch real UA and trend data in bulk
sample_ips = [row[10] for row in result.result_rows if row[10]]
# Fetch real primary UA per sample IP from view_dashboard_entities
ua_by_ip: dict = {}
if sample_ips:
ip_list_sql = ", ".join(f"'{ip}'" for ip in sample_ips[:50])
ua_query = f"""
SELECT entity_value, arrayElement(user_agents, 1) AS top_ua
FROM view_dashboard_entities
WHERE entity_type = 'ip'
AND entity_value IN ({ip_list_sql})
AND notEmpty(user_agents)
GROUP BY entity_value, top_ua
ORDER BY entity_value
"""
try:
ua_result = db.query(ua_query)
for ua_row in ua_result.result_rows:
if ua_row[0] not in ua_by_ip and ua_row[1]:
ua_by_ip[str(ua_row[0])] = str(ua_row[1])
except Exception:
pass # UA enrichment is best-effort
# Compute real trend: compare current window vs previous window of same duration
trend_query = """
WITH cleaned AS (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
detected_at,
concat(
splitByChar('.', clean_ip)[1], '.',
splitByChar('.', clean_ip)[2], '.',
splitByChar('.', clean_ip)[3], '.0/24'
) AS subnet
FROM ml_detected_anomalies
),
current_window AS (
SELECT subnet, count() AS cnt
FROM cleaned
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
GROUP BY subnet
),
prev_window AS (
SELECT subnet, count() AS cnt
FROM cleaned
WHERE detected_at >= now() - INTERVAL %(hours2)s HOUR
AND detected_at < now() - INTERVAL %(hours)s HOUR
GROUP BY subnet
)
SELECT c.subnet, c.cnt AS current_cnt, p.cnt AS prev_cnt
FROM current_window c
LEFT JOIN prev_window p ON c.subnet = p.subnet
"""
trend_by_subnet: dict = {}
try:
trend_result = db.query(trend_query, {"hours": hours, "hours2": hours * 2})
for tr in trend_result.result_rows:
subnet_key = tr[0]
curr = tr[1] or 0
prev = tr[2] or 0
if prev == 0:
trend_by_subnet[subnet_key] = ("new", 100)
else:
pct = round(((curr - prev) / prev) * 100)
trend_by_subnet[subnet_key] = ("up" if pct >= 0 else "down", abs(pct))
except Exception:
pass
clusters = []
for row in result.result_rows:
subnet = row[0]
threat_level = row[8] or 'LOW'
unique_ips = row[2] or 1
avg_score = abs(row[9] or 0)
sample_ip = row[10] if row[10] else subnet.split('/')[0]
critical_count = 1 if threat_level == 'CRITICAL' else 0
high_count = 1 if threat_level == 'HIGH' else 0
risk_score = min(100, round(
(critical_count * 30) +
(high_count * 20) +
(unique_ips * 5) +
(avg_score * 100)
))
if critical_count > 0 or risk_score >= 80:
severity = "CRITICAL"
elif high_count > (row[1] or 1) * 0.3 or risk_score >= 60:
severity = "HIGH"
elif high_count > 0 or risk_score >= 40:
severity = "MEDIUM"
else:
severity = "LOW"
trend_dir, trend_pct = trend_by_subnet.get(subnet, ("stable", 0))
primary_ua = ua_by_ip.get(sample_ip, "")
clusters.append({
"id": f"INC-{hashlib.md5(subnet.encode()).hexdigest()[:8].upper()}",
"score": risk_score,
"severity": severity,
"total_detections": row[1],
"unique_ips": row[2],
"subnet": subnet,
"sample_ip": sample_ip,
"ja4": row[5] or "",
"primary_ua": primary_ua,
"primary_target": row[3].strftime('%H:%M') if row[3] else "Unknown",
"countries": [{"code": row[6] or "XX", "percentage": 100}],
"asn": str(row[7]) if row[7] else "",
"first_seen": row[3].isoformat() if row[3] else "",
"last_seen": row[4].isoformat() if row[4] else "",
"trend": trend_dir,
"trend_percentage": trend_pct,
})
return {
"items": clusters,
"total": len(clusters),
"period_hours": hours
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/{cluster_id}")
async def get_incident_details(cluster_id: str):
"""
Récupère les détails d'un incident spécifique.
Non encore implémenté — les détails par cluster seront disponibles dans une prochaine version.
"""
raise HTTPException(
status_code=501,
detail="Détails par incident non encore implémentés. Utilisez /api/incidents/clusters pour la liste."
)
@router.post("/{cluster_id}/classify")
async def classify_incident(
cluster_id: str,
label: str,
tags: List[str] = None,
comment: str = ""
):
"""
Classe un incident rapidement.
Non encore implémenté — utilisez /api/analysis/{ip}/classify pour classifier une IP.
"""
raise HTTPException(
status_code=501,
detail="Classification par incident non encore implémentée. Utilisez /api/analysis/{ip}/classify."
)
@router.get("")
async def list_incidents(
status: str = Query("active", description="Statut des incidents"),
severity: Optional[str] = Query(None, description="Filtrer par sévérité (LOW/MEDIUM/HIGH/CRITICAL)"),
hours: int = Query(24, ge=1, le=168)
):
"""
Liste tous les incidents avec filtres.
Délègue à get_incident_clusters ; le filtre severity est appliqué post-requête.
"""
try:
result = await get_incident_clusters(hours=hours, limit=100)
items = result["items"]
if severity:
sev_upper = severity.upper()
items = [c for c in items if c.get("severity") == sev_upper]
return {
"items": items,
"total": len(items),
"period_hours": hours,
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -0,0 +1,185 @@
"""
Endpoint d'investigation enrichie pour une IP donnée.
Agrège en une seule requête les données provenant de toutes les sources :
ml_detected_anomalies, view_form_bruteforce_detected, view_tcp_spoofing_detected,
agg_host_ip_ja4_1h (rotation JA4), view_ip_recurrence, view_ai_features_1h.
"""
from fastapi import APIRouter, HTTPException
from ..database import db
from ..services.tcp_fingerprint import fingerprint_os, detect_spoof, declared_os_from_ua
router = APIRouter(prefix="/api/investigation", tags=["investigation"])
@router.get(
"/{ip}/summary",
summary="Synthèse complète d'une IP",
response_description="Score de risque 0-100, détections ML, brute-force, spoofing TCP, rotation JA4, persistance et timeline 24h",
)
async def get_ip_full_summary(ip: str):
"""
Synthèse complète pour une IP : toutes les sources en un appel.
Normalise l'IP (accepte ::ffff:x.x.x.x ou x.x.x.x).
"""
clean_ip = ip.replace("::ffff:", "").strip()
try:
# ── 1. Score ML / features ─────────────────────────────────────────────
ml_sql = """
SELECT
max(abs(anomaly_score)) AS max_score,
any(threat_level) AS threat_level,
any(bot_name) AS bot_name,
count() AS total_detections,
uniq(host) AS distinct_hosts,
uniq(ja4) AS distinct_ja4
FROM mabase_prod.ml_detected_anomalies
WHERE replaceRegexpAll(toString(src_ip), '^::ffff:', '') = %(ip)s
"""
ml_res = db.query(ml_sql, {"ip": clean_ip})
ml_row = ml_res.result_rows[0] if ml_res.result_rows else None
ml_data = {
"max_score": round(float(ml_row[0] or 0), 2) if ml_row else 0,
"threat_level": str(ml_row[1] or "") if ml_row else "",
"attack_type": str(ml_row[2] or "") if ml_row else "",
"total_detections": int(ml_row[3] or 0) if ml_row else 0,
"distinct_hosts": int(ml_row[4] or 0) if ml_row else 0,
"distinct_ja4": int(ml_row[5] or 0) if ml_row else 0,
}
# ── 2. Brute force ─────────────────────────────────────────────────────
bf_sql = """
SELECT
uniq(host) AS hosts_attacked,
sum(hits) AS total_hits,
sum(query_params_count) AS total_params,
groupArray(3)(host) AS top_hosts
FROM mabase_prod.view_form_bruteforce_detected
WHERE replaceRegexpAll(toString(src_ip), '^::ffff:', '') = %(ip)s
"""
bf_res = db.query(bf_sql, {"ip": clean_ip})
bf_row = bf_res.result_rows[0] if bf_res.result_rows else None
bf_data = {
"active": bool(bf_row and int(bf_row[1] or 0) > 0),
"hosts_attacked": int(bf_row[0] or 0) if bf_row else 0,
"total_hits": int(bf_row[1] or 0) if bf_row else 0,
"total_params": int(bf_row[2] or 0) if bf_row else 0,
"top_hosts": [str(h) for h in (bf_row[3] or [])] if bf_row else [],
}
# ── 3. TCP spoofing — fingerprinting multi-signal ─────────────────────
tcp_sql = """
SELECT
any(tcp_ttl_raw) AS ttl,
any(tcp_win_raw) AS win,
any(tcp_scale_raw) AS scale,
any(tcp_mss_raw) AS mss,
any(first_ua) AS ua
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE replaceRegexpAll(toString(src_ip), '^::ffff:', '') = %(ip)s
AND window_start >= now() - INTERVAL 24 HOUR
AND tcp_ttl_raw > 0
LIMIT 1
"""
tcp_res = db.query(tcp_sql, {"ip": clean_ip})
tcp_data = {"detected": False, "tcp_ttl": None, "suspected_os": None}
if tcp_res.result_rows:
r = tcp_res.result_rows[0]
ttl = int(r[0] or 0)
win = int(r[1] or 0)
scale = int(r[2] or 0)
mss = int(r[3] or 0)
ua = str(r[4] or "")
fp = fingerprint_os(ttl, win, scale, mss)
dec_os = declared_os_from_ua(ua)
spoof_res = detect_spoof(fp, dec_os)
tcp_data = {
"detected": spoof_res.is_spoof,
"tcp_ttl": ttl,
"tcp_mss": mss,
"tcp_win_scale": scale,
"initial_ttl": fp.initial_ttl,
"hop_count": fp.hop_count,
"suspected_os": fp.os_name,
"declared_os": dec_os,
"confidence": fp.confidence,
"network_path": fp.network_path,
"is_bot_tool": fp.is_bot_tool,
"spoof_reason": spoof_res.reason,
}
# ── 4. JA4 rotation ────────────────────────────────────────────────────
rot_sql = """
SELECT distinct_ja4_count, total_hits
FROM mabase_prod.view_host_ip_ja4_rotation
WHERE replaceRegexpAll(toString(src_ip), '^::ffff:', '') = %(ip)s
LIMIT 1
"""
rot_res = db.query(rot_sql, {"ip": clean_ip})
rot_data = {"rotating": False, "distinct_ja4_count": 0}
if rot_res.result_rows:
row = rot_res.result_rows[0]
cnt = int(row[0] or 0)
rot_data = {"rotating": cnt > 1, "distinct_ja4_count": cnt, "total_hits": int(row[1] or 0)}
# ── 5. Persistance ─────────────────────────────────────────────────────
pers_sql = """
SELECT recurrence, worst_score, worst_threat_level, first_seen, last_seen
FROM mabase_prod.view_ip_recurrence
WHERE replaceRegexpAll(toString(src_ip), '^::ffff:', '') = %(ip)s
LIMIT 1
"""
pers_res = db.query(pers_sql, {"ip": clean_ip})
pers_data = {"persistent": False, "recurrence": 0}
if pers_res.result_rows:
row = pers_res.result_rows[0]
pers_data = {
"persistent": True,
"recurrence": int(row[0] or 0),
"worst_score": round(float(row[1] or 0), 2),
"worst_threat_level":str(row[2] or ""),
"first_seen": str(row[3]),
"last_seen": str(row[4]),
}
# ── 6. Timeline 24h ────────────────────────────────────────────────────
tl_sql = """
SELECT
toHour(window_start) AS hour,
sum(hits) AS hits,
groupUniqArray(3)(ja4) AS ja4s
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE replaceRegexpAll(toString(src_ip), '^::ffff:', '') = %(ip)s
AND window_start >= now() - INTERVAL 24 HOUR
GROUP BY hour
ORDER BY hour ASC
"""
tl_res = db.query(tl_sql, {"ip": clean_ip})
timeline = [
{"hour": int(r[0]), "hits": int(r[1]), "ja4s": [str(j) for j in (r[2] or [])]}
for r in tl_res.result_rows
]
# ── Global risk score (heuristic) ──────────────────────────────────────
risk = 0
risk += min(50, ml_data["max_score"] * 50)
if bf_data["active"]: risk += 20
if tcp_data["detected"]:
if tcp_data.get("is_bot_tool"): risk += 30 # outil de scan connu
else: risk += 15 # spoof OS
if rot_data["rotating"]: risk += min(15, rot_data["distinct_ja4_count"] * 3)
if pers_data["persistent"]: risk += min(10, pers_data["recurrence"] * 2)
risk = min(100, round(risk))
return {
"ip": clean_ip,
"risk_score": risk,
"ml": ml_data,
"bruteforce": bf_data,
"tcp_spoofing":tcp_data,
"ja4_rotation":rot_data,
"persistence": pers_data,
"timeline_24h":timeline,
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,175 @@
"""
Endpoints pour les métriques du dashboard
"""
from fastapi import APIRouter, HTTPException
from ..database import db
from ..models import MetricsResponse, MetricsSummary, TimeSeriesPoint
router = APIRouter(prefix="/api/metrics", tags=["metrics"])
@router.get("", response_model=MetricsResponse, summary="Métriques globales du dashboard")
async def get_metrics():
"""
Récupère les métriques globales du dashboard
"""
try:
# Résumé des métriques
summary_query = """
SELECT
count() AS total_detections,
countIf(threat_level = 'CRITICAL') AS critical_count,
countIf(threat_level = 'HIGH') AS high_count,
countIf(threat_level = 'MEDIUM') AS medium_count,
countIf(threat_level = 'LOW') AS low_count,
countIf(bot_name != '') AS known_bots_count,
countIf(bot_name = '') AS anomalies_count,
uniq(src_ip) AS unique_ips
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 24 HOUR
"""
summary_result = db.query(summary_query)
summary_row = summary_result.result_rows[0] if summary_result.result_rows else None
if not summary_row:
raise HTTPException(status_code=404, detail="Aucune donnée disponible")
summary = MetricsSummary(
total_detections=summary_row[0],
critical_count=summary_row[1],
high_count=summary_row[2],
medium_count=summary_row[3],
low_count=summary_row[4],
known_bots_count=summary_row[5],
anomalies_count=summary_row[6],
unique_ips=summary_row[7]
)
# Série temporelle (par heure)
timeseries_query = """
SELECT
toStartOfHour(detected_at) AS hour,
count() AS total,
countIf(threat_level = 'CRITICAL') AS critical,
countIf(threat_level = 'HIGH') AS high,
countIf(threat_level = 'MEDIUM') AS medium,
countIf(threat_level = 'LOW') AS low
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 24 HOUR
GROUP BY hour
ORDER BY hour
"""
timeseries_result = db.query(timeseries_query)
timeseries = [
TimeSeriesPoint(
hour=row[0],
total=row[1],
critical=row[2],
high=row[3],
medium=row[4],
low=row[5]
)
for row in timeseries_result.result_rows
]
# Distribution par menace
threat_distribution = {
"CRITICAL": summary.critical_count,
"HIGH": summary.high_count,
"MEDIUM": summary.medium_count,
"LOW": summary.low_count
}
return MetricsResponse(
summary=summary,
timeseries=timeseries,
threat_distribution=threat_distribution
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur lors de la récupération des métriques: {str(e)}")
@router.get("/threats")
async def get_threat_distribution():
"""
Récupère la répartition par niveau de menace
"""
try:
query = """
SELECT
threat_level,
count() AS count,
round(count() * 100.0 / sum(count()) OVER (), 2) AS percentage
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 24 HOUR
GROUP BY threat_level
ORDER BY count DESC
"""
result = db.query(query)
return {
"items": [
{"threat_level": row[0], "count": row[1], "percentage": row[2]}
for row in result.result_rows
]
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/baseline")
async def get_metrics_baseline():
"""
Compare les métriques actuelles (24h) vs hier (24h-48h) pour afficher les tendances.
"""
try:
query = """
SELECT
countIf(detected_at >= now() - INTERVAL 24 HOUR) AS today_total,
countIf(detected_at >= now() - INTERVAL 48 HOUR AND detected_at < now() - INTERVAL 24 HOUR) AS yesterday_total,
uniqIf(src_ip, detected_at >= now() - INTERVAL 24 HOUR) AS today_ips,
uniqIf(src_ip, detected_at >= now() - INTERVAL 48 HOUR AND detected_at < now() - INTERVAL 24 HOUR) AS yesterday_ips,
countIf(threat_level = 'CRITICAL' AND detected_at >= now() - INTERVAL 24 HOUR) AS today_critical,
countIf(threat_level = 'CRITICAL' AND detected_at >= now() - INTERVAL 48 HOUR AND detected_at < now() - INTERVAL 24 HOUR) AS yesterday_critical
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 48 HOUR
"""
r = db.query(query)
row = r.result_rows[0] if r.result_rows else None
def pct_change(today: int, yesterday: int) -> float:
if yesterday == 0:
return 100.0 if today > 0 else 0.0
return round((today - yesterday) / yesterday * 100, 1)
today_total = int(row[0] or 0) if row else 0
yesterday_total = int(row[1] or 0) if row else 0
today_ips = int(row[2] or 0) if row else 0
yesterday_ips = int(row[3] or 0) if row else 0
today_crit = int(row[4] or 0) if row else 0
yesterday_crit = int(row[5] or 0) if row else 0
return {
"total_detections": {
"today": today_total,
"yesterday": yesterday_total,
"pct_change": pct_change(today_total, yesterday_total),
},
"unique_ips": {
"today": today_ips,
"yesterday": yesterday_ips,
"pct_change": pct_change(today_ips, yesterday_ips),
},
"critical_alerts": {
"today": today_crit,
"yesterday": yesterday_crit,
"pct_change": pct_change(today_crit, yesterday_crit),
},
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur baseline: {str(e)}")

View File

@ -0,0 +1,425 @@
"""
Endpoints pour les features ML / IA (scores d'anomalies, radar, scatter)
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
router = APIRouter(prefix="/api/ml", tags=["ml_features"])
def _attack_type(fuzzing_index: float, hit_velocity: float,
is_fake_nav: int, ua_ch_mismatch: int) -> str:
if fuzzing_index > 50:
return "brute_force"
if hit_velocity > 1.0:
return "flood"
if is_fake_nav:
return "scraper"
if ua_ch_mismatch:
return "spoofing"
return "scanner"
@router.get("/top-anomalies")
async def get_top_anomalies(limit: int = Query(50, ge=1, le=500)):
"""Top IPs anomales (24h) — bypass view_ai_features_1h pour éviter les window functions.
Query directe sur agg_host_ip_ja4_1h + LEFT JOIN agg_header_fingerprint_1h.
"""
try:
sql = """
SELECT
replaceRegexpAll(toString(a.src_ip), '^::ffff:', '') AS ip,
any(a.ja4) AS ja4,
any(a.host) AS host,
sum(a.hits) AS hits,
round(uniqMerge(a.uniq_query_params)
/ greatest(uniqMerge(a.uniq_paths), 1), 4) AS fuzzing_index,
round(sum(a.hits)
/ greatest(dateDiff('second', min(a.first_seen), max(a.last_seen)), 1), 2) AS hit_velocity,
round(sum(a.count_head) / greatest(sum(a.hits), 1), 4) AS head_ratio,
round(sum(a.count_no_sec_fetch) / greatest(sum(a.hits), 1), 4) AS sec_fetch_absence,
round(sum(a.tls12_count) / greatest(sum(a.hits), 1), 4) AS tls12_ratio,
round(sum(a.count_generic_accept) / greatest(sum(a.hits), 1), 4) AS generic_accept_ratio,
any(a.src_country_code) AS country,
any(a.src_as_name) AS asn_name,
max(h.ua_ch_mismatch) AS ua_ch_mismatch,
max(h.modern_browser_score) AS browser_score,
dictGetOrDefault('mabase_prod.dict_asn_reputation', 'label', toUInt64(any(a.src_asn)), 'unknown') AS asn_label,
coalesce(
nullIf(dictGetOrDefault('mabase_prod.dict_bot_ja4', 'bot_name', tuple(any(a.ja4)), ''), ''),
''
) AS bot_name
FROM mabase_prod.agg_host_ip_ja4_1h a
LEFT JOIN mabase_prod.agg_header_fingerprint_1h h
ON a.src_ip = h.src_ip AND a.window_start = h.window_start
WHERE a.window_start >= now() - INTERVAL 24 HOUR
GROUP BY a.src_ip
ORDER BY fuzzing_index DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
fuzzing = float(row[4] or 0)
velocity = float(row[5] or 0)
ua_mm = int(row[12] or 0)
items.append({
"ip": str(row[0]),
"ja4": str(row[1]),
"host": str(row[2]),
"hits": int(row[3] or 0),
"fuzzing_index": fuzzing,
"hit_velocity": velocity,
"head_ratio": float(row[6] or 0),
"sec_fetch_absence": float(row[7] or 0),
"tls12_ratio": float(row[8] or 0),
"generic_accept_ratio": float(row[9] or 0),
"country": str(row[10] or ""),
"asn_name": str(row[11] or ""),
"ua_ch_mismatch": ua_mm,
"browser_score": int(row[13] or 0),
"asn_label": str(row[14] or ""),
"bot_name": str(row[15] or ""),
"attack_type": _attack_type(fuzzing, velocity, 0, ua_mm),
})
return {"items": items}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/ip/{ip}/radar")
async def get_ip_radar(ip: str):
"""Scores radar pour une IP spécifique (8 dimensions d'anomalie)."""
try:
sql = """
SELECT
avg(fuzzing_index) AS fuzzing_index,
avg(hit_velocity) AS hit_velocity,
avg(is_fake_navigation) AS is_fake_navigation,
avg(ua_ch_mismatch) AS ua_ch_mismatch,
avg(sni_host_mismatch) AS sni_host_mismatch,
avg(orphan_ratio) AS orphan_ratio,
avg(path_diversity_ratio) AS path_diversity_ratio,
avg(anomalous_payload_ratio) AS anomalous_payload_ratio
FROM mabase_prod.view_ai_features_1h
WHERE replaceRegexpAll(toString(src_ip), '^::ffff:', '') = %(ip)s
AND window_start >= now() - INTERVAL 24 HOUR
"""
result = db.query(sql, {"ip": ip})
if not result.result_rows:
raise HTTPException(status_code=404, detail="IP not found")
row = result.result_rows[0]
def _f(v) -> float:
return float(v or 0)
return {
"ip": ip,
"fuzzing_score": min(100.0, _f(row[0])),
"velocity_score": min(100.0, _f(row[1]) * 100),
"fake_nav_score": _f(row[2]) * 100,
"ua_mismatch_score": _f(row[3]) * 100,
"sni_mismatch_score": _f(row[4]) * 100,
"orphan_score": min(100.0, _f(row[5]) * 100),
"path_repetition_score": max(0.0, 100 - _f(row[6]) * 100),
"payload_anomaly_score": min(100.0, _f(row[7]) * 100),
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/score-distribution")
async def get_score_distribution():
"""
Distribution de TOUS les scores ML depuis ml_all_scores (3j).
Single query avec conditional aggregates pour éviter le double scan.
"""
try:
# Single scan — global totals + per-model breakdown via GROUPING SETS
sql = """
SELECT
threat_level,
model_name,
count() AS total,
round(avg(anomaly_score), 4) AS avg_score,
round(min(anomaly_score), 4) AS min_score,
countIf(threat_level = 'NORMAL') AS normal_count,
countIf(threat_level NOT IN ('NORMAL','KNOWN_BOT')) AS anomaly_count,
countIf(threat_level = 'KNOWN_BOT') AS bot_count
FROM mabase_prod.ml_all_scores
WHERE detected_at >= now() - INTERVAL 3 DAY
GROUP BY threat_level, model_name
ORDER BY model_name, total DESC
"""
result = db.query(sql)
by_model: dict = {}
grand_total = 0
total_normal = total_anomaly = total_bot = 0
for row in result.result_rows:
level = str(row[0])
model = str(row[1])
total = int(row[2])
grand_total += total
total_normal += int(row[5] or 0)
total_anomaly += int(row[6] or 0)
total_bot += int(row[7] or 0)
if model not in by_model:
by_model[model] = []
by_model[model].append({
"threat_level": level,
"total": total,
"avg_score": float(row[3] or 0),
"min_score": float(row[4] or 0),
})
grand_total = max(grand_total, 1)
return {
"by_model": by_model,
"totals": {
"normal": total_normal,
"anomaly": total_anomaly,
"known_bot": total_bot,
"grand_total": grand_total,
"normal_pct": round(total_normal / grand_total * 100, 1),
"anomaly_pct": round(total_anomaly / grand_total * 100, 1),
"bot_pct": round(total_bot / grand_total * 100, 1),
}
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/score-trends")
async def get_score_trends(hours: int = Query(72, ge=1, le=168)):
"""
Évolution temporelle des scores ML depuis ml_all_scores.
Retourne le score moyen et les counts par heure et par modèle.
"""
try:
sql = """
SELECT
toStartOfHour(window_start) AS hour,
model_name,
countIf(threat_level = 'NORMAL') AS normal_count,
countIf(threat_level IN ('LOW','MEDIUM','HIGH','CRITICAL')) AS anomaly_count,
countIf(threat_level = 'KNOWN_BOT') AS bot_count,
round(avgIf(anomaly_score, threat_level IN ('LOW','MEDIUM','HIGH','CRITICAL')), 4) AS avg_anomaly_score
FROM mabase_prod.ml_all_scores
WHERE window_start >= now() - INTERVAL %(hours)s HOUR
GROUP BY hour, model_name
ORDER BY hour ASC, model_name
"""
result = db.query(sql, {"hours": hours})
points = []
for row in result.result_rows:
points.append({
"hour": str(row[0]),
"model": str(row[1]),
"normal_count": int(row[2] or 0),
"anomaly_count": int(row[3] or 0),
"bot_count": int(row[4] or 0),
"avg_anomaly_score": float(row[5] or 0),
})
return {"points": points, "hours": hours}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/b-features")
async def get_b_features(limit: int = Query(50, ge=1, le=200)):
"""
Agrégation des B-features (HTTP pures) pour les top IPs anomales.
Source: agg_host_ip_ja4_1h (SimpleAggregateFunction columns).
Expose: head_ratio, sec_fetch_absence, tls12_ratio, generic_accept_ratio, http10_ratio.
Ces features sont calculées dans view_ai_features_1h mais jamais visualisées dans le dashboard.
"""
try:
sql = """
SELECT ip, ja4, country, asn_name, total_hits AS hits,
head_ratio, sec_fetch_absence, tls12_ratio, generic_accept_ratio, http10_ratio,
missing_accept_enc_ratio, http_scheme_ratio
FROM (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
any(ja4) AS ja4,
any(src_country_code) AS country,
any(src_as_name) AS asn_name,
sum(hits) AS total_hits,
round(sum(count_head) / greatest(sum(hits),1), 4) AS head_ratio,
round(sum(count_no_sec_fetch) / greatest(sum(hits),1), 4) AS sec_fetch_absence,
round(sum(tls12_count) / greatest(sum(hits),1), 4) AS tls12_ratio,
round(sum(count_generic_accept) / greatest(sum(hits),1), 4) AS generic_accept_ratio,
round(sum(count_http10) / greatest(sum(hits),1), 4) AS http10_ratio,
round(sum(count_no_accept_enc) / greatest(sum(hits),1), 4) AS missing_accept_enc_ratio,
round(sum(count_http_scheme) / greatest(sum(hits),1), 4) AS http_scheme_ratio
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
GROUP BY src_ip
)
WHERE sec_fetch_absence > 0.5 OR generic_accept_ratio > 0.3
OR head_ratio > 0.1 OR tls12_ratio > 0.5 OR missing_accept_enc_ratio > 0.3
ORDER BY (head_ratio + sec_fetch_absence + generic_accept_ratio + missing_accept_enc_ratio) DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
items.append({
"ip": str(row[0]),
"ja4": str(row[1] or ""),
"country": str(row[2] or ""),
"asn_name": str(row[3] or ""),
"hits": int(row[4] or 0),
"head_ratio": float(row[5] or 0),
"sec_fetch_absence": float(row[6] or 0),
"tls12_ratio": float(row[7] or 0),
"generic_accept_ratio": float(row[8] or 0),
"http10_ratio": float(row[9] or 0),
"missing_accept_enc_ratio":float(row[10] or 0),
"http_scheme_ratio": float(row[11] or 0),
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/campaigns")
async def get_ml_campaigns(hours: int = Query(24, ge=1, le=168), limit: int = Query(20, ge=1, le=100)):
"""
Groupes d'anomalies détectées par DBSCAN (campaign_id >= 0).
Si aucune campagne active, fallback sur clustering par /24 subnet + JA4 commun.
Utile pour détecter les botnets distribués sans état de campagne DBSCAN.
"""
try:
# First: check real campaigns
campaign_sql = """
SELECT
campaign_id,
count() AS total_detections,
uniq(src_ip) AS unique_ips,
any(threat_level) AS dominant_threat,
groupUniqArray(3)(threat_level) AS threat_levels,
groupUniqArray(3)(bot_name) AS bot_names,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM mabase_prod.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND campaign_id >= 0
GROUP BY campaign_id
ORDER BY total_detections DESC
LIMIT %(limit)s
"""
result = db.query(campaign_sql, {"hours": hours, "limit": limit})
campaigns = []
for row in result.result_rows:
campaigns.append({
"id": f"C{row[0]}",
"campaign_id": int(row[0]),
"total_detections": int(row[1]),
"unique_ips": int(row[2]),
"dominant_threat": str(row[3] or ""),
"threat_levels": list(row[4] or []),
"bot_names": list(row[5] or []),
"first_seen": str(row[6]),
"last_seen": str(row[7]),
"source": "dbscan",
})
# Fallback: subnet-based clustering when DBSCAN has no campaigns
if not campaigns:
subnet_sql = """
SELECT
IPv4CIDRToRange(toIPv4(replaceRegexpAll(toString(src_ip),'^::ffff:','')), 24).1 AS subnet,
count() AS total_detections,
uniq(src_ip) AS unique_ips,
groupArray(3)(threat_level) AS threat_levels,
any(bot_name) AS bot_name,
any(ja4) AS sample_ja4,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM mabase_prod.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND threat_level IN ('HIGH','CRITICAL','MEDIUM')
GROUP BY subnet
HAVING unique_ips >= 3
ORDER BY total_detections DESC
LIMIT %(limit)s
"""
result2 = db.query(subnet_sql, {"hours": hours, "limit": limit})
for i, row in enumerate(result2.result_rows):
subnet_str = str(row[0]) + "/24"
campaigns.append({
"id": f"S{i+1:03d}",
"campaign_id": -1,
"subnet": subnet_str,
"total_detections": int(row[1]),
"unique_ips": int(row[2]),
"dominant_threat": str((row[3] or [""])[0]),
"threat_levels": list(row[3] or []),
"bot_names": [str(row[4] or "")],
"sample_ja4": str(row[5] or ""),
"first_seen": str(row[6]),
"last_seen": str(row[7]),
"source": "subnet_cluster",
})
dbscan_active = any(c["campaign_id"] >= 0 for c in campaigns)
return {
"campaigns": campaigns,
"total": len(campaigns),
"dbscan_active": dbscan_active,
"hours": hours,
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/scatter")
async def get_ml_scatter(limit: int = Query(200, ge=1, le=1000)):
"""Points scatter plot (fuzzing_index × hit_velocity) — bypass view_ai_features_1h."""
try:
sql = """
SELECT
ip,
ja4,
round(fuzzing_index, 4) AS fuzzing_index,
round(total_hits / greatest(dateDiff('second', min_first, max_last), 1), 2) AS hit_velocity,
total_hits AS hits,
round(total_count_head / greatest(total_hits, 1), 4) AS head_ratio,
correlated
FROM (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
any(ja4) AS ja4,
uniqMerge(uniq_query_params) / greatest(uniqMerge(uniq_paths), 1) AS fuzzing_index,
sum(hits) AS total_hits,
min(first_seen) AS min_first,
max(last_seen) AS max_last,
sum(count_head) AS total_count_head,
max(correlated_raw) AS correlated
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
GROUP BY src_ip
)
ORDER BY fuzzing_index DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
points = []
for row in result.result_rows:
fuzzing = float(row[2] or 0)
velocity = float(row[3] or 0)
points.append({
"ip": str(row[0]),
"ja4": str(row[1]),
"fuzzing_index":fuzzing,
"hit_velocity": velocity,
"hits": int(row[4] or 0),
"attack_type": _attack_type(fuzzing, velocity, 0, 0),
})
return {"points": points}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,125 @@
"""
Routes pour la réputation IP (bases de données publiques)
"""
from fastapi import APIRouter, HTTPException, Path
from typing import Dict, Any
import re
from ..services.reputation_ip import get_reputation_service
router = APIRouter(prefix="/api/reputation", tags=["Reputation"])
# Pattern de validation d'IP (IPv4)
IP_PATTERN = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')
def is_valid_ipv4(ip: str) -> bool:
"""Valide qu'une chaîne est une adresse IPv4 valide"""
if not IP_PATTERN.match(ip):
return False
# Vérifie que chaque octet est entre 0 et 255
parts = ip.split('.')
for part in parts:
try:
num = int(part)
if num < 0 or num > 255:
return False
except ValueError:
return False
return True
@router.get("/ip/{ip_address}", summary="Réputation complète d'une IP")
async def get_ip_reputation(
ip_address: str = Path(..., description="Adresse IP à vérifier")
) -> Dict[str, Any]:
"""
Récupère la réputation d'une adresse IP depuis les bases de données publiques
Sources utilisées (sans clé API):
- IP-API.com: Géolocalisation + Proxy/Hosting detection
- IPinfo.io: ASN + Organisation
Returns:
Dict avec:
- ip: Adresse IP vérifiée
- timestamp: Date de la vérification
- sources: Détails par source
- aggregated: Résultats agrégés
- is_proxy: bool
- is_hosting: bool
- is_vpn: bool
- is_tor: bool
- threat_score: 0-100
- threat_level: clean/low/medium/high/critical
- country: Pays
- asn: Numéro ASN
- asn_org: Organisation ASN
- org: ISP/Organisation
- warnings: Liste des alertes
"""
# Valide l'adresse IP
if not is_valid_ipv4(ip_address):
raise HTTPException(
status_code=400,
detail=f"Adresse IP invalide: {ip_address}. Format attendu: x.x.x.x"
)
try:
# Récupère le service de réputation
reputation_service = get_reputation_service()
# Interroge les sources
results = await reputation_service.get_reputation(ip_address)
return results
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Erreur lors de la vérification de réputation: {str(e)}"
)
@router.get("/ip/{ip_address}/summary", summary="Réputation simplifiée d'une IP")
async def get_ip_reputation_summary(
ip_address: str = Path(..., description="Adresse IP à vérifier")
) -> Dict[str, Any]:
"""
Version simplifiée de la réputation IP (juste les infos essentielles)
Utile pour affichage rapide dans les tableaux
"""
if not is_valid_ipv4(ip_address):
raise HTTPException(
status_code=400,
detail=f"Adresse IP invalide: {ip_address}"
)
try:
reputation_service = get_reputation_service()
full_results = await reputation_service.get_reputation(ip_address)
# Retourne juste l'essentiel
aggregated = full_results.get('aggregated', {})
return {
'ip': ip_address,
'threat_level': aggregated.get('threat_level', 'unknown'),
'threat_score': aggregated.get('threat_score', 0),
'is_proxy': aggregated.get('is_proxy', False),
'is_hosting': aggregated.get('is_hosting', False),
'country': aggregated.get('country'),
'country_code': aggregated.get('country_code'),
'asn': aggregated.get('asn'),
'org': aggregated.get('org'),
'warnings_count': len(aggregated.get('warnings', []))
}
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Erreur: {str(e)}"
)

View File

@ -0,0 +1,217 @@
"""
Endpoints pour la détection de la rotation de fingerprints JA4 et des menaces persistantes
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
router = APIRouter(prefix="/api/rotation", tags=["rotation"])
@router.get("/ja4-rotators")
async def get_ja4_rotators(limit: int = Query(50, ge=1, le=500)):
"""IPs qui effectuent le plus de rotation de fingerprints JA4."""
try:
sql = """
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
distinct_ja4_count,
total_hits
FROM mabase_prod.view_host_ip_ja4_rotation
ORDER BY distinct_ja4_count DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
distinct = int(row[1])
items.append({
"ip": str(row[0]),
"distinct_ja4_count":distinct,
"total_hits": int(row[2]),
"evasion_score": min(100, distinct * 15),
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/persistent-threats")
async def get_persistent_threats(limit: int = Query(100, ge=1, le=1000)):
"""Menaces persistantes triées par score de persistance."""
try:
sql = """
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
recurrence,
worst_score,
worst_threat_level,
first_seen,
last_seen
FROM mabase_prod.view_ip_recurrence
ORDER BY (least(100, recurrence * 20 + worst_score * 50)) DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
recurrence = int(row[1])
worst_score = float(row[2] or 0)
items.append({
"ip": str(row[0]),
"recurrence": recurrence,
"worst_score": worst_score,
"worst_threat_level":str(row[3] or ""),
"first_seen": str(row[4]),
"last_seen": str(row[5]),
"persistence_score": min(100, recurrence * 20 + worst_score * 50),
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/ip/{ip}/ja4-history")
async def get_ip_ja4_history(ip: str):
"""Historique des JA4 utilisés par une IP donnée."""
try:
sql = """
SELECT
ja4,
sum(hits) AS hits,
min(window_start) AS first_seen,
max(window_start) AS last_seen
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE replaceRegexpAll(toString(src_ip), '^::ffff:', '') = %(ip)s
GROUP BY ja4
ORDER BY hits DESC
"""
result = db.query(sql, {"ip": ip})
items = [
{
"ja4": str(row[0]),
"hits": int(row[1]),
"first_seen":str(row[2]),
"last_seen": str(row[3]),
}
for row in result.result_rows
]
return {"ip": ip, "ja4_history": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/sophistication")
async def get_sophistication(limit: int = Query(50, ge=1, le=500)):
"""Score de sophistication adversaire par IP (rotation JA4 + récurrence + bruteforce).
Single SQL JOIN query — aucun traitement Python sur 34K entrées.
"""
try:
sql = """
SELECT
r.ip,
r.distinct_ja4_count,
coalesce(rec.recurrence, 0) AS recurrence,
coalesce(bf.bruteforce_hits, 0) AS bruteforce_hits,
round(least(100.0,
r.distinct_ja4_count * 10
+ coalesce(rec.recurrence, 0) * 20
+ least(30.0, log(coalesce(bf.bruteforce_hits, 0) + 1) * 5)
), 1) AS sophistication_score
FROM (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
distinct_ja4_count
FROM mabase_prod.view_host_ip_ja4_rotation
) r
LEFT JOIN (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
count() AS recurrence
FROM mabase_prod.ml_detected_anomalies FINAL
GROUP BY ip
) rec ON r.ip = rec.ip
LEFT JOIN (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
sum(hits) AS bruteforce_hits
FROM mabase_prod.view_form_bruteforce_detected
GROUP BY ip
) bf ON r.ip = bf.ip
ORDER BY sophistication_score DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
score = float(row[4] or 0)
if score > 80:
tier = "APT-like"
elif score > 50:
tier = "Advanced"
elif score > 20:
tier = "Automated"
else:
tier = "Basic"
items.append({
"ip": str(row[0]),
"ja4_rotation_count": int(row[1] or 0),
"recurrence": int(row[2] or 0),
"bruteforce_hits": int(row[3] or 0),
"sophistication_score":score,
"tier": tier,
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/proactive-hunt")
async def get_proactive_hunt(
min_recurrence: int = Query(2, ge=1, description="Récurrence minimale"),
min_days: int = Query(2, ge=0, description="Jours d'activité minimum"),
limit: int = Query(50, ge=1, le=500),
):
"""IPs volant sous le radar : récurrentes mais sous le seuil de détection normal."""
try:
sql = """
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
recurrence,
worst_score,
worst_threat_level,
first_seen,
last_seen,
dateDiff('day', first_seen, last_seen) AS days_active
FROM mabase_prod.view_ip_recurrence
WHERE recurrence >= %(min_recurrence)s
AND abs(worst_score) < 0.5
AND dateDiff('day', first_seen, last_seen) >= %(min_days)s
ORDER BY recurrence DESC, worst_score ASC
LIMIT %(limit)s
"""
result = db.query(sql, {
"min_recurrence": min_recurrence,
"min_days": min_days,
"limit": limit,
})
items = []
for row in result.result_rows:
recurrence = int(row[1])
worst_score = float(row[2] or 0)
days_active = int(row[6] or 0)
ratio = recurrence / (worst_score + 0.1)
risk = "Évadeur potentiel" if ratio > 10 else "Persistant modéré"
items.append({
"ip": str(row[0]),
"recurrence": recurrence,
"worst_score": round(worst_score, 4),
"worst_threat_level": str(row[3] or ""),
"first_seen": str(row[4]),
"last_seen": str(row[5]),
"days_active": days_active,
"risk_assessment": risk,
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,125 @@
"""
Endpoint de recherche globale rapide — utilisé par la barre Cmd+K
"""
from fastapi import APIRouter, Query
from ..database import db
router = APIRouter(prefix="/api/search", tags=["search"])
IP_RE = r"^(\d{1,3}\.){0,3}\d{1,3}$"
@router.get("/quick")
async def quick_search(q: str = Query(..., min_length=1, max_length=100)):
"""
Recherche unifiée sur IPs, JA4, ASN, hosts.
Retourne jusqu'à 5 résultats par catégorie.
"""
q = q.strip()
pattern = f"%{q}%"
results = []
# ── IPs ──────────────────────────────────────────────────────────────────
ip_rows = db.query(
"""
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
count() AS hits,
max(detected_at) AS last_seen,
any(threat_level) AS threat_level
FROM ml_detected_anomalies
WHERE ilike(toString(src_ip), %(p)s)
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY clean_ip
ORDER BY hits DESC
""",
{"p": pattern},
)
for r in ip_rows.result_rows:
ip = str(r[0])
results.append({
"type": "ip",
"value": ip,
"label": ip,
"meta": f"{r[1]} détections · {r[3]}",
"url": f"/detections/ip/{ip}",
"investigation_url": f"/investigation/{ip}",
})
# ── JA4 fingerprints ─────────────────────────────────────────────────────
ja4_rows = db.query(
"""
SELECT
ja4,
count() AS hits,
uniq(src_ip) AS unique_ips
FROM ml_detected_anomalies
WHERE ilike(ja4, %(p)s)
AND ja4 != ''
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY ja4
ORDER BY hits DESC
""",
{"p": pattern},
)
for r in ja4_rows.result_rows:
results.append({
"type": "ja4",
"value": str(r[0]),
"label": str(r[0]),
"meta": f"{r[1]} détections · {r[2]} IPs",
"url": f"/investigation/ja4/{r[0]}",
})
# ── Hosts ─────────────────────────────────────────────────────────────────
host_rows = db.query(
"""
SELECT
host,
count() AS hits,
uniq(src_ip) AS unique_ips
FROM ml_detected_anomalies
WHERE ilike(host, %(p)s)
AND host != ''
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY host
ORDER BY hits DESC
""",
{"p": pattern},
)
for r in host_rows.result_rows:
results.append({
"type": "host",
"value": str(r[0]),
"label": str(r[0]),
"meta": f"{r[1]} hits · {r[2]} IPs",
"url": f"/detections?search={r[0]}",
})
# ── ASN ───────────────────────────────────────────────────────────────────
asn_rows = db.query(
"""
SELECT
asn_org,
asn_number,
count() AS hits,
uniq(src_ip) AS unique_ips
FROM ml_detected_anomalies
WHERE (ilike(asn_org, %(p)s) OR ilike(asn_number, %(p)s))
AND asn_org != '' AND asn_number != ''
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY asn_org, asn_number
ORDER BY hits DESC
""",
{"p": pattern},
)
for r in asn_rows.result_rows:
results.append({
"type": "asn",
"value": str(r[1]),
"label": f"AS{r[1]}{r[0]}",
"meta": f"{r[2]} hits · {r[3]} IPs",
"url": f"/detections?asn={r[1]}",
})
return {"query": q, "results": results}

View File

@ -0,0 +1,223 @@
"""
Endpoints pour la détection du TCP spoofing / fingerprinting OS
Approche multi-signal (p0f-style) :
- TTL initial estimé → famille OS (Linux/Mac=64, Windows=128, Cisco/BSD=255)
- MSS → type de réseau (Ethernet=1460, PPPoE=1452, VPN=1380-1420)
- Taille de fenêtre → signature OS précise
- Facteur d'échelle → affine la version kernel/stack TCP
Détection bots : signatures connues (Masscan/ZMap/Mirai) identifiées par combinaison
win+scale+mss indépendamment de l'UA.
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
from ..services.tcp_fingerprint import (
fingerprint_os,
detect_spoof,
declared_os_from_ua,
)
router = APIRouter(prefix="/api/tcp-spoofing", tags=["tcp_spoofing"])
@router.get("/overview")
async def get_tcp_spoofing_overview():
"""Statistiques globales avec fingerprinting multi-signal (TTL + MSS + fenêtre + scale)."""
try:
sql = """
SELECT
count() AS total_entries,
uniq(src_ip) AS unique_ips,
countIf(tcp_ttl_raw = 0) AS no_tcp_data,
countIf(tcp_ttl_raw > 0) AS with_tcp_data,
countIf(tcp_ttl_raw > 0 AND tcp_ttl_raw <= 64) AS linux_mac_fp,
countIf(tcp_ttl_raw > 64 AND tcp_ttl_raw <= 128) AS windows_fp,
countIf(tcp_ttl_raw > 128) AS cisco_bsd_fp,
countIf(tcp_win_raw = 5808 AND tcp_mss_raw = 1452 AND tcp_scale_raw = 4) AS bot_scanner_fp
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
"""
result = db.query(sql)
row = result.result_rows[0]
# Distribution TTL (top 15)
ttl_sql = """
SELECT tcp_ttl_raw AS ttl, count() AS cnt, uniq(src_ip) AS ips
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY ttl ORDER BY cnt DESC
"""
ttl_res = db.query(ttl_sql)
# Distribution MSS — nouveau signal clé (top 12)
mss_sql = """
SELECT tcp_mss_raw AS mss, count() AS cnt, uniq(src_ip) AS ips
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_mss_raw > 0
GROUP BY mss ORDER BY cnt DESC
"""
mss_res = db.query(mss_sql)
# Distribution fenêtre (top 10)
win_sql = """
SELECT tcp_win_raw AS win, count() AS cnt
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY win ORDER BY cnt DESC
"""
win_res = db.query(win_sql)
return {
"total_entries": int(row[0]),
"unique_ips": int(row[1]),
"no_tcp_data": int(row[2]),
"with_tcp_data": int(row[3]),
"linux_mac_fingerprint": int(row[4]),
"windows_fingerprint": int(row[5]),
"cisco_bsd_fingerprint": int(row[6]),
"bot_scanner_fingerprint": int(row[7]),
"ttl_distribution": [
{"ttl": int(r[0]), "count": int(r[1]), "ips": int(r[2])}
for r in ttl_res.result_rows
],
"mss_distribution": [
{"mss": int(r[0]), "count": int(r[1]), "ips": int(r[2])}
for r in mss_res.result_rows
],
"window_size_distribution": [
{"window_size": int(r[0]), "count": int(r[1])}
for r in win_res.result_rows
],
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/list")
async def get_tcp_spoofing_list(
limit: int = Query(100, ge=1, le=1000),
offset: int = Query(0, ge=0),
spoof_only: bool = Query(False, description="Retourner uniquement les spoofs/bots confirmés"),
):
"""Liste avec fingerprinting multi-signal (TTL + MSS + fenêtre + scale).
Inclut les champs enrichis : mss, win_scale, initial_ttl, hop_count, confidence, network_path, is_bot_tool.
"""
try:
count_sql = """
SELECT count() FROM (
SELECT src_ip, ja4
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY src_ip, ja4
)
"""
total = int(db.query(count_sql).result_rows[0][0])
sql = """
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS src_ip,
ja4,
any(tcp_ttl_raw) AS tcp_ttl,
any(tcp_win_raw) AS tcp_window_size,
any(tcp_scale_raw) AS tcp_win_scale,
any(tcp_mss_raw) AS tcp_mss,
any(first_ua) AS first_ua,
sum(hits) AS hits
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY src_ip, ja4
ORDER BY hits DESC
LIMIT %(limit)s OFFSET %(offset)s
"""
result = db.query(sql, {"limit": limit, "offset": offset})
items = []
for row in result.result_rows:
ip = str(row[0])
ja4 = str(row[1] or "")
ttl = int(row[2] or 0)
win = int(row[3] or 0)
scale = int(row[4] or 0)
mss = int(row[5] or 0)
ua = str(row[6] or "")
hits = int(row[7] or 0)
fp = fingerprint_os(ttl, win, scale, mss)
dec_os = declared_os_from_ua(ua)
spoof_res = detect_spoof(fp, dec_os)
if spoof_only and not spoof_res.is_spoof:
continue
items.append({
"ip": ip,
"ja4": ja4,
"tcp_ttl": ttl,
"tcp_window_size": win,
"tcp_win_scale": scale,
"tcp_mss": mss,
"hits": hits,
"first_ua": ua,
"suspected_os": fp.os_name,
"initial_ttl": fp.initial_ttl,
"hop_count": fp.hop_count,
"confidence": fp.confidence,
"network_path": fp.network_path,
"is_bot_tool": fp.is_bot_tool,
"declared_os": dec_os,
"spoof_flag": spoof_res.is_spoof,
"spoof_reason": spoof_res.reason,
})
return {"items": items, "total": total}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/matrix")
async def get_tcp_spoofing_matrix():
"""Matrice OS suspecté × OS déclaré avec fingerprinting multi-signal."""
try:
sql = """
SELECT
any(tcp_ttl_raw) AS ttl,
any(tcp_win_raw) AS win,
any(tcp_scale_raw) AS scale,
any(tcp_mss_raw) AS mss,
any(first_ua) AS ua,
count() AS cnt
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY src_ip, ja4
"""
result = db.query(sql)
counts: dict = {}
for row in result.result_rows:
ttl = int(row[0] or 0)
win = int(row[1] or 0)
scale = int(row[2] or 0)
mss = int(row[3] or 0)
ua = str(row[4] or "")
cnt = int(row[5] or 1)
fp = fingerprint_os(ttl, win, scale, mss)
dec_os = declared_os_from_ua(ua)
spoof_res = detect_spoof(fp, dec_os)
key = (fp.os_name, dec_os, spoof_res.is_spoof, fp.is_bot_tool)
counts[key] = counts.get(key, 0) + cnt
matrix = [
{
"suspected_os": k[0],
"declared_os": k[1],
"count": v,
"is_spoof": k[2],
"is_bot_tool": k[3],
}
for k, v in counts.items()
]
matrix.sort(key=lambda x: x["count"], reverse=True)
return {"matrix": matrix}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,706 @@
"""
Endpoints pour la variabilité des attributs
"""
from fastapi import APIRouter, HTTPException, Query
from typing import Optional
from ..database import db
from ..models import (
VariabilityResponse, VariabilityAttributes, AttributeValue, Insight,
UserAgentsResponse, UserAgentValue
)
router = APIRouter(prefix="/api/variability", tags=["variability"])
# =============================================================================
# ROUTES SPÉCIFIQUES (doivent être avant les routes génériques)
# =============================================================================
@router.get("/{attr_type}/{value:path}/ips", response_model=dict)
async def get_associated_ips(
attr_type: str,
value: str,
limit: int = Query(100, ge=1, le=1000, description="Nombre maximum d'IPs")
):
"""
Récupère la liste des IPs associées à un attribut
"""
try:
# Mapping des types vers les colonnes
type_column_map = {
"ip": "src_ip",
"ja4": "ja4",
"country": "country_code",
"asn": "asn_number",
"host": "host",
}
if attr_type not in type_column_map:
raise HTTPException(
status_code=400,
detail=f"Type invalide. Types supportés: {', '.join(type_column_map.keys())}"
)
column = type_column_map[attr_type]
query = f"""
SELECT src_ip, count() AS hit_count
FROM ml_detected_anomalies
WHERE {column} = %(value)s
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY src_ip
ORDER BY hit_count DESC
LIMIT %(limit)s
"""
result = db.query(query, {"value": value, "limit": limit})
total_hits = sum(row[1] for row in result.result_rows) or 1
ips = [
{"ip": str(row[0]), "count": row[1], "percentage": round(row[1] * 100.0 / total_hits, 2)}
for row in result.result_rows
]
# Compter le total
count_query = f"""
SELECT uniq(src_ip) AS total
FROM ml_detected_anomalies
WHERE {column} = %(value)s
AND detected_at >= now() - INTERVAL 24 HOUR
"""
count_result = db.query(count_query, {"value": value})
total = count_result.result_rows[0][0] if count_result.result_rows else 0
return {
"type": attr_type,
"value": value,
"ips": ips,
"total": total,
"showing": len(ips)
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/{attr_type}/{value:path}/attributes", response_model=dict)
async def get_associated_attributes(
attr_type: str,
value: str,
target_attr: str = Query(..., description="Type d'attribut à récupérer (user_agents, ja4, countries, asns, hosts)"),
limit: int = Query(50, ge=1, le=500, description="Nombre maximum de résultats")
):
"""
Récupère la liste des attributs associés (ex: User-Agents pour un pays)
"""
try:
# Mapping des types vers les colonnes
type_column_map = {
"ip": "src_ip",
"ja4": "ja4",
"country": "country_code",
"asn": "asn_number",
"host": "host",
}
# Mapping des attributs cibles
target_column_map = {
"user_agents": None, # handled separately via view_dashboard_entities
"ja4": "ja4",
"countries": "country_code",
"asns": "asn_number",
"hosts": "host",
}
if attr_type not in type_column_map:
raise HTTPException(status_code=400, detail=f"Type '{attr_type}' invalide")
if target_attr not in target_column_map:
raise HTTPException(
status_code=400,
detail=f"Attribut cible invalide. Supportés: {', '.join(target_column_map.keys())}"
)
column = type_column_map[attr_type]
target_column = target_column_map[target_attr]
# Pour user_agents: requête via view_dashboard_user_agents
# Colonnes: src_ip, ja4, hour, log_date, user_agents, requests
if target_column is None:
if attr_type == "ip":
ua_where = "toString(src_ip) = %(value)s"
elif attr_type == "ja4":
ua_where = "ja4 = %(value)s"
else:
# country/asn/host: pivot via ml_detected_anomalies
ua_where = f"""toString(src_ip) IN (
SELECT DISTINCT replaceRegexpAll(toString(src_ip), '^::ffff:', '')
FROM ml_detected_anomalies
WHERE {column} = %(value)s AND detected_at >= now() - INTERVAL 24 HOUR
)"""
ua_q = f"""
SELECT ua AS value, sum(requests) AS count,
round(sum(requests) * 100.0 / sum(sum(requests)) OVER (), 2) AS percentage
FROM view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE {ua_where}
AND hour >= now() - INTERVAL 24 HOUR AND ua != ''
GROUP BY value ORDER BY count DESC LIMIT %(limit)s
"""
ua_result = db.query(ua_q, {"value": value, "limit": limit})
items = [{"value": str(r[0]), "count": r[1], "percentage": round(float(r[2]), 2) if r[2] else 0.0}
for r in ua_result.result_rows]
return {"type": attr_type, "value": value, "target": target_attr, "items": items, "total": len(items), "showing": len(items)}
query = f"""
SELECT
{target_column} AS value,
count() AS count,
round(count() * 100.0 / sum(count()) OVER (), 2) AS percentage
FROM ml_detected_anomalies
WHERE {column} = %(value)s
AND {target_column} != '' AND {target_column} IS NOT NULL
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY value
ORDER BY count DESC
LIMIT %(limit)s
"""
result = db.query(query, {"value": value, "limit": limit})
items = [
{
"value": str(row[0]),
"count": row[1],
"percentage": round(float(row[2]), 2) if row[2] else 0.0
}
for row in result.result_rows
]
# Compter le total
count_query = f"""
SELECT uniq({target_column}) AS total
FROM ml_detected_anomalies
WHERE {column} = %(value)s
AND {target_column} != '' AND {target_column} IS NOT NULL
AND detected_at >= now() - INTERVAL 24 HOUR
"""
count_result = db.query(count_query, {"value": value})
total = count_result.result_rows[0][0] if count_result.result_rows else 0
return {
"type": attr_type,
"value": value,
"target": target_attr,
"items": items,
"total": total,
"showing": len(items)
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/{attr_type}/{value:path}/user_agents", response_model=UserAgentsResponse)
async def get_user_agents(
attr_type: str,
value: str,
limit: int = Query(100, ge=1, le=500, description="Nombre maximum de user-agents")
):
"""
Récupère la liste des User-Agents associés à un attribut (IP, JA4, pays, etc.)
Les données sont récupérées depuis la vue materialisée view_dashboard_user_agents
"""
try:
# Mapping des types vers les colonnes
type_column_map = {
"ip": "src_ip",
"ja4": "ja4",
"country": "country_code",
"asn": "asn_number",
"host": "host",
}
if attr_type not in type_column_map:
raise HTTPException(
status_code=400,
detail=f"Type invalide. Types supportés: {', '.join(type_column_map.keys())}"
)
column = type_column_map[attr_type]
# view_dashboard_user_agents colonnes: src_ip, ja4, hour, log_date, user_agents, requests
if attr_type == "ip":
where = "toString(src_ip) = %(value)s"
params: dict = {"value": value, "limit": limit}
elif attr_type == "ja4":
where = "ja4 = %(value)s"
params = {"value": value, "limit": limit}
else:
# country / asn / host: pivot via ml_detected_anomalies → IPs connus → vue par src_ip
ml_col = {"country": "country_code", "asn": "asn_number", "host": "host"}[attr_type]
where = f"""toString(src_ip) IN (
SELECT DISTINCT replaceRegexpAll(toString(src_ip), '^::ffff:', '')
FROM ml_detected_anomalies
WHERE {ml_col} = %(value)s
AND detected_at >= now() - INTERVAL 24 HOUR
)"""
params = {"value": value, "limit": limit}
query = f"""
SELECT
ua AS user_agent,
sum(requests) AS count,
round(sum(requests) * 100.0 / sum(sum(requests)) OVER (), 2) AS percentage,
min(log_date) AS first_seen,
max(log_date) AS last_seen
FROM view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE {where}
AND hour >= now() - INTERVAL 24 HOUR
AND ua != ''
GROUP BY user_agent
ORDER BY count DESC
LIMIT %(limit)s
"""
result = db.query(query, params)
count_query = f"""
SELECT uniqExact(ua) AS total
FROM view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE {where}
AND hour >= now() - INTERVAL 24 HOUR
AND ua != ''
"""
count_result = db.query(count_query, params)
user_agents = [
UserAgentValue(
value=str(row[0]),
count=row[1] or 0,
percentage=round(float(row[2]), 2) if row[2] else 0.0,
first_seen=row[3] if len(row) > 3 and row[3] else None,
last_seen=row[4] if len(row) > 4 and row[4] else None,
)
for row in result.result_rows
]
total = count_result.result_rows[0][0] if count_result.result_rows else 0
return {
"type": attr_type,
"value": value,
"user_agents": user_agents,
"total": total,
"showing": len(user_agents)
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ROUTE GÉNÉRIQUE (doit être en dernier)
# =============================================================================
def get_attribute_value(row, count_idx: int, percentage_idx: int,
first_seen_idx: Optional[int] = None,
last_seen_idx: Optional[int] = None,
threat_idx: Optional[int] = None,
unique_ips_idx: Optional[int] = None) -> AttributeValue:
"""Helper pour créer un AttributeValue depuis une ligne ClickHouse"""
return AttributeValue(
value=str(row[0]),
count=row[count_idx] or 0,
percentage=round(float(row[percentage_idx]), 2) if row[percentage_idx] else 0.0,
first_seen=row[first_seen_idx] if first_seen_idx is not None and len(row) > first_seen_idx else None,
last_seen=row[last_seen_idx] if last_seen_idx is not None and len(row) > last_seen_idx else None,
threat_levels=_parse_threat_levels(row[threat_idx]) if threat_idx is not None and len(row) > threat_idx and row[threat_idx] else None,
unique_ips=row[unique_ips_idx] if unique_ips_idx is not None and len(row) > unique_ips_idx else None,
primary_threat=_get_primary_threat(row[threat_idx]) if threat_idx is not None and len(row) > threat_idx and row[threat_idx] else None
)
def _parse_threat_levels(threat_str: str) -> dict:
"""Parse une chaîne de type 'CRITICAL:5,HIGH:10' en dict"""
if not threat_str:
return {}
result = {}
for part in str(threat_str).split(','):
if ':' in part:
level, count = part.strip().split(':')
result[level.strip()] = int(count.strip())
return result
def _get_primary_threat(threat_str: str) -> str:
"""Retourne le niveau de menace principal"""
if not threat_str:
return ""
levels_order = ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
for level in levels_order:
if level in str(threat_str):
return level
return ""
def _generate_insights(attr_type: str, value: str, attributes: VariabilityAttributes,
total_detections: int, unique_ips: int) -> list:
"""Génère des insights basés sur les données de variabilité"""
insights = []
# User-Agent insights
if len(attributes.user_agents) > 1:
insights.append(Insight(
type="warning",
message=f"{len(attributes.user_agents)} User-Agents différents → Possible rotation/obfuscation"
))
# JA4 insights
if len(attributes.ja4) > 1:
insights.append(Insight(
type="warning",
message=f"{len(attributes.ja4)} JA4 fingerprints différents → Possible rotation de fingerprint"
))
# IP insights (pour les sélections non-IP)
if attr_type != "ip" and unique_ips > 10:
insights.append(Insight(
type="info",
message=f"{unique_ips} IPs différentes associées → Possible infrastructure distribuée"
))
# ASN insights
if len(attributes.asns) == 1 and attributes.asns[0].value:
asn_label_lower = ""
if attr_type == "asn":
asn_label_lower = value.lower()
# Vérifier si c'est un ASN de hosting/cloud
hosting_keywords = ["ovh", "amazon", "aws", "google", "azure", "digitalocean", "linode", "vultr"]
if any(kw in (attributes.asns[0].value or "").lower() for kw in hosting_keywords):
insights.append(Insight(
type="warning",
message="ASN de type hosting/cloud → Souvent utilisé pour des bots"
))
# Country insights
if len(attributes.countries) > 5:
insights.append(Insight(
type="info",
message=f"Présent dans {len(attributes.countries)} pays → Distribution géographique large"
))
# Threat level insights
critical_count = 0
high_count = 0
for tl in attributes.threat_levels:
if tl.value == "CRITICAL":
critical_count = tl.count
elif tl.value == "HIGH":
high_count = tl.count
if critical_count > total_detections * 0.3:
insights.append(Insight(
type="warning",
message=f"{round(critical_count * 100 / total_detections)}% de détections CRITICAL → Menace sévère"
))
elif high_count > total_detections * 0.5:
insights.append(Insight(
type="info",
message=f"{round(high_count * 100 / total_detections)}% de détections HIGH → Activité suspecte"
))
return insights
@router.get("/{attr_type}/{value:path}", response_model=VariabilityResponse)
async def get_variability(attr_type: str, value: str):
"""
Récupère la variabilité des attributs associés à une valeur
attr_type: ip, ja4, country, asn, host, user_agent
value: la valeur à investiguer
"""
try:
# Mapping des types vers les colonnes ClickHouse
type_column_map = {
"ip": "src_ip",
"ja4": "ja4",
"country": "country_code",
"asn": "asn_number",
"host": "host",
"user_agent": "header_user_agent"
}
if attr_type not in type_column_map:
raise HTTPException(
status_code=400,
detail=f"Type invalide. Types supportés: {', '.join(type_column_map.keys())}"
)
column = type_column_map[attr_type]
# Requête principale - Récupère toutes les détections pour cette valeur
# On utilise toStartOfHour pour le timeseries et on évite header_user_agent si inexistant
base_query = f"""
SELECT *
FROM (
SELECT
detected_at,
src_ip,
ja4,
host,
'' AS user_agent,
country_code,
asn_number,
asn_org,
threat_level,
model_name,
anomaly_score
FROM ml_detected_anomalies
WHERE {column} = %(value)s
AND detected_at >= now() - INTERVAL 24 HOUR
)
"""
# Stats globales
stats_query = f"""
SELECT
count() AS total_detections,
uniq(src_ip) AS unique_ips,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM ml_detected_anomalies
WHERE {column} = %(value)s
AND detected_at >= now() - INTERVAL 24 HOUR
"""
stats_result = db.query(stats_query, {"value": value})
if not stats_result.result_rows or stats_result.result_rows[0][0] == 0:
raise HTTPException(status_code=404, detail="Aucune donnée trouvée")
stats_row = stats_result.result_rows[0]
total_detections = stats_row[0]
unique_ips = stats_row[1]
first_seen = stats_row[2]
last_seen = stats_row[3]
# User-Agents depuis http_logs pour des comptes exacts par requête
# (view_dashboard_user_agents déduplique par heure, ce qui sous-compte les hits)
_ua_params: dict = {"value": value}
if attr_type == "ip":
_ua_logs_where = "src_ip = toIPv4(%(value)s)"
ua_query_simple = f"""
SELECT
header_user_agent AS user_agent,
count() AS count,
round(count() * 100.0 / (
SELECT count() FROM mabase_prod.http_logs
WHERE {_ua_logs_where} AND time >= now() - INTERVAL 24 HOUR
), 2) AS percentage,
min(time) AS first_seen,
max(time) AS last_seen
FROM mabase_prod.http_logs
WHERE {_ua_logs_where}
AND time >= now() - INTERVAL 24 HOUR
AND header_user_agent != '' AND header_user_agent IS NOT NULL
GROUP BY user_agent
ORDER BY count DESC
"""
ua_result = db.query(ua_query_simple, _ua_params)
user_agents = [get_attribute_value(row, 1, 2, 3, 4) for row in ua_result.result_rows]
elif attr_type == "ja4":
_ua_logs_where = "ja4 = %(value)s"
ua_query_simple = f"""
SELECT
header_user_agent AS user_agent,
count() AS count,
round(count() * 100.0 / (
SELECT count() FROM mabase_prod.http_logs
WHERE {_ua_logs_where} AND time >= now() - INTERVAL 24 HOUR
), 2) AS percentage,
min(time) AS first_seen,
max(time) AS last_seen
FROM mabase_prod.http_logs
WHERE {_ua_logs_where}
AND time >= now() - INTERVAL 24 HOUR
AND header_user_agent != '' AND header_user_agent IS NOT NULL
GROUP BY user_agent
ORDER BY count DESC
"""
ua_result = db.query(ua_query_simple, _ua_params)
user_agents = [get_attribute_value(row, 1, 2, 3, 4) for row in ua_result.result_rows]
else:
# country / asn / host: pivot via ml_detected_anomalies → IPs, puis view UA
_ua_where = f"""toString(src_ip) IN (
SELECT DISTINCT replaceRegexpAll(toString(src_ip), '^::ffff:', '')
FROM ml_detected_anomalies
WHERE {column} = %(value)s AND detected_at >= now() - INTERVAL 24 HOUR
)"""
ua_query_simple = f"""
SELECT
ua AS user_agent,
sum(requests) AS count,
round(sum(requests) * 100.0 / sum(sum(requests)) OVER (), 2) AS percentage,
min(log_date) AS first_seen,
max(log_date) AS last_seen
FROM view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE {_ua_where}
AND hour >= now() - INTERVAL 24 HOUR
AND ua != ''
GROUP BY user_agent
ORDER BY count DESC
"""
ua_result = db.query(ua_query_simple, _ua_params)
user_agents = [get_attribute_value(row, 1, 2, 3, 4) for row in ua_result.result_rows]
# JA4 fingerprints
ja4_query = f"""
SELECT
ja4,
count() AS count,
round(count() * 100.0 / (SELECT count() FROM ({base_query})), 2) AS percentage,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM ({base_query})
WHERE ja4 != '' AND ja4 IS NOT NULL
GROUP BY ja4
ORDER BY count DESC
"""
ja4_result = db.query(ja4_query, {"value": value})
ja4s = [get_attribute_value(row, 1, 2, 3, 4) for row in ja4_result.result_rows]
# Pays
country_query = f"""
SELECT
country_code,
count() AS count,
round(count() * 100.0 / (SELECT count() FROM ({base_query})), 2) AS percentage
FROM ({base_query})
WHERE country_code != '' AND country_code IS NOT NULL
GROUP BY country_code
ORDER BY count DESC
"""
country_result = db.query(country_query, {"value": value})
countries = [get_attribute_value(row, 1, 2) for row in country_result.result_rows]
# ASN
asn_query = f"""
SELECT
concat('AS', toString(asn_number), ' - ', asn_org) AS asn_display,
asn_number,
count() AS count,
round(count() * 100.0 / (SELECT count() FROM ({base_query})), 2) AS percentage
FROM ({base_query})
WHERE asn_number != '' AND asn_number IS NOT NULL AND asn_number != '0'
GROUP BY asn_display, asn_number
ORDER BY count DESC
"""
asn_result = db.query(asn_query, {"value": value})
asns = [
AttributeValue(
value=str(row[0]),
count=row[2] or 0,
percentage=round(float(row[3]), 2) if row[3] else 0.0
)
for row in asn_result.result_rows
]
# Hosts
host_query = f"""
SELECT
host,
count() AS count,
round(count() * 100.0 / (SELECT count() FROM ({base_query})), 2) AS percentage
FROM ({base_query})
WHERE host != '' AND host IS NOT NULL
GROUP BY host
ORDER BY count DESC
"""
host_result = db.query(host_query, {"value": value})
hosts = [get_attribute_value(row, 1, 2) for row in host_result.result_rows]
# Threat levels
threat_query = f"""
SELECT
threat_level,
count() AS count,
round(count() * 100.0 / (SELECT count() FROM ({base_query})), 2) AS percentage
FROM ({base_query})
WHERE threat_level != '' AND threat_level IS NOT NULL
GROUP BY threat_level
ORDER BY
CASE threat_level
WHEN 'CRITICAL' THEN 1
WHEN 'HIGH' THEN 2
WHEN 'MEDIUM' THEN 3
WHEN 'LOW' THEN 4
ELSE 5
END
"""
threat_result = db.query(threat_query, {"value": value})
threat_levels = [get_attribute_value(row, 1, 2) for row in threat_result.result_rows]
# Model names
model_query = f"""
SELECT
model_name,
count() AS count,
round(count() * 100.0 / (SELECT count() FROM ({base_query})), 2) AS percentage
FROM ({base_query})
WHERE model_name != '' AND model_name IS NOT NULL
GROUP BY model_name
ORDER BY count DESC
"""
model_result = db.query(model_query, {"value": value})
model_names = [get_attribute_value(row, 1, 2) for row in model_result.result_rows]
# Construire la réponse
attributes = VariabilityAttributes(
user_agents=user_agents,
ja4=ja4s,
countries=countries,
asns=asns,
hosts=hosts,
threat_levels=threat_levels,
model_names=model_names
)
# Générer les insights
insights = _generate_insights(attr_type, value, attributes, total_detections, unique_ips)
return VariabilityResponse(
type=attr_type,
value=value,
total_detections=total_detections,
unique_ips=unique_ips,
date_range={
"first_seen": first_seen,
"last_seen": last_seen
},
attributes=attributes,
insights=insights
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -0,0 +1,493 @@
"""
Moteur de clustering K-means++ multi-métriques (numpy + scipy vectorisé).
Ref:
Arthur & Vassilvitskii (2007) — k-means++: The Advantages of Careful Seeding
scipy.spatial.ConvexHull — enveloppe convexe (Graham/Qhull)
sklearn-style API — centroids, labels_, inertia_
Features (31 dimensions, normalisées [0,1]) :
0 ttl_n : TTL initial normalisé
1 mss_n : MSS normalisé → type réseau
2 scale_n : facteur de mise à l'échelle TCP
3 win_n : fenêtre TCP normalisée
4 score_n : score anomalie ML (abs)
5 velocity_n : vélocité de requêtes (log1p)
6 fuzzing_n : index de fuzzing (log1p)
7 headless_n : ratio sessions headless
8 post_n : ratio POST/total
9 ip_id_zero_n : ratio IP-ID=0 (Linux/spoofé)
10 entropy_n : entropie temporelle
11 browser_n : score navigateur moderne
12 alpn_n : mismatch ALPN/protocole
13 alpn_absent_n : ratio ALPN absent
14 h2_n : efficacité H2 multiplexing (log1p)
15 hdr_conf_n : confiance ordre headers
16 ua_ch_n : mismatch User-Agent-Client-Hints
17 asset_n : ratio assets statiques
18 direct_n : ratio accès directs
19 ja4_div_n : diversité JA4 (log1p)
20 ua_rot_n : UA rotatif (booléen)
21 country_risk_n : risque pays source (CN/RU/KP → 1.0, US/DE/FR → 0.0)
22 asn_cloud_n : hébergeur cloud/CDN/VPN (Cloudflare/AWS/OVH → 1.0)
23 hdr_accept_lang_n : présence header Accept-Language (0=absent=bot-like)
24 hdr_encoding_n : présence header Accept-Encoding (0=absent=bot-like)
25 hdr_sec_fetch_n : présence headers Sec-Fetch-* (1=navigateur réel)
26 hdr_count_n : nombre de headers HTTP normalisé (3=bot, 15=browser)
27 hfp_popular_n : popularité du fingerprint headers (log-normalisé)
fingerprint rare = suspect ; très populaire = browser légitime
28 hfp_rotating_n : rotation de fingerprint (distinct_header_orders)
plusieurs fingerprints distincts → bot en rotation
29 hfp_cookie_n : présence header Cookie (engagement utilisateur réel)
30 hfp_referer_n : présence header Referer (navigation HTTP normale)
"""
from __future__ import annotations
import math
import logging
import numpy as np
from dataclasses import dataclass, field
from scipy.spatial import ConvexHull
log = logging.getLogger(__name__)
# ─── Encodage pays (risque source) ───────────────────────────────────────────
# Source: MISP threat intel, Spamhaus DROP list, géographie offensive connue
_COUNTRY_RISK: dict[str, float] = {
# Très haut risque : infrastructure offensive documentée
"CN": 1.0, "RU": 1.0, "KP": 1.0, "IR": 1.0,
"BY": 0.9, "SY": 0.9, "CU": 0.8,
# Haut risque : transit/hébergement permissif, bulletproof hosters
"HK": 0.75, "VN": 0.7, "UA": 0.65,
"RO": 0.6, "PK": 0.6, "NG": 0.6,
"BG": 0.55, "TR": 0.55, "BR": 0.5,
"TH": 0.5, "IN": 0.45, "ID": 0.45,
# Risque faible : pays à faible tolérance envers activité malveillante
"US": 0.1, "DE": 0.1, "FR": 0.1, "GB": 0.1,
"CA": 0.1, "JP": 0.1, "AU": 0.1, "NL": 0.15,
"CH": 0.1, "SE": 0.1, "NO": 0.1, "DK": 0.1,
"FI": 0.1, "AT": 0.1, "BE": 0.1, "IT": 0.15,
"SG": 0.3, "TW": 0.2, "KR": 0.2, "RS": 0.4,
}
_DEFAULT_COUNTRY_RISK = 0.35 # pays inconnu → risque modéré
def country_risk(cc: str | None) -> float:
"""Score de risque [0,1] d'un code pays ISO-3166."""
return _COUNTRY_RISK.get((cc or "").upper(), _DEFAULT_COUNTRY_RISK)
# ─── Encodage ASN (type d'infrastructure) ────────────────────────────────────
# Cloud/CDN/hosting → fort corrélé avec scanners automatisés et bots
_ASN_CLOUD_KEYWORDS = [
# Hyperscalers
"amazon", "aws", "google", "microsoft", "azure", "alibaba", "tencent", "huawei",
# CDN / edge
"cloudflare", "akamai", "fastly", "cloudfront", "incapsula", "imperva",
"sucuri", "stackpath", "keycdn",
# Hébergeurs
"ovh", "hetzner", "digitalocean", "vultr", "linode", "akamai-linode",
"leaseweb", "choopa", "packet", "equinix", "serverius", "combahton",
"m247", "b2 net", "hostinger", "contabo",
# Bulletproof / transit permissif connus
"hwclouds", "multacom", "psychz", "serverius", "colocrossing",
"frantech", "sharktech", "tzulo",
# VPN / proxy commerciaux
"nordvpn", "expressvpn", "mullvad", "protonvpn", "surfshark",
"privateinternetaccess", "pia ", "cyberghost", "hotspot shield",
"ipvanish", "hide.me",
# Bots search engines / crawlers
"facebook", "meta ", "twitter", "linkedin", "semrush", "ahrefs",
"majestic", "moz ", "babbar", "sistrix", "criteo", "peer39",
]
def asn_cloud_score(asn_org: str | None) -> float:
"""
Score [0,1] : 1.0 = cloud/CDN/hébergement/VPN confirmé.
Correspond à une infrastructure typiquement utilisée par les bots.
"""
if not asn_org:
return 0.2 # inconnu → légèrement suspect
s = asn_org.lower()
for kw in _ASN_CLOUD_KEYWORDS:
if kw in s:
return 1.0
return 0.0
# ─── Définition des features ──────────────────────────────────────────────────
FEATURES: list[tuple[str, str, object]] = [
# TCP stack
("ttl", "TTL Initial", lambda v: min(1.0, (v or 0) / 255.0)),
("mss", "MSS Réseau", lambda v: min(1.0, (v or 0) / 1460.0)),
("scale", "Scale TCP", lambda v: min(1.0, (v or 0) / 14.0)),
("win", "Fenêtre TCP", lambda v: min(1.0, (v or 0) / 65535.0)),
# Anomalie ML
("avg_velocity", "Vélocité (rps)", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(100))), ("avg_fuzzing", "Fuzzing", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(300))),
("pct_headless", "Headless", lambda v: min(1.0, float(v or 0))),
("avg_post", "Ratio POST", lambda v: min(1.0, float(v or 0))),
# IP-ID
("ip_id_zero", "IP-ID Zéro", lambda v: min(1.0, float(v or 0))),
# Temporel
("entropy", "Entropie Temporelle", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(10))),
# Navigateur
("browser_score", "Score Navigateur", lambda v: min(1.0, float(v or 0) / 50.0)),
# TLS / Protocole
("alpn_mismatch", "ALPN Mismatch", lambda v: min(1.0, float(v or 0))),
("alpn_missing", "ALPN Absent", lambda v: min(1.0, float(v or 0))),
("h2_eff", "H2 Multiplexing", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(20))),
("hdr_conf", "Ordre Headers", lambda v: min(1.0, float(v or 0))),
("ua_ch_mismatch","UA-CH Mismatch", lambda v: min(1.0, float(v or 0))),
# Comportement HTTP
("asset_ratio", "Ratio Assets", lambda v: min(1.0, float(v or 0))),
("direct_ratio", "Accès Direct", lambda v: min(1.0, float(v or 0))),
# Diversité JA4
("ja4_count", "Diversité JA4", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(30))),
# UA rotatif
("ua_rotating", "UA Rotatif", lambda v: 1.0 if float(v or 0) > 0 else 0.0),
# ── Géographie & infrastructure (nouvelles features) ──────────────────
("country", "Risque Pays", lambda v: country_risk(str(v) if v else None)),
("asn_org", "Hébergeur Cloud/VPN", lambda v: asn_cloud_score(str(v) if v else None)),
# ── Headers HTTP (présence / profil de la requête) ────────────────────
# Absence d'Accept-Language ou Accept-Encoding = fort signal bot (bots simples l'omettent)
# Sec-Fetch-* = exclusif aux navigateurs réels (fetch metadata)
("hdr_accept_lang", "Accept-Language", lambda v: min(1.0, float(v or 0))),
("hdr_has_encoding", "Accept-Encoding", lambda v: 1.0 if float(v or 0) > 0 else 0.0),
("hdr_has_sec_fetch", "Sec-Fetch Headers", lambda v: 1.0 if float(v or 0) > 0 else 0.0),
("hdr_count_raw", "Nb Headers", lambda v: min(1.0, float(v or 0) / 20.0)),
# ── Fingerprint HTTP Headers (agg_header_fingerprint_1h) ──────────────
# header_order_shared_count : nb d'IPs partageant ce fingerprint
# élevé → populaire → browser légitime (normalisé log1p / log1p(500000))
("hfp_shared_count", "FP Popularité", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(500_000))),
# distinct_header_orders : nb de fingerprints distincts pour cette IP
# élevé → rotation de fingerprint → bot (normalisé log1p / log1p(10))
("hfp_distinct_orders", "FP Rotation", lambda v: min(1.0, math.log1p(float(v or 0)) / math.log1p(10))),
# Cookie et Referer : signaux de navigation légitime
("hfp_cookie", "Cookie Présent", lambda v: min(1.0, float(v or 0))),
("hfp_referer", "Referer Présent", lambda v: min(1.0, float(v or 0))),
]
FEATURE_KEYS = [f[0] for f in FEATURES]
FEATURE_NAMES = [f[1] for f in FEATURES]
FEATURE_NORMS = [f[2] for f in FEATURES]
N_FEATURES = len(FEATURES)
# ─── Construction du vecteur de features ─────────────────────────────────────
def build_feature_vector(row: dict) -> list[float]:
"""Construit le vecteur normalisé [0,1]^23 depuis un dict SQL."""
return [norm(row.get(key, 0)) for key, _, norm in FEATURES]
# ─── Standardisation z-score ──────────────────────────────────────────────────
def standardize(X: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Z-score standardisation : chaque feature est centrée et mise à l'échelle
par sa déviation standard.
Ref: Bishop (2006) PRML §9.1 — preprocessing recommandé pour K-means.
Retourne (X_std, mean, std) pour pouvoir projeter de nouveaux points.
"""
mean = X.mean(axis=0)
std = X.std(axis=0)
std[std < 1e-8] = 1.0 # évite la division par zéro pour features constantes
return (X - mean) / std, mean, std
# ─── K-means++ vectorisé (numpy) ─────────────────────────────────────────────
@dataclass
class KMeansResult:
centroids: np.ndarray # (k, n_features)
labels: np.ndarray # (n_points,) int32
inertia: float
n_iter: int
def kmeans_pp(X: np.ndarray, k: int, max_iter: int = 60, n_init: int = 3,
seed: int = 42) -> KMeansResult:
"""
K-means++ entièrement vectorisé avec numpy.
n_init exécutions, meilleure inertie conservée.
"""
rng = np.random.default_rng(seed)
n, d = X.shape
best: KMeansResult | None = None
for _ in range(n_init):
# ── Initialisation K-means++ ──────────────────────────────────────
centers = [X[rng.integers(n)].copy()]
for _ in range(k - 1):
D = _min_sq_dist(X, np.array(centers))
# Garantit des probabilités non-négatives (erreurs float, points dupliqués)
D = np.clip(D, 0.0, None)
total = D.sum()
if total < 1e-12:
# Tous les points sont confondus — tirage aléatoire
centers.append(X[rng.integers(n)].copy())
else:
probs = D / total
centers.append(X[rng.choice(n, p=probs)].copy())
centers_arr = np.array(centers) # (k, d)
# ── Iterations ───────────────────────────────────────────────────
labels = np.zeros(n, dtype=np.int32)
for it in range(max_iter):
# Assignation vectorisée : (n, k) distance²
dists = _sq_dists(X, centers_arr) # (n, k)
new_labels = np.argmin(dists, axis=1).astype(np.int32)
if it > 0 and np.all(new_labels == labels):
break # convergence
labels = new_labels
# Mise à jour des centroïdes
for j in range(k):
mask = labels == j
if mask.any():
centers_arr[j] = X[mask].mean(axis=0)
inertia = float(np.sum(np.min(_sq_dists(X, centers_arr), axis=1)))
result = KMeansResult(centers_arr, labels, inertia, it + 1)
if best is None or inertia < best.inertia:
best = result
return best # type: ignore[return-value]
def _sq_dists(X: np.ndarray, C: np.ndarray) -> np.ndarray:
"""Distance² entre chaque point de X et chaque centroïde de C. O(n·k·d)."""
# ||x - c||² = ||x||² + ||c||² - 2·x·cᵀ
X2 = np.sum(X ** 2, axis=1, keepdims=True) # (n, 1)
C2 = np.sum(C ** 2, axis=1, keepdims=True).T # (1, k)
return X2 + C2 - 2.0 * X @ C.T # (n, k)
def _min_sq_dist(X: np.ndarray, C: np.ndarray) -> np.ndarray:
"""Distance² minimale de chaque point aux centroïdes existants."""
return np.min(_sq_dists(X, C), axis=1)
# ─── PCA 2D (numpy) ──────────────────────────────────────────────────────────
def pca_2d(X: np.ndarray) -> np.ndarray:
"""
PCA-2D vectorisée. Retourne les coordonnées normalisées [0,1] × [0,1].
"""
mean = X.mean(axis=0)
Xc = X - mean
# Power iteration pour les 2 premières composantes
rng = np.random.default_rng(0)
v1 = _power_iter(Xc, rng.standard_normal(Xc.shape[1]))
proj1 = Xc @ v1
# Déflation (Hotelling)
Xc2 = Xc - np.outer(proj1, v1)
v2 = _power_iter(Xc2, rng.standard_normal(Xc.shape[1]))
proj2 = Xc2 @ v2
coords = np.column_stack([proj1, proj2])
# Normalisation [0,1]
mn, mx = coords.min(axis=0), coords.max(axis=0)
rng_ = mx - mn
rng_[rng_ == 0] = 1.0
return (coords - mn) / rng_
def _power_iter(X: np.ndarray, v: np.ndarray, n_iter: int = 30) -> np.ndarray:
"""Power iteration : trouve le premier vecteur propre de XᵀX."""
for _ in range(n_iter):
v = X.T @ (X @ v)
norm = np.linalg.norm(v)
if norm < 1e-12:
break
v /= norm
return v
# ─── Enveloppe convexe (hull) par cluster ────────────────────────────────────
def compute_hulls(coords_2d: np.ndarray, labels: np.ndarray,
k: int, min_pts: int = 4) -> dict[int, list[list[float]]]:
"""
Calcule l'enveloppe convexe (convex hull) des points PCA pour chaque cluster.
Retourne {cluster_idx: [[x,y], ...]} (polygone fermé).
"""
hulls: dict[int, list[list[float]]] = {}
for j in range(k):
pts = coords_2d[labels == j]
if len(pts) < min_pts:
# Pas assez de points : bounding box
if len(pts) > 0:
mx_, my_ = pts.mean(axis=0)
r = max(0.01, pts.std(axis=0).max())
hulls[j] = [
[mx_ - r, my_ - r], [mx_ + r, my_ - r],
[mx_ + r, my_ + r], [mx_ - r, my_ + r],
]
continue
try:
hull = ConvexHull(pts)
hull_pts = pts[hull.vertices].tolist()
# Fermer le polygone
hull_pts.append(hull_pts[0])
hulls[j] = hull_pts
except Exception:
hulls[j] = []
return hulls
# ─── Nommage et scoring ───────────────────────────────────────────────────────
def name_cluster(centroid: np.ndarray, raw_stats: dict) -> str:
"""Nom lisible basé sur les features dominantes du centroïde [0,1]."""
s = centroid
n = len(s)
ttl_raw = float(raw_stats.get("mean_ttl", 0))
mss_raw = float(raw_stats.get("mean_mss", 0))
country_risk_v = s[20] if n > 20 else 0.0
asn_cloud = s[21] if n > 21 else 0.0
accept_lang = s[22] if n > 22 else 1.0
accept_enc = s[23] if n > 23 else 1.0
sec_fetch = s[24] if n > 24 else 0.0
hdr_count = s[25] if n > 25 else 0.5
hfp_popular = s[26] if n > 26 else 0.5
hfp_rotating = s[27] if n > 27 else 0.0
# Scanner pur : aucun header browser, fingerprint rare, peu de headers
if accept_lang < 0.15 and accept_enc < 0.15 and hdr_count < 0.25:
return "🤖 Scanner pur (no headers)"
# Fingerprint tournant : bot qui change de profil headers
if hfp_rotating > 0.6:
return "🔄 Bot fingerprint tournant"
# Fingerprint très rare : bot artisanal unique
if hfp_popular < 0.15:
return "🕵️ Fingerprint rare suspect"
# Scanners Masscan
if s[0] > 0.16 and s[0] < 0.25 and mss_raw in range(1440, 1460) and s[2] > 0.25:
return "🤖 Masscan Scanner"
# Bots offensifs agressifs (fuzzing élevé)
if s[4] > 0.40 and s[5] > 0.3:
return "🤖 Bot agressif"
# Bot qui simule un navigateur mais sans les vrais headers
if s[15] > 0.40 and sec_fetch < 0.2 and accept_lang < 0.3:
return "🤖 Bot UA simulé"
# Pays à très haut risque avec infrastructure cloud
if country_risk_v > 0.75 and asn_cloud > 0.5:
return "🌏 Source pays risqué"
# Cloud + UA-CH mismatch
if s[15] > 0.50 and asn_cloud > 0.70:
return "☁️ Bot cloud UA-CH"
if s[15] > 0.60:
return "🤖 UA-CH Mismatch"
# Headless browser (Puppeteer/Playwright) : a les headers Sec-Fetch mais headless
if s[6] > 0.50 and sec_fetch > 0.5:
return "🤖 Headless Browser"
if s[6] > 0.50:
return "🤖 Headless (no Sec-Fetch)"
# Cloud pur (CDN/crawler légitime ?)
if asn_cloud > 0.85:
return "☁️ Infrastructure cloud"
# Pays à risque élevé sans autre signal
if country_risk_v > 0.60:
return "🌏 Trafic suspect (pays)"
# Navigateur légitime : tous les signaux positifs y compris fingerprint populaire
if (accept_lang > 0.7 and accept_enc > 0.7 and sec_fetch > 0.5
and hdr_count > 0.5 and hfp_popular > 0.5):
return "🌐 Navigateur légitime"
# OS fingerprinting
if s[3] > 0.85 and ttl_raw > 120:
return "🖥️ Windows"
if s[0] > 0.22 and s[0] < 0.28 and mss_raw > 1400:
return "🐧 Linux"
if mss_raw < 1380 and mss_raw > 0:
return "🌐 Tunnel réseau"
if s[4] > 0.40:
return "⚡ Trafic rapide"
if s[4] < 0.10 and asn_cloud < 0.30:
return "✅ Trafic sain"
return "📊 Cluster mixte"
def risk_score_from_centroid(centroid: np.ndarray) -> float:
"""
Score de risque [0,1] depuis le centroïde (espace original [0,1]).
30 features (avg_score supprimé) — poids calibrés pour sommer à 1.0.
Indices décalés de -1 après suppression de avg_score (ancien idx 4).
"""
s = centroid
n = len(s)
country_risk_v = s[20] if n > 20 else 0.0
asn_cloud = s[21] if n > 21 else 0.0
no_accept_lang = 1.0 - (s[22] if n > 22 else 1.0)
no_encoding = 1.0 - (s[23] if n > 23 else 1.0)
no_sec_fetch = 1.0 - (s[24] if n > 24 else 0.0)
few_headers = 1.0 - (s[25] if n > 25 else 0.5)
hfp_rare = 1.0 - (s[26] if n > 26 else 0.5)
hfp_rotating = s[27] if n > 27 else 0.0
# [4]=vélocité [5]=fuzzing [6]=headless [8]=ip_id_zero [15]=ua_ch_mismatch
# Poids redistribués depuis l'ancien score ML anomalie (0.25) vers les signaux restants
return float(np.clip(
0.14 * s[5] + # fuzzing
0.17 * s[15] + # UA-CH mismatch (fort signal impersonation navigateur)
0.10 * s[6] + # headless
0.09 * s[4] + # vélocité (rps)
0.07 * s[8] + # IP-ID zéro
0.09 * country_risk_v+ # risque pays source
0.06 * asn_cloud + # infrastructure cloud/VPN
0.04 * no_accept_lang+ # absence Accept-Language
0.04 * no_encoding + # absence Accept-Encoding
0.04 * no_sec_fetch + # absence Sec-Fetch
0.04 * few_headers + # très peu de headers
0.06 * hfp_rare + # fingerprint rare = suspect
0.06 * hfp_rotating, # rotation de fingerprint = bot
0.0, 1.0
))
# ─── Gradient de couleur basé sur le score de non-humanité ──────────────────
# Le score [0,1] est mappé sur un dégradé HSL traversant tout le spectre :
# bleu (humain) → cyan → vert → jaune-vert → jaune → orange → rouge (bot pur)
# Hue : 220° (bleu froid) → 0° (rouge vif) en passant par tout l'arc chromatique.
def _hsl_to_hex(h: float, s: float, l: float) -> str:
"""Convertit HSL (h:0-360, s:0-100, l:0-100) en chaîne '#rrggbb'."""
s /= 100.0
l /= 100.0
c = (1.0 - abs(2.0 * l - 1.0)) * s
x = c * (1.0 - abs((h / 60.0) % 2.0 - 1.0))
m = l - c / 2.0
if h < 60: r, g, b = c, x, 0.0
elif h < 120: r, g, b = x, c, 0.0
elif h < 180: r, g, b = 0.0, c, x
elif h < 240: r, g, b = 0.0, x, c
elif h < 300: r, g, b = x, 0.0, c
else: r, g, b = c, 0.0, x
ri, gi, bi = int((r + m) * 255), int((g + m) * 255), int((b + m) * 255)
return f"#{ri:02x}{gi:02x}{bi:02x}"
def risk_to_gradient_color(risk: float) -> str:
"""
Mappe un score de non-humanité [0,1] sur un dégradé HSL continu multi-stop.
risk = 0.0 → hue 220° (bleu froid — trafic humain légitime)
risk = 0.25 → hue 165° (cyan-vert — léger signal suspect)
risk = 0.50 → hue 110° (vert-jaune — comportement mixte)
risk = 0.75 → hue 55° (jaune-orange — probable bot)
risk = 1.0 → hue 0° (rouge vif — bot confirmé)
La saturation monte légèrement avec le risque pour accentuer la lisibilité.
"""
r = float(np.clip(risk, 0.0, 1.0))
hue = (1.0 - r) * 220.0 # 220° → 0°
saturation = 70.0 + r * 20.0 # 70% → 90%
lightness = 58.0 - r * 10.0 # 58% → 48% (plus sombre = plus alarmant)
return _hsl_to_hex(hue, saturation, lightness)

View File

@ -0,0 +1,312 @@
"""
Services de réputation IP - Bases de données publiques sans clé API
"""
import httpx
from typing import Optional, Dict, Any
from datetime import datetime
import logging
logger = logging.getLogger(__name__)
# Timeout pour les requêtes HTTP
HTTP_TIMEOUT = 10.0
class IPReputationService:
"""
Service de réputation IP utilisant des bases de données publiques gratuites
"""
def __init__(self):
self.http_client = httpx.AsyncClient(timeout=HTTP_TIMEOUT)
# Sources de réputation (sans clé API)
self.sources = {
'ip_api': 'http://ip-api.com/json/{ip}',
'ipinfo': 'https://ipinfo.io/{ip}/json',
}
async def get_reputation(self, ip: str) -> Dict[str, Any]:
"""
Récupère la réputation d'une IP depuis toutes les sources disponibles
Args:
ip: Adresse IP à vérifier
Returns:
Dict avec les informations de réputation agrégées
"""
results = {
'ip': ip,
'timestamp': datetime.utcnow().isoformat(),
'sources': {},
'aggregated': {
'is_proxy': False,
'is_hosting': False,
'is_vpn': False,
'is_tor': False,
'threat_score': 0,
'threat_level': 'unknown',
'country': None,
'asn': None,
'org': None,
'warnings': []
}
}
# Interroge chaque source
for source_name, url_template in self.sources.items():
try:
url = url_template.format(ip=ip)
response = await self.http_client.get(url)
if response.status_code == 200:
data = response.json()
results['sources'][source_name] = self._parse_source_data(source_name, data)
else:
logger.warning(f"Source {source_name} returned status {response.status_code}")
results['sources'][source_name] = {'error': f'Status {response.status_code}'}
except httpx.TimeoutException:
logger.warning(f"Timeout for source {source_name}")
results['sources'][source_name] = {'error': 'Timeout'}
except Exception as e:
logger.error(f"Error fetching from {source_name}: {str(e)}")
results['sources'][source_name] = {'error': str(e)}
# Agrège les résultats
results['aggregated'] = self._aggregate_results(results['sources'])
return results
def _parse_source_data(self, source: str, data: Dict[str, Any]) -> Dict[str, Any]:
"""
Parse les données d'une source spécifique
"""
if source == 'ip_api':
return self._parse_ip_api(data)
elif source == 'ipinfo':
return self._parse_ipinfo(data)
return data
def _parse_ip_api(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""
Parse les données de IP-API.com
Response example:
{
"status": "success",
"country": "France",
"countryCode": "FR",
"region": "IDF",
"regionName": "Île-de-France",
"city": "Paris",
"zip": "75001",
"lat": 48.8534,
"lon": 2.3488,
"timezone": "Europe/Paris",
"isp": "OVH SAS",
"org": "OVH SAS",
"as": "AS16276 OVH SAS",
"asname": "OVH",
"mobile": false,
"proxy": false,
"hosting": true,
"query": "51.15.0.1"
}
"""
if data.get('status') != 'success':
return {'error': data.get('message', 'Unknown error')}
# Extraire l'ASN
asn_full = data.get('as', '')
asn_number = None
asn_org = None
if asn_full:
parts = asn_full.split(' ', 1)
if len(parts) >= 1:
asn_number = parts[0].replace('AS', '')
if len(parts) >= 2:
asn_org = parts[1]
return {
'country': data.get('country'),
'country_code': data.get('countryCode'),
'region': data.get('regionName'),
'city': data.get('city'),
'isp': data.get('isp'),
'org': data.get('org'),
'asn': asn_number,
'asn_org': asn_org,
'is_proxy': data.get('proxy', False),
'is_hosting': data.get('hosting', False),
'is_mobile': data.get('mobile', False),
'timezone': data.get('timezone'),
'lat': data.get('lat'),
'lon': data.get('lon'),
'query': data.get('query')
}
def _parse_ipinfo(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""
Parse les données de IPinfo.io
Response example:
{
"ip": "51.15.0.1",
"city": "Paris",
"region": "Île-de-France",
"country": "FR",
"loc": "48.8534,2.3488",
"org": "AS16276 OVH SAS",
"postal": "75001",
"timezone": "Europe/Paris",
"readme": "https://ipinfo.io/missingauth"
}
"""
# Extraire l'ASN
org_full = data.get('org', '')
asn_number = None
asn_org = None
if org_full:
parts = org_full.split(' ', 1)
if len(parts) >= 1:
asn_number = parts[0].replace('AS', '')
if len(parts) >= 2:
asn_org = parts[1]
# Extraire lat/lon
loc = data.get('loc', '')
lat = None
lon = None
if loc:
coords = loc.split(',')
if len(coords) == 2:
lat = float(coords[0])
lon = float(coords[1])
return {
'ip': data.get('ip'),
'city': data.get('city'),
'region': data.get('region'),
'country': data.get('country'),
'postal': data.get('postal'),
'timezone': data.get('timezone'),
'asn': asn_number,
'asn_org': asn_org,
'org': data.get('org'),
'lat': lat,
'lon': lon
}
def _aggregate_results(self, sources: Dict[str, Any]) -> Dict[str, Any]:
"""
Agrège les résultats de toutes les sources
Logique d'agrégation:
- is_proxy: true si au moins une source le détecte
- is_hosting: true si au moins une source le détecte
- threat_score: basé sur les détections proxy/hosting/vpn/tor
- threat_level: low/medium/high/critical basé sur le score
"""
aggregated = {
'is_proxy': False,
'is_hosting': False,
'is_vpn': False,
'is_tor': False,
'threat_score': 0,
'threat_level': 'unknown',
'country': None,
'country_code': None,
'asn': None,
'asn_org': None,
'org': None,
'city': None,
'warnings': []
}
threat_score = 0
for source_name, source_data in sources.items():
if 'error' in source_data:
continue
# Proxy detection
if source_data.get('is_proxy'):
aggregated['is_proxy'] = True
threat_score += 30
aggregated['warnings'].append(f'{source_name}: Proxy détecté')
# Hosting detection
if source_data.get('is_hosting'):
aggregated['is_hosting'] = True
threat_score += 20
aggregated['warnings'].append(f'{source_name}: Hébergement cloud/datacenter')
# VPN detection (si disponible)
if source_data.get('is_vpn'):
aggregated['is_vpn'] = True
threat_score += 40
aggregated['warnings'].append(f'{source_name}: VPN détecté')
# Tor detection (si disponible)
if source_data.get('is_tor'):
aggregated['is_tor'] = True
threat_score += 50
aggregated['warnings'].append(f'{source_name}: Exit node Tor détecté')
# Infos géographiques (prend la première disponible)
if not aggregated['country'] and source_data.get('country'):
aggregated['country'] = source_data.get('country')
if not aggregated['country_code'] and source_data.get('country_code'):
aggregated['country_code'] = source_data.get('country_code')
# ASN (prend la première disponible)
if not aggregated['asn'] and source_data.get('asn'):
aggregated['asn'] = source_data.get('asn')
if not aggregated['asn_org'] and source_data.get('asn_org'):
aggregated['asn_org'] = source_data.get('asn_org')
# Organisation/ISP
if not aggregated['org'] and source_data.get('org'):
aggregated['org'] = source_data.get('org')
# Ville
if not aggregated['city'] and source_data.get('city'):
aggregated['city'] = source_data.get('city')
# Calcul du niveau de menace
aggregated['threat_score'] = min(100, threat_score)
if threat_score >= 80:
aggregated['threat_level'] = 'critical'
elif threat_score >= 60:
aggregated['threat_level'] = 'high'
elif threat_score >= 40:
aggregated['threat_level'] = 'medium'
elif threat_score >= 20:
aggregated['threat_level'] = 'low'
else:
aggregated['threat_level'] = 'clean'
return aggregated
async def close(self):
"""Ferme le client HTTP"""
await self.http_client.aclose()
# Singleton pour réutiliser le service
_reputation_service: Optional[IPReputationService] = None
def get_reputation_service() -> IPReputationService:
"""Retourne l'instance singleton du service de réputation"""
global _reputation_service
if _reputation_service is None:
_reputation_service = IPReputationService()
return _reputation_service

View File

@ -0,0 +1,436 @@
"""
Service de fingerprinting OS par signature TCP — approche multi-signal inspirée de p0f.
Signaux utilisés (par ordre de poids) :
1. TTL initial estimé (→ famille OS : Linux/Mac=64, Windows=128, Cisco/BSD=255)
2. MSS (→ type de réseau : Ethernet=1460, PPPoE=1452, VPN=1380-1420)
3. Taille de fenêtre (→ signature OS précise)
4. Facteur d'échelle (→ affine la version du kernel/stack TCP)
Références :
- p0f v3 (Michal Zalewski) — passive OS fingerprinting
- Nmap OS detection (Gordon Lyon)
- "OS Fingerprinting Revisited" (Beverly, 2004)
- "Passive OS fingerprinting" (Orebaugh, Ramirez)
- Recherche sur Masscan/ZMap : signatures SYN craftées connues
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Optional
# ─── Constantes ───────────────────────────────────────────────────────────────
_INITIAL_TTLS = (64, 128, 255)
# MSS → type de chemin réseau (MTU - 40 octets d'en-têtes IP+TCP)
_MSS_PATH: list[tuple[range, str]] = [
(range(1461, 9001), "Ethernet/Jumbo"), # jumbo frames (CDN/datacenter)
(range(1460, 1461), "Ethernet directe"), # MTU 1500 standard
(range(1453, 1460), "Ethernet directe"), # légèrement réduit (padding)
(range(1452, 1453), "PPPoE/DSL"), # MTU 1492
(range(1436, 1452), "PPPoE/DSL ajusté"), # variations DSL
(range(1420, 1436), "VPN léger"), # WireGuard / IPsec transport
(range(1380, 1420), "VPN/Tunnel"), # OpenVPN / L2TP
(range(1300, 1380), "VPN double ou mobile"),
(range(0, 1300), "Lien bas débit / GPRS"),
]
# ─── Base de signatures OS ─────────────────────────────────────────────────────
#
# Format : chaque entrée est un dict avec :
# ttl : int — TTL initial attendu (64 | 128 | 255)
# win : set[int]|None — tailles de fenêtre attendues (None = ignorer)
# scale : set[int]|None — facteurs d'échelle attendus (None = ignorer)
# mss : set[int]|None — valeurs MSS attendues (None = ignorer)
# name : str — libellé affiché
# conf : float — poids de confiance de base (01)
# bot : bool — outil de scan/bot connu
_SIGNATURES: list[dict] = [
# ══════════════════════════════════════════════════════
# OUTILS DE SCAN ET BOTS CONNUS (priorité maximale)
# ══════════════════════════════════════════════════════
# Masscan / scanner personnalisé avec stack Linux modifiée (PPPoE MSS=1452)
# Pattern très présent dans les données : ~111k requêtes, UA spoofé macOS/Windows
{
"ttl": 64, "win": {5808}, "scale": {4}, "mss": {1452},
"name": "Bot-Scanner/Masscan", "conf": 0.97, "bot": True,
},
# Masscan TTL=255 (mode direct, pas de hop)
{
"ttl": 255, "win": {1024}, "scale": {0}, "mss": None,
"name": "Bot-ZMap/Masscan", "conf": 0.96, "bot": True,
},
# Mirai variant (petite fenêtre, pas de scale, TTL Linux)
{
"ttl": 64, "win": {1024, 2048}, "scale": {0}, "mss": {1460},
"name": "Bot-Mirai", "conf": 0.92, "bot": True,
},
# Mirai variant (petite fenêtre Windows)
{
"ttl": 128, "win": {1024, 2048}, "scale": {0}, "mss": {1460},
"name": "Bot-Mirai/Win", "conf": 0.92, "bot": True,
},
# Scapy / forge manuelle (fenêtre 8192 exactement + TTL 64 + pas de scale)
{
"ttl": 64, "win": {8192}, "scale": {0}, "mss": {1460},
"name": "Bot-Scapy/Forge", "conf": 0.85, "bot": True,
},
# Nmap SYN scan (window=1024, MSS=1460, TTL=64 ou 128)
{
"ttl": 64, "win": {1}, "scale": None, "mss": None,
"name": "Bot-ZMap", "conf": 0.95, "bot": True,
},
# ══════════════════════════════════════════════════════
# WINDOWS
# ══════════════════════════════════════════════════════
# Windows 10 / 11 — signature standard (LAN direct)
{
"ttl": 128, "win": {64240}, "scale": {8}, "mss": {1460},
"name": "Windows 10/11", "conf": 0.93, "bot": False,
},
# Windows 10/11 — derrière VPN/proxy (MSS réduit)
{
"ttl": 128, "win": {64240}, "scale": {8}, "mss": {1380, 1400, 1412, 1420, 1440},
"name": "Windows 10/11 (VPN)", "conf": 0.90, "bot": False,
},
# Windows Server 2019/2022 — grande fenêtre
{
"ttl": 128, "win": {65535, 131072}, "scale": {8, 9}, "mss": {1460},
"name": "Windows Server", "conf": 0.88, "bot": False,
},
# Windows 7/8.1
{
"ttl": 128, "win": {8192, 65535}, "scale": {4, 8}, "mss": {1460},
"name": "Windows 7/8", "conf": 0.83, "bot": False,
},
# Windows générique (TTL=128, scale=8, tout MSS)
{
"ttl": 128, "win": None, "scale": {8}, "mss": None,
"name": "Windows", "conf": 0.70, "bot": False,
},
# ══════════════════════════════════════════════════════
# ANDROID (stack BBRv2 / CUBIC moderne)
# ══════════════════════════════════════════════════════
# Android 10+ — scale=9 ou 10, grande fenêtre (BBRv2)
{
"ttl": 64, "win": {65535, 131072, 42340, 35844}, "scale": {9, 10}, "mss": {1460},
"name": "Android 10+", "conf": 0.82, "bot": False,
},
# Android via proxy TTL=128 (app Facebook, TikTok etc. passant par infra)
{
"ttl": 128, "win": {62727, 65535}, "scale": {7}, "mss": {1460},
"name": "Android/App (proxy)", "conf": 0.75, "bot": False,
},
# Android derrière VPN (MSS réduit)
{
"ttl": 64, "win": {65535, 59640, 63940}, "scale": {8, 9, 10}, "mss": {1380, 1390, 1400, 1418, 1420},
"name": "Android (VPN/mobile)", "conf": 0.78, "bot": False,
},
# ══════════════════════════════════════════════════════
# iOS / macOS
# ══════════════════════════════════════════════════════
# iOS 14+ / macOS Monterey+ — scale=6, win=65535 (signature XNU)
{
"ttl": 64, "win": {65535, 32768}, "scale": {6}, "mss": {1460},
"name": "iOS/macOS", "conf": 0.87, "bot": False,
},
# macOS Sonoma+ / iOS 17+ (scale=9, fenêtre plus grande)
{
"ttl": 64, "win": {65535, 32768}, "scale": {9}, "mss": {1460},
"name": "macOS Sonoma+/iOS 17+", "conf": 0.83, "bot": False,
},
# macOS derrière VPN (MSS réduit)
{
"ttl": 64, "win": {65535}, "scale": {6, 9}, "mss": {1380, 1400, 1412, 1436},
"name": "iOS/macOS (VPN)", "conf": 0.80, "bot": False,
},
# ══════════════════════════════════════════════════════
# LINUX (desktop/serveur)
# ══════════════════════════════════════════════════════
# Linux 5.x+ — scale=7, win=64240 ou 65535 (kernel ≥ 4.19)
{
"ttl": 64, "win": {64240, 65320}, "scale": {7}, "mss": {1460},
"name": "Linux 5.x+", "conf": 0.86, "bot": False,
},
# Linux 4.x / ChromeOS
{
"ttl": 64, "win": {29200, 65535, 43690, 32120}, "scale": {7}, "mss": {1460},
"name": "Linux 4.x/ChromeOS", "conf": 0.83, "bot": False,
},
# Linux derrière VPN (MSS réduit)
{
"ttl": 64, "win": {64240, 65535, 42600}, "scale": {7}, "mss": {1380, 1400, 1420, 1436},
"name": "Linux (VPN)", "conf": 0.80, "bot": False,
},
# Linux 2.6.x (ancien — win=5840/14600)
{
"ttl": 64, "win": {5840, 14600, 16384}, "scale": {4, 5}, "mss": {1460},
"name": "Linux 2.6", "conf": 0.78, "bot": False,
},
# ══════════════════════════════════════════════════════
# BSD / ÉQUIPEMENTS RÉSEAU / CDN
# ══════════════════════════════════════════════════════
# FreeBSD / OpenBSD (initial TTL=64)
{
"ttl": 64, "win": {65535}, "scale": {6}, "mss": {512, 1460},
"name": "FreeBSD/OpenBSD", "conf": 0.74, "bot": False,
},
# Cisco IOS / équipements réseau (initial TTL=255, fenêtre petite)
{
"ttl": 255, "win": {4096, 4128, 8760}, "scale": {0, 1, 2}, "mss": {512, 1460},
"name": "Cisco/Réseau", "conf": 0.87, "bot": False,
},
# CDN / Applebot (TTL=255, jumbo MSS, fenêtre élevée)
{
"ttl": 255, "win": {26883, 65535, 59640}, "scale": {7, 8}, "mss": {8373, 8365, 1460},
"name": "CDN/Applebot (jumbo)", "conf": 0.85, "bot": False,
},
# BSD/Unix générique (TTL=255)
{
"ttl": 255, "win": None, "scale": {6, 7, 8}, "mss": {1460},
"name": "BSD/Unix", "conf": 0.68, "bot": False,
},
]
# ─── Data classes ──────────────────────────────────────────────────────────────
@dataclass
class OsFingerprint:
os_name: str
initial_ttl: int
hop_count: int
confidence: float
is_bot_tool: bool
network_path: str
@dataclass
class SpoofResult:
is_spoof: bool
is_bot_tool: bool
reason: str
# ─── Fonctions utilitaires ─────────────────────────────────────────────────────
def _estimate_initial_ttl(observed_ttl: int) -> tuple[int, int]:
"""Retourne (initial_ttl, hop_count).
Cherche le TTL standard le plus bas >= observed_ttl.
Rejette les hop counts > 45 (réseau légitimement long = peu probable).
"""
if observed_ttl <= 0:
return 0, -1
for initial in _INITIAL_TTLS:
if observed_ttl <= initial:
hop = initial - observed_ttl
if hop <= 45:
return initial, hop
return 255, 255 - observed_ttl # TTL > 255 impossible, fallback
def _infer_network_path(mss: int) -> str:
"""Retourne le type de chemin réseau probable à partir du MSS."""
if mss <= 0:
return "Inconnu"
for rng, label in _MSS_PATH:
if mss in rng:
return label
return "Inconnu"
def _os_family(os_name: str) -> str:
"""Réduit un nom OS détaillé à sa famille pour comparaison avec l'UA."""
n = os_name.lower()
if "windows" in n:
return "Windows"
if "android" in n:
return "Android"
if "ios" in n or "macos" in n or "iphone" in n or "ipad" in n:
return "Apple"
if "linux" in n or "chromeos" in n:
return "Linux"
if "bsd" in n or "cisco" in n or "cdn" in n or "réseau" in n:
return "Network"
if "bot" in n or "scanner" in n or "mirai" in n or "zmap" in n:
return "Bot"
return "Unknown"
def _ua_os_family(declared_os: str) -> str:
"""Réduit l'OS déclaré (UA) à sa famille."""
mapping = {
"Windows": "Windows",
"Android": "Android",
"iOS": "Apple",
"macOS": "Apple",
"Linux": "Linux",
"ChromeOS": "Linux",
"BSD": "Network",
}
return mapping.get(declared_os, "Unknown")
# ─── Fonctions publiques ───────────────────────────────────────────────────────
def declared_os_from_ua(ua: str) -> str:
"""Infère l'OS déclaré à partir du User-Agent."""
ua = ua or ""
ul = ua.lower()
if not ul:
return "Unknown"
if "windows nt" in ul:
return "Windows"
if "android" in ul:
return "Android"
if "iphone" in ul or "ipad" in ul or "cpu iphone" in ul or "cpu ipad" in ul:
return "iOS"
if "mac os x" in ul or "macos" in ul:
return "macOS"
if "cros" in ul or "chromeos" in ul:
return "ChromeOS"
if "linux" in ul:
return "Linux"
if "freebsd" in ul or "openbsd" in ul or "netbsd" in ul:
return "BSD"
return "Unknown"
def fingerprint_os(ttl: int, win: int, scale: int, mss: int) -> OsFingerprint:
"""Fingerprint OS multi-signal avec scoring pondéré.
Poids des signaux :
- TTL initial 40 % (discriminant principal : famille OS)
- MSS 30 % (type de réseau ET OS)
- Fenêtre TCP 20 % (version/distrib précise)
- Scale 10 % (affine la version kernel)
"""
initial_ttl, hop_count = _estimate_initial_ttl(ttl)
network_path = _infer_network_path(mss)
if initial_ttl == 0:
return OsFingerprint(
os_name="Unknown", initial_ttl=0, hop_count=-1,
confidence=0.0, is_bot_tool=False, network_path=network_path,
)
best_score: float = -1.0
best_sig: Optional[dict] = None
for sig in _SIGNATURES:
# Le TTL est un filtre strict — pas de correspondance, on passe
if sig["ttl"] != initial_ttl:
continue
score: float = 0.40 # Score de base pour correspondance TTL
# MSS (poids 0.30)
if sig["mss"] is not None:
score += 0.30 if mss in sig["mss"] else -0.12
# Fenêtre (poids 0.20)
if sig["win"] is not None:
score += 0.20 if win in sig["win"] else -0.08
# Scale (poids 0.10)
if sig["scale"] is not None:
score += 0.10 if scale in sig["scale"] else -0.04
# Pénalité si hop count anormalement élevé (>30 hops)
if hop_count > 30:
score -= 0.05
if score > best_score:
best_score = score
best_sig = sig
if best_sig and best_score >= 0.38:
# Pondérer la confiance finale par le score et le conf de la signature
raw_conf = best_score * best_sig["conf"]
confidence = round(min(max(raw_conf, 0.0), 1.0), 2)
return OsFingerprint(
os_name=best_sig["name"],
initial_ttl=initial_ttl,
hop_count=hop_count,
confidence=confidence,
is_bot_tool=best_sig["bot"],
network_path=network_path,
)
# Repli : classification TTL seule (confiance minimale)
fallback = {64: "Linux/macOS", 128: "Windows", 255: "Cisco/BSD"}
return OsFingerprint(
os_name=fallback.get(initial_ttl, "Unknown"),
initial_ttl=initial_ttl,
hop_count=hop_count,
confidence=round(0.40 * 0.65, 2), # confiance faible
is_bot_tool=False,
network_path=network_path,
)
def detect_spoof(fp: OsFingerprint, declared_os: str) -> SpoofResult:
"""Détecte les incohérences OS entre TCP et UA.
Règles :
1. Outil de scan connu → spoof/bot, quelle que soit l'UA
2. Confiance < 0.50 → indéterminable
3. OS incompatibles → spoof confirmé
4. Cohérent → OK
"""
if fp.is_bot_tool:
return SpoofResult(
is_spoof=True,
is_bot_tool=True,
reason=f"Outil de scan détecté ({fp.os_name})",
)
if fp.confidence < 0.50 or fp.os_name == "Unknown" or declared_os == "Unknown":
return SpoofResult(
is_spoof=False,
is_bot_tool=False,
reason="Corrélation insuffisante",
)
tcp_family = _os_family(fp.os_name)
ua_family = _ua_os_family(declared_os)
# Les familles Network/Bot sont toujours suspectes si l'UA prétend être un navigateur
if tcp_family == "Network" and ua_family not in ("Network", "Unknown"):
return SpoofResult(
is_spoof=True,
is_bot_tool=False,
reason=f"Équipement réseau/CDN (TCP) vs {declared_os} (UA)",
)
if tcp_family == "Unknown" or ua_family == "Unknown":
return SpoofResult(is_spoof=False, is_bot_tool=False, reason="OS indéterminé")
# Android passant par un proxy infra (ex: Facebook app → proxy Windows)
# → pas forcément un spoof, noté mais non flaggé
if declared_os == "Android" and tcp_family == "Windows" and "proxy" in fp.os_name.lower():
return SpoofResult(is_spoof=False, is_bot_tool=False, reason="App mobile via proxy infra")
if tcp_family != ua_family:
return SpoofResult(
is_spoof=True,
is_bot_tool=False,
reason=f"TCP→{tcp_family} vs UA→{ua_family}",
)
return SpoofResult(is_spoof=False, is_bot_tool=False, reason="Cohérent")

View File

@ -0,0 +1,18 @@
import pytest
from unittest.mock import MagicMock, patch
from fastapi.testclient import TestClient
@pytest.fixture
def mock_db():
db = MagicMock()
db.query.return_value = MagicMock(result_rows=[])
return db
@pytest.fixture
def client(mock_db):
with patch("backend.database.db", mock_db):
from backend.main import app
with TestClient(app) as c:
yield c, mock_db

View File

@ -0,0 +1,10 @@
def test_audit_log_post(client):
c, _ = client
resp = c.post("/api/audit/logs?action=test_action&user=testuser")
assert resp.status_code in (200, 422, 404)
def test_audit_log_get(client):
c, _ = client
resp = c.get("/api/audit/logs?hours=1")
assert resp.status_code in (200, 404)

View File

@ -0,0 +1,70 @@
"""Tests for the detections routes and helper functions."""
import pytest
def test_detections_list_endpoint(client):
"""GET /api/detections returns a valid status code."""
c, mock_db = client
mock_db.query.return_value.result_rows = [(50,)] # count query
resp = c.get("/api/detections")
assert resp.status_code in (200, 404, 422, 500)
def test_detections_list_with_filters(client):
"""GET /api/detections supports filter query params."""
c, mock_db = client
mock_db.query.return_value.result_rows = [(0,)]
resp = c.get("/api/detections?threat_level=CRITICAL&page=1&page_size=10")
assert resp.status_code in (200, 404, 422, 500)
def test_detections_pagination(client):
"""GET /api/detections supports pagination params."""
c, mock_db = client
mock_db.query.return_value.result_rows = [(0,)]
resp = c.get("/api/detections?page=2&page_size=10")
assert resp.status_code in (200, 404, 422, 500)
def test_label_to_score_known_labels():
"""_label_to_score returns known float values for recognized labels."""
from backend.routes.detections import _label_to_score
assert _label_to_score("human") == pytest.approx(0.9)
assert _label_to_score("bot") == pytest.approx(0.05)
assert _label_to_score("tor") == pytest.approx(0.1)
assert _label_to_score("proxy") == pytest.approx(0.25)
def test_label_to_score_unknown_label():
"""_label_to_score returns 0.5 for unrecognized labels."""
from backend.routes.detections import _label_to_score
assert _label_to_score("unknown_label") == pytest.approx(0.5)
def test_label_to_score_empty_string():
"""_label_to_score returns None for empty string."""
from backend.routes.detections import _label_to_score
assert _label_to_score("") is None
def test_label_to_score_case_insensitive():
"""_label_to_score is case-insensitive."""
from backend.routes.detections import _label_to_score
assert _label_to_score("HUMAN") == _label_to_score("human")
assert _label_to_score("Bot") == _label_to_score("bot")
def test_detections_search_filter(client):
"""GET /api/detections supports search text filter."""
c, mock_db = client
mock_db.query.return_value.result_rows = [(0,)]
resp = c.get("/api/detections?search=1.2.3")
assert resp.status_code in (200, 404, 422, 500)
def test_detections_group_by_ip(client):
"""GET /api/detections supports group_by_ip mode."""
c, mock_db = client
mock_db.query.return_value.result_rows = [(0,)]
resp = c.get("/api/detections?group_by_ip=true")
assert resp.status_code in (200, 404, 422, 500)

View File

@ -0,0 +1,26 @@
def test_health_returns_200(client):
c, _ = client
resp = c.get("/health")
assert resp.status_code == 200
def test_health_endpoint_body(client):
"""Health endpoint returns a body with 'status'."""
c, _ = client
resp = c.get("/health")
assert resp.status_code == 200
# Body may be JSON or plain text
try:
data = resp.json()
assert "status" in data
except Exception:
pass # Non-JSON health check body is also acceptable
def test_health_db_not_required(client):
"""Health check does not depend on DB availability."""
c, mock_db = client
mock_db.query.side_effect = Exception("DB down")
resp = c.get("/health")
# Health should still return 200 even if DB throws
assert resp.status_code == 200

View File

@ -0,0 +1,34 @@
def test_metrics_endpoint(client):
c, mock_db = client
mock_db.query.return_value.result_rows = [
("1.2.3.4", "t1234567890abc", "UA/5.0", "FR", 100)
]
resp = c.get("/api/metrics/top-ips?hours=1&limit=10")
assert resp.status_code in (200, 404, 422) # endpoint may not exist in all versions
def test_metrics_main_endpoint(client):
"""GET /api/metrics returns 200 when DB returns data."""
c, mock_db = client
# Summary row: total, critical, high, medium, low, known_bots, anomalies, unique_ips
mock_db.query.return_value.result_rows = [
(100, 5, 10, 20, 65, 15, 85, 50)
]
resp = c.get("/api/metrics")
assert resp.status_code in (200, 404, 422, 500)
def test_metrics_main_no_data(client):
"""GET /api/metrics returns 404 when DB returns no rows."""
c, mock_db = client
mock_db.query.return_value.result_rows = []
resp = c.get("/api/metrics")
assert resp.status_code in (404, 500)
def test_threats_endpoint(client):
"""GET /api/metrics/threats returns acceptable status code."""
c, mock_db = client
mock_db.query.return_value.result_rows = [("CRITICAL", 5), ("HIGH", 10)]
resp = c.get("/api/metrics/threats")
assert resp.status_code in (200, 404, 422, 500)

View File

@ -0,0 +1,25 @@
import pytest
PRIVATE_RANGES = [
"127.0.0.1", "10.0.0.1", "192.168.1.1", "172.16.0.1",
"169.254.0.1", "::1", "fc00::1"
]
def is_private_ip(ip: str) -> bool:
import ipaddress
try:
addr = ipaddress.ip_address(ip)
return addr.is_private or addr.is_loopback or addr.is_link_local
except ValueError:
return True
def test_private_ips_rejected():
for ip in PRIVATE_RANGES:
assert is_private_ip(ip), f"{ip} should be private"
def test_public_ip_accepted():
assert not is_private_ip("8.8.8.8")
assert not is_private_ip("1.1.1.1")