feat(dashboard): rebuild SOC dashboard + fix ClickHouse SQL

Complete rewrite of the SOC dashboard using FastAPI + Jinja2 + htmx + Chart.js + Tailwind CSS.
Replaces the old React/Vite frontend with server-rendered templates.

Dashboard pages:
- Overview: KPIs, timeline chart, threat distribution, top IPs
- Detections: paginated/filterable anomaly table
- Scores: ml_all_scores with AE error & XGB prob columns
- Traffic: HTTP logs with method/host filters
- IP Investigation: full deep-dive (scores, features, HTTP logs, classify)
- Classification: SOC feedback form + history
- Features: AI + thesis feature stats
- Models: scoring stats + model metadata

API: 9 JSON endpoints with parameterized queries, sort whitelists

SQL fixes:
- 05_aggregation_tables: add deduplicate_merge_projection_mode
- 11_views: fix nested aggregate (argMax inside sum)
- 12_thesis_features: remove invalid 'let' bindings, fix groupArrayIf type

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-08 03:21:05 +02:00
parent 228ad7026a
commit b735bab5a5
120 changed files with 1444 additions and 24933 deletions

View File

@ -1 +0,0 @@
"""Package des routes FastAPI de l'API bot-detector."""

View File

@ -1,688 +0,0 @@
"""
Endpoints pour l'analyse de corrélations et la classification SOC
"""
from collections import defaultdict
from fastapi import APIRouter, HTTPException, Query
from typing import Optional, List
import ipaddress
import json
from ..database import db
from ..models import (
SubnetAnalysis, CountryAnalysis, CountryData, JA4Analysis, JA4SubnetData,
UserAgentAnalysis, UserAgentData, CorrelationIndicators,
ClassificationRecommendation, ClassificationLabel,
ClassificationCreate, Classification, ClassificationsListResponse
)
from ..config import settings
router = APIRouter(prefix="/api/analysis", tags=["analysis"])
# Mapping code ISO → nom lisible (utilisé par analyze_ip_country et analyze_country)
_COUNTRY_NAMES: dict[str, str] = {
"CN": "China", "US": "United States", "DE": "Germany",
"FR": "France", "RU": "Russia", "GB": "United Kingdom",
"NL": "Netherlands", "IN": "India", "BR": "Brazil",
"JP": "Japan", "KR": "South Korea", "IT": "Italy",
"ES": "Spain", "CA": "Canada", "AU": "Australia"
}
# =============================================================================
# ANALYSE SUBNET / ASN
# =============================================================================
@router.get("/{ip}/subnet", response_model=SubnetAnalysis)
async def analyze_subnet(ip: str):
"""
Analyse les IPs du même subnet et ASN
"""
try:
# Calculer le subnet /24
ip_obj = ipaddress.ip_address(ip)
subnet = ipaddress.ip_network(f"{ip}/24", strict=False)
subnet_str = str(subnet)
# Récupérer les infos ASN pour cette IP
asn_query = f"""
SELECT asn_number, asn_org
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE src_ip = %(ip)s
ORDER BY detected_at DESC
LIMIT 1
"""
asn_result = db.query(asn_query, {"ip": ip})
if not asn_result.result_rows:
# Fallback: utiliser données par défaut
asn_number = "0"
asn_org = "Unknown"
else:
asn_number = str(asn_result.result_rows[0][0] or "0")
asn_org = asn_result.result_rows[0][1] or "Unknown"
# IPs du même subnet /24
subnet_ips_query = f"""
SELECT DISTINCT src_ip
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE toIPv4(src_ip) >= toIPv4(%(subnet_start)s)
AND toIPv4(src_ip) <= toIPv4(%(subnet_end)s)
AND detected_at >= now() - INTERVAL 24 HOUR
ORDER BY src_ip
"""
subnet_result = db.query(subnet_ips_query, {
"subnet_start": str(subnet.network_address),
"subnet_end": str(subnet.broadcast_address)
})
subnet_ips = [str(row[0]) for row in subnet_result.result_rows]
# Total IPs du même ASN
if asn_number != "0":
asn_total_query = f"""
SELECT uniq(src_ip)
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE asn_number = %(asn_number)s
AND detected_at >= now() - INTERVAL 24 HOUR
"""
asn_total_result = db.query(asn_total_query, {"asn_number": asn_number})
asn_total = asn_total_result.result_rows[0][0] if asn_total_result.result_rows else 0
else:
asn_total = 0
return SubnetAnalysis(
ip=ip,
subnet=subnet_str,
ips_in_subnet=subnet_ips,
total_in_subnet=len(subnet_ips),
asn_number=asn_number,
asn_org=asn_org,
total_in_asn=asn_total,
alert=len(subnet_ips) > 10
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/{ip}/country", response_model=dict)
async def analyze_ip_country(ip: str):
"""
Analyse le pays d'une IP spécifique et la répartition des autres pays du même ASN
"""
try:
# Pays de l'IP
ip_country_query = f"""
SELECT country_code, asn_number
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE src_ip = %(ip)s
ORDER BY detected_at DESC
LIMIT 1
"""
ip_result = db.query(ip_country_query, {"ip": ip})
if not ip_result.result_rows:
return {"ip_country": None, "asn_countries": []}
ip_country_code = ip_result.result_rows[0][0]
asn_number = ip_result.result_rows[0][1]
# Répartition des autres pays du même ASN
asn_countries_query = f"""
SELECT
country_code,
count() AS count
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE asn_number = %(asn_number)s
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY country_code
ORDER BY count DESC
"""
asn_result = db.query(asn_countries_query, {"asn_number": asn_number})
total = sum(row[1] for row in asn_result.result_rows)
asn_countries = [
{
"code": row[0],
"name": _COUNTRY_NAMES.get(row[0], row[0]),
"count": row[1],
"percentage": round((row[1] / total * 100), 2) if total > 0 else 0.0
}
for row in asn_result.result_rows
]
return {
"ip_country": {
"code": ip_country_code,
"name": _COUNTRY_NAMES.get(ip_country_code, ip_country_code)
},
"asn_countries": asn_countries
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ANALYSE PAYS
# =============================================================================
@router.get("/country", response_model=CountryAnalysis)
async def analyze_country(days: int = Query(1, ge=1, le=30)):
"""
Analyse la distribution des pays
"""
try:
# Top pays
top_query = f"""
SELECT
country_code,
count() AS count
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(days)s DAY
AND country_code != '' AND country_code IS NOT NULL
GROUP BY country_code
ORDER BY count DESC
"""
top_result = db.query(top_query, {"days": days})
# Calculer le total pour le pourcentage
total = sum(row[1] for row in top_result.result_rows)
top_countries = [
CountryData(
code=row[0],
name=_COUNTRY_NAMES.get(row[0], row[0]),
count=row[1],
percentage=round((row[1] / total * 100), 2) if total > 0 else 0.0
)
for row in top_result.result_rows
]
# Baseline (7 derniers jours)
baseline_query = f"""
SELECT
country_code,
count() AS count
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 7 DAY
AND country_code != '' AND country_code IS NOT NULL
GROUP BY country_code
ORDER BY count DESC
"""
baseline_result = db.query(baseline_query)
baseline_total = sum(row[1] for row in baseline_result.result_rows)
baseline = {
row[0]: round((row[1] / baseline_total * 100), 2) if baseline_total > 0 else 0.0
for row in baseline_result.result_rows
}
# Détecter pays surreprésenté
alert_country = None
for country in top_countries:
baseline_pct = baseline.get(country.code, 0)
if baseline_pct > 0 and country.percentage > baseline_pct * 2 and country.percentage > 30:
alert_country = country.code
break
return CountryAnalysis(
top_countries=top_countries,
baseline=baseline,
alert_country=alert_country
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ANALYSE JA4
# =============================================================================
@router.get("/{ip}/ja4", response_model=JA4Analysis)
async def analyze_ja4(ip: str):
"""
Analyse le JA4 fingerprint
"""
try:
# JA4 de cette IP
ja4_query = f"""
SELECT ja4
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE src_ip = %(ip)s
AND ja4 != '' AND ja4 IS NOT NULL
ORDER BY detected_at DESC
LIMIT 1
"""
ja4_result = db.query(ja4_query, {"ip": ip})
if not ja4_result.result_rows:
return JA4Analysis(
ja4="",
shared_ips_count=0,
top_subnets=[],
other_ja4_for_ip=[]
)
ja4 = ja4_result.result_rows[0][0]
# IPs avec le même JA4
shared_query = f"""
SELECT uniq(src_ip)
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE ja4 = %(ja4)s
AND detected_at >= now() - INTERVAL 24 HOUR
"""
shared_result = db.query(shared_query, {"ja4": ja4})
shared_count = shared_result.result_rows[0][0] if shared_result.result_rows else 0
# Top subnets pour ce JA4 - Simplifié
subnets_query = f"""
SELECT
src_ip,
count() AS count
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE ja4 = %(ja4)s
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY src_ip
ORDER BY count DESC
"""
subnets_result = db.query(subnets_query, {"ja4": ja4})
# Grouper par subnet /24
subnet_counts = defaultdict(int)
for row in subnets_result.result_rows:
ip_addr = str(row[0])
parts = ip_addr.split('.')
if len(parts) == 4:
subnet = f"{parts[0]}.{parts[1]}.{parts[2]}.0/24"
subnet_counts[subnet] += row[1]
top_subnets = [
JA4SubnetData(subnet=subnet, count=count)
for subnet, count in sorted(subnet_counts.items(), key=lambda x: x[1], reverse=True)[:10]
]
# Autres JA4 pour cette IP
other_ja4_query = f"""
SELECT DISTINCT ja4
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE src_ip = %(ip)s
AND ja4 != '' AND ja4 IS NOT NULL
AND ja4 != %(current_ja4)s
"""
other_result = db.query(other_ja4_query, {"ip": ip, "current_ja4": ja4})
other_ja4 = [row[0] for row in other_result.result_rows]
return JA4Analysis(
ja4=ja4,
shared_ips_count=shared_count,
top_subnets=top_subnets,
other_ja4_for_ip=other_ja4
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ANALYSE USER-AGENTS
# =============================================================================
@router.get("/{ip}/user-agents", response_model=UserAgentAnalysis)
async def analyze_user_agents(ip: str):
"""
Analyse les User-Agents
"""
try:
# User-Agents pour cette IP (depuis http_logs)
ip_ua_query = f"""
SELECT
header_user_agent AS ua,
count() AS count
FROM {settings.CLICKHOUSE_DB_LOGS}.http_logs
WHERE src_ip = %(ip)s
AND header_user_agent != '' AND header_user_agent IS NOT NULL
AND time >= now() - INTERVAL 24 HOUR
GROUP BY ua
ORDER BY count DESC
"""
# Total réel des requêtes (pour les pourcentages corrects)
ip_total_query = f"""
SELECT count()
FROM {settings.CLICKHOUSE_DB_LOGS}.http_logs
WHERE src_ip = %(ip)s
AND time >= now() - INTERVAL 24 HOUR
"""
ip_ua_result = db.query(ip_ua_query, {"ip": ip})
ip_total_result = db.query(ip_total_query, {"ip": ip})
# Classification des UAs
def classify_ua(ua: str) -> str:
"""Classe un User-Agent en 'bot', 'script', 'browser' ou 'unknown'."""
ua_lower = ua.lower()
if any(bot in ua_lower for bot in ['bot', 'crawler', 'spider', 'curl', 'wget', 'python', 'requests', 'scrapy']):
return 'bot'
if any(script in ua_lower for script in ['python', 'java', 'php', 'ruby', 'perl', 'node']):
return 'script'
if not ua or ua.strip() == '':
return 'script'
return 'normal'
# Total réel de toutes les requêtes (pour des pourcentages corrects même avec LIMIT)
total_count = ip_total_result.result_rows[0][0] if ip_total_result.result_rows else 0
if total_count == 0:
total_count = sum(row[1] for row in ip_ua_result.result_rows)
ip_user_agents = [
UserAgentData(
value=row[0],
count=row[1],
percentage=round((row[1] / total_count * 100), 2) if total_count > 0 else 0.0,
classification=classify_ua(row[0])
)
for row in ip_ua_result.result_rows
]
# Pour les UAs du JA4, on retourne les mêmes pour l'instant
ja4_user_agents = ip_user_agents
# Pourcentage de bots
bot_count = sum(ua.count for ua in ip_user_agents if ua.classification in ['bot', 'script'])
bot_percentage = (bot_count / total_count * 100) if total_count > 0 else 0
return UserAgentAnalysis(
ip_user_agents=ip_user_agents,
ja4_user_agents=ja4_user_agents,
bot_percentage=bot_percentage,
alert=bot_percentage > 20
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# RECOMMANDATION DE CLASSIFICATION
# =============================================================================
@router.get("/{ip}/recommendation", response_model=ClassificationRecommendation)
async def get_classification_recommendation(ip: str):
"""
Génère une recommandation de classification basée sur les corrélations
"""
try:
# Récupérer les analyses
try:
subnet_analysis = await analyze_subnet(ip)
except Exception:
subnet_analysis = None
try:
country_analysis = await analyze_country(1)
except Exception:
country_analysis = None
try:
ja4_analysis = await analyze_ja4(ip)
except Exception:
ja4_analysis = None
try:
ua_analysis = await analyze_user_agents(ip)
except Exception:
ua_analysis = None
# Indicateurs par défaut
indicators = CorrelationIndicators(
subnet_ips_count=subnet_analysis.total_in_subnet if subnet_analysis else 0,
asn_ips_count=subnet_analysis.total_in_asn if subnet_analysis else 0,
country_percentage=0.0,
ja4_shared_ips=ja4_analysis.shared_ips_count if ja4_analysis else 0,
user_agents_count=len(ua_analysis.ja4_user_agents) if ua_analysis else 0,
bot_ua_percentage=ua_analysis.bot_percentage if ua_analysis else 0.0
)
# Score de confiance
score = 0.0
reasons = []
tags = []
# Subnet > 10 IPs
if subnet_analysis and subnet_analysis.total_in_subnet > 10:
score += 0.25
reasons.append(f"{subnet_analysis.total_in_subnet} IPs du même subnet")
tags.append("distributed")
# JA4 partagé > 50 IPs
if ja4_analysis and ja4_analysis.shared_ips_count > 50:
score += 0.25
reasons.append(f"{ja4_analysis.shared_ips_count} IPs avec même JA4")
tags.append("ja4-rotation")
# Bot UA > 20%
if ua_analysis and ua_analysis.bot_percentage > 20:
score += 0.25
reasons.append(f"{ua_analysis.bot_percentage:.0f}% UAs bots/scripts")
tags.append("bot-ua")
# Pays surreprésenté
if country_analysis and country_analysis.alert_country:
score += 0.15
reasons.append(f"Pays {country_analysis.alert_country} surreprésenté")
tags.append(f"country-{country_analysis.alert_country.lower()}")
# ASN hosting
if subnet_analysis:
hosting_keywords = ["ovh", "amazon", "aws", "google", "azure", "digitalocean", "linode", "vultr", "china169", "chinamobile"]
if any(kw in (subnet_analysis.asn_org or "").lower() for kw in hosting_keywords):
score += 0.10
tags.append("hosting-asn")
# Déterminer label
if score >= 0.7:
label = ClassificationLabel.MALICIOUS
tags.append("campaign")
elif score >= 0.4:
label = ClassificationLabel.SUSPICIOUS
else:
label = ClassificationLabel.LEGITIMATE
reason = " | ".join(reasons) if reasons else "Aucun indicateur fort"
return ClassificationRecommendation(
label=label,
confidence=min(score, 1.0),
indicators=indicators,
suggested_tags=tags,
reason=reason
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# CLASSIFICATIONS CRUD
# =============================================================================
@router.post("/classifications", response_model=Classification)
async def create_classification(data: ClassificationCreate):
"""
Crée une classification pour une IP ou un JA4
"""
try:
# Validation: soit ip, soit ja4 doit être fourni
if not data.ip and not data.ja4:
raise HTTPException(status_code=400, detail="IP ou JA4 requis")
query = f"""
INSERT INTO {settings.CLICKHOUSE_DB_PROCESSING}.classifications
(ip, ja4, label, tags, comment, confidence, features, analyst, created_at)
VALUES
(%(ip)s, %(ja4)s, %(label)s, %(tags)s, %(comment)s, %(confidence)s, %(features)s, %(analyst)s, now())
"""
db.query(query, {
"ip": data.ip or "",
"ja4": data.ja4 or "",
"label": data.label.value,
"tags": data.tags,
"comment": data.comment,
"confidence": data.confidence,
"features": json.dumps(data.features),
"analyst": data.analyst
})
# Récupérer la classification créée
where_clause = "ip = %(entity)s" if data.ip else "ja4 = %(entity)s"
select_query = f"""
SELECT ip, ja4, label, tags, comment, confidence, features, analyst, created_at
FROM {settings.CLICKHOUSE_DB_PROCESSING}.classifications
WHERE {where_clause}
ORDER BY created_at DESC
LIMIT 1
"""
result = db.query(select_query, {"entity": data.ip or data.ja4})
if not result.result_rows:
raise HTTPException(status_code=404, detail="Classification non trouvée")
row = result.result_rows[0]
return Classification(
ip=row[0] or None,
ja4=row[1] or None,
label=ClassificationLabel(row[2]),
tags=row[3],
comment=row[4],
confidence=row[5],
features=json.loads(row[6]) if row[6] else {},
analyst=row[7],
created_at=row[8]
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/classifications", response_model=ClassificationsListResponse)
async def list_classifications(
ip: Optional[str] = Query(None, description="Filtrer par IP"),
ja4: Optional[str] = Query(None, description="Filtrer par JA4"),
label: Optional[str] = Query(None, description="Filtrer par label"),
limit: int = Query(100, ge=1, le=1000)
):
"""
Liste les classifications
"""
try:
where_clauses = ["1=1"]
params = {"limit": limit}
if ip:
where_clauses.append("ip = %(ip)s")
params["ip"] = ip
if ja4:
where_clauses.append("ja4 = %(ja4)s")
params["ja4"] = ja4
if label:
where_clauses.append("label = %(label)s")
params["label"] = label
where_clause = " AND ".join(where_clauses)
query = f"""
SELECT ip, ja4, label, tags, comment, confidence, features, analyst, created_at
FROM {settings.CLICKHOUSE_DB_PROCESSING}.classifications
WHERE {where_clause}
ORDER BY created_at DESC
LIMIT %(limit)s
"""
result = db.query(query, params)
classifications = [
Classification(
ip=row[0] or None,
ja4=row[1] or None,
label=ClassificationLabel(row[2]),
tags=row[3],
comment=row[4],
confidence=row[5],
features=json.loads(row[6]) if row[6] else {},
analyst=row[7],
created_at=row[8]
)
for row in result.result_rows
]
# Total
count_query = f"""
SELECT count()
FROM {settings.CLICKHOUSE_DB_PROCESSING}.classifications
WHERE {where_clause}
"""
count_result = db.query(count_query, params)
total = count_result.result_rows[0][0] if count_result.result_rows else 0
return ClassificationsListResponse(
items=classifications,
total=total
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/classifications/stats")
async def get_classification_stats():
"""
Statistiques des classifications
"""
try:
stats_query = f"""
SELECT
label,
count() AS total,
uniq(ip) AS unique_ips,
avg(confidence) AS avg_confidence
FROM {settings.CLICKHOUSE_DB_PROCESSING}.classifications
GROUP BY label
ORDER BY total DESC
"""
result = db.query(stats_query)
stats = [
{
"label": row[0],
"total": row[1],
"unique_ips": row[2],
"avg_confidence": float(row[3]) if row[3] else 0.0
}
for row in result.result_rows
]
return {"stats": stats}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -0,0 +1,507 @@
"""JSON API endpoints for the JA4 SOC Dashboard."""
from __future__ import annotations
import json
import logging
import os
from pathlib import Path
from typing import Any
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from backend.config import DB_PROCESSING, DB_LOGS, safe_identifier
from backend.database import query, query_scalar, execute
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api")
# Pre-validate DB identifiers at import time
_DB = safe_identifier(DB_PROCESSING)
_DB_LOGS = safe_identifier(DB_LOGS)
# Whitelists for sort/order to prevent SQL injection
_DETECTION_SORT_COLS = {
"detected_at", "src_ip", "ja4", "host", "anomaly_score",
"threat_level", "recurrence", "hits", "hit_velocity",
"fuzzing_index", "post_ratio", "campaign_id",
}
_SCORE_SORT_COLS = {
"detected_at", "window_start", "src_ip", "ja4", "host",
"anomaly_score", "raw_anomaly_score", "threat_level",
"hits", "hit_velocity", "xgb_prob", "ae_recon_error",
}
_TRAFFIC_SORT_COLS = {
"time", "src_ip", "method", "host", "path", "http_version",
"header_user_agent", "ja4", "src_country_code",
}
_ORDER_VALUES = {"ASC", "DESC"}
def _validate_sort(value: str, whitelist: set[str], default: str) -> str:
return value if value in whitelist else default
def _validate_order(value: str) -> str:
return value.upper() if value.upper() in _ORDER_VALUES else "DESC"
# ---------------------------------------------------------------------------
# GET /api/overview
# ---------------------------------------------------------------------------
@router.get("/overview")
async def overview() -> dict[str, Any]:
try:
detections_24h = query_scalar(
f"SELECT count() FROM {_DB}.ml_detected_anomalies "
"WHERE detected_at >= now() - INTERVAL 1 DAY"
) or 0
scored_24h = query_scalar(
f"SELECT count() FROM {_DB}.ml_all_scores "
"WHERE detected_at >= now() - INTERVAL 1 DAY"
) or 0
threat_distribution = query(
f"SELECT threat_level, count() AS cnt "
f"FROM {_DB}.ml_all_scores "
"WHERE detected_at >= now() - INTERVAL 1 DAY "
"GROUP BY threat_level"
)
# Compute critical / high counts from distribution
threat_map = {r["threat_level"]: r["cnt"] for r in threat_distribution}
critical_count = threat_map.get("CRITICAL", 0)
high_count = threat_map.get("HIGH", 0)
unique_ips = query_scalar(
f"SELECT uniq(src_ip) FROM {_DB}.ml_detected_anomalies "
"WHERE detected_at >= now() - INTERVAL 1 DAY"
) or 0
top_ips = query(
f"SELECT toString(src_ip) AS src_ip, count() AS cnt, "
f"max(anomaly_score) AS worst_score, "
f"any(threat_level) AS threat_level, "
f"any(asn_org) AS asn_org, any(country_code) AS country_code "
f"FROM {_DB}.ml_detected_anomalies "
"WHERE detected_at >= now() - INTERVAL 1 DAY "
"GROUP BY src_ip ORDER BY cnt DESC LIMIT 10"
)
timeline = query(
f"SELECT toStartOfHour(detected_at) AS hour, count() AS cnt "
f"FROM {_DB}.ml_detected_anomalies "
"WHERE detected_at >= now() - INTERVAL 1 DAY "
"GROUP BY hour ORDER BY hour"
)
traffic_24h = query_scalar(
f"SELECT count() FROM {_DB_LOGS}.http_logs "
"WHERE time >= now() - INTERVAL 1 DAY"
) or 0
models = query(
f"SELECT model_name, count() AS scored "
f"FROM {_DB}.ml_all_scores "
"WHERE detected_at >= now() - INTERVAL 1 DAY "
"GROUP BY model_name"
)
return {
"detections_24h": detections_24h,
"scored_24h": scored_24h,
"traffic_24h": traffic_24h,
"unique_ips": unique_ips,
"critical_count": critical_count,
"high_count": high_count,
"threat_distribution": threat_distribution,
"top_ips": top_ips,
"timeline": [{"hour": str(r["hour"]), "cnt": r["cnt"]} for r in timeline],
"models": models,
}
except Exception as exc:
logger.exception("overview query failed")
raise HTTPException(status_code=500, detail=str(exc))
# ---------------------------------------------------------------------------
# GET /api/detections
# ---------------------------------------------------------------------------
@router.get("/detections")
async def detections(
page: int = Query(1, ge=1),
per_page: int = Query(50, ge=1, le=500),
sort: str = Query("detected_at"),
order: str = Query("DESC"),
threat_level: str | None = Query(None),
search: str | None = Query(None),
) -> dict[str, Any]:
sort = _validate_sort(sort, _DETECTION_SORT_COLS, "detected_at")
order = _validate_order(order)
offset = (page - 1) * per_page
where_clauses = ["detected_at >= now() - INTERVAL 30 DAY"]
params: dict[str, Any] = {}
if threat_level:
where_clauses.append("threat_level = {tl:String}")
params["tl"] = threat_level
if search:
where_clauses.append(
"(toString(src_ip) LIKE {search:String} OR host LIKE {search:String})"
)
params["search"] = f"%{search}%"
where = " AND ".join(where_clauses)
try:
total = query_scalar(
f"SELECT count() FROM {_DB}.ml_detected_anomalies WHERE {where}",
params,
)
rows = query(
f"SELECT *, toString(src_ip) AS src_ip_str "
f"FROM {_DB}.ml_detected_anomalies "
f"WHERE {where} ORDER BY {sort} {order} "
f"LIMIT {{lim:UInt32}} OFFSET {{off:UInt32}}",
{**params, "lim": per_page, "off": offset},
)
return {
"data": rows,
"total": total or 0,
"page": page,
"per_page": per_page,
"pages": max(1, -(-((total or 0)) // per_page)),
}
except Exception as exc:
logger.exception("detections query failed")
raise HTTPException(status_code=500, detail=str(exc))
# ---------------------------------------------------------------------------
# GET /api/scores
# ---------------------------------------------------------------------------
@router.get("/scores")
async def scores(
page: int = Query(1, ge=1),
per_page: int = Query(50, ge=1, le=500),
sort: str = Query("detected_at"),
order: str = Query("DESC"),
threat_level: str | None = Query(None),
search: str | None = Query(None),
) -> dict[str, Any]:
sort = _validate_sort(sort, _SCORE_SORT_COLS, "detected_at")
order = _validate_order(order)
offset = (page - 1) * per_page
where_clauses = ["detected_at >= now() - INTERVAL 3 DAY"]
params: dict[str, Any] = {}
if threat_level:
where_clauses.append("threat_level = {tl:String}")
params["tl"] = threat_level
if search:
where_clauses.append(
"(toString(src_ip) LIKE {search:String} OR host LIKE {search:String})"
)
params["search"] = f"%{search}%"
where = " AND ".join(where_clauses)
try:
total = query_scalar(
f"SELECT count() FROM {_DB}.ml_all_scores WHERE {where}",
params,
)
rows = query(
f"SELECT *, toString(src_ip) AS src_ip_str "
f"FROM {_DB}.ml_all_scores "
f"WHERE {where} ORDER BY {sort} {order} "
f"LIMIT {{lim:UInt32}} OFFSET {{off:UInt32}}",
{**params, "lim": per_page, "off": offset},
)
return {
"data": rows,
"total": total or 0,
"page": page,
"per_page": per_page,
"pages": max(1, -(-((total or 0)) // per_page)),
}
except Exception as exc:
logger.exception("scores query failed")
raise HTTPException(status_code=500, detail=str(exc))
# ---------------------------------------------------------------------------
# GET /api/traffic
# ---------------------------------------------------------------------------
@router.get("/traffic")
async def traffic(
page: int = Query(1, ge=1),
per_page: int = Query(50, ge=1, le=500),
sort: str = Query("time"),
order: str = Query("DESC"),
method: str | None = Query(None),
host: str | None = Query(None),
status: str | None = Query(None),
) -> dict[str, Any]:
sort = _validate_sort(sort, _TRAFFIC_SORT_COLS, "time")
order = _validate_order(order)
offset = (page - 1) * per_page
where_clauses = ["time >= now() - INTERVAL 1 DAY"]
params: dict[str, Any] = {}
if method:
where_clauses.append("method = {method:String}")
params["method"] = method
if host:
where_clauses.append("host LIKE {host:String}")
params["host"] = f"%{host}%"
if status is not None:
where_clauses.append("http_version = {status:String}")
params["status"] = status
where = " AND ".join(where_clauses)
try:
total = query_scalar(
f"SELECT count() FROM {_DB_LOGS}.http_logs WHERE {where}",
params,
)
rows = query(
f"SELECT time, toString(src_ip) AS src_ip, method, host, path, "
f"http_version, header_user_agent, ja4, src_country_code "
f"FROM {_DB_LOGS}.http_logs "
f"WHERE {where} ORDER BY {sort} {order} "
f"LIMIT {{lim:UInt32}} OFFSET {{off:UInt32}}",
{**params, "lim": per_page, "off": offset},
)
return {
"data": rows,
"total": total or 0,
"page": page,
"per_page": per_page,
"pages": max(1, -(-((total or 0)) // per_page)),
}
except Exception as exc:
logger.exception("traffic query failed")
raise HTTPException(status_code=500, detail=str(exc))
# ---------------------------------------------------------------------------
# GET /api/ip/{ip}
# ---------------------------------------------------------------------------
@router.get("/ip/{ip}")
async def ip_detail(ip: str) -> dict[str, Any]:
# Strip ::ffff: prefix for IPv4-mapped addresses
clean_ip = ip.replace("::ffff:", "")
params = {"ip": clean_ip}
try:
detections = query(
f"SELECT *, toString(src_ip) AS src_ip_str "
f"FROM {_DB}.ml_detected_anomalies "
"WHERE src_ip = toIPv6({ip:String}) "
"AND detected_at >= now() - INTERVAL 30 DAY "
"ORDER BY detected_at DESC",
params,
)
all_scores = query(
f"SELECT *, toString(src_ip) AS src_ip_str "
f"FROM {_DB}.ml_all_scores "
"WHERE src_ip = toIPv6({ip:String}) "
"AND detected_at >= now() - INTERVAL 3 DAY "
"ORDER BY detected_at DESC",
params,
)
http_logs = query(
f"SELECT time, method, host, path, http_version, header_user_agent, ja4 "
f"FROM {_DB_LOGS}.http_logs "
"WHERE src_ip = toIPv4OrZero({ip:String}) "
"AND time >= now() - INTERVAL 1 DAY "
"ORDER BY time DESC LIMIT 100",
params,
)
ai_features: list[dict] = []
try:
ai_features = query(
f"SELECT * FROM {_DB}.view_ai_features_1h "
"WHERE src_ip = toIPv6({ip:String}) LIMIT 1",
params,
)
except Exception:
logger.debug("view_ai_features_1h unavailable for %s", ip)
recurrence: list[dict] = []
try:
recurrence = query(
f"SELECT * FROM {_DB}.view_ip_recurrence "
"WHERE src_ip = toIPv6({ip:String})",
params,
)
except Exception:
logger.debug("view_ip_recurrence unavailable for %s", ip)
return {
"ip": ip,
"detections": detections,
"scores": all_scores,
"http_logs": http_logs,
"ai_features": ai_features,
"recurrence": recurrence,
}
except Exception as exc:
logger.exception("ip detail query failed for %s", ip)
raise HTTPException(status_code=500, detail=str(exc))
# ---------------------------------------------------------------------------
# GET /api/features
# ---------------------------------------------------------------------------
@router.get("/features")
async def features() -> dict[str, Any]:
result: dict[str, Any] = {"ai_features": {}, "thesis_features": {}}
try:
ai_stats = query(
f"SELECT count() AS total, "
f"avg(hits) AS avg_hits, "
f"avg(hit_velocity) AS avg_hit_velocity, "
f"avg(fuzzing_index) AS avg_fuzzing_index, "
f"avg(post_ratio) AS avg_post_ratio "
f"FROM {_DB}.view_ai_features_1h"
)
if ai_stats:
result["ai_features"] = ai_stats[0]
except Exception:
logger.debug("view_ai_features_1h not available")
try:
thesis_stats = query(
f"SELECT count() AS total, "
f"avg(hits) AS avg_hits, "
f"avg(hit_velocity) AS avg_hit_velocity, "
f"avg(fuzzing_index) AS avg_fuzzing_index, "
f"avg(post_ratio) AS avg_post_ratio "
f"FROM {_DB}.view_thesis_features_1h"
)
if thesis_stats:
result["thesis_features"] = thesis_stats[0]
except Exception:
logger.debug("view_thesis_features_1h not available")
return result
# ---------------------------------------------------------------------------
# GET /api/models
# ---------------------------------------------------------------------------
_MODEL_DIR = Path("/data/models")
@router.get("/models")
async def models() -> dict[str, Any]:
model_info: list[dict[str, Any]] = []
if _MODEL_DIR.is_dir():
for p in sorted(_MODEL_DIR.glob("*.json")):
try:
data = json.loads(p.read_text())
model_info.append(data)
except Exception:
logger.warning("Could not read model metadata %s", p)
# Also fetch latest scoring stats from ClickHouse
scoring_stats: list[dict] = []
try:
scoring_stats = query(
f"SELECT model_name, count() AS scored, "
f"min(detected_at) AS first_seen, max(detected_at) AS last_seen "
f"FROM {_DB}.ml_all_scores "
"WHERE detected_at >= now() - INTERVAL 7 DAY "
"GROUP BY model_name"
)
except Exception:
logger.debug("could not fetch model scoring stats")
return {"models": model_info, "scoring_stats": scoring_stats}
# ---------------------------------------------------------------------------
# POST /api/classify — SOC analyst feedback
# ---------------------------------------------------------------------------
class ClassifyRequest(BaseModel):
src_ip: str
classification: str # bot | legitimate | suspicious
comment: str = ""
_VALID_CLASSIFICATIONS = {"bot", "legitimate", "suspicious"}
_feedback_table_ensured = False
def _ensure_feedback_table() -> None:
global _feedback_table_ensured
if _feedback_table_ensured:
return
execute(
f"CREATE TABLE IF NOT EXISTS {_DB}.soc_feedback ("
" created_at DateTime DEFAULT now(),"
" src_ip IPv6,"
" classification LowCardinality(String),"
" comment String"
") ENGINE = MergeTree() ORDER BY (src_ip, created_at)"
)
_feedback_table_ensured = True
@router.post("/classify")
async def classify(body: ClassifyRequest) -> dict[str, Any]:
if body.classification not in _VALID_CLASSIFICATIONS:
raise HTTPException(
status_code=422,
detail=f"classification must be one of {_VALID_CLASSIFICATIONS}",
)
try:
_ensure_feedback_table()
execute(
f"INSERT INTO {_DB}.soc_feedback (src_ip, classification, comment) VALUES "
"(toIPv6({ip:String}), {cls:String}, {cmt:String})",
{"ip": body.src_ip, "cls": body.classification, "cmt": body.comment},
)
return {"status": "ok", "src_ip": body.src_ip, "classification": body.classification}
except Exception as exc:
logger.exception("classify insert failed")
raise HTTPException(status_code=500, detail=str(exc))
# ---------------------------------------------------------------------------
# GET /api/classifications — recent SOC feedback
# ---------------------------------------------------------------------------
@router.get("/classifications")
async def classifications() -> dict[str, Any]:
try:
_ensure_feedback_table()
rows = query(
f"SELECT created_at, toString(src_ip) AS src_ip, classification, comment "
f"FROM {_DB}.soc_feedback "
"ORDER BY created_at DESC LIMIT 50"
)
return {"data": rows}
except Exception as exc:
logger.exception("classifications query failed")
return {"data": []}

View File

@ -1,93 +0,0 @@
"""
Endpoints pour la liste des attributs uniques
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
from ..models import AttributeListResponse, AttributeListItem
from ..config import settings
router = APIRouter(prefix="/api/attributes", tags=["attributes"])
@router.get("/{attr_type}", response_model=AttributeListResponse)
async def get_attributes(
attr_type: str,
limit: int = Query(100, ge=1, le=1000, description="Nombre maximum de résultats")
):
"""
Récupère la liste des valeurs uniques pour un type d'attribut
"""
try:
# Mapping des types vers les colonnes
type_column_map = {
"ip": "src_ip",
"ja4": "ja4",
"country": "country_code",
"asn": "asn_number",
"host": "host",
"threat_level": "threat_level",
"model_name": "model_name",
"asn_org": "asn_org"
}
if attr_type not in type_column_map:
raise HTTPException(
status_code=400,
detail=f"Type invalide. Types supportés: {', '.join(type_column_map.keys())}"
)
column = type_column_map[attr_type]
# Requête de base
base_query = f"""
SELECT
{column} AS value,
count() AS count
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 24 HOUR
"""
# Ajout du filtre pour exclure les valeurs vides/nulles
# Gestion spéciale pour les types IPv6/IPv4 qui ne peuvent pas être comparés à ''
if attr_type == "ip":
# Pour les adresses IP, on convertit en string et on filtre
query = f"""
SELECT value, count FROM (
SELECT toString({column}) AS value, count() AS count
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 24 HOUR
GROUP BY {column}
)
WHERE value != '' AND value IS NOT NULL
ORDER BY count DESC
LIMIT %(limit)s
"""
else:
query = f"""
{base_query}
AND {column} != '' AND {column} IS NOT NULL
GROUP BY value
ORDER BY count DESC
LIMIT %(limit)s
"""
result = db.query(query, {"limit": limit})
items = [
AttributeListItem(
value=str(row[0]),
count=row[1]
)
for row in result.result_rows
]
return AttributeListResponse(
type=attr_type,
items=items,
total=len(items)
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -1,239 +0,0 @@
"""
Routes pour l'audit et les logs d'activité
"""
import logging
from fastapi import APIRouter, HTTPException, Query, Request
from typing import Optional
from datetime import datetime
from ..database import db
from ..config import settings
router = APIRouter(prefix="/api/audit", tags=["audit"])
logger = logging.getLogger(__name__)
@router.post("/logs")
async def create_audit_log(
request: Request,
action: str,
entity_type: Optional[str] = None,
entity_id: Optional[str] = None,
entity_count: Optional[int] = None,
details: Optional[dict] = None,
user: Optional[str] = "soc_user"
):
"""
Crée un log d'audit pour une action utilisateur
"""
try:
# Récupérer l'IP du client
client_ip = request.client.host if request.client else "unknown"
# Insérer dans ClickHouse
insert_query = f"""
INSERT INTO {settings.CLICKHOUSE_DB_PROCESSING}.audit_logs
(timestamp, user_name, action, entity_type, entity_id, entity_count, details, client_ip)
VALUES
(%(timestamp)s, %(user)s, %(action)s, %(entity_type)s, %(entity_id)s, %(entity_count)s, %(details)s, %(client_ip)s)
"""
params = {
'timestamp': datetime.now(),
'user': user,
'action': action,
'entity_type': entity_type,
'entity_id': entity_id,
'entity_count': entity_count,
'details': str(details) if details else '',
'client_ip': client_ip
}
# Note: This requires the audit_logs table to exist
# See deploy_audit_logs_table.sql
try:
db.query(insert_query, params)
except Exception as e:
# La table peut ne pas encore exister — on logue mais on ne bloque pas l'appelant
logger.warning(f"Could not insert audit log: {e}")
return {
"status": "success",
"message": "Audit log created",
"action": action,
"timestamp": params['timestamp'].isoformat()
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/logs")
async def get_audit_logs(
hours: int = Query(24, ge=1, le=720, description="Fenêtre temporelle en heures"),
user: Optional[str] = Query(None, description="Filtrer par utilisateur"),
action: Optional[str] = Query(None, description="Filtrer par action"),
entity_type: Optional[str] = Query(None, description="Filtrer par type d'entité"),
limit: int = Query(100, ge=1, le=1000, description="Nombre maximum de résultats")
):
"""
Récupère les logs d'audit avec filtres
"""
try:
where_clauses = ["timestamp >= now() - INTERVAL %(hours)s HOUR"]
params = {"hours": hours, "limit": limit}
if user:
where_clauses.append("user_name = %(user)s")
params["user"] = user
if action:
where_clauses.append("action = %(action)s")
params["action"] = action
if entity_type:
where_clauses.append("entity_type = %(entity_type)s")
params["entity_type"] = entity_type
where_clause = " AND ".join(where_clauses)
query = f"""
SELECT
timestamp,
user_name,
action,
entity_type,
entity_id,
entity_count,
details,
client_ip
FROM {settings.CLICKHOUSE_DB_PROCESSING}.audit_logs
WHERE {where_clause}
ORDER BY timestamp DESC
LIMIT %(limit)s
"""
result = db.query(query, params)
logs = []
for row in result.result_rows:
logs.append({
"timestamp": row[0].isoformat() if row[0] else "",
"user_name": row[1] or "",
"action": row[2] or "",
"entity_type": row[3] or "",
"entity_id": row[4] or "",
"entity_count": row[5] or 0,
"details": row[6] or "",
"client_ip": row[7] or ""
})
return {
"items": logs,
"total": len(logs),
"period_hours": hours
}
except Exception as e:
# If table doesn't exist, return empty result
if "Table" in str(e) and "doesn't exist" in str(e):
return {
"items": [],
"total": 0,
"period_hours": hours,
"warning": "Audit logs table not created yet"
}
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/stats")
async def get_audit_stats(
hours: int = Query(24, ge=1, le=720)
):
"""
Statistiques d'audit
"""
try:
query = f"""
SELECT
action,
count() AS count,
uniq(user_name) AS unique_users,
sum(entity_count) AS total_entities
FROM {settings.CLICKHOUSE_DB_PROCESSING}.audit_logs
WHERE timestamp >= now() - INTERVAL %(hours)s HOUR
GROUP BY action
ORDER BY count DESC
"""
result = db.query(query, {"hours": hours})
stats = []
for row in result.result_rows:
stats.append({
"action": row[0] or "",
"count": row[1] or 0,
"unique_users": row[2] or 0,
"total_entities": row[3] or 0
})
return {
"items": stats,
"period_hours": hours
}
except Exception as e:
if "Table" in str(e) and "doesn't exist" in str(e):
return {
"items": [],
"period_hours": hours,
"warning": "Audit logs table not created yet"
}
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/users/activity")
async def get_user_activity(
hours: int = Query(24, ge=1, le=720)
):
"""
Activité par utilisateur
"""
try:
query = f"""
SELECT
user_name,
count() AS actions,
uniq(action) AS action_types,
min(timestamp) AS first_action,
max(timestamp) AS last_action
FROM {settings.CLICKHOUSE_DB_PROCESSING}.audit_logs
WHERE timestamp >= now() - INTERVAL %(hours)s HOUR
GROUP BY user_name
ORDER BY actions DESC
"""
result = db.query(query, {"hours": hours})
users = []
for row in result.result_rows:
users.append({
"user_name": row[0] or "",
"actions": row[1] or 0,
"action_types": row[2] or 0,
"first_action": row[3].isoformat() if row[3] else "",
"last_action": row[4].isoformat() if row[4] else ""
})
return {
"items": users,
"period_hours": hours
}
except Exception as e:
if "Table" in str(e) and "doesn't exist" in str(e):
return {
"items": [],
"period_hours": hours,
"warning": "Audit logs table not created yet"
}
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -1,107 +0,0 @@
"""
Endpoints pour l'analyse des botnets via la propagation des fingerprints JA4
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
from ..config import settings
router = APIRouter(prefix="/api/botnets", tags=["botnets"])
def _botnet_class(unique_countries: int) -> str:
"""Classifie un JA4 selon sa dispersion géographique."""
if unique_countries > 100:
return "global_botnet"
if unique_countries > 20:
return "regional_botnet"
return "concentrated"
@router.get("/ja4-spread")
async def get_ja4_spread():
"""Propagation des JA4 fingerprints à travers les pays et les IPs."""
try:
sql = f"""
SELECT
ja4,
unique_ips,
unique_countries,
targeted_hosts
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_host_ja4_anomalies
ORDER BY unique_countries DESC
"""
result = db.query(sql)
items = []
for row in result.result_rows:
ja4 = str(row[0])
unique_ips = int(row[1])
unique_countries = int(row[2])
targeted_hosts = int(row[3])
dist_score = round(
unique_countries / max(unique_ips ** 0.5, 0.001), 2
)
items.append({
"ja4": ja4,
"unique_ips": unique_ips,
"unique_countries": unique_countries,
"targeted_hosts": targeted_hosts,
"distribution_score":dist_score,
"botnet_class": _botnet_class(unique_countries),
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/ja4/{ja4}/countries")
async def get_ja4_countries(ja4: str, limit: int = Query(30, ge=1, le=200)):
"""Top pays pour un JA4 donné depuis agg_host_ip_ja4_1h."""
try:
sql = f"""
SELECT
src_country_code AS country_code,
uniq(replaceRegexpAll(toString(src_ip), '^::ffff:', '')) AS unique_ips,
sum(hits) AS hits
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE ja4 = %(ja4)s
GROUP BY src_country_code
ORDER BY unique_ips DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"ja4": ja4, "limit": limit})
items = [
{
"country_code": str(row[0]),
"unique_ips": int(row[1]),
"hits": int(row[2]),
}
for row in result.result_rows
]
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/summary")
async def get_botnets_summary():
"""Statistiques globales sur les botnets détectés."""
try:
sql = f"""
SELECT
countIf(unique_countries > 100) AS total_global_botnets,
sumIf(unique_ips, unique_countries > 50) AS total_ips_in_botnets,
argMax(ja4, unique_countries) AS most_spread_ja4,
argMax(ja4, unique_ips) AS most_ips_ja4
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_host_ja4_anomalies
"""
result = db.query(sql)
row = result.result_rows[0]
return {
"total_global_botnets": int(row[0]),
"total_ips_in_botnets": int(row[1]),
"most_spread_ja4": str(row[2]),
"most_ips_ja4": str(row[3]),
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -1,142 +0,0 @@
"""
Endpoints pour l'analyse des attaques par force brute sur les formulaires
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
from ..config import settings
router = APIRouter(prefix="/api/bruteforce", tags=["bruteforce"])
@router.get("/targets")
async def get_bruteforce_targets():
"""Liste des hôtes ciblés par brute-force, triés par total_hits DESC."""
try:
sql = f"""
SELECT
host,
uniq(src_ip) AS unique_ips,
sum(hits) AS total_hits,
sum(query_params_count) AS total_params,
groupArray(3)(ja4) AS top_ja4s
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_form_bruteforce_detected
GROUP BY host
ORDER BY total_hits DESC
"""
result = db.query(sql)
items = []
for row in result.result_rows:
host = str(row[0])
unique_ips = int(row[1])
total_hits = int(row[2])
total_params= int(row[3])
top_ja4s = [str(j) for j in (row[4] or [])]
attack_type = (
"credential_stuffing"
if total_hits > 0 and total_params / total_hits > 0.5
else "enumeration"
)
items.append({
"host": host,
"unique_ips": unique_ips,
"total_hits": total_hits,
"total_params":total_params,
"attack_type": attack_type,
"top_ja4s": top_ja4s,
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/attackers")
async def get_bruteforce_attackers(limit: int = Query(50, ge=1, le=500)):
"""Top IPs attaquantes triées par total_hits DESC."""
try:
sql = f"""
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
uniq(host) AS distinct_hosts,
sum(hits) AS total_hits,
sum(query_params_count) AS total_params,
argMax(ja4, hits) AS ja4
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_form_bruteforce_detected
GROUP BY src_ip
ORDER BY total_hits DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
items.append({
"ip": str(row[0]),
"distinct_hosts":int(row[1]),
"total_hits": int(row[2]),
"total_params": int(row[3]),
"ja4": str(row[4]),
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/timeline")
async def get_bruteforce_timeline():
"""Hits par heure (dernières 72h) depuis agg_host_ip_ja4_1h."""
try:
sql = f"""
SELECT
toHour(window_start) AS hour,
sum(hits) AS hits,
uniq(replaceRegexpAll(toString(src_ip), '^::ffff:', '')) AS ips
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 72 HOUR
GROUP BY hour
ORDER BY hour ASC
"""
result = db.query(sql)
hours = []
for row in result.result_rows:
hours.append({
"hour": int(row[0]),
"hits": int(row[1]),
"ips": int(row[2]),
})
return {"hours": hours}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/host/{host:path}/attackers")
async def get_host_attackers(host: str, limit: int = Query(20, ge=1, le=200)):
"""Top IPs attaquant un hôte spécifique, avec JA4 et type d'attaque."""
try:
sql = f"""
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
sum(hits) AS total_hits,
sum(query_params_count) AS total_params,
argMax(ja4, hits) AS ja4,
max(hits) AS max_hits_per_window
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_form_bruteforce_detected
WHERE host = %(host)s
GROUP BY src_ip
ORDER BY total_hits DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"host": host, "limit": limit})
items = []
for row in result.result_rows:
total_hits = int(row[1])
total_params = int(row[2])
items.append({
"ip": str(row[0]),
"total_hits": total_hits,
"total_params":total_params,
"ja4": str(row[3] or ""),
"attack_type": "credential_stuffing" if total_hits > 0 and total_params / total_hits > 0.5 else "enumeration",
})
return {"host": host, "items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -1,554 +0,0 @@
"""
Clustering d'IPs multi-métriques — WebGL / deck.gl backend.
- Calcul sur la TOTALITÉ des IPs (GROUP BY src_ip, ja4 sans LIMIT)
- K-means++ vectorisé (numpy) + PCA-2D + enveloppes convexes (scipy)
- Calcul en background thread + cache 30 min
- Endpoints : /clusters, /status, /cluster/{id}/points
"""
import math
import time
import logging
import threading
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from typing import Any
import numpy as np
from fastapi import APIRouter, HTTPException, Query
from ..database import db
from ..services.clustering_engine import (
FEATURE_NAMES,
build_feature_vector, kmeans_pp, pca_2d, compute_hulls,
name_cluster, risk_score_from_centroid, standardize,
risk_to_gradient_color,
)
from ..config import settings
log = logging.getLogger(__name__)
router = APIRouter(prefix="/api/clustering", tags=["clustering"])
# ─── Cache global ──────────────────────────────────────────────────────────────
_CACHE: dict[str, Any] = {
"status": "idle", # idle | computing | ready | error
"error": None,
"result": None, # dict résultat complet
"ts": 0.0, # timestamp dernière mise à jour
"params": {},
"cluster_ips": {}, # cluster_idx → [(ip, ja4, pca_x, pca_y, risk)]
}
_CACHE_TTL = 1800 # 30 minutes
_LOCK = threading.Lock()
_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="clustering")
# ─── Palette de couleurs (remplace l'ancienne logique menace) ─────────────────
# Les couleurs sont désormais attribuées par index de cluster pour maximiser
# la distinction visuelle, indépendamment du niveau de risque.
# ─── SQL : TOUTES les IPs sans LIMIT ─────────────────────────────────────────
_SQL_ALL_IPS = f"""
SELECT
replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') AS ip,
t.ja4,
any(t.tcp_ttl_raw) AS ttl,
any(t.tcp_win_raw) AS win,
any(t.tcp_scale_raw) AS scale,
any(t.tcp_mss_raw) AS mss,
any(t.first_ua) AS ua,
sum(t.hits) AS hits,
avg(abs(ml.anomaly_score)) AS avg_score,
avg(ml.hit_velocity) AS avg_velocity,
avg(ml.fuzzing_index) AS avg_fuzzing,
avg(ml.is_headless) AS pct_headless,
avg(ml.post_ratio) AS avg_post,
avg(ml.ip_id_zero_ratio) AS ip_id_zero,
avg(ml.temporal_entropy) AS entropy,
avg(ml.modern_browser_score) AS browser_score,
avg(ml.alpn_http_mismatch) AS alpn_mismatch,
avg(ml.is_alpn_missing) AS alpn_missing,
avg(ml.multiplexing_efficiency) AS h2_eff,
avg(ml.header_order_confidence) AS hdr_conf,
avg(ml.ua_ch_mismatch) AS ua_ch_mismatch,
avg(ml.asset_ratio) AS asset_ratio,
avg(ml.direct_access_ratio) AS direct_ratio,
avg(ml.distinct_ja4_count) AS ja4_count,
max(ml.is_ua_rotating) AS ua_rotating,
max(ml.threat_level) AS threat,
any(ml.country_code) AS country,
any(ml.asn_org) AS asn_org,
-- Features headers HTTP (depuis view_dashboard_entities)
avg(ml.has_accept_language) AS hdr_accept_lang,
any(vh.hdr_enc) AS hdr_has_encoding,
any(vh.hdr_sec_fetch) AS hdr_has_sec_fetch,
any(vh.hdr_count) AS hdr_count_raw,
-- Fingerprint HTTP Headers (depuis agg_header_fingerprint_1h + ml_detected_anomalies)
-- header_order_shared_count : nb d'IPs partageant le même fingerprint
-- → faible = fingerprint rare = comportement suspect
avg(ml.header_order_shared_count) AS hfp_shared_count,
-- distinct_header_orders : nb de fingerprints distincts émis par cette IP
-- → élevé = rotation de fingerprint = comportement bot
avg(ml.distinct_header_orders) AS hfp_distinct_orders,
-- Cookie et Referer issus de la table dédiée aux empreintes
any(hfp.hfp_cookie) AS hfp_cookie,
any(hfp.hfp_referer) AS hfp_referer
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h t
LEFT JOIN {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies ml
ON t.src_ip = ml.src_ip AND t.ja4 = ml.ja4
AND ml.detected_at >= now() - INTERVAL %(hours)s HOUR
LEFT JOIN (
SELECT
toIPv6(concat('::ffff:', toString(src_ip))) AS src_ip_v6,
ja4,
any(arrayExists(x -> x LIKE '%%Accept-Encoding%%', client_headers)) AS hdr_enc,
any(arrayExists(x -> x LIKE '%%Sec-Fetch%%', client_headers)) AS hdr_sec_fetch,
any(length(splitByChar(',', client_headers[1]))) AS hdr_count
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_entities
WHERE length(client_headers) > 0
AND log_date >= today() - 2
GROUP BY src_ip_v6, ja4
) vh ON t.src_ip = vh.src_ip_v6 AND t.ja4 = vh.ja4
LEFT JOIN (
SELECT
src_ip,
avg(has_cookie) AS hfp_cookie,
avg(has_referer) AS hfp_referer
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_header_fingerprint_1h
WHERE window_start >= now() - INTERVAL %(hours)s HOUR
GROUP BY src_ip
) hfp ON t.src_ip = hfp.src_ip
WHERE t.window_start >= now() - INTERVAL %(hours)s HOUR
AND t.tcp_ttl_raw > 0
GROUP BY t.src_ip, t.ja4
"""
_SQL_COLS = [
"ip", "ja4", "ttl", "win", "scale", "mss", "ua", "hits",
"avg_score", "avg_velocity", "avg_fuzzing", "pct_headless", "avg_post",
"ip_id_zero", "entropy", "browser_score", "alpn_mismatch", "alpn_missing",
"h2_eff", "hdr_conf", "ua_ch_mismatch", "asset_ratio", "direct_ratio",
"ja4_count", "ua_rotating", "threat", "country", "asn_org",
"hdr_accept_lang", "hdr_has_encoding", "hdr_has_sec_fetch", "hdr_count_raw",
"hfp_shared_count", "hfp_distinct_orders", "hfp_cookie", "hfp_referer",
]
# ─── Worker de clustering (thread pool) ──────────────────────────────────────
def _run_clustering_job(k: int, hours: int, sensitivity: float = 1.0) -> None:
"""Exécuté dans le thread pool. Met à jour _CACHE.
sensitivity : multiplicateur de k [0.5 5.0].
0.5 = vue très agrégée (k/2 clusters)
1.0 = comportement par défaut
2.0 = deux fois plus de clusters → groupes plus homogènes
5.0 = granularité maximale (classification la plus fine)
k_actual est plafonné à 300 pour éviter des temps de calcul excessifs.
n_init est réduit à 1 quand k_actual > 60 pour rester rapide.
"""
k_actual = max(4, min(300, round(k * sensitivity)))
t0 = time.time()
with _LOCK:
_CACHE["status"] = "computing"
_CACHE["error"] = None
try:
log.info(f"[clustering] Démarrage k={k_actual} (base={k}×sens={sensitivity}) hours={hours}")
# ── 1. Chargement de toutes les IPs ──────────────────────────────
result = db.query(_SQL_ALL_IPS, {"hours": hours})
rows: list[dict] = []
for row in result.result_rows:
rows.append({col: row[i] for i, col in enumerate(_SQL_COLS)})
n = len(rows)
log.info(f"[clustering] {n} IPs chargées")
if n < k_actual:
raise ValueError(f"Seulement {n} IPs disponibles (k={k_actual} requis)")
# ── 2. Construction de la matrice de features (numpy) ────────────
X = np.array([build_feature_vector(r) for r in rows], dtype=np.float32)
log.info(f"[clustering] Matrice X: {X.shape}{X.nbytes/1024/1024:.1f} MB")
# ── 3. Standardisation z-score ────────────────────────────────────
# Normalise par variance : features discriminantes (forte std)
# contribuent plus que les features quasi-constantes.
X64 = X.astype(np.float64)
X_std, feat_mean, feat_std = standardize(X64)
# ── 4. K-means++ sur l'espace standardisé ────────────────────────
# n_init réduit à 1 pour k élevé (> 60) afin de limiter le temps de calcul
n_init = 1 if k_actual > 60 else 3
km = kmeans_pp(X_std, k=k_actual, max_iter=80, n_init=n_init, seed=42)
log.info(f"[clustering] K-means: {km.n_iter} iters, inertia={km.inertia:.2f}")
# Centroïdes dans l'espace original [0,1] pour affichage radar
# (dé-standardisation : c_orig = c_std * std + mean, puis clip [0,1])
centroids_orig = np.clip(km.centroids * feat_std + feat_mean, 0.0, 1.0)
# ── 5. PCA-2D sur les features ORIGINALES (normalisées [0,1]) ────
coords = pca_2d(X64) # (n, 2), normalisé [0,1]
# ── 5b. Enveloppes convexes par cluster ──────────────────────────
hulls = compute_hulls(coords, km.labels, k_actual)
# ── 6. Agrégation par cluster ─────────────────────────────────────
cluster_rows: list[list[dict]] = [[] for _ in range(k_actual)]
cluster_coords: list[list[list[float]]] = [[] for _ in range(k_actual)]
cluster_ips_map: dict[int, list] = {j: [] for j in range(k_actual)}
for i, label in enumerate(km.labels):
j = int(label)
cluster_rows[j].append(rows[i])
cluster_coords[j].append(coords[i].tolist())
cluster_ips_map[j].append((
rows[i]["ip"],
rows[i]["ja4"],
float(coords[i][0]),
float(coords[i][1]),
float(risk_score_from_centroid(centroids_orig[j])),
))
# ── 7. Construction des nœuds ─────────────────────────────────────
nodes = []
for j in range(k_actual):
if not cluster_rows[j]:
continue
def avg_f(key: str, crows: list[dict] = cluster_rows[j]) -> float:
"""Calcule la moyenne flottante d'un champ numérique sur les lignes du cluster."""
return float(np.mean([float(r.get(key) or 0) for r in crows]))
mean_ttl = avg_f("ttl")
mean_mss = avg_f("mss")
mean_scale = avg_f("scale")
mean_win = avg_f("win")
raw_stats = {"mean_ttl": mean_ttl, "mean_mss": mean_mss, "mean_scale": mean_scale}
label_name = name_cluster(centroids_orig[j], raw_stats)
risk = float(risk_score_from_centroid(centroids_orig[j]))
color = risk_to_gradient_color(risk)
# Centroïde 2D = moyenne des coords du cluster
cxy = np.mean(cluster_coords[j], axis=0).tolist() if cluster_coords[j] else [0.5, 0.5]
ip_set = list({r["ip"] for r in cluster_rows[j]})
ip_count = len(ip_set)
hit_count = int(sum(float(r.get("hits") or 0) for r in cluster_rows[j]))
threats = [str(r.get("threat") or "") for r in cluster_rows[j] if r.get("threat")]
countries = [str(r.get("country") or "") for r in cluster_rows[j] if r.get("country")]
orgs = [str(r.get("asn_org") or "") for r in cluster_rows[j] if r.get("asn_org")]
def topk(lst: list[str], n: int = 5) -> list[str]:
"""Retourne les n valeurs les plus fréquentes d'une liste (valeurs vides exclues)."""
return [v for v, _ in Counter(lst).most_common(n) if v]
radar = [
{"feature": name, "value": round(float(centroids_orig[j][i]), 4)}
for i, name in enumerate(FEATURE_NAMES)
]
radius = max(8, min(30, int(math.log1p(ip_count) * 2.2)))
sample_rows = sorted(cluster_rows[j], key=lambda r: float(r.get("hits") or 0), reverse=True)[:8]
sample_ips = [r["ip"] for r in sample_rows]
sample_ua = str(cluster_rows[j][0].get("ua") or "")
nodes.append({
"id": f"c{j}_k{k_actual}",
"cluster_idx": j,
"label": label_name,
"pca_x": round(cxy[0], 6),
"pca_y": round(cxy[1], 6),
"radius": radius,
"color": color,
"risk_score": round(risk, 4),
"mean_ttl": round(mean_ttl, 1),
"mean_mss": round(mean_mss, 0),
"mean_scale": round(mean_scale, 1),
"mean_win": round(mean_win, 0),
"mean_velocity":round(avg_f("avg_velocity"),3),
"mean_fuzzing": round(avg_f("avg_fuzzing"), 3),
"mean_headless":round(avg_f("pct_headless"),3),
"mean_post": round(avg_f("avg_post"), 3),
"mean_asset": round(avg_f("asset_ratio"), 3),
"mean_direct": round(avg_f("direct_ratio"),3),
"mean_alpn_mismatch": round(avg_f("alpn_mismatch"),3),
"mean_h2_eff": round(avg_f("h2_eff"), 3),
"mean_hdr_conf":round(avg_f("hdr_conf"), 3),
"mean_ua_ch": round(avg_f("ua_ch_mismatch"),3),
"mean_entropy": round(avg_f("entropy"), 3),
"mean_ja4_diversity": round(avg_f("ja4_count"),3),
"mean_ip_id_zero": round(avg_f("ip_id_zero"),3),
"mean_browser_score": round(avg_f("browser_score"),1),
"mean_ua_rotating": round(avg_f("ua_rotating"),3),
"ip_count": ip_count,
"hit_count": hit_count,
"top_threat": topk(threats, 1)[0] if threats else "",
"top_countries":topk(countries, 5),
"top_orgs": topk(orgs, 5),
"sample_ips": sample_ips,
"sample_ua": sample_ua,
"radar": radar,
# Hull pour deck.gl PolygonLayer
"hull": hulls.get(j, []),
})
# ── 8. Arêtes k-NN entre clusters ────────────────────────────────
edges = []
seen: set[frozenset] = set()
for i, ni in enumerate(nodes):
ci = ni["cluster_idx"]
dists = sorted(
[(j, nj["cluster_idx"],
float(np.sum((centroids_orig[ci] - centroids_orig[nj["cluster_idx"]]) ** 2)))
for j, nj in enumerate(nodes) if j != i],
key=lambda x: x[2]
)
for j_idx, cj, d2 in dists[:2]:
key = frozenset([ni["id"], nodes[j_idx]["id"]])
if key in seen:
continue
seen.add(key)
edges.append({
"id": f"e_{ni['id']}_{nodes[j_idx]['id']}",
"source": ni["id"],
"target": nodes[j_idx]["id"],
"similarity": round(1.0 / (1.0 + math.sqrt(d2)), 3),
})
# ── 9. Stockage résultat + cache IPs ─────────────────────────────
total_ips = sum(n_["ip_count"] for n_ in nodes)
total_hits = sum(n_["hit_count"] for n_ in nodes)
elapsed = round(time.time() - t0, 2)
result_dict = {
"nodes": nodes,
"edges": edges,
"stats": {
"total_clusters": len(nodes),
"total_ips": total_ips,
"total_hits": total_hits,
"n_samples": n,
"k": k_actual,
"k_base": k,
"sensitivity": sensitivity,
"elapsed_s": elapsed,
},
"feature_names": FEATURE_NAMES,
}
with _LOCK:
_CACHE["result"] = result_dict
_CACHE["cluster_ips"] = cluster_ips_map
_CACHE["status"] = "ready"
_CACHE["ts"] = time.time()
_CACHE["params"] = {"k": k, "hours": hours, "sensitivity": sensitivity}
_CACHE["error"] = None
log.info(f"[clustering] Terminé en {elapsed}s — {total_ips} IPs, {len(nodes)} clusters")
except Exception as e:
log.exception("[clustering] Erreur lors du calcul")
with _LOCK:
_CACHE["status"] = "error"
_CACHE["error"] = str(e)
def _maybe_trigger(k: int, hours: int, sensitivity: float) -> None:
"""Lance le calcul si cache absent, expiré ou paramètres différents."""
with _LOCK:
status = _CACHE["status"]
params = _CACHE["params"]
ts = _CACHE["ts"]
cache_stale = (time.time() - ts) > _CACHE_TTL
params_changed = (
params.get("k") != k or
params.get("hours") != hours or
params.get("sensitivity") != sensitivity
)
if status in ("computing",):
return # déjà en cours
if status == "ready" and not cache_stale and not params_changed:
return # cache frais
_EXECUTOR.submit(_run_clustering_job, k, hours, sensitivity)
# ─── Endpoints ────────────────────────────────────────────────────────────────
@router.get("/status")
async def get_status():
"""État du calcul en cours (polling frontend)."""
with _LOCK:
return {
"status": _CACHE["status"],
"error": _CACHE["error"],
"ts": _CACHE["ts"],
"params": _CACHE["params"],
"age_s": round(time.time() - _CACHE["ts"], 0) if _CACHE["ts"] else None,
}
@router.get("/clusters")
async def get_clusters(
k: int = Query(20, ge=4, le=100, description="Nombre de clusters de base"),
hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle (heures)"),
sensitivity: float = Query(1.0, ge=0.5, le=5.0, description="Sensibilité : multiplicateur de k (5.0 = granularité maximale)"),
force: bool = Query(False, description="Forcer le recalcul"),
):
"""
Clustering multi-métriques sur TOUTES les IPs.
k_actual = round(k × sensitivity) — la sensibilité contrôle la granularité.
Retourne immédiatement depuis le cache. Déclenche le calcul si nécessaire.
"""
if force:
with _LOCK:
_CACHE["status"] = "idle"
_CACHE["ts"] = 0.0
_CACHE["result"] = None
_CACHE["cluster_ips"] = {}
_maybe_trigger(k, hours, sensitivity)
with _LOCK:
status = _CACHE["status"]
result = _CACHE["result"]
error = _CACHE["error"]
if status == "computing":
return {"status": "computing", "message": "Calcul en cours, réessayez dans quelques secondes"}
if status == "error":
raise HTTPException(status_code=500, detail=error or "Erreur inconnue")
if result is None:
return {"status": "idle", "message": "Calcul démarré, réessayez dans quelques secondes"}
return {**result, "status": "ready"}
@router.get("/cluster/{cluster_id}/points")
async def get_cluster_points(
cluster_id: str,
limit: int = Query(5000, ge=1, le=20000),
offset: int = Query(0, ge=0),
):
"""
Coordonnées PCA + métadonnées de toutes les IPs d'un cluster.
Utilisé par deck.gl ScatterplotLayer (drill-down ou zoom avancé).
"""
with _LOCK:
status = _CACHE["status"]
ips_map = _CACHE["cluster_ips"]
if status != "ready" or not ips_map:
raise HTTPException(status_code=404, detail="Cache absent — appelez /clusters d'abord")
try:
idx = int(cluster_id.split("_")[0][1:])
except (ValueError, IndexError):
raise HTTPException(status_code=400, detail="cluster_id invalide (format: c{n}_k{k})")
members = ips_map.get(idx, [])
total = len(members)
page = members[offset: offset + limit]
points = [
{"ip": m[0], "ja4": m[1], "pca_x": round(m[2], 6), "pca_y": round(m[3], 6), "risk": round(m[4], 3)}
for m in page
]
return {"points": points, "total": total, "offset": offset, "limit": limit}
@router.get("/cluster/{cluster_id}/ips")
async def get_cluster_ips(
cluster_id: str,
limit: int = Query(100, ge=1, le=500),
offset: int = Query(0, ge=0),
):
"""IPs avec détails SQL (backward-compat avec l'ancienne UI)."""
with _LOCK:
status = _CACHE["status"]
ips_map = _CACHE["cluster_ips"]
if status != "ready" or not ips_map:
raise HTTPException(status_code=404, detail="Cache absent — appelez /clusters d'abord")
try:
idx = int(cluster_id.split("_")[0][1:])
except (ValueError, IndexError):
raise HTTPException(status_code=400, detail="cluster_id invalide")
members = ips_map.get(idx, [])
total = len(members)
page = members[offset: offset + limit]
if not page:
return {"ips": [], "total": total, "cluster_id": cluster_id}
safe_ips = [m[0].replace("'", "") for m in page[:200]]
ip_filter = ", ".join(f"'{ip}'" for ip in safe_ips)
sql = f"""
SELECT
replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') AS src_ip,
t.ja4,
any(t.tcp_ttl_raw) AS ttl,
any(t.tcp_win_raw) AS win,
any(t.tcp_scale_raw) AS scale,
any(t.tcp_mss_raw) AS mss,
sum(t.hits) AS hits,
any(t.first_ua) AS ua,
round(avg(abs(ml.anomaly_score)), 3) AS avg_score,
max(ml.threat_level) AS threat_level,
any(ml.country_code) AS country_code,
any(ml.asn_org) AS asn_org,
round(avg(ml.fuzzing_index), 2) AS fuzzing,
round(avg(ml.hit_velocity), 2) AS velocity
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h t
LEFT JOIN {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies ml
ON t.src_ip = ml.src_ip AND t.ja4 = ml.ja4
AND ml.detected_at >= now() - INTERVAL 24 HOUR
WHERE t.window_start >= now() - INTERVAL 24 HOUR
AND replaceRegexpAll(toString(t.src_ip), '^::ffff:', '') IN ({ip_filter})
GROUP BY t.src_ip, t.ja4
ORDER BY hits DESC
"""
try:
result = db.query(sql)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
ips = []
for row in result.result_rows:
ips.append({
"ip": str(row[0] or ""),
"ja4": str(row[1] or ""),
"tcp_ttl": int(row[2] or 0),
"tcp_win": int(row[3] or 0),
"tcp_scale": int(row[4] or 0),
"tcp_mss": int(row[5] or 0),
"hits": int(row[6] or 0),
"ua": str(row[7] or ""),
"avg_score": float(row[8] or 0),
"threat_level": str(row[9] or ""),
"country_code": str(row[10] or ""),
"asn_org": str(row[11] or ""),
"fuzzing": float(row[12] or 0),
"velocity": float(row[13] or 0),
})
return {"ips": ips, "total": total, "cluster_id": cluster_id}

View File

@ -1,451 +0,0 @@
"""
Endpoints pour la liste des détections
"""
from fastapi import APIRouter, HTTPException, Query
from typing import Optional, List
from ..database import db
from ..models import DetectionsListResponse, Detection
from ..config import settings
router = APIRouter(prefix="/api/detections", tags=["detections"])
# Mapping label ASN → score float (0 = très suspect, 1 = légitime)
_ASN_LABEL_SCORES: dict[str, float] = {
'human': 0.9, 'bot': 0.05, 'proxy': 0.25, 'vpn': 0.3,
'tor': 0.1, 'datacenter': 0.4, 'scanner': 0.05, 'malicious': 0.05,
}
def _label_to_score(label: str) -> float | None:
"""Convertit un label de réputation ASN en score numérique."""
if not label:
return None
return _ASN_LABEL_SCORES.get(label.lower(), 0.5)
@router.get("", response_model=DetectionsListResponse, summary="Liste paginée des détections")
async def get_detections(
page: int = Query(1, ge=1, description="Numéro de page"),
page_size: int = Query(25, ge=1, le=100, description="Nombre de lignes par page"),
threat_level: Optional[str] = Query(None, description="Filtrer par niveau de menace"),
model_name: Optional[str] = Query(None, description="Filtrer par modèle"),
country_code: Optional[str] = Query(None, description="Filtrer par pays"),
asn_number: Optional[str] = Query(None, description="Filtrer par ASN"),
search: Optional[str] = Query(None, description="Recherche texte (IP, JA4, Host)"),
sort_by: str = Query("detected_at", description="Trier par"),
sort_order: str = Query("DESC", description="Ordre (ASC/DESC)"),
group_by_ip: bool = Query(False, description="Grouper par IP (first_seen/last_seen agrégés)"),
score_type: Optional[str] = Query(None, description="Filtrer par type de score: BOT, REGLE, BOT_REGLE, SCORE")
):
"""
Récupère la liste des détections avec pagination et filtres
"""
try:
# Construction de la requête
where_clauses = ["detected_at >= now() - INTERVAL 24 HOUR"]
params = {}
if threat_level:
where_clauses.append("threat_level = %(threat_level)s")
params["threat_level"] = threat_level
if model_name:
where_clauses.append("model_name = %(model_name)s")
params["model_name"] = model_name
if country_code:
where_clauses.append("country_code = %(country_code)s")
params["country_code"] = country_code.upper()
if asn_number:
where_clauses.append("asn_number = %(asn_number)s")
params["asn_number"] = asn_number
if search:
where_clauses.append(
"(ilike(toString(src_ip), %(search)s) OR ilike(ja4, %(search)s) OR ilike(host, %(search)s))"
)
params["search"] = f"%{search}%"
if score_type:
st = score_type.upper()
if st == "BOT":
where_clauses.append("threat_level = 'KNOWN_BOT'")
elif st == "REGLE":
where_clauses.append("threat_level = 'ANUBIS_DENY'")
elif st == "BOT_REGLE":
where_clauses.append("threat_level IN ('KNOWN_BOT', 'ANUBIS_DENY')")
elif st == "SCORE":
where_clauses.append("threat_level NOT IN ('KNOWN_BOT', 'ANUBIS_DENY')")
where_clause = " AND ".join(where_clauses)
# Requête de comptage
count_query = f"""
SELECT count()
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE {where_clause}
"""
count_result = db.query(count_query, params)
total = count_result.result_rows[0][0] if count_result.result_rows else 0
# Requête principale
offset = (page - 1) * page_size
sort_order = "DESC" if sort_order.upper() == "DESC" else "ASC"
# ── Mode groupé par IP (first_seen / last_seen depuis la DB) ────────────
if group_by_ip:
valid_sort_grouped = ["anomaly_score", "hits", "hit_velocity", "first_seen", "last_seen", "src_ip", "detected_at"]
grouped_sort = sort_by if sort_by in valid_sort_grouped else "last_seen"
# detected_at → last_seen (max(detected_at) dans le GROUP BY)
if grouped_sort == "detected_at":
grouped_sort = "last_seen"
# In outer query, min_score is exposed as anomaly_score — keep the alias
outer_sort = "min_score" if grouped_sort == "anomaly_score" else grouped_sort
# Count distinct IPs
count_ip_query = f"""
SELECT uniq(src_ip)
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE {where_clause}
"""
cr = db.query(count_ip_query, params)
total = cr.result_rows[0][0] if cr.result_rows else 0
grouped_query = f"""
SELECT
ip_data.src_ip,
ip_data.first_seen,
ip_data.last_seen,
ip_data.detection_count,
ip_data.unique_ja4s,
ip_data.unique_hosts,
ip_data.min_score AS anomaly_score,
ip_data.threat_level_best,
ip_data.model_name_best,
ip_data.country_code,
ip_data.asn_number,
ip_data.asn_org,
ip_data.hit_velocity,
ip_data.hits,
ip_data.asn_label,
ar.label AS asn_rep_label,
ip_data.anubis_bot_name_best,
ip_data.anubis_bot_action_best,
ip_data.anubis_bot_category_best
FROM (
SELECT
src_ip,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen,
count() AS detection_count,
groupUniqArray(5)(ja4) AS unique_ja4s,
groupUniqArray(5)(host) AS unique_hosts,
min(anomaly_score) AS min_score,
argMin(threat_level, anomaly_score) AS threat_level_best,
argMin(model_name, anomaly_score) AS model_name_best,
any(country_code) AS country_code,
any(asn_number) AS asn_number,
any(asn_org) AS asn_org,
max(hit_velocity) AS hit_velocity,
sum(hits) AS hits,
any(asn_label) AS asn_label,
argMin(anubis_bot_name, anomaly_score) AS anubis_bot_name_best,
argMin(anubis_bot_action, anomaly_score) AS anubis_bot_action_best,
argMin(anubis_bot_category, anomaly_score) AS anubis_bot_category_best
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE {where_clause}
GROUP BY src_ip
) ip_data
LEFT JOIN {settings.CLICKHOUSE_DB_PROCESSING}.asn_reputation ar
ON ar.src_asn = toUInt32OrZero(ip_data.asn_number)
ORDER BY {outer_sort} {sort_order}
LIMIT %(limit)s OFFSET %(offset)s
"""
params["limit"] = page_size
params["offset"] = offset
gresult = db.query(grouped_query, params)
detections = []
for row in gresult.result_rows:
# row: src_ip, first_seen, last_seen, detection_count, unique_ja4s, unique_hosts,
# anomaly_score, threat_level_best, model_name_best, country_code, asn_number,
# asn_org, hit_velocity, hits, asn_label, asn_rep_label,
# anubis_bot_name, anubis_bot_action, anubis_bot_category
ja4s = list(row[4]) if row[4] else []
hosts = list(row[5]) if row[5] else []
detections.append(Detection(
detected_at=row[1],
src_ip=str(row[0]),
ja4=ja4s[0] if ja4s else "",
host=hosts[0] if hosts else "",
bot_name="",
anomaly_score=float(row[6]) if row[6] else 0.0,
threat_level=row[7] or "LOW",
model_name=row[8] or "",
recurrence=int(row[3] or 0),
asn_number=str(row[10]) if row[10] else "",
asn_org=row[11] or "",
asn_detail="",
asn_domain="",
country_code=row[9] or "",
asn_label=row[14] or "",
hits=int(row[13] or 0),
hit_velocity=float(row[12]) if row[12] else 0.0,
fuzzing_index=0.0,
post_ratio=0.0,
reason="",
asn_rep_label=row[15] or "",
asn_score=_label_to_score(row[15] or ""),
first_seen=row[1],
last_seen=row[2],
unique_ja4s=ja4s,
unique_hosts=hosts,
anubis_bot_name=row[16] or "",
anubis_bot_action=row[17] or "",
anubis_bot_category=row[18] or "",
))
total_pages = (total + page_size - 1) // page_size
return DetectionsListResponse(
items=detections, total=total, page=page,
page_size=page_size, total_pages=total_pages
)
# ── Mode individuel (comportement original) ──────────────────────────────
# Validation du tri
valid_sort_columns = [
"detected_at", "src_ip", "threat_level", "anomaly_score",
"asn_number", "country_code", "hits", "hit_velocity"
]
if sort_by not in valid_sort_columns:
sort_by = "detected_at"
main_query = f"""
SELECT
detected_at,
src_ip,
ja4,
host,
bot_name,
anomaly_score,
threat_level,
model_name,
recurrence,
asn_number,
asn_org,
asn_detail,
asn_domain,
country_code,
asn_label,
hits,
hit_velocity,
fuzzing_index,
post_ratio,
reason,
ar.label AS asn_rep_label,
anubis_bot_name,
anubis_bot_action,
anubis_bot_category
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
LEFT JOIN {settings.CLICKHOUSE_DB_PROCESSING}.asn_reputation ar ON ar.src_asn = toUInt32OrZero(asn_number)
WHERE {where_clause}
ORDER BY {sort_by} {sort_order}
LIMIT %(limit)s OFFSET %(offset)s
"""
params["limit"] = page_size
params["offset"] = offset
result = db.query(main_query, params)
detections = [
Detection(
detected_at=row[0],
src_ip=str(row[1]),
ja4=row[2] or "",
host=row[3] or "",
bot_name=row[4] or "",
anomaly_score=float(row[5]) if row[5] else 0.0,
threat_level=row[6] or "LOW",
model_name=row[7] or "",
recurrence=row[8] or 0,
asn_number=str(row[9]) if row[9] else "",
asn_org=row[10] or "",
asn_detail=row[11] or "",
asn_domain=row[12] or "",
country_code=row[13] or "",
asn_label=row[14] or "",
hits=row[15] or 0,
hit_velocity=float(row[16]) if row[16] else 0.0,
fuzzing_index=float(row[17]) if row[17] else 0.0,
post_ratio=float(row[18]) if row[18] else 0.0,
reason=row[19] or "",
asn_rep_label=row[20] or "",
asn_score=_label_to_score(row[20] or ""),
anubis_bot_name=row[21] or "",
anubis_bot_action=row[22] or "",
anubis_bot_category=row[23] or "",
)
for row in result.result_rows
]
total_pages = (total + page_size - 1) // page_size
return DetectionsListResponse(
items=detections,
total=total,
page=page,
page_size=page_size,
total_pages=total_pages
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur lors de la récupération des détections: {str(e)}")
@router.get("/{detection_id}")
async def get_detection_details(detection_id: str):
"""
Récupère les détails d'une détection spécifique
detection_id peut être une IP ou un identifiant
"""
try:
query = f"""
SELECT
detected_at,
src_ip,
ja4,
host,
bot_name,
anomaly_score,
threat_level,
model_name,
recurrence,
asn_number,
asn_org,
asn_detail,
asn_domain,
country_code,
asn_label,
hits,
hit_velocity,
fuzzing_index,
post_ratio,
port_exhaustion_ratio,
orphan_ratio,
tcp_jitter_variance,
tcp_shared_count,
true_window_size,
window_mss_ratio,
alpn_http_mismatch,
is_alpn_missing,
sni_host_mismatch,
header_count,
has_accept_language,
has_cookie,
has_referer,
modern_browser_score,
ua_ch_mismatch,
header_order_shared_count,
ip_id_zero_ratio,
request_size_variance,
multiplexing_efficiency,
mss_mobile_mismatch,
correlated,
reason,
asset_ratio,
direct_access_ratio,
is_ua_rotating,
distinct_ja4_count,
src_port_density,
ja4_asn_concentration,
ja4_country_concentration,
is_rare_ja4
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE src_ip = %(ip)s
ORDER BY detected_at DESC
LIMIT 1
"""
result = db.query(query, {"ip": detection_id})
if not result.result_rows:
raise HTTPException(status_code=404, detail="Détection non trouvée")
row = result.result_rows[0]
return {
"detected_at": row[0],
"src_ip": str(row[1]),
"ja4": row[2] or "",
"host": row[3] or "",
"bot_name": row[4] or "",
"anomaly_score": float(row[5]) if row[5] else 0.0,
"threat_level": row[6] or "LOW",
"model_name": row[7] or "",
"recurrence": row[8] or 0,
"asn": {
"number": str(row[9]) if row[9] else "",
"org": row[10] or "",
"detail": row[11] or "",
"domain": row[12] or "",
"label": row[14] or ""
},
"country": {
"code": row[13] or "",
},
"metrics": {
"hits": row[15] or 0,
"hit_velocity": float(row[16]) if row[16] else 0.0,
"fuzzing_index": float(row[17]) if row[17] else 0.0,
"post_ratio": float(row[18]) if row[18] else 0.0,
"port_exhaustion_ratio": float(row[19]) if row[19] else 0.0,
"orphan_ratio": float(row[20]) if row[20] else 0.0,
},
"tcp": {
"jitter_variance": float(row[21]) if row[21] else 0.0,
"shared_count": row[22] or 0,
"true_window_size": row[23] or 0,
"window_mss_ratio": float(row[24]) if row[24] else 0.0,
},
"tls": {
"alpn_http_mismatch": bool(row[25]) if row[25] is not None else False,
"is_alpn_missing": bool(row[26]) if row[26] is not None else False,
"sni_host_mismatch": bool(row[27]) if row[27] is not None else False,
},
"headers": {
"count": row[28] or 0,
"has_accept_language": bool(row[29]) if row[29] is not None else False,
"has_cookie": bool(row[30]) if row[30] is not None else False,
"has_referer": bool(row[31]) if row[31] is not None else False,
"modern_browser_score": row[32] or 0,
"ua_ch_mismatch": bool(row[33]) if row[33] is not None else False,
"header_order_shared_count": row[34] or 0,
},
"behavior": {
"ip_id_zero_ratio": float(row[35]) if row[35] else 0.0,
"request_size_variance": float(row[36]) if row[36] else 0.0,
"multiplexing_efficiency": float(row[37]) if row[37] else 0.0,
"mss_mobile_mismatch": bool(row[38]) if row[38] is not None else False,
"correlated": bool(row[39]) if row[39] is not None else False,
},
"advanced": {
"asset_ratio": float(row[41]) if row[41] else 0.0,
"direct_access_ratio": float(row[42]) if row[42] else 0.0,
"is_ua_rotating": bool(row[43]) if row[43] is not None else False,
"distinct_ja4_count": row[44] or 0,
"src_port_density": float(row[45]) if row[45] else 0.0,
"ja4_asn_concentration": float(row[46]) if row[46] else 0.0,
"ja4_country_concentration": float(row[47]) if row[47] else 0.0,
"is_rare_ja4": bool(row[48]) if row[48] is not None else False,
},
"reason": row[40] or ""
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -1,510 +0,0 @@
"""
Routes pour l'investigation d'entités (IP, JA4, User-Agent, Client-Header, Host, Path, Query-Param)
"""
from fastapi import APIRouter, HTTPException, Query
from typing import Optional, List
from ..database import db
from ..models import (
EntityInvestigation,
EntityStats,
EntityRelatedAttributes,
EntityAttributeValue
)
from ..config import settings
router = APIRouter(prefix="/api/entities", tags=["Entities"])
# Ensemble des types d'entités valides
VALID_ENTITY_TYPES = frozenset({
'ip', 'ja4', 'user_agent', 'client_header', 'host', 'path', 'query_param'
})
def get_entity_stats(entity_type: str, entity_value: str, hours: int = 24) -> Optional[EntityStats]:
"""
Récupère les statistiques pour une entité donnée
"""
query = f"""
SELECT
entity_type,
entity_value,
sum(requests) as total_requests,
sum(unique_ips) as unique_ips,
min(log_date) as first_seen,
max(log_date) as last_seen
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_entities
WHERE entity_type = %(entity_type)s
AND entity_value = %(entity_value)s
AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR)
GROUP BY entity_type, entity_value
"""
result = db.query(query, {
'entity_type': entity_type,
'entity_value': entity_value,
'hours': hours
})
if not result.result_rows:
return None
row = result.result_rows[0]
return EntityStats(
entity_type=row[0],
entity_value=row[1],
total_requests=row[2],
unique_ips=row[3],
first_seen=row[4],
last_seen=row[5]
)
def get_related_attributes(entity_type: str, entity_value: str, hours: int = 24) -> EntityRelatedAttributes:
"""
Récupère les attributs associés à une entité
"""
# Requête pour agréger tous les attributs associés
query = f"""
SELECT
(SELECT groupUniqArray(toString(src_ip)) FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_entities WHERE entity_type = %(entity_type)s AND entity_value = %(entity_value)s AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR)) as ips,
(SELECT groupUniqArray(ja4) FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_entities WHERE entity_type = %(entity_type)s AND entity_value = %(entity_value)s AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR) AND ja4 != '') as ja4s,
(SELECT groupUniqArray(host) FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_entities WHERE entity_type = %(entity_type)s AND entity_value = %(entity_value)s AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR) AND host != '') as hosts,
(SELECT groupUniqArrayArray(asns) FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_entities WHERE entity_type = %(entity_type)s AND entity_value = %(entity_value)s AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR) AND notEmpty(asns)) as asns,
(SELECT groupUniqArrayArray(countries) FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_entities WHERE entity_type = %(entity_type)s AND entity_value = %(entity_value)s AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR) AND notEmpty(countries)) as countries
"""
result = db.query(query, {
'entity_type': entity_type,
'entity_value': entity_value,
'hours': hours
})
if not result.result_rows or not any(result.result_rows[0]):
return EntityRelatedAttributes(
ips=[],
ja4s=[],
hosts=[],
asns=[],
countries=[]
)
row = result.result_rows[0]
return EntityRelatedAttributes(
ips=[str(ip) for ip in (row[0] or []) if ip],
ja4s=[ja4 for ja4 in (row[1] or []) if ja4],
hosts=[host for host in (row[2] or []) if host],
asns=[asn for asn in (row[3] or []) if asn],
countries=[country for country in (row[4] or []) if country]
)
def get_array_values(entity_type: str, entity_value: str, array_field: str, hours: int = 24) -> List[EntityAttributeValue]:
"""
Extrait et retourne les valeurs d'un champ Array (user_agents, client_headers, etc.)
"""
query = f"""
SELECT
value,
count() as count,
round(count * 100.0 / sum(count) OVER (), 2) as percentage
FROM (
SELECT
arrayJoin({array_field}) as value
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_entities
WHERE entity_type = %(entity_type)s
AND entity_value = %(entity_value)s
AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR)
AND notEmpty({array_field})
)
GROUP BY value
ORDER BY count DESC
"""
result = db.query(query, {
'entity_type': entity_type,
'entity_value': entity_value,
'hours': hours
})
return [
EntityAttributeValue(
value=row[0],
count=row[1],
percentage=row[2]
)
for row in result.result_rows
]
@router.get("/subnet/{subnet:path}")
async def get_subnet_investigation(
subnet: str,
hours: int = Query(default=24, ge=1, le=720)
):
"""
Récupère toutes les IPs d'un subnet /24 avec leurs statistiques
Utilise ml_detected_anomalies pour les détections + view_dashboard_entities pour les user-agents
"""
try:
# Extraire l'IP de base du subnet (ex: 192.168.1.0/24 -> 192.168.1.0)
subnet_ip = subnet.replace('/24', '').replace('/16', '').replace('/8', '')
# Extraire les 3 premiers octets pour le filtre (ex: 141.98.11)
subnet_parts = subnet_ip.split('.')[:3]
subnet_prefix = subnet_parts[0]
subnet_mask = subnet_parts[1]
subnet_third = subnet_parts[2]
# Stats globales du subnet - utilise ml_detected_anomalies + view_dashboard_entities pour UA
stats_query = f"""
WITH cleaned_ips AS (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
detected_at,
ja4,
host,
country_code,
asn_number
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
),
subnet_filter AS (
SELECT *
FROM cleaned_ips
WHERE splitByChar('.', clean_ip)[1] = %(subnet_prefix)s
AND splitByChar('.', clean_ip)[2] = %(subnet_mask)s
AND splitByChar('.', clean_ip)[3] = %(subnet_third)s
),
-- Récupérer les user-agents depuis view_dashboard_entities
ua_data AS (
SELECT
entity_value AS ip,
arrayJoin(user_agents) AS user_agent
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_entities
WHERE entity_type = 'ip'
AND log_date >= toDate(now() - INTERVAL %(hours)s HOUR)
AND splitByChar('.', entity_value)[1] = %(subnet_prefix)s
AND splitByChar('.', entity_value)[2] = %(subnet_mask)s
AND splitByChar('.', entity_value)[3] = %(subnet_third)s
)
SELECT
%(subnet)s AS subnet,
uniq(clean_ip) AS total_ips,
count() AS total_detections,
uniq(ja4) AS unique_ja4,
(SELECT uniq(user_agent) FROM ua_data) AS unique_ua,
uniq(host) AS unique_hosts,
argMax(country_code, detected_at) AS primary_country,
argMax(asn_number, detected_at) AS primary_asn,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM subnet_filter
"""
stats_result = db.query(stats_query, {
"subnet": subnet,
"subnet_prefix": subnet_prefix,
"subnet_mask": subnet_mask,
"subnet_third": subnet_third,
"hours": hours
})
if not stats_result.result_rows or stats_result.result_rows[0][1] == 0:
raise HTTPException(status_code=404, detail="Subnet non trouvé")
stats_row = stats_result.result_rows[0]
stats = {
"subnet": subnet,
"total_ips": stats_row[1] or 0,
"total_detections": stats_row[2] or 0,
"unique_ja4": stats_row[3] or 0,
"unique_ua": stats_row[4] or 0,
"unique_hosts": stats_row[5] or 0,
"primary_country": stats_row[6] or "XX",
"primary_asn": str(stats_row[7]) if stats_row[7] else "?",
"first_seen": stats_row[8].isoformat() if stats_row[8] else "",
"last_seen": stats_row[9].isoformat() if stats_row[9] else ""
}
# Liste des IPs avec détails - 2 requêtes séparées + fusion en Python
ips_query = f"""
WITH cleaned_ips AS (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
detected_at,
ja4,
country_code,
asn_number,
threat_level,
anomaly_score
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
),
subnet_filter AS (
SELECT *
FROM cleaned_ips
WHERE splitByChar('.', clean_ip)[1] = %(subnet_prefix)s
AND splitByChar('.', clean_ip)[2] = %(subnet_mask)s
AND splitByChar('.', clean_ip)[3] = %(subnet_third)s
)
SELECT
clean_ip AS ip,
count() AS total_detections,
uniq(ja4) AS unique_ja4,
argMax(country_code, detected_at) AS primary_country,
argMax(asn_number, detected_at) AS primary_asn,
argMax(threat_level, detected_at) AS threat_level,
avg(anomaly_score) AS avg_score,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM subnet_filter
GROUP BY ip
ORDER BY total_detections DESC
"""
# Exécuter la première requête pour obtenir les IPs
ips_result = db.query(ips_query, {
"subnet_prefix": subnet_prefix,
"subnet_mask": subnet_mask,
"subnet_third": subnet_third,
"hours": hours
})
# Extraire la liste des IPs pour la requête UA
ip_list = [str(row[0]) for row in ips_result.result_rows]
# Requête pour les user-agents avec IN clause (utilise l'index)
unique_ua_dict = {}
if ip_list:
# Formater la liste pour la clause IN
ip_values = ', '.join(f"'{ip}'" for ip in ip_list)
ua_query = f"""
SELECT
entity_value AS ip,
uniq(arrayJoin(user_agents)) AS unique_ua
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_entities
PREWHERE entity_type = 'ip'
WHERE entity_value IN ({ip_values})
AND log_date >= today() - INTERVAL 30 DAY
GROUP BY entity_value
"""
ua_result = db.query(ua_query, {})
unique_ua_dict = {row[0]: row[1] for row in ua_result.result_rows}
# Fusionner les résultats
ips = []
for row in ips_result.result_rows:
ips.append({
"ip": str(row[0]),
"total_detections": row[1],
"unique_ja4": row[2],
"unique_ua": unique_ua_dict.get(row[0], 0),
"primary_country": row[3] or "XX",
"primary_asn": str(row[4]) if row[4] else "?",
"threat_level": row[5] or "LOW",
"avg_score": abs(row[6] or 0),
"first_seen": row[7].isoformat() if row[7] else "",
"last_seen": row[8].isoformat() if row[8] else ""
})
return {
"stats": stats,
"ips": ips
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/types")
async def get_entity_types():
"""
Retourne la liste des types d'entités supportés.
NOTE: Cette route DOIT être déclarée avant /{entity_type}/... pour ne pas être masquée.
"""
return {
"entity_types": sorted(VALID_ENTITY_TYPES),
"descriptions": {
"ip": "Adresse IP source",
"ja4": "Fingerprint JA4 TLS",
"user_agent": "User-Agent HTTP",
"client_header": "Client Header",
"host": "Host HTTP",
"path": "Path URL",
"query_param": "Query Param"
}
}
@router.get("/{entity_type}/{entity_value:path}", response_model=EntityInvestigation)
async def get_entity_investigation(
entity_type: str,
entity_value: str,
hours: int = Query(default=24, ge=1, le=720, description="Fenêtre temporelle en heures")
):
"""
Investigation complète pour une entité donnée
- **entity_type**: Type d'entité (ip, ja4, user_agent, client_header, host, path, query_param)
- **entity_value**: Valeur de l'entité
- **hours**: Fenêtre temporelle (défaut: 24h)
Retourne:
- Stats générales
- Attributs associés (IPs, JA4, Hosts, ASNs, Pays)
- User-Agents
- Client-Headers
- Paths
- Query-Params
"""
# Valider le type d'entité
if entity_type not in VALID_ENTITY_TYPES:
raise HTTPException(
status_code=400,
detail=f"Type d'entité invalide. Types supportés: {', '.join(VALID_ENTITY_TYPES)}"
)
# Stats générales
stats = get_entity_stats(entity_type, entity_value, hours)
if not stats:
raise HTTPException(status_code=404, detail="Entité non trouvée")
# Attributs associés
related = get_related_attributes(entity_type, entity_value, hours)
# User-Agents
user_agents = get_array_values(entity_type, entity_value, 'user_agents', hours)
# Client-Headers
client_headers = get_array_values(entity_type, entity_value, 'client_headers', hours)
# Paths
paths = get_array_values(entity_type, entity_value, 'paths', hours)
# Query-Params
query_params = get_array_values(entity_type, entity_value, 'query_params', hours)
return EntityInvestigation(
stats=stats,
related=related,
user_agents=user_agents,
client_headers=client_headers,
paths=paths,
query_params=query_params
)
@router.get("/{entity_type}/{entity_value:path}/related")
async def get_entity_related(
entity_type: str,
entity_value: str,
hours: int = Query(default=24, ge=1, le=720)
):
"""
Récupère uniquement les attributs associés à une entité
"""
if entity_type not in VALID_ENTITY_TYPES:
raise HTTPException(
status_code=400,
detail=f"Type d'entité invalide. Types supportés: {', '.join(VALID_ENTITY_TYPES)}"
)
related = get_related_attributes(entity_type, entity_value, hours)
return {
"entity_type": entity_type,
"entity_value": entity_value,
"hours": hours,
"related": related
}
@router.get("/{entity_type}/{entity_value:path}/user_agents")
async def get_entity_user_agents(
entity_type: str,
entity_value: str,
hours: int = Query(default=24, ge=1, le=720)
):
"""
Récupère les User-Agents associés à une entité
"""
if entity_type not in VALID_ENTITY_TYPES:
raise HTTPException(status_code=400, detail="Type d'entité invalide")
user_agents = get_array_values(entity_type, entity_value, 'user_agents', hours)
return {
"entity_type": entity_type,
"entity_value": entity_value,
"user_agents": user_agents,
"total": len(user_agents)
}
@router.get("/{entity_type}/{entity_value:path}/client_headers")
async def get_entity_client_headers(
entity_type: str,
entity_value: str,
hours: int = Query(default=24, ge=1, le=720)
):
"""
Récupère les Client-Headers associés à une entité
"""
if entity_type not in VALID_ENTITY_TYPES:
raise HTTPException(status_code=400, detail="Type d'entité invalide")
client_headers = get_array_values(entity_type, entity_value, 'client_headers', hours)
return {
"entity_type": entity_type,
"entity_value": entity_value,
"client_headers": client_headers,
"total": len(client_headers)
}
@router.get("/{entity_type}/{entity_value:path}/paths")
async def get_entity_paths(
entity_type: str,
entity_value: str,
hours: int = Query(default=24, ge=1, le=720)
):
"""
Récupère les Paths associés à une entité
"""
if entity_type not in VALID_ENTITY_TYPES:
raise HTTPException(status_code=400, detail="Type d'entité invalide")
paths = get_array_values(entity_type, entity_value, 'paths', hours)
return {
"entity_type": entity_type,
"entity_value": entity_value,
"paths": paths,
"total": len(paths)
}
@router.get("/{entity_type}/{entity_value:path}/query_params")
async def get_entity_query_params(
entity_type: str,
entity_value: str,
hours: int = Query(default=24, ge=1, le=720)
):
"""
Récupère les Query-Params associés à une entité
"""
if entity_type not in VALID_ENTITY_TYPES:
raise HTTPException(status_code=400, detail="Type d'entité invalide")
query_params = get_array_values(entity_type, entity_value, 'query_params', hours)
return {
"entity_type": entity_type,
"entity_value": entity_value,
"query_params": query_params,
"total": len(query_params)
}

View File

@ -1,829 +0,0 @@
"""
Endpoints pour l'analyse des fingerprints JA4 et User-Agents
Objectifs:
- Détecter le spoofing JA4 (fingerprint TLS qui prétend être un navigateur mais
dont les User-Agents, les headers HTTP ou les métriques comportementales trahissent
une origine bot/script)
- Construire une matrice JA4 × User-Agent pour visualiser les associations suspectes
- Analyser la distribution des User-Agents pour identifier les rotateurs et les bots
qui usurpent des UA de navigateurs légitimes
"""
from fastapi import APIRouter, HTTPException, Query
import re
from ..database import db
from ..config import settings
router = APIRouter(prefix="/api/fingerprints", tags=["fingerprints"])
# ─── Helpers ──────────────────────────────────────────────────────────────────
# Patterns indiquant clairement un bot/script sans simulation de navigateur
_BOT_PATTERNS = re.compile(
r"bot|crawler|spider|scraper|python|curl|wget|go-http|java/|axios|"
r"libwww|httpclient|okhttp|requests|aiohttp|httpx|playwright|puppeteer|"
r"selenium|headless|phantomjs",
re.IGNORECASE,
)
# Navigateurs légitimes communs — un JA4 de type "browser" devrait venir avec ces UAs
_BROWSER_PATTERNS = re.compile(
r"mozilla|chrome|safari|firefox|edge|opera|trident",
re.IGNORECASE,
)
def _classify_ua(ua: str) -> str:
"""Retourne 'bot', 'browser', ou 'script'"""
if not ua:
return "empty"
if _BOT_PATTERNS.search(ua):
return "bot"
if _BROWSER_PATTERNS.search(ua):
return "browser"
return "script"
# =============================================================================
# ENDPOINT 1 — Détection de spoofing JA4
# =============================================================================
@router.get("/spoofing")
async def get_ja4_spoofing(
hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle"),
min_detections: int = Query(10, ge=1, description="Nombre minimum de détections"),
limit: int = Query(50, ge=1, le=200),
):
"""
Identifie les JA4 fingerprints suspects de spoofing navigateur.
Un JA4 est considéré suspect quand:
- Il présente un taux élevé de ua_ch_mismatch (header UA ≠ Client Hints)
- Son modern_browser_score est élevé mais les UAs associés sont des bots/scripts
- Il apparaît avec un taux élevé de sni_host_mismatch ou alpn_http_mismatch
- is_rare_ja4 = true avec un volume important
Retourne un score de confiance de spoofing [0-100] pour chaque JA4.
"""
try:
# Agrégation par JA4 avec tous les indicateurs de spoofing
query = f"""
SELECT
ja4,
count() AS total_detections,
uniq(src_ip) AS unique_ips,
-- Indicateurs de mismatch
countIf(ua_ch_mismatch = true) AS ua_ch_mismatch_count,
round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
countIf(sni_host_mismatch = true) AS sni_mismatch_count,
round(countIf(sni_host_mismatch = true) * 100.0 / count(), 2) AS sni_mismatch_pct,
countIf(alpn_http_mismatch = true) AS alpn_mismatch_count,
round(countIf(alpn_http_mismatch = true) * 100.0 / count(), 2) AS alpn_mismatch_pct,
-- Indicateurs comportementaux
avg(modern_browser_score) AS avg_browser_score,
countIf(is_rare_ja4 = true) AS rare_ja4_count,
round(countIf(is_rare_ja4 = true) * 100.0 / count(), 2) AS rare_ja4_pct,
countIf(is_ua_rotating = true) AS ua_rotating_count,
round(countIf(is_ua_rotating = true) * 100.0 / count(), 2) AS ua_rotating_pct,
-- Métriques TLS/TCP
countIf(is_alpn_missing = true) AS alpn_missing_count,
avg(distinct_ja4_count) AS avg_distinct_ja4_per_ip,
-- Répartition threat levels
countIf(threat_level = 'CRITICAL') AS critical_count,
countIf(threat_level = 'HIGH') AS high_count,
-- Botnet indicators
avg(ja4_asn_concentration) AS avg_asn_concentration,
avg(ja4_country_concentration) AS avg_country_concentration,
argMax(threat_level, detected_at) AS last_threat_level
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND ja4 != '' AND ja4 IS NOT NULL
GROUP BY ja4
HAVING total_detections >= %(min_detections)s
ORDER BY ua_ch_mismatch_pct DESC, total_detections DESC
LIMIT %(limit)s
"""
result = db.query(query, {
"hours": hours,
"min_detections": min_detections,
"limit": limit,
})
# Fetch top UA per JA4 from view_dashboard_user_agents
ja4_list = [str(r[0]) for r in result.result_rows if r[0]]
ua_by_ja4: dict = {}
if ja4_list:
ja4_sql = ", ".join(f"'{j}'" for j in ja4_list[:100])
ua_q = f"""
SELECT ja4, groupArray(5)(ua) AS top_uas
FROM (
SELECT ja4, arrayJoin(user_agents) AS ua, sum(requests) AS cnt
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents
WHERE ja4 IN ({ja4_sql})
AND hour >= now() - INTERVAL {hours} HOUR
AND ua != ''
GROUP BY ja4, ua
ORDER BY ja4, cnt DESC
)
GROUP BY ja4
"""
try:
ua_res = db.query(ua_q)
for ua_row in ua_res.result_rows:
j4 = str(ua_row[0])
if ua_row[1]:
ua_by_ja4[j4] = list(ua_row[1])
except Exception:
pass
items = []
for row in result.result_rows:
ja4 = str(row[0])
ua_ch_mismatch_pct = float(row[4] or 0)
sni_mismatch_pct = float(row[6] or 0)
alpn_mismatch_pct = float(row[8] or 0)
avg_browser_score = float(row[9] or 0)
rare_ja4_pct = float(row[11] or 0)
ua_rotating_pct = float(row[13] or 0)
alpn_missing_count = int(row[14] or 0)
total = int(row[1] or 1)
top_uas = ua_by_ja4.get(ja4, [])
ua_classes = [_classify_ua(u) for u in top_uas]
has_bot_ua = any(c == "bot" for c in ua_classes)
has_browser_ua = any(c == "browser" for c in ua_classes)
# Spoofing confidence score [0-100]:
# UA/CH mismatch est le signal le plus fort (poids 40)
# Browser UA avec score navigateur élevé mais indicateurs bot (poids 25)
# SNI/ALPN mismatches (poids 15)
# is_rare_ja4 avec gros volume (poids 10)
# UA rotating (poids 10)
spoof_score = min(100, round(
ua_ch_mismatch_pct * 0.40
+ (avg_browser_score * 25 / 100 if has_bot_ua else 0)
+ sni_mismatch_pct * 0.10
+ alpn_mismatch_pct * 0.05
+ rare_ja4_pct * 0.10
+ ua_rotating_pct * 0.10
+ (10 if alpn_missing_count > total * 0.3 else 0)
))
# Classification du JA4
if spoof_score >= 60:
classification = "spoofed_browser"
elif has_bot_ua and avg_browser_score < 30:
classification = "known_bot"
elif has_browser_ua and ua_ch_mismatch_pct < 10:
classification = "legitimate_browser"
else:
classification = "suspicious"
items.append({
"ja4": ja4,
"classification": classification,
"spoofing_score": spoof_score,
"total_detections": int(row[1] or 0),
"unique_ips": int(row[2] or 0),
"indicators": {
"ua_ch_mismatch_pct": ua_ch_mismatch_pct,
"sni_mismatch_pct": sni_mismatch_pct,
"alpn_mismatch_pct": alpn_mismatch_pct,
"avg_browser_score": round(avg_browser_score, 1),
"rare_ja4_pct": rare_ja4_pct,
"ua_rotating_pct": ua_rotating_pct,
"alpn_missing_count": alpn_missing_count,
"avg_asn_concentration": round(float(row[18] or 0), 3),
"avg_country_concentration": round(float(row[19] or 0), 3),
},
"top_user_agents": [
{"ua": u, "type": _classify_ua(u)} for u in top_uas
],
"threat_breakdown": {
"critical": int(row[16] or 0),
"high": int(row[17] or 0),
"last_level": str(row[20] or "LOW"),
},
})
# Trier: spoofed_browser d'abord, puis par score
items.sort(key=lambda x: (-x["spoofing_score"], -x["total_detections"]))
return {
"items": items,
"total": len(items),
"period_hours": hours,
"summary": {
"spoofed_browser": sum(1 for i in items if i["classification"] == "spoofed_browser"),
"known_bot": sum(1 for i in items if i["classification"] == "known_bot"),
"suspicious": sum(1 for i in items if i["classification"] == "suspicious"),
"legitimate_browser": sum(1 for i in items if i["classification"] == "legitimate_browser"),
},
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ENDPOINT 2 — Matrice JA4 × User-Agent
# =============================================================================
@router.get("/ja4-ua-matrix")
async def get_ja4_ua_matrix(
hours: int = Query(24, ge=1, le=168),
min_ips: int = Query(3, ge=1, description="Nombre minimum d'IPs pour inclure un JA4"),
limit: int = Query(30, ge=1, le=100),
):
"""
Matrice JA4 × User-Agent.
Pour chaque JA4:
- Top User-Agents associés (depuis view_dashboard_entities)
- Taux de ua_ch_mismatch
- Classification UA (bot / browser / script)
- Indicateur de spoofing si browser_score élevé + UA non-navigateur
"""
try:
# Stats JA4 depuis ml_detected_anomalies
stats_query = f"""
SELECT
ja4,
uniq(src_ip) AS unique_ips,
count() AS total_detections,
round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
avg(modern_browser_score) AS avg_browser_score,
countIf(is_rare_ja4 = true) AS rare_count,
countIf(is_ua_rotating = true) AS rotating_count,
argMax(threat_level, detected_at) AS last_threat
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND ja4 != '' AND ja4 IS NOT NULL
GROUP BY ja4
HAVING unique_ips >= %(min_ips)s
ORDER BY ua_ch_mismatch_pct DESC, unique_ips DESC
LIMIT %(limit)s
"""
stats_res = db.query(stats_query, {"hours": hours, "min_ips": min_ips, "limit": limit})
ja4_list = [str(r[0]) for r in stats_res.result_rows]
if not ja4_list:
return {"items": [], "total": 0, "period_hours": hours}
# UAs par JA4 depuis view_dashboard_user_agents
ja4_sql = ", ".join(f"'{j}'" for j in ja4_list)
ua_query = f"""
SELECT
ja4,
ua,
sum(requests) AS cnt
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE ja4 IN ({ja4_sql})
AND hour >= now() - INTERVAL {hours} HOUR
AND ua != ''
GROUP BY ja4, ua
ORDER BY ja4, cnt DESC
"""
ua_by_ja4: dict = {}
try:
ua_res = db.query(ua_query)
for row in ua_res.result_rows:
j4 = str(row[0])
if j4 not in ua_by_ja4:
ua_by_ja4[j4] = []
if len(ua_by_ja4[j4]) < 8:
ua_by_ja4[j4].append({"ua": str(row[1]), "count": int(row[2] or 0)})
except Exception:
pass
items = []
for row in stats_res.result_rows:
ja4 = str(row[0])
unique_ips = int(row[1] or 0)
ua_ch_mismatch_pct = float(row[3] or 0)
avg_browser_score = float(row[4] or 0)
top_uas = ua_by_ja4.get(ja4, [])
ua_total = sum(u["count"] for u in top_uas) or 1
classified_uas = []
for u in top_uas:
ua_type = _classify_ua(u["ua"])
classified_uas.append({
"ua": u["ua"],
"count": u["count"],
"pct": round(u["count"] * 100 / ua_total, 1),
"type": ua_type,
})
bot_pct = sum(u["pct"] for u in classified_uas if u["type"] == "bot")
browser_pct = sum(u["pct"] for u in classified_uas if u["type"] == "browser")
# Spoofing flag: JA4 ressemble à un navigateur (browser_score élevé)
# mais les UAs sont des bots/scripts
is_spoofing = avg_browser_score > 50 and bot_pct > 30 and ua_ch_mismatch_pct > 20
items.append({
"ja4": ja4,
"unique_ips": unique_ips,
"total_detections": int(row[2] or 0),
"ua_ch_mismatch_pct": ua_ch_mismatch_pct,
"avg_browser_score": round(avg_browser_score, 1),
"rare_count": int(row[5] or 0),
"rotating_count": int(row[6] or 0),
"last_threat": str(row[7] or "LOW"),
"user_agents": classified_uas,
"ua_summary": {
"bot_pct": round(bot_pct, 1),
"browser_pct": round(browser_pct, 1),
"script_pct": round(100 - bot_pct - browser_pct, 1),
"total_distinct": len(top_uas),
},
"is_spoofing_suspect": is_spoofing,
})
return {
"items": items,
"total": len(items),
"period_hours": hours,
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ENDPOINT 3 — Analyse globale des User-Agents
# =============================================================================
@router.get("/ua-analysis")
async def get_ua_analysis(
hours: int = Query(24, ge=1, le=168),
limit: int = Query(50, ge=1, le=200),
):
"""
Analyse globale des User-Agents dans les détections.
Identifie:
- UAs de type bot/script
- UAs browser légitimes vs UAs browser utilisés par des bots (via ua_ch_mismatch)
- UAs rares/suspects qui tournent (is_ua_rotating)
- Distribution JA4 par UA pour détecter les UAs multi-fingerprints (rotation)
"""
try:
# Top UAs globaux depuis view_dashboard_user_agents
ua_global_query = """
SELECT
ua,
sum(requests) AS ip_count
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE hour >= now() - INTERVAL %(hours)s HOUR
AND ua != ''
GROUP BY ua
ORDER BY ip_count DESC
LIMIT %(limit)s
"""
ua_global_res = db.query(ua_global_query, {"hours": hours, "limit": limit})
top_uas = [str(r[0]) for r in ua_global_res.result_rows]
# Pour chaque UA, chercher ses JA4 via view_dashboard_user_agents
ua_sql = ", ".join(f"'{u.replace(chr(39), chr(39)*2)}'" for u in top_uas[:50]) if top_uas else "''"
ja4_per_ua_query = f"""
SELECT
ua,
uniq(ja4) AS unique_ja4s,
groupUniqArray(3)(ja4) AS sample_ja4s
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE ua IN ({ua_sql})
AND hour >= now() - INTERVAL {hours} HOUR
AND ua != ''
AND ja4 != ''
GROUP BY ua
"""
ja4_by_ua: dict = {}
try:
ja4_res = db.query(ja4_per_ua_query)
for r in ja4_res.result_rows:
ja4_by_ua[str(r[0])] = {
"unique_ja4s": int(r[1] or 0),
"sample_ja4s": list(r[2] or []),
}
except Exception:
pass
# IPs avec is_ua_rotating depuis ml_detected_anomalies
rotating_query = f"""
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
avg(ua_ch_mismatch) AS avg_ua_ch_mismatch
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND is_ua_rotating = true
GROUP BY clean_ip
ORDER BY avg_ua_ch_mismatch DESC
"""
rotating_ips: list = []
try:
rot_res = db.query(rotating_query, {"hours": hours})
rotating_ips = [str(r[0]) for r in rot_res.result_rows]
except Exception:
pass
# Construire la réponse
items = []
for row in ua_global_res.result_rows:
ua = str(row[0])
ip_count = int(row[1] or 0)
ua_type = _classify_ua(ua)
ja4_info = ja4_by_ua.get(ua, {"unique_ja4s": 0, "sample_ja4s": []})
# UA multi-JA4 est suspect: un vrai navigateur a généralement 1-2 JA4
multi_ja4_flag = ja4_info["unique_ja4s"] > 3
items.append({
"user_agent": ua,
"type": ua_type,
"ip_count": ip_count,
"unique_ja4_count": ja4_info["unique_ja4s"],
"sample_ja4s": ja4_info["sample_ja4s"],
"is_multi_ja4_suspect": multi_ja4_flag,
"risk_flags": _build_ua_risk_flags(ua, ua_type, ja4_info["unique_ja4s"], ip_count),
})
# IPs avec rotation d'UA
ua_rotating_stats = {
"rotating_ip_count": len(rotating_ips),
"sample_rotating_ips": rotating_ips[:10],
}
return {
"items": items,
"total": len(items),
"period_hours": hours,
"ua_rotating_stats": ua_rotating_stats,
"summary": {
"bot_count": sum(1 for i in items if i["type"] == "bot"),
"browser_count": sum(1 for i in items if i["type"] == "browser"),
"script_count": sum(1 for i in items if i["type"] == "script"),
"multi_ja4_suspect_count": sum(1 for i in items if i["is_multi_ja4_suspect"]),
},
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
def _build_ua_risk_flags(ua: str, ua_type: str, unique_ja4s: int, ip_count: int) -> list:
"""Construit la liste des indicateurs de risque pour un User-Agent."""
flags = []
if ua_type == "bot":
flags.append("ua_bot_signature")
elif ua_type == "script":
flags.append("ua_script_library")
if unique_ja4s > 5:
flags.append("ja4_rotation_suspect")
if unique_ja4s > 3 and ua_type == "browser":
flags.append("browser_ua_multi_fingerprint")
if ip_count > 100:
flags.append("high_volume")
return flags
# =============================================================================
# ENDPOINT 4 — JA4 d'un IP spécifique: analyse de cohérence UA/JA4
# =============================================================================
@router.get("/ip/{ip}/coherence")
async def get_ip_fingerprint_coherence(ip: str):
"""
Analyse la cohérence JA4/UA pour une IP spécifique.
Répond à la question: "Cette IP spoofait-elle son fingerprint?"
Calcule un score de cohérence basé sur:
- Correspondance entre JA4 (TLS client fingerprint) et User-Agent
- ua_ch_mismatch (User-Agent vs Client Hints)
- modern_browser_score vs type d'UA réel
- Nombre de JA4 distincts utilisés (rotation)
- sni_host_mismatch, alpn_http_mismatch
"""
try:
# Données depuis ml_detected_anomalies
ml_query = f"""
SELECT
ja4,
ua_ch_mismatch,
modern_browser_score,
sni_host_mismatch,
alpn_http_mismatch,
is_alpn_missing,
is_rare_ja4,
is_ua_rotating,
distinct_ja4_count,
header_count,
has_accept_language,
has_cookie,
has_referer,
header_order_shared_count,
detected_at,
threat_level,
window_mss_ratio,
tcp_jitter_variance,
multiplexing_efficiency
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE src_ip = %(ip)s
ORDER BY detected_at DESC
"""
ml_res = db.query(ml_query, {"ip": ip})
if not ml_res.result_rows:
raise HTTPException(status_code=404, detail="IP non trouvée dans les détections")
# User-agents réels depuis view_dashboard_user_agents
ua_query = """
SELECT ua, sum(requests) AS cnt
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE toString(src_ip) = %(ip)s
AND hour >= now() - INTERVAL 72 HOUR
AND ua != ''
GROUP BY ua ORDER BY cnt DESC
"""
ua_res = db.query(ua_query, {"ip": ip})
top_uas = [{"ua": str(r[0]), "count": int(r[1] or 0), "type": _classify_ua(str(r[0]))}
for r in ua_res.result_rows]
# Agréger les indicateurs de la dernière session
rows = ml_res.result_rows
latest = rows[0]
total_rows = len(rows)
ua_ch_mismatch_count = sum(1 for r in rows if r[1])
sni_mismatch_count = sum(1 for r in rows if r[3])
alpn_mismatch_count = sum(1 for r in rows if r[4])
is_rare_count = sum(1 for r in rows if r[6])
is_rotating = any(r[7] for r in rows)
distinct_ja4s = {str(r[0]) for r in rows if r[0]}
avg_browser_score = sum(int(r[2] or 0) for r in rows) / total_rows
# UA analysis
has_browser_ua = any(u["type"] == "browser" for u in top_uas)
has_bot_ua = any(u["type"] == "bot" for u in top_uas)
primary_ua_type = top_uas[0]["type"] if top_uas else "empty"
# Calcul du score de spoofing
spoof_score = min(100, round(
(ua_ch_mismatch_count / total_rows * 100) * 0.40
+ (avg_browser_score * 0.20 if has_bot_ua else 0)
+ (sni_mismatch_count / total_rows * 100) * 0.10
+ (alpn_mismatch_count / total_rows * 100) * 0.05
+ (len(distinct_ja4s) * 5 if len(distinct_ja4s) > 2 else 0)
+ (15 if is_rotating else 0)
+ (10 if is_rare_count > total_rows * 0.5 else 0)
))
# Verdict
if spoof_score >= 70:
verdict = "high_confidence_spoofing"
elif spoof_score >= 40:
verdict = "suspicious_spoofing"
elif has_bot_ua and avg_browser_score < 20:
verdict = "known_bot_no_spoofing"
elif has_browser_ua and spoof_score < 20:
verdict = "legitimate_browser"
else:
verdict = "inconclusive"
# Explication humaine
explanation = []
if ua_ch_mismatch_count > total_rows * 0.3:
explanation.append(f"UA-Client-Hints mismatch sur {round(ua_ch_mismatch_count*100/total_rows)}% des requêtes")
if has_bot_ua and avg_browser_score > 40:
explanation.append(f"JA4 ressemble à un navigateur (score {round(avg_browser_score)}/100) mais UA est de type bot")
if len(distinct_ja4s) > 2:
explanation.append(f"{len(distinct_ja4s)} JA4 distincts utilisés → rotation de fingerprint")
if is_rotating:
explanation.append("is_ua_rotating détecté → rotation d'User-Agent confirmée")
if sni_mismatch_count > 0:
explanation.append(f"SNI ≠ Host header sur {sni_mismatch_count}/{total_rows} requêtes")
if not explanation:
explanation.append("Aucun indicateur de spoofing majeur détecté")
return {
"ip": ip,
"verdict": verdict,
"spoofing_score": spoof_score,
"explanation": explanation,
"indicators": {
"ua_ch_mismatch_rate": round(ua_ch_mismatch_count / total_rows * 100, 1),
"sni_mismatch_rate": round(sni_mismatch_count / total_rows * 100, 1),
"alpn_mismatch_rate": round(alpn_mismatch_count / total_rows * 100, 1),
"avg_browser_score": round(avg_browser_score, 1),
"distinct_ja4_count": len(distinct_ja4s),
"is_ua_rotating": is_rotating,
"rare_ja4_rate": round(is_rare_count / total_rows * 100, 1),
},
"fingerprints": {
"ja4_list": list(distinct_ja4s),
"latest_ja4": str(latest[0] or ""),
},
"user_agents": top_uas,
"latest_detection": {
"detected_at": latest[14].isoformat() if latest[14] else "",
"threat_level": str(latest[15] or "LOW"),
"modern_browser_score": int(latest[2] or 0),
"header_count": int(latest[9] or 0),
"has_accept_language": bool(latest[10]),
"has_cookie": bool(latest[11]),
"has_referer": bool(latest[12]),
"header_order_shared_count": int(latest[13] or 0),
},
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ENDPOINT 5 — JA4 légitimes (baseline / whitelist)
# =============================================================================
@router.get("/legitimate-ja4")
async def get_legitimate_ja4(
hours: int = Query(168, ge=24, le=720, description="Fenêtre pour établir la baseline"),
min_ips: int = Query(50, ge=5, description="Nombre minimum d'IPs pour qualifier un JA4 de légitime"),
):
"""
Établit une baseline des JA4 fingerprints légitimes.
Un JA4 est considéré légitime si:
- Il est utilisé par un grand nombre d'IPs distinctes (> min_ips)
- Son taux de ua_ch_mismatch est faible (< 5%)
- Son modern_browser_score est élevé (> 60)
- Il n'est PAS is_rare_ja4
- Ses UAs sont dominés par des navigateurs connus
Utile comme whitelist pour réduire les faux positifs.
"""
try:
query = f"""
SELECT
ja4,
uniq(src_ip) AS unique_ips,
count() AS total_detections,
round(countIf(ua_ch_mismatch = true) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
avg(modern_browser_score) AS avg_browser_score,
countIf(is_rare_ja4 = true) AS rare_count,
round(countIf(threat_level = 'CRITICAL') * 100.0 / count(), 2) AS critical_pct,
round(countIf(threat_level = 'HIGH') * 100.0 / count(), 2) AS high_pct
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND ja4 != '' AND ja4 IS NOT NULL
GROUP BY ja4
HAVING unique_ips >= %(min_ips)s
AND ua_ch_mismatch_pct < 5.0
AND avg_browser_score > 60
AND rare_count = 0
ORDER BY unique_ips DESC
"""
result = db.query(query, {"hours": hours, "min_ips": min_ips})
items = [
{
"ja4": str(row[0]),
"unique_ips": int(row[1] or 0),
"total_detections": int(row[2] or 0),
"ua_ch_mismatch_pct": float(row[3] or 0),
"avg_browser_score": round(float(row[4] or 0), 1),
"critical_pct": float(row[6] or 0),
"high_pct": float(row[7] or 0),
"legitimacy_confidence": min(100, round(
(1 - float(row[3] or 0) / 100) * 40
+ float(row[4] or 0) * 0.40
+ min(int(row[1] or 0) / min_ips, 1) * 20
)),
}
for row in result.result_rows
]
return {
"items": items,
"total": len(items),
"period_hours": hours,
"note": "Ces JA4 sont candidats à une whitelist. Vérifier manuellement avant de whitelister.",
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ENDPOINT — Corrélation JA4 × ASN / Pays (C5)
# Détecte les JA4 fortement concentrés sur un seul ASN ou pays
# → signal de botnet ciblé ou d'infrastructure de test/attaque partagée
# =============================================================================
@router.get("/asn-correlation")
async def get_ja4_asn_correlation(
min_concentration: float = Query(0.7, ge=0.0, le=1.0, description="Seuil min de concentration ASN ou pays"),
min_ips: int = Query(5, ge=1, description="Nombre minimum d'IPs par JA4"),
limit: int = Query(50, ge=1, le=200),
):
"""
Identifie les JA4 fingerprints fortement concentrés sur un seul ASN ou pays.
Un JA4 avec asn_concentration ≥ 0.7 signifie que ≥70% des IPs utilisant ce fingerprint
proviennent du même ASN → infrastructure de bot partagée ou datacenter suspect.
"""
try:
# Two-pass: first aggregate per (ja4, asn) to get IP counts per ASN,
# then aggregate per ja4 to compute concentration ratio
sql = f"""
SELECT
ja4,
sum(ips_per_combo) AS unique_ips,
uniq(src_asn) AS unique_asns,
uniq(src_country_code) AS unique_countries,
toString(argMax(src_asn, ips_per_combo)) AS top_asn_number,
argMax(asn_name, ips_per_combo) AS top_asn_name,
argMax(src_country_code, country_ips) AS dominant_country,
sum(total_hits) AS total_hits,
round(max(ips_per_combo) / greatest(sum(ips_per_combo), 1), 3) AS asn_concentration,
round(max(country_ips) / greatest(sum(ips_per_combo), 1), 3) AS country_concentration
FROM (
SELECT
ja4,
src_asn,
src_country_code,
any(src_as_name) AS asn_name,
uniq(src_ip) AS ips_per_combo,
uniq(src_ip) AS country_ips,
sum(hits) AS total_hits
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
AND ja4 != ''
GROUP BY ja4, src_asn, src_country_code
)
GROUP BY ja4
HAVING unique_ips >= %(min_ips)s
AND (asn_concentration >= %(min_conc)s OR country_concentration >= %(min_conc)s)
ORDER BY asn_concentration DESC, unique_ips DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"min_ips": min_ips, "min_conc": min_concentration, "limit": limit})
items = []
for row in result.result_rows:
ja4 = str(row[0])
unique_ips = int(row[1])
unique_asns = int(row[2])
unique_countries = int(row[3])
top_asn_number = str(row[4] or "")
top_asn_name = str(row[5] or "")
dominant_country = str(row[6] or "")
total_hits = int(row[7] or 0)
asn_concentration = float(row[8] or 0)
country_concentration = float(row[9] or 0)
if asn_concentration >= 0.85:
corr_type, risk = "asn_monopoly", "high"
elif asn_concentration >= min_concentration:
corr_type, risk = "asn_dominant", "medium"
elif country_concentration >= min_concentration:
corr_type, risk = "geo_targeted", "medium"
else:
corr_type, risk = "distributed", "low"
items.append({
"ja4": ja4,
"unique_ips": unique_ips,
"unique_asns": unique_asns,
"unique_countries": unique_countries,
"top_asn_name": top_asn_name,
"top_asn_number": top_asn_number,
"dominant_country": dominant_country,
"total_hits": total_hits,
"asn_concentration": asn_concentration,
"country_concentration":country_concentration,
"correlation_type": corr_type,
"risk": risk,
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -1,102 +0,0 @@
"""
Endpoints pour l'analyse des empreintes d'en-têtes HTTP
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
from ..config import settings
router = APIRouter(prefix="/api/headers", tags=["header_fingerprint"])
@router.get("/clusters")
async def get_header_clusters(limit: int = Query(50, ge=1, le=200)):
"""Clusters d'empreintes d'en-têtes groupés par header_order_hash."""
try:
sql = f"""
SELECT
header_order_hash AS hash,
uniq(replaceRegexpAll(toString(src_ip), '^::ffff:', '')) AS unique_ips,
avg(modern_browser_score) AS avg_browser_score,
sum(ua_ch_mismatch) AS ua_ch_mismatch_count,
round(sum(ua_ch_mismatch) * 100.0 / count(), 2) AS ua_ch_mismatch_pct,
groupArray(5)(sec_fetch_mode) AS top_sec_fetch_modes,
round(sum(has_cookie) * 100.0 / count(), 2) AS has_cookie_pct,
round(sum(has_referer) * 100.0 / count(), 2) AS has_referer_pct
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_header_fingerprint_1h
GROUP BY header_order_hash
ORDER BY unique_ips DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
total_sql = f"""
SELECT uniq(header_order_hash)
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_header_fingerprint_1h
"""
total_clusters = int(db.query(total_sql).result_rows[0][0])
clusters = []
for row in result.result_rows:
h = str(row[0])
unique_ips = int(row[1])
avg_browser_score = float(row[2] or 0)
ua_ch_mismatch_cnt = int(row[3])
ua_ch_mismatch_pct = float(row[4] or 0)
top_modes = list(set(str(m) for m in (row[5] or [])))
has_cookie_pct = float(row[6] or 0)
has_referer_pct = float(row[7] or 0)
if avg_browser_score >= 90 and ua_ch_mismatch_pct < 5:
classification = "legitimate"
elif ua_ch_mismatch_pct > 50:
classification = "bot_suspicious"
else:
classification = "mixed"
clusters.append({
"hash": h,
"unique_ips": unique_ips,
"avg_browser_score": round(avg_browser_score, 2),
"ua_ch_mismatch_count":ua_ch_mismatch_cnt,
"ua_ch_mismatch_pct": ua_ch_mismatch_pct,
"top_sec_fetch_modes": top_modes,
"has_cookie_pct": has_cookie_pct,
"has_referer_pct": has_referer_pct,
"classification": classification,
})
return {"clusters": clusters, "total_clusters": total_clusters}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/cluster/{hash}/ips")
async def get_cluster_ips(hash: str, limit: int = Query(50, ge=1, le=500)):
"""Liste des IPs appartenant à un cluster d'en-têtes donné."""
try:
sql = f"""
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
any(modern_browser_score) AS browser_score,
any(ua_ch_mismatch) AS ua_ch_mismatch,
any(sec_fetch_mode) AS sec_fetch_mode,
any(sec_fetch_dest) AS sec_fetch_dest
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_header_fingerprint_1h
WHERE header_order_hash = %(hash)s
GROUP BY src_ip
ORDER BY browser_score DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"hash": hash, "limit": limit})
items = []
for row in result.result_rows:
items.append({
"ip": str(row[0]),
"browser_score": int(row[1] or 0),
"ua_ch_mismatch": int(row[2] or 0),
"sec_fetch_mode": str(row[3] or ""),
"sec_fetch_dest": str(row[4] or ""),
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -1,145 +0,0 @@
"""
Endpoints pour la heatmap temporelle (hits par heure / hôte)
"""
from collections import defaultdict
from fastapi import APIRouter, HTTPException, Query
from ..database import db
from ..config import settings
router = APIRouter(prefix="/api/heatmap", tags=["heatmap"])
@router.get("/hourly")
async def get_heatmap_hourly():
"""Hits agrégés par heure sur les 72 dernières heures."""
try:
sql = f"""
SELECT
toHour(window_start) AS hour,
sum(hits) AS hits,
uniq(replaceRegexpAll(toString(src_ip), '^::ffff:', '')) AS unique_ips,
max(max_requests_per_sec) AS max_rps
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 72 HOUR
GROUP BY hour
ORDER BY hour ASC
"""
result = db.query(sql)
hours = [
{
"hour": int(row[0]),
"hits": int(row[1]),
"unique_ips": int(row[2]),
"max_rps": int(row[3]),
}
for row in result.result_rows
]
return {"hours": hours}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/top-hosts")
async def get_heatmap_top_hosts(limit: int = Query(20, ge=1, le=100)):
"""Hôtes les plus ciblés avec répartition horaire sur 24h."""
try:
# Aggregate overall stats per host
agg_sql = f"""
SELECT
host,
sum(hits) AS total_hits,
uniq(replaceRegexpAll(toString(src_ip), '^::ffff:', '')) AS unique_ips,
uniq(ja4) AS unique_ja4s
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 72 HOUR
GROUP BY host
ORDER BY total_hits DESC
LIMIT %(limit)s
"""
agg_res = db.query(agg_sql, {"limit": limit})
top_hosts = [str(r[0]) for r in agg_res.result_rows]
host_stats = {
str(r[0]): {
"host": str(r[0]),
"total_hits": int(r[1]),
"unique_ips": int(r[2]),
"unique_ja4s":int(r[3]),
}
for r in agg_res.result_rows
}
if not top_hosts:
return {"items": []}
# Hourly breakdown per host
hourly_sql = f"""
SELECT
host,
toHour(window_start) AS hour,
sum(hits) AS hits
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 72 HOUR
AND host IN %(hosts)s
GROUP BY host, hour
"""
hourly_res = db.query(hourly_sql, {"hosts": top_hosts})
hourly_map: dict = defaultdict(lambda: [0] * 24)
for row in hourly_res.result_rows:
h = str(row[0])
hour = int(row[1])
hits = int(row[2])
hourly_map[h][hour] += hits
items = []
for host in top_hosts:
entry = dict(host_stats[host])
entry["hourly_hits"] = hourly_map[host]
items.append(entry)
return {"items": items}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/matrix")
async def get_heatmap_matrix():
"""Matrice top-15 hôtes × 24 heures (sum hits) sur les 72 dernières heures."""
try:
top_sql = f"""
SELECT host, sum(hits) AS total_hits
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 72 HOUR
GROUP BY host
ORDER BY total_hits DESC
"""
top_res = db.query(top_sql)
top_hosts = [str(r[0]) for r in top_res.result_rows]
if not top_hosts:
return {"hosts": [], "matrix": []}
cell_sql = f"""
SELECT
host,
toHour(window_start) AS hour,
sum(hits) AS hits
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 72 HOUR
AND host IN %(hosts)s
GROUP BY host, hour
"""
cell_res = db.query(cell_sql, {"hosts": top_hosts})
matrix_map: dict = defaultdict(lambda: [0] * 24)
for row in cell_res.result_rows:
h = str(row[0])
hour = int(row[1])
hits = int(row[2])
matrix_map[h][hour] += hits
matrix = [matrix_map[h] for h in top_hosts]
return {"hosts": top_hosts, "matrix": matrix}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -1,267 +0,0 @@
"""
Routes pour la gestion des incidents clusterisés
"""
import hashlib
from fastapi import APIRouter, HTTPException, Query
from typing import List, Optional
from datetime import datetime
from ..database import db
from ..config import settings
router = APIRouter(prefix="/api/incidents", tags=["incidents"])
@router.get("/clusters")
async def get_incident_clusters(
hours: int = Query(24, ge=1, le=168, description="Fenêtre temporelle en heures"),
min_severity: str = Query("LOW", description="Niveau de sévérité minimum"),
limit: int = Query(20, ge=1, le=100, description="Nombre maximum de clusters")
):
"""
Récupère les incidents clusterisés automatiquement
Les clusters sont formés par:
- Subnet /24
- JA4 fingerprint
- Pattern temporel
"""
try:
# Cluster par subnet /24 avec une IP exemple
# Note: src_ip est en IPv6, les IPv4 sont stockés comme ::ffff:x.x.x.x
# toIPv4() convertit les IPv4-mapped, IPv4NumToString() retourne l'IPv4 en notation x.x.x.x
cluster_query = f"""
WITH cleaned_ips AS (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
detected_at,
ja4,
country_code,
asn_number,
threat_level,
anomaly_score
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
),
subnet_groups AS (
SELECT
concat(
splitByChar('.', clean_ip)[1], '.',
splitByChar('.', clean_ip)[2], '.',
splitByChar('.', clean_ip)[3], '.0/24'
) AS subnet,
count() AS total_detections,
uniq(clean_ip) AS unique_ips,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen,
argMax(ja4, detected_at) AS ja4,
argMax(country_code, detected_at) AS country_code,
argMax(asn_number, detected_at) AS asn_number,
argMax(threat_level, detected_at) AS threat_level,
avg(anomaly_score) AS avg_score,
argMax(clean_ip, detected_at) AS sample_ip
FROM cleaned_ips
GROUP BY subnet
HAVING total_detections >= 2
)
SELECT
subnet,
total_detections,
unique_ips,
first_seen,
last_seen,
ja4,
country_code,
asn_number,
threat_level,
avg_score,
sample_ip
FROM subnet_groups
ORDER BY avg_score ASC, total_detections DESC
LIMIT %(limit)s
"""
result = db.query(cluster_query, {"hours": hours, "limit": limit})
# Collect sample IPs to fetch real UA and trend data in bulk
sample_ips = [row[10] for row in result.result_rows if row[10]]
# Fetch real primary UA per sample IP from {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_entities
ua_by_ip: dict = {}
if sample_ips:
ip_list_sql = ", ".join(f"'{ip}'" for ip in sample_ips[:50])
ua_query = f"""
SELECT entity_value, arrayElement(user_agents, 1) AS top_ua
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_entities
WHERE entity_type = 'ip'
AND entity_value IN ({ip_list_sql})
AND notEmpty(user_agents)
GROUP BY entity_value, top_ua
ORDER BY entity_value
"""
try:
ua_result = db.query(ua_query)
for ua_row in ua_result.result_rows:
if ua_row[0] not in ua_by_ip and ua_row[1]:
ua_by_ip[str(ua_row[0])] = str(ua_row[1])
except Exception:
pass # UA enrichment is best-effort
# Compute real trend: compare current window vs previous window of same duration
trend_query = f"""
WITH cleaned AS (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
detected_at,
concat(
splitByChar('.', clean_ip)[1], '.',
splitByChar('.', clean_ip)[2], '.',
splitByChar('.', clean_ip)[3], '.0/24'
) AS subnet
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
),
current_window AS (
SELECT subnet, count() AS cnt
FROM cleaned
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
GROUP BY subnet
),
prev_window AS (
SELECT subnet, count() AS cnt
FROM cleaned
WHERE detected_at >= now() - INTERVAL %(hours2)s HOUR
AND detected_at < now() - INTERVAL %(hours)s HOUR
GROUP BY subnet
)
SELECT c.subnet, c.cnt AS current_cnt, p.cnt AS prev_cnt
FROM current_window c
LEFT JOIN prev_window p ON c.subnet = p.subnet
"""
trend_by_subnet: dict = {}
try:
trend_result = db.query(trend_query, {"hours": hours, "hours2": hours * 2})
for tr in trend_result.result_rows:
subnet_key = tr[0]
curr = tr[1] or 0
prev = tr[2] or 0
if prev == 0:
trend_by_subnet[subnet_key] = ("new", 100)
else:
pct = round(((curr - prev) / prev) * 100)
trend_by_subnet[subnet_key] = ("up" if pct >= 0 else "down", abs(pct))
except Exception:
pass
clusters = []
for row in result.result_rows:
subnet = row[0]
threat_level = row[8] or 'LOW'
unique_ips = row[2] or 1
avg_score = abs(row[9] or 0)
sample_ip = row[10] if row[10] else subnet.split('/')[0]
critical_count = 1 if threat_level == 'CRITICAL' else 0
high_count = 1 if threat_level == 'HIGH' else 0
risk_score = min(100, round(
(critical_count * 30) +
(high_count * 20) +
(unique_ips * 5) +
(avg_score * 100)
))
if critical_count > 0 or risk_score >= 80:
severity = "CRITICAL"
elif high_count > (row[1] or 1) * 0.3 or risk_score >= 60:
severity = "HIGH"
elif high_count > 0 or risk_score >= 40:
severity = "MEDIUM"
else:
severity = "LOW"
trend_dir, trend_pct = trend_by_subnet.get(subnet, ("stable", 0))
primary_ua = ua_by_ip.get(sample_ip, "")
clusters.append({
"id": f"INC-{hashlib.md5(subnet.encode()).hexdigest()[:8].upper()}",
"score": risk_score,
"severity": severity,
"total_detections": row[1],
"unique_ips": row[2],
"subnet": subnet,
"sample_ip": sample_ip,
"ja4": row[5] or "",
"primary_ua": primary_ua,
"primary_target": row[3].strftime('%H:%M') if row[3] else "Unknown",
"countries": [{"code": row[6] or "XX", "percentage": 100}],
"asn": str(row[7]) if row[7] else "",
"first_seen": row[3].isoformat() if row[3] else "",
"last_seen": row[4].isoformat() if row[4] else "",
"trend": trend_dir,
"trend_percentage": trend_pct,
})
return {
"items": clusters,
"total": len(clusters),
"period_hours": hours
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/{cluster_id}")
async def get_incident_details(cluster_id: str):
"""
Récupère les détails d'un incident spécifique.
Non encore implémenté — les détails par cluster seront disponibles dans une prochaine version.
"""
raise HTTPException(
status_code=501,
detail="Détails par incident non encore implémentés. Utilisez /api/incidents/clusters pour la liste."
)
@router.post("/{cluster_id}/classify")
async def classify_incident(
cluster_id: str,
label: str,
tags: List[str] = None,
comment: str = ""
):
"""
Classe un incident rapidement.
Non encore implémenté — utilisez /api/analysis/{ip}/classify pour classifier une IP.
"""
raise HTTPException(
status_code=501,
detail="Classification par incident non encore implémentée. Utilisez /api/analysis/{ip}/classify."
)
@router.get("")
async def list_incidents(
status: str = Query("active", description="Statut des incidents"),
severity: Optional[str] = Query(None, description="Filtrer par sévérité (LOW/MEDIUM/HIGH/CRITICAL)"),
hours: int = Query(24, ge=1, le=168)
):
"""
Liste tous les incidents avec filtres.
Délègue à get_incident_clusters ; le filtre severity est appliqué post-requête.
"""
try:
result = await get_incident_clusters(hours=hours, limit=100)
items = result["items"]
if severity:
sev_upper = severity.upper()
items = [c for c in items if c.get("severity") == sev_upper]
return {
"items": items,
"total": len(items),
"period_hours": hours,
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -1,186 +0,0 @@
"""
Endpoint d'investigation enrichie pour une IP donnée.
Agrège en une seule requête les données provenant de toutes les sources :
ml_detected_anomalies, view_form_bruteforce_detected, view_tcp_spoofing_detected,
agg_host_ip_ja4_1h (rotation JA4), view_ip_recurrence, view_ai_features_1h.
"""
from fastapi import APIRouter, HTTPException
from ..database import db
from ..services.tcp_fingerprint import fingerprint_os, detect_spoof, declared_os_from_ua
from ..config import settings
router = APIRouter(prefix="/api/investigation", tags=["investigation"])
@router.get(
"/{ip}/summary",
summary="Synthèse complète d'une IP",
response_description="Score de risque 0-100, détections ML, brute-force, spoofing TCP, rotation JA4, persistance et timeline 24h",
)
async def get_ip_full_summary(ip: str):
"""
Synthèse complète pour une IP : toutes les sources en un appel.
Normalise l'IP (accepte ::ffff:x.x.x.x ou x.x.x.x).
"""
clean_ip = ip.replace("::ffff:", "").strip()
try:
# ── 1. Score ML / features ─────────────────────────────────────────────
ml_sql = f"""
SELECT
max(abs(anomaly_score)) AS max_score,
any(threat_level) AS threat_level,
any(bot_name) AS bot_name,
count() AS total_detections,
uniq(host) AS distinct_hosts,
uniq(ja4) AS distinct_ja4
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE src_ip = IPv4MappedToIPv6(toIPv4(%(ip)s))
"""
ml_res = db.query(ml_sql, {"ip": clean_ip})
ml_row = ml_res.result_rows[0] if ml_res.result_rows else None
ml_data = {
"max_score": round(float(ml_row[0] or 0), 2) if ml_row else 0,
"threat_level": str(ml_row[1] or "") if ml_row else "",
"attack_type": str(ml_row[2] or "") if ml_row else "",
"total_detections": int(ml_row[3] or 0) if ml_row else 0,
"distinct_hosts": int(ml_row[4] or 0) if ml_row else 0,
"distinct_ja4": int(ml_row[5] or 0) if ml_row else 0,
}
# ── 2. Brute force ─────────────────────────────────────────────────────
bf_sql = f"""
SELECT
uniq(host) AS hosts_attacked,
sum(hits) AS total_hits,
sum(query_params_count) AS total_params,
groupArray(3)(host) AS top_hosts
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_form_bruteforce_detected
WHERE src_ip = IPv4MappedToIPv6(toIPv4(%(ip)s))
"""
bf_res = db.query(bf_sql, {"ip": clean_ip})
bf_row = bf_res.result_rows[0] if bf_res.result_rows else None
bf_data = {
"active": bool(bf_row and int(bf_row[1] or 0) > 0),
"hosts_attacked": int(bf_row[0] or 0) if bf_row else 0,
"total_hits": int(bf_row[1] or 0) if bf_row else 0,
"total_params": int(bf_row[2] or 0) if bf_row else 0,
"top_hosts": [str(h) for h in (bf_row[3] or [])] if bf_row else [],
}
# ── 3. TCP spoofing — fingerprinting multi-signal ─────────────────────
tcp_sql = f"""
SELECT
any(tcp_ttl_raw) AS ttl,
any(tcp_win_raw) AS win,
any(tcp_scale_raw) AS scale,
any(tcp_mss_raw) AS mss,
any(first_ua) AS ua
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE src_ip = IPv4MappedToIPv6(toIPv4(%(ip)s))
AND window_start >= now() - INTERVAL 24 HOUR
AND tcp_ttl_raw > 0
LIMIT 1
"""
tcp_res = db.query(tcp_sql, {"ip": clean_ip})
tcp_data = {"detected": False, "tcp_ttl": None, "suspected_os": None}
if tcp_res.result_rows:
r = tcp_res.result_rows[0]
ttl = int(r[0] or 0)
win = int(r[1] or 0)
scale = int(r[2] or 0)
mss = int(r[3] or 0)
ua = str(r[4] or "")
fp = fingerprint_os(ttl, win, scale, mss)
dec_os = declared_os_from_ua(ua)
spoof_res = detect_spoof(fp, dec_os)
tcp_data = {
"detected": spoof_res.is_spoof,
"tcp_ttl": ttl,
"tcp_mss": mss,
"tcp_win_scale": scale,
"initial_ttl": fp.initial_ttl,
"hop_count": fp.hop_count,
"suspected_os": fp.os_name,
"declared_os": dec_os,
"confidence": fp.confidence,
"network_path": fp.network_path,
"is_bot_tool": fp.is_bot_tool,
"spoof_reason": spoof_res.reason,
}
# ── 4. JA4 rotation ────────────────────────────────────────────────────
rot_sql = f"""
SELECT distinct_ja4_count, total_hits
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_host_ip_ja4_rotation
WHERE src_ip = IPv4MappedToIPv6(toIPv4(%(ip)s))
LIMIT 1
"""
rot_res = db.query(rot_sql, {"ip": clean_ip})
rot_data = {"rotating": False, "distinct_ja4_count": 0}
if rot_res.result_rows:
row = rot_res.result_rows[0]
cnt = int(row[0] or 0)
rot_data = {"rotating": cnt > 1, "distinct_ja4_count": cnt, "total_hits": int(row[1] or 0)}
# ── 5. Persistance ─────────────────────────────────────────────────────
pers_sql = f"""
SELECT recurrence, worst_score, worst_threat_level, first_seen, last_seen
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_ip_recurrence
WHERE src_ip = IPv4MappedToIPv6(toIPv4(%(ip)s))
LIMIT 1
"""
pers_res = db.query(pers_sql, {"ip": clean_ip})
pers_data = {"persistent": False, "recurrence": 0}
if pers_res.result_rows:
row = pers_res.result_rows[0]
pers_data = {
"persistent": True,
"recurrence": int(row[0] or 0),
"worst_score": round(float(row[1] or 0), 2),
"worst_threat_level":str(row[2] or ""),
"first_seen": str(row[3]),
"last_seen": str(row[4]),
}
# ── 6. Timeline 24h ────────────────────────────────────────────────────
tl_sql = f"""
SELECT
toHour(window_start) AS hour,
sum(hits) AS hits,
groupUniqArray(3)(ja4) AS ja4s
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE src_ip = IPv4MappedToIPv6(toIPv4(%(ip)s))
AND window_start >= now() - INTERVAL 24 HOUR
GROUP BY hour
ORDER BY hour ASC
"""
tl_res = db.query(tl_sql, {"ip": clean_ip})
timeline = [
{"hour": int(r[0]), "hits": int(r[1]), "ja4s": [str(j) for j in (r[2] or [])]}
for r in tl_res.result_rows
]
# ── Global risk score (heuristic) ──────────────────────────────────────
risk = 0
risk += min(50, ml_data["max_score"] * 50)
if bf_data["active"]: risk += 20
if tcp_data["detected"]:
if tcp_data.get("is_bot_tool"): risk += 30 # outil de scan connu
else: risk += 15 # spoof OS
if rot_data["rotating"]: risk += min(15, rot_data["distinct_ja4_count"] * 3)
if pers_data["persistent"]: risk += min(10, pers_data["recurrence"] * 2)
risk = min(100, round(risk))
return {
"ip": clean_ip,
"risk_score": risk,
"ml": ml_data,
"bruteforce": bf_data,
"tcp_spoofing":tcp_data,
"ja4_rotation":rot_data,
"persistence": pers_data,
"timeline_24h":timeline,
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -1,177 +0,0 @@
"""
Endpoints pour les métriques du dashboard
"""
from fastapi import APIRouter, HTTPException
from ..database import db
from ..models import MetricsResponse, MetricsSummary, TimeSeriesPoint
from ..config import settings
router = APIRouter(prefix="/api/metrics", tags=["metrics"])
@router.get("", response_model=MetricsResponse, summary="Métriques globales du dashboard")
async def get_metrics():
"""
Récupère les métriques globales du dashboard
"""
try:
# Résumé des métriques
summary_query = f"""
SELECT
count() AS total_detections,
countIf(threat_level = 'CRITICAL') AS critical_count,
countIf(threat_level = 'HIGH') AS high_count,
countIf(threat_level = 'MEDIUM') AS medium_count,
countIf(threat_level = 'LOW') AS low_count,
countIf(bot_name != '') AS known_bots_count,
countIf(bot_name = '') AS anomalies_count,
uniq(src_ip) AS unique_ips
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 24 HOUR
"""
summary_result = db.query(summary_query)
summary_row = summary_result.result_rows[0] if summary_result.result_rows else None
if not summary_row:
raise HTTPException(status_code=404, detail="Aucune donnée disponible")
summary = MetricsSummary(
total_detections=summary_row[0],
critical_count=summary_row[1],
high_count=summary_row[2],
medium_count=summary_row[3],
low_count=summary_row[4],
known_bots_count=summary_row[5],
anomalies_count=summary_row[6],
unique_ips=summary_row[7]
)
# Série temporelle (par heure)
timeseries_query = f"""
SELECT
toStartOfHour(detected_at) AS hour,
count() AS total,
countIf(threat_level = 'CRITICAL') AS critical,
countIf(threat_level = 'HIGH') AS high,
countIf(threat_level = 'MEDIUM') AS medium,
countIf(threat_level = 'LOW') AS low
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 24 HOUR
GROUP BY hour
ORDER BY hour
"""
timeseries_result = db.query(timeseries_query)
timeseries = [
TimeSeriesPoint(
hour=row[0],
total=row[1],
critical=row[2],
high=row[3],
medium=row[4],
low=row[5]
)
for row in timeseries_result.result_rows
]
# Distribution par menace
threat_distribution = {
"CRITICAL": summary.critical_count,
"HIGH": summary.high_count,
"MEDIUM": summary.medium_count,
"LOW": summary.low_count
}
return MetricsResponse(
summary=summary,
timeseries=timeseries,
threat_distribution=threat_distribution
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur lors de la récupération des métriques: {str(e)}")
@router.get("/threats")
async def get_threat_distribution():
"""
Récupère la répartition par niveau de menace
"""
try:
query = f"""
SELECT
threat_level,
count() AS count,
round(count() * 100.0 / sum(count()) OVER (), 2) AS percentage
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 24 HOUR
GROUP BY threat_level
ORDER BY count DESC
"""
result = db.query(query)
return {
"items": [
{"threat_level": row[0], "count": row[1], "percentage": row[2]}
for row in result.result_rows
]
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/baseline")
async def get_metrics_baseline():
"""
Compare les métriques actuelles (24h) vs hier (24h-48h) pour afficher les tendances.
"""
try:
query = f"""
SELECT
countIf(detected_at >= now() - INTERVAL 24 HOUR) AS today_total,
countIf(detected_at >= now() - INTERVAL 48 HOUR AND detected_at < now() - INTERVAL 24 HOUR) AS yesterday_total,
uniqIf(src_ip, detected_at >= now() - INTERVAL 24 HOUR) AS today_ips,
uniqIf(src_ip, detected_at >= now() - INTERVAL 48 HOUR AND detected_at < now() - INTERVAL 24 HOUR) AS yesterday_ips,
countIf(threat_level = 'CRITICAL' AND detected_at >= now() - INTERVAL 24 HOUR) AS today_critical,
countIf(threat_level = 'CRITICAL' AND detected_at >= now() - INTERVAL 48 HOUR AND detected_at < now() - INTERVAL 24 HOUR) AS yesterday_critical
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL 48 HOUR
"""
r = db.query(query)
row = r.result_rows[0] if r.result_rows else None
def pct_change(today: int, yesterday: int) -> float:
"""Calcule la variation en pourcentage entre aujourd'hui et hier. Retourne 100 si hier=0 et aujourd'hui>0."""
if yesterday == 0:
return 100.0 if today > 0 else 0.0
return round((today - yesterday) / yesterday * 100, 1)
today_total = int(row[0] or 0) if row else 0
yesterday_total = int(row[1] or 0) if row else 0
today_ips = int(row[2] or 0) if row else 0
yesterday_ips = int(row[3] or 0) if row else 0
today_crit = int(row[4] or 0) if row else 0
yesterday_crit = int(row[5] or 0) if row else 0
return {
"total_detections": {
"today": today_total,
"yesterday": yesterday_total,
"pct_change": pct_change(today_total, yesterday_total),
},
"unique_ips": {
"today": today_ips,
"yesterday": yesterday_ips,
"pct_change": pct_change(today_ips, yesterday_ips),
},
"critical_alerts": {
"today": today_crit,
"yesterday": yesterday_crit,
"pct_change": pct_change(today_crit, yesterday_crit),
},
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur baseline: {str(e)}")

View File

@ -1,428 +0,0 @@
"""
Endpoints pour les features ML / IA (scores d'anomalies, radar, scatter)
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
from ..config import settings
router = APIRouter(prefix="/api/ml", tags=["ml_features"])
def _attack_type(fuzzing_index: float, hit_velocity: float,
is_fake_nav: int, ua_ch_mismatch: int) -> str:
"""Déduit le type d'attaque depuis les métriques comportementales."""
if fuzzing_index > 50:
return "brute_force"
if hit_velocity > 1.0:
return "flood"
if is_fake_nav:
return "scraper"
if ua_ch_mismatch:
return "spoofing"
return "scanner"
@router.get("/top-anomalies")
async def get_top_anomalies(limit: int = Query(50, ge=1, le=500)):
"""Top IPs anomales (24h) — bypass view_ai_features_1h pour éviter les window functions.
Query directe sur agg_host_ip_ja4_1h + LEFT JOIN agg_header_fingerprint_1h.
"""
try:
sql = f"""
SELECT
replaceRegexpAll(toString(a.src_ip), '^::ffff:', '') AS ip,
any(a.ja4) AS ja4,
any(a.host) AS host,
sum(a.hits) AS hits,
round(uniqMerge(a.uniq_query_params)
/ greatest(uniqMerge(a.uniq_paths), 1), 4) AS fuzzing_index,
round(sum(a.hits)
/ greatest(dateDiff('second', min(a.first_seen), max(a.last_seen)), 1), 2) AS hit_velocity,
round(sum(a.count_head) / greatest(sum(a.hits), 1), 4) AS head_ratio,
round(sum(a.count_no_sec_fetch) / greatest(sum(a.hits), 1), 4) AS sec_fetch_absence,
round(sum(a.tls12_count) / greatest(sum(a.hits), 1), 4) AS tls12_ratio,
round(sum(a.count_generic_accept) / greatest(sum(a.hits), 1), 4) AS generic_accept_ratio,
any(a.src_country_code) AS country,
any(a.src_as_name) AS asn_name,
max(h.ua_ch_mismatch) AS ua_ch_mismatch,
max(h.modern_browser_score) AS browser_score,
dictGetOrDefault('{settings.CLICKHOUSE_DB_PROCESSING}.dict_asn_reputation', 'label', toUInt64(any(a.src_asn)), 'unknown') AS asn_label,
coalesce(
nullIf(dictGetOrDefault('{settings.CLICKHOUSE_DB_PROCESSING}.dict_bot_ja4', 'bot_name', tuple(any(a.ja4)), ''), ''),
''
) AS bot_name
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h a
LEFT JOIN {settings.CLICKHOUSE_DB_PROCESSING}.agg_header_fingerprint_1h h
ON a.src_ip = h.src_ip AND a.window_start = h.window_start
WHERE a.window_start >= now() - INTERVAL 24 HOUR
GROUP BY a.src_ip
ORDER BY fuzzing_index DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
fuzzing = float(row[4] or 0)
velocity = float(row[5] or 0)
ua_mm = int(row[12] or 0)
items.append({
"ip": str(row[0]),
"ja4": str(row[1]),
"host": str(row[2]),
"hits": int(row[3] or 0),
"fuzzing_index": fuzzing,
"hit_velocity": velocity,
"head_ratio": float(row[6] or 0),
"sec_fetch_absence": float(row[7] or 0),
"tls12_ratio": float(row[8] or 0),
"generic_accept_ratio": float(row[9] or 0),
"country": str(row[10] or ""),
"asn_name": str(row[11] or ""),
"ua_ch_mismatch": ua_mm,
"browser_score": int(row[13] or 0),
"asn_label": str(row[14] or ""),
"bot_name": str(row[15] or ""),
"attack_type": _attack_type(fuzzing, velocity, 0, ua_mm),
})
return {"items": items}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/ip/{ip}/radar")
async def get_ip_radar(ip: str):
"""Scores radar pour une IP spécifique (8 dimensions d'anomalie)."""
try:
sql = f"""
SELECT
avg(fuzzing_index) AS fuzzing_index,
avg(hit_velocity) AS hit_velocity,
avg(is_fake_navigation) AS is_fake_navigation,
avg(ua_ch_mismatch) AS ua_ch_mismatch,
avg(sni_host_mismatch) AS sni_host_mismatch,
avg(orphan_ratio) AS orphan_ratio,
avg(path_diversity_ratio) AS path_diversity_ratio,
avg(anomalous_payload_ratio) AS anomalous_payload_ratio
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_ai_features_1h
WHERE src_ip = IPv4MappedToIPv6(toIPv4(%(ip)s))
AND window_start >= now() - INTERVAL 24 HOUR
"""
result = db.query(sql, {"ip": ip})
if not result.result_rows:
raise HTTPException(status_code=404, detail="IP not found")
row = result.result_rows[0]
def _f(v) -> float:
"""Convertit une valeur nullable en float (None ou falsy → 0.0)."""
return float(v or 0)
return {
"ip": ip,
"fuzzing_score": min(100.0, _f(row[0])),
"velocity_score": min(100.0, _f(row[1]) * 100),
"fake_nav_score": _f(row[2]) * 100,
"ua_mismatch_score": _f(row[3]) * 100,
"sni_mismatch_score": _f(row[4]) * 100,
"orphan_score": min(100.0, _f(row[5]) * 100),
"path_repetition_score": max(0.0, 100 - _f(row[6]) * 100),
"payload_anomaly_score": min(100.0, _f(row[7]) * 100),
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/score-distribution")
async def get_score_distribution():
"""
Distribution de TOUS les scores ML depuis ml_all_scores (3j).
Single query avec conditional aggregates pour éviter le double scan.
"""
try:
# Single scan — global totals + per-model breakdown via GROUPING SETS
sql = f"""
SELECT
threat_level,
model_name,
count() AS total,
round(avg(anomaly_score), 4) AS avg_score,
round(min(anomaly_score), 4) AS min_score,
countIf(threat_level = 'NORMAL') AS normal_count,
countIf(threat_level NOT IN ('NORMAL','KNOWN_BOT')) AS anomaly_count,
countIf(threat_level = 'KNOWN_BOT') AS bot_count
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_all_scores
WHERE detected_at >= now() - INTERVAL 3 DAY
GROUP BY threat_level, model_name
ORDER BY model_name, total DESC
"""
result = db.query(sql)
by_model: dict = {}
grand_total = 0
total_normal = total_anomaly = total_bot = 0
for row in result.result_rows:
level = str(row[0])
model = str(row[1])
total = int(row[2])
grand_total += total
total_normal += int(row[5] or 0)
total_anomaly += int(row[6] or 0)
total_bot += int(row[7] or 0)
if model not in by_model:
by_model[model] = []
by_model[model].append({
"threat_level": level,
"total": total,
"avg_score": float(row[3] or 0),
"min_score": float(row[4] or 0),
})
grand_total = max(grand_total, 1)
return {
"by_model": by_model,
"totals": {
"normal": total_normal,
"anomaly": total_anomaly,
"known_bot": total_bot,
"grand_total": grand_total,
"normal_pct": round(total_normal / grand_total * 100, 1),
"anomaly_pct": round(total_anomaly / grand_total * 100, 1),
"bot_pct": round(total_bot / grand_total * 100, 1),
}
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/score-trends")
async def get_score_trends(hours: int = Query(72, ge=1, le=168)):
"""
Évolution temporelle des scores ML depuis ml_all_scores.
Retourne le score moyen et les counts par heure et par modèle.
"""
try:
sql = f"""
SELECT
toStartOfHour(window_start) AS hour,
model_name,
countIf(threat_level = 'NORMAL') AS normal_count,
countIf(threat_level IN ('LOW','MEDIUM','HIGH','CRITICAL')) AS anomaly_count,
countIf(threat_level = 'KNOWN_BOT') AS bot_count,
round(avgIf(anomaly_score, threat_level IN ('LOW','MEDIUM','HIGH','CRITICAL')), 4) AS avg_anomaly_score
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_all_scores
WHERE window_start >= now() - INTERVAL %(hours)s HOUR
GROUP BY hour, model_name
ORDER BY hour ASC, model_name
"""
result = db.query(sql, {"hours": hours})
points = []
for row in result.result_rows:
points.append({
"hour": str(row[0]),
"model": str(row[1]),
"normal_count": int(row[2] or 0),
"anomaly_count": int(row[3] or 0),
"bot_count": int(row[4] or 0),
"avg_anomaly_score": float(row[5] or 0),
})
return {"points": points, "hours": hours}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/b-features")
async def get_b_features(limit: int = Query(50, ge=1, le=200)):
"""
Agrégation des B-features (HTTP pures) pour les top IPs anomales.
Source: agg_host_ip_ja4_1h (SimpleAggregateFunction columns).
Expose: head_ratio, sec_fetch_absence, tls12_ratio, generic_accept_ratio, http10_ratio.
Ces features sont calculées dans view_ai_features_1h mais jamais visualisées dans le dashboard.
"""
try:
sql = f"""
SELECT ip, ja4, country, asn_name, total_hits AS hits,
head_ratio, sec_fetch_absence, tls12_ratio, generic_accept_ratio, http10_ratio,
missing_accept_enc_ratio, http_scheme_ratio
FROM (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
any(ja4) AS ja4,
any(src_country_code) AS country,
any(src_as_name) AS asn_name,
sum(hits) AS total_hits,
round(sum(count_head) / greatest(sum(hits),1), 4) AS head_ratio,
round(sum(count_no_sec_fetch) / greatest(sum(hits),1), 4) AS sec_fetch_absence,
round(sum(tls12_count) / greatest(sum(hits),1), 4) AS tls12_ratio,
round(sum(count_generic_accept) / greatest(sum(hits),1), 4) AS generic_accept_ratio,
round(sum(count_http10) / greatest(sum(hits),1), 4) AS http10_ratio,
round(sum(count_no_accept_enc) / greatest(sum(hits),1), 4) AS missing_accept_enc_ratio,
round(sum(count_http_scheme) / greatest(sum(hits),1), 4) AS http_scheme_ratio
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
GROUP BY src_ip
)
WHERE sec_fetch_absence > 0.5 OR generic_accept_ratio > 0.3
OR head_ratio > 0.1 OR tls12_ratio > 0.5 OR missing_accept_enc_ratio > 0.3
ORDER BY (head_ratio + sec_fetch_absence + generic_accept_ratio + missing_accept_enc_ratio) DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
items.append({
"ip": str(row[0]),
"ja4": str(row[1] or ""),
"country": str(row[2] or ""),
"asn_name": str(row[3] or ""),
"hits": int(row[4] or 0),
"head_ratio": float(row[5] or 0),
"sec_fetch_absence": float(row[6] or 0),
"tls12_ratio": float(row[7] or 0),
"generic_accept_ratio": float(row[8] or 0),
"http10_ratio": float(row[9] or 0),
"missing_accept_enc_ratio":float(row[10] or 0),
"http_scheme_ratio": float(row[11] or 0),
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/campaigns")
async def get_ml_campaigns(hours: int = Query(24, ge=1, le=168), limit: int = Query(20, ge=1, le=100)):
"""
Groupes d'anomalies détectées par DBSCAN (campaign_id >= 0).
Si aucune campagne active, fallback sur clustering par /24 subnet + JA4 commun.
Utile pour détecter les botnets distribués sans état de campagne DBSCAN.
"""
try:
# First: check real campaigns
campaign_sql = f"""
SELECT
campaign_id,
count() AS total_detections,
uniq(src_ip) AS unique_ips,
any(threat_level) AS dominant_threat,
groupUniqArray(3)(threat_level) AS threat_levels,
groupUniqArray(3)(bot_name) AS bot_names,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND campaign_id >= 0
GROUP BY campaign_id
ORDER BY total_detections DESC
LIMIT %(limit)s
"""
result = db.query(campaign_sql, {"hours": hours, "limit": limit})
campaigns = []
for row in result.result_rows:
campaigns.append({
"id": f"C{row[0]}",
"campaign_id": int(row[0]),
"total_detections": int(row[1]),
"unique_ips": int(row[2]),
"dominant_threat": str(row[3] or ""),
"threat_levels": list(row[4] or []),
"bot_names": list(row[5] or []),
"first_seen": str(row[6]),
"last_seen": str(row[7]),
"source": "dbscan",
})
# Fallback: subnet-based clustering when DBSCAN has no campaigns
if not campaigns:
subnet_sql = f"""
SELECT
IPv4CIDRToRange(toIPv4(replaceRegexpAll(toString(src_ip),'^::ffff:','')), 24).1 AS subnet,
count() AS total_detections,
uniq(src_ip) AS unique_ips,
groupArray(3)(threat_level) AS threat_levels,
any(bot_name) AS bot_name,
any(ja4) AS sample_ja4,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND threat_level IN ('HIGH','CRITICAL','MEDIUM')
GROUP BY subnet
HAVING unique_ips >= 3
ORDER BY total_detections DESC
LIMIT %(limit)s
"""
result2 = db.query(subnet_sql, {"hours": hours, "limit": limit})
for i, row in enumerate(result2.result_rows):
subnet_str = str(row[0]) + "/24"
campaigns.append({
"id": f"S{i+1:03d}",
"campaign_id": -1,
"subnet": subnet_str,
"total_detections": int(row[1]),
"unique_ips": int(row[2]),
"dominant_threat": str((row[3] or [""])[0]),
"threat_levels": list(row[3] or []),
"bot_names": [str(row[4] or "")],
"sample_ja4": str(row[5] or ""),
"first_seen": str(row[6]),
"last_seen": str(row[7]),
"source": "subnet_cluster",
})
dbscan_active = any(c["campaign_id"] >= 0 for c in campaigns)
return {
"campaigns": campaigns,
"total": len(campaigns),
"dbscan_active": dbscan_active,
"hours": hours,
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/scatter")
async def get_ml_scatter(limit: int = Query(200, ge=1, le=1000)):
"""Points scatter plot (fuzzing_index × hit_velocity) — bypass view_ai_features_1h."""
try:
sql = f"""
SELECT
ip,
ja4,
round(fuzzing_index, 4) AS fuzzing_index,
round(total_hits / greatest(dateDiff('second', min_first, max_last), 1), 2) AS hit_velocity,
total_hits AS hits,
round(total_count_head / greatest(total_hits, 1), 4) AS head_ratio,
correlated
FROM (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
any(ja4) AS ja4,
uniqMerge(uniq_query_params) / greatest(uniqMerge(uniq_paths), 1) AS fuzzing_index,
sum(hits) AS total_hits,
min(first_seen) AS min_first,
max(last_seen) AS max_last,
sum(count_head) AS total_count_head,
max(correlated_raw) AS correlated
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
GROUP BY src_ip
)
ORDER BY fuzzing_index DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
points = []
for row in result.result_rows:
fuzzing = float(row[2] or 0)
velocity = float(row[3] or 0)
points.append({
"ip": str(row[0]),
"ja4": str(row[1]),
"fuzzing_index":fuzzing,
"hit_velocity": velocity,
"hits": int(row[4] or 0),
"attack_type": _attack_type(fuzzing, velocity, 0, 0),
})
return {"points": points}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,53 @@
"""HTML page routes served via Jinja2 templates."""
from __future__ import annotations
from fastapi import APIRouter, Request
from fastapi.templating import Jinja2Templates
router = APIRouter()
templates = Jinja2Templates(directory="backend/templates")
def _ctx(request: Request, page: str, **extra) -> dict:
return {"request": request, "active_page": page, **extra}
@router.get("/")
async def overview(request: Request):
return templates.TemplateResponse("overview.html", _ctx(request, "overview"))
@router.get("/detections")
async def detections(request: Request):
return templates.TemplateResponse("detections.html", _ctx(request, "detections"))
@router.get("/scores")
async def scores(request: Request):
return templates.TemplateResponse("scores.html", _ctx(request, "scores"))
@router.get("/traffic")
async def traffic(request: Request):
return templates.TemplateResponse("traffic.html", _ctx(request, "traffic"))
@router.get("/ip/{ip}")
async def ip_detail(request: Request, ip: str):
return templates.TemplateResponse("ip_detail.html", _ctx(request, "ip_detail", ip=ip))
@router.get("/classify")
async def classify(request: Request):
return templates.TemplateResponse("classify.html", _ctx(request, "classify"))
@router.get("/features")
async def features(request: Request):
return templates.TemplateResponse("features.html", _ctx(request, "features"))
@router.get("/models")
async def models(request: Request):
return templates.TemplateResponse("models.html", _ctx(request, "models"))

View File

@ -1,125 +0,0 @@
"""
Routes pour la réputation IP (bases de données publiques)
"""
from fastapi import APIRouter, HTTPException, Path
from typing import Dict, Any
import re
from ..services.reputation_ip import get_reputation_service
router = APIRouter(prefix="/api/reputation", tags=["Reputation"])
# Pattern de validation d'IP (IPv4)
IP_PATTERN = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')
def is_valid_ipv4(ip: str) -> bool:
"""Valide qu'une chaîne est une adresse IPv4 valide"""
if not IP_PATTERN.match(ip):
return False
# Vérifie que chaque octet est entre 0 et 255
parts = ip.split('.')
for part in parts:
try:
num = int(part)
if num < 0 or num > 255:
return False
except ValueError:
return False
return True
@router.get("/ip/{ip_address}", summary="Réputation complète d'une IP")
async def get_ip_reputation(
ip_address: str = Path(..., description="Adresse IP à vérifier")
) -> Dict[str, Any]:
"""
Récupère la réputation d'une adresse IP depuis les bases de données publiques
Sources utilisées (sans clé API):
- IP-API.com: Géolocalisation + Proxy/Hosting detection
- IPinfo.io: ASN + Organisation
Returns:
Dict avec:
- ip: Adresse IP vérifiée
- timestamp: Date de la vérification
- sources: Détails par source
- aggregated: Résultats agrégés
- is_proxy: bool
- is_hosting: bool
- is_vpn: bool
- is_tor: bool
- threat_score: 0-100
- threat_level: clean/low/medium/high/critical
- country: Pays
- asn: Numéro ASN
- asn_org: Organisation ASN
- org: ISP/Organisation
- warnings: Liste des alertes
"""
# Valide l'adresse IP
if not is_valid_ipv4(ip_address):
raise HTTPException(
status_code=400,
detail=f"Adresse IP invalide: {ip_address}. Format attendu: x.x.x.x"
)
try:
# Récupère le service de réputation
reputation_service = get_reputation_service()
# Interroge les sources
results = await reputation_service.get_reputation(ip_address)
return results
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Erreur lors de la vérification de réputation: {str(e)}"
)
@router.get("/ip/{ip_address}/summary", summary="Réputation simplifiée d'une IP")
async def get_ip_reputation_summary(
ip_address: str = Path(..., description="Adresse IP à vérifier")
) -> Dict[str, Any]:
"""
Version simplifiée de la réputation IP (juste les infos essentielles)
Utile pour affichage rapide dans les tableaux
"""
if not is_valid_ipv4(ip_address):
raise HTTPException(
status_code=400,
detail=f"Adresse IP invalide: {ip_address}"
)
try:
reputation_service = get_reputation_service()
full_results = await reputation_service.get_reputation(ip_address)
# Retourne juste l'essentiel
aggregated = full_results.get('aggregated', {})
return {
'ip': ip_address,
'threat_level': aggregated.get('threat_level', 'unknown'),
'threat_score': aggregated.get('threat_score', 0),
'is_proxy': aggregated.get('is_proxy', False),
'is_hosting': aggregated.get('is_hosting', False),
'country': aggregated.get('country'),
'country_code': aggregated.get('country_code'),
'asn': aggregated.get('asn'),
'org': aggregated.get('org'),
'warnings_count': len(aggregated.get('warnings', []))
}
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Erreur: {str(e)}"
)

View File

@ -1,219 +0,0 @@
"""
Endpoints pour la détection de la rotation de fingerprints JA4 et des menaces persistantes
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
from ..config import settings
router = APIRouter(prefix="/api/rotation", tags=["rotation"])
@router.get("/ja4-rotators")
async def get_ja4_rotators(limit: int = Query(50, ge=1, le=500)):
"""IPs qui effectuent le plus de rotation de fingerprints JA4."""
try:
sql = f"""
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
distinct_ja4_count,
total_hits
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_host_ip_ja4_rotation
ORDER BY distinct_ja4_count DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
distinct = int(row[1])
items.append({
"ip": str(row[0]),
"distinct_ja4_count":distinct,
"total_hits": int(row[2]),
"evasion_score": min(100, distinct * 15),
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/persistent-threats")
async def get_persistent_threats(limit: int = Query(100, ge=1, le=1000)):
"""Menaces persistantes triées par score de persistance."""
try:
sql = f"""
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
recurrence,
worst_score,
worst_threat_level,
first_seen,
last_seen
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_ip_recurrence
ORDER BY (least(100, recurrence * 20 + worst_score * 50)) DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
recurrence = int(row[1])
worst_score = float(row[2] or 0)
items.append({
"ip": str(row[0]),
"recurrence": recurrence,
"worst_score": worst_score,
"worst_threat_level":str(row[3] or ""),
"first_seen": str(row[4]),
"last_seen": str(row[5]),
"persistence_score": min(100, recurrence * 20 + worst_score * 50),
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/ip/{ip}/ja4-history")
async def get_ip_ja4_history(ip: str):
"""Historique des JA4 utilisés par une IP donnée."""
try:
sql = f"""
SELECT
ja4,
sum(hits) AS hits,
min(window_start) AS first_seen,
max(window_start) AS last_seen
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE src_ip = IPv4MappedToIPv6(toIPv4(%(ip)s))
GROUP BY ja4
ORDER BY hits DESC
"""
result = db.query(sql, {"ip": ip})
items = [
{
"ja4": str(row[0]),
"hits": int(row[1]),
"first_seen":str(row[2]),
"last_seen": str(row[3]),
}
for row in result.result_rows
]
return {"ip": ip, "ja4_history": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/sophistication")
async def get_sophistication(limit: int = Query(50, ge=1, le=500)):
"""Score de sophistication adversaire par IP (rotation JA4 + récurrence + bruteforce).
Single SQL JOIN query — aucun traitement Python sur 34K entrées.
"""
try:
sql = f"""
SELECT
r.ip,
r.distinct_ja4_count,
coalesce(rec.recurrence, 0) AS recurrence,
coalesce(bf.bruteforce_hits, 0) AS bruteforce_hits,
round(least(100.0,
r.distinct_ja4_count * 10
+ coalesce(rec.recurrence, 0) * 20
+ least(30.0, log(coalesce(bf.bruteforce_hits, 0) + 1) * 5)
), 1) AS sophistication_score
FROM (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
distinct_ja4_count
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_host_ip_ja4_rotation
) r
LEFT JOIN (
-- Utilise view_ip_recurrence (pré-agrégée) au lieu de ml_detected_anomalies FINAL
-- FINAL force une déduplication complète du ReplacingMergeTree — très coûteux
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
recurrence
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_ip_recurrence
) rec ON r.ip = rec.ip
LEFT JOIN (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
sum(hits) AS bruteforce_hits
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_form_bruteforce_detected
GROUP BY ip
) bf ON r.ip = bf.ip
ORDER BY sophistication_score DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
score = float(row[4] or 0)
if score > 80:
tier = "APT-like"
elif score > 50:
tier = "Advanced"
elif score > 20:
tier = "Automated"
else:
tier = "Basic"
items.append({
"ip": str(row[0]),
"ja4_rotation_count": int(row[1] or 0),
"recurrence": int(row[2] or 0),
"bruteforce_hits": int(row[3] or 0),
"sophistication_score":score,
"tier": tier,
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/proactive-hunt")
async def get_proactive_hunt(
min_recurrence: int = Query(2, ge=1, description="Récurrence minimale"),
min_days: int = Query(2, ge=0, description="Jours d'activité minimum"),
limit: int = Query(50, ge=1, le=500),
):
"""IPs volant sous le radar : récurrentes mais sous le seuil de détection normal."""
try:
sql = f"""
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
recurrence,
worst_score,
worst_threat_level,
first_seen,
last_seen,
dateDiff('day', first_seen, last_seen) AS days_active
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_ip_recurrence
WHERE recurrence >= %(min_recurrence)s
AND abs(worst_score) < 0.5
AND dateDiff('day', first_seen, last_seen) >= %(min_days)s
ORDER BY recurrence DESC, worst_score ASC
LIMIT %(limit)s
"""
result = db.query(sql, {
"min_recurrence": min_recurrence,
"min_days": min_days,
"limit": limit,
})
items = []
for row in result.result_rows:
recurrence = int(row[1])
worst_score = float(row[2] or 0)
days_active = int(row[6] or 0)
ratio = recurrence / (worst_score + 0.1)
risk = "Évadeur potentiel" if ratio > 10 else "Persistant modéré"
items.append({
"ip": str(row[0]),
"recurrence": recurrence,
"worst_score": round(worst_score, 4),
"worst_threat_level": str(row[3] or ""),
"first_seen": str(row[4]),
"last_seen": str(row[5]),
"days_active": days_active,
"risk_assessment": risk,
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -1,126 +0,0 @@
"""
Endpoint de recherche globale rapide — utilisé par la barre Cmd+K
"""
from fastapi import APIRouter, Query
from ..database import db
from ..config import settings
router = APIRouter(prefix="/api/search", tags=["search"])
IP_RE = r"^(\d{1,3}\.){0,3}\d{1,3}$"
@router.get("/quick")
async def quick_search(q: str = Query(..., min_length=1, max_length=100)):
"""
Recherche unifiée sur IPs, JA4, ASN, hosts.
Retourne jusqu'à 5 résultats par catégorie.
"""
q = q.strip()
pattern = f"%{q}%"
results = []
# ── IPs ──────────────────────────────────────────────────────────────────
ip_rows = db.query(
f"""
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
count() AS hits,
max(detected_at) AS last_seen,
any(threat_level) AS threat_level
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE ilike(toString(src_ip), %(p)s)
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY clean_ip
ORDER BY hits DESC
""",
{"p": pattern},
)
for r in ip_rows.result_rows:
ip = str(r[0])
results.append({
"type": "ip",
"value": ip,
"label": ip,
"meta": f"{r[1]} détections · {r[3]}",
"url": f"/detections/ip/{ip}",
"investigation_url": f"/investigation/{ip}",
})
# ── JA4 fingerprints ─────────────────────────────────────────────────────
ja4_rows = db.query(
f"""
SELECT
ja4,
count() AS hits,
uniq(src_ip) AS unique_ips
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE ilike(ja4, %(p)s)
AND ja4 != ''
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY ja4
ORDER BY hits DESC
""",
{"p": pattern},
)
for r in ja4_rows.result_rows:
results.append({
"type": "ja4",
"value": str(r[0]),
"label": str(r[0]),
"meta": f"{r[1]} détections · {r[2]} IPs",
"url": f"/investigation/ja4/{r[0]}",
})
# ── Hosts ─────────────────────────────────────────────────────────────────
host_rows = db.query(
f"""
SELECT
host,
count() AS hits,
uniq(src_ip) AS unique_ips
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE ilike(host, %(p)s)
AND host != ''
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY host
ORDER BY hits DESC
""",
{"p": pattern},
)
for r in host_rows.result_rows:
results.append({
"type": "host",
"value": str(r[0]),
"label": str(r[0]),
"meta": f"{r[1]} hits · {r[2]} IPs",
"url": f"/detections?search={r[0]}",
})
# ── ASN ───────────────────────────────────────────────────────────────────
asn_rows = db.query(
f"""
SELECT
asn_org,
asn_number,
count() AS hits,
uniq(src_ip) AS unique_ips
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE (ilike(asn_org, %(p)s) OR ilike(asn_number, %(p)s))
AND asn_org != '' AND asn_number != ''
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY asn_org, asn_number
ORDER BY hits DESC
""",
{"p": pattern},
)
for r in asn_rows.result_rows:
results.append({
"type": "asn",
"value": str(r[1]),
"label": f"AS{r[1]}{r[0]}",
"meta": f"{r[2]} hits · {r[3]} IPs",
"url": f"/detections?asn={r[1]}",
})
return {"query": q, "results": results}

View File

@ -1,224 +0,0 @@
"""
Endpoints pour la détection du TCP spoofing / fingerprinting OS
Approche multi-signal (p0f-style) :
- TTL initial estimé → famille OS (Linux/Mac=64, Windows=128, Cisco/BSD=255)
- MSS → type de réseau (Ethernet=1460, PPPoE=1452, VPN=1380-1420)
- Taille de fenêtre → signature OS précise
- Facteur d'échelle → affine la version kernel/stack TCP
Détection bots : signatures connues (Masscan/ZMap/Mirai) identifiées par combinaison
win+scale+mss indépendamment de l'UA.
"""
from fastapi import APIRouter, HTTPException, Query
from ..database import db
from ..services.tcp_fingerprint import (
fingerprint_os,
detect_spoof,
declared_os_from_ua,
)
from ..config import settings
router = APIRouter(prefix="/api/tcp-spoofing", tags=["tcp_spoofing"])
@router.get("/overview")
async def get_tcp_spoofing_overview():
"""Statistiques globales avec fingerprinting multi-signal (TTL + MSS + fenêtre + scale)."""
try:
sql = f"""
SELECT
count() AS total_entries,
uniq(src_ip) AS unique_ips,
countIf(tcp_ttl_raw = 0) AS no_tcp_data,
countIf(tcp_ttl_raw > 0) AS with_tcp_data,
countIf(tcp_ttl_raw > 0 AND tcp_ttl_raw <= 64) AS linux_mac_fp,
countIf(tcp_ttl_raw > 64 AND tcp_ttl_raw <= 128) AS windows_fp,
countIf(tcp_ttl_raw > 128) AS cisco_bsd_fp,
countIf(tcp_win_raw = 5808 AND tcp_mss_raw = 1452 AND tcp_scale_raw = 4) AS bot_scanner_fp
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
"""
result = db.query(sql)
row = result.result_rows[0]
# Distribution TTL (top 15)
ttl_sql = f"""
SELECT tcp_ttl_raw AS ttl, count() AS cnt, uniq(src_ip) AS ips
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY ttl ORDER BY cnt DESC
"""
ttl_res = db.query(ttl_sql)
# Distribution MSS — nouveau signal clé (top 12)
mss_sql = f"""
SELECT tcp_mss_raw AS mss, count() AS cnt, uniq(src_ip) AS ips
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_mss_raw > 0
GROUP BY mss ORDER BY cnt DESC
"""
mss_res = db.query(mss_sql)
# Distribution fenêtre (top 10)
win_sql = f"""
SELECT tcp_win_raw AS win, count() AS cnt
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY win ORDER BY cnt DESC
"""
win_res = db.query(win_sql)
return {
"total_entries": int(row[0]),
"unique_ips": int(row[1]),
"no_tcp_data": int(row[2]),
"with_tcp_data": int(row[3]),
"linux_mac_fingerprint": int(row[4]),
"windows_fingerprint": int(row[5]),
"cisco_bsd_fingerprint": int(row[6]),
"bot_scanner_fingerprint": int(row[7]),
"ttl_distribution": [
{"ttl": int(r[0]), "count": int(r[1]), "ips": int(r[2])}
for r in ttl_res.result_rows
],
"mss_distribution": [
{"mss": int(r[0]), "count": int(r[1]), "ips": int(r[2])}
for r in mss_res.result_rows
],
"window_size_distribution": [
{"window_size": int(r[0]), "count": int(r[1])}
for r in win_res.result_rows
],
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/list")
async def get_tcp_spoofing_list(
limit: int = Query(100, ge=1, le=1000),
offset: int = Query(0, ge=0),
spoof_only: bool = Query(False, description="Retourner uniquement les spoofs/bots confirmés"),
):
"""Liste avec fingerprinting multi-signal (TTL + MSS + fenêtre + scale).
Inclut les champs enrichis : mss, win_scale, initial_ttl, hop_count, confidence, network_path, is_bot_tool.
"""
try:
count_sql = f"""
SELECT count() FROM (
SELECT src_ip, ja4
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY src_ip, ja4
)
"""
total = int(db.query(count_sql).result_rows[0][0])
sql = f"""
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS src_ip,
ja4,
any(tcp_ttl_raw) AS tcp_ttl,
any(tcp_win_raw) AS tcp_window_size,
any(tcp_scale_raw) AS tcp_win_scale,
any(tcp_mss_raw) AS tcp_mss,
any(first_ua) AS first_ua,
sum(hits) AS hits
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY src_ip, ja4
ORDER BY hits DESC
LIMIT %(limit)s OFFSET %(offset)s
"""
result = db.query(sql, {"limit": limit, "offset": offset})
items = []
for row in result.result_rows:
ip = str(row[0])
ja4 = str(row[1] or "")
ttl = int(row[2] or 0)
win = int(row[3] or 0)
scale = int(row[4] or 0)
mss = int(row[5] or 0)
ua = str(row[6] or "")
hits = int(row[7] or 0)
fp = fingerprint_os(ttl, win, scale, mss)
dec_os = declared_os_from_ua(ua)
spoof_res = detect_spoof(fp, dec_os)
if spoof_only and not spoof_res.is_spoof:
continue
items.append({
"ip": ip,
"ja4": ja4,
"tcp_ttl": ttl,
"tcp_window_size": win,
"tcp_win_scale": scale,
"tcp_mss": mss,
"hits": hits,
"first_ua": ua,
"suspected_os": fp.os_name,
"initial_ttl": fp.initial_ttl,
"hop_count": fp.hop_count,
"confidence": fp.confidence,
"network_path": fp.network_path,
"is_bot_tool": fp.is_bot_tool,
"declared_os": dec_os,
"spoof_flag": spoof_res.is_spoof,
"spoof_reason": spoof_res.reason,
})
return {"items": items, "total": total}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/matrix")
async def get_tcp_spoofing_matrix():
"""Matrice OS suspecté × OS déclaré avec fingerprinting multi-signal."""
try:
sql = f"""
SELECT
any(tcp_ttl_raw) AS ttl,
any(tcp_win_raw) AS win,
any(tcp_scale_raw) AS scale,
any(tcp_mss_raw) AS mss,
any(first_ua) AS ua,
count() AS cnt
FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR AND tcp_ttl_raw > 0
GROUP BY src_ip, ja4
"""
result = db.query(sql)
counts: dict = {}
for row in result.result_rows:
ttl = int(row[0] or 0)
win = int(row[1] or 0)
scale = int(row[2] or 0)
mss = int(row[3] or 0)
ua = str(row[4] or "")
cnt = int(row[5] or 1)
fp = fingerprint_os(ttl, win, scale, mss)
dec_os = declared_os_from_ua(ua)
spoof_res = detect_spoof(fp, dec_os)
key = (fp.os_name, dec_os, spoof_res.is_spoof, fp.is_bot_tool)
counts[key] = counts.get(key, 0) + cnt
matrix = [
{
"suspected_os": k[0],
"declared_os": k[1],
"count": v,
"is_spoof": k[2],
"is_bot_tool": k[3],
}
for k, v in counts.items()
]
matrix.sort(key=lambda x: x["count"], reverse=True)
return {"matrix": matrix}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -1,707 +0,0 @@
"""
Endpoints pour la variabilité des attributs
"""
from fastapi import APIRouter, HTTPException, Query
from typing import Optional
from ..database import db
from ..models import (
VariabilityResponse, VariabilityAttributes, AttributeValue, Insight,
UserAgentsResponse, UserAgentValue
)
from ..config import settings
router = APIRouter(prefix="/api/variability", tags=["variability"])
# =============================================================================
# ROUTES SPÉCIFIQUES (doivent être avant les routes génériques)
# =============================================================================
@router.get("/{attr_type}/{value:path}/ips", response_model=dict)
async def get_associated_ips(
attr_type: str,
value: str,
limit: int = Query(100, ge=1, le=1000, description="Nombre maximum d'IPs")
):
"""
Récupère la liste des IPs associées à un attribut
"""
try:
# Mapping des types vers les colonnes
type_column_map = {
"ip": "src_ip",
"ja4": "ja4",
"country": "country_code",
"asn": "asn_number",
"host": "host",
}
if attr_type not in type_column_map:
raise HTTPException(
status_code=400,
detail=f"Type invalide. Types supportés: {', '.join(type_column_map.keys())}"
)
column = type_column_map[attr_type]
query = f"""
SELECT src_ip, count() AS hit_count
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE {column} = %(value)s
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY src_ip
ORDER BY hit_count DESC
LIMIT %(limit)s
"""
result = db.query(query, {"value": value, "limit": limit})
total_hits = sum(row[1] for row in result.result_rows) or 1
ips = [
{"ip": str(row[0]), "count": row[1], "percentage": round(row[1] * 100.0 / total_hits, 2)}
for row in result.result_rows
]
# Compter le total
count_query = f"""
SELECT uniq(src_ip) AS total
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE {column} = %(value)s
AND detected_at >= now() - INTERVAL 24 HOUR
"""
count_result = db.query(count_query, {"value": value})
total = count_result.result_rows[0][0] if count_result.result_rows else 0
return {
"type": attr_type,
"value": value,
"ips": ips,
"total": total,
"showing": len(ips)
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/{attr_type}/{value:path}/attributes", response_model=dict)
async def get_associated_attributes(
attr_type: str,
value: str,
target_attr: str = Query(..., description="Type d'attribut à récupérer (user_agents, ja4, countries, asns, hosts)"),
limit: int = Query(50, ge=1, le=500, description="Nombre maximum de résultats")
):
"""
Récupère la liste des attributs associés (ex: User-Agents pour un pays)
"""
try:
# Mapping des types vers les colonnes
type_column_map = {
"ip": "src_ip",
"ja4": "ja4",
"country": "country_code",
"asn": "asn_number",
"host": "host",
}
# Mapping des attributs cibles
target_column_map = {
"user_agents": None, # handled separately via view_dashboard_entities
"ja4": "ja4",
"countries": "country_code",
"asns": "asn_number",
"hosts": "host",
}
if attr_type not in type_column_map:
raise HTTPException(status_code=400, detail=f"Type '{attr_type}' invalide")
if target_attr not in target_column_map:
raise HTTPException(
status_code=400,
detail=f"Attribut cible invalide. Supportés: {', '.join(target_column_map.keys())}"
)
column = type_column_map[attr_type]
target_column = target_column_map[target_attr]
# Pour user_agents: requête via view_dashboard_user_agents
# Colonnes: src_ip, ja4, hour, log_date, user_agents, requests
if target_column is None:
if attr_type == "ip":
ua_where = "toString(src_ip) = %(value)s"
elif attr_type == "ja4":
ua_where = "ja4 = %(value)s"
else:
# country/asn/host: pivot via ml_detected_anomalies
ua_where = f"""toString(src_ip) IN (
SELECT DISTINCT replaceRegexpAll(toString(src_ip), '^::ffff:', '')
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE {column} = %(value)s AND detected_at >= now() - INTERVAL 24 HOUR
)"""
ua_q = f"""
SELECT ua AS value, sum(requests) AS count,
round(sum(requests) * 100.0 / sum(sum(requests)) OVER (), 2) AS percentage
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE {ua_where}
AND hour >= now() - INTERVAL 24 HOUR AND ua != ''
GROUP BY value ORDER BY count DESC LIMIT %(limit)s
"""
ua_result = db.query(ua_q, {"value": value, "limit": limit})
items = [{"value": str(r[0]), "count": r[1], "percentage": round(float(r[2]), 2) if r[2] else 0.0}
for r in ua_result.result_rows]
return {"type": attr_type, "value": value, "target": target_attr, "items": items, "total": len(items), "showing": len(items)}
query = f"""
SELECT
{target_column} AS value,
count() AS count,
round(count() * 100.0 / sum(count()) OVER (), 2) AS percentage
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE {column} = %(value)s
AND {target_column} != '' AND {target_column} IS NOT NULL
AND detected_at >= now() - INTERVAL 24 HOUR
GROUP BY value
ORDER BY count DESC
LIMIT %(limit)s
"""
result = db.query(query, {"value": value, "limit": limit})
items = [
{
"value": str(row[0]),
"count": row[1],
"percentage": round(float(row[2]), 2) if row[2] else 0.0
}
for row in result.result_rows
]
# Compter le total
count_query = f"""
SELECT uniq({target_column}) AS total
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE {column} = %(value)s
AND {target_column} != '' AND {target_column} IS NOT NULL
AND detected_at >= now() - INTERVAL 24 HOUR
"""
count_result = db.query(count_query, {"value": value})
total = count_result.result_rows[0][0] if count_result.result_rows else 0
return {
"type": attr_type,
"value": value,
"target": target_attr,
"items": items,
"total": total,
"showing": len(items)
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
@router.get("/{attr_type}/{value:path}/user_agents", response_model=UserAgentsResponse)
async def get_user_agents(
attr_type: str,
value: str,
limit: int = Query(100, ge=1, le=500, description="Nombre maximum de user-agents")
):
"""
Récupère la liste des User-Agents associés à un attribut (IP, JA4, pays, etc.)
Les données sont récupérées depuis la vue materialisée view_dashboard_user_agents
"""
try:
# Mapping des types vers les colonnes
type_column_map = {
"ip": "src_ip",
"ja4": "ja4",
"country": "country_code",
"asn": "asn_number",
"host": "host",
}
if attr_type not in type_column_map:
raise HTTPException(
status_code=400,
detail=f"Type invalide. Types supportés: {', '.join(type_column_map.keys())}"
)
column = type_column_map[attr_type]
# view_dashboard_user_agents colonnes: src_ip, ja4, hour, log_date, user_agents, requests
if attr_type == "ip":
where = "toString(src_ip) = %(value)s"
params: dict = {"value": value, "limit": limit}
elif attr_type == "ja4":
where = "ja4 = %(value)s"
params = {"value": value, "limit": limit}
else:
# country / asn / host: pivot via ml_detected_anomalies → IPs connus → vue par src_ip
ml_col = {"country": "country_code", "asn": "asn_number", "host": "host"}[attr_type]
where = f"""toString(src_ip) IN (
SELECT DISTINCT replaceRegexpAll(toString(src_ip), '^::ffff:', '')
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE {ml_col} = %(value)s
AND detected_at >= now() - INTERVAL 24 HOUR
)"""
params = {"value": value, "limit": limit}
query = f"""
SELECT
ua AS user_agent,
sum(requests) AS count,
round(sum(requests) * 100.0 / sum(sum(requests)) OVER (), 2) AS percentage,
min(log_date) AS first_seen,
max(log_date) AS last_seen
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE {where}
AND hour >= now() - INTERVAL 24 HOUR
AND ua != ''
GROUP BY user_agent
ORDER BY count DESC
LIMIT %(limit)s
"""
result = db.query(query, params)
count_query = f"""
SELECT uniqExact(ua) AS total
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE {where}
AND hour >= now() - INTERVAL 24 HOUR
AND ua != ''
"""
count_result = db.query(count_query, params)
user_agents = [
UserAgentValue(
value=str(row[0]),
count=row[1] or 0,
percentage=round(float(row[2]), 2) if row[2] else 0.0,
first_seen=row[3] if len(row) > 3 and row[3] else None,
last_seen=row[4] if len(row) > 4 and row[4] else None,
)
for row in result.result_rows
]
total = count_result.result_rows[0][0] if count_result.result_rows else 0
return {
"type": attr_type,
"value": value,
"user_agents": user_agents,
"total": total,
"showing": len(user_agents)
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ROUTE GÉNÉRIQUE (doit être en dernier)
# =============================================================================
def get_attribute_value(row, count_idx: int, percentage_idx: int,
first_seen_idx: Optional[int] = None,
last_seen_idx: Optional[int] = None,
threat_idx: Optional[int] = None,
unique_ips_idx: Optional[int] = None) -> AttributeValue:
"""Helper pour créer un AttributeValue depuis une ligne ClickHouse"""
return AttributeValue(
value=str(row[0]),
count=row[count_idx] or 0,
percentage=round(float(row[percentage_idx]), 2) if row[percentage_idx] else 0.0,
first_seen=row[first_seen_idx] if first_seen_idx is not None and len(row) > first_seen_idx else None,
last_seen=row[last_seen_idx] if last_seen_idx is not None and len(row) > last_seen_idx else None,
threat_levels=_parse_threat_levels(row[threat_idx]) if threat_idx is not None and len(row) > threat_idx and row[threat_idx] else None,
unique_ips=row[unique_ips_idx] if unique_ips_idx is not None and len(row) > unique_ips_idx else None,
primary_threat=_get_primary_threat(row[threat_idx]) if threat_idx is not None and len(row) > threat_idx and row[threat_idx] else None
)
def _parse_threat_levels(threat_str: str) -> dict:
"""Parse une chaîne de type 'CRITICAL:5,HIGH:10' en dict"""
if not threat_str:
return {}
result = {}
for part in str(threat_str).split(','):
if ':' in part:
level, count = part.strip().split(':')
result[level.strip()] = int(count.strip())
return result
def _get_primary_threat(threat_str: str) -> str:
"""Retourne le niveau de menace principal"""
if not threat_str:
return ""
levels_order = ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
for level in levels_order:
if level in str(threat_str):
return level
return ""
def _generate_insights(attr_type: str, value: str, attributes: VariabilityAttributes,
total_detections: int, unique_ips: int) -> list:
"""Génère des insights basés sur les données de variabilité"""
insights = []
# User-Agent insights
if len(attributes.user_agents) > 1:
insights.append(Insight(
type="warning",
message=f"{len(attributes.user_agents)} User-Agents différents → Possible rotation/obfuscation"
))
# JA4 insights
if len(attributes.ja4) > 1:
insights.append(Insight(
type="warning",
message=f"{len(attributes.ja4)} JA4 fingerprints différents → Possible rotation de fingerprint"
))
# IP insights (pour les sélections non-IP)
if attr_type != "ip" and unique_ips > 10:
insights.append(Insight(
type="info",
message=f"{unique_ips} IPs différentes associées → Possible infrastructure distribuée"
))
# ASN insights
if len(attributes.asns) == 1 and attributes.asns[0].value:
asn_label_lower = ""
if attr_type == "asn":
asn_label_lower = value.lower()
# Vérifier si c'est un ASN de hosting/cloud
hosting_keywords = ["ovh", "amazon", "aws", "google", "azure", "digitalocean", "linode", "vultr"]
if any(kw in (attributes.asns[0].value or "").lower() for kw in hosting_keywords):
insights.append(Insight(
type="warning",
message="ASN de type hosting/cloud → Souvent utilisé pour des bots"
))
# Country insights
if len(attributes.countries) > 5:
insights.append(Insight(
type="info",
message=f"Présent dans {len(attributes.countries)} pays → Distribution géographique large"
))
# Threat level insights
critical_count = 0
high_count = 0
for tl in attributes.threat_levels:
if tl.value == "CRITICAL":
critical_count = tl.count
elif tl.value == "HIGH":
high_count = tl.count
if critical_count > total_detections * 0.3:
insights.append(Insight(
type="warning",
message=f"{round(critical_count * 100 / total_detections)}% de détections CRITICAL → Menace sévère"
))
elif high_count > total_detections * 0.5:
insights.append(Insight(
type="info",
message=f"{round(high_count * 100 / total_detections)}% de détections HIGH → Activité suspecte"
))
return insights
@router.get("/{attr_type}/{value:path}", response_model=VariabilityResponse)
async def get_variability(attr_type: str, value: str):
"""
Récupère la variabilité des attributs associés à une valeur
attr_type: ip, ja4, country, asn, host, user_agent
value: la valeur à investiguer
"""
try:
# Mapping des types vers les colonnes ClickHouse
type_column_map = {
"ip": "src_ip",
"ja4": "ja4",
"country": "country_code",
"asn": "asn_number",
"host": "host",
"user_agent": "header_user_agent"
}
if attr_type not in type_column_map:
raise HTTPException(
status_code=400,
detail=f"Type invalide. Types supportés: {', '.join(type_column_map.keys())}"
)
column = type_column_map[attr_type]
# Requête principale - Récupère toutes les détections pour cette valeur
# On utilise toStartOfHour pour le timeseries et on évite header_user_agent si inexistant
base_query = f"""
SELECT *
FROM (
SELECT
detected_at,
src_ip,
ja4,
host,
'' AS user_agent,
country_code,
asn_number,
asn_org,
threat_level,
model_name,
anomaly_score
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE {column} = %(value)s
AND detected_at >= now() - INTERVAL 24 HOUR
)
"""
# Stats globales
stats_query = f"""
SELECT
count() AS total_detections,
uniq(src_ip) AS unique_ips,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE {column} = %(value)s
AND detected_at >= now() - INTERVAL 24 HOUR
"""
stats_result = db.query(stats_query, {"value": value})
if not stats_result.result_rows or stats_result.result_rows[0][0] == 0:
raise HTTPException(status_code=404, detail="Aucune donnée trouvée")
stats_row = stats_result.result_rows[0]
total_detections = stats_row[0]
unique_ips = stats_row[1]
first_seen = stats_row[2]
last_seen = stats_row[3]
# User-Agents depuis http_logs pour des comptes exacts par requête
# (view_dashboard_user_agents déduplique par heure, ce qui sous-compte les hits)
_ua_params: dict = {"value": value}
if attr_type == "ip":
_ua_logs_where = "src_ip = toIPv4(%(value)s)"
ua_query_simple = f"""
SELECT
header_user_agent AS user_agent,
count() AS count,
round(count() * 100.0 / (
SELECT count() FROM {settings.CLICKHOUSE_DB_LOGS}.http_logs
WHERE {_ua_logs_where} AND time >= now() - INTERVAL 24 HOUR
), 2) AS percentage,
min(time) AS first_seen,
max(time) AS last_seen
FROM {settings.CLICKHOUSE_DB_LOGS}.http_logs
WHERE {_ua_logs_where}
AND time >= now() - INTERVAL 24 HOUR
AND header_user_agent != '' AND header_user_agent IS NOT NULL
GROUP BY user_agent
ORDER BY count DESC
"""
ua_result = db.query(ua_query_simple, _ua_params)
user_agents = [get_attribute_value(row, 1, 2, 3, 4) for row in ua_result.result_rows]
elif attr_type == "ja4":
_ua_logs_where = "ja4 = %(value)s"
ua_query_simple = f"""
SELECT
header_user_agent AS user_agent,
count() AS count,
round(count() * 100.0 / (
SELECT count() FROM {settings.CLICKHOUSE_DB_LOGS}.http_logs
WHERE {_ua_logs_where} AND time >= now() - INTERVAL 24 HOUR
), 2) AS percentage,
min(time) AS first_seen,
max(time) AS last_seen
FROM {settings.CLICKHOUSE_DB_LOGS}.http_logs
WHERE {_ua_logs_where}
AND time >= now() - INTERVAL 24 HOUR
AND header_user_agent != '' AND header_user_agent IS NOT NULL
GROUP BY user_agent
ORDER BY count DESC
"""
ua_result = db.query(ua_query_simple, _ua_params)
user_agents = [get_attribute_value(row, 1, 2, 3, 4) for row in ua_result.result_rows]
else:
# country / asn / host: pivot via ml_detected_anomalies → IPs, puis view UA
_ua_where = f"""toString(src_ip) IN (
SELECT DISTINCT replaceRegexpAll(toString(src_ip), '^::ffff:', '')
FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
WHERE {column} = %(value)s AND detected_at >= now() - INTERVAL 24 HOUR
)"""
ua_query_simple = f"""
SELECT
ua AS user_agent,
sum(requests) AS count,
round(sum(requests) * 100.0 / sum(sum(requests)) OVER (), 2) AS percentage,
min(log_date) AS first_seen,
max(log_date) AS last_seen
FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents
ARRAY JOIN user_agents AS ua
WHERE {_ua_where}
AND hour >= now() - INTERVAL 24 HOUR
AND ua != ''
GROUP BY user_agent
ORDER BY count DESC
"""
ua_result = db.query(ua_query_simple, _ua_params)
user_agents = [get_attribute_value(row, 1, 2, 3, 4) for row in ua_result.result_rows]
# JA4 fingerprints
ja4_query = f"""
SELECT
ja4,
count() AS count,
round(count() * 100.0 / (SELECT count() FROM ({base_query})), 2) AS percentage,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM ({base_query})
WHERE ja4 != '' AND ja4 IS NOT NULL
GROUP BY ja4
ORDER BY count DESC
"""
ja4_result = db.query(ja4_query, {"value": value})
ja4s = [get_attribute_value(row, 1, 2, 3, 4) for row in ja4_result.result_rows]
# Pays
country_query = f"""
SELECT
country_code,
count() AS count,
round(count() * 100.0 / (SELECT count() FROM ({base_query})), 2) AS percentage
FROM ({base_query})
WHERE country_code != '' AND country_code IS NOT NULL
GROUP BY country_code
ORDER BY count DESC
"""
country_result = db.query(country_query, {"value": value})
countries = [get_attribute_value(row, 1, 2) for row in country_result.result_rows]
# ASN
asn_query = f"""
SELECT
concat('AS', toString(asn_number), ' - ', asn_org) AS asn_display,
asn_number,
count() AS count,
round(count() * 100.0 / (SELECT count() FROM ({base_query})), 2) AS percentage
FROM ({base_query})
WHERE asn_number != '' AND asn_number IS NOT NULL AND asn_number != '0'
GROUP BY asn_display, asn_number
ORDER BY count DESC
"""
asn_result = db.query(asn_query, {"value": value})
asns = [
AttributeValue(
value=str(row[0]),
count=row[2] or 0,
percentage=round(float(row[3]), 2) if row[3] else 0.0
)
for row in asn_result.result_rows
]
# Hosts
host_query = f"""
SELECT
host,
count() AS count,
round(count() * 100.0 / (SELECT count() FROM ({base_query})), 2) AS percentage
FROM ({base_query})
WHERE host != '' AND host IS NOT NULL
GROUP BY host
ORDER BY count DESC
"""
host_result = db.query(host_query, {"value": value})
hosts = [get_attribute_value(row, 1, 2) for row in host_result.result_rows]
# Threat levels
threat_query = f"""
SELECT
threat_level,
count() AS count,
round(count() * 100.0 / (SELECT count() FROM ({base_query})), 2) AS percentage
FROM ({base_query})
WHERE threat_level != '' AND threat_level IS NOT NULL
GROUP BY threat_level
ORDER BY
CASE threat_level
WHEN 'CRITICAL' THEN 1
WHEN 'HIGH' THEN 2
WHEN 'MEDIUM' THEN 3
WHEN 'LOW' THEN 4
ELSE 5
END
"""
threat_result = db.query(threat_query, {"value": value})
threat_levels = [get_attribute_value(row, 1, 2) for row in threat_result.result_rows]
# Model names
model_query = f"""
SELECT
model_name,
count() AS count,
round(count() * 100.0 / (SELECT count() FROM ({base_query})), 2) AS percentage
FROM ({base_query})
WHERE model_name != '' AND model_name IS NOT NULL
GROUP BY model_name
ORDER BY count DESC
"""
model_result = db.query(model_query, {"value": value})
model_names = [get_attribute_value(row, 1, 2) for row in model_result.result_rows]
# Construire la réponse
attributes = VariabilityAttributes(
user_agents=user_agents,
ja4=ja4s,
countries=countries,
asns=asns,
hosts=hosts,
threat_levels=threat_levels,
model_names=model_names
)
# Générer les insights
insights = _generate_insights(attr_type, value, attributes, total_detections, unique_ips)
return VariabilityResponse(
type=attr_type,
value=value,
total_detections=total_detections,
unique_ips=unique_ips,
date_range={
"first_seen": first_seen,
"last_seen": last_seen
},
attributes=attributes,
insights=insights
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")