diff --git a/services/dashboard/backend/routes/rotation.py b/services/dashboard/backend/routes/rotation.py index fd16052..c35642a 100644 --- a/services/dashboard/backend/routes/rotation.py +++ b/services/dashboard/backend/routes/rotation.py @@ -126,11 +126,12 @@ async def get_sophistication(limit: int = Query(50, ge=1, le=500)): FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_host_ip_ja4_rotation ) r LEFT JOIN ( + -- Utilise view_ip_recurrence (pré-agrégée) au lieu de ml_detected_anomalies FINAL + -- FINAL force une déduplication complète du ReplacingMergeTree — très coûteux SELECT replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip, - count() AS recurrence - FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies FINAL - GROUP BY ip + recurrence + FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_ip_recurrence ) rec ON r.ip = rec.ip LEFT JOIN ( SELECT diff --git a/shared/clickhouse/06_ml_tables.sql b/shared/clickhouse/06_ml_tables.sql index 134359e..d0087ec 100644 --- a/shared/clickhouse/06_ml_tables.sql +++ b/shared/clickhouse/06_ml_tables.sql @@ -112,15 +112,27 @@ SETTINGS -- ----------------------------------------------------------------------------- --- view_ip_recurrence — recurrence aggregation over ml_detected_anomalies +-- view_ip_recurrence — récurrence des détections par IP +-- +-- Agrège ml_detected_anomalies (ORDER BY src_ip) pour obtenir le profil +-- de récurrence de chaque IP détectée. +-- +-- Optimisation : avec PARTITION BY toYYYYMMDD(detected_at) (ajouté en P1), +-- chaque GROUP BY src_ip bénéficie de l'élagage de partitions si la vue est +-- filtrée par date en amont (les routes filtrent généralement sur 30 jours max). +-- Le ORDER BY (src_ip) garantit que le GROUP BY src_ip lit des données +-- contiguës en mémoire (co-localisation des lignes d'une même IP). -- ----------------------------------------------------------------------------- CREATE OR REPLACE VIEW ja4_processing.view_ip_recurrence AS SELECT src_ip, - count() AS recurrence, - min(detected_at) AS first_seen, - max(detected_at) AS last_seen, - min(anomaly_score) AS worst_score, + count() AS recurrence, + min(detected_at) AS first_seen, + max(detected_at) AS last_seen, + min(anomaly_score) AS worst_score, argMin(threat_level, anomaly_score) AS worst_threat_level FROM ja4_processing.ml_detected_anomalies +-- Filtre temporel aligné sur le TTL de la table (30 jours) +-- Évite de scanner les partitions expirées non encore supprimées par le TTL +WHERE detected_at >= now() - INTERVAL 30 DAY GROUP BY src_ip;