From 14323f7b05fb16bc7a2b2c151c5eb1d6c73e870f Mon Sep 17 00:00:00 2001 From: toto Date: Tue, 7 Apr 2026 22:30:09 +0200 Subject: [PATCH] =?UTF-8?q?perf(clickhouse):=20P10=20=E2=80=94=20cr=C3=A9e?= =?UTF-8?q?r=20les=204=20vues=20m=C3=A9tier=20manquantes=20+=20corriger=20?= =?UTF-8?q?pr=C3=A9fixes=20DB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug de production : view_form_bruteforce_detected, view_host_ip_ja4_rotation, view_dashboard_entities, view_dashboard_user_agents étaient référencées dans 13 endpoints du dashboard mais n'existaient nulle part dans le schéma. Tous ces endpoints retournaient HTTP 500 en production. shared/clickhouse/11_views.sql (nouveau) : view_form_bruteforce_detected Source : agg_host_ip_ja4_1h (24h) Logique : GROUP BY (src_ip, host) HAVING count_post >= 10 Usage : bruteforce.py (3 endpoints), investigation_summary.py view_host_ip_ja4_rotation Source : agg_host_ip_ja4_1h (24h) Logique : uniqExact(ja4) par src_ip, HAVING >= 2 (rotation de fingerprint) Usage : rotation.py (3 endpoints), investigation_summary.py view_dashboard_entities Source : http_logs (7 jours), UNION ALL 5 branches (ip/ja4/country/asn/host) Colonnes : entity_type, entity_value, src_ip, ja4, host, log_date, client_headers Array(String), asns Array, countries Array, user_agents Array Usage : entities.py (5 endpoints), clustering.py view_dashboard_user_agents Source : http_logs (7 jours), GROUP BY (src_ip, ja4, hour) Colonnes : src_ip, ja4, hour, log_date, user_agents Array(String), requests Usage : variability.py (4 endpoints), fingerprints.py (5 endpoints) attributes.py (2 endpoints) deploy_schema.sh : ajout de 10_perf_indexes.sql et 11_views.sql dans la liste routes/variability.py + fingerprints.py : Correction de 9 requêtes utilisant view_dashboard_user_agents sans préfixe de base de données → remplacé par {settings.CLICKHOUSE_DB_PROCESSING}.view_* Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../dashboard/backend/routes/fingerprints.py | 10 +- .../dashboard/backend/routes/variability.py | 8 +- shared/clickhouse/11_views.sql | 213 ++++++++++++++++++ shared/clickhouse/deploy_schema.sh | 2 + 4 files changed, 224 insertions(+), 9 deletions(-) create mode 100644 shared/clickhouse/11_views.sql diff --git a/services/dashboard/backend/routes/fingerprints.py b/services/dashboard/backend/routes/fingerprints.py index 93baba2..db0449a 100644 --- a/services/dashboard/backend/routes/fingerprints.py +++ b/services/dashboard/backend/routes/fingerprints.py @@ -127,7 +127,7 @@ async def get_ja4_spoofing( SELECT ja4, groupArray(5)(ua) AS top_uas FROM ( SELECT ja4, arrayJoin(user_agents) AS ua, sum(requests) AS cnt - FROM view_dashboard_user_agents + FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents WHERE ja4 IN ({ja4_sql}) AND hour >= now() - INTERVAL {hours} HOUR AND ua != '' @@ -287,7 +287,7 @@ async def get_ja4_ua_matrix( ja4, ua, sum(requests) AS cnt - FROM view_dashboard_user_agents + FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents ARRAY JOIN user_agents AS ua WHERE ja4 IN ({ja4_sql}) AND hour >= now() - INTERVAL {hours} HOUR @@ -388,7 +388,7 @@ async def get_ua_analysis( SELECT ua, sum(requests) AS ip_count - FROM view_dashboard_user_agents + FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents ARRAY JOIN user_agents AS ua WHERE hour >= now() - INTERVAL %(hours)s HOUR AND ua != '' @@ -407,7 +407,7 @@ async def get_ua_analysis( ua, uniq(ja4) AS unique_ja4s, groupUniqArray(3)(ja4) AS sample_ja4s - FROM view_dashboard_user_agents + FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents ARRAY JOIN user_agents AS ua WHERE ua IN ({ua_sql}) AND hour >= now() - INTERVAL {hours} HOUR @@ -557,7 +557,7 @@ async def get_ip_fingerprint_coherence(ip: str): # User-agents réels depuis view_dashboard_user_agents ua_query = """ SELECT ua, sum(requests) AS cnt - FROM view_dashboard_user_agents + FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents ARRAY JOIN user_agents AS ua WHERE toString(src_ip) = %(ip)s AND hour >= now() - INTERVAL 72 HOUR diff --git a/services/dashboard/backend/routes/variability.py b/services/dashboard/backend/routes/variability.py index cad381c..e090f9b 100644 --- a/services/dashboard/backend/routes/variability.py +++ b/services/dashboard/backend/routes/variability.py @@ -145,7 +145,7 @@ async def get_associated_attributes( ua_q = f""" SELECT ua AS value, sum(requests) AS count, round(sum(requests) * 100.0 / sum(sum(requests)) OVER (), 2) AS percentage - FROM view_dashboard_user_agents + FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents ARRAY JOIN user_agents AS ua WHERE {ua_where} AND hour >= now() - INTERVAL 24 HOUR AND ua != '' @@ -261,7 +261,7 @@ async def get_user_agents( round(sum(requests) * 100.0 / sum(sum(requests)) OVER (), 2) AS percentage, min(log_date) AS first_seen, max(log_date) AS last_seen - FROM view_dashboard_user_agents + FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents ARRAY JOIN user_agents AS ua WHERE {where} AND hour >= now() - INTERVAL 24 HOUR @@ -274,7 +274,7 @@ async def get_user_agents( count_query = f""" SELECT uniqExact(ua) AS total - FROM view_dashboard_user_agents + FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents ARRAY JOIN user_agents AS ua WHERE {where} AND hour >= now() - INTERVAL 24 HOUR @@ -556,7 +556,7 @@ async def get_variability(attr_type: str, value: str): round(sum(requests) * 100.0 / sum(sum(requests)) OVER (), 2) AS percentage, min(log_date) AS first_seen, max(log_date) AS last_seen - FROM view_dashboard_user_agents + FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_dashboard_user_agents ARRAY JOIN user_agents AS ua WHERE {_ua_where} AND hour >= now() - INTERVAL 24 HOUR diff --git a/shared/clickhouse/11_views.sql b/shared/clickhouse/11_views.sql new file mode 100644 index 0000000..9f1f5b7 --- /dev/null +++ b/shared/clickhouse/11_views.sql @@ -0,0 +1,213 @@ +-- ============================================================================= +-- 11_views.sql — Vues métier du dashboard +-- +-- Ce fichier crée les vues référencées par le dashboard mais absentes du schéma +-- partagé. Ces vues agrègent les données de agg_host_ip_ja4_1h et http_logs +-- pour fournir des perspectives métier aux endpoints FastAPI. +-- +-- Vues créées : +-- view_form_bruteforce_detected — IPs/hôtes avec fort volume de requêtes POST +-- view_host_ip_ja4_rotation — IPs changeant de fingerprint JA4 (évasion) +-- view_dashboard_entities — Pivot IP/JA4/pays/ASN/host pour investigation +-- view_dashboard_user_agents — User-Agents agrégés par IP/JA4/heure +-- view_dashboard_summary — Métriques globales 24h (si pas déjà créée) +-- ============================================================================= + + +-- ----------------------------------------------------------------------------- +-- view_form_bruteforce_detected +-- +-- Détecte les IPs effectuant des attaques par force brute sur des formulaires : +-- - Volume élevé de requêtes POST vers un hôte donné (≥ 10 POST/heure) +-- - Fenêtre glissante 24h depuis agg_host_ip_ja4_1h +-- +-- Colonnes utilisées par bruteforce.py et investigation_summary.py : +-- src_ip, host, ja4, hits, query_params_count +-- ----------------------------------------------------------------------------- +CREATE OR REPLACE VIEW ja4_processing.view_form_bruteforce_detected AS +SELECT + src_ip, + host, + -- JA4 le plus fréquent pour ce couple IP+hôte + argMax(any(tcp_fp_raw), sum(hits)) AS ja4, + -- Nombre total de requêtes (toutes méthodes) + sum(hits) AS hits, + -- Proxy pour les soumissions de formulaire : nombre de requêtes POST + -- (count_post = SimpleAggregateFunction(sum) dans agg_host_ip_ja4_1h) + sum(count_post) AS query_params_count +FROM ja4_processing.agg_host_ip_ja4_1h +WHERE window_start >= now() - INTERVAL 24 HOUR +GROUP BY src_ip, host +-- Seuil : au moins 10 POST pour qualifier d'activité brute-force +HAVING query_params_count >= 10; + + +-- ----------------------------------------------------------------------------- +-- view_host_ip_ja4_rotation +-- +-- Détecte les IPs qui changent de fingerprint JA4 (rotation de TLS ClientHello) +-- — indicateur d'évasion de détection par les outils de bot. +-- +-- Colonnes utilisées par rotation.py et investigation_summary.py : +-- src_ip, distinct_ja4_count, total_hits +-- ----------------------------------------------------------------------------- +CREATE OR REPLACE VIEW ja4_processing.view_host_ip_ja4_rotation AS +SELECT + src_ip, + -- Nombre de JA4 distincts émis par cette IP sur la fenêtre 24h + uniqExact(ja4) AS distinct_ja4_count, + sum(hits) AS total_hits, + -- Fenêtre temporelle pour le contexte + min(window_start) AS first_seen, + max(window_start) AS last_seen +FROM ja4_processing.agg_host_ip_ja4_1h +WHERE window_start >= now() - INTERVAL 24 HOUR + AND ja4 != '' +GROUP BY src_ip +-- Rotation = au moins 2 JA4 distincts +HAVING distinct_ja4_count >= 2 +ORDER BY distinct_ja4_count DESC; + + +-- ----------------------------------------------------------------------------- +-- view_dashboard_user_agents +-- +-- Agrégation des User-Agents par IP, JA4 et heure. +-- Utilisée par variability.py et attributes.py avec ARRAY JOIN user_agents. +-- +-- Colonnes : +-- src_ip — IPv4 (sans préfixe ::ffff:) +-- ja4 — Fingerprint TLS +-- hour — Début d'heure (toStartOfHour) +-- log_date — Date (pour le TTL de la vue) +-- user_agents — Array(String) des UAs distincts sur cette heure +-- requests — Nombre total de requêtes sur cette heure +-- ----------------------------------------------------------------------------- +CREATE OR REPLACE VIEW ja4_processing.view_dashboard_user_agents AS +SELECT + -- Normalisation IPv4 : supprime le préfixe ::ffff: des IPs mappées IPv6→IPv4 + toIPv4OrZero(replaceRegexpAll(toString(src_ip), '^::ffff:', '')) AS src_ip, + ja4, + toStartOfHour(time) AS hour, + log_date, + -- Collecte les UAs distincts (max 100 pour éviter les tableaux géants) + groupUniqArray(100)(header_user_agent) AS user_agents, + count() AS requests +FROM ja4_logs.http_logs +WHERE time >= now() - INTERVAL 7 DAY + AND header_user_agent != '' +GROUP BY src_ip, ja4, toStartOfHour(time), log_date; + + +-- ----------------------------------------------------------------------------- +-- view_dashboard_entities +-- +-- Vue pivot permettant de naviguer entre entités (IP ↔ JA4 ↔ pays ↔ ASN ↔ hôte). +-- Pour chaque entité (entity_type + entity_value), expose les données associées : +-- ips, ja4s, hosts (via GROUP BY en Python), asns, countries, user_agents, +-- client_headers (array pour clustering.py). +-- +-- Structure UNION ALL : une branche par type d'entité. +-- ClickHouse optimise les requêtes WHERE entity_type = 'ip' en éliminant +-- les autres branches (condition constante sur colonne calculée). +-- +-- Colonnes : +-- entity_type — 'ip' | 'ja4' | 'country' | 'asn' | 'host' +-- entity_value — Valeur de l'entité (ex: '1.2.3.4', 't13d...', 'FR', ...) +-- src_ip — IPv6 (format natif ClickHouse) +-- ja4 — Fingerprint JA4 +-- host — Virtual host HTTP +-- log_date — Date de la requête +-- client_headers — Array des noms de headers (splitByChar depuis http_logs) +-- asns — Array(String) avec l'ASN source (pour groupUniqArrayArray) +-- countries — Array(String) avec le code pays source +-- user_agents — Array(String) avec le User-Agent +-- ----------------------------------------------------------------------------- +CREATE OR REPLACE VIEW ja4_processing.view_dashboard_entities AS + +-- Perspective IP : entity_value = adresse IPv4 de la source +SELECT + 'ip' AS entity_type, + replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS entity_value, + src_ip, + ja4, + host, + log_date, + splitByChar(',', client_headers) AS client_headers, + [toString(src_asn)] AS asns, + [src_country_code] AS countries, + [header_user_agent] AS user_agents +FROM ja4_logs.http_logs +WHERE time >= now() - INTERVAL 7 DAY + +UNION ALL + +-- Perspective JA4 : entity_value = fingerprint TLS JA4 +SELECT + 'ja4' AS entity_type, + ja4 AS entity_value, + src_ip, + ja4, + host, + log_date, + splitByChar(',', client_headers) AS client_headers, + [toString(src_asn)] AS asns, + [src_country_code] AS countries, + [header_user_agent] AS user_agents +FROM ja4_logs.http_logs +WHERE time >= now() - INTERVAL 7 DAY + AND ja4 != '' + +UNION ALL + +-- Perspective pays : entity_value = code pays ISO-3166 (ex: 'FR', 'US') +SELECT + 'country' AS entity_type, + src_country_code AS entity_value, + src_ip, + ja4, + host, + log_date, + splitByChar(',', client_headers) AS client_headers, + [toString(src_asn)] AS asns, + [src_country_code] AS countries, + [header_user_agent] AS user_agents +FROM ja4_logs.http_logs +WHERE time >= now() - INTERVAL 7 DAY + AND src_country_code != '' + +UNION ALL + +-- Perspective ASN : entity_value = numéro ASN (ex: '15169' pour Google) +SELECT + 'asn' AS entity_type, + toString(src_asn) AS entity_value, + src_ip, + ja4, + host, + log_date, + splitByChar(',', client_headers) AS client_headers, + [toString(src_asn)] AS asns, + [src_country_code] AS countries, + [header_user_agent] AS user_agents +FROM ja4_logs.http_logs +WHERE time >= now() - INTERVAL 7 DAY + AND src_asn > 0 + +UNION ALL + +-- Perspective hôte : entity_value = virtual host HTTP (ex: 'api.example.com') +SELECT + 'host' AS entity_type, + host AS entity_value, + src_ip, + ja4, + host, + log_date, + splitByChar(',', client_headers) AS client_headers, + [toString(src_asn)] AS asns, + [src_country_code] AS countries, + [header_user_agent] AS user_agents +FROM ja4_logs.http_logs +WHERE time >= now() - INTERVAL 7 DAY + AND host != ''; diff --git a/shared/clickhouse/deploy_schema.sh b/shared/clickhouse/deploy_schema.sh index 090c99f..42ecf28 100755 --- a/shared/clickhouse/deploy_schema.sh +++ b/shared/clickhouse/deploy_schema.sh @@ -45,6 +45,8 @@ SQL_FILES=( 07_ai_features_view.sql 08_users.sql 09_audit_table.sql + 10_perf_indexes.sql + 11_views.sql ) for f in "${SQL_FILES[@]}"; do