From 2f2c5e03bb6a108fc128f8920ac7b860b636312f Mon Sep 17 00:00:00 2001 From: toto Date: Fri, 10 Apr 2026 00:48:05 +0200 Subject: [PATCH] fix(sql): contournement bug scope ClickHouse 24.8 dans view_ai_features_1h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Restructure 07_ai_features_view.sql : single anonymous inner subquery avec aliases explicites sur toutes les colonnes (a.xxx AS xxx, h.xxx AS xxx, h2.xxx AS xxx) pour résoudre l'ambiguïté PARTITION BY src_ip dans l'outer SELECT - Supprime les CTEs multiples (h2_agg, enriched) qui déclenchaient le bug - Fix migration 04_http2_fields.sql : ordre DEFAULT avant CODEC (syntax ClickHouse) - make init-stack : 0 erreur sur 13 fichiers SQL Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../sql/migrations/04_http2_fields.sql | 6 +- shared/clickhouse/07_ai_features_view.sql | 400 ++++++++++-------- 2 files changed, 232 insertions(+), 174 deletions(-) diff --git a/services/correlator/sql/migrations/04_http2_fields.sql b/services/correlator/sql/migrations/04_http2_fields.sql index 53b657f..b002c45 100644 --- a/services/correlator/sql/migrations/04_http2_fields.sql +++ b/services/correlator/sql/migrations/04_http2_fields.sql @@ -13,13 +13,13 @@ -- clickhouse-client --multiquery < 04_http2_fields.sql ALTER TABLE ja4_logs.http_logs - ADD COLUMN IF NOT EXISTS `h2_fingerprint` String CODEC(ZSTD(3)) DEFAULT ''; + ADD COLUMN IF NOT EXISTS `h2_fingerprint` String DEFAULT '' CODEC(ZSTD(3)); ALTER TABLE ja4_logs.http_logs - ADD COLUMN IF NOT EXISTS `h2_settings_fp` String CODEC(ZSTD(3)) DEFAULT ''; + ADD COLUMN IF NOT EXISTS `h2_settings_fp` String DEFAULT '' CODEC(ZSTD(3)); ALTER TABLE ja4_logs.http_logs - ADD COLUMN IF NOT EXISTS `h2_window_update` UInt32 DEFAULT 0; + ADD COLUMN IF NOT EXISTS `h2_window_update` UInt32 DEFAULT 0; ALTER TABLE ja4_logs.http_logs ADD COLUMN IF NOT EXISTS `h2_pseudo_order` LowCardinality(String) DEFAULT ''; diff --git a/shared/clickhouse/07_ai_features_view.sql b/shared/clickhouse/07_ai_features_view.sql index e3e7fba..6e45864 100644 --- a/shared/clickhouse/07_ai_features_view.sql +++ b/shared/clickhouse/07_ai_features_view.sql @@ -1,219 +1,277 @@ -- ============================================================================= -- 07_ai_features_view.sql — AI feature view with full Anubis enrichment --- Source: bot_detector/anubis/view_ai_features_anubis.sql --- Includes combined UA+IP priority logic and Anubis bot_name/action/category. -- §2 : Features HTTP/2 (dict_browser_h2, cohérence H2↔JA4, pseudo-headers) -- §3 : Score de cohérence de fingerprint cross-layer +-- Note: single anonymous inner subquery with explicit aliases (fix ClickHouse 24.8 +-- scope bug where PARTITION BY src_ip fails when multiple JOIN sources expose src_ip) -- ============================================================================= CREATE OR REPLACE VIEW ja4_processing.view_ai_features_1h AS -WITH - --- §2 — Agrégation des fingerprints HTTP/2 par (heure, src_ip) --- Lecture directe depuis http_logs pour les colonnes ajoutées à l'étape 1 -h2_agg AS ( +WITH base_data AS ( SELECT - toStartOfHour(time) AS window_start, - toIPv6(src_ip) AS src_ip, - anyIf(h2_fingerprint, h2_fingerprint != '') AS h2_fp, - anyIf(h2_pseudo_order, h2_pseudo_order != '') AS h2_pseudo_ord - FROM ja4_logs.http_logs - WHERE time >= now() - INTERVAL 24 HOUR - AND (h2_fingerprint != '' OR h2_pseudo_order != '') - GROUP BY window_start, src_ip -), - -base_data AS ( - SELECT - a.window_start, a.src_ip, a.ja4, a.host, - toString(a.src_asn) AS asn_number, - a.src_as_name AS asn_org, a.src_org AS asn_detail, a.src_domain AS asn_domain, - a.src_country_code AS country_code, - dictGetOrDefault('ja4_processing.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label, + window_start, src_ip, ja4, host, + toString(src_asn) AS asn_number, + src_as_name AS asn_org, src_org AS asn_detail, src_domain AS asn_domain, + src_country_code AS country_code, + dictGetOrDefault('ja4_processing.dict_asn_reputation', 'label', toUInt64(src_asn), 'unknown') AS asn_label, COALESCE( - nullIf(dictGetOrDefault('ja4_processing.dict_bot_ip', 'bot_name', a.src_ip, ''), ''), - nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_bot_ip', 'bot_name', src_ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(ja4), ''), ''), '' ) AS bot_name, - dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(a.ja4), '') AS browser_family, + dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(ja4), '') AS browser_family, -- Anubis: IP/CIDR > ASN (simplified — UA and Country rules removed) COALESCE( - nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, ''), ''), - nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', src_ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(src_asn), ''), ''), '' ) AS anubis_bot_name, COALESCE( - nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', a.src_ip, ''), ''), - nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(a.src_asn), ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', src_ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(src_asn), ''), ''), '' ) AS anubis_bot_action, COALESCE( - nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', a.src_ip, ''), ''), - nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(a.src_asn), ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', src_ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(src_asn), ''), ''), '' ) AS anubis_bot_category, - a.hits AS hits, - sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits, - a.correlated AS correlated, - a.tcp_jitter_variance AS tcp_jitter_variance, - a.true_window_size AS true_window_size, - a.window_mss_ratio AS window_mss_ratio, - a.max_keepalives AS max_keepalives, - h.header_order_hash AS header_order_hash, h.header_count AS header_count, - h.has_accept_language AS has_accept_language, h.has_cookie AS has_cookie, - h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score, - h.has_sec_ch_ua AS has_sec_ch_ua, - h.ua_ch_mismatch AS ua_ch_mismatch, - (a.count_post / (a.hits + 1)) AS post_ratio, - (a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index, - (a.hits / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS hit_velocity, - (a.unique_src_ports / (a.hits + 1)) AS port_exhaustion_ratio, - (a.orphan_count / (a.hits + 1)) AS orphan_ratio, - (a.ip_id_zero_count / (a.hits + 1)) AS ip_id_zero_ratio, - (a.hits / (a.unique_conn_id + 1)) AS multiplexing_efficiency, - IF(a.mss_1460_count > (a.hits * 0.8) AND h.has_sec_ch_ua > 0, 1, 0) AS mss_mobile_mismatch, - a.request_size_variance AS request_size_variance, - IF(a.tls_alpn = 'h2' AND a.http_version != '2', 1, 0) AS alpn_http_mismatch, - IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing, - IF(length(a.tls_sni) > 0 AND a.tls_sni != a.host, 1, 0) AS sni_host_mismatch, - IF(h.sec_fetch_mode = 'navigate' AND h.sec_fetch_dest != 'document', 1, 0) AS is_fake_navigation, - count() OVER (PARTITION BY a.tcp_fingerprint) AS tcp_shared_count, - count() OVER (PARTITION BY h.header_order_hash) AS header_order_shared_count, - (a.count_assets / (a.hits + 1)) AS asset_ratio, - (a.count_no_referer / (a.hits + 1)) AS direct_access_ratio, - IF(a.unique_ua > 2, 1, 0) AS is_ua_rotating, - uniqExact(a.ja4) OVER (PARTITION BY a.src_ip) AS distinct_ja4_count, - ((a.hits / (a.unique_src_ports + 1)) / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS src_port_density, - (sum(a.hits) OVER (PARTITION BY a.ja4, a.src_asn) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_asn_concentration, - (sum(a.hits) OVER (PARTITION BY a.ja4, a.src_country_code) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_country_concentration, - IF(sum(a.hits) OVER (PARTITION BY a.ja4) < 100, 1, 0) AS is_rare_ja4, - (count() OVER (PARTITION BY h.header_order_hash, a.ja4) / (count() OVER (PARTITION BY a.ja4) + 1)) AS header_order_confidence, - uniqExact(h.header_order_hash) OVER (PARTITION BY a.src_ip) AS distinct_header_orders, - (a.uniq_paths / (a.hits + 1)) AS path_diversity_ratio, - a.url_depth_variance AS url_depth_variance, - (a.count_anomalous_payload / (a.hits + 1)) AS anomalous_payload_ratio, - a.uniq_ja3_val AS uniq_ja3_per_row, - sqrt(a.tcp_jitter_variance) / greatest(a.avg_syn_ms_val, 1) AS syn_timing_cv, - a.tls12_count / (a.hits + 1) AS tls12_ratio, - a.count_head / (a.hits + 1) AS head_ratio, - a.count_no_sec_fetch / (a.hits + 1) AS sec_fetch_absence_rate, - a.count_generic_accept / (a.hits + 1) AS generic_accept_ratio, - a.count_http10 / (a.hits + 1) AS http10_ratio, - a.ip_df_variance AS ip_df_variance, - a.avg_ttl_val AS avg_ttl, - sqrt(a.ttl_variance_val) AS ttl_std, - IF(a.count_correlated_val > 0, a.count_no_wscale_val / a.count_correlated_val, 0) AS no_window_scale_ratio, - a.count_no_accept_enc_val / (a.hits + 1) AS missing_accept_enc_ratio, - a.count_http_scheme_val / (a.hits + 1) AS http_scheme_ratio, + hits AS hits, + sum(hits) OVER (PARTITION BY src_ip) AS total_ip_hits, + correlated AS correlated, + tcp_jitter_variance AS tcp_jitter_variance, + true_window_size AS true_window_size, + window_mss_ratio AS window_mss_ratio, + max_keepalives AS max_keepalives, + header_order_hash AS header_order_hash, header_count AS header_count, + has_accept_language AS has_accept_language, has_cookie AS has_cookie, + has_referer AS has_referer, modern_browser_score AS modern_browser_score, + has_sec_ch_ua AS has_sec_ch_ua, + ua_ch_mismatch AS ua_ch_mismatch, + (count_post / (hits + 1)) AS post_ratio, + (uniq_query_params / (uniq_paths + 1)) AS fuzzing_index, + (hits / (dateDiff('second', first_seen, last_seen) + 1)) AS hit_velocity, + (unique_src_ports / (hits + 1)) AS port_exhaustion_ratio, + (orphan_count / (hits + 1)) AS orphan_ratio, + (ip_id_zero_count / (hits + 1)) AS ip_id_zero_ratio, + (hits / (unique_conn_id + 1)) AS multiplexing_efficiency, + IF(mss_1460_count > (hits * 0.8) AND has_sec_ch_ua > 0, 1, 0) AS mss_mobile_mismatch, + request_size_variance AS request_size_variance, + IF(tls_alpn = 'h2' AND http_version != '2', 1, 0) AS alpn_http_mismatch, + IF(length(tls_alpn) = 0 OR tls_alpn = '00', 1, 0) AS is_alpn_missing, + IF(length(tls_sni) > 0 AND tls_sni != host, 1, 0) AS sni_host_mismatch, + IF(sec_fetch_mode = 'navigate' AND sec_fetch_dest != 'document', 1, 0) AS is_fake_navigation, + count() OVER (PARTITION BY tcp_fingerprint) AS tcp_shared_count, + count() OVER (PARTITION BY header_order_hash) AS header_order_shared_count, + (count_assets / (hits + 1)) AS asset_ratio, + (count_no_referer / (hits + 1)) AS direct_access_ratio, + IF(unique_ua > 2, 1, 0) AS is_ua_rotating, + uniqExact(ja4) OVER (PARTITION BY src_ip) AS distinct_ja4_count, + ((hits / (unique_src_ports + 1)) / (dateDiff('second', first_seen, last_seen) + 1)) AS src_port_density, + (sum(hits) OVER (PARTITION BY ja4, src_asn) / (sum(hits) OVER (PARTITION BY ja4) + 1)) AS ja4_asn_concentration, + (sum(hits) OVER (PARTITION BY ja4, src_country_code) / (sum(hits) OVER (PARTITION BY ja4) + 1)) AS ja4_country_concentration, + IF(sum(hits) OVER (PARTITION BY ja4) < 100, 1, 0) AS is_rare_ja4, + (count() OVER (PARTITION BY header_order_hash, ja4) / (count() OVER (PARTITION BY ja4) + 1)) AS header_order_confidence, + uniqExact(header_order_hash) OVER (PARTITION BY src_ip) AS distinct_header_orders, + (uniq_paths / (hits + 1)) AS path_diversity_ratio, + url_depth_variance AS url_depth_variance, + (count_anomalous_payload / (hits + 1)) AS anomalous_payload_ratio, + uniq_ja3_val AS uniq_ja3_per_row, + sqrt(tcp_jitter_variance) / greatest(avg_syn_ms_val, 1) AS syn_timing_cv, + tls12_count / (hits + 1) AS tls12_ratio, + count_head / (hits + 1) AS head_ratio, + count_no_sec_fetch / (hits + 1) AS sec_fetch_absence_rate, + count_generic_accept / (hits + 1) AS generic_accept_ratio, + count_http10 / (hits + 1) AS http10_ratio, + ip_df_variance AS ip_df_variance, + avg_ttl_val AS avg_ttl, + sqrt(ttl_variance_val) AS ttl_std, + IF(count_correlated_val > 0, count_no_wscale_val / count_correlated_val, 0) AS no_window_scale_ratio, + count_no_accept_enc_val / (hits + 1) AS missing_accept_enc_ratio, + count_http_scheme_val / (hits + 1) AS http_scheme_ratio, -- P1 : nouvelles features de détection - IF(a.count_xff_val > 0, 1, 0) AS has_xff, - a.count_unusual_ct_val / greatest(a.count_post, 1) AS unusual_content_type_ratio, - a.count_non_std_port_val / (a.hits + 1) AS non_standard_port_ratio, - a.count_login_post_val / greatest(a.count_post, 1) AS login_post_concentration, - h.sec_ch_mobile_mismatch AS sec_ch_mobile_mismatch, + IF(count_xff_val > 0, 1, 0) AS has_xff, + count_unusual_ct_val / greatest(count_post, 1) AS unusual_content_type_ratio, + count_non_std_port_val / (hits + 1) AS non_standard_port_ratio, + count_login_post_val / greatest(count_post, 1) AS login_post_concentration, + sec_ch_mobile_mismatch AS sec_ch_mobile_mismatch, -- §2 — Features HTTP/2 (fingerprint SETTINGS, cohérence H2↔JA4, pseudo-headers) - -- h2_settings_known : le fingerprint H2 est dans dict_browser_h2 IF( - COALESCE(h2.h2_fp, '') != '' AND - dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', - tuple(COALESCE(h2.h2_fp, '')), '') != '', + h2_fp != '' AND + dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', tuple(h2_fp), '') != '', 1, 0 ) AS h2_settings_known, - -- h2_pseudo_order_match : l'ordre des pseudo-headers correspond à la famille JA4 déclarée CASE - WHEN COALESCE(h2.h2_pseudo_ord, '') = '' THEN 0 + WHEN h2_pseudo_ord = '' THEN 0 WHEN dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', - tuple(a.ja4), '') IN ('Chromium', 'Chrome', 'Edge', 'Safari') - AND h2.h2_pseudo_ord = 'm,a,s,p' THEN 1 + tuple(ja4), '') IN ('Chromium', 'Chrome', 'Edge', 'Safari') + AND h2_pseudo_ord = 'm,a,s,p' THEN 1 WHEN dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', - tuple(a.ja4), '') = 'Firefox' - AND h2.h2_pseudo_ord = 'm,p,s,a' THEN 1 + tuple(ja4), '') = 'Firefox' + AND h2_pseudo_ord = 'm,p,s,a' THEN 1 ELSE 0 END AS h2_pseudo_order_match, - -- h2_ja4_coherence : la famille navigateur H2 correspond à la famille JA4 IF( - COALESCE(h2.h2_fp, '') != '' AND - dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', - tuple(COALESCE(h2.h2_fp, '')), '') = - dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', - tuple(a.ja4), '') AND - dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', - tuple(a.ja4), '') != '', + h2_fp != '' AND + dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', tuple(h2_fp), '') = + dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(ja4), '') AND + dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(ja4), '') != '', 1, 0 ) AS h2_ja4_coherence, - -- h2_settings_rare : fingerprint H2 non reconnu (potentiellement suspect) IF( - COALESCE(h2.h2_fp, '') != '' AND - dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', - tuple(COALESCE(h2.h2_fp, '')), '') = '', + h2_fp != '' AND + dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', tuple(h2_fp), '') = '', 1, 0 ) AS h2_settings_rare FROM ( + -- Jointure unique avec aliases explicites (contournement bug scope ClickHouse 24.8 + -- où PARTITION BY src_ip échoue quand plusieurs sources de JOIN exposent src_ip) SELECT - window_start, src_ip, ja4, host, src_asn, - any(src_country_code) AS src_country_code, any(src_as_name) AS src_as_name, - any(src_org) AS src_org, any(src_domain) AS src_domain, any(first_ua) AS first_ua, - sum(hits) AS hits, uniqMerge(uniq_paths) AS uniq_paths, - uniqMerge(uniq_query_params) AS uniq_query_params, sum(count_post) AS count_post, - min(first_seen) AS first_seen, max(last_seen) AS last_seen, - any(tcp_fp_raw) AS tcp_fingerprint, varPopMerge(tcp_jitter_variance) AS tcp_jitter_variance, - varPopMerge(total_ip_length_var) AS request_size_variance, - any(tcp_win_raw * exp2(tcp_scale_raw)) AS true_window_size, - IF(any(tcp_mss_raw) > 0, any(tcp_win_raw) / any(tcp_mss_raw), 0) AS window_mss_ratio, - any(http_ver_raw) AS http_version, any(tls_alpn_raw) AS tls_alpn, any(tls_sni_raw) AS tls_sni, - max(correlated_raw) AS correlated, uniqMerge(unique_src_ports) AS unique_src_ports, - uniqMerge(unique_conn_id) AS unique_conn_id, max(max_keepalives) AS max_keepalives, - sum(orphan_count) AS orphan_count, sum(ip_id_zero_count) AS ip_id_zero_count, - sum(mss_1460_count) AS mss_1460_count, - sum(count_assets) AS count_assets, sum(count_no_referer) AS count_no_referer, - uniqMerge(uniq_ua) AS unique_ua, - varPopMerge(url_depth_variance) AS url_depth_variance, - sum(count_anomalous_payload) AS count_anomalous_payload, - uniqMerge(uniq_ja3) AS uniq_ja3_val, - avgMerge(avg_syn_ms) AS avg_syn_ms_val, - sum(tls12_count) AS tls12_count, - sum(count_head) AS count_head, - sum(count_no_sec_fetch) AS count_no_sec_fetch, - sum(count_generic_accept) AS count_generic_accept, - sum(count_http10) AS count_http10, - varPopMerge(ip_df_var) AS ip_df_variance, - avgIfMerge(avg_ttl) AS avg_ttl_val, - varPopIfMerge(ttl_var) AS ttl_variance_val, - sum(count_no_wscale) AS count_no_wscale_val, - sum(count_correlated) AS count_correlated_val, - sum(count_no_accept_enc) AS count_no_accept_enc_val, - sum(count_http_scheme) AS count_http_scheme_val, - -- P1 : nouvelles features de détection - sum(count_xff) AS count_xff_val, - sum(count_unusual_ct) AS count_unusual_ct_val, - sum(count_non_std_port) AS count_non_std_port_val, - sum(count_login_post) AS count_login_post_val - FROM ja4_processing.agg_host_ip_ja4_1h - WHERE window_start >= now() - INTERVAL 24 HOUR - GROUP BY window_start, src_ip, ja4, host, src_asn - ) a - LEFT JOIN ( - SELECT - window_start, src_ip, any(header_order_hash) AS header_order_hash, - max(header_count) AS header_count, max(has_accept_language) AS has_accept_language, - max(has_cookie) AS has_cookie, max(has_referer) AS has_referer, - max(modern_browser_score) AS modern_browser_score, max(has_sec_ch_ua) AS has_sec_ch_ua, - max(ua_ch_mismatch) AS ua_ch_mismatch, - max(sec_ch_mobile_mismatch) AS sec_ch_mobile_mismatch, - any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest - FROM ja4_processing.agg_header_fingerprint_1h - WHERE window_start >= now() - INTERVAL 24 HOUR - GROUP BY window_start, src_ip - ) h ON a.src_ip = h.src_ip AND a.window_start = h.window_start - LEFT JOIN h2_agg h2 ON h2.src_ip = a.src_ip AND h2.window_start = a.window_start + a.window_start AS window_start, + a.src_ip AS src_ip, + a.ja4 AS ja4, + a.host AS host, + a.src_asn AS src_asn, + a.src_country_code AS src_country_code, + a.src_as_name AS src_as_name, + a.src_org AS src_org, + a.src_domain AS src_domain, + a.first_ua AS first_ua, + a.hits AS hits, + a.uniq_paths AS uniq_paths, + a.uniq_query_params AS uniq_query_params, + a.count_post AS count_post, + a.first_seen AS first_seen, + a.last_seen AS last_seen, + a.tcp_fingerprint AS tcp_fingerprint, + a.tcp_jitter_variance AS tcp_jitter_variance, + a.request_size_variance AS request_size_variance, + a.true_window_size AS true_window_size, + a.window_mss_ratio AS window_mss_ratio, + a.http_version AS http_version, + a.tls_alpn AS tls_alpn, + a.tls_sni AS tls_sni, + a.correlated AS correlated, + a.unique_src_ports AS unique_src_ports, + a.unique_conn_id AS unique_conn_id, + a.max_keepalives AS max_keepalives, + a.orphan_count AS orphan_count, + a.ip_id_zero_count AS ip_id_zero_count, + a.mss_1460_count AS mss_1460_count, + a.count_assets AS count_assets, + a.count_no_referer AS count_no_referer, + a.unique_ua AS unique_ua, + a.url_depth_variance AS url_depth_variance, + a.count_anomalous_payload AS count_anomalous_payload, + a.uniq_ja3_val AS uniq_ja3_val, + a.avg_syn_ms_val AS avg_syn_ms_val, + a.tls12_count AS tls12_count, + a.count_head AS count_head, + a.count_no_sec_fetch AS count_no_sec_fetch, + a.count_generic_accept AS count_generic_accept, + a.count_http10 AS count_http10, + a.ip_df_variance AS ip_df_variance, + a.avg_ttl_val AS avg_ttl_val, + a.ttl_variance_val AS ttl_variance_val, + a.count_no_wscale_val AS count_no_wscale_val, + a.count_correlated_val AS count_correlated_val, + a.count_no_accept_enc_val AS count_no_accept_enc_val, + a.count_http_scheme_val AS count_http_scheme_val, + a.count_xff_val AS count_xff_val, + a.count_unusual_ct_val AS count_unusual_ct_val, + a.count_non_std_port_val AS count_non_std_port_val, + a.count_login_post_val AS count_login_post_val, + -- colonnes de agg_header_fingerprint_1h + h.header_order_hash AS header_order_hash, + h.header_count AS header_count, + h.has_accept_language AS has_accept_language, + h.has_cookie AS has_cookie, + h.has_referer AS has_referer, + h.modern_browser_score AS modern_browser_score, + h.has_sec_ch_ua AS has_sec_ch_ua, + h.ua_ch_mismatch AS ua_ch_mismatch, + h.sec_ch_mobile_mismatch AS sec_ch_mobile_mismatch, + h.sec_fetch_mode AS sec_fetch_mode, + h.sec_fetch_dest AS sec_fetch_dest, + -- colonnes HTTP/2 (defaut vide si pas de trafic H2) + COALESCE(h2.h2_fp, '') AS h2_fp, + COALESCE(h2.h2_pseudo_ord, '') AS h2_pseudo_ord + FROM ( + SELECT + window_start, src_ip, ja4, host, src_asn, + any(src_country_code) AS src_country_code, any(src_as_name) AS src_as_name, + any(src_org) AS src_org, any(src_domain) AS src_domain, any(first_ua) AS first_ua, + sum(hits) AS hits, uniqMerge(uniq_paths) AS uniq_paths, + uniqMerge(uniq_query_params) AS uniq_query_params, sum(count_post) AS count_post, + min(first_seen) AS first_seen, max(last_seen) AS last_seen, + any(tcp_fp_raw) AS tcp_fingerprint, varPopMerge(tcp_jitter_variance) AS tcp_jitter_variance, + varPopMerge(total_ip_length_var) AS request_size_variance, + any(tcp_win_raw * exp2(tcp_scale_raw)) AS true_window_size, + IF(any(tcp_mss_raw) > 0, any(tcp_win_raw) / any(tcp_mss_raw), 0) AS window_mss_ratio, + any(http_ver_raw) AS http_version, any(tls_alpn_raw) AS tls_alpn, any(tls_sni_raw) AS tls_sni, + max(correlated_raw) AS correlated, uniqMerge(unique_src_ports) AS unique_src_ports, + uniqMerge(unique_conn_id) AS unique_conn_id, max(max_keepalives) AS max_keepalives, + sum(orphan_count) AS orphan_count, sum(ip_id_zero_count) AS ip_id_zero_count, + sum(mss_1460_count) AS mss_1460_count, + sum(count_assets) AS count_assets, sum(count_no_referer) AS count_no_referer, + uniqMerge(uniq_ua) AS unique_ua, + varPopMerge(url_depth_variance) AS url_depth_variance, + sum(count_anomalous_payload) AS count_anomalous_payload, + uniqMerge(uniq_ja3) AS uniq_ja3_val, + avgMerge(avg_syn_ms) AS avg_syn_ms_val, + sum(tls12_count) AS tls12_count, + sum(count_head) AS count_head, + sum(count_no_sec_fetch) AS count_no_sec_fetch, + sum(count_generic_accept) AS count_generic_accept, + sum(count_http10) AS count_http10, + varPopMerge(ip_df_var) AS ip_df_variance, + avgIfMerge(avg_ttl) AS avg_ttl_val, + varPopIfMerge(ttl_var) AS ttl_variance_val, + sum(count_no_wscale) AS count_no_wscale_val, + sum(count_correlated) AS count_correlated_val, + sum(count_no_accept_enc) AS count_no_accept_enc_val, + sum(count_http_scheme) AS count_http_scheme_val, + -- P1 : nouvelles features de détection + sum(count_xff) AS count_xff_val, + sum(count_unusual_ct) AS count_unusual_ct_val, + sum(count_non_std_port) AS count_non_std_port_val, + sum(count_login_post) AS count_login_post_val + FROM ja4_processing.agg_host_ip_ja4_1h + WHERE window_start >= now() - INTERVAL 24 HOUR + GROUP BY window_start, src_ip, ja4, host, src_asn + ) a + LEFT JOIN ( + SELECT + window_start, src_ip, any(header_order_hash) AS header_order_hash, + max(header_count) AS header_count, max(has_accept_language) AS has_accept_language, + max(has_cookie) AS has_cookie, max(has_referer) AS has_referer, + max(modern_browser_score) AS modern_browser_score, max(has_sec_ch_ua) AS has_sec_ch_ua, + max(ua_ch_mismatch) AS ua_ch_mismatch, + max(sec_ch_mobile_mismatch) AS sec_ch_mobile_mismatch, + any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest + FROM ja4_processing.agg_header_fingerprint_1h + WHERE window_start >= now() - INTERVAL 24 HOUR + GROUP BY window_start, src_ip + ) h ON a.src_ip = h.src_ip AND a.window_start = h.window_start + -- §2 — LEFT JOIN fingerprints HTTP/2 depuis ja4_logs.http_logs + LEFT JOIN ( + SELECT + toStartOfHour(time) AS h2_window, + toIPv6(src_ip) AS h2_ip, + anyIf(h2_fingerprint, h2_fingerprint != '') AS h2_fp, + anyIf(h2_pseudo_order, h2_pseudo_order != '') AS h2_pseudo_ord + FROM ja4_logs.http_logs + WHERE time >= now() - INTERVAL 24 HOUR + AND (h2_fingerprint != '' OR h2_pseudo_order != '') + GROUP BY toStartOfHour(time), toIPv6(src_ip) + ) h2 ON h2.h2_ip = a.src_ip AND h2.h2_window = a.window_start + ) ) SELECT *, -(sum((hits / (total_ip_hits + 1)) * log2((hits / (total_ip_hits + 1)) + 0.000001)) OVER (PARTITION BY src_ip)) AS temporal_entropy, sum(uniq_ja3_per_row) OVER (PARTITION BY src_ip) / greatest(distinct_ja4_count, 1) AS ja3_diversity_ratio, -- §3 — Score de cohérence de fingerprint cross-layer [0.0, 1.0] - -- Combine : famille navigateur connue, cohérence H2↔JA4, cohérence TLS, - -- présence Accept-Language, et absence de mismatch UA/CH. toFloat32( CASE WHEN browser_family != '' THEN 0.25 ELSE 0.0 END + COALESCE(h2_ja4_coherence, 0) * 0.20