-- ============================================================================= -- 07_ai_features_view.sql — AI feature view with full Anubis enrichment -- §2 : Features HTTP/2 (dict_browser_h2, cohérence H2↔JA4, pseudo-headers) -- §3 : Score de cohérence de fingerprint cross-layer -- Note: single anonymous inner subquery with explicit aliases (fix ClickHouse 24.8 -- scope bug where PARTITION BY src_ip fails when multiple JOIN sources expose src_ip) -- ============================================================================= CREATE OR REPLACE VIEW ja4_processing.view_ai_features_1h AS WITH base_data AS ( SELECT window_start, src_ip, ja4, host, toString(src_asn) AS asn_number, src_as_name AS asn_org, src_org AS asn_detail, src_domain AS asn_domain, src_country_code AS country_code, dictGetOrDefault('ja4_processing.dict_asn_reputation', 'label', toUInt64(src_asn), 'unknown') AS asn_label, COALESCE( nullIf(dictGetOrDefault('ja4_processing.dict_bot_ip', 'bot_name', src_ip, ''), ''), nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(ja4), ''), ''), '' ) AS bot_name, dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(ja4), '') AS browser_family, -- Anubis: IP/CIDR > ASN (simplified — UA and Country rules removed) COALESCE( nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', src_ip, ''), ''), nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(src_asn), ''), ''), '' ) AS anubis_bot_name, COALESCE( nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', src_ip, ''), ''), nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(src_asn), ''), ''), '' ) AS anubis_bot_action, COALESCE( nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', src_ip, ''), ''), nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(src_asn), ''), ''), '' ) AS anubis_bot_category, hits AS hits, sum(hits) OVER (PARTITION BY src_ip) AS total_ip_hits, correlated AS correlated, tcp_jitter_variance AS tcp_jitter_variance, true_window_size AS true_window_size, window_mss_ratio AS window_mss_ratio, max_keepalives AS max_keepalives, header_order_hash AS header_order_hash, header_count AS header_count, has_accept_language AS has_accept_language, has_cookie AS has_cookie, has_referer AS has_referer, modern_browser_score AS modern_browser_score, has_sec_ch_ua AS has_sec_ch_ua, ua_ch_mismatch AS ua_ch_mismatch, (count_post / (hits + 1)) AS post_ratio, (uniq_query_params / (uniq_paths + 1)) AS fuzzing_index, (hits / (dateDiff('second', first_seen, last_seen) + 1)) AS hit_velocity, (unique_src_ports / (hits + 1)) AS port_exhaustion_ratio, (orphan_count / (hits + 1)) AS orphan_ratio, (ip_id_zero_count / (hits + 1)) AS ip_id_zero_ratio, (hits / (unique_conn_id + 1)) AS multiplexing_efficiency, IF(mss_1460_count > (hits * 0.8) AND has_sec_ch_ua > 0, 1, 0) AS mss_mobile_mismatch, request_size_variance AS request_size_variance, IF(tls_alpn = 'h2' AND http_version != '2', 1, 0) AS alpn_http_mismatch, IF(length(tls_alpn) = 0 OR tls_alpn = '00', 1, 0) AS is_alpn_missing, IF(length(tls_sni) > 0 AND tls_sni != host, 1, 0) AS sni_host_mismatch, IF(sec_fetch_mode = 'navigate' AND sec_fetch_dest != 'document', 1, 0) AS is_fake_navigation, count() OVER (PARTITION BY tcp_fingerprint) AS tcp_shared_count, count() OVER (PARTITION BY header_order_hash) AS header_order_shared_count, (count_assets / (hits + 1)) AS asset_ratio, (count_no_referer / (hits + 1)) AS direct_access_ratio, IF(unique_ua > 2, 1, 0) AS is_ua_rotating, uniqExact(ja4) OVER (PARTITION BY src_ip) AS distinct_ja4_count, ((hits / (unique_src_ports + 1)) / (dateDiff('second', first_seen, last_seen) + 1)) AS src_port_density, (sum(hits) OVER (PARTITION BY ja4, src_asn) / (sum(hits) OVER (PARTITION BY ja4) + 1)) AS ja4_asn_concentration, (sum(hits) OVER (PARTITION BY ja4, src_country_code) / (sum(hits) OVER (PARTITION BY ja4) + 1)) AS ja4_country_concentration, IF(sum(hits) OVER (PARTITION BY ja4) < 100, 1, 0) AS is_rare_ja4, (count() OVER (PARTITION BY header_order_hash, ja4) / (count() OVER (PARTITION BY ja4) + 1)) AS header_order_confidence, uniqExact(header_order_hash) OVER (PARTITION BY src_ip) AS distinct_header_orders, (uniq_paths / (hits + 1)) AS path_diversity_ratio, url_depth_variance AS url_depth_variance, (count_anomalous_payload / (hits + 1)) AS anomalous_payload_ratio, uniq_ja3_val AS uniq_ja3_per_row, sqrt(tcp_jitter_variance) / greatest(avg_syn_ms_val, 1) AS syn_timing_cv, tls12_count / (hits + 1) AS tls12_ratio, count_head / (hits + 1) AS head_ratio, count_no_sec_fetch / (hits + 1) AS sec_fetch_absence_rate, count_generic_accept / (hits + 1) AS generic_accept_ratio, count_http10 / (hits + 1) AS http10_ratio, ip_df_variance AS ip_df_variance, avg_ttl_val AS avg_ttl, sqrt(ttl_variance_val) AS ttl_std, IF(count_correlated_val > 0, count_no_wscale_val / count_correlated_val, 0) AS no_window_scale_ratio, count_no_accept_enc_val / (hits + 1) AS missing_accept_enc_ratio, count_http_scheme_val / (hits + 1) AS http_scheme_ratio, -- P1 : nouvelles features de détection IF(count_xff_val > 0, 1, 0) AS has_xff, count_unusual_ct_val / greatest(count_post, 1) AS unusual_content_type_ratio, count_non_std_port_val / (hits + 1) AS non_standard_port_ratio, count_login_post_val / greatest(count_post, 1) AS login_post_concentration, sec_ch_mobile_mismatch AS sec_ch_mobile_mismatch, -- §2 — Features HTTP/2 (fingerprint SETTINGS, cohérence H2↔JA4, pseudo-headers) IF( h2_fp != '' AND dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', tuple(h2_fp), '') != '', 1, 0 ) AS h2_settings_known, CASE WHEN h2_pseudo_ord = '' THEN 0 WHEN dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(ja4), '') IN ('Chromium', 'Chrome', 'Edge', 'Safari') AND h2_pseudo_ord = 'm,a,s,p' THEN 1 WHEN dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(ja4), '') = 'Firefox' AND h2_pseudo_ord = 'm,p,s,a' THEN 1 ELSE 0 END AS h2_pseudo_order_match, IF( h2_fp != '' AND dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', tuple(h2_fp), '') = dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(ja4), '') AND dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(ja4), '') != '', 1, 0 ) AS h2_ja4_coherence, IF( h2_fp != '' AND dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', tuple(h2_fp), '') = '', 1, 0 ) AS h2_settings_rare, -- §4 — Famille identifiée par le dictionnaire H2 (browser_matcher) dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', tuple(h2_fp), '') AS h2_dict_family, -- §4 — Valeur brute WINDOW_UPDATE H2 (signal de famille le plus fiable) h2_wu_val AS h2_window_update_value, -- §4 — Signaux atomiques H2 pour le browser_matcher et le vecteur ML toUInt8(h2_wu_val BETWEEN 15663000 AND 15664000) AS h2_window_chrome, toUInt8(h2_wu_val BETWEEN 12517000 AND 12518000) AS h2_window_firefox, toUInt8(h2_wu_val BETWEEN 10485700 AND 10485820) AS h2_window_safari, toUInt8(h2_wu_val = 0 AND h2_fp != '') AS h2_window_absent, -- Chrome et Safari partagent l'ordre m,a,s,p — utiliser WU pour distinguer toUInt8(h2_pseudo_ord = 'm,a,s,p') AS h2_order_chromesafari, toUInt8(h2_pseudo_ord = 'm,p,s,a') AS h2_order_firefox, -- Présence de PRIORITY frames (3e champ de h2_fp, != '0' → Firefox ancien) toUInt8( h2_fp != '' AND length(splitByChar('|', h2_fp)) >= 3 AND arrayElement(splitByChar('|', h2_fp), 3) NOT IN ('', '0') ) AS h2_priority_present, -- Valeur brute du pseudo-header order (pour le matcher Python) h2_pseudo_ord AS h2_pseudo_ord_raw, -- Colonnes SETTINGS individuelles — passthrough pour _d1_h2_settings (browser_matcher) h2_header_table_size AS h2_header_table_size, h2_enable_push AS h2_enable_push, h2_max_concurrent_streams AS h2_max_concurrent_streams, h2_initial_window_size AS h2_initial_window_size, h2_max_frame_size AS h2_max_frame_size, h2_max_header_list_size AS h2_max_header_list_size, h2_enable_connect_protocol AS h2_enable_connect_protocol FROM ( -- Jointure unique avec aliases explicites (contournement bug scope ClickHouse 24.8 -- où PARTITION BY src_ip échoue quand plusieurs sources de JOIN exposent src_ip) SELECT a.window_start AS window_start, a.src_ip AS src_ip, a.ja4 AS ja4, a.host AS host, a.src_asn AS src_asn, a.src_country_code AS src_country_code, a.src_as_name AS src_as_name, a.src_org AS src_org, a.src_domain AS src_domain, a.first_ua AS first_ua, a.hits AS hits, a.uniq_paths AS uniq_paths, a.uniq_query_params AS uniq_query_params, a.count_post AS count_post, a.first_seen AS first_seen, a.last_seen AS last_seen, a.tcp_fingerprint AS tcp_fingerprint, a.tcp_jitter_variance AS tcp_jitter_variance, a.request_size_variance AS request_size_variance, a.true_window_size AS true_window_size, a.window_mss_ratio AS window_mss_ratio, a.http_version AS http_version, a.tls_alpn AS tls_alpn, a.tls_sni AS tls_sni, a.correlated AS correlated, a.unique_src_ports AS unique_src_ports, a.unique_conn_id AS unique_conn_id, a.max_keepalives AS max_keepalives, a.orphan_count AS orphan_count, a.ip_id_zero_count AS ip_id_zero_count, a.mss_1460_count AS mss_1460_count, a.count_assets AS count_assets, a.count_no_referer AS count_no_referer, a.unique_ua AS unique_ua, a.url_depth_variance AS url_depth_variance, a.count_anomalous_payload AS count_anomalous_payload, a.uniq_ja3_val AS uniq_ja3_val, a.avg_syn_ms_val AS avg_syn_ms_val, a.tls12_count AS tls12_count, a.count_head AS count_head, a.count_no_sec_fetch AS count_no_sec_fetch, a.count_generic_accept AS count_generic_accept, a.count_http10 AS count_http10, a.ip_df_variance AS ip_df_variance, a.avg_ttl_val AS avg_ttl_val, a.ttl_variance_val AS ttl_variance_val, a.count_no_wscale_val AS count_no_wscale_val, a.count_correlated_val AS count_correlated_val, a.count_no_accept_enc_val AS count_no_accept_enc_val, a.count_http_scheme_val AS count_http_scheme_val, a.count_xff_val AS count_xff_val, a.count_unusual_ct_val AS count_unusual_ct_val, a.count_non_std_port_val AS count_non_std_port_val, a.count_login_post_val AS count_login_post_val, -- colonnes de agg_header_fingerprint_1h h.header_order_hash AS header_order_hash, h.header_count AS header_count, h.has_accept_language AS has_accept_language, h.has_cookie AS has_cookie, h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score, h.has_sec_ch_ua AS has_sec_ch_ua, h.ua_ch_mismatch AS ua_ch_mismatch, h.sec_ch_mobile_mismatch AS sec_ch_mobile_mismatch, h.sec_fetch_mode AS sec_fetch_mode, h.sec_fetch_dest AS sec_fetch_dest, -- colonnes HTTP/2 (defaut vide/0 si pas de trafic H2) COALESCE(h2.h2_fp, '') AS h2_fp, COALESCE(h2.h2_pseudo_ord, '') AS h2_pseudo_ord, COALESCE(h2.h2_wu_val, 0) AS h2_wu_val, -- SETTINGS individuels : NULL → -1 quand absent du preface COALESCE(h2.h2_set_1, -1) AS h2_header_table_size, COALESCE(h2.h2_set_2, -1) AS h2_enable_push, COALESCE(h2.h2_set_3, -1) AS h2_max_concurrent_streams, COALESCE(h2.h2_set_4, -1) AS h2_initial_window_size, COALESCE(h2.h2_set_5, -1) AS h2_max_frame_size, COALESCE(h2.h2_set_6, -1) AS h2_max_header_list_size, COALESCE(h2.h2_set_8, -1) AS h2_enable_connect_protocol FROM ( SELECT window_start, src_ip, ja4, host, src_asn, any(src_country_code) AS src_country_code, any(src_as_name) AS src_as_name, any(src_org) AS src_org, any(src_domain) AS src_domain, any(first_ua) AS first_ua, sum(hits) AS hits, uniqMerge(uniq_paths) AS uniq_paths, uniqMerge(uniq_query_params) AS uniq_query_params, sum(count_post) AS count_post, min(first_seen) AS first_seen, max(last_seen) AS last_seen, any(tcp_fp_raw) AS tcp_fingerprint, varPopMerge(tcp_jitter_variance) AS tcp_jitter_variance, varPopMerge(total_ip_length_var) AS request_size_variance, any(tcp_win_raw * exp2(tcp_scale_raw)) AS true_window_size, IF(any(tcp_mss_raw) > 0, any(tcp_win_raw) / any(tcp_mss_raw), 0) AS window_mss_ratio, any(http_ver_raw) AS http_version, any(tls_alpn_raw) AS tls_alpn, any(tls_sni_raw) AS tls_sni, max(correlated_raw) AS correlated, uniqMerge(unique_src_ports) AS unique_src_ports, uniqMerge(unique_conn_id) AS unique_conn_id, max(max_keepalives) AS max_keepalives, sum(orphan_count) AS orphan_count, sum(ip_id_zero_count) AS ip_id_zero_count, sum(mss_1460_count) AS mss_1460_count, sum(count_assets) AS count_assets, sum(count_no_referer) AS count_no_referer, uniqMerge(uniq_ua) AS unique_ua, varPopMerge(url_depth_variance) AS url_depth_variance, sum(count_anomalous_payload) AS count_anomalous_payload, uniqMerge(uniq_ja3) AS uniq_ja3_val, avgMerge(avg_syn_ms) AS avg_syn_ms_val, sum(tls12_count) AS tls12_count, sum(count_head) AS count_head, sum(count_no_sec_fetch) AS count_no_sec_fetch, sum(count_generic_accept) AS count_generic_accept, sum(count_http10) AS count_http10, varPopMerge(ip_df_var) AS ip_df_variance, avgIfMerge(avg_ttl) AS avg_ttl_val, varPopIfMerge(ttl_var) AS ttl_variance_val, sum(count_no_wscale) AS count_no_wscale_val, sum(count_correlated) AS count_correlated_val, sum(count_no_accept_enc) AS count_no_accept_enc_val, sum(count_http_scheme) AS count_http_scheme_val, -- P1 : nouvelles features de détection sum(count_xff) AS count_xff_val, sum(count_unusual_ct) AS count_unusual_ct_val, sum(count_non_std_port) AS count_non_std_port_val, sum(count_login_post) AS count_login_post_val FROM ja4_processing.agg_host_ip_ja4_1h WHERE window_start >= now() - INTERVAL 24 HOUR GROUP BY window_start, src_ip, ja4, host, src_asn ) a LEFT JOIN ( SELECT window_start, src_ip, any(header_order_hash) AS header_order_hash, max(header_count) AS header_count, max(has_accept_language) AS has_accept_language, max(has_cookie) AS has_cookie, max(has_referer) AS has_referer, max(modern_browser_score) AS modern_browser_score, max(has_sec_ch_ua) AS has_sec_ch_ua, max(ua_ch_mismatch) AS ua_ch_mismatch, max(sec_ch_mobile_mismatch) AS sec_ch_mobile_mismatch, any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest FROM ja4_processing.agg_header_fingerprint_1h WHERE window_start >= now() - INTERVAL 24 HOUR GROUP BY window_start, src_ip ) h ON a.src_ip = h.src_ip AND a.window_start = h.window_start -- §2 — LEFT JOIN fingerprints HTTP/2 depuis ja4_logs.http_logs LEFT JOIN ( SELECT toStartOfHour(time) AS h2_window, toIPv6(src_ip) AS h2_ip, anyIf(h2_fingerprint, h2_fingerprint != '') AS h2_fp, anyIf(h2_pseudo_order, h2_pseudo_order != '') AS h2_pseudo_ord, anyIf(h2_window_update, h2_window_update > 0) AS h2_wu_val, -- Colonnes SETTINGS individuelles (ID 1–6, 8) : -1 = absent du preface client toInt32(anyIf(toNullable(h2_header_table_size), h2_header_table_size >= 0)) AS h2_set_1, toInt32(anyIf(toNullable(h2_enable_push), h2_enable_push >= 0)) AS h2_set_2, toInt32(anyIf(toNullable(h2_max_concurrent_streams), h2_max_concurrent_streams >= 0)) AS h2_set_3, toInt64(anyIf(toNullable(h2_initial_window_size), h2_initial_window_size >= 0)) AS h2_set_4, toInt32(anyIf(toNullable(h2_max_frame_size), h2_max_frame_size >= 0)) AS h2_set_5, toInt32(anyIf(toNullable(h2_max_header_list_size), h2_max_header_list_size >= 0)) AS h2_set_6, toInt32(anyIf(toNullable(h2_enable_connect_protocol), h2_enable_connect_protocol >= 0)) AS h2_set_8 FROM ja4_logs.http_logs WHERE time >= now() - INTERVAL 24 HOUR AND (h2_fingerprint != '' OR h2_pseudo_order != '') GROUP BY toStartOfHour(time), toIPv6(src_ip) ) h2 ON h2.h2_ip = a.src_ip AND h2.h2_window = a.window_start ) ) SELECT *, -(sum((hits / (total_ip_hits + 1)) * log2((hits / (total_ip_hits + 1)) + 0.000001)) OVER (PARTITION BY src_ip)) AS temporal_entropy, sum(uniq_ja3_per_row) OVER (PARTITION BY src_ip) / greatest(distinct_ja4_count, 1) AS ja3_diversity_ratio, -- §4 — Incohérence TLS↔H2 : JA4 identifie une famille mais H2 WINDOW_UPDATE en contredit une autre toUInt8(CASE WHEN browser_family IN ('Chromium', 'Chrome', 'Edge') AND h2_window_update_value BETWEEN 12517000 AND 12518000 THEN 1 -- Chrome JA4 / Firefox H2 WHEN browser_family IN ('Chromium', 'Chrome', 'Edge') AND h2_window_update_value BETWEEN 10485700 AND 10485820 THEN 1 -- Chrome JA4 / Safari H2 WHEN browser_family = 'Firefox' AND h2_window_update_value BETWEEN 15663000 AND 15664000 THEN 1 -- Firefox JA4 / Chrome H2 WHEN browser_family != '' AND h2_window_update_value = 0 AND h2_settings_known > 0 THEN 1 -- Navigateur JA4 / pas de WU (outil) ELSE 0 END) AS tls_h2_family_mismatch, -- §3 — Score de cohérence de fingerprint cross-layer [0.0, 1.0] toFloat32( CASE WHEN browser_family != '' THEN 0.25 ELSE 0.0 END + COALESCE(h2_ja4_coherence, 0) * 0.20 + (1 - COALESCE(alpn_http_mismatch, 0)) * 0.15 + (1 - COALESCE(sni_host_mismatch, 0)) * 0.10 + COALESCE(has_accept_language, 0) * 0.15 + (1 - COALESCE(ua_ch_mismatch, 0)) * 0.15 ) AS fingerprint_coherence_score FROM base_data;