-- ============================================================================= -- 07_ai_features_view.sql — AI feature view with full Anubis enrichment -- Source: bot_detector/anubis/view_ai_features_anubis.sql -- Includes combined UA+IP priority logic and Anubis bot_name/action/category. -- ============================================================================= CREATE OR REPLACE VIEW mabase_prod.view_ai_features_1h AS WITH base_data AS ( SELECT a.window_start, a.src_ip, a.ja4, a.host, toString(a.src_asn) AS asn_number, a.src_as_name AS asn_org, a.src_org AS asn_detail, a.src_domain AS asn_domain, a.src_country_code AS country_code, dictGetOrDefault('mabase_prod.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label, COALESCE( nullIf(dictGetOrDefault('mabase_prod.dict_bot_ip', 'bot_name', a.src_ip, ''), ''), nullIf(dictGetOrDefault('mabase_prod.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''), '' ) AS bot_name, -- Anubis: combined UA+IP priority logic > UA only > IP only > ASN > Country CASE WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1' AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua)) = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0' AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') ELSE '' END AS anubis_bot_name, CASE WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1' AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua)) = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) THEN dictGet('mabase_prod.dict_anubis_ua', 'action', a.first_ua) WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0' AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' THEN dictGet('mabase_prod.dict_anubis_ua', 'action', a.first_ua) WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'action', a.src_ip, '') WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'action', toUInt32(a.src_asn), '') WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'action', a.src_country_code, '') ELSE '' END AS anubis_bot_action, CASE WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1' AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua)) = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) THEN dictGet('mabase_prod.dict_anubis_ua', 'category', a.first_ua) WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0' AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' THEN dictGet('mabase_prod.dict_anubis_ua', 'category', a.first_ua) WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'category', a.src_ip, '') WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'category', toUInt32(a.src_asn), '') WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'category', a.src_country_code, '') ELSE '' END AS anubis_bot_category, a.hits AS hits, sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits, a.correlated AS correlated, a.tcp_jitter_variance AS tcp_jitter_variance, a.true_window_size AS true_window_size, a.window_mss_ratio AS window_mss_ratio, a.max_keepalives AS max_keepalives, h.header_order_hash AS header_order_hash, h.header_count AS header_count, h.has_accept_language AS has_accept_language, h.has_cookie AS has_cookie, h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score, h.ua_ch_mismatch AS ua_ch_mismatch, (a.count_post / (a.hits + 1)) AS post_ratio, (a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index, (a.hits / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS hit_velocity, (a.unique_src_ports / (a.hits + 1)) AS port_exhaustion_ratio, (a.orphan_count / (a.hits + 1)) AS orphan_ratio, (a.ip_id_zero_count / (a.hits + 1)) AS ip_id_zero_ratio, (a.hits / (a.unique_conn_id + 1)) AS multiplexing_efficiency, IF(a.mss_1460_count > (a.hits * 0.8) AND h.modern_browser_score > 70, 1, 0) AS mss_mobile_mismatch, a.request_size_variance AS request_size_variance, IF(a.tls_alpn = 'h2' AND a.http_version != '2', 1, 0) AS alpn_http_mismatch, IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing, IF(length(a.tls_sni) > 0 AND a.tls_sni != a.host, 1, 0) AS sni_host_mismatch, IF(h.sec_fetch_mode = 'navigate' AND h.sec_fetch_dest != 'document', 1, 0) AS is_fake_navigation, count() OVER (PARTITION BY a.tcp_fingerprint) AS tcp_shared_count, count() OVER (PARTITION BY h.header_order_hash) AS header_order_shared_count, (a.count_assets / (a.hits + 1)) AS asset_ratio, (a.count_no_referer / (a.hits + 1)) AS direct_access_ratio, IF(a.unique_ua > 2, 1, 0) AS is_ua_rotating, uniqExact(a.ja4) OVER (PARTITION BY a.src_ip) AS distinct_ja4_count, ((a.hits / (a.unique_src_ports + 1)) / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS src_port_density, (sum(a.hits) OVER (PARTITION BY a.ja4, a.src_asn) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_asn_concentration, (sum(a.hits) OVER (PARTITION BY a.ja4, a.src_country_code) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_country_concentration, IF(sum(a.hits) OVER (PARTITION BY a.ja4) < 100, 1, 0) AS is_rare_ja4, (count() OVER (PARTITION BY h.header_order_hash, a.first_ua) / (count() OVER (PARTITION BY a.first_ua) + 1)) AS header_order_confidence, uniqExact(h.header_order_hash) OVER (PARTITION BY a.src_ip) AS distinct_header_orders, (a.uniq_paths / (a.hits + 1)) AS path_diversity_ratio, a.url_depth_variance AS url_depth_variance, (a.count_anomalous_payload / (a.hits + 1)) AS anomalous_payload_ratio, a.uniq_ja3_val AS uniq_ja3_per_row, sqrt(a.tcp_jitter_variance) / greatest(a.avg_syn_ms_val, 1) AS syn_timing_cv, a.tls12_count / (a.hits + 1) AS tls12_ratio, a.count_head / (a.hits + 1) AS head_ratio, a.count_no_sec_fetch / (a.hits + 1) AS sec_fetch_absence_rate, a.count_generic_accept / (a.hits + 1) AS generic_accept_ratio, a.count_http10 / (a.hits + 1) AS http10_ratio, a.ip_df_variance AS ip_df_variance, a.avg_ttl_val AS avg_ttl, sqrt(a.ttl_variance_val) AS ttl_std, IF(a.count_correlated_val > 0, a.count_no_wscale_val / a.count_correlated_val, 0) AS no_window_scale_ratio, a.count_no_accept_enc_val / (a.hits + 1) AS missing_accept_enc_ratio, a.count_http_scheme_val / (a.hits + 1) AS http_scheme_ratio FROM ( SELECT window_start, src_ip, ja4, host, src_asn, any(src_country_code) AS src_country_code, any(src_as_name) AS src_as_name, any(src_org) AS src_org, any(src_domain) AS src_domain, any(first_ua) AS first_ua, sum(hits) AS hits, uniqMerge(uniq_paths) AS uniq_paths, uniqMerge(uniq_query_params) AS uniq_query_params, sum(count_post) AS count_post, min(first_seen) AS first_seen, max(last_seen) AS last_seen, any(tcp_fp_raw) AS tcp_fingerprint, varPopMerge(tcp_jitter_variance) AS tcp_jitter_variance, varPopMerge(total_ip_length_var) AS request_size_variance, any(tcp_win_raw * exp2(tcp_scale_raw)) AS true_window_size, IF(any(tcp_mss_raw) > 0, any(tcp_win_raw) / any(tcp_mss_raw), 0) AS window_mss_ratio, any(http_ver_raw) AS http_version, any(tls_alpn_raw) AS tls_alpn, any(tls_sni_raw) AS tls_sni, max(correlated_raw) AS correlated, uniqMerge(unique_src_ports) AS unique_src_ports, uniqMerge(unique_conn_id) AS unique_conn_id, max(max_keepalives) AS max_keepalives, sum(orphan_count) AS orphan_count, sum(ip_id_zero_count) AS ip_id_zero_count, sum(mss_1460_count) AS mss_1460_count, sum(count_assets) AS count_assets, sum(count_no_referer) AS count_no_referer, uniqMerge(uniq_ua) AS unique_ua, varPopMerge(url_depth_variance) AS url_depth_variance, sum(count_anomalous_payload) AS count_anomalous_payload, uniqMerge(uniq_ja3) AS uniq_ja3_val, avgMerge(avg_syn_ms) AS avg_syn_ms_val, sum(tls12_count) AS tls12_count, sum(count_head) AS count_head, sum(count_no_sec_fetch) AS count_no_sec_fetch, sum(count_generic_accept) AS count_generic_accept, sum(count_http10) AS count_http10, varPopMerge(ip_df_var) AS ip_df_variance, avgIfMerge(avg_ttl) AS avg_ttl_val, varPopIfMerge(ttl_var) AS ttl_variance_val, sum(count_no_wscale) AS count_no_wscale_val, sum(count_correlated) AS count_correlated_val, sum(count_no_accept_enc) AS count_no_accept_enc_val, sum(count_http_scheme) AS count_http_scheme_val FROM mabase_prod.agg_host_ip_ja4_1h WHERE window_start >= now() - INTERVAL 24 HOUR GROUP BY window_start, src_ip, ja4, host, src_asn ) a LEFT JOIN ( SELECT window_start, src_ip, any(header_order_hash) AS header_order_hash, max(header_count) AS header_count, max(has_accept_language) AS has_accept_language, max(has_cookie) AS has_cookie, max(has_referer) AS has_referer, max(modern_browser_score) AS modern_browser_score, max(ua_ch_mismatch) AS ua_ch_mismatch, any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest FROM mabase_prod.agg_header_fingerprint_1h WHERE window_start >= now() - INTERVAL 24 HOUR GROUP BY window_start, src_ip ) h ON a.src_ip = h.src_ip AND a.window_start = h.window_start ) SELECT *, -(sum((hits / (total_ip_hits + 1)) * log2((hits / (total_ip_hits + 1)) + 0.000001)) OVER (PARTITION BY src_ip)) AS temporal_entropy, sum(uniq_ja3_per_row) OVER (PARTITION BY src_ip) / greatest(distinct_ja4_count, 1) AS ja3_diversity_ratio FROM base_data;