Files
ja4-platform/shared/clickhouse/07_ai_features_view.sql
toto f704541f83 feat(h2): direct per-parameter SETTINGS comparison in browser_matcher
- Rewrote _d1_h2_settings() with 3-signal weighted formula:
  direct_score×0.60 + dict_match×0.30 + ja4_coherence×0.10
  when individual SETTINGS cols are available in the DataFrame
- Added _H2_SETTINGS_COLS dict (IDs 1,2,3,4,5,6,8 → column names)
- Fallback to dict_match×0.80 + ja4_coherence×0.20 for backward compat
- Fix view_ai_features_1h: pass 7 individual SETTINGS columns through
  base_data CTE (h2_header_table_size, h2_enable_push,
  h2_max_concurrent_streams, h2_initial_window_size, h2_max_frame_size,
  h2_max_header_list_size, h2_enable_connect_protocol)
- Remove non-existent h2_dict_confidence reference from view SQL
  (dict_browser_h2 only exposes browser_family attribute)
- Add 7 new pytest cases: exact match, one wrong setting, forbidden key
  penalty, unknown fingerprint with correct settings, fallback path,
  CDN proxy neutralisation, full Chrome simulation
- 53/53 bot-detector tests pass
- Update thesis §3.9.2: document direct comparison algorithm + fallback

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-11 03:05:36 +02:00

343 lines
21 KiB
SQL
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

-- =============================================================================
-- 07_ai_features_view.sql — AI feature view with full Anubis enrichment
-- §2 : Features HTTP/2 (dict_browser_h2, cohérence H2↔JA4, pseudo-headers)
-- §3 : Score de cohérence de fingerprint cross-layer
-- Note: single anonymous inner subquery with explicit aliases (fix ClickHouse 24.8
-- scope bug where PARTITION BY src_ip fails when multiple JOIN sources expose src_ip)
-- =============================================================================
CREATE OR REPLACE VIEW ja4_processing.view_ai_features_1h AS
WITH base_data AS (
SELECT
window_start, src_ip, ja4, host,
toString(src_asn) AS asn_number,
src_as_name AS asn_org, src_org AS asn_detail, src_domain AS asn_domain,
src_country_code AS country_code,
dictGetOrDefault('ja4_processing.dict_asn_reputation', 'label', toUInt64(src_asn), 'unknown') AS asn_label,
COALESCE(
nullIf(dictGetOrDefault('ja4_processing.dict_bot_ip', 'bot_name', src_ip, ''), ''),
nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(ja4), ''), ''),
''
) AS bot_name,
dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(ja4), '') AS browser_family,
-- Anubis: IP/CIDR > ASN (simplified — UA and Country rules removed)
COALESCE(
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', src_ip, ''), ''),
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(src_asn), ''), ''),
''
) AS anubis_bot_name,
COALESCE(
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', src_ip, ''), ''),
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(src_asn), ''), ''),
''
) AS anubis_bot_action,
COALESCE(
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', src_ip, ''), ''),
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(src_asn), ''), ''),
''
) AS anubis_bot_category,
hits AS hits,
sum(hits) OVER (PARTITION BY src_ip) AS total_ip_hits,
correlated AS correlated,
tcp_jitter_variance AS tcp_jitter_variance,
true_window_size AS true_window_size,
window_mss_ratio AS window_mss_ratio,
max_keepalives AS max_keepalives,
header_order_hash AS header_order_hash, header_count AS header_count,
has_accept_language AS has_accept_language, has_cookie AS has_cookie,
has_referer AS has_referer, modern_browser_score AS modern_browser_score,
has_sec_ch_ua AS has_sec_ch_ua,
ua_ch_mismatch AS ua_ch_mismatch,
(count_post / (hits + 1)) AS post_ratio,
(uniq_query_params / (uniq_paths + 1)) AS fuzzing_index,
(hits / (dateDiff('second', first_seen, last_seen) + 1)) AS hit_velocity,
(unique_src_ports / (hits + 1)) AS port_exhaustion_ratio,
(orphan_count / (hits + 1)) AS orphan_ratio,
(ip_id_zero_count / (hits + 1)) AS ip_id_zero_ratio,
(hits / (unique_conn_id + 1)) AS multiplexing_efficiency,
IF(mss_1460_count > (hits * 0.8) AND has_sec_ch_ua > 0, 1, 0) AS mss_mobile_mismatch,
request_size_variance AS request_size_variance,
IF(tls_alpn = 'h2' AND http_version != '2', 1, 0) AS alpn_http_mismatch,
IF(length(tls_alpn) = 0 OR tls_alpn = '00', 1, 0) AS is_alpn_missing,
IF(length(tls_sni) > 0 AND tls_sni != host, 1, 0) AS sni_host_mismatch,
IF(sec_fetch_mode = 'navigate' AND sec_fetch_dest != 'document', 1, 0) AS is_fake_navigation,
count() OVER (PARTITION BY tcp_fingerprint) AS tcp_shared_count,
count() OVER (PARTITION BY header_order_hash) AS header_order_shared_count,
(count_assets / (hits + 1)) AS asset_ratio,
(count_no_referer / (hits + 1)) AS direct_access_ratio,
IF(unique_ua > 2, 1, 0) AS is_ua_rotating,
uniqExact(ja4) OVER (PARTITION BY src_ip) AS distinct_ja4_count,
((hits / (unique_src_ports + 1)) / (dateDiff('second', first_seen, last_seen) + 1)) AS src_port_density,
(sum(hits) OVER (PARTITION BY ja4, src_asn) / (sum(hits) OVER (PARTITION BY ja4) + 1)) AS ja4_asn_concentration,
(sum(hits) OVER (PARTITION BY ja4, src_country_code) / (sum(hits) OVER (PARTITION BY ja4) + 1)) AS ja4_country_concentration,
IF(sum(hits) OVER (PARTITION BY ja4) < 100, 1, 0) AS is_rare_ja4,
(count() OVER (PARTITION BY header_order_hash, ja4) / (count() OVER (PARTITION BY ja4) + 1)) AS header_order_confidence,
uniqExact(header_order_hash) OVER (PARTITION BY src_ip) AS distinct_header_orders,
(uniq_paths / (hits + 1)) AS path_diversity_ratio,
url_depth_variance AS url_depth_variance,
(count_anomalous_payload / (hits + 1)) AS anomalous_payload_ratio,
uniq_ja3_val AS uniq_ja3_per_row,
sqrt(tcp_jitter_variance) / greatest(avg_syn_ms_val, 1) AS syn_timing_cv,
tls12_count / (hits + 1) AS tls12_ratio,
count_head / (hits + 1) AS head_ratio,
count_no_sec_fetch / (hits + 1) AS sec_fetch_absence_rate,
count_generic_accept / (hits + 1) AS generic_accept_ratio,
count_http10 / (hits + 1) AS http10_ratio,
ip_df_variance AS ip_df_variance,
avg_ttl_val AS avg_ttl,
sqrt(ttl_variance_val) AS ttl_std,
IF(count_correlated_val > 0, count_no_wscale_val / count_correlated_val, 0) AS no_window_scale_ratio,
count_no_accept_enc_val / (hits + 1) AS missing_accept_enc_ratio,
count_http_scheme_val / (hits + 1) AS http_scheme_ratio,
-- P1 : nouvelles features de détection
IF(count_xff_val > 0, 1, 0) AS has_xff,
count_unusual_ct_val / greatest(count_post, 1) AS unusual_content_type_ratio,
count_non_std_port_val / (hits + 1) AS non_standard_port_ratio,
count_login_post_val / greatest(count_post, 1) AS login_post_concentration,
sec_ch_mobile_mismatch AS sec_ch_mobile_mismatch,
-- §2 — Features HTTP/2 (fingerprint SETTINGS, cohérence H2↔JA4, pseudo-headers)
IF(
h2_fp != '' AND
dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', tuple(h2_fp), '') != '',
1, 0
) AS h2_settings_known,
CASE
WHEN h2_pseudo_ord = '' THEN 0
WHEN dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family',
tuple(ja4), '') IN ('Chromium', 'Chrome', 'Edge', 'Safari')
AND h2_pseudo_ord = 'm,a,s,p' THEN 1
WHEN dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family',
tuple(ja4), '') = 'Firefox'
AND h2_pseudo_ord = 'm,p,s,a' THEN 1
ELSE 0
END AS h2_pseudo_order_match,
IF(
h2_fp != '' AND
dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', tuple(h2_fp), '') =
dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(ja4), '') AND
dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(ja4), '') != '',
1, 0
) AS h2_ja4_coherence,
IF(
h2_fp != '' AND
dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family', tuple(h2_fp), '') = '',
1, 0
) AS h2_settings_rare,
-- §4 — Famille identifiée par le dictionnaire H2 (browser_matcher)
dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family',
tuple(h2_fp), '') AS h2_dict_family,
-- §4 — Valeur brute WINDOW_UPDATE H2 (signal de famille le plus fiable)
h2_wu_val AS h2_window_update_value,
-- §4 — Signaux atomiques H2 pour le browser_matcher et le vecteur ML
toUInt8(h2_wu_val BETWEEN 15663000 AND 15664000) AS h2_window_chrome,
toUInt8(h2_wu_val BETWEEN 12517000 AND 12518000) AS h2_window_firefox,
toUInt8(h2_wu_val BETWEEN 10485700 AND 10485820) AS h2_window_safari,
toUInt8(h2_wu_val = 0 AND h2_fp != '') AS h2_window_absent,
-- Chrome et Safari partagent l'ordre m,a,s,p — utiliser WU pour distinguer
toUInt8(h2_pseudo_ord = 'm,a,s,p') AS h2_order_chromesafari,
toUInt8(h2_pseudo_ord = 'm,p,s,a') AS h2_order_firefox,
-- Présence de PRIORITY frames (3e champ de h2_fp, != '0' → Firefox ancien)
toUInt8(
h2_fp != ''
AND length(splitByChar('|', h2_fp)) >= 3
AND arrayElement(splitByChar('|', h2_fp), 3) NOT IN ('', '0')
) AS h2_priority_present,
-- Valeur brute du pseudo-header order (pour le matcher Python)
h2_pseudo_ord AS h2_pseudo_ord_raw,
-- Colonnes SETTINGS individuelles — passthrough pour _d1_h2_settings (browser_matcher)
h2_header_table_size AS h2_header_table_size,
h2_enable_push AS h2_enable_push,
h2_max_concurrent_streams AS h2_max_concurrent_streams,
h2_initial_window_size AS h2_initial_window_size,
h2_max_frame_size AS h2_max_frame_size,
h2_max_header_list_size AS h2_max_header_list_size,
h2_enable_connect_protocol AS h2_enable_connect_protocol
FROM (
-- Jointure unique avec aliases explicites (contournement bug scope ClickHouse 24.8
-- où PARTITION BY src_ip échoue quand plusieurs sources de JOIN exposent src_ip)
SELECT
a.window_start AS window_start,
a.src_ip AS src_ip,
a.ja4 AS ja4,
a.host AS host,
a.src_asn AS src_asn,
a.src_country_code AS src_country_code,
a.src_as_name AS src_as_name,
a.src_org AS src_org,
a.src_domain AS src_domain,
a.first_ua AS first_ua,
a.hits AS hits,
a.uniq_paths AS uniq_paths,
a.uniq_query_params AS uniq_query_params,
a.count_post AS count_post,
a.first_seen AS first_seen,
a.last_seen AS last_seen,
a.tcp_fingerprint AS tcp_fingerprint,
a.tcp_jitter_variance AS tcp_jitter_variance,
a.request_size_variance AS request_size_variance,
a.true_window_size AS true_window_size,
a.window_mss_ratio AS window_mss_ratio,
a.http_version AS http_version,
a.tls_alpn AS tls_alpn,
a.tls_sni AS tls_sni,
a.correlated AS correlated,
a.unique_src_ports AS unique_src_ports,
a.unique_conn_id AS unique_conn_id,
a.max_keepalives AS max_keepalives,
a.orphan_count AS orphan_count,
a.ip_id_zero_count AS ip_id_zero_count,
a.mss_1460_count AS mss_1460_count,
a.count_assets AS count_assets,
a.count_no_referer AS count_no_referer,
a.unique_ua AS unique_ua,
a.url_depth_variance AS url_depth_variance,
a.count_anomalous_payload AS count_anomalous_payload,
a.uniq_ja3_val AS uniq_ja3_val,
a.avg_syn_ms_val AS avg_syn_ms_val,
a.tls12_count AS tls12_count,
a.count_head AS count_head,
a.count_no_sec_fetch AS count_no_sec_fetch,
a.count_generic_accept AS count_generic_accept,
a.count_http10 AS count_http10,
a.ip_df_variance AS ip_df_variance,
a.avg_ttl_val AS avg_ttl_val,
a.ttl_variance_val AS ttl_variance_val,
a.count_no_wscale_val AS count_no_wscale_val,
a.count_correlated_val AS count_correlated_val,
a.count_no_accept_enc_val AS count_no_accept_enc_val,
a.count_http_scheme_val AS count_http_scheme_val,
a.count_xff_val AS count_xff_val,
a.count_unusual_ct_val AS count_unusual_ct_val,
a.count_non_std_port_val AS count_non_std_port_val,
a.count_login_post_val AS count_login_post_val,
-- colonnes de agg_header_fingerprint_1h
h.header_order_hash AS header_order_hash,
h.header_count AS header_count,
h.has_accept_language AS has_accept_language,
h.has_cookie AS has_cookie,
h.has_referer AS has_referer,
h.modern_browser_score AS modern_browser_score,
h.has_sec_ch_ua AS has_sec_ch_ua,
h.ua_ch_mismatch AS ua_ch_mismatch,
h.sec_ch_mobile_mismatch AS sec_ch_mobile_mismatch,
h.sec_fetch_mode AS sec_fetch_mode,
h.sec_fetch_dest AS sec_fetch_dest,
-- colonnes HTTP/2 (defaut vide/0 si pas de trafic H2)
COALESCE(h2.h2_fp, '') AS h2_fp,
COALESCE(h2.h2_pseudo_ord, '') AS h2_pseudo_ord,
COALESCE(h2.h2_wu_val, 0) AS h2_wu_val,
-- SETTINGS individuels : NULL → -1 quand absent du preface
COALESCE(h2.h2_set_1, -1) AS h2_header_table_size,
COALESCE(h2.h2_set_2, -1) AS h2_enable_push,
COALESCE(h2.h2_set_3, -1) AS h2_max_concurrent_streams,
COALESCE(h2.h2_set_4, -1) AS h2_initial_window_size,
COALESCE(h2.h2_set_5, -1) AS h2_max_frame_size,
COALESCE(h2.h2_set_6, -1) AS h2_max_header_list_size,
COALESCE(h2.h2_set_8, -1) AS h2_enable_connect_protocol
FROM (
SELECT
window_start, src_ip, ja4, host, src_asn,
any(src_country_code) AS src_country_code, any(src_as_name) AS src_as_name,
any(src_org) AS src_org, any(src_domain) AS src_domain, any(first_ua) AS first_ua,
sum(hits) AS hits, uniqMerge(uniq_paths) AS uniq_paths,
uniqMerge(uniq_query_params) AS uniq_query_params, sum(count_post) AS count_post,
min(first_seen) AS first_seen, max(last_seen) AS last_seen,
any(tcp_fp_raw) AS tcp_fingerprint, varPopMerge(tcp_jitter_variance) AS tcp_jitter_variance,
varPopMerge(total_ip_length_var) AS request_size_variance,
any(tcp_win_raw * exp2(tcp_scale_raw)) AS true_window_size,
IF(any(tcp_mss_raw) > 0, any(tcp_win_raw) / any(tcp_mss_raw), 0) AS window_mss_ratio,
any(http_ver_raw) AS http_version, any(tls_alpn_raw) AS tls_alpn, any(tls_sni_raw) AS tls_sni,
max(correlated_raw) AS correlated, uniqMerge(unique_src_ports) AS unique_src_ports,
uniqMerge(unique_conn_id) AS unique_conn_id, max(max_keepalives) AS max_keepalives,
sum(orphan_count) AS orphan_count, sum(ip_id_zero_count) AS ip_id_zero_count,
sum(mss_1460_count) AS mss_1460_count,
sum(count_assets) AS count_assets, sum(count_no_referer) AS count_no_referer,
uniqMerge(uniq_ua) AS unique_ua,
varPopMerge(url_depth_variance) AS url_depth_variance,
sum(count_anomalous_payload) AS count_anomalous_payload,
uniqMerge(uniq_ja3) AS uniq_ja3_val,
avgMerge(avg_syn_ms) AS avg_syn_ms_val,
sum(tls12_count) AS tls12_count,
sum(count_head) AS count_head,
sum(count_no_sec_fetch) AS count_no_sec_fetch,
sum(count_generic_accept) AS count_generic_accept,
sum(count_http10) AS count_http10,
varPopMerge(ip_df_var) AS ip_df_variance,
avgIfMerge(avg_ttl) AS avg_ttl_val,
varPopIfMerge(ttl_var) AS ttl_variance_val,
sum(count_no_wscale) AS count_no_wscale_val,
sum(count_correlated) AS count_correlated_val,
sum(count_no_accept_enc) AS count_no_accept_enc_val,
sum(count_http_scheme) AS count_http_scheme_val,
-- P1 : nouvelles features de détection
sum(count_xff) AS count_xff_val,
sum(count_unusual_ct) AS count_unusual_ct_val,
sum(count_non_std_port) AS count_non_std_port_val,
sum(count_login_post) AS count_login_post_val
FROM ja4_processing.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
GROUP BY window_start, src_ip, ja4, host, src_asn
) a
LEFT JOIN (
SELECT
window_start, src_ip, any(header_order_hash) AS header_order_hash,
max(header_count) AS header_count, max(has_accept_language) AS has_accept_language,
max(has_cookie) AS has_cookie, max(has_referer) AS has_referer,
max(modern_browser_score) AS modern_browser_score, max(has_sec_ch_ua) AS has_sec_ch_ua,
max(ua_ch_mismatch) AS ua_ch_mismatch,
max(sec_ch_mobile_mismatch) AS sec_ch_mobile_mismatch,
any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest
FROM ja4_processing.agg_header_fingerprint_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
GROUP BY window_start, src_ip
) h ON a.src_ip = h.src_ip AND a.window_start = h.window_start
-- §2 — LEFT JOIN fingerprints HTTP/2 depuis ja4_logs.http_logs
LEFT JOIN (
SELECT
toStartOfHour(time) AS h2_window,
toIPv6(src_ip) AS h2_ip,
anyIf(h2_fingerprint, h2_fingerprint != '') AS h2_fp,
anyIf(h2_pseudo_order, h2_pseudo_order != '') AS h2_pseudo_ord,
anyIf(h2_window_update, h2_window_update > 0) AS h2_wu_val,
-- Colonnes SETTINGS individuelles (ID 16, 8) : -1 = absent du preface client
toInt32(anyIf(toNullable(h2_header_table_size), h2_header_table_size >= 0)) AS h2_set_1,
toInt32(anyIf(toNullable(h2_enable_push), h2_enable_push >= 0)) AS h2_set_2,
toInt32(anyIf(toNullable(h2_max_concurrent_streams), h2_max_concurrent_streams >= 0)) AS h2_set_3,
toInt64(anyIf(toNullable(h2_initial_window_size), h2_initial_window_size >= 0)) AS h2_set_4,
toInt32(anyIf(toNullable(h2_max_frame_size), h2_max_frame_size >= 0)) AS h2_set_5,
toInt32(anyIf(toNullable(h2_max_header_list_size), h2_max_header_list_size >= 0)) AS h2_set_6,
toInt32(anyIf(toNullable(h2_enable_connect_protocol), h2_enable_connect_protocol >= 0)) AS h2_set_8
FROM ja4_logs.http_logs
WHERE time >= now() - INTERVAL 24 HOUR
AND (h2_fingerprint != '' OR h2_pseudo_order != '')
GROUP BY toStartOfHour(time), toIPv6(src_ip)
) h2 ON h2.h2_ip = a.src_ip AND h2.h2_window = a.window_start
)
)
SELECT
*,
-(sum((hits / (total_ip_hits + 1)) * log2((hits / (total_ip_hits + 1)) + 0.000001)) OVER (PARTITION BY src_ip)) AS temporal_entropy,
sum(uniq_ja3_per_row) OVER (PARTITION BY src_ip) / greatest(distinct_ja4_count, 1) AS ja3_diversity_ratio,
-- §4 — Incohérence TLS↔H2 : JA4 identifie une famille mais H2 WINDOW_UPDATE en contredit une autre
toUInt8(CASE
WHEN browser_family IN ('Chromium', 'Chrome', 'Edge')
AND h2_window_update_value BETWEEN 12517000 AND 12518000 THEN 1 -- Chrome JA4 / Firefox H2
WHEN browser_family IN ('Chromium', 'Chrome', 'Edge')
AND h2_window_update_value BETWEEN 10485700 AND 10485820 THEN 1 -- Chrome JA4 / Safari H2
WHEN browser_family = 'Firefox'
AND h2_window_update_value BETWEEN 15663000 AND 15664000 THEN 1 -- Firefox JA4 / Chrome H2
WHEN browser_family != '' AND h2_window_update_value = 0
AND h2_settings_known > 0 THEN 1 -- Navigateur JA4 / pas de WU (outil)
ELSE 0
END) AS tls_h2_family_mismatch,
-- §3 — Score de cohérence de fingerprint cross-layer [0.0, 1.0]
toFloat32(
CASE WHEN browser_family != '' THEN 0.25 ELSE 0.0 END
+ COALESCE(h2_ja4_coherence, 0) * 0.20
+ (1 - COALESCE(alpn_http_mismatch, 0)) * 0.15
+ (1 - COALESCE(sni_host_mismatch, 0)) * 0.10
+ COALESCE(has_accept_language, 0) * 0.15
+ (1 - COALESCE(ua_ch_mismatch, 0)) * 0.15
) AS fingerprint_coherence_score
FROM base_data;