feat(h2): direct per-parameter SETTINGS comparison in browser_matcher
- Rewrote _d1_h2_settings() with 3-signal weighted formula: direct_score×0.60 + dict_match×0.30 + ja4_coherence×0.10 when individual SETTINGS cols are available in the DataFrame - Added _H2_SETTINGS_COLS dict (IDs 1,2,3,4,5,6,8 → column names) - Fallback to dict_match×0.80 + ja4_coherence×0.20 for backward compat - Fix view_ai_features_1h: pass 7 individual SETTINGS columns through base_data CTE (h2_header_table_size, h2_enable_push, h2_max_concurrent_streams, h2_initial_window_size, h2_max_frame_size, h2_max_header_list_size, h2_enable_connect_protocol) - Remove non-existent h2_dict_confidence reference from view SQL (dict_browser_h2 only exposes browser_family attribute) - Add 7 new pytest cases: exact match, one wrong setting, forbidden key penalty, unknown fingerprint with correct settings, fallback path, CDN proxy neutralisation, full Chrome simulation - 53/53 bot-detector tests pass - Update thesis §3.9.2: document direct comparison algorithm + fallback Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -126,8 +126,6 @@ WITH base_data AS (
|
||||
-- §4 — Famille identifiée par le dictionnaire H2 (browser_matcher)
|
||||
dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family',
|
||||
tuple(h2_fp), '') AS h2_dict_family,
|
||||
dictGetOrDefault('ja4_processing.dict_browser_h2', 'confidence',
|
||||
tuple(h2_fp), toFloat32(0.0)) AS h2_dict_confidence,
|
||||
-- §4 — Valeur brute WINDOW_UPDATE H2 (signal de famille le plus fiable)
|
||||
h2_wu_val AS h2_window_update_value,
|
||||
-- §4 — Signaux atomiques H2 pour le browser_matcher et le vecteur ML
|
||||
@ -145,7 +143,15 @@ WITH base_data AS (
|
||||
AND arrayElement(splitByChar('|', h2_fp), 3) NOT IN ('', '0')
|
||||
) AS h2_priority_present,
|
||||
-- Valeur brute du pseudo-header order (pour le matcher Python)
|
||||
h2_pseudo_ord AS h2_pseudo_ord_raw
|
||||
h2_pseudo_ord AS h2_pseudo_ord_raw,
|
||||
-- Colonnes SETTINGS individuelles — passthrough pour _d1_h2_settings (browser_matcher)
|
||||
h2_header_table_size AS h2_header_table_size,
|
||||
h2_enable_push AS h2_enable_push,
|
||||
h2_max_concurrent_streams AS h2_max_concurrent_streams,
|
||||
h2_initial_window_size AS h2_initial_window_size,
|
||||
h2_max_frame_size AS h2_max_frame_size,
|
||||
h2_max_header_list_size AS h2_max_header_list_size,
|
||||
h2_enable_connect_protocol AS h2_enable_connect_protocol
|
||||
FROM (
|
||||
-- Jointure unique avec aliases explicites (contournement bug scope ClickHouse 24.8
|
||||
-- où PARTITION BY src_ip échoue quand plusieurs sources de JOIN exposent src_ip)
|
||||
@ -219,7 +225,15 @@ WITH base_data AS (
|
||||
-- colonnes HTTP/2 (defaut vide/0 si pas de trafic H2)
|
||||
COALESCE(h2.h2_fp, '') AS h2_fp,
|
||||
COALESCE(h2.h2_pseudo_ord, '') AS h2_pseudo_ord,
|
||||
COALESCE(h2.h2_wu_val, 0) AS h2_wu_val
|
||||
COALESCE(h2.h2_wu_val, 0) AS h2_wu_val,
|
||||
-- SETTINGS individuels : NULL → -1 quand absent du preface
|
||||
COALESCE(h2.h2_set_1, -1) AS h2_header_table_size,
|
||||
COALESCE(h2.h2_set_2, -1) AS h2_enable_push,
|
||||
COALESCE(h2.h2_set_3, -1) AS h2_max_concurrent_streams,
|
||||
COALESCE(h2.h2_set_4, -1) AS h2_initial_window_size,
|
||||
COALESCE(h2.h2_set_5, -1) AS h2_max_frame_size,
|
||||
COALESCE(h2.h2_set_6, -1) AS h2_max_header_list_size,
|
||||
COALESCE(h2.h2_set_8, -1) AS h2_enable_connect_protocol
|
||||
FROM (
|
||||
SELECT
|
||||
window_start, src_ip, ja4, host, src_asn,
|
||||
@ -284,7 +298,15 @@ WITH base_data AS (
|
||||
toIPv6(src_ip) AS h2_ip,
|
||||
anyIf(h2_fingerprint, h2_fingerprint != '') AS h2_fp,
|
||||
anyIf(h2_pseudo_order, h2_pseudo_order != '') AS h2_pseudo_ord,
|
||||
anyIf(h2_window_update, h2_window_update > 0) AS h2_wu_val
|
||||
anyIf(h2_window_update, h2_window_update > 0) AS h2_wu_val,
|
||||
-- Colonnes SETTINGS individuelles (ID 1–6, 8) : -1 = absent du preface client
|
||||
toInt32(anyIf(toNullable(h2_header_table_size), h2_header_table_size >= 0)) AS h2_set_1,
|
||||
toInt32(anyIf(toNullable(h2_enable_push), h2_enable_push >= 0)) AS h2_set_2,
|
||||
toInt32(anyIf(toNullable(h2_max_concurrent_streams), h2_max_concurrent_streams >= 0)) AS h2_set_3,
|
||||
toInt64(anyIf(toNullable(h2_initial_window_size), h2_initial_window_size >= 0)) AS h2_set_4,
|
||||
toInt32(anyIf(toNullable(h2_max_frame_size), h2_max_frame_size >= 0)) AS h2_set_5,
|
||||
toInt32(anyIf(toNullable(h2_max_header_list_size), h2_max_header_list_size >= 0)) AS h2_set_6,
|
||||
toInt32(anyIf(toNullable(h2_enable_connect_protocol), h2_enable_connect_protocol >= 0)) AS h2_set_8
|
||||
FROM ja4_logs.http_logs
|
||||
WHERE time >= now() - INTERVAL 24 HOUR
|
||||
AND (h2_fingerprint != '' OR h2_pseudo_order != '')
|
||||
|
||||
Reference in New Issue
Block a user