feat(h2): direct per-parameter SETTINGS comparison in browser_matcher

- Rewrote _d1_h2_settings() with 3-signal weighted formula:
  direct_score×0.60 + dict_match×0.30 + ja4_coherence×0.10
  when individual SETTINGS cols are available in the DataFrame
- Added _H2_SETTINGS_COLS dict (IDs 1,2,3,4,5,6,8 → column names)
- Fallback to dict_match×0.80 + ja4_coherence×0.20 for backward compat
- Fix view_ai_features_1h: pass 7 individual SETTINGS columns through
  base_data CTE (h2_header_table_size, h2_enable_push,
  h2_max_concurrent_streams, h2_initial_window_size, h2_max_frame_size,
  h2_max_header_list_size, h2_enable_connect_protocol)
- Remove non-existent h2_dict_confidence reference from view SQL
  (dict_browser_h2 only exposes browser_family attribute)
- Add 7 new pytest cases: exact match, one wrong setting, forbidden key
  penalty, unknown fingerprint with correct settings, fallback path,
  CDN proxy neutralisation, full Chrome simulation
- 53/53 bot-detector tests pass
- Update thesis §3.9.2: document direct comparison algorithm + fallback

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-11 03:05:36 +02:00
parent 95e87149aa
commit f704541f83
4 changed files with 259 additions and 45 deletions

View File

@ -126,8 +126,6 @@ WITH base_data AS (
-- §4 — Famille identifiée par le dictionnaire H2 (browser_matcher)
dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family',
tuple(h2_fp), '') AS h2_dict_family,
dictGetOrDefault('ja4_processing.dict_browser_h2', 'confidence',
tuple(h2_fp), toFloat32(0.0)) AS h2_dict_confidence,
-- §4 — Valeur brute WINDOW_UPDATE H2 (signal de famille le plus fiable)
h2_wu_val AS h2_window_update_value,
-- §4 — Signaux atomiques H2 pour le browser_matcher et le vecteur ML
@ -145,7 +143,15 @@ WITH base_data AS (
AND arrayElement(splitByChar('|', h2_fp), 3) NOT IN ('', '0')
) AS h2_priority_present,
-- Valeur brute du pseudo-header order (pour le matcher Python)
h2_pseudo_ord AS h2_pseudo_ord_raw
h2_pseudo_ord AS h2_pseudo_ord_raw,
-- Colonnes SETTINGS individuelles — passthrough pour _d1_h2_settings (browser_matcher)
h2_header_table_size AS h2_header_table_size,
h2_enable_push AS h2_enable_push,
h2_max_concurrent_streams AS h2_max_concurrent_streams,
h2_initial_window_size AS h2_initial_window_size,
h2_max_frame_size AS h2_max_frame_size,
h2_max_header_list_size AS h2_max_header_list_size,
h2_enable_connect_protocol AS h2_enable_connect_protocol
FROM (
-- Jointure unique avec aliases explicites (contournement bug scope ClickHouse 24.8
-- où PARTITION BY src_ip échoue quand plusieurs sources de JOIN exposent src_ip)
@ -219,7 +225,15 @@ WITH base_data AS (
-- colonnes HTTP/2 (defaut vide/0 si pas de trafic H2)
COALESCE(h2.h2_fp, '') AS h2_fp,
COALESCE(h2.h2_pseudo_ord, '') AS h2_pseudo_ord,
COALESCE(h2.h2_wu_val, 0) AS h2_wu_val
COALESCE(h2.h2_wu_val, 0) AS h2_wu_val,
-- SETTINGS individuels : NULL → -1 quand absent du preface
COALESCE(h2.h2_set_1, -1) AS h2_header_table_size,
COALESCE(h2.h2_set_2, -1) AS h2_enable_push,
COALESCE(h2.h2_set_3, -1) AS h2_max_concurrent_streams,
COALESCE(h2.h2_set_4, -1) AS h2_initial_window_size,
COALESCE(h2.h2_set_5, -1) AS h2_max_frame_size,
COALESCE(h2.h2_set_6, -1) AS h2_max_header_list_size,
COALESCE(h2.h2_set_8, -1) AS h2_enable_connect_protocol
FROM (
SELECT
window_start, src_ip, ja4, host, src_asn,
@ -284,7 +298,15 @@ WITH base_data AS (
toIPv6(src_ip) AS h2_ip,
anyIf(h2_fingerprint, h2_fingerprint != '') AS h2_fp,
anyIf(h2_pseudo_order, h2_pseudo_order != '') AS h2_pseudo_ord,
anyIf(h2_window_update, h2_window_update > 0) AS h2_wu_val
anyIf(h2_window_update, h2_window_update > 0) AS h2_wu_val,
-- Colonnes SETTINGS individuelles (ID 16, 8) : -1 = absent du preface client
toInt32(anyIf(toNullable(h2_header_table_size), h2_header_table_size >= 0)) AS h2_set_1,
toInt32(anyIf(toNullable(h2_enable_push), h2_enable_push >= 0)) AS h2_set_2,
toInt32(anyIf(toNullable(h2_max_concurrent_streams), h2_max_concurrent_streams >= 0)) AS h2_set_3,
toInt64(anyIf(toNullable(h2_initial_window_size), h2_initial_window_size >= 0)) AS h2_set_4,
toInt32(anyIf(toNullable(h2_max_frame_size), h2_max_frame_size >= 0)) AS h2_set_5,
toInt32(anyIf(toNullable(h2_max_header_list_size), h2_max_header_list_size >= 0)) AS h2_set_6,
toInt32(anyIf(toNullable(h2_enable_connect_protocol), h2_enable_connect_protocol >= 0)) AS h2_set_8
FROM ja4_logs.http_logs
WHERE time >= now() - INTERVAL 24 HOUR
AND (h2_fingerprint != '' OR h2_pseudo_order != '')