From d098de1a6666879bd216054b01e99e939fdf19ad Mon Sep 17 00:00:00 2001 From: toto Date: Fri, 10 Apr 2026 15:15:20 +0200 Subject: [PATCH] fix(bot-detector): neutralize H2 dimensions behind proxy (X-Forwarded-For) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When has_xff=1, the H2 connection is terminated by the reverse proxy/CDN, so client H2 fingerprints are lost. Previously only D1 (h2_settings) was neutralized; D2 (window_update), D3 (pseudo_order), and D4 (priority) still penalized proxied traffic — a real Chrome behind Cloudflare scored 0.0 on 3 dimensions (45% of total weight). Now all 4 H2 dimensions return 0.5 (neutral) when has_xff>0, and non-browser H2 detection is also disabled behind proxies. Tests: 10/10 passed including 3 new XFF-specific cases. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../bot_detector/browser_matcher.py | 44 +++++++++++---- .../tests/test_browser_matcher.py | 53 +++++++++++++++++-- 2 files changed, 83 insertions(+), 14 deletions(-) diff --git a/services/bot-detector/bot_detector/browser_matcher.py b/services/bot-detector/bot_detector/browser_matcher.py index 0ba7946..3b0a79f 100644 --- a/services/bot-detector/bot_detector/browser_matcher.py +++ b/services/bot-detector/bot_detector/browser_matcher.py @@ -72,8 +72,10 @@ def _d2_h2_window(df: pd.DataFrame, family: str) -> pd.Series: Score 1.0 si la valeur correspond à la famille attendue. Score 0.0 si absent (= outil non-navigateur) ou mauvaise famille. + Neutre (0.5) si proxy CDN (has_xff) car le H2 client est perdu. """ sig = BROWSER_SIGNATURES[family] + has_xff = _col(df, "has_xff") wu = _col(df, "h2_window_update_value") tol = sig["h2_window_update_tolerance"] expected = sig["h2_window_update"] @@ -81,11 +83,14 @@ def _d2_h2_window(df: pd.DataFrame, family: str) -> pd.Series: match = ((wu - expected).abs() <= tol).astype(float) absent = (wu == 0).astype(float) - # 0 si absent, score sinon - return pd.Series( + base = pd.Series( np.where(absent > 0, 0.0, match.values), index=df.index, ) + return pd.Series( + np.where(has_xff > 0, 0.5, base.values), + index=df.index, + ) def _d3_pseudo_order(df: pd.DataFrame, family: str) -> pd.Series: @@ -93,20 +98,25 @@ def _d3_pseudo_order(df: pd.DataFrame, family: str) -> pd.Series: Correspondance exacte → 1.0. Mauvais ordre mais non vide → 0.0. - Données H2 absentes → neutre 0.5. + Données H2 absentes ou proxy CDN (has_xff) → neutre 0.5. """ sig = BROWSER_SIGNATURES[family] expected = sig["pseudo_header_order"] + has_xff = _col(df, "has_xff") # Colonne h2_pseudo_ord_raw si disponible, sinon reconstruire depuis signaux atomiques if "h2_pseudo_ord_raw" in df.columns: h2_order = df["h2_pseudo_ord_raw"].fillna("").astype(str) absent = h2_order == "" match = (h2_order == expected).astype(float) - return pd.Series( + base = pd.Series( np.where(absent, 0.5, match.values), index=df.index, ) + return pd.Series( + np.where(has_xff > 0, 0.5, base.values), + index=df.index, + ) # Fallback sur les signaux atomiques précalculés en SQL if family == "Chrome": @@ -114,16 +124,21 @@ def _d3_pseudo_order(df: pd.DataFrame, family: str) -> pd.Series: h2_order_cs = _col(df, "h2_order_chromesafari") wu = _col(df, "h2_window_update_value") chrome_wu = ((wu - 15663105).abs() <= 1000).astype(float) - return (h2_order_cs * chrome_wu).clip(0, 1) + base = (h2_order_cs * chrome_wu).clip(0, 1) elif family == "Firefox": - return _col(df, "h2_order_firefox") + base = _col(df, "h2_order_firefox") elif family == "Safari": h2_order_cs = _col(df, "h2_order_chromesafari") wu = _col(df, "h2_window_update_value") safari_wu = ((wu - 10485760).abs() <= 1000).astype(float) - return (h2_order_cs * safari_wu).clip(0, 1) + base = (h2_order_cs * safari_wu).clip(0, 1) + else: + base = pd.Series(0.5, index=df.index, dtype=float) - return pd.Series(0.5, index=df.index, dtype=float) + return pd.Series( + np.where(has_xff > 0, 0.5, base.values), + index=df.index, + ) def _d4_h2_priority(df: pd.DataFrame, family: str) -> pd.Series: @@ -132,9 +147,11 @@ def _d4_h2_priority(df: pd.DataFrame, family: str) -> pd.Series: Firefox attendait des PRIORITY frames (versions anciennes). Chrome et Safari n'en envoient pas. Score 1.0 si présence/absence correspond à l'attendu. + Neutre (0.5) si proxy CDN (has_xff) ou pas de données H2. """ sig = BROWSER_SIGNATURES[family] expected_present = sig["h2_priority_frames_expected"] + has_xff = _col(df, "has_xff") if "h2_priority_present" not in df.columns: return pd.Series(0.5, index=df.index, dtype=float) @@ -147,10 +164,14 @@ def _d4_h2_priority(df: pd.DataFrame, family: str) -> pd.Series: wu = _col(df, "h2_window_update_value") no_h2 = ((h2_known == 0) & (wu == 0)).astype(float) - return pd.Series( + base = pd.Series( np.where(no_h2 > 0, 0.5, match.values), index=df.index, ) + return pd.Series( + np.where(has_xff > 0, 0.5, base.values), + index=df.index, + ) def _d5_http_headers(df: pd.DataFrame, family: str) -> pd.Series: @@ -250,7 +271,9 @@ def _matches_non_browser(df: pd.DataFrame) -> pd.Series: """Détecte les signatures d'outils non-navigateur (curl, httpx, go). Retourne un masque booléen. + Désactivé derrière un proxy CDN (has_xff) : le H2 observé est celui du proxy. """ + has_xff = _col(df, "has_xff") wu = _col(df, "h2_window_update_value") # go net/http a une valeur de WINDOW_UPDATE très distincte @@ -260,7 +283,8 @@ def _matches_non_browser(df: pd.DataFrame) -> pd.Series: h2_present = _col(df, "h2_settings_known").astype(bool) | (wu > 0) curl_sig = (wu == 0) & h2_present - return (go_sig | curl_sig).astype(bool) + # Ignorer les signaux H2 non-browser derrière un proxy + return ((go_sig | curl_sig) & (has_xff == 0)).astype(bool) # ───────────────────────────────────────────────────────────────────────────── diff --git a/services/bot-detector/bot_detector/tests/test_browser_matcher.py b/services/bot-detector/bot_detector/tests/test_browser_matcher.py index 6b47557..cd40d45 100644 --- a/services/bot-detector/bot_detector/tests/test_browser_matcher.py +++ b/services/bot-detector/bot_detector/tests/test_browser_matcher.py @@ -249,16 +249,61 @@ def test_tls_h2_mismatch_detection(): def test_cdn_proxy_neutralizes_h2_settings(): - """Session derrière un CDN (has_xff=1) : dimension H2 SETTINGS neutralisée (0.5).""" + """Session derrière un CDN (has_xff=1) : toutes les dimensions H2 neutralisées.""" df = _chrome_session( has_xff=1, # proxy CDN détecté h2_dict_family="", # le proxy a terminé la connexion H2 — fingerprint perdu h2_settings_known=0, + h2_window_update_value=0, # pas de H2 du client ) result = run_browser_matcher(df) - # Le score peut être inférieur mais la session ne devrait pas être NON_BROWSER - # (le proxy ne rend pas le client malveillant) - assert result.loc[0, "bm_non_browser"] is False or result.loc[0, "bm_non_browser"] == False + # Le score ne devrait pas être NON_BROWSER + assert result.loc[0, "bm_non_browser"] == False + # Le score devrait rester significatif grâce aux dimensions HTTP headers + TLS + JA4 + # (H2 dimensions scores 0.5 × 4 = neutre, pas pénalisant) + score = result.loc[0, "bm_score"] + assert score > 0.2, f"Score trop bas derrière proxy: {score}" + + +def test_cdn_proxy_score_higher_than_no_proxy_without_h2(): + """Un vrai Chrome derrière CDN doit scorer mieux qu'un curl sans XFF. + + Derrière un proxy, les 4 dimensions H2 valent 0.5 (neutre) au lieu de 0.0, + donc le score total est plus élevé qu'un outil qui n'a pas de H2 du tout. + """ + # Chrome derrière CDN : H2 perdu mais headers HTTP + TLS corrects + df_proxy = _chrome_session( + has_xff=1, + h2_dict_family="", + h2_settings_known=0, + h2_window_update_value=0, + ) + # curl sans proxy : pas de H2 et pas de bons headers + df_curl = _curl_session() + + r_proxy = run_browser_matcher(df_proxy) + r_curl = run_browser_matcher(df_curl) + + proxy_score = r_proxy.loc[0, "bm_score"] + curl_score = r_curl.loc[0, "bm_score"] + assert proxy_score > curl_score, ( + f"Chrome via CDN ({proxy_score}) devrait scorer > curl ({curl_score})" + ) + + +def test_cdn_proxy_go_not_flagged_non_browser(): + """go net/http derrière un proxy CDN ne devrait pas être marqué NON_BROWSER. + + Le H2 WINDOW_UPDATE observé vient du proxy, pas du client. + """ + df = _curl_session( + has_xff=1, + h2_window_update_value=1073676289, # signature go, mais provient du proxy + h2_window_absent=0, + h2_settings_known=1, + ) + result = run_browser_matcher(df) + assert result.loc[0, "bm_non_browser"] == False def test_non_browser_go_net_http():