fix(bot-detector): neutralize H2 dimensions behind proxy (X-Forwarded-For)

When has_xff=1, the H2 connection is terminated by the reverse proxy/CDN,
so client H2 fingerprints are lost. Previously only D1 (h2_settings) was
neutralized; D2 (window_update), D3 (pseudo_order), and D4 (priority)
still penalized proxied traffic — a real Chrome behind Cloudflare scored
0.0 on 3 dimensions (45% of total weight).

Now all 4 H2 dimensions return 0.5 (neutral) when has_xff>0, and
non-browser H2 detection is also disabled behind proxies.

Tests: 10/10 passed including 3 new XFF-specific cases.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-10 15:15:20 +02:00
parent 261205028d
commit d098de1a66
2 changed files with 83 additions and 14 deletions

View File

@ -249,16 +249,61 @@ def test_tls_h2_mismatch_detection():
def test_cdn_proxy_neutralizes_h2_settings():
"""Session derrière un CDN (has_xff=1) : dimension H2 SETTINGS neutralisée (0.5)."""
"""Session derrière un CDN (has_xff=1) : toutes les dimensions H2 neutralisées."""
df = _chrome_session(
has_xff=1, # proxy CDN détecté
h2_dict_family="", # le proxy a terminé la connexion H2 — fingerprint perdu
h2_settings_known=0,
h2_window_update_value=0, # pas de H2 du client
)
result = run_browser_matcher(df)
# Le score peut être inférieur mais la session ne devrait pas être NON_BROWSER
# (le proxy ne rend pas le client malveillant)
assert result.loc[0, "bm_non_browser"] is False or result.loc[0, "bm_non_browser"] == False
# Le score ne devrait pas être NON_BROWSER
assert result.loc[0, "bm_non_browser"] == False
# Le score devrait rester significatif grâce aux dimensions HTTP headers + TLS + JA4
# (H2 dimensions scores 0.5 × 4 = neutre, pas pénalisant)
score = result.loc[0, "bm_score"]
assert score > 0.2, f"Score trop bas derrière proxy: {score}"
def test_cdn_proxy_score_higher_than_no_proxy_without_h2():
"""Un vrai Chrome derrière CDN doit scorer mieux qu'un curl sans XFF.
Derrière un proxy, les 4 dimensions H2 valent 0.5 (neutre) au lieu de 0.0,
donc le score total est plus élevé qu'un outil qui n'a pas de H2 du tout.
"""
# Chrome derrière CDN : H2 perdu mais headers HTTP + TLS corrects
df_proxy = _chrome_session(
has_xff=1,
h2_dict_family="",
h2_settings_known=0,
h2_window_update_value=0,
)
# curl sans proxy : pas de H2 et pas de bons headers
df_curl = _curl_session()
r_proxy = run_browser_matcher(df_proxy)
r_curl = run_browser_matcher(df_curl)
proxy_score = r_proxy.loc[0, "bm_score"]
curl_score = r_curl.loc[0, "bm_score"]
assert proxy_score > curl_score, (
f"Chrome via CDN ({proxy_score}) devrait scorer > curl ({curl_score})"
)
def test_cdn_proxy_go_not_flagged_non_browser():
"""go net/http derrière un proxy CDN ne devrait pas être marqué NON_BROWSER.
Le H2 WINDOW_UPDATE observé vient du proxy, pas du client.
"""
df = _curl_session(
has_xff=1,
h2_window_update_value=1073676289, # signature go, mais provient du proxy
h2_window_absent=0,
h2_settings_known=1,
)
result = run_browser_matcher(df)
assert result.loc[0, "bm_non_browser"] == False
def test_non_browser_go_net_http():