fix(bot-detector): neutralize H2 dimensions behind proxy (X-Forwarded-For)

When has_xff=1, the H2 connection is terminated by the reverse proxy/CDN,
so client H2 fingerprints are lost. Previously only D1 (h2_settings) was
neutralized; D2 (window_update), D3 (pseudo_order), and D4 (priority)
still penalized proxied traffic — a real Chrome behind Cloudflare scored
0.0 on 3 dimensions (45% of total weight).

Now all 4 H2 dimensions return 0.5 (neutral) when has_xff>0, and
non-browser H2 detection is also disabled behind proxies.

Tests: 10/10 passed including 3 new XFF-specific cases.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-10 15:15:20 +02:00
parent 261205028d
commit d098de1a66
2 changed files with 83 additions and 14 deletions

View File

@ -72,8 +72,10 @@ def _d2_h2_window(df: pd.DataFrame, family: str) -> pd.Series:
Score 1.0 si la valeur correspond à la famille attendue.
Score 0.0 si absent (= outil non-navigateur) ou mauvaise famille.
Neutre (0.5) si proxy CDN (has_xff) car le H2 client est perdu.
"""
sig = BROWSER_SIGNATURES[family]
has_xff = _col(df, "has_xff")
wu = _col(df, "h2_window_update_value")
tol = sig["h2_window_update_tolerance"]
expected = sig["h2_window_update"]
@ -81,11 +83,14 @@ def _d2_h2_window(df: pd.DataFrame, family: str) -> pd.Series:
match = ((wu - expected).abs() <= tol).astype(float)
absent = (wu == 0).astype(float)
# 0 si absent, score sinon
return pd.Series(
base = pd.Series(
np.where(absent > 0, 0.0, match.values),
index=df.index,
)
return pd.Series(
np.where(has_xff > 0, 0.5, base.values),
index=df.index,
)
def _d3_pseudo_order(df: pd.DataFrame, family: str) -> pd.Series:
@ -93,20 +98,25 @@ def _d3_pseudo_order(df: pd.DataFrame, family: str) -> pd.Series:
Correspondance exacte → 1.0.
Mauvais ordre mais non vide → 0.0.
Données H2 absentes → neutre 0.5.
Données H2 absentes ou proxy CDN (has_xff) → neutre 0.5.
"""
sig = BROWSER_SIGNATURES[family]
expected = sig["pseudo_header_order"]
has_xff = _col(df, "has_xff")
# Colonne h2_pseudo_ord_raw si disponible, sinon reconstruire depuis signaux atomiques
if "h2_pseudo_ord_raw" in df.columns:
h2_order = df["h2_pseudo_ord_raw"].fillna("").astype(str)
absent = h2_order == ""
match = (h2_order == expected).astype(float)
return pd.Series(
base = pd.Series(
np.where(absent, 0.5, match.values),
index=df.index,
)
return pd.Series(
np.where(has_xff > 0, 0.5, base.values),
index=df.index,
)
# Fallback sur les signaux atomiques précalculés en SQL
if family == "Chrome":
@ -114,16 +124,21 @@ def _d3_pseudo_order(df: pd.DataFrame, family: str) -> pd.Series:
h2_order_cs = _col(df, "h2_order_chromesafari")
wu = _col(df, "h2_window_update_value")
chrome_wu = ((wu - 15663105).abs() <= 1000).astype(float)
return (h2_order_cs * chrome_wu).clip(0, 1)
base = (h2_order_cs * chrome_wu).clip(0, 1)
elif family == "Firefox":
return _col(df, "h2_order_firefox")
base = _col(df, "h2_order_firefox")
elif family == "Safari":
h2_order_cs = _col(df, "h2_order_chromesafari")
wu = _col(df, "h2_window_update_value")
safari_wu = ((wu - 10485760).abs() <= 1000).astype(float)
return (h2_order_cs * safari_wu).clip(0, 1)
base = (h2_order_cs * safari_wu).clip(0, 1)
else:
base = pd.Series(0.5, index=df.index, dtype=float)
return pd.Series(0.5, index=df.index, dtype=float)
return pd.Series(
np.where(has_xff > 0, 0.5, base.values),
index=df.index,
)
def _d4_h2_priority(df: pd.DataFrame, family: str) -> pd.Series:
@ -132,9 +147,11 @@ def _d4_h2_priority(df: pd.DataFrame, family: str) -> pd.Series:
Firefox attendait des PRIORITY frames (versions anciennes).
Chrome et Safari n'en envoient pas.
Score 1.0 si présence/absence correspond à l'attendu.
Neutre (0.5) si proxy CDN (has_xff) ou pas de données H2.
"""
sig = BROWSER_SIGNATURES[family]
expected_present = sig["h2_priority_frames_expected"]
has_xff = _col(df, "has_xff")
if "h2_priority_present" not in df.columns:
return pd.Series(0.5, index=df.index, dtype=float)
@ -147,10 +164,14 @@ def _d4_h2_priority(df: pd.DataFrame, family: str) -> pd.Series:
wu = _col(df, "h2_window_update_value")
no_h2 = ((h2_known == 0) & (wu == 0)).astype(float)
return pd.Series(
base = pd.Series(
np.where(no_h2 > 0, 0.5, match.values),
index=df.index,
)
return pd.Series(
np.where(has_xff > 0, 0.5, base.values),
index=df.index,
)
def _d5_http_headers(df: pd.DataFrame, family: str) -> pd.Series:
@ -250,7 +271,9 @@ def _matches_non_browser(df: pd.DataFrame) -> pd.Series:
"""Détecte les signatures d'outils non-navigateur (curl, httpx, go).
Retourne un masque booléen.
Désactivé derrière un proxy CDN (has_xff) : le H2 observé est celui du proxy.
"""
has_xff = _col(df, "has_xff")
wu = _col(df, "h2_window_update_value")
# go net/http a une valeur de WINDOW_UPDATE très distincte
@ -260,7 +283,8 @@ def _matches_non_browser(df: pd.DataFrame) -> pd.Series:
h2_present = _col(df, "h2_settings_known").astype(bool) | (wu > 0)
curl_sig = (wu == 0) & h2_present
return (go_sig | curl_sig).astype(bool)
# Ignorer les signaux H2 non-browser derrière un proxy
return ((go_sig | curl_sig) & (has_xff == 0)).astype(bool)
# ─────────────────────────────────────────────────────────────────────────────

View File

@ -249,16 +249,61 @@ def test_tls_h2_mismatch_detection():
def test_cdn_proxy_neutralizes_h2_settings():
"""Session derrière un CDN (has_xff=1) : dimension H2 SETTINGS neutralisée (0.5)."""
"""Session derrière un CDN (has_xff=1) : toutes les dimensions H2 neutralisées."""
df = _chrome_session(
has_xff=1, # proxy CDN détecté
h2_dict_family="", # le proxy a terminé la connexion H2 — fingerprint perdu
h2_settings_known=0,
h2_window_update_value=0, # pas de H2 du client
)
result = run_browser_matcher(df)
# Le score peut être inférieur mais la session ne devrait pas être NON_BROWSER
# (le proxy ne rend pas le client malveillant)
assert result.loc[0, "bm_non_browser"] is False or result.loc[0, "bm_non_browser"] == False
# Le score ne devrait pas être NON_BROWSER
assert result.loc[0, "bm_non_browser"] == False
# Le score devrait rester significatif grâce aux dimensions HTTP headers + TLS + JA4
# (H2 dimensions scores 0.5 × 4 = neutre, pas pénalisant)
score = result.loc[0, "bm_score"]
assert score > 0.2, f"Score trop bas derrière proxy: {score}"
def test_cdn_proxy_score_higher_than_no_proxy_without_h2():
"""Un vrai Chrome derrière CDN doit scorer mieux qu'un curl sans XFF.
Derrière un proxy, les 4 dimensions H2 valent 0.5 (neutre) au lieu de 0.0,
donc le score total est plus élevé qu'un outil qui n'a pas de H2 du tout.
"""
# Chrome derrière CDN : H2 perdu mais headers HTTP + TLS corrects
df_proxy = _chrome_session(
has_xff=1,
h2_dict_family="",
h2_settings_known=0,
h2_window_update_value=0,
)
# curl sans proxy : pas de H2 et pas de bons headers
df_curl = _curl_session()
r_proxy = run_browser_matcher(df_proxy)
r_curl = run_browser_matcher(df_curl)
proxy_score = r_proxy.loc[0, "bm_score"]
curl_score = r_curl.loc[0, "bm_score"]
assert proxy_score > curl_score, (
f"Chrome via CDN ({proxy_score}) devrait scorer > curl ({curl_score})"
)
def test_cdn_proxy_go_not_flagged_non_browser():
"""go net/http derrière un proxy CDN ne devrait pas être marqué NON_BROWSER.
Le H2 WINDOW_UPDATE observé vient du proxy, pas du client.
"""
df = _curl_session(
has_xff=1,
h2_window_update_value=1073676289, # signature go, mais provient du proxy
h2_window_absent=0,
h2_settings_known=1,
)
result = run_browser_matcher(df)
assert result.loc[0, "bm_non_browser"] == False
def test_non_browser_go_net_http():