fix(bot-detector): neutralize H2 dimensions behind proxy (X-Forwarded-For)
When has_xff=1, the H2 connection is terminated by the reverse proxy/CDN, so client H2 fingerprints are lost. Previously only D1 (h2_settings) was neutralized; D2 (window_update), D3 (pseudo_order), and D4 (priority) still penalized proxied traffic — a real Chrome behind Cloudflare scored 0.0 on 3 dimensions (45% of total weight). Now all 4 H2 dimensions return 0.5 (neutral) when has_xff>0, and non-browser H2 detection is also disabled behind proxies. Tests: 10/10 passed including 3 new XFF-specific cases. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -72,8 +72,10 @@ def _d2_h2_window(df: pd.DataFrame, family: str) -> pd.Series:
|
|||||||
|
|
||||||
Score 1.0 si la valeur correspond à la famille attendue.
|
Score 1.0 si la valeur correspond à la famille attendue.
|
||||||
Score 0.0 si absent (= outil non-navigateur) ou mauvaise famille.
|
Score 0.0 si absent (= outil non-navigateur) ou mauvaise famille.
|
||||||
|
Neutre (0.5) si proxy CDN (has_xff) car le H2 client est perdu.
|
||||||
"""
|
"""
|
||||||
sig = BROWSER_SIGNATURES[family]
|
sig = BROWSER_SIGNATURES[family]
|
||||||
|
has_xff = _col(df, "has_xff")
|
||||||
wu = _col(df, "h2_window_update_value")
|
wu = _col(df, "h2_window_update_value")
|
||||||
tol = sig["h2_window_update_tolerance"]
|
tol = sig["h2_window_update_tolerance"]
|
||||||
expected = sig["h2_window_update"]
|
expected = sig["h2_window_update"]
|
||||||
@ -81,11 +83,14 @@ def _d2_h2_window(df: pd.DataFrame, family: str) -> pd.Series:
|
|||||||
match = ((wu - expected).abs() <= tol).astype(float)
|
match = ((wu - expected).abs() <= tol).astype(float)
|
||||||
absent = (wu == 0).astype(float)
|
absent = (wu == 0).astype(float)
|
||||||
|
|
||||||
# 0 si absent, score sinon
|
base = pd.Series(
|
||||||
return pd.Series(
|
|
||||||
np.where(absent > 0, 0.0, match.values),
|
np.where(absent > 0, 0.0, match.values),
|
||||||
index=df.index,
|
index=df.index,
|
||||||
)
|
)
|
||||||
|
return pd.Series(
|
||||||
|
np.where(has_xff > 0, 0.5, base.values),
|
||||||
|
index=df.index,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _d3_pseudo_order(df: pd.DataFrame, family: str) -> pd.Series:
|
def _d3_pseudo_order(df: pd.DataFrame, family: str) -> pd.Series:
|
||||||
@ -93,20 +98,25 @@ def _d3_pseudo_order(df: pd.DataFrame, family: str) -> pd.Series:
|
|||||||
|
|
||||||
Correspondance exacte → 1.0.
|
Correspondance exacte → 1.0.
|
||||||
Mauvais ordre mais non vide → 0.0.
|
Mauvais ordre mais non vide → 0.0.
|
||||||
Données H2 absentes → neutre 0.5.
|
Données H2 absentes ou proxy CDN (has_xff) → neutre 0.5.
|
||||||
"""
|
"""
|
||||||
sig = BROWSER_SIGNATURES[family]
|
sig = BROWSER_SIGNATURES[family]
|
||||||
expected = sig["pseudo_header_order"]
|
expected = sig["pseudo_header_order"]
|
||||||
|
has_xff = _col(df, "has_xff")
|
||||||
|
|
||||||
# Colonne h2_pseudo_ord_raw si disponible, sinon reconstruire depuis signaux atomiques
|
# Colonne h2_pseudo_ord_raw si disponible, sinon reconstruire depuis signaux atomiques
|
||||||
if "h2_pseudo_ord_raw" in df.columns:
|
if "h2_pseudo_ord_raw" in df.columns:
|
||||||
h2_order = df["h2_pseudo_ord_raw"].fillna("").astype(str)
|
h2_order = df["h2_pseudo_ord_raw"].fillna("").astype(str)
|
||||||
absent = h2_order == ""
|
absent = h2_order == ""
|
||||||
match = (h2_order == expected).astype(float)
|
match = (h2_order == expected).astype(float)
|
||||||
return pd.Series(
|
base = pd.Series(
|
||||||
np.where(absent, 0.5, match.values),
|
np.where(absent, 0.5, match.values),
|
||||||
index=df.index,
|
index=df.index,
|
||||||
)
|
)
|
||||||
|
return pd.Series(
|
||||||
|
np.where(has_xff > 0, 0.5, base.values),
|
||||||
|
index=df.index,
|
||||||
|
)
|
||||||
|
|
||||||
# Fallback sur les signaux atomiques précalculés en SQL
|
# Fallback sur les signaux atomiques précalculés en SQL
|
||||||
if family == "Chrome":
|
if family == "Chrome":
|
||||||
@ -114,16 +124,21 @@ def _d3_pseudo_order(df: pd.DataFrame, family: str) -> pd.Series:
|
|||||||
h2_order_cs = _col(df, "h2_order_chromesafari")
|
h2_order_cs = _col(df, "h2_order_chromesafari")
|
||||||
wu = _col(df, "h2_window_update_value")
|
wu = _col(df, "h2_window_update_value")
|
||||||
chrome_wu = ((wu - 15663105).abs() <= 1000).astype(float)
|
chrome_wu = ((wu - 15663105).abs() <= 1000).astype(float)
|
||||||
return (h2_order_cs * chrome_wu).clip(0, 1)
|
base = (h2_order_cs * chrome_wu).clip(0, 1)
|
||||||
elif family == "Firefox":
|
elif family == "Firefox":
|
||||||
return _col(df, "h2_order_firefox")
|
base = _col(df, "h2_order_firefox")
|
||||||
elif family == "Safari":
|
elif family == "Safari":
|
||||||
h2_order_cs = _col(df, "h2_order_chromesafari")
|
h2_order_cs = _col(df, "h2_order_chromesafari")
|
||||||
wu = _col(df, "h2_window_update_value")
|
wu = _col(df, "h2_window_update_value")
|
||||||
safari_wu = ((wu - 10485760).abs() <= 1000).astype(float)
|
safari_wu = ((wu - 10485760).abs() <= 1000).astype(float)
|
||||||
return (h2_order_cs * safari_wu).clip(0, 1)
|
base = (h2_order_cs * safari_wu).clip(0, 1)
|
||||||
|
else:
|
||||||
|
base = pd.Series(0.5, index=df.index, dtype=float)
|
||||||
|
|
||||||
return pd.Series(0.5, index=df.index, dtype=float)
|
return pd.Series(
|
||||||
|
np.where(has_xff > 0, 0.5, base.values),
|
||||||
|
index=df.index,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _d4_h2_priority(df: pd.DataFrame, family: str) -> pd.Series:
|
def _d4_h2_priority(df: pd.DataFrame, family: str) -> pd.Series:
|
||||||
@ -132,9 +147,11 @@ def _d4_h2_priority(df: pd.DataFrame, family: str) -> pd.Series:
|
|||||||
Firefox attendait des PRIORITY frames (versions anciennes).
|
Firefox attendait des PRIORITY frames (versions anciennes).
|
||||||
Chrome et Safari n'en envoient pas.
|
Chrome et Safari n'en envoient pas.
|
||||||
Score 1.0 si présence/absence correspond à l'attendu.
|
Score 1.0 si présence/absence correspond à l'attendu.
|
||||||
|
Neutre (0.5) si proxy CDN (has_xff) ou pas de données H2.
|
||||||
"""
|
"""
|
||||||
sig = BROWSER_SIGNATURES[family]
|
sig = BROWSER_SIGNATURES[family]
|
||||||
expected_present = sig["h2_priority_frames_expected"]
|
expected_present = sig["h2_priority_frames_expected"]
|
||||||
|
has_xff = _col(df, "has_xff")
|
||||||
|
|
||||||
if "h2_priority_present" not in df.columns:
|
if "h2_priority_present" not in df.columns:
|
||||||
return pd.Series(0.5, index=df.index, dtype=float)
|
return pd.Series(0.5, index=df.index, dtype=float)
|
||||||
@ -147,10 +164,14 @@ def _d4_h2_priority(df: pd.DataFrame, family: str) -> pd.Series:
|
|||||||
wu = _col(df, "h2_window_update_value")
|
wu = _col(df, "h2_window_update_value")
|
||||||
no_h2 = ((h2_known == 0) & (wu == 0)).astype(float)
|
no_h2 = ((h2_known == 0) & (wu == 0)).astype(float)
|
||||||
|
|
||||||
return pd.Series(
|
base = pd.Series(
|
||||||
np.where(no_h2 > 0, 0.5, match.values),
|
np.where(no_h2 > 0, 0.5, match.values),
|
||||||
index=df.index,
|
index=df.index,
|
||||||
)
|
)
|
||||||
|
return pd.Series(
|
||||||
|
np.where(has_xff > 0, 0.5, base.values),
|
||||||
|
index=df.index,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _d5_http_headers(df: pd.DataFrame, family: str) -> pd.Series:
|
def _d5_http_headers(df: pd.DataFrame, family: str) -> pd.Series:
|
||||||
@ -250,7 +271,9 @@ def _matches_non_browser(df: pd.DataFrame) -> pd.Series:
|
|||||||
"""Détecte les signatures d'outils non-navigateur (curl, httpx, go).
|
"""Détecte les signatures d'outils non-navigateur (curl, httpx, go).
|
||||||
|
|
||||||
Retourne un masque booléen.
|
Retourne un masque booléen.
|
||||||
|
Désactivé derrière un proxy CDN (has_xff) : le H2 observé est celui du proxy.
|
||||||
"""
|
"""
|
||||||
|
has_xff = _col(df, "has_xff")
|
||||||
wu = _col(df, "h2_window_update_value")
|
wu = _col(df, "h2_window_update_value")
|
||||||
|
|
||||||
# go net/http a une valeur de WINDOW_UPDATE très distincte
|
# go net/http a une valeur de WINDOW_UPDATE très distincte
|
||||||
@ -260,7 +283,8 @@ def _matches_non_browser(df: pd.DataFrame) -> pd.Series:
|
|||||||
h2_present = _col(df, "h2_settings_known").astype(bool) | (wu > 0)
|
h2_present = _col(df, "h2_settings_known").astype(bool) | (wu > 0)
|
||||||
curl_sig = (wu == 0) & h2_present
|
curl_sig = (wu == 0) & h2_present
|
||||||
|
|
||||||
return (go_sig | curl_sig).astype(bool)
|
# Ignorer les signaux H2 non-browser derrière un proxy
|
||||||
|
return ((go_sig | curl_sig) & (has_xff == 0)).astype(bool)
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
@ -249,16 +249,61 @@ def test_tls_h2_mismatch_detection():
|
|||||||
|
|
||||||
|
|
||||||
def test_cdn_proxy_neutralizes_h2_settings():
|
def test_cdn_proxy_neutralizes_h2_settings():
|
||||||
"""Session derrière un CDN (has_xff=1) : dimension H2 SETTINGS neutralisée (0.5)."""
|
"""Session derrière un CDN (has_xff=1) : toutes les dimensions H2 neutralisées."""
|
||||||
df = _chrome_session(
|
df = _chrome_session(
|
||||||
has_xff=1, # proxy CDN détecté
|
has_xff=1, # proxy CDN détecté
|
||||||
h2_dict_family="", # le proxy a terminé la connexion H2 — fingerprint perdu
|
h2_dict_family="", # le proxy a terminé la connexion H2 — fingerprint perdu
|
||||||
h2_settings_known=0,
|
h2_settings_known=0,
|
||||||
|
h2_window_update_value=0, # pas de H2 du client
|
||||||
)
|
)
|
||||||
result = run_browser_matcher(df)
|
result = run_browser_matcher(df)
|
||||||
# Le score peut être inférieur mais la session ne devrait pas être NON_BROWSER
|
# Le score ne devrait pas être NON_BROWSER
|
||||||
# (le proxy ne rend pas le client malveillant)
|
assert result.loc[0, "bm_non_browser"] == False
|
||||||
assert result.loc[0, "bm_non_browser"] is False or result.loc[0, "bm_non_browser"] == False
|
# Le score devrait rester significatif grâce aux dimensions HTTP headers + TLS + JA4
|
||||||
|
# (H2 dimensions scores 0.5 × 4 = neutre, pas pénalisant)
|
||||||
|
score = result.loc[0, "bm_score"]
|
||||||
|
assert score > 0.2, f"Score trop bas derrière proxy: {score}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_cdn_proxy_score_higher_than_no_proxy_without_h2():
|
||||||
|
"""Un vrai Chrome derrière CDN doit scorer mieux qu'un curl sans XFF.
|
||||||
|
|
||||||
|
Derrière un proxy, les 4 dimensions H2 valent 0.5 (neutre) au lieu de 0.0,
|
||||||
|
donc le score total est plus élevé qu'un outil qui n'a pas de H2 du tout.
|
||||||
|
"""
|
||||||
|
# Chrome derrière CDN : H2 perdu mais headers HTTP + TLS corrects
|
||||||
|
df_proxy = _chrome_session(
|
||||||
|
has_xff=1,
|
||||||
|
h2_dict_family="",
|
||||||
|
h2_settings_known=0,
|
||||||
|
h2_window_update_value=0,
|
||||||
|
)
|
||||||
|
# curl sans proxy : pas de H2 et pas de bons headers
|
||||||
|
df_curl = _curl_session()
|
||||||
|
|
||||||
|
r_proxy = run_browser_matcher(df_proxy)
|
||||||
|
r_curl = run_browser_matcher(df_curl)
|
||||||
|
|
||||||
|
proxy_score = r_proxy.loc[0, "bm_score"]
|
||||||
|
curl_score = r_curl.loc[0, "bm_score"]
|
||||||
|
assert proxy_score > curl_score, (
|
||||||
|
f"Chrome via CDN ({proxy_score}) devrait scorer > curl ({curl_score})"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_cdn_proxy_go_not_flagged_non_browser():
|
||||||
|
"""go net/http derrière un proxy CDN ne devrait pas être marqué NON_BROWSER.
|
||||||
|
|
||||||
|
Le H2 WINDOW_UPDATE observé vient du proxy, pas du client.
|
||||||
|
"""
|
||||||
|
df = _curl_session(
|
||||||
|
has_xff=1,
|
||||||
|
h2_window_update_value=1073676289, # signature go, mais provient du proxy
|
||||||
|
h2_window_absent=0,
|
||||||
|
h2_settings_known=1,
|
||||||
|
)
|
||||||
|
result = run_browser_matcher(df)
|
||||||
|
assert result.loc[0, "bm_non_browser"] == False
|
||||||
|
|
||||||
|
|
||||||
def test_non_browser_go_net_http():
|
def test_non_browser_go_net_http():
|
||||||
|
|||||||
Reference in New Issue
Block a user