refactor: suppression dépendance User-Agent de la détection navigateur
Changements SQL : - modern_browser_score : sec-ch-ua→100, Sec-Fetch→70 (plus de UA fallback) - Ajout has_sec_ch_ua (UInt8) dans agg_header_fingerprint_1h et ml_all_scores - mss_mobile_mismatch utilise has_sec_ch_ua au lieu de modern_browser_score - header_order_confidence : PARTITION BY ja4 au lieu de first_ua - sec_ch_mobile_mismatch : comparaison Client Hints interne (sans UA) - Migration 03_remove_ua_browser_detection.sql Changements Python : - browser.py Axe 3 : Client Hints + Sec-Fetch + is_fake_navigation (PAS de UA) - Pondération axes : ja4_known 0.30, tls_coherence 0.20 (signaux TLS renforcés) - preprocessing.py : has_sec_ch_ua ajouté aux features et binary_features Fichiers modifiés : 8 SQL/Python + 1 migration, 36/36 tests passent. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -1,8 +1,8 @@
|
||||
"""A9 — Identification multifactorielle des navigateurs.
|
||||
|
||||
5 axes indépendants combinés pour identifier les navigateurs légitimes.
|
||||
Chaque axe produit un score [0,1]. La combinaison pondérée produit
|
||||
browser_confidence [0,1], remplaçant l'ancien système binaire JA4-only.
|
||||
Aucun axe ne repose sur le User-Agent (trivalement falsifiable).
|
||||
Signaux principaux : JA4 (TLS), Client Hints, Sec-Fetch-*, comportement.
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
@ -22,10 +22,11 @@ _BROWSER_JA4_PROFILES = {
|
||||
}
|
||||
|
||||
# Pondération des 5 axes pour le score browser_confidence.
|
||||
# Favorise les signaux TLS (difficiles à falsifier) sur HTTP.
|
||||
_AXIS_WEIGHTS = {
|
||||
'ja4_known': 0.25, # Axe 1 — Signature JA4 dans dict_browser_ja4
|
||||
'ja4_known': 0.30, # Axe 1 — Signature JA4 dans dict_browser_ja4 (TLS fingerprint)
|
||||
'ja4_struct': 0.15, # Axe 2 — Structure JA4 (TLS1.3, h2, nb ciphers/ext)
|
||||
'http_modern': 0.25, # Axe 3 — Headers HTTP modernes (sec-ch-ua, Sec-Fetch-*)
|
||||
'http_modern': 0.20, # Axe 3 — Client Hints + Sec-Fetch-* (PAS de User-Agent)
|
||||
'nav_behavior': 0.15, # Axe 4 — Comportement de navigation (assets, referers)
|
||||
'tls_coherence': 0.20, # Axe 5 — Cohérence TLS/TCP (pas de mismatch)
|
||||
}
|
||||
@ -80,18 +81,21 @@ def _compute_browser_axes(df: pd.DataFrame) -> pd.DataFrame:
|
||||
is_tls13 * 0.35 + is_h2h3 * 0.25 + ciphers_ok * 0.20 + exts_ok * 0.20
|
||||
)
|
||||
|
||||
# ── Axe 3 — Headers HTTP modernes ──
|
||||
mbs = df.get('modern_browser_score', pd.Series(0, index=df.index)).fillna(0)
|
||||
# ── Axe 3 — Signaux HTTP sans User-Agent ──
|
||||
# Client Hints (sec-ch-ua) = signal fort, nécessite un vrai moteur Chromium.
|
||||
# Sec-Fetch-* = signal moyen, envoyé automatiquement par les navigateurs modernes.
|
||||
# Le User-Agent n'est PAS utilisé (trivalement falsifiable par n'importe quel bot).
|
||||
scu = df.get('has_sec_ch_ua', pd.Series(0, index=df.index)).fillna(0)
|
||||
hal = df.get('has_accept_language', pd.Series(0, index=df.index)).fillna(0)
|
||||
sfa = df.get('sec_fetch_absence_rate', pd.Series(1, index=df.index)).fillna(1)
|
||||
gar = df.get('generic_accept_ratio', pd.Series(1, index=df.index)).fillna(1)
|
||||
uam = df.get('ua_ch_mismatch', pd.Series(0, index=df.index)).fillna(0)
|
||||
ifn = df.get('is_fake_navigation', pd.Series(0, index=df.index)).fillna(0)
|
||||
axes['axis_http_modern'] = (
|
||||
(mbs >= 50).astype(float) * 0.35
|
||||
+ (hal > 0).astype(float) * 0.20
|
||||
+ (sfa < 0.3).astype(float) * 0.25
|
||||
+ (gar < 0.3).astype(float) * 0.10
|
||||
+ (uam == 0).astype(float) * 0.10
|
||||
(scu > 0).astype(float) * 0.30
|
||||
+ (sfa < 0.2).astype(float) * 0.30
|
||||
+ (hal > 0).astype(float) * 0.15
|
||||
+ (gar < 0.3).astype(float) * 0.15
|
||||
+ (ifn == 0).astype(float) * 0.10
|
||||
)
|
||||
|
||||
# ── Axe 4 — Comportement de navigation ──
|
||||
|
||||
@ -20,7 +20,7 @@ FEATURES = [
|
||||
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio',
|
||||
'orphan_ratio', 'max_keepalives', 'tcp_shared_count', 'header_order_shared_count',
|
||||
'header_count', 'has_accept_language', 'has_cookie', 'has_referer',
|
||||
'modern_browser_score', 'ua_ch_mismatch', 'ip_id_zero_ratio',
|
||||
'modern_browser_score', 'ua_ch_mismatch', 'has_sec_ch_ua', 'ip_id_zero_ratio',
|
||||
'request_size_variance', 'multiplexing_efficiency', 'mss_mobile_mismatch',
|
||||
'asset_ratio', 'direct_access_ratio', 'is_ua_rotating', 'distinct_ja4_count',
|
||||
'src_port_density', 'ja4_asn_concentration', 'ja4_country_concentration', 'is_rare_ja4',
|
||||
@ -103,7 +103,8 @@ def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
|
||||
|
||||
# Imputation intelligente
|
||||
binary_features = {
|
||||
'has_accept_language', 'has_cookie', 'has_referer', 'ua_ch_mismatch',
|
||||
'has_accept_language', 'has_cookie', 'has_referer', 'has_sec_ch_ua',
|
||||
'ua_ch_mismatch',
|
||||
'is_ua_rotating', 'is_alpn_missing', 'sni_host_mismatch', 'alpn_http_mismatch',
|
||||
'mss_mobile_mismatch', 'anubis_is_flagged', 'is_rare_ja4',
|
||||
'is_fake_navigation', 'has_xff', 'sec_ch_mobile_mismatch',
|
||||
|
||||
@ -606,23 +606,23 @@ def test_browser_consistency_score_range():
|
||||
"""browser_consistency_score is in [0, 5] and sums 5 binary signals."""
|
||||
df = pd.DataFrame({
|
||||
'browser_family': ['Chromium', '', 'Firefox', 'Safari'],
|
||||
'modern_browser_score': [100, 0, 80, 50],
|
||||
'has_sec_ch_ua': [1, 0, 1, 0],
|
||||
'has_accept_language': [1, 0, 1, 1],
|
||||
'has_cookie': [1, 0, 1, 0],
|
||||
'sec_fetch_absence_rate': [0.0, 1.0, 0.1, 0.6],
|
||||
})
|
||||
is_known = (df['browser_family'] != '').astype(int)
|
||||
mbs_ok = (df['modern_browser_score'] >= 50).astype(int)
|
||||
scu_ok = (df['has_sec_ch_ua'] > 0).astype(int)
|
||||
hal_ok = (df['has_accept_language'] > 0).astype(int)
|
||||
hck_ok = (df['has_cookie'] > 0).astype(int)
|
||||
sfa_ok = (df['sec_fetch_absence_rate'] < 0.5).astype(int)
|
||||
bcs = is_known + mbs_ok + hal_ok + hck_ok + sfa_ok
|
||||
bcs = is_known + scu_ok + hal_ok + hck_ok + sfa_ok
|
||||
|
||||
assert bcs.min() >= 0 and bcs.max() <= 5
|
||||
assert bcs.iloc[0] == 5, "Chromium with all signals should score 5"
|
||||
assert bcs.iloc[1] == 0, "Empty browser with no signals should score 0"
|
||||
assert bcs.iloc[2] == 5, "Firefox with all signals should score 5"
|
||||
assert bcs.iloc[3] == 3, "Safari without cookie and high sec_fetch_absence should score 3"
|
||||
assert bcs.iloc[3] == 2, "Safari without CH/cookie and high sec_fetch_absence should score 2"
|
||||
|
||||
|
||||
def test_legitimate_browser_classification_threshold():
|
||||
|
||||
Reference in New Issue
Block a user