feat(bot-detector): Browser Signature Detection engine (parallel mode)
Étape A — browser_signatures.py
Données pures : BROWSER_SIGNATURES (Chrome/Firefox/Safari), NON_BROWSER_SIGNATURES
(curl/httpx/go), BROWSER_THRESHOLDS, DIMENSION_WEIGHTS. Valeurs H2 extraites
des captures réelles (format Akamai avec virgules, non semicolons).
Étape B — browser_matcher.py
Moteur vectorisé 7 dimensions (H2 SETTINGS 0.30, WINDOW_UPDATE 0.15,
pseudo-header order 0.15, H2 PRIORITY 0.10, HTTP headers 0.15, TLS 0.10,
JA4 dict 0.05). run_browser_matcher(df) ajoute bm_family/bm_score/bm_decision.
CDN edge case : dimension H2 neutralisée (0.5) si has_xff=1.
BROWSER_MATCHER_REPLACE=false par défaut (mode DUAL_MODE logging uniquement).
Étape C — 06_browser_signature_detection.sql (migration)
Crée browser_h2_signatures (table MergeTree avec 12 fingerprints de référence).
Recrée dict_browser_h2 depuis la table avec champ confidence (remplace CSV).
Étape D — 07_ai_features_view.sql
+h2_wu_val dans le JOIN http_logs, +h2_window_update_value, +h2_dict_family,
+h2_dict_confidence, +h2_window_{chrome,firefox,safari,absent},
+h2_order_{chromesafari,firefox}, +h2_priority_present, +h2_pseudo_ord_raw,
+tls_h2_family_mismatch (détection incohérence famille JA4 vs famille H2).
Étape E — preprocessing.py + pipeline.py
preprocessing.py: appelle run_browser_matcher() après compute_browser_axes(),
ajoute 7 nouvelles features binaires H2 à FEATURES et binary_features.
pipeline.py: appelle log_dual_mode_comparison() après la classification A9.
BROWSER_MATCHER_REPLACE=true active le remplacement du bypass.
Étape F — test_browser_matcher.py
8 tests : Chrome/Firefox/Safari full match, curl rejeté, httpcloak partiel,
TLS↔H2 mismatch, CDN proxy neutralisation, go net/http rejeté.
Tous 8 PASSED (+ 36 tests existants inchangés).
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
272
services/bot-detector/bot_detector/tests/test_browser_matcher.py
Normal file
272
services/bot-detector/bot_detector/tests/test_browser_matcher.py
Normal file
@ -0,0 +1,272 @@
|
||||
"""Tests unitaires pour browser_matcher.py.
|
||||
|
||||
Vérifie les 6 cas de base :
|
||||
1. Chrome full match → famille Chrome, score ≥ 0.72 → LEGITIMATE_BROWSER
|
||||
2. Firefox full match → famille Firefox, score ≥ 0.68 → LEGITIMATE_BROWSER
|
||||
3. Safari full match → famille Safari, score ≥ 0.68 → LEGITIMATE_BROWSER
|
||||
4. curl rejeté → NON_BROWSER (pas de bypass)
|
||||
5. httpcloak partial → score < seuil, PARTIAL
|
||||
6. TLS↔H2 mismatch → tls_h2_family_mismatch calculé correctement
|
||||
|
||||
Les tests sont auto-contenus : ils construisent des DataFrames pandas
|
||||
représentant des sessions fictives sans interagir avec ClickHouse.
|
||||
"""
|
||||
import pytest
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# Les importations du module principal sont testées directement.
|
||||
from bot_detector.browser_signatures import BROWSER_SIGNATURES, BROWSER_THRESHOLDS
|
||||
from bot_detector.browser_matcher import (
|
||||
run_browser_matcher,
|
||||
_compute_family_score,
|
||||
_matches_non_browser,
|
||||
)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Helpers
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _chrome_session(**overrides) -> pd.DataFrame:
|
||||
"""Session Chrome parfaite (fingerprint complet)."""
|
||||
base = {
|
||||
"src_ip": "1.2.3.4",
|
||||
"ja4": "t13d1517h2_8daaf6152771_e5627efa2ab1",
|
||||
"browser_family": "Chromium",
|
||||
"h2_dict_family": "Chrome",
|
||||
"h2_dict_confidence": 1.0,
|
||||
"h2_window_update_value": 15663105,
|
||||
"h2_order_chromesafari": 1,
|
||||
"h2_order_firefox": 0,
|
||||
"h2_window_chrome": 1,
|
||||
"h2_window_firefox": 0,
|
||||
"h2_window_safari": 0,
|
||||
"h2_window_absent": 0,
|
||||
"h2_priority_present": 0,
|
||||
"h2_settings_known": 1,
|
||||
"h2_ja4_coherence": 1,
|
||||
"h2_settings_rare": 0,
|
||||
"has_accept_language": 1,
|
||||
"has_sec_ch_ua": 1,
|
||||
"sec_fetch_absence_rate": 0.0,
|
||||
"modern_browser_score": 1.0,
|
||||
"has_xff": 0,
|
||||
"tls12_ratio": 0.0,
|
||||
}
|
||||
base.update(overrides)
|
||||
return pd.DataFrame([base])
|
||||
|
||||
|
||||
def _firefox_session(**overrides) -> pd.DataFrame:
|
||||
"""Session Firefox parfaite."""
|
||||
base = {
|
||||
"src_ip": "2.3.4.5",
|
||||
"ja4": "t13d1516h2_8daaf6152771_e5627efa2ab2",
|
||||
"browser_family": "Firefox",
|
||||
"h2_dict_family": "Firefox",
|
||||
"h2_dict_confidence": 1.0,
|
||||
"h2_window_update_value": 12517377,
|
||||
"h2_order_chromesafari": 0,
|
||||
"h2_order_firefox": 1,
|
||||
"h2_window_chrome": 0,
|
||||
"h2_window_firefox": 1,
|
||||
"h2_window_safari": 0,
|
||||
"h2_window_absent": 0,
|
||||
"h2_priority_present": 0,
|
||||
"h2_settings_known": 1,
|
||||
"h2_ja4_coherence": 1,
|
||||
"h2_settings_rare": 0,
|
||||
"has_accept_language": 1,
|
||||
"has_sec_ch_ua": 0, # Firefox n'envoie pas Sec-CH-UA
|
||||
"sec_fetch_absence_rate": 0.0,
|
||||
"modern_browser_score": 1.0,
|
||||
"has_xff": 0,
|
||||
"tls12_ratio": 0.0,
|
||||
}
|
||||
base.update(overrides)
|
||||
return pd.DataFrame([base])
|
||||
|
||||
|
||||
def _safari_session(**overrides) -> pd.DataFrame:
|
||||
"""Session Safari parfaite."""
|
||||
base = {
|
||||
"src_ip": "3.4.5.6",
|
||||
"ja4": "t13d1516h2_8daaf6152771_e5627efa2ab3",
|
||||
"browser_family": "Safari",
|
||||
"h2_dict_family": "Safari",
|
||||
"h2_dict_confidence": 1.0,
|
||||
"h2_window_update_value": 10485760,
|
||||
"h2_order_chromesafari": 1, # m,a,s,p partagé avec Chrome
|
||||
"h2_order_firefox": 0,
|
||||
"h2_window_chrome": 0,
|
||||
"h2_window_firefox": 0,
|
||||
"h2_window_safari": 1,
|
||||
"h2_window_absent": 0,
|
||||
"h2_priority_present": 0,
|
||||
"h2_settings_known": 1,
|
||||
"h2_ja4_coherence": 1,
|
||||
"h2_settings_rare": 0,
|
||||
"has_accept_language": 1,
|
||||
"has_sec_ch_ua": 0,
|
||||
"sec_fetch_absence_rate": 1.0, # Safari n'envoie pas Sec-Fetch-*
|
||||
"modern_browser_score": 0.8,
|
||||
"has_xff": 0,
|
||||
"tls12_ratio": 0.0,
|
||||
}
|
||||
base.update(overrides)
|
||||
return pd.DataFrame([base])
|
||||
|
||||
|
||||
def _curl_session(**overrides) -> pd.DataFrame:
|
||||
"""Session curl (WINDOW_UPDATE absent = 0, signature d'outil)."""
|
||||
base = {
|
||||
"src_ip": "4.5.6.7",
|
||||
"ja4": "t13d0606h2_915bc7144b37_curl",
|
||||
"browser_family": "",
|
||||
"h2_dict_family": "",
|
||||
"h2_dict_confidence": 0.0,
|
||||
"h2_window_update_value": 0,
|
||||
"h2_order_chromesafari": 0,
|
||||
"h2_order_firefox": 0,
|
||||
"h2_window_chrome": 0,
|
||||
"h2_window_firefox": 0,
|
||||
"h2_window_safari": 0,
|
||||
"h2_window_absent": 1,
|
||||
"h2_priority_present": 0,
|
||||
"h2_settings_known": 1, # curl envoie des SETTINGS mais pas de WU
|
||||
"h2_ja4_coherence": 0,
|
||||
"h2_settings_rare": 1,
|
||||
"has_accept_language": 0,
|
||||
"has_sec_ch_ua": 0,
|
||||
"sec_fetch_absence_rate": 1.0,
|
||||
"modern_browser_score": 0.1,
|
||||
"has_xff": 0,
|
||||
"tls12_ratio": 0.1,
|
||||
}
|
||||
base.update(overrides)
|
||||
return pd.DataFrame([base])
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Tests
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_chrome_full_match():
|
||||
"""Chrome avec fingerprint parfait : score ≥ 0.72 → LEGITIMATE_BROWSER."""
|
||||
df = _chrome_session()
|
||||
result = run_browser_matcher(df)
|
||||
assert result.loc[0, "bm_decision"] == "LEGITIMATE_BROWSER"
|
||||
assert result.loc[0, "bm_family"] == "Chrome"
|
||||
assert result.loc[0, "bm_score"] >= 0.72
|
||||
|
||||
|
||||
def test_firefox_full_match():
|
||||
"""Firefox avec fingerprint parfait : score ≥ 0.68 → LEGITIMATE_BROWSER."""
|
||||
df = _firefox_session()
|
||||
result = run_browser_matcher(df)
|
||||
assert result.loc[0, "bm_decision"] == "LEGITIMATE_BROWSER"
|
||||
assert result.loc[0, "bm_family"] == "Firefox"
|
||||
assert result.loc[0, "bm_score"] >= 0.68
|
||||
|
||||
|
||||
def test_safari_full_match():
|
||||
"""Safari : absence Sec-Fetch + WU=10485760 + dict→Safari → LEGITIMATE_BROWSER."""
|
||||
df = _safari_session()
|
||||
result = run_browser_matcher(df)
|
||||
assert result.loc[0, "bm_decision"] == "LEGITIMATE_BROWSER"
|
||||
assert result.loc[0, "bm_family"] == "Safari"
|
||||
assert result.loc[0, "bm_score"] >= 0.68
|
||||
|
||||
|
||||
def test_curl_rejected():
|
||||
"""curl : WINDOW_UPDATE absent avec H2 présent → NON_BROWSER (pas de bypass)."""
|
||||
df = _curl_session()
|
||||
result = run_browser_matcher(df)
|
||||
assert result.loc[0, "bm_decision"] == "NON_BROWSER"
|
||||
assert result.loc[0, "bm_family"] == ""
|
||||
assert result.loc[0, "bm_non_browser"] is True or result.loc[0, "bm_non_browser"] == True
|
||||
|
||||
|
||||
def test_httpcloak_partial():
|
||||
"""httpcloak : SETTINGS Chrome parfaits MAIS pseudo-header order ≠ Chrome → score partiel."""
|
||||
# H2 SETTINGS Chrome (dict renvoie 'Chrome') mais pseudo-order
|
||||
# inconsistant (h2_order_chromesafari=0 = pas m,a,s,p)
|
||||
df = _chrome_session(
|
||||
h2_dict_family="Chrome",
|
||||
h2_window_update_value=15663105, # Chrome WU ✓
|
||||
h2_order_chromesafari=0, # mauvais pseudo-header order
|
||||
h2_order_firefox=1, # ressemble à Firefox ici
|
||||
browser_family="", # JA4 inconnu
|
||||
has_sec_ch_ua=0, # Sec-CH-UA absent (manque un required)
|
||||
)
|
||||
result = run_browser_matcher(df)
|
||||
# Doit être en dessous du seuil Chrome (0.72)
|
||||
assert result.loc[0, "bm_score"] < BROWSER_THRESHOLDS["Chrome"]
|
||||
# Avec des signaux partiels, doit être PARTIAL ou UNKNOWN (pas LEGITIMATE_BROWSER)
|
||||
assert result.loc[0, "bm_decision"] in ("PARTIAL", "UNKNOWN", "NON_BROWSER")
|
||||
|
||||
|
||||
def test_tls_h2_mismatch_detection():
|
||||
"""JA4 identifie Chrome mais WINDOW_UPDATE = valeur Firefox → mismatch détectable."""
|
||||
# Simuler tls_h2_family_mismatch via les colonnes disponibles
|
||||
# (la colonne SQL est calculée dans la vue, ici on vérifie la logique Python)
|
||||
chrome_ja4_firefox_wu = {
|
||||
"browser_family": "Chromium", # JA4 dit Chrome
|
||||
"h2_window_update_value": 12517377, # WINDOW_UPDATE Firefox
|
||||
"h2_window_chrome": 0,
|
||||
"h2_window_firefox": 1,
|
||||
"h2_window_safari": 0,
|
||||
"h2_window_absent": 0,
|
||||
"h2_order_chromesafari": 0,
|
||||
"h2_order_firefox": 1,
|
||||
"h2_dict_family": "Firefox", # dict H2 confirme Firefox
|
||||
"h2_dict_confidence": 1.0,
|
||||
"h2_settings_known": 1,
|
||||
"h2_ja4_coherence": 0, # incohérence JA4 vs H2
|
||||
"has_accept_language": 1,
|
||||
"has_sec_ch_ua": 1,
|
||||
"sec_fetch_absence_rate": 0.0,
|
||||
"has_xff": 0,
|
||||
"tls12_ratio": 0.0,
|
||||
}
|
||||
df = pd.DataFrame([chrome_ja4_firefox_wu])
|
||||
result = run_browser_matcher(df)
|
||||
|
||||
# Le score Chrome doit être bas (h2 ne matche pas Chrome)
|
||||
chrome_score = _compute_family_score(df, "Chrome").iloc[0]
|
||||
firefox_score = _compute_family_score(df, "Firefox").iloc[0]
|
||||
|
||||
# Firefox score devrait être supérieur à Chrome (cohérence H2)
|
||||
assert firefox_score > chrome_score, (
|
||||
f"Firefox score ({firefox_score:.3f}) devrait dépasser Chrome ({chrome_score:.3f})"
|
||||
)
|
||||
# La détection de mismatch : la session ne devrait pas être classée Chrome légitime
|
||||
if result.loc[0, "bm_decision"] == "LEGITIMATE_BROWSER":
|
||||
assert result.loc[0, "bm_family"] != "Chrome", (
|
||||
"Une session avec JA4=Chrome mais H2=Firefox ne devrait pas être classée Chrome légitime"
|
||||
)
|
||||
|
||||
|
||||
def test_cdn_proxy_neutralizes_h2_settings():
|
||||
"""Session derrière un CDN (has_xff=1) : dimension H2 SETTINGS neutralisée (0.5)."""
|
||||
df = _chrome_session(
|
||||
has_xff=1, # proxy CDN détecté
|
||||
h2_dict_family="", # le proxy a terminé la connexion H2 — fingerprint perdu
|
||||
h2_settings_known=0,
|
||||
)
|
||||
result = run_browser_matcher(df)
|
||||
# Le score peut être inférieur mais la session ne devrait pas être NON_BROWSER
|
||||
# (le proxy ne rend pas le client malveillant)
|
||||
assert result.loc[0, "bm_non_browser"] is False or result.loc[0, "bm_non_browser"] == False
|
||||
|
||||
|
||||
def test_non_browser_go_net_http():
|
||||
"""go net/http : WINDOW_UPDATE=1073676289 → NON_BROWSER."""
|
||||
df = _curl_session(
|
||||
h2_window_update_value=1073676289,
|
||||
h2_window_absent=0,
|
||||
h2_settings_known=1,
|
||||
)
|
||||
result = run_browser_matcher(df)
|
||||
assert result.loc[0, "bm_non_browser"] is True or result.loc[0, "bm_non_browser"] == True
|
||||
Reference in New Issue
Block a user