feat: LEGITIMATE_BROWSER classification from JA4 + behavioral consistency

Add browser legitimacy classification (A9) to the bot detection pipeline:

- New features: is_known_browser (binary) and browser_consistency_score [0..5]
  combining 5 signals: JA4 browser match, modern_browser_score, Accept-Language,
  cookies, Sec-Fetch-* presence
- Post-scoring: sessions with known browser JA4 + consistency >= 4/5 + NORMAL/LOW
  threat level are reclassified as LEGITIMATE_BROWSER
- Spoofing detection: inconsistent behavior (known JA4 but low consistency) stays
  in normal anomaly scoring — prevents evasion via JA4 spoofing
- XGBoost treats LEGITIMATE_BROWSER as non-threat (negative label)
- ClickHouse: browser_family column added to ml_detected_anomalies and ml_all_scores
- Dashboard: browser_family filter/sort on detections and scores endpoints,
  legitimate_browsers count and browser_stats in overview
- 6 new unit tests covering classification threshold, spoofing, exclusion logic

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-08 15:46:22 +02:00
parent 7d09c614c3
commit 9a48fb9d29
4 changed files with 215 additions and 7 deletions

View File

@ -587,3 +587,113 @@ def test_xgb_save_load_model():
np.testing.assert_allclose(probs_before, probs_after, rtol=1e-5)
finally:
os.remove(path)
# ═══════════════════════════════════════════════════════════════════════════════
# Tests pour la classification des navigateurs légitimes (A9)
# ═══════════════════════════════════════════════════════════════════════════════
def test_is_known_browser_feature():
"""is_known_browser=1 when browser_family is set, 0 otherwise."""
df = pd.DataFrame({
'browser_family': ['Chromium', 'Firefox', '', '', 'Safari'],
})
df['is_known_browser'] = (df['browser_family'] != '').astype(int)
assert df['is_known_browser'].tolist() == [1, 1, 0, 0, 1]
def test_browser_consistency_score_range():
"""browser_consistency_score is in [0, 5] and sums 5 binary signals."""
df = pd.DataFrame({
'browser_family': ['Chromium', '', 'Firefox', 'Safari'],
'modern_browser_score': [100, 0, 80, 50],
'has_accept_language': [1, 0, 1, 1],
'has_cookie': [1, 0, 1, 0],
'sec_fetch_absence_rate': [0.0, 1.0, 0.1, 0.6],
})
is_known = (df['browser_family'] != '').astype(int)
mbs_ok = (df['modern_browser_score'] >= 50).astype(int)
hal_ok = (df['has_accept_language'] > 0).astype(int)
hck_ok = (df['has_cookie'] > 0).astype(int)
sfa_ok = (df['sec_fetch_absence_rate'] < 0.5).astype(int)
bcs = is_known + mbs_ok + hal_ok + hck_ok + sfa_ok
assert bcs.min() >= 0 and bcs.max() <= 5
assert bcs.iloc[0] == 5, "Chromium with all signals should score 5"
assert bcs.iloc[1] == 0, "Empty browser with no signals should score 0"
assert bcs.iloc[2] == 5, "Firefox with all signals should score 5"
assert bcs.iloc[3] == 3, "Safari without cookie and high sec_fetch_absence should score 3"
def test_legitimate_browser_classification_threshold():
"""LEGITIMATE_BROWSER requires browser_family + consistency >= threshold + NORMAL/LOW threat."""
BROWSER_LEGIT_MIN_CONSISTENCY = 4
sessions = [
# (browser_family, bcs, threat_level) → expected classification
('Chromium', 5, 'NORMAL'), # → LEGITIMATE_BROWSER
('Chromium', 5, 'MEDIUM'), # threat too high → keep MEDIUM
('Firefox', 4, 'LOW'), # → LEGITIMATE_BROWSER
('Firefox', 3, 'NORMAL'), # consistency too low → keep NORMAL
('', 5, 'NORMAL'), # no browser → keep NORMAL (can't be 5 without browser, but edge case)
('Chromium', 5, 'ANUBIS_DENY'), # Anubis DENY → keep ANUBIS_DENY
]
results = []
for bf, bcs, tl in sessions:
is_legit = (
bf != '' and
bcs >= BROWSER_LEGIT_MIN_CONSISTENCY and
tl in ('NORMAL', 'LOW')
)
results.append('LEGITIMATE_BROWSER' if is_legit else tl)
assert results == [
'LEGITIMATE_BROWSER',
'MEDIUM',
'LEGITIMATE_BROWSER',
'NORMAL',
'NORMAL',
'ANUBIS_DENY',
]
def test_legitimate_browser_excluded_from_anomalies():
"""Sessions classified as LEGITIMATE_BROWSER should not appear in anomaly list."""
threat_levels = pd.Series(['CRITICAL', 'HIGH', 'MEDIUM', 'NORMAL', 'LEGITIMATE_BROWSER', 'ANUBIS_DENY'])
raw_scores = pd.Series([-0.5, -0.2, -0.08, 0.1, 0.05, -0.15])
threshold = -0.05
# Anomalies = below threshold AND not LEGITIMATE_BROWSER AND not ANUBIS_DENY
non_browser = threat_levels != 'LEGITIMATE_BROWSER'
non_deny = threat_levels != 'ANUBIS_DENY'
below_threshold = raw_scores < threshold
anomaly_mask = non_browser & non_deny & below_threshold
assert anomaly_mask.tolist() == [True, True, True, False, False, False]
def test_browser_spoofing_detection():
"""Inconsistent browser behavior (known JA4 but low consistency) stays in normal scoring."""
BROWSER_LEGIT_MIN_CONSISTENCY = 4
# Spoofed: JA4 looks like Chrome but no cookies, no Accept-Language, high sec_fetch_absence
spoofed_bf = 'Chromium'
spoofed_bcs = 1 # only is_known_browser=1, all others fail
spoofed_tl = 'MEDIUM'
is_legit = (
spoofed_bf != '' and
spoofed_bcs >= BROWSER_LEGIT_MIN_CONSISTENCY and
spoofed_tl in ('NORMAL', 'LOW')
)
assert not is_legit, "Spoofed browser should NOT be classified as LEGITIMATE_BROWSER"
def test_xgb_labels_treat_legitimate_browser_as_normal():
"""XGBoost training should treat LEGITIMATE_BROWSER as negative (non-threat) label."""
# The XGBoost query only selects these threat_levels (not LOW/MEDIUM)
threat_levels = pd.Series([
'NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'KNOWN_BOT', 'ANUBIS_DENY'
])
y = (~threat_levels.isin(['NORMAL', 'LEGITIMATE_BROWSER'])).astype(int)
assert y.tolist() == [0, 0, 1, 1, 1, 1]