feat: LEGITIMATE_BROWSER classification from JA4 + behavioral consistency

Add browser legitimacy classification (A9) to the bot detection pipeline: - New features: is_known_browser (binary) and browser_consistency_score [0..5] combining 5 signals: JA4 browser match, modern_browser_score, Accept-Language, cookies, Sec-Fetch-* presence - Post-scoring: sessions with known browser JA4 + consistency >= 4/5 + NORMAL/LOW threat level are reclassified as LEGITIMATE_BROWSER - Spoofing detection: inconsistent behavior (known JA4 but low consistency) stays in normal anomaly scoring — prevents evasion via JA4 spoofing - XGBoost treats LEGITIMATE_BROWSER as non-threat (negative label) - ClickHouse: browser_family column added to ml_detected_anomalies and ml_all_scores - Dashboard: browser_family filter/sort on detections and scores endpoints, legitimate_browsers count and browser_stats in overview - 6 new unit tests covering classification threshold, spoofing, exclusion logic Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-08 15:46:22 +02:00
parent 7d09c614c3
commit 9a48fb9d29
4 changed files with 215 additions and 7 deletions
--- a/services/bot-detector/bot_detector/tests/test_detector.py
+++ b/services/bot-detector/bot_detector/tests/test_detector.py
@ -587,3 +587,113 @@ def test_xgb_save_load_model():
        np.testing.assert_allclose(probs_before, probs_after, rtol=1e-5)
    finally:
        os.remove(path)
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Tests pour la classification des navigateurs légitimes (A9)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def test_is_known_browser_feature():
+    """is_known_browser=1 when browser_family is set, 0 otherwise."""
+    df = pd.DataFrame({
+        'browser_family': ['Chromium', 'Firefox', '', '', 'Safari'],
+    })
+    df['is_known_browser'] = (df['browser_family'] != '').astype(int)
+    assert df['is_known_browser'].tolist() == [1, 1, 0, 0, 1]
+
+
+def test_browser_consistency_score_range():
+    """browser_consistency_score is in [0, 5] and sums 5 binary signals."""
+    df = pd.DataFrame({
+        'browser_family':       ['Chromium', '',       'Firefox', 'Safari'],
+        'modern_browser_score': [100,        0,        80,        50],
+        'has_accept_language':  [1,          0,        1,         1],
+        'has_cookie':           [1,          0,        1,         0],
+        'sec_fetch_absence_rate': [0.0,      1.0,      0.1,       0.6],
+    })
+    is_known = (df['browser_family'] != '').astype(int)
+    mbs_ok = (df['modern_browser_score'] >= 50).astype(int)
+    hal_ok = (df['has_accept_language'] > 0).astype(int)
+    hck_ok = (df['has_cookie'] > 0).astype(int)
+    sfa_ok = (df['sec_fetch_absence_rate'] < 0.5).astype(int)
+    bcs = is_known + mbs_ok + hal_ok + hck_ok + sfa_ok
+
+    assert bcs.min() >= 0 and bcs.max() <= 5
+    assert bcs.iloc[0] == 5, "Chromium with all signals should score 5"
+    assert bcs.iloc[1] == 0, "Empty browser with no signals should score 0"
+    assert bcs.iloc[2] == 5, "Firefox with all signals should score 5"
+    assert bcs.iloc[3] == 3, "Safari without cookie and high sec_fetch_absence should score 3"
+
+
+def test_legitimate_browser_classification_threshold():
+    """LEGITIMATE_BROWSER requires browser_family + consistency >= threshold + NORMAL/LOW threat."""
+    BROWSER_LEGIT_MIN_CONSISTENCY = 4
+
+    sessions = [
+        # (browser_family, bcs, threat_level) → expected classification
+        ('Chromium', 5, 'NORMAL'),      # → LEGITIMATE_BROWSER
+        ('Chromium', 5, 'MEDIUM'),      # threat too high → keep MEDIUM
+        ('Firefox',  4, 'LOW'),         # → LEGITIMATE_BROWSER
+        ('Firefox',  3, 'NORMAL'),      # consistency too low → keep NORMAL
+        ('',         5, 'NORMAL'),      # no browser → keep NORMAL (can't be 5 without browser, but edge case)
+        ('Chromium', 5, 'ANUBIS_DENY'), # Anubis DENY → keep ANUBIS_DENY
+    ]
+    results = []
+    for bf, bcs, tl in sessions:
+        is_legit = (
+            bf != '' and
+            bcs >= BROWSER_LEGIT_MIN_CONSISTENCY and
+            tl in ('NORMAL', 'LOW')
+        )
+        results.append('LEGITIMATE_BROWSER' if is_legit else tl)
+
+    assert results == [
+        'LEGITIMATE_BROWSER',
+        'MEDIUM',
+        'LEGITIMATE_BROWSER',
+        'NORMAL',
+        'NORMAL',
+        'ANUBIS_DENY',
+    ]
+
+
+def test_legitimate_browser_excluded_from_anomalies():
+    """Sessions classified as LEGITIMATE_BROWSER should not appear in anomaly list."""
+    threat_levels = pd.Series(['CRITICAL', 'HIGH', 'MEDIUM', 'NORMAL', 'LEGITIMATE_BROWSER', 'ANUBIS_DENY'])
+    raw_scores    = pd.Series([-0.5,      -0.2,   -0.08,    0.1,      0.05,                  -0.15])
+    threshold = -0.05
+
+    # Anomalies = below threshold AND not LEGITIMATE_BROWSER AND not ANUBIS_DENY
+    non_browser = threat_levels != 'LEGITIMATE_BROWSER'
+    non_deny = threat_levels != 'ANUBIS_DENY'
+    below_threshold = raw_scores < threshold
+    anomaly_mask = non_browser & non_deny & below_threshold
+
+    assert anomaly_mask.tolist() == [True, True, True, False, False, False]
+
+
+def test_browser_spoofing_detection():
+    """Inconsistent browser behavior (known JA4 but low consistency) stays in normal scoring."""
+    BROWSER_LEGIT_MIN_CONSISTENCY = 4
+
+    # Spoofed: JA4 looks like Chrome but no cookies, no Accept-Language, high sec_fetch_absence
+    spoofed_bf = 'Chromium'
+    spoofed_bcs = 1  # only is_known_browser=1, all others fail
+    spoofed_tl = 'MEDIUM'
+
+    is_legit = (
+        spoofed_bf != '' and
+        spoofed_bcs >= BROWSER_LEGIT_MIN_CONSISTENCY and
+        spoofed_tl in ('NORMAL', 'LOW')
+    )
+    assert not is_legit, "Spoofed browser should NOT be classified as LEGITIMATE_BROWSER"
+
+
+def test_xgb_labels_treat_legitimate_browser_as_normal():
+    """XGBoost training should treat LEGITIMATE_BROWSER as negative (non-threat) label."""
+    # The XGBoost query only selects these threat_levels (not LOW/MEDIUM)
+    threat_levels = pd.Series([
+        'NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'KNOWN_BOT', 'ANUBIS_DENY'
+    ])
+    y = (~threat_levels.isin(['NORMAL', 'LEGITIMATE_BROWSER'])).astype(int)
+    assert y.tolist() == [0, 0, 1, 1, 1, 1]