feat: LEGITIMATE_BROWSER classification from JA4 + behavioral consistency
Add browser legitimacy classification (A9) to the bot detection pipeline: - New features: is_known_browser (binary) and browser_consistency_score [0..5] combining 5 signals: JA4 browser match, modern_browser_score, Accept-Language, cookies, Sec-Fetch-* presence - Post-scoring: sessions with known browser JA4 + consistency >= 4/5 + NORMAL/LOW threat level are reclassified as LEGITIMATE_BROWSER - Spoofing detection: inconsistent behavior (known JA4 but low consistency) stays in normal anomaly scoring — prevents evasion via JA4 spoofing - XGBoost treats LEGITIMATE_BROWSER as non-threat (negative label) - ClickHouse: browser_family column added to ml_detected_anomalies and ml_all_scores - Dashboard: browser_family filter/sort on detections and scores endpoints, legitimate_browsers count and browser_stats in overview - 6 new unit tests covering classification threshold, spoofing, exclusion logic Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -587,3 +587,113 @@ def test_xgb_save_load_model():
|
||||
np.testing.assert_allclose(probs_before, probs_after, rtol=1e-5)
|
||||
finally:
|
||||
os.remove(path)
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# Tests pour la classification des navigateurs légitimes (A9)
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def test_is_known_browser_feature():
|
||||
"""is_known_browser=1 when browser_family is set, 0 otherwise."""
|
||||
df = pd.DataFrame({
|
||||
'browser_family': ['Chromium', 'Firefox', '', '', 'Safari'],
|
||||
})
|
||||
df['is_known_browser'] = (df['browser_family'] != '').astype(int)
|
||||
assert df['is_known_browser'].tolist() == [1, 1, 0, 0, 1]
|
||||
|
||||
|
||||
def test_browser_consistency_score_range():
|
||||
"""browser_consistency_score is in [0, 5] and sums 5 binary signals."""
|
||||
df = pd.DataFrame({
|
||||
'browser_family': ['Chromium', '', 'Firefox', 'Safari'],
|
||||
'modern_browser_score': [100, 0, 80, 50],
|
||||
'has_accept_language': [1, 0, 1, 1],
|
||||
'has_cookie': [1, 0, 1, 0],
|
||||
'sec_fetch_absence_rate': [0.0, 1.0, 0.1, 0.6],
|
||||
})
|
||||
is_known = (df['browser_family'] != '').astype(int)
|
||||
mbs_ok = (df['modern_browser_score'] >= 50).astype(int)
|
||||
hal_ok = (df['has_accept_language'] > 0).astype(int)
|
||||
hck_ok = (df['has_cookie'] > 0).astype(int)
|
||||
sfa_ok = (df['sec_fetch_absence_rate'] < 0.5).astype(int)
|
||||
bcs = is_known + mbs_ok + hal_ok + hck_ok + sfa_ok
|
||||
|
||||
assert bcs.min() >= 0 and bcs.max() <= 5
|
||||
assert bcs.iloc[0] == 5, "Chromium with all signals should score 5"
|
||||
assert bcs.iloc[1] == 0, "Empty browser with no signals should score 0"
|
||||
assert bcs.iloc[2] == 5, "Firefox with all signals should score 5"
|
||||
assert bcs.iloc[3] == 3, "Safari without cookie and high sec_fetch_absence should score 3"
|
||||
|
||||
|
||||
def test_legitimate_browser_classification_threshold():
|
||||
"""LEGITIMATE_BROWSER requires browser_family + consistency >= threshold + NORMAL/LOW threat."""
|
||||
BROWSER_LEGIT_MIN_CONSISTENCY = 4
|
||||
|
||||
sessions = [
|
||||
# (browser_family, bcs, threat_level) → expected classification
|
||||
('Chromium', 5, 'NORMAL'), # → LEGITIMATE_BROWSER
|
||||
('Chromium', 5, 'MEDIUM'), # threat too high → keep MEDIUM
|
||||
('Firefox', 4, 'LOW'), # → LEGITIMATE_BROWSER
|
||||
('Firefox', 3, 'NORMAL'), # consistency too low → keep NORMAL
|
||||
('', 5, 'NORMAL'), # no browser → keep NORMAL (can't be 5 without browser, but edge case)
|
||||
('Chromium', 5, 'ANUBIS_DENY'), # Anubis DENY → keep ANUBIS_DENY
|
||||
]
|
||||
results = []
|
||||
for bf, bcs, tl in sessions:
|
||||
is_legit = (
|
||||
bf != '' and
|
||||
bcs >= BROWSER_LEGIT_MIN_CONSISTENCY and
|
||||
tl in ('NORMAL', 'LOW')
|
||||
)
|
||||
results.append('LEGITIMATE_BROWSER' if is_legit else tl)
|
||||
|
||||
assert results == [
|
||||
'LEGITIMATE_BROWSER',
|
||||
'MEDIUM',
|
||||
'LEGITIMATE_BROWSER',
|
||||
'NORMAL',
|
||||
'NORMAL',
|
||||
'ANUBIS_DENY',
|
||||
]
|
||||
|
||||
|
||||
def test_legitimate_browser_excluded_from_anomalies():
|
||||
"""Sessions classified as LEGITIMATE_BROWSER should not appear in anomaly list."""
|
||||
threat_levels = pd.Series(['CRITICAL', 'HIGH', 'MEDIUM', 'NORMAL', 'LEGITIMATE_BROWSER', 'ANUBIS_DENY'])
|
||||
raw_scores = pd.Series([-0.5, -0.2, -0.08, 0.1, 0.05, -0.15])
|
||||
threshold = -0.05
|
||||
|
||||
# Anomalies = below threshold AND not LEGITIMATE_BROWSER AND not ANUBIS_DENY
|
||||
non_browser = threat_levels != 'LEGITIMATE_BROWSER'
|
||||
non_deny = threat_levels != 'ANUBIS_DENY'
|
||||
below_threshold = raw_scores < threshold
|
||||
anomaly_mask = non_browser & non_deny & below_threshold
|
||||
|
||||
assert anomaly_mask.tolist() == [True, True, True, False, False, False]
|
||||
|
||||
|
||||
def test_browser_spoofing_detection():
|
||||
"""Inconsistent browser behavior (known JA4 but low consistency) stays in normal scoring."""
|
||||
BROWSER_LEGIT_MIN_CONSISTENCY = 4
|
||||
|
||||
# Spoofed: JA4 looks like Chrome but no cookies, no Accept-Language, high sec_fetch_absence
|
||||
spoofed_bf = 'Chromium'
|
||||
spoofed_bcs = 1 # only is_known_browser=1, all others fail
|
||||
spoofed_tl = 'MEDIUM'
|
||||
|
||||
is_legit = (
|
||||
spoofed_bf != '' and
|
||||
spoofed_bcs >= BROWSER_LEGIT_MIN_CONSISTENCY and
|
||||
spoofed_tl in ('NORMAL', 'LOW')
|
||||
)
|
||||
assert not is_legit, "Spoofed browser should NOT be classified as LEGITIMATE_BROWSER"
|
||||
|
||||
|
||||
def test_xgb_labels_treat_legitimate_browser_as_normal():
|
||||
"""XGBoost training should treat LEGITIMATE_BROWSER as negative (non-threat) label."""
|
||||
# The XGBoost query only selects these threat_levels (not LOW/MEDIUM)
|
||||
threat_levels = pd.Series([
|
||||
'NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'KNOWN_BOT', 'ANUBIS_DENY'
|
||||
])
|
||||
y = (~threat_levels.isin(['NORMAL', 'LEGITIMATE_BROWSER'])).astype(int)
|
||||
assert y.tolist() == [0, 0, 1, 1, 1, 1]
|
||||
|
||||
Reference in New Issue
Block a user