feat: LEGITIMATE_BROWSER classification from JA4 + behavioral consistency

Add browser legitimacy classification (A9) to the bot detection pipeline:

- New features: is_known_browser (binary) and browser_consistency_score [0..5]
  combining 5 signals: JA4 browser match, modern_browser_score, Accept-Language,
  cookies, Sec-Fetch-* presence
- Post-scoring: sessions with known browser JA4 + consistency >= 4/5 + NORMAL/LOW
  threat level are reclassified as LEGITIMATE_BROWSER
- Spoofing detection: inconsistent behavior (known JA4 but low consistency) stays
  in normal anomaly scoring — prevents evasion via JA4 spoofing
- XGBoost treats LEGITIMATE_BROWSER as non-threat (negative label)
- ClickHouse: browser_family column added to ml_detected_anomalies and ml_all_scores
- Dashboard: browser_family filter/sort on detections and scores endpoints,
  legitimate_browsers count and browser_stats in overview
- 6 new unit tests covering classification threshold, spoofing, exclusion logic

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-08 15:46:22 +02:00
parent 7d09c614c3
commit 9a48fb9d29
4 changed files with 215 additions and 7 deletions

View File

@ -151,6 +151,11 @@ XGB_WEIGHT = _require_float('XGB_WEIGHT', 0.20, 0, 1) # pondération dans le sc
XGB_MIN_LABELS = int(os.getenv('XGB_MIN_LABELS', '100')) # nb minimum de labels historiques pour entraîner
XGB_RETRAIN_INTERVAL_H = int(os.getenv('XGB_RETRAIN_INTERVAL_HOURS', '168')) # retrain hebdomadaire
# A9 — Classification navigateurs légitimes (JA4 + cohérence comportementale)
# Seuil minimal de browser_consistency_score [0..5] pour classer LEGITIMATE_BROWSER.
# 4/5 = navigateur reconnu + ≥3 signaux comportementaux (sec-ch-ua, cookies, Accept-Language, Sec-Fetch).
BROWSER_LEGIT_MIN_CONSISTENCY = int(os.getenv('BROWSER_LEGIT_MIN_CONSISTENCY', '4'))
# Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé)
# Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design.
STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
@ -441,7 +446,7 @@ def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -
result = client.query(
f"SELECT {feature_cols}, threat_level "
f"FROM {DB}.ml_all_scores "
f"WHERE threat_level IN ('NORMAL', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') "
f"WHERE threat_level IN ('NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') "
f"AND window_start >= now() - INTERVAL 7 DAY "
f"ORDER BY rand() LIMIT 50000"
)
@ -451,7 +456,7 @@ def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -
df = pd.DataFrame(result.result_rows, columns=cols)
df[features] = df[features].apply(pd.to_numeric, errors='coerce')
df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=features)
y = (df['threat_level'] != 'NORMAL').astype(int)
y = (~df['threat_level'].isin(['NORMAL', 'LEGITIMATE_BROWSER'])).astype(int)
if y.sum() < 10 or len(y) < min_labels:
return None, None
X = df[features].values
@ -1055,6 +1060,36 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
deny_mask = unknown_traffic['anubis_bot_action'] == 'DENY'
unknown_traffic.loc[deny_mask, 'threat_level'] = 'ANUBIS_DENY'
# ── Classification des navigateurs légitimes ─────────────────────────────
# Principe : un cluster/session est classé LEGITIMATE_BROWSER si et seulement si :
# 1. Le JA4 correspond à un navigateur connu (browser_family != '')
# 2. Le comportement est cohérent avec un vrai navigateur (browser_consistency_score >= 4)
# 3. Le score IF ne signale pas de menace (threat_level in NORMAL/LOW)
# 4. Ce n'est pas un DENY Anubis
# Cela réduit les faux positifs sur les vrais utilisateurs tout en détectant
# le spoofing de JA4 (navigateur déclaré mais comportement incohérent).
_bf = unknown_traffic.get('browser_family', pd.Series('', index=unknown_traffic.index)).fillna('').astype(str)
_bcs = unknown_traffic.get('browser_consistency_score', pd.Series(0, index=unknown_traffic.index)).fillna(0)
browser_legit_mask = (
(_bf != '') & # JA4 navigateur reconnu
(_bcs >= BROWSER_LEGIT_MIN_CONSISTENCY) & # comportement cohérent
(unknown_traffic['threat_level'].isin(['NORMAL', 'LOW'])) & # pas de menace IF
(~deny_mask) # pas un DENY Anubis
)
if browser_legit_mask.any():
unknown_traffic.loc[browser_legit_mask, 'threat_level'] = 'LEGITIMATE_BROWSER'
unknown_traffic.loc[browser_legit_mask, 'reason'] = (
'[Navigateur légitime] ' + _bf[browser_legit_mask] +
' (cohérence=' + _bcs[browser_legit_mask].astype(int).astype(str) + '/5)'
)
n_legit = browser_legit_mask.sum()
families = _bf[browser_legit_mask].value_counts().to_dict()
log_info(f"[{name}] {n_legit} session(s) classée(s) LEGITIMATE_BROWSER : {families}")
log_decision('LEGITIMATE_BROWSER', cycle_id, name, {
'count': int(n_legit), 'families': families,
'mean_consistency': round(float(_bcs[browser_legit_mask].mean()), 2),
})
# Capturer toutes les sessions scorées (avant filtrage par seuil) — pour ml_all_scores
all_scored = unknown_traffic.copy()
@ -1120,9 +1155,10 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
'recurrence': int(row.get('recurrence', 1)),
})
# Filtrer sur raw_anomaly_score (A6 inclus) — seulement le trafic non-DENY
# Filtrer sur raw_anomaly_score (A6 inclus) — seulement le trafic non-DENY et non-navigateur légitime
# Les DENY sont toujours des threats, indépendamment du seuil IF
non_deny_traffic = unknown_traffic[~deny_mask]
# Les LEGITIMATE_BROWSER sont exclus des anomalies (navigateurs confirmés)
non_deny_traffic = unknown_traffic[~deny_mask & (unknown_traffic['threat_level'] != 'LEGITIMATE_BROWSER')]
anomalies = non_deny_traffic[non_deny_traffic['raw_anomaly_score'] < effective_threshold].copy()
if not anomalies.empty:
log_info(f"[{name}] ALERT: {len(anomalies)} anomalies détectées (seuil={effective_threshold:.4f}).")
@ -1261,6 +1297,36 @@ def _preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
if col in df.columns:
df[col] = df[col].fillna('').astype(str)
# ── Feature dérivée : navigateur légitime connu (dict_browser_ja4) ──
# is_known_browser : 1 si le JA4 correspond à un navigateur dans dict_browser_ja4.
# Signal fort de légitimité passé à l'IF pour réduire les faux positifs sur les
# vrais navigateurs. Le modèle apprend que browser_family ≠ '' corrèle avec le
# comportement humain normal (navigation, assets, referer, cookies).
df['is_known_browser'] = (
df.get('browser_family', pd.Series('', index=df.index)).fillna('').astype(str) != ''
).astype(int)
# ── browser_consistency_score : [0..5] — cohérence comportementale navigateur ──
# Combine 5 signaux binaires attendus d'un vrai navigateur :
# 1. browser_family reconnu (JA4 dans dict_browser_ja4)
# 2. modern_browser_score ≥ 50 (sec-ch-ua / UA modernes)
# 3. Accept-Language présent
# 4. Cookies présents (session active)
# 5. Sec-Fetch-* présent (absence faible)
# Un score ≥ 4 indique un navigateur cohérent. Un score bas avec
# is_known_browser=1 signale un possible spoofing de JA4.
_mbs = df.get('modern_browser_score', pd.Series(0, index=df.index)).fillna(0)
_hal = df.get('has_accept_language', pd.Series(0, index=df.index)).fillna(0)
_hck = df.get('has_cookie', pd.Series(0, index=df.index)).fillna(0)
_sfa = df.get('sec_fetch_absence_rate', pd.Series(1, index=df.index)).fillna(1)
df['browser_consistency_score'] = (
df['is_known_browser']
+ (_mbs >= 50).astype(int)
+ (_hal > 0).astype(int)
+ (_hck > 0).astype(int)
+ (_sfa < 0.5).astype(int)
)
# ── Features numériques dérivées des labels Anubis (pour IsolationForest) ──
# anubis_is_flagged : 1 si le trafic est marqué WEIGH/CHALLENGE par Anubis
# → signal de suspicion modéré passé à l'IF (ALLOW/DENY sont exclus du pipeline)
@ -1399,6 +1465,8 @@ def fetch_and_analyze():
'head_ratio', 'sec_fetch_absence_rate', 'generic_accept_ratio', 'http10_ratio',
# Anubis : signal de suspicion modéré (WEIGH/CHALLENGE) — bypass pour ALLOW/DENY
'anubis_is_flagged',
# Browser : signaux de navigateur légitime (dict_browser_ja4 + cohérence comportementale)
'is_known_browser', 'browser_consistency_score',
# HTTP : header incomplet et usage HTTP plain (disponibles pour les deux modèles)
'missing_accept_enc_ratio', 'http_scheme_ratio',
# ── Thèse §5 : features avancées (optionnelles — ignorées si indisponibles) ──
@ -1456,6 +1524,7 @@ def fetch_and_analyze():
all_scored['ja4'] = all_scored['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
all_scores_cols = [
'detected_at', 'window_start', 'src_ip', 'ja4', 'host', 'bot_name',
'browser_family',
'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category',
'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name',
'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label',
@ -1485,7 +1554,7 @@ def fetch_and_analyze():
all_anom['is_headless'] = all_anom[fake_nav_col].astype(int) if fake_nav_col in all_anom.columns else 0
cols = [
'detected_at', 'src_ip', 'ja4', 'host', 'bot_name', 'anomaly_score',
'detected_at', 'src_ip', 'ja4', 'host', 'bot_name', 'browser_family', 'anomaly_score',
'threat_level', 'model_name', 'recurrence',
'asn_number', 'asn_org', 'asn_detail', 'asn_domain', 'country_code', 'asn_label',
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio', 'max_keepalives', 'orphan_ratio',
@ -1534,6 +1603,7 @@ if __name__ == '__main__':
log_info(f' Clustering : {"activé" if ENABLE_CLUSTERING else "désactivé"} | Dedup TTL : {DEDUP_TTL_MIN}min')
log_info(f' Récurrence weight : {RECURRENCE_WEIGHT} | Min features ratio : {MIN_VALID_FEATURE_RATIO:.0%}')
log_info(f' Anubis : ALLOW→KNOWN_BOT (score=0), DENY→ANUBIS_DENY (score IF réel)')
log_info(f' Browser : LEGITIMATE_BROWSER si consistency>={BROWSER_LEGIT_MIN_CONSISTENCY}/5 + JA4 reconnu')
log_info('*' * 65)
log_decision('SERVICE_START', 'boot', '', {
'db': DB, 'contamination': CONTAMINATION, 'anomaly_threshold': ANOMALY_THRESHOLD,

View File

@ -587,3 +587,113 @@ def test_xgb_save_load_model():
np.testing.assert_allclose(probs_before, probs_after, rtol=1e-5)
finally:
os.remove(path)
# ═══════════════════════════════════════════════════════════════════════════════
# Tests pour la classification des navigateurs légitimes (A9)
# ═══════════════════════════════════════════════════════════════════════════════
def test_is_known_browser_feature():
"""is_known_browser=1 when browser_family is set, 0 otherwise."""
df = pd.DataFrame({
'browser_family': ['Chromium', 'Firefox', '', '', 'Safari'],
})
df['is_known_browser'] = (df['browser_family'] != '').astype(int)
assert df['is_known_browser'].tolist() == [1, 1, 0, 0, 1]
def test_browser_consistency_score_range():
"""browser_consistency_score is in [0, 5] and sums 5 binary signals."""
df = pd.DataFrame({
'browser_family': ['Chromium', '', 'Firefox', 'Safari'],
'modern_browser_score': [100, 0, 80, 50],
'has_accept_language': [1, 0, 1, 1],
'has_cookie': [1, 0, 1, 0],
'sec_fetch_absence_rate': [0.0, 1.0, 0.1, 0.6],
})
is_known = (df['browser_family'] != '').astype(int)
mbs_ok = (df['modern_browser_score'] >= 50).astype(int)
hal_ok = (df['has_accept_language'] > 0).astype(int)
hck_ok = (df['has_cookie'] > 0).astype(int)
sfa_ok = (df['sec_fetch_absence_rate'] < 0.5).astype(int)
bcs = is_known + mbs_ok + hal_ok + hck_ok + sfa_ok
assert bcs.min() >= 0 and bcs.max() <= 5
assert bcs.iloc[0] == 5, "Chromium with all signals should score 5"
assert bcs.iloc[1] == 0, "Empty browser with no signals should score 0"
assert bcs.iloc[2] == 5, "Firefox with all signals should score 5"
assert bcs.iloc[3] == 3, "Safari without cookie and high sec_fetch_absence should score 3"
def test_legitimate_browser_classification_threshold():
"""LEGITIMATE_BROWSER requires browser_family + consistency >= threshold + NORMAL/LOW threat."""
BROWSER_LEGIT_MIN_CONSISTENCY = 4
sessions = [
# (browser_family, bcs, threat_level) → expected classification
('Chromium', 5, 'NORMAL'), # → LEGITIMATE_BROWSER
('Chromium', 5, 'MEDIUM'), # threat too high → keep MEDIUM
('Firefox', 4, 'LOW'), # → LEGITIMATE_BROWSER
('Firefox', 3, 'NORMAL'), # consistency too low → keep NORMAL
('', 5, 'NORMAL'), # no browser → keep NORMAL (can't be 5 without browser, but edge case)
('Chromium', 5, 'ANUBIS_DENY'), # Anubis DENY → keep ANUBIS_DENY
]
results = []
for bf, bcs, tl in sessions:
is_legit = (
bf != '' and
bcs >= BROWSER_LEGIT_MIN_CONSISTENCY and
tl in ('NORMAL', 'LOW')
)
results.append('LEGITIMATE_BROWSER' if is_legit else tl)
assert results == [
'LEGITIMATE_BROWSER',
'MEDIUM',
'LEGITIMATE_BROWSER',
'NORMAL',
'NORMAL',
'ANUBIS_DENY',
]
def test_legitimate_browser_excluded_from_anomalies():
"""Sessions classified as LEGITIMATE_BROWSER should not appear in anomaly list."""
threat_levels = pd.Series(['CRITICAL', 'HIGH', 'MEDIUM', 'NORMAL', 'LEGITIMATE_BROWSER', 'ANUBIS_DENY'])
raw_scores = pd.Series([-0.5, -0.2, -0.08, 0.1, 0.05, -0.15])
threshold = -0.05
# Anomalies = below threshold AND not LEGITIMATE_BROWSER AND not ANUBIS_DENY
non_browser = threat_levels != 'LEGITIMATE_BROWSER'
non_deny = threat_levels != 'ANUBIS_DENY'
below_threshold = raw_scores < threshold
anomaly_mask = non_browser & non_deny & below_threshold
assert anomaly_mask.tolist() == [True, True, True, False, False, False]
def test_browser_spoofing_detection():
"""Inconsistent browser behavior (known JA4 but low consistency) stays in normal scoring."""
BROWSER_LEGIT_MIN_CONSISTENCY = 4
# Spoofed: JA4 looks like Chrome but no cookies, no Accept-Language, high sec_fetch_absence
spoofed_bf = 'Chromium'
spoofed_bcs = 1 # only is_known_browser=1, all others fail
spoofed_tl = 'MEDIUM'
is_legit = (
spoofed_bf != '' and
spoofed_bcs >= BROWSER_LEGIT_MIN_CONSISTENCY and
spoofed_tl in ('NORMAL', 'LOW')
)
assert not is_legit, "Spoofed browser should NOT be classified as LEGITIMATE_BROWSER"
def test_xgb_labels_treat_legitimate_browser_as_normal():
"""XGBoost training should treat LEGITIMATE_BROWSER as negative (non-threat) label."""
# The XGBoost query only selects these threat_levels (not LOW/MEDIUM)
threat_levels = pd.Series([
'NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'KNOWN_BOT', 'ANUBIS_DENY'
])
y = (~threat_levels.isin(['NORMAL', 'LEGITIMATE_BROWSER'])).astype(int)
assert y.tolist() == [0, 0, 1, 1, 1, 1]

View File

@ -26,13 +26,13 @@ _DETECTION_SORT_COLS = {
"detected_at", "src_ip", "ja4", "host", "anomaly_score",
"threat_level", "recurrence", "hits", "hit_velocity",
"fuzzing_index", "post_ratio", "campaign_id",
"asn_org", "country_code", "bot_name",
"asn_org", "country_code", "bot_name", "browser_family",
}
_SCORE_SORT_COLS = {
"detected_at", "window_start", "src_ip", "ja4", "host",
"anomaly_score", "raw_anomaly_score", "threat_level",
"hits", "hit_velocity", "xgb_prob", "ae_recon_error",
"asn_org", "country_code",
"asn_org", "country_code", "browser_family",
}
_TRAFFIC_SORT_COLS = {
"time", "src_ip", "method", "host", "path", "http_version",
@ -111,6 +111,20 @@ async def overview() -> dict[str, Any]:
"GROUP BY model_name"
)
browser_stats = query(
f"SELECT browser_family, count() AS cnt "
f"FROM {_DB}.ml_all_scores "
"WHERE detected_at >= now() - INTERVAL 1 DAY "
"AND browser_family != '' "
"GROUP BY browser_family ORDER BY cnt DESC"
)
legitimate_browsers = query_scalar(
f"SELECT count() FROM {_DB}.ml_all_scores "
"WHERE detected_at >= now() - INTERVAL 1 DAY "
"AND threat_level = 'LEGITIMATE_BROWSER'"
) or 0
return {
"detections_24h": detections_24h,
"scored_24h": scored_24h,
@ -118,6 +132,8 @@ async def overview() -> dict[str, Any]:
"unique_ips": unique_ips,
"critical_count": critical_count,
"high_count": high_count,
"legitimate_browsers": legitimate_browsers,
"browser_stats": browser_stats,
"threat_distribution": threat_distribution,
"top_ips": top_ips,
"timeline": [{"hour": str(r["hour"]), "cnt": r["cnt"]} for r in timeline],
@ -143,6 +159,7 @@ async def detections(
country_code: str | None = Query(None),
ja4: str | None = Query(None),
bot_name: str | None = Query(None),
browser_family: str | None = Query(None),
) -> dict[str, Any]:
sort = _validate_sort(sort, _DETECTION_SORT_COLS, "detected_at")
order = _validate_order(order)
@ -177,6 +194,10 @@ async def detections(
where_clauses.append("bot_name = {bn:String}")
params["bn"] = bot_name
if browser_family:
where_clauses.append("browser_family = {bf:String}")
params["bf"] = browser_family
where = " AND ".join(where_clauses)
try:
@ -219,6 +240,7 @@ async def scores(
asn_org: str | None = Query(None),
country_code: str | None = Query(None),
ja4: str | None = Query(None),
browser_family: str | None = Query(None),
) -> dict[str, Any]:
sort = _validate_sort(sort, _SCORE_SORT_COLS, "detected_at")
order = _validate_order(order)
@ -249,6 +271,10 @@ async def scores(
where_clauses.append("ja4 = {ja4:String}")
params["ja4"] = ja4
if browser_family:
where_clauses.append("browser_family = {bf:String}")
params["bf"] = browser_family
where = " AND ".join(where_clauses)
try:

View File

@ -24,6 +24,7 @@
CREATE TABLE IF NOT EXISTS ja4_processing.ml_detected_anomalies
(
detected_at DateTime, src_ip IPv6, ja4 String, host String, bot_name String,
browser_family LowCardinality(String) DEFAULT '',
anomaly_score Float32, threat_level String, model_name String, recurrence UInt32,
asn_number String, asn_org String, asn_detail String, asn_domain String,
country_code String, asn_label String,
@ -80,6 +81,7 @@ CREATE TABLE IF NOT EXISTS ja4_processing.ml_all_scores
ja4 String,
host String,
bot_name String,
browser_family LowCardinality(String) DEFAULT '',
anomaly_score Float32,
raw_anomaly_score Float32,
threat_level String,