feat: LEGITIMATE_BROWSER classification from JA4 + behavioral consistency

Add browser legitimacy classification (A9) to the bot detection pipeline: - New features: is_known_browser (binary) and browser_consistency_score [0..5] combining 5 signals: JA4 browser match, modern_browser_score, Accept-Language, cookies, Sec-Fetch-* presence - Post-scoring: sessions with known browser JA4 + consistency >= 4/5 + NORMAL/LOW threat level are reclassified as LEGITIMATE_BROWSER - Spoofing detection: inconsistent behavior (known JA4 but low consistency) stays in normal anomaly scoring — prevents evasion via JA4 spoofing - XGBoost treats LEGITIMATE_BROWSER as non-threat (negative label) - ClickHouse: browser_family column added to ml_detected_anomalies and ml_all_scores - Dashboard: browser_family filter/sort on detections and scores endpoints, legitimate_browsers count and browser_stats in overview - 6 new unit tests covering classification threshold, spoofing, exclusion logic Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-08 15:46:22 +02:00
parent 7d09c614c3
commit 9a48fb9d29
4 changed files with 215 additions and 7 deletions
--- a/services/bot-detector/bot_detector/bot_detector.py
+++ b/services/bot-detector/bot_detector/bot_detector.py
@ -151,6 +151,11 @@ XGB_WEIGHT = _require_float('XGB_WEIGHT', 0.20, 0, 1)  # pondération dans le sc
 XGB_MIN_LABELS = int(os.getenv('XGB_MIN_LABELS', '100'))  # nb minimum de labels historiques pour entraîner
 XGB_RETRAIN_INTERVAL_H = int(os.getenv('XGB_RETRAIN_INTERVAL_HOURS', '168'))  # retrain hebdomadaire

+# A9 — Classification navigateurs légitimes (JA4 + cohérence comportementale)
+# Seuil minimal de browser_consistency_score [0..5] pour classer LEGITIMATE_BROWSER.
+# 4/5 = navigateur reconnu + ≥3 signaux comportementaux (sec-ch-ua, cookies, Accept-Language, Sec-Fetch).
+BROWSER_LEGIT_MIN_CONSISTENCY = int(os.getenv('BROWSER_LEGIT_MIN_CONSISTENCY', '4'))
+
 # Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé)
 # Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design.
 STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = {
@ -441,7 +446,7 @@ def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -
        result = client.query(
            f"SELECT {feature_cols}, threat_level "
            f"FROM {DB}.ml_all_scores "
-            f"WHERE threat_level IN ('NORMAL', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') "
+            f"WHERE threat_level IN ('NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') "
            f"AND window_start >= now() - INTERVAL 7 DAY "
            f"ORDER BY rand() LIMIT 50000"
        )
@ -451,7 +456,7 @@ def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) -
        df = pd.DataFrame(result.result_rows, columns=cols)
        df[features] = df[features].apply(pd.to_numeric, errors='coerce')
        df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=features)
-        y = (df['threat_level'] != 'NORMAL').astype(int)
+        y = (~df['threat_level'].isin(['NORMAL', 'LEGITIMATE_BROWSER'])).astype(int)
        if y.sum() < 10 or len(y) < min_labels:
            return None, None
        X = df[features].values
@ -1055,6 +1060,36 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
    deny_mask = unknown_traffic['anubis_bot_action'] == 'DENY'
    unknown_traffic.loc[deny_mask, 'threat_level'] = 'ANUBIS_DENY'

+    # ── Classification des navigateurs légitimes ─────────────────────────────
+    # Principe : un cluster/session est classé LEGITIMATE_BROWSER si et seulement si :
+    #   1. Le JA4 correspond à un navigateur connu (browser_family != '')
+    #   2. Le comportement est cohérent avec un vrai navigateur (browser_consistency_score >= 4)
+    #   3. Le score IF ne signale pas de menace (threat_level in NORMAL/LOW)
+    #   4. Ce n'est pas un DENY Anubis
+    # Cela réduit les faux positifs sur les vrais utilisateurs tout en détectant
+    # le spoofing de JA4 (navigateur déclaré mais comportement incohérent).
+    _bf = unknown_traffic.get('browser_family', pd.Series('', index=unknown_traffic.index)).fillna('').astype(str)
+    _bcs = unknown_traffic.get('browser_consistency_score', pd.Series(0, index=unknown_traffic.index)).fillna(0)
+    browser_legit_mask = (
+        (_bf != '') &                                                    # JA4 navigateur reconnu
+        (_bcs >= BROWSER_LEGIT_MIN_CONSISTENCY) &                        # comportement cohérent
+        (unknown_traffic['threat_level'].isin(['NORMAL', 'LOW'])) &      # pas de menace IF
+        (~deny_mask)                                                     # pas un DENY Anubis
+    )
+    if browser_legit_mask.any():
+        unknown_traffic.loc[browser_legit_mask, 'threat_level'] = 'LEGITIMATE_BROWSER'
+        unknown_traffic.loc[browser_legit_mask, 'reason'] = (
+            '[Navigateur légitime] ' + _bf[browser_legit_mask] +
+            ' (cohérence=' + _bcs[browser_legit_mask].astype(int).astype(str) + '/5)'
+        )
+        n_legit = browser_legit_mask.sum()
+        families = _bf[browser_legit_mask].value_counts().to_dict()
+        log_info(f"[{name}] {n_legit} session(s) classée(s) LEGITIMATE_BROWSER : {families}")
+        log_decision('LEGITIMATE_BROWSER', cycle_id, name, {
+            'count': int(n_legit), 'families': families,
+            'mean_consistency': round(float(_bcs[browser_legit_mask].mean()), 2),
+        })
+
    # Capturer toutes les sessions scorées (avant filtrage par seuil) — pour ml_all_scores
    all_scored = unknown_traffic.copy()

@ -1120,9 +1155,10 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
                'recurrence': int(row.get('recurrence', 1)),
            })

-    # Filtrer sur raw_anomaly_score (A6 inclus) — seulement le trafic non-DENY
+    # Filtrer sur raw_anomaly_score (A6 inclus) — seulement le trafic non-DENY et non-navigateur légitime
    # Les DENY sont toujours des threats, indépendamment du seuil IF
-    non_deny_traffic = unknown_traffic[~deny_mask]
+    # Les LEGITIMATE_BROWSER sont exclus des anomalies (navigateurs confirmés)
+    non_deny_traffic = unknown_traffic[~deny_mask & (unknown_traffic['threat_level'] != 'LEGITIMATE_BROWSER')]
    anomalies = non_deny_traffic[non_deny_traffic['raw_anomaly_score'] < effective_threshold].copy()
    if not anomalies.empty:
        log_info(f"[{name}] ALERT: {len(anomalies)} anomalies détectées (seuil={effective_threshold:.4f}).")
@ -1261,6 +1297,36 @@ def _preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
        if col in df.columns:
            df[col] = df[col].fillna('').astype(str)

+    # ── Feature dérivée : navigateur légitime connu (dict_browser_ja4) ──
+    # is_known_browser : 1 si le JA4 correspond à un navigateur dans dict_browser_ja4.
+    # Signal fort de légitimité passé à l'IF pour réduire les faux positifs sur les
+    # vrais navigateurs. Le modèle apprend que browser_family ≠ '' corrèle avec le
+    # comportement humain normal (navigation, assets, referer, cookies).
+    df['is_known_browser'] = (
+        df.get('browser_family', pd.Series('', index=df.index)).fillna('').astype(str) != ''
+    ).astype(int)
+
+    # ── browser_consistency_score : [0..5] — cohérence comportementale navigateur ──
+    # Combine 5 signaux binaires attendus d'un vrai navigateur :
+    #   1. browser_family reconnu (JA4 dans dict_browser_ja4)
+    #   2. modern_browser_score ≥ 50 (sec-ch-ua / UA modernes)
+    #   3. Accept-Language présent
+    #   4. Cookies présents (session active)
+    #   5. Sec-Fetch-* présent (absence faible)
+    # Un score ≥ 4 indique un navigateur cohérent. Un score bas avec
+    # is_known_browser=1 signale un possible spoofing de JA4.
+    _mbs = df.get('modern_browser_score', pd.Series(0, index=df.index)).fillna(0)
+    _hal = df.get('has_accept_language', pd.Series(0, index=df.index)).fillna(0)
+    _hck = df.get('has_cookie', pd.Series(0, index=df.index)).fillna(0)
+    _sfa = df.get('sec_fetch_absence_rate', pd.Series(1, index=df.index)).fillna(1)
+    df['browser_consistency_score'] = (
+        df['is_known_browser']
+        + (_mbs >= 50).astype(int)
+        + (_hal > 0).astype(int)
+        + (_hck > 0).astype(int)
+        + (_sfa < 0.5).astype(int)
+    )
+
    # ── Features numériques dérivées des labels Anubis (pour IsolationForest) ──
    # anubis_is_flagged : 1 si le trafic est marqué WEIGH/CHALLENGE par Anubis
    #   → signal de suspicion modéré passé à l'IF (ALLOW/DENY sont exclus du pipeline)
@ -1399,6 +1465,8 @@ def fetch_and_analyze():
        'head_ratio', 'sec_fetch_absence_rate', 'generic_accept_ratio', 'http10_ratio',
        # Anubis : signal de suspicion modéré (WEIGH/CHALLENGE) — bypass pour ALLOW/DENY
        'anubis_is_flagged',
+        # Browser : signaux de navigateur légitime (dict_browser_ja4 + cohérence comportementale)
+        'is_known_browser', 'browser_consistency_score',
        # HTTP : header incomplet et usage HTTP plain (disponibles pour les deux modèles)
        'missing_accept_enc_ratio', 'http_scheme_ratio',
        # ── Thèse §5 : features avancées (optionnelles — ignorées si indisponibles) ──
@ -1456,6 +1524,7 @@ def fetch_and_analyze():
            all_scored['ja4'] = all_scored['ja4'].replace({'': 'HTTP_CLEAR_TEXT'})
            all_scores_cols = [
                'detected_at', 'window_start', 'src_ip', 'ja4', 'host', 'bot_name',
+                'browser_family',
                'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category',
                'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name',
                'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label',
@ -1485,7 +1554,7 @@ def fetch_and_analyze():
        all_anom['is_headless'] = all_anom[fake_nav_col].astype(int) if fake_nav_col in all_anom.columns else 0

        cols = [
-            'detected_at', 'src_ip', 'ja4', 'host', 'bot_name', 'anomaly_score',
+            'detected_at', 'src_ip', 'ja4', 'host', 'bot_name', 'browser_family', 'anomaly_score',
            'threat_level', 'model_name', 'recurrence',
            'asn_number', 'asn_org', 'asn_detail', 'asn_domain', 'country_code', 'asn_label',
            'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio', 'max_keepalives', 'orphan_ratio',
@ -1534,6 +1603,7 @@ if __name__ == '__main__':
    log_info(f' Clustering : {"activé" if ENABLE_CLUSTERING else "désactivé"} | Dedup TTL : {DEDUP_TTL_MIN}min')
    log_info(f' Récurrence weight : {RECURRENCE_WEIGHT} | Min features ratio : {MIN_VALID_FEATURE_RATIO:.0%}')
    log_info(f' Anubis : ALLOW→KNOWN_BOT (score=0), DENY→ANUBIS_DENY (score IF réel)')
+    log_info(f' Browser : LEGITIMATE_BROWSER si consistency>={BROWSER_LEGIT_MIN_CONSISTENCY}/5 + JA4 reconnu')
    log_info('*' * 65)
    log_decision('SERVICE_START', 'boot', '', {
        'db': DB, 'contamination': CONTAMINATION, 'anomaly_threshold': ANOMALY_THRESHOLD,
--- a/services/bot-detector/bot_detector/tests/test_detector.py
+++ b/services/bot-detector/bot_detector/tests/test_detector.py
@ -587,3 +587,113 @@ def test_xgb_save_load_model():
        np.testing.assert_allclose(probs_before, probs_after, rtol=1e-5)
    finally:
        os.remove(path)
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Tests pour la classification des navigateurs légitimes (A9)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def test_is_known_browser_feature():
+    """is_known_browser=1 when browser_family is set, 0 otherwise."""
+    df = pd.DataFrame({
+        'browser_family': ['Chromium', 'Firefox', '', '', 'Safari'],
+    })
+    df['is_known_browser'] = (df['browser_family'] != '').astype(int)
+    assert df['is_known_browser'].tolist() == [1, 1, 0, 0, 1]
+
+
+def test_browser_consistency_score_range():
+    """browser_consistency_score is in [0, 5] and sums 5 binary signals."""
+    df = pd.DataFrame({
+        'browser_family':       ['Chromium', '',       'Firefox', 'Safari'],
+        'modern_browser_score': [100,        0,        80,        50],
+        'has_accept_language':  [1,          0,        1,         1],
+        'has_cookie':           [1,          0,        1,         0],
+        'sec_fetch_absence_rate': [0.0,      1.0,      0.1,       0.6],
+    })
+    is_known = (df['browser_family'] != '').astype(int)
+    mbs_ok = (df['modern_browser_score'] >= 50).astype(int)
+    hal_ok = (df['has_accept_language'] > 0).astype(int)
+    hck_ok = (df['has_cookie'] > 0).astype(int)
+    sfa_ok = (df['sec_fetch_absence_rate'] < 0.5).astype(int)
+    bcs = is_known + mbs_ok + hal_ok + hck_ok + sfa_ok
+
+    assert bcs.min() >= 0 and bcs.max() <= 5
+    assert bcs.iloc[0] == 5, "Chromium with all signals should score 5"
+    assert bcs.iloc[1] == 0, "Empty browser with no signals should score 0"
+    assert bcs.iloc[2] == 5, "Firefox with all signals should score 5"
+    assert bcs.iloc[3] == 3, "Safari without cookie and high sec_fetch_absence should score 3"
+
+
+def test_legitimate_browser_classification_threshold():
+    """LEGITIMATE_BROWSER requires browser_family + consistency >= threshold + NORMAL/LOW threat."""
+    BROWSER_LEGIT_MIN_CONSISTENCY = 4
+
+    sessions = [
+        # (browser_family, bcs, threat_level) → expected classification
+        ('Chromium', 5, 'NORMAL'),      # → LEGITIMATE_BROWSER
+        ('Chromium', 5, 'MEDIUM'),      # threat too high → keep MEDIUM
+        ('Firefox',  4, 'LOW'),         # → LEGITIMATE_BROWSER
+        ('Firefox',  3, 'NORMAL'),      # consistency too low → keep NORMAL
+        ('',         5, 'NORMAL'),      # no browser → keep NORMAL (can't be 5 without browser, but edge case)
+        ('Chromium', 5, 'ANUBIS_DENY'), # Anubis DENY → keep ANUBIS_DENY
+    ]
+    results = []
+    for bf, bcs, tl in sessions:
+        is_legit = (
+            bf != '' and
+            bcs >= BROWSER_LEGIT_MIN_CONSISTENCY and
+            tl in ('NORMAL', 'LOW')
+        )
+        results.append('LEGITIMATE_BROWSER' if is_legit else tl)
+
+    assert results == [
+        'LEGITIMATE_BROWSER',
+        'MEDIUM',
+        'LEGITIMATE_BROWSER',
+        'NORMAL',
+        'NORMAL',
+        'ANUBIS_DENY',
+    ]
+
+
+def test_legitimate_browser_excluded_from_anomalies():
+    """Sessions classified as LEGITIMATE_BROWSER should not appear in anomaly list."""
+    threat_levels = pd.Series(['CRITICAL', 'HIGH', 'MEDIUM', 'NORMAL', 'LEGITIMATE_BROWSER', 'ANUBIS_DENY'])
+    raw_scores    = pd.Series([-0.5,      -0.2,   -0.08,    0.1,      0.05,                  -0.15])
+    threshold = -0.05
+
+    # Anomalies = below threshold AND not LEGITIMATE_BROWSER AND not ANUBIS_DENY
+    non_browser = threat_levels != 'LEGITIMATE_BROWSER'
+    non_deny = threat_levels != 'ANUBIS_DENY'
+    below_threshold = raw_scores < threshold
+    anomaly_mask = non_browser & non_deny & below_threshold
+
+    assert anomaly_mask.tolist() == [True, True, True, False, False, False]
+
+
+def test_browser_spoofing_detection():
+    """Inconsistent browser behavior (known JA4 but low consistency) stays in normal scoring."""
+    BROWSER_LEGIT_MIN_CONSISTENCY = 4
+
+    # Spoofed: JA4 looks like Chrome but no cookies, no Accept-Language, high sec_fetch_absence
+    spoofed_bf = 'Chromium'
+    spoofed_bcs = 1  # only is_known_browser=1, all others fail
+    spoofed_tl = 'MEDIUM'
+
+    is_legit = (
+        spoofed_bf != '' and
+        spoofed_bcs >= BROWSER_LEGIT_MIN_CONSISTENCY and
+        spoofed_tl in ('NORMAL', 'LOW')
+    )
+    assert not is_legit, "Spoofed browser should NOT be classified as LEGITIMATE_BROWSER"
+
+
+def test_xgb_labels_treat_legitimate_browser_as_normal():
+    """XGBoost training should treat LEGITIMATE_BROWSER as negative (non-threat) label."""
+    # The XGBoost query only selects these threat_levels (not LOW/MEDIUM)
+    threat_levels = pd.Series([
+        'NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'KNOWN_BOT', 'ANUBIS_DENY'
+    ])
+    y = (~threat_levels.isin(['NORMAL', 'LEGITIMATE_BROWSER'])).astype(int)
+    assert y.tolist() == [0, 0, 1, 1, 1, 1]
--- a/services/dashboard/backend/routes/api.py
+++ b/services/dashboard/backend/routes/api.py
@ -26,13 +26,13 @@ _DETECTION_SORT_COLS = {
    "detected_at", "src_ip", "ja4", "host", "anomaly_score",
    "threat_level", "recurrence", "hits", "hit_velocity",
    "fuzzing_index", "post_ratio", "campaign_id",
-    "asn_org", "country_code", "bot_name",
+    "asn_org", "country_code", "bot_name", "browser_family",
 }
 _SCORE_SORT_COLS = {
    "detected_at", "window_start", "src_ip", "ja4", "host",
    "anomaly_score", "raw_anomaly_score", "threat_level",
    "hits", "hit_velocity", "xgb_prob", "ae_recon_error",
-    "asn_org", "country_code",
+    "asn_org", "country_code", "browser_family",
 }
 _TRAFFIC_SORT_COLS = {
    "time", "src_ip", "method", "host", "path", "http_version",
@ -111,6 +111,20 @@ async def overview() -> dict[str, Any]:
            "GROUP BY model_name"
        )

+        browser_stats = query(
+            f"SELECT browser_family, count() AS cnt "
+            f"FROM {_DB}.ml_all_scores "
+            "WHERE detected_at >= now() - INTERVAL 1 DAY "
+            "AND browser_family != '' "
+            "GROUP BY browser_family ORDER BY cnt DESC"
+        )
+
+        legitimate_browsers = query_scalar(
+            f"SELECT count() FROM {_DB}.ml_all_scores "
+            "WHERE detected_at >= now() - INTERVAL 1 DAY "
+            "AND threat_level = 'LEGITIMATE_BROWSER'"
+        ) or 0
+
        return {
            "detections_24h": detections_24h,
            "scored_24h": scored_24h,
@ -118,6 +132,8 @@ async def overview() -> dict[str, Any]:
            "unique_ips": unique_ips,
            "critical_count": critical_count,
            "high_count": high_count,
+            "legitimate_browsers": legitimate_browsers,
+            "browser_stats": browser_stats,
            "threat_distribution": threat_distribution,
            "top_ips": top_ips,
            "timeline": [{"hour": str(r["hour"]), "cnt": r["cnt"]} for r in timeline],
@ -143,6 +159,7 @@ async def detections(
    country_code: str | None = Query(None),
    ja4: str | None = Query(None),
    bot_name: str | None = Query(None),
+    browser_family: str | None = Query(None),
 ) -> dict[str, Any]:
    sort = _validate_sort(sort, _DETECTION_SORT_COLS, "detected_at")
    order = _validate_order(order)
@ -177,6 +194,10 @@ async def detections(
        where_clauses.append("bot_name = {bn:String}")
        params["bn"] = bot_name

+    if browser_family:
+        where_clauses.append("browser_family = {bf:String}")
+        params["bf"] = browser_family
+
    where = " AND ".join(where_clauses)

    try:
@ -219,6 +240,7 @@ async def scores(
    asn_org: str | None = Query(None),
    country_code: str | None = Query(None),
    ja4: str | None = Query(None),
+    browser_family: str | None = Query(None),
 ) -> dict[str, Any]:
    sort = _validate_sort(sort, _SCORE_SORT_COLS, "detected_at")
    order = _validate_order(order)
@ -249,6 +271,10 @@ async def scores(
        where_clauses.append("ja4 = {ja4:String}")
        params["ja4"] = ja4

+    if browser_family:
+        where_clauses.append("browser_family = {bf:String}")
+        params["bf"] = browser_family
+
    where = " AND ".join(where_clauses)

    try:
--- a/shared/clickhouse/06_ml_tables.sql
+++ b/shared/clickhouse/06_ml_tables.sql
@ -24,6 +24,7 @@
 CREATE TABLE IF NOT EXISTS ja4_processing.ml_detected_anomalies
 (
    detected_at DateTime, src_ip IPv6, ja4 String, host String, bot_name String,
+    browser_family LowCardinality(String) DEFAULT '',
    anomaly_score Float32, threat_level String, model_name String, recurrence UInt32,
    asn_number String, asn_org String, asn_detail String, asn_domain String,
    country_code String, asn_label String,
@ -80,6 +81,7 @@ CREATE TABLE IF NOT EXISTS ja4_processing.ml_all_scores
    ja4               String,
    host              String,
    bot_name          String,
+    browser_family    LowCardinality(String) DEFAULT '',
    anomaly_score     Float32,
    raw_anomaly_score Float32,
    threat_level      String,