diff --git a/services/bot-detector/bot_detector/bot_detector.py b/services/bot-detector/bot_detector/bot_detector.py index 7bdafe3..30160a6 100644 --- a/services/bot-detector/bot_detector/bot_detector.py +++ b/services/bot-detector/bot_detector/bot_detector.py @@ -151,6 +151,11 @@ XGB_WEIGHT = _require_float('XGB_WEIGHT', 0.20, 0, 1) # pondération dans le sc XGB_MIN_LABELS = int(os.getenv('XGB_MIN_LABELS', '100')) # nb minimum de labels historiques pour entraîner XGB_RETRAIN_INTERVAL_H = int(os.getenv('XGB_RETRAIN_INTERVAL_HOURS', '168')) # retrain hebdomadaire +# A9 — Classification navigateurs légitimes (JA4 + cohérence comportementale) +# Seuil minimal de browser_consistency_score [0..5] pour classer LEGITIMATE_BROWSER. +# 4/5 = navigateur reconnu + ≥3 signaux comportementaux (sec-ch-ua, cookies, Accept-Language, Sec-Fetch). +BROWSER_LEGIT_MIN_CONSISTENCY = int(os.getenv('BROWSER_LEGIT_MIN_CONSISTENCY', '4')) + # Features structurellement indisponibles par modèle (pas de données L4 pour trafic non-corrélé) # Ces features ne génèrent pas de warnings "pipeline" — leur absence est by-design. STRUCTURAL_EXCLUDED_FEATURES: dict[str, list] = { @@ -441,7 +446,7 @@ def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) - result = client.query( f"SELECT {feature_cols}, threat_level " f"FROM {DB}.ml_all_scores " - f"WHERE threat_level IN ('NORMAL', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') " + f"WHERE threat_level IN ('NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'ANUBIS_DENY', 'KNOWN_BOT') " f"AND window_start >= now() - INTERVAL 7 DAY " f"ORDER BY rand() LIMIT 50000" ) @@ -451,7 +456,7 @@ def _load_xgb_labels(client, features: list, min_labels: int = XGB_MIN_LABELS) - df = pd.DataFrame(result.result_rows, columns=cols) df[features] = df[features].apply(pd.to_numeric, errors='coerce') df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=features) - y = (df['threat_level'] != 'NORMAL').astype(int) + y = (~df['threat_level'].isin(['NORMAL', 'LEGITIMATE_BROWSER'])).astype(int) if y.sum() < 10 or len(y) < min_labels: return None, None X = df[features].values @@ -1055,6 +1060,36 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map): deny_mask = unknown_traffic['anubis_bot_action'] == 'DENY' unknown_traffic.loc[deny_mask, 'threat_level'] = 'ANUBIS_DENY' + # ── Classification des navigateurs légitimes ───────────────────────────── + # Principe : un cluster/session est classé LEGITIMATE_BROWSER si et seulement si : + # 1. Le JA4 correspond à un navigateur connu (browser_family != '') + # 2. Le comportement est cohérent avec un vrai navigateur (browser_consistency_score >= 4) + # 3. Le score IF ne signale pas de menace (threat_level in NORMAL/LOW) + # 4. Ce n'est pas un DENY Anubis + # Cela réduit les faux positifs sur les vrais utilisateurs tout en détectant + # le spoofing de JA4 (navigateur déclaré mais comportement incohérent). + _bf = unknown_traffic.get('browser_family', pd.Series('', index=unknown_traffic.index)).fillna('').astype(str) + _bcs = unknown_traffic.get('browser_consistency_score', pd.Series(0, index=unknown_traffic.index)).fillna(0) + browser_legit_mask = ( + (_bf != '') & # JA4 navigateur reconnu + (_bcs >= BROWSER_LEGIT_MIN_CONSISTENCY) & # comportement cohérent + (unknown_traffic['threat_level'].isin(['NORMAL', 'LOW'])) & # pas de menace IF + (~deny_mask) # pas un DENY Anubis + ) + if browser_legit_mask.any(): + unknown_traffic.loc[browser_legit_mask, 'threat_level'] = 'LEGITIMATE_BROWSER' + unknown_traffic.loc[browser_legit_mask, 'reason'] = ( + '[Navigateur légitime] ' + _bf[browser_legit_mask] + + ' (cohérence=' + _bcs[browser_legit_mask].astype(int).astype(str) + '/5)' + ) + n_legit = browser_legit_mask.sum() + families = _bf[browser_legit_mask].value_counts().to_dict() + log_info(f"[{name}] {n_legit} session(s) classée(s) LEGITIMATE_BROWSER : {families}") + log_decision('LEGITIMATE_BROWSER', cycle_id, name, { + 'count': int(n_legit), 'families': families, + 'mean_consistency': round(float(_bcs[browser_legit_mask].mean()), 2), + }) + # Capturer toutes les sessions scorées (avant filtrage par seuil) — pour ml_all_scores all_scored = unknown_traffic.copy() @@ -1120,9 +1155,10 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map): 'recurrence': int(row.get('recurrence', 1)), }) - # Filtrer sur raw_anomaly_score (A6 inclus) — seulement le trafic non-DENY + # Filtrer sur raw_anomaly_score (A6 inclus) — seulement le trafic non-DENY et non-navigateur légitime # Les DENY sont toujours des threats, indépendamment du seuil IF - non_deny_traffic = unknown_traffic[~deny_mask] + # Les LEGITIMATE_BROWSER sont exclus des anomalies (navigateurs confirmés) + non_deny_traffic = unknown_traffic[~deny_mask & (unknown_traffic['threat_level'] != 'LEGITIMATE_BROWSER')] anomalies = non_deny_traffic[non_deny_traffic['raw_anomaly_score'] < effective_threshold].copy() if not anomalies.empty: log_info(f"[{name}] ALERT: {len(anomalies)} anomalies détectées (seuil={effective_threshold:.4f}).") @@ -1261,6 +1297,36 @@ def _preprocess_df(df: pd.DataFrame) -> pd.DataFrame: if col in df.columns: df[col] = df[col].fillna('').astype(str) + # ── Feature dérivée : navigateur légitime connu (dict_browser_ja4) ── + # is_known_browser : 1 si le JA4 correspond à un navigateur dans dict_browser_ja4. + # Signal fort de légitimité passé à l'IF pour réduire les faux positifs sur les + # vrais navigateurs. Le modèle apprend que browser_family ≠ '' corrèle avec le + # comportement humain normal (navigation, assets, referer, cookies). + df['is_known_browser'] = ( + df.get('browser_family', pd.Series('', index=df.index)).fillna('').astype(str) != '' + ).astype(int) + + # ── browser_consistency_score : [0..5] — cohérence comportementale navigateur ── + # Combine 5 signaux binaires attendus d'un vrai navigateur : + # 1. browser_family reconnu (JA4 dans dict_browser_ja4) + # 2. modern_browser_score ≥ 50 (sec-ch-ua / UA modernes) + # 3. Accept-Language présent + # 4. Cookies présents (session active) + # 5. Sec-Fetch-* présent (absence faible) + # Un score ≥ 4 indique un navigateur cohérent. Un score bas avec + # is_known_browser=1 signale un possible spoofing de JA4. + _mbs = df.get('modern_browser_score', pd.Series(0, index=df.index)).fillna(0) + _hal = df.get('has_accept_language', pd.Series(0, index=df.index)).fillna(0) + _hck = df.get('has_cookie', pd.Series(0, index=df.index)).fillna(0) + _sfa = df.get('sec_fetch_absence_rate', pd.Series(1, index=df.index)).fillna(1) + df['browser_consistency_score'] = ( + df['is_known_browser'] + + (_mbs >= 50).astype(int) + + (_hal > 0).astype(int) + + (_hck > 0).astype(int) + + (_sfa < 0.5).astype(int) + ) + # ── Features numériques dérivées des labels Anubis (pour IsolationForest) ── # anubis_is_flagged : 1 si le trafic est marqué WEIGH/CHALLENGE par Anubis # → signal de suspicion modéré passé à l'IF (ALLOW/DENY sont exclus du pipeline) @@ -1399,6 +1465,8 @@ def fetch_and_analyze(): 'head_ratio', 'sec_fetch_absence_rate', 'generic_accept_ratio', 'http10_ratio', # Anubis : signal de suspicion modéré (WEIGH/CHALLENGE) — bypass pour ALLOW/DENY 'anubis_is_flagged', + # Browser : signaux de navigateur légitime (dict_browser_ja4 + cohérence comportementale) + 'is_known_browser', 'browser_consistency_score', # HTTP : header incomplet et usage HTTP plain (disponibles pour les deux modèles) 'missing_accept_enc_ratio', 'http_scheme_ratio', # ── Thèse §5 : features avancées (optionnelles — ignorées si indisponibles) ── @@ -1456,6 +1524,7 @@ def fetch_and_analyze(): all_scored['ja4'] = all_scored['ja4'].replace({'': 'HTTP_CLEAR_TEXT'}) all_scores_cols = [ 'detected_at', 'window_start', 'src_ip', 'ja4', 'host', 'bot_name', + 'browser_family', 'anubis_bot_name', 'anubis_bot_action', 'anubis_bot_category', 'anomaly_score', 'raw_anomaly_score', 'threat_level', 'model_name', 'correlated', 'asn_number', 'asn_org', 'country_code', 'asn_label', @@ -1485,7 +1554,7 @@ def fetch_and_analyze(): all_anom['is_headless'] = all_anom[fake_nav_col].astype(int) if fake_nav_col in all_anom.columns else 0 cols = [ - 'detected_at', 'src_ip', 'ja4', 'host', 'bot_name', 'anomaly_score', + 'detected_at', 'src_ip', 'ja4', 'host', 'bot_name', 'browser_family', 'anomaly_score', 'threat_level', 'model_name', 'recurrence', 'asn_number', 'asn_org', 'asn_detail', 'asn_domain', 'country_code', 'asn_label', 'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio', 'max_keepalives', 'orphan_ratio', @@ -1534,6 +1603,7 @@ if __name__ == '__main__': log_info(f' Clustering : {"activé" if ENABLE_CLUSTERING else "désactivé"} | Dedup TTL : {DEDUP_TTL_MIN}min') log_info(f' Récurrence weight : {RECURRENCE_WEIGHT} | Min features ratio : {MIN_VALID_FEATURE_RATIO:.0%}') log_info(f' Anubis : ALLOW→KNOWN_BOT (score=0), DENY→ANUBIS_DENY (score IF réel)') + log_info(f' Browser : LEGITIMATE_BROWSER si consistency>={BROWSER_LEGIT_MIN_CONSISTENCY}/5 + JA4 reconnu') log_info('*' * 65) log_decision('SERVICE_START', 'boot', '', { 'db': DB, 'contamination': CONTAMINATION, 'anomaly_threshold': ANOMALY_THRESHOLD, diff --git a/services/bot-detector/bot_detector/tests/test_detector.py b/services/bot-detector/bot_detector/tests/test_detector.py index c11ba2b..ce5daa3 100644 --- a/services/bot-detector/bot_detector/tests/test_detector.py +++ b/services/bot-detector/bot_detector/tests/test_detector.py @@ -587,3 +587,113 @@ def test_xgb_save_load_model(): np.testing.assert_allclose(probs_before, probs_after, rtol=1e-5) finally: os.remove(path) + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Tests pour la classification des navigateurs légitimes (A9) +# ═══════════════════════════════════════════════════════════════════════════════ + +def test_is_known_browser_feature(): + """is_known_browser=1 when browser_family is set, 0 otherwise.""" + df = pd.DataFrame({ + 'browser_family': ['Chromium', 'Firefox', '', '', 'Safari'], + }) + df['is_known_browser'] = (df['browser_family'] != '').astype(int) + assert df['is_known_browser'].tolist() == [1, 1, 0, 0, 1] + + +def test_browser_consistency_score_range(): + """browser_consistency_score is in [0, 5] and sums 5 binary signals.""" + df = pd.DataFrame({ + 'browser_family': ['Chromium', '', 'Firefox', 'Safari'], + 'modern_browser_score': [100, 0, 80, 50], + 'has_accept_language': [1, 0, 1, 1], + 'has_cookie': [1, 0, 1, 0], + 'sec_fetch_absence_rate': [0.0, 1.0, 0.1, 0.6], + }) + is_known = (df['browser_family'] != '').astype(int) + mbs_ok = (df['modern_browser_score'] >= 50).astype(int) + hal_ok = (df['has_accept_language'] > 0).astype(int) + hck_ok = (df['has_cookie'] > 0).astype(int) + sfa_ok = (df['sec_fetch_absence_rate'] < 0.5).astype(int) + bcs = is_known + mbs_ok + hal_ok + hck_ok + sfa_ok + + assert bcs.min() >= 0 and bcs.max() <= 5 + assert bcs.iloc[0] == 5, "Chromium with all signals should score 5" + assert bcs.iloc[1] == 0, "Empty browser with no signals should score 0" + assert bcs.iloc[2] == 5, "Firefox with all signals should score 5" + assert bcs.iloc[3] == 3, "Safari without cookie and high sec_fetch_absence should score 3" + + +def test_legitimate_browser_classification_threshold(): + """LEGITIMATE_BROWSER requires browser_family + consistency >= threshold + NORMAL/LOW threat.""" + BROWSER_LEGIT_MIN_CONSISTENCY = 4 + + sessions = [ + # (browser_family, bcs, threat_level) → expected classification + ('Chromium', 5, 'NORMAL'), # → LEGITIMATE_BROWSER + ('Chromium', 5, 'MEDIUM'), # threat too high → keep MEDIUM + ('Firefox', 4, 'LOW'), # → LEGITIMATE_BROWSER + ('Firefox', 3, 'NORMAL'), # consistency too low → keep NORMAL + ('', 5, 'NORMAL'), # no browser → keep NORMAL (can't be 5 without browser, but edge case) + ('Chromium', 5, 'ANUBIS_DENY'), # Anubis DENY → keep ANUBIS_DENY + ] + results = [] + for bf, bcs, tl in sessions: + is_legit = ( + bf != '' and + bcs >= BROWSER_LEGIT_MIN_CONSISTENCY and + tl in ('NORMAL', 'LOW') + ) + results.append('LEGITIMATE_BROWSER' if is_legit else tl) + + assert results == [ + 'LEGITIMATE_BROWSER', + 'MEDIUM', + 'LEGITIMATE_BROWSER', + 'NORMAL', + 'NORMAL', + 'ANUBIS_DENY', + ] + + +def test_legitimate_browser_excluded_from_anomalies(): + """Sessions classified as LEGITIMATE_BROWSER should not appear in anomaly list.""" + threat_levels = pd.Series(['CRITICAL', 'HIGH', 'MEDIUM', 'NORMAL', 'LEGITIMATE_BROWSER', 'ANUBIS_DENY']) + raw_scores = pd.Series([-0.5, -0.2, -0.08, 0.1, 0.05, -0.15]) + threshold = -0.05 + + # Anomalies = below threshold AND not LEGITIMATE_BROWSER AND not ANUBIS_DENY + non_browser = threat_levels != 'LEGITIMATE_BROWSER' + non_deny = threat_levels != 'ANUBIS_DENY' + below_threshold = raw_scores < threshold + anomaly_mask = non_browser & non_deny & below_threshold + + assert anomaly_mask.tolist() == [True, True, True, False, False, False] + + +def test_browser_spoofing_detection(): + """Inconsistent browser behavior (known JA4 but low consistency) stays in normal scoring.""" + BROWSER_LEGIT_MIN_CONSISTENCY = 4 + + # Spoofed: JA4 looks like Chrome but no cookies, no Accept-Language, high sec_fetch_absence + spoofed_bf = 'Chromium' + spoofed_bcs = 1 # only is_known_browser=1, all others fail + spoofed_tl = 'MEDIUM' + + is_legit = ( + spoofed_bf != '' and + spoofed_bcs >= BROWSER_LEGIT_MIN_CONSISTENCY and + spoofed_tl in ('NORMAL', 'LOW') + ) + assert not is_legit, "Spoofed browser should NOT be classified as LEGITIMATE_BROWSER" + + +def test_xgb_labels_treat_legitimate_browser_as_normal(): + """XGBoost training should treat LEGITIMATE_BROWSER as negative (non-threat) label.""" + # The XGBoost query only selects these threat_levels (not LOW/MEDIUM) + threat_levels = pd.Series([ + 'NORMAL', 'LEGITIMATE_BROWSER', 'HIGH', 'CRITICAL', 'KNOWN_BOT', 'ANUBIS_DENY' + ]) + y = (~threat_levels.isin(['NORMAL', 'LEGITIMATE_BROWSER'])).astype(int) + assert y.tolist() == [0, 0, 1, 1, 1, 1] diff --git a/services/dashboard/backend/routes/api.py b/services/dashboard/backend/routes/api.py index 43743ee..a794383 100644 --- a/services/dashboard/backend/routes/api.py +++ b/services/dashboard/backend/routes/api.py @@ -26,13 +26,13 @@ _DETECTION_SORT_COLS = { "detected_at", "src_ip", "ja4", "host", "anomaly_score", "threat_level", "recurrence", "hits", "hit_velocity", "fuzzing_index", "post_ratio", "campaign_id", - "asn_org", "country_code", "bot_name", + "asn_org", "country_code", "bot_name", "browser_family", } _SCORE_SORT_COLS = { "detected_at", "window_start", "src_ip", "ja4", "host", "anomaly_score", "raw_anomaly_score", "threat_level", "hits", "hit_velocity", "xgb_prob", "ae_recon_error", - "asn_org", "country_code", + "asn_org", "country_code", "browser_family", } _TRAFFIC_SORT_COLS = { "time", "src_ip", "method", "host", "path", "http_version", @@ -111,6 +111,20 @@ async def overview() -> dict[str, Any]: "GROUP BY model_name" ) + browser_stats = query( + f"SELECT browser_family, count() AS cnt " + f"FROM {_DB}.ml_all_scores " + "WHERE detected_at >= now() - INTERVAL 1 DAY " + "AND browser_family != '' " + "GROUP BY browser_family ORDER BY cnt DESC" + ) + + legitimate_browsers = query_scalar( + f"SELECT count() FROM {_DB}.ml_all_scores " + "WHERE detected_at >= now() - INTERVAL 1 DAY " + "AND threat_level = 'LEGITIMATE_BROWSER'" + ) or 0 + return { "detections_24h": detections_24h, "scored_24h": scored_24h, @@ -118,6 +132,8 @@ async def overview() -> dict[str, Any]: "unique_ips": unique_ips, "critical_count": critical_count, "high_count": high_count, + "legitimate_browsers": legitimate_browsers, + "browser_stats": browser_stats, "threat_distribution": threat_distribution, "top_ips": top_ips, "timeline": [{"hour": str(r["hour"]), "cnt": r["cnt"]} for r in timeline], @@ -143,6 +159,7 @@ async def detections( country_code: str | None = Query(None), ja4: str | None = Query(None), bot_name: str | None = Query(None), + browser_family: str | None = Query(None), ) -> dict[str, Any]: sort = _validate_sort(sort, _DETECTION_SORT_COLS, "detected_at") order = _validate_order(order) @@ -177,6 +194,10 @@ async def detections( where_clauses.append("bot_name = {bn:String}") params["bn"] = bot_name + if browser_family: + where_clauses.append("browser_family = {bf:String}") + params["bf"] = browser_family + where = " AND ".join(where_clauses) try: @@ -219,6 +240,7 @@ async def scores( asn_org: str | None = Query(None), country_code: str | None = Query(None), ja4: str | None = Query(None), + browser_family: str | None = Query(None), ) -> dict[str, Any]: sort = _validate_sort(sort, _SCORE_SORT_COLS, "detected_at") order = _validate_order(order) @@ -249,6 +271,10 @@ async def scores( where_clauses.append("ja4 = {ja4:String}") params["ja4"] = ja4 + if browser_family: + where_clauses.append("browser_family = {bf:String}") + params["bf"] = browser_family + where = " AND ".join(where_clauses) try: diff --git a/shared/clickhouse/06_ml_tables.sql b/shared/clickhouse/06_ml_tables.sql index f6c3406..35420da 100644 --- a/shared/clickhouse/06_ml_tables.sql +++ b/shared/clickhouse/06_ml_tables.sql @@ -24,6 +24,7 @@ CREATE TABLE IF NOT EXISTS ja4_processing.ml_detected_anomalies ( detected_at DateTime, src_ip IPv6, ja4 String, host String, bot_name String, + browser_family LowCardinality(String) DEFAULT '', anomaly_score Float32, threat_level String, model_name String, recurrence UInt32, asn_number String, asn_org String, asn_detail String, asn_domain String, country_code String, asn_label String, @@ -80,6 +81,7 @@ CREATE TABLE IF NOT EXISTS ja4_processing.ml_all_scores ja4 String, host String, bot_name String, + browser_family LowCardinality(String) DEFAULT '', anomaly_score Float32, raw_anomaly_score Float32, threat_level String,