Files
ja4-platform/services/bot-detector/bot_detector/preprocessing.py
toto e52cdcc01f feat(bot-detector): Browser Signature Detection engine (parallel mode)
Étape A — browser_signatures.py
  Données pures : BROWSER_SIGNATURES (Chrome/Firefox/Safari), NON_BROWSER_SIGNATURES
  (curl/httpx/go), BROWSER_THRESHOLDS, DIMENSION_WEIGHTS. Valeurs H2 extraites
  des captures réelles (format Akamai avec virgules, non semicolons).

Étape B — browser_matcher.py
  Moteur vectorisé 7 dimensions (H2 SETTINGS 0.30, WINDOW_UPDATE 0.15,
  pseudo-header order 0.15, H2 PRIORITY 0.10, HTTP headers 0.15, TLS 0.10,
  JA4 dict 0.05). run_browser_matcher(df) ajoute bm_family/bm_score/bm_decision.
  CDN edge case : dimension H2 neutralisée (0.5) si has_xff=1.
  BROWSER_MATCHER_REPLACE=false par défaut (mode DUAL_MODE logging uniquement).

Étape C — 06_browser_signature_detection.sql (migration)
  Crée browser_h2_signatures (table MergeTree avec 12 fingerprints de référence).
  Recrée dict_browser_h2 depuis la table avec champ confidence (remplace CSV).

Étape D — 07_ai_features_view.sql
  +h2_wu_val dans le JOIN http_logs, +h2_window_update_value, +h2_dict_family,
  +h2_dict_confidence, +h2_window_{chrome,firefox,safari,absent},
  +h2_order_{chromesafari,firefox}, +h2_priority_present, +h2_pseudo_ord_raw,
  +tls_h2_family_mismatch (détection incohérence famille JA4 vs famille H2).

Étape E — preprocessing.py + pipeline.py
  preprocessing.py: appelle run_browser_matcher() après compute_browser_axes(),
  ajoute 7 nouvelles features binaires H2 à FEATURES et binary_features.
  pipeline.py: appelle log_dual_mode_comparison() après la classification A9.
  BROWSER_MATCHER_REPLACE=true active le remplacement du bypass.

Étape F — test_browser_matcher.py
  8 tests : Chrome/Firefox/Safari full match, curl rejeté, httpcloak partiel,
  TLS↔H2 mismatch, CDN proxy neutralisation, go net/http rejeté.
  Tous 8 PASSED (+ 36 tests existants inchangés).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-10 13:52:57 +02:00

144 lines
7.6 KiB
Python

"""Prétraitement des données et listes de features.
Normalise les colonnes, enrichit via l'identification multifactorielle des
navigateurs, et définit les listes de features pour chaque modèle.
"""
import pandas as pd
import numpy as np
from .config import BROWSER_CONFIDENCE_THRESHOLD
from .log import log_info
from .browser import _compute_browser_axes, _parse_ja4_columns, _infer_browser_family
from .browser_matcher import run_browser_matcher, log_dual_mode_comparison, BROWSER_MATCHER_ENABLED
# ═══════════════════════════════════════════════════════════════════════════════
# LISTES DE FEATURES PAR MODÈLE
# ═══════════════════════════════════════════════════════════════════════════════
# Features communes (L7 HTTP pur, disponibles correlated=0 et 1)
FEATURES = [
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio',
'orphan_ratio', 'max_keepalives', 'tcp_shared_count', 'header_order_shared_count',
'header_count', 'has_accept_language', 'has_cookie', 'has_referer',
'modern_browser_score', 'ua_ch_mismatch', 'has_sec_ch_ua', 'ip_id_zero_ratio',
'request_size_variance', 'multiplexing_efficiency', 'mss_mobile_mismatch',
'asset_ratio', 'direct_access_ratio', 'is_ua_rotating', 'distinct_ja4_count',
'src_port_density', 'ja4_asn_concentration', 'ja4_country_concentration', 'is_rare_ja4',
'header_order_confidence', 'distinct_header_orders', 'temporal_entropy',
'path_diversity_ratio', 'url_depth_variance', 'anomalous_payload_ratio',
# B4-B7 : features L7 pures
'head_ratio', 'sec_fetch_absence_rate', 'generic_accept_ratio', 'http10_ratio',
# Anubis
'anubis_is_flagged',
# Browser multifactoriel
'is_known_browser', 'browser_consistency_score', 'browser_confidence',
'axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
'axis_nav_behavior', 'axis_tls_coherence', 'axis_h2_coherence',
# HTTP
'missing_accept_enc_ratio', 'http_scheme_ratio',
# Thèse §5
'path_transition_entropy',
'cadence_cv', 'burst_ratio', 'pause_ratio',
'lag1_autocorrelation', 'benford_deviation',
'host_diversity', 'host_sweep_speed', 'host_coverage_uniformity',
# §5.8b — Similarité Jaccard cross-domaine (chemins partagés entre hosts)
'cross_domain_path_similarity',
# P0+P1 : features sous-exploitées (SQL existant ou ajouté)
'is_fake_navigation',
'true_window_size', 'window_mss_ratio',
# P1 : nouvelles features de détection
'has_xff', 'unusual_content_type_ratio', 'non_standard_port_ratio',
'login_post_concentration', 'sec_ch_mobile_mismatch',
# §2 — Features HTTP/2 (fingerprint SETTINGS, cohérence H2↔JA4)
'h2_settings_known', 'h2_pseudo_order_match',
'h2_ja4_coherence', 'h2_settings_rare',
# §4 — Signaux atomiques H2 pour le browser_matcher (Famille 4 : Cohérence cross-layer)
'tls_h2_family_mismatch',
'h2_window_chrome', 'h2_window_firefox', 'h2_window_safari', 'h2_window_absent',
'h2_order_chromesafari', 'h2_order_firefox',
# §3 — Score de cohérence de fingerprint cross-layer
'fingerprint_coherence_score',
]
# Features supplémentaires pour le modèle Complet (données TCP/TLS requises)
FEATURES_COMPLET = FEATURES + [
'tcp_jitter_variance', 'alpn_http_mismatch', 'is_alpn_missing', 'sni_host_mismatch',
# B1-B3, B8 : features TLS/TCP
'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
# TTL fingerprinting OS + TCP window scale
'avg_ttl', 'ttl_std', 'no_window_scale_ratio',
# §5.5 — Dérive JA4 intra-session
'ja4_drift_ratio',
]
# ═══════════════════════════════════════════════════════════════════════════════
# PRÉTRAITEMENT
# ═══════════════════════════════════════════════════════════════════════════════
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
"""Normalise les colonnes et remplit les valeurs manquantes (commun 1h et 24h)."""
df.columns = [c.split('.')[-1] for c in df.columns]
for col in ['src_ip', 'ja4', 'host', 'bot_name', 'anubis_bot_name', 'anubis_bot_action',
'anubis_bot_category', 'asn_number', 'asn_org', 'asn_detail', 'asn_domain',
'country_code', 'asn_label']:
if col in df.columns:
df[col] = df[col].fillna('').astype(str)
# ── A9 — Identification multifactorielle des navigateurs ──────────────────
browser_axes = _compute_browser_axes(df)
ja4_parsed = _parse_ja4_columns(df.get('ja4', pd.Series('', index=df.index)))
df['inferred_browser_family'] = _infer_browser_family(df, ja4_parsed, browser_axes)
df['browser_confidence'] = browser_axes['browser_confidence']
for ax in ['axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
'axis_nav_behavior', 'axis_tls_coherence', 'axis_h2_coherence']:
df[ax] = browser_axes[ax]
# ── A9b — Browser Signature Matcher (parallèle à browser_confidence) ─────
# En mode DUAL_MODE (BROWSER_MATCHER_REPLACE=false), les colonnes bm_* sont
# ajoutées pour journalisation uniquement — le bypass reste piloté par
# browser_confidence jusqu'à la validation complète.
if BROWSER_MATCHER_ENABLED:
df = run_browser_matcher(df)
# Rétro-compatibilité
df['is_known_browser'] = browser_axes['axis_ja4_known'].astype(int)
df['browser_consistency_score'] = (
browser_axes['axis_ja4_known'].clip(0, 1)
+ browser_axes['axis_http_modern'].apply(lambda x: 1 if x >= 0.5 else 0)
+ browser_axes['axis_nav_behavior'].apply(lambda x: 1 if x >= 0.5 else 0)
+ browser_axes['axis_tls_coherence'].apply(lambda x: 1 if x >= 0.5 else 0)
+ (df['inferred_browser_family'] != '').astype(int)
).astype(int)
# anubis_is_flagged : signal de suspicion modéré
df['anubis_is_flagged'] = (
(df.get('anubis_bot_name', pd.Series('', index=df.index)) != '') &
(~df.get('anubis_bot_action', pd.Series('', index=df.index)).isin(['ALLOW', 'DENY', '']))
).astype(int)
# Imputation intelligente
binary_features = {
'has_accept_language', 'has_cookie', 'has_referer', 'has_sec_ch_ua',
'ua_ch_mismatch',
'is_ua_rotating', 'is_alpn_missing', 'sni_host_mismatch', 'alpn_http_mismatch',
'mss_mobile_mismatch', 'anubis_is_flagged', 'is_rare_ja4',
'is_fake_navigation', 'has_xff', 'sec_ch_mobile_mismatch',
# §2 — Features HTTP/2 binaires
'h2_settings_known', 'h2_pseudo_order_match', 'h2_ja4_coherence', 'h2_settings_rare',
# §4 — Signaux atomiques H2 binaires
'tls_h2_family_mismatch',
'h2_window_chrome', 'h2_window_firefox', 'h2_window_safari', 'h2_window_absent',
'h2_order_chromesafari', 'h2_order_firefox', 'h2_priority_present',
}
for col in df.columns:
if col in binary_features:
df[col] = df[col].fillna(-1)
elif df[col].dtype in ('float64', 'float32', 'int64', 'int32', 'uint64', 'uint32'):
df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(df[col].median())
return df