Files
ja4-platform/services/bot-detector/bot_detector/preprocessing.py
toto 14db3d9040 refactor: suppression dépendance User-Agent de la détection navigateur
Changements SQL :
- modern_browser_score : sec-ch-ua→100, Sec-Fetch→70 (plus de UA fallback)
- Ajout has_sec_ch_ua (UInt8) dans agg_header_fingerprint_1h et ml_all_scores
- mss_mobile_mismatch utilise has_sec_ch_ua au lieu de modern_browser_score
- header_order_confidence : PARTITION BY ja4 au lieu de first_ua
- sec_ch_mobile_mismatch : comparaison Client Hints interne (sans UA)
- Migration 03_remove_ua_browser_detection.sql

Changements Python :
- browser.py Axe 3 : Client Hints + Sec-Fetch + is_fake_navigation (PAS de UA)
- Pondération axes : ja4_known 0.30, tls_coherence 0.20 (signaux TLS renforcés)
- preprocessing.py : has_sec_ch_ua ajouté aux features et binary_features

Fichiers modifiés : 8 SQL/Python + 1 migration, 36/36 tests passent.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-09 23:06:01 +02:00

119 lines
6.1 KiB
Python

"""Prétraitement des données et listes de features.
Normalise les colonnes, enrichit via l'identification multifactorielle des
navigateurs, et définit les listes de features pour chaque modèle.
"""
import pandas as pd
import numpy as np
from .config import BROWSER_CONFIDENCE_THRESHOLD
from .log import log_info
from .browser import _compute_browser_axes, _parse_ja4_columns, _infer_browser_family
# ═══════════════════════════════════════════════════════════════════════════════
# LISTES DE FEATURES PAR MODÈLE
# ═══════════════════════════════════════════════════════════════════════════════
# Features communes (L7 HTTP pur, disponibles correlated=0 et 1)
FEATURES = [
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio',
'orphan_ratio', 'max_keepalives', 'tcp_shared_count', 'header_order_shared_count',
'header_count', 'has_accept_language', 'has_cookie', 'has_referer',
'modern_browser_score', 'ua_ch_mismatch', 'has_sec_ch_ua', 'ip_id_zero_ratio',
'request_size_variance', 'multiplexing_efficiency', 'mss_mobile_mismatch',
'asset_ratio', 'direct_access_ratio', 'is_ua_rotating', 'distinct_ja4_count',
'src_port_density', 'ja4_asn_concentration', 'ja4_country_concentration', 'is_rare_ja4',
'header_order_confidence', 'distinct_header_orders', 'temporal_entropy',
'path_diversity_ratio', 'url_depth_variance', 'anomalous_payload_ratio',
# B4-B7 : features L7 pures
'head_ratio', 'sec_fetch_absence_rate', 'generic_accept_ratio', 'http10_ratio',
# Anubis
'anubis_is_flagged',
# Browser multifactoriel
'is_known_browser', 'browser_consistency_score', 'browser_confidence',
'axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
'axis_nav_behavior', 'axis_tls_coherence',
# HTTP
'missing_accept_enc_ratio', 'http_scheme_ratio',
# Thèse §5
'path_transition_entropy',
'cadence_cv', 'burst_ratio', 'pause_ratio',
'lag1_autocorrelation', 'benford_deviation',
'host_diversity', 'host_sweep_speed', 'host_coverage_uniformity',
# P0+P1 : features sous-exploitées (SQL existant ou ajouté)
'is_fake_navigation',
'true_window_size', 'window_mss_ratio',
# P1 : nouvelles features de détection
'has_xff', 'unusual_content_type_ratio', 'non_standard_port_ratio',
'login_post_concentration', 'sec_ch_mobile_mismatch',
]
# Features supplémentaires pour le modèle Complet (données TCP/TLS requises)
FEATURES_COMPLET = FEATURES + [
'tcp_jitter_variance', 'alpn_http_mismatch', 'is_alpn_missing', 'sni_host_mismatch',
# B1-B3, B8 : features TLS/TCP
'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
# TTL fingerprinting OS + TCP window scale
'avg_ttl', 'ttl_std', 'no_window_scale_ratio',
# §5.5 — Dérive JA4 intra-session
'ja4_drift_ratio',
]
# ═══════════════════════════════════════════════════════════════════════════════
# PRÉTRAITEMENT
# ═══════════════════════════════════════════════════════════════════════════════
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
"""Normalise les colonnes et remplit les valeurs manquantes (commun 1h et 24h)."""
df.columns = [c.split('.')[-1] for c in df.columns]
for col in ['src_ip', 'ja4', 'host', 'bot_name', 'anubis_bot_name', 'anubis_bot_action',
'anubis_bot_category', 'asn_number', 'asn_org', 'asn_detail', 'asn_domain',
'country_code', 'asn_label']:
if col in df.columns:
df[col] = df[col].fillna('').astype(str)
# ── A9 — Identification multifactorielle des navigateurs ──────────────────
browser_axes = _compute_browser_axes(df)
ja4_parsed = _parse_ja4_columns(df.get('ja4', pd.Series('', index=df.index)))
df['inferred_browser_family'] = _infer_browser_family(df, ja4_parsed, browser_axes)
df['browser_confidence'] = browser_axes['browser_confidence']
for ax in ['axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
'axis_nav_behavior', 'axis_tls_coherence']:
df[ax] = browser_axes[ax]
# Rétro-compatibilité
df['is_known_browser'] = browser_axes['axis_ja4_known'].astype(int)
df['browser_consistency_score'] = (
browser_axes['axis_ja4_known'].clip(0, 1)
+ browser_axes['axis_http_modern'].apply(lambda x: 1 if x >= 0.5 else 0)
+ browser_axes['axis_nav_behavior'].apply(lambda x: 1 if x >= 0.5 else 0)
+ browser_axes['axis_tls_coherence'].apply(lambda x: 1 if x >= 0.5 else 0)
+ (df['inferred_browser_family'] != '').astype(int)
).astype(int)
# anubis_is_flagged : signal de suspicion modéré
df['anubis_is_flagged'] = (
(df.get('anubis_bot_name', pd.Series('', index=df.index)) != '') &
(~df.get('anubis_bot_action', pd.Series('', index=df.index)).isin(['ALLOW', 'DENY', '']))
).astype(int)
# Imputation intelligente
binary_features = {
'has_accept_language', 'has_cookie', 'has_referer', 'has_sec_ch_ua',
'ua_ch_mismatch',
'is_ua_rotating', 'is_alpn_missing', 'sni_host_mismatch', 'alpn_http_mismatch',
'mss_mobile_mismatch', 'anubis_is_flagged', 'is_rare_ja4',
'is_fake_navigation', 'has_xff', 'sec_ch_mobile_mismatch',
}
for col in df.columns:
if col in binary_features:
df[col] = df[col].fillna(-1)
elif df[col].dtype in ('float64', 'float32', 'int64', 'int32', 'uint64', 'uint32'):
df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(df[col].median())
return df