ja4-platform/services/bot-detector/bot_detector/preprocessing.py

"""Prétraitement des données et listes de features.

Normalise les colonnes, enrichit via l'identification multifactorielle des
navigateurs, et définit les listes de features pour chaque modèle.
"""
import pandas as pd
import numpy as np

from .config import BROWSER_CONFIDENCE_THRESHOLD
from .log import log_info
from .browser import _compute_browser_axes, _parse_ja4_columns, _infer_browser_family


# ═══════════════════════════════════════════════════════════════════════════════
# LISTES DE FEATURES PAR MODÈLE
# ═══════════════════════════════════════════════════════════════════════════════

# Features communes (L7 HTTP pur, disponibles correlated=0 et 1)
FEATURES = [
    'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio',
    'orphan_ratio', 'max_keepalives', 'tcp_shared_count', 'header_order_shared_count',
    'header_count', 'has_accept_language', 'has_cookie', 'has_referer',
    'modern_browser_score', 'ua_ch_mismatch', 'has_sec_ch_ua', 'ip_id_zero_ratio',
    'request_size_variance', 'multiplexing_efficiency', 'mss_mobile_mismatch',
    'asset_ratio', 'direct_access_ratio', 'is_ua_rotating', 'distinct_ja4_count',
    'src_port_density', 'ja4_asn_concentration', 'ja4_country_concentration', 'is_rare_ja4',
    'header_order_confidence', 'distinct_header_orders', 'temporal_entropy',
    'path_diversity_ratio', 'url_depth_variance', 'anomalous_payload_ratio',
    # B4-B7 : features L7 pures
    'head_ratio', 'sec_fetch_absence_rate', 'generic_accept_ratio', 'http10_ratio',
    # Anubis
    'anubis_is_flagged',
    # Browser multifactoriel
    'is_known_browser', 'browser_consistency_score', 'browser_confidence',
    'axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
    'axis_nav_behavior', 'axis_tls_coherence', 'axis_h2_coherence',
    # HTTP
    'missing_accept_enc_ratio', 'http_scheme_ratio',
    # Thèse §5
    'path_transition_entropy',
    'cadence_cv', 'burst_ratio', 'pause_ratio',
    'lag1_autocorrelation', 'benford_deviation',
    'host_diversity', 'host_sweep_speed', 'host_coverage_uniformity',
    # §5.8b — Similarité Jaccard cross-domaine (chemins partagés entre hosts)
    'cross_domain_path_similarity',
    # P0+P1 : features sous-exploitées (SQL existant ou ajouté)
    'is_fake_navigation',
    'true_window_size', 'window_mss_ratio',
    # P1 : nouvelles features de détection
    'has_xff', 'unusual_content_type_ratio', 'non_standard_port_ratio',
    'login_post_concentration', 'sec_ch_mobile_mismatch',
    # §2 — Features HTTP/2 (fingerprint SETTINGS, cohérence H2↔JA4)
    'h2_settings_known', 'h2_pseudo_order_match',
    'h2_ja4_coherence', 'h2_settings_rare',
    # §3 — Score de cohérence de fingerprint cross-layer
    'fingerprint_coherence_score',
]

# Features supplémentaires pour le modèle Complet (données TCP/TLS requises)
FEATURES_COMPLET = FEATURES + [
    'tcp_jitter_variance', 'alpn_http_mismatch', 'is_alpn_missing', 'sni_host_mismatch',
    # B1-B3, B8 : features TLS/TCP
    'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
    # TTL fingerprinting OS + TCP window scale
    'avg_ttl', 'ttl_std', 'no_window_scale_ratio',
    # §5.5 — Dérive JA4 intra-session
    'ja4_drift_ratio',
]


# ═══════════════════════════════════════════════════════════════════════════════
# PRÉTRAITEMENT
# ═══════════════════════════════════════════════════════════════════════════════

def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    """Normalise les colonnes et remplit les valeurs manquantes (commun 1h et 24h)."""
    df.columns = [c.split('.')[-1] for c in df.columns]
    for col in ['src_ip', 'ja4', 'host', 'bot_name', 'anubis_bot_name', 'anubis_bot_action',
                'anubis_bot_category', 'asn_number', 'asn_org', 'asn_detail', 'asn_domain',
                'country_code', 'asn_label']:
        if col in df.columns:
            df[col] = df[col].fillna('').astype(str)

    # ── A9 — Identification multifactorielle des navigateurs ──────────────────
    browser_axes = _compute_browser_axes(df)
    ja4_parsed = _parse_ja4_columns(df.get('ja4', pd.Series('', index=df.index)))

    df['inferred_browser_family'] = _infer_browser_family(df, ja4_parsed, browser_axes)

    df['browser_confidence'] = browser_axes['browser_confidence']
    for ax in ['axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
               'axis_nav_behavior', 'axis_tls_coherence', 'axis_h2_coherence']:
        df[ax] = browser_axes[ax]

    # Rétro-compatibilité
    df['is_known_browser'] = browser_axes['axis_ja4_known'].astype(int)
    df['browser_consistency_score'] = (
        browser_axes['axis_ja4_known'].clip(0, 1)
        + browser_axes['axis_http_modern'].apply(lambda x: 1 if x >= 0.5 else 0)
        + browser_axes['axis_nav_behavior'].apply(lambda x: 1 if x >= 0.5 else 0)
        + browser_axes['axis_tls_coherence'].apply(lambda x: 1 if x >= 0.5 else 0)
        + (df['inferred_browser_family'] != '').astype(int)
    ).astype(int)

    # anubis_is_flagged : signal de suspicion modéré
    df['anubis_is_flagged'] = (
        (df.get('anubis_bot_name', pd.Series('', index=df.index)) != '') &
        (~df.get('anubis_bot_action', pd.Series('', index=df.index)).isin(['ALLOW', 'DENY', '']))
    ).astype(int)

    # Imputation intelligente
    binary_features = {
        'has_accept_language', 'has_cookie', 'has_referer', 'has_sec_ch_ua',
        'ua_ch_mismatch',
        'is_ua_rotating', 'is_alpn_missing', 'sni_host_mismatch', 'alpn_http_mismatch',
        'mss_mobile_mismatch', 'anubis_is_flagged', 'is_rare_ja4',
        'is_fake_navigation', 'has_xff', 'sec_ch_mobile_mismatch',
        # §2 — Features HTTP/2 binaires
        'h2_settings_known', 'h2_pseudo_order_match', 'h2_ja4_coherence', 'h2_settings_rare',
    }
    for col in df.columns:
        if col in binary_features:
            df[col] = df[col].fillna(-1)
        elif df[col].dtype in ('float64', 'float32', 'int64', 'int32', 'uint64', 'uint32'):
            df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(df[col].median())

    return df