ja4ebpf: - Refactor BPF TC capture with improved SYN offset handling and TCP option parsing - Enhance TLS uprobe SSL hooking for better key extraction - Add ClickHouse writer improvements for HTTP log materialized views - Update RPM spec for Rocky Linux 8/9/10, fix systemd service - Simplify loader with cleaner bpf2go integration bot-detector: - Add H2 SETTINGS per-parameter comparison in browser_matcher - Enhance browser signatures and scoring pipeline - Improve preprocessing and cycle detection infra: - Multi-distro Vagrantfile (centos8, rocky9, rocky10) with per-distro provisioning - New Makefile targets: vm-up-all, test-vm-matrix, test-vm-centos8/rocky10 - Add debug helpers and run-test-from-host.sh for host-driven VM testing - Update run-tests-vm.sh for cross-distro compatibility - Remove accidental binary blob (\004) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
154 lines
8.2 KiB
Python
154 lines
8.2 KiB
Python
"""Prétraitement des données et listes de features.
|
|
|
|
Normalise les colonnes, enrichit via l'identification multifactorielle des
|
|
navigateurs, et définit les listes de features pour chaque modèle.
|
|
"""
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
from .config import BROWSER_CONFIDENCE_THRESHOLD
|
|
from .log import log_info
|
|
from .browser import _compute_browser_axes, _parse_ja4_columns, _infer_browser_family
|
|
from .browser_matcher import run_browser_matcher, log_dual_mode_comparison, BROWSER_MATCHER_ENABLED
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# LISTES DE FEATURES PAR MODÈLE
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
# Features communes (L7 HTTP pur, disponibles correlated=0 et 1)
|
|
FEATURES = [
|
|
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio',
|
|
'orphan_ratio', 'max_keepalives', 'tcp_shared_count', 'header_order_shared_count',
|
|
'header_count', 'has_accept_language', 'has_cookie', 'has_referer',
|
|
'modern_browser_score', 'ua_ch_mismatch', 'has_sec_ch_ua', 'ip_id_zero_ratio',
|
|
'request_size_variance', 'multiplexing_efficiency', 'mss_mobile_mismatch',
|
|
'asset_ratio', 'direct_access_ratio', 'is_ua_rotating', 'distinct_ja4_count',
|
|
'src_port_density', 'ja4_asn_concentration', 'ja4_country_concentration', 'is_rare_ja4',
|
|
'header_order_confidence', 'distinct_header_orders', 'temporal_entropy',
|
|
'path_diversity_ratio', 'url_depth_variance', 'anomalous_payload_ratio',
|
|
# B4-B7 : features L7 pures
|
|
'head_ratio', 'sec_fetch_absence_rate', 'generic_accept_ratio', 'http10_ratio',
|
|
# Anubis
|
|
'anubis_is_flagged',
|
|
# Browser multifactoriel
|
|
'is_known_browser', 'browser_consistency_score', 'browser_confidence',
|
|
'axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
|
|
'axis_nav_behavior', 'axis_tls_coherence', 'axis_h2_coherence',
|
|
# HTTP
|
|
'missing_accept_enc_ratio', 'http_scheme_ratio',
|
|
# Thèse §5
|
|
'path_transition_entropy',
|
|
'cadence_cv', 'burst_ratio', 'pause_ratio',
|
|
'lag1_autocorrelation', 'benford_deviation',
|
|
'host_diversity', 'host_sweep_speed', 'host_coverage_uniformity',
|
|
# §5.8b — Similarité Jaccard cross-domaine (chemins partagés entre hosts)
|
|
'cross_domain_path_similarity',
|
|
# §5.4 — Resource Dependency Tree (cascade de chargement)
|
|
'root_to_first_asset_delay', 'asset_load_stddev',
|
|
# P0+P1 : features sous-exploitées (SQL existant ou ajouté)
|
|
'is_fake_navigation',
|
|
'true_window_size', 'window_mss_ratio',
|
|
# P1 : nouvelles features de détection
|
|
'has_xff', 'unusual_content_type_ratio', 'non_standard_port_ratio',
|
|
'login_post_concentration', 'sec_ch_mobile_mismatch',
|
|
# §2 — Features HTTP/2 (fingerprint SETTINGS, cohérence H2↔JA4)
|
|
'h2_settings_known', 'h2_pseudo_order_match',
|
|
'h2_ja4_coherence', 'h2_settings_rare',
|
|
# §4 — Signaux atomiques H2 pour le browser_matcher (Famille 4 : Cohérence cross-layer)
|
|
'tls_h2_family_mismatch',
|
|
'h2_window_chrome', 'h2_window_firefox', 'h2_window_safari', 'h2_window_absent',
|
|
'h2_order_chromesafari', 'h2_order_firefox',
|
|
# §3 — Score de cohérence de fingerprint cross-layer
|
|
'fingerprint_coherence_score',
|
|
# §3.9.4 — Browser matcher scores (passif H2)
|
|
'browser_match_chrome', 'browser_match_firefox', 'browser_match_safari',
|
|
'browser_match_max',
|
|
]
|
|
|
|
# Features supplémentaires pour le modèle Complet (données TCP/TLS requises)
|
|
FEATURES_COMPLET = FEATURES + [
|
|
'tcp_jitter_variance', 'alpn_http_mismatch', 'is_alpn_missing', 'sni_host_mismatch',
|
|
# B1-B3, B8 : features TLS/TCP
|
|
'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
|
|
# TTL fingerprinting OS + TCP window scale
|
|
'avg_ttl', 'ttl_std', 'no_window_scale_ratio',
|
|
# §5.5 — Dérive JA4 intra-session
|
|
'ja4_drift_ratio',
|
|
]
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# PRÉTRAITEMENT
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Normalise les colonnes et remplit les valeurs manquantes (commun 1h et 24h)."""
|
|
df.columns = [c.split('.')[-1] for c in df.columns]
|
|
for col in ['src_ip', 'ja4', 'host', 'bot_name', 'anubis_bot_name', 'anubis_bot_action',
|
|
'anubis_bot_category', 'asn_number', 'asn_org', 'asn_detail', 'asn_domain',
|
|
'country_code', 'asn_label']:
|
|
if col in df.columns:
|
|
df[col] = df[col].fillna('').astype(str)
|
|
|
|
# ── A9 — Identification multifactorielle des navigateurs ──────────────────
|
|
browser_axes = _compute_browser_axes(df)
|
|
ja4_parsed = _parse_ja4_columns(df.get('ja4', pd.Series('', index=df.index)))
|
|
|
|
df['inferred_browser_family'] = _infer_browser_family(df, ja4_parsed, browser_axes)
|
|
|
|
df['browser_confidence'] = browser_axes['browser_confidence']
|
|
for ax in ['axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
|
|
'axis_nav_behavior', 'axis_tls_coherence', 'axis_h2_coherence']:
|
|
df[ax] = browser_axes[ax]
|
|
|
|
# ── A9b — Browser Signature Matcher (parallèle à browser_confidence) ─────
|
|
# En mode DUAL_MODE (BROWSER_MATCHER_REPLACE=false), les colonnes bm_* sont
|
|
# ajoutées pour journalisation uniquement — le bypass reste piloté par
|
|
# browser_confidence jusqu'à la validation complète.
|
|
if BROWSER_MATCHER_ENABLED:
|
|
df = run_browser_matcher(df)
|
|
else:
|
|
# Colonnes par défaut quand le matcher est désactivé
|
|
for col in ['browser_match_chrome', 'browser_match_firefox', 'browser_match_safari',
|
|
'browser_match_max', 'browser_family_detected']:
|
|
df[col] = 0.0 if col != 'browser_family_detected' else ''
|
|
|
|
# Rétro-compatibilité
|
|
df['is_known_browser'] = browser_axes['axis_ja4_known'].astype(int)
|
|
df['browser_consistency_score'] = (
|
|
browser_axes['axis_ja4_known'].clip(0, 1)
|
|
+ browser_axes['axis_http_modern'].apply(lambda x: 1 if x >= 0.5 else 0)
|
|
+ browser_axes['axis_nav_behavior'].apply(lambda x: 1 if x >= 0.5 else 0)
|
|
+ browser_axes['axis_tls_coherence'].apply(lambda x: 1 if x >= 0.5 else 0)
|
|
+ (df['inferred_browser_family'] != '').astype(int)
|
|
).astype(int)
|
|
|
|
# anubis_is_flagged : signal de suspicion modéré
|
|
df['anubis_is_flagged'] = (
|
|
(df.get('anubis_bot_name', pd.Series('', index=df.index)) != '') &
|
|
(~df.get('anubis_bot_action', pd.Series('', index=df.index)).isin(['ALLOW', 'DENY', '']))
|
|
).astype(int)
|
|
|
|
# Imputation intelligente
|
|
binary_features = {
|
|
'has_accept_language', 'has_cookie', 'has_referer', 'has_sec_ch_ua',
|
|
'ua_ch_mismatch',
|
|
'is_ua_rotating', 'is_alpn_missing', 'sni_host_mismatch', 'alpn_http_mismatch',
|
|
'mss_mobile_mismatch', 'anubis_is_flagged', 'is_rare_ja4',
|
|
'is_fake_navigation', 'has_xff', 'sec_ch_mobile_mismatch',
|
|
# §2 — Features HTTP/2 binaires
|
|
'h2_settings_known', 'h2_pseudo_order_match', 'h2_ja4_coherence', 'h2_settings_rare',
|
|
# §4 — Signaux atomiques H2 binaires
|
|
'tls_h2_family_mismatch',
|
|
'h2_window_chrome', 'h2_window_firefox', 'h2_window_safari', 'h2_window_absent',
|
|
'h2_order_chromesafari', 'h2_order_firefox', 'h2_priority_present',
|
|
}
|
|
for col in df.columns:
|
|
if col in binary_features:
|
|
df[col] = df[col].fillna(-1)
|
|
elif df[col].dtype in ('float64', 'float32', 'int64', 'int32', 'uint64', 'uint32'):
|
|
df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(df[col].median())
|
|
|
|
return df
|