Étape A — browser_signatures.py
Données pures : BROWSER_SIGNATURES (Chrome/Firefox/Safari), NON_BROWSER_SIGNATURES
(curl/httpx/go), BROWSER_THRESHOLDS, DIMENSION_WEIGHTS. Valeurs H2 extraites
des captures réelles (format Akamai avec virgules, non semicolons).
Étape B — browser_matcher.py
Moteur vectorisé 7 dimensions (H2 SETTINGS 0.30, WINDOW_UPDATE 0.15,
pseudo-header order 0.15, H2 PRIORITY 0.10, HTTP headers 0.15, TLS 0.10,
JA4 dict 0.05). run_browser_matcher(df) ajoute bm_family/bm_score/bm_decision.
CDN edge case : dimension H2 neutralisée (0.5) si has_xff=1.
BROWSER_MATCHER_REPLACE=false par défaut (mode DUAL_MODE logging uniquement).
Étape C — 06_browser_signature_detection.sql (migration)
Crée browser_h2_signatures (table MergeTree avec 12 fingerprints de référence).
Recrée dict_browser_h2 depuis la table avec champ confidence (remplace CSV).
Étape D — 07_ai_features_view.sql
+h2_wu_val dans le JOIN http_logs, +h2_window_update_value, +h2_dict_family,
+h2_dict_confidence, +h2_window_{chrome,firefox,safari,absent},
+h2_order_{chromesafari,firefox}, +h2_priority_present, +h2_pseudo_ord_raw,
+tls_h2_family_mismatch (détection incohérence famille JA4 vs famille H2).
Étape E — preprocessing.py + pipeline.py
preprocessing.py: appelle run_browser_matcher() après compute_browser_axes(),
ajoute 7 nouvelles features binaires H2 à FEATURES et binary_features.
pipeline.py: appelle log_dual_mode_comparison() après la classification A9.
BROWSER_MATCHER_REPLACE=true active le remplacement du bypass.
Étape F — test_browser_matcher.py
8 tests : Chrome/Firefox/Safari full match, curl rejeté, httpcloak partiel,
TLS↔H2 mismatch, CDN proxy neutralisation, go net/http rejeté.
Tous 8 PASSED (+ 36 tests existants inchangés).
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
144 lines
7.6 KiB
Python
144 lines
7.6 KiB
Python
"""Prétraitement des données et listes de features.
|
|
|
|
Normalise les colonnes, enrichit via l'identification multifactorielle des
|
|
navigateurs, et définit les listes de features pour chaque modèle.
|
|
"""
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
from .config import BROWSER_CONFIDENCE_THRESHOLD
|
|
from .log import log_info
|
|
from .browser import _compute_browser_axes, _parse_ja4_columns, _infer_browser_family
|
|
from .browser_matcher import run_browser_matcher, log_dual_mode_comparison, BROWSER_MATCHER_ENABLED
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# LISTES DE FEATURES PAR MODÈLE
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
# Features communes (L7 HTTP pur, disponibles correlated=0 et 1)
|
|
FEATURES = [
|
|
'hits', 'hit_velocity', 'fuzzing_index', 'post_ratio', 'port_exhaustion_ratio',
|
|
'orphan_ratio', 'max_keepalives', 'tcp_shared_count', 'header_order_shared_count',
|
|
'header_count', 'has_accept_language', 'has_cookie', 'has_referer',
|
|
'modern_browser_score', 'ua_ch_mismatch', 'has_sec_ch_ua', 'ip_id_zero_ratio',
|
|
'request_size_variance', 'multiplexing_efficiency', 'mss_mobile_mismatch',
|
|
'asset_ratio', 'direct_access_ratio', 'is_ua_rotating', 'distinct_ja4_count',
|
|
'src_port_density', 'ja4_asn_concentration', 'ja4_country_concentration', 'is_rare_ja4',
|
|
'header_order_confidence', 'distinct_header_orders', 'temporal_entropy',
|
|
'path_diversity_ratio', 'url_depth_variance', 'anomalous_payload_ratio',
|
|
# B4-B7 : features L7 pures
|
|
'head_ratio', 'sec_fetch_absence_rate', 'generic_accept_ratio', 'http10_ratio',
|
|
# Anubis
|
|
'anubis_is_flagged',
|
|
# Browser multifactoriel
|
|
'is_known_browser', 'browser_consistency_score', 'browser_confidence',
|
|
'axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
|
|
'axis_nav_behavior', 'axis_tls_coherence', 'axis_h2_coherence',
|
|
# HTTP
|
|
'missing_accept_enc_ratio', 'http_scheme_ratio',
|
|
# Thèse §5
|
|
'path_transition_entropy',
|
|
'cadence_cv', 'burst_ratio', 'pause_ratio',
|
|
'lag1_autocorrelation', 'benford_deviation',
|
|
'host_diversity', 'host_sweep_speed', 'host_coverage_uniformity',
|
|
# §5.8b — Similarité Jaccard cross-domaine (chemins partagés entre hosts)
|
|
'cross_domain_path_similarity',
|
|
# P0+P1 : features sous-exploitées (SQL existant ou ajouté)
|
|
'is_fake_navigation',
|
|
'true_window_size', 'window_mss_ratio',
|
|
# P1 : nouvelles features de détection
|
|
'has_xff', 'unusual_content_type_ratio', 'non_standard_port_ratio',
|
|
'login_post_concentration', 'sec_ch_mobile_mismatch',
|
|
# §2 — Features HTTP/2 (fingerprint SETTINGS, cohérence H2↔JA4)
|
|
'h2_settings_known', 'h2_pseudo_order_match',
|
|
'h2_ja4_coherence', 'h2_settings_rare',
|
|
# §4 — Signaux atomiques H2 pour le browser_matcher (Famille 4 : Cohérence cross-layer)
|
|
'tls_h2_family_mismatch',
|
|
'h2_window_chrome', 'h2_window_firefox', 'h2_window_safari', 'h2_window_absent',
|
|
'h2_order_chromesafari', 'h2_order_firefox',
|
|
# §3 — Score de cohérence de fingerprint cross-layer
|
|
'fingerprint_coherence_score',
|
|
]
|
|
|
|
# Features supplémentaires pour le modèle Complet (données TCP/TLS requises)
|
|
FEATURES_COMPLET = FEATURES + [
|
|
'tcp_jitter_variance', 'alpn_http_mismatch', 'is_alpn_missing', 'sni_host_mismatch',
|
|
# B1-B3, B8 : features TLS/TCP
|
|
'ja3_diversity_ratio', 'syn_timing_cv', 'tls12_ratio', 'ip_df_variance',
|
|
# TTL fingerprinting OS + TCP window scale
|
|
'avg_ttl', 'ttl_std', 'no_window_scale_ratio',
|
|
# §5.5 — Dérive JA4 intra-session
|
|
'ja4_drift_ratio',
|
|
]
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# PRÉTRAITEMENT
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Normalise les colonnes et remplit les valeurs manquantes (commun 1h et 24h)."""
|
|
df.columns = [c.split('.')[-1] for c in df.columns]
|
|
for col in ['src_ip', 'ja4', 'host', 'bot_name', 'anubis_bot_name', 'anubis_bot_action',
|
|
'anubis_bot_category', 'asn_number', 'asn_org', 'asn_detail', 'asn_domain',
|
|
'country_code', 'asn_label']:
|
|
if col in df.columns:
|
|
df[col] = df[col].fillna('').astype(str)
|
|
|
|
# ── A9 — Identification multifactorielle des navigateurs ──────────────────
|
|
browser_axes = _compute_browser_axes(df)
|
|
ja4_parsed = _parse_ja4_columns(df.get('ja4', pd.Series('', index=df.index)))
|
|
|
|
df['inferred_browser_family'] = _infer_browser_family(df, ja4_parsed, browser_axes)
|
|
|
|
df['browser_confidence'] = browser_axes['browser_confidence']
|
|
for ax in ['axis_ja4_known', 'axis_ja4_struct', 'axis_http_modern',
|
|
'axis_nav_behavior', 'axis_tls_coherence', 'axis_h2_coherence']:
|
|
df[ax] = browser_axes[ax]
|
|
|
|
# ── A9b — Browser Signature Matcher (parallèle à browser_confidence) ─────
|
|
# En mode DUAL_MODE (BROWSER_MATCHER_REPLACE=false), les colonnes bm_* sont
|
|
# ajoutées pour journalisation uniquement — le bypass reste piloté par
|
|
# browser_confidence jusqu'à la validation complète.
|
|
if BROWSER_MATCHER_ENABLED:
|
|
df = run_browser_matcher(df)
|
|
|
|
# Rétro-compatibilité
|
|
df['is_known_browser'] = browser_axes['axis_ja4_known'].astype(int)
|
|
df['browser_consistency_score'] = (
|
|
browser_axes['axis_ja4_known'].clip(0, 1)
|
|
+ browser_axes['axis_http_modern'].apply(lambda x: 1 if x >= 0.5 else 0)
|
|
+ browser_axes['axis_nav_behavior'].apply(lambda x: 1 if x >= 0.5 else 0)
|
|
+ browser_axes['axis_tls_coherence'].apply(lambda x: 1 if x >= 0.5 else 0)
|
|
+ (df['inferred_browser_family'] != '').astype(int)
|
|
).astype(int)
|
|
|
|
# anubis_is_flagged : signal de suspicion modéré
|
|
df['anubis_is_flagged'] = (
|
|
(df.get('anubis_bot_name', pd.Series('', index=df.index)) != '') &
|
|
(~df.get('anubis_bot_action', pd.Series('', index=df.index)).isin(['ALLOW', 'DENY', '']))
|
|
).astype(int)
|
|
|
|
# Imputation intelligente
|
|
binary_features = {
|
|
'has_accept_language', 'has_cookie', 'has_referer', 'has_sec_ch_ua',
|
|
'ua_ch_mismatch',
|
|
'is_ua_rotating', 'is_alpn_missing', 'sni_host_mismatch', 'alpn_http_mismatch',
|
|
'mss_mobile_mismatch', 'anubis_is_flagged', 'is_rare_ja4',
|
|
'is_fake_navigation', 'has_xff', 'sec_ch_mobile_mismatch',
|
|
# §2 — Features HTTP/2 binaires
|
|
'h2_settings_known', 'h2_pseudo_order_match', 'h2_ja4_coherence', 'h2_settings_rare',
|
|
# §4 — Signaux atomiques H2 binaires
|
|
'tls_h2_family_mismatch',
|
|
'h2_window_chrome', 'h2_window_firefox', 'h2_window_safari', 'h2_window_absent',
|
|
'h2_order_chromesafari', 'h2_order_firefox', 'h2_priority_present',
|
|
}
|
|
for col in df.columns:
|
|
if col in binary_features:
|
|
df[col] = df[col].fillna(-1)
|
|
elif df[col].dtype in ('float64', 'float32', 'int64', 'int32', 'uint64', 'uint32'):
|
|
df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(df[col].median())
|
|
|
|
return df
|