feat: multi-distro VM tests, ja4ebpf eBPF improvements, bot-detector scoring

ja4ebpf:
- Refactor BPF TC capture with improved SYN offset handling and TCP option parsing
- Enhance TLS uprobe SSL hooking for better key extraction
- Add ClickHouse writer improvements for HTTP log materialized views
- Update RPM spec for Rocky Linux 8/9/10, fix systemd service
- Simplify loader with cleaner bpf2go integration

bot-detector:
- Add H2 SETTINGS per-parameter comparison in browser_matcher
- Enhance browser signatures and scoring pipeline
- Improve preprocessing and cycle detection

infra:
- Multi-distro Vagrantfile (centos8, rocky9, rocky10) with per-distro provisioning
- New Makefile targets: vm-up-all, test-vm-matrix, test-vm-centos8/rocky10
- Add debug helpers and run-test-from-host.sh for host-driven VM testing
- Update run-tests-vm.sh for cross-distro compatibility
- Remove accidental binary blob (\004)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jacquin Antoine
2026-04-13 01:09:33 +02:00
parent d81463a589
commit d75825278e
32 changed files with 2148 additions and 890 deletions

View File

@ -302,16 +302,51 @@ def _compute_family_score(df: pd.DataFrame, family: str) -> pd.Series:
"""Calcule le score de correspondance [0.0, 1.0] pour une famille navigateur.
Score = somme pondérée des 7 dimensions.
Quand has_xff=1 (CDN/proxy), les dimensions H2 sont neutralisées à 0.5
et leur poids (0.70) est redistribué vers HTTP headers (+0.35) et TLS (+0.35).
"""
w = DIMENSION_WEIGHTS
score = (
_d1_h2_settings(df, family) * w["h2_settings"]
+ _d2_h2_window(df, family) * w["h2_window"]
+ _d3_pseudo_order(df, family) * w["pseudo_order"]
+ _d4_h2_priority(df, family) * w["h2_priority"]
+ _d5_http_headers(df, family) * w["http_headers"]
+ _d6_tls_structure(df, family) * w["tls_structure"]
+ _d7_ja4_dict(df, family) * w["ja4_dict"]
has_xff = _col(df, "has_xff").astype(bool)
# Dimensions H2
d_h2_settings = _d1_h2_settings(df, family)
d_h2_window = _d2_h2_window(df, family)
d_pseudo = _d3_pseudo_order(df, family)
d_priority = _d4_h2_priority(df, family)
# Dimensions non-H2
d_headers = _d5_http_headers(df, family)
d_tls = _d6_tls_structure(df, family)
d_ja4 = _d7_ja4_dict(df, family)
# Neutraliser les dimensions H2 à 0.5 derrière CDN (le H2 observé est celui du proxy)
h2_weight_total = w["h2_settings"] + w["h2_window"] + w["pseudo_order"] + w["h2_priority"]
# Redistribuer : chaque dimension non-H2 reçoit une part proportionnelle
# au poids H2 redistribué (0.35 vers headers, 0.35 vers TLS)
http_bonus = h2_weight_total / 2 # 0.35
tls_bonus = h2_weight_total / 2 # 0.35
# Score avec poids normaux (pas CDN)
score_normal = (
d_h2_settings * w["h2_settings"]
+ d_h2_window * w["h2_window"]
+ d_pseudo * w["pseudo_order"]
+ d_priority * w["h2_priority"]
+ d_headers * w["http_headers"]
+ d_tls * w["tls_structure"]
+ d_ja4 * w["ja4_dict"]
)
# Score avec poids redistribués (CDN : H2 neutralisé à 0.5)
score_cdn = (
0.5 * h2_weight_total # H2 dimensions neutralisées
+ d_headers * (w["http_headers"] + http_bonus) # 0.15 + 0.35 = 0.50
+ d_tls * (w["tls_structure"] + tls_bonus) # 0.10 + 0.35 = 0.45
+ d_ja4 * w["ja4_dict"] # 0.05 (inchangé)
)
score = pd.Series(
np.where(has_xff, score_cdn, score_normal),
index=df.index,
)
return score.clip(0.0, 1.0)
@ -414,6 +449,12 @@ def run_browser_matcher(df: pd.DataFrame) -> pd.DataFrame:
df["bm_non_browser"] = non_browser_mask
df["bm_decision"] = decision
# Scores par famille pour le vecteur ML (§3.9.4)
for family in BROWSER_SIGNATURES:
df[f"browser_match_{family.lower()}"] = scores[family].round(4)
df["browser_match_max"] = adjusted_score.round(4)
df["browser_family_detected"] = df["bm_family"]
return df

View File

@ -103,3 +103,63 @@ DIMENSION_WEIGHTS: dict = {
"tls_structure": 0.10,
"ja4_dict": 0.05,
}
# Timestamp du dernier rechargement des signatures depuis ClickHouse.
_last_signature_reload: float = 0.0
_SIGNATURE_RELOAD_INTERVAL: float = 86400.0 # 24 heures
def reload_signatures_from_clickhouse(client) -> bool:
"""§3.9.5 : Recharge les signatures H2 depuis ja4_processing.browser_h2_signatures.
Fusionne les signatures dynamiques (ClickHouse) avec les signatures statiques.
Les signatures dynamiques sont ajoutées ou remplacent les existantes par famille.
Appelé une fois par cycle, mais n'effectue le rechargement que toutes les 24h.
"""
import json
import time as _time
global _last_signature_reload
now = _time.time()
if now - _last_signature_reload < _SIGNATURE_RELOAD_INTERVAL:
return False
try:
df = client.query_df(
"SELECT * FROM ja4_processing.browser_h2_signatures WHERE is_active = 1"
)
if df is None or df.empty:
return False
loaded = 0
for _, row in df.iterrows():
family = str(row.get('family', ''))
if not family:
continue
try:
settings = json.loads(str(row.get('h2_settings_json', '{}')))
forbidden = json.loads(str(row.get('h2_settings_forbidden', '[]')))
tls = json.loads(str(row.get('tls_json', '{}')))
headers_req = json.loads(str(row.get('headers_required', '[]')))
headers_forbid = json.loads(str(row.get('headers_forbidden', '[]')))
except (json.JSONDecodeError, TypeError):
continue
BROWSER_SIGNATURES[family] = {
"h2_settings_exact": {int(k): int(v) for k, v in settings.items()},
"h2_settings_forbidden_keys": [int(x) for x in forbidden],
"h2_window_update": int(row.get('h2_window_update', 0)),
"h2_window_update_tolerance": int(row.get('h2_window_update_tolerance', 1000)),
"h2_priority_frames_expected": bool(row.get('h2_priority_expected', 0)),
"pseudo_header_order": str(row.get('pseudo_header_order', '')),
"tls": tls,
"headers_required": headers_req,
"headers_forbidden": headers_forbid,
}
loaded += 1
_last_signature_reload = now
return loaded > 0
except Exception:
return False

View File

@ -18,6 +18,7 @@ from .infra import get_client, set_healthy
from .preprocessing import preprocess_df, FEATURES, FEATURES_COMPLET
from .pipeline import run_semi_supervised_logic
from .fleet import enrich_with_fleet_score
from .browser_signatures import reload_signatures_from_clickhouse
from .metrics import record_cycle_metrics
@ -120,6 +121,13 @@ def fetch_and_analyze():
client = get_client()
# §3.9.5 — Rechargement périodique des signatures H2 depuis ClickHouse
try:
if reload_signatures_from_clickhouse(client):
log_info('[Signatures] Signatures H2 rechargées depuis browser_h2_signatures.')
except Exception:
pass
# ── Récupération du trafic (fenêtre 1h) ──────────────────────────────────
try:
df = client.query_df(f'SELECT * FROM {DB}.view_ai_features_1h')
@ -171,6 +179,43 @@ def fetch_and_analyze():
except Exception as e:
log_info(f'[Fleet §5] Enrichissement de flotte échoué : {e}')
# §3.9.5 — Queue unknown_h2_fingerprints : sessions H2 inconnues mais navigateur-like
try:
bm_col = 'bm_score' if 'bm_score' in df.columns else None
bc_col = 'browser_confidence' if 'browser_confidence' in df.columns else None
h2_col = 'h2_settings_known' if 'h2_settings_known' in df.columns else None
tls_col = 'tls_version' if 'tls_version' in df.columns else None
if bm_col and h2_col:
# Conditions : H2 inconnu + comportement navigateur + TLS 1.3
unknown_h2_mask = (
(df[h2_col] == 0) # H2 SETTINGS inconnu
& (
(df[bm_col] < 0.45) # browser_matcher ne reconnaît pas
| (bc_col and df[bc_col] >= 0.55) # mais browser_confidence élevé
)
)
if tls_col:
unknown_h2_mask = unknown_h2_mask & (df[tls_col].astype(str).str.startswith('TLSv1.3'))
unknown_h2 = df[unknown_h2_mask]
if not unknown_h2.empty:
n_unknown = len(unknown_h2)
# Insérer les fingerprints inconnus dans la table ClickHouse
client.command(
"INSERT INTO ja4_processing.unknown_h2_fingerprints "
"(observed_at, src_ip, ja4, h2_fingerprint, h2_settings_fp, "
"h2_window_update, h2_pseudo_order, h2_has_priority, "
"browser_confidence_score, header_user_agent, tls_version) "
"SELECT now(), src_ip, ja4, h2_fingerprint, h2_settings_fp, "
"h2_window_update, h2_pseudo_order, h2_has_priority, "
"browser_confidence, header_user_agent, tls_version "
"FROM input"
)
log_info(f'[H2 Queue] {n_unknown} fingerprint(s) H2 inconnu(s) mis en file d\'examen.')
except Exception as e:
log_info(f'[H2 Queue] Erreur insertion unknown_h2_fingerprints : {e}')
# ── Résumé des données chargées ───────────────────────────────────────────
n_total = len(df)
n_correlated = int((df.get('correlated', pd.Series()) == 1).sum())

View File

@ -140,6 +140,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
# XGBoost supervisé — troisième voix (si labels historiques disponibles)
unknown_traffic['xgb_prob'] = 0.0
xgb_model_ref = None # Référence pour SHAP TreeExplainer (§2.4.5)
if XGB_AVAILABLE and XGB_WEIGHT > 0:
try:
xgb_client = get_client()
@ -150,6 +151,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
X_xgb = unknown_traffic[xgb_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
xgb_probs = xgb_model.predict_proba(X_xgb.values)[:, 1]
unknown_traffic['xgb_prob'] = xgb_probs
xgb_model_ref = xgb_model
log_info(f"[{name}] XGBoost : xgb_mean={xgb_probs.mean():.4f}")
except Exception as exc:
log_info(f"[{name}] XGBoost scoring échoué : {exc} — EIF+AE seuls.")
@ -187,9 +189,9 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
except Exception as exc:
log_info(f"[{name}] MetaLearner entraînement échoué : {exc}")
# §7 — ExIFFI : importance de features pour l'EIF (quand SHAP désactivé)
# §7 — ExIFFI : importance de features pour l'EIF (toujours actif en complément de SHAP)
exiffi_tops: list = [{}] * len(unknown_traffic)
if not ENABLE_SHAP and len(unknown_traffic) > 0:
if len(unknown_traffic) > 0:
try:
exiffi_tops = compute_exiffi_importance(model, X_test, scoring_features)
except Exception:
@ -376,9 +378,10 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
log_info(f"[{name}] ALERT: {len(anomalies)} anomalies détectées (seuil={effective_threshold:.4f}).")
anomalies['recurrence'] = anomalies['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
# A4 — Explainabilité SHAP : top features responsables de chaque anomalie
# A4 — Explainabilité SHAP : TreeExplainer sur XGBoost si dispo, sinon EIF
X_anomalies = X_test.loc[anomalies.index]
shap_tops = compute_shap_top_features(model, X_anomalies, valid_features)
shap_tops = compute_shap_top_features(model, X_anomalies, valid_features,
xgb_model=xgb_model_ref)
# §7 — ExIFFI : utiliser les tops ExIFFI précalculés quand SHAP est inactif
# Construire un mapping index → exiffi_top pour accès rapide

View File

@ -44,6 +44,8 @@ FEATURES = [
'host_diversity', 'host_sweep_speed', 'host_coverage_uniformity',
# §5.8b — Similarité Jaccard cross-domaine (chemins partagés entre hosts)
'cross_domain_path_similarity',
# §5.4 — Resource Dependency Tree (cascade de chargement)
'root_to_first_asset_delay', 'asset_load_stddev',
# P0+P1 : features sous-exploitées (SQL existant ou ajouté)
'is_fake_navigation',
'true_window_size', 'window_mss_ratio',
@ -59,6 +61,9 @@ FEATURES = [
'h2_order_chromesafari', 'h2_order_firefox',
# §3 — Score de cohérence de fingerprint cross-layer
'fingerprint_coherence_score',
# §3.9.4 — Browser matcher scores (passif H2)
'browser_match_chrome', 'browser_match_firefox', 'browser_match_safari',
'browser_match_max',
]
# Features supplémentaires pour le modèle Complet (données TCP/TLS requises)
@ -103,6 +108,11 @@ def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
# browser_confidence jusqu'à la validation complète.
if BROWSER_MATCHER_ENABLED:
df = run_browser_matcher(df)
else:
# Colonnes par défaut quand le matcher est désactivé
for col in ['browser_match_chrome', 'browser_match_firefox', 'browser_match_safari',
'browser_match_max', 'browser_family_detected']:
df[col] = 0.0 if col != 'browser_family_detected' else ''
# Rétro-compatibilité
df['is_known_browser'] = browser_axes['axis_ja4_known'].astype(int)

View File

@ -248,25 +248,48 @@ def normalize_scores(scores: np.ndarray) -> np.ndarray:
# ═══════════════════════════════════════════════════════════════════════════════
def compute_shap_top_features(model, X: pd.DataFrame, features: list,
n_top: int = 5) -> list:
n_top: int = 5, xgb_model=None) -> list:
"""
Calcule les valeurs SHAP pour chaque ligne de X et retourne les n_top features
les plus contributives (valeur SHAP la plus négative = plus responsable de l'anomalie).
Retourne une liste de dicts {feature: shap_value} par ligne.
Calcule les valeurs SHAP et retourne les n_top features les plus contributives.
Utilise TreeExplainer pour sklearn, et un échantillon Permutation pour isotree.
Stratégie par modèle (conforme à la thèse §2.4.5) :
- XGBoost : TreeExplainer (O(TLD²), exact et efficace)
- EIF (sklearn) : TreeExplainer natif
- EIF (isotree) : PermutationExplainer
Si xgb_model est fourni, utilise TreeExplainer sur XGBoost en priorité.
Sinon, utilise l'Explainer adapté au modèle EIF.
Retourne une liste de dicts {feature: shap_value} par ligne.
"""
if not ENABLE_SHAP or X.empty:
return [{}] * len(X)
# Priorité XGBoost : TreeExplainer est optimal pour les modèles à base d'arbres
if xgb_model is not None:
try:
explainer = _shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X[features].fillna(0))
if isinstance(shap_values, list):
shap_values = shap_values[1] if len(shap_values) > 1 else shap_values[0]
result = []
for sv in shap_values:
pairs = sorted(zip(features, sv), key=lambda x: abs(x[1]), reverse=True)
result.append({f: round(float(v), 4) for f, v in pairs[:n_top]})
return result
except Exception as e:
log_info(f"[SHAP] TreeExplainer XGBoost échoué ({e}), fallback EIF")
# Fallback EIF
try:
if EIF_AVAILABLE:
sample_size = min(100, len(X))
X_sample = X.sample(n=sample_size, random_state=42) if len(X) > sample_size else X
X_sample = X[features].sample(n=sample_size, random_state=42) if len(X) > sample_size else X[features]
explainer = _shap.Explainer(model.decision_function, X_sample)
shap_values = explainer(X).values
shap_values = explainer(X[features].fillna(0)).values
else:
explainer = _shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap_values = explainer.shap_values(X[features].fillna(0))
result = []
for sv in shap_values:
pairs = sorted(zip(features, sv), key=lambda x: x[1])