feat: multi-distro VM tests, ja4ebpf eBPF improvements, bot-detector scoring
ja4ebpf: - Refactor BPF TC capture with improved SYN offset handling and TCP option parsing - Enhance TLS uprobe SSL hooking for better key extraction - Add ClickHouse writer improvements for HTTP log materialized views - Update RPM spec for Rocky Linux 8/9/10, fix systemd service - Simplify loader with cleaner bpf2go integration bot-detector: - Add H2 SETTINGS per-parameter comparison in browser_matcher - Enhance browser signatures and scoring pipeline - Improve preprocessing and cycle detection infra: - Multi-distro Vagrantfile (centos8, rocky9, rocky10) with per-distro provisioning - New Makefile targets: vm-up-all, test-vm-matrix, test-vm-centos8/rocky10 - Add debug helpers and run-test-from-host.sh for host-driven VM testing - Update run-tests-vm.sh for cross-distro compatibility - Remove accidental binary blob (\004) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -302,16 +302,51 @@ def _compute_family_score(df: pd.DataFrame, family: str) -> pd.Series:
|
||||
"""Calcule le score de correspondance [0.0, 1.0] pour une famille navigateur.
|
||||
|
||||
Score = somme pondérée des 7 dimensions.
|
||||
Quand has_xff=1 (CDN/proxy), les dimensions H2 sont neutralisées à 0.5
|
||||
et leur poids (0.70) est redistribué vers HTTP headers (+0.35) et TLS (+0.35).
|
||||
"""
|
||||
w = DIMENSION_WEIGHTS
|
||||
score = (
|
||||
_d1_h2_settings(df, family) * w["h2_settings"]
|
||||
+ _d2_h2_window(df, family) * w["h2_window"]
|
||||
+ _d3_pseudo_order(df, family) * w["pseudo_order"]
|
||||
+ _d4_h2_priority(df, family) * w["h2_priority"]
|
||||
+ _d5_http_headers(df, family) * w["http_headers"]
|
||||
+ _d6_tls_structure(df, family) * w["tls_structure"]
|
||||
+ _d7_ja4_dict(df, family) * w["ja4_dict"]
|
||||
has_xff = _col(df, "has_xff").astype(bool)
|
||||
|
||||
# Dimensions H2
|
||||
d_h2_settings = _d1_h2_settings(df, family)
|
||||
d_h2_window = _d2_h2_window(df, family)
|
||||
d_pseudo = _d3_pseudo_order(df, family)
|
||||
d_priority = _d4_h2_priority(df, family)
|
||||
# Dimensions non-H2
|
||||
d_headers = _d5_http_headers(df, family)
|
||||
d_tls = _d6_tls_structure(df, family)
|
||||
d_ja4 = _d7_ja4_dict(df, family)
|
||||
|
||||
# Neutraliser les dimensions H2 à 0.5 derrière CDN (le H2 observé est celui du proxy)
|
||||
h2_weight_total = w["h2_settings"] + w["h2_window"] + w["pseudo_order"] + w["h2_priority"]
|
||||
# Redistribuer : chaque dimension non-H2 reçoit une part proportionnelle
|
||||
# au poids H2 redistribué (0.35 vers headers, 0.35 vers TLS)
|
||||
http_bonus = h2_weight_total / 2 # 0.35
|
||||
tls_bonus = h2_weight_total / 2 # 0.35
|
||||
|
||||
# Score avec poids normaux (pas CDN)
|
||||
score_normal = (
|
||||
d_h2_settings * w["h2_settings"]
|
||||
+ d_h2_window * w["h2_window"]
|
||||
+ d_pseudo * w["pseudo_order"]
|
||||
+ d_priority * w["h2_priority"]
|
||||
+ d_headers * w["http_headers"]
|
||||
+ d_tls * w["tls_structure"]
|
||||
+ d_ja4 * w["ja4_dict"]
|
||||
)
|
||||
|
||||
# Score avec poids redistribués (CDN : H2 neutralisé à 0.5)
|
||||
score_cdn = (
|
||||
0.5 * h2_weight_total # H2 dimensions neutralisées
|
||||
+ d_headers * (w["http_headers"] + http_bonus) # 0.15 + 0.35 = 0.50
|
||||
+ d_tls * (w["tls_structure"] + tls_bonus) # 0.10 + 0.35 = 0.45
|
||||
+ d_ja4 * w["ja4_dict"] # 0.05 (inchangé)
|
||||
)
|
||||
|
||||
score = pd.Series(
|
||||
np.where(has_xff, score_cdn, score_normal),
|
||||
index=df.index,
|
||||
)
|
||||
return score.clip(0.0, 1.0)
|
||||
|
||||
@ -414,6 +449,12 @@ def run_browser_matcher(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df["bm_non_browser"] = non_browser_mask
|
||||
df["bm_decision"] = decision
|
||||
|
||||
# Scores par famille pour le vecteur ML (§3.9.4)
|
||||
for family in BROWSER_SIGNATURES:
|
||||
df[f"browser_match_{family.lower()}"] = scores[family].round(4)
|
||||
df["browser_match_max"] = adjusted_score.round(4)
|
||||
df["browser_family_detected"] = df["bm_family"]
|
||||
|
||||
return df
|
||||
|
||||
|
||||
|
||||
@ -103,3 +103,63 @@ DIMENSION_WEIGHTS: dict = {
|
||||
"tls_structure": 0.10,
|
||||
"ja4_dict": 0.05,
|
||||
}
|
||||
|
||||
# Timestamp du dernier rechargement des signatures depuis ClickHouse.
|
||||
_last_signature_reload: float = 0.0
|
||||
_SIGNATURE_RELOAD_INTERVAL: float = 86400.0 # 24 heures
|
||||
|
||||
|
||||
def reload_signatures_from_clickhouse(client) -> bool:
|
||||
"""§3.9.5 : Recharge les signatures H2 depuis ja4_processing.browser_h2_signatures.
|
||||
|
||||
Fusionne les signatures dynamiques (ClickHouse) avec les signatures statiques.
|
||||
Les signatures dynamiques sont ajoutées ou remplacent les existantes par famille.
|
||||
Appelé une fois par cycle, mais n'effectue le rechargement que toutes les 24h.
|
||||
"""
|
||||
import json
|
||||
import time as _time
|
||||
|
||||
global _last_signature_reload
|
||||
|
||||
now = _time.time()
|
||||
if now - _last_signature_reload < _SIGNATURE_RELOAD_INTERVAL:
|
||||
return False
|
||||
|
||||
try:
|
||||
df = client.query_df(
|
||||
"SELECT * FROM ja4_processing.browser_h2_signatures WHERE is_active = 1"
|
||||
)
|
||||
if df is None or df.empty:
|
||||
return False
|
||||
|
||||
loaded = 0
|
||||
for _, row in df.iterrows():
|
||||
family = str(row.get('family', ''))
|
||||
if not family:
|
||||
continue
|
||||
try:
|
||||
settings = json.loads(str(row.get('h2_settings_json', '{}')))
|
||||
forbidden = json.loads(str(row.get('h2_settings_forbidden', '[]')))
|
||||
tls = json.loads(str(row.get('tls_json', '{}')))
|
||||
headers_req = json.loads(str(row.get('headers_required', '[]')))
|
||||
headers_forbid = json.loads(str(row.get('headers_forbidden', '[]')))
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
continue
|
||||
|
||||
BROWSER_SIGNATURES[family] = {
|
||||
"h2_settings_exact": {int(k): int(v) for k, v in settings.items()},
|
||||
"h2_settings_forbidden_keys": [int(x) for x in forbidden],
|
||||
"h2_window_update": int(row.get('h2_window_update', 0)),
|
||||
"h2_window_update_tolerance": int(row.get('h2_window_update_tolerance', 1000)),
|
||||
"h2_priority_frames_expected": bool(row.get('h2_priority_expected', 0)),
|
||||
"pseudo_header_order": str(row.get('pseudo_header_order', '')),
|
||||
"tls": tls,
|
||||
"headers_required": headers_req,
|
||||
"headers_forbidden": headers_forbid,
|
||||
}
|
||||
loaded += 1
|
||||
|
||||
_last_signature_reload = now
|
||||
return loaded > 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@ -18,6 +18,7 @@ from .infra import get_client, set_healthy
|
||||
from .preprocessing import preprocess_df, FEATURES, FEATURES_COMPLET
|
||||
from .pipeline import run_semi_supervised_logic
|
||||
from .fleet import enrich_with_fleet_score
|
||||
from .browser_signatures import reload_signatures_from_clickhouse
|
||||
from .metrics import record_cycle_metrics
|
||||
|
||||
|
||||
@ -120,6 +121,13 @@ def fetch_and_analyze():
|
||||
|
||||
client = get_client()
|
||||
|
||||
# §3.9.5 — Rechargement périodique des signatures H2 depuis ClickHouse
|
||||
try:
|
||||
if reload_signatures_from_clickhouse(client):
|
||||
log_info('[Signatures] Signatures H2 rechargées depuis browser_h2_signatures.')
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── Récupération du trafic (fenêtre 1h) ──────────────────────────────────
|
||||
try:
|
||||
df = client.query_df(f'SELECT * FROM {DB}.view_ai_features_1h')
|
||||
@ -171,6 +179,43 @@ def fetch_and_analyze():
|
||||
except Exception as e:
|
||||
log_info(f'[Fleet §5] Enrichissement de flotte échoué : {e}')
|
||||
|
||||
# §3.9.5 — Queue unknown_h2_fingerprints : sessions H2 inconnues mais navigateur-like
|
||||
try:
|
||||
bm_col = 'bm_score' if 'bm_score' in df.columns else None
|
||||
bc_col = 'browser_confidence' if 'browser_confidence' in df.columns else None
|
||||
h2_col = 'h2_settings_known' if 'h2_settings_known' in df.columns else None
|
||||
tls_col = 'tls_version' if 'tls_version' in df.columns else None
|
||||
|
||||
if bm_col and h2_col:
|
||||
# Conditions : H2 inconnu + comportement navigateur + TLS 1.3
|
||||
unknown_h2_mask = (
|
||||
(df[h2_col] == 0) # H2 SETTINGS inconnu
|
||||
& (
|
||||
(df[bm_col] < 0.45) # browser_matcher ne reconnaît pas
|
||||
| (bc_col and df[bc_col] >= 0.55) # mais browser_confidence élevé
|
||||
)
|
||||
)
|
||||
if tls_col:
|
||||
unknown_h2_mask = unknown_h2_mask & (df[tls_col].astype(str).str.startswith('TLSv1.3'))
|
||||
|
||||
unknown_h2 = df[unknown_h2_mask]
|
||||
if not unknown_h2.empty:
|
||||
n_unknown = len(unknown_h2)
|
||||
# Insérer les fingerprints inconnus dans la table ClickHouse
|
||||
client.command(
|
||||
"INSERT INTO ja4_processing.unknown_h2_fingerprints "
|
||||
"(observed_at, src_ip, ja4, h2_fingerprint, h2_settings_fp, "
|
||||
"h2_window_update, h2_pseudo_order, h2_has_priority, "
|
||||
"browser_confidence_score, header_user_agent, tls_version) "
|
||||
"SELECT now(), src_ip, ja4, h2_fingerprint, h2_settings_fp, "
|
||||
"h2_window_update, h2_pseudo_order, h2_has_priority, "
|
||||
"browser_confidence, header_user_agent, tls_version "
|
||||
"FROM input"
|
||||
)
|
||||
log_info(f'[H2 Queue] {n_unknown} fingerprint(s) H2 inconnu(s) mis en file d\'examen.')
|
||||
except Exception as e:
|
||||
log_info(f'[H2 Queue] Erreur insertion unknown_h2_fingerprints : {e}')
|
||||
|
||||
# ── Résumé des données chargées ───────────────────────────────────────────
|
||||
n_total = len(df)
|
||||
n_correlated = int((df.get('correlated', pd.Series()) == 1).sum())
|
||||
|
||||
@ -140,6 +140,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
|
||||
# XGBoost supervisé — troisième voix (si labels historiques disponibles)
|
||||
unknown_traffic['xgb_prob'] = 0.0
|
||||
xgb_model_ref = None # Référence pour SHAP TreeExplainer (§2.4.5)
|
||||
if XGB_AVAILABLE and XGB_WEIGHT > 0:
|
||||
try:
|
||||
xgb_client = get_client()
|
||||
@ -150,6 +151,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
X_xgb = unknown_traffic[xgb_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
|
||||
xgb_probs = xgb_model.predict_proba(X_xgb.values)[:, 1]
|
||||
unknown_traffic['xgb_prob'] = xgb_probs
|
||||
xgb_model_ref = xgb_model
|
||||
log_info(f"[{name}] XGBoost : xgb_mean={xgb_probs.mean():.4f}")
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] XGBoost scoring échoué : {exc} — EIF+AE seuls.")
|
||||
@ -187,9 +189,9 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
except Exception as exc:
|
||||
log_info(f"[{name}] MetaLearner entraînement échoué : {exc}")
|
||||
|
||||
# §7 — ExIFFI : importance de features pour l'EIF (quand SHAP désactivé)
|
||||
# §7 — ExIFFI : importance de features pour l'EIF (toujours actif en complément de SHAP)
|
||||
exiffi_tops: list = [{}] * len(unknown_traffic)
|
||||
if not ENABLE_SHAP and len(unknown_traffic) > 0:
|
||||
if len(unknown_traffic) > 0:
|
||||
try:
|
||||
exiffi_tops = compute_exiffi_importance(model, X_test, scoring_features)
|
||||
except Exception:
|
||||
@ -376,9 +378,10 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
log_info(f"[{name}] ALERT: {len(anomalies)} anomalies détectées (seuil={effective_threshold:.4f}).")
|
||||
anomalies['recurrence'] = anomalies['src_ip'].map(recurrence_map).fillna(0).astype(int) + 1
|
||||
|
||||
# A4 — Explainabilité SHAP : top features responsables de chaque anomalie
|
||||
# A4 — Explainabilité SHAP : TreeExplainer sur XGBoost si dispo, sinon EIF
|
||||
X_anomalies = X_test.loc[anomalies.index]
|
||||
shap_tops = compute_shap_top_features(model, X_anomalies, valid_features)
|
||||
shap_tops = compute_shap_top_features(model, X_anomalies, valid_features,
|
||||
xgb_model=xgb_model_ref)
|
||||
|
||||
# §7 — ExIFFI : utiliser les tops ExIFFI précalculés quand SHAP est inactif
|
||||
# Construire un mapping index → exiffi_top pour accès rapide
|
||||
|
||||
@ -44,6 +44,8 @@ FEATURES = [
|
||||
'host_diversity', 'host_sweep_speed', 'host_coverage_uniformity',
|
||||
# §5.8b — Similarité Jaccard cross-domaine (chemins partagés entre hosts)
|
||||
'cross_domain_path_similarity',
|
||||
# §5.4 — Resource Dependency Tree (cascade de chargement)
|
||||
'root_to_first_asset_delay', 'asset_load_stddev',
|
||||
# P0+P1 : features sous-exploitées (SQL existant ou ajouté)
|
||||
'is_fake_navigation',
|
||||
'true_window_size', 'window_mss_ratio',
|
||||
@ -59,6 +61,9 @@ FEATURES = [
|
||||
'h2_order_chromesafari', 'h2_order_firefox',
|
||||
# §3 — Score de cohérence de fingerprint cross-layer
|
||||
'fingerprint_coherence_score',
|
||||
# §3.9.4 — Browser matcher scores (passif H2)
|
||||
'browser_match_chrome', 'browser_match_firefox', 'browser_match_safari',
|
||||
'browser_match_max',
|
||||
]
|
||||
|
||||
# Features supplémentaires pour le modèle Complet (données TCP/TLS requises)
|
||||
@ -103,6 +108,11 @@ def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
|
||||
# browser_confidence jusqu'à la validation complète.
|
||||
if BROWSER_MATCHER_ENABLED:
|
||||
df = run_browser_matcher(df)
|
||||
else:
|
||||
# Colonnes par défaut quand le matcher est désactivé
|
||||
for col in ['browser_match_chrome', 'browser_match_firefox', 'browser_match_safari',
|
||||
'browser_match_max', 'browser_family_detected']:
|
||||
df[col] = 0.0 if col != 'browser_family_detected' else ''
|
||||
|
||||
# Rétro-compatibilité
|
||||
df['is_known_browser'] = browser_axes['axis_ja4_known'].astype(int)
|
||||
|
||||
@ -248,25 +248,48 @@ def normalize_scores(scores: np.ndarray) -> np.ndarray:
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def compute_shap_top_features(model, X: pd.DataFrame, features: list,
|
||||
n_top: int = 5) -> list:
|
||||
n_top: int = 5, xgb_model=None) -> list:
|
||||
"""
|
||||
Calcule les valeurs SHAP pour chaque ligne de X et retourne les n_top features
|
||||
les plus contributives (valeur SHAP la plus négative = plus responsable de l'anomalie).
|
||||
Retourne une liste de dicts {feature: shap_value} par ligne.
|
||||
Calcule les valeurs SHAP et retourne les n_top features les plus contributives.
|
||||
|
||||
Utilise TreeExplainer pour sklearn, et un échantillon Permutation pour isotree.
|
||||
Stratégie par modèle (conforme à la thèse §2.4.5) :
|
||||
- XGBoost : TreeExplainer (O(TLD²), exact et efficace)
|
||||
- EIF (sklearn) : TreeExplainer natif
|
||||
- EIF (isotree) : PermutationExplainer
|
||||
|
||||
Si xgb_model est fourni, utilise TreeExplainer sur XGBoost en priorité.
|
||||
Sinon, utilise l'Explainer adapté au modèle EIF.
|
||||
|
||||
Retourne une liste de dicts {feature: shap_value} par ligne.
|
||||
"""
|
||||
if not ENABLE_SHAP or X.empty:
|
||||
return [{}] * len(X)
|
||||
|
||||
# Priorité XGBoost : TreeExplainer est optimal pour les modèles à base d'arbres
|
||||
if xgb_model is not None:
|
||||
try:
|
||||
explainer = _shap.TreeExplainer(xgb_model)
|
||||
shap_values = explainer.shap_values(X[features].fillna(0))
|
||||
if isinstance(shap_values, list):
|
||||
shap_values = shap_values[1] if len(shap_values) > 1 else shap_values[0]
|
||||
result = []
|
||||
for sv in shap_values:
|
||||
pairs = sorted(zip(features, sv), key=lambda x: abs(x[1]), reverse=True)
|
||||
result.append({f: round(float(v), 4) for f, v in pairs[:n_top]})
|
||||
return result
|
||||
except Exception as e:
|
||||
log_info(f"[SHAP] TreeExplainer XGBoost échoué ({e}), fallback EIF")
|
||||
|
||||
# Fallback EIF
|
||||
try:
|
||||
if EIF_AVAILABLE:
|
||||
sample_size = min(100, len(X))
|
||||
X_sample = X.sample(n=sample_size, random_state=42) if len(X) > sample_size else X
|
||||
X_sample = X[features].sample(n=sample_size, random_state=42) if len(X) > sample_size else X[features]
|
||||
explainer = _shap.Explainer(model.decision_function, X_sample)
|
||||
shap_values = explainer(X).values
|
||||
shap_values = explainer(X[features].fillna(0)).values
|
||||
else:
|
||||
explainer = _shap.TreeExplainer(model)
|
||||
shap_values = explainer.shap_values(X)
|
||||
shap_values = explainer.shap_values(X[features].fillna(0))
|
||||
result = []
|
||||
for sv in shap_values:
|
||||
pairs = sorted(zip(features, sv), key=lambda x: x[1])
|
||||
|
||||
Reference in New Issue
Block a user