feat: multi-distro VM tests, ja4ebpf eBPF improvements, bot-detector scoring

ja4ebpf:
- Refactor BPF TC capture with improved SYN offset handling and TCP option parsing
- Enhance TLS uprobe SSL hooking for better key extraction
- Add ClickHouse writer improvements for HTTP log materialized views
- Update RPM spec for Rocky Linux 8/9/10, fix systemd service
- Simplify loader with cleaner bpf2go integration

bot-detector:
- Add H2 SETTINGS per-parameter comparison in browser_matcher
- Enhance browser signatures and scoring pipeline
- Improve preprocessing and cycle detection

infra:
- Multi-distro Vagrantfile (centos8, rocky9, rocky10) with per-distro provisioning
- New Makefile targets: vm-up-all, test-vm-matrix, test-vm-centos8/rocky10
- Add debug helpers and run-test-from-host.sh for host-driven VM testing
- Update run-tests-vm.sh for cross-distro compatibility
- Remove accidental binary blob (\004)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jacquin Antoine
2026-04-13 01:09:33 +02:00
parent d81463a589
commit d75825278e
32 changed files with 2148 additions and 890 deletions

View File

@ -18,6 +18,7 @@ from .infra import get_client, set_healthy
from .preprocessing import preprocess_df, FEATURES, FEATURES_COMPLET
from .pipeline import run_semi_supervised_logic
from .fleet import enrich_with_fleet_score
from .browser_signatures import reload_signatures_from_clickhouse
from .metrics import record_cycle_metrics
@ -120,6 +121,13 @@ def fetch_and_analyze():
client = get_client()
# §3.9.5 — Rechargement périodique des signatures H2 depuis ClickHouse
try:
if reload_signatures_from_clickhouse(client):
log_info('[Signatures] Signatures H2 rechargées depuis browser_h2_signatures.')
except Exception:
pass
# ── Récupération du trafic (fenêtre 1h) ──────────────────────────────────
try:
df = client.query_df(f'SELECT * FROM {DB}.view_ai_features_1h')
@ -171,6 +179,43 @@ def fetch_and_analyze():
except Exception as e:
log_info(f'[Fleet §5] Enrichissement de flotte échoué : {e}')
# §3.9.5 — Queue unknown_h2_fingerprints : sessions H2 inconnues mais navigateur-like
try:
bm_col = 'bm_score' if 'bm_score' in df.columns else None
bc_col = 'browser_confidence' if 'browser_confidence' in df.columns else None
h2_col = 'h2_settings_known' if 'h2_settings_known' in df.columns else None
tls_col = 'tls_version' if 'tls_version' in df.columns else None
if bm_col and h2_col:
# Conditions : H2 inconnu + comportement navigateur + TLS 1.3
unknown_h2_mask = (
(df[h2_col] == 0) # H2 SETTINGS inconnu
& (
(df[bm_col] < 0.45) # browser_matcher ne reconnaît pas
| (bc_col and df[bc_col] >= 0.55) # mais browser_confidence élevé
)
)
if tls_col:
unknown_h2_mask = unknown_h2_mask & (df[tls_col].astype(str).str.startswith('TLSv1.3'))
unknown_h2 = df[unknown_h2_mask]
if not unknown_h2.empty:
n_unknown = len(unknown_h2)
# Insérer les fingerprints inconnus dans la table ClickHouse
client.command(
"INSERT INTO ja4_processing.unknown_h2_fingerprints "
"(observed_at, src_ip, ja4, h2_fingerprint, h2_settings_fp, "
"h2_window_update, h2_pseudo_order, h2_has_priority, "
"browser_confidence_score, header_user_agent, tls_version) "
"SELECT now(), src_ip, ja4, h2_fingerprint, h2_settings_fp, "
"h2_window_update, h2_pseudo_order, h2_has_priority, "
"browser_confidence, header_user_agent, tls_version "
"FROM input"
)
log_info(f'[H2 Queue] {n_unknown} fingerprint(s) H2 inconnu(s) mis en file d\'examen.')
except Exception as e:
log_info(f'[H2 Queue] Erreur insertion unknown_h2_fingerprints : {e}')
# ── Résumé des données chargées ───────────────────────────────────────────
n_total = len(df)
n_correlated = int((df.get('correlated', pd.Series()) == 1).sum())