feat: pipeline L7 HTTP complet + infrastructure tests VM

Correctifs pipeline L7 (uprobe SSL_read) :
- uprobe_ssl.c : ssl_set_fd ne retourne plus tôt quand fd_conn_map est
  vide (accept4 non disponible en Docker). Sauvegarde ssl_ptr→{fd,0,0}
  pour permettre le fallback /proc côté Go.
- main.go : consumeSSLEvents reécrit avec routeur magic-bytes complet :
  * HTTP/2 preface → extraction SETTINGS + conversion correlation.HTTP2Settings
  * HTTP/1.x requête → method, path, query, headers, header_order_sig
  * HTTP/1.x réponse → status_code
  * Fallback /proc/<tgid>/fd/<fd> quand src_ip=0 (accept4 absent)
- writer/clickhouse.go : export header_order_signature ajouté

Nouveaux packages :
- internal/parser/http1.go : parseur HTTP/1.x (IsHTTP1Request,
  ParseHTTP1Request, IsHTTP1Response, ParseHTTP1Response)
- internal/parser/http1_test.go : 11 tests unitaires (28 total passent)
- internal/procutil/proc_lookup.go : résolution fd→IP via /proc avec cache
  TTL 5s (FDCache). Supporte /proc/PID/net/tcp et tcp6, IPv4-mappé IPv6.

Infrastructure tests VM (tests/vm/) :
- Vagrantfile : VM Rocky Linux 9 KVM, 4 CPU / 4 GB RAM
- provision.sh : installation toolchain eBPF + Go + Docker + nginx
- run-tests-vm.sh : suite de test complète dans la VM (L3/L4+TLS+L7)
- README.md : guide d'installation et d'utilisation
- Makefile : cibles vm-up, vm-down, vm-ssh, test-vm-nginx, test-vm-all,
  vm-rebuild-ja4ebpf

Corrections stack Docker :
- Dockerfiles nginx/apache/nginx-varnish/hitch-varnish : suppression des
  références à shared/go/ja4common/ (répertoire supprimé)
- clickhouse-init.sh : restauré depuis git, seed anubis_ua_rules obsolète
  supprimé (table REGEXP_TREE supprimée du schéma)
- traffic-gen : ajout HTTP/1.0 (http.client) et HTTP/2 (httpx)
- verify_db.py : script de vérification 35 checks (L3/L4/TLS/L7/corrélation)
- run-stack-tests.sh : phase 6 verify_db ajoutée

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-12 02:37:00 +02:00
parent 9734e21fe3
commit f85a10b012
21 changed files with 1868 additions and 74 deletions

View File

@ -1,6 +1,8 @@
FROM python:3.12-alpine
# No extra deps needed — stdlib only (urllib, ssl, concurrent.futures)
# httpx[http2] pour les scénarios HTTP/2 explicites
RUN pip install --no-cache-dir "httpx[http2]"
WORKDIR /app
COPY *.py .

View File

@ -8,18 +8,19 @@ Simulates varied web traffic including:
- Multiple HTTP methods (GET, POST, PUT, DELETE, HEAD, OPTIONS, PATCH)
- Varied paths, query strings, form data, JSON payloads
- Both HTTP (port 80) and HTTPS (port 443)
- HTTP/1.0, HTTP/1.1, HTTP/2.0 (via httpx[http2])
- Different Accept/Language/Encoding headers
- Cookie / Referer / X-Forwarded-For always set — ensures src_ip diversity
in ClickHouse via mod_remoteip (r->useragent_ip updated from XFF)
- Multiple SSL contexts to vary TLS ClientHello parameters
Usage:
python generate_traffic.py [--host platform] [--http-port 80] [--https-port 443]
[--requests 500] [--workers 10] [--scenario all]
[--requests 500] [--workers 10]
"""
import argparse
import concurrent.futures
import http.client
import json
import random
import ssl
@ -435,6 +436,45 @@ def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> l
label="options-cors",
))
# --- HTTP/1.0 explicite sur HTTP (port 80) ---
# http.client permet de forcer le protocole HTTP/1.0 via _http_vsn
h10_count = max(10, int(count * 0.05))
for _ in range(h10_count):
ua = random.choice(BROWSERS + BOTS)
path = random.choice(["/", "/health", "/index.html", "/robots.txt"])
scenarios.append(RequestScenario(
method="GET",
url=f"{base_http}{path}",
headers=_random_headers(ua, xff_ip=random.choice(HUMAN_IPS + BOT_IPS)),
label="http10-plain",
))
# --- HTTP/1.0 explicite sur HTTPS ---
for _ in range(max(5, int(count * 0.03))):
ua = random.choice(BROWSERS + BOTS)
_, ssl_ctx = random.choice(SSL_CONTEXTS)
scenarios.append(RequestScenario(
method="GET",
url=f"{base_https}/health",
headers=_random_headers(ua, xff_ip=random.choice(HUMAN_IPS)),
ssl_ctx=ssl_ctx,
label="http10-tls",
))
# --- HTTP/2 explicite (httpx[http2]) ---
h2_count = max(20, int(count * 0.10))
for _ in range(h2_count):
ua = random.choice(BROWSERS)
path = random.choice(PATHS)
qs = random.choice(QUERY_PARAMS)
scenarios.append(RequestScenario(
method=random.choice(["GET", "GET", "GET", "POST"]),
url=f"{base_https}{path}{qs}",
headers=_random_headers(ua, xff_ip=random.choice(HUMAN_IPS)),
body=json.dumps({"h2": True}).encode() if random.random() < 0.2 else None,
label="http2-explicit",
))
# Fill remaining with browser HTTPS GETs
while len(scenarios) < count:
ua = random.choice(BROWSERS)
@ -457,8 +497,78 @@ def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> l
stats = {"ok": 0, "err": 0, "by_label": {}}
def _send_http10(scenario: RequestScenario) -> dict:
"""Envoie une requête en HTTP/1.0 pur via http.client."""
t0 = time.monotonic()
try:
from urllib.parse import urlparse
parsed = urlparse(scenario.url)
host = parsed.hostname
port = parsed.port or (443 if parsed.scheme == "https" else 80)
path = parsed.path or "/"
if parsed.query:
path += "?" + parsed.query
if parsed.scheme == "https":
ctx = scenario.ssl_ctx or ssl.create_default_context()
if hasattr(ctx, "check_hostname"):
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
conn = http.client.HTTPSConnection(host, port, timeout=5, context=ctx)
else:
conn = http.client.HTTPConnection(host, port, timeout=5)
# Forcer HTTP/1.0
conn._http_vsn = 10
conn._http_vsn_str = "HTTP/1.0"
hdrs = {k: v for k, v in scenario.headers.items()
if k.lower() not in ("connection",)}
conn.request(scenario.method, path, body=scenario.body, headers=hdrs)
resp = conn.getresponse()
resp.read(4096)
return {"ok": True, "status": resp.status, "label": scenario.label,
"ms": int((time.monotonic() - t0) * 1000)}
except Exception as e:
return {"ok": False, "error": str(e)[:80], "label": scenario.label,
"ms": int((time.monotonic() - t0) * 1000)}
finally:
try:
conn.close()
except Exception:
pass
def _send_http2(scenario: RequestScenario) -> dict:
"""Envoie une requête HTTP/2 via httpx (négociation ALPN h2)."""
t0 = time.monotonic()
try:
import httpx
with httpx.Client(http2=True, verify=False, timeout=5.0) as client:
hdrs = {k: v for k, v in scenario.headers.items()
if k.lower() not in ("connection", "content-length")}
resp = client.request(
method=scenario.method,
url=scenario.url,
headers=hdrs,
content=scenario.body,
)
return {"ok": True, "status": resp.status_code, "label": scenario.label,
"ms": int((time.monotonic() - t0) * 1000),
"http_version": resp.http_version}
except Exception as e:
return {"ok": False, "error": str(e)[:80], "label": scenario.label,
"ms": int((time.monotonic() - t0) * 1000)}
def send_request(scenario: RequestScenario) -> dict:
"""Send a single request, return result dict."""
"""Dispatcher : HTTP/1.0, HTTP/2, ou HTTP/1.1 selon le label."""
if scenario.label.startswith("http10"):
return _send_http10(scenario)
if scenario.label == "http2-explicit":
return _send_http2(scenario)
# HTTP/1.1 via urllib (chemin existant)
t0 = time.monotonic()
try:
req = urllib.request.Request(
@ -469,11 +579,10 @@ def send_request(scenario: RequestScenario) -> dict:
)
ctx = scenario.ssl_ctx
with urllib.request.urlopen(req, context=ctx, timeout=5) as resp:
_ = resp.read(4096) # consume partial body
_ = resp.read(4096)
return {"ok": True, "status": resp.status, "label": scenario.label,
"ms": int((time.monotonic() - t0) * 1000)}
except urllib.error.HTTPError as e:
# HTTP errors (4xx/5xx) are still valid responses — Apache served them
return {"ok": True, "status": e.code, "label": scenario.label,
"ms": int((time.monotonic() - t0) * 1000)}
except Exception as e:

View File

@ -0,0 +1,268 @@
#!/usr/bin/env python3
"""
verify_db.py — Vérification exhaustive des données dans ClickHouse après génération de trafic
Vérifie que toutes les couches de données attendues sont présentes :
- L3/L4 : TTL, MSS, window_size, df_bit
- TLS/L5 : ja4, sni, alpn, version
- L7 HTTP : method, path, status_code, duration_ms, header_order_signature
- Corrélation : correlated=1 (L3+L7), correlated=0 (L7 seul)
- Keep-alives : requêtes multiplexées sur une même connexion TCP
- HTTP/2 : tls_alpn='h2' ou h2_settings_count > 0
- HTTP plain : lignes sans TLS (port 80)
Usage:
python verify_db.py [--host clickhouse] [--port 9000]
[--db-logs ja4_logs] [--db-processing ja4_processing]
[--min-rows 10]
"""
import argparse
import sys
import time
# ---------------------------------------------------------------------------
# Client ClickHouse léger (HTTP interface port 8123)
# ---------------------------------------------------------------------------
import urllib.parse
import urllib.request
import json
def ch_query(host: str, port: int, query: str) -> list:
"""Envoie une requête SELECT à ClickHouse via HTTP (port 8123) et retourne les lignes."""
url = f"http://{host}:{port}/?query={urllib.parse.quote(query + ' FORMAT JSON')}"
try:
with urllib.request.urlopen(url, timeout=10) as resp:
data = json.loads(resp.read())
return data.get("data", [])
except Exception as e:
return [{"__error__": str(e)}]
def ch_scalar(host: str, port: int, query: str) -> str:
"""Retourne la première colonne de la première ligne."""
rows = ch_query(host, port, query)
if not rows or "__error__" in rows[0]:
return str(rows[0].get("__error__", "?"))
return str(list(rows[0].values())[0])
# ---------------------------------------------------------------------------
# Checks
# ---------------------------------------------------------------------------
CHECK_OK = ""
CHECK_FAIL = ""
CHECK_WARN = "⚠️ "
results: list = []
def check(name: str, query: str, host: str, port: int, expect_nonzero: bool = True,
min_val: int = 1, warn_only: bool = False) -> bool:
val = ch_scalar(host, port, query)
try:
n = int(float(val))
except ValueError:
n = -1
ok = n >= min_val if expect_nonzero else n == 0
icon = CHECK_OK if ok else (CHECK_WARN if warn_only else CHECK_FAIL)
results.append((icon, name, val))
return ok
def run_checks(host: str, http_port: int, db_logs: str, db_processing: str, min_rows: int):
print(f"\n{'='*65}")
print(f" Vérification ClickHouse — {db_logs} / {db_processing}")
print(f"{'='*65}\n")
# ------------------------------------------------------------------
# 1. Tables brutes
# ------------------------------------------------------------------
print("── 1. Tables brutes ─────────────────────────────────────────")
check("http_logs_raw : lignes totales",
f"SELECT count() FROM {db_logs}.http_logs_raw", host, http_port, min_val=min_rows)
check("http_logs : lignes après MV",
f"SELECT count() FROM {db_logs}.http_logs", host, http_port, min_val=min_rows)
# ------------------------------------------------------------------
# 2. Métadonnées L3/L4
# ------------------------------------------------------------------
print("\n── 2. Métadonnées L3/L4 ─────────────────────────────────────")
check("ip_meta_ttl > 0 (TTL capturé)",
f"SELECT count() FROM {db_logs}.http_logs WHERE ip_meta_ttl > 0",
host, http_port, min_val=1)
check("tcp_meta_mss > 0 (MSS capturé)",
f"SELECT count() FROM {db_logs}.http_logs WHERE tcp_meta_mss > 0",
host, http_port, min_val=1)
check("tcp_meta_window_size > 0 (window_size capturé)",
f"SELECT count() FROM {db_logs}.http_logs WHERE tcp_meta_window_size > 0",
host, http_port, min_val=1)
check("src_port renseigné (> 1000)",
f"SELECT count() FROM {db_logs}.http_logs WHERE src_port > 1000",
host, http_port, min_val=1)
# ------------------------------------------------------------------
# 3. TLS / JA4
# ------------------------------------------------------------------
print("\n── 3. TLS / JA4 ─────────────────────────────────────────────")
check("ja4 renseigné (fingerprint TLS)",
f"SELECT count() FROM {db_logs}.http_logs WHERE ja4 != ''",
host, http_port, min_val=1)
check("tls_sni renseigné (SNI extrait)",
f"SELECT count() FROM {db_logs}.http_logs WHERE tls_sni != ''",
host, http_port, min_val=1)
check("tls_version TLSv1.2 présente",
f"SELECT count() FROM {db_logs}.http_logs WHERE tls_version = 'TLSv1.2'",
host, http_port, min_val=1, warn_only=True)
check("tls_version TLSv1.3 présente",
f"SELECT count() FROM {db_logs}.http_logs WHERE tls_version = 'TLSv1.3'",
host, http_port, min_val=1)
check("tls_alpn h2 (HTTP/2 négocié)",
f"SELECT count() FROM {db_logs}.http_logs WHERE tls_alpn = 'h2'",
host, http_port, min_val=1, warn_only=True)
# ja4 format : t12d... ou t13d... selon la version TLS
check("ja4 TLS1.2 (commence par t12)",
f"SELECT count() FROM {db_logs}.http_logs WHERE startsWith(ja4, 't12')",
host, http_port, min_val=1, warn_only=True)
check("ja4 TLS1.3 (commence par t13)",
f"SELECT count() FROM {db_logs}.http_logs WHERE startsWith(ja4, 't13')",
host, http_port, min_val=1)
# ------------------------------------------------------------------
# 4. Couche L7 HTTP
# ------------------------------------------------------------------
print("\n── 4. Couche L7 HTTP ────────────────────────────────────────")
for method in ("GET", "POST", "PUT", "DELETE", "HEAD", "OPTIONS", "PATCH"):
check(f"méthode {method} présente",
f"SELECT count() FROM {db_logs}.http_logs WHERE method = '{method}'",
host, http_port, min_val=1)
check("path renseigné",
f"SELECT count() FROM {db_logs}.http_logs WHERE path != ''",
host, http_port, min_val=1)
check("status_code 200 présent",
f"SELECT count() FROM {db_logs}.http_logs WHERE status_code = 200",
host, http_port, min_val=1)
check("status_code 404 présent (requêtes de test)",
f"SELECT count() FROM {db_logs}.http_logs WHERE status_code IN (400,404,405,500)",
host, http_port, min_val=1, warn_only=True)
check("duration_ms > 0 (latence mesurée)",
f"SELECT count() FROM {db_logs}.http_logs WHERE duration_ms > 0",
host, http_port, min_val=1)
check("header_order_signature renseigné",
f"SELECT count() FROM {db_logs}.http_logs WHERE header_order_signature != ''",
host, http_port, min_val=1)
# ------------------------------------------------------------------
# 5. Corrélation L3+L7
# ------------------------------------------------------------------
print("\n── 5. Corrélation L3 ↔ L7 ───────────────────────────────────")
check("correlated=1 (lignes L3+L7 fusionnées)",
f"SELECT count() FROM {db_logs}.http_logs WHERE correlated = 1",
host, http_port, min_val=1)
check("correlated=0 (HTTP sans corr. réseau, port 80)",
f"SELECT count() FROM {db_logs}.http_logs WHERE correlated = 0",
host, http_port, min_val=1, warn_only=True)
pct_corr = ch_scalar(host, http_port,
f"SELECT round(100*countIf(correlated=1)/count(), 1) FROM {db_logs}.http_logs")
results.append((CHECK_OK, f"taux corrélation ({pct_corr}%)", pct_corr))
# ------------------------------------------------------------------
# 6. Keep-alives (TCP multiplexing)
# ------------------------------------------------------------------
print("\n── 6. Keep-alives ────────────────────────────────────────────")
check("keepalives > 1 (connexions multiplexées)",
f"SELECT count() FROM {db_logs}.http_logs WHERE keepalives > 1",
host, http_port, min_val=1, warn_only=True)
max_ka = ch_scalar(host, http_port,
f"SELECT max(keepalives) FROM {db_logs}.http_logs")
results.append((CHECK_OK, f"max keepalives ({max_ka})", max_ka))
# ------------------------------------------------------------------
# 7. Diversité des IPs sources
# ------------------------------------------------------------------
print("\n── 7. Diversité sources ─────────────────────────────────────")
check("IPs sources distinctes >= 5",
f"SELECT uniqExact(src_ip) FROM {db_logs}.http_logs",
host, http_port, min_val=5, warn_only=True)
# ------------------------------------------------------------------
# 8. Tables de traitement (ja4_processing)
# ------------------------------------------------------------------
print("\n── 8. Tables processing ─────────────────────────────────────")
for tbl in ("agg_host_ip_ja4_1h", "agg_header_fingerprint_1h",
"agg_ip_behavior_1h", "agg_request_timing_1h"):
check(f"{tbl} peuplée",
f"SELECT count() FROM {db_processing}.{tbl}",
host, http_port, min_val=1, warn_only=True)
# ------------------------------------------------------------------
# Résumé
# ------------------------------------------------------------------
print(f"\n{'='*65}")
fails = 0
warns = 0
for icon, name, val in results:
print(f" {icon} {name:<50s} {val}")
if icon == CHECK_FAIL:
fails += 1
elif icon == CHECK_WARN:
warns += 1
total = len(results)
passed = total - fails - warns
print(f"\n{'='*65}")
print(f" Résultat : {passed} OK | {warns} WARN | {fails} FAIL (total {total})")
print(f"{'='*65}\n")
return fails == 0
def main():
parser = argparse.ArgumentParser(description="Vérification ClickHouse post-trafic")
parser.add_argument("--host", default="clickhouse", help="Hôte ClickHouse")
parser.add_argument("--port", type=int, default=8123, help="Port HTTP ClickHouse (8123)")
parser.add_argument("--db-logs", default="ja4_logs", help="Base de données logs")
parser.add_argument("--db-processing", default="ja4_processing", help="Base processing")
parser.add_argument("--min-rows", type=int, default=10, help="Minimum de lignes attendues")
parser.add_argument("--wait", type=int, default=5,
help="Attendre N secondes avant de vérifier (flush MV)")
args = parser.parse_args()
if args.wait > 0:
print(f"[verify_db] Attente {args.wait}s pour le flush des Materialized Views...")
time.sleep(args.wait)
ok = run_checks(args.host, args.port, args.db_logs, args.db_processing, args.min_rows)
sys.exit(0 if ok else 1)
if __name__ == "__main__":
main()