Files
ja4-platform/services/ja4ebpf/internal/writer/clickhouse.go
Jacquin Antoine a02423fd18 feat: maximize data completeness across L3/L4/TLS/HTTP layers and add E2E test infra
Add SSL_write uprobe for HTTP response capture, HPACK decoder for HTTP/2
header extraction, and AcceptCache for reliable SSL/TC session correlation.
Populate all ClickHouse fields including tcp_meta_options, ip_meta_total_length,
syn_to_clienthello_ms, client_headers, TLS cipher suites/extensions, and
h2_enable_connect_protocol. Increase BPF capture buffers (HTTP 512B, TLS 1024B).
Add distributed E2E testing infrastructure with multi-VM Vagrant setup.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-15 03:34:33 +02:00

554 lines
18 KiB
Go

// Package writer gère l'écriture asynchrone par batch des sessions
// corrélées dans ClickHouse.
package writer
import (
"context"
"encoding/json"
"fmt"
"log"
"strings"
"time"
"github.com/ClickHouse/clickhouse-go/v2"
"github.com/ClickHouse/clickhouse-go/v2/lib/driver"
"github.com/antitbone/ja4/ja4ebpf/internal/correlation"
)
// ClickHouseWriter écrit les sessions corrélées dans ja4_logs.http_logs_raw
// via des insertions batch asynchrones.
type ClickHouseWriter struct {
conn driver.Conn // connexion ClickHouse native
ch chan *correlation.SessionState // canal d'entrée des sessions
batchSz int // taille d'un batch d'insertion
flush time.Duration // intervalle de flush forcé
}
// sessionRecord est la représentation JSON d'une session pour http_logs_raw.
// Les noms de champs JSON correspondent exactement aux clés attendues par le MV mv_http_logs.
type sessionRecord struct {
Time time.Time `json:"time"`
SrcIP string `json:"src_ip"`
SrcPort int `json:"src_port"`
DstIP string `json:"dst_ip"`
DstPort int `json:"dst_port"`
// Métadonnées IP (noms attendus par le MV)
IPMetaDF *bool `json:"ip_meta_df,omitempty"`
IPMetaID *uint16 `json:"ip_meta_id,omitempty"`
IPMetaTTL *uint8 `json:"ip_meta_ttl,omitempty"`
IPMetaTotalLength *uint16 `json:"ip_meta_total_length,omitempty"`
// Métadonnées TCP (noms attendus par le MV)
TCPMetaWindowSize *uint16 `json:"tcp_meta_window_size,omitempty"`
TCPMetaWindowScale *uint8 `json:"tcp_meta_window_scale,omitempty"`
TCPMetaMSS *uint16 `json:"tcp_meta_mss,omitempty"`
TCPMetaOptions string `json:"tcp_meta_options,omitempty"`
// TLS (noms attendus par le MV)
JA4Hash string `json:"ja4,omitempty"`
JA3Raw string `json:"ja3,omitempty"`
JA3Hash string `json:"ja3_hash,omitempty"`
TLSSNI string `json:"tls_sni,omitempty"`
TLSALPN string `json:"tls_alpn,omitempty"`
TLSVersion string `json:"tls_version,omitempty"`
TLSCipherSuites string `json:"tls_cipher_suites,omitempty"`
TLSExtensions string `json:"tls_extensions,omitempty"`
TLSSupportedGroups string `json:"tls_supported_groups,omitempty"`
// HTTP
Method string `json:"method,omitempty"`
Path string `json:"path,omitempty"`
Host string `json:"host,omitempty"`
QueryString string `json:"query_string,omitempty"`
Scheme string `json:"scheme,omitempty"`
HTTPVersion string `json:"http_version,omitempty"`
StatusCode *int `json:"status_code,omitempty"`
ResponseSize *int64 `json:"response_size,omitempty"`
DurationMS *float64 `json:"duration_ms,omitempty"`
KeepAlives int `json:"keepalives,omitempty"`
HeaderOrderSig string `json:"header_order_signature,omitempty"`
HeadersRaw string `json:"headers_raw,omitempty"`
HeaderUserAgent string `json:"header_User-Agent,omitempty"`
HeaderAccept string `json:"header_Accept,omitempty"`
HeaderAcceptEnc string `json:"header_Accept-Encoding,omitempty"`
HeaderAcceptLang string `json:"header_Accept-Language,omitempty"`
HeaderContentType string `json:"header_Content-Type,omitempty"`
HeaderXReqID string `json:"header_X-Request-Id,omitempty"`
HeaderXTraceID string `json:"header_X-Trace-Id,omitempty"`
HeaderXForwarded string `json:"header_X-Forwarded-For,omitempty"`
HeaderSecCHUA string `json:"header_Sec-CH-UA,omitempty"`
HeaderSecCHUAMobile string `json:"header_Sec-CH-UA-Mobile,omitempty"`
HeaderSecCHUAPlat string `json:"header_Sec-CH-UA-Platform,omitempty"`
HeaderSecFetchDest string `json:"header_Sec-Fetch-Dest,omitempty"`
HeaderSecFetchMode string `json:"header_Sec-Fetch-Mode,omitempty"`
HeaderSecFetchSite string `json:"header_Sec-Fetch-Site,omitempty"`
// HTTP/2 fingerprinting passif
H2Fingerprint string `json:"h2_fingerprint,omitempty"`
H2SettingsFP string `json:"h2_settings_fp,omitempty"`
H2WindowUpdate uint32 `json:"h2_window_update,omitempty"`
H2PseudoOrder string `json:"h2_pseudo_order,omitempty"`
H2HasPriority uint8 `json:"h2_has_priority,omitempty"`
H2HeaderTableSize *int32 `json:"h2_header_table_size,omitempty"`
H2EnablePush *int32 `json:"h2_enable_push,omitempty"`
H2MaxConcurrentStreams *int32 `json:"h2_max_concurrent_streams,omitempty"`
H2InitialWindowSize *int64 `json:"h2_initial_window_size,omitempty"`
H2MaxFrameSize *int32 `json:"h2_max_frame_size,omitempty"`
H2MaxHeaderListSize *int32 `json:"h2_max_header_list_size,omitempty"`
H2EnableConnectProtocol *int32 `json:"h2_enable_connect_protocol,omitempty"`
// Champs de corrélation
ClientHeaders string `json:"client_headers,omitempty"`
SynToClientHelloMs *int32 `json:"syn_to_clienthello_ms,omitempty"`
}
// NewClickHouseWriter crée un writer et établit la connexion ClickHouse.
func NewClickHouseWriter(dsn string, batchSize int, flushInterval time.Duration) (*ClickHouseWriter, error) {
opts, err := clickhouse.ParseDSN(dsn)
if err != nil {
return nil, fmt.Errorf("analyse DSN ClickHouse: %w", err)
}
conn, err := clickhouse.Open(opts)
if err != nil {
return nil, fmt.Errorf("connexion ClickHouse: %w", err)
}
// Vérifier la connexion avec un ping limité dans le temps
pingCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := conn.Ping(pingCtx); err != nil {
conn.Close()
return nil, fmt.Errorf("ping ClickHouse: %w", err)
}
return &ClickHouseWriter{
conn: conn,
ch: make(chan *correlation.SessionState, 8192),
batchSz: batchSize,
flush: flushInterval,
}, nil
}
// Start lance la goroutine de consommation du canal de sessions.
// Se termine proprement à l'annulation du contexte.
func (w *ClickHouseWriter) Start(ctx context.Context) {
go func() {
batch := make([]*correlation.SessionState, 0, w.batchSz)
ticker := time.NewTicker(w.flush)
defer ticker.Stop()
defer w.conn.Close()
for {
select {
case s, ok := <-w.ch:
if !ok {
// Canal fermé : vider le batch restant
if len(batch) > 0 {
if err := w.flushBatch(ctx, batch); err != nil {
log.Printf("[writer] erreur flush final: %v", err)
}
}
return
}
batch = append(batch, s)
if len(batch) >= w.batchSz {
if err := w.flushBatch(ctx, batch); err != nil {
log.Printf("[writer] erreur flush batch: %v", err)
}
batch = batch[:0]
}
case <-ticker.C:
if len(batch) > 0 {
if err := w.flushBatch(ctx, batch); err != nil {
log.Printf("[writer] erreur flush périodique: %v", err)
}
batch = batch[:0]
}
case <-ctx.Done():
if len(batch) > 0 {
flushCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
if err := w.flushBatch(flushCtx, batch); err != nil {
log.Printf("[writer] erreur flush arrêt: %v", err)
}
cancel()
}
return
}
}
}()
}
// Write envoie une session dans le canal d'écriture (non-bloquant).
// Si le canal est plein, la session est abandonnée avec un log d'avertissement.
func (w *ClickHouseWriter) Write(s *correlation.SessionState) {
select {
case w.ch <- s:
default:
log.Printf("[writer] canal plein, session abandonnée: src=%v:%d", s.Key.SrcIP, s.Key.SrcPort)
}
}
// flushBatch insère un batch de sessions dans ja4_logs.http_logs_raw.
// Chaque session est sérialisée en JSON et insérée dans la colonne raw_json.
func (w *ClickHouseWriter) flushBatch(ctx context.Context, batch []*correlation.SessionState) error {
b, err := w.conn.PrepareBatch(ctx, "INSERT INTO ja4_logs.http_logs_raw (raw_json)")
if err != nil {
return fmt.Errorf("préparation batch ClickHouse: %w", err)
}
for _, s := range batch {
record := sessionToRecord(s)
jsonBytes, err := json.Marshal(record)
if err != nil {
return fmt.Errorf("sérialisation session JSON: %w", err)
}
if err := b.Append(string(jsonBytes)); err != nil {
return fmt.Errorf("ajout ligne au batch: %w", err)
}
}
if err := b.Send(); err != nil {
return fmt.Errorf("envoi batch ClickHouse (%d lignes): %w", len(batch), err)
}
return nil
}
// sessionToRecord convertit une SessionState en enregistrement JSON plat.
func sessionToRecord(s *correlation.SessionState) sessionRecord {
srcIP := fmt.Sprintf("%d.%d.%d.%d",
s.Key.SrcIP[0], s.Key.SrcIP[1], s.Key.SrcIP[2], s.Key.SrcIP[3])
rec := sessionRecord{
Time: s.FirstSeen,
SrcIP: srcIP,
SrcPort: int(s.Key.SrcPort),
KeepAlives: len(s.Requests),
}
// Fallback dst_port pour sessions TLS sans L3L4 : 443 pour HTTPS
if s.L3L4 == nil && s.TLS != nil {
rec.DstPort = 443
}
// Champs métadonnées IP/TCP
if s.L3L4 != nil {
rec.DstIP = fmt.Sprintf("%d.%d.%d.%d",
s.L3L4.DstIP[0], s.L3L4.DstIP[1], s.L3L4.DstIP[2], s.L3L4.DstIP[3])
rec.DstPort = int(s.L3L4.DstPort)
rec.IPMetaDF = &s.L3L4.DFBit
rec.IPMetaID = &s.L3L4.IPID
rec.IPMetaTTL = &s.L3L4.TTL
if s.L3L4.IPTotalLength > 0 {
rec.IPMetaTotalLength = &s.L3L4.IPTotalLength
}
rec.TCPMetaWindowSize = &s.L3L4.WindowSize
// WindowScale 0xFF = absent (convention C), ne pas inclure
if s.L3L4.WindowScale != 0xFF {
rec.TCPMetaWindowScale = &s.L3L4.WindowScale
}
// MSS 0 = absent, ne pas inclure
if s.L3L4.MSS > 0 {
rec.TCPMetaMSS = &s.L3L4.MSS
}
// Options TCP : noms abrégés (MSS, WS, SACK, TS, etc.)
if len(s.L3L4.TCPOptionsRaw) > 0 {
rec.TCPMetaOptions = formatTCPOptions(s.L3L4.TCPOptionsRaw)
}
}
// Champs TLS
if s.TLS != nil {
rec.JA4Hash = s.TLS.JA4Hash
rec.JA3Raw = s.TLS.JA3Raw
rec.JA3Hash = s.TLS.JA3Hash
rec.TLSSNI = s.TLS.SNI
rec.TLSALPN = strings.Join(s.TLS.ALPN, ",")
rec.TLSVersion = formatTLSVersion(s.TLS.TLSVersion)
// Cipher suites : liste hex séparée par des tirets
if len(s.TLS.CipherSuites) > 0 {
parts := make([]string, len(s.TLS.CipherSuites))
for i, cs := range s.TLS.CipherSuites {
parts[i] = fmt.Sprintf("%04x", cs)
}
rec.TLSCipherSuites = strings.Join(parts, "-")
}
// Extensions : liste d'IDs hex séparés par des tirets
if len(s.TLS.Extensions) > 0 {
parts := make([]string, len(s.TLS.Extensions))
for i, ext := range s.TLS.Extensions {
parts[i] = fmt.Sprintf("%04x", ext)
}
rec.TLSExtensions = strings.Join(parts, "-")
}
// Supported groups : liste hex séparée par des tirets
// (disponible via TLSInfo.Extensions — extraction depuis ClientHelloRaw si nécessaire)
_ = s.TLS.ClientHelloRaw // réservé pour extraction future
// Fallback : si pas de Host HTTP, utiliser TLS SNI
if rec.Host == "" && s.TLS.SNI != "" {
rec.Host = s.TLS.SNI
}
// Scheme déduit de la présence TLS
if s.TLS.SNI != "" {
rec.Scheme = "https"
}
}
// syn_to_clienthello_ms : délai entre SYN et ClientHello
if s.L3L4 != nil && s.TLS != nil && !s.L3L4.SYNTimestamp.IsZero() && !s.TLS.Timestamp.IsZero() {
delta := int32(s.TLS.Timestamp.Sub(s.L3L4.SYNTimestamp).Milliseconds())
rec.SynToClientHelloMs = &delta
}
// Champs HTTP (dernière requête)
if len(s.Requests) > 0 {
last := &s.Requests[len(s.Requests)-1]
rec.Method = last.Method
rec.Path = last.Path
rec.QueryString = last.QueryString
rec.Host = last.Host
if last.Host != "" && s.TLS != nil {
rec.Scheme = "https"
}
rec.HTTPVersion = last.HTTPVersion
rec.StatusCode = &last.StatusCode
rec.ResponseSize = &last.ResponseSize
rec.DurationMS = &last.DurationMS
rec.HeaderOrderSig = last.HeaderOrderSig
// En-têtes HTTP individuels (HTTP/1.1: clés Title-Case, HTTP/2: clés lowercase)
if last.HeaderKV != nil {
rec.HeaderUserAgent = headerVal(last.HeaderKV, "User-Agent", "user-agent")
rec.HeaderAccept = headerVal(last.HeaderKV, "Accept", "accept")
rec.HeaderAcceptEnc = headerVal(last.HeaderKV, "Accept-Encoding", "accept-encoding")
rec.HeaderAcceptLang = headerVal(last.HeaderKV, "Accept-Language", "accept-language")
rec.HeaderContentType = headerVal(last.HeaderKV, "Content-Type", "content-type")
rec.HeaderXReqID = headerVal(last.HeaderKV, "X-Request-Id", "x-request-id")
rec.HeaderXTraceID = headerVal(last.HeaderKV, "X-Trace-Id", "x-trace-id")
rec.HeaderXForwarded = headerVal(last.HeaderKV, "X-Forwarded-For", "x-forwarded-for")
rec.HeaderSecCHUA = headerVal(last.HeaderKV, "Sec-CH-UA", "sec-ch-ua")
rec.HeaderSecCHUAMobile = headerVal(last.HeaderKV, "Sec-CH-UA-Mobile", "sec-ch-ua-mobile")
rec.HeaderSecCHUAPlat = headerVal(last.HeaderKV, "Sec-CH-UA-Platform", "sec-ch-ua-platform")
rec.HeaderSecFetchDest = headerVal(last.HeaderKV, "Sec-Fetch-Dest", "sec-fetch-dest")
rec.HeaderSecFetchMode = headerVal(last.HeaderKV, "Sec-Fetch-Mode", "sec-fetch-mode")
rec.HeaderSecFetchSite = headerVal(last.HeaderKV, "Sec-Fetch-Site", "sec-fetch-site")
// client_headers : JSON des en-têtes capturés
rec.ClientHeaders = buildClientHeaders(last.HeaderKV)
}
// Construire headers_raw : ordre des noms joints par ";"
if len(last.HeaderOrder) > 0 {
rec.HeadersRaw = strings.Join(last.HeaderOrder, ";")
}
// Champs HTTP/2 passifs
if last.HTTP2Settings != nil {
h2 := last.HTTP2Settings
rec.H2WindowUpdate = h2.WindowUpdateIncrement
// Ordre des pseudo-headers → notation abrégée "m,a,s,p"
if len(h2.PseudoHeaderOrder) > 0 {
rec.H2PseudoOrder = pseudoOrderToShort(h2.PseudoHeaderOrder)
}
// Paramètres SETTINGS individuels (pointeurs : nil = absent du preface)
rec.H2HeaderTableSize = &h2.HeaderTableSize
rec.H2EnablePush = &h2.EnablePush
rec.H2MaxConcurrentStreams = &h2.MaxConcurrentStreams
h2InitWin := int64(h2.InitialWindowSize)
rec.H2InitialWindowSize = &h2InitWin
rec.H2MaxFrameSize = &h2.MaxFrameSize
rec.H2MaxHeaderListSize = &h2.MaxHeaderListSize
rec.H2EnableConnectProtocol = &h2.EnableConnectProtocol
// Fingerprints composites Akamai
rec.H2Fingerprint = buildH2Fingerprint(h2)
rec.H2SettingsFP = buildH2SettingsFP(h2)
}
}
return rec
}
// pseudoOrderToShort convertit la liste de pseudo-headers en notation abrégée.
// Ex: [":method", ":authority", ":scheme", ":path"] → "m,a,s,p"
func pseudoOrderToShort(headers []string) string {
short := make([]byte, 0, len(headers)*2-1)
for i, h := range headers {
if i > 0 {
short = append(short, ',')
}
switch {
case h == ":method":
short = append(short, 'm')
case h == ":authority":
short = append(short, 'a')
case h == ":scheme":
short = append(short, 's')
case h == ":path":
short = append(short, 'p')
default:
short = append(short, '?')
}
}
return string(short)
}
// buildH2Fingerprint construit le fingerprint composite au format Akamai.
// Format : SETTINGS[pairs]|WINDOW_UPDATE[value]|PRIORITY[0/1]|PSEUDO_ORDER[order]
func buildH2Fingerprint(h2 *correlation.HTTP2Settings) string {
var b strings.Builder
// SETTINGS
b.WriteString("1:")
b.WriteString(fmt.Sprintf("%d", h2.HeaderTableSize))
b.WriteString(",2:")
b.WriteString(fmt.Sprintf("%d", h2.EnablePush))
if h2.MaxConcurrentStreams >= 0 {
b.WriteString(",3:")
b.WriteString(fmt.Sprintf("%d", h2.MaxConcurrentStreams))
}
b.WriteString(",4:")
b.WriteString(fmt.Sprintf("%d", h2.InitialWindowSize))
if h2.MaxFrameSize >= 0 {
b.WriteString(",5:")
b.WriteString(fmt.Sprintf("%d", h2.MaxFrameSize))
}
if h2.MaxHeaderListSize >= 0 {
b.WriteString(",6:")
b.WriteString(fmt.Sprintf("%d", h2.MaxHeaderListSize))
}
// WINDOW_UPDATE
b.WriteByte('|')
if h2.WindowUpdateIncrement > 0 {
b.WriteString(fmt.Sprintf("%d", h2.WindowUpdateIncrement))
}
// PRIORITY (non capturé actuellement)
b.WriteString("|0")
// PSEUDO_ORDER
b.WriteByte('|')
if len(h2.PseudoHeaderOrder) > 0 {
b.WriteString(pseudoOrderToShort(h2.PseudoHeaderOrder))
}
return b.String()
}
// buildH2SettingsFP construit la chaîne brute des entrées SETTINGS.
func buildH2SettingsFP(h2 *correlation.HTTP2Settings) string {
var parts []string
if h2.MaxConcurrentStreams >= 0 {
parts = append(parts, fmt.Sprintf("3:%d", h2.MaxConcurrentStreams))
}
if h2.InitialWindowSize >= 0 {
parts = append(parts, fmt.Sprintf("4:%d", h2.InitialWindowSize))
}
if h2.EnablePush >= 0 {
parts = append(parts, fmt.Sprintf("2:%d", h2.EnablePush))
}
return strings.Join(parts, ",")
}
// formatTLSVersion convertit la valeur numérique TLS en chaîne lisible.
func formatTLSVersion(v uint16) string {
switch v {
case 0x0301:
return "TLSv1.0"
case 0x0302:
return "TLSv1.1"
case 0x0303:
return "TLSv1.2"
case 0x0304:
return "TLSv1.3"
default:
return ""
}
}
// headerVal cherche un en-tête dans le map avec deux clés possibles :
// HTTP/1.1 utilise Title-Case (ex: "User-Agent"), HTTP/2 utilise lowercase (ex: "user-agent").
func headerVal(kv map[string]string, titleKey, lowerKey string) string {
if v := kv[titleKey]; v != "" {
return v
}
return kv[lowerKey]
}
// buildClientHeaders sérialise les en-têtes capturés en JSON pour la colonne client_headers.
func buildClientHeaders(kv map[string]string) string {
if len(kv) == 0 {
return ""
}
// Sérialiser en JSON trié par clé pour un résultat déterministe
b, err := json.Marshal(kv)
if err != nil {
return ""
}
return string(b)
}
// formatTCPOptions convertit les options TCP brutes en chaîne lisible.
// Noms abrégés : MSS=2, WS=3, SACK=4, TS=8, etc.
func formatTCPOptions(opts []byte) string {
if len(opts) == 0 {
return ""
}
var names []string
i := 0
for i < len(opts) {
kind := opts[i]
switch kind {
case 0: // End of Options List
break
case 1: // NOP
names = append(names, "NOP")
i++
case 2: // MSS
names = append(names, "MSS")
if i+3 < len(opts) {
i += 4
} else {
i++
}
case 3: // Window Scale
names = append(names, "WS")
if i+2 < len(opts) {
i += 3
} else {
i++
}
case 4: // SACK Permitted
names = append(names, "SACK")
i += 2
case 5: // SACK
names = append(names, "SACKb")
if i+1 < len(opts) {
i += int(opts[i+1])
} else {
i++
}
case 8: // Timestamp
names = append(names, "TS")
if i+9 < len(opts) {
i += 10
} else {
i++
}
default:
if i+1 < len(opts) && int(opts[i+1]) > 0 {
i += int(opts[i+1])
} else {
i++
}
}
}
return strings.Join(names, ",")
}