From eed376d74954bbbdc59a198c7d5a34e97a478a89 Mon Sep 17 00:00:00 2001 From: toto Date: Tue, 3 Mar 2026 14:38:45 +0100 Subject: [PATCH] docs: update ClickHouse schema with new table structure - http_logs_raw: partition by toDate(ingest_time), order by ingest_time - http_logs: explicit columns (no DEFAULT), extracted by MV - mv_http_logs: full SELECT with JSONExtract* + coalesce for all fields - Add 17 HTTP header fields (User-Agent, Accept, Sec-CH-UA, etc.) - New ORDER BY: (time, src_ip, dst_ip, ja4) - architecture.yml: match new schema with MV query details Co-authored-by: Qwen-Coder --- README.md | 153 +++++++++++++++++++++++++++++++------------ architecture.yml | 164 ++++++++++++++++++++++++++++++----------------- 2 files changed, 215 insertions(+), 102 deletions(-) diff --git a/README.md b/README.md index b3c2252..7ec5bd1 100644 --- a/README.md +++ b/README.md @@ -207,63 +207,72 @@ DROP TABLE IF EXISTS mabase_prod.http_logs_raw; CREATE TABLE mabase_prod.http_logs_raw ( - raw_json String, + raw_json String, ingest_time DateTime DEFAULT now() ) ENGINE = MergeTree -ORDER BY tuple() -TTL ingest_time + INTERVAL 1 DAY -SETTINGS ttl_only_drop_parts = 1; +PARTITION BY toDate(ingest_time) +ORDER BY ingest_time; -- 3. Table parsée DROP TABLE IF EXISTS mabase_prod.http_logs; CREATE TABLE mabase_prod.http_logs ( - raw_json String, + time DateTime, + log_date Date DEFAULT toDate(time), - -- champs de base - time_str String DEFAULT JSONExtractString(raw_json, 'time'), - timestamp_str String DEFAULT JSONExtractString(raw_json, 'timestamp'), - time DateTime DEFAULT parseDateTimeBestEffort(time_str), - log_date Date DEFAULT toDate(time), + src_ip IPv4, + src_port UInt16, + dst_ip IPv4, + dst_port UInt16, - src_ip IPv4 DEFAULT toIPv4(JSONExtractString(raw_json, 'src_ip')), - src_port UInt16 DEFAULT toUInt16(JSONExtractUInt(raw_json, 'src_port')), - dst_ip IPv4 DEFAULT toIPv4(JSONExtractString(raw_json, 'dst_ip')), - dst_port UInt16 DEFAULT toUInt16(JSONExtractUInt(raw_json, 'dst_port')), + method LowCardinality(String), + scheme LowCardinality(String), + host LowCardinality(String), + path String, + query String, + http_version LowCardinality(String), + orphan_side LowCardinality(String), - correlated UInt8 DEFAULT JSONExtractBool(raw_json, 'correlated'), - keepalives UInt16 DEFAULT toUInt16(JSONExtractUInt(raw_json, 'keepalives')), - method LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'method'), - scheme LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'scheme'), - host LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'host'), - path String DEFAULT JSONExtractString(raw_json, 'path'), - query String DEFAULT JSONExtractString(raw_json, 'query'), - http_version LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'http_version'), - orphan_side LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'orphan_side'), + correlated UInt8, + keepalives UInt16, + a_timestamp UInt64, + b_timestamp UInt64, + conn_id String, - a_timestamp UInt64 DEFAULT JSONExtractUInt(raw_json, 'a_timestamp'), - b_timestamp UInt64 DEFAULT JSONExtractUInt(raw_json, 'b_timestamp'), - conn_id String DEFAULT JSONExtractString(raw_json, 'conn_id'), - ip_meta_df UInt8 DEFAULT JSONExtractBool(raw_json, 'ip_meta_df'), - ip_meta_id UInt32 DEFAULT JSONExtractUInt(raw_json, 'ip_meta_id'), - ip_meta_total_length UInt32 DEFAULT JSONExtractUInt(raw_json, 'ip_meta_total_length'), - ip_meta_ttl UInt8 DEFAULT JSONExtractUInt(raw_json, 'ip_meta_ttl'), - tcp_meta_options LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'tcp_meta_options'), - tcp_meta_window_size UInt32 DEFAULT JSONExtractUInt(raw_json, 'tcp_meta_window_size'), - syn_to_clienthello_ms Int32 DEFAULT toInt32(JSONExtractInt(raw_json, 'syn_to_clienthello_ms')), - tls_version LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'tls_version'), - tls_sni LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'tls_sni'), - ja3 String DEFAULT JSONExtractString(raw_json, 'ja3'), - ja3_hash String DEFAULT JSONExtractString(raw_json, 'ja3_hash'), - ja4 String DEFAULT JSONExtractString(raw_json, 'ja4'), + ip_meta_df UInt8, + ip_meta_id UInt32, + ip_meta_total_length UInt32, + ip_meta_ttl UInt8, + tcp_meta_options LowCardinality(String), + tcp_meta_window_size UInt32, + syn_to_clienthello_ms Int32, - extra JSON DEFAULT raw_json + tls_version LowCardinality(String), + tls_sni LowCardinality(String), + ja3 String, + ja3_hash String, + ja4 String, + + header_user_agent String, + header_accept String, + header_accept_encoding String, + header_accept_language String, + header_x_request_id String, + header_x_trace_id String, + header_x_forwarded_for String, + + header_sec_ch_ua String, + header_sec_ch_ua_mobile String, + header_sec_ch_ua_platform String, + header_sec_fetch_dest String, + header_sec_fetch_mode String, + header_sec_fetch_site String ) ENGINE = MergeTree -PARTITION BY toYYYYMM(log_date) -ORDER BY (log_date, dst_ip, src_ip, time); +PARTITION BY log_date +ORDER BY (time, src_ip, dst_ip, ja4); -- 4. Vue matérialisée (RAW → logs) DROP VIEW IF EXISTS mabase_prod.mv_http_logs; @@ -271,7 +280,67 @@ DROP VIEW IF EXISTS mabase_prod.mv_http_logs; CREATE MATERIALIZED VIEW mabase_prod.mv_http_logs TO mabase_prod.http_logs AS -SELECT raw_json +SELECT + -- 1. Temps + parseDateTimeBestEffort( + coalesce(JSONExtractString(raw_json, 'time'), '1970-01-01T00:00:00Z') + ) AS time, + toDate(time) AS log_date, + + -- 2. Réseau L3/L4 + toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0')) AS src_ip, + toUInt16(coalesce(JSONExtractUInt(raw_json, 'src_port'), 0)) AS src_port, + toIPv4(coalesce(JSONExtractString(raw_json, 'dst_ip'), '0.0.0.0')) AS dst_ip, + toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port, + + -- 3. HTTP de base + coalesce(JSONExtractString(raw_json, 'method'), '') AS method, + coalesce(JSONExtractString(raw_json, 'scheme'), '') AS scheme, + coalesce(JSONExtractString(raw_json, 'host'), '') AS host, + coalesce(JSONExtractString(raw_json, 'path'), '') AS path, + coalesce(JSONExtractString(raw_json, 'query'), '') AS query, + coalesce(JSONExtractString(raw_json, 'http_version'), '') AS http_version, + coalesce(JSONExtractString(raw_json, 'orphan_side'), '') AS orphan_side, + + -- 4. Connexion / corrélation + toUInt8(coalesce(JSONExtractBool(raw_json, 'correlated'), 0)) AS correlated, + toUInt16(coalesce(JSONExtractUInt(raw_json, 'keepalives'), 0)) AS keepalives, + coalesce(JSONExtractUInt(raw_json, 'a_timestamp'), 0) AS a_timestamp, + coalesce(JSONExtractUInt(raw_json, 'b_timestamp'), 0) AS b_timestamp, + coalesce(JSONExtractString(raw_json, 'conn_id'), '') AS conn_id, + + -- 5. IP/TCP + toUInt8(coalesce(JSONExtractBool(raw_json, 'ip_meta_df'), 0)) AS ip_meta_df, + coalesce(JSONExtractUInt(raw_json, 'ip_meta_id'), 0) AS ip_meta_id, + coalesce(JSONExtractUInt(raw_json, 'ip_meta_total_length'), 0) AS ip_meta_total_length, + coalesce(JSONExtractUInt(raw_json, 'ip_meta_ttl'), 0) AS ip_meta_ttl, + coalesce(JSONExtractString(raw_json, 'tcp_meta_options'), '') AS tcp_meta_options, + coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_size'), 0) AS tcp_meta_window_size, + toInt32(coalesce(JSONExtractInt(raw_json, 'syn_to_clienthello_ms'), 0)) AS syn_to_clienthello_ms, + + -- 6. TLS / JA3/JA4 + coalesce(JSONExtractString(raw_json, 'tls_version'), '') AS tls_version, + coalesce(JSONExtractString(raw_json, 'tls_sni'), '') AS tls_sni, + coalesce(JSONExtractString(raw_json, 'ja3'), '') AS ja3, + coalesce(JSONExtractString(raw_json, 'ja3_hash'), '') AS ja3_hash, + coalesce(JSONExtractString(raw_json, 'ja4'), '') AS ja4, + + -- 7. Headers HTTP + coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS header_user_agent, + coalesce(JSONExtractString(raw_json, 'header_Accept'), '') AS header_accept, + coalesce(JSONExtractString(raw_json, 'header_Accept-Encoding'), '') AS header_accept_encoding, + coalesce(JSONExtractString(raw_json, 'header_Accept-Language'), '') AS header_accept_language, + coalesce(JSONExtractString(raw_json, 'header_X-Request-Id'), '') AS header_x_request_id, + coalesce(JSONExtractString(raw_json, 'header_X-Trace-Id'), '') AS header_x_trace_id, + coalesce(JSONExtractString(raw_json, 'header_X-Forwarded-For'), '') AS header_x_forwarded_for, + + coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA'), '') AS header_sec_ch_ua, + coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Mobile'), '') AS header_sec_ch_ua_mobile, + coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Platform'), '') AS header_sec_ch_ua_platform, + coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Dest'), '') AS header_sec_fetch_dest, + coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode, + coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site + FROM mabase_prod.http_logs_raw; ``` diff --git a/architecture.yml b/architecture.yml index 303155d..f36952b 100644 --- a/architecture.yml +++ b/architecture.yml @@ -404,19 +404,17 @@ clickhouse_schema: database: mabase_prod description: > La table ClickHouse est gérée en dehors du service. Deux tables sont utilisées : - http_logs_raw (table d'ingestion avec TTL 1 jour) et http_logs (table enrichie - avec extraction des champs via des colonnes matérialisées). Une vue matérialisée - transfère automatiquement les données de RAW vers parsée. + http_logs_raw (table d'ingestion partitionnée par jour) et http_logs (table parsée + avec extraction explicite des champs). Une vue matérialisée transfère automatiquement + les données de RAW vers parsée. tables: - name: http_logs_raw description: > - Table d'ingestion brute avec TTL. Une seule colonne raw_json contient le log - corrélé complet sérialisé en JSON. TTL de 1 jour pour limiter le stockage. + Table d'ingestion brute. Une seule colonne raw_json contient le log corrélé + complet sérialisé en JSON. Partitionnée par jour pour optimiser le TTL. engine: MergeTree - order_by: tuple() - ttl: ingest_time + INTERVAL 1 DAY - settings: - ttl_only_drop_parts: 1 + partition_by: toDate(ingest_time) + order_by: ingest_time columns: - name: raw_json type: String @@ -429,122 +427,168 @@ clickhouse_schema: - name: http_logs description: > - Table enrichie avec extraction des champs du JSON brut via des expressions DEFAULT. - Partitionnée par mois, optimisée pour les requêtes analytiques. + Table parsée avec tous les champs extraits explicitement par la vue matérialisée. + Partitionnée par log_date, optimisée pour les requêtes analytiques. engine: MergeTree - partition_by: toYYYYMM(log_date) - order_by: (log_date, dst_ip, src_ip, time) + partition_by: log_date + order_by: (time, src_ip, dst_ip, ja4) columns: - - name: raw_json - type: String - - name: time_str - type: String - default: JSONExtractString(raw_json, 'time') - - name: timestamp_str - type: String - default: JSONExtractString(raw_json, 'timestamp') - name: time type: DateTime - default: parseDateTimeBestEffort(time_str) - name: log_date type: Date default: toDate(time) - name: src_ip type: IPv4 - default: toIPv4(JSONExtractString(raw_json, 'src_ip')) - name: src_port type: UInt16 - default: toUInt16(JSONExtractUInt(raw_json, 'src_port')) - name: dst_ip type: IPv4 - default: toIPv4(JSONExtractString(raw_json, 'dst_ip')) - name: dst_port type: UInt16 - default: toUInt16(JSONExtractUInt(raw_json, 'dst_port')) - - name: correlated - type: UInt8 - default: JSONExtractBool(raw_json, 'correlated') - - name: keepalives - type: UInt16 - default: toUInt16(JSONExtractUInt(raw_json, 'keepalives')) - name: method type: LowCardinality(String) - default: JSONExtractString(raw_json, 'method') - name: scheme type: LowCardinality(String) - default: JSONExtractString(raw_json, 'scheme') - name: host type: LowCardinality(String) - default: JSONExtractString(raw_json, 'host') - name: path type: String - default: JSONExtractString(raw_json, 'path') - name: query type: String - default: JSONExtractString(raw_json, 'query') - name: http_version type: LowCardinality(String) - default: JSONExtractString(raw_json, 'http_version') - name: orphan_side type: LowCardinality(String) - default: JSONExtractString(raw_json, 'orphan_side') + - name: correlated + type: UInt8 + - name: keepalives + type: UInt16 - name: a_timestamp type: UInt64 - default: JSONExtractUInt(raw_json, 'a_timestamp') - name: b_timestamp type: UInt64 - default: JSONExtractUInt(raw_json, 'b_timestamp') - name: conn_id type: String - default: JSONExtractString(raw_json, 'conn_id') - name: ip_meta_df type: UInt8 - default: JSONExtractBool(raw_json, 'ip_meta_df') - name: ip_meta_id type: UInt32 - default: JSONExtractUInt(raw_json, 'ip_meta_id') - name: ip_meta_total_length type: UInt32 - default: JSONExtractUInt(raw_json, 'ip_meta_total_length') - name: ip_meta_ttl type: UInt8 - default: JSONExtractUInt(raw_json, 'ip_meta_ttl') - name: tcp_meta_options type: LowCardinality(String) - default: JSONExtractString(raw_json, 'tcp_meta_options') - name: tcp_meta_window_size type: UInt32 - default: JSONExtractUInt(raw_json, 'tcp_meta_window_size') - name: syn_to_clienthello_ms type: Int32 - default: toInt32(JSONExtractInt(raw_json, 'syn_to_clienthello_ms')) - name: tls_version type: LowCardinality(String) - default: JSONExtractString(raw_json, 'tls_version') - name: tls_sni type: LowCardinality(String) - default: JSONExtractString(raw_json, 'tls_sni') - name: ja3 type: String - default: JSONExtractString(raw_json, 'ja3') - name: ja3_hash type: String - default: JSONExtractString(raw_json, 'ja3_hash') - name: ja4 type: String - default: JSONExtractString(raw_json, 'ja4') - - name: extra - type: JSON - default: raw_json + - name: header_user_agent + type: String + - name: header_accept + type: String + - name: header_accept_encoding + type: String + - name: header_accept_language + type: String + - name: header_x_request_id + type: String + - name: header_x_trace_id + type: String + - name: header_x_forwarded_for + type: String + - name: header_sec_ch_ua + type: String + - name: header_sec_ch_ua_mobile + type: String + - name: header_sec_ch_ua_platform + type: String + - name: header_sec_fetch_dest + type: String + - name: header_sec_fetch_mode + type: String + - name: header_sec_fetch_site + type: String - name: mv_http_logs type: materialized_view description: > - Vue matérialisée qui transfère automatiquement les données de http_logs_raw - vers http_logs lors de chaque INSERT. + Vue matérialisée qui transfère les données de http_logs_raw vers http_logs + en extrayant tous les champs du JSON via JSONExtract* et coalesce pour les + valeurs par défaut. target: mabase_prod.http_logs query: | - SELECT raw_json FROM mabase_prod.http_logs_raw + SELECT + -- 1. Temps + parseDateTimeBestEffort( + coalesce(JSONExtractString(raw_json, 'time'), '1970-01-01T00:00:00Z') + ) AS time, + toDate(time) AS log_date, + + -- 2. Réseau L3/L4 + toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0')) AS src_ip, + toUInt16(coalesce(JSONExtractUInt(raw_json, 'src_port'), 0)) AS src_port, + toIPv4(coalesce(JSONExtractString(raw_json, 'dst_ip'), '0.0.0.0')) AS dst_ip, + toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port, + + -- 3. HTTP de base + coalesce(JSONExtractString(raw_json, 'method'), '') AS method, + coalesce(JSONExtractString(raw_json, 'scheme'), '') AS scheme, + coalesce(JSONExtractString(raw_json, 'host'), '') AS host, + coalesce(JSONExtractString(raw_json, 'path'), '') AS path, + coalesce(JSONExtractString(raw_json, 'query'), '') AS query, + coalesce(JSONExtractString(raw_json, 'http_version'), '') AS http_version, + coalesce(JSONExtractString(raw_json, 'orphan_side'), '') AS orphan_side, + + -- 4. Connexion / corrélation + toUInt8(coalesce(JSONExtractBool(raw_json, 'correlated'), 0)) AS correlated, + toUInt16(coalesce(JSONExtractUInt(raw_json, 'keepalives'), 0)) AS keepalives, + coalesce(JSONExtractUInt(raw_json, 'a_timestamp'), 0) AS a_timestamp, + coalesce(JSONExtractUInt(raw_json, 'b_timestamp'), 0) AS b_timestamp, + coalesce(JSONExtractString(raw_json, 'conn_id'), '') AS conn_id, + + -- 5. IP/TCP + toUInt8(coalesce(JSONExtractBool(raw_json, 'ip_meta_df'), 0)) AS ip_meta_df, + coalesce(JSONExtractUInt(raw_json, 'ip_meta_id'), 0) AS ip_meta_id, + coalesce(JSONExtractUInt(raw_json, 'ip_meta_total_length'), 0) AS ip_meta_total_length, + coalesce(JSONExtractUInt(raw_json, 'ip_meta_ttl'), 0) AS ip_meta_ttl, + coalesce(JSONExtractString(raw_json, 'tcp_meta_options'), '') AS tcp_meta_options, + coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_size'), 0) AS tcp_meta_window_size, + toInt32(coalesce(JSONExtractInt(raw_json, 'syn_to_clienthello_ms'), 0)) AS syn_to_clienthello_ms, + + -- 6. TLS / JA3/JA4 + coalesce(JSONExtractString(raw_json, 'tls_version'), '') AS tls_version, + coalesce(JSONExtractString(raw_json, 'tls_sni'), '') AS tls_sni, + coalesce(JSONExtractString(raw_json, 'ja3'), '') AS ja3, + coalesce(JSONExtractString(raw_json, 'ja3_hash'), '') AS ja3_hash, + coalesce(JSONExtractString(raw_json, 'ja4'), '') AS ja4, + + -- 7. Headers HTTP + coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS header_user_agent, + coalesce(JSONExtractString(raw_json, 'header_Accept'), '') AS header_accept, + coalesce(JSONExtractString(raw_json, 'header_Accept-Encoding'), '') AS header_accept_encoding, + coalesce(JSONExtractString(raw_json, 'header_Accept-Language'), '') AS header_accept_language, + coalesce(JSONExtractString(raw_json, 'header_X-Request-Id'), '') AS header_x_request_id, + coalesce(JSONExtractString(raw_json, 'header_X-Trace-Id'), '') AS header_x_trace_id, + coalesce(JSONExtractString(raw_json, 'header_X-Forwarded-For'), '') AS header_x_forwarded_for, + coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA'), '') AS header_sec_ch_ua, + coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Mobile'), '') AS header_sec_ch_ua_mobile, + coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Platform'), '') AS header_sec_ch_ua_platform, + coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Dest'), '') AS header_sec_fetch_dest, + coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode, + coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site + FROM mabase_prod.http_logs_raw; users: - name: data_writer