docs: update ClickHouse schema with new table structure
Some checks failed
Build and Test / test (push) Has been cancelled
Build and Test / build (push) Has been cancelled
Build and Test / docker (push) Has been cancelled

- http_logs_raw: partition by toDate(ingest_time), order by ingest_time
- http_logs: explicit columns (no DEFAULT), extracted by MV
- mv_http_logs: full SELECT with JSONExtract* + coalesce for all fields
- Add 17 HTTP header fields (User-Agent, Accept, Sec-CH-UA, etc.)
- New ORDER BY: (time, src_ip, dst_ip, ja4)
- architecture.yml: match new schema with MV query details

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
toto
2026-03-03 14:38:45 +01:00
parent 60cd8d87e4
commit eed376d749
2 changed files with 215 additions and 102 deletions

151
README.md
View File

@ -211,59 +211,68 @@ CREATE TABLE mabase_prod.http_logs_raw
ingest_time DateTime DEFAULT now() ingest_time DateTime DEFAULT now()
) )
ENGINE = MergeTree ENGINE = MergeTree
ORDER BY tuple() PARTITION BY toDate(ingest_time)
TTL ingest_time + INTERVAL 1 DAY ORDER BY ingest_time;
SETTINGS ttl_only_drop_parts = 1;
-- 3. Table parsée -- 3. Table parsée
DROP TABLE IF EXISTS mabase_prod.http_logs; DROP TABLE IF EXISTS mabase_prod.http_logs;
CREATE TABLE mabase_prod.http_logs CREATE TABLE mabase_prod.http_logs
( (
raw_json String, time DateTime,
-- champs de base
time_str String DEFAULT JSONExtractString(raw_json, 'time'),
timestamp_str String DEFAULT JSONExtractString(raw_json, 'timestamp'),
time DateTime DEFAULT parseDateTimeBestEffort(time_str),
log_date Date DEFAULT toDate(time), log_date Date DEFAULT toDate(time),
src_ip IPv4 DEFAULT toIPv4(JSONExtractString(raw_json, 'src_ip')), src_ip IPv4,
src_port UInt16 DEFAULT toUInt16(JSONExtractUInt(raw_json, 'src_port')), src_port UInt16,
dst_ip IPv4 DEFAULT toIPv4(JSONExtractString(raw_json, 'dst_ip')), dst_ip IPv4,
dst_port UInt16 DEFAULT toUInt16(JSONExtractUInt(raw_json, 'dst_port')), dst_port UInt16,
correlated UInt8 DEFAULT JSONExtractBool(raw_json, 'correlated'), method LowCardinality(String),
keepalives UInt16 DEFAULT toUInt16(JSONExtractUInt(raw_json, 'keepalives')), scheme LowCardinality(String),
method LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'method'), host LowCardinality(String),
scheme LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'scheme'), path String,
host LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'host'), query String,
path String DEFAULT JSONExtractString(raw_json, 'path'), http_version LowCardinality(String),
query String DEFAULT JSONExtractString(raw_json, 'query'), orphan_side LowCardinality(String),
http_version LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'http_version'),
orphan_side LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'orphan_side'),
a_timestamp UInt64 DEFAULT JSONExtractUInt(raw_json, 'a_timestamp'), correlated UInt8,
b_timestamp UInt64 DEFAULT JSONExtractUInt(raw_json, 'b_timestamp'), keepalives UInt16,
conn_id String DEFAULT JSONExtractString(raw_json, 'conn_id'), a_timestamp UInt64,
ip_meta_df UInt8 DEFAULT JSONExtractBool(raw_json, 'ip_meta_df'), b_timestamp UInt64,
ip_meta_id UInt32 DEFAULT JSONExtractUInt(raw_json, 'ip_meta_id'), conn_id String,
ip_meta_total_length UInt32 DEFAULT JSONExtractUInt(raw_json, 'ip_meta_total_length'),
ip_meta_ttl UInt8 DEFAULT JSONExtractUInt(raw_json, 'ip_meta_ttl'),
tcp_meta_options LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'tcp_meta_options'),
tcp_meta_window_size UInt32 DEFAULT JSONExtractUInt(raw_json, 'tcp_meta_window_size'),
syn_to_clienthello_ms Int32 DEFAULT toInt32(JSONExtractInt(raw_json, 'syn_to_clienthello_ms')),
tls_version LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'tls_version'),
tls_sni LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'tls_sni'),
ja3 String DEFAULT JSONExtractString(raw_json, 'ja3'),
ja3_hash String DEFAULT JSONExtractString(raw_json, 'ja3_hash'),
ja4 String DEFAULT JSONExtractString(raw_json, 'ja4'),
extra JSON DEFAULT raw_json ip_meta_df UInt8,
ip_meta_id UInt32,
ip_meta_total_length UInt32,
ip_meta_ttl UInt8,
tcp_meta_options LowCardinality(String),
tcp_meta_window_size UInt32,
syn_to_clienthello_ms Int32,
tls_version LowCardinality(String),
tls_sni LowCardinality(String),
ja3 String,
ja3_hash String,
ja4 String,
header_user_agent String,
header_accept String,
header_accept_encoding String,
header_accept_language String,
header_x_request_id String,
header_x_trace_id String,
header_x_forwarded_for String,
header_sec_ch_ua String,
header_sec_ch_ua_mobile String,
header_sec_ch_ua_platform String,
header_sec_fetch_dest String,
header_sec_fetch_mode String,
header_sec_fetch_site String
) )
ENGINE = MergeTree ENGINE = MergeTree
PARTITION BY toYYYYMM(log_date) PARTITION BY log_date
ORDER BY (log_date, dst_ip, src_ip, time); ORDER BY (time, src_ip, dst_ip, ja4);
-- 4. Vue matérialisée (RAW → logs) -- 4. Vue matérialisée (RAW → logs)
DROP VIEW IF EXISTS mabase_prod.mv_http_logs; DROP VIEW IF EXISTS mabase_prod.mv_http_logs;
@ -271,7 +280,67 @@ DROP VIEW IF EXISTS mabase_prod.mv_http_logs;
CREATE MATERIALIZED VIEW mabase_prod.mv_http_logs CREATE MATERIALIZED VIEW mabase_prod.mv_http_logs
TO mabase_prod.http_logs TO mabase_prod.http_logs
AS AS
SELECT raw_json SELECT
-- 1. Temps
parseDateTimeBestEffort(
coalesce(JSONExtractString(raw_json, 'time'), '1970-01-01T00:00:00Z')
) AS time,
toDate(time) AS log_date,
-- 2. Réseau L3/L4
toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0')) AS src_ip,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'src_port'), 0)) AS src_port,
toIPv4(coalesce(JSONExtractString(raw_json, 'dst_ip'), '0.0.0.0')) AS dst_ip,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port,
-- 3. HTTP de base
coalesce(JSONExtractString(raw_json, 'method'), '') AS method,
coalesce(JSONExtractString(raw_json, 'scheme'), '') AS scheme,
coalesce(JSONExtractString(raw_json, 'host'), '') AS host,
coalesce(JSONExtractString(raw_json, 'path'), '') AS path,
coalesce(JSONExtractString(raw_json, 'query'), '') AS query,
coalesce(JSONExtractString(raw_json, 'http_version'), '') AS http_version,
coalesce(JSONExtractString(raw_json, 'orphan_side'), '') AS orphan_side,
-- 4. Connexion / corrélation
toUInt8(coalesce(JSONExtractBool(raw_json, 'correlated'), 0)) AS correlated,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'keepalives'), 0)) AS keepalives,
coalesce(JSONExtractUInt(raw_json, 'a_timestamp'), 0) AS a_timestamp,
coalesce(JSONExtractUInt(raw_json, 'b_timestamp'), 0) AS b_timestamp,
coalesce(JSONExtractString(raw_json, 'conn_id'), '') AS conn_id,
-- 5. IP/TCP
toUInt8(coalesce(JSONExtractBool(raw_json, 'ip_meta_df'), 0)) AS ip_meta_df,
coalesce(JSONExtractUInt(raw_json, 'ip_meta_id'), 0) AS ip_meta_id,
coalesce(JSONExtractUInt(raw_json, 'ip_meta_total_length'), 0) AS ip_meta_total_length,
coalesce(JSONExtractUInt(raw_json, 'ip_meta_ttl'), 0) AS ip_meta_ttl,
coalesce(JSONExtractString(raw_json, 'tcp_meta_options'), '') AS tcp_meta_options,
coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_size'), 0) AS tcp_meta_window_size,
toInt32(coalesce(JSONExtractInt(raw_json, 'syn_to_clienthello_ms'), 0)) AS syn_to_clienthello_ms,
-- 6. TLS / JA3/JA4
coalesce(JSONExtractString(raw_json, 'tls_version'), '') AS tls_version,
coalesce(JSONExtractString(raw_json, 'tls_sni'), '') AS tls_sni,
coalesce(JSONExtractString(raw_json, 'ja3'), '') AS ja3,
coalesce(JSONExtractString(raw_json, 'ja3_hash'), '') AS ja3_hash,
coalesce(JSONExtractString(raw_json, 'ja4'), '') AS ja4,
-- 7. Headers HTTP
coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS header_user_agent,
coalesce(JSONExtractString(raw_json, 'header_Accept'), '') AS header_accept,
coalesce(JSONExtractString(raw_json, 'header_Accept-Encoding'), '') AS header_accept_encoding,
coalesce(JSONExtractString(raw_json, 'header_Accept-Language'), '') AS header_accept_language,
coalesce(JSONExtractString(raw_json, 'header_X-Request-Id'), '') AS header_x_request_id,
coalesce(JSONExtractString(raw_json, 'header_X-Trace-Id'), '') AS header_x_trace_id,
coalesce(JSONExtractString(raw_json, 'header_X-Forwarded-For'), '') AS header_x_forwarded_for,
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA'), '') AS header_sec_ch_ua,
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Mobile'), '') AS header_sec_ch_ua_mobile,
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Platform'), '') AS header_sec_ch_ua_platform,
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Dest'), '') AS header_sec_fetch_dest,
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode,
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site
FROM mabase_prod.http_logs_raw; FROM mabase_prod.http_logs_raw;
``` ```

View File

@ -404,19 +404,17 @@ clickhouse_schema:
database: mabase_prod database: mabase_prod
description: > description: >
La table ClickHouse est gérée en dehors du service. Deux tables sont utilisées : La table ClickHouse est gérée en dehors du service. Deux tables sont utilisées :
http_logs_raw (table d'ingestion avec TTL 1 jour) et http_logs (table enrichie http_logs_raw (table d'ingestion partitionnée par jour) et http_logs (table parsée
avec extraction des champs via des colonnes matérialisées). Une vue matérialisée avec extraction explicite des champs). Une vue matérialisée transfère automatiquement
transfère automatiquement les données de RAW vers parsée. les données de RAW vers parsée.
tables: tables:
- name: http_logs_raw - name: http_logs_raw
description: > description: >
Table d'ingestion brute avec TTL. Une seule colonne raw_json contient le log Table d'ingestion brute. Une seule colonne raw_json contient le log corrélé
corrélé complet sérialisé en JSON. TTL de 1 jour pour limiter le stockage. complet sérialisé en JSON. Partitionnée par jour pour optimiser le TTL.
engine: MergeTree engine: MergeTree
order_by: tuple() partition_by: toDate(ingest_time)
ttl: ingest_time + INTERVAL 1 DAY order_by: ingest_time
settings:
ttl_only_drop_parts: 1
columns: columns:
- name: raw_json - name: raw_json
type: String type: String
@ -429,122 +427,168 @@ clickhouse_schema:
- name: http_logs - name: http_logs
description: > description: >
Table enrichie avec extraction des champs du JSON brut via des expressions DEFAULT. Table parsée avec tous les champs extraits explicitement par la vue matérialisée.
Partitionnée par mois, optimisée pour les requêtes analytiques. Partitionnée par log_date, optimisée pour les requêtes analytiques.
engine: MergeTree engine: MergeTree
partition_by: toYYYYMM(log_date) partition_by: log_date
order_by: (log_date, dst_ip, src_ip, time) order_by: (time, src_ip, dst_ip, ja4)
columns: columns:
- name: raw_json
type: String
- name: time_str
type: String
default: JSONExtractString(raw_json, 'time')
- name: timestamp_str
type: String
default: JSONExtractString(raw_json, 'timestamp')
- name: time - name: time
type: DateTime type: DateTime
default: parseDateTimeBestEffort(time_str)
- name: log_date - name: log_date
type: Date type: Date
default: toDate(time) default: toDate(time)
- name: src_ip - name: src_ip
type: IPv4 type: IPv4
default: toIPv4(JSONExtractString(raw_json, 'src_ip'))
- name: src_port - name: src_port
type: UInt16 type: UInt16
default: toUInt16(JSONExtractUInt(raw_json, 'src_port'))
- name: dst_ip - name: dst_ip
type: IPv4 type: IPv4
default: toIPv4(JSONExtractString(raw_json, 'dst_ip'))
- name: dst_port - name: dst_port
type: UInt16 type: UInt16
default: toUInt16(JSONExtractUInt(raw_json, 'dst_port'))
- name: correlated
type: UInt8
default: JSONExtractBool(raw_json, 'correlated')
- name: keepalives
type: UInt16
default: toUInt16(JSONExtractUInt(raw_json, 'keepalives'))
- name: method - name: method
type: LowCardinality(String) type: LowCardinality(String)
default: JSONExtractString(raw_json, 'method')
- name: scheme - name: scheme
type: LowCardinality(String) type: LowCardinality(String)
default: JSONExtractString(raw_json, 'scheme')
- name: host - name: host
type: LowCardinality(String) type: LowCardinality(String)
default: JSONExtractString(raw_json, 'host')
- name: path - name: path
type: String type: String
default: JSONExtractString(raw_json, 'path')
- name: query - name: query
type: String type: String
default: JSONExtractString(raw_json, 'query')
- name: http_version - name: http_version
type: LowCardinality(String) type: LowCardinality(String)
default: JSONExtractString(raw_json, 'http_version')
- name: orphan_side - name: orphan_side
type: LowCardinality(String) type: LowCardinality(String)
default: JSONExtractString(raw_json, 'orphan_side') - name: correlated
type: UInt8
- name: keepalives
type: UInt16
- name: a_timestamp - name: a_timestamp
type: UInt64 type: UInt64
default: JSONExtractUInt(raw_json, 'a_timestamp')
- name: b_timestamp - name: b_timestamp
type: UInt64 type: UInt64
default: JSONExtractUInt(raw_json, 'b_timestamp')
- name: conn_id - name: conn_id
type: String type: String
default: JSONExtractString(raw_json, 'conn_id')
- name: ip_meta_df - name: ip_meta_df
type: UInt8 type: UInt8
default: JSONExtractBool(raw_json, 'ip_meta_df')
- name: ip_meta_id - name: ip_meta_id
type: UInt32 type: UInt32
default: JSONExtractUInt(raw_json, 'ip_meta_id')
- name: ip_meta_total_length - name: ip_meta_total_length
type: UInt32 type: UInt32
default: JSONExtractUInt(raw_json, 'ip_meta_total_length')
- name: ip_meta_ttl - name: ip_meta_ttl
type: UInt8 type: UInt8
default: JSONExtractUInt(raw_json, 'ip_meta_ttl')
- name: tcp_meta_options - name: tcp_meta_options
type: LowCardinality(String) type: LowCardinality(String)
default: JSONExtractString(raw_json, 'tcp_meta_options')
- name: tcp_meta_window_size - name: tcp_meta_window_size
type: UInt32 type: UInt32
default: JSONExtractUInt(raw_json, 'tcp_meta_window_size')
- name: syn_to_clienthello_ms - name: syn_to_clienthello_ms
type: Int32 type: Int32
default: toInt32(JSONExtractInt(raw_json, 'syn_to_clienthello_ms'))
- name: tls_version - name: tls_version
type: LowCardinality(String) type: LowCardinality(String)
default: JSONExtractString(raw_json, 'tls_version')
- name: tls_sni - name: tls_sni
type: LowCardinality(String) type: LowCardinality(String)
default: JSONExtractString(raw_json, 'tls_sni')
- name: ja3 - name: ja3
type: String type: String
default: JSONExtractString(raw_json, 'ja3')
- name: ja3_hash - name: ja3_hash
type: String type: String
default: JSONExtractString(raw_json, 'ja3_hash')
- name: ja4 - name: ja4
type: String type: String
default: JSONExtractString(raw_json, 'ja4') - name: header_user_agent
- name: extra type: String
type: JSON - name: header_accept
default: raw_json type: String
- name: header_accept_encoding
type: String
- name: header_accept_language
type: String
- name: header_x_request_id
type: String
- name: header_x_trace_id
type: String
- name: header_x_forwarded_for
type: String
- name: header_sec_ch_ua
type: String
- name: header_sec_ch_ua_mobile
type: String
- name: header_sec_ch_ua_platform
type: String
- name: header_sec_fetch_dest
type: String
- name: header_sec_fetch_mode
type: String
- name: header_sec_fetch_site
type: String
- name: mv_http_logs - name: mv_http_logs
type: materialized_view type: materialized_view
description: > description: >
Vue matérialisée qui transfère automatiquement les données de http_logs_raw Vue matérialisée qui transfère les données de http_logs_raw vers http_logs
vers http_logs lors de chaque INSERT. en extrayant tous les champs du JSON via JSONExtract* et coalesce pour les
valeurs par défaut.
target: mabase_prod.http_logs target: mabase_prod.http_logs
query: | query: |
SELECT raw_json FROM mabase_prod.http_logs_raw SELECT
-- 1. Temps
parseDateTimeBestEffort(
coalesce(JSONExtractString(raw_json, 'time'), '1970-01-01T00:00:00Z')
) AS time,
toDate(time) AS log_date,
-- 2. Réseau L3/L4
toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0')) AS src_ip,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'src_port'), 0)) AS src_port,
toIPv4(coalesce(JSONExtractString(raw_json, 'dst_ip'), '0.0.0.0')) AS dst_ip,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port,
-- 3. HTTP de base
coalesce(JSONExtractString(raw_json, 'method'), '') AS method,
coalesce(JSONExtractString(raw_json, 'scheme'), '') AS scheme,
coalesce(JSONExtractString(raw_json, 'host'), '') AS host,
coalesce(JSONExtractString(raw_json, 'path'), '') AS path,
coalesce(JSONExtractString(raw_json, 'query'), '') AS query,
coalesce(JSONExtractString(raw_json, 'http_version'), '') AS http_version,
coalesce(JSONExtractString(raw_json, 'orphan_side'), '') AS orphan_side,
-- 4. Connexion / corrélation
toUInt8(coalesce(JSONExtractBool(raw_json, 'correlated'), 0)) AS correlated,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'keepalives'), 0)) AS keepalives,
coalesce(JSONExtractUInt(raw_json, 'a_timestamp'), 0) AS a_timestamp,
coalesce(JSONExtractUInt(raw_json, 'b_timestamp'), 0) AS b_timestamp,
coalesce(JSONExtractString(raw_json, 'conn_id'), '') AS conn_id,
-- 5. IP/TCP
toUInt8(coalesce(JSONExtractBool(raw_json, 'ip_meta_df'), 0)) AS ip_meta_df,
coalesce(JSONExtractUInt(raw_json, 'ip_meta_id'), 0) AS ip_meta_id,
coalesce(JSONExtractUInt(raw_json, 'ip_meta_total_length'), 0) AS ip_meta_total_length,
coalesce(JSONExtractUInt(raw_json, 'ip_meta_ttl'), 0) AS ip_meta_ttl,
coalesce(JSONExtractString(raw_json, 'tcp_meta_options'), '') AS tcp_meta_options,
coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_size'), 0) AS tcp_meta_window_size,
toInt32(coalesce(JSONExtractInt(raw_json, 'syn_to_clienthello_ms'), 0)) AS syn_to_clienthello_ms,
-- 6. TLS / JA3/JA4
coalesce(JSONExtractString(raw_json, 'tls_version'), '') AS tls_version,
coalesce(JSONExtractString(raw_json, 'tls_sni'), '') AS tls_sni,
coalesce(JSONExtractString(raw_json, 'ja3'), '') AS ja3,
coalesce(JSONExtractString(raw_json, 'ja3_hash'), '') AS ja3_hash,
coalesce(JSONExtractString(raw_json, 'ja4'), '') AS ja4,
-- 7. Headers HTTP
coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS header_user_agent,
coalesce(JSONExtractString(raw_json, 'header_Accept'), '') AS header_accept,
coalesce(JSONExtractString(raw_json, 'header_Accept-Encoding'), '') AS header_accept_encoding,
coalesce(JSONExtractString(raw_json, 'header_Accept-Language'), '') AS header_accept_language,
coalesce(JSONExtractString(raw_json, 'header_X-Request-Id'), '') AS header_x_request_id,
coalesce(JSONExtractString(raw_json, 'header_X-Trace-Id'), '') AS header_x_trace_id,
coalesce(JSONExtractString(raw_json, 'header_X-Forwarded-For'), '') AS header_x_forwarded_for,
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA'), '') AS header_sec_ch_ua,
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Mobile'), '') AS header_sec_ch_ua_mobile,
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Platform'), '') AS header_sec_ch_ua_platform,
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Dest'), '') AS header_sec_fetch_dest,
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode,
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site
FROM mabase_prod.http_logs_raw;
users: users:
- name: data_writer - name: data_writer