docs: update ClickHouse schema with new table structure
Some checks failed
Build and Test / test (push) Has been cancelled
Build and Test / build (push) Has been cancelled
Build and Test / docker (push) Has been cancelled

- http_logs_raw: partition by toDate(ingest_time), order by ingest_time
- http_logs: explicit columns (no DEFAULT), extracted by MV
- mv_http_logs: full SELECT with JSONExtract* + coalesce for all fields
- Add 17 HTTP header fields (User-Agent, Accept, Sec-CH-UA, etc.)
- New ORDER BY: (time, src_ip, dst_ip, ja4)
- architecture.yml: match new schema with MV query details

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
toto
2026-03-03 14:38:45 +01:00
parent 60cd8d87e4
commit eed376d749
2 changed files with 215 additions and 102 deletions

153
README.md
View File

@ -207,63 +207,72 @@ DROP TABLE IF EXISTS mabase_prod.http_logs_raw;
CREATE TABLE mabase_prod.http_logs_raw
(
raw_json String,
raw_json String,
ingest_time DateTime DEFAULT now()
)
ENGINE = MergeTree
ORDER BY tuple()
TTL ingest_time + INTERVAL 1 DAY
SETTINGS ttl_only_drop_parts = 1;
PARTITION BY toDate(ingest_time)
ORDER BY ingest_time;
-- 3. Table parsée
DROP TABLE IF EXISTS mabase_prod.http_logs;
CREATE TABLE mabase_prod.http_logs
(
raw_json String,
time DateTime,
log_date Date DEFAULT toDate(time),
-- champs de base
time_str String DEFAULT JSONExtractString(raw_json, 'time'),
timestamp_str String DEFAULT JSONExtractString(raw_json, 'timestamp'),
time DateTime DEFAULT parseDateTimeBestEffort(time_str),
log_date Date DEFAULT toDate(time),
src_ip IPv4,
src_port UInt16,
dst_ip IPv4,
dst_port UInt16,
src_ip IPv4 DEFAULT toIPv4(JSONExtractString(raw_json, 'src_ip')),
src_port UInt16 DEFAULT toUInt16(JSONExtractUInt(raw_json, 'src_port')),
dst_ip IPv4 DEFAULT toIPv4(JSONExtractString(raw_json, 'dst_ip')),
dst_port UInt16 DEFAULT toUInt16(JSONExtractUInt(raw_json, 'dst_port')),
method LowCardinality(String),
scheme LowCardinality(String),
host LowCardinality(String),
path String,
query String,
http_version LowCardinality(String),
orphan_side LowCardinality(String),
correlated UInt8 DEFAULT JSONExtractBool(raw_json, 'correlated'),
keepalives UInt16 DEFAULT toUInt16(JSONExtractUInt(raw_json, 'keepalives')),
method LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'method'),
scheme LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'scheme'),
host LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'host'),
path String DEFAULT JSONExtractString(raw_json, 'path'),
query String DEFAULT JSONExtractString(raw_json, 'query'),
http_version LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'http_version'),
orphan_side LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'orphan_side'),
correlated UInt8,
keepalives UInt16,
a_timestamp UInt64,
b_timestamp UInt64,
conn_id String,
a_timestamp UInt64 DEFAULT JSONExtractUInt(raw_json, 'a_timestamp'),
b_timestamp UInt64 DEFAULT JSONExtractUInt(raw_json, 'b_timestamp'),
conn_id String DEFAULT JSONExtractString(raw_json, 'conn_id'),
ip_meta_df UInt8 DEFAULT JSONExtractBool(raw_json, 'ip_meta_df'),
ip_meta_id UInt32 DEFAULT JSONExtractUInt(raw_json, 'ip_meta_id'),
ip_meta_total_length UInt32 DEFAULT JSONExtractUInt(raw_json, 'ip_meta_total_length'),
ip_meta_ttl UInt8 DEFAULT JSONExtractUInt(raw_json, 'ip_meta_ttl'),
tcp_meta_options LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'tcp_meta_options'),
tcp_meta_window_size UInt32 DEFAULT JSONExtractUInt(raw_json, 'tcp_meta_window_size'),
syn_to_clienthello_ms Int32 DEFAULT toInt32(JSONExtractInt(raw_json, 'syn_to_clienthello_ms')),
tls_version LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'tls_version'),
tls_sni LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'tls_sni'),
ja3 String DEFAULT JSONExtractString(raw_json, 'ja3'),
ja3_hash String DEFAULT JSONExtractString(raw_json, 'ja3_hash'),
ja4 String DEFAULT JSONExtractString(raw_json, 'ja4'),
ip_meta_df UInt8,
ip_meta_id UInt32,
ip_meta_total_length UInt32,
ip_meta_ttl UInt8,
tcp_meta_options LowCardinality(String),
tcp_meta_window_size UInt32,
syn_to_clienthello_ms Int32,
extra JSON DEFAULT raw_json
tls_version LowCardinality(String),
tls_sni LowCardinality(String),
ja3 String,
ja3_hash String,
ja4 String,
header_user_agent String,
header_accept String,
header_accept_encoding String,
header_accept_language String,
header_x_request_id String,
header_x_trace_id String,
header_x_forwarded_for String,
header_sec_ch_ua String,
header_sec_ch_ua_mobile String,
header_sec_ch_ua_platform String,
header_sec_fetch_dest String,
header_sec_fetch_mode String,
header_sec_fetch_site String
)
ENGINE = MergeTree
PARTITION BY toYYYYMM(log_date)
ORDER BY (log_date, dst_ip, src_ip, time);
PARTITION BY log_date
ORDER BY (time, src_ip, dst_ip, ja4);
-- 4. Vue matérialisée (RAW → logs)
DROP VIEW IF EXISTS mabase_prod.mv_http_logs;
@ -271,7 +280,67 @@ DROP VIEW IF EXISTS mabase_prod.mv_http_logs;
CREATE MATERIALIZED VIEW mabase_prod.mv_http_logs
TO mabase_prod.http_logs
AS
SELECT raw_json
SELECT
-- 1. Temps
parseDateTimeBestEffort(
coalesce(JSONExtractString(raw_json, 'time'), '1970-01-01T00:00:00Z')
) AS time,
toDate(time) AS log_date,
-- 2. Réseau L3/L4
toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0')) AS src_ip,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'src_port'), 0)) AS src_port,
toIPv4(coalesce(JSONExtractString(raw_json, 'dst_ip'), '0.0.0.0')) AS dst_ip,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port,
-- 3. HTTP de base
coalesce(JSONExtractString(raw_json, 'method'), '') AS method,
coalesce(JSONExtractString(raw_json, 'scheme'), '') AS scheme,
coalesce(JSONExtractString(raw_json, 'host'), '') AS host,
coalesce(JSONExtractString(raw_json, 'path'), '') AS path,
coalesce(JSONExtractString(raw_json, 'query'), '') AS query,
coalesce(JSONExtractString(raw_json, 'http_version'), '') AS http_version,
coalesce(JSONExtractString(raw_json, 'orphan_side'), '') AS orphan_side,
-- 4. Connexion / corrélation
toUInt8(coalesce(JSONExtractBool(raw_json, 'correlated'), 0)) AS correlated,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'keepalives'), 0)) AS keepalives,
coalesce(JSONExtractUInt(raw_json, 'a_timestamp'), 0) AS a_timestamp,
coalesce(JSONExtractUInt(raw_json, 'b_timestamp'), 0) AS b_timestamp,
coalesce(JSONExtractString(raw_json, 'conn_id'), '') AS conn_id,
-- 5. IP/TCP
toUInt8(coalesce(JSONExtractBool(raw_json, 'ip_meta_df'), 0)) AS ip_meta_df,
coalesce(JSONExtractUInt(raw_json, 'ip_meta_id'), 0) AS ip_meta_id,
coalesce(JSONExtractUInt(raw_json, 'ip_meta_total_length'), 0) AS ip_meta_total_length,
coalesce(JSONExtractUInt(raw_json, 'ip_meta_ttl'), 0) AS ip_meta_ttl,
coalesce(JSONExtractString(raw_json, 'tcp_meta_options'), '') AS tcp_meta_options,
coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_size'), 0) AS tcp_meta_window_size,
toInt32(coalesce(JSONExtractInt(raw_json, 'syn_to_clienthello_ms'), 0)) AS syn_to_clienthello_ms,
-- 6. TLS / JA3/JA4
coalesce(JSONExtractString(raw_json, 'tls_version'), '') AS tls_version,
coalesce(JSONExtractString(raw_json, 'tls_sni'), '') AS tls_sni,
coalesce(JSONExtractString(raw_json, 'ja3'), '') AS ja3,
coalesce(JSONExtractString(raw_json, 'ja3_hash'), '') AS ja3_hash,
coalesce(JSONExtractString(raw_json, 'ja4'), '') AS ja4,
-- 7. Headers HTTP
coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS header_user_agent,
coalesce(JSONExtractString(raw_json, 'header_Accept'), '') AS header_accept,
coalesce(JSONExtractString(raw_json, 'header_Accept-Encoding'), '') AS header_accept_encoding,
coalesce(JSONExtractString(raw_json, 'header_Accept-Language'), '') AS header_accept_language,
coalesce(JSONExtractString(raw_json, 'header_X-Request-Id'), '') AS header_x_request_id,
coalesce(JSONExtractString(raw_json, 'header_X-Trace-Id'), '') AS header_x_trace_id,
coalesce(JSONExtractString(raw_json, 'header_X-Forwarded-For'), '') AS header_x_forwarded_for,
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA'), '') AS header_sec_ch_ua,
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Mobile'), '') AS header_sec_ch_ua_mobile,
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Platform'), '') AS header_sec_ch_ua_platform,
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Dest'), '') AS header_sec_fetch_dest,
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode,
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site
FROM mabase_prod.http_logs_raw;
```