docs: update ClickHouse schema (http_logs_raw + http_logs)
- README.md: documenter les deux tables (raw + enrichie) - architecture.yml: décrire le schema complet avec colonnes matérialisées - Table http_logs_raw: ingestion JSON brut (colonne raw_json unique) - Table http_logs: extraction des champs via DEFAULT JSONExtract* Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
85
README.md
85
README.md
@ -194,18 +194,81 @@ Tous les champs des sources A et B sont fusionnés au même niveau. Les champs d
|
||||
|
||||
## Schema ClickHouse
|
||||
|
||||
Le service utilise deux tables ClickHouse :
|
||||
|
||||
### Table brute (`http_logs_raw`)
|
||||
|
||||
Table d'ingestion qui stocke le log corrélé brut au format JSON :
|
||||
|
||||
```sql
|
||||
CREATE TABLE correlated_logs_http_network (
|
||||
timestamp DateTime64(9),
|
||||
src_ip String,
|
||||
src_port UInt32,
|
||||
dst_ip String,
|
||||
dst_port UInt32,
|
||||
correlated UInt8,
|
||||
orphan_side String,
|
||||
fields JSON
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY (timestamp, src_ip, src_port);
|
||||
CREATE TABLE http_logs_raw
|
||||
(
|
||||
raw_json String
|
||||
)
|
||||
ENGINE = MergeTree
|
||||
ORDER BY tuple();
|
||||
```
|
||||
|
||||
**Format d'insertion :** Le service envoie chaque log corrélé sérialisé en JSON dans la colonne `raw_json` :
|
||||
|
||||
```sql
|
||||
INSERT INTO http_logs_raw (raw_json) FORMAT JSONEachRow
|
||||
{"raw_json":"{\"timestamp\":\"2024-01-01T12:00:00Z\",\"src_ip\":\"192.168.1.1\",\"correlated\":true,...}"}
|
||||
```
|
||||
|
||||
### Table enrichie (`http_logs`)
|
||||
|
||||
Vue matérialisée qui extrait les champs du JSON pour l'analyse :
|
||||
|
||||
```sql
|
||||
CREATE TABLE http_logs
|
||||
(
|
||||
raw_json String,
|
||||
|
||||
-- champs de base
|
||||
time_str String DEFAULT JSONExtractString(raw_json, 'time'),
|
||||
timestamp_str String DEFAULT JSONExtractString(raw_json, 'timestamp'),
|
||||
time DateTime DEFAULT parseDateTimeBestEffort(time_str),
|
||||
log_date Date DEFAULT toDate(time),
|
||||
|
||||
src_ip IPv4 DEFAULT toIPv4(JSONExtractString(raw_json, 'src_ip')),
|
||||
src_port UInt16 DEFAULT toUInt16(JSONExtractUInt(raw_json, 'src_port')),
|
||||
dst_ip IPv4 DEFAULT toIPv4(JSONExtractString(raw_json, 'dst_ip')),
|
||||
dst_port UInt16 DEFAULT toUInt16(JSONExtractUInt(raw_json, 'dst_port')),
|
||||
|
||||
correlated UInt8 DEFAULT JSONExtractBool(raw_json, 'correlated'),
|
||||
keepalives UInt16 DEFAULT toUInt16(JSONExtractUInt(raw_json, 'keepalives')),
|
||||
method LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'method'),
|
||||
scheme LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'scheme'),
|
||||
host LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'host'),
|
||||
path String DEFAULT JSONExtractString(raw_json, 'path'),
|
||||
query String DEFAULT JSONExtractString(raw_json, 'query'),
|
||||
http_version LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'http_version'),
|
||||
orphan_side LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'orphan_side'),
|
||||
|
||||
-- champs « presque toujours là »
|
||||
a_timestamp UInt64 DEFAULT JSONExtractUInt(raw_json, 'a_timestamp'),
|
||||
b_timestamp UInt64 DEFAULT JSONExtractUInt(raw_json, 'b_timestamp'),
|
||||
conn_id String DEFAULT JSONExtractString(raw_json, 'conn_id'),
|
||||
ip_meta_df UInt8 DEFAULT JSONExtractBool(raw_json, 'ip_meta_df'),
|
||||
ip_meta_id UInt32 DEFAULT JSONExtractUInt(raw_json, 'ip_meta_id'),
|
||||
ip_meta_total_length UInt32 DEFAULT JSONExtractUInt(raw_json, 'ip_meta_total_length'),
|
||||
ip_meta_ttl UInt8 DEFAULT JSONExtractUInt(raw_json, 'ip_meta_ttl'),
|
||||
tcp_meta_options LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'tcp_meta_options'),
|
||||
tcp_meta_window_size UInt32 DEFAULT JSONExtractUInt(raw_json, 'tcp_meta_window_size'),
|
||||
syn_to_clienthello_ms Int32 DEFAULT toInt32(JSONExtractInt(raw_json, 'syn_to_clienthello_ms')),
|
||||
tls_version LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'tls_version'),
|
||||
tls_sni LowCardinality(String) DEFAULT JSONExtractString(raw_json, 'tls_sni'),
|
||||
ja3 String DEFAULT JSONExtractString(raw_json, 'ja3'),
|
||||
ja3_hash String DEFAULT JSONExtractString(raw_json, 'ja3_hash'),
|
||||
ja4 String DEFAULT JSONExtractString(raw_json, 'ja4'),
|
||||
|
||||
-- tous les autres champs JSON (headers dynamiques etc.)
|
||||
extra JSON DEFAULT raw_json
|
||||
)
|
||||
ENGINE = MergeTree
|
||||
PARTITION BY toYYYYMM(log_date)
|
||||
ORDER BY (log_date, dst_ip, src_ip, time);
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
Reference in New Issue
Block a user