schema sql

This commit is contained in:
toto
2026-03-12 11:21:11 +01:00
parent 20ebe7240e
commit 79d86eba23
2 changed files with 127 additions and 136 deletions

View File

@ -8,3 +8,14 @@ CREATE TABLE mabase_prod.ref_bot_networks (
last_update DateTime
) ENGINE = ReplacingMergeTree(last_update)
ORDER BY (network, bot_name);
-- Création de la table lisant le fichier des IPs
CREATE TABLE mabase_prod.bot_ip (
ip String
) ENGINE = File(CSV, 'bot_ip.csv');
-- Création de la table lisant le fichier des signatures JA4
CREATE TABLE mabase_prod.bot_ja4 (
ja4 String
) ENGINE = File(CSV, 'bot_ja4.csv');

View File

@ -1,28 +1,21 @@
-- ============================================================================
-- SCRIPT DE DÉPLOIEMENT DES VUES DE DÉTECTION DE BOTS & SPAM (CLICKHOUSE)
-- SCRIPT DE DÉPLOIEMENT DES VUES DE DÉTECTION (CLICKHOUSE) - MABASE_PROD
-- ============================================================================
-- ----------------------------------------------------------------------------
-- 1. NETTOYAGE STRICT
-- ----------------------------------------------------------------------------
DROP TABLE IF EXISTS mabase_prod.ml_detected_anomalies;
DROP VIEW IF EXISTS mabase_prod.view_ai_features_1h;
DROP VIEW IF EXISTS mabase_prod.view_host_ip_ja4_rotation;
DROP VIEW IF EXISTS mabase_prod.view_host_ja4_anomalies;
DROP VIEW IF EXISTS mabase_prod.view_form_bruteforce_detected;
DROP VIEW IF EXISTS mabase_prod.view_alpn_mismatch_detected;
DROP VIEW IF EXISTS mabase_prod.view_tcp_spoofing_detected;
DROP VIEW IF EXISTS mabase_prod.mv_agg_host_ip_ja4_1h;
DROP TABLE IF EXISTS mabase_prod.agg_host_ip_ja4_1h;
DROP VIEW IF EXISTS mabase_prod.mv_agg_header_fingerprint_1h;
DROP TABLE IF EXISTS mabase_prod.agg_header_fingerprint_1h;
-- ----------------------------------------------------------------------------
-- 2. TABLES D'AGRÉGATION ET VUES MATÉRIALISÉES (TEMPS RÉEL)
-- 1. TABLES D'AGRÉGATION ET VUES MATÉRIALISÉES
-- ----------------------------------------------------------------------------
CREATE TABLE mabase_prod.agg_host_ip_ja4_1h (
@ -44,41 +37,52 @@ CREATE TABLE mabase_prod.agg_host_ip_ja4_1h (
tcp_mss SimpleAggregateFunction(any, UInt32),
tcp_ttl SimpleAggregateFunction(any, UInt32),
http_version SimpleAggregateFunction(any, String),
first_ua SimpleAggregateFunction(any, String)
tls_alpn SimpleAggregateFunction(any, String),
tls_sni SimpleAggregateFunction(any, String),
first_ua SimpleAggregateFunction(any, String),
correlated SimpleAggregateFunction(max, UInt8),
unique_src_ports AggregateFunction(uniq, UInt16),
max_keepalives SimpleAggregateFunction(max, UInt32),
orphan_count SimpleAggregateFunction(sum, UInt64)
) ENGINE = AggregatingMergeTree()
ORDER BY (window_start, src_ip, ja4, host)
TTL window_start + INTERVAL 7 DAY;
CREATE MATERIALIZED VIEW mabase_prod.mv_agg_host_ip_ja4_1h
TO mabase_prod.agg_host_ip_ja4_1h AS
CREATE MATERIALIZED VIEW mabase_prod.mv_agg_host_ip_ja4_1h TO mabase_prod.agg_host_ip_ja4_1h AS
SELECT
toStartOfHour(time) AS window_start,
src_ip,
ja4,
host,
min(time) AS first_seen,
max(time) AS last_seen,
toStartOfHour(src.time) AS window_start,
src.src_ip, src.ja4, src.host,
min(src.time) AS first_seen, max(src.time) AS last_seen,
count() AS hits,
sum(IF(method = 'POST', 1, 0)) AS count_post,
uniqState(path) AS uniq_paths,
uniqState(query) AS uniq_query_params,
any(src_country_code) AS src_country_code,
any(toString(cityHash64(concat(toString(tcp_meta_window_size), toString(tcp_meta_mss), toString(tcp_meta_window_scale), tcp_meta_options)))) AS tcp_fingerprint,
varPopState(toFloat64(syn_to_clienthello_ms)) AS tcp_jitter_variance,
any(tcp_meta_window_size) AS tcp_window_size,
any(tcp_meta_window_scale) AS tcp_window_scale,
any(tcp_meta_mss) AS tcp_mss,
any(ip_meta_ttl) AS tcp_ttl,
any(http_version) AS http_version,
any(header_user_agent) AS first_ua
FROM mabase_prod.http_logs
GROUP BY window_start, src_ip, ja4, host;
sum(IF(src.method = 'POST', 1, 0)) AS count_post,
uniqState(src.path) AS uniq_paths,
uniqState(src.query) AS uniq_query_params,
any(src.src_country_code) AS src_country_code,
any(toString(cityHash64(concat(toString(src.tcp_meta_window_size), toString(src.tcp_meta_mss), toString(src.tcp_meta_window_scale), src.tcp_meta_options)))) AS tcp_fingerprint,
varPopState(toFloat64(src.syn_to_clienthello_ms)) AS tcp_jitter_variance,
any(src.tcp_meta_window_size) AS tcp_window_size,
any(src.tcp_meta_window_scale) AS tcp_window_scale,
any(src.tcp_meta_mss) AS tcp_mss,
any(src.ip_meta_ttl) AS tcp_ttl,
any(src.http_version) AS http_version,
any(src.tls_alpn) AS tls_alpn,
any(src.tls_sni) AS tls_sni,
any(src.header_user_agent) AS first_ua,
max(toUInt8(src.correlated)) AS correlated,
uniqState(toUInt16(src.src_port)) AS unique_src_ports,
max(toUInt32(src.keepalives)) AS max_keepalives,
sum(IF(src.orphan_side = 'A' OR toUInt8(src.correlated) = 0, 1, 0)) AS orphan_count
FROM mabase_prod.http_logs AS src
GROUP BY window_start, src.src_ip, src.ja4, src.host;
CREATE TABLE mabase_prod.agg_header_fingerprint_1h (
window_start DateTime,
src_ip String,
header_order_hash SimpleAggregateFunction(any, String),
header_count SimpleAggregateFunction(max, UInt16),
has_accept_language SimpleAggregateFunction(max, UInt8),
has_cookie SimpleAggregateFunction(max, UInt8),
has_referer SimpleAggregateFunction(max, UInt8), -- NOUVEAU (JA4H_a)
modern_browser_score SimpleAggregateFunction(max, UInt8),
sec_fetch_mode SimpleAggregateFunction(any, String),
sec_fetch_dest SimpleAggregateFunction(any, String),
@ -87,22 +91,24 @@ CREATE TABLE mabase_prod.agg_header_fingerprint_1h (
ORDER BY (window_start, src_ip)
TTL window_start + INTERVAL 7 DAY;
CREATE MATERIALIZED VIEW mabase_prod.mv_agg_header_fingerprint_1h
TO mabase_prod.agg_header_fingerprint_1h AS
CREATE MATERIALIZED VIEW mabase_prod.mv_agg_header_fingerprint_1h TO mabase_prod.agg_header_fingerprint_1h AS
SELECT
toStartOfHour(time) AS window_start,
src_ip,
any(toString(cityHash64(client_headers))) AS header_order_hash,
max(toUInt8(if(length(header_sec_ch_ua) > 0, 100, if(length(header_user_agent) > 0, 50, 0)))) AS modern_browser_score,
any(header_sec_fetch_mode) AS sec_fetch_mode,
any(header_sec_fetch_dest) AS sec_fetch_dest,
sum(IF(header_sec_fetch_site = 'none', 1, 0)) AS count_site_none
FROM mabase_prod.http_logs
GROUP BY window_start, src_ip;
toStartOfHour(src.time) AS window_start,
src.src_ip,
any(toString(cityHash64(src.client_headers))) AS header_order_hash,
max(toUInt16(length(src.client_headers) - length(replaceAll(src.client_headers, ',', '')) + 1)) AS header_count,
max(toUInt8(if(position(src.client_headers, 'Accept-Language') > 0, 1, 0))) AS has_accept_language,
max(toUInt8(if(position(src.client_headers, 'Cookie') > 0, 1, 0))) AS has_cookie,
max(toUInt8(if(position(src.client_headers, 'Referer') > 0, 1, 0))) AS has_referer, -- NOUVEAU (JA4H_a)
max(toUInt8(if(length(src.header_sec_ch_ua) > 0, 100, if(length(src.header_user_agent) > 0, 50, 0)))) AS modern_browser_score,
any(src.header_sec_fetch_mode) AS sec_fetch_mode,
any(src.header_sec_fetch_dest) AS sec_fetch_dest,
sum(IF(src.header_sec_fetch_site = 'none', 1, 0)) AS count_site_none
FROM mabase_prod.http_logs AS src
GROUP BY window_start, src.src_ip;
-- ----------------------------------------------------------------------------
-- 3. TABLE DE DESTINATION POUR LE MACHINE LEARNING
-- 2. TABLE DES ANOMALIES ÉLARGIE POUR GRAFANA
-- ----------------------------------------------------------------------------
CREATE TABLE mabase_prod.ml_detected_anomalies (
detected_at DateTime,
@ -110,52 +116,78 @@ CREATE TABLE mabase_prod.ml_detected_anomalies (
ja4 String,
host String,
anomaly_score Float32,
-- Dimensions Applicatives et Comportementales
hits UInt64,
hit_velocity Float32,
fuzzing_index Float32,
post_ratio Float32,
site_none_ratio Float32,
-- Dimensions Réseau / TCP
port_exhaustion_ratio Float32,
max_keepalives UInt32,
orphan_ratio Float32,
tcp_jitter_variance Float32,
tcp_shared_count UInt32,
true_window_size UInt64,
window_mss_ratio Float32,
-- Dimensions TLS / Contextuelles
alpn_http_mismatch UInt8,
is_alpn_missing UInt8,
sni_host_mismatch UInt8,
-- Dimensions JA4H (Headers)
header_count UInt16,
has_accept_language UInt8,
has_cookie UInt8,
has_referer UInt8,
modern_browser_score UInt8,
is_headless UInt8,
header_order_shared_count UInt32, -- NOUVEAU (JA4H_b)
reason String
) ENGINE = MergeTree()
ORDER BY (detected_at, src_ip, ja4)
TTL detected_at + INTERVAL 30 DAY;
-- ----------------------------------------------------------------------------
-- 4. VUE DE FEATURE ENGINEERING POUR L'ISOLATION FOREST (RÉSOLUE)
-- 3. VUE DE FEATURE ENGINEERING POUR L'IA (SUR 24H)
-- ----------------------------------------------------------------------------
-- Utilisation de sous-requêtes agrégées (GROUP BY explicite) avant la jointure
-- pour éviter les erreurs d'état et le produit cartésien.
CREATE VIEW mabase_prod.view_ai_features_1h AS
CREATE OR REPLACE VIEW mabase_prod.view_ai_features_1h AS
SELECT
a.src_ip,
a.ja4,
a.host,
a.hits,
a.uniq_paths,
a.uniq_query_params,
a.count_post,
-- Indicateur de Corrélation L4/L7
IF(length(a.ja4) > 0 AND length(a.tcp_fingerprint) > 0, 1, 0) AS correlated,
a.src_ip, a.ja4, a.host, a.hits, a.uniq_paths, a.uniq_query_params, a.count_post,
a.correlated AS correlated,
-- DIMENSIONS COMPORTEMENTALES
(a.count_post / (a.hits + 1)) AS post_ratio,
(a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index,
(a.hits / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS hit_velocity,
(a.unique_src_ports / (a.hits + 1)) AS port_exhaustion_ratio,
(a.orphan_count / (a.hits + 1)) AS orphan_ratio,
a.max_keepalives AS max_keepalives,
-- DIMENSIONS TCP / L4
COALESCE(a.tcp_jitter_variance, 0) AS tcp_jitter_variance,
count() OVER (PARTITION BY a.tcp_fingerprint) AS tcp_shared_count,
a.tcp_window_size * exp2(a.tcp_window_scale) AS true_window_size,
IF(a.tcp_mss > 0, a.tcp_window_size / a.tcp_mss, 0) AS window_mss_ratio,
-- DIMENSIONS TLS / L5 (Mismatch)
IF(substring(a.ja4, 10, 2) = 'h2' AND a.http_version!= '2', 1, 0) AS alpn_http_mismatch,
IF(substring(a.ja4, 10, 2) = '00', 1, 0) AS is_alpn_missing,
IF(a.tls_alpn = 'h2' AND a.http_version!= '2', 1, 0) AS alpn_http_mismatch,
IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing,
IF(length(a.tls_sni) > 0 AND a.tls_sni!= a.host, 1, 0) AS sni_host_mismatch,
-- DIMENSIONS HTTP / L7
COALESCE(h.header_count, 0) AS header_count,
COALESCE(h.has_accept_language, 0) AS has_accept_language,
COALESCE(h.has_cookie, 0) AS has_cookie,
COALESCE(h.has_referer, 0) AS has_referer,
COALESCE(h.modern_browser_score, 0) AS modern_browser_score,
IF(h.sec_fetch_mode = 'navigate' AND h.sec_fetch_dest!= 'document', 1, 0) AS is_fake_navigation,
(h.count_site_none / (a.hits + 1)) AS site_none_ratio
(h.count_site_none / (a.hits + 1)) AS site_none_ratio,
-- JA4H_b : Regroupement par ordre de header pour détecter les botnets
count() OVER (PARTITION BY h.header_order_hash) AS header_order_shared_count
FROM (
-- Consolidation des logs d'hôtes (Résolution du GROUP BY manquant)
SELECT
window_start, src_ip, ja4, host,
sum(hits) AS hits,
@ -169,83 +201,31 @@ FROM (
any(tcp_window_size) AS tcp_window_size,
any(tcp_window_scale) AS tcp_window_scale,
any(tcp_mss) AS tcp_mss,
any(http_version) AS http_version
any(http_version) AS http_version,
any(tls_alpn) AS tls_alpn,
any(tls_sni) AS tls_sni,
max(correlated) AS correlated,
uniqMerge(unique_src_ports) AS unique_src_ports,
max(max_keepalives) AS max_keepalives,
sum(orphan_count) AS orphan_count
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= toStartOfHour(now() - INTERVAL 2 HOUR)
WHERE window_start >= toStartOfHour(now() - INTERVAL 24 HOUR)
GROUP BY window_start, src_ip, ja4, host
) a
LEFT JOIN (
-- Consolidation des en-têtes
SELECT
window_start, src_ip,
any(header_order_hash) AS header_order_hash,
max(header_count) AS header_count,
max(has_accept_language) AS has_accept_language,
max(has_cookie) AS has_cookie,
max(has_referer) AS has_referer,
max(modern_browser_score) AS modern_browser_score,
any(sec_fetch_mode) AS sec_fetch_mode,
any(sec_fetch_dest) AS sec_fetch_dest,
sum(count_site_none) AS count_site_none
FROM mabase_prod.agg_header_fingerprint_1h
WHERE window_start >= toStartOfHour(now() - INTERVAL 2 HOUR)
WHERE window_start >= toStartOfHour(now() - INTERVAL 24 HOUR)
GROUP BY window_start, src_ip
) h
ON a.src_ip = h.src_ip AND a.window_start = h.window_start;
-- ----------------------------------------------------------------------------
-- 5. VUES DE DÉTECTION HEURISTIQUES STATIQUES (RÉSOLUES)
-- ----------------------------------------------------------------------------
CREATE VIEW mabase_prod.view_host_ip_ja4_rotation AS
SELECT
src_ip,
uniqExact(ja4) AS distinct_ja4_count,
sum(hits) AS total_hits
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= toStartOfHour(now() - INTERVAL 1 HOUR)
GROUP BY src_ip
HAVING distinct_ja4_count >= 5 AND total_hits > 100;
CREATE VIEW mabase_prod.view_host_ja4_anomalies AS
SELECT
ja4,
uniqExact(src_ip) AS unique_ips,
uniqExact(src_country_code) AS unique_countries,
uniqExact(host) AS targeted_hosts
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= toStartOfHour(now() - INTERVAL 1 HOUR)
GROUP BY ja4
HAVING unique_ips >= 20 AND targeted_hosts >= 3;
-- Ajout du GROUP BY
CREATE VIEW mabase_prod.view_form_bruteforce_detected AS
SELECT
src_ip, ja4, host,
sum(hits) AS hits,
uniqMerge(uniq_query_params) AS query_params_count
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= toStartOfHour(now() - INTERVAL 1 HOUR)
GROUP BY src_ip, ja4, host
HAVING query_params_count >= 10 AND hits >= 20;
-- Ajout du GROUP BY
CREATE VIEW mabase_prod.view_alpn_mismatch_detected AS
SELECT
src_ip, ja4, host,
sum(hits) AS hits,
any(http_version) AS http_version
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= toStartOfHour(now() - INTERVAL 1 HOUR)
AND substring(ja4, 10, 2) IN ('h2', 'h3')
GROUP BY src_ip, ja4, host
HAVING http_version = '1.1' AND hits >= 10;
-- Ajout du GROUP BY
CREATE VIEW mabase_prod.view_tcp_spoofing_detected AS
SELECT
src_ip, ja4,
any(tcp_ttl) AS tcp_ttl,
any(tcp_window_size) AS tcp_window_size,
any(first_ua) AS first_ua
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= toStartOfHour(now() - INTERVAL 1 HOUR)
GROUP BY src_ip, ja4
HAVING tcp_ttl <= 64
AND (first_ua ILIKE '%Windows%' OR first_ua ILIKE '%iPhone%');
ON a.src_ip = h.src_ip AND a.window_start = a.window_start;