feat: port v14 schema fixes, migration, MV verifier, thesis from ja4/
deploy_views.sql (v13 → v14): - CRITICAL: ml_detected_anomalies ORDER BY (src_ip) → (src_ip, ja4, host, model_name) ReplacingMergeTree was collapsing all detections to 1 row per IP on merge - Add PARTITION BY toDate + ttl_only_drop_parts on all 4 data tables - ml_all_scores TTL 3d → 7d; ml_detected_anomalies TTL 30d → 7d - agg_host_ip_ja4_1h + agg_header_fingerprint_1h: add partition + TTL 7d - view_ip_recurrence: add WHERE detected_at >= now() - 7 DAY (was full scan) - Remove dead views: summary/timeseries/threat_dist/variability - Add view_dashboard_entities (fixes HTTP 500 in clustering/incidents/fingerprints) - Add view_dashboard_user_agents (fixes HTTP 500 in fingerprints/metrics) - Add view_ai_features_24h (enables ENABLE_MULTIWINDOW in bot_detector) - Mark max_requests_per_sec as DEPRECATED (always 0) New files: - correlator/sql/migrations/01_ttl_adjustments.sql: ALTER TABLE migration - tests/integration/verify_mvs.py: MV pipeline verification assertions - docs/THESIS_HTTP_Traffic_Detection.md: detection techniques thesis All DB references use ja4_processing/ja4_logs (no mabase_prod). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -1,9 +1,17 @@
|
||||
-- ============================================================================
|
||||
-- ARCHITECTURE DE DÉTECTION INTÉGRALE (v13 - bot_detector v11 + ml_all_scores)
|
||||
-- ARCHITECTURE DE DÉTECTION INTÉGRALE (v14 - corrections structurelles)
|
||||
-- Base : ja4_processing | Fenêtre : 24h | Dédoublonnage par src_ip
|
||||
-- Modifications v11 : ajout campaign_id, raw_anomaly_score dans ml_detected_anomalies
|
||||
-- correction view_dashboard_variability (header_user_agent → reason)
|
||||
-- Modifications v12 : ajout table ml_all_scores (toutes les classifications, sans seuil)
|
||||
-- v11 : ajout campaign_id, raw_anomaly_score dans ml_detected_anomalies
|
||||
-- v12 : ajout table ml_all_scores (toutes les classifications, sans seuil)
|
||||
-- v13 : partition + TTL sur agg_host_ip_ja4_1h et agg_header_fingerprint_1h
|
||||
-- v14 : correction ORDER BY ml_detected_anomalies (était (src_ip) → multi-colonnes)
|
||||
-- suppression colonne morte max_requests_per_sec
|
||||
-- rétention ml_all_scores portée à 7 jours
|
||||
-- partition + ttl_only_drop_parts sur ml_detected_anomalies et ml_all_scores
|
||||
-- filtre 7 jours sur view_ip_recurrence
|
||||
-- suppression vues dashboard inutilisées (summary/timeseries/threat_dist/variability)
|
||||
-- ajout views manquantes : view_dashboard_entities, view_dashboard_user_agents,
|
||||
-- view_ai_features_24h
|
||||
-- ============================================================================
|
||||
|
||||
-- 1. NETTOYAGE COMPLET
|
||||
@ -14,6 +22,15 @@ DROP DICTIONARY IF EXISTS ja4_processing.dict_asn_reputation;
|
||||
DROP TABLE IF EXISTS ja4_processing.ml_detected_anomalies;
|
||||
DROP VIEW IF EXISTS ja4_processing.view_ip_recurrence;
|
||||
DROP VIEW IF EXISTS ja4_processing.view_ai_features_1h;
|
||||
DROP VIEW IF EXISTS ja4_processing.view_ai_features_24h;
|
||||
-- Vues dashboard inutilisées (non référencées par les routes Python)
|
||||
DROP VIEW IF EXISTS ja4_processing.view_dashboard_summary;
|
||||
DROP VIEW IF EXISTS ja4_processing.view_dashboard_timeseries;
|
||||
DROP VIEW IF EXISTS ja4_processing.view_dashboard_threat_dist;
|
||||
DROP VIEW IF EXISTS ja4_processing.view_dashboard_variability;
|
||||
-- Vues dashboard manquantes (recréées plus bas)
|
||||
DROP VIEW IF EXISTS ja4_processing.view_dashboard_entities;
|
||||
DROP VIEW IF EXISTS ja4_processing.view_dashboard_user_agents;
|
||||
-- Suppression des anciennes vues heuristiques
|
||||
DROP VIEW IF EXISTS ja4_processing.view_host_ip_ja4_rotation;
|
||||
DROP VIEW IF EXISTS ja4_processing.view_host_ja4_anomalies;
|
||||
@ -79,7 +96,7 @@ CREATE TABLE ja4_processing.agg_host_ip_ja4_1h
|
||||
count_assets SimpleAggregateFunction(sum, UInt64),
|
||||
count_no_referer SimpleAggregateFunction(sum, UInt64),
|
||||
uniq_ua AggregateFunction(uniq, String),
|
||||
max_requests_per_sec SimpleAggregateFunction(max, UInt32),
|
||||
max_requests_per_sec SimpleAggregateFunction(max, UInt32), -- DEPRECATED: toujours 0, conservé pour compatibilité
|
||||
url_depth_variance AggregateFunction(varPop, Float64),
|
||||
count_anomalous_payload SimpleAggregateFunction(sum, UInt64),
|
||||
-- B features (ajoutées v14)
|
||||
@ -93,7 +110,12 @@ CREATE TABLE ja4_processing.agg_host_ip_ja4_1h
|
||||
ip_df_var AggregateFunction(varPop, Float64) -- B8: variance bit DF
|
||||
)
|
||||
ENGINE = AggregatingMergeTree()
|
||||
ORDER BY (window_start, src_ip, ja4, host);
|
||||
PARTITION BY toDate(window_start)
|
||||
ORDER BY (window_start, src_ip, ja4, host)
|
||||
TTL window_start + INTERVAL 7 DAY
|
||||
SETTINGS
|
||||
index_granularity = 8192,
|
||||
ttl_only_drop_parts = 1;
|
||||
|
||||
-- ============================================================================
|
||||
-- 4. VUE MATÉRIALISÉE → agg_host_ip_ja4_1h
|
||||
@ -156,7 +178,12 @@ CREATE TABLE ja4_processing.agg_header_fingerprint_1h
|
||||
sec_fetch_dest SimpleAggregateFunction(any, String)
|
||||
)
|
||||
ENGINE = AggregatingMergeTree()
|
||||
ORDER BY (window_start, src_ip);
|
||||
PARTITION BY toDate(window_start)
|
||||
ORDER BY (window_start, src_ip)
|
||||
TTL window_start + INTERVAL 7 DAY
|
||||
SETTINGS
|
||||
index_granularity = 8192,
|
||||
ttl_only_drop_parts = 1;
|
||||
|
||||
CREATE MATERIALIZED VIEW ja4_processing.mv_agg_header_fingerprint_1h
|
||||
TO ja4_processing.agg_header_fingerprint_1h AS
|
||||
@ -198,8 +225,12 @@ CREATE TABLE ja4_processing.ml_detected_anomalies
|
||||
raw_anomaly_score Float32 DEFAULT 0
|
||||
)
|
||||
ENGINE = ReplacingMergeTree(detected_at)
|
||||
ORDER BY (src_ip)
|
||||
TTL detected_at + INTERVAL 30 DAY;
|
||||
PARTITION BY toDate(detected_at)
|
||||
ORDER BY (src_ip, ja4, host, model_name)
|
||||
TTL detected_at + INTERVAL 7 DAY
|
||||
SETTINGS
|
||||
index_granularity = 8192,
|
||||
ttl_only_drop_parts = 1;
|
||||
|
||||
-- ============================================================================
|
||||
-- 6b. TABLE DE TOUTES LES CLASSIFICATIONS (sans seuil, pour observabilité)
|
||||
@ -228,9 +259,12 @@ CREATE TABLE ja4_processing.ml_all_scores
|
||||
campaign_id Int32
|
||||
)
|
||||
ENGINE = ReplacingMergeTree(detected_at)
|
||||
PARTITION BY toDate(window_start)
|
||||
ORDER BY (window_start, src_ip, ja4, host, model_name)
|
||||
TTL window_start + INTERVAL 3 DAY
|
||||
SETTINGS index_granularity = 8192;
|
||||
TTL window_start + INTERVAL 7 DAY
|
||||
SETTINGS
|
||||
index_granularity = 8192,
|
||||
ttl_only_drop_parts = 1;
|
||||
|
||||
-- ============================================================================
|
||||
-- 7. VUE DE RÉCURRENCE
|
||||
@ -238,7 +272,9 @@ SETTINGS index_granularity = 8192;
|
||||
CREATE OR REPLACE VIEW ja4_processing.view_ip_recurrence AS
|
||||
SELECT src_ip, count() AS recurrence, min(detected_at) AS first_seen, max(detected_at) AS last_seen,
|
||||
min(anomaly_score) AS worst_score, argMin(threat_level, anomaly_score) AS worst_threat_level
|
||||
FROM ja4_processing.ml_detected_anomalies GROUP BY src_ip;
|
||||
FROM ja4_processing.ml_detected_anomalies
|
||||
WHERE detected_at >= now() - INTERVAL 7 DAY
|
||||
GROUP BY src_ip;
|
||||
|
||||
-- ============================================================================
|
||||
-- 8. VUE IA PRINCIPALE (Avec CTE pour Entropie Temporelle)
|
||||
@ -351,61 +387,140 @@ FROM base_data;
|
||||
-- VUES POUR LE DASHBOARD WEB
|
||||
-- ============================================================================
|
||||
|
||||
-- Vue pour les métriques globales du dashboard
|
||||
CREATE OR REPLACE VIEW ja4_processing.view_dashboard_summary AS
|
||||
-- Vue entités : données brutes par IP/JA4 pour clustering et incidents
|
||||
-- Colonnes utilisées : entity_type, entity_value, user_agents, client_headers, src_ip, ja4, log_date
|
||||
CREATE OR REPLACE VIEW ja4_processing.view_dashboard_entities AS
|
||||
SELECT
|
||||
count() AS total_detections,
|
||||
countIf(threat_level = 'CRITICAL') AS critical_count,
|
||||
countIf(threat_level = 'HIGH') AS high_count,
|
||||
countIf(threat_level = 'MEDIUM') AS medium_count,
|
||||
countIf(threat_level = 'LOW') AS low_count,
|
||||
countIf(bot_name != '') AS known_bots_count,
|
||||
countIf(bot_name = '') AS anomalies_count,
|
||||
uniq(src_ip) AS unique_ips
|
||||
FROM ja4_processing.ml_detected_anomalies
|
||||
WHERE detected_at >= now() - INTERVAL 24 HOUR;
|
||||
|
||||
-- Vue pour la série temporelle (par heure)
|
||||
CREATE OR REPLACE VIEW ja4_processing.view_dashboard_timeseries AS
|
||||
SELECT
|
||||
toStartOfHour(detected_at) AS hour,
|
||||
count() AS total,
|
||||
countIf(threat_level = 'CRITICAL') AS critical,
|
||||
countIf(threat_level = 'HIGH') AS high,
|
||||
countIf(threat_level = 'MEDIUM') AS medium,
|
||||
countIf(threat_level = 'LOW') AS low
|
||||
FROM ja4_processing.ml_detected_anomalies
|
||||
WHERE detected_at >= now() - INTERVAL 24 HOUR
|
||||
GROUP BY hour
|
||||
ORDER BY hour;
|
||||
|
||||
-- Vue pour la distribution des menaces
|
||||
CREATE OR REPLACE VIEW ja4_processing.view_dashboard_threat_dist AS
|
||||
SELECT
|
||||
threat_level,
|
||||
count() AS count,
|
||||
round(count() * 100.0 / sum(count()) OVER (), 2) AS percentage
|
||||
FROM ja4_processing.ml_detected_anomalies
|
||||
WHERE detected_at >= now() - INTERVAL 24 HOUR
|
||||
GROUP BY threat_level
|
||||
ORDER BY count DESC;
|
||||
|
||||
-- Vue pour la variabilité (utilisée par l'API)
|
||||
-- Note v12 : header_user_agent n'existe pas dans ml_detected_anomalies → remplacé par reason
|
||||
CREATE OR REPLACE VIEW ja4_processing.view_dashboard_variability AS
|
||||
SELECT
|
||||
detected_at,
|
||||
'ip' AS entity_type,
|
||||
toString(src_ip) AS entity_value,
|
||||
src_ip,
|
||||
ja4,
|
||||
host,
|
||||
reason AS sample_reason,
|
||||
country_code,
|
||||
asn_number,
|
||||
asn_org,
|
||||
threat_level,
|
||||
model_name,
|
||||
anomaly_score,
|
||||
campaign_id,
|
||||
raw_anomaly_score
|
||||
FROM ja4_processing.ml_detected_anomalies
|
||||
WHERE detected_at >= now() - INTERVAL 24 HOUR;
|
||||
log_date,
|
||||
groupUniqArray(100)(header_user_agent) AS user_agents,
|
||||
splitByChar(',', any(client_headers)) AS client_headers
|
||||
FROM ja4_logs.http_logs
|
||||
WHERE log_date >= today() - 7
|
||||
AND header_user_agent != ''
|
||||
GROUP BY src_ip, ja4, log_date;
|
||||
|
||||
-- Vue user-agents agrégés par heure pour les routes fingerprints/metrics
|
||||
-- Colonnes utilisées : src_ip, ja4, hour, log_date, user_agents (Array), requests
|
||||
CREATE OR REPLACE VIEW ja4_processing.view_dashboard_user_agents AS
|
||||
SELECT
|
||||
src_ip,
|
||||
ja4,
|
||||
toStartOfHour(time) AS hour,
|
||||
log_date,
|
||||
groupUniqArray(50)(header_user_agent) AS user_agents,
|
||||
count() AS requests
|
||||
FROM ja4_logs.http_logs
|
||||
WHERE log_date >= today() - 7
|
||||
AND header_user_agent != ''
|
||||
GROUP BY src_ip, ja4, hour, log_date;
|
||||
|
||||
-- ============================================================================
|
||||
-- VUE IA 24H (fenêtre glissante journalière — détection d'attaquants lents)
|
||||
-- Même structure que view_ai_features_1h mais agrège toutes les fenêtres horaires
|
||||
-- en une seule ligne par (src_ip, ja4, host) sur 24h.
|
||||
-- Utilisée par bot_detector en mode ENABLE_MULTIWINDOW=true.
|
||||
-- ============================================================================
|
||||
CREATE OR REPLACE VIEW ja4_processing.view_ai_features_24h AS
|
||||
WITH base_data AS (
|
||||
SELECT
|
||||
toStartOfDay(a.first_seen) AS window_start,
|
||||
a.src_ip, a.ja4, a.host,
|
||||
toString(a.src_asn) AS asn_number, a.src_as_name AS asn_org,
|
||||
a.src_org AS asn_detail, a.src_domain AS asn_domain, a.src_country_code AS country_code,
|
||||
dictGetOrDefault('ja4_processing.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label,
|
||||
COALESCE(
|
||||
nullIf(dictGetOrDefault('ja4_processing.dict_bot_ip', 'bot_name', a.src_ip, ''), ''),
|
||||
nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''),
|
||||
''
|
||||
) AS bot_name,
|
||||
a.hits AS hits,
|
||||
sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits,
|
||||
a.correlated AS correlated, a.tcp_jitter_variance AS tcp_jitter_variance,
|
||||
a.true_window_size AS true_window_size, a.window_mss_ratio AS window_mss_ratio, a.max_keepalives AS max_keepalives,
|
||||
h.header_order_hash AS header_order_hash, h.header_count AS header_count,
|
||||
h.has_accept_language AS has_accept_language, h.has_cookie AS has_cookie,
|
||||
h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score, h.ua_ch_mismatch AS ua_ch_mismatch,
|
||||
(a.count_post / (a.hits + 1)) AS post_ratio, (a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index,
|
||||
(a.hits / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS hit_velocity,
|
||||
(a.unique_src_ports / (a.hits + 1)) AS port_exhaustion_ratio, (a.orphan_count / (a.hits + 1)) AS orphan_ratio,
|
||||
(a.ip_id_zero_count / (a.hits + 1)) AS ip_id_zero_ratio, (a.hits / (a.unique_conn_id + 1)) AS multiplexing_efficiency,
|
||||
IF(a.mss_1460_count > (a.hits * 0.8) AND h.modern_browser_score > 70, 1, 0) AS mss_mobile_mismatch,
|
||||
a.request_size_variance AS request_size_variance,
|
||||
IF(a.tls_alpn = 'h2' AND a.http_version != '2', 1, 0) AS alpn_http_mismatch,
|
||||
IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing,
|
||||
IF(length(a.tls_sni) > 0 AND a.tls_sni != a.host, 1, 0) AS sni_host_mismatch,
|
||||
IF(h.sec_fetch_mode = 'navigate' AND h.sec_fetch_dest != 'document', 1, 0) AS is_fake_navigation,
|
||||
count() OVER (PARTITION BY a.tcp_fingerprint) AS tcp_shared_count,
|
||||
count() OVER (PARTITION BY h.header_order_hash) AS header_order_shared_count,
|
||||
(a.count_assets / (a.hits + 1)) AS asset_ratio, (a.count_no_referer / (a.hits + 1)) AS direct_access_ratio,
|
||||
IF(a.unique_ua > 2, 1, 0) AS is_ua_rotating, uniqExact(a.ja4) OVER (PARTITION BY a.src_ip) AS distinct_ja4_count,
|
||||
((a.hits / (a.unique_src_ports + 1)) / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS src_port_density,
|
||||
(sum(a.hits) OVER (PARTITION BY a.ja4, a.src_asn) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_asn_concentration,
|
||||
(sum(a.hits) OVER (PARTITION BY a.ja4, a.src_country_code) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_country_concentration,
|
||||
IF(sum(a.hits) OVER (PARTITION BY a.ja4) < 100, 1, 0) AS is_rare_ja4,
|
||||
(count() OVER (PARTITION BY h.header_order_hash, a.first_ua) / (count() OVER (PARTITION BY a.first_ua) + 1)) AS header_order_confidence,
|
||||
uniqExact(h.header_order_hash) OVER (PARTITION BY a.src_ip) AS distinct_header_orders,
|
||||
(a.uniq_paths / (a.hits + 1)) AS path_diversity_ratio,
|
||||
a.url_depth_variance AS url_depth_variance,
|
||||
(a.count_anomalous_payload / (a.hits + 1)) AS anomalous_payload_ratio,
|
||||
a.uniq_ja3_val AS uniq_ja3_per_row,
|
||||
sqrt(a.tcp_jitter_variance) / greatest(a.avg_syn_ms_val, 1) AS syn_timing_cv,
|
||||
a.tls12_count / (a.hits + 1) AS tls12_ratio,
|
||||
a.count_head / (a.hits + 1) AS head_ratio,
|
||||
a.count_no_sec_fetch / (a.hits + 1) AS sec_fetch_absence_rate,
|
||||
a.count_generic_accept / (a.hits + 1) AS generic_accept_ratio,
|
||||
a.count_http10 / (a.hits + 1) AS http10_ratio,
|
||||
a.ip_df_variance AS ip_df_variance
|
||||
FROM (
|
||||
SELECT
|
||||
src_ip, ja4, host, src_asn,
|
||||
any(src_country_code) AS src_country_code, any(src_as_name) AS src_as_name,
|
||||
any(src_org) AS src_org, any(src_domain) AS src_domain, any(first_ua) AS first_ua,
|
||||
sum(hits) AS hits, uniqMerge(uniq_paths) AS uniq_paths,
|
||||
uniqMerge(uniq_query_params) AS uniq_query_params, sum(count_post) AS count_post,
|
||||
min(first_seen) AS first_seen, max(last_seen) AS last_seen,
|
||||
any(tcp_fp_raw) AS tcp_fingerprint, varPopMerge(tcp_jitter_variance) AS tcp_jitter_variance,
|
||||
varPopMerge(total_ip_length_var) AS request_size_variance,
|
||||
any(tcp_win_raw * exp2(tcp_scale_raw)) AS true_window_size,
|
||||
IF(any(tcp_mss_raw) > 0, any(tcp_win_raw) / any(tcp_mss_raw), 0) AS window_mss_ratio,
|
||||
any(http_ver_raw) AS http_version, any(tls_alpn_raw) AS tls_alpn, any(tls_sni_raw) AS tls_sni,
|
||||
max(correlated_raw) AS correlated, uniqMerge(unique_src_ports) AS unique_src_ports,
|
||||
uniqMerge(unique_conn_id) AS unique_conn_id, max(max_keepalives) AS max_keepalives,
|
||||
sum(orphan_count) AS orphan_count, sum(ip_id_zero_count) AS ip_id_zero_count,
|
||||
sum(mss_1460_count) AS mss_1460_count,
|
||||
sum(count_assets) AS count_assets, sum(count_no_referer) AS count_no_referer, uniqMerge(uniq_ua) AS unique_ua,
|
||||
varPopMerge(url_depth_variance) AS url_depth_variance,
|
||||
sum(count_anomalous_payload) AS count_anomalous_payload,
|
||||
uniqMerge(uniq_ja3) AS uniq_ja3_val,
|
||||
avgMerge(avg_syn_ms) AS avg_syn_ms_val,
|
||||
sum(tls12_count) AS tls12_count,
|
||||
sum(count_head) AS count_head,
|
||||
sum(count_no_sec_fetch) AS count_no_sec_fetch,
|
||||
sum(count_generic_accept) AS count_generic_accept,
|
||||
sum(count_http10) AS count_http10,
|
||||
varPopMerge(ip_df_var) AS ip_df_variance
|
||||
FROM ja4_processing.agg_host_ip_ja4_1h
|
||||
WHERE window_start >= now() - INTERVAL 24 HOUR
|
||||
GROUP BY src_ip, ja4, host, src_asn
|
||||
) a
|
||||
LEFT JOIN (
|
||||
SELECT
|
||||
src_ip, any(header_order_hash) AS header_order_hash,
|
||||
max(header_count) AS header_count, max(has_accept_language) AS has_accept_language,
|
||||
max(has_cookie) AS has_cookie, max(has_referer) AS has_referer,
|
||||
max(modern_browser_score) AS modern_browser_score, max(ua_ch_mismatch) AS ua_ch_mismatch,
|
||||
any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest
|
||||
FROM ja4_processing.agg_header_fingerprint_1h
|
||||
WHERE window_start >= now() - INTERVAL 24 HOUR
|
||||
GROUP BY src_ip
|
||||
) h ON a.src_ip = h.src_ip
|
||||
)
|
||||
SELECT
|
||||
*,
|
||||
-(sum((hits / (total_ip_hits + 1)) * log2((hits / (total_ip_hits + 1)) + 0.000001)) OVER (PARTITION BY src_ip)) AS temporal_entropy,
|
||||
sum(uniq_ja3_per_row) OVER (PARTITION BY src_ip) / greatest(distinct_ja4_count, 1) AS ja3_diversity_ratio
|
||||
FROM base_data;
|
||||
|
||||
93
services/correlator/sql/migrations/01_ttl_adjustments.sql
Normal file
93
services/correlator/sql/migrations/01_ttl_adjustments.sql
Normal file
@ -0,0 +1,93 @@
|
||||
-- =============================================================================
|
||||
-- 01_ttl_adjustments.sql — Ajustement des TTL de rétention des logs
|
||||
--
|
||||
-- Objectif :
|
||||
-- - http_logs : conserver 30 jours (était 7 jours)
|
||||
-- - http_logs_raw : supprimer au plus vite (2 heures, était 1 jour)
|
||||
--
|
||||
-- Usage (déploiements existants) :
|
||||
-- clickhouse-client --multiquery < sql/migrations/01_ttl_adjustments.sql
|
||||
--
|
||||
-- IMPORTANT — http_logs_raw (partition) :
|
||||
-- La modification du TTL ci-dessous s'applique immédiatement aux nouvelles
|
||||
-- insertions. Toutefois, la PARTITION BY (toDate → toStartOfHour) ne peut
|
||||
-- PAS être modifiée via ALTER TABLE — elle est fixée à la création.
|
||||
--
|
||||
-- Pour adopter la partition horaire sur un déploiement existant, procéder
|
||||
-- manuellement :
|
||||
-- 1. CREATE TABLE ja4_logs.http_logs_raw_new ( ... identique à init.sql ... )
|
||||
-- 2. Attendre que http_logs_raw soit vide (ou accepter la perte des raw non traités)
|
||||
-- 3. DROP VIEW IF EXISTS ja4_logs.mv_http_logs
|
||||
-- 4. DROP TABLE ja4_logs.http_logs_raw
|
||||
-- 5. RENAME TABLE ja4_logs.http_logs_raw_new TO ja4_logs.http_logs_raw
|
||||
-- 6. Recréer mv_http_logs (cf. init.sql)
|
||||
--
|
||||
-- Si la migration complète n'est pas possible, la modification du TTL seul
|
||||
-- (ci-dessous) ramène la suppression à : fin de la partition quotidienne + 2h,
|
||||
-- soit au plus 26h au lieu de 48h — amélioration immédiate sans downtime.
|
||||
-- =============================================================================
|
||||
|
||||
-- -----------------------------------------------------------------------------
|
||||
-- 1. http_logs — rétention étendue à 30 jours
|
||||
-- -----------------------------------------------------------------------------
|
||||
ALTER TABLE ja4_logs.http_logs
|
||||
MODIFY TTL log_date + INTERVAL 30 DAY;
|
||||
|
||||
-- -----------------------------------------------------------------------------
|
||||
-- 2. http_logs_raw — TTL réduit à 2 heures
|
||||
-- (la MV traite les données de façon synchrone à l'INSERT, donc les raw
|
||||
-- sont redondants dès leur insertion)
|
||||
-- -----------------------------------------------------------------------------
|
||||
ALTER TABLE ja4_logs.http_logs_raw
|
||||
MODIFY TTL ingest_time + INTERVAL 2 HOUR;
|
||||
|
||||
-- -----------------------------------------------------------------------------
|
||||
-- Vérifications post-migration
|
||||
-- -----------------------------------------------------------------------------
|
||||
-- SELECT engine_full FROM system.tables
|
||||
-- WHERE database IN ('ja4_logs', 'ja4_processing') AND name IN ('http_logs', 'http_logs_raw');
|
||||
|
||||
-- =============================================================================
|
||||
-- Ajustements TTL des tables de MVs (bot_detector) — 7 jours max
|
||||
-- =============================================================================
|
||||
|
||||
-- agg_host_ip_ja4_1h — TTL 7 jours (était : aucun)
|
||||
-- Note : PARTITION BY ne peut pas être ajouté via ALTER TABLE.
|
||||
-- Pour la partition quotidienne (recommandée), recréer la table via deploy_views.sql.
|
||||
-- Sans partition explicite, ClickHouse applique le TTL par granule (moins efficace).
|
||||
ALTER TABLE ja4_processing.agg_host_ip_ja4_1h
|
||||
MODIFY TTL window_start + INTERVAL 7 DAY;
|
||||
|
||||
-- agg_header_fingerprint_1h — TTL 7 jours (était : aucun)
|
||||
ALTER TABLE ja4_processing.agg_header_fingerprint_1h
|
||||
MODIFY TTL window_start + INTERVAL 7 DAY;
|
||||
|
||||
-- ml_detected_anomalies — CRITIQUE : corriger ORDER BY (était (src_ip), causait une
|
||||
-- déduplication agressive — une seule ligne conservée par IP au lieu de src_ip+ja4+host+model)
|
||||
-- Note : ALTER TABLE ... MODIFY ORDER BY nécessite ClickHouse >= 22.6
|
||||
-- Si la version est antérieure, recréer la table via deploy_views.sql.
|
||||
ALTER TABLE ja4_processing.ml_detected_anomalies
|
||||
MODIFY ORDER BY (src_ip, ja4, host, model_name);
|
||||
|
||||
-- ml_detected_anomalies — TTL 7 jours (était 30 jours, déjà corrigé en 7j pour certains)
|
||||
ALTER TABLE ja4_processing.ml_detected_anomalies
|
||||
MODIFY TTL detected_at + INTERVAL 7 DAY;
|
||||
|
||||
-- ml_all_scores — TTL 7 jours (était 3 jours)
|
||||
-- Note : PARTITION BY ne peut pas être ajouté via ALTER TABLE.
|
||||
-- Pour la partition quotidienne (recommandée), recréer la table via deploy_views.sql.
|
||||
ALTER TABLE ja4_processing.ml_all_scores
|
||||
MODIFY TTL window_start + INTERVAL 7 DAY;
|
||||
|
||||
-- =============================================================================
|
||||
-- Vues dashboard — suppression des vues inutilisées, ajout des vues manquantes
|
||||
-- Ces vues n'ont pas d'état persistant : les DROP/CREATE sont idempotents.
|
||||
-- =============================================================================
|
||||
-- Supprimer les vues inutilisées (non référencées par les routes Python)
|
||||
DROP VIEW IF EXISTS ja4_processing.view_dashboard_summary;
|
||||
DROP VIEW IF EXISTS ja4_processing.view_dashboard_timeseries;
|
||||
DROP VIEW IF EXISTS ja4_processing.view_dashboard_threat_dist;
|
||||
DROP VIEW IF EXISTS ja4_processing.view_dashboard_variability;
|
||||
-- Les nouvelles vues (view_dashboard_entities, view_dashboard_user_agents,
|
||||
-- view_ai_features_24h) sont créées par deploy_views.sql (section finale).
|
||||
-- Les exécuter ici revient à rejouer cette section de deploy_views.sql.
|
||||
Reference in New Issue
Block a user