- Add generate_browser_ja4.py: 1,186 browser JA4 fingerprints from FoxIO + ja4db.com covering 11 families (Chromium, Firefox, Safari, Edge, Tor, Opera, Vivaldi...) - Rewrite generate_bot_ip.py: Anubis YAML rules (Google, Bing, Apple, DuckDuck, OpenAI, Perplexity bots) + Tor exit nodes + cloud scanner IPs (3,555 entries) - Rewrite generate_asn_data.py: worldwide iptoasn.com data (78,049 ASNs, 714K CIDRs) - Add dict_browser_ja4 ClickHouse dictionary + browser_family in AI features views - Add /api/browsers dashboard endpoint - Fix CSV quoting for fields containing commas (User-Agent strings) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
534 lines
32 KiB
SQL
534 lines
32 KiB
SQL
-- ============================================================================
|
|
-- ARCHITECTURE DE DÉTECTION INTÉGRALE (v14 - corrections structurelles)
|
|
-- Base : ja4_processing | Fenêtre : 24h | Dédoublonnage par src_ip
|
|
-- v11 : ajout campaign_id, raw_anomaly_score dans ml_detected_anomalies
|
|
-- v12 : ajout table ml_all_scores (toutes les classifications, sans seuil)
|
|
-- v13 : partition + TTL sur agg_host_ip_ja4_1h et agg_header_fingerprint_1h
|
|
-- v14 : correction ORDER BY ml_detected_anomalies (était (src_ip) → multi-colonnes)
|
|
-- suppression colonne morte max_requests_per_sec
|
|
-- rétention ml_all_scores portée à 7 jours
|
|
-- partition + ttl_only_drop_parts sur ml_detected_anomalies et ml_all_scores
|
|
-- filtre 7 jours sur view_ip_recurrence
|
|
-- suppression vues dashboard inutilisées (summary/timeseries/threat_dist/variability)
|
|
-- ajout views manquantes : view_dashboard_entities, view_dashboard_user_agents,
|
|
-- view_ai_features_24h
|
|
-- ============================================================================
|
|
|
|
-- 1. NETTOYAGE COMPLET
|
|
DROP TABLE IF EXISTS ja4_processing.ml_all_scores;
|
|
DROP DICTIONARY IF EXISTS ja4_processing.dict_bot_ip;
|
|
DROP DICTIONARY IF EXISTS ja4_processing.dict_bot_ja4;
|
|
DROP DICTIONARY IF EXISTS ja4_processing.dict_browser_ja4;
|
|
DROP DICTIONARY IF EXISTS ja4_processing.dict_asn_reputation;
|
|
DROP TABLE IF EXISTS ja4_processing.ml_detected_anomalies;
|
|
DROP VIEW IF EXISTS ja4_processing.view_ip_recurrence;
|
|
DROP VIEW IF EXISTS ja4_processing.view_ai_features_1h;
|
|
DROP VIEW IF EXISTS ja4_processing.view_ai_features_24h;
|
|
-- Vues dashboard inutilisées (non référencées par les routes Python)
|
|
DROP VIEW IF EXISTS ja4_processing.view_dashboard_summary;
|
|
DROP VIEW IF EXISTS ja4_processing.view_dashboard_timeseries;
|
|
DROP VIEW IF EXISTS ja4_processing.view_dashboard_threat_dist;
|
|
DROP VIEW IF EXISTS ja4_processing.view_dashboard_variability;
|
|
-- Vues dashboard manquantes (recréées plus bas)
|
|
DROP VIEW IF EXISTS ja4_processing.view_dashboard_entities;
|
|
DROP VIEW IF EXISTS ja4_processing.view_dashboard_user_agents;
|
|
-- Suppression des anciennes vues heuristiques
|
|
DROP VIEW IF EXISTS ja4_processing.view_host_ip_ja4_rotation;
|
|
DROP VIEW IF EXISTS ja4_processing.view_host_ja4_anomalies;
|
|
DROP VIEW IF EXISTS ja4_processing.view_form_bruteforce_detected;
|
|
DROP VIEW IF EXISTS ja4_processing.view_alpn_mismatch_detected;
|
|
DROP VIEW IF EXISTS ja4_processing.view_tcp_spoofing_detected;
|
|
|
|
DROP VIEW IF EXISTS ja4_processing.mv_agg_host_ip_ja4_1h;
|
|
DROP TABLE IF EXISTS ja4_processing.agg_host_ip_ja4_1h;
|
|
DROP VIEW IF EXISTS ja4_processing.mv_agg_header_fingerprint_1h;
|
|
DROP TABLE IF EXISTS ja4_processing.agg_header_fingerprint_1h;
|
|
|
|
-- ============================================================================
|
|
-- 2. DICTIONNAIRES DE RÉPUTATION EN RAM
|
|
-- ============================================================================
|
|
CREATE DICTIONARY ja4_processing.dict_bot_ip (prefix String, bot_name String)
|
|
PRIMARY KEY prefix SOURCE(FILE(path '/var/lib/clickhouse/user_files/bot_ip.csv' format 'CSV'))
|
|
LAYOUT(IP_TRIE()) LIFETIME(MIN 300 MAX 300);
|
|
|
|
CREATE DICTIONARY ja4_processing.dict_bot_ja4 (ja4 String, bot_name String)
|
|
PRIMARY KEY ja4 SOURCE(FILE(path '/var/lib/clickhouse/user_files/bot_ja4.csv' format 'CSV'))
|
|
LAYOUT(COMPLEX_KEY_HASHED()) LIFETIME(MIN 300 MAX 300);
|
|
|
|
CREATE DICTIONARY ja4_processing.dict_asn_reputation (src_asn UInt64, label String)
|
|
PRIMARY KEY src_asn SOURCE(FILE(path '/var/lib/clickhouse/user_files/asn_reputation.csv' format 'CSV'))
|
|
LAYOUT(HASHED()) LIFETIME(MIN 300 MAX 300);
|
|
|
|
CREATE DICTIONARY ja4_processing.dict_browser_ja4 (ja4 String, browser_family String, tls_library String, context String)
|
|
PRIMARY KEY ja4 SOURCE(FILE(path '/var/lib/clickhouse/user_files/browser_ja4.csv' format 'CSV'))
|
|
LAYOUT(COMPLEX_KEY_HASHED()) LIFETIME(MIN 300 MAX 300);
|
|
|
|
-- ============================================================================
|
|
-- 3. TABLE D'AGRÉGATION COMPORTEMENTALE (L4 / L5 / L7)
|
|
-- ============================================================================
|
|
CREATE TABLE ja4_processing.agg_host_ip_ja4_1h
|
|
(
|
|
window_start DateTime,
|
|
src_ip IPv6, ja4 String, host String, src_asn UInt32,
|
|
src_country_code SimpleAggregateFunction(any, String),
|
|
src_as_name SimpleAggregateFunction(any, String),
|
|
src_org SimpleAggregateFunction(any, String),
|
|
src_domain SimpleAggregateFunction(any, String),
|
|
first_seen SimpleAggregateFunction(min, DateTime),
|
|
last_seen SimpleAggregateFunction(max, DateTime),
|
|
hits SimpleAggregateFunction(sum, UInt64),
|
|
count_post SimpleAggregateFunction(sum, UInt64),
|
|
uniq_paths AggregateFunction(uniq, String),
|
|
uniq_query_params AggregateFunction(uniq, String),
|
|
tcp_fp_raw SimpleAggregateFunction(any, String),
|
|
tcp_jitter_variance AggregateFunction(varPop, Float64),
|
|
tcp_win_raw SimpleAggregateFunction(any, UInt32),
|
|
tcp_scale_raw SimpleAggregateFunction(any, UInt32),
|
|
tcp_mss_raw SimpleAggregateFunction(any, UInt32),
|
|
tcp_ttl_raw SimpleAggregateFunction(any, UInt32),
|
|
http_ver_raw SimpleAggregateFunction(any, String),
|
|
tls_alpn_raw SimpleAggregateFunction(any, String),
|
|
tls_sni_raw SimpleAggregateFunction(any, String),
|
|
first_ua SimpleAggregateFunction(any, String),
|
|
correlated_raw SimpleAggregateFunction(max, UInt8),
|
|
unique_src_ports AggregateFunction(uniq, UInt16),
|
|
unique_conn_id AggregateFunction(uniq, String),
|
|
max_keepalives SimpleAggregateFunction(max, UInt32),
|
|
orphan_count SimpleAggregateFunction(sum, UInt64),
|
|
ip_id_zero_count SimpleAggregateFunction(sum, UInt64),
|
|
total_ip_length_var AggregateFunction(varPop, Float64),
|
|
mss_1460_count SimpleAggregateFunction(sum, UInt64),
|
|
count_assets SimpleAggregateFunction(sum, UInt64),
|
|
count_no_referer SimpleAggregateFunction(sum, UInt64),
|
|
uniq_ua AggregateFunction(uniq, String),
|
|
max_requests_per_sec SimpleAggregateFunction(max, UInt32), -- DEPRECATED: toujours 0, conservé pour compatibilité
|
|
url_depth_variance AggregateFunction(varPop, Float64),
|
|
count_anomalous_payload SimpleAggregateFunction(sum, UInt64),
|
|
-- B features (ajoutées v14)
|
|
uniq_ja3 AggregateFunction(uniq, String), -- B1: diversité JA3/JA4
|
|
avg_syn_ms AggregateFunction(avg, Float64), -- B2: SYN timing moyen (pour CV)
|
|
tls12_count SimpleAggregateFunction(sum, UInt64), -- B3: ratio TLS 1.2
|
|
count_head SimpleAggregateFunction(sum, UInt64), -- B4: ratio requêtes HEAD
|
|
count_no_sec_fetch SimpleAggregateFunction(sum, UInt64),-- B5: absence Sec-Fetch-*
|
|
count_generic_accept SimpleAggregateFunction(sum, UInt64),-- B6: Accept générique
|
|
count_http10 SimpleAggregateFunction(sum, UInt64), -- B7: ratio HTTP/1.0
|
|
ip_df_var AggregateFunction(varPop, Float64) -- B8: variance bit DF
|
|
)
|
|
ENGINE = AggregatingMergeTree()
|
|
PARTITION BY toDate(window_start)
|
|
ORDER BY (window_start, src_ip, ja4, host)
|
|
TTL window_start + INTERVAL 7 DAY
|
|
SETTINGS
|
|
index_granularity = 8192,
|
|
ttl_only_drop_parts = 1;
|
|
|
|
-- ============================================================================
|
|
-- 4. VUE MATÉRIALISÉE → agg_host_ip_ja4_1h
|
|
-- ============================================================================
|
|
CREATE MATERIALIZED VIEW ja4_processing.mv_agg_host_ip_ja4_1h
|
|
TO ja4_processing.agg_host_ip_ja4_1h AS
|
|
SELECT
|
|
toStartOfHour(src.time) AS window_start,
|
|
toIPv6(src.src_ip) AS src_ip, src.ja4, src.host, src.src_asn,
|
|
any(src.src_country_code) AS src_country_code, any(src.src_as_name) AS src_as_name,
|
|
any(src.src_org) AS src_org, any(src.src_domain) AS src_domain,
|
|
min(src.time) AS first_seen, max(src.time) AS last_seen, count() AS hits,
|
|
sum(IF(src.method = 'POST', 1, 0)) AS count_post,
|
|
uniqState(src.path) AS uniq_paths, uniqState(src.query) AS uniq_query_params,
|
|
any(toString(cityHash64(concat(toString(src.tcp_meta_window_size), toString(src.tcp_meta_mss), toString(src.tcp_meta_window_scale), src.tcp_meta_options)))) AS tcp_fp_raw,
|
|
varPopState(toFloat64(src.syn_to_clienthello_ms)) AS tcp_jitter_variance,
|
|
any(src.tcp_meta_window_size) AS tcp_win_raw, any(src.tcp_meta_window_scale) AS tcp_scale_raw,
|
|
any(src.tcp_meta_mss) AS tcp_mss_raw, any(src.ip_meta_ttl) AS tcp_ttl_raw,
|
|
any(src.http_version) AS http_ver_raw, any(src.tls_alpn) AS tls_alpn_raw, any(src.tls_sni) AS tls_sni_raw,
|
|
any(src.header_user_agent) AS first_ua, max(toUInt8(src.correlated)) AS correlated_raw,
|
|
uniqState(toUInt16(src.src_port)) AS unique_src_ports, uniqState(src.conn_id) AS unique_conn_id,
|
|
max(toUInt32(src.keepalives)) AS max_keepalives,
|
|
sum(IF(src.orphan_side = 'A' OR src.correlated = 0, 1, 0)) AS orphan_count,
|
|
sum(IF(src.ip_meta_id == 0, 1, 0)) AS ip_id_zero_count,
|
|
varPopState(toFloat64(src.ip_meta_total_length)) AS total_ip_length_var,
|
|
sum(IF(src.tcp_meta_mss == 1460, 1, 0)) AS mss_1460_count,
|
|
sum(IF(match(src.path, '(?i)\.(png|jpg|jpeg|gif|css|js|ico|woff2|svg|eot)$'), 1, 0)) AS count_assets,
|
|
sum(IF(position(src.client_headers, 'Referer') = 0, 1, 0)) AS count_no_referer,
|
|
uniqState(src.header_user_agent) AS uniq_ua,
|
|
0 AS max_requests_per_sec,
|
|
varPopState(toFloat64(length(replaceAll(src.path, '/', '//')) - length(src.path))) AS url_depth_variance,
|
|
sum(IF(src.ip_meta_total_length < 60 OR src.ip_meta_total_length > 1500, 1, 0)) AS count_anomalous_payload,
|
|
-- B features
|
|
uniqState(src.ja3) AS uniq_ja3,
|
|
avgState(toFloat64(src.syn_to_clienthello_ms)) AS avg_syn_ms,
|
|
sum(IF(src.tls_version = '1.2', 1, 0)) AS tls12_count,
|
|
sum(IF(src.method = 'HEAD', 1, 0)) AS count_head,
|
|
sum(IF(length(src.header_sec_fetch_site) = 0, 1, 0)) AS count_no_sec_fetch,
|
|
sum(IF(length(src.header_accept) < 5, 1, 0)) AS count_generic_accept,
|
|
sum(IF(src.http_version = 'HTTP/1.0', 1, 0)) AS count_http10,
|
|
varPopState(toFloat64(src.ip_meta_df)) AS ip_df_var
|
|
FROM ja4_logs.http_logs AS src
|
|
GROUP BY window_start, src_ip, ja4, host, src_asn;
|
|
|
|
-- ============================================================================
|
|
-- 5. TABLE D'AGRÉGATION DES HEADERS (L7)
|
|
-- ============================================================================
|
|
CREATE TABLE ja4_processing.agg_header_fingerprint_1h
|
|
(
|
|
window_start DateTime,
|
|
src_ip IPv6,
|
|
header_order_hash SimpleAggregateFunction(any, String),
|
|
header_count SimpleAggregateFunction(max, UInt16),
|
|
has_accept_language SimpleAggregateFunction(max, UInt8),
|
|
has_cookie SimpleAggregateFunction(max, UInt8),
|
|
has_referer SimpleAggregateFunction(max, UInt8),
|
|
modern_browser_score SimpleAggregateFunction(max, UInt8),
|
|
ua_ch_mismatch SimpleAggregateFunction(max, UInt8),
|
|
sec_fetch_mode SimpleAggregateFunction(any, String),
|
|
sec_fetch_dest SimpleAggregateFunction(any, String)
|
|
)
|
|
ENGINE = AggregatingMergeTree()
|
|
PARTITION BY toDate(window_start)
|
|
ORDER BY (window_start, src_ip)
|
|
TTL window_start + INTERVAL 7 DAY
|
|
SETTINGS
|
|
index_granularity = 8192,
|
|
ttl_only_drop_parts = 1;
|
|
|
|
CREATE MATERIALIZED VIEW ja4_processing.mv_agg_header_fingerprint_1h
|
|
TO ja4_processing.agg_header_fingerprint_1h AS
|
|
SELECT
|
|
toStartOfHour(src.time) AS window_start,
|
|
toIPv6(src.src_ip) AS src_ip,
|
|
any(toString(cityHash64(src.client_headers))) AS header_order_hash,
|
|
max(toUInt16(length(src.client_headers) - length(replaceAll(src.client_headers, ',', '')) + 1)) AS header_count,
|
|
max(toUInt8(if(position(src.client_headers, 'Accept-Language') > 0, 1, 0))) AS has_accept_language,
|
|
max(toUInt8(if(position(src.client_headers, 'Cookie') > 0, 1, 0))) AS has_cookie,
|
|
max(toUInt8(if(position(src.client_headers, 'Referer') > 0, 1, 0))) AS has_referer,
|
|
max(toUInt8(if(length(src.header_sec_ch_ua) > 0, 100, if(length(src.header_user_agent) > 0, 50, 0)))) AS modern_browser_score,
|
|
max(toUInt8(if((position(src.header_user_agent, 'Windows') > 0 AND position(src.header_sec_ch_ua_platform, 'Windows') == 0) OR (position(src.header_user_agent, 'iPhone') > 0 AND position(src.header_sec_ch_ua_platform, 'iOS') == 0), 1, 0))) AS ua_ch_mismatch,
|
|
any(src.header_sec_fetch_mode) AS sec_fetch_mode,
|
|
any(src.header_sec_fetch_dest) AS sec_fetch_dest
|
|
FROM ja4_logs.http_logs AS src
|
|
GROUP BY window_start, src.src_ip;
|
|
|
|
-- ============================================================================
|
|
-- 6. TABLE DE RÉSULTATS ML — MENACES UNIQUEMENT (scores < seuil)
|
|
-- ============================================================================
|
|
CREATE TABLE ja4_processing.ml_detected_anomalies
|
|
(
|
|
detected_at DateTime, src_ip IPv6, ja4 String, host String, bot_name String,
|
|
anomaly_score Float32, threat_level String, model_name String, recurrence UInt32,
|
|
asn_number String, asn_org String, asn_detail String, asn_domain String, country_code String, asn_label String,
|
|
hits UInt64, hit_velocity Float32, fuzzing_index Float32, post_ratio Float32, port_exhaustion_ratio Float32,
|
|
max_keepalives UInt32, orphan_ratio Float32, tcp_jitter_variance Float32, tcp_shared_count UInt32,
|
|
true_window_size UInt64, window_mss_ratio Float32, alpn_http_mismatch UInt8, is_alpn_missing UInt8, sni_host_mismatch UInt8,
|
|
header_count UInt16, has_accept_language UInt8, has_cookie UInt8, has_referer UInt8, modern_browser_score UInt8,
|
|
is_headless UInt8, ua_ch_mismatch UInt8, header_order_shared_count UInt32, ip_id_zero_ratio Float32,
|
|
request_size_variance Float32, multiplexing_efficiency Float32, mss_mobile_mismatch UInt8, correlated UInt8, reason String,
|
|
asset_ratio Float32, direct_access_ratio Float32, is_ua_rotating UInt8, distinct_ja4_count UInt32,
|
|
src_port_density Float32, ja4_asn_concentration Float32, ja4_country_concentration Float32, is_rare_ja4 UInt8,
|
|
header_order_confidence Float32, distinct_header_orders UInt32, temporal_entropy Float32,
|
|
path_diversity_ratio Float32, url_depth_variance Float32, anomalous_payload_ratio Float32,
|
|
-- Colonnes ajoutées en v11 (bot_detector v11)
|
|
campaign_id Int32 DEFAULT -1,
|
|
raw_anomaly_score Float32 DEFAULT 0
|
|
)
|
|
ENGINE = ReplacingMergeTree(detected_at)
|
|
PARTITION BY toDate(detected_at)
|
|
ORDER BY (src_ip, ja4, host, model_name)
|
|
TTL detected_at + INTERVAL 7 DAY
|
|
SETTINGS
|
|
index_granularity = 8192,
|
|
ttl_only_drop_parts = 1;
|
|
|
|
-- ============================================================================
|
|
-- 6b. TABLE DE TOUTES LES CLASSIFICATIONS (sans seuil, pour observabilité)
|
|
-- ============================================================================
|
|
CREATE TABLE ja4_processing.ml_all_scores
|
|
(
|
|
detected_at DateTime,
|
|
window_start DateTime,
|
|
src_ip IPv6,
|
|
ja4 String,
|
|
host String,
|
|
bot_name String,
|
|
anomaly_score Float32,
|
|
raw_anomaly_score Float32,
|
|
threat_level String,
|
|
model_name String,
|
|
correlated UInt8,
|
|
asn_number String,
|
|
asn_org String,
|
|
country_code String,
|
|
asn_label String,
|
|
hits UInt64,
|
|
hit_velocity Float32,
|
|
fuzzing_index Float32,
|
|
post_ratio Float32,
|
|
campaign_id Int32
|
|
)
|
|
ENGINE = ReplacingMergeTree(detected_at)
|
|
PARTITION BY toDate(window_start)
|
|
ORDER BY (window_start, src_ip, ja4, host, model_name)
|
|
TTL window_start + INTERVAL 7 DAY
|
|
SETTINGS
|
|
index_granularity = 8192,
|
|
ttl_only_drop_parts = 1;
|
|
|
|
-- ============================================================================
|
|
-- 7. VUE DE RÉCURRENCE
|
|
-- ============================================================================
|
|
CREATE OR REPLACE VIEW ja4_processing.view_ip_recurrence AS
|
|
SELECT src_ip, count() AS recurrence, min(detected_at) AS first_seen, max(detected_at) AS last_seen,
|
|
min(anomaly_score) AS worst_score, argMin(threat_level, anomaly_score) AS worst_threat_level
|
|
FROM ja4_processing.ml_detected_anomalies
|
|
WHERE detected_at >= now() - INTERVAL 7 DAY
|
|
GROUP BY src_ip;
|
|
|
|
-- ============================================================================
|
|
-- 8. VUE IA PRINCIPALE (Avec CTE pour Entropie Temporelle)
|
|
-- ============================================================================
|
|
CREATE OR REPLACE VIEW ja4_processing.view_ai_features_1h AS
|
|
WITH base_data AS (
|
|
SELECT
|
|
a.window_start, a.src_ip, a.ja4, a.host,
|
|
toString(a.src_asn) AS asn_number, a.src_as_name AS asn_org,
|
|
a.src_org AS asn_detail, a.src_domain AS asn_domain, a.src_country_code AS country_code,
|
|
dictGetOrDefault('ja4_processing.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label,
|
|
COALESCE(
|
|
nullIf(dictGetOrDefault('ja4_processing.dict_bot_ip', 'bot_name', a.src_ip, ''), ''),
|
|
nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''),
|
|
''
|
|
) AS bot_name,
|
|
dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(a.ja4), '') AS browser_family,
|
|
a.hits AS hits,
|
|
sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits,
|
|
a.correlated AS correlated, a.tcp_jitter_variance AS tcp_jitter_variance,
|
|
a.true_window_size AS true_window_size, a.window_mss_ratio AS window_mss_ratio, a.max_keepalives AS max_keepalives,
|
|
h.header_order_hash AS header_order_hash, h.header_count AS header_count,
|
|
h.has_accept_language AS has_accept_language, h.has_cookie AS has_cookie,
|
|
h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score, h.ua_ch_mismatch AS ua_ch_mismatch,
|
|
(a.count_post / (a.hits + 1)) AS post_ratio, (a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index,
|
|
(a.hits / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS hit_velocity,
|
|
(a.unique_src_ports / (a.hits + 1)) AS port_exhaustion_ratio, (a.orphan_count / (a.hits + 1)) AS orphan_ratio,
|
|
(a.ip_id_zero_count / (a.hits + 1)) AS ip_id_zero_ratio, (a.hits / (a.unique_conn_id + 1)) AS multiplexing_efficiency,
|
|
IF(a.mss_1460_count > (a.hits * 0.8) AND h.modern_browser_score > 70, 1, 0) AS mss_mobile_mismatch,
|
|
a.request_size_variance AS request_size_variance,
|
|
IF(a.tls_alpn = 'h2' AND a.http_version != '2', 1, 0) AS alpn_http_mismatch,
|
|
IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing,
|
|
IF(length(a.tls_sni) > 0 AND a.tls_sni != a.host, 1, 0) AS sni_host_mismatch,
|
|
IF(h.sec_fetch_mode = 'navigate' AND h.sec_fetch_dest != 'document', 1, 0) AS is_fake_navigation,
|
|
count() OVER (PARTITION BY a.tcp_fingerprint) AS tcp_shared_count,
|
|
count() OVER (PARTITION BY h.header_order_hash) AS header_order_shared_count,
|
|
(a.count_assets / (a.hits + 1)) AS asset_ratio, (a.count_no_referer / (a.hits + 1)) AS direct_access_ratio,
|
|
IF(a.unique_ua > 2, 1, 0) AS is_ua_rotating, uniqExact(a.ja4) OVER (PARTITION BY a.src_ip) AS distinct_ja4_count,
|
|
((a.hits / (a.unique_src_ports + 1)) / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS src_port_density,
|
|
(sum(a.hits) OVER (PARTITION BY a.ja4, a.src_asn) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_asn_concentration,
|
|
(sum(a.hits) OVER (PARTITION BY a.ja4, a.src_country_code) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_country_concentration,
|
|
IF(sum(a.hits) OVER (PARTITION BY a.ja4) < 100, 1, 0) AS is_rare_ja4,
|
|
(count() OVER (PARTITION BY h.header_order_hash, a.first_ua) / (count() OVER (PARTITION BY a.first_ua) + 1)) AS header_order_confidence,
|
|
uniqExact(h.header_order_hash) OVER (PARTITION BY a.src_ip) AS distinct_header_orders,
|
|
(a.uniq_paths / (a.hits + 1)) AS path_diversity_ratio,
|
|
a.url_depth_variance AS url_depth_variance,
|
|
(a.count_anomalous_payload / (a.hits + 1)) AS anomalous_payload_ratio,
|
|
-- B features : TLS/TCP (disponibles correlated=1 uniquement)
|
|
a.uniq_ja3_val AS uniq_ja3_per_row,
|
|
sqrt(a.tcp_jitter_variance) / greatest(a.avg_syn_ms_val, 1) AS syn_timing_cv, -- B2
|
|
a.tls12_count / (a.hits + 1) AS tls12_ratio, -- B3
|
|
-- B features : HTTP pures (disponibles correlated=0 et 1)
|
|
a.count_head / (a.hits + 1) AS head_ratio, -- B4
|
|
a.count_no_sec_fetch / (a.hits + 1) AS sec_fetch_absence_rate, -- B5
|
|
a.count_generic_accept / (a.hits + 1) AS generic_accept_ratio, -- B6
|
|
a.count_http10 / (a.hits + 1) AS http10_ratio, -- B7
|
|
a.ip_df_variance AS ip_df_variance -- B8
|
|
FROM (
|
|
SELECT
|
|
window_start, src_ip, ja4, host, src_asn,
|
|
any(src_country_code) AS src_country_code, any(src_as_name) AS src_as_name,
|
|
any(src_org) AS src_org, any(src_domain) AS src_domain, any(first_ua) AS first_ua,
|
|
sum(hits) AS hits, uniqMerge(uniq_paths) AS uniq_paths,
|
|
uniqMerge(uniq_query_params) AS uniq_query_params, sum(count_post) AS count_post,
|
|
min(first_seen) AS first_seen, max(last_seen) AS last_seen,
|
|
any(tcp_fp_raw) AS tcp_fingerprint, varPopMerge(tcp_jitter_variance) AS tcp_jitter_variance,
|
|
varPopMerge(total_ip_length_var) AS request_size_variance,
|
|
any(tcp_win_raw * exp2(tcp_scale_raw)) AS true_window_size,
|
|
IF(any(tcp_mss_raw) > 0, any(tcp_win_raw) / any(tcp_mss_raw), 0) AS window_mss_ratio,
|
|
any(http_ver_raw) AS http_version, any(tls_alpn_raw) AS tls_alpn, any(tls_sni_raw) AS tls_sni,
|
|
max(correlated_raw) AS correlated, uniqMerge(unique_src_ports) AS unique_src_ports,
|
|
uniqMerge(unique_conn_id) AS unique_conn_id, max(max_keepalives) AS max_keepalives,
|
|
sum(orphan_count) AS orphan_count, sum(ip_id_zero_count) AS ip_id_zero_count,
|
|
sum(mss_1460_count) AS mss_1460_count,
|
|
sum(count_assets) AS count_assets, sum(count_no_referer) AS count_no_referer, uniqMerge(uniq_ua) AS unique_ua,
|
|
varPopMerge(url_depth_variance) AS url_depth_variance,
|
|
sum(count_anomalous_payload) AS count_anomalous_payload,
|
|
-- B feature aggregates
|
|
uniqMerge(uniq_ja3) AS uniq_ja3_val,
|
|
avgMerge(avg_syn_ms) AS avg_syn_ms_val,
|
|
sum(tls12_count) AS tls12_count,
|
|
sum(count_head) AS count_head,
|
|
sum(count_no_sec_fetch) AS count_no_sec_fetch,
|
|
sum(count_generic_accept) AS count_generic_accept,
|
|
sum(count_http10) AS count_http10,
|
|
varPopMerge(ip_df_var) AS ip_df_variance
|
|
FROM ja4_processing.agg_host_ip_ja4_1h
|
|
WHERE window_start >= now() - INTERVAL 24 HOUR
|
|
GROUP BY window_start, src_ip, ja4, host, src_asn
|
|
) a
|
|
LEFT JOIN (
|
|
SELECT
|
|
window_start, src_ip, any(header_order_hash) AS header_order_hash,
|
|
max(header_count) AS header_count, max(has_accept_language) AS has_accept_language,
|
|
max(has_cookie) AS has_cookie, max(has_referer) AS has_referer,
|
|
max(modern_browser_score) AS modern_browser_score, max(ua_ch_mismatch) AS ua_ch_mismatch,
|
|
any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest
|
|
FROM ja4_processing.agg_header_fingerprint_1h
|
|
WHERE window_start >= now() - INTERVAL 24 HOUR
|
|
GROUP BY window_start, src_ip
|
|
) h ON a.src_ip = h.src_ip AND a.window_start = h.window_start
|
|
)
|
|
SELECT
|
|
*,
|
|
-(sum((hits / (total_ip_hits + 1)) * log2((hits / (total_ip_hits + 1)) + 0.000001)) OVER (PARTITION BY src_ip)) AS temporal_entropy,
|
|
-- B1: ratio diversité JA3/JA4 par src_ip (signal: bots avec JA3 rotatifs sur peu de JA4)
|
|
sum(uniq_ja3_per_row) OVER (PARTITION BY src_ip) / greatest(distinct_ja4_count, 1) AS ja3_diversity_ratio
|
|
FROM base_data;
|
|
|
|
-- ============================================================================
|
|
-- VUES POUR LE DASHBOARD WEB
|
|
-- ============================================================================
|
|
|
|
-- Vue entités : données brutes par IP/JA4 pour clustering et incidents
|
|
-- Colonnes utilisées : entity_type, entity_value, user_agents, client_headers, src_ip, ja4, log_date
|
|
CREATE OR REPLACE VIEW ja4_processing.view_dashboard_entities AS
|
|
SELECT
|
|
'ip' AS entity_type,
|
|
toString(src_ip) AS entity_value,
|
|
src_ip,
|
|
ja4,
|
|
log_date,
|
|
groupUniqArray(100)(header_user_agent) AS user_agents,
|
|
splitByChar(',', any(client_headers)) AS client_headers
|
|
FROM ja4_logs.http_logs
|
|
WHERE log_date >= today() - 7
|
|
AND header_user_agent != ''
|
|
GROUP BY src_ip, ja4, log_date;
|
|
|
|
-- Vue user-agents agrégés par heure pour les routes fingerprints/metrics
|
|
-- Colonnes utilisées : src_ip, ja4, hour, log_date, user_agents (Array), requests
|
|
CREATE OR REPLACE VIEW ja4_processing.view_dashboard_user_agents AS
|
|
SELECT
|
|
src_ip,
|
|
ja4,
|
|
toStartOfHour(time) AS hour,
|
|
log_date,
|
|
groupUniqArray(50)(header_user_agent) AS user_agents,
|
|
count() AS requests
|
|
FROM ja4_logs.http_logs
|
|
WHERE log_date >= today() - 7
|
|
AND header_user_agent != ''
|
|
GROUP BY src_ip, ja4, hour, log_date;
|
|
|
|
-- ============================================================================
|
|
-- VUE IA 24H (fenêtre glissante journalière — détection d'attaquants lents)
|
|
-- Même structure que view_ai_features_1h mais agrège toutes les fenêtres horaires
|
|
-- en une seule ligne par (src_ip, ja4, host) sur 24h.
|
|
-- Utilisée par bot_detector en mode ENABLE_MULTIWINDOW=true.
|
|
-- ============================================================================
|
|
CREATE OR REPLACE VIEW ja4_processing.view_ai_features_24h AS
|
|
WITH base_data AS (
|
|
SELECT
|
|
toStartOfDay(a.first_seen) AS window_start,
|
|
a.src_ip, a.ja4, a.host,
|
|
toString(a.src_asn) AS asn_number, a.src_as_name AS asn_org,
|
|
a.src_org AS asn_detail, a.src_domain AS asn_domain, a.src_country_code AS country_code,
|
|
dictGetOrDefault('ja4_processing.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label,
|
|
COALESCE(
|
|
nullIf(dictGetOrDefault('ja4_processing.dict_bot_ip', 'bot_name', a.src_ip, ''), ''),
|
|
nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''),
|
|
''
|
|
) AS bot_name,
|
|
dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(a.ja4), '') AS browser_family,
|
|
a.hits AS hits,
|
|
sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits,
|
|
a.correlated AS correlated, a.tcp_jitter_variance AS tcp_jitter_variance,
|
|
a.true_window_size AS true_window_size, a.window_mss_ratio AS window_mss_ratio, a.max_keepalives AS max_keepalives,
|
|
h.header_order_hash AS header_order_hash, h.header_count AS header_count,
|
|
h.has_accept_language AS has_accept_language, h.has_cookie AS has_cookie,
|
|
h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score, h.ua_ch_mismatch AS ua_ch_mismatch,
|
|
(a.count_post / (a.hits + 1)) AS post_ratio, (a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index,
|
|
(a.hits / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS hit_velocity,
|
|
(a.unique_src_ports / (a.hits + 1)) AS port_exhaustion_ratio, (a.orphan_count / (a.hits + 1)) AS orphan_ratio,
|
|
(a.ip_id_zero_count / (a.hits + 1)) AS ip_id_zero_ratio, (a.hits / (a.unique_conn_id + 1)) AS multiplexing_efficiency,
|
|
IF(a.mss_1460_count > (a.hits * 0.8) AND h.modern_browser_score > 70, 1, 0) AS mss_mobile_mismatch,
|
|
a.request_size_variance AS request_size_variance,
|
|
IF(a.tls_alpn = 'h2' AND a.http_version != '2', 1, 0) AS alpn_http_mismatch,
|
|
IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing,
|
|
IF(length(a.tls_sni) > 0 AND a.tls_sni != a.host, 1, 0) AS sni_host_mismatch,
|
|
IF(h.sec_fetch_mode = 'navigate' AND h.sec_fetch_dest != 'document', 1, 0) AS is_fake_navigation,
|
|
count() OVER (PARTITION BY a.tcp_fingerprint) AS tcp_shared_count,
|
|
count() OVER (PARTITION BY h.header_order_hash) AS header_order_shared_count,
|
|
(a.count_assets / (a.hits + 1)) AS asset_ratio, (a.count_no_referer / (a.hits + 1)) AS direct_access_ratio,
|
|
IF(a.unique_ua > 2, 1, 0) AS is_ua_rotating, uniqExact(a.ja4) OVER (PARTITION BY a.src_ip) AS distinct_ja4_count,
|
|
((a.hits / (a.unique_src_ports + 1)) / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS src_port_density,
|
|
(sum(a.hits) OVER (PARTITION BY a.ja4, a.src_asn) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_asn_concentration,
|
|
(sum(a.hits) OVER (PARTITION BY a.ja4, a.src_country_code) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_country_concentration,
|
|
IF(sum(a.hits) OVER (PARTITION BY a.ja4) < 100, 1, 0) AS is_rare_ja4,
|
|
(count() OVER (PARTITION BY h.header_order_hash, a.first_ua) / (count() OVER (PARTITION BY a.first_ua) + 1)) AS header_order_confidence,
|
|
uniqExact(h.header_order_hash) OVER (PARTITION BY a.src_ip) AS distinct_header_orders,
|
|
(a.uniq_paths / (a.hits + 1)) AS path_diversity_ratio,
|
|
a.url_depth_variance AS url_depth_variance,
|
|
(a.count_anomalous_payload / (a.hits + 1)) AS anomalous_payload_ratio,
|
|
a.uniq_ja3_val AS uniq_ja3_per_row,
|
|
sqrt(a.tcp_jitter_variance) / greatest(a.avg_syn_ms_val, 1) AS syn_timing_cv,
|
|
a.tls12_count / (a.hits + 1) AS tls12_ratio,
|
|
a.count_head / (a.hits + 1) AS head_ratio,
|
|
a.count_no_sec_fetch / (a.hits + 1) AS sec_fetch_absence_rate,
|
|
a.count_generic_accept / (a.hits + 1) AS generic_accept_ratio,
|
|
a.count_http10 / (a.hits + 1) AS http10_ratio,
|
|
a.ip_df_variance AS ip_df_variance
|
|
FROM (
|
|
SELECT
|
|
src_ip, ja4, host, src_asn,
|
|
any(src_country_code) AS src_country_code, any(src_as_name) AS src_as_name,
|
|
any(src_org) AS src_org, any(src_domain) AS src_domain, any(first_ua) AS first_ua,
|
|
sum(hits) AS hits, uniqMerge(uniq_paths) AS uniq_paths,
|
|
uniqMerge(uniq_query_params) AS uniq_query_params, sum(count_post) AS count_post,
|
|
min(first_seen) AS first_seen, max(last_seen) AS last_seen,
|
|
any(tcp_fp_raw) AS tcp_fingerprint, varPopMerge(tcp_jitter_variance) AS tcp_jitter_variance,
|
|
varPopMerge(total_ip_length_var) AS request_size_variance,
|
|
any(tcp_win_raw * exp2(tcp_scale_raw)) AS true_window_size,
|
|
IF(any(tcp_mss_raw) > 0, any(tcp_win_raw) / any(tcp_mss_raw), 0) AS window_mss_ratio,
|
|
any(http_ver_raw) AS http_version, any(tls_alpn_raw) AS tls_alpn, any(tls_sni_raw) AS tls_sni,
|
|
max(correlated_raw) AS correlated, uniqMerge(unique_src_ports) AS unique_src_ports,
|
|
uniqMerge(unique_conn_id) AS unique_conn_id, max(max_keepalives) AS max_keepalives,
|
|
sum(orphan_count) AS orphan_count, sum(ip_id_zero_count) AS ip_id_zero_count,
|
|
sum(mss_1460_count) AS mss_1460_count,
|
|
sum(count_assets) AS count_assets, sum(count_no_referer) AS count_no_referer, uniqMerge(uniq_ua) AS unique_ua,
|
|
varPopMerge(url_depth_variance) AS url_depth_variance,
|
|
sum(count_anomalous_payload) AS count_anomalous_payload,
|
|
uniqMerge(uniq_ja3) AS uniq_ja3_val,
|
|
avgMerge(avg_syn_ms) AS avg_syn_ms_val,
|
|
sum(tls12_count) AS tls12_count,
|
|
sum(count_head) AS count_head,
|
|
sum(count_no_sec_fetch) AS count_no_sec_fetch,
|
|
sum(count_generic_accept) AS count_generic_accept,
|
|
sum(count_http10) AS count_http10,
|
|
varPopMerge(ip_df_var) AS ip_df_variance
|
|
FROM ja4_processing.agg_host_ip_ja4_1h
|
|
WHERE window_start >= now() - INTERVAL 24 HOUR
|
|
GROUP BY src_ip, ja4, host, src_asn
|
|
) a
|
|
LEFT JOIN (
|
|
SELECT
|
|
src_ip, any(header_order_hash) AS header_order_hash,
|
|
max(header_count) AS header_count, max(has_accept_language) AS has_accept_language,
|
|
max(has_cookie) AS has_cookie, max(has_referer) AS has_referer,
|
|
max(modern_browser_score) AS modern_browser_score, max(ua_ch_mismatch) AS ua_ch_mismatch,
|
|
any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest
|
|
FROM ja4_processing.agg_header_fingerprint_1h
|
|
WHERE window_start >= now() - INTERVAL 24 HOUR
|
|
GROUP BY src_ip
|
|
) h ON a.src_ip = h.src_ip
|
|
)
|
|
SELECT
|
|
*,
|
|
-(sum((hits / (total_ip_hits + 1)) * log2((hits / (total_ip_hits + 1)) + 0.000001)) OVER (PARTITION BY src_ip)) AS temporal_entropy,
|
|
sum(uniq_ja3_per_row) OVER (PARTITION BY src_ip) / greatest(distinct_ja4_count, 1) AS ja3_diversity_ratio
|
|
FROM base_data;
|