Étape 2 — Fingerprinting HTTP/2 dans le pipeline ML : - Ajout du dictionnaire dict_browser_h2 (11 familles de navigateurs) dans 05_aggregation_tables.sql - Ajout du CTE h2_agg et 4 features HTTP/2 dans 07_ai_features_view.sql : h2_settings_known, h2_pseudo_order_match, h2_ja4_coherence, h2_settings_rare - Calcul du fingerprint_coherence_score (5 axes pondérés) dans la vue - Ajout du 6e axe axis_h2_coherence dans browser.py (poids rééquilibrés) - browser_h2.csv : 11 fingerprints Akamai → famille navigateur Étape 3 — Pré-filtre de cohérence sur la baseline humaine : - pipeline.py exclut les sessions avec fingerprint_coherence_score < seuil de la baseline d'entraînement - FINGERPRINT_COHERENCE_THRESHOLD configurable via env (défaut 0.25) - Log des sessions exclues pour analyse SOC Étape 4 — Détection de drift améliorée : - scoring.py : passage de 5 à 9 quantiles (p5…p95) - Ajout de la divergence KL en complément du test KS - Détection de drift adversarial (≥80% des features dérivent dans la même direction) - Split temporel strict pour la validation Étape 5 — Graphe bipartite JA4×ASN (§5.2) : - fleet.py : détection de flottes via NetworkX + Louvain (imports optionnels) - enrich_with_fleet_score() : ajout fleet_score + fleet_campaign_flag au DataFrame - cycle.py : appel après preprocess_df avec log du nombre de sessions en flotte - SQL migration 05_fleet_metrics_tables.sql : table fleet_detections (TTL 7j) - Dashboard : /fleet + /api/fleet (communautés détectées) + template fleet.html Étape 6 — Cross-domain Jaccard §5.8 : - 12_thesis_features.sql : CTE jaccard_paths → cross_domain_path_similarity - Signal : même chemins (/admin, /wp-login) sur plusieurs hosts = scanner Étape 7 — ExIFFI + erreurs AE par feature : - scoring.py : compute_exiffi_importance() par permutation, compute_ae_feature_errors() - pipeline.py : calcul ExIFFI sur X_test, mapping index → dict pour anomalies - build_reason() enrichi avec exiffi_top quand SHAP inactif Étape 8 — Méta-learner pour la pondération de l'ensemble : - scoring.py : classe MetaLearner (LogisticRegression, fallback poids fixes <1000 labels) - Collecte des labels depuis le cycle courant (known_bots, légitimes, Anubis) - pipeline.py : remplacement des poids fixes par MetaLearner.predict() Étape 9 — Métriques de performance et monitoring : - metrics.py : record_cycle_metrics() — taux anomalie, drift, corrélation, latence - SQL migration 05_fleet_metrics_tables.sql : table ml_performance_metrics (TTL 90j) - Dashboard : /health + /api/health + template health.html - cycle.py : appel record_cycle_metrics en fin de cycle (Complet + Applicatif) Tests : 36/36 bot-detector tests passent Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
226 lines
13 KiB
SQL
226 lines
13 KiB
SQL
-- =============================================================================
|
|
-- 07_ai_features_view.sql — AI feature view with full Anubis enrichment
|
|
-- Source: bot_detector/anubis/view_ai_features_anubis.sql
|
|
-- Includes combined UA+IP priority logic and Anubis bot_name/action/category.
|
|
-- §2 : Features HTTP/2 (dict_browser_h2, cohérence H2↔JA4, pseudo-headers)
|
|
-- §3 : Score de cohérence de fingerprint cross-layer
|
|
-- =============================================================================
|
|
|
|
CREATE OR REPLACE VIEW ja4_processing.view_ai_features_1h AS
|
|
WITH
|
|
|
|
-- §2 — Agrégation des fingerprints HTTP/2 par (heure, src_ip)
|
|
-- Lecture directe depuis http_logs pour les colonnes ajoutées à l'étape 1
|
|
h2_agg AS (
|
|
SELECT
|
|
toStartOfHour(time) AS window_start,
|
|
toIPv6(src_ip) AS src_ip,
|
|
anyIf(h2_fingerprint, h2_fingerprint != '') AS h2_fp,
|
|
anyIf(h2_pseudo_order, h2_pseudo_order != '') AS h2_pseudo_ord
|
|
FROM ja4_logs.http_logs
|
|
WHERE time >= now() - INTERVAL 24 HOUR
|
|
AND (h2_fingerprint != '' OR h2_pseudo_order != '')
|
|
GROUP BY window_start, src_ip
|
|
),
|
|
|
|
base_data AS (
|
|
SELECT
|
|
a.window_start, a.src_ip, a.ja4, a.host,
|
|
toString(a.src_asn) AS asn_number,
|
|
a.src_as_name AS asn_org, a.src_org AS asn_detail, a.src_domain AS asn_domain,
|
|
a.src_country_code AS country_code,
|
|
dictGetOrDefault('ja4_processing.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label,
|
|
COALESCE(
|
|
nullIf(dictGetOrDefault('ja4_processing.dict_bot_ip', 'bot_name', a.src_ip, ''), ''),
|
|
nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''),
|
|
''
|
|
) AS bot_name,
|
|
dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(a.ja4), '') AS browser_family,
|
|
-- Anubis: IP/CIDR > ASN (simplified — UA and Country rules removed)
|
|
COALESCE(
|
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, ''), ''),
|
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), ''), ''),
|
|
''
|
|
) AS anubis_bot_name,
|
|
COALESCE(
|
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', a.src_ip, ''), ''),
|
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(a.src_asn), ''), ''),
|
|
''
|
|
) AS anubis_bot_action,
|
|
COALESCE(
|
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', a.src_ip, ''), ''),
|
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(a.src_asn), ''), ''),
|
|
''
|
|
) AS anubis_bot_category,
|
|
a.hits AS hits,
|
|
sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits,
|
|
a.correlated AS correlated,
|
|
a.tcp_jitter_variance AS tcp_jitter_variance,
|
|
a.true_window_size AS true_window_size,
|
|
a.window_mss_ratio AS window_mss_ratio,
|
|
a.max_keepalives AS max_keepalives,
|
|
h.header_order_hash AS header_order_hash, h.header_count AS header_count,
|
|
h.has_accept_language AS has_accept_language, h.has_cookie AS has_cookie,
|
|
h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score,
|
|
h.has_sec_ch_ua AS has_sec_ch_ua,
|
|
h.ua_ch_mismatch AS ua_ch_mismatch,
|
|
(a.count_post / (a.hits + 1)) AS post_ratio,
|
|
(a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index,
|
|
(a.hits / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS hit_velocity,
|
|
(a.unique_src_ports / (a.hits + 1)) AS port_exhaustion_ratio,
|
|
(a.orphan_count / (a.hits + 1)) AS orphan_ratio,
|
|
(a.ip_id_zero_count / (a.hits + 1)) AS ip_id_zero_ratio,
|
|
(a.hits / (a.unique_conn_id + 1)) AS multiplexing_efficiency,
|
|
IF(a.mss_1460_count > (a.hits * 0.8) AND h.has_sec_ch_ua > 0, 1, 0) AS mss_mobile_mismatch,
|
|
a.request_size_variance AS request_size_variance,
|
|
IF(a.tls_alpn = 'h2' AND a.http_version != '2', 1, 0) AS alpn_http_mismatch,
|
|
IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing,
|
|
IF(length(a.tls_sni) > 0 AND a.tls_sni != a.host, 1, 0) AS sni_host_mismatch,
|
|
IF(h.sec_fetch_mode = 'navigate' AND h.sec_fetch_dest != 'document', 1, 0) AS is_fake_navigation,
|
|
count() OVER (PARTITION BY a.tcp_fingerprint) AS tcp_shared_count,
|
|
count() OVER (PARTITION BY h.header_order_hash) AS header_order_shared_count,
|
|
(a.count_assets / (a.hits + 1)) AS asset_ratio,
|
|
(a.count_no_referer / (a.hits + 1)) AS direct_access_ratio,
|
|
IF(a.unique_ua > 2, 1, 0) AS is_ua_rotating,
|
|
uniqExact(a.ja4) OVER (PARTITION BY a.src_ip) AS distinct_ja4_count,
|
|
((a.hits / (a.unique_src_ports + 1)) / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS src_port_density,
|
|
(sum(a.hits) OVER (PARTITION BY a.ja4, a.src_asn) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_asn_concentration,
|
|
(sum(a.hits) OVER (PARTITION BY a.ja4, a.src_country_code) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_country_concentration,
|
|
IF(sum(a.hits) OVER (PARTITION BY a.ja4) < 100, 1, 0) AS is_rare_ja4,
|
|
(count() OVER (PARTITION BY h.header_order_hash, a.ja4) / (count() OVER (PARTITION BY a.ja4) + 1)) AS header_order_confidence,
|
|
uniqExact(h.header_order_hash) OVER (PARTITION BY a.src_ip) AS distinct_header_orders,
|
|
(a.uniq_paths / (a.hits + 1)) AS path_diversity_ratio,
|
|
a.url_depth_variance AS url_depth_variance,
|
|
(a.count_anomalous_payload / (a.hits + 1)) AS anomalous_payload_ratio,
|
|
a.uniq_ja3_val AS uniq_ja3_per_row,
|
|
sqrt(a.tcp_jitter_variance) / greatest(a.avg_syn_ms_val, 1) AS syn_timing_cv,
|
|
a.tls12_count / (a.hits + 1) AS tls12_ratio,
|
|
a.count_head / (a.hits + 1) AS head_ratio,
|
|
a.count_no_sec_fetch / (a.hits + 1) AS sec_fetch_absence_rate,
|
|
a.count_generic_accept / (a.hits + 1) AS generic_accept_ratio,
|
|
a.count_http10 / (a.hits + 1) AS http10_ratio,
|
|
a.ip_df_variance AS ip_df_variance,
|
|
a.avg_ttl_val AS avg_ttl,
|
|
sqrt(a.ttl_variance_val) AS ttl_std,
|
|
IF(a.count_correlated_val > 0, a.count_no_wscale_val / a.count_correlated_val, 0) AS no_window_scale_ratio,
|
|
a.count_no_accept_enc_val / (a.hits + 1) AS missing_accept_enc_ratio,
|
|
a.count_http_scheme_val / (a.hits + 1) AS http_scheme_ratio,
|
|
-- P1 : nouvelles features de détection
|
|
IF(a.count_xff_val > 0, 1, 0) AS has_xff,
|
|
a.count_unusual_ct_val / greatest(a.count_post, 1) AS unusual_content_type_ratio,
|
|
a.count_non_std_port_val / (a.hits + 1) AS non_standard_port_ratio,
|
|
a.count_login_post_val / greatest(a.count_post, 1) AS login_post_concentration,
|
|
h.sec_ch_mobile_mismatch AS sec_ch_mobile_mismatch,
|
|
-- §2 — Features HTTP/2 (fingerprint SETTINGS, cohérence H2↔JA4, pseudo-headers)
|
|
-- h2_settings_known : le fingerprint H2 est dans dict_browser_h2
|
|
IF(
|
|
COALESCE(h2.h2_fp, '') != '' AND
|
|
dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family',
|
|
tuple(COALESCE(h2.h2_fp, '')), '') != '',
|
|
1, 0
|
|
) AS h2_settings_known,
|
|
-- h2_pseudo_order_match : l'ordre des pseudo-headers correspond à la famille JA4 déclarée
|
|
CASE
|
|
WHEN COALESCE(h2.h2_pseudo_ord, '') = '' THEN 0
|
|
WHEN dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family',
|
|
tuple(a.ja4), '') IN ('Chromium', 'Chrome', 'Edge', 'Safari')
|
|
AND h2.h2_pseudo_ord = 'm,a,s,p' THEN 1
|
|
WHEN dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family',
|
|
tuple(a.ja4), '') = 'Firefox'
|
|
AND h2.h2_pseudo_ord = 'm,p,s,a' THEN 1
|
|
ELSE 0
|
|
END AS h2_pseudo_order_match,
|
|
-- h2_ja4_coherence : la famille navigateur H2 correspond à la famille JA4
|
|
IF(
|
|
COALESCE(h2.h2_fp, '') != '' AND
|
|
dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family',
|
|
tuple(COALESCE(h2.h2_fp, '')), '') =
|
|
dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family',
|
|
tuple(a.ja4), '') AND
|
|
dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family',
|
|
tuple(a.ja4), '') != '',
|
|
1, 0
|
|
) AS h2_ja4_coherence,
|
|
-- h2_settings_rare : fingerprint H2 non reconnu (potentiellement suspect)
|
|
IF(
|
|
COALESCE(h2.h2_fp, '') != '' AND
|
|
dictGetOrDefault('ja4_processing.dict_browser_h2', 'browser_family',
|
|
tuple(COALESCE(h2.h2_fp, '')), '') = '',
|
|
1, 0
|
|
) AS h2_settings_rare
|
|
FROM (
|
|
SELECT
|
|
window_start, src_ip, ja4, host, src_asn,
|
|
any(src_country_code) AS src_country_code, any(src_as_name) AS src_as_name,
|
|
any(src_org) AS src_org, any(src_domain) AS src_domain, any(first_ua) AS first_ua,
|
|
sum(hits) AS hits, uniqMerge(uniq_paths) AS uniq_paths,
|
|
uniqMerge(uniq_query_params) AS uniq_query_params, sum(count_post) AS count_post,
|
|
min(first_seen) AS first_seen, max(last_seen) AS last_seen,
|
|
any(tcp_fp_raw) AS tcp_fingerprint, varPopMerge(tcp_jitter_variance) AS tcp_jitter_variance,
|
|
varPopMerge(total_ip_length_var) AS request_size_variance,
|
|
any(tcp_win_raw * exp2(tcp_scale_raw)) AS true_window_size,
|
|
IF(any(tcp_mss_raw) > 0, any(tcp_win_raw) / any(tcp_mss_raw), 0) AS window_mss_ratio,
|
|
any(http_ver_raw) AS http_version, any(tls_alpn_raw) AS tls_alpn, any(tls_sni_raw) AS tls_sni,
|
|
max(correlated_raw) AS correlated, uniqMerge(unique_src_ports) AS unique_src_ports,
|
|
uniqMerge(unique_conn_id) AS unique_conn_id, max(max_keepalives) AS max_keepalives,
|
|
sum(orphan_count) AS orphan_count, sum(ip_id_zero_count) AS ip_id_zero_count,
|
|
sum(mss_1460_count) AS mss_1460_count,
|
|
sum(count_assets) AS count_assets, sum(count_no_referer) AS count_no_referer,
|
|
uniqMerge(uniq_ua) AS unique_ua,
|
|
varPopMerge(url_depth_variance) AS url_depth_variance,
|
|
sum(count_anomalous_payload) AS count_anomalous_payload,
|
|
uniqMerge(uniq_ja3) AS uniq_ja3_val,
|
|
avgMerge(avg_syn_ms) AS avg_syn_ms_val,
|
|
sum(tls12_count) AS tls12_count,
|
|
sum(count_head) AS count_head,
|
|
sum(count_no_sec_fetch) AS count_no_sec_fetch,
|
|
sum(count_generic_accept) AS count_generic_accept,
|
|
sum(count_http10) AS count_http10,
|
|
varPopMerge(ip_df_var) AS ip_df_variance,
|
|
avgIfMerge(avg_ttl) AS avg_ttl_val,
|
|
varPopIfMerge(ttl_var) AS ttl_variance_val,
|
|
sum(count_no_wscale) AS count_no_wscale_val,
|
|
sum(count_correlated) AS count_correlated_val,
|
|
sum(count_no_accept_enc) AS count_no_accept_enc_val,
|
|
sum(count_http_scheme) AS count_http_scheme_val,
|
|
-- P1 : nouvelles features de détection
|
|
sum(count_xff) AS count_xff_val,
|
|
sum(count_unusual_ct) AS count_unusual_ct_val,
|
|
sum(count_non_std_port) AS count_non_std_port_val,
|
|
sum(count_login_post) AS count_login_post_val
|
|
FROM ja4_processing.agg_host_ip_ja4_1h
|
|
WHERE window_start >= now() - INTERVAL 24 HOUR
|
|
GROUP BY window_start, src_ip, ja4, host, src_asn
|
|
) a
|
|
LEFT JOIN (
|
|
SELECT
|
|
window_start, src_ip, any(header_order_hash) AS header_order_hash,
|
|
max(header_count) AS header_count, max(has_accept_language) AS has_accept_language,
|
|
max(has_cookie) AS has_cookie, max(has_referer) AS has_referer,
|
|
max(modern_browser_score) AS modern_browser_score, max(has_sec_ch_ua) AS has_sec_ch_ua,
|
|
max(ua_ch_mismatch) AS ua_ch_mismatch,
|
|
max(sec_ch_mobile_mismatch) AS sec_ch_mobile_mismatch,
|
|
any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest
|
|
FROM ja4_processing.agg_header_fingerprint_1h
|
|
WHERE window_start >= now() - INTERVAL 24 HOUR
|
|
GROUP BY window_start, src_ip
|
|
) h ON a.src_ip = h.src_ip AND a.window_start = h.window_start
|
|
LEFT JOIN h2_agg h2 ON h2.src_ip = a.src_ip AND h2.window_start = a.window_start
|
|
)
|
|
SELECT
|
|
*,
|
|
-(sum((hits / (total_ip_hits + 1)) * log2((hits / (total_ip_hits + 1)) + 0.000001)) OVER (PARTITION BY src_ip)) AS temporal_entropy,
|
|
sum(uniq_ja3_per_row) OVER (PARTITION BY src_ip) / greatest(distinct_ja4_count, 1) AS ja3_diversity_ratio,
|
|
-- §3 — Score de cohérence de fingerprint cross-layer [0.0, 1.0]
|
|
-- Combine : famille navigateur connue, cohérence H2↔JA4, cohérence TLS,
|
|
-- présence Accept-Language, et absence de mismatch UA/CH.
|
|
toFloat32(
|
|
CASE WHEN browser_family != '' THEN 0.25 ELSE 0.0 END
|
|
+ COALESCE(h2_ja4_coherence, 0) * 0.20
|
|
+ (1 - COALESCE(alpn_http_mismatch, 0)) * 0.15
|
|
+ (1 - COALESCE(sni_host_mismatch, 0)) * 0.10
|
|
+ COALESCE(has_accept_language, 0) * 0.15
|
|
+ (1 - COALESCE(ua_ch_mismatch, 0)) * 0.15
|
|
) AS fingerprint_coherence_score
|
|
FROM base_data;
|