feat(clustering): ajout features headers HTTP (Accept-Language, Accept-Encoding, Sec-Fetch, nb headers)
- SQL: LEFT JOIN sur view_dashboard_entities pour extraire la présence des headers Accept-Encoding, Sec-Fetch-* et le nombre de headers par src_ip/ja4 (via subquery) - SQL: ajout avg(ml.has_accept_language) depuis ml_detected_anomalies - FEATURES: 23 → 27 dimensions [23] Accept-Language (0=absent=bot-like) [24] Accept-Encoding (0=absent=bot-like) [25] Sec-Fetch-* (1=vrai navigateur) [26] Nb Headers (normalisé /20; 3=bot, 15=browser) - risk_score_from_centroid(): poids réajustés (somme=1.0), 4 nouveaux termes header absence Accept-Language ×0.05, absence Accept-Encoding ×0.05, absence Sec-Fetch ×0.04, peu de headers ×0.04 - name_cluster(): nouveau label 'Bot UA simulé' (ua_ch mismatch + sec_fetch absent) et 'Scanner pur (no headers)' + 'Navigateur légitime' + 'Headless (no Sec-Fetch)' - Fix: %% dans les LIKE ClickHouse (échappement paramètres Python %-format) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -89,11 +89,29 @@ SELECT
|
||||
|
||||
max(ml.threat_level) AS threat,
|
||||
any(ml.country_code) AS country,
|
||||
any(ml.asn_org) AS asn_org
|
||||
any(ml.asn_org) AS asn_org,
|
||||
|
||||
-- Features headers HTTP (depuis view_dashboard_entities)
|
||||
avg(ml.has_accept_language) AS hdr_accept_lang,
|
||||
any(vh.hdr_enc) AS hdr_has_encoding,
|
||||
any(vh.hdr_sec_fetch) AS hdr_has_sec_fetch,
|
||||
any(vh.hdr_count) AS hdr_count_raw
|
||||
FROM mabase_prod.agg_host_ip_ja4_1h t
|
||||
LEFT JOIN mabase_prod.ml_detected_anomalies ml
|
||||
ON t.src_ip = ml.src_ip AND t.ja4 = ml.ja4
|
||||
AND ml.detected_at >= now() - INTERVAL %(hours)s HOUR
|
||||
LEFT JOIN (
|
||||
SELECT
|
||||
toIPv6(concat('::ffff:', toString(src_ip))) AS src_ip_v6,
|
||||
ja4,
|
||||
any(arrayExists(x -> x LIKE '%%Accept-Encoding%%', client_headers)) AS hdr_enc,
|
||||
any(arrayExists(x -> x LIKE '%%Sec-Fetch%%', client_headers)) AS hdr_sec_fetch,
|
||||
any(length(splitByChar(',', client_headers[1]))) AS hdr_count
|
||||
FROM mabase_prod.view_dashboard_entities
|
||||
WHERE length(client_headers) > 0
|
||||
AND log_date >= today() - 2
|
||||
GROUP BY src_ip_v6, ja4
|
||||
) vh ON t.src_ip = vh.src_ip_v6 AND t.ja4 = vh.ja4
|
||||
WHERE t.window_start >= now() - INTERVAL %(hours)s HOUR
|
||||
AND t.tcp_ttl_raw > 0
|
||||
GROUP BY t.src_ip, t.ja4
|
||||
@ -105,6 +123,7 @@ _SQL_COLS = [
|
||||
"ip_id_zero", "entropy", "browser_score", "alpn_mismatch", "alpn_missing",
|
||||
"h2_eff", "hdr_conf", "ua_ch_mismatch", "asset_ratio", "direct_ratio",
|
||||
"ja4_count", "ua_rotating", "threat", "country", "asn_org",
|
||||
"hdr_accept_lang", "hdr_has_encoding", "hdr_has_sec_fetch", "hdr_count_raw",
|
||||
]
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user