From 8180f4af0459c2f07f7a442959f2cb5c1e3cda3b Mon Sep 17 00:00:00 2001 From: toto Date: Thu, 9 Apr 2026 15:25:33 +0200 Subject: [PATCH] refactor(anubis): simplify to IP/CIDR + ASN only, remove UA and Country rules - Remove UA regex extraction (extract_ua_regex, _extract_ua_from_all/any) - Remove Country rule collection from parse_bot_policies_inline - Simplify fetch_rules.py: collect_all_rules returns (ip_rules, asn_rules) - Remove insert_ua_rules and insert_country_rules functions - reload_dicts now only reloads dict_anubis_ip + dict_anubis_asn - Simplify CASE blocks in 04_mv_http_logs.sql, 07_ai_features_view.sql, view_ai_features_anubis.sql, mv_http_logs.sql: IP > ASN (was 5-level UA+IP > UA > IP > ASN > Country cascade) - Remove dict_anubis_country + dict_anubis_ua from 03_anubis_tables.sql (UA table kept as stub for REGEXP_TREE catch-all compatibility) - Remove anubis_country_rules table from schema - Remove Anubis UA and Country tabs from dashboard reflists page - Remove anubis_ua_rules/country_rules from API reflist queries - deploy_schema.sql simplified from 339 to 122 lines - 764 lines removed across 9 files Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../bot-detector/anubis/deploy_schema.sql | 267 ++---------------- services/bot-detector/anubis/fetch_rules.py | 225 ++------------- services/bot-detector/anubis/mv_http_logs.sql | 75 ++--- .../anubis/view_ai_features_anubis.sql | 74 ++--- services/dashboard/backend/routes/api.py | 14 - .../dashboard/backend/templates/reflists.html | 8 +- shared/clickhouse/03_anubis_tables.sql | 86 ++---- shared/clickhouse/04_mv_http_logs.sql | 75 ++--- shared/clickhouse/07_ai_features_view.sql | 74 ++--- 9 files changed, 136 insertions(+), 762 deletions(-) diff --git a/services/bot-detector/anubis/deploy_schema.sql b/services/bot-detector/anubis/deploy_schema.sql index c89320b..ddd52ed 100644 --- a/services/bot-detector/anubis/deploy_schema.sql +++ b/services/bot-detector/anubis/deploy_schema.sql @@ -1,30 +1,19 @@ -- ============================================================================ -- ANUBIS CRAWLER RULES — Labeling des http_logs + pipeline ML --- Architecture : --- anubis_ua_rules (table) → dict_anubis_ua (REGEXP_TREE) +-- Architecture simplifiée (IP/CIDR et ASN uniquement) : +-- anubis_ua_rules (table stub) → dict_anubis_ua (REGEXP_TREE, catch-all) -- anubis_ip_rules (table) → dict_anubis_ip (IP_TRIE) --- http_logs : +anubis_bot_name, +anubis_bot_action --- mv_http_logs : reconstruit avec enrichissement Anubis +-- anubis_asn_rules (table) → dict_anubis_asn (FLAT) +-- http_logs : +anubis_bot_name, +anubis_bot_action, +anubis_bot_category +-- mv_http_logs : enrichissement Anubis (IP > ASN) -- view_ai_features_1h : +anubis_bot_name, +anubis_bot_action (via dictGet) --- ml_detected_anomalies : +anubis_bot_name, +anubis_bot_action --- ml_all_scores : +anubis_bot_name, +anubis_bot_action +-- ml_detected_anomalies / ml_all_scores : colonnes Anubis -- ============================================================================ -- ---------------------------------------------------------------------------- --- 1. TABLE SOURCE — règles User-Agent (pour dictionnaire REGEXP_TREE) --- --- Format attendu par ClickHouse regexp_tree (v23.5+) : --- id UInt64 : identifiant unique --- parent_id UInt64 : 0 = racine, sinon id du parent (héritage d'attributs) --- regexp String : expression régulière (re2/vectorscan) --- keys Array(String) : noms des attributs, ex. ['bot_name', 'action'] --- values Array(String) : valeurs correspondantes --- --- Hiérarchie utilisée pour la priorité : --- Règles génériques DENY (parent_id=0) → enfants ALLOW spécifiques --- Exemple : ai-crawlers-training (parent) → openai-gptbot (enfant) --- Quand l'UA correspond à enfant ET parent, c'est le nom de l'enfant qui --- est retourné (l'enfant hérite ET surcharge les attributs du parent). +-- 1. TABLE SOURCE — règles User-Agent (stub REGEXP_TREE) +-- REGEXP_TREE nécessite ≥1 règle ; le catch-all est injecté à l'init. +-- Cette table n'est PAS peuplée par fetch_rules.py. -- ---------------------------------------------------------------------------- CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules ( @@ -39,12 +28,7 @@ ORDER BY id; -- ---------------------------------------------------------------------------- -- 2. TABLE SOURCE — règles IP/CIDR (pour dictionnaire IP_TRIE) --- --- Colonnes requises par dict_anubis_ip et mv_http_logs : --- rule_id : identifiant de règle, croisé avec dict_anubis_ua pour --- la logique UA+IP (même rule_id → match combiné) --- has_ua : 1 si la règle possède aussi une regex UA (croisement nécessaire) --- category : catégorie Anubis (bots, crawlers, clients, policies…) +-- Peuplée par fetch_rules.py depuis les fichiers YAML Anubis. -- ---------------------------------------------------------------------------- CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules ( @@ -59,29 +43,21 @@ ENGINE = ReplacingMergeTree() ORDER BY prefix; -- ---------------------------------------------------------------------------- --- 3. DICTIONNAIRE UA — REGEXP_TREE --- dictGet('ja4_processing.dict_anubis_ua', 'bot_name', header_user_agent) --- --- Le PRIMARY KEY est 'regexp' (String) — requis par ClickHouse 26.x. --- Connexion interne (HOST localhost PORT 9000) pour éviter deadlock HTTP. --- Remplacer 'admin' et le mot de passe par les credentials ClickHouse. +-- 3. TABLE SOURCE — règles ASN (pour dictionnaire Flat) +-- Peuplée par fetch_rules.py depuis botPolicies.yaml. -- ---------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ua; -CREATE DICTIONARY ja4_processing.dict_anubis_ua +CREATE TABLE IF NOT EXISTS ja4_processing.anubis_asn_rules ( - regexp String, - bot_name String, - action String + asn UInt32, + bot_name LowCardinality(String), + action LowCardinality(String), + category LowCardinality(String) ) -PRIMARY KEY regexp -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_ua_rules')) -LAYOUT(REGEXP_TREE) -LIFETIME(MIN 300 MAX 600); +ENGINE = ReplacingMergeTree() +ORDER BY asn; -- ---------------------------------------------------------------------------- --- 4. DICTIONNAIRE IP — IP_TRIE --- dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', toIPv6(src_ip), '') --- Connexion interne (HOST localhost PORT 9000) — même raison que dict_anubis_ua. +-- 4. DICTIONNAIRE IP — IP_TRIE (actif) -- ---------------------------------------------------------------------------- DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ip; CREATE DICTIONARY ja4_processing.dict_anubis_ip @@ -99,36 +75,7 @@ LAYOUT(IP_TRIE()) LIFETIME(MIN 300 MAX 600); -- ---------------------------------------------------------------------------- --- 5. TABLE SOURCE — règles ASN (pour dictionnaire Flat) --- Alimentée par botPolicies.yaml via fetch_rules.py → insert_asn_rules() --- ---------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS ja4_processing.anubis_asn_rules -( - asn UInt32, - bot_name LowCardinality(String), - action LowCardinality(String), - category LowCardinality(String) -) -ENGINE = ReplacingMergeTree() -ORDER BY asn; - --- ---------------------------------------------------------------------------- --- 6. TABLE SOURCE — règles pays ISO-3166 (pour dictionnaire Flat) --- Alimentée par botPolicies.yaml via fetch_rules.py → insert_country_rules() --- ---------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS ja4_processing.anubis_country_rules -( - country_code LowCardinality(String), - bot_name LowCardinality(String), - action LowCardinality(String), - category LowCardinality(String) -) -ENGINE = ReplacingMergeTree() -ORDER BY country_code; - --- ---------------------------------------------------------------------------- --- 7. DICTIONNAIRE ASN — Flat --- dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', src_asn, '') +-- 5. DICTIONNAIRE ASN — Flat (actif) -- ---------------------------------------------------------------------------- DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_asn; CREATE DICTIONARY ja4_processing.dict_anubis_asn @@ -144,176 +91,19 @@ LAYOUT(FLAT()) LIFETIME(MIN 300 MAX 600); -- ---------------------------------------------------------------------------- --- 8. DICTIONNAIRE PAYS — Flat --- dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', src_country_code, '') --- ---------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_country; -CREATE DICTIONARY ja4_processing.dict_anubis_country -( - country_code String, - bot_name String, - action String, - category String -) -PRIMARY KEY country_code -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_country_rules')) -LAYOUT(FLAT()) -LIFETIME(MIN 300 MAX 600); - --- ---------------------------------------------------------------------------- --- 9. AJOUT DES COLONNES ANUBIS dans http_logs --- Idempotent : ne plante pas si déjà présentes +-- 6. AJOUT DES COLONNES ANUBIS dans http_logs (idempotent) -- ---------------------------------------------------------------------------- ALTER TABLE ja4_logs.http_logs ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '', ADD COLUMN IF NOT EXISTS anubis_bot_action LowCardinality(String) DEFAULT '', ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT ''; --- ---------------------------------------------------------------------------- --- 10. RECONSTRUCTION DE mv_http_logs avec enrichissement Anubis --- Logique de priorisation : --- 1. UA regex (plus informatif — identifie le bot précis) --- 2. IP/CIDR (fallback — identifie le réseau cloud) --- ---------------------------------------------------------------------------- -DROP VIEW IF EXISTS ja4_logs.mv_http_logs; - -CREATE MATERIALIZED VIEW ja4_logs.mv_http_logs -TO ja4_logs.http_logs -( - `time` DateTime, - `log_date` Date, - `src_ip` IPv4, - `src_port` UInt16, - `src_asn` UInt32, - `src_country_code` String, - `dst_ip` IPv4, - `dst_port` UInt16, - `src_as_name` String, - `src_org` String, - `src_domain` String, - `method` String, - `scheme` String, - `host` String, - `path` String, - `query` String, - `http_version` String, - `orphan_side` String, - `correlated` UInt8, - `keepalives` UInt16, - `a_timestamp` UInt64, - `b_timestamp` UInt64, - `conn_id` String, - `ip_meta_df` UInt8, - `ip_meta_id` UInt16, - `ip_meta_total_length` UInt16, - `ip_meta_ttl` UInt8, - `tcp_meta_options` String, - `tcp_meta_window_size` UInt32, - `tcp_meta_mss` UInt16, - `tcp_meta_window_scale` UInt8, - `syn_to_clienthello_ms` Int32, - `tls_version` String, - `tls_sni` String, - `tls_alpn` String, - `ja3` String, - `ja3_hash` String, - `ja4` String, - `client_headers` String, - `header_user_agent` String, - `header_accept` String, - `header_accept_encoding` String, - `header_accept_language` String, - `header_content_type` String, - `header_x_request_id` String, - `header_x_trace_id` String, - `header_x_forwarded_for` String, - `header_sec_ch_ua` String, - `header_sec_ch_ua_mobile` String, - `header_sec_ch_ua_platform` String, - `header_sec_fetch_dest` String, - `header_sec_fetch_mode` String, - `header_sec_fetch_site` String, - `anubis_bot_name` String, - `anubis_bot_action` String -) -AS SELECT - parseDateTimeBestEffort(coalesce(JSONExtractString(raw_json, 'time'), '1970-01-01T00:00:00Z')) AS time, - toDate(time) AS log_date, - toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0')) AS src_ip, - toUInt16(coalesce(JSONExtractUInt(raw_json, 'src_port'), 0)) AS src_port, - dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'asn', toIPv6(src_ip), toUInt32(0)) AS src_asn, - dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'country_code', toIPv6(src_ip), '') AS src_country_code, - toIPv4(coalesce(JSONExtractString(raw_json, 'dst_ip'), '0.0.0.0')) AS dst_ip, - toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port, - dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'name', toIPv6(src_ip), '') AS src_as_name, - '' AS src_org, - '' AS src_domain, - coalesce(JSONExtractString(raw_json, 'method'), '') AS method, - coalesce(JSONExtractString(raw_json, 'scheme'), '') AS scheme, - coalesce(JSONExtractString(raw_json, 'host'), '') AS host, - coalesce(JSONExtractString(raw_json, 'path'), '') AS path, - coalesce(JSONExtractString(raw_json, 'query'), '') AS query, - coalesce(JSONExtractString(raw_json, 'http_version'), '') AS http_version, - coalesce(JSONExtractString(raw_json, 'orphan_side'), '') AS orphan_side, - toUInt8(coalesce(JSONExtractBool(raw_json, 'correlated'), 0)) AS correlated, - toUInt16(coalesce(JSONExtractUInt(raw_json, 'keepalives'), 0)) AS keepalives, - coalesce(JSONExtractUInt(raw_json, 'a_timestamp'), 0) AS a_timestamp, - coalesce(JSONExtractUInt(raw_json, 'b_timestamp'), 0) AS b_timestamp, - coalesce(JSONExtractString(raw_json, 'conn_id'), '') AS conn_id, - toUInt8(coalesce(JSONExtractBool(raw_json, 'ip_meta_df'), 0)) AS ip_meta_df, - toUInt16(coalesce(JSONExtractUInt(raw_json, 'ip_meta_id'), 0)) AS ip_meta_id, - toUInt16(coalesce(JSONExtractUInt(raw_json, 'ip_meta_total_length'), 0)) AS ip_meta_total_length, - toUInt8(coalesce(JSONExtractUInt(raw_json, 'ip_meta_ttl'), 0)) AS ip_meta_ttl, - coalesce(JSONExtractString(raw_json, 'tcp_meta_options'), '') AS tcp_meta_options, - toUInt32(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_size'), 0)) AS tcp_meta_window_size, - toUInt16(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_mss'), 0)) AS tcp_meta_mss, - toUInt8(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_scale'), 0)) AS tcp_meta_window_scale, - toInt32(coalesce(JSONExtractInt(raw_json, 'syn_to_clienthello_ms'), 0)) AS syn_to_clienthello_ms, - coalesce(JSONExtractString(raw_json, 'tls_version'), '') AS tls_version, - coalesce(JSONExtractString(raw_json, 'tls_sni'), '') AS tls_sni, - coalesce(JSONExtractString(raw_json, 'tls_alpn'), '') AS tls_alpn, - coalesce(JSONExtractString(raw_json, 'ja3'), '') AS ja3, - coalesce(JSONExtractString(raw_json, 'ja3_hash'), '') AS ja3_hash, - coalesce(JSONExtractString(raw_json, 'ja4'), '') AS ja4, - coalesce(JSONExtractString(raw_json, 'client_headers'), '') AS client_headers, - coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS header_user_agent, - coalesce(JSONExtractString(raw_json, 'header_Accept'), '') AS header_accept, - coalesce(JSONExtractString(raw_json, 'header_Accept-Encoding'), '') AS header_accept_encoding, - coalesce(JSONExtractString(raw_json, 'header_Accept-Language'), '') AS header_accept_language, - coalesce(JSONExtractString(raw_json, 'header_Content-Type'), '') AS header_content_type, - coalesce(JSONExtractString(raw_json, 'header_X-Request-Id'), '') AS header_x_request_id, - coalesce(JSONExtractString(raw_json, 'header_X-Trace-Id'), '') AS header_x_trace_id, - coalesce(JSONExtractString(raw_json, 'header_X-Forwarded-For'), '') AS header_x_forwarded_for, - coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA'), '') AS header_sec_ch_ua, - coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Mobile'), '') AS header_sec_ch_ua_mobile, - coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Platform'), '') AS header_sec_ch_ua_platform, - coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Dest'), '') AS header_sec_fetch_dest, - coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode, - coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site, - -- ── Enrichissement Anubis ──────────────────────────────────────────────── - -- Priorité : UA regex > IP/CIDR (UA identifie précisément le bot) - COALESCE( - nullIf(dictGet('ja4_processing.dict_anubis_ua', 'bot_name', - coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '')), ''), - nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', - toIPv6(toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0'))), ''), ''), - '' - ) AS anubis_bot_name, - COALESCE( - nullIf(dictGet('ja4_processing.dict_anubis_ua', 'action', - coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '')), ''), - nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', - toIPv6(toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0'))), ''), ''), - '' - ) AS anubis_bot_action -FROM ja4_logs.http_logs_raw; - -- ============================================================================ -- INTÉGRATION ML — Propagation Anubis vers le pipeline bot_detector -- ============================================================================ -- ---------------------------------------------------------------------------- --- 11. COLONNES ANUBIS dans ml_detected_anomalies +-- 7. COLONNES ANUBIS dans ml_detected_anomalies -- ---------------------------------------------------------------------------- ALTER TABLE ja4_processing.ml_detected_anomalies ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '', @@ -321,7 +111,7 @@ ALTER TABLE ja4_processing.ml_detected_anomalies ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT ''; -- ---------------------------------------------------------------------------- --- 12. COLONNES ANUBIS dans ml_all_scores +-- 8. COLONNES ANUBIS dans ml_all_scores -- ---------------------------------------------------------------------------- ALTER TABLE ja4_processing.ml_all_scores ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '', @@ -329,11 +119,6 @@ ALTER TABLE ja4_processing.ml_all_scores ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT ''; -- ---------------------------------------------------------------------------- --- 13. VIEW view_ai_features_1h — Enrichissement Anubis --- Ajoute anubis_bot_name et anubis_bot_action via dictGet. --- Priorité : UA regex (first_ua → dict_anubis_ua) > IP/CIDR (src_ip → dict_anubis_ip) --- Voir le fichier complet dans /tmp/update_view_ai_features.sql ou recréer --- avec CREATE OR REPLACE VIEW après avoir appliqué les étapes précédentes. +-- 9. VIEW view_ai_features_1h — Enrichissement Anubis +-- Voir view_ai_features_anubis.sql pour le CREATE OR REPLACE VIEW complet. -- ---------------------------------------------------------------------------- --- NOTE : Exécuter le contenu de /tmp/update_view_ai_features.sql ici (trop long). --- Ou lancer depuis le repo : psql -f bot_detector/anubis/view_ai_features_anubis.sql diff --git a/services/bot-detector/anubis/fetch_rules.py b/services/bot-detector/anubis/fetch_rules.py index 17c4722..5837b03 100644 --- a/services/bot-detector/anubis/fetch_rules.py +++ b/services/bot-detector/anubis/fetch_rules.py @@ -18,7 +18,6 @@ Variables d'environnement : import json import os -import re import sys import urllib.request import urllib.error @@ -57,15 +56,9 @@ DIRECTORIES = [ ("data/bots", "bots"), # Catch-alls larges (ai-robots-txt, ai-catchall) — LAST ] -# Fichier de politique principal (règles ASN + pays inline) +# Fichier de politique principal (règles ASN inline) BOT_POLICIES_PATH = "data/botPolicies.yaml" -# UA_PARENT_OVERRIDE : mapping nom_règle → nom_parent pour forcer la hiérarchie REGEXP_TREE. -# Conservé vide intentionnellement : l'ordre de chargement (spécifique avant catch-all) -# garantit la priorité sans hiérarchie parent_id explicite. -# Populer ce dict si une règle doit hériter d'une autre via parent_id dans REGEXP_TREE. -UA_PARENT_OVERRIDE: dict[str, str] = {} - # ────────────────────────────────────────────────────────────────────────────── # HTTP helpers @@ -111,69 +104,6 @@ def list_yaml_files(api_path: str) -> list[str]: return [f["download_url"] for f in files] -# ────────────────────────────────────────────────────────────────────────────── -# Extraction des patterns UA depuis les expressions CEL-like -# ────────────────────────────────────────────────────────────────────────────── - -def _extract_ua_from_all(conditions: list) -> str | None: - """Extrait une regex UA depuis une expression 'all' (ex: yandexbot userAgent.matches).""" - for cond in conditions: - if not isinstance(cond, str): - continue - m = re.search(r'userAgent\.matches\("(.+?)"\)', cond) - if m: - return m.group(1).replace("\\\\", "\\") - return None - - -def _extract_ua_from_any(conditions: list) -> str | None: - """ - Extrait une regex UA depuis une expression 'any' avec userAgent.contains(...) - Exemple : aggressive-brazilian-scrapers.yaml - Retourne une regex en OR : MSIE|Trident|... - """ - patterns = [] - for cond in conditions: - if not isinstance(cond, str): - continue - m = re.search(r'userAgent\.contains\("(.+?)"\)', cond) - if m: - patterns.append(re.escape(m.group(1))) - if patterns: - return "|".join(patterns) - return None - - -def extract_ua_regex(rule: dict) -> str | None: - """Extrait la regex User-Agent depuis toutes les formes possibles.""" - # Forme directe - if ua := rule.get("user_agent_regex"): - return ua.strip() - - expr = rule.get("expression") - if not expr: - return None - - # Expression scalaire (CEL string) - if isinstance(expr, str): - m = re.search(r'userAgent\.matches\("(.+?)"\)', expr) - if m: - return m.group(1).replace("\\\\", "\\") - m = re.search(r'userAgent\.contains\("(.+?)"\)', expr) - if m: - return re.escape(m.group(1)) - return None - - # Expression structurée dict - if isinstance(expr, dict): - if ua := _extract_ua_from_all(expr.get("all", [])): - return ua - if ua := _extract_ua_from_any(expr.get("any", [])): - return ua - - return None - - # ────────────────────────────────────────────────────────────────────────────── # Parse des fichiers YAML # ────────────────────────────────────────────────────────────────────────────── @@ -182,27 +112,22 @@ def parse_file( url: str, category: str, ua_name_to_id: dict, - ua_id_counter_ref: list, # [int] — compteur mutable partagé entre appels - rule_id_counter_ref: list, # [int] — idem + ua_id_counter_ref: list, + rule_id_counter_ref: list, ) -> tuple[list[dict], list[dict]]: """ Parse un fichier YAML Anubis. - Retourne (ua_rules, ip_rules). - - Note : ua_name_to_id est maintenu pour supporter la hiérarchie parent_id dans - REGEXP_TREE (via UA_PARENT_OVERRIDE). Tant que UA_PARENT_OVERRIDE est vide, - parent_id vaut toujours 0 et ua_name_to_id n'est pas consulté en pratique. + Retourne ([], ip_rules) — les règles UA ne sont plus collectées. """ data = fetch_yaml_url(url) if not data or not isinstance(data, list): return [], [] - ua_rules, ip_rules = [], [] + ip_rules = [] for rule in data: if not isinstance(rule, dict): continue - # Ignorer les imports (références à d'autres fichiers) if "import" in rule: continue @@ -212,58 +137,34 @@ def parse_file( continue remote_addrs = [str(c).strip() for c in rule.get("remote_addresses", []) if c] - has_ip = bool(remote_addrs) rule_id = rule_id_counter_ref[0] rule_id_counter_ref[0] += 1 - # ── User-Agent regex ───────────────────────────────────────────────── - ua_regex = extract_ua_regex(rule) - if ua_regex: - parent_name = UA_PARENT_OVERRIDE.get(name) - parent_id = ua_name_to_id.get(parent_name, 0) if parent_name else 0 - - uid = ua_id_counter_ref[0] - ua_id_counter_ref[0] += 1 - ua_name_to_id[name] = uid - - ua_rules.append({ - "id": uid, - "parent_id": parent_id, - "regexp": ua_regex, - "bot_name": name, - "action": action, - "has_ip": "1" if has_ip else "0", - "rule_id": str(rule_id), - "category": category, - }) - # ── IP/CIDR ranges ─────────────────────────────────────────────────── - has_ua = bool(ua_regex) for cidr in remote_addrs: ip_rules.append({ "prefix": cidr, "bot_name": name, "action": action, "rule_id": rule_id, - "has_ua": 1 if has_ua else 0, + "has_ua": 0, "category": category, }) - return ua_rules, ip_rules + return [], ip_rules def parse_bot_policies_inline(url: str) -> tuple[list[dict], list[dict]]: """ - Parse botPolicies.yaml pour les règles inline avec geoip.countries et asns.match. - Retourne (asn_rules, country_rules). + Parse botPolicies.yaml pour les règles ASN inline. + Retourne (asn_rules, []) — les règles pays ne sont plus collectées. """ data = fetch_yaml_url(url) if not data or not isinstance(data, dict): return [], [] - asn_rules: list[dict] = [] - country_rules: list[dict] = [] + asn_rules: list[dict] = [] for rule in data.get("bots", []): if not isinstance(rule, dict): @@ -287,31 +188,19 @@ def parse_bot_policies_inline(url: str) -> tuple[list[dict], list[dict]]: "category": "policies", }) - # Country rules - geoip = rule.get("geoip", {}) - if isinstance(geoip, dict): - for cc in geoip.get("countries", []): - country_rules.append({ - "country_code": str(cc).upper(), - "bot_name": name, - "action": action, - "category": "policies", - }) - - return asn_rules, country_rules + return asn_rules, [] # ────────────────────────────────────────────────────────────────────────────── # Collecte de toutes les règles # ────────────────────────────────────────────────────────────────────────────── -def collect_all_rules() -> tuple[list, list, list, list]: - """Retourne (ua_rules, ip_rules, asn_rules, country_rules).""" +def collect_all_rules() -> tuple[list, list]: + """Retourne (ip_rules, asn_rules). Les règles UA et pays ne sont plus collectées.""" ua_name_to_id: dict[str, int] = {} ua_id_counter_ref: list[int] = [1] rule_id_counter: list[int] = [1] - all_ua: list[dict] = [] all_ip: list[dict] = [] for api_path, category in DIRECTORIES: @@ -319,16 +208,15 @@ def collect_all_rules() -> tuple[list, list, list, list]: file_urls = list_yaml_files(api_path) print(f" {len(file_urls)} fichiers trouvés") for url in file_urls: - ua, ip = parse_file(url, category, ua_name_to_id, ua_id_counter_ref, rule_id_counter) - all_ua.extend(ua) + _ua, ip = parse_file(url, category, ua_name_to_id, ua_id_counter_ref, rule_id_counter) all_ip.extend(ip) - # Règles ASN + pays depuis botPolicies.yaml + # Règles ASN depuis botPolicies.yaml print(f"[INFO] Lecture de botPolicies.yaml…") policies_url = f"{GITHUB_RAW}/{BOT_POLICIES_PATH}" - asn_rules, country_rules = parse_bot_policies_inline(policies_url) + asn_rules, _country_rules = parse_bot_policies_inline(policies_url) - return all_ua, all_ip, asn_rules, country_rules + return all_ip, asn_rules # ────────────────────────────────────────────────────────────────────────────── @@ -348,30 +236,6 @@ def get_ch_client(): DB_PROC = os.environ.get("CLICKHOUSE_DB_PROCESSING", os.environ.get("CLICKHOUSE_DB", "ja4_processing")) -def insert_ua_rules(client, rules: list[dict]) -> None: - """Tronque et remplace la table anubis_ua_rules avec les règles User-Agent fournies. - - Le format cible est REGEXP_TREE (colonnes id, parent_id, regexp, keys[], values[]). - """ - if not rules: - print("[INFO] Aucune règle UA.") - return - client.command(f"TRUNCATE TABLE {DB_PROC}.anubis_ua_rules") - # REGEXP_TREE format : id, parent_id, regexp, keys[], values[] - # keys = ['bot_name', 'action', 'has_ip', 'rule_id', 'category'] - data = [ - [ - r["id"], r["parent_id"], r["regexp"], - ["bot_name", "action", "has_ip", "rule_id", "category"], - [r["bot_name"], r["action"], r["has_ip"], r["rule_id"], r["category"]], - ] - for r in rules - ] - client.insert(f"{DB_PROC}.anubis_ua_rules", data, - column_names=["id", "parent_id", "regexp", "keys", "values"]) - print(f"[OK] {len(rules)} règles UA insérées.") - - def insert_ip_rules(client, rules: list[dict]) -> None: """Tronque et remplace la table anubis_ip_rules avec les règles CIDR/IP fournies.""" if not rules: @@ -400,25 +264,11 @@ def insert_asn_rules(client, rules: list[dict]) -> None: print(f"[OK] {len(rules)} règles ASN insérées.") -def insert_country_rules(client, rules: list[dict]) -> None: - """Tronque et remplace la table anubis_country_rules avec les règles pays fournies.""" - if not rules: - print("[INFO] Aucune règle pays.") - return - client.command(f"TRUNCATE TABLE {DB_PROC}.anubis_country_rules") - data = [[r["country_code"], r["bot_name"], r["action"], r["category"]] for r in rules] - client.insert(f"{DB_PROC}.anubis_country_rules", data, - column_names=["country_code", "bot_name", "action", "category"]) - print(f"[OK] {len(rules)} règles pays insérées.") - - def reload_dicts(client) -> None: - """Recharge les quatre dictionnaires ClickHouse Anubis après mise à jour des tables sources.""" + """Recharge les dictionnaires ClickHouse Anubis IP et ASN après mise à jour.""" dicts = [ - f"{DB_PROC}.dict_anubis_ua", f"{DB_PROC}.dict_anubis_ip", f"{DB_PROC}.dict_anubis_asn", - f"{DB_PROC}.dict_anubis_country", ] for d in dicts: try: @@ -432,27 +282,14 @@ def reload_dicts(client) -> None: # Rapport # ────────────────────────────────────────────────────────────────────────────── -def print_summary(ua_rules, ip_rules, asn_rules, country_rules): - """Affiche un résumé lisible des règles collectées (UA, IP, ASN, pays) sur la sortie standard.""" - print("\n── Règles UA ──") - by_cat: dict[str, list] = {} - for r in ua_rules: - by_cat.setdefault(r["category"], []).append(r) - for cat, rules in sorted(by_cat.items()): - print(f" [{cat}] {len(rules)} règle(s)") - for r in rules[:5]: - has = " [+IP]" if r["has_ip"] == "1" else "" - par = f" [parent={r['parent_id']}]" if r["parent_id"] else "" - print(f" [{r['action']:9s}] {r['bot_name']}{has}{par}: {r['regexp'][:50]}") - if len(rules) > 5: - print(f" … et {len(rules) - 5} autres") - +def print_summary(ip_rules, asn_rules): + """Affiche un résumé lisible des règles collectées (IP, ASN) sur la sortie standard.""" print(f"\n── Règles IP : {len(ip_rules)} CIDRs ──") by_bot: dict[str, list] = {} for r in ip_rules: by_bot.setdefault(r["bot_name"], []).append(r) for bot, rs in sorted(by_bot.items())[:15]: - print(f" [{rs[0]['action']:9s}] {bot}: {len(rs)} CIDRs (cat={rs[0]['category']}, has_ua={rs[0]['has_ua']})") + print(f" [{rs[0]['action']:9s}] {bot}: {len(rs)} CIDRs (cat={rs[0]['category']})") if len(by_bot) > 15: print(f" … et {len(by_bot) - 15} autres bots") @@ -461,38 +298,30 @@ def print_summary(ua_rules, ip_rules, asn_rules, country_rules): for r in asn_rules: print(f" [{r['action']:9s}] ASN {r['asn']}: {r['bot_name']}") - if country_rules: - print(f"\n── Règles pays : {len(country_rules)} ──") - for r in country_rules: - print(f" [{r['action']:9s}] {r['country_code']}: {r['bot_name']}") - # ────────────────────────────────────────────────────────────────────────────── # Main # ────────────────────────────────────────────────────────────────────────────── def main() -> None: - """Point d'entrée principal : collecte les règles Anubis et les charge dans ClickHouse.""" - print("[INFO] Collecte des règles Anubis depuis GitHub…") - ua_rules, ip_rules, asn_rules, country_rules = collect_all_rules() + """Point d'entrée principal : collecte les règles Anubis IP/CIDR et ASN, puis les charge dans ClickHouse.""" + print("[INFO] Collecte des règles Anubis depuis GitHub (IP/CIDR + ASN uniquement)…") + ip_rules, asn_rules = collect_all_rules() - total = len(ua_rules) + len(ip_rules) + len(asn_rules) + len(country_rules) - print(f"\n[INFO] {len(ua_rules)} règles UA, {len(ip_rules)} CIDRs IP, " - f"{len(asn_rules)} ASN, {len(country_rules)} pays (total={total})") + total = len(ip_rules) + len(asn_rules) + print(f"\n[INFO] {len(ip_rules)} CIDRs IP, {len(asn_rules)} ASN (total={total})") if total == 0: print("[ERREUR] Aucune règle récupérée.", file=sys.stderr) sys.exit(1) - print_summary(ua_rules, ip_rules, asn_rules, country_rules) + print_summary(ip_rules, asn_rules) print("\n[INFO] Connexion à ClickHouse…") client = get_ch_client() - insert_ua_rules(client, ua_rules) insert_ip_rules(client, ip_rules) insert_asn_rules(client, asn_rules) - insert_country_rules(client, country_rules) reload_dicts(client) print("\n[OK] Règles Anubis chargées avec succès.") diff --git a/services/bot-detector/anubis/mv_http_logs.sql b/services/bot-detector/anubis/mv_http_logs.sql index ac46fda..c1e811d 100644 --- a/services/bot-detector/anubis/mv_http_logs.sql +++ b/services/bot-detector/anubis/mv_http_logs.sql @@ -61,66 +61,23 @@ SELECT coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode, coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site, - -- Anubis enrichment : logique de correspondance combinée UA+IP - -- Priorité : (1) UA+IP [même rule_id] > (2) UA seul > (3) IP seul > (4) ASN > (5) Pays - CASE - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' - AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua)) - = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) - THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' - THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) - WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') - ELSE '' - END AS anubis_bot_name, + -- Anubis enrichment: IP/CIDR > ASN (simplified — UA and Country rules removed) + COALESCE( + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, ''), ''), + '' + ) AS anubis_bot_name, - CASE - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' - AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua)) - = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) - THEN dictGet('ja4_processing.dict_anubis_ua', 'action', _ua) - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' - THEN dictGet('ja4_processing.dict_anubis_ua', 'action', _ua) - WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', _ip, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', _asn, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'action', _cc, '') - ELSE '' - END AS anubis_bot_action, + COALESCE( + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', _ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', _asn, ''), ''), + '' + ) AS anubis_bot_action, - CASE - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' - AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua)) - = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) - THEN dictGet('ja4_processing.dict_anubis_ua', 'category', _ua) - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' - THEN dictGet('ja4_processing.dict_anubis_ua', 'category', _ua) - WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', _ip, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', _asn, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'category', _cc, '') - ELSE '' - END AS anubis_bot_category + COALESCE( + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', _ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', _asn, ''), ''), + '' + ) AS anubis_bot_category FROM ja4_logs.http_logs_raw diff --git a/services/bot-detector/anubis/view_ai_features_anubis.sql b/services/bot-detector/anubis/view_ai_features_anubis.sql index abc93cb..11bcdc4 100644 --- a/services/bot-detector/anubis/view_ai_features_anubis.sql +++ b/services/bot-detector/anubis/view_ai_features_anubis.sql @@ -12,64 +12,22 @@ WITH base_data AS ( nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''), '' ) AS bot_name, - -- Anubis : logique combinée UA+IP (même rule_id) > UA seul > IP seul > ASN > Pays - CASE - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua)) - = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) - THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' - THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) - WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') - ELSE '' - END AS anubis_bot_name, - CASE - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua)) - = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) - THEN dictGet('ja4_processing.dict_anubis_ua', 'action', a.first_ua) - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' - THEN dictGet('ja4_processing.dict_anubis_ua', 'action', a.first_ua) - WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', a.src_ip, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(a.src_asn), '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'action', a.src_country_code, '') - ELSE '' - END AS anubis_bot_action, - CASE - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua)) - = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) - THEN dictGet('ja4_processing.dict_anubis_ua', 'category', a.first_ua) - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' - THEN dictGet('ja4_processing.dict_anubis_ua', 'category', a.first_ua) - WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', a.src_ip, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(a.src_asn), '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'category', a.src_country_code, '') - ELSE '' - END AS anubis_bot_category, + -- Anubis : IP/CIDR > ASN (simplifié — UA et Pays retirés) + COALESCE( + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), ''), ''), + '' + ) AS anubis_bot_name, + COALESCE( + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', a.src_ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(a.src_asn), ''), ''), + '' + ) AS anubis_bot_action, + COALESCE( + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', a.src_ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(a.src_asn), ''), ''), + '' + ) AS anubis_bot_category, a.hits AS hits, sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits, a.correlated AS correlated, diff --git a/services/dashboard/backend/routes/api.py b/services/dashboard/backend/routes/api.py index 06eba26..ea86a83 100644 --- a/services/dashboard/backend/routes/api.py +++ b/services/dashboard/backend/routes/api.py @@ -1499,10 +1499,8 @@ _REFLIST_SORT = { "browser_ja4": {"ja4", "browser_family", "tls_library"}, "asn_reputation": {"src_asn", "label"}, "iplocate_asn": {"asn", "country_code", "name", "network"}, - "anubis_ua_rules": {"id", "regexp", "bot_name", "action", "category"}, "anubis_ip_rules": {"prefix", "bot_name", "action", "category"}, "anubis_asn_rules": {"asn", "bot_name", "action", "category"}, - "anubis_country_rules": {"country_code", "bot_name", "action", "category"}, } _REFLIST_SEARCH_COLS: dict[str, list[str]] = { @@ -1511,10 +1509,8 @@ _REFLIST_SEARCH_COLS: dict[str, list[str]] = { "browser_ja4": ["ja4", "browser_family", "tls_library", "context"], "asn_reputation": ["toString(src_asn)", "label"], "iplocate_asn": ["network", "toString(asn)", "country_code", "name"], - "anubis_ua_rules": ["regexp", "bot_name", "action", "category"], "anubis_ip_rules": ["prefix", "bot_name", "action", "category"], "anubis_asn_rules": ["toString(asn)", "bot_name", "action", "category"], - "anubis_country_rules": ["country_code", "bot_name", "action", "category"], } @@ -1532,22 +1528,12 @@ _REFLIST_QUERIES: dict[str, str] = { f"SELECT network, asn, country_code, name " f"FROM dictionary('{_DB}.dict_iplocate_asn')" ), - "anubis_ua_rules": ( - f"SELECT id, parent_id, regexp, " - f" arrayElement(values, indexOf(keys, 'bot_name')) AS bot_name, " - f" arrayElement(values, indexOf(keys, 'action')) AS action, " - f" arrayElement(values, indexOf(keys, 'category')) AS category " - f"FROM {_DB}.anubis_ua_rules" - ), "anubis_ip_rules": ( f"SELECT prefix, bot_name, action, category FROM {_DB}.anubis_ip_rules" ), "anubis_asn_rules": ( f"SELECT asn, bot_name, action, category FROM {_DB}.anubis_asn_rules" ), - "anubis_country_rules": ( - f"SELECT country_code, bot_name, action, category FROM {_DB}.anubis_country_rules" - ), } diff --git a/services/dashboard/backend/templates/reflists.html b/services/dashboard/backend/templates/reflists.html index c465775..71f039c 100644 --- a/services/dashboard/backend/templates/reflists.html +++ b/services/dashboard/backend/templates/reflists.html @@ -17,7 +17,7 @@ Listes de référence browser_ja4Fingerprints JA4 navigateurs → famille, lib TLS asn_reputationASN → label de réputation (isp, datacenter, cdn…) iplocate_asnIP/CIDR → ASN, pays, nom (géolocalisation) - anubis_*Règles Anubis : UA, IP, ASN, pays (filtrage crawlers) + anubis_*Règles Anubis : IP/CIDR et ASN (filtrage crawlers) @@ -55,10 +55,8 @@ Listes de référence - - @@ -141,10 +139,8 @@ const LIST_DOC = { browser_ja4:'Fingerprints JA4 de navigateurs légitimes avec famille (Chromium, Firefox, Safari…), librairie TLS et contexte. Source : dict_browser_ja4.', asn_reputation:'Réputation de chaque ASN (isp, datacenter, cdn, hosting, education…). Source : dict_asn_reputation. Utilisé pour la baseline ISP humaine.', iplocate_asn:'Géolocalisation IP→ASN : réseau, numéro ASN, code pays, nom opérateur. Source : dict_iplocate_asn (IP_TRIE, ~714K entrées).', - anubis_ua_rules:'Règles Anubis de détection par User-Agent (REGEXP_TREE). Chaque règle associe un pattern regex à un bot_name et une action (ALLOW/DENY/WEIGH).', anubis_ip_rules:'Règles Anubis de détection par plage IP (IP_TRIE). Associe des CIDR à des bots connus avec action de filtrage.', anubis_asn_rules:'Règles Anubis par ASN : certains ASN sont associés à des botnets ou services automatisés connus.', - anubis_country_rules:'Règles Anubis par pays : politique de filtrage par code pays (ex: bloquer le trafic de certaines régions).', }; // Colonnes par liste @@ -154,10 +150,8 @@ const COLUMNS = { browser_ja4:[{k:'ja4',f:'JA4'},{k:'browser_family',f:'Famille'},{k:'tls_library',f:'Lib TLS'},{k:'context',f:'Contexte'}], asn_reputation:[{k:'src_asn',f:'ASN'},{k:'label',f:'Réputation'}], iplocate_asn:[{k:'network',f:'Réseau'},{k:'asn',f:'ASN'},{k:'country_code',f:'Pays'},{k:'name',f:'Nom'}], - anubis_ua_rules:[{k:'id',f:'ID'},{k:'regexp',f:'Regex'},{k:'bot_name',f:'Bot'},{k:'action',f:'Action'},{k:'category',f:'Catégorie'}], anubis_ip_rules:[{k:'prefix',f:'IP/CIDR'},{k:'bot_name',f:'Bot'},{k:'action',f:'Action'},{k:'category',f:'Catégorie'}], anubis_asn_rules:[{k:'asn',f:'ASN'},{k:'bot_name',f:'Bot'},{k:'action',f:'Action'},{k:'category',f:'Catégorie'}], - anubis_country_rules:[{k:'country_code',f:'Pays'},{k:'bot_name',f:'Bot'},{k:'action',f:'Action'},{k:'category',f:'Catégorie'}], }; // Tab click diff --git a/shared/clickhouse/03_anubis_tables.sql b/shared/clickhouse/03_anubis_tables.sql index 3b2313a..d2b706d 100644 --- a/shared/clickhouse/03_anubis_tables.sql +++ b/shared/clickhouse/03_anubis_tables.sql @@ -1,10 +1,14 @@ -- ============================================================================= -- 03_anubis_tables.sql — Anubis crawler rule tables and dictionaries --- Items 1–8 from bot_detector/anubis/deploy_schema.sql +-- Only IP/CIDR and ASN rules are populated by fetch_rules.py. +-- UA and Country dictionaries are kept as stubs (required by MV references) +-- but are never populated with real data. -- ============================================================================= -- ----------------------------------------------------------------------------- --- 1. TABLE SOURCE — User-Agent rules (for REGEXP_TREE dictionary) +-- 1. TABLE SOURCE — User-Agent rules (REGEXP_TREE stub) +-- REGEXP_TREE requires ≥1 rule; the catch-all is seeded at init time. +-- This table is NOT populated by fetch_rules.py. -- ----------------------------------------------------------------------------- CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules ( @@ -20,6 +24,7 @@ ORDER BY id; -- ----------------------------------------------------------------------------- -- 2. TABLE SOURCE — IP/CIDR rules (for IP_TRIE dictionary) +-- Populated by fetch_rules.py from Anubis GitHub data. -- ----------------------------------------------------------------------------- CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules ( @@ -35,30 +40,23 @@ ORDER BY prefix; -- ----------------------------------------------------------------------------- --- 3. DICTIONARY — UA REGEXP_TREE --- dictGet('ja4_processing.dict_anubis_ua', 'bot_name', header_user_agent) --- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use. +-- 3. TABLE SOURCE — ASN rules (for Flat dictionary) +-- Populated by fetch_rules.py from botPolicies.yaml. -- ----------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ua; -CREATE DICTIONARY ja4_processing.dict_anubis_ua +CREATE TABLE IF NOT EXISTS ja4_processing.anubis_asn_rules ( - regexp String, - bot_name String, - action String, - has_ip String, - rule_id String, - category String + asn UInt32, + bot_name LowCardinality(String), + action LowCardinality(String), + category LowCardinality(String) ) -PRIMARY KEY regexp -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_ua_rules')) -LAYOUT(REGEXP_TREE) -LIFETIME(MIN 300 MAX 600); +ENGINE = ReplacingMergeTree() +ORDER BY asn; -- ----------------------------------------------------------------------------- --- 4. DICTIONARY — IP IP_TRIE +-- 4. DICTIONARY — IP IP_TRIE (active) -- dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', toIPv6(src_ip), '') --- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use. -- ----------------------------------------------------------------------------- DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ip; CREATE DICTIONARY ja4_processing.dict_anubis_ip @@ -77,37 +75,8 @@ LIFETIME(MIN 300 MAX 600); -- ----------------------------------------------------------------------------- --- 5. TABLE SOURCE — ASN rules (for Flat dictionary) --- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS ja4_processing.anubis_asn_rules -( - asn UInt32, - bot_name LowCardinality(String), - action LowCardinality(String), - category LowCardinality(String) -) -ENGINE = ReplacingMergeTree() -ORDER BY asn; - - --- ----------------------------------------------------------------------------- --- 6. TABLE SOURCE — Country rules (for Flat dictionary) --- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS ja4_processing.anubis_country_rules -( - country_code LowCardinality(String), - bot_name LowCardinality(String), - action LowCardinality(String), - category LowCardinality(String) -) -ENGINE = ReplacingMergeTree() -ORDER BY country_code; - - --- ----------------------------------------------------------------------------- --- 7. DICTIONARY — ASN Flat +-- 5. DICTIONARY — ASN Flat (active) -- dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', src_asn, '') --- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use. -- ----------------------------------------------------------------------------- DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_asn; CREATE DICTIONARY ja4_processing.dict_anubis_asn @@ -121,22 +90,3 @@ PRIMARY KEY asn SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_asn_rules')) LAYOUT(FLAT()) LIFETIME(MIN 300 MAX 600); - - --- ----------------------------------------------------------------------------- --- 8. DICTIONARY — Country COMPLEX_KEY_HASHED --- dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', src_country_code, '') --- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use. --- ----------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_country; -CREATE DICTIONARY ja4_processing.dict_anubis_country -( - country_code String, - bot_name String, - action String, - category String -) -PRIMARY KEY country_code -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_country_rules')) -LAYOUT(COMPLEX_KEY_HASHED()) -LIFETIME(MIN 300 MAX 600); diff --git a/shared/clickhouse/04_mv_http_logs.sql b/shared/clickhouse/04_mv_http_logs.sql index 64bf887..47eabfb 100644 --- a/shared/clickhouse/04_mv_http_logs.sql +++ b/shared/clickhouse/04_mv_http_logs.sql @@ -175,66 +175,23 @@ SELECT coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode, coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site, - -- Anubis enrichment: combined UA+IP priority logic - -- Priority: (1) UA+IP [same rule_id] > (2) UA only > (3) IP only > (4) ASN > (5) Country - CASE - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' - AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua)) - = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) - THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' - THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) - WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') - ELSE '' - END AS anubis_bot_name, + -- Anubis enrichment: IP/CIDR > ASN (simplified — UA and Country rules removed) + COALESCE( + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, ''), ''), + '' + ) AS anubis_bot_name, - CASE - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' - AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua)) - = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) - THEN dictGet('ja4_processing.dict_anubis_ua', 'action', _ua) - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' - THEN dictGet('ja4_processing.dict_anubis_ua', 'action', _ua) - WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', _ip, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', _asn, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'action', _cc, '') - ELSE '' - END AS anubis_bot_action, + COALESCE( + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', _ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', _asn, ''), ''), + '' + ) AS anubis_bot_action, - CASE - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' - AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua)) - = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) - THEN dictGet('ja4_processing.dict_anubis_ua', 'category', _ua) - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' - THEN dictGet('ja4_processing.dict_anubis_ua', 'category', _ua) - WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', _ip, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', _asn, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'category', _cc, '') - ELSE '' - END AS anubis_bot_category + COALESCE( + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', _ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', _asn, ''), ''), + '' + ) AS anubis_bot_category FROM ja4_logs.http_logs_raw; diff --git a/shared/clickhouse/07_ai_features_view.sql b/shared/clickhouse/07_ai_features_view.sql index 5456de8..8b9aebf 100644 --- a/shared/clickhouse/07_ai_features_view.sql +++ b/shared/clickhouse/07_ai_features_view.sql @@ -18,64 +18,22 @@ WITH base_data AS ( '' ) AS bot_name, dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(a.ja4), '') AS browser_family, - -- Anubis: combined UA+IP priority logic > UA only > IP only > ASN > Country - CASE - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua)) - = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) - THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' - THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) - WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') - ELSE '' - END AS anubis_bot_name, - CASE - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua)) - = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) - THEN dictGet('ja4_processing.dict_anubis_ua', 'action', a.first_ua) - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' - THEN dictGet('ja4_processing.dict_anubis_ua', 'action', a.first_ua) - WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', a.src_ip, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(a.src_asn), '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'action', a.src_country_code, '') - ELSE '' - END AS anubis_bot_action, - CASE - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua)) - = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) - THEN dictGet('ja4_processing.dict_anubis_ua', 'category', a.first_ua) - WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0' - AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' - THEN dictGet('ja4_processing.dict_anubis_ua', 'category', a.first_ua) - WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 - AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', a.src_ip, '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(a.src_asn), '') - WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' - THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'category', a.src_country_code, '') - ELSE '' - END AS anubis_bot_category, + -- Anubis: IP/CIDR > ASN (simplified — UA and Country rules removed) + COALESCE( + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), ''), ''), + '' + ) AS anubis_bot_name, + COALESCE( + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', a.src_ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(a.src_asn), ''), ''), + '' + ) AS anubis_bot_action, + COALESCE( + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', a.src_ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(a.src_asn), ''), ''), + '' + ) AS anubis_bot_category, a.hits AS hits, sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits, a.correlated AS correlated,