refactor(anubis): simplify to IP/CIDR + ASN only, remove UA and Country rules
- Remove UA regex extraction (extract_ua_regex, _extract_ua_from_all/any) - Remove Country rule collection from parse_bot_policies_inline - Simplify fetch_rules.py: collect_all_rules returns (ip_rules, asn_rules) - Remove insert_ua_rules and insert_country_rules functions - reload_dicts now only reloads dict_anubis_ip + dict_anubis_asn - Simplify CASE blocks in 04_mv_http_logs.sql, 07_ai_features_view.sql, view_ai_features_anubis.sql, mv_http_logs.sql: IP > ASN (was 5-level UA+IP > UA > IP > ASN > Country cascade) - Remove dict_anubis_country + dict_anubis_ua from 03_anubis_tables.sql (UA table kept as stub for REGEXP_TREE catch-all compatibility) - Remove anubis_country_rules table from schema - Remove Anubis UA and Country tabs from dashboard reflists page - Remove anubis_ua_rules/country_rules from API reflist queries - deploy_schema.sql simplified from 339 to 122 lines - 764 lines removed across 9 files Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -1,30 +1,19 @@
|
|||||||
-- ============================================================================
|
-- ============================================================================
|
||||||
-- ANUBIS CRAWLER RULES — Labeling des http_logs + pipeline ML
|
-- ANUBIS CRAWLER RULES — Labeling des http_logs + pipeline ML
|
||||||
-- Architecture :
|
-- Architecture simplifiée (IP/CIDR et ASN uniquement) :
|
||||||
-- anubis_ua_rules (table) → dict_anubis_ua (REGEXP_TREE)
|
-- anubis_ua_rules (table stub) → dict_anubis_ua (REGEXP_TREE, catch-all)
|
||||||
-- anubis_ip_rules (table) → dict_anubis_ip (IP_TRIE)
|
-- anubis_ip_rules (table) → dict_anubis_ip (IP_TRIE)
|
||||||
-- http_logs : +anubis_bot_name, +anubis_bot_action
|
-- anubis_asn_rules (table) → dict_anubis_asn (FLAT)
|
||||||
-- mv_http_logs : reconstruit avec enrichissement Anubis
|
-- http_logs : +anubis_bot_name, +anubis_bot_action, +anubis_bot_category
|
||||||
|
-- mv_http_logs : enrichissement Anubis (IP > ASN)
|
||||||
-- view_ai_features_1h : +anubis_bot_name, +anubis_bot_action (via dictGet)
|
-- view_ai_features_1h : +anubis_bot_name, +anubis_bot_action (via dictGet)
|
||||||
-- ml_detected_anomalies : +anubis_bot_name, +anubis_bot_action
|
-- ml_detected_anomalies / ml_all_scores : colonnes Anubis
|
||||||
-- ml_all_scores : +anubis_bot_name, +anubis_bot_action
|
|
||||||
-- ============================================================================
|
-- ============================================================================
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
-- 1. TABLE SOURCE — règles User-Agent (pour dictionnaire REGEXP_TREE)
|
-- 1. TABLE SOURCE — règles User-Agent (stub REGEXP_TREE)
|
||||||
--
|
-- REGEXP_TREE nécessite ≥1 règle ; le catch-all est injecté à l'init.
|
||||||
-- Format attendu par ClickHouse regexp_tree (v23.5+) :
|
-- Cette table n'est PAS peuplée par fetch_rules.py.
|
||||||
-- id UInt64 : identifiant unique
|
|
||||||
-- parent_id UInt64 : 0 = racine, sinon id du parent (héritage d'attributs)
|
|
||||||
-- regexp String : expression régulière (re2/vectorscan)
|
|
||||||
-- keys Array(String) : noms des attributs, ex. ['bot_name', 'action']
|
|
||||||
-- values Array(String) : valeurs correspondantes
|
|
||||||
--
|
|
||||||
-- Hiérarchie utilisée pour la priorité :
|
|
||||||
-- Règles génériques DENY (parent_id=0) → enfants ALLOW spécifiques
|
|
||||||
-- Exemple : ai-crawlers-training (parent) → openai-gptbot (enfant)
|
|
||||||
-- Quand l'UA correspond à enfant ET parent, c'est le nom de l'enfant qui
|
|
||||||
-- est retourné (l'enfant hérite ET surcharge les attributs du parent).
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules
|
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules
|
||||||
(
|
(
|
||||||
@ -39,12 +28,7 @@ ORDER BY id;
|
|||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
-- 2. TABLE SOURCE — règles IP/CIDR (pour dictionnaire IP_TRIE)
|
-- 2. TABLE SOURCE — règles IP/CIDR (pour dictionnaire IP_TRIE)
|
||||||
--
|
-- Peuplée par fetch_rules.py depuis les fichiers YAML Anubis.
|
||||||
-- Colonnes requises par dict_anubis_ip et mv_http_logs :
|
|
||||||
-- rule_id : identifiant de règle, croisé avec dict_anubis_ua pour
|
|
||||||
-- la logique UA+IP (même rule_id → match combiné)
|
|
||||||
-- has_ua : 1 si la règle possède aussi une regex UA (croisement nécessaire)
|
|
||||||
-- category : catégorie Anubis (bots, crawlers, clients, policies…)
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules
|
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules
|
||||||
(
|
(
|
||||||
@ -59,29 +43,21 @@ ENGINE = ReplacingMergeTree()
|
|||||||
ORDER BY prefix;
|
ORDER BY prefix;
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
-- 3. DICTIONNAIRE UA — REGEXP_TREE
|
-- 3. TABLE SOURCE — règles ASN (pour dictionnaire Flat)
|
||||||
-- dictGet('ja4_processing.dict_anubis_ua', 'bot_name', header_user_agent)
|
-- Peuplée par fetch_rules.py depuis botPolicies.yaml.
|
||||||
--
|
|
||||||
-- Le PRIMARY KEY est 'regexp' (String) — requis par ClickHouse 26.x.
|
|
||||||
-- Connexion interne (HOST localhost PORT 9000) pour éviter deadlock HTTP.
|
|
||||||
-- Remplacer 'admin' et le mot de passe par les credentials ClickHouse.
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ua;
|
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_asn_rules
|
||||||
CREATE DICTIONARY ja4_processing.dict_anubis_ua
|
|
||||||
(
|
(
|
||||||
regexp String,
|
asn UInt32,
|
||||||
bot_name String,
|
bot_name LowCardinality(String),
|
||||||
action String
|
action LowCardinality(String),
|
||||||
|
category LowCardinality(String)
|
||||||
)
|
)
|
||||||
PRIMARY KEY regexp
|
ENGINE = ReplacingMergeTree()
|
||||||
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_ua_rules'))
|
ORDER BY asn;
|
||||||
LAYOUT(REGEXP_TREE)
|
|
||||||
LIFETIME(MIN 300 MAX 600);
|
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
-- 4. DICTIONNAIRE IP — IP_TRIE
|
-- 4. DICTIONNAIRE IP — IP_TRIE (actif)
|
||||||
-- dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', toIPv6(src_ip), '')
|
|
||||||
-- Connexion interne (HOST localhost PORT 9000) — même raison que dict_anubis_ua.
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ip;
|
DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ip;
|
||||||
CREATE DICTIONARY ja4_processing.dict_anubis_ip
|
CREATE DICTIONARY ja4_processing.dict_anubis_ip
|
||||||
@ -99,36 +75,7 @@ LAYOUT(IP_TRIE())
|
|||||||
LIFETIME(MIN 300 MAX 600);
|
LIFETIME(MIN 300 MAX 600);
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
-- 5. TABLE SOURCE — règles ASN (pour dictionnaire Flat)
|
-- 5. DICTIONNAIRE ASN — Flat (actif)
|
||||||
-- Alimentée par botPolicies.yaml via fetch_rules.py → insert_asn_rules()
|
|
||||||
-- ----------------------------------------------------------------------------
|
|
||||||
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_asn_rules
|
|
||||||
(
|
|
||||||
asn UInt32,
|
|
||||||
bot_name LowCardinality(String),
|
|
||||||
action LowCardinality(String),
|
|
||||||
category LowCardinality(String)
|
|
||||||
)
|
|
||||||
ENGINE = ReplacingMergeTree()
|
|
||||||
ORDER BY asn;
|
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
|
||||||
-- 6. TABLE SOURCE — règles pays ISO-3166 (pour dictionnaire Flat)
|
|
||||||
-- Alimentée par botPolicies.yaml via fetch_rules.py → insert_country_rules()
|
|
||||||
-- ----------------------------------------------------------------------------
|
|
||||||
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_country_rules
|
|
||||||
(
|
|
||||||
country_code LowCardinality(String),
|
|
||||||
bot_name LowCardinality(String),
|
|
||||||
action LowCardinality(String),
|
|
||||||
category LowCardinality(String)
|
|
||||||
)
|
|
||||||
ENGINE = ReplacingMergeTree()
|
|
||||||
ORDER BY country_code;
|
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
|
||||||
-- 7. DICTIONNAIRE ASN — Flat
|
|
||||||
-- dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', src_asn, '')
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_asn;
|
DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_asn;
|
||||||
CREATE DICTIONARY ja4_processing.dict_anubis_asn
|
CREATE DICTIONARY ja4_processing.dict_anubis_asn
|
||||||
@ -144,176 +91,19 @@ LAYOUT(FLAT())
|
|||||||
LIFETIME(MIN 300 MAX 600);
|
LIFETIME(MIN 300 MAX 600);
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
-- 8. DICTIONNAIRE PAYS — Flat
|
-- 6. AJOUT DES COLONNES ANUBIS dans http_logs (idempotent)
|
||||||
-- dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', src_country_code, '')
|
|
||||||
-- ----------------------------------------------------------------------------
|
|
||||||
DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_country;
|
|
||||||
CREATE DICTIONARY ja4_processing.dict_anubis_country
|
|
||||||
(
|
|
||||||
country_code String,
|
|
||||||
bot_name String,
|
|
||||||
action String,
|
|
||||||
category String
|
|
||||||
)
|
|
||||||
PRIMARY KEY country_code
|
|
||||||
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_country_rules'))
|
|
||||||
LAYOUT(FLAT())
|
|
||||||
LIFETIME(MIN 300 MAX 600);
|
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
|
||||||
-- 9. AJOUT DES COLONNES ANUBIS dans http_logs
|
|
||||||
-- Idempotent : ne plante pas si déjà présentes
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
ALTER TABLE ja4_logs.http_logs
|
ALTER TABLE ja4_logs.http_logs
|
||||||
ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '',
|
ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '',
|
||||||
ADD COLUMN IF NOT EXISTS anubis_bot_action LowCardinality(String) DEFAULT '',
|
ADD COLUMN IF NOT EXISTS anubis_bot_action LowCardinality(String) DEFAULT '',
|
||||||
ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT '';
|
ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT '';
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
|
||||||
-- 10. RECONSTRUCTION DE mv_http_logs avec enrichissement Anubis
|
|
||||||
-- Logique de priorisation :
|
|
||||||
-- 1. UA regex (plus informatif — identifie le bot précis)
|
|
||||||
-- 2. IP/CIDR (fallback — identifie le réseau cloud)
|
|
||||||
-- ----------------------------------------------------------------------------
|
|
||||||
DROP VIEW IF EXISTS ja4_logs.mv_http_logs;
|
|
||||||
|
|
||||||
CREATE MATERIALIZED VIEW ja4_logs.mv_http_logs
|
|
||||||
TO ja4_logs.http_logs
|
|
||||||
(
|
|
||||||
`time` DateTime,
|
|
||||||
`log_date` Date,
|
|
||||||
`src_ip` IPv4,
|
|
||||||
`src_port` UInt16,
|
|
||||||
`src_asn` UInt32,
|
|
||||||
`src_country_code` String,
|
|
||||||
`dst_ip` IPv4,
|
|
||||||
`dst_port` UInt16,
|
|
||||||
`src_as_name` String,
|
|
||||||
`src_org` String,
|
|
||||||
`src_domain` String,
|
|
||||||
`method` String,
|
|
||||||
`scheme` String,
|
|
||||||
`host` String,
|
|
||||||
`path` String,
|
|
||||||
`query` String,
|
|
||||||
`http_version` String,
|
|
||||||
`orphan_side` String,
|
|
||||||
`correlated` UInt8,
|
|
||||||
`keepalives` UInt16,
|
|
||||||
`a_timestamp` UInt64,
|
|
||||||
`b_timestamp` UInt64,
|
|
||||||
`conn_id` String,
|
|
||||||
`ip_meta_df` UInt8,
|
|
||||||
`ip_meta_id` UInt16,
|
|
||||||
`ip_meta_total_length` UInt16,
|
|
||||||
`ip_meta_ttl` UInt8,
|
|
||||||
`tcp_meta_options` String,
|
|
||||||
`tcp_meta_window_size` UInt32,
|
|
||||||
`tcp_meta_mss` UInt16,
|
|
||||||
`tcp_meta_window_scale` UInt8,
|
|
||||||
`syn_to_clienthello_ms` Int32,
|
|
||||||
`tls_version` String,
|
|
||||||
`tls_sni` String,
|
|
||||||
`tls_alpn` String,
|
|
||||||
`ja3` String,
|
|
||||||
`ja3_hash` String,
|
|
||||||
`ja4` String,
|
|
||||||
`client_headers` String,
|
|
||||||
`header_user_agent` String,
|
|
||||||
`header_accept` String,
|
|
||||||
`header_accept_encoding` String,
|
|
||||||
`header_accept_language` String,
|
|
||||||
`header_content_type` String,
|
|
||||||
`header_x_request_id` String,
|
|
||||||
`header_x_trace_id` String,
|
|
||||||
`header_x_forwarded_for` String,
|
|
||||||
`header_sec_ch_ua` String,
|
|
||||||
`header_sec_ch_ua_mobile` String,
|
|
||||||
`header_sec_ch_ua_platform` String,
|
|
||||||
`header_sec_fetch_dest` String,
|
|
||||||
`header_sec_fetch_mode` String,
|
|
||||||
`header_sec_fetch_site` String,
|
|
||||||
`anubis_bot_name` String,
|
|
||||||
`anubis_bot_action` String
|
|
||||||
)
|
|
||||||
AS SELECT
|
|
||||||
parseDateTimeBestEffort(coalesce(JSONExtractString(raw_json, 'time'), '1970-01-01T00:00:00Z')) AS time,
|
|
||||||
toDate(time) AS log_date,
|
|
||||||
toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0')) AS src_ip,
|
|
||||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'src_port'), 0)) AS src_port,
|
|
||||||
dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'asn', toIPv6(src_ip), toUInt32(0)) AS src_asn,
|
|
||||||
dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'country_code', toIPv6(src_ip), '') AS src_country_code,
|
|
||||||
toIPv4(coalesce(JSONExtractString(raw_json, 'dst_ip'), '0.0.0.0')) AS dst_ip,
|
|
||||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port,
|
|
||||||
dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'name', toIPv6(src_ip), '') AS src_as_name,
|
|
||||||
'' AS src_org,
|
|
||||||
'' AS src_domain,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'method'), '') AS method,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'scheme'), '') AS scheme,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'host'), '') AS host,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'path'), '') AS path,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'query'), '') AS query,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'http_version'), '') AS http_version,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'orphan_side'), '') AS orphan_side,
|
|
||||||
toUInt8(coalesce(JSONExtractBool(raw_json, 'correlated'), 0)) AS correlated,
|
|
||||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'keepalives'), 0)) AS keepalives,
|
|
||||||
coalesce(JSONExtractUInt(raw_json, 'a_timestamp'), 0) AS a_timestamp,
|
|
||||||
coalesce(JSONExtractUInt(raw_json, 'b_timestamp'), 0) AS b_timestamp,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'conn_id'), '') AS conn_id,
|
|
||||||
toUInt8(coalesce(JSONExtractBool(raw_json, 'ip_meta_df'), 0)) AS ip_meta_df,
|
|
||||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'ip_meta_id'), 0)) AS ip_meta_id,
|
|
||||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'ip_meta_total_length'), 0)) AS ip_meta_total_length,
|
|
||||||
toUInt8(coalesce(JSONExtractUInt(raw_json, 'ip_meta_ttl'), 0)) AS ip_meta_ttl,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'tcp_meta_options'), '') AS tcp_meta_options,
|
|
||||||
toUInt32(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_size'), 0)) AS tcp_meta_window_size,
|
|
||||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_mss'), 0)) AS tcp_meta_mss,
|
|
||||||
toUInt8(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_scale'), 0)) AS tcp_meta_window_scale,
|
|
||||||
toInt32(coalesce(JSONExtractInt(raw_json, 'syn_to_clienthello_ms'), 0)) AS syn_to_clienthello_ms,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'tls_version'), '') AS tls_version,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'tls_sni'), '') AS tls_sni,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'tls_alpn'), '') AS tls_alpn,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'ja3'), '') AS ja3,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'ja3_hash'), '') AS ja3_hash,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'ja4'), '') AS ja4,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'client_headers'), '') AS client_headers,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS header_user_agent,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_Accept'), '') AS header_accept,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_Accept-Encoding'), '') AS header_accept_encoding,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_Accept-Language'), '') AS header_accept_language,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_Content-Type'), '') AS header_content_type,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_X-Request-Id'), '') AS header_x_request_id,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_X-Trace-Id'), '') AS header_x_trace_id,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_X-Forwarded-For'), '') AS header_x_forwarded_for,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA'), '') AS header_sec_ch_ua,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Mobile'), '') AS header_sec_ch_ua_mobile,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Platform'), '') AS header_sec_ch_ua_platform,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Dest'), '') AS header_sec_fetch_dest,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode,
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site,
|
|
||||||
-- ── Enrichissement Anubis ────────────────────────────────────────────────
|
|
||||||
-- Priorité : UA regex > IP/CIDR (UA identifie précisément le bot)
|
|
||||||
COALESCE(
|
|
||||||
nullIf(dictGet('ja4_processing.dict_anubis_ua', 'bot_name',
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '')), ''),
|
|
||||||
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name',
|
|
||||||
toIPv6(toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0'))), ''), ''),
|
|
||||||
''
|
|
||||||
) AS anubis_bot_name,
|
|
||||||
COALESCE(
|
|
||||||
nullIf(dictGet('ja4_processing.dict_anubis_ua', 'action',
|
|
||||||
coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '')), ''),
|
|
||||||
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action',
|
|
||||||
toIPv6(toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0'))), ''), ''),
|
|
||||||
''
|
|
||||||
) AS anubis_bot_action
|
|
||||||
FROM ja4_logs.http_logs_raw;
|
|
||||||
|
|
||||||
-- ============================================================================
|
-- ============================================================================
|
||||||
-- INTÉGRATION ML — Propagation Anubis vers le pipeline bot_detector
|
-- INTÉGRATION ML — Propagation Anubis vers le pipeline bot_detector
|
||||||
-- ============================================================================
|
-- ============================================================================
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
-- 11. COLONNES ANUBIS dans ml_detected_anomalies
|
-- 7. COLONNES ANUBIS dans ml_detected_anomalies
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
ALTER TABLE ja4_processing.ml_detected_anomalies
|
ALTER TABLE ja4_processing.ml_detected_anomalies
|
||||||
ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '',
|
ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '',
|
||||||
@ -321,7 +111,7 @@ ALTER TABLE ja4_processing.ml_detected_anomalies
|
|||||||
ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT '';
|
ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT '';
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
-- 12. COLONNES ANUBIS dans ml_all_scores
|
-- 8. COLONNES ANUBIS dans ml_all_scores
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
ALTER TABLE ja4_processing.ml_all_scores
|
ALTER TABLE ja4_processing.ml_all_scores
|
||||||
ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '',
|
ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '',
|
||||||
@ -329,11 +119,6 @@ ALTER TABLE ja4_processing.ml_all_scores
|
|||||||
ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT '';
|
ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT '';
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
-- 13. VIEW view_ai_features_1h — Enrichissement Anubis
|
-- 9. VIEW view_ai_features_1h — Enrichissement Anubis
|
||||||
-- Ajoute anubis_bot_name et anubis_bot_action via dictGet.
|
-- Voir view_ai_features_anubis.sql pour le CREATE OR REPLACE VIEW complet.
|
||||||
-- Priorité : UA regex (first_ua → dict_anubis_ua) > IP/CIDR (src_ip → dict_anubis_ip)
|
|
||||||
-- Voir le fichier complet dans /tmp/update_view_ai_features.sql ou recréer
|
|
||||||
-- avec CREATE OR REPLACE VIEW après avoir appliqué les étapes précédentes.
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
-- NOTE : Exécuter le contenu de /tmp/update_view_ai_features.sql ici (trop long).
|
|
||||||
-- Ou lancer depuis le repo : psql -f bot_detector/anubis/view_ai_features_anubis.sql
|
|
||||||
|
|||||||
@ -18,7 +18,6 @@ Variables d'environnement :
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import urllib.error
|
import urllib.error
|
||||||
@ -57,15 +56,9 @@ DIRECTORIES = [
|
|||||||
("data/bots", "bots"), # Catch-alls larges (ai-robots-txt, ai-catchall) — LAST
|
("data/bots", "bots"), # Catch-alls larges (ai-robots-txt, ai-catchall) — LAST
|
||||||
]
|
]
|
||||||
|
|
||||||
# Fichier de politique principal (règles ASN + pays inline)
|
# Fichier de politique principal (règles ASN inline)
|
||||||
BOT_POLICIES_PATH = "data/botPolicies.yaml"
|
BOT_POLICIES_PATH = "data/botPolicies.yaml"
|
||||||
|
|
||||||
# UA_PARENT_OVERRIDE : mapping nom_règle → nom_parent pour forcer la hiérarchie REGEXP_TREE.
|
|
||||||
# Conservé vide intentionnellement : l'ordre de chargement (spécifique avant catch-all)
|
|
||||||
# garantit la priorité sans hiérarchie parent_id explicite.
|
|
||||||
# Populer ce dict si une règle doit hériter d'une autre via parent_id dans REGEXP_TREE.
|
|
||||||
UA_PARENT_OVERRIDE: dict[str, str] = {}
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
# HTTP helpers
|
# HTTP helpers
|
||||||
@ -111,69 +104,6 @@ def list_yaml_files(api_path: str) -> list[str]:
|
|||||||
return [f["download_url"] for f in files]
|
return [f["download_url"] for f in files]
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
|
||||||
# Extraction des patterns UA depuis les expressions CEL-like
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def _extract_ua_from_all(conditions: list) -> str | None:
|
|
||||||
"""Extrait une regex UA depuis une expression 'all' (ex: yandexbot userAgent.matches)."""
|
|
||||||
for cond in conditions:
|
|
||||||
if not isinstance(cond, str):
|
|
||||||
continue
|
|
||||||
m = re.search(r'userAgent\.matches\("(.+?)"\)', cond)
|
|
||||||
if m:
|
|
||||||
return m.group(1).replace("\\\\", "\\")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _extract_ua_from_any(conditions: list) -> str | None:
|
|
||||||
"""
|
|
||||||
Extrait une regex UA depuis une expression 'any' avec userAgent.contains(...)
|
|
||||||
Exemple : aggressive-brazilian-scrapers.yaml
|
|
||||||
Retourne une regex en OR : MSIE|Trident|...
|
|
||||||
"""
|
|
||||||
patterns = []
|
|
||||||
for cond in conditions:
|
|
||||||
if not isinstance(cond, str):
|
|
||||||
continue
|
|
||||||
m = re.search(r'userAgent\.contains\("(.+?)"\)', cond)
|
|
||||||
if m:
|
|
||||||
patterns.append(re.escape(m.group(1)))
|
|
||||||
if patterns:
|
|
||||||
return "|".join(patterns)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def extract_ua_regex(rule: dict) -> str | None:
|
|
||||||
"""Extrait la regex User-Agent depuis toutes les formes possibles."""
|
|
||||||
# Forme directe
|
|
||||||
if ua := rule.get("user_agent_regex"):
|
|
||||||
return ua.strip()
|
|
||||||
|
|
||||||
expr = rule.get("expression")
|
|
||||||
if not expr:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Expression scalaire (CEL string)
|
|
||||||
if isinstance(expr, str):
|
|
||||||
m = re.search(r'userAgent\.matches\("(.+?)"\)', expr)
|
|
||||||
if m:
|
|
||||||
return m.group(1).replace("\\\\", "\\")
|
|
||||||
m = re.search(r'userAgent\.contains\("(.+?)"\)', expr)
|
|
||||||
if m:
|
|
||||||
return re.escape(m.group(1))
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Expression structurée dict
|
|
||||||
if isinstance(expr, dict):
|
|
||||||
if ua := _extract_ua_from_all(expr.get("all", [])):
|
|
||||||
return ua
|
|
||||||
if ua := _extract_ua_from_any(expr.get("any", [])):
|
|
||||||
return ua
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
# Parse des fichiers YAML
|
# Parse des fichiers YAML
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
@ -182,27 +112,22 @@ def parse_file(
|
|||||||
url: str,
|
url: str,
|
||||||
category: str,
|
category: str,
|
||||||
ua_name_to_id: dict,
|
ua_name_to_id: dict,
|
||||||
ua_id_counter_ref: list, # [int] — compteur mutable partagé entre appels
|
ua_id_counter_ref: list,
|
||||||
rule_id_counter_ref: list, # [int] — idem
|
rule_id_counter_ref: list,
|
||||||
) -> tuple[list[dict], list[dict]]:
|
) -> tuple[list[dict], list[dict]]:
|
||||||
"""
|
"""
|
||||||
Parse un fichier YAML Anubis.
|
Parse un fichier YAML Anubis.
|
||||||
Retourne (ua_rules, ip_rules).
|
Retourne ([], ip_rules) — les règles UA ne sont plus collectées.
|
||||||
|
|
||||||
Note : ua_name_to_id est maintenu pour supporter la hiérarchie parent_id dans
|
|
||||||
REGEXP_TREE (via UA_PARENT_OVERRIDE). Tant que UA_PARENT_OVERRIDE est vide,
|
|
||||||
parent_id vaut toujours 0 et ua_name_to_id n'est pas consulté en pratique.
|
|
||||||
"""
|
"""
|
||||||
data = fetch_yaml_url(url)
|
data = fetch_yaml_url(url)
|
||||||
if not data or not isinstance(data, list):
|
if not data or not isinstance(data, list):
|
||||||
return [], []
|
return [], []
|
||||||
|
|
||||||
ua_rules, ip_rules = [], []
|
ip_rules = []
|
||||||
|
|
||||||
for rule in data:
|
for rule in data:
|
||||||
if not isinstance(rule, dict):
|
if not isinstance(rule, dict):
|
||||||
continue
|
continue
|
||||||
# Ignorer les imports (références à d'autres fichiers)
|
|
||||||
if "import" in rule:
|
if "import" in rule:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -212,58 +137,34 @@ def parse_file(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
remote_addrs = [str(c).strip() for c in rule.get("remote_addresses", []) if c]
|
remote_addrs = [str(c).strip() for c in rule.get("remote_addresses", []) if c]
|
||||||
has_ip = bool(remote_addrs)
|
|
||||||
|
|
||||||
rule_id = rule_id_counter_ref[0]
|
rule_id = rule_id_counter_ref[0]
|
||||||
rule_id_counter_ref[0] += 1
|
rule_id_counter_ref[0] += 1
|
||||||
|
|
||||||
# ── User-Agent regex ─────────────────────────────────────────────────
|
|
||||||
ua_regex = extract_ua_regex(rule)
|
|
||||||
if ua_regex:
|
|
||||||
parent_name = UA_PARENT_OVERRIDE.get(name)
|
|
||||||
parent_id = ua_name_to_id.get(parent_name, 0) if parent_name else 0
|
|
||||||
|
|
||||||
uid = ua_id_counter_ref[0]
|
|
||||||
ua_id_counter_ref[0] += 1
|
|
||||||
ua_name_to_id[name] = uid
|
|
||||||
|
|
||||||
ua_rules.append({
|
|
||||||
"id": uid,
|
|
||||||
"parent_id": parent_id,
|
|
||||||
"regexp": ua_regex,
|
|
||||||
"bot_name": name,
|
|
||||||
"action": action,
|
|
||||||
"has_ip": "1" if has_ip else "0",
|
|
||||||
"rule_id": str(rule_id),
|
|
||||||
"category": category,
|
|
||||||
})
|
|
||||||
|
|
||||||
# ── IP/CIDR ranges ───────────────────────────────────────────────────
|
# ── IP/CIDR ranges ───────────────────────────────────────────────────
|
||||||
has_ua = bool(ua_regex)
|
|
||||||
for cidr in remote_addrs:
|
for cidr in remote_addrs:
|
||||||
ip_rules.append({
|
ip_rules.append({
|
||||||
"prefix": cidr,
|
"prefix": cidr,
|
||||||
"bot_name": name,
|
"bot_name": name,
|
||||||
"action": action,
|
"action": action,
|
||||||
"rule_id": rule_id,
|
"rule_id": rule_id,
|
||||||
"has_ua": 1 if has_ua else 0,
|
"has_ua": 0,
|
||||||
"category": category,
|
"category": category,
|
||||||
})
|
})
|
||||||
|
|
||||||
return ua_rules, ip_rules
|
return [], ip_rules
|
||||||
|
|
||||||
|
|
||||||
def parse_bot_policies_inline(url: str) -> tuple[list[dict], list[dict]]:
|
def parse_bot_policies_inline(url: str) -> tuple[list[dict], list[dict]]:
|
||||||
"""
|
"""
|
||||||
Parse botPolicies.yaml pour les règles inline avec geoip.countries et asns.match.
|
Parse botPolicies.yaml pour les règles ASN inline.
|
||||||
Retourne (asn_rules, country_rules).
|
Retourne (asn_rules, []) — les règles pays ne sont plus collectées.
|
||||||
"""
|
"""
|
||||||
data = fetch_yaml_url(url)
|
data = fetch_yaml_url(url)
|
||||||
if not data or not isinstance(data, dict):
|
if not data or not isinstance(data, dict):
|
||||||
return [], []
|
return [], []
|
||||||
|
|
||||||
asn_rules: list[dict] = []
|
asn_rules: list[dict] = []
|
||||||
country_rules: list[dict] = []
|
|
||||||
|
|
||||||
for rule in data.get("bots", []):
|
for rule in data.get("bots", []):
|
||||||
if not isinstance(rule, dict):
|
if not isinstance(rule, dict):
|
||||||
@ -287,31 +188,19 @@ def parse_bot_policies_inline(url: str) -> tuple[list[dict], list[dict]]:
|
|||||||
"category": "policies",
|
"category": "policies",
|
||||||
})
|
})
|
||||||
|
|
||||||
# Country rules
|
return asn_rules, []
|
||||||
geoip = rule.get("geoip", {})
|
|
||||||
if isinstance(geoip, dict):
|
|
||||||
for cc in geoip.get("countries", []):
|
|
||||||
country_rules.append({
|
|
||||||
"country_code": str(cc).upper(),
|
|
||||||
"bot_name": name,
|
|
||||||
"action": action,
|
|
||||||
"category": "policies",
|
|
||||||
})
|
|
||||||
|
|
||||||
return asn_rules, country_rules
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
# Collecte de toutes les règles
|
# Collecte de toutes les règles
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def collect_all_rules() -> tuple[list, list, list, list]:
|
def collect_all_rules() -> tuple[list, list]:
|
||||||
"""Retourne (ua_rules, ip_rules, asn_rules, country_rules)."""
|
"""Retourne (ip_rules, asn_rules). Les règles UA et pays ne sont plus collectées."""
|
||||||
ua_name_to_id: dict[str, int] = {}
|
ua_name_to_id: dict[str, int] = {}
|
||||||
ua_id_counter_ref: list[int] = [1]
|
ua_id_counter_ref: list[int] = [1]
|
||||||
rule_id_counter: list[int] = [1]
|
rule_id_counter: list[int] = [1]
|
||||||
|
|
||||||
all_ua: list[dict] = []
|
|
||||||
all_ip: list[dict] = []
|
all_ip: list[dict] = []
|
||||||
|
|
||||||
for api_path, category in DIRECTORIES:
|
for api_path, category in DIRECTORIES:
|
||||||
@ -319,16 +208,15 @@ def collect_all_rules() -> tuple[list, list, list, list]:
|
|||||||
file_urls = list_yaml_files(api_path)
|
file_urls = list_yaml_files(api_path)
|
||||||
print(f" {len(file_urls)} fichiers trouvés")
|
print(f" {len(file_urls)} fichiers trouvés")
|
||||||
for url in file_urls:
|
for url in file_urls:
|
||||||
ua, ip = parse_file(url, category, ua_name_to_id, ua_id_counter_ref, rule_id_counter)
|
_ua, ip = parse_file(url, category, ua_name_to_id, ua_id_counter_ref, rule_id_counter)
|
||||||
all_ua.extend(ua)
|
|
||||||
all_ip.extend(ip)
|
all_ip.extend(ip)
|
||||||
|
|
||||||
# Règles ASN + pays depuis botPolicies.yaml
|
# Règles ASN depuis botPolicies.yaml
|
||||||
print(f"[INFO] Lecture de botPolicies.yaml…")
|
print(f"[INFO] Lecture de botPolicies.yaml…")
|
||||||
policies_url = f"{GITHUB_RAW}/{BOT_POLICIES_PATH}"
|
policies_url = f"{GITHUB_RAW}/{BOT_POLICIES_PATH}"
|
||||||
asn_rules, country_rules = parse_bot_policies_inline(policies_url)
|
asn_rules, _country_rules = parse_bot_policies_inline(policies_url)
|
||||||
|
|
||||||
return all_ua, all_ip, asn_rules, country_rules
|
return all_ip, asn_rules
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
@ -348,30 +236,6 @@ def get_ch_client():
|
|||||||
DB_PROC = os.environ.get("CLICKHOUSE_DB_PROCESSING", os.environ.get("CLICKHOUSE_DB", "ja4_processing"))
|
DB_PROC = os.environ.get("CLICKHOUSE_DB_PROCESSING", os.environ.get("CLICKHOUSE_DB", "ja4_processing"))
|
||||||
|
|
||||||
|
|
||||||
def insert_ua_rules(client, rules: list[dict]) -> None:
|
|
||||||
"""Tronque et remplace la table anubis_ua_rules avec les règles User-Agent fournies.
|
|
||||||
|
|
||||||
Le format cible est REGEXP_TREE (colonnes id, parent_id, regexp, keys[], values[]).
|
|
||||||
"""
|
|
||||||
if not rules:
|
|
||||||
print("[INFO] Aucune règle UA.")
|
|
||||||
return
|
|
||||||
client.command(f"TRUNCATE TABLE {DB_PROC}.anubis_ua_rules")
|
|
||||||
# REGEXP_TREE format : id, parent_id, regexp, keys[], values[]
|
|
||||||
# keys = ['bot_name', 'action', 'has_ip', 'rule_id', 'category']
|
|
||||||
data = [
|
|
||||||
[
|
|
||||||
r["id"], r["parent_id"], r["regexp"],
|
|
||||||
["bot_name", "action", "has_ip", "rule_id", "category"],
|
|
||||||
[r["bot_name"], r["action"], r["has_ip"], r["rule_id"], r["category"]],
|
|
||||||
]
|
|
||||||
for r in rules
|
|
||||||
]
|
|
||||||
client.insert(f"{DB_PROC}.anubis_ua_rules", data,
|
|
||||||
column_names=["id", "parent_id", "regexp", "keys", "values"])
|
|
||||||
print(f"[OK] {len(rules)} règles UA insérées.")
|
|
||||||
|
|
||||||
|
|
||||||
def insert_ip_rules(client, rules: list[dict]) -> None:
|
def insert_ip_rules(client, rules: list[dict]) -> None:
|
||||||
"""Tronque et remplace la table anubis_ip_rules avec les règles CIDR/IP fournies."""
|
"""Tronque et remplace la table anubis_ip_rules avec les règles CIDR/IP fournies."""
|
||||||
if not rules:
|
if not rules:
|
||||||
@ -400,25 +264,11 @@ def insert_asn_rules(client, rules: list[dict]) -> None:
|
|||||||
print(f"[OK] {len(rules)} règles ASN insérées.")
|
print(f"[OK] {len(rules)} règles ASN insérées.")
|
||||||
|
|
||||||
|
|
||||||
def insert_country_rules(client, rules: list[dict]) -> None:
|
|
||||||
"""Tronque et remplace la table anubis_country_rules avec les règles pays fournies."""
|
|
||||||
if not rules:
|
|
||||||
print("[INFO] Aucune règle pays.")
|
|
||||||
return
|
|
||||||
client.command(f"TRUNCATE TABLE {DB_PROC}.anubis_country_rules")
|
|
||||||
data = [[r["country_code"], r["bot_name"], r["action"], r["category"]] for r in rules]
|
|
||||||
client.insert(f"{DB_PROC}.anubis_country_rules", data,
|
|
||||||
column_names=["country_code", "bot_name", "action", "category"])
|
|
||||||
print(f"[OK] {len(rules)} règles pays insérées.")
|
|
||||||
|
|
||||||
|
|
||||||
def reload_dicts(client) -> None:
|
def reload_dicts(client) -> None:
|
||||||
"""Recharge les quatre dictionnaires ClickHouse Anubis après mise à jour des tables sources."""
|
"""Recharge les dictionnaires ClickHouse Anubis IP et ASN après mise à jour."""
|
||||||
dicts = [
|
dicts = [
|
||||||
f"{DB_PROC}.dict_anubis_ua",
|
|
||||||
f"{DB_PROC}.dict_anubis_ip",
|
f"{DB_PROC}.dict_anubis_ip",
|
||||||
f"{DB_PROC}.dict_anubis_asn",
|
f"{DB_PROC}.dict_anubis_asn",
|
||||||
f"{DB_PROC}.dict_anubis_country",
|
|
||||||
]
|
]
|
||||||
for d in dicts:
|
for d in dicts:
|
||||||
try:
|
try:
|
||||||
@ -432,27 +282,14 @@ def reload_dicts(client) -> None:
|
|||||||
# Rapport
|
# Rapport
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def print_summary(ua_rules, ip_rules, asn_rules, country_rules):
|
def print_summary(ip_rules, asn_rules):
|
||||||
"""Affiche un résumé lisible des règles collectées (UA, IP, ASN, pays) sur la sortie standard."""
|
"""Affiche un résumé lisible des règles collectées (IP, ASN) sur la sortie standard."""
|
||||||
print("\n── Règles UA ──")
|
|
||||||
by_cat: dict[str, list] = {}
|
|
||||||
for r in ua_rules:
|
|
||||||
by_cat.setdefault(r["category"], []).append(r)
|
|
||||||
for cat, rules in sorted(by_cat.items()):
|
|
||||||
print(f" [{cat}] {len(rules)} règle(s)")
|
|
||||||
for r in rules[:5]:
|
|
||||||
has = " [+IP]" if r["has_ip"] == "1" else ""
|
|
||||||
par = f" [parent={r['parent_id']}]" if r["parent_id"] else ""
|
|
||||||
print(f" [{r['action']:9s}] {r['bot_name']}{has}{par}: {r['regexp'][:50]}")
|
|
||||||
if len(rules) > 5:
|
|
||||||
print(f" … et {len(rules) - 5} autres")
|
|
||||||
|
|
||||||
print(f"\n── Règles IP : {len(ip_rules)} CIDRs ──")
|
print(f"\n── Règles IP : {len(ip_rules)} CIDRs ──")
|
||||||
by_bot: dict[str, list] = {}
|
by_bot: dict[str, list] = {}
|
||||||
for r in ip_rules:
|
for r in ip_rules:
|
||||||
by_bot.setdefault(r["bot_name"], []).append(r)
|
by_bot.setdefault(r["bot_name"], []).append(r)
|
||||||
for bot, rs in sorted(by_bot.items())[:15]:
|
for bot, rs in sorted(by_bot.items())[:15]:
|
||||||
print(f" [{rs[0]['action']:9s}] {bot}: {len(rs)} CIDRs (cat={rs[0]['category']}, has_ua={rs[0]['has_ua']})")
|
print(f" [{rs[0]['action']:9s}] {bot}: {len(rs)} CIDRs (cat={rs[0]['category']})")
|
||||||
if len(by_bot) > 15:
|
if len(by_bot) > 15:
|
||||||
print(f" … et {len(by_bot) - 15} autres bots")
|
print(f" … et {len(by_bot) - 15} autres bots")
|
||||||
|
|
||||||
@ -461,38 +298,30 @@ def print_summary(ua_rules, ip_rules, asn_rules, country_rules):
|
|||||||
for r in asn_rules:
|
for r in asn_rules:
|
||||||
print(f" [{r['action']:9s}] ASN {r['asn']}: {r['bot_name']}")
|
print(f" [{r['action']:9s}] ASN {r['asn']}: {r['bot_name']}")
|
||||||
|
|
||||||
if country_rules:
|
|
||||||
print(f"\n── Règles pays : {len(country_rules)} ──")
|
|
||||||
for r in country_rules:
|
|
||||||
print(f" [{r['action']:9s}] {r['country_code']}: {r['bot_name']}")
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
# Main
|
# Main
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
"""Point d'entrée principal : collecte les règles Anubis et les charge dans ClickHouse."""
|
"""Point d'entrée principal : collecte les règles Anubis IP/CIDR et ASN, puis les charge dans ClickHouse."""
|
||||||
print("[INFO] Collecte des règles Anubis depuis GitHub…")
|
print("[INFO] Collecte des règles Anubis depuis GitHub (IP/CIDR + ASN uniquement)…")
|
||||||
ua_rules, ip_rules, asn_rules, country_rules = collect_all_rules()
|
ip_rules, asn_rules = collect_all_rules()
|
||||||
|
|
||||||
total = len(ua_rules) + len(ip_rules) + len(asn_rules) + len(country_rules)
|
total = len(ip_rules) + len(asn_rules)
|
||||||
print(f"\n[INFO] {len(ua_rules)} règles UA, {len(ip_rules)} CIDRs IP, "
|
print(f"\n[INFO] {len(ip_rules)} CIDRs IP, {len(asn_rules)} ASN (total={total})")
|
||||||
f"{len(asn_rules)} ASN, {len(country_rules)} pays (total={total})")
|
|
||||||
|
|
||||||
if total == 0:
|
if total == 0:
|
||||||
print("[ERREUR] Aucune règle récupérée.", file=sys.stderr)
|
print("[ERREUR] Aucune règle récupérée.", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
print_summary(ua_rules, ip_rules, asn_rules, country_rules)
|
print_summary(ip_rules, asn_rules)
|
||||||
|
|
||||||
print("\n[INFO] Connexion à ClickHouse…")
|
print("\n[INFO] Connexion à ClickHouse…")
|
||||||
client = get_ch_client()
|
client = get_ch_client()
|
||||||
|
|
||||||
insert_ua_rules(client, ua_rules)
|
|
||||||
insert_ip_rules(client, ip_rules)
|
insert_ip_rules(client, ip_rules)
|
||||||
insert_asn_rules(client, asn_rules)
|
insert_asn_rules(client, asn_rules)
|
||||||
insert_country_rules(client, country_rules)
|
|
||||||
reload_dicts(client)
|
reload_dicts(client)
|
||||||
|
|
||||||
print("\n[OK] Règles Anubis chargées avec succès.")
|
print("\n[OK] Règles Anubis chargées avec succès.")
|
||||||
|
|||||||
@ -61,66 +61,23 @@ SELECT
|
|||||||
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode,
|
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode,
|
||||||
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site,
|
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site,
|
||||||
|
|
||||||
-- Anubis enrichment : logique de correspondance combinée UA+IP
|
-- Anubis enrichment: IP/CIDR > ASN (simplified — UA and Country rules removed)
|
||||||
-- Priorité : (1) UA+IP [même rule_id] > (2) UA seul > (3) IP seul > (4) ASN > (5) Pays
|
COALESCE(
|
||||||
CASE
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, ''), ''),
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1'
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, ''), ''),
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != ''
|
''
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
) AS anubis_bot_name,
|
||||||
AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua))
|
|
||||||
= dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0))
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua)
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != ''
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua)
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '')
|
|
||||||
ELSE ''
|
|
||||||
END AS anubis_bot_name,
|
|
||||||
|
|
||||||
CASE
|
COALESCE(
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1'
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', _ip, ''), ''),
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != ''
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', _asn, ''), ''),
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
''
|
||||||
AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua))
|
) AS anubis_bot_action,
|
||||||
= dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0))
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'action', _ua)
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != ''
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'action', _ua)
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', _ip, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', _asn, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'action', _cc, '')
|
|
||||||
ELSE ''
|
|
||||||
END AS anubis_bot_action,
|
|
||||||
|
|
||||||
CASE
|
COALESCE(
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1'
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', _ip, ''), ''),
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != ''
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', _asn, ''), ''),
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
''
|
||||||
AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua))
|
) AS anubis_bot_category
|
||||||
= dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0))
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'category', _ua)
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != ''
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'category', _ua)
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', _ip, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', _asn, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'category', _cc, '')
|
|
||||||
ELSE ''
|
|
||||||
END AS anubis_bot_category
|
|
||||||
|
|
||||||
FROM ja4_logs.http_logs_raw
|
FROM ja4_logs.http_logs_raw
|
||||||
|
|||||||
@ -12,64 +12,22 @@ WITH base_data AS (
|
|||||||
nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''),
|
nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''),
|
||||||
''
|
''
|
||||||
) AS bot_name,
|
) AS bot_name,
|
||||||
-- Anubis : logique combinée UA+IP (même rule_id) > UA seul > IP seul > ASN > Pays
|
-- Anubis : IP/CIDR > ASN (simplifié — UA et Pays retirés)
|
||||||
CASE
|
COALESCE(
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1'
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, ''), ''),
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), ''), ''),
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
''
|
||||||
AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua))
|
) AS anubis_bot_name,
|
||||||
= dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0))
|
COALESCE(
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua)
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', a.src_ip, ''), ''),
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0'
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(a.src_asn), ''), ''),
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
''
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua)
|
) AS anubis_bot_action,
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0
|
COALESCE(
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', a.src_ip, ''), ''),
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '')
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(a.src_asn), ''), ''),
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != ''
|
''
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '')
|
) AS anubis_bot_category,
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '')
|
|
||||||
ELSE ''
|
|
||||||
END AS anubis_bot_name,
|
|
||||||
CASE
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
|
||||||
AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua))
|
|
||||||
= dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0))
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'action', a.first_ua)
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'action', a.first_ua)
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', a.src_ip, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(a.src_asn), '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'action', a.src_country_code, '')
|
|
||||||
ELSE ''
|
|
||||||
END AS anubis_bot_action,
|
|
||||||
CASE
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
|
||||||
AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua))
|
|
||||||
= dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0))
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'category', a.first_ua)
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'category', a.first_ua)
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', a.src_ip, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(a.src_asn), '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'category', a.src_country_code, '')
|
|
||||||
ELSE ''
|
|
||||||
END AS anubis_bot_category,
|
|
||||||
a.hits AS hits,
|
a.hits AS hits,
|
||||||
sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits,
|
sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits,
|
||||||
a.correlated AS correlated,
|
a.correlated AS correlated,
|
||||||
|
|||||||
@ -1499,10 +1499,8 @@ _REFLIST_SORT = {
|
|||||||
"browser_ja4": {"ja4", "browser_family", "tls_library"},
|
"browser_ja4": {"ja4", "browser_family", "tls_library"},
|
||||||
"asn_reputation": {"src_asn", "label"},
|
"asn_reputation": {"src_asn", "label"},
|
||||||
"iplocate_asn": {"asn", "country_code", "name", "network"},
|
"iplocate_asn": {"asn", "country_code", "name", "network"},
|
||||||
"anubis_ua_rules": {"id", "regexp", "bot_name", "action", "category"},
|
|
||||||
"anubis_ip_rules": {"prefix", "bot_name", "action", "category"},
|
"anubis_ip_rules": {"prefix", "bot_name", "action", "category"},
|
||||||
"anubis_asn_rules": {"asn", "bot_name", "action", "category"},
|
"anubis_asn_rules": {"asn", "bot_name", "action", "category"},
|
||||||
"anubis_country_rules": {"country_code", "bot_name", "action", "category"},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_REFLIST_SEARCH_COLS: dict[str, list[str]] = {
|
_REFLIST_SEARCH_COLS: dict[str, list[str]] = {
|
||||||
@ -1511,10 +1509,8 @@ _REFLIST_SEARCH_COLS: dict[str, list[str]] = {
|
|||||||
"browser_ja4": ["ja4", "browser_family", "tls_library", "context"],
|
"browser_ja4": ["ja4", "browser_family", "tls_library", "context"],
|
||||||
"asn_reputation": ["toString(src_asn)", "label"],
|
"asn_reputation": ["toString(src_asn)", "label"],
|
||||||
"iplocate_asn": ["network", "toString(asn)", "country_code", "name"],
|
"iplocate_asn": ["network", "toString(asn)", "country_code", "name"],
|
||||||
"anubis_ua_rules": ["regexp", "bot_name", "action", "category"],
|
|
||||||
"anubis_ip_rules": ["prefix", "bot_name", "action", "category"],
|
"anubis_ip_rules": ["prefix", "bot_name", "action", "category"],
|
||||||
"anubis_asn_rules": ["toString(asn)", "bot_name", "action", "category"],
|
"anubis_asn_rules": ["toString(asn)", "bot_name", "action", "category"],
|
||||||
"anubis_country_rules": ["country_code", "bot_name", "action", "category"],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -1532,22 +1528,12 @@ _REFLIST_QUERIES: dict[str, str] = {
|
|||||||
f"SELECT network, asn, country_code, name "
|
f"SELECT network, asn, country_code, name "
|
||||||
f"FROM dictionary('{_DB}.dict_iplocate_asn')"
|
f"FROM dictionary('{_DB}.dict_iplocate_asn')"
|
||||||
),
|
),
|
||||||
"anubis_ua_rules": (
|
|
||||||
f"SELECT id, parent_id, regexp, "
|
|
||||||
f" arrayElement(values, indexOf(keys, 'bot_name')) AS bot_name, "
|
|
||||||
f" arrayElement(values, indexOf(keys, 'action')) AS action, "
|
|
||||||
f" arrayElement(values, indexOf(keys, 'category')) AS category "
|
|
||||||
f"FROM {_DB}.anubis_ua_rules"
|
|
||||||
),
|
|
||||||
"anubis_ip_rules": (
|
"anubis_ip_rules": (
|
||||||
f"SELECT prefix, bot_name, action, category FROM {_DB}.anubis_ip_rules"
|
f"SELECT prefix, bot_name, action, category FROM {_DB}.anubis_ip_rules"
|
||||||
),
|
),
|
||||||
"anubis_asn_rules": (
|
"anubis_asn_rules": (
|
||||||
f"SELECT asn, bot_name, action, category FROM {_DB}.anubis_asn_rules"
|
f"SELECT asn, bot_name, action, category FROM {_DB}.anubis_asn_rules"
|
||||||
),
|
),
|
||||||
"anubis_country_rules": (
|
|
||||||
f"SELECT country_code, bot_name, action, category FROM {_DB}.anubis_country_rules"
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -17,7 +17,7 @@ Listes de référence
|
|||||||
<tr><td class="pr-2 font-mono text-brand-400">browser_ja4</td><td>Fingerprints JA4 navigateurs → famille, lib TLS</td></tr>
|
<tr><td class="pr-2 font-mono text-brand-400">browser_ja4</td><td>Fingerprints JA4 navigateurs → famille, lib TLS</td></tr>
|
||||||
<tr><td class="pr-2 font-mono text-brand-400">asn_reputation</td><td>ASN → label de réputation (isp, datacenter, cdn…)</td></tr>
|
<tr><td class="pr-2 font-mono text-brand-400">asn_reputation</td><td>ASN → label de réputation (isp, datacenter, cdn…)</td></tr>
|
||||||
<tr><td class="pr-2 font-mono text-brand-400">iplocate_asn</td><td>IP/CIDR → ASN, pays, nom (géolocalisation)</td></tr>
|
<tr><td class="pr-2 font-mono text-brand-400">iplocate_asn</td><td>IP/CIDR → ASN, pays, nom (géolocalisation)</td></tr>
|
||||||
<tr><td class="pr-2 font-mono text-brand-400">anubis_*</td><td>Règles Anubis : UA, IP, ASN, pays (filtrage crawlers)</td></tr>
|
<tr><td class="pr-2 font-mono text-brand-400">anubis_*</td><td>Règles Anubis : IP/CIDR et ASN (filtrage crawlers)</td></tr>
|
||||||
</table>
|
</table>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -55,10 +55,8 @@ Listes de référence
|
|||||||
<button class="tab-btn" data-tab="browser_ja4">🌐 Browser JA4</button>
|
<button class="tab-btn" data-tab="browser_ja4">🌐 Browser JA4</button>
|
||||||
<button class="tab-btn" data-tab="asn_reputation">🏢 ASN Réputation</button>
|
<button class="tab-btn" data-tab="asn_reputation">🏢 ASN Réputation</button>
|
||||||
<button class="tab-btn" data-tab="iplocate_asn">🌍 IPLocate</button>
|
<button class="tab-btn" data-tab="iplocate_asn">🌍 IPLocate</button>
|
||||||
<button class="tab-btn" data-tab="anubis_ua_rules">🕷 Anubis UA</button>
|
|
||||||
<button class="tab-btn" data-tab="anubis_ip_rules">🕷 Anubis IP</button>
|
<button class="tab-btn" data-tab="anubis_ip_rules">🕷 Anubis IP</button>
|
||||||
<button class="tab-btn" data-tab="anubis_asn_rules">🕷 Anubis ASN</button>
|
<button class="tab-btn" data-tab="anubis_asn_rules">🕷 Anubis ASN</button>
|
||||||
<button class="tab-btn" data-tab="anubis_country_rules">🕷 Anubis Pays</button>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Controls row -->
|
<!-- Controls row -->
|
||||||
@ -141,10 +139,8 @@ const LIST_DOC = {
|
|||||||
browser_ja4:'Fingerprints JA4 de navigateurs légitimes avec famille (Chromium, Firefox, Safari…), librairie TLS et contexte. Source : dict_browser_ja4.',
|
browser_ja4:'Fingerprints JA4 de navigateurs légitimes avec famille (Chromium, Firefox, Safari…), librairie TLS et contexte. Source : dict_browser_ja4.',
|
||||||
asn_reputation:'Réputation de chaque ASN (isp, datacenter, cdn, hosting, education…). Source : dict_asn_reputation. Utilisé pour la baseline ISP humaine.',
|
asn_reputation:'Réputation de chaque ASN (isp, datacenter, cdn, hosting, education…). Source : dict_asn_reputation. Utilisé pour la baseline ISP humaine.',
|
||||||
iplocate_asn:'Géolocalisation IP→ASN : réseau, numéro ASN, code pays, nom opérateur. Source : dict_iplocate_asn (IP_TRIE, ~714K entrées).',
|
iplocate_asn:'Géolocalisation IP→ASN : réseau, numéro ASN, code pays, nom opérateur. Source : dict_iplocate_asn (IP_TRIE, ~714K entrées).',
|
||||||
anubis_ua_rules:'Règles Anubis de détection par User-Agent (REGEXP_TREE). Chaque règle associe un pattern regex à un bot_name et une action (ALLOW/DENY/WEIGH).',
|
|
||||||
anubis_ip_rules:'Règles Anubis de détection par plage IP (IP_TRIE). Associe des CIDR à des bots connus avec action de filtrage.',
|
anubis_ip_rules:'Règles Anubis de détection par plage IP (IP_TRIE). Associe des CIDR à des bots connus avec action de filtrage.',
|
||||||
anubis_asn_rules:'Règles Anubis par ASN : certains ASN sont associés à des botnets ou services automatisés connus.',
|
anubis_asn_rules:'Règles Anubis par ASN : certains ASN sont associés à des botnets ou services automatisés connus.',
|
||||||
anubis_country_rules:'Règles Anubis par pays : politique de filtrage par code pays (ex: bloquer le trafic de certaines régions).',
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Colonnes par liste
|
// Colonnes par liste
|
||||||
@ -154,10 +150,8 @@ const COLUMNS = {
|
|||||||
browser_ja4:[{k:'ja4',f:'JA4'},{k:'browser_family',f:'Famille'},{k:'tls_library',f:'Lib TLS'},{k:'context',f:'Contexte'}],
|
browser_ja4:[{k:'ja4',f:'JA4'},{k:'browser_family',f:'Famille'},{k:'tls_library',f:'Lib TLS'},{k:'context',f:'Contexte'}],
|
||||||
asn_reputation:[{k:'src_asn',f:'ASN'},{k:'label',f:'Réputation'}],
|
asn_reputation:[{k:'src_asn',f:'ASN'},{k:'label',f:'Réputation'}],
|
||||||
iplocate_asn:[{k:'network',f:'Réseau'},{k:'asn',f:'ASN'},{k:'country_code',f:'Pays'},{k:'name',f:'Nom'}],
|
iplocate_asn:[{k:'network',f:'Réseau'},{k:'asn',f:'ASN'},{k:'country_code',f:'Pays'},{k:'name',f:'Nom'}],
|
||||||
anubis_ua_rules:[{k:'id',f:'ID'},{k:'regexp',f:'Regex'},{k:'bot_name',f:'Bot'},{k:'action',f:'Action'},{k:'category',f:'Catégorie'}],
|
|
||||||
anubis_ip_rules:[{k:'prefix',f:'IP/CIDR'},{k:'bot_name',f:'Bot'},{k:'action',f:'Action'},{k:'category',f:'Catégorie'}],
|
anubis_ip_rules:[{k:'prefix',f:'IP/CIDR'},{k:'bot_name',f:'Bot'},{k:'action',f:'Action'},{k:'category',f:'Catégorie'}],
|
||||||
anubis_asn_rules:[{k:'asn',f:'ASN'},{k:'bot_name',f:'Bot'},{k:'action',f:'Action'},{k:'category',f:'Catégorie'}],
|
anubis_asn_rules:[{k:'asn',f:'ASN'},{k:'bot_name',f:'Bot'},{k:'action',f:'Action'},{k:'category',f:'Catégorie'}],
|
||||||
anubis_country_rules:[{k:'country_code',f:'Pays'},{k:'bot_name',f:'Bot'},{k:'action',f:'Action'},{k:'category',f:'Catégorie'}],
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Tab click
|
// Tab click
|
||||||
|
|||||||
@ -1,10 +1,14 @@
|
|||||||
-- =============================================================================
|
-- =============================================================================
|
||||||
-- 03_anubis_tables.sql — Anubis crawler rule tables and dictionaries
|
-- 03_anubis_tables.sql — Anubis crawler rule tables and dictionaries
|
||||||
-- Items 1–8 from bot_detector/anubis/deploy_schema.sql
|
-- Only IP/CIDR and ASN rules are populated by fetch_rules.py.
|
||||||
|
-- UA and Country dictionaries are kept as stubs (required by MV references)
|
||||||
|
-- but are never populated with real data.
|
||||||
-- =============================================================================
|
-- =============================================================================
|
||||||
|
|
||||||
-- -----------------------------------------------------------------------------
|
-- -----------------------------------------------------------------------------
|
||||||
-- 1. TABLE SOURCE — User-Agent rules (for REGEXP_TREE dictionary)
|
-- 1. TABLE SOURCE — User-Agent rules (REGEXP_TREE stub)
|
||||||
|
-- REGEXP_TREE requires ≥1 rule; the catch-all is seeded at init time.
|
||||||
|
-- This table is NOT populated by fetch_rules.py.
|
||||||
-- -----------------------------------------------------------------------------
|
-- -----------------------------------------------------------------------------
|
||||||
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules
|
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules
|
||||||
(
|
(
|
||||||
@ -20,6 +24,7 @@ ORDER BY id;
|
|||||||
|
|
||||||
-- -----------------------------------------------------------------------------
|
-- -----------------------------------------------------------------------------
|
||||||
-- 2. TABLE SOURCE — IP/CIDR rules (for IP_TRIE dictionary)
|
-- 2. TABLE SOURCE — IP/CIDR rules (for IP_TRIE dictionary)
|
||||||
|
-- Populated by fetch_rules.py from Anubis GitHub data.
|
||||||
-- -----------------------------------------------------------------------------
|
-- -----------------------------------------------------------------------------
|
||||||
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules
|
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules
|
||||||
(
|
(
|
||||||
@ -35,30 +40,23 @@ ORDER BY prefix;
|
|||||||
|
|
||||||
|
|
||||||
-- -----------------------------------------------------------------------------
|
-- -----------------------------------------------------------------------------
|
||||||
-- 3. DICTIONARY — UA REGEXP_TREE
|
-- 3. TABLE SOURCE — ASN rules (for Flat dictionary)
|
||||||
-- dictGet('ja4_processing.dict_anubis_ua', 'bot_name', header_user_agent)
|
-- Populated by fetch_rules.py from botPolicies.yaml.
|
||||||
-- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use.
|
|
||||||
-- -----------------------------------------------------------------------------
|
-- -----------------------------------------------------------------------------
|
||||||
DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ua;
|
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_asn_rules
|
||||||
CREATE DICTIONARY ja4_processing.dict_anubis_ua
|
|
||||||
(
|
(
|
||||||
regexp String,
|
asn UInt32,
|
||||||
bot_name String,
|
bot_name LowCardinality(String),
|
||||||
action String,
|
action LowCardinality(String),
|
||||||
has_ip String,
|
category LowCardinality(String)
|
||||||
rule_id String,
|
|
||||||
category String
|
|
||||||
)
|
)
|
||||||
PRIMARY KEY regexp
|
ENGINE = ReplacingMergeTree()
|
||||||
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_ua_rules'))
|
ORDER BY asn;
|
||||||
LAYOUT(REGEXP_TREE)
|
|
||||||
LIFETIME(MIN 300 MAX 600);
|
|
||||||
|
|
||||||
|
|
||||||
-- -----------------------------------------------------------------------------
|
-- -----------------------------------------------------------------------------
|
||||||
-- 4. DICTIONARY — IP IP_TRIE
|
-- 4. DICTIONARY — IP IP_TRIE (active)
|
||||||
-- dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', toIPv6(src_ip), '')
|
-- dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', toIPv6(src_ip), '')
|
||||||
-- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use.
|
|
||||||
-- -----------------------------------------------------------------------------
|
-- -----------------------------------------------------------------------------
|
||||||
DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ip;
|
DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ip;
|
||||||
CREATE DICTIONARY ja4_processing.dict_anubis_ip
|
CREATE DICTIONARY ja4_processing.dict_anubis_ip
|
||||||
@ -77,37 +75,8 @@ LIFETIME(MIN 300 MAX 600);
|
|||||||
|
|
||||||
|
|
||||||
-- -----------------------------------------------------------------------------
|
-- -----------------------------------------------------------------------------
|
||||||
-- 5. TABLE SOURCE — ASN rules (for Flat dictionary)
|
-- 5. DICTIONARY — ASN Flat (active)
|
||||||
-- -----------------------------------------------------------------------------
|
|
||||||
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_asn_rules
|
|
||||||
(
|
|
||||||
asn UInt32,
|
|
||||||
bot_name LowCardinality(String),
|
|
||||||
action LowCardinality(String),
|
|
||||||
category LowCardinality(String)
|
|
||||||
)
|
|
||||||
ENGINE = ReplacingMergeTree()
|
|
||||||
ORDER BY asn;
|
|
||||||
|
|
||||||
|
|
||||||
-- -----------------------------------------------------------------------------
|
|
||||||
-- 6. TABLE SOURCE — Country rules (for Flat dictionary)
|
|
||||||
-- -----------------------------------------------------------------------------
|
|
||||||
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_country_rules
|
|
||||||
(
|
|
||||||
country_code LowCardinality(String),
|
|
||||||
bot_name LowCardinality(String),
|
|
||||||
action LowCardinality(String),
|
|
||||||
category LowCardinality(String)
|
|
||||||
)
|
|
||||||
ENGINE = ReplacingMergeTree()
|
|
||||||
ORDER BY country_code;
|
|
||||||
|
|
||||||
|
|
||||||
-- -----------------------------------------------------------------------------
|
|
||||||
-- 7. DICTIONARY — ASN Flat
|
|
||||||
-- dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', src_asn, '')
|
-- dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', src_asn, '')
|
||||||
-- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use.
|
|
||||||
-- -----------------------------------------------------------------------------
|
-- -----------------------------------------------------------------------------
|
||||||
DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_asn;
|
DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_asn;
|
||||||
CREATE DICTIONARY ja4_processing.dict_anubis_asn
|
CREATE DICTIONARY ja4_processing.dict_anubis_asn
|
||||||
@ -121,22 +90,3 @@ PRIMARY KEY asn
|
|||||||
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_asn_rules'))
|
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_asn_rules'))
|
||||||
LAYOUT(FLAT())
|
LAYOUT(FLAT())
|
||||||
LIFETIME(MIN 300 MAX 600);
|
LIFETIME(MIN 300 MAX 600);
|
||||||
|
|
||||||
|
|
||||||
-- -----------------------------------------------------------------------------
|
|
||||||
-- 8. DICTIONARY — Country COMPLEX_KEY_HASHED
|
|
||||||
-- dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', src_country_code, '')
|
|
||||||
-- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use.
|
|
||||||
-- -----------------------------------------------------------------------------
|
|
||||||
DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_country;
|
|
||||||
CREATE DICTIONARY ja4_processing.dict_anubis_country
|
|
||||||
(
|
|
||||||
country_code String,
|
|
||||||
bot_name String,
|
|
||||||
action String,
|
|
||||||
category String
|
|
||||||
)
|
|
||||||
PRIMARY KEY country_code
|
|
||||||
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_country_rules'))
|
|
||||||
LAYOUT(COMPLEX_KEY_HASHED())
|
|
||||||
LIFETIME(MIN 300 MAX 600);
|
|
||||||
|
|||||||
@ -175,66 +175,23 @@ SELECT
|
|||||||
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode,
|
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode,
|
||||||
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site,
|
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site,
|
||||||
|
|
||||||
-- Anubis enrichment: combined UA+IP priority logic
|
-- Anubis enrichment: IP/CIDR > ASN (simplified — UA and Country rules removed)
|
||||||
-- Priority: (1) UA+IP [same rule_id] > (2) UA only > (3) IP only > (4) ASN > (5) Country
|
COALESCE(
|
||||||
CASE
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, ''), ''),
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1'
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, ''), ''),
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != ''
|
''
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
) AS anubis_bot_name,
|
||||||
AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua))
|
|
||||||
= dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0))
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua)
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != ''
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua)
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '')
|
|
||||||
ELSE ''
|
|
||||||
END AS anubis_bot_name,
|
|
||||||
|
|
||||||
CASE
|
COALESCE(
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1'
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', _ip, ''), ''),
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != ''
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', _asn, ''), ''),
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
''
|
||||||
AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua))
|
) AS anubis_bot_action,
|
||||||
= dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0))
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'action', _ua)
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != ''
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'action', _ua)
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', _ip, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', _asn, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'action', _cc, '')
|
|
||||||
ELSE ''
|
|
||||||
END AS anubis_bot_action,
|
|
||||||
|
|
||||||
CASE
|
COALESCE(
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1'
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', _ip, ''), ''),
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != ''
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', _asn, ''), ''),
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
''
|
||||||
AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua))
|
) AS anubis_bot_category
|
||||||
= dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0))
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'category', _ua)
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != ''
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'category', _ua)
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', _ip, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', _asn, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'category', _cc, '')
|
|
||||||
ELSE ''
|
|
||||||
END AS anubis_bot_category
|
|
||||||
|
|
||||||
FROM ja4_logs.http_logs_raw;
|
FROM ja4_logs.http_logs_raw;
|
||||||
|
|||||||
@ -18,64 +18,22 @@ WITH base_data AS (
|
|||||||
''
|
''
|
||||||
) AS bot_name,
|
) AS bot_name,
|
||||||
dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(a.ja4), '') AS browser_family,
|
dictGetOrDefault('ja4_processing.dict_browser_ja4', 'browser_family', tuple(a.ja4), '') AS browser_family,
|
||||||
-- Anubis: combined UA+IP priority logic > UA only > IP only > ASN > Country
|
-- Anubis: IP/CIDR > ASN (simplified — UA and Country rules removed)
|
||||||
CASE
|
COALESCE(
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1'
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, ''), ''),
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), ''), ''),
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
''
|
||||||
AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua))
|
) AS anubis_bot_name,
|
||||||
= dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0))
|
COALESCE(
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua)
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', a.src_ip, ''), ''),
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0'
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(a.src_asn), ''), ''),
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
''
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua)
|
) AS anubis_bot_action,
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0
|
COALESCE(
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', a.src_ip, ''), ''),
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '')
|
nullIf(dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(a.src_asn), ''), ''),
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != ''
|
''
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '')
|
) AS anubis_bot_category,
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '')
|
|
||||||
ELSE ''
|
|
||||||
END AS anubis_bot_name,
|
|
||||||
CASE
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
|
||||||
AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua))
|
|
||||||
= dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0))
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'action', a.first_ua)
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'action', a.first_ua)
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', a.src_ip, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(a.src_asn), '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'action', a.src_country_code, '')
|
|
||||||
ELSE ''
|
|
||||||
END AS anubis_bot_action,
|
|
||||||
CASE
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
|
||||||
AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua))
|
|
||||||
= dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0))
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'category', a.first_ua)
|
|
||||||
WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0'
|
|
||||||
AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
|
||||||
THEN dictGet('ja4_processing.dict_anubis_ua', 'category', a.first_ua)
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0
|
|
||||||
AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', a.src_ip, '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(a.src_asn), '')
|
|
||||||
WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != ''
|
|
||||||
THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'category', a.src_country_code, '')
|
|
||||||
ELSE ''
|
|
||||||
END AS anubis_bot_category,
|
|
||||||
a.hits AS hits,
|
a.hits AS hits,
|
||||||
sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits,
|
sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits,
|
||||||
a.correlated AS correlated,
|
a.correlated AS correlated,
|
||||||
|
|||||||
Reference in New Issue
Block a user