feat: ja4-platform monorepo — 5 services unified, tests & RPM builds standardized
Services: - ja4sentinel: TLS/JA4 fingerprint capture daemon (Go, libpcap) - logcorrelator: JA4 log correlation engine (Go, ClickHouse) - mod_reqin_log: Apache module (C, JSON request logging) - bot_detector: ML bot detection pipeline (Python) - dashboard: FastAPI/Streamlit analytics UI (Python) Shared libraries: - shared/go/ja4common: logger, config, shutdown, ipfilter (Go module) - shared/python/ja4_common: ClickHouseClient, ClickHouseSettings (Python package) - shared/clickhouse/: canonical SQL migrations (10 files) Build & packaging: - Unified 3-stage Dockerfile.package for Go RPMs (el8/el9/el10) - go.work workspace linking sentinel, correlator, ja4common - Makefile with test-all, build-all, rpm-* targets Fixes applied: - go.work: 1.21 → 1.24.6 (required by sentinel) - correlator Dockerfiles: golang:1.21 → golang:1.24 - replace directives in go.mod for ja4common local path - pyproject.toml: setuptools.backends → setuptools.build_meta - Removed static libpcap linking (unavailable on Rocky 9) - Fixed data races in output/writers_test.go (sync.Mutex + atomic.Int32) - Rewrote corrupted test files (logger_test.go × 2) Test coverage: - correlator: 67.1% total (unixsocket 80.5%, config 91.7%, app 83.3%, multi 87.7%, stdout 100%) - sentinel: all 10 packages pass (api, capture, config, fingerprint, ipfilter, logging, output, tlsparse) Documentation: - README.md + docs/ (architecture, development, 5 services, shared libs, DB schema & migrations) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
339
services/bot-detector/anubis/deploy_schema.sql
Normal file
339
services/bot-detector/anubis/deploy_schema.sql
Normal file
@ -0,0 +1,339 @@
|
||||
-- ============================================================================
|
||||
-- ANUBIS CRAWLER RULES — Labeling des http_logs + pipeline ML
|
||||
-- Architecture :
|
||||
-- anubis_ua_rules (table) → dict_anubis_ua (REGEXP_TREE)
|
||||
-- anubis_ip_rules (table) → dict_anubis_ip (IP_TRIE)
|
||||
-- http_logs : +anubis_bot_name, +anubis_bot_action
|
||||
-- mv_http_logs : reconstruit avec enrichissement Anubis
|
||||
-- view_ai_features_1h : +anubis_bot_name, +anubis_bot_action (via dictGet)
|
||||
-- ml_detected_anomalies : +anubis_bot_name, +anubis_bot_action
|
||||
-- ml_all_scores : +anubis_bot_name, +anubis_bot_action
|
||||
-- ============================================================================
|
||||
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- 1. TABLE SOURCE — règles User-Agent (pour dictionnaire REGEXP_TREE)
|
||||
--
|
||||
-- Format attendu par ClickHouse regexp_tree (v23.5+) :
|
||||
-- id UInt64 : identifiant unique
|
||||
-- parent_id UInt64 : 0 = racine, sinon id du parent (héritage d'attributs)
|
||||
-- regexp String : expression régulière (re2/vectorscan)
|
||||
-- keys Array(String) : noms des attributs, ex. ['bot_name', 'action']
|
||||
-- values Array(String) : valeurs correspondantes
|
||||
--
|
||||
-- Hiérarchie utilisée pour la priorité :
|
||||
-- Règles génériques DENY (parent_id=0) → enfants ALLOW spécifiques
|
||||
-- Exemple : ai-crawlers-training (parent) → openai-gptbot (enfant)
|
||||
-- Quand l'UA correspond à enfant ET parent, c'est le nom de l'enfant qui
|
||||
-- est retourné (l'enfant hérite ET surcharge les attributs du parent).
|
||||
-- ----------------------------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS mabase_prod.anubis_ua_rules
|
||||
(
|
||||
id UInt64,
|
||||
parent_id UInt64,
|
||||
regexp String,
|
||||
keys Array(String),
|
||||
values Array(String)
|
||||
)
|
||||
ENGINE = ReplacingMergeTree()
|
||||
ORDER BY id;
|
||||
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- 2. TABLE SOURCE — règles IP/CIDR (pour dictionnaire IP_TRIE)
|
||||
--
|
||||
-- Colonnes requises par dict_anubis_ip et mv_http_logs :
|
||||
-- rule_id : identifiant de règle, croisé avec dict_anubis_ua pour
|
||||
-- la logique UA+IP (même rule_id → match combiné)
|
||||
-- has_ua : 1 si la règle possède aussi une regex UA (croisement nécessaire)
|
||||
-- category : catégorie Anubis (bots, crawlers, clients, policies…)
|
||||
-- ----------------------------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS mabase_prod.anubis_ip_rules
|
||||
(
|
||||
prefix String,
|
||||
bot_name LowCardinality(String),
|
||||
action LowCardinality(String),
|
||||
rule_id UInt64,
|
||||
has_ua UInt8,
|
||||
category LowCardinality(String)
|
||||
)
|
||||
ENGINE = ReplacingMergeTree()
|
||||
ORDER BY prefix;
|
||||
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- 3. DICTIONNAIRE UA — REGEXP_TREE
|
||||
-- dictGet('mabase_prod.dict_anubis_ua', 'bot_name', header_user_agent)
|
||||
--
|
||||
-- Le PRIMARY KEY est 'regexp' (String) — requis par ClickHouse 26.x.
|
||||
-- Connexion interne (HOST localhost PORT 9000) pour éviter deadlock HTTP.
|
||||
-- Remplacer 'admin' et le mot de passe par les credentials ClickHouse.
|
||||
-- ----------------------------------------------------------------------------
|
||||
DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_ua;
|
||||
CREATE DICTIONARY mabase_prod.dict_anubis_ua
|
||||
(
|
||||
regexp String,
|
||||
bot_name String,
|
||||
action String
|
||||
)
|
||||
PRIMARY KEY regexp
|
||||
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_ua_rules'))
|
||||
LAYOUT(REGEXP_TREE)
|
||||
LIFETIME(MIN 300 MAX 600);
|
||||
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- 4. DICTIONNAIRE IP — IP_TRIE
|
||||
-- dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', toIPv6(src_ip), '')
|
||||
-- Connexion interne (HOST localhost PORT 9000) — même raison que dict_anubis_ua.
|
||||
-- ----------------------------------------------------------------------------
|
||||
DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_ip;
|
||||
CREATE DICTIONARY mabase_prod.dict_anubis_ip
|
||||
(
|
||||
prefix String,
|
||||
bot_name String,
|
||||
action String,
|
||||
rule_id UInt64,
|
||||
has_ua UInt8,
|
||||
category String
|
||||
)
|
||||
PRIMARY KEY prefix
|
||||
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_ip_rules'))
|
||||
LAYOUT(IP_TRIE())
|
||||
LIFETIME(MIN 300 MAX 600);
|
||||
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- 5. TABLE SOURCE — règles ASN (pour dictionnaire Flat)
|
||||
-- Alimentée par botPolicies.yaml via fetch_rules.py → insert_asn_rules()
|
||||
-- ----------------------------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS mabase_prod.anubis_asn_rules
|
||||
(
|
||||
asn UInt32,
|
||||
bot_name LowCardinality(String),
|
||||
action LowCardinality(String),
|
||||
category LowCardinality(String)
|
||||
)
|
||||
ENGINE = ReplacingMergeTree()
|
||||
ORDER BY asn;
|
||||
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- 6. TABLE SOURCE — règles pays ISO-3166 (pour dictionnaire Flat)
|
||||
-- Alimentée par botPolicies.yaml via fetch_rules.py → insert_country_rules()
|
||||
-- ----------------------------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS mabase_prod.anubis_country_rules
|
||||
(
|
||||
country_code LowCardinality(String),
|
||||
bot_name LowCardinality(String),
|
||||
action LowCardinality(String),
|
||||
category LowCardinality(String)
|
||||
)
|
||||
ENGINE = ReplacingMergeTree()
|
||||
ORDER BY country_code;
|
||||
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- 7. DICTIONNAIRE ASN — Flat
|
||||
-- dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', src_asn, '')
|
||||
-- ----------------------------------------------------------------------------
|
||||
DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_asn;
|
||||
CREATE DICTIONARY mabase_prod.dict_anubis_asn
|
||||
(
|
||||
asn UInt32,
|
||||
bot_name String,
|
||||
action String,
|
||||
category String
|
||||
)
|
||||
PRIMARY KEY asn
|
||||
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_asn_rules'))
|
||||
LAYOUT(FLAT())
|
||||
LIFETIME(MIN 300 MAX 600);
|
||||
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- 8. DICTIONNAIRE PAYS — Flat
|
||||
-- dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', src_country_code, '')
|
||||
-- ----------------------------------------------------------------------------
|
||||
DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_country;
|
||||
CREATE DICTIONARY mabase_prod.dict_anubis_country
|
||||
(
|
||||
country_code String,
|
||||
bot_name String,
|
||||
action String,
|
||||
category String
|
||||
)
|
||||
PRIMARY KEY country_code
|
||||
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_country_rules'))
|
||||
LAYOUT(FLAT())
|
||||
LIFETIME(MIN 300 MAX 600);
|
||||
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- 9. AJOUT DES COLONNES ANUBIS dans http_logs
|
||||
-- Idempotent : ne plante pas si déjà présentes
|
||||
-- ----------------------------------------------------------------------------
|
||||
ALTER TABLE mabase_prod.http_logs
|
||||
ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '',
|
||||
ADD COLUMN IF NOT EXISTS anubis_bot_action LowCardinality(String) DEFAULT '',
|
||||
ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT '';
|
||||
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- 10. RECONSTRUCTION DE mv_http_logs avec enrichissement Anubis
|
||||
-- Logique de priorisation :
|
||||
-- 1. UA regex (plus informatif — identifie le bot précis)
|
||||
-- 2. IP/CIDR (fallback — identifie le réseau cloud)
|
||||
-- ----------------------------------------------------------------------------
|
||||
DROP VIEW IF EXISTS mabase_prod.mv_http_logs;
|
||||
|
||||
CREATE MATERIALIZED VIEW mabase_prod.mv_http_logs
|
||||
TO mabase_prod.http_logs
|
||||
(
|
||||
`time` DateTime,
|
||||
`log_date` Date,
|
||||
`src_ip` IPv4,
|
||||
`src_port` UInt16,
|
||||
`src_asn` UInt32,
|
||||
`src_country_code` String,
|
||||
`dst_ip` IPv4,
|
||||
`dst_port` UInt16,
|
||||
`src_as_name` String,
|
||||
`src_org` String,
|
||||
`src_domain` String,
|
||||
`method` String,
|
||||
`scheme` String,
|
||||
`host` String,
|
||||
`path` String,
|
||||
`query` String,
|
||||
`http_version` String,
|
||||
`orphan_side` String,
|
||||
`correlated` UInt8,
|
||||
`keepalives` UInt16,
|
||||
`a_timestamp` UInt64,
|
||||
`b_timestamp` UInt64,
|
||||
`conn_id` String,
|
||||
`ip_meta_df` UInt8,
|
||||
`ip_meta_id` UInt16,
|
||||
`ip_meta_total_length` UInt16,
|
||||
`ip_meta_ttl` UInt8,
|
||||
`tcp_meta_options` String,
|
||||
`tcp_meta_window_size` UInt32,
|
||||
`tcp_meta_mss` UInt16,
|
||||
`tcp_meta_window_scale` UInt8,
|
||||
`syn_to_clienthello_ms` Int32,
|
||||
`tls_version` String,
|
||||
`tls_sni` String,
|
||||
`tls_alpn` String,
|
||||
`ja3` String,
|
||||
`ja3_hash` String,
|
||||
`ja4` String,
|
||||
`client_headers` String,
|
||||
`header_user_agent` String,
|
||||
`header_accept` String,
|
||||
`header_accept_encoding` String,
|
||||
`header_accept_language` String,
|
||||
`header_content_type` String,
|
||||
`header_x_request_id` String,
|
||||
`header_x_trace_id` String,
|
||||
`header_x_forwarded_for` String,
|
||||
`header_sec_ch_ua` String,
|
||||
`header_sec_ch_ua_mobile` String,
|
||||
`header_sec_ch_ua_platform` String,
|
||||
`header_sec_fetch_dest` String,
|
||||
`header_sec_fetch_mode` String,
|
||||
`header_sec_fetch_site` String,
|
||||
`anubis_bot_name` String,
|
||||
`anubis_bot_action` String
|
||||
)
|
||||
AS SELECT
|
||||
parseDateTimeBestEffort(coalesce(JSONExtractString(raw_json, 'time'), '1970-01-01T00:00:00Z')) AS time,
|
||||
toDate(time) AS log_date,
|
||||
toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0')) AS src_ip,
|
||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'src_port'), 0)) AS src_port,
|
||||
dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'asn', toIPv6(src_ip), toUInt32(0)) AS src_asn,
|
||||
dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'country_code', toIPv6(src_ip), '') AS src_country_code,
|
||||
toIPv4(coalesce(JSONExtractString(raw_json, 'dst_ip'), '0.0.0.0')) AS dst_ip,
|
||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port,
|
||||
dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'name', toIPv6(src_ip), '') AS src_as_name,
|
||||
dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'org', toIPv6(src_ip), '') AS src_org,
|
||||
dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'domain', toIPv6(src_ip), '') AS src_domain,
|
||||
coalesce(JSONExtractString(raw_json, 'method'), '') AS method,
|
||||
coalesce(JSONExtractString(raw_json, 'scheme'), '') AS scheme,
|
||||
coalesce(JSONExtractString(raw_json, 'host'), '') AS host,
|
||||
coalesce(JSONExtractString(raw_json, 'path'), '') AS path,
|
||||
coalesce(JSONExtractString(raw_json, 'query'), '') AS query,
|
||||
coalesce(JSONExtractString(raw_json, 'http_version'), '') AS http_version,
|
||||
coalesce(JSONExtractString(raw_json, 'orphan_side'), '') AS orphan_side,
|
||||
toUInt8(coalesce(JSONExtractBool(raw_json, 'correlated'), 0)) AS correlated,
|
||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'keepalives'), 0)) AS keepalives,
|
||||
coalesce(JSONExtractUInt(raw_json, 'a_timestamp'), 0) AS a_timestamp,
|
||||
coalesce(JSONExtractUInt(raw_json, 'b_timestamp'), 0) AS b_timestamp,
|
||||
coalesce(JSONExtractString(raw_json, 'conn_id'), '') AS conn_id,
|
||||
toUInt8(coalesce(JSONExtractBool(raw_json, 'ip_meta_df'), 0)) AS ip_meta_df,
|
||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'ip_meta_id'), 0)) AS ip_meta_id,
|
||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'ip_meta_total_length'), 0)) AS ip_meta_total_length,
|
||||
toUInt8(coalesce(JSONExtractUInt(raw_json, 'ip_meta_ttl'), 0)) AS ip_meta_ttl,
|
||||
coalesce(JSONExtractString(raw_json, 'tcp_meta_options'), '') AS tcp_meta_options,
|
||||
toUInt32(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_size'), 0)) AS tcp_meta_window_size,
|
||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_mss'), 0)) AS tcp_meta_mss,
|
||||
toUInt8(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_scale'), 0)) AS tcp_meta_window_scale,
|
||||
toInt32(coalesce(JSONExtractInt(raw_json, 'syn_to_clienthello_ms'), 0)) AS syn_to_clienthello_ms,
|
||||
coalesce(JSONExtractString(raw_json, 'tls_version'), '') AS tls_version,
|
||||
coalesce(JSONExtractString(raw_json, 'tls_sni'), '') AS tls_sni,
|
||||
coalesce(JSONExtractString(raw_json, 'tls_alpn'), '') AS tls_alpn,
|
||||
coalesce(JSONExtractString(raw_json, 'ja3'), '') AS ja3,
|
||||
coalesce(JSONExtractString(raw_json, 'ja3_hash'), '') AS ja3_hash,
|
||||
coalesce(JSONExtractString(raw_json, 'ja4'), '') AS ja4,
|
||||
coalesce(JSONExtractString(raw_json, 'client_headers'), '') AS client_headers,
|
||||
coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS header_user_agent,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Accept'), '') AS header_accept,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Accept-Encoding'), '') AS header_accept_encoding,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Accept-Language'), '') AS header_accept_language,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Content-Type'), '') AS header_content_type,
|
||||
coalesce(JSONExtractString(raw_json, 'header_X-Request-Id'), '') AS header_x_request_id,
|
||||
coalesce(JSONExtractString(raw_json, 'header_X-Trace-Id'), '') AS header_x_trace_id,
|
||||
coalesce(JSONExtractString(raw_json, 'header_X-Forwarded-For'), '') AS header_x_forwarded_for,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA'), '') AS header_sec_ch_ua,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Mobile'), '') AS header_sec_ch_ua_mobile,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Platform'), '') AS header_sec_ch_ua_platform,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Dest'), '') AS header_sec_fetch_dest,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site,
|
||||
-- ── Enrichissement Anubis ────────────────────────────────────────────────
|
||||
-- Priorité : UA regex > IP/CIDR (UA identifie précisément le bot)
|
||||
COALESCE(
|
||||
nullIf(dictGet('mabase_prod.dict_anubis_ua', 'bot_name',
|
||||
coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '')), ''),
|
||||
nullIf(dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name',
|
||||
toIPv6(toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0'))), ''), ''),
|
||||
''
|
||||
) AS anubis_bot_name,
|
||||
COALESCE(
|
||||
nullIf(dictGet('mabase_prod.dict_anubis_ua', 'action',
|
||||
coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '')), ''),
|
||||
nullIf(dictGetOrDefault('mabase_prod.dict_anubis_ip', 'action',
|
||||
toIPv6(toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0'))), ''), ''),
|
||||
''
|
||||
) AS anubis_bot_action
|
||||
FROM mabase_prod.http_logs_raw;
|
||||
|
||||
-- ============================================================================
|
||||
-- INTÉGRATION ML — Propagation Anubis vers le pipeline bot_detector
|
||||
-- ============================================================================
|
||||
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- 11. COLONNES ANUBIS dans ml_detected_anomalies
|
||||
-- ----------------------------------------------------------------------------
|
||||
ALTER TABLE mabase_prod.ml_detected_anomalies
|
||||
ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '',
|
||||
ADD COLUMN IF NOT EXISTS anubis_bot_action LowCardinality(String) DEFAULT '',
|
||||
ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT '';
|
||||
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- 12. COLONNES ANUBIS dans ml_all_scores
|
||||
-- ----------------------------------------------------------------------------
|
||||
ALTER TABLE mabase_prod.ml_all_scores
|
||||
ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '',
|
||||
ADD COLUMN IF NOT EXISTS anubis_bot_action LowCardinality(String) DEFAULT '',
|
||||
ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT '';
|
||||
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- 13. VIEW view_ai_features_1h — Enrichissement Anubis
|
||||
-- Ajoute anubis_bot_name et anubis_bot_action via dictGet.
|
||||
-- Priorité : UA regex (first_ua → dict_anubis_ua) > IP/CIDR (src_ip → dict_anubis_ip)
|
||||
-- Voir le fichier complet dans /tmp/update_view_ai_features.sql ou recréer
|
||||
-- avec CREATE OR REPLACE VIEW après avoir appliqué les étapes précédentes.
|
||||
-- ----------------------------------------------------------------------------
|
||||
-- NOTE : Exécuter le contenu de /tmp/update_view_ai_features.sql ici (trop long).
|
||||
-- Ou lancer depuis le repo : psql -f bot_detector/anubis/view_ai_features_anubis.sql
|
||||
486
services/bot-detector/anubis/fetch_rules.py
Normal file
486
services/bot-detector/anubis/fetch_rules.py
Normal file
@ -0,0 +1,486 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
fetch_rules.py — Récupère TOUTES les règles Anubis depuis GitHub et les insère dans ClickHouse.
|
||||
|
||||
Sources :
|
||||
- data/bots/**/*.yaml (bots pathologiques, IA, IRC)
|
||||
- data/crawlers/*.yaml (crawlers légitimes et clouds)
|
||||
- data/clients/*.yaml (clients IA agissant pour utilisateurs)
|
||||
- data/common/*.yaml (règles communes : IPs privées, etc.)
|
||||
- data/botPolicies.yaml (règles ASN et pays inline)
|
||||
|
||||
Usage (depuis le container dashboard_web) :
|
||||
python /tmp/fetch_rules.py
|
||||
|
||||
Variables d'environnement :
|
||||
CLICKHOUSE_HOST, CLICKHOUSE_DB, CLICKHOUSE_USER, CLICKHOUSE_PASSWORD
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
print("[ERREUR] pyyaml manquant.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
import clickhouse_connect
|
||||
except ImportError:
|
||||
print("[ERREUR] clickhouse-connect manquant.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Config
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
GITHUB_API = "https://api.github.com/repos/TecharoHQ/anubis/contents"
|
||||
GITHUB_RAW = "https://raw.githubusercontent.com/TecharoHQ/anubis/main"
|
||||
|
||||
# Répertoires à parcourir — ORDER CRITIQUE pour REGEXP_TREE :
|
||||
# Dans REGEXP_TREE (root-level rules), la règle avec l'ID le plus bas gagne quand plusieurs matchent.
|
||||
# → Les règles SPÉCIFIQUES doivent être chargées en PREMIER (IDs bas) pour gagner sur les catch-alls.
|
||||
# → Les catch-alls (ai-robots-txt, ai-catchall) doivent être chargés en DERNIER (IDs hauts).
|
||||
#
|
||||
# Au sein de chaque répertoire, les fichiers sont triés EN ORDRE ALPHABÉTIQUE INVERSÉ
|
||||
# pour que les règles spécifiques (noms longs) aient des IDs plus bas que les catch-alls (ai.yaml).
|
||||
DIRECTORIES = [
|
||||
("data/clients", "clients"), # Règles AI clients avec IP (openai-chatgpt-user, etc.)
|
||||
("data/bots/irc-bots", "bots/irc-bots"), # Bots IRC spécifiques
|
||||
("data/crawlers", "crawlers"), # Crawlers spécifiques + clouds
|
||||
("data/common", "common"), # IPs privées, routes communes
|
||||
("data/bots", "bots"), # Catch-alls larges (ai-robots-txt, ai-catchall) — LAST
|
||||
]
|
||||
|
||||
# Fichier de politique principal (règles ASN + pays inline)
|
||||
BOT_POLICIES_PATH = "data/botPolicies.yaml"
|
||||
|
||||
# UA_PARENT_OVERRIDE : mapping nom_règle → nom_parent pour forcer la hiérarchie REGEXP_TREE.
|
||||
# Conservé vide intentionnellement : l'ordre de chargement (spécifique avant catch-all)
|
||||
# garantit la priorité sans hiérarchie parent_id explicite.
|
||||
# Populer ce dict si une règle doit hériter d'une autre via parent_id dans REGEXP_TREE.
|
||||
UA_PARENT_OVERRIDE: dict[str, str] = {}
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# HTTP helpers
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _fetch_url(url: str, timeout: int = 15) -> str | None:
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=timeout) as resp:
|
||||
return resp.read().decode("utf-8")
|
||||
except urllib.error.URLError as e:
|
||||
print(f"[WARN] {url}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def fetch_yaml_url(url: str) -> list | dict | None:
|
||||
content = _fetch_url(url)
|
||||
if content:
|
||||
return yaml.safe_load(content)
|
||||
return None
|
||||
|
||||
|
||||
def list_yaml_files(api_path: str) -> list[str]:
|
||||
"""
|
||||
Retourne la liste des raw URLs des fichiers .yaml/.yml dans api_path via l'API GitHub.
|
||||
Les fichiers sont triés en ordre ALPHABÉTIQUE INVERSÉ pour que les règles spécifiques
|
||||
(noms longs, ex: openai-chatgpt-user.yaml) aient un ID inférieur aux catch-alls (ai.yaml).
|
||||
"""
|
||||
content = _fetch_url(f"{GITHUB_API}/{api_path}")
|
||||
if not content:
|
||||
return []
|
||||
try:
|
||||
entries = json.loads(content)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
files = [
|
||||
entry for entry in entries
|
||||
if entry.get("type") == "file" and entry.get("name", "").endswith((".yaml", ".yml"))
|
||||
]
|
||||
# Tri inverse : les noms longs (spécifiques) avant les noms courts (catch-alls)
|
||||
files.sort(key=lambda e: e["name"], reverse=True)
|
||||
return [f["download_url"] for f in files]
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Extraction des patterns UA depuis les expressions CEL-like
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _extract_ua_from_all(conditions: list) -> str | None:
|
||||
"""Extrait une regex UA depuis une expression 'all' (ex: yandexbot userAgent.matches)."""
|
||||
for cond in conditions:
|
||||
if not isinstance(cond, str):
|
||||
continue
|
||||
m = re.search(r'userAgent\.matches\("(.+?)"\)', cond)
|
||||
if m:
|
||||
return m.group(1).replace("\\\\", "\\")
|
||||
return None
|
||||
|
||||
|
||||
def _extract_ua_from_any(conditions: list) -> str | None:
|
||||
"""
|
||||
Extrait une regex UA depuis une expression 'any' avec userAgent.contains(...)
|
||||
Exemple : aggressive-brazilian-scrapers.yaml
|
||||
Retourne une regex en OR : MSIE|Trident|...
|
||||
"""
|
||||
patterns = []
|
||||
for cond in conditions:
|
||||
if not isinstance(cond, str):
|
||||
continue
|
||||
m = re.search(r'userAgent\.contains\("(.+?)"\)', cond)
|
||||
if m:
|
||||
patterns.append(re.escape(m.group(1)))
|
||||
if patterns:
|
||||
return "|".join(patterns)
|
||||
return None
|
||||
|
||||
|
||||
def extract_ua_regex(rule: dict) -> str | None:
|
||||
"""Extrait la regex User-Agent depuis toutes les formes possibles."""
|
||||
# Forme directe
|
||||
if ua := rule.get("user_agent_regex"):
|
||||
return ua.strip()
|
||||
|
||||
expr = rule.get("expression")
|
||||
if not expr:
|
||||
return None
|
||||
|
||||
# Expression scalaire (CEL string)
|
||||
if isinstance(expr, str):
|
||||
m = re.search(r'userAgent\.matches\("(.+?)"\)', expr)
|
||||
if m:
|
||||
return m.group(1).replace("\\\\", "\\")
|
||||
m = re.search(r'userAgent\.contains\("(.+?)"\)', expr)
|
||||
if m:
|
||||
return re.escape(m.group(1))
|
||||
return None
|
||||
|
||||
# Expression structurée dict
|
||||
if isinstance(expr, dict):
|
||||
if ua := _extract_ua_from_all(expr.get("all", [])):
|
||||
return ua
|
||||
if ua := _extract_ua_from_any(expr.get("any", [])):
|
||||
return ua
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Parse des fichiers YAML
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def parse_file(
|
||||
url: str,
|
||||
category: str,
|
||||
ua_name_to_id: dict,
|
||||
ua_id_counter_ref: list, # [int] — compteur mutable partagé entre appels
|
||||
rule_id_counter_ref: list, # [int] — idem
|
||||
) -> tuple[list[dict], list[dict]]:
|
||||
"""
|
||||
Parse un fichier YAML Anubis.
|
||||
Retourne (ua_rules, ip_rules).
|
||||
|
||||
Note : ua_name_to_id est maintenu pour supporter la hiérarchie parent_id dans
|
||||
REGEXP_TREE (via UA_PARENT_OVERRIDE). Tant que UA_PARENT_OVERRIDE est vide,
|
||||
parent_id vaut toujours 0 et ua_name_to_id n'est pas consulté en pratique.
|
||||
"""
|
||||
data = fetch_yaml_url(url)
|
||||
if not data or not isinstance(data, list):
|
||||
return [], []
|
||||
|
||||
ua_rules, ip_rules = [], []
|
||||
|
||||
for rule in data:
|
||||
if not isinstance(rule, dict):
|
||||
continue
|
||||
# Ignorer les imports (références à d'autres fichiers)
|
||||
if "import" in rule:
|
||||
continue
|
||||
|
||||
name = rule.get("name", "").strip()
|
||||
action = rule.get("action", "").strip()
|
||||
if not name or not action:
|
||||
continue
|
||||
|
||||
remote_addrs = [str(c).strip() for c in rule.get("remote_addresses", []) if c]
|
||||
has_ip = bool(remote_addrs)
|
||||
|
||||
rule_id = rule_id_counter_ref[0]
|
||||
rule_id_counter_ref[0] += 1
|
||||
|
||||
# ── User-Agent regex ─────────────────────────────────────────────────
|
||||
ua_regex = extract_ua_regex(rule)
|
||||
if ua_regex:
|
||||
parent_name = UA_PARENT_OVERRIDE.get(name)
|
||||
parent_id = ua_name_to_id.get(parent_name, 0) if parent_name else 0
|
||||
|
||||
uid = ua_id_counter_ref[0]
|
||||
ua_id_counter_ref[0] += 1
|
||||
ua_name_to_id[name] = uid
|
||||
|
||||
ua_rules.append({
|
||||
"id": uid,
|
||||
"parent_id": parent_id,
|
||||
"regexp": ua_regex,
|
||||
"bot_name": name,
|
||||
"action": action,
|
||||
"has_ip": "1" if has_ip else "0",
|
||||
"rule_id": str(rule_id),
|
||||
"category": category,
|
||||
})
|
||||
|
||||
# ── IP/CIDR ranges ───────────────────────────────────────────────────
|
||||
has_ua = bool(ua_regex)
|
||||
for cidr in remote_addrs:
|
||||
ip_rules.append({
|
||||
"prefix": cidr,
|
||||
"bot_name": name,
|
||||
"action": action,
|
||||
"rule_id": rule_id,
|
||||
"has_ua": 1 if has_ua else 0,
|
||||
"category": category,
|
||||
})
|
||||
|
||||
return ua_rules, ip_rules
|
||||
|
||||
|
||||
def parse_bot_policies_inline(url: str) -> tuple[list[dict], list[dict]]:
|
||||
"""
|
||||
Parse botPolicies.yaml pour les règles inline avec geoip.countries et asns.match.
|
||||
Retourne (asn_rules, country_rules).
|
||||
"""
|
||||
data = fetch_yaml_url(url)
|
||||
if not data or not isinstance(data, dict):
|
||||
return [], []
|
||||
|
||||
asn_rules: list[dict] = []
|
||||
country_rules: list[dict] = []
|
||||
|
||||
for rule in data.get("bots", []):
|
||||
if not isinstance(rule, dict):
|
||||
continue
|
||||
if "import" in rule:
|
||||
continue
|
||||
|
||||
name = rule.get("name", "").strip()
|
||||
action = rule.get("action", "").strip()
|
||||
if not name or not action:
|
||||
continue
|
||||
|
||||
# ASN rules
|
||||
asns = rule.get("asns", {})
|
||||
if isinstance(asns, dict):
|
||||
for asn in asns.get("match", []):
|
||||
asn_rules.append({
|
||||
"asn": int(asn),
|
||||
"bot_name": name,
|
||||
"action": action,
|
||||
"category": "policies",
|
||||
})
|
||||
|
||||
# Country rules
|
||||
geoip = rule.get("geoip", {})
|
||||
if isinstance(geoip, dict):
|
||||
for cc in geoip.get("countries", []):
|
||||
country_rules.append({
|
||||
"country_code": str(cc).upper(),
|
||||
"bot_name": name,
|
||||
"action": action,
|
||||
"category": "policies",
|
||||
})
|
||||
|
||||
return asn_rules, country_rules
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Collecte de toutes les règles
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def collect_all_rules() -> tuple[list, list, list, list]:
|
||||
"""Retourne (ua_rules, ip_rules, asn_rules, country_rules)."""
|
||||
ua_name_to_id: dict[str, int] = {}
|
||||
ua_id_counter_ref: list[int] = [1]
|
||||
rule_id_counter: list[int] = [1]
|
||||
|
||||
all_ua: list[dict] = []
|
||||
all_ip: list[dict] = []
|
||||
|
||||
for api_path, category in DIRECTORIES:
|
||||
print(f"[INFO] Parcours de {api_path} ({category})…")
|
||||
file_urls = list_yaml_files(api_path)
|
||||
print(f" {len(file_urls)} fichiers trouvés")
|
||||
for url in file_urls:
|
||||
ua, ip = parse_file(url, category, ua_name_to_id, ua_id_counter_ref, rule_id_counter)
|
||||
all_ua.extend(ua)
|
||||
all_ip.extend(ip)
|
||||
|
||||
# Règles ASN + pays depuis botPolicies.yaml
|
||||
print(f"[INFO] Lecture de botPolicies.yaml…")
|
||||
policies_url = f"{GITHUB_RAW}/{BOT_POLICIES_PATH}"
|
||||
asn_rules, country_rules = parse_bot_policies_inline(policies_url)
|
||||
|
||||
return all_ua, all_ip, asn_rules, country_rules
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# ClickHouse
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_ch_client():
|
||||
return clickhouse_connect.get_client(
|
||||
host=os.environ.get("CLICKHOUSE_HOST", "clickhouse"),
|
||||
database=os.environ.get("CLICKHOUSE_DB", "mabase_prod"),
|
||||
username=os.environ.get("CLICKHOUSE_USER", "admin"),
|
||||
password=os.environ.get("CLICKHOUSE_PASSWORD", ""),
|
||||
)
|
||||
|
||||
|
||||
def insert_ua_rules(client, rules: list[dict]) -> None:
|
||||
if not rules:
|
||||
print("[INFO] Aucune règle UA.")
|
||||
return
|
||||
client.command("TRUNCATE TABLE mabase_prod.anubis_ua_rules")
|
||||
# REGEXP_TREE format : id, parent_id, regexp, keys[], values[]
|
||||
# keys = ['bot_name', 'action', 'has_ip', 'rule_id', 'category']
|
||||
data = [
|
||||
[
|
||||
r["id"], r["parent_id"], r["regexp"],
|
||||
["bot_name", "action", "has_ip", "rule_id", "category"],
|
||||
[r["bot_name"], r["action"], r["has_ip"], r["rule_id"], r["category"]],
|
||||
]
|
||||
for r in rules
|
||||
]
|
||||
client.insert("mabase_prod.anubis_ua_rules", data,
|
||||
column_names=["id", "parent_id", "regexp", "keys", "values"])
|
||||
print(f"[OK] {len(rules)} règles UA insérées.")
|
||||
|
||||
|
||||
def insert_ip_rules(client, rules: list[dict]) -> None:
|
||||
if not rules:
|
||||
print("[INFO] Aucune règle IP.")
|
||||
return
|
||||
client.command("TRUNCATE TABLE mabase_prod.anubis_ip_rules")
|
||||
data = [
|
||||
[r["prefix"], r["bot_name"], r["action"],
|
||||
r["rule_id"], r["has_ua"], r["category"]]
|
||||
for r in rules
|
||||
]
|
||||
client.insert("mabase_prod.anubis_ip_rules", data,
|
||||
column_names=["prefix", "bot_name", "action", "rule_id", "has_ua", "category"])
|
||||
print(f"[OK] {len(rules)} règles IP insérées.")
|
||||
|
||||
|
||||
def insert_asn_rules(client, rules: list[dict]) -> None:
|
||||
if not rules:
|
||||
print("[INFO] Aucune règle ASN.")
|
||||
return
|
||||
client.command("TRUNCATE TABLE mabase_prod.anubis_asn_rules")
|
||||
data = [[r["asn"], r["bot_name"], r["action"], r["category"]] for r in rules]
|
||||
client.insert("mabase_prod.anubis_asn_rules", data,
|
||||
column_names=["asn", "bot_name", "action", "category"])
|
||||
print(f"[OK] {len(rules)} règles ASN insérées.")
|
||||
|
||||
|
||||
def insert_country_rules(client, rules: list[dict]) -> None:
|
||||
if not rules:
|
||||
print("[INFO] Aucune règle pays.")
|
||||
return
|
||||
client.command("TRUNCATE TABLE mabase_prod.anubis_country_rules")
|
||||
data = [[r["country_code"], r["bot_name"], r["action"], r["category"]] for r in rules]
|
||||
client.insert("mabase_prod.anubis_country_rules", data,
|
||||
column_names=["country_code", "bot_name", "action", "category"])
|
||||
print(f"[OK] {len(rules)} règles pays insérées.")
|
||||
|
||||
|
||||
def reload_dicts(client) -> None:
|
||||
dicts = [
|
||||
"mabase_prod.dict_anubis_ua",
|
||||
"mabase_prod.dict_anubis_ip",
|
||||
"mabase_prod.dict_anubis_asn",
|
||||
"mabase_prod.dict_anubis_country",
|
||||
]
|
||||
for d in dicts:
|
||||
try:
|
||||
client.command(f"SYSTEM RELOAD DICTIONARY {d}")
|
||||
print(f"[OK] {d} rechargé.")
|
||||
except Exception as e:
|
||||
print(f"[WARN] Rechargement {d}: {e}", file=sys.stderr)
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Rapport
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def print_summary(ua_rules, ip_rules, asn_rules, country_rules):
|
||||
print("\n── Règles UA ──")
|
||||
by_cat: dict[str, list] = {}
|
||||
for r in ua_rules:
|
||||
by_cat.setdefault(r["category"], []).append(r)
|
||||
for cat, rules in sorted(by_cat.items()):
|
||||
print(f" [{cat}] {len(rules)} règle(s)")
|
||||
for r in rules[:5]:
|
||||
has = " [+IP]" if r["has_ip"] == "1" else ""
|
||||
par = f" [parent={r['parent_id']}]" if r["parent_id"] else ""
|
||||
print(f" [{r['action']:9s}] {r['bot_name']}{has}{par}: {r['regexp'][:50]}")
|
||||
if len(rules) > 5:
|
||||
print(f" … et {len(rules) - 5} autres")
|
||||
|
||||
print(f"\n── Règles IP : {len(ip_rules)} CIDRs ──")
|
||||
by_bot: dict[str, list] = {}
|
||||
for r in ip_rules:
|
||||
by_bot.setdefault(r["bot_name"], []).append(r)
|
||||
for bot, rs in sorted(by_bot.items())[:15]:
|
||||
print(f" [{rs[0]['action']:9s}] {bot}: {len(rs)} CIDRs (cat={rs[0]['category']}, has_ua={rs[0]['has_ua']})")
|
||||
if len(by_bot) > 15:
|
||||
print(f" … et {len(by_bot) - 15} autres bots")
|
||||
|
||||
if asn_rules:
|
||||
print(f"\n── Règles ASN : {len(asn_rules)} ──")
|
||||
for r in asn_rules:
|
||||
print(f" [{r['action']:9s}] ASN {r['asn']}: {r['bot_name']}")
|
||||
|
||||
if country_rules:
|
||||
print(f"\n── Règles pays : {len(country_rules)} ──")
|
||||
for r in country_rules:
|
||||
print(f" [{r['action']:9s}] {r['country_code']}: {r['bot_name']}")
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Main
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
print("[INFO] Collecte des règles Anubis depuis GitHub…")
|
||||
ua_rules, ip_rules, asn_rules, country_rules = collect_all_rules()
|
||||
|
||||
total = len(ua_rules) + len(ip_rules) + len(asn_rules) + len(country_rules)
|
||||
print(f"\n[INFO] {len(ua_rules)} règles UA, {len(ip_rules)} CIDRs IP, "
|
||||
f"{len(asn_rules)} ASN, {len(country_rules)} pays (total={total})")
|
||||
|
||||
if total == 0:
|
||||
print("[ERREUR] Aucune règle récupérée.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print_summary(ua_rules, ip_rules, asn_rules, country_rules)
|
||||
|
||||
print("\n[INFO] Connexion à ClickHouse…")
|
||||
client = get_ch_client()
|
||||
|
||||
insert_ua_rules(client, ua_rules)
|
||||
insert_ip_rules(client, ip_rules)
|
||||
insert_asn_rules(client, asn_rules)
|
||||
insert_country_rules(client, country_rules)
|
||||
reload_dicts(client)
|
||||
|
||||
print("\n[OK] Règles Anubis chargées avec succès.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
126
services/bot-detector/anubis/mv_http_logs.sql
Normal file
126
services/bot-detector/anubis/mv_http_logs.sql
Normal file
@ -0,0 +1,126 @@
|
||||
CREATE MATERIALIZED VIEW mabase_prod.mv_http_logs
|
||||
TO mabase_prod.http_logs
|
||||
AS
|
||||
WITH
|
||||
coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS _ua,
|
||||
toIPv6(toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0'))) AS _ip,
|
||||
toUInt32(dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'asn', _ip, toUInt32(0))) AS _asn,
|
||||
dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'country_code', _ip, '') AS _cc
|
||||
SELECT
|
||||
parseDateTimeBestEffort(coalesce(JSONExtractString(raw_json, 'time'), '1970-01-01T00:00:00Z')) AS time,
|
||||
toDate(time) AS log_date,
|
||||
toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0')) AS src_ip,
|
||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'src_port'), 0)) AS src_port,
|
||||
_asn AS src_asn,
|
||||
_cc AS src_country_code,
|
||||
toIPv4(coalesce(JSONExtractString(raw_json, 'dst_ip'), '0.0.0.0')) AS dst_ip,
|
||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port,
|
||||
dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'name', _ip, '') AS src_as_name,
|
||||
dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'org', _ip, '') AS src_org,
|
||||
dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'domain', _ip, '') AS src_domain,
|
||||
coalesce(JSONExtractString(raw_json, 'method'), '') AS method,
|
||||
coalesce(JSONExtractString(raw_json, 'scheme'), '') AS scheme,
|
||||
coalesce(JSONExtractString(raw_json, 'host'), '') AS host,
|
||||
coalesce(JSONExtractString(raw_json, 'path'), '') AS path,
|
||||
coalesce(JSONExtractString(raw_json, 'query'), '') AS query,
|
||||
coalesce(JSONExtractString(raw_json, 'http_version'), '') AS http_version,
|
||||
coalesce(JSONExtractString(raw_json, 'orphan_side'), '') AS orphan_side,
|
||||
toUInt8(coalesce(JSONExtractBool(raw_json, 'correlated'), 0)) AS correlated,
|
||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'keepalives'), 0)) AS keepalives,
|
||||
coalesce(JSONExtractUInt(raw_json, 'a_timestamp'), 0) AS a_timestamp,
|
||||
coalesce(JSONExtractUInt(raw_json, 'b_timestamp'), 0) AS b_timestamp,
|
||||
coalesce(JSONExtractString(raw_json, 'conn_id'), '') AS conn_id,
|
||||
toUInt8(coalesce(JSONExtractBool(raw_json, 'ip_meta_df'), 0)) AS ip_meta_df,
|
||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'ip_meta_id'), 0)) AS ip_meta_id,
|
||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'ip_meta_total_length'), 0)) AS ip_meta_total_length,
|
||||
toUInt8(coalesce(JSONExtractUInt(raw_json, 'ip_meta_ttl'), 0)) AS ip_meta_ttl,
|
||||
coalesce(JSONExtractString(raw_json, 'tcp_meta_options'), '') AS tcp_meta_options,
|
||||
toUInt32(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_size'), 0)) AS tcp_meta_window_size,
|
||||
toUInt16(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_mss'), 0)) AS tcp_meta_mss,
|
||||
toUInt8(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_scale'), 0)) AS tcp_meta_window_scale,
|
||||
toInt32(coalesce(JSONExtractInt(raw_json, 'syn_to_clienthello_ms'), 0)) AS syn_to_clienthello_ms,
|
||||
coalesce(JSONExtractString(raw_json, 'tls_version'), '') AS tls_version,
|
||||
coalesce(JSONExtractString(raw_json, 'tls_sni'), '') AS tls_sni,
|
||||
coalesce(JSONExtractString(raw_json, 'tls_alpn'), '') AS tls_alpn,
|
||||
coalesce(JSONExtractString(raw_json, 'ja3'), '') AS ja3,
|
||||
coalesce(JSONExtractString(raw_json, 'ja3_hash'), '') AS ja3_hash,
|
||||
coalesce(JSONExtractString(raw_json, 'ja4'), '') AS ja4,
|
||||
coalesce(JSONExtractString(raw_json, 'client_headers'), '') AS client_headers,
|
||||
coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS header_user_agent,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Accept'), '') AS header_accept,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Accept-Encoding'), '') AS header_accept_encoding,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Accept-Language'), '') AS header_accept_language,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Content-Type'), '') AS header_content_type,
|
||||
coalesce(JSONExtractString(raw_json, 'header_X-Request-Id'), '') AS header_x_request_id,
|
||||
coalesce(JSONExtractString(raw_json, 'header_X-Trace-Id'), '') AS header_x_trace_id,
|
||||
coalesce(JSONExtractString(raw_json, 'header_X-Forwarded-For'), '') AS header_x_forwarded_for,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA'), '') AS header_sec_ch_ua,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Mobile'), '') AS header_sec_ch_ua_mobile,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Platform'), '') AS header_sec_ch_ua_platform,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Dest'), '') AS header_sec_fetch_dest,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode,
|
||||
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site,
|
||||
|
||||
-- Anubis enrichment : logique de correspondance combinée UA+IP
|
||||
-- Priorité : (1) UA+IP [même rule_id] > (2) UA seul > (3) IP seul > (4) ASN > (5) Pays
|
||||
CASE
|
||||
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '1'
|
||||
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != ''
|
||||
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
||||
AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', _ua))
|
||||
= dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', _ip, toUInt64(0))
|
||||
THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua)
|
||||
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '0'
|
||||
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != ''
|
||||
THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua)
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0
|
||||
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '')
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '')
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '')
|
||||
ELSE ''
|
||||
END AS anubis_bot_name,
|
||||
|
||||
CASE
|
||||
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '1'
|
||||
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != ''
|
||||
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
||||
AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', _ua))
|
||||
= dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', _ip, toUInt64(0))
|
||||
THEN dictGet('mabase_prod.dict_anubis_ua', 'action', _ua)
|
||||
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '0'
|
||||
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != ''
|
||||
THEN dictGet('mabase_prod.dict_anubis_ua', 'action', _ua)
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0
|
||||
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'action', _ip, '')
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'action', _asn, '')
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'action', _cc, '')
|
||||
ELSE ''
|
||||
END AS anubis_bot_action,
|
||||
|
||||
CASE
|
||||
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '1'
|
||||
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != ''
|
||||
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
||||
AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', _ua))
|
||||
= dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', _ip, toUInt64(0))
|
||||
THEN dictGet('mabase_prod.dict_anubis_ua', 'category', _ua)
|
||||
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '0'
|
||||
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != ''
|
||||
THEN dictGet('mabase_prod.dict_anubis_ua', 'category', _ua)
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0
|
||||
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'category', _ip, '')
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'category', _asn, '')
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'category', _cc, '')
|
||||
ELSE ''
|
||||
END AS anubis_bot_category
|
||||
|
||||
FROM mabase_prod.http_logs_raw
|
||||
183
services/bot-detector/anubis/view_ai_features_anubis.sql
Normal file
183
services/bot-detector/anubis/view_ai_features_anubis.sql
Normal file
@ -0,0 +1,183 @@
|
||||
CREATE OR REPLACE VIEW mabase_prod.view_ai_features_1h AS
|
||||
WITH base_data AS (
|
||||
SELECT
|
||||
a.window_start, a.src_ip, a.ja4, a.host,
|
||||
toString(a.src_asn) AS asn_number,
|
||||
a.src_as_name AS asn_org, a.src_org AS asn_detail, a.src_domain AS asn_domain,
|
||||
a.src_country_code AS country_code,
|
||||
dictGetOrDefault('mabase_prod.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label,
|
||||
-- Bot connu via JA4/IP (dictionnaires existants)
|
||||
COALESCE(
|
||||
nullIf(dictGetOrDefault('mabase_prod.dict_bot_ip', 'bot_name', a.src_ip, ''), ''),
|
||||
nullIf(dictGetOrDefault('mabase_prod.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''),
|
||||
''
|
||||
) AS bot_name,
|
||||
-- Anubis : logique combinée UA+IP (même rule_id) > UA seul > IP seul > ASN > Pays
|
||||
CASE
|
||||
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1'
|
||||
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
||||
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
||||
AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua))
|
||||
= dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0))
|
||||
THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua)
|
||||
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0'
|
||||
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
||||
THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua)
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0
|
||||
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '')
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '')
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '')
|
||||
ELSE ''
|
||||
END AS anubis_bot_name,
|
||||
CASE
|
||||
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1'
|
||||
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
||||
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
||||
AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua))
|
||||
= dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0))
|
||||
THEN dictGet('mabase_prod.dict_anubis_ua', 'action', a.first_ua)
|
||||
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0'
|
||||
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
||||
THEN dictGet('mabase_prod.dict_anubis_ua', 'action', a.first_ua)
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0
|
||||
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'action', a.src_ip, '')
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'action', toUInt32(a.src_asn), '')
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'action', a.src_country_code, '')
|
||||
ELSE ''
|
||||
END AS anubis_bot_action,
|
||||
CASE
|
||||
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1'
|
||||
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
||||
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
||||
AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua))
|
||||
= dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0))
|
||||
THEN dictGet('mabase_prod.dict_anubis_ua', 'category', a.first_ua)
|
||||
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0'
|
||||
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != ''
|
||||
THEN dictGet('mabase_prod.dict_anubis_ua', 'category', a.first_ua)
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0
|
||||
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'category', a.src_ip, '')
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'category', toUInt32(a.src_asn), '')
|
||||
WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != ''
|
||||
THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'category', a.src_country_code, '')
|
||||
ELSE ''
|
||||
END AS anubis_bot_category,
|
||||
a.hits AS hits,
|
||||
sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits,
|
||||
a.correlated AS correlated,
|
||||
a.tcp_jitter_variance AS tcp_jitter_variance,
|
||||
a.true_window_size AS true_window_size,
|
||||
a.window_mss_ratio AS window_mss_ratio,
|
||||
a.max_keepalives AS max_keepalives,
|
||||
h.header_order_hash AS header_order_hash, h.header_count AS header_count,
|
||||
h.has_accept_language AS has_accept_language, h.has_cookie AS has_cookie,
|
||||
h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score,
|
||||
h.ua_ch_mismatch AS ua_ch_mismatch,
|
||||
(a.count_post / (a.hits + 1)) AS post_ratio,
|
||||
(a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index,
|
||||
(a.hits / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS hit_velocity,
|
||||
(a.unique_src_ports / (a.hits + 1)) AS port_exhaustion_ratio,
|
||||
(a.orphan_count / (a.hits + 1)) AS orphan_ratio,
|
||||
(a.ip_id_zero_count / (a.hits + 1)) AS ip_id_zero_ratio,
|
||||
(a.hits / (a.unique_conn_id + 1)) AS multiplexing_efficiency,
|
||||
IF(a.mss_1460_count > (a.hits * 0.8) AND h.modern_browser_score > 70, 1, 0) AS mss_mobile_mismatch,
|
||||
a.request_size_variance AS request_size_variance,
|
||||
IF(a.tls_alpn = 'h2' AND a.http_version != '2', 1, 0) AS alpn_http_mismatch,
|
||||
IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing,
|
||||
IF(length(a.tls_sni) > 0 AND a.tls_sni != a.host, 1, 0) AS sni_host_mismatch,
|
||||
IF(h.sec_fetch_mode = 'navigate' AND h.sec_fetch_dest != 'document', 1, 0) AS is_fake_navigation,
|
||||
count() OVER (PARTITION BY a.tcp_fingerprint) AS tcp_shared_count,
|
||||
count() OVER (PARTITION BY h.header_order_hash) AS header_order_shared_count,
|
||||
(a.count_assets / (a.hits + 1)) AS asset_ratio,
|
||||
(a.count_no_referer / (a.hits + 1)) AS direct_access_ratio,
|
||||
IF(a.unique_ua > 2, 1, 0) AS is_ua_rotating,
|
||||
uniqExact(a.ja4) OVER (PARTITION BY a.src_ip) AS distinct_ja4_count,
|
||||
((a.hits / (a.unique_src_ports + 1)) / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS src_port_density,
|
||||
(sum(a.hits) OVER (PARTITION BY a.ja4, a.src_asn) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_asn_concentration,
|
||||
(sum(a.hits) OVER (PARTITION BY a.ja4, a.src_country_code) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_country_concentration,
|
||||
IF(sum(a.hits) OVER (PARTITION BY a.ja4) < 100, 1, 0) AS is_rare_ja4,
|
||||
(count() OVER (PARTITION BY h.header_order_hash, a.first_ua) / (count() OVER (PARTITION BY a.first_ua) + 1)) AS header_order_confidence,
|
||||
uniqExact(h.header_order_hash) OVER (PARTITION BY a.src_ip) AS distinct_header_orders,
|
||||
(a.uniq_paths / (a.hits + 1)) AS path_diversity_ratio,
|
||||
a.url_depth_variance AS url_depth_variance,
|
||||
(a.count_anomalous_payload / (a.hits + 1)) AS anomalous_payload_ratio,
|
||||
a.uniq_ja3_val AS uniq_ja3_per_row,
|
||||
sqrt(a.tcp_jitter_variance) / greatest(a.avg_syn_ms_val, 1) AS syn_timing_cv,
|
||||
a.tls12_count / (a.hits + 1) AS tls12_ratio,
|
||||
a.count_head / (a.hits + 1) AS head_ratio,
|
||||
a.count_no_sec_fetch / (a.hits + 1) AS sec_fetch_absence_rate,
|
||||
a.count_generic_accept / (a.hits + 1) AS generic_accept_ratio,
|
||||
a.count_http10 / (a.hits + 1) AS http10_ratio,
|
||||
a.ip_df_variance AS ip_df_variance,
|
||||
-- Nouvelles features TTL (fingerprint OS, L4 → modèle Complet)
|
||||
a.avg_ttl_val AS avg_ttl,
|
||||
sqrt(a.ttl_variance_val) AS ttl_std,
|
||||
IF(a.count_correlated_val > 0, a.count_no_wscale_val / a.count_correlated_val, 0) AS no_window_scale_ratio,
|
||||
-- Nouvelles features HTTP (disponibles pour les deux modèles)
|
||||
a.count_no_accept_enc_val / (a.hits + 1) AS missing_accept_enc_ratio,
|
||||
a.count_http_scheme_val / (a.hits + 1) AS http_scheme_ratio
|
||||
FROM (
|
||||
SELECT
|
||||
window_start, src_ip, ja4, host, src_asn,
|
||||
any(src_country_code) AS src_country_code, any(src_as_name) AS src_as_name,
|
||||
any(src_org) AS src_org, any(src_domain) AS src_domain, any(first_ua) AS first_ua,
|
||||
sum(hits) AS hits, uniqMerge(uniq_paths) AS uniq_paths,
|
||||
uniqMerge(uniq_query_params) AS uniq_query_params, sum(count_post) AS count_post,
|
||||
min(first_seen) AS first_seen, max(last_seen) AS last_seen,
|
||||
any(tcp_fp_raw) AS tcp_fingerprint, varPopMerge(tcp_jitter_variance) AS tcp_jitter_variance,
|
||||
varPopMerge(total_ip_length_var) AS request_size_variance,
|
||||
any(tcp_win_raw * exp2(tcp_scale_raw)) AS true_window_size,
|
||||
IF(any(tcp_mss_raw) > 0, any(tcp_win_raw) / any(tcp_mss_raw), 0) AS window_mss_ratio,
|
||||
any(http_ver_raw) AS http_version, any(tls_alpn_raw) AS tls_alpn, any(tls_sni_raw) AS tls_sni,
|
||||
max(correlated_raw) AS correlated, uniqMerge(unique_src_ports) AS unique_src_ports,
|
||||
uniqMerge(unique_conn_id) AS unique_conn_id, max(max_keepalives) AS max_keepalives,
|
||||
sum(orphan_count) AS orphan_count, sum(ip_id_zero_count) AS ip_id_zero_count,
|
||||
sum(mss_1460_count) AS mss_1460_count,
|
||||
sum(count_assets) AS count_assets, sum(count_no_referer) AS count_no_referer,
|
||||
uniqMerge(uniq_ua) AS unique_ua,
|
||||
varPopMerge(url_depth_variance) AS url_depth_variance,
|
||||
sum(count_anomalous_payload) AS count_anomalous_payload,
|
||||
uniqMerge(uniq_ja3) AS uniq_ja3_val,
|
||||
avgMerge(avg_syn_ms) AS avg_syn_ms_val,
|
||||
sum(tls12_count) AS tls12_count,
|
||||
sum(count_head) AS count_head,
|
||||
sum(count_no_sec_fetch) AS count_no_sec_fetch,
|
||||
sum(count_generic_accept) AS count_generic_accept,
|
||||
sum(count_http10) AS count_http10,
|
||||
varPopMerge(ip_df_var) AS ip_df_variance,
|
||||
-- Nouvelles features : TTL fingerprint (L4) + HTTP
|
||||
avgIfMerge(avg_ttl) AS avg_ttl_val,
|
||||
varPopIfMerge(ttl_var) AS ttl_variance_val,
|
||||
sum(count_no_wscale) AS count_no_wscale_val,
|
||||
sum(count_correlated) AS count_correlated_val,
|
||||
sum(count_no_accept_enc) AS count_no_accept_enc_val,
|
||||
sum(count_http_scheme) AS count_http_scheme_val
|
||||
FROM mabase_prod.agg_host_ip_ja4_1h
|
||||
WHERE window_start >= now() - INTERVAL 24 HOUR
|
||||
GROUP BY window_start, src_ip, ja4, host, src_asn
|
||||
) a
|
||||
LEFT JOIN (
|
||||
SELECT
|
||||
window_start, src_ip, any(header_order_hash) AS header_order_hash,
|
||||
max(header_count) AS header_count, max(has_accept_language) AS has_accept_language,
|
||||
max(has_cookie) AS has_cookie, max(has_referer) AS has_referer,
|
||||
max(modern_browser_score) AS modern_browser_score, max(ua_ch_mismatch) AS ua_ch_mismatch,
|
||||
any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest
|
||||
FROM mabase_prod.agg_header_fingerprint_1h
|
||||
WHERE window_start >= now() - INTERVAL 24 HOUR
|
||||
GROUP BY window_start, src_ip
|
||||
) h ON a.src_ip = h.src_ip AND a.window_start = h.window_start
|
||||
)
|
||||
SELECT
|
||||
*,
|
||||
-(sum((hits / (total_ip_hits + 1)) * log2((hits / (total_ip_hits + 1)) + 0.000001)) OVER (PARTITION BY src_ip)) AS temporal_entropy,
|
||||
sum(uniq_ja3_per_row) OVER (PARTITION BY src_ip) / greatest(distinct_ja4_count, 1) AS ja3_diversity_ratio
|
||||
FROM base_data;
|
||||
Reference in New Issue
Block a user