Files
ja4-platform/shared/clickhouse/03_anubis_tables.sql
toto d469e39da7 feat: ja4-platform monorepo — 5 services unified, tests & RPM builds standardized
Services:
- ja4sentinel: TLS/JA4 fingerprint capture daemon (Go, libpcap)
- logcorrelator: JA4 log correlation engine (Go, ClickHouse)
- mod_reqin_log: Apache module (C, JSON request logging)
- bot_detector: ML bot detection pipeline (Python)
- dashboard: FastAPI/Streamlit analytics UI (Python)

Shared libraries:
- shared/go/ja4common: logger, config, shutdown, ipfilter (Go module)
- shared/python/ja4_common: ClickHouseClient, ClickHouseSettings (Python package)
- shared/clickhouse/: canonical SQL migrations (10 files)

Build & packaging:
- Unified 3-stage Dockerfile.package for Go RPMs (el8/el9/el10)
- go.work workspace linking sentinel, correlator, ja4common
- Makefile with test-all, build-all, rpm-* targets

Fixes applied:
- go.work: 1.21 → 1.24.6 (required by sentinel)
- correlator Dockerfiles: golang:1.21 → golang:1.24
- replace directives in go.mod for ja4common local path
- pyproject.toml: setuptools.backends → setuptools.build_meta
- Removed static libpcap linking (unavailable on Rocky 9)
- Fixed data races in output/writers_test.go (sync.Mutex + atomic.Int32)
- Rewrote corrupted test files (logger_test.go × 2)

Test coverage:
- correlator: 67.1% total (unixsocket 80.5%, config 91.7%, app 83.3%, multi 87.7%, stdout 100%)
- sentinel: all 10 packages pass (api, capture, config, fingerprint, ipfilter, logging, output, tlsparse)

Documentation:
- README.md + docs/ (architecture, development, 5 services, shared libs, DB schema & migrations)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-07 16:42:59 +02:00

140 lines
5.1 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

-- =============================================================================
-- 03_anubis_tables.sql — Anubis crawler rule tables and dictionaries
-- Items 18 from bot_detector/anubis/deploy_schema.sql
-- =============================================================================
-- -----------------------------------------------------------------------------
-- 1. TABLE SOURCE — User-Agent rules (for REGEXP_TREE dictionary)
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.anubis_ua_rules
(
id UInt64,
parent_id UInt64,
regexp String,
keys Array(String),
values Array(String)
)
ENGINE = ReplacingMergeTree()
ORDER BY id;
-- -----------------------------------------------------------------------------
-- 2. TABLE SOURCE — IP/CIDR rules (for IP_TRIE dictionary)
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.anubis_ip_rules
(
prefix String,
bot_name LowCardinality(String),
action LowCardinality(String),
rule_id UInt64,
has_ua UInt8,
category LowCardinality(String)
)
ENGINE = ReplacingMergeTree()
ORDER BY prefix;
-- -----------------------------------------------------------------------------
-- 3. DICTIONARY — UA REGEXP_TREE
-- dictGet('mabase_prod.dict_anubis_ua', 'bot_name', header_user_agent)
-- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use.
-- -----------------------------------------------------------------------------
DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_ua;
CREATE DICTIONARY mabase_prod.dict_anubis_ua
(
regexp String,
bot_name String,
action String
)
PRIMARY KEY regexp
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_ua_rules'))
LAYOUT(REGEXP_TREE)
LIFETIME(MIN 300 MAX 600);
-- -----------------------------------------------------------------------------
-- 4. DICTIONARY — IP IP_TRIE
-- dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', toIPv6(src_ip), '')
-- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use.
-- -----------------------------------------------------------------------------
DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_ip;
CREATE DICTIONARY mabase_prod.dict_anubis_ip
(
prefix String,
bot_name String,
action String,
rule_id UInt64,
has_ua UInt8,
category String
)
PRIMARY KEY prefix
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_ip_rules'))
LAYOUT(IP_TRIE())
LIFETIME(MIN 300 MAX 600);
-- -----------------------------------------------------------------------------
-- 5. TABLE SOURCE — ASN rules (for Flat dictionary)
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.anubis_asn_rules
(
asn UInt32,
bot_name LowCardinality(String),
action LowCardinality(String),
category LowCardinality(String)
)
ENGINE = ReplacingMergeTree()
ORDER BY asn;
-- -----------------------------------------------------------------------------
-- 6. TABLE SOURCE — Country rules (for Flat dictionary)
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.anubis_country_rules
(
country_code LowCardinality(String),
bot_name LowCardinality(String),
action LowCardinality(String),
category LowCardinality(String)
)
ENGINE = ReplacingMergeTree()
ORDER BY country_code;
-- -----------------------------------------------------------------------------
-- 7. DICTIONARY — ASN Flat
-- dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', src_asn, '')
-- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use.
-- -----------------------------------------------------------------------------
DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_asn;
CREATE DICTIONARY mabase_prod.dict_anubis_asn
(
asn UInt32,
bot_name String,
action String,
category String
)
PRIMARY KEY asn
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_asn_rules'))
LAYOUT(FLAT())
LIFETIME(MIN 300 MAX 600);
-- -----------------------------------------------------------------------------
-- 8. DICTIONARY — Country Flat
-- dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', src_country_code, '')
-- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use.
-- -----------------------------------------------------------------------------
DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_country;
CREATE DICTIONARY mabase_prod.dict_anubis_country
(
country_code String,
bot_name String,
action String,
category String
)
PRIMARY KEY country_code
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_country_rules'))
LAYOUT(FLAT())
LIFETIME(MIN 300 MAX 600);