feat: ja4-platform monorepo — 5 services unified, tests & RPM builds standardized

Services:
- ja4sentinel: TLS/JA4 fingerprint capture daemon (Go, libpcap)
- logcorrelator: JA4 log correlation engine (Go, ClickHouse)
- mod_reqin_log: Apache module (C, JSON request logging)
- bot_detector: ML bot detection pipeline (Python)
- dashboard: FastAPI/Streamlit analytics UI (Python)

Shared libraries:
- shared/go/ja4common: logger, config, shutdown, ipfilter (Go module)
- shared/python/ja4_common: ClickHouseClient, ClickHouseSettings (Python package)
- shared/clickhouse/: canonical SQL migrations (10 files)

Build & packaging:
- Unified 3-stage Dockerfile.package for Go RPMs (el8/el9/el10)
- go.work workspace linking sentinel, correlator, ja4common
- Makefile with test-all, build-all, rpm-* targets

Fixes applied:
- go.work: 1.21 → 1.24.6 (required by sentinel)
- correlator Dockerfiles: golang:1.21 → golang:1.24
- replace directives in go.mod for ja4common local path
- pyproject.toml: setuptools.backends → setuptools.build_meta
- Removed static libpcap linking (unavailable on Rocky 9)
- Fixed data races in output/writers_test.go (sync.Mutex + atomic.Int32)
- Rewrote corrupted test files (logger_test.go × 2)

Test coverage:
- correlator: 67.1% total (unixsocket 80.5%, config 91.7%, app 83.3%, multi 87.7%, stdout 100%)
- sentinel: all 10 packages pass (api, capture, config, fingerprint, ipfilter, logging, output, tlsparse)

Documentation:
- README.md + docs/ (architecture, development, 5 services, shared libs, DB schema & migrations)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-07 16:42:59 +02:00
commit d469e39da7
278 changed files with 1621301 additions and 0 deletions

View File

@ -0,0 +1,4 @@
-- =============================================================================
-- 00_database.sql — Create the production database
-- =============================================================================
CREATE DATABASE IF NOT EXISTS mabase_prod;

View File

@ -0,0 +1,16 @@
-- =============================================================================
-- 01_raw_tables.sql — Raw ingest table (direct target for logcorrelator inserts)
-- =============================================================================
CREATE TABLE IF NOT EXISTS mabase_prod.http_logs_raw
(
`raw_json` String CODEC(ZSTD(3)),
`ingest_time` DateTime DEFAULT now()
)
ENGINE = MergeTree
PARTITION BY toDate(ingest_time)
ORDER BY ingest_time
TTL ingest_time + INTERVAL 1 DAY
SETTINGS
index_granularity = 8192,
ttl_only_drop_parts = 1;

View File

@ -0,0 +1,59 @@
-- =============================================================================
-- 02_dictionaries.sql — ASN geo dictionary + bot reference tables
-- =============================================================================
-- -----------------------------------------------------------------------------
-- IPLocate ASN geo-location dictionary
-- CSV file must be placed at: /var/lib/clickhouse/user_files/iplocate-ip-to-asn.csv
-- -----------------------------------------------------------------------------
DROP DICTIONARY IF EXISTS mabase_prod.dict_iplocate_asn;
CREATE DICTIONARY IF NOT EXISTS mabase_prod.dict_iplocate_asn
(
network String,
asn UInt32,
country_code String,
name String,
org String,
domain String
)
PRIMARY KEY network
SOURCE(FILE(path '/var/lib/clickhouse/user_files/iplocate-ip-to-asn.csv' format 'CSVWithNames'))
LAYOUT(IP_TRIE())
LIFETIME(MIN 3600 MAX 7200);
-- -----------------------------------------------------------------------------
-- Bot network CIDR reference table (ReplacingMergeTree for upserts)
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.ref_bot_networks
(
network IPv6CIDR,
bot_name LowCardinality(String),
is_legitimate UInt8,
last_update DateTime
)
ENGINE = ReplacingMergeTree(last_update)
ORDER BY (network, bot_name);
-- -----------------------------------------------------------------------------
-- Bot IP flat table (CSV file engine)
-- CSV file must be placed at: /var/lib/clickhouse/user_files/bot_ip.csv
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.bot_ip
(
ip String
)
ENGINE = File(CSV, 'bot_ip.csv');
-- -----------------------------------------------------------------------------
-- Bot JA4 flat table (CSV file engine)
-- CSV file must be placed at: /var/lib/clickhouse/user_files/bot_ja4.csv
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.bot_ja4
(
ja4 String
)
ENGINE = File(CSV, 'bot_ja4.csv');

View File

@ -0,0 +1,139 @@
-- =============================================================================
-- 03_anubis_tables.sql — Anubis crawler rule tables and dictionaries
-- Items 18 from bot_detector/anubis/deploy_schema.sql
-- =============================================================================
-- -----------------------------------------------------------------------------
-- 1. TABLE SOURCE — User-Agent rules (for REGEXP_TREE dictionary)
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.anubis_ua_rules
(
id UInt64,
parent_id UInt64,
regexp String,
keys Array(String),
values Array(String)
)
ENGINE = ReplacingMergeTree()
ORDER BY id;
-- -----------------------------------------------------------------------------
-- 2. TABLE SOURCE — IP/CIDR rules (for IP_TRIE dictionary)
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.anubis_ip_rules
(
prefix String,
bot_name LowCardinality(String),
action LowCardinality(String),
rule_id UInt64,
has_ua UInt8,
category LowCardinality(String)
)
ENGINE = ReplacingMergeTree()
ORDER BY prefix;
-- -----------------------------------------------------------------------------
-- 3. DICTIONARY — UA REGEXP_TREE
-- dictGet('mabase_prod.dict_anubis_ua', 'bot_name', header_user_agent)
-- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use.
-- -----------------------------------------------------------------------------
DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_ua;
CREATE DICTIONARY mabase_prod.dict_anubis_ua
(
regexp String,
bot_name String,
action String
)
PRIMARY KEY regexp
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_ua_rules'))
LAYOUT(REGEXP_TREE)
LIFETIME(MIN 300 MAX 600);
-- -----------------------------------------------------------------------------
-- 4. DICTIONARY — IP IP_TRIE
-- dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', toIPv6(src_ip), '')
-- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use.
-- -----------------------------------------------------------------------------
DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_ip;
CREATE DICTIONARY mabase_prod.dict_anubis_ip
(
prefix String,
bot_name String,
action String,
rule_id UInt64,
has_ua UInt8,
category String
)
PRIMARY KEY prefix
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_ip_rules'))
LAYOUT(IP_TRIE())
LIFETIME(MIN 300 MAX 600);
-- -----------------------------------------------------------------------------
-- 5. TABLE SOURCE — ASN rules (for Flat dictionary)
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.anubis_asn_rules
(
asn UInt32,
bot_name LowCardinality(String),
action LowCardinality(String),
category LowCardinality(String)
)
ENGINE = ReplacingMergeTree()
ORDER BY asn;
-- -----------------------------------------------------------------------------
-- 6. TABLE SOURCE — Country rules (for Flat dictionary)
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.anubis_country_rules
(
country_code LowCardinality(String),
bot_name LowCardinality(String),
action LowCardinality(String),
category LowCardinality(String)
)
ENGINE = ReplacingMergeTree()
ORDER BY country_code;
-- -----------------------------------------------------------------------------
-- 7. DICTIONARY — ASN Flat
-- dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', src_asn, '')
-- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use.
-- -----------------------------------------------------------------------------
DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_asn;
CREATE DICTIONARY mabase_prod.dict_anubis_asn
(
asn UInt32,
bot_name String,
action String,
category String
)
PRIMARY KEY asn
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_asn_rules'))
LAYOUT(FLAT())
LIFETIME(MIN 300 MAX 600);
-- -----------------------------------------------------------------------------
-- 8. DICTIONARY — Country Flat
-- dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', src_country_code, '')
-- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use.
-- -----------------------------------------------------------------------------
DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_country;
CREATE DICTIONARY mabase_prod.dict_anubis_country
(
country_code String,
bot_name String,
action String,
category String
)
PRIMARY KEY country_code
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_country_rules'))
LAYOUT(FLAT())
LIFETIME(MIN 300 MAX 600);

View File

@ -0,0 +1,234 @@
-- =============================================================================
-- 04_mv_http_logs.sql — Canonical http_logs target table + mv_http_logs MV
--
-- Canonical version — supersedes services/correlator/sql/init.sql base version.
-- This version includes full Anubis enrichment (anubis_bot_name, anubis_bot_action,
-- anubis_bot_category) with combined UA+IP priority logic from
-- bot_detector/anubis/mv_http_logs.sql.
-- =============================================================================
-- -----------------------------------------------------------------------------
-- http_logs — parsed destination table (populated by mv_http_logs)
-- Includes Anubis enrichment columns added by deploy_schema.sql items 9+
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.http_logs
(
-- Temporal
`time` DateTime,
`log_date` Date DEFAULT toDate(time),
-- Network
`src_ip` IPv4,
`src_port` UInt16,
`dst_ip` IPv4,
`dst_port` UInt16,
-- IPLocate enrichment
`src_asn` UInt32,
`src_country_code` LowCardinality(String),
`src_as_name` LowCardinality(String),
`src_org` LowCardinality(String),
`src_domain` LowCardinality(String),
-- HTTP
`method` LowCardinality(String),
`scheme` LowCardinality(String),
`host` LowCardinality(String),
`path` String CODEC(ZSTD(3)),
`query` String CODEC(ZSTD(3)),
`http_version` LowCardinality(String),
-- Correlation
`orphan_side` LowCardinality(String),
`correlated` UInt8,
`keepalives` UInt16,
`a_timestamp` UInt64,
`b_timestamp` UInt64,
`conn_id` String CODEC(ZSTD(3)),
-- IP metadata
`ip_meta_df` UInt8,
`ip_meta_id` UInt16,
`ip_meta_total_length` UInt16,
`ip_meta_ttl` UInt8,
-- TCP metadata
`tcp_meta_options` LowCardinality(String),
`tcp_meta_window_size` UInt32,
`tcp_meta_mss` UInt16,
`tcp_meta_window_scale` UInt8,
`syn_to_clienthello_ms` Int32,
-- TLS / fingerprints
`tls_version` LowCardinality(String),
`tls_sni` LowCardinality(String),
`tls_alpn` LowCardinality(String),
`ja3` String CODEC(ZSTD(3)),
`ja3_hash` String CODEC(ZSTD(3)),
`ja4` String CODEC(ZSTD(3)),
-- HTTP headers
`client_headers` String CODEC(ZSTD(3)),
`header_user_agent` String CODEC(ZSTD(3)),
`header_accept` String CODEC(ZSTD(3)),
`header_accept_encoding` String CODEC(ZSTD(3)),
`header_accept_language` String CODEC(ZSTD(3)),
`header_content_type` String CODEC(ZSTD(3)),
`header_x_request_id` String CODEC(ZSTD(3)),
`header_x_trace_id` String CODEC(ZSTD(3)),
`header_x_forwarded_for` String CODEC(ZSTD(3)),
`header_sec_ch_ua` String CODEC(ZSTD(3)),
`header_sec_ch_ua_mobile` String CODEC(ZSTD(3)),
`header_sec_ch_ua_platform` String CODEC(ZSTD(3)),
`header_sec_fetch_dest` String CODEC(ZSTD(3)),
`header_sec_fetch_mode` String CODEC(ZSTD(3)),
`header_sec_fetch_site` String CODEC(ZSTD(3)),
-- Anubis enrichment columns
`anubis_bot_name` LowCardinality(String) DEFAULT '',
`anubis_bot_action` LowCardinality(String) DEFAULT '',
`anubis_bot_category` LowCardinality(String) DEFAULT ''
)
ENGINE = MergeTree
PARTITION BY log_date
ORDER BY (time, src_ip, dst_ip, ja4)
TTL log_date + INTERVAL 7 DAY
SETTINGS
index_granularity = 8192,
ttl_only_drop_parts = 1;
-- -----------------------------------------------------------------------------
-- mv_http_logs — canonical materialized view with full Anubis enrichment
-- Priority logic: (1) UA+IP combined [same rule_id] > (2) UA only > (3) IP only
-- > (4) ASN > (5) Country
-- Uses WITH clause CTEs for clean intermediate values.
-- -----------------------------------------------------------------------------
DROP VIEW IF EXISTS mabase_prod.mv_http_logs;
CREATE MATERIALIZED VIEW mabase_prod.mv_http_logs
TO mabase_prod.http_logs
AS
WITH
coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS _ua,
toIPv6(toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0'))) AS _ip,
toUInt32(dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'asn', _ip, toUInt32(0))) AS _asn,
dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'country_code', _ip, '') AS _cc
SELECT
parseDateTimeBestEffort(coalesce(JSONExtractString(raw_json, 'time'), '1970-01-01T00:00:00Z')) AS time,
toDate(time) AS log_date,
toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0')) AS src_ip,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'src_port'), 0)) AS src_port,
_asn AS src_asn,
_cc AS src_country_code,
toIPv4(coalesce(JSONExtractString(raw_json, 'dst_ip'), '0.0.0.0')) AS dst_ip,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port,
dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'name', _ip, '') AS src_as_name,
dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'org', _ip, '') AS src_org,
dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'domain', _ip, '') AS src_domain,
coalesce(JSONExtractString(raw_json, 'method'), '') AS method,
coalesce(JSONExtractString(raw_json, 'scheme'), '') AS scheme,
coalesce(JSONExtractString(raw_json, 'host'), '') AS host,
coalesce(JSONExtractString(raw_json, 'path'), '') AS path,
coalesce(JSONExtractString(raw_json, 'query'), '') AS query,
coalesce(JSONExtractString(raw_json, 'http_version'), '') AS http_version,
coalesce(JSONExtractString(raw_json, 'orphan_side'), '') AS orphan_side,
toUInt8(coalesce(JSONExtractBool(raw_json, 'correlated'), 0)) AS correlated,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'keepalives'), 0)) AS keepalives,
coalesce(JSONExtractUInt(raw_json, 'a_timestamp'), 0) AS a_timestamp,
coalesce(JSONExtractUInt(raw_json, 'b_timestamp'), 0) AS b_timestamp,
coalesce(JSONExtractString(raw_json, 'conn_id'), '') AS conn_id,
toUInt8(coalesce(JSONExtractBool(raw_json, 'ip_meta_df'), 0)) AS ip_meta_df,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'ip_meta_id'), 0)) AS ip_meta_id,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'ip_meta_total_length'), 0)) AS ip_meta_total_length,
toUInt8(coalesce(JSONExtractUInt(raw_json, 'ip_meta_ttl'), 0)) AS ip_meta_ttl,
coalesce(JSONExtractString(raw_json, 'tcp_meta_options'), '') AS tcp_meta_options,
toUInt32(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_size'), 0)) AS tcp_meta_window_size,
toUInt16(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_mss'), 0)) AS tcp_meta_mss,
toUInt8(coalesce(JSONExtractUInt(raw_json, 'tcp_meta_window_scale'), 0)) AS tcp_meta_window_scale,
toInt32(coalesce(JSONExtractInt(raw_json, 'syn_to_clienthello_ms'), 0)) AS syn_to_clienthello_ms,
coalesce(JSONExtractString(raw_json, 'tls_version'), '') AS tls_version,
coalesce(JSONExtractString(raw_json, 'tls_sni'), '') AS tls_sni,
coalesce(JSONExtractString(raw_json, 'tls_alpn'), '') AS tls_alpn,
coalesce(JSONExtractString(raw_json, 'ja3'), '') AS ja3,
coalesce(JSONExtractString(raw_json, 'ja3_hash'), '') AS ja3_hash,
coalesce(JSONExtractString(raw_json, 'ja4'), '') AS ja4,
coalesce(JSONExtractString(raw_json, 'client_headers'), '') AS client_headers,
coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS header_user_agent,
coalesce(JSONExtractString(raw_json, 'header_Accept'), '') AS header_accept,
coalesce(JSONExtractString(raw_json, 'header_Accept-Encoding'), '') AS header_accept_encoding,
coalesce(JSONExtractString(raw_json, 'header_Accept-Language'), '') AS header_accept_language,
coalesce(JSONExtractString(raw_json, 'header_Content-Type'), '') AS header_content_type,
coalesce(JSONExtractString(raw_json, 'header_X-Request-Id'), '') AS header_x_request_id,
coalesce(JSONExtractString(raw_json, 'header_X-Trace-Id'), '') AS header_x_trace_id,
coalesce(JSONExtractString(raw_json, 'header_X-Forwarded-For'), '') AS header_x_forwarded_for,
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA'), '') AS header_sec_ch_ua,
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Mobile'), '') AS header_sec_ch_ua_mobile,
coalesce(JSONExtractString(raw_json, 'header_Sec-CH-UA-Platform'), '') AS header_sec_ch_ua_platform,
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Dest'), '') AS header_sec_fetch_dest,
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode,
coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site,
-- Anubis enrichment: combined UA+IP priority logic
-- Priority: (1) UA+IP [same rule_id] > (2) UA only > (3) IP only > (4) ASN > (5) Country
CASE
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '1'
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != ''
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != ''
AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', _ua))
= dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', _ip, toUInt64(0))
THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua)
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '0'
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != ''
THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua)
WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '')
WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '')
WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '')
ELSE ''
END AS anubis_bot_name,
CASE
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '1'
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != ''
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != ''
AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', _ua))
= dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', _ip, toUInt64(0))
THEN dictGet('mabase_prod.dict_anubis_ua', 'action', _ua)
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '0'
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != ''
THEN dictGet('mabase_prod.dict_anubis_ua', 'action', _ua)
WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'action', _ip, '')
WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'action', _asn, '')
WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'action', _cc, '')
ELSE ''
END AS anubis_bot_action,
CASE
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '1'
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != ''
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != ''
AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', _ua))
= dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', _ip, toUInt64(0))
THEN dictGet('mabase_prod.dict_anubis_ua', 'category', _ua)
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '0'
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != ''
THEN dictGet('mabase_prod.dict_anubis_ua', 'category', _ua)
WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'category', _ip, '')
WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'category', _asn, '')
WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'category', _cc, '')
ELSE ''
END AS anubis_bot_category
FROM mabase_prod.http_logs_raw;

View File

@ -0,0 +1,195 @@
-- =============================================================================
-- 05_aggregation_tables.sql — Behavioral aggregation tables + MVs
-- Source: bot_detector/deploy_views.sql sections 2-5
-- =============================================================================
-- -----------------------------------------------------------------------------
-- Bot reputation dictionaries (in-RAM for fast lookup)
-- CSV files must be placed at: /var/lib/clickhouse/user_files/
-- -----------------------------------------------------------------------------
DROP DICTIONARY IF EXISTS mabase_prod.dict_bot_ip;
CREATE DICTIONARY mabase_prod.dict_bot_ip
(
prefix String,
bot_name String
)
PRIMARY KEY prefix
SOURCE(FILE(path '/var/lib/clickhouse/user_files/bot_ip.csv' format 'CSV'))
LAYOUT(IP_TRIE())
LIFETIME(MIN 300 MAX 300);
DROP DICTIONARY IF EXISTS mabase_prod.dict_bot_ja4;
CREATE DICTIONARY mabase_prod.dict_bot_ja4
(
ja4 String,
bot_name String
)
PRIMARY KEY ja4
SOURCE(FILE(path '/var/lib/clickhouse/user_files/bot_ja4.csv' format 'CSV'))
LAYOUT(COMPLEX_KEY_HASHED())
LIFETIME(MIN 300 MAX 300);
DROP DICTIONARY IF EXISTS mabase_prod.dict_asn_reputation;
CREATE DICTIONARY mabase_prod.dict_asn_reputation
(
src_asn UInt64,
label String
)
PRIMARY KEY src_asn
SOURCE(FILE(path '/var/lib/clickhouse/user_files/asn_reputation.csv' format 'CSV'))
LAYOUT(HASHED())
LIFETIME(MIN 300 MAX 300);
-- -----------------------------------------------------------------------------
-- agg_host_ip_ja4_1h — behavioral aggregation (L4/L5/L7)
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.agg_host_ip_ja4_1h
(
window_start DateTime,
src_ip IPv6, ja4 String, host String, src_asn UInt32,
src_country_code SimpleAggregateFunction(any, String),
src_as_name SimpleAggregateFunction(any, String),
src_org SimpleAggregateFunction(any, String),
src_domain SimpleAggregateFunction(any, String),
first_seen SimpleAggregateFunction(min, DateTime),
last_seen SimpleAggregateFunction(max, DateTime),
hits SimpleAggregateFunction(sum, UInt64),
count_post SimpleAggregateFunction(sum, UInt64),
uniq_paths AggregateFunction(uniq, String),
uniq_query_params AggregateFunction(uniq, String),
tcp_fp_raw SimpleAggregateFunction(any, String),
tcp_jitter_variance AggregateFunction(varPop, Float64),
tcp_win_raw SimpleAggregateFunction(any, UInt32),
tcp_scale_raw SimpleAggregateFunction(any, UInt32),
tcp_mss_raw SimpleAggregateFunction(any, UInt32),
tcp_ttl_raw SimpleAggregateFunction(any, UInt32),
http_ver_raw SimpleAggregateFunction(any, String),
tls_alpn_raw SimpleAggregateFunction(any, String),
tls_sni_raw SimpleAggregateFunction(any, String),
first_ua SimpleAggregateFunction(any, String),
correlated_raw SimpleAggregateFunction(max, UInt8),
unique_src_ports AggregateFunction(uniq, UInt16),
unique_conn_id AggregateFunction(uniq, String),
max_keepalives SimpleAggregateFunction(max, UInt32),
orphan_count SimpleAggregateFunction(sum, UInt64),
ip_id_zero_count SimpleAggregateFunction(sum, UInt64),
total_ip_length_var AggregateFunction(varPop, Float64),
mss_1460_count SimpleAggregateFunction(sum, UInt64),
count_assets SimpleAggregateFunction(sum, UInt64),
count_no_referer SimpleAggregateFunction(sum, UInt64),
uniq_ua AggregateFunction(uniq, String),
max_requests_per_sec SimpleAggregateFunction(max, UInt32),
url_depth_variance AggregateFunction(varPop, Float64),
count_anomalous_payload SimpleAggregateFunction(sum, UInt64),
-- B features
uniq_ja3 AggregateFunction(uniq, String),
avg_syn_ms AggregateFunction(avg, Float64),
tls12_count SimpleAggregateFunction(sum, UInt64),
count_head SimpleAggregateFunction(sum, UInt64),
count_no_sec_fetch SimpleAggregateFunction(sum, UInt64),
count_generic_accept SimpleAggregateFunction(sum, UInt64),
count_http10 SimpleAggregateFunction(sum, UInt64),
ip_df_var AggregateFunction(varPop, Float64),
-- TTL features (L4 fingerprint / OS)
avg_ttl AggregateFunction(avgIf, Float64, UInt8),
ttl_var AggregateFunction(varPopIf, Float64, UInt8),
count_no_wscale SimpleAggregateFunction(sum, UInt64),
count_correlated SimpleAggregateFunction(sum, UInt64),
-- HTTP features
count_no_accept_enc SimpleAggregateFunction(sum, UInt64),
count_http_scheme SimpleAggregateFunction(sum, UInt64)
)
ENGINE = AggregatingMergeTree()
ORDER BY (window_start, src_ip, ja4, host);
-- -----------------------------------------------------------------------------
-- mv_agg_host_ip_ja4_1h — feeds agg_host_ip_ja4_1h from http_logs
-- -----------------------------------------------------------------------------
DROP VIEW IF EXISTS mabase_prod.mv_agg_host_ip_ja4_1h;
CREATE MATERIALIZED VIEW mabase_prod.mv_agg_host_ip_ja4_1h
TO mabase_prod.agg_host_ip_ja4_1h AS
SELECT
toStartOfHour(src.time) AS window_start,
toIPv6(src.src_ip) AS src_ip, src.ja4, src.host, src.src_asn,
any(src.src_country_code) AS src_country_code, any(src.src_as_name) AS src_as_name,
any(src.src_org) AS src_org, any(src.src_domain) AS src_domain,
min(src.time) AS first_seen, max(src.time) AS last_seen, count() AS hits,
sum(IF(src.method = 'POST', 1, 0)) AS count_post,
uniqState(src.path) AS uniq_paths, uniqState(src.query) AS uniq_query_params,
any(toString(cityHash64(concat(toString(src.tcp_meta_window_size), toString(src.tcp_meta_mss), toString(src.tcp_meta_window_scale), src.tcp_meta_options)))) AS tcp_fp_raw,
varPopState(toFloat64(src.syn_to_clienthello_ms)) AS tcp_jitter_variance,
any(src.tcp_meta_window_size) AS tcp_win_raw, any(src.tcp_meta_window_scale) AS tcp_scale_raw,
any(src.tcp_meta_mss) AS tcp_mss_raw, any(src.ip_meta_ttl) AS tcp_ttl_raw,
any(src.http_version) AS http_ver_raw, any(src.tls_alpn) AS tls_alpn_raw, any(src.tls_sni) AS tls_sni_raw,
any(src.header_user_agent) AS first_ua, max(toUInt8(src.correlated)) AS correlated_raw,
uniqState(toUInt16(src.src_port)) AS unique_src_ports, uniqState(src.conn_id) AS unique_conn_id,
max(toUInt32(src.keepalives)) AS max_keepalives,
sum(IF(src.orphan_side = 'A' OR src.correlated = 0, 1, 0)) AS orphan_count,
sum(IF(src.ip_meta_id == 0, 1, 0)) AS ip_id_zero_count,
varPopState(toFloat64(src.ip_meta_total_length)) AS total_ip_length_var,
sum(IF(src.tcp_meta_mss == 1460, 1, 0)) AS mss_1460_count,
sum(IF(match(src.path, '(?i)\.(png|jpg|jpeg|gif|css|js|ico|woff2|svg|eot)$'), 1, 0)) AS count_assets,
sum(IF(position(src.client_headers, 'Referer') = 0, 1, 0)) AS count_no_referer,
uniqState(src.header_user_agent) AS uniq_ua,
0 AS max_requests_per_sec,
varPopState(toFloat64(length(replaceAll(src.path, '/', '//')) - length(src.path))) AS url_depth_variance,
sum(IF(src.ip_meta_total_length < 60 OR src.ip_meta_total_length > 1500, 1, 0)) AS count_anomalous_payload,
uniqState(src.ja3) AS uniq_ja3,
avgState(toFloat64(src.syn_to_clienthello_ms)) AS avg_syn_ms,
sum(IF(src.tls_version = '1.2', 1, 0)) AS tls12_count,
sum(IF(src.method = 'HEAD', 1, 0)) AS count_head,
sum(IF(length(src.header_sec_fetch_site) = 0, 1, 0)) AS count_no_sec_fetch,
sum(IF(length(src.header_accept) < 5, 1, 0)) AS count_generic_accept,
sum(IF(src.http_version = 'HTTP/1.0', 1, 0)) AS count_http10,
varPopState(toFloat64(src.ip_meta_df)) AS ip_df_var,
avgIfState(toFloat64(src.ip_meta_ttl), src.ip_meta_ttl > 0) AS avg_ttl,
varPopIfState(toFloat64(src.ip_meta_ttl), src.ip_meta_ttl > 0) AS ttl_var,
sum(IF(src.tcp_meta_window_scale = 0 AND src.correlated = 1, 1, 0)) AS count_no_wscale,
sum(toUInt64(src.correlated)) AS count_correlated,
sum(IF(length(src.header_accept_encoding) = 0, 1, 0)) AS count_no_accept_enc,
sum(IF(src.scheme = 'http', 1, 0)) AS count_http_scheme
FROM mabase_prod.http_logs AS src
GROUP BY window_start, src_ip, ja4, host, src_asn;
-- -----------------------------------------------------------------------------
-- agg_header_fingerprint_1h — header fingerprint aggregation (L7)
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.agg_header_fingerprint_1h
(
window_start DateTime,
src_ip IPv6,
header_order_hash SimpleAggregateFunction(any, String),
header_count SimpleAggregateFunction(max, UInt16),
has_accept_language SimpleAggregateFunction(max, UInt8),
has_cookie SimpleAggregateFunction(max, UInt8),
has_referer SimpleAggregateFunction(max, UInt8),
modern_browser_score SimpleAggregateFunction(max, UInt8),
ua_ch_mismatch SimpleAggregateFunction(max, UInt8),
sec_fetch_mode SimpleAggregateFunction(any, String),
sec_fetch_dest SimpleAggregateFunction(any, String)
)
ENGINE = AggregatingMergeTree()
ORDER BY (window_start, src_ip);
DROP VIEW IF EXISTS mabase_prod.mv_agg_header_fingerprint_1h;
CREATE MATERIALIZED VIEW mabase_prod.mv_agg_header_fingerprint_1h
TO mabase_prod.agg_header_fingerprint_1h AS
SELECT
toStartOfHour(src.time) AS window_start,
toIPv6(src.src_ip) AS src_ip,
any(toString(cityHash64(src.client_headers))) AS header_order_hash,
max(toUInt16(length(src.client_headers) - length(replaceAll(src.client_headers, ',', '')) + 1)) AS header_count,
max(toUInt8(if(position(src.client_headers, 'Accept-Language') > 0, 1, 0))) AS has_accept_language,
max(toUInt8(if(position(src.client_headers, 'Cookie') > 0, 1, 0))) AS has_cookie,
max(toUInt8(if(position(src.client_headers, 'Referer') > 0, 1, 0))) AS has_referer,
max(toUInt8(if(length(src.header_sec_ch_ua) > 0, 100, if(length(src.header_user_agent) > 0, 50, 0)))) AS modern_browser_score,
max(toUInt8(if((position(src.header_user_agent, 'Windows') > 0 AND position(src.header_sec_ch_ua_platform, 'Windows') == 0) OR (position(src.header_user_agent, 'iPhone') > 0 AND position(src.header_sec_ch_ua_platform, 'iOS') == 0), 1, 0))) AS ua_ch_mismatch,
any(src.header_sec_fetch_mode) AS sec_fetch_mode,
any(src.header_sec_fetch_dest) AS sec_fetch_dest
FROM mabase_prod.http_logs AS src
GROUP BY window_start, src.src_ip;

View File

@ -0,0 +1,90 @@
-- =============================================================================
-- 06_ml_tables.sql — ML detection results tables
-- Source: bot_detector/deploy_views.sql sections 6-6b + deploy_schema.sql items 11-12
-- =============================================================================
-- -----------------------------------------------------------------------------
-- ml_detected_anomalies — anomaly detections above threat threshold
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.ml_detected_anomalies
(
detected_at DateTime, src_ip IPv6, ja4 String, host String, bot_name String,
anomaly_score Float32, threat_level String, model_name String, recurrence UInt32,
asn_number String, asn_org String, asn_detail String, asn_domain String,
country_code String, asn_label String,
hits UInt64, hit_velocity Float32, fuzzing_index Float32, post_ratio Float32,
port_exhaustion_ratio Float32, max_keepalives UInt32, orphan_ratio Float32,
tcp_jitter_variance Float32, tcp_shared_count UInt32, true_window_size UInt64,
window_mss_ratio Float32, alpn_http_mismatch UInt8, is_alpn_missing UInt8,
sni_host_mismatch UInt8, header_count UInt16, has_accept_language UInt8,
has_cookie UInt8, has_referer UInt8, modern_browser_score UInt8, is_headless UInt8,
ua_ch_mismatch UInt8, header_order_shared_count UInt32, ip_id_zero_ratio Float32,
request_size_variance Float32, multiplexing_efficiency Float32,
mss_mobile_mismatch UInt8, correlated UInt8, reason String,
asset_ratio Float32, direct_access_ratio Float32, is_ua_rotating UInt8,
distinct_ja4_count UInt32, src_port_density Float32, ja4_asn_concentration Float32,
ja4_country_concentration Float32, is_rare_ja4 UInt8, header_order_confidence Float32,
distinct_header_orders UInt32, temporal_entropy Float32, path_diversity_ratio Float32,
url_depth_variance Float32, anomalous_payload_ratio Float32,
-- v11 additions
campaign_id Int32 DEFAULT -1,
raw_anomaly_score Float32 DEFAULT 0,
-- Anubis enrichment (deploy_schema.sql item 11)
anubis_bot_name LowCardinality(String) DEFAULT '',
anubis_bot_action LowCardinality(String) DEFAULT '',
anubis_bot_category LowCardinality(String) DEFAULT ''
)
ENGINE = ReplacingMergeTree(detected_at)
ORDER BY (src_ip)
TTL detected_at + INTERVAL 30 DAY;
-- -----------------------------------------------------------------------------
-- ml_all_scores — all classifications (no threshold, for observability)
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS mabase_prod.ml_all_scores
(
detected_at DateTime,
window_start DateTime,
src_ip IPv6,
ja4 String,
host String,
bot_name String,
anomaly_score Float32,
raw_anomaly_score Float32,
threat_level String,
model_name String,
correlated UInt8,
asn_number String,
asn_org String,
country_code String,
asn_label String,
hits UInt64,
hit_velocity Float32,
fuzzing_index Float32,
post_ratio Float32,
campaign_id Int32,
-- Anubis enrichment (deploy_schema.sql item 12)
anubis_bot_name LowCardinality(String) DEFAULT '',
anubis_bot_action LowCardinality(String) DEFAULT '',
anubis_bot_category LowCardinality(String) DEFAULT ''
)
ENGINE = ReplacingMergeTree(detected_at)
ORDER BY (window_start, src_ip, ja4, host, model_name)
TTL window_start + INTERVAL 3 DAY
SETTINGS index_granularity = 8192;
-- -----------------------------------------------------------------------------
-- view_ip_recurrence — recurrence aggregation over ml_detected_anomalies
-- -----------------------------------------------------------------------------
CREATE OR REPLACE VIEW mabase_prod.view_ip_recurrence AS
SELECT
src_ip,
count() AS recurrence,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen,
min(anomaly_score) AS worst_score,
argMin(threat_level, anomaly_score) AS worst_threat_level
FROM mabase_prod.ml_detected_anomalies
GROUP BY src_ip;

View File

@ -0,0 +1,185 @@
-- =============================================================================
-- 07_ai_features_view.sql — AI feature view with full Anubis enrichment
-- Source: bot_detector/anubis/view_ai_features_anubis.sql
-- Includes combined UA+IP priority logic and Anubis bot_name/action/category.
-- =============================================================================
CREATE OR REPLACE VIEW mabase_prod.view_ai_features_1h AS
WITH base_data AS (
SELECT
a.window_start, a.src_ip, a.ja4, a.host,
toString(a.src_asn) AS asn_number,
a.src_as_name AS asn_org, a.src_org AS asn_detail, a.src_domain AS asn_domain,
a.src_country_code AS country_code,
dictGetOrDefault('mabase_prod.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label,
COALESCE(
nullIf(dictGetOrDefault('mabase_prod.dict_bot_ip', 'bot_name', a.src_ip, ''), ''),
nullIf(dictGetOrDefault('mabase_prod.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''),
''
) AS bot_name,
-- Anubis: combined UA+IP priority logic > UA only > IP only > ASN > Country
CASE
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1'
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != ''
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua))
= dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0))
THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua)
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0'
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != ''
THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua)
WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '')
WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '')
WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '')
ELSE ''
END AS anubis_bot_name,
CASE
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1'
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != ''
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua))
= dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0))
THEN dictGet('mabase_prod.dict_anubis_ua', 'action', a.first_ua)
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0'
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != ''
THEN dictGet('mabase_prod.dict_anubis_ua', 'action', a.first_ua)
WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'action', a.src_ip, '')
WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'action', toUInt32(a.src_asn), '')
WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'action', a.src_country_code, '')
ELSE ''
END AS anubis_bot_action,
CASE
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1'
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != ''
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua))
= dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0))
THEN dictGet('mabase_prod.dict_anubis_ua', 'category', a.first_ua)
WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0'
AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != ''
THEN dictGet('mabase_prod.dict_anubis_ua', 'category', a.first_ua)
WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0
AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'category', a.src_ip, '')
WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'category', toUInt32(a.src_asn), '')
WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != ''
THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'category', a.src_country_code, '')
ELSE ''
END AS anubis_bot_category,
a.hits AS hits,
sum(a.hits) OVER (PARTITION BY a.src_ip) AS total_ip_hits,
a.correlated AS correlated,
a.tcp_jitter_variance AS tcp_jitter_variance,
a.true_window_size AS true_window_size,
a.window_mss_ratio AS window_mss_ratio,
a.max_keepalives AS max_keepalives,
h.header_order_hash AS header_order_hash, h.header_count AS header_count,
h.has_accept_language AS has_accept_language, h.has_cookie AS has_cookie,
h.has_referer AS has_referer, h.modern_browser_score AS modern_browser_score,
h.ua_ch_mismatch AS ua_ch_mismatch,
(a.count_post / (a.hits + 1)) AS post_ratio,
(a.uniq_query_params / (a.uniq_paths + 1)) AS fuzzing_index,
(a.hits / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS hit_velocity,
(a.unique_src_ports / (a.hits + 1)) AS port_exhaustion_ratio,
(a.orphan_count / (a.hits + 1)) AS orphan_ratio,
(a.ip_id_zero_count / (a.hits + 1)) AS ip_id_zero_ratio,
(a.hits / (a.unique_conn_id + 1)) AS multiplexing_efficiency,
IF(a.mss_1460_count > (a.hits * 0.8) AND h.modern_browser_score > 70, 1, 0) AS mss_mobile_mismatch,
a.request_size_variance AS request_size_variance,
IF(a.tls_alpn = 'h2' AND a.http_version != '2', 1, 0) AS alpn_http_mismatch,
IF(length(a.tls_alpn) = 0 OR a.tls_alpn = '00', 1, 0) AS is_alpn_missing,
IF(length(a.tls_sni) > 0 AND a.tls_sni != a.host, 1, 0) AS sni_host_mismatch,
IF(h.sec_fetch_mode = 'navigate' AND h.sec_fetch_dest != 'document', 1, 0) AS is_fake_navigation,
count() OVER (PARTITION BY a.tcp_fingerprint) AS tcp_shared_count,
count() OVER (PARTITION BY h.header_order_hash) AS header_order_shared_count,
(a.count_assets / (a.hits + 1)) AS asset_ratio,
(a.count_no_referer / (a.hits + 1)) AS direct_access_ratio,
IF(a.unique_ua > 2, 1, 0) AS is_ua_rotating,
uniqExact(a.ja4) OVER (PARTITION BY a.src_ip) AS distinct_ja4_count,
((a.hits / (a.unique_src_ports + 1)) / (dateDiff('second', a.first_seen, a.last_seen) + 1)) AS src_port_density,
(sum(a.hits) OVER (PARTITION BY a.ja4, a.src_asn) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_asn_concentration,
(sum(a.hits) OVER (PARTITION BY a.ja4, a.src_country_code) / (sum(a.hits) OVER (PARTITION BY a.ja4) + 1)) AS ja4_country_concentration,
IF(sum(a.hits) OVER (PARTITION BY a.ja4) < 100, 1, 0) AS is_rare_ja4,
(count() OVER (PARTITION BY h.header_order_hash, a.first_ua) / (count() OVER (PARTITION BY a.first_ua) + 1)) AS header_order_confidence,
uniqExact(h.header_order_hash) OVER (PARTITION BY a.src_ip) AS distinct_header_orders,
(a.uniq_paths / (a.hits + 1)) AS path_diversity_ratio,
a.url_depth_variance AS url_depth_variance,
(a.count_anomalous_payload / (a.hits + 1)) AS anomalous_payload_ratio,
a.uniq_ja3_val AS uniq_ja3_per_row,
sqrt(a.tcp_jitter_variance) / greatest(a.avg_syn_ms_val, 1) AS syn_timing_cv,
a.tls12_count / (a.hits + 1) AS tls12_ratio,
a.count_head / (a.hits + 1) AS head_ratio,
a.count_no_sec_fetch / (a.hits + 1) AS sec_fetch_absence_rate,
a.count_generic_accept / (a.hits + 1) AS generic_accept_ratio,
a.count_http10 / (a.hits + 1) AS http10_ratio,
a.ip_df_variance AS ip_df_variance,
a.avg_ttl_val AS avg_ttl,
sqrt(a.ttl_variance_val) AS ttl_std,
IF(a.count_correlated_val > 0, a.count_no_wscale_val / a.count_correlated_val, 0) AS no_window_scale_ratio,
a.count_no_accept_enc_val / (a.hits + 1) AS missing_accept_enc_ratio,
a.count_http_scheme_val / (a.hits + 1) AS http_scheme_ratio
FROM (
SELECT
window_start, src_ip, ja4, host, src_asn,
any(src_country_code) AS src_country_code, any(src_as_name) AS src_as_name,
any(src_org) AS src_org, any(src_domain) AS src_domain, any(first_ua) AS first_ua,
sum(hits) AS hits, uniqMerge(uniq_paths) AS uniq_paths,
uniqMerge(uniq_query_params) AS uniq_query_params, sum(count_post) AS count_post,
min(first_seen) AS first_seen, max(last_seen) AS last_seen,
any(tcp_fp_raw) AS tcp_fingerprint, varPopMerge(tcp_jitter_variance) AS tcp_jitter_variance,
varPopMerge(total_ip_length_var) AS request_size_variance,
any(tcp_win_raw * exp2(tcp_scale_raw)) AS true_window_size,
IF(any(tcp_mss_raw) > 0, any(tcp_win_raw) / any(tcp_mss_raw), 0) AS window_mss_ratio,
any(http_ver_raw) AS http_version, any(tls_alpn_raw) AS tls_alpn, any(tls_sni_raw) AS tls_sni,
max(correlated_raw) AS correlated, uniqMerge(unique_src_ports) AS unique_src_ports,
uniqMerge(unique_conn_id) AS unique_conn_id, max(max_keepalives) AS max_keepalives,
sum(orphan_count) AS orphan_count, sum(ip_id_zero_count) AS ip_id_zero_count,
sum(mss_1460_count) AS mss_1460_count,
sum(count_assets) AS count_assets, sum(count_no_referer) AS count_no_referer,
uniqMerge(uniq_ua) AS unique_ua,
varPopMerge(url_depth_variance) AS url_depth_variance,
sum(count_anomalous_payload) AS count_anomalous_payload,
uniqMerge(uniq_ja3) AS uniq_ja3_val,
avgMerge(avg_syn_ms) AS avg_syn_ms_val,
sum(tls12_count) AS tls12_count,
sum(count_head) AS count_head,
sum(count_no_sec_fetch) AS count_no_sec_fetch,
sum(count_generic_accept) AS count_generic_accept,
sum(count_http10) AS count_http10,
varPopMerge(ip_df_var) AS ip_df_variance,
avgIfMerge(avg_ttl) AS avg_ttl_val,
varPopIfMerge(ttl_var) AS ttl_variance_val,
sum(count_no_wscale) AS count_no_wscale_val,
sum(count_correlated) AS count_correlated_val,
sum(count_no_accept_enc) AS count_no_accept_enc_val,
sum(count_http_scheme) AS count_http_scheme_val
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
GROUP BY window_start, src_ip, ja4, host, src_asn
) a
LEFT JOIN (
SELECT
window_start, src_ip, any(header_order_hash) AS header_order_hash,
max(header_count) AS header_count, max(has_accept_language) AS has_accept_language,
max(has_cookie) AS has_cookie, max(has_referer) AS has_referer,
max(modern_browser_score) AS modern_browser_score, max(ua_ch_mismatch) AS ua_ch_mismatch,
any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest
FROM mabase_prod.agg_header_fingerprint_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
GROUP BY window_start, src_ip
) h ON a.src_ip = h.src_ip AND a.window_start = h.window_start
)
SELECT
*,
-(sum((hits / (total_ip_hits + 1)) * log2((hits / (total_ip_hits + 1)) + 0.000001)) OVER (PARTITION BY src_ip)) AS temporal_entropy,
sum(uniq_ja3_per_row) OVER (PARTITION BY src_ip) / greatest(distinct_ja4_count, 1) AS ja3_diversity_ratio
FROM base_data;

View File

@ -0,0 +1,20 @@
-- =============================================================================
-- 08_users.sql — ClickHouse users and grants
-- TODO: Replace 'ChangeMe' with strong passwords before production use.
-- Store passwords in a secrets manager (Vault, K8s secrets, etc.).
-- =============================================================================
CREATE USER IF NOT EXISTS data_writer IDENTIFIED WITH plaintext_password BY 'ChangeMe';
CREATE USER IF NOT EXISTS analyst IDENTIFIED WITH plaintext_password BY 'ChangeMe';
-- data_writer: INSERT on raw table only (fed by correlator service)
GRANT INSERT ON mabase_prod.http_logs_raw TO data_writer;
GRANT SELECT ON mabase_prod.http_logs_raw TO data_writer;
-- analyst: read access on parsed and derived tables
GRANT SELECT ON mabase_prod.http_logs TO analyst;
GRANT SELECT ON mabase_prod.ml_detected_anomalies TO analyst;
GRANT SELECT ON mabase_prod.ml_all_scores TO analyst;
GRANT SELECT ON mabase_prod.view_ai_features_1h TO analyst;
GRANT SELECT ON mabase_prod.view_ip_recurrence TO analyst;
GRANT SELECT ON mabase_prod.audit_logs TO analyst;

View File

@ -0,0 +1,21 @@
-- =============================================================================
-- 09_audit_table.sql — SOC audit log table for dashboard activity tracking
-- Referenced as mabase_prod.audit_logs in dashboard/backend/routes/audit.py
-- =============================================================================
CREATE TABLE IF NOT EXISTS mabase_prod.audit_logs
(
`timestamp` DateTime DEFAULT now(),
`user_name` LowCardinality(String) DEFAULT 'soc_user',
`action` LowCardinality(String),
`entity_type` LowCardinality(String) DEFAULT '',
`entity_id` String DEFAULT '',
`entity_count` UInt32 DEFAULT 0,
`details` String CODEC(ZSTD(3)) DEFAULT '',
`client_ip` String DEFAULT ''
)
ENGINE = MergeTree
PARTITION BY toDate(timestamp)
ORDER BY (timestamp, user_name, action)
TTL toDate(timestamp) + INTERVAL 90 DAY
SETTINGS index_granularity = 8192;

View File

@ -0,0 +1,48 @@
# ClickHouse Migrations — ja4-platform
## Migration Order
Apply these files in numeric order against the ClickHouse server:
```bash
clickhouse-client --multiquery < 00_database.sql
clickhouse-client --multiquery < 01_raw_tables.sql
clickhouse-client --multiquery < 02_dictionaries.sql
clickhouse-client --multiquery < 03_anubis_tables.sql
clickhouse-client --multiquery < 04_mv_http_logs.sql
clickhouse-client --multiquery < 05_aggregation_tables.sql
clickhouse-client --multiquery < 06_ml_tables.sql
clickhouse-client --multiquery < 07_ai_features_view.sql
clickhouse-client --multiquery < 08_users.sql
clickhouse-client --multiquery < 09_audit_table.sql
```
## File Descriptions
| File | Contents |
|------|----------|
| `00_database.sql` | CREATE DATABASE |
| `01_raw_tables.sql` | `http_logs_raw` ingest table |
| `02_dictionaries.sql` | ASN geo dict, bot IP/JA4/network reference tables |
| `03_anubis_tables.sql` | Anubis crawler rule tables and dictionaries (UA, IP, ASN, country) |
| `04_mv_http_logs.sql` | Canonical `http_logs` target table + `mv_http_logs` materialized view with full Anubis enrichment |
| `05_aggregation_tables.sql` | `agg_host_ip_ja4_1h`, `agg_header_fingerprint_1h` + their MVs |
| `06_ml_tables.sql` | `ml_detected_anomalies`, `ml_all_scores` |
| `07_ai_features_view.sql` | `view_ai_features_1h` with Anubis enrichment |
| `08_users.sql` | ClickHouse users and grants |
| `09_audit_table.sql` | `audit_logs` table for SOC dashboard audit trail |
## Prerequisites
Place CSV data files in `/var/lib/clickhouse/user_files/`:
- `iplocate-ip-to-asn.csv` — IP-to-ASN mapping (from IPLocate)
- `bot_ip.csv` — Known bot IP prefixes
- `bot_ja4.csv` — Known bot JA4 fingerprints
- `asn_reputation.csv` — ASN reputation labels
## Notes
- `04_mv_http_logs.sql` is the **canonical** version of the MV, superseding the base version
in `services/correlator/sql/init.sql`. It includes full Anubis enrichment.
- All migrations are idempotent (use `IF NOT EXISTS` / `IF EXISTS`).
- Anubis dictionary passwords in `03_anubis_tables.sql` **must** be changed before production use.

View File

@ -0,0 +1,114 @@
// Package config provides generic YAML config loading with env override support.
package config
import (
"fmt"
"os"
"reflect"
"strconv"
"strings"
"gopkg.in/yaml.v3"
)
// LoadYAML reads a YAML file at path and unmarshals it into T.
// If path is empty or the file does not exist and optional is true, the zero value of T is returned.
func LoadYAML[T any](path string, optional bool) (T, error) {
var zero T
if path == "" {
if optional {
return zero, nil
}
return zero, fmt.Errorf("config path is empty")
}
data, err := os.ReadFile(path)
if err != nil {
if optional && os.IsNotExist(err) {
return zero, nil
}
return zero, fmt.Errorf("reading config file %q: %w", path, err)
}
var cfg T
if err := yaml.Unmarshal(data, &cfg); err != nil {
return zero, fmt.Errorf("parsing config file %q: %w", path, err)
}
return cfg, nil
}
// OverrideFromEnv applies environment variable overrides to a struct using struct tags.
// Tag format: env:"ENV_VAR_NAME"
// Supports field types: string, int, bool, []string (comma-separated).
// envPrefix is prepended to tag values if non-empty (e.g. envPrefix="APP" + tag="PORT" → "APP_PORT").
func OverrideFromEnv[T any](cfg *T, envPrefix string) error {
return overrideStruct(reflect.ValueOf(cfg).Elem(), envPrefix)
}
func overrideStruct(v reflect.Value, envPrefix string) error {
t := v.Type()
for i := 0; i < t.NumField(); i++ {
field := t.Field(i)
fv := v.Field(i)
if !fv.CanSet() {
continue
}
// Recurse into embedded/nested structs
if fv.Kind() == reflect.Struct {
if err := overrideStruct(fv, envPrefix); err != nil {
return err
}
continue
}
envTag := field.Tag.Get("env")
if envTag == "" {
continue
}
envKey := envTag
if envPrefix != "" {
envKey = envPrefix + "_" + envTag
}
val := os.Getenv(envKey)
if val == "" {
continue
}
switch fv.Kind() {
case reflect.String:
fv.SetString(val)
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
n, err := strconv.ParseInt(val, 10, 64)
if err != nil {
return fmt.Errorf("env %s: cannot parse %q as int: %w", envKey, val, err)
}
fv.SetInt(n)
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
n, err := strconv.ParseUint(val, 10, 64)
if err != nil {
return fmt.Errorf("env %s: cannot parse %q as uint: %w", envKey, val, err)
}
fv.SetUint(n)
case reflect.Bool:
b, err := strconv.ParseBool(val)
if err != nil {
return fmt.Errorf("env %s: cannot parse %q as bool: %w", envKey, val, err)
}
fv.SetBool(b)
case reflect.Slice:
if fv.Type().Elem().Kind() == reflect.String {
parts := strings.Split(val, ",")
for i, p := range parts {
parts[i] = strings.TrimSpace(p)
}
fv.Set(reflect.ValueOf(parts))
}
}
}
return nil
}

View File

@ -0,0 +1,139 @@
package config
import (
"os"
"path/filepath"
"testing"
)
type testConfig struct {
Host string `yaml:"host" env:"HOST"`
Port int `yaml:"port" env:"PORT"`
TLS bool `yaml:"tls" env:"TLS"`
Tags []string `yaml:"tags" env:"TAGS"`
}
func TestLoadYAML_Basic(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "test.yml")
content := `
host: myhost
port: 9000
tls: true
tags:
- a
- b
`
if err := os.WriteFile(path, []byte(content), 0600); err != nil {
t.Fatal(err)
}
cfg, err := LoadYAML[testConfig](path, false)
if err != nil {
t.Fatalf("LoadYAML error: %v", err)
}
if cfg.Host != "myhost" {
t.Errorf("Host = %q, want %q", cfg.Host, "myhost")
}
if cfg.Port != 9000 {
t.Errorf("Port = %d, want 9000", cfg.Port)
}
if !cfg.TLS {
t.Error("TLS should be true")
}
if len(cfg.Tags) != 2 {
t.Errorf("Tags len = %d, want 2", len(cfg.Tags))
}
}
func TestLoadYAML_Optional_MissingFile(t *testing.T) {
cfg, err := LoadYAML[testConfig]("/nonexistent/path.yml", true)
if err != nil {
t.Fatalf("optional missing file should not error: %v", err)
}
if cfg.Host != "" {
t.Errorf("zero value expected, got host=%q", cfg.Host)
}
}
func TestLoadYAML_Required_MissingFile(t *testing.T) {
_, err := LoadYAML[testConfig]("/nonexistent/path.yml", false)
if err == nil {
t.Error("expected error for missing required file")
}
}
func TestLoadYAML_EmptyPath_Optional(t *testing.T) {
cfg, err := LoadYAML[testConfig]("", true)
if err != nil {
t.Fatalf("empty optional path should not error: %v", err)
}
_ = cfg
}
func TestOverrideFromEnv_String(t *testing.T) {
t.Setenv("HOST", "envhost")
cfg := testConfig{Host: "original"}
if err := OverrideFromEnv(&cfg, ""); err != nil {
t.Fatal(err)
}
if cfg.Host != "envhost" {
t.Errorf("Host = %q, want envhost", cfg.Host)
}
}
func TestOverrideFromEnv_Int(t *testing.T) {
t.Setenv("PORT", "8080")
cfg := testConfig{Port: 1234}
if err := OverrideFromEnv(&cfg, ""); err != nil {
t.Fatal(err)
}
if cfg.Port != 8080 {
t.Errorf("Port = %d, want 8080", cfg.Port)
}
}
func TestOverrideFromEnv_Bool(t *testing.T) {
t.Setenv("TLS", "false")
cfg := testConfig{TLS: true}
if err := OverrideFromEnv(&cfg, ""); err != nil {
t.Fatal(err)
}
if cfg.TLS {
t.Error("TLS should be false after env override")
}
}
func TestOverrideFromEnv_Slice(t *testing.T) {
t.Setenv("TAGS", "x,y,z")
cfg := testConfig{}
if err := OverrideFromEnv(&cfg, ""); err != nil {
t.Fatal(err)
}
if len(cfg.Tags) != 3 || cfg.Tags[0] != "x" {
t.Errorf("Tags = %v, want [x y z]", cfg.Tags)
}
}
func TestOverrideFromEnv_WithPrefix(t *testing.T) {
t.Setenv("APP_HOST", "prefixed")
cfg := testConfig{Host: "original"}
if err := OverrideFromEnv(&cfg, "APP"); err != nil {
t.Fatal(err)
}
if cfg.Host != "prefixed" {
t.Errorf("Host = %q, want prefixed", cfg.Host)
}
}
func TestOverrideFromEnv_NoEnvSet_NoChange(t *testing.T) {
os.Unsetenv("HOST")
os.Unsetenv("PORT")
cfg := testConfig{Host: "keep", Port: 42}
if err := OverrideFromEnv(&cfg, ""); err != nil {
t.Fatal(err)
}
if cfg.Host != "keep" || cfg.Port != 42 {
t.Errorf("unset env should not change values: host=%q port=%d", cfg.Host, cfg.Port)
}
}

View File

@ -0,0 +1,9 @@
module github.com/antitbone/ja4/ja4common
go 1.21
require (
gopkg.in/yaml.v3 v3.0.1
)
require gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect

View File

@ -0,0 +1,84 @@
// Package ipfilter provides IP address and CIDR range matching for ja4-platform services.
package ipfilter
import (
"fmt"
"net"
"sync"
)
// Filter checks if an IP address should be excluded based on a list of IPs or CIDR ranges
type Filter struct {
mu sync.RWMutex
networks []*net.IPNet
ips []net.IP
}
// New creates a new IP filter from a list of IP addresses or CIDR ranges
// Accepts formats like: "192.168.1.1", "10.0.0.0/8", "2001:db8::/32"
func New(excludeList []string) (*Filter, error) {
f := &Filter{
networks: make([]*net.IPNet, 0),
ips: make([]net.IP, 0),
}
for _, entry := range excludeList {
if entry == "" {
continue
}
// Try parsing as CIDR first
if _, ipNet, err := net.ParseCIDR(entry); err == nil {
f.networks = append(f.networks, ipNet)
continue
}
// Try parsing as single IP
if ip := net.ParseIP(entry); ip != nil {
f.ips = append(f.ips, ip)
continue
}
return nil, fmt.Errorf("invalid IP or CIDR: %s", entry)
}
return f, nil
}
// ShouldExclude checks if an IP address should be excluded
func (f *Filter) ShouldExclude(ipStr string) bool {
if f == nil {
return false
}
ip := net.ParseIP(ipStr)
if ip == nil {
return false
}
f.mu.RLock()
defer f.mu.RUnlock()
// Check against single IPs
for _, filterIP := range f.ips {
if ip.Equal(filterIP) {
return true
}
}
// Check against CIDR ranges
for _, network := range f.networks {
if network.Contains(ip) {
return true
}
}
return false
}
// Count returns the number of loaded filter entries
func (f *Filter) Count() (ips int, networks int) {
f.mu.RLock()
defer f.mu.RUnlock()
return len(f.ips), len(f.networks)
}

View File

@ -0,0 +1,160 @@
package ipfilter
import (
"testing"
)
func TestFilter_New(t *testing.T) {
tests := []struct {
name string
list []string
wantErr bool
}{
{
name: "empty list",
list: []string{},
wantErr: false,
},
{
name: "single IP",
list: []string{"192.168.1.1"},
wantErr: false,
},
{
name: "single CIDR",
list: []string{"10.0.0.0/8"},
wantErr: false,
},
{
name: "mixed IPs and CIDRs",
list: []string{"192.168.1.1", "10.0.0.0/8", "172.16.0.0/12"},
wantErr: false,
},
{
name: "invalid IP",
list: []string{"999.999.999.999"},
wantErr: true,
},
{
name: "invalid CIDR",
list: []string{"10.0.0.0/33"},
wantErr: true,
},
{
name: "IPv6 address",
list: []string{"2001:db8::1"},
wantErr: false,
},
{
name: "IPv6 CIDR",
list: []string{"2001:db8::/32"},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
f, err := New(tt.list)
if (err != nil) != tt.wantErr {
t.Errorf("New() error = %v, wantErr %v", err, tt.wantErr)
return
}
if err == nil && f == nil {
t.Error("New() should return non-nil filter on success")
}
})
}
}
func TestFilter_ShouldExclude(t *testing.T) {
f, err := New([]string{
"192.168.1.1",
"10.0.0.0/8",
"172.16.0.0/12",
"2001:db8::1",
"fc00::/7",
})
if err != nil {
t.Fatalf("New() error = %v", err)
}
tests := []struct {
name string
ip string
want bool
}{
// Exact IP matches
{"exact match", "192.168.1.1", true},
{"exact IPv6 match", "2001:db8::1", true},
// CIDR matches
{"CIDR match 10.0.0.1", "10.0.0.1", true},
{"CIDR match 10.255.255.255", "10.255.255.255", true},
{"CIDR match 172.16.0.1", "172.16.0.1", true},
{"CIDR match 172.31.255.255", "172.31.255.255", true},
{"CIDR IPv6 match", "fc00::1", true},
// No matches
{"no match 192.168.2.1", "192.168.2.1", false},
{"no match 11.0.0.1", "11.0.0.1", false},
{"no match 172.32.0.1", "172.32.0.1", false},
{"no match 8.8.8.8", "8.8.8.8", false},
// Invalid IP
{"invalid IP", "invalid", false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := f.ShouldExclude(tt.ip); got != tt.want {
t.Errorf("ShouldExclude(%q) = %v, want %v", tt.ip, got, tt.want)
}
})
}
}
func TestFilter_ShouldExclude_NilFilter(t *testing.T) {
var f *Filter
if f.ShouldExclude("192.168.1.1") {
t.Error("ShouldExclude on nil filter should return false")
}
}
func TestFilter_Count(t *testing.T) {
f, err := New([]string{
"192.168.1.1",
"10.0.0.1",
"10.0.0.0/8",
"172.16.0.0/12",
})
if err != nil {
t.Fatalf("New() error = %v", err)
}
ips, networks := f.Count()
if ips != 2 {
t.Errorf("Count() ips = %d, want 2", ips)
}
if networks != 2 {
t.Errorf("Count() networks = %d, want 2", networks)
}
}
func TestFilter_EmptyEntries(t *testing.T) {
f, err := New([]string{"", "192.168.1.1", ""})
if err != nil {
t.Fatalf("New() error = %v", err)
}
ips, _ := f.Count()
if ips != 1 {
t.Errorf("Count() ips = %d, want 1 (empty entries should be skipped)", ips)
}
if !f.ShouldExclude("192.168.1.1") {
t.Error("Should exclude 192.168.1.1")
}
if f.ShouldExclude("192.168.1.2") {
t.Error("Should not exclude 192.168.1.2")
}
}

View File

@ -0,0 +1,263 @@
// Package logger provides unified structured logging for the ja4-platform services.
// It merges the component-based logger from sentinel and the prefix/fields-based
// logger from correlator into a single implementation.
package logger
import (
"fmt"
"log"
"os"
"sort"
"strings"
"sync"
)
// LogLevel represents the severity of a log message.
type LogLevel int
const (
DEBUG LogLevel = iota
INFO
WARN
ERROR
)
// ParseLogLevel converts a string to LogLevel.
func ParseLogLevel(level string) LogLevel {
switch strings.ToUpper(level) {
case "DEBUG":
return DEBUG
case "INFO":
return INFO
case "WARN", "WARNING":
return WARN
case "ERROR":
return ERROR
default:
return INFO
}
}
// String returns the string representation of a LogLevel.
func (l LogLevel) String() string {
switch l {
case DEBUG:
return "DEBUG"
case INFO:
return "INFO"
case WARN:
return "WARN"
case ERROR:
return "ERROR"
default:
return "INFO"
}
}
// Logger provides structured prefix+fields-based logging (correlator style).
type Logger struct {
mu sync.RWMutex
logger *log.Logger
prefix string
fields map[string]any
minLevel LogLevel
}
// New creates a new Logger with INFO level.
func New(prefix string) *Logger {
return &Logger{
logger: log.New(os.Stderr, "", log.LstdFlags|log.Lmicroseconds),
prefix: prefix,
fields: make(map[string]any),
minLevel: INFO,
}
}
// NewWithLevel creates a new Logger with the specified minimum level.
func NewWithLevel(prefix string, level string) *Logger {
return &Logger{
logger: log.New(os.Stderr, "", log.LstdFlags|log.Lmicroseconds),
prefix: prefix,
fields: make(map[string]any),
minLevel: ParseLogLevel(level),
}
}
// SetLevel sets the minimum log level.
func (l *Logger) SetLevel(level string) {
l.mu.Lock()
defer l.mu.Unlock()
l.minLevel = ParseLogLevel(level)
}
// ShouldLog returns true if the given level should be logged.
func (l *Logger) ShouldLog(level LogLevel) bool {
l.mu.RLock()
defer l.mu.RUnlock()
return level >= l.minLevel
}
// WithFields returns a new Logger with additional structured fields.
func (l *Logger) WithFields(fields map[string]any) *Logger {
l.mu.RLock()
minLevel := l.minLevel
prefix := l.prefix
existing := make(map[string]any, len(l.fields))
for k, v := range l.fields {
existing[k] = v
}
l.mu.RUnlock()
for k, v := range fields {
existing[k] = v
}
return &Logger{
logger: l.logger,
prefix: prefix,
fields: existing,
minLevel: minLevel,
}
}
// Info logs an info message.
func (l *Logger) Info(msg string) {
if l.ShouldLog(INFO) {
l.emit("INFO", msg)
}
}
// Infof logs a formatted info message.
func (l *Logger) Infof(msg string, args ...any) {
if l.ShouldLog(INFO) {
l.emit("INFO", fmt.Sprintf(msg, args...))
}
}
// Warn logs a warning message.
func (l *Logger) Warn(msg string) {
if l.ShouldLog(WARN) {
l.emit("WARN", msg)
}
}
// Warnf logs a formatted warning message.
func (l *Logger) Warnf(msg string, args ...any) {
if l.ShouldLog(WARN) {
l.emit("WARN", fmt.Sprintf(msg, args...))
}
}
// Error logs an error message with an optional error value.
func (l *Logger) Error(msg string, err error) {
if !l.ShouldLog(ERROR) {
return
}
if err != nil {
l.emit("ERROR", msg+" "+err.Error())
} else {
l.emit("ERROR", msg)
}
}
// Debug logs a debug message.
func (l *Logger) Debug(msg string) {
if l.ShouldLog(DEBUG) {
l.emit("DEBUG", msg)
}
}
// Debugf logs a formatted debug message.
func (l *Logger) Debugf(msg string, args ...any) {
if l.ShouldLog(DEBUG) {
l.emit("DEBUG", fmt.Sprintf(msg, args...))
}
}
func (l *Logger) emit(level, msg string) {
l.mu.RLock()
prefix := l.prefix
fields := make(map[string]any, len(l.fields))
for k, v := range l.fields {
fields[k] = v
}
l.mu.RUnlock()
var b strings.Builder
if prefix != "" {
b.WriteString("[")
b.WriteString(prefix)
b.WriteString("] ")
}
b.WriteString(level)
b.WriteString(" ")
b.WriteString(msg)
if len(fields) > 0 {
keys := make([]string, 0, len(fields))
for k := range fields {
keys = append(keys, k)
}
sort.Strings(keys)
for _, k := range keys {
b.WriteString(" ")
b.WriteString(k)
b.WriteString("=")
b.WriteString(fmt.Sprintf("%v", fields[k]))
}
}
l.logger.Print(b.String())
}
// ComponentLogger wraps Logger to satisfy the sentinel-style component-based Logger interface.
// This allows new services to use ja4common while sentinel's existing api.Logger interface
// is still satisfied.
type ComponentLogger struct {
*Logger
}
// NewComponentLogger creates a ComponentLogger with the specified log level.
func NewComponentLogger(level string) *ComponentLogger {
return &ComponentLogger{Logger: NewWithLevel("", level)}
}
// Log emits a structured log entry for the given component.
func (c *ComponentLogger) Log(component, level, message string, details map[string]string) {
fields := make(map[string]any, len(details)+1)
fields["component"] = component
for k, v := range details {
fields[k] = v
}
cl := c.Logger.WithFields(fields)
switch strings.ToLower(level) {
case "debug":
cl.Debug(message)
case "warn", "warning":
cl.Warn(message)
case "error":
cl.Error(message, nil)
default:
cl.Info(message)
}
}
// Debug logs a debug entry for the given component.
func (c *ComponentLogger) Debug(component, message string, details map[string]string) {
c.Log(component, "debug", message, details)
}
// Info logs an info entry for the given component.
func (c *ComponentLogger) Info(component, message string, details map[string]string) {
c.Log(component, "info", message, details)
}
// Warn logs a warning entry for the given component.
func (c *ComponentLogger) Warn(component, message string, details map[string]string) {
c.Log(component, "warn", message, details)
}
// Error logs an error entry for the given component.
func (c *ComponentLogger) Error(component, message string, details map[string]string) {
c.Log(component, "error", message, details)
}

View File

@ -0,0 +1,139 @@
package logger
import (
"strings"
"testing"
)
func TestParseLogLevel(t *testing.T) {
tests := []struct {
input string
want LogLevel
}{
{"debug", DEBUG},
{"DEBUG", DEBUG},
{"info", INFO},
{"INFO", INFO},
{"warn", WARN},
{"WARN", WARN},
{"warning", WARN},
{"WARNING", WARN},
{"error", ERROR},
{"ERROR", ERROR},
{"invalid", INFO},
{"", INFO},
}
for _, tt := range tests {
got := ParseLogLevel(tt.input)
if got != tt.want {
t.Errorf("ParseLogLevel(%q) = %v, want %v", tt.input, got, tt.want)
}
}
}
func TestLogger_LevelFiltering(t *testing.T) {
tests := []struct {
loggerLevel string
logLevel LogLevel
shouldLog bool
}{
{"debug", DEBUG, true},
{"debug", INFO, true},
{"info", DEBUG, false},
{"info", INFO, true},
{"warn", INFO, false},
{"warn", WARN, true},
{"error", WARN, false},
{"error", ERROR, true},
}
for _, tt := range tests {
l := NewWithLevel("test", tt.loggerLevel)
got := l.ShouldLog(tt.logLevel)
if got != tt.shouldLog {
t.Errorf("level=%s ShouldLog(%v)=%v want %v", tt.loggerLevel, tt.logLevel, got, tt.shouldLog)
}
}
}
func TestLogger_WithFields(t *testing.T) {
l := New("test")
l2 := l.WithFields(map[string]any{"key": "value", "n": 42})
if l2 == l {
t.Error("WithFields should return a new Logger")
}
if len(l2.fields) != 2 {
t.Errorf("expected 2 fields, got %d", len(l2.fields))
}
}
func TestLogger_SetLevel(t *testing.T) {
l := New("test")
if l.minLevel != INFO {
t.Errorf("default level should be INFO, got %v", l.minLevel)
}
l.SetLevel("debug")
if l.minLevel != DEBUG {
t.Errorf("level after SetLevel(debug) should be DEBUG, got %v", l.minLevel)
}
}
func TestComponentLogger_Interface(t *testing.T) {
cl := NewComponentLogger("debug")
// Verify it implements the component-based interface by calling all methods
cl.Debug("component", "debug msg", nil)
cl.Info("component", "info msg", map[string]string{"key": "val"})
cl.Warn("component", "warn msg", nil)
cl.Error("component", "error msg", map[string]string{"err": "test"})
cl.Log("component", "info", "log msg", nil)
}
func TestComponentLogger_LevelFiltering(t *testing.T) {
cl := NewComponentLogger("warn")
// At warn level, debug and info should be filtered
if cl.Logger.ShouldLog(DEBUG) {
t.Error("DEBUG should be filtered at warn level")
}
if cl.Logger.ShouldLog(INFO) {
t.Error("INFO should be filtered at warn level")
}
if !cl.Logger.ShouldLog(WARN) {
t.Error("WARN should pass at warn level")
}
}
func TestLogger_LogLevelString(t *testing.T) {
tests := []struct {
level LogLevel
want string
}{
{DEBUG, "DEBUG"},
{INFO, "INFO"},
{WARN, "WARN"},
{ERROR, "ERROR"},
}
for _, tt := range tests {
if got := tt.level.String(); got != tt.want {
t.Errorf("%v.String() = %q, want %q", tt.level, got, tt.want)
}
}
}
func TestLogger_EmitContainsLevel(t *testing.T) {
// Use a custom logger that captures output
var buf strings.Builder
l := New("myservice")
l.logger.SetOutput(&buf)
l.SetLevel("debug")
l.Info("hello from info")
if !strings.Contains(buf.String(), "INFO") {
t.Errorf("expected INFO in output, got: %s", buf.String())
}
buf.Reset()
l.Debug("hello from debug")
if !strings.Contains(buf.String(), "DEBUG") {
t.Errorf("expected DEBUG in output, got: %s", buf.String())
}
}

View File

@ -0,0 +1,45 @@
// Package shutdown provides graceful shutdown handling for services.
package shutdown
import (
"context"
"os"
"os/signal"
"syscall"
)
// Hook is a cleanup function called during shutdown.
type Hook struct {
Name string
Fn func() error
}
// simpleLogger is the minimal interface required from a logger.
type simpleLogger interface {
Info(string)
Error(string, error)
}
// Handle blocks until SIGTERM or SIGINT is received, then cancels the context
// and runs hooks in order. Each hook error is logged but does not abort remaining hooks.
func Handle(ctx context.Context, cancel context.CancelFunc, hooks []Hook, logger simpleLogger) {
quit := make(chan os.Signal, 1)
signal.Notify(quit, syscall.SIGTERM, syscall.SIGINT)
defer signal.Stop(quit)
select {
case sig := <-quit:
logger.Info("shutdown signal received: " + sig.String())
case <-ctx.Done():
logger.Info("context cancelled, shutting down")
}
cancel()
for _, h := range hooks {
logger.Info("running shutdown hook: " + h.Name)
if err := h.Fn(); err != nil {
logger.Error("shutdown hook "+h.Name+" failed", err)
}
}
}

View File

@ -0,0 +1,133 @@
package shutdown
import (
"context"
"errors"
"sync/atomic"
"syscall"
"testing"
"time"
)
type mockLogger struct {
infoMsgs []string
errorMsgs []string
}
func (m *mockLogger) Info(msg string) { m.infoMsgs = append(m.infoMsgs, msg) }
func (m *mockLogger) Error(msg string, _ error) { m.errorMsgs = append(m.errorMsgs, msg) }
func TestHandle_RunsHooks(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
logger := &mockLogger{}
var hookCalled int32
hooks := []Hook{
{
Name: "test-hook",
Fn: func() error {
atomic.StoreInt32(&hookCalled, 1)
return nil
},
},
}
done := make(chan struct{})
go func() {
Handle(ctx, cancel, hooks, logger)
close(done)
}()
// Send SIGTERM to trigger shutdown
time.Sleep(50 * time.Millisecond)
p, _ := syscall.Getpid(), 0
syscall.Kill(p, syscall.SIGTERM)
select {
case <-done:
case <-time.After(3 * time.Second):
t.Fatal("Handle did not return within timeout")
}
if atomic.LoadInt32(&hookCalled) != 1 {
t.Error("hook was not called")
}
}
func TestHandle_HookError_ContinuesOtherHooks(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
logger := &mockLogger{}
var secondCalled int32
hooks := []Hook{
{
Name: "failing-hook",
Fn: func() error { return errors.New("hook error") },
},
{
Name: "second-hook",
Fn: func() error {
atomic.StoreInt32(&secondCalled, 1)
return nil
},
},
}
done := make(chan struct{})
go func() {
Handle(ctx, cancel, hooks, logger)
close(done)
}()
time.Sleep(50 * time.Millisecond)
syscall.Kill(syscall.Getpid(), syscall.SIGTERM)
select {
case <-done:
case <-time.After(3 * time.Second):
t.Fatal("Handle did not return within timeout")
}
if atomic.LoadInt32(&secondCalled) != 1 {
t.Error("second hook should still run after first hook error")
}
if len(logger.errorMsgs) == 0 {
t.Error("error should be logged for failing hook")
}
}
func TestHandle_ContextCancel(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
logger := &mockLogger{}
var hookCalled int32
hooks := []Hook{
{
Name: "ctx-hook",
Fn: func() error {
atomic.StoreInt32(&hookCalled, 1)
return nil
},
},
}
done := make(chan struct{})
go func() {
Handle(ctx, cancel, hooks, logger)
close(done)
}()
// Cancel context directly instead of sending signal
time.Sleep(50 * time.Millisecond)
cancel()
select {
case <-done:
case <-time.After(3 * time.Second):
t.Fatal("Handle did not return within timeout after context cancel")
}
if atomic.LoadInt32(&hookCalled) != 1 {
t.Error("hook should run on context cancel")
}
}

View File

@ -0,0 +1,8 @@
FROM python:3.11-slim
WORKDIR /app
COPY ja4_common/ ./ja4_common/
COPY tests/ ./tests/
COPY pyproject.toml .
RUN pip install --no-cache-dir -e . && \
pip install --no-cache-dir pytest pytest-mock pydantic-settings clickhouse-connect
CMD ["pytest", "tests/", "-v"]

View File

@ -0,0 +1,2 @@
"""JA4 Common — shared utilities for the JA4 security suite."""
__version__ = "0.1.0"

View File

@ -0,0 +1,55 @@
"""Unified singleton ClickHouse client for the JA4 security suite."""
import clickhouse_connect
from typing import Optional
from .settings import settings
class ClickHouseClient:
def __init__(self):
self._client: Optional[clickhouse_connect.driver.client.Client] = None
def connect(self) -> clickhouse_connect.driver.client.Client:
if self._client is None or not self._ping():
self._client = clickhouse_connect.get_client(
host=settings.CLICKHOUSE_HOST,
port=settings.CLICKHOUSE_PORT,
database=settings.CLICKHOUSE_DB,
user=settings.CLICKHOUSE_USER,
password=settings.CLICKHOUSE_PASSWORD,
connect_timeout=10,
)
return self._client
def _ping(self) -> bool:
try:
if self._client:
self._client.ping()
return True
except Exception:
pass
return False
def query(self, query: str, params: Optional[dict] = None):
return self.connect().query(query, params)
def command(self, query: str, params: Optional[dict] = None):
return self.connect().command(query, parameters=params)
def insert(self, table: str, data, column_names=None):
return self.connect().insert(table, data, column_names=column_names)
def close(self):
if self._client:
self._client.close()
self._client = None
_client: Optional[ClickHouseClient] = None
def get_client() -> ClickHouseClient:
global _client
if _client is None:
_client = ClickHouseClient()
return _client

View File

@ -0,0 +1,16 @@
"""Unified ClickHouse settings using pydantic-settings."""
from pydantic_settings import BaseSettings
class ClickHouseSettings(BaseSettings):
CLICKHOUSE_HOST: str = "clickhouse"
CLICKHOUSE_PORT: int = 8123
CLICKHOUSE_DB: str = "mabase_prod"
CLICKHOUSE_USER: str = "admin"
CLICKHOUSE_PASSWORD: str = ""
model_config = {"env_file": ".env", "case_sensitive": True}
# Singleton instance — re-instantiate in tests via ClickHouseSettings() directly
settings = ClickHouseSettings()

View File

@ -0,0 +1,17 @@
[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "ja4-common"
version = "0.1.0"
description = "Shared utilities for JA4 security suite"
requires-python = ">=3.11"
dependencies = [
"clickhouse-connect>=0.8.0",
"pydantic-settings>=2.1.0",
]
[tool.setuptools.packages.find]
where = ["."]
include = ["ja4_common*"]

View File

@ -0,0 +1,110 @@
from unittest.mock import patch, MagicMock
from ja4_common.clickhouse import ClickHouseClient, get_client
import ja4_common.clickhouse as ch_module
def test_get_client_singleton():
ch_module._client = None
with patch("ja4_common.clickhouse.clickhouse_connect.get_client") as mock_gc:
mock_client = MagicMock()
mock_client.ping.return_value = True
mock_gc.return_value = mock_client
c1 = get_client()
c2 = get_client()
assert c1 is c2
def test_client_reconnects_on_ping_fail():
client = ClickHouseClient()
with patch("ja4_common.clickhouse.clickhouse_connect.get_client") as mock_gc:
mock_inner = MagicMock()
mock_inner.ping.side_effect = Exception("connection lost")
mock_gc.return_value = mock_inner
client._client = mock_inner # simulate stale connection
client.connect() # should reconnect
assert mock_gc.call_count >= 1
def test_get_client_returns_same_instance_on_second_call():
"""get_client() is a singleton: returns the same object on repeated calls."""
ch_module._client = None
with patch("ja4_common.clickhouse.clickhouse_connect.get_client") as mock_gc:
mock_inner = MagicMock()
mock_inner.ping.return_value = True
mock_gc.return_value = mock_inner
c1 = get_client()
c2 = get_client()
assert c1 is c2
# connect() should have been called once for c1; c2 reuses the same instance
assert mock_gc.call_count == 1
def test_client_query_delegates_to_inner():
"""ClickHouseClient.query() delegates to the underlying client."""
client = ClickHouseClient()
with patch("ja4_common.clickhouse.clickhouse_connect.get_client") as mock_gc:
mock_inner = MagicMock()
mock_inner.ping.return_value = True
mock_inner.query.return_value = "result"
mock_gc.return_value = mock_inner
result = client.query("SELECT 1")
assert result == "result"
mock_inner.query.assert_called_once_with("SELECT 1", None)
def test_client_query_with_params():
"""ClickHouseClient.query() passes params to the inner client."""
client = ClickHouseClient()
with patch("ja4_common.clickhouse.clickhouse_connect.get_client") as mock_gc:
mock_inner = MagicMock()
mock_inner.ping.return_value = True
mock_gc.return_value = mock_inner
client.query("SELECT %(val)s", {"val": 42})
mock_inner.query.assert_called_once_with("SELECT %(val)s", {"val": 42})
def test_client_close_sets_client_to_none():
"""ClickHouseClient.close() clears the internal client reference."""
client = ClickHouseClient()
with patch("ja4_common.clickhouse.clickhouse_connect.get_client") as mock_gc:
mock_inner = MagicMock()
mock_inner.ping.return_value = True
mock_gc.return_value = mock_inner
client.connect() # establish connection
assert client._client is not None
client.close()
assert client._client is None
def test_client_close_when_already_none():
"""ClickHouseClient.close() is safe to call when no connection exists."""
client = ClickHouseClient()
client._client = None # ensure no connection
client.close() # should not raise
def test_ping_returns_false_on_exception():
"""_ping() returns False when ping raises."""
client = ClickHouseClient()
mock_inner = MagicMock()
mock_inner.ping.side_effect = Exception("conn reset")
client._client = mock_inner
assert client._ping() is False
def test_ping_returns_true_on_success():
"""_ping() returns True when ping succeeds."""
client = ClickHouseClient()
mock_inner = MagicMock()
mock_inner.ping.return_value = True
client._client = mock_inner
assert client._ping() is True
def test_ping_returns_false_when_no_client():
"""_ping() returns False when _client is None."""
client = ClickHouseClient()
client._client = None
assert client._ping() is False

View File

@ -0,0 +1,64 @@
import os
import pytest
from ja4_common.settings import ClickHouseSettings
def test_default_settings():
s = ClickHouseSettings()
assert s.CLICKHOUSE_HOST == "clickhouse"
assert s.CLICKHOUSE_PORT == 8123
assert s.CLICKHOUSE_DB == "mabase_prod"
assert s.CLICKHOUSE_USER == "admin"
assert s.CLICKHOUSE_PASSWORD == ""
def test_settings_from_env(monkeypatch):
monkeypatch.setenv("CLICKHOUSE_HOST", "myhost")
monkeypatch.setenv("CLICKHOUSE_PORT", "9000")
monkeypatch.setenv("CLICKHOUSE_DB", "testdb")
s = ClickHouseSettings()
assert s.CLICKHOUSE_HOST == "myhost"
assert s.CLICKHOUSE_PORT == 9000
assert s.CLICKHOUSE_DB == "testdb"
def test_settings_password_default_empty():
s = ClickHouseSettings()
assert s.CLICKHOUSE_PASSWORD == ""
def test_settings_user_default():
s = ClickHouseSettings()
assert s.CLICKHOUSE_USER == "admin"
def test_settings_password_from_env(monkeypatch):
monkeypatch.setenv("CLICKHOUSE_PASSWORD", "secret")
s = ClickHouseSettings()
assert s.CLICKHOUSE_PASSWORD == "secret"
def test_settings_port_is_int():
s = ClickHouseSettings()
assert isinstance(s.CLICKHOUSE_PORT, int)
def test_settings_port_from_env_string(monkeypatch):
"""Port provided as string env var is coerced to int."""
monkeypatch.setenv("CLICKHOUSE_PORT", "9100")
s = ClickHouseSettings()
assert s.CLICKHOUSE_PORT == 9100
def test_settings_all_fields_overridable(monkeypatch):
monkeypatch.setenv("CLICKHOUSE_HOST", "h1")
monkeypatch.setenv("CLICKHOUSE_PORT", "1234")
monkeypatch.setenv("CLICKHOUSE_DB", "mydb")
monkeypatch.setenv("CLICKHOUSE_USER", "myuser")
monkeypatch.setenv("CLICKHOUSE_PASSWORD", "mypass")
s = ClickHouseSettings()
assert s.CLICKHOUSE_HOST == "h1"
assert s.CLICKHOUSE_PORT == 1234
assert s.CLICKHOUSE_DB == "mydb"
assert s.CLICKHOUSE_USER == "myuser"
assert s.CLICKHOUSE_PASSWORD == "mypass"