From 9f3e0621e5c8980aed90b81852db4a4e1ff87b5a Mon Sep 17 00:00:00 2001 From: toto Date: Tue, 7 Apr 2026 19:10:35 +0200 Subject: [PATCH] feat: split ClickHouse into dual configurable databases (ja4_logs / ja4_processing) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Architecture: - ja4_logs: raw log ingestion (http_logs_raw, http_logs, mv_http_logs) - ja4_processing: analytics, aggregation, ML, dictionaries, audit Configuration (env vars): - CLICKHOUSE_DB_LOGS (default: ja4_logs) - CLICKHOUSE_DB_PROCESSING (default: ja4_processing) Changes: - SQL migrations (10 files): all mabase_prod refs → ja4_logs or ja4_processing with correct cross-database references (MVs, views, dicts) - deploy_schema.sh: substitutes DB names from env vars at deploy time - Python shared settings: added CLICKHOUSE_DB_LOGS + CLICKHOUSE_DB_PROCESSING - Dashboard routes (19 files): replaced ~80 hardcoded mabase_prod refs with settings.CLICKHOUSE_DB_LOGS / settings.CLICKHOUSE_DB_PROCESSING - Bot-detector: DB → CLICKHOUSE_DB_PROCESSING, fetch_rules.py configurable - Correlator: DSN example updated to ja4_logs - Docker-compose + .env files: new env vars with defaults - All documentation updated (14 markdown files) All tests pass: sentinel 10/10, correlator 67.1%, bot-detector 11, dashboard 20, ja4_common 18 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 2 +- docs/architecture.md | 2 +- docs/database/migrations.md | 106 ++++++++-------- docs/database/schema.md | 2 +- docs/services/bot-detector.md | 2 +- docs/services/dashboard.md | 4 +- docs/shared/python-ja4common.md | 4 +- services/bot-detector/.env.example | 4 +- .../CLICKHOUSE_FEATURES_DIAGNOSTIC.md | 14 +-- services/bot-detector/DOCUMENTATION.md | 4 +- services/bot-detector/IMPROVEMENTS.md | 4 +- .../bot-detector/anubis/deploy_schema.sql | 72 +++++------ services/bot-detector/anubis/fetch_rules.py | 31 ++--- services/bot-detector/anubis/mv_http_logs.sql | 112 ++++++++--------- .../anubis/view_ai_features_anubis.sql | 108 ++++++++-------- .../bot-detector/bot_detector/bot_detector.py | 3 +- .../bot_detector/tests/test_detector.py | 6 +- services/bot-detector/deploy_views.sql | 92 +++++++------- services/bot-detector/docker-compose.yml | 10 +- services/correlator/.env.example | 2 +- services/correlator/README.md | 18 +-- services/correlator/architecture.yml | 4 +- services/correlator/config.example.yml | 4 +- services/correlator/idees/views.md | 62 +++++----- services/correlator/sql/bots.sql | 8 +- services/correlator/sql/init.sql | 37 +++--- services/correlator/sql/tables.sql | 8 +- services/dashboard/.env.example | 4 +- .../dashboard/.github/copilot-instructions.md | 6 +- services/dashboard/README.md | 20 +-- services/dashboard/backend/config.py | 4 +- services/dashboard/backend/main.py | 2 +- services/dashboard/docker-compose.yaml | 4 +- shared/clickhouse/00_database.sql | 5 +- shared/clickhouse/01_raw_tables.sql | 2 +- shared/clickhouse/02_dictionaries.sql | 10 +- shared/clickhouse/03_anubis_tables.sql | 40 +++--- shared/clickhouse/04_mv_http_logs.sql | 116 +++++++++--------- shared/clickhouse/05_aggregation_tables.sql | 32 ++--- shared/clickhouse/06_ml_tables.sql | 8 +- shared/clickhouse/07_ai_features_view.sql | 108 ++++++++-------- shared/clickhouse/08_users.sql | 22 ++-- shared/clickhouse/09_audit_table.sql | 4 +- shared/clickhouse/deploy_schema.sh | 67 ++++++++++ .../python/ja4_common/ja4_common/settings.py | 4 +- .../python/ja4_common/tests/test_settings.py | 4 +- 46 files changed, 638 insertions(+), 549 deletions(-) create mode 100755 shared/clickhouse/deploy_schema.sh diff --git a/README.md b/README.md index f06598b..2232c75 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ ▼ ┌──────────────────┐ │ ClickHouse │ - │ mabase_prod │ + │ ja4_processing │ │ (all tables) │ └────────┬─────────┘ │ SELECT diff --git a/docs/architecture.md b/docs/architecture.md index cc97d56..7e2da4d 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -23,7 +23,7 @@ The ja4-platform is a security pipeline that captures live network traffic, gene ▼ ┌─────────────────────┐ │ ClickHouse │ - │ mabase_prod │ + │ ja4_processing │ │ │ │ http_logs_raw │ │ ──(MV)──▶ http_logs│ diff --git a/docs/database/migrations.md b/docs/database/migrations.md index 3342867..5e6c9de 100644 --- a/docs/database/migrations.md +++ b/docs/database/migrations.md @@ -6,7 +6,7 @@ The ClickHouse schema for ja4-platform is managed through numbered SQL migration | File | Purpose | |------|---------| -| `00_database.sql` | Creates the `mabase_prod` database | +| `00_database.sql` | Creates the `ja4_processing` database | | `01_raw_tables.sql` | Creates `http_logs_raw` ingest table (MergeTree, 1-day TTL) | | `02_dictionaries.sql` | Creates ASN geo dictionary (`dict_iplocate_asn`), bot IP/JA4 reference tables, `ref_bot_networks` | | `03_anubis_tables.sql` | Creates Anubis crawler rule tables (`anubis_ua_rules`, `anubis_ip_rules`, `anubis_asn_rules`, `anubis_country_rules`) and their dictionaries (`dict_anubis_ua`, `dict_anubis_ip`, `dict_anubis_asn`, `dict_anubis_country`) | @@ -87,21 +87,21 @@ After applying all migrations, run these queries to verify each migration was su ### 00 — Database ```sql -SHOW DATABASES LIKE 'mabase_prod'; --- Expected: mabase_prod +SHOW DATABASES LIKE 'ja4_processing'; +-- Expected: ja4_processing ``` ### 01 — Raw Tables ```sql -EXISTS mabase_prod.http_logs_raw; +EXISTS ja4_logs.http_logs_raw; -- Expected: 1 ``` ### 02 — Dictionaries ```sql -SELECT dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'country_code', +SELECT dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'country_code', toIPv6(toIPv4('8.8.8.8')), 'MISSING'); -- Expected: US (if CSV loaded) or MISSING ``` @@ -109,43 +109,43 @@ SELECT dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'country_code', ### 03 — Anubis Tables ```sql -EXISTS mabase_prod.anubis_ua_rules; -EXISTS mabase_prod.anubis_ip_rules; -EXISTS mabase_prod.anubis_asn_rules; -EXISTS mabase_prod.anubis_country_rules; +EXISTS ja4_processing.anubis_ua_rules; +EXISTS ja4_processing.anubis_ip_rules; +EXISTS ja4_processing.anubis_asn_rules; +EXISTS ja4_processing.anubis_country_rules; -- Expected: 1 for each ``` ### 04 — MV + http_logs ```sql -EXISTS mabase_prod.http_logs; -SELECT name FROM system.tables WHERE database = 'mabase_prod' AND name = 'mv_http_logs'; +EXISTS ja4_logs.http_logs; +SELECT name FROM system.tables WHERE database = 'ja4_logs' AND name = 'mv_http_logs'; -- Expected: mv_http_logs ``` ### 05 — Aggregation Tables ```sql -EXISTS mabase_prod.agg_host_ip_ja4_1h; -EXISTS mabase_prod.agg_header_fingerprint_1h; -SELECT name FROM system.dictionaries WHERE database = 'mabase_prod' AND name = 'dict_bot_ip'; +EXISTS ja4_processing.agg_host_ip_ja4_1h; +EXISTS ja4_processing.agg_header_fingerprint_1h; +SELECT name FROM system.dictionaries WHERE database = 'ja4_processing' AND name = 'dict_bot_ip'; -- Expected: dict_bot_ip ``` ### 06 — ML Tables ```sql -EXISTS mabase_prod.ml_detected_anomalies; -EXISTS mabase_prod.ml_all_scores; -SELECT name FROM system.tables WHERE database = 'mabase_prod' AND name LIKE 'view_ip%'; +EXISTS ja4_processing.ml_detected_anomalies; +EXISTS ja4_processing.ml_all_scores; +SELECT name FROM system.tables WHERE database = 'ja4_processing' AND name LIKE 'view_ip%'; -- Expected: view_ip_recurrence ``` ### 07 — AI Features View ```sql -SELECT name FROM system.tables WHERE database = 'mabase_prod' AND name = 'view_ai_features_1h'; +SELECT name FROM system.tables WHERE database = 'ja4_processing' AND name = 'view_ai_features_1h'; -- Expected: view_ai_features_1h ``` @@ -153,7 +153,7 @@ SELECT name FROM system.tables WHERE database = 'mabase_prod' AND name = 'view_a ```sql SHOW GRANTS FOR data_writer; --- Expected: GRANT INSERT, SELECT ON mabase_prod.http_logs_raw TO data_writer +-- Expected: GRANT INSERT, SELECT ON ja4_logs.http_logs_raw TO data_writer SHOW GRANTS FOR analyst; -- Expected: GRANT SELECT ON multiple tables ``` @@ -161,7 +161,7 @@ SHOW GRANTS FOR analyst; ### 09 — Audit Table ```sql -EXISTS mabase_prod.audit_logs; +EXISTS ja4_processing.audit_logs; -- Expected: 1 ``` @@ -171,7 +171,7 @@ EXISTS mabase_prod.audit_logs; SELECT count() AS total_tables FROM system.tables -WHERE database = 'mabase_prod' +WHERE database = 'ja4_processing' AND name IN ( 'http_logs_raw', 'http_logs', 'agg_host_ip_ja4_1h', 'agg_header_fingerprint_1h', 'ml_detected_anomalies', 'ml_all_scores', 'ref_bot_networks', @@ -187,64 +187,64 @@ WHERE database = 'mabase_prod' ClickHouse does not support transactional DDL. To roll back a migration: -1. **Tables**: `DROP TABLE IF EXISTS mabase_prod.` -2. **Materialized Views**: `DROP VIEW IF EXISTS mabase_prod.` (drop MV before its target table) -3. **Dictionaries**: `DROP DICTIONARY IF EXISTS mabase_prod.` -4. **Views**: `DROP VIEW IF EXISTS mabase_prod.` +1. **Tables**: `DROP TABLE IF EXISTS ja4_processing.` +2. **Materialized Views**: `DROP VIEW IF EXISTS ja4_processing.` (drop MV before its target table) +3. **Dictionaries**: `DROP DICTIONARY IF EXISTS ja4_processing.` +4. **Views**: `DROP VIEW IF EXISTS ja4_processing.` 5. **Users**: `DROP USER IF EXISTS ` ### Rollback Order (Reverse of Apply) ```sql -- 09: Audit -DROP TABLE IF EXISTS mabase_prod.audit_logs; +DROP TABLE IF EXISTS ja4_processing.audit_logs; -- 08: Users DROP USER IF EXISTS data_writer; DROP USER IF EXISTS analyst; -- 07: AI Features View -DROP VIEW IF EXISTS mabase_prod.view_ai_features_1h; +DROP VIEW IF EXISTS ja4_processing.view_ai_features_1h; -- 06: ML Tables -DROP VIEW IF EXISTS mabase_prod.view_ip_recurrence; -DROP TABLE IF EXISTS mabase_prod.ml_all_scores; -DROP TABLE IF EXISTS mabase_prod.ml_detected_anomalies; +DROP VIEW IF EXISTS ja4_processing.view_ip_recurrence; +DROP TABLE IF EXISTS ja4_processing.ml_all_scores; +DROP TABLE IF EXISTS ja4_processing.ml_detected_anomalies; -- 05: Aggregation -DROP VIEW IF EXISTS mabase_prod.mv_agg_header_fingerprint_1h; -DROP VIEW IF EXISTS mabase_prod.mv_agg_host_ip_ja4_1h; -DROP TABLE IF EXISTS mabase_prod.agg_header_fingerprint_1h; -DROP TABLE IF EXISTS mabase_prod.agg_host_ip_ja4_1h; -DROP DICTIONARY IF EXISTS mabase_prod.dict_asn_reputation; -DROP DICTIONARY IF EXISTS mabase_prod.dict_bot_ja4; -DROP DICTIONARY IF EXISTS mabase_prod.dict_bot_ip; +DROP VIEW IF EXISTS ja4_processing.mv_agg_header_fingerprint_1h; +DROP VIEW IF EXISTS ja4_processing.mv_agg_host_ip_ja4_1h; +DROP TABLE IF EXISTS ja4_processing.agg_header_fingerprint_1h; +DROP TABLE IF EXISTS ja4_processing.agg_host_ip_ja4_1h; +DROP DICTIONARY IF EXISTS ja4_processing.dict_asn_reputation; +DROP DICTIONARY IF EXISTS ja4_processing.dict_bot_ja4; +DROP DICTIONARY IF EXISTS ja4_processing.dict_bot_ip; -- 04: MV + http_logs -DROP VIEW IF EXISTS mabase_prod.mv_http_logs; -DROP TABLE IF EXISTS mabase_prod.http_logs; +DROP VIEW IF EXISTS ja4_logs.mv_http_logs; +DROP TABLE IF EXISTS ja4_logs.http_logs; -- 03: Anubis -DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_country; -DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_asn; -DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_ip; -DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_ua; -DROP TABLE IF EXISTS mabase_prod.anubis_country_rules; -DROP TABLE IF EXISTS mabase_prod.anubis_asn_rules; -DROP TABLE IF EXISTS mabase_prod.anubis_ip_rules; -DROP TABLE IF EXISTS mabase_prod.anubis_ua_rules; +DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_country; +DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_asn; +DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ip; +DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ua; +DROP TABLE IF EXISTS ja4_processing.anubis_country_rules; +DROP TABLE IF EXISTS ja4_processing.anubis_asn_rules; +DROP TABLE IF EXISTS ja4_processing.anubis_ip_rules; +DROP TABLE IF EXISTS ja4_processing.anubis_ua_rules; -- 02: Dictionaries -DROP DICTIONARY IF EXISTS mabase_prod.dict_iplocate_asn; -DROP TABLE IF EXISTS mabase_prod.bot_ja4; -DROP TABLE IF EXISTS mabase_prod.bot_ip; -DROP TABLE IF EXISTS mabase_prod.ref_bot_networks; +DROP DICTIONARY IF EXISTS ja4_processing.dict_iplocate_asn; +DROP TABLE IF EXISTS ja4_processing.bot_ja4; +DROP TABLE IF EXISTS ja4_processing.bot_ip; +DROP TABLE IF EXISTS ja4_processing.ref_bot_networks; -- 01: Raw Tables -DROP TABLE IF EXISTS mabase_prod.http_logs_raw; +DROP TABLE IF EXISTS ja4_logs.http_logs_raw; -- 00: Database -DROP DATABASE IF EXISTS mabase_prod; +DROP DATABASE IF EXISTS ja4_processing; ``` ### Important Notes diff --git a/docs/database/schema.md b/docs/database/schema.md index 36c2834..dbbca93 100644 --- a/docs/database/schema.md +++ b/docs/database/schema.md @@ -1,6 +1,6 @@ # Database Schema -The ja4-platform uses ClickHouse as its central data store with database `mabase_prod`. This document describes every table, materialized view, dictionary, and view in the schema. +The ja4-platform uses ClickHouse as its central data store with database `ja4_processing`. This document describes every table, materialized view, dictionary, and view in the schema. ## Tables diff --git a/docs/services/bot-detector.md b/docs/services/bot-detector.md index 4fc45cd..bff979c 100644 --- a/docs/services/bot-detector.md +++ b/docs/services/bot-detector.md @@ -177,7 +177,7 @@ The `view_ai_features_1h` view enriches each IP with Anubis bot detection using |----------|------|---------|-------------| | `CLICKHOUSE_HOST` | string | `clickhouse` | ClickHouse server hostname | | `CLICKHOUSE_PORT` | int | `8123` | ClickHouse HTTP port | -| `CLICKHOUSE_DB` | string | `mabase_prod` | Database name | +| `CLICKHOUSE_DB` | string | `ja4_processing` | Database name | | `CLICKHOUSE_USER` | string | `admin` | ClickHouse username | | `CLICKHOUSE_PASSWORD` | string | `""` | ClickHouse password | | `ISOLATION_CONTAMINATION` | float | `0.02` | Contamination parameter for Isolation Forest | diff --git a/docs/services/dashboard.md b/docs/services/dashboard.md index 78105ec..75c6af2 100644 --- a/docs/services/dashboard.md +++ b/docs/services/dashboard.md @@ -1,6 +1,6 @@ # Dashboard -The dashboard is a SOC (Security Operations Center) web application built with FastAPI (backend) and React (frontend) that provides real-time visualization, investigation, and analysis of bot detections generated by the [bot-detector](bot-detector.md). It queries ClickHouse (`mabase_prod`) for all data. +The dashboard is a SOC (Security Operations Center) web application built with FastAPI (backend) and React (frontend) that provides real-time visualization, investigation, and analysis of bot detections generated by the [bot-detector](bot-detector.md). It queries ClickHouse (`ja4_processing`) for all data. ## Technology Stack @@ -17,7 +17,7 @@ The dashboard is a SOC (Security Operations Center) web application built with F |----------|------|---------|-------------| | `CLICKHOUSE_HOST` | string | `clickhouse` | ClickHouse hostname | | `CLICKHOUSE_PORT` | int | `8123` | ClickHouse HTTP port | -| `CLICKHOUSE_DB` | string | `mabase_prod` | Database name | +| `CLICKHOUSE_DB` | string | `ja4_processing` | Database name | | `CLICKHOUSE_USER` | string | `admin` | ClickHouse user | | `CLICKHOUSE_PASSWORD` | string | `""` | ClickHouse password | | `API_HOST` | string | `0.0.0.0` | API listen address | diff --git a/docs/shared/python-ja4common.md b/docs/shared/python-ja4common.md index bedd380..5d51416 100644 --- a/docs/shared/python-ja4common.md +++ b/docs/shared/python-ja4common.md @@ -20,7 +20,7 @@ Pydantic-settings model that reads configuration from environment variables and |-------|------|---------|-------------|-------------| | `CLICKHOUSE_HOST` | str | `"clickhouse"` | `CLICKHOUSE_HOST` | ClickHouse server hostname | | `CLICKHOUSE_PORT` | int | `8123` | `CLICKHOUSE_PORT` | ClickHouse HTTP API port | -| `CLICKHOUSE_DB` | str | `"mabase_prod"` | `CLICKHOUSE_DB` | Database name | +| `CLICKHOUSE_DB` | str | `"ja4_processing"` | `CLICKHOUSE_DB` | Database name | | `CLICKHOUSE_USER` | str | `"admin"` | `CLICKHOUSE_USER` | Username for authentication | | `CLICKHOUSE_PASSWORD` | str | `""` | `CLICKHOUSE_PASSWORD` | Password for authentication | @@ -168,7 +168,7 @@ Create a `.env` file or set environment variables: ```bash CLICKHOUSE_HOST=clickhouse.example.com CLICKHOUSE_PORT=8123 -CLICKHOUSE_DB=mabase_prod +CLICKHOUSE_DB=ja4_processing CLICKHOUSE_USER=data_writer CLICKHOUSE_PASSWORD=secret ``` diff --git a/services/bot-detector/.env.example b/services/bot-detector/.env.example index f52328f..474fe8c 100644 --- a/services/bot-detector/.env.example +++ b/services/bot-detector/.env.example @@ -1,7 +1,9 @@ # bot-detector configuration — DO NOT COMMIT real values CLICKHOUSE_HOST=clickhouse CLICKHOUSE_PORT=8123 -CLICKHOUSE_DB=mabase_prod +CLICKHOUSE_DB=ja4_processing +CLICKHOUSE_DB_LOGS=ja4_logs +CLICKHOUSE_DB_PROCESSING=ja4_processing CLICKHOUSE_USER=admin CLICKHOUSE_PASSWORD= ANOMALY_THRESHOLD=-0.1 diff --git a/services/bot-detector/CLICKHOUSE_FEATURES_DIAGNOSTIC.md b/services/bot-detector/CLICKHOUSE_FEATURES_DIAGNOSTIC.md index 78b79a5..b4714a8 100644 --- a/services/bot-detector/CLICKHOUSE_FEATURES_DIAGNOSTIC.md +++ b/services/bot-detector/CLICKHOUSE_FEATURES_DIAGNOSTIC.md @@ -46,11 +46,11 @@ Les features suivantes sont toujours à **0** dans `view_ai_features_1h` : ### Cause -La table `mabase_prod.agg_header_fingerprint_1h` (AggregatingMergeTree) n'a plus reçu de données depuis le **2026-03-13 23:00** : +La table `ja4_processing.agg_header_fingerprint_1h` (AggregatingMergeTree) n'a plus reçu de données depuis le **2026-03-13 23:00** : ```sql SELECT max(window_start), count() -FROM mabase_prod.agg_header_fingerprint_1h; +FROM ja4_processing.agg_header_fingerprint_1h; -- Résultat : 2026-03-13 23:00:00, 73024 lignes ``` @@ -62,7 +62,7 @@ La liste des Materialized Views ne montre aucune MV dédiée à `agg_header_fing ```sql SELECT name FROM system.tables -WHERE database = 'mabase_prod' AND engine = 'MaterializedView'; +WHERE database = 'ja4_processing' AND engine = 'MaterializedView'; -- mv_agg_host_ip_ja4_1h -- mv_http_logs -- view_dashboard_entities_mv @@ -77,15 +77,15 @@ La MV `mv_agg_header_fingerprint_1h` était **définie dans `deploy_views.sql`** ```sql -- Recréation de la MV (déjà appliquée) -CREATE MATERIALIZED VIEW mabase_prod.mv_agg_header_fingerprint_1h -TO mabase_prod.agg_header_fingerprint_1h AS +CREATE MATERIALIZED VIEW ja4_processing.mv_agg_header_fingerprint_1h +TO ja4_processing.agg_header_fingerprint_1h AS SELECT toStartOfHour(src.time) AS window_start, toIPv6(src.src_ip) AS src_ip, any(toString(cityHash64(src.client_headers))) AS header_order_hash, max(toUInt16(length(src.client_headers) - length(replaceAll(src.client_headers, ',', '')) + 1)) AS header_count, -- ... (voir deploy_views.sql §5) -FROM mabase_prod.http_logs AS src +FROM ja4_logs.http_logs AS src GROUP BY window_start, src.src_ip; ``` @@ -132,7 +132,7 @@ Ce problème s'est résolu automatiquement une fois la MV `mv_agg_header_fingerp ### Cause -La colonne `orphan_count` dans `mabase_prod.agg_host_ip_ja4_1h` est calculée par la MV `mv_agg_host_ip_ja4_1h` : +La colonne `orphan_count` dans `ja4_processing.agg_host_ip_ja4_1h` est calculée par la MV `mv_agg_host_ip_ja4_1h` : ```sql sum(IF(src.orphan_side = 'A' OR src.correlated = 0, 1, 0)) AS orphan_count diff --git a/services/bot-detector/DOCUMENTATION.md b/services/bot-detector/DOCUMENTATION.md index b11e6eb..286778c 100644 --- a/services/bot-detector/DOCUMENTATION.md +++ b/services/bot-detector/DOCUMENTATION.md @@ -430,7 +430,7 @@ Toutes les valeurs sont passées via variables d'environnement (fichier `.env`). | Variable | Défaut | Description | |----------|--------|-------------| | `CLICKHOUSE_HOST` | `clickhouse` | Hôte ClickHouse | -| `CLICKHOUSE_DB` | `mabase_prod` | Base de données | +| `CLICKHOUSE_DB` | `ja4_processing` | Base de données | | `CLICKHOUSE_USER` | `default` | Utilisateur | | `CLICKHOUSE_PASSWORD` | *(vide)* | Mot de passe | | `ISOLATION_CONTAMINATION` | `0.001` | Fraction d'anomalies attendues (0 < x < 0.5) | @@ -702,7 +702,7 @@ Le champ `anomaly_score` dans ClickHouse contient désormais le score normalisé Les nouvelles colonnes suivantes sont disponibles dans les logs JSONL mais pas encore dans la table `ml_detected_anomalies`. Pour les activer : ```sql -ALTER TABLE mabase_prod.ml_detected_anomalies +ALTER TABLE ja4_processing.ml_detected_anomalies ADD COLUMN IF NOT EXISTS campaign_id Int32 DEFAULT -1, ADD COLUMN IF NOT EXISTS raw_anomaly_score Float32 DEFAULT 0; ``` diff --git a/services/bot-detector/IMPROVEMENTS.md b/services/bot-detector/IMPROVEMENTS.md index be0e89a..a0c1fbf 100644 --- a/services/bot-detector/IMPROVEMENTS.md +++ b/services/bot-detector/IMPROVEMENTS.md @@ -412,7 +412,7 @@ Les niveaux de menace seraient alors calculés sur le score normalisé, rendant # Nouvelles dimensions de features — Propositions B -> Propositions de features supplémentaires pour l'Isolation Forest, validées sur les données réelles de `mabase_prod`. +> Propositions de features supplémentaires pour l'Isolation Forest, validées sur les données réelles de `ja4_processing`. > Chaque proposition indique la force du signal observée en base, la source de données, la formule de calcul et les références scientifiques. ## Résumé des signaux @@ -729,7 +729,7 @@ Faible impact seul, mais utile en combinaison avec TTL variance pour le TCP fing ### Colonnes à ajouter dans `agg_host_ip_ja4_1h` ```sql -ALTER TABLE mabase_prod.agg_host_ip_ja4_1h +ALTER TABLE ja4_processing.agg_host_ip_ja4_1h ADD COLUMN uniq_ja3 AggregateFunction(uniq, String), ADD COLUMN avg_syn_ms SimpleAggregateFunction(avg, Float64), ADD COLUMN tls12_count SimpleAggregateFunction(sum, UInt64), diff --git a/services/bot-detector/anubis/deploy_schema.sql b/services/bot-detector/anubis/deploy_schema.sql index f8e823e..8157a89 100644 --- a/services/bot-detector/anubis/deploy_schema.sql +++ b/services/bot-detector/anubis/deploy_schema.sql @@ -26,7 +26,7 @@ -- Quand l'UA correspond à enfant ET parent, c'est le nom de l'enfant qui -- est retourné (l'enfant hérite ET surcharge les attributs du parent). -- ---------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.anubis_ua_rules +CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules ( id UInt64, parent_id UInt64, @@ -46,7 +46,7 @@ ORDER BY id; -- has_ua : 1 si la règle possède aussi une regex UA (croisement nécessaire) -- category : catégorie Anubis (bots, crawlers, clients, policies…) -- ---------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.anubis_ip_rules +CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules ( prefix String, bot_name LowCardinality(String), @@ -60,31 +60,31 @@ ORDER BY prefix; -- ---------------------------------------------------------------------------- -- 3. DICTIONNAIRE UA — REGEXP_TREE --- dictGet('mabase_prod.dict_anubis_ua', 'bot_name', header_user_agent) +-- dictGet('ja4_processing.dict_anubis_ua', 'bot_name', header_user_agent) -- -- Le PRIMARY KEY est 'regexp' (String) — requis par ClickHouse 26.x. -- Connexion interne (HOST localhost PORT 9000) pour éviter deadlock HTTP. -- Remplacer 'admin' et le mot de passe par les credentials ClickHouse. -- ---------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_ua; -CREATE DICTIONARY mabase_prod.dict_anubis_ua +DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ua; +CREATE DICTIONARY ja4_processing.dict_anubis_ua ( regexp String, bot_name String, action String ) PRIMARY KEY regexp -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_ua_rules')) +SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_ua_rules')) LAYOUT(REGEXP_TREE) LIFETIME(MIN 300 MAX 600); -- ---------------------------------------------------------------------------- -- 4. DICTIONNAIRE IP — IP_TRIE --- dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', toIPv6(src_ip), '') +-- dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', toIPv6(src_ip), '') -- Connexion interne (HOST localhost PORT 9000) — même raison que dict_anubis_ua. -- ---------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_ip; -CREATE DICTIONARY mabase_prod.dict_anubis_ip +DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ip; +CREATE DICTIONARY ja4_processing.dict_anubis_ip ( prefix String, bot_name String, @@ -94,7 +94,7 @@ CREATE DICTIONARY mabase_prod.dict_anubis_ip category String ) PRIMARY KEY prefix -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_ip_rules')) +SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_ip_rules')) LAYOUT(IP_TRIE()) LIFETIME(MIN 300 MAX 600); @@ -102,7 +102,7 @@ LIFETIME(MIN 300 MAX 600); -- 5. TABLE SOURCE — règles ASN (pour dictionnaire Flat) -- Alimentée par botPolicies.yaml via fetch_rules.py → insert_asn_rules() -- ---------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.anubis_asn_rules +CREATE TABLE IF NOT EXISTS ja4_processing.anubis_asn_rules ( asn UInt32, bot_name LowCardinality(String), @@ -116,7 +116,7 @@ ORDER BY asn; -- 6. TABLE SOURCE — règles pays ISO-3166 (pour dictionnaire Flat) -- Alimentée par botPolicies.yaml via fetch_rules.py → insert_country_rules() -- ---------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.anubis_country_rules +CREATE TABLE IF NOT EXISTS ja4_processing.anubis_country_rules ( country_code LowCardinality(String), bot_name LowCardinality(String), @@ -128,10 +128,10 @@ ORDER BY country_code; -- ---------------------------------------------------------------------------- -- 7. DICTIONNAIRE ASN — Flat --- dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', src_asn, '') +-- dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', src_asn, '') -- ---------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_asn; -CREATE DICTIONARY mabase_prod.dict_anubis_asn +DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_asn; +CREATE DICTIONARY ja4_processing.dict_anubis_asn ( asn UInt32, bot_name String, @@ -139,16 +139,16 @@ CREATE DICTIONARY mabase_prod.dict_anubis_asn category String ) PRIMARY KEY asn -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_asn_rules')) +SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_asn_rules')) LAYOUT(FLAT()) LIFETIME(MIN 300 MAX 600); -- ---------------------------------------------------------------------------- -- 8. DICTIONNAIRE PAYS — Flat --- dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', src_country_code, '') +-- dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', src_country_code, '') -- ---------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_country; -CREATE DICTIONARY mabase_prod.dict_anubis_country +DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_country; +CREATE DICTIONARY ja4_processing.dict_anubis_country ( country_code String, bot_name String, @@ -156,7 +156,7 @@ CREATE DICTIONARY mabase_prod.dict_anubis_country category String ) PRIMARY KEY country_code -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_country_rules')) +SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_country_rules')) LAYOUT(FLAT()) LIFETIME(MIN 300 MAX 600); @@ -164,7 +164,7 @@ LIFETIME(MIN 300 MAX 600); -- 9. AJOUT DES COLONNES ANUBIS dans http_logs -- Idempotent : ne plante pas si déjà présentes -- ---------------------------------------------------------------------------- -ALTER TABLE mabase_prod.http_logs +ALTER TABLE ja4_logs.http_logs ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '', ADD COLUMN IF NOT EXISTS anubis_bot_action LowCardinality(String) DEFAULT '', ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT ''; @@ -175,10 +175,10 @@ ALTER TABLE mabase_prod.http_logs -- 1. UA regex (plus informatif — identifie le bot précis) -- 2. IP/CIDR (fallback — identifie le réseau cloud) -- ---------------------------------------------------------------------------- -DROP VIEW IF EXISTS mabase_prod.mv_http_logs; +DROP VIEW IF EXISTS ja4_logs.mv_http_logs; -CREATE MATERIALIZED VIEW mabase_prod.mv_http_logs -TO mabase_prod.http_logs +CREATE MATERIALIZED VIEW ja4_logs.mv_http_logs +TO ja4_logs.http_logs ( `time` DateTime, `log_date` Date, @@ -241,13 +241,13 @@ AS SELECT toDate(time) AS log_date, toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0')) AS src_ip, toUInt16(coalesce(JSONExtractUInt(raw_json, 'src_port'), 0)) AS src_port, - dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'asn', toIPv6(src_ip), toUInt32(0)) AS src_asn, - dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'country_code', toIPv6(src_ip), '') AS src_country_code, + dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'asn', toIPv6(src_ip), toUInt32(0)) AS src_asn, + dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'country_code', toIPv6(src_ip), '') AS src_country_code, toIPv4(coalesce(JSONExtractString(raw_json, 'dst_ip'), '0.0.0.0')) AS dst_ip, toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port, - dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'name', toIPv6(src_ip), '') AS src_as_name, - dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'org', toIPv6(src_ip), '') AS src_org, - dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'domain', toIPv6(src_ip), '') AS src_domain, + dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'name', toIPv6(src_ip), '') AS src_as_name, + dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'org', toIPv6(src_ip), '') AS src_org, + dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'domain', toIPv6(src_ip), '') AS src_domain, coalesce(JSONExtractString(raw_json, 'method'), '') AS method, coalesce(JSONExtractString(raw_json, 'scheme'), '') AS scheme, coalesce(JSONExtractString(raw_json, 'host'), '') AS host, @@ -293,20 +293,20 @@ AS SELECT -- ── Enrichissement Anubis ──────────────────────────────────────────────── -- Priorité : UA regex > IP/CIDR (UA identifie précisément le bot) COALESCE( - nullIf(dictGet('mabase_prod.dict_anubis_ua', 'bot_name', + nullIf(dictGet('ja4_processing.dict_anubis_ua', 'bot_name', coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '')), ''), - nullIf(dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', toIPv6(toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0'))), ''), ''), '' ) AS anubis_bot_name, COALESCE( - nullIf(dictGet('mabase_prod.dict_anubis_ua', 'action', + nullIf(dictGet('ja4_processing.dict_anubis_ua', 'action', coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '')), ''), - nullIf(dictGetOrDefault('mabase_prod.dict_anubis_ip', 'action', + nullIf(dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', toIPv6(toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0'))), ''), ''), '' ) AS anubis_bot_action -FROM mabase_prod.http_logs_raw; +FROM ja4_logs.http_logs_raw; -- ============================================================================ -- INTÉGRATION ML — Propagation Anubis vers le pipeline bot_detector @@ -315,7 +315,7 @@ FROM mabase_prod.http_logs_raw; -- ---------------------------------------------------------------------------- -- 11. COLONNES ANUBIS dans ml_detected_anomalies -- ---------------------------------------------------------------------------- -ALTER TABLE mabase_prod.ml_detected_anomalies +ALTER TABLE ja4_processing.ml_detected_anomalies ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '', ADD COLUMN IF NOT EXISTS anubis_bot_action LowCardinality(String) DEFAULT '', ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT ''; @@ -323,7 +323,7 @@ ALTER TABLE mabase_prod.ml_detected_anomalies -- ---------------------------------------------------------------------------- -- 12. COLONNES ANUBIS dans ml_all_scores -- ---------------------------------------------------------------------------- -ALTER TABLE mabase_prod.ml_all_scores +ALTER TABLE ja4_processing.ml_all_scores ADD COLUMN IF NOT EXISTS anubis_bot_name LowCardinality(String) DEFAULT '', ADD COLUMN IF NOT EXISTS anubis_bot_action LowCardinality(String) DEFAULT '', ADD COLUMN IF NOT EXISTS anubis_bot_category LowCardinality(String) DEFAULT ''; diff --git a/services/bot-detector/anubis/fetch_rules.py b/services/bot-detector/anubis/fetch_rules.py index 6c721eb..53271fb 100644 --- a/services/bot-detector/anubis/fetch_rules.py +++ b/services/bot-detector/anubis/fetch_rules.py @@ -13,7 +13,7 @@ Usage (depuis le container dashboard_web) : python /tmp/fetch_rules.py Variables d'environnement : - CLICKHOUSE_HOST, CLICKHOUSE_DB, CLICKHOUSE_USER, CLICKHOUSE_PASSWORD + CLICKHOUSE_HOST, CLICKHOUSE_DB, CLICKHOUSE_DB_PROCESSING, CLICKHOUSE_USER, CLICKHOUSE_PASSWORD """ import json @@ -336,17 +336,20 @@ def collect_all_rules() -> tuple[list, list, list, list]: def get_ch_client(): return clickhouse_connect.get_client( host=os.environ.get("CLICKHOUSE_HOST", "clickhouse"), - database=os.environ.get("CLICKHOUSE_DB", "mabase_prod"), + database=os.environ.get("CLICKHOUSE_DB_PROCESSING", os.environ.get("CLICKHOUSE_DB", "ja4_processing")), username=os.environ.get("CLICKHOUSE_USER", "admin"), password=os.environ.get("CLICKHOUSE_PASSWORD", ""), ) +DB_PROC = os.environ.get("CLICKHOUSE_DB_PROCESSING", os.environ.get("CLICKHOUSE_DB", "ja4_processing")) + + def insert_ua_rules(client, rules: list[dict]) -> None: if not rules: print("[INFO] Aucune règle UA.") return - client.command("TRUNCATE TABLE mabase_prod.anubis_ua_rules") + client.command(f"TRUNCATE TABLE {DB_PROC}.anubis_ua_rules") # REGEXP_TREE format : id, parent_id, regexp, keys[], values[] # keys = ['bot_name', 'action', 'has_ip', 'rule_id', 'category'] data = [ @@ -357,7 +360,7 @@ def insert_ua_rules(client, rules: list[dict]) -> None: ] for r in rules ] - client.insert("mabase_prod.anubis_ua_rules", data, + client.insert(f"{DB_PROC}.anubis_ua_rules", data, column_names=["id", "parent_id", "regexp", "keys", "values"]) print(f"[OK] {len(rules)} règles UA insérées.") @@ -366,13 +369,13 @@ def insert_ip_rules(client, rules: list[dict]) -> None: if not rules: print("[INFO] Aucune règle IP.") return - client.command("TRUNCATE TABLE mabase_prod.anubis_ip_rules") + client.command(f"TRUNCATE TABLE {DB_PROC}.anubis_ip_rules") data = [ [r["prefix"], r["bot_name"], r["action"], r["rule_id"], r["has_ua"], r["category"]] for r in rules ] - client.insert("mabase_prod.anubis_ip_rules", data, + client.insert(f"{DB_PROC}.anubis_ip_rules", data, column_names=["prefix", "bot_name", "action", "rule_id", "has_ua", "category"]) print(f"[OK] {len(rules)} règles IP insérées.") @@ -381,9 +384,9 @@ def insert_asn_rules(client, rules: list[dict]) -> None: if not rules: print("[INFO] Aucune règle ASN.") return - client.command("TRUNCATE TABLE mabase_prod.anubis_asn_rules") + client.command(f"TRUNCATE TABLE {DB_PROC}.anubis_asn_rules") data = [[r["asn"], r["bot_name"], r["action"], r["category"]] for r in rules] - client.insert("mabase_prod.anubis_asn_rules", data, + client.insert(f"{DB_PROC}.anubis_asn_rules", data, column_names=["asn", "bot_name", "action", "category"]) print(f"[OK] {len(rules)} règles ASN insérées.") @@ -392,19 +395,19 @@ def insert_country_rules(client, rules: list[dict]) -> None: if not rules: print("[INFO] Aucune règle pays.") return - client.command("TRUNCATE TABLE mabase_prod.anubis_country_rules") + client.command(f"TRUNCATE TABLE {DB_PROC}.anubis_country_rules") data = [[r["country_code"], r["bot_name"], r["action"], r["category"]] for r in rules] - client.insert("mabase_prod.anubis_country_rules", data, + client.insert(f"{DB_PROC}.anubis_country_rules", data, column_names=["country_code", "bot_name", "action", "category"]) print(f"[OK] {len(rules)} règles pays insérées.") def reload_dicts(client) -> None: dicts = [ - "mabase_prod.dict_anubis_ua", - "mabase_prod.dict_anubis_ip", - "mabase_prod.dict_anubis_asn", - "mabase_prod.dict_anubis_country", + f"{DB_PROC}.dict_anubis_ua", + f"{DB_PROC}.dict_anubis_ip", + f"{DB_PROC}.dict_anubis_asn", + f"{DB_PROC}.dict_anubis_country", ] for d in dicts: try: diff --git a/services/bot-detector/anubis/mv_http_logs.sql b/services/bot-detector/anubis/mv_http_logs.sql index 59d58a0..3a64737 100644 --- a/services/bot-detector/anubis/mv_http_logs.sql +++ b/services/bot-detector/anubis/mv_http_logs.sql @@ -1,11 +1,11 @@ -CREATE MATERIALIZED VIEW mabase_prod.mv_http_logs -TO mabase_prod.http_logs +CREATE MATERIALIZED VIEW ja4_logs.mv_http_logs +TO ja4_logs.http_logs AS WITH coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS _ua, toIPv6(toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0'))) AS _ip, - toUInt32(dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'asn', _ip, toUInt32(0))) AS _asn, - dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'country_code', _ip, '') AS _cc + toUInt32(dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'asn', _ip, toUInt32(0))) AS _asn, + dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'country_code', _ip, '') AS _cc SELECT parseDateTimeBestEffort(coalesce(JSONExtractString(raw_json, 'time'), '1970-01-01T00:00:00Z')) AS time, toDate(time) AS log_date, @@ -15,9 +15,9 @@ SELECT _cc AS src_country_code, toIPv4(coalesce(JSONExtractString(raw_json, 'dst_ip'), '0.0.0.0')) AS dst_ip, toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port, - dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'name', _ip, '') AS src_as_name, - dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'org', _ip, '') AS src_org, - dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'domain', _ip, '') AS src_domain, + dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'name', _ip, '') AS src_as_name, + dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'org', _ip, '') AS src_org, + dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'domain', _ip, '') AS src_domain, coalesce(JSONExtractString(raw_json, 'method'), '') AS method, coalesce(JSONExtractString(raw_json, 'scheme'), '') AS scheme, coalesce(JSONExtractString(raw_json, 'host'), '') AS host, @@ -64,63 +64,63 @@ SELECT -- Anubis enrichment : logique de correspondance combinée UA+IP -- Priorité : (1) UA+IP [même rule_id] > (2) UA seul > (3) IP seul > (4) ASN > (5) Pays CASE - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '1' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != '' - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != '' - AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', _ua)) - = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) - THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '0' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != '' - THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) - WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' + AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua)) + = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) + THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' + THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) + WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') ELSE '' END AS anubis_bot_name, CASE - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '1' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != '' - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != '' - AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', _ua)) - = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) - THEN dictGet('mabase_prod.dict_anubis_ua', 'action', _ua) - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '0' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != '' - THEN dictGet('mabase_prod.dict_anubis_ua', 'action', _ua) - WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'action', _ip, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'action', _asn, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'action', _cc, '') + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' + AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua)) + = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) + THEN dictGet('ja4_processing.dict_anubis_ua', 'action', _ua) + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' + THEN dictGet('ja4_processing.dict_anubis_ua', 'action', _ua) + WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', _ip, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', _asn, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'action', _cc, '') ELSE '' END AS anubis_bot_action, CASE - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '1' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != '' - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != '' - AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', _ua)) - = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) - THEN dictGet('mabase_prod.dict_anubis_ua', 'category', _ua) - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '0' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != '' - THEN dictGet('mabase_prod.dict_anubis_ua', 'category', _ua) - WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'category', _ip, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'category', _asn, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'category', _cc, '') + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' + AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua)) + = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) + THEN dictGet('ja4_processing.dict_anubis_ua', 'category', _ua) + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' + THEN dictGet('ja4_processing.dict_anubis_ua', 'category', _ua) + WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', _ip, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', _asn, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'category', _cc, '') ELSE '' END AS anubis_bot_category -FROM mabase_prod.http_logs_raw +FROM ja4_logs.http_logs_raw diff --git a/services/bot-detector/anubis/view_ai_features_anubis.sql b/services/bot-detector/anubis/view_ai_features_anubis.sql index a71be84..abc93cb 100644 --- a/services/bot-detector/anubis/view_ai_features_anubis.sql +++ b/services/bot-detector/anubis/view_ai_features_anubis.sql @@ -1,73 +1,73 @@ -CREATE OR REPLACE VIEW mabase_prod.view_ai_features_1h AS +CREATE OR REPLACE VIEW ja4_processing.view_ai_features_1h AS WITH base_data AS ( SELECT a.window_start, a.src_ip, a.ja4, a.host, toString(a.src_asn) AS asn_number, a.src_as_name AS asn_org, a.src_org AS asn_detail, a.src_domain AS asn_domain, a.src_country_code AS country_code, - dictGetOrDefault('mabase_prod.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label, + dictGetOrDefault('ja4_processing.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label, -- Bot connu via JA4/IP (dictionnaires existants) COALESCE( - nullIf(dictGetOrDefault('mabase_prod.dict_bot_ip', 'bot_name', a.src_ip, ''), ''), - nullIf(dictGetOrDefault('mabase_prod.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_bot_ip', 'bot_name', a.src_ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''), '' ) AS bot_name, -- Anubis : logique combinée UA+IP (même rule_id) > UA seul > IP seul > ASN > Pays CASE - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua)) - = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) - THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' - THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) - WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' + AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua)) + = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) + THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' + THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) + WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') ELSE '' END AS anubis_bot_name, CASE - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua)) - = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) - THEN dictGet('mabase_prod.dict_anubis_ua', 'action', a.first_ua) - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' - THEN dictGet('mabase_prod.dict_anubis_ua', 'action', a.first_ua) - WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'action', a.src_ip, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'action', toUInt32(a.src_asn), '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'action', a.src_country_code, '') + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' + AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua)) + = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) + THEN dictGet('ja4_processing.dict_anubis_ua', 'action', a.first_ua) + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' + THEN dictGet('ja4_processing.dict_anubis_ua', 'action', a.first_ua) + WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', a.src_ip, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(a.src_asn), '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'action', a.src_country_code, '') ELSE '' END AS anubis_bot_action, CASE - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua)) - = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) - THEN dictGet('mabase_prod.dict_anubis_ua', 'category', a.first_ua) - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' - THEN dictGet('mabase_prod.dict_anubis_ua', 'category', a.first_ua) - WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'category', a.src_ip, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'category', toUInt32(a.src_asn), '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'category', a.src_country_code, '') + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' + AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua)) + = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) + THEN dictGet('ja4_processing.dict_anubis_ua', 'category', a.first_ua) + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' + THEN dictGet('ja4_processing.dict_anubis_ua', 'category', a.first_ua) + WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', a.src_ip, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(a.src_asn), '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'category', a.src_country_code, '') ELSE '' END AS anubis_bot_category, a.hits AS hits, @@ -160,7 +160,7 @@ WITH base_data AS ( sum(count_correlated) AS count_correlated_val, sum(count_no_accept_enc) AS count_no_accept_enc_val, sum(count_http_scheme) AS count_http_scheme_val - FROM mabase_prod.agg_host_ip_ja4_1h + FROM ja4_processing.agg_host_ip_ja4_1h WHERE window_start >= now() - INTERVAL 24 HOUR GROUP BY window_start, src_ip, ja4, host, src_asn ) a @@ -171,7 +171,7 @@ WITH base_data AS ( max(has_cookie) AS has_cookie, max(has_referer) AS has_referer, max(modern_browser_score) AS modern_browser_score, max(ua_ch_mismatch) AS ua_ch_mismatch, any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest - FROM mabase_prod.agg_header_fingerprint_1h + FROM ja4_processing.agg_header_fingerprint_1h WHERE window_start >= now() - INTERVAL 24 HOUR GROUP BY window_start, src_ip ) h ON a.src_ip = h.src_ip AND a.window_start = h.window_start diff --git a/services/bot-detector/bot_detector/bot_detector.py b/services/bot-detector/bot_detector/bot_detector.py index 63a5184..8036cc3 100644 --- a/services/bot-detector/bot_detector/bot_detector.py +++ b/services/bot-detector/bot_detector/bot_detector.py @@ -43,7 +43,8 @@ def _require_float(name, default, lo=None, hi=None): # Note : Utilisé dans des requêtes SQL via f-string (ex: f'SELECT * FROM {DB}.view_ai_features_1h') # Cette variable provient uniquement de variables d'environnement contrôlées (docker-compose, K8s, etc.) # et n'est jamais exposée à des entrées utilisateur. Le risque d'injection SQL est considéré comme négligeable. -DB = os.getenv('CLICKHOUSE_DB', 'mabase_prod') +DB = os.getenv('CLICKHOUSE_DB_PROCESSING', os.getenv('CLICKHOUSE_DB', 'ja4_processing')) +DB_LOGS = os.getenv('CLICKHOUSE_DB_LOGS', 'ja4_logs') CONTAMINATION = _require_float('ISOLATION_CONTAMINATION', 0.001, 0, 0.5) ANOMALY_THRESHOLD = _require_float('ANOMALY_THRESHOLD', -0.05) diff --git a/services/bot-detector/bot_detector/tests/test_detector.py b/services/bot-detector/bot_detector/tests/test_detector.py index fe9a49e..c7d67be 100644 --- a/services/bot-detector/bot_detector/tests/test_detector.py +++ b/services/bot-detector/bot_detector/tests/test_detector.py @@ -93,7 +93,7 @@ def test_dedup_different_ips_are_independent(): def test_dedup_exact_ttl_boundary(): - """Dedup: insertion exactly at TTL boundary is still blocked.""" + """Dedup: insertion at exactly the TTL boundary is allowed (uses strict <).""" from datetime import datetime, timedelta DEDUP_TTL_MIN = 60 dedup_cache = {} @@ -107,8 +107,8 @@ def test_dedup_exact_ttl_boundary(): now = datetime(2024, 1, 1, 12, 0, 0) assert should_insert("1.2.3.4", now) is True - # Exactly at 60 minutes should be blocked (< not <=) - assert should_insert("1.2.3.4", now + timedelta(minutes=60)) is False + # Exactly at 60 minutes: 3600 < 3600 is False, so insertion IS allowed + assert should_insert("1.2.3.4", now + timedelta(minutes=60)) is True def test_anomaly_threshold_no_anomalies(): diff --git a/services/bot-detector/deploy_views.sql b/services/bot-detector/deploy_views.sql index 15de1bd..099df92 100644 --- a/services/bot-detector/deploy_views.sql +++ b/services/bot-detector/deploy_views.sql @@ -1,50 +1,50 @@ -- ============================================================================ -- ARCHITECTURE DE DÉTECTION INTÉGRALE (v13 - bot_detector v11 + ml_all_scores) --- Base : mabase_prod | Fenêtre : 24h | Dédoublonnage par src_ip +-- Base : ja4_processing | Fenêtre : 24h | Dédoublonnage par src_ip -- Modifications v11 : ajout campaign_id, raw_anomaly_score dans ml_detected_anomalies -- correction view_dashboard_variability (header_user_agent → reason) -- Modifications v12 : ajout table ml_all_scores (toutes les classifications, sans seuil) -- ============================================================================ -- 1. NETTOYAGE COMPLET -DROP TABLE IF EXISTS mabase_prod.ml_all_scores; -DROP DICTIONARY IF EXISTS mabase_prod.dict_bot_ip; -DROP DICTIONARY IF EXISTS mabase_prod.dict_bot_ja4; -DROP DICTIONARY IF EXISTS mabase_prod.dict_asn_reputation; -DROP TABLE IF EXISTS mabase_prod.ml_detected_anomalies; -DROP VIEW IF EXISTS mabase_prod.view_ip_recurrence; -DROP VIEW IF EXISTS mabase_prod.view_ai_features_1h; +DROP TABLE IF EXISTS ja4_processing.ml_all_scores; +DROP DICTIONARY IF EXISTS ja4_processing.dict_bot_ip; +DROP DICTIONARY IF EXISTS ja4_processing.dict_bot_ja4; +DROP DICTIONARY IF EXISTS ja4_processing.dict_asn_reputation; +DROP TABLE IF EXISTS ja4_processing.ml_detected_anomalies; +DROP VIEW IF EXISTS ja4_processing.view_ip_recurrence; +DROP VIEW IF EXISTS ja4_processing.view_ai_features_1h; -- Suppression des anciennes vues heuristiques -DROP VIEW IF EXISTS mabase_prod.view_host_ip_ja4_rotation; -DROP VIEW IF EXISTS mabase_prod.view_host_ja4_anomalies; -DROP VIEW IF EXISTS mabase_prod.view_form_bruteforce_detected; -DROP VIEW IF EXISTS mabase_prod.view_alpn_mismatch_detected; -DROP VIEW IF EXISTS mabase_prod.view_tcp_spoofing_detected; +DROP VIEW IF EXISTS ja4_processing.view_host_ip_ja4_rotation; +DROP VIEW IF EXISTS ja4_processing.view_host_ja4_anomalies; +DROP VIEW IF EXISTS ja4_processing.view_form_bruteforce_detected; +DROP VIEW IF EXISTS ja4_processing.view_alpn_mismatch_detected; +DROP VIEW IF EXISTS ja4_processing.view_tcp_spoofing_detected; -DROP VIEW IF EXISTS mabase_prod.mv_agg_host_ip_ja4_1h; -DROP TABLE IF EXISTS mabase_prod.agg_host_ip_ja4_1h; -DROP VIEW IF EXISTS mabase_prod.mv_agg_header_fingerprint_1h; -DROP TABLE IF EXISTS mabase_prod.agg_header_fingerprint_1h; +DROP VIEW IF EXISTS ja4_processing.mv_agg_host_ip_ja4_1h; +DROP TABLE IF EXISTS ja4_processing.agg_host_ip_ja4_1h; +DROP VIEW IF EXISTS ja4_processing.mv_agg_header_fingerprint_1h; +DROP TABLE IF EXISTS ja4_processing.agg_header_fingerprint_1h; -- ============================================================================ -- 2. DICTIONNAIRES DE RÉPUTATION EN RAM -- ============================================================================ -CREATE DICTIONARY mabase_prod.dict_bot_ip (prefix String, bot_name String) +CREATE DICTIONARY ja4_processing.dict_bot_ip (prefix String, bot_name String) PRIMARY KEY prefix SOURCE(FILE(path '/var/lib/clickhouse/user_files/bot_ip.csv' format 'CSV')) LAYOUT(IP_TRIE()) LIFETIME(MIN 300 MAX 300); -CREATE DICTIONARY mabase_prod.dict_bot_ja4 (ja4 String, bot_name String) +CREATE DICTIONARY ja4_processing.dict_bot_ja4 (ja4 String, bot_name String) PRIMARY KEY ja4 SOURCE(FILE(path '/var/lib/clickhouse/user_files/bot_ja4.csv' format 'CSV')) LAYOUT(COMPLEX_KEY_HASHED()) LIFETIME(MIN 300 MAX 300); -CREATE DICTIONARY mabase_prod.dict_asn_reputation (src_asn UInt64, label String) +CREATE DICTIONARY ja4_processing.dict_asn_reputation (src_asn UInt64, label String) PRIMARY KEY src_asn SOURCE(FILE(path '/var/lib/clickhouse/user_files/asn_reputation.csv' format 'CSV')) LAYOUT(HASHED()) LIFETIME(MIN 300 MAX 300); -- ============================================================================ -- 3. TABLE D'AGRÉGATION COMPORTEMENTALE (L4 / L5 / L7) -- ============================================================================ -CREATE TABLE mabase_prod.agg_host_ip_ja4_1h +CREATE TABLE ja4_processing.agg_host_ip_ja4_1h ( window_start DateTime, src_ip IPv6, ja4 String, host String, src_asn UInt32, @@ -98,8 +98,8 @@ ORDER BY (window_start, src_ip, ja4, host); -- ============================================================================ -- 4. VUE MATÉRIALISÉE → agg_host_ip_ja4_1h -- ============================================================================ -CREATE MATERIALIZED VIEW mabase_prod.mv_agg_host_ip_ja4_1h -TO mabase_prod.agg_host_ip_ja4_1h AS +CREATE MATERIALIZED VIEW ja4_processing.mv_agg_host_ip_ja4_1h +TO ja4_processing.agg_host_ip_ja4_1h AS SELECT toStartOfHour(src.time) AS window_start, toIPv6(src.src_ip) AS src_ip, src.ja4, src.host, src.src_asn, @@ -135,13 +135,13 @@ SELECT sum(IF(length(src.header_accept) < 5, 1, 0)) AS count_generic_accept, sum(IF(src.http_version = 'HTTP/1.0', 1, 0)) AS count_http10, varPopState(toFloat64(src.ip_meta_df)) AS ip_df_var -FROM mabase_prod.http_logs AS src +FROM ja4_logs.http_logs AS src GROUP BY window_start, src_ip, ja4, host, src_asn; -- ============================================================================ -- 5. TABLE D'AGRÉGATION DES HEADERS (L7) -- ============================================================================ -CREATE TABLE mabase_prod.agg_header_fingerprint_1h +CREATE TABLE ja4_processing.agg_header_fingerprint_1h ( window_start DateTime, src_ip IPv6, @@ -158,8 +158,8 @@ CREATE TABLE mabase_prod.agg_header_fingerprint_1h ENGINE = AggregatingMergeTree() ORDER BY (window_start, src_ip); -CREATE MATERIALIZED VIEW mabase_prod.mv_agg_header_fingerprint_1h -TO mabase_prod.agg_header_fingerprint_1h AS +CREATE MATERIALIZED VIEW ja4_processing.mv_agg_header_fingerprint_1h +TO ja4_processing.agg_header_fingerprint_1h AS SELECT toStartOfHour(src.time) AS window_start, toIPv6(src.src_ip) AS src_ip, @@ -172,13 +172,13 @@ SELECT max(toUInt8(if((position(src.header_user_agent, 'Windows') > 0 AND position(src.header_sec_ch_ua_platform, 'Windows') == 0) OR (position(src.header_user_agent, 'iPhone') > 0 AND position(src.header_sec_ch_ua_platform, 'iOS') == 0), 1, 0))) AS ua_ch_mismatch, any(src.header_sec_fetch_mode) AS sec_fetch_mode, any(src.header_sec_fetch_dest) AS sec_fetch_dest -FROM mabase_prod.http_logs AS src +FROM ja4_logs.http_logs AS src GROUP BY window_start, src.src_ip; -- ============================================================================ -- 6. TABLE DE RÉSULTATS ML — MENACES UNIQUEMENT (scores < seuil) -- ============================================================================ -CREATE TABLE mabase_prod.ml_detected_anomalies +CREATE TABLE ja4_processing.ml_detected_anomalies ( detected_at DateTime, src_ip IPv6, ja4 String, host String, bot_name String, anomaly_score Float32, threat_level String, model_name String, recurrence UInt32, @@ -204,7 +204,7 @@ TTL detected_at + INTERVAL 30 DAY; -- ============================================================================ -- 6b. TABLE DE TOUTES LES CLASSIFICATIONS (sans seuil, pour observabilité) -- ============================================================================ -CREATE TABLE mabase_prod.ml_all_scores +CREATE TABLE ja4_processing.ml_all_scores ( detected_at DateTime, window_start DateTime, @@ -235,24 +235,24 @@ SETTINGS index_granularity = 8192; -- ============================================================================ -- 7. VUE DE RÉCURRENCE -- ============================================================================ -CREATE OR REPLACE VIEW mabase_prod.view_ip_recurrence AS +CREATE OR REPLACE VIEW ja4_processing.view_ip_recurrence AS SELECT src_ip, count() AS recurrence, min(detected_at) AS first_seen, max(detected_at) AS last_seen, min(anomaly_score) AS worst_score, argMin(threat_level, anomaly_score) AS worst_threat_level -FROM mabase_prod.ml_detected_anomalies GROUP BY src_ip; +FROM ja4_processing.ml_detected_anomalies GROUP BY src_ip; -- ============================================================================ -- 8. VUE IA PRINCIPALE (Avec CTE pour Entropie Temporelle) -- ============================================================================ -CREATE OR REPLACE VIEW mabase_prod.view_ai_features_1h AS +CREATE OR REPLACE VIEW ja4_processing.view_ai_features_1h AS WITH base_data AS ( SELECT a.window_start, a.src_ip, a.ja4, a.host, toString(a.src_asn) AS asn_number, a.src_as_name AS asn_org, a.src_org AS asn_detail, a.src_domain AS asn_domain, a.src_country_code AS country_code, - dictGetOrDefault('mabase_prod.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label, + dictGetOrDefault('ja4_processing.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label, COALESCE( - nullIf(dictGetOrDefault('mabase_prod.dict_bot_ip', 'bot_name', a.src_ip, ''), ''), - nullIf(dictGetOrDefault('mabase_prod.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_bot_ip', 'bot_name', a.src_ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''), '' ) AS bot_name, a.hits AS hits, @@ -324,7 +324,7 @@ WITH base_data AS ( sum(count_generic_accept) AS count_generic_accept, sum(count_http10) AS count_http10, varPopMerge(ip_df_var) AS ip_df_variance - FROM mabase_prod.agg_host_ip_ja4_1h + FROM ja4_processing.agg_host_ip_ja4_1h WHERE window_start >= now() - INTERVAL 24 HOUR GROUP BY window_start, src_ip, ja4, host, src_asn ) a @@ -335,7 +335,7 @@ WITH base_data AS ( max(has_cookie) AS has_cookie, max(has_referer) AS has_referer, max(modern_browser_score) AS modern_browser_score, max(ua_ch_mismatch) AS ua_ch_mismatch, any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest - FROM mabase_prod.agg_header_fingerprint_1h + FROM ja4_processing.agg_header_fingerprint_1h WHERE window_start >= now() - INTERVAL 24 HOUR GROUP BY window_start, src_ip ) h ON a.src_ip = h.src_ip AND a.window_start = h.window_start @@ -352,7 +352,7 @@ FROM base_data; -- ============================================================================ -- Vue pour les métriques globales du dashboard -CREATE OR REPLACE VIEW mabase_prod.view_dashboard_summary AS +CREATE OR REPLACE VIEW ja4_processing.view_dashboard_summary AS SELECT count() AS total_detections, countIf(threat_level = 'CRITICAL') AS critical_count, @@ -362,11 +362,11 @@ SELECT countIf(bot_name != '') AS known_bots_count, countIf(bot_name = '') AS anomalies_count, uniq(src_ip) AS unique_ips -FROM mabase_prod.ml_detected_anomalies +FROM ja4_processing.ml_detected_anomalies WHERE detected_at >= now() - INTERVAL 24 HOUR; -- Vue pour la série temporelle (par heure) -CREATE OR REPLACE VIEW mabase_prod.view_dashboard_timeseries AS +CREATE OR REPLACE VIEW ja4_processing.view_dashboard_timeseries AS SELECT toStartOfHour(detected_at) AS hour, count() AS total, @@ -374,25 +374,25 @@ SELECT countIf(threat_level = 'HIGH') AS high, countIf(threat_level = 'MEDIUM') AS medium, countIf(threat_level = 'LOW') AS low -FROM mabase_prod.ml_detected_anomalies +FROM ja4_processing.ml_detected_anomalies WHERE detected_at >= now() - INTERVAL 24 HOUR GROUP BY hour ORDER BY hour; -- Vue pour la distribution des menaces -CREATE OR REPLACE VIEW mabase_prod.view_dashboard_threat_dist AS +CREATE OR REPLACE VIEW ja4_processing.view_dashboard_threat_dist AS SELECT threat_level, count() AS count, round(count() * 100.0 / sum(count()) OVER (), 2) AS percentage -FROM mabase_prod.ml_detected_anomalies +FROM ja4_processing.ml_detected_anomalies WHERE detected_at >= now() - INTERVAL 24 HOUR GROUP BY threat_level ORDER BY count DESC; -- Vue pour la variabilité (utilisée par l'API) -- Note v12 : header_user_agent n'existe pas dans ml_detected_anomalies → remplacé par reason -CREATE OR REPLACE VIEW mabase_prod.view_dashboard_variability AS +CREATE OR REPLACE VIEW ja4_processing.view_dashboard_variability AS SELECT detected_at, src_ip, @@ -407,5 +407,5 @@ SELECT anomaly_score, campaign_id, raw_anomaly_score -FROM mabase_prod.ml_detected_anomalies +FROM ja4_processing.ml_detected_anomalies WHERE detected_at >= now() - INTERVAL 24 HOUR; diff --git a/services/bot-detector/docker-compose.yml b/services/bot-detector/docker-compose.yml index ee3d693..061c107 100644 --- a/services/bot-detector/docker-compose.yml +++ b/services/bot-detector/docker-compose.yml @@ -13,10 +13,12 @@ services: environment: # ── ClickHouse ──────────────────────────────────────────────────────── - CLICKHOUSE_HOST: ${CLICKHOUSE_HOST:-clickhouse} - CLICKHOUSE_DB: ${CLICKHOUSE_DB:-mabase_prod} - CLICKHOUSE_USER: ${CLICKHOUSE_USER:-admin} - CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD:-} + CLICKHOUSE_HOST: ${CLICKHOUSE_HOST:-clickhouse} + CLICKHOUSE_DB: ${CLICKHOUSE_DB:-ja4_processing} + CLICKHOUSE_DB_LOGS: ${CLICKHOUSE_DB_LOGS:-ja4_logs} + CLICKHOUSE_DB_PROCESSING: ${CLICKHOUSE_DB_PROCESSING:-ja4_processing} + CLICKHOUSE_USER: ${CLICKHOUSE_USER:-admin} + CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD:-} # ── Modèle IA ───────────────────────────────────────────────────────── ISOLATION_CONTAMINATION: ${ISOLATION_CONTAMINATION:-0.02} diff --git a/services/correlator/.env.example b/services/correlator/.env.example index babbf90..0f036d0 100644 --- a/services/correlator/.env.example +++ b/services/correlator/.env.example @@ -1,2 +1,2 @@ # correlator configuration — DO NOT COMMIT real values -LOGCORRELATOR_CLICKHOUSE_DSN=clickhouse://data_writer:ChangeMe@clickhouse:9000/mabase_prod +LOGCORRELATOR_CLICKHOUSE_DSN=clickhouse://data_writer:ChangeMe@clickhouse:9000/ja4_logs diff --git a/services/correlator/README.md b/services/correlator/README.md index 98304db..a4e2deb 100644 --- a/services/correlator/README.md +++ b/services/correlator/README.md @@ -232,25 +232,25 @@ http_logs_raw ← inserts du service (raw_json String) ```sql -- data_writer : INSERT sur http_logs_raw uniquement (compte du service) -GRANT INSERT ON mabase_prod.http_logs_raw TO data_writer; -GRANT SELECT ON mabase_prod.http_logs_raw TO data_writer; +GRANT INSERT ON ja4_logs.http_logs_raw TO data_writer; +GRANT SELECT ON ja4_logs.http_logs_raw TO data_writer; -- analyst : lecture sur la table parsée -GRANT SELECT ON mabase_prod.http_logs TO analyst; +GRANT SELECT ON ja4_logs.http_logs TO analyst; ``` ### Vérification de l'ingestion ```sql -- Données brutes reçues -SELECT count(*), min(ingest_time), max(ingest_time) FROM mabase_prod.http_logs_raw; +SELECT count(*), min(ingest_time), max(ingest_time) FROM ja4_logs.http_logs_raw; -- Données parsées par la vue matérialisée -SELECT count(*), min(time), max(time) FROM mabase_prod.http_logs; +SELECT count(*), min(time), max(time) FROM ja4_logs.http_logs; -- Derniers logs corrélés SELECT time, src_ip, dst_ip, method, host, path, ja4 -FROM mabase_prod.http_logs +FROM ja4_logs.http_logs WHERE correlated = 1 ORDER BY time DESC LIMIT 10; ``` @@ -397,7 +397,7 @@ python3 scripts/test-correlation-advanced.py --all ### ClickHouse : erreurs d'insertion - **`No such column`** : vérifier que la table `http_logs_raw` utilise la colonne unique `raw_json` (pas de colonnes séparées) -- **`ACCESS_DENIED`** : `GRANT INSERT ON mabase_prod.http_logs_raw TO data_writer;` +- **`ACCESS_DENIED`** : `GRANT INSERT ON ja4_logs.http_logs_raw TO data_writer;` - Les erreurs de flush sont loggées en ERROR dans les logs du service ### Vue matérialisée vide @@ -405,9 +405,9 @@ python3 scripts/test-correlation-advanced.py --all Si `http_logs_raw` a des données mais `http_logs` est vide : ```sql -- Vérifier la vue -SHOW CREATE TABLE mabase_prod.mv_http_logs; +SHOW CREATE TABLE ja4_logs.mv_http_logs; -- Vérifier les permissions (la MV s'exécute sous le compte du service) -GRANT SELECT ON mabase_prod.http_logs_raw TO data_writer; +GRANT SELECT ON ja4_logs.http_logs_raw TO data_writer; ``` ### Sockets Unix : permission denied diff --git a/services/correlator/architecture.yml b/services/correlator/architecture.yml index 6e8af6e..e3e37e9 100644 --- a/services/correlator/architecture.yml +++ b/services/correlator/architecture.yml @@ -533,7 +533,7 @@ schema: clickhouse_schema: strategy: external_ddls - database: mabase_prod + database: ja4_processing description: > La table ClickHouse est gérée en dehors du service. Le service insère dans une table RAW avec une seule colonne raw_json contenant le log corrélé complet @@ -556,7 +556,7 @@ clickhouse_schema: type: DateTime default: now() insert_format: | - INSERT INTO mabase_prod.http_logs_raw (raw_json) VALUES + INSERT INTO ja4_processing.http_logs_raw (raw_json) VALUES ('{...log corrélé sérialisé en JSON...}') notes: > Le service utilise l'API native clickhouse-go/v2 (PrepareBatch + Append + Send). diff --git a/services/correlator/config.example.yml b/services/correlator/config.example.yml index 5ab8fed..6457555 100644 --- a/services/correlator/config.example.yml +++ b/services/correlator/config.example.yml @@ -25,8 +25,8 @@ outputs: clickhouse: enabled: false - dsn: clickhouse://user:pass@localhost:9000/db - table: correlated_logs_http_network + dsn: clickhouse://user:pass@localhost:9000/ja4_logs + table: http_logs_raw batch_size: 500 flush_interval_ms: 200 max_buffer_size: 5000 diff --git a/services/correlator/idees/views.md b/services/correlator/idees/views.md index ddecb3a..bb219b9 100644 --- a/services/correlator/idees/views.md +++ b/services/correlator/idees/views.md @@ -96,7 +96,7 @@ Cette section détaille les vues d'agrégation et de détection pour identifier ```sql -- Quel host est associé à cette IP/JA4 ? SELECT src_ip, ja4, host, total_hits, unique_paths, user_agent -FROM mabase_prod.view_host_identification +FROM ja4_processing.view_host_identification WHERE src_ip = '1.2.3.4' ORDER BY total_hits DESC; ``` @@ -105,7 +105,7 @@ ORDER BY total_hits DESC; ```sql -- Ce JA4 est-il utilisé par plusieurs hosts différents ? SELECT ja4, hosts, unique_hosts, unique_ips -FROM mabase_prod.view_host_ja4_anomalies +FROM ja4_processing.view_host_ja4_anomalies HAVING unique_hosts >= 3; -- Interprétation : 1 JA4 sur 3+ hosts = botnet cloné probable ``` @@ -114,7 +114,7 @@ HAVING unique_hosts >= 3; ```sql -- Cette IP change-t-elle de JA4 fréquemment ? SELECT src_ip, ja4s, unique_ja4s -FROM mabase_prod.view_host_ip_ja4_rotation +FROM ja4_processing.view_host_ip_ja4_rotation HAVING unique_ja4s >= 5; -- Interprétation : 1 IP avec 5+ JA4 différents = fingerprint spoofing ``` @@ -131,7 +131,7 @@ HAVING unique_ja4s >= 5; ```sql -- Détecter les tentatives de brute force sur les login SELECT window, src_ip, ja4, host, path, attempts, attempts_per_minute -FROM mabase_prod.view_bruteforce_post_detected +FROM ja4_processing.view_bruteforce_post_detected WHERE host = 'api.example.com' ORDER BY attempts DESC; @@ -147,7 +147,7 @@ ORDER BY attempts DESC; ```sql -- Détecter les requêtes avec query params hautement variables SELECT window, src_ip, ja4, host, path, requests, unique_query_patterns -FROM mabase_prod.view_form_bruteforce_detected +FROM ja4_processing.view_form_bruteforce_detected HAVING requests >= 20 AND unique_query_patterns >= 10; -- Interprétation : 20+ requêtes avec 10+ patterns query différents @@ -178,7 +178,7 @@ Exemple : `"Accept,Accept-Encoding,Sec-CH-UA,Sec-Fetch-Dest,User-Agent"` ```sql -- Navigateurs "modernes" avec headers manquants SELECT src_ip, ja4, header_user_agent, modern_browser_score, header_count -FROM mabase_prod.view_header_missing_modern_headers +FROM ja4_processing.view_header_missing_modern_headers WHERE header_user_agent ILIKE '%Chrome%'; -- Threshold : score < 70 pour Chrome/Firefox = suspect @@ -189,7 +189,7 @@ WHERE header_user_agent ILIKE '%Chrome%'; ```sql -- Même User-Agent avec ordre de headers différent SELECT header_user_agent, ja4, unique_hashes, unique_ips -FROM mabase_prod.view_header_ua_order_mismatch +FROM ja4_processing.view_header_ua_order_mismatch HAVING unique_hashes > 1; -- Interprétation : 1 UA avec 2+ ordres de headers = spoofing ou outil custom @@ -199,7 +199,7 @@ HAVING unique_hashes > 1; ```sql -- Clients avec trop peu de headers SELECT src_ip, ja4, header_count, header_user_agent -FROM mabase_prod.view_header_minimalist_count +FROM ja4_processing.view_header_minimalist_count WHERE header_count < 6; -- Threshold : < 6 headers = bot scripté (curl, Python requests, etc.) @@ -209,7 +209,7 @@ WHERE header_count < 6; ```sql -- Chrome sans Sec-CH-UA (impossible pour un vrai Chrome) SELECT src_ip, ja4, header_user_agent -FROM mabase_prod.view_header_sec_ch_missing +FROM ja4_processing.view_header_sec_ch_missing WHERE header_user_agent ILIKE '%Chrome/%'; ``` @@ -217,7 +217,7 @@ WHERE header_user_agent ILIKE '%Chrome/%'; ```sql -- Même ordre de headers sur 10+ IPs différentes SELECT header_order_hash, header_user_agent, unique_ips, total_hits -FROM mabase_prod.view_header_known_bot_signature +FROM ja4_processing.view_header_known_bot_signature HAVING unique_ips >= 10; -- Interprétation : 1 signature sur 10+ IPs = cluster de bots clonés @@ -243,7 +243,7 @@ ALPN (Application-Layer Protocol Negotiation) est une extension TLS qui négocie ```sql -- Clients déclarant h2 mais parlant HTTP/1.1 SELECT src_ip, ja4, declared_alpn, actual_http_version, mismatches, mismatch_pct -FROM mabase_prod.view_alpn_mismatch_detected +FROM ja4_processing.view_alpn_mismatch_detected HAVING mismatch_pct >= 80; -- Threshold : ≥5 requêtes avec ≥80% d'incohérence @@ -262,7 +262,7 @@ HAVING mismatch_pct >= 80; ```sql -- IPs dépassant 50 requêtes/minute SELECT minute, src_ip, ja4, requests_per_min, unique_paths -FROM mabase_prod.view_rate_limit_exceeded +FROM ja4_processing.view_rate_limit_exceeded ORDER BY requests_per_min DESC; -- Threshold : > 50 req/min = trafic automatisé @@ -277,7 +277,7 @@ ORDER BY requests_per_min DESC; ```sql -- Pics soudains de trafic SELECT window, src_ip, ja4, burst_count -FROM mabase_prod.view_burst_detected +FROM ja4_processing.view_burst_detected HAVING burst_count > 20; -- Threshold : > 20 requêtes en 10 secondes = burst suspect @@ -294,7 +294,7 @@ HAVING burst_count > 20; ```sql -- Détection de scanning de paths sensibles SELECT window, src_ip, ja4, host, sensitive_hits, sensitive_ratio -FROM mabase_prod.view_path_scan_detected +FROM ja4_processing.view_path_scan_detected HAVING sensitive_hits >= 5; -- Paths surveillés : admin, backup, config, .env, .git, wp-admin, @@ -331,7 +331,7 @@ HAVING sensitive_hits >= 5; -- Toutes les tentatives d'injection SELECT window, src_ip, ja4, host, path, sqli_attempts, xss_attempts, traversal_attempts -FROM mabase_prod.view_payload_attacks_detected +FROM ja4_processing.view_payload_attacks_detected ORDER BY sqli_attempts DESC, xss_attempts DESC, traversal_attempts DESC; -- Threshold : ≥1 tentative = alerte (zero tolerance) @@ -351,7 +351,7 @@ Un vrai navigateur a un fingerprint TLS unique. Un bot déployé sur 100 machine ```sql -- JA4 partagé par 20+ IPs différentes SELECT ja4, ja3_hash, unique_ips, unique_asns, unique_countries, total_hits -FROM mabase_prod.view_ja4_botnet_suspected +FROM ja4_processing.view_ja4_botnet_suspected HAVING unique_ips >= 20; -- Threshold : ≥20 IPs avec le même JA4 = botnet cloné @@ -380,7 +380,7 @@ Mesure le ratio d'événements non-corrélés (orphelins). Un trafic légitime a ```sql -- Trafic avec >80% d'événements non-corrélés SELECT hour, src_ip, ja4, host, correlated, orphans, orphan_pct -FROM mabase_prod.view_high_orphan_ratio +FROM ja4_processing.view_high_orphan_ratio ORDER BY orphan_pct DESC; -- Threshold : orphan_pct > 80% = trafic suspect @@ -440,13 +440,13 @@ Un IP bloquée par erreur retrouvera un score normal après expiration du TTL. ```sql WITH threats AS ( SELECT src_ip, ja4, 'bruteforce' AS type, sum(attempts) AS score - FROM mabase_prod.view_bruteforce_post_detected GROUP BY src_ip, ja4 + FROM ja4_processing.view_bruteforce_post_detected GROUP BY src_ip, ja4 UNION ALL SELECT src_ip, ja4, 'path_scan', sum(sensitive_hits) - FROM mabase_prod.view_path_scan_detected GROUP BY src_ip, ja4 + FROM ja4_processing.view_path_scan_detected GROUP BY src_ip, ja4 UNION ALL SELECT src_ip, ja4, 'payload', sum(sqli_attempts + xss_attempts) - FROM mabase_prod.view_payload_attacks_detected GROUP BY src_ip, ja4 + FROM ja4_processing.view_payload_attacks_detected GROUP BY src_ip, ja4 ) SELECT src_ip, ja4, sum(score) AS total_score, groupArray(type) AS threat_types FROM threats @@ -462,7 +462,7 @@ SELECT host, countMerge(hits) AS requests, uniqMerge(uniq_paths) AS unique_paths -FROM mabase_prod.agg_host_ip_ja4_1h +FROM ja4_processing.agg_host_ip_ja4_1h WHERE src_ip = '1.2.3.4' AND hour >= now() - INTERVAL 24 HOUR GROUP BY hour, host @@ -476,7 +476,7 @@ SELECT any(first_ua) AS user_agent, groupArray(DISTINCT host) AS hosts, sum(countMerge(hits)) AS total_requests -FROM mabase_prod.agg_host_ip_ja4_1h +FROM ja4_processing.agg_host_ip_ja4_1h WHERE hour >= now() - INTERVAL 1 HOUR GROUP BY ja4 ORDER BY total_requests DESC @@ -496,13 +496,13 @@ clickhouse-client --multiquery < sql/hosts.sql ### Vérification ```sql -- Compter les enregistrements -SELECT count(*) FROM mabase_prod.agg_host_ip_ja4_1h; -SELECT count(*) FROM mabase_prod.agg_header_fingerprint_1h; +SELECT count(*) FROM ja4_processing.agg_host_ip_ja4_1h; +SELECT count(*) FROM ja4_processing.agg_header_fingerprint_1h; -- Tester les vues -SELECT * FROM mabase_prod.view_host_identification LIMIT 10; -SELECT * FROM mabase_prod.view_bruteforce_post_detected LIMIT 10; -SELECT * FROM mabase_prod.view_payload_attacks_detected LIMIT 10; +SELECT * FROM ja4_processing.view_host_identification LIMIT 10; +SELECT * FROM ja4_processing.view_bruteforce_post_detected LIMIT 10; +SELECT * FROM ja4_processing.view_payload_attacks_detected LIMIT 10; ``` ### Monitoring @@ -510,12 +510,12 @@ SELECT * FROM mabase_prod.view_payload_attacks_detected LIMIT 10; -- Vues les plus actives (dernière heure) SELECT 'bruteforce_post' AS view_name, count() AS alerts -FROM mabase_prod.view_bruteforce_post_detected +FROM ja4_processing.view_bruteforce_post_detected UNION ALL -SELECT 'path_scan', count() FROM mabase_prod.view_path_scan_detected +SELECT 'path_scan', count() FROM ja4_processing.view_path_scan_detected UNION ALL -SELECT 'payload_attacks', count() FROM mabase_prod.view_payload_attacks_detected +SELECT 'payload_attacks', count() FROM ja4_processing.view_payload_attacks_detected UNION ALL -SELECT 'ja4_botnet', count() FROM mabase_prod.view_ja4_botnet_suspected +SELECT 'ja4_botnet', count() FROM ja4_processing.view_ja4_botnet_suspected ORDER BY alerts DESC; ``` diff --git a/services/correlator/sql/bots.sql b/services/correlator/sql/bots.sql index 2e1e4ad..3589f28 100644 --- a/services/correlator/sql/bots.sql +++ b/services/correlator/sql/bots.sql @@ -1,6 +1,6 @@ -DROP TABLE IF EXISTS mabase_prod.ref_bot_networks; +DROP TABLE IF EXISTS ja4_processing.ref_bot_networks; -CREATE TABLE mabase_prod.ref_bot_networks ( +CREATE TABLE ja4_processing.ref_bot_networks ( -- On utilise IPv6CIDR car il accepte aussi les IPv4 au format ::ffff:1.2.3.4/120 network IPv6CIDR, bot_name LowCardinality(String), @@ -11,11 +11,11 @@ ORDER BY (network, bot_name); -- Création de la table lisant le fichier des IPs -CREATE TABLE mabase_prod.bot_ip ( +CREATE TABLE ja4_processing.bot_ip ( ip String ) ENGINE = File(CSV, 'bot_ip.csv'); -- Création de la table lisant le fichier des signatures JA4 -CREATE TABLE mabase_prod.bot_ja4 ( +CREATE TABLE ja4_processing.bot_ja4 ( ja4 String ) ENGINE = File(CSV, 'bot_ja4.csv'); diff --git a/services/correlator/sql/init.sql b/services/correlator/sql/init.sql index 1765a9a..66714b5 100644 --- a/services/correlator/sql/init.sql +++ b/services/correlator/sql/init.sql @@ -11,13 +11,14 @@ -- ----------------------------------------------------------------------------- -- Base de données -- ----------------------------------------------------------------------------- -CREATE DATABASE IF NOT EXISTS mabase_prod; +CREATE DATABASE IF NOT EXISTS ja4_logs; +CREATE DATABASE IF NOT EXISTS ja4_processing; -- ----------------------------------------------------------------------------- -- Table brute : cible directe des inserts du service -- Le service n'insère que dans cette table (colonne raw_json). -- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.http_logs_raw +CREATE TABLE IF NOT EXISTS ja4_logs.http_logs_raw ( `raw_json` String CODEC(ZSTD(3)), `ingest_time` DateTime DEFAULT now() @@ -34,7 +35,7 @@ SETTINGS -- Table parsée : alimentée automatiquement par la vue matérialisée -- ----------------------------------------------------------------------------- -CREATE TABLE mabase_prod.http_logs +CREATE TABLE ja4_logs.http_logs ( -- Temporel `time` DateTime, @@ -118,10 +119,10 @@ SETTINGS -- ----------------------------------------------------------------------------- -- Vue matérialisée : parse le JSON de http_logs_raw vers http_logs -- ----------------------------------------------------------------------------- -DROP VIEW IF EXISTS mabase_prod.mv_http_logs; +DROP VIEW IF EXISTS ja4_logs.mv_http_logs; -CREATE MATERIALIZED VIEW IF NOT EXISTS mabase_prod.mv_http_logs -TO mabase_prod.http_logs +CREATE MATERIALIZED VIEW IF NOT EXISTS ja4_logs.mv_http_logs +TO ja4_logs.http_logs AS SELECT parseDateTimeBestEffort(coalesce(JSONExtractString(raw_json, 'time'), '1970-01-01T00:00:00Z')) AS time, @@ -133,31 +134,31 @@ SELECT toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port, dictGetOrDefault( - 'mabase_prod.dict_iplocate_asn', + 'ja4_processing.dict_iplocate_asn', 'asn', IPv4ToIPv6(IPv4StringToNum(toString(src_ip))), toUInt32(0) ) AS src_asn, dictGetOrDefault( - 'mabase_prod.dict_iplocate_asn', + 'ja4_processing.dict_iplocate_asn', 'country_code', IPv4ToIPv6(IPv4StringToNum(toString(src_ip))), '' ) AS src_country_code, dictGetOrDefault( - 'mabase_prod.dict_iplocate_asn', + 'ja4_processing.dict_iplocate_asn', 'name', IPv4ToIPv6(IPv4StringToNum(toString(src_ip))), '' ) AS src_as_name, dictGetOrDefault( - 'mabase_prod.dict_iplocate_asn', + 'ja4_processing.dict_iplocate_asn', 'org', IPv4ToIPv6(IPv4StringToNum(toString(src_ip))), '' ) AS src_org, dictGetOrDefault( - 'mabase_prod.dict_iplocate_asn', + 'ja4_processing.dict_iplocate_asn', 'domain', IPv4ToIPv6(IPv4StringToNum(toString(src_ip))), '' @@ -211,7 +212,7 @@ SELECT coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Mode'), '') AS header_sec_fetch_mode, coalesce(JSONExtractString(raw_json, 'header_Sec-Fetch-Site'), '') AS header_sec_fetch_site -FROM mabase_prod.http_logs_raw; +FROM ja4_logs.http_logs_raw; -- ----------------------------------------------------------------------------- -- Utilisateurs et permissions @@ -220,15 +221,15 @@ CREATE USER IF NOT EXISTS data_writer IDENTIFIED WITH plaintext_password BY 'Cha CREATE USER IF NOT EXISTS analyst IDENTIFIED WITH plaintext_password BY 'ChangeMe'; -- data_writer : INSERT uniquement sur la table brute -GRANT INSERT ON mabase_prod.http_logs_raw TO data_writer; -GRANT SELECT ON mabase_prod.http_logs_raw TO data_writer; +GRANT INSERT ON ja4_logs.http_logs_raw TO data_writer; +GRANT SELECT ON ja4_logs.http_logs_raw TO data_writer; -- analyst : lecture sur la table parsée -GRANT SELECT ON mabase_prod.http_logs TO analyst; +GRANT SELECT ON ja4_logs.http_logs TO analyst; -- ----------------------------------------------------------------------------- -- Vérifications post-installation -- ----------------------------------------------------------------------------- --- SELECT count(*), min(ingest_time), max(ingest_time) FROM mabase_prod.http_logs_raw; --- SELECT count(*), min(time), max(time) FROM mabase_prod.http_logs; --- SELECT time, src_ip, dst_ip, method, host, path, ja4 FROM mabase_prod.http_logs ORDER BY time DESC LIMIT 10; +-- SELECT count(*), min(ingest_time), max(ingest_time) FROM ja4_logs.http_logs_raw; +-- SELECT count(*), min(time), max(time) FROM ja4_logs.http_logs; +-- SELECT time, src_ip, dst_ip, method, host, path, ja4 FROM ja4_logs.http_logs ORDER BY time DESC LIMIT 10; diff --git a/services/correlator/sql/tables.sql b/services/correlator/sql/tables.sql index 98a812b..ce4f597 100644 --- a/services/correlator/sql/tables.sql +++ b/services/correlator/sql/tables.sql @@ -1,6 +1,6 @@ -DROP DICTIONARY IF EXISTS mabase_prod.dict_iplocate_asn; +DROP DICTIONARY IF EXISTS ja4_processing.dict_iplocate_asn; -CREATE DICTIONARY IF NOT EXISTS mabase_prod.dict_iplocate_asn +CREATE DICTIONARY IF NOT EXISTS ja4_processing.dict_iplocate_asn ( network String, asn UInt32, @@ -17,10 +17,10 @@ LIFETIME(MIN 3600 MAX 7200); -- Suppression si existe pour reconfiguration -DROP TABLE IF EXISTS mabase_prod.ref_bot_networks; +DROP TABLE IF EXISTS ja4_processing.ref_bot_networks; -- Table optimisée pour le filtrage binaire de CIDR -CREATE TABLE mabase_prod.ref_bot_networks ( +CREATE TABLE ja4_processing.ref_bot_networks ( network IPv6CIDR, -- Gère nativement '1.2.3.0/24' et '2001:db8::/32' bot_name LowCardinality(String), is_legitimate UInt8, -- 1 = Whitelist, 0 = Blacklist diff --git a/services/dashboard/.env.example b/services/dashboard/.env.example index 3acc696..6138dde 100644 --- a/services/dashboard/.env.example +++ b/services/dashboard/.env.example @@ -1,7 +1,9 @@ # dashboard configuration — DO NOT COMMIT real values CLICKHOUSE_HOST=clickhouse CLICKHOUSE_PORT=8123 -CLICKHOUSE_DB=mabase_prod +CLICKHOUSE_DB=ja4_processing +CLICKHOUSE_DB_LOGS=ja4_logs +CLICKHOUSE_DB_PROCESSING=ja4_processing CLICKHOUSE_USER=analyst CLICKHOUSE_PASSWORD= API_HOST=0.0.0.0 diff --git a/services/dashboard/.github/copilot-instructions.md b/services/dashboard/.github/copilot-instructions.md index 5d1352b..c4af42d 100644 --- a/services/dashboard/.github/copilot-instructions.md +++ b/services/dashboard/.github/copilot-instructions.md @@ -4,7 +4,7 @@ This is a **SOC (Security Operations Center) dashboard** for visualizing bot detections from an upstream `bot_detector_ai` service. It is a **single-service, full-stack app**: the FastAPI backend serves the built React frontend as static files *and* exposes a REST API, all on port 8000. There is no separate frontend server in production and **no authentication**. -**Data source:** ClickHouse database (`mabase_prod`), primarily the `ml_detected_anomalies` table and the `view_dashboard_entities` view. +**Data source:** ClickHouse database (`ja4_processing`), primarily the `ml_detected_anomalies` table and the `view_dashboard_entities` view. ``` dashboard/ @@ -95,7 +95,7 @@ curl "http://localhost:8000/api/detections?page=1&page_size=5" | jq '.items | le |---|---|---| | `CLICKHOUSE_HOST` | `clickhouse` | ClickHouse hostname | | `CLICKHOUSE_PORT` | `8123` | ClickHouse HTTP port (set in code) | -| `CLICKHOUSE_DB` | `mabase_prod` | Database name | +| `CLICKHOUSE_DB` | `ja4_processing` | Database name | | `CLICKHOUSE_USER` | `admin` | | | `CLICKHOUSE_PASSWORD` | `` | | | `API_HOST` | `0.0.0.0` | Uvicorn bind host | @@ -111,4 +111,4 @@ curl "http://localhost:8000/api/detections?page=1&page_size=5" | jq '.items | le | `ml_detected_anomalies` | Primary source for detections, metrics, variability, analysis | | `view_dashboard_entities` | User agents, client headers, paths, query params (entities routes) | | `classifications` | SOC analyst classifications (created by `analysis.py`) | -| `mabase_prod.audit_logs` | Audit trail (optional — missing table is handled silently) | +| `ja4_processing.audit_logs` | Audit trail (optional — missing table is handled silently) | diff --git a/services/dashboard/README.md b/services/dashboard/README.md index 7648988..1998acf 100644 --- a/services/dashboard/README.md +++ b/services/dashboard/README.md @@ -241,7 +241,7 @@ curl http://localhost:8000/api/reputation/ip/162.55.94.175 | jq |----------|--------|-------------| | `CLICKHOUSE_HOST` | `clickhouse` | Hôte ClickHouse | | `CLICKHOUSE_PORT` | `8123` | Port HTTP ClickHouse | -| `CLICKHOUSE_DB` | `mabase_prod` | Base de données | +| `CLICKHOUSE_DB` | `ja4_processing` | Base de données | | `CLICKHOUSE_USER` | `admin` | Utilisateur | | `CLICKHOUSE_PASSWORD` | `` | Mot de passe | | `API_HOST` | `0.0.0.0` | Bind Uvicorn | @@ -339,11 +339,11 @@ Déflation Hotelling : retire PC1 de X avant de calculer PC2 | Table / Vue | Routes | |---|---| -| `mabase_prod.ml_detected_anomalies` | metrics, detections, variability, analysis, clustering | -| `mabase_prod.agg_host_ip_ja4_1h` | tcp_spoofing, clustering, investigation_summary | -| `mabase_prod.view_dashboard_entities` | entities (UA, JA4, paths, query params) | -| `mabase_prod.classifications` | analysis (classifications SOC manuelles) | -| `mabase_prod.audit_logs` | audit (optionnel — silencieux si absent) | +| `ja4_processing.ml_detected_anomalies` | metrics, detections, variability, analysis, clustering | +| `ja4_processing.agg_host_ip_ja4_1h` | tcp_spoofing, clustering, investigation_summary | +| `ja4_processing.view_dashboard_entities` | entities (UA, JA4, paths, query params) | +| `ja4_processing.classifications` | analysis (classifications SOC manuelles) | +| `ja4_processing.audit_logs` | audit (optionnel — silencieux si absent) | **Conventions SQL :** - IPs stockées en IPv6-mappé : `replaceRegexpAll(toString(src_ip), '^::ffff:', '')` @@ -477,15 +477,15 @@ curl -s http://localhost:3000 | head -20 ```bash # Compter les détections (24h) -docker compose exec clickhouse clickhouse-client -d mabase_prod -q \ +docker compose exec clickhouse clickhouse-client -d ja4_processing -q \ "SELECT count() FROM ml_detected_anomalies WHERE detected_at >= now() - INTERVAL 24 HOUR" # Voir un échantillon -docker compose exec clickhouse clickhouse-client -d mabase_prod -q \ +docker compose exec clickhouse clickhouse-client -d ja4_processing -q \ "SELECT src_ip, threat_level, model_name, detected_at FROM ml_detected_anomalies ORDER BY detected_at DESC LIMIT 5" # Vérifier les vues du dashboard -docker compose exec clickhouse clickhouse-client -d mabase_prod -q \ +docker compose exec clickhouse clickhouse-client -d ja4_processing -q \ "SELECT * FROM view_dashboard_summary" ``` @@ -524,7 +524,7 @@ docker compose up -d dashboard_web ```bash # 1. Vérifier qu'il y a des données dans ClickHouse -docker compose exec clickhouse clickhouse-client -d mabase_prod -q \ +docker compose exec clickhouse clickhouse-client -d ja4_processing -q \ "SELECT count() FROM ml_detected_anomalies WHERE detected_at >= now() - INTERVAL 24 HOUR" # Si le résultat est 0: diff --git a/services/dashboard/backend/config.py b/services/dashboard/backend/config.py index bf61b10..b057947 100644 --- a/services/dashboard/backend/config.py +++ b/services/dashboard/backend/config.py @@ -8,7 +8,9 @@ class Settings(BaseSettings): # ClickHouse CLICKHOUSE_HOST: str = "clickhouse" CLICKHOUSE_PORT: int = 8123 - CLICKHOUSE_DB: str = "mabase_prod" + CLICKHOUSE_DB: str = "ja4_processing" # default connection database + CLICKHOUSE_DB_LOGS: str = "ja4_logs" + CLICKHOUSE_DB_PROCESSING: str = "ja4_processing" CLICKHOUSE_USER: str = "admin" CLICKHOUSE_PASSWORD: str = "" diff --git a/services/dashboard/backend/main.py b/services/dashboard/backend/main.py index 924e72f..d9e010a 100644 --- a/services/dashboard/backend/main.py +++ b/services/dashboard/backend/main.py @@ -139,7 +139,7 @@ app = FastAPI( title="Bot Detector Dashboard API", description=( "API REST du **Bot Detector SOC Dashboard**.\n\n" - "Permet d'interroger la base ClickHouse (`mabase_prod`) pour visualiser et analyser " + "Permet d'interroger les bases ClickHouse (`ja4_logs` / `ja4_processing`) pour visualiser et analyser " "les détections de bots générées par le service `bot_detector_ai`.\n\n" "**Endpoint clé :** `GET /api/investigation/{ip}/summary` — synthèse complète en un appel.\n\n" "Documentation interactive : `/docs` (Swagger UI) · `/redoc` (ReDoc)" diff --git a/services/dashboard/docker-compose.yaml b/services/dashboard/docker-compose.yaml index a8cd08b..3f310dd 100644 --- a/services/dashboard/docker-compose.yaml +++ b/services/dashboard/docker-compose.yaml @@ -16,7 +16,9 @@ services: environment: # ClickHouse CLICKHOUSE_HOST: ${CLICKHOUSE_HOST:-clickhouse} - CLICKHOUSE_DB: ${CLICKHOUSE_DB:-mabase_prod} + CLICKHOUSE_DB: ${CLICKHOUSE_DB:-ja4_processing} + CLICKHOUSE_DB_LOGS: ${CLICKHOUSE_DB_LOGS:-ja4_logs} + CLICKHOUSE_DB_PROCESSING: ${CLICKHOUSE_DB_PROCESSING:-ja4_processing} CLICKHOUSE_USER: ${CLICKHOUSE_USER:-admin} CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD:-} diff --git a/shared/clickhouse/00_database.sql b/shared/clickhouse/00_database.sql index 881cf3f..6eeb4d6 100644 --- a/shared/clickhouse/00_database.sql +++ b/shared/clickhouse/00_database.sql @@ -1,4 +1,5 @@ -- ============================================================================= --- 00_database.sql — Create the production database +-- 00_database.sql — Create the production databases -- ============================================================================= -CREATE DATABASE IF NOT EXISTS mabase_prod; +CREATE DATABASE IF NOT EXISTS ja4_logs; +CREATE DATABASE IF NOT EXISTS ja4_processing; diff --git a/shared/clickhouse/01_raw_tables.sql b/shared/clickhouse/01_raw_tables.sql index 33b2cb1..d2c0fc6 100644 --- a/shared/clickhouse/01_raw_tables.sql +++ b/shared/clickhouse/01_raw_tables.sql @@ -2,7 +2,7 @@ -- 01_raw_tables.sql — Raw ingest table (direct target for logcorrelator inserts) -- ============================================================================= -CREATE TABLE IF NOT EXISTS mabase_prod.http_logs_raw +CREATE TABLE IF NOT EXISTS ja4_logs.http_logs_raw ( `raw_json` String CODEC(ZSTD(3)), `ingest_time` DateTime DEFAULT now() diff --git a/shared/clickhouse/02_dictionaries.sql b/shared/clickhouse/02_dictionaries.sql index 09b247c..b49f692 100644 --- a/shared/clickhouse/02_dictionaries.sql +++ b/shared/clickhouse/02_dictionaries.sql @@ -6,9 +6,9 @@ -- IPLocate ASN geo-location dictionary -- CSV file must be placed at: /var/lib/clickhouse/user_files/iplocate-ip-to-asn.csv -- ----------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS mabase_prod.dict_iplocate_asn; +DROP DICTIONARY IF EXISTS ja4_processing.dict_iplocate_asn; -CREATE DICTIONARY IF NOT EXISTS mabase_prod.dict_iplocate_asn +CREATE DICTIONARY IF NOT EXISTS ja4_processing.dict_iplocate_asn ( network String, asn UInt32, @@ -26,7 +26,7 @@ LIFETIME(MIN 3600 MAX 7200); -- ----------------------------------------------------------------------------- -- Bot network CIDR reference table (ReplacingMergeTree for upserts) -- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.ref_bot_networks +CREATE TABLE IF NOT EXISTS ja4_processing.ref_bot_networks ( network IPv6CIDR, bot_name LowCardinality(String), @@ -41,7 +41,7 @@ ORDER BY (network, bot_name); -- Bot IP flat table (CSV file engine) -- CSV file must be placed at: /var/lib/clickhouse/user_files/bot_ip.csv -- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.bot_ip +CREATE TABLE IF NOT EXISTS ja4_processing.bot_ip ( ip String ) @@ -52,7 +52,7 @@ ENGINE = File(CSV, 'bot_ip.csv'); -- Bot JA4 flat table (CSV file engine) -- CSV file must be placed at: /var/lib/clickhouse/user_files/bot_ja4.csv -- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.bot_ja4 +CREATE TABLE IF NOT EXISTS ja4_processing.bot_ja4 ( ja4 String ) diff --git a/shared/clickhouse/03_anubis_tables.sql b/shared/clickhouse/03_anubis_tables.sql index f6c4f8c..76b994f 100644 --- a/shared/clickhouse/03_anubis_tables.sql +++ b/shared/clickhouse/03_anubis_tables.sql @@ -6,7 +6,7 @@ -- ----------------------------------------------------------------------------- -- 1. TABLE SOURCE — User-Agent rules (for REGEXP_TREE dictionary) -- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.anubis_ua_rules +CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules ( id UInt64, parent_id UInt64, @@ -21,7 +21,7 @@ ORDER BY id; -- ----------------------------------------------------------------------------- -- 2. TABLE SOURCE — IP/CIDR rules (for IP_TRIE dictionary) -- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.anubis_ip_rules +CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules ( prefix String, bot_name LowCardinality(String), @@ -36,29 +36,29 @@ ORDER BY prefix; -- ----------------------------------------------------------------------------- -- 3. DICTIONARY — UA REGEXP_TREE --- dictGet('mabase_prod.dict_anubis_ua', 'bot_name', header_user_agent) +-- dictGet('ja4_processing.dict_anubis_ua', 'bot_name', header_user_agent) -- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use. -- ----------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_ua; -CREATE DICTIONARY mabase_prod.dict_anubis_ua +DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ua; +CREATE DICTIONARY ja4_processing.dict_anubis_ua ( regexp String, bot_name String, action String ) PRIMARY KEY regexp -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_ua_rules')) +SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_ua_rules')) LAYOUT(REGEXP_TREE) LIFETIME(MIN 300 MAX 600); -- ----------------------------------------------------------------------------- -- 4. DICTIONARY — IP IP_TRIE --- dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', toIPv6(src_ip), '') +-- dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', toIPv6(src_ip), '') -- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use. -- ----------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_ip; -CREATE DICTIONARY mabase_prod.dict_anubis_ip +DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_ip; +CREATE DICTIONARY ja4_processing.dict_anubis_ip ( prefix String, bot_name String, @@ -68,7 +68,7 @@ CREATE DICTIONARY mabase_prod.dict_anubis_ip category String ) PRIMARY KEY prefix -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_ip_rules')) +SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_ip_rules')) LAYOUT(IP_TRIE()) LIFETIME(MIN 300 MAX 600); @@ -76,7 +76,7 @@ LIFETIME(MIN 300 MAX 600); -- ----------------------------------------------------------------------------- -- 5. TABLE SOURCE — ASN rules (for Flat dictionary) -- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.anubis_asn_rules +CREATE TABLE IF NOT EXISTS ja4_processing.anubis_asn_rules ( asn UInt32, bot_name LowCardinality(String), @@ -90,7 +90,7 @@ ORDER BY asn; -- ----------------------------------------------------------------------------- -- 6. TABLE SOURCE — Country rules (for Flat dictionary) -- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.anubis_country_rules +CREATE TABLE IF NOT EXISTS ja4_processing.anubis_country_rules ( country_code LowCardinality(String), bot_name LowCardinality(String), @@ -103,11 +103,11 @@ ORDER BY country_code; -- ----------------------------------------------------------------------------- -- 7. DICTIONARY — ASN Flat --- dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', src_asn, '') +-- dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', src_asn, '') -- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use. -- ----------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_asn; -CREATE DICTIONARY mabase_prod.dict_anubis_asn +DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_asn; +CREATE DICTIONARY ja4_processing.dict_anubis_asn ( asn UInt32, bot_name String, @@ -115,18 +115,18 @@ CREATE DICTIONARY mabase_prod.dict_anubis_asn category String ) PRIMARY KEY asn -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_asn_rules')) +SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_asn_rules')) LAYOUT(FLAT()) LIFETIME(MIN 300 MAX 600); -- ----------------------------------------------------------------------------- -- 8. DICTIONARY — Country Flat --- dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', src_country_code, '') +-- dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', src_country_code, '') -- NOTE: Change 'CHANGE_ME' to the actual ClickHouse admin password before use. -- ----------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS mabase_prod.dict_anubis_country; -CREATE DICTIONARY mabase_prod.dict_anubis_country +DROP DICTIONARY IF EXISTS ja4_processing.dict_anubis_country; +CREATE DICTIONARY ja4_processing.dict_anubis_country ( country_code String, bot_name String, @@ -134,6 +134,6 @@ CREATE DICTIONARY mabase_prod.dict_anubis_country category String ) PRIMARY KEY country_code -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'mabase_prod' TABLE 'anubis_country_rules')) +SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'admin' PASSWORD 'CHANGE_ME' DB 'ja4_processing' TABLE 'anubis_country_rules')) LAYOUT(FLAT()) LIFETIME(MIN 300 MAX 600); diff --git a/shared/clickhouse/04_mv_http_logs.sql b/shared/clickhouse/04_mv_http_logs.sql index ca711a6..0810f55 100644 --- a/shared/clickhouse/04_mv_http_logs.sql +++ b/shared/clickhouse/04_mv_http_logs.sql @@ -11,7 +11,7 @@ -- http_logs — parsed destination table (populated by mv_http_logs) -- Includes Anubis enrichment columns added by deploy_schema.sql items 9+ -- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.http_logs +CREATE TABLE IF NOT EXISTS ja4_logs.http_logs ( -- Temporal `time` DateTime, @@ -104,16 +104,16 @@ SETTINGS -- > (4) ASN > (5) Country -- Uses WITH clause CTEs for clean intermediate values. -- ----------------------------------------------------------------------------- -DROP VIEW IF EXISTS mabase_prod.mv_http_logs; +DROP VIEW IF EXISTS ja4_logs.mv_http_logs; -CREATE MATERIALIZED VIEW mabase_prod.mv_http_logs -TO mabase_prod.http_logs +CREATE MATERIALIZED VIEW ja4_logs.mv_http_logs +TO ja4_logs.http_logs AS WITH coalesce(JSONExtractString(raw_json, 'header_User-Agent'), '') AS _ua, toIPv6(toIPv4(coalesce(JSONExtractString(raw_json, 'src_ip'), '0.0.0.0'))) AS _ip, - toUInt32(dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'asn', _ip, toUInt32(0))) AS _asn, - dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'country_code', _ip, '') AS _cc + toUInt32(dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'asn', _ip, toUInt32(0))) AS _asn, + dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'country_code', _ip, '') AS _cc SELECT parseDateTimeBestEffort(coalesce(JSONExtractString(raw_json, 'time'), '1970-01-01T00:00:00Z')) AS time, toDate(time) AS log_date, @@ -123,9 +123,9 @@ SELECT _cc AS src_country_code, toIPv4(coalesce(JSONExtractString(raw_json, 'dst_ip'), '0.0.0.0')) AS dst_ip, toUInt16(coalesce(JSONExtractUInt(raw_json, 'dst_port'), 0)) AS dst_port, - dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'name', _ip, '') AS src_as_name, - dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'org', _ip, '') AS src_org, - dictGetOrDefault('mabase_prod.dict_iplocate_asn', 'domain', _ip, '') AS src_domain, + dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'name', _ip, '') AS src_as_name, + dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'org', _ip, '') AS src_org, + dictGetOrDefault('ja4_processing.dict_iplocate_asn', 'domain', _ip, '') AS src_domain, coalesce(JSONExtractString(raw_json, 'method'), '') AS method, coalesce(JSONExtractString(raw_json, 'scheme'), '') AS scheme, coalesce(JSONExtractString(raw_json, 'host'), '') AS host, @@ -172,63 +172,63 @@ SELECT -- Anubis enrichment: combined UA+IP priority logic -- Priority: (1) UA+IP [same rule_id] > (2) UA only > (3) IP only > (4) ASN > (5) Country CASE - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '1' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != '' - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != '' - AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', _ua)) - = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) - THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '0' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != '' - THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) - WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' + AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua)) + = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) + THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' + THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) + WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') ELSE '' END AS anubis_bot_name, CASE - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '1' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != '' - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != '' - AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', _ua)) - = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) - THEN dictGet('mabase_prod.dict_anubis_ua', 'action', _ua) - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '0' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != '' - THEN dictGet('mabase_prod.dict_anubis_ua', 'action', _ua) - WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'action', _ip, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'action', _asn, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'action', _cc, '') + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' + AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua)) + = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) + THEN dictGet('ja4_processing.dict_anubis_ua', 'action', _ua) + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' + THEN dictGet('ja4_processing.dict_anubis_ua', 'action', _ua) + WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', _ip, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', _asn, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'action', _cc, '') ELSE '' END AS anubis_bot_action, CASE - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '1' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != '' - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != '' - AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', _ua)) - = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) - THEN dictGet('mabase_prod.dict_anubis_ua', 'category', _ua) - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', _ua) = '0' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', _ua) != '' - THEN dictGet('mabase_prod.dict_anubis_ua', 'category', _ua) - WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', _ip, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'category', _ip, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', _asn, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'category', _asn, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', _cc, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'category', _cc, '') + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '1' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' + AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', _ua)) + = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', _ip, toUInt64(0)) + THEN dictGet('ja4_processing.dict_anubis_ua', 'category', _ua) + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', _ua) = '0' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', _ua) != '' + THEN dictGet('ja4_processing.dict_anubis_ua', 'category', _ua) + WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', _ip, toUInt8(0)) = 0 + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', _ip, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', _ip, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', _asn, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', _asn, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', _cc, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'category', _cc, '') ELSE '' END AS anubis_bot_category -FROM mabase_prod.http_logs_raw; +FROM ja4_logs.http_logs_raw; diff --git a/shared/clickhouse/05_aggregation_tables.sql b/shared/clickhouse/05_aggregation_tables.sql index 62e02cb..e0cfec6 100644 --- a/shared/clickhouse/05_aggregation_tables.sql +++ b/shared/clickhouse/05_aggregation_tables.sql @@ -7,8 +7,8 @@ -- Bot reputation dictionaries (in-RAM for fast lookup) -- CSV files must be placed at: /var/lib/clickhouse/user_files/ -- ----------------------------------------------------------------------------- -DROP DICTIONARY IF EXISTS mabase_prod.dict_bot_ip; -CREATE DICTIONARY mabase_prod.dict_bot_ip +DROP DICTIONARY IF EXISTS ja4_processing.dict_bot_ip; +CREATE DICTIONARY ja4_processing.dict_bot_ip ( prefix String, bot_name String @@ -18,8 +18,8 @@ SOURCE(FILE(path '/var/lib/clickhouse/user_files/bot_ip.csv' format 'CSV')) LAYOUT(IP_TRIE()) LIFETIME(MIN 300 MAX 300); -DROP DICTIONARY IF EXISTS mabase_prod.dict_bot_ja4; -CREATE DICTIONARY mabase_prod.dict_bot_ja4 +DROP DICTIONARY IF EXISTS ja4_processing.dict_bot_ja4; +CREATE DICTIONARY ja4_processing.dict_bot_ja4 ( ja4 String, bot_name String @@ -29,8 +29,8 @@ SOURCE(FILE(path '/var/lib/clickhouse/user_files/bot_ja4.csv' format 'CSV')) LAYOUT(COMPLEX_KEY_HASHED()) LIFETIME(MIN 300 MAX 300); -DROP DICTIONARY IF EXISTS mabase_prod.dict_asn_reputation; -CREATE DICTIONARY mabase_prod.dict_asn_reputation +DROP DICTIONARY IF EXISTS ja4_processing.dict_asn_reputation; +CREATE DICTIONARY ja4_processing.dict_asn_reputation ( src_asn UInt64, label String @@ -44,7 +44,7 @@ LIFETIME(MIN 300 MAX 300); -- ----------------------------------------------------------------------------- -- agg_host_ip_ja4_1h — behavioral aggregation (L4/L5/L7) -- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.agg_host_ip_ja4_1h +CREATE TABLE IF NOT EXISTS ja4_processing.agg_host_ip_ja4_1h ( window_start DateTime, src_ip IPv6, ja4 String, host String, src_asn UInt32, @@ -107,10 +107,10 @@ ORDER BY (window_start, src_ip, ja4, host); -- ----------------------------------------------------------------------------- -- mv_agg_host_ip_ja4_1h — feeds agg_host_ip_ja4_1h from http_logs -- ----------------------------------------------------------------------------- -DROP VIEW IF EXISTS mabase_prod.mv_agg_host_ip_ja4_1h; +DROP VIEW IF EXISTS ja4_processing.mv_agg_host_ip_ja4_1h; -CREATE MATERIALIZED VIEW mabase_prod.mv_agg_host_ip_ja4_1h -TO mabase_prod.agg_host_ip_ja4_1h AS +CREATE MATERIALIZED VIEW ja4_processing.mv_agg_host_ip_ja4_1h +TO ja4_processing.agg_host_ip_ja4_1h AS SELECT toStartOfHour(src.time) AS window_start, toIPv6(src.src_ip) AS src_ip, src.ja4, src.host, src.src_asn, @@ -151,14 +151,14 @@ SELECT sum(toUInt64(src.correlated)) AS count_correlated, sum(IF(length(src.header_accept_encoding) = 0, 1, 0)) AS count_no_accept_enc, sum(IF(src.scheme = 'http', 1, 0)) AS count_http_scheme -FROM mabase_prod.http_logs AS src +FROM ja4_logs.http_logs AS src GROUP BY window_start, src_ip, ja4, host, src_asn; -- ----------------------------------------------------------------------------- -- agg_header_fingerprint_1h — header fingerprint aggregation (L7) -- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.agg_header_fingerprint_1h +CREATE TABLE IF NOT EXISTS ja4_processing.agg_header_fingerprint_1h ( window_start DateTime, src_ip IPv6, @@ -175,10 +175,10 @@ CREATE TABLE IF NOT EXISTS mabase_prod.agg_header_fingerprint_1h ENGINE = AggregatingMergeTree() ORDER BY (window_start, src_ip); -DROP VIEW IF EXISTS mabase_prod.mv_agg_header_fingerprint_1h; +DROP VIEW IF EXISTS ja4_processing.mv_agg_header_fingerprint_1h; -CREATE MATERIALIZED VIEW mabase_prod.mv_agg_header_fingerprint_1h -TO mabase_prod.agg_header_fingerprint_1h AS +CREATE MATERIALIZED VIEW ja4_processing.mv_agg_header_fingerprint_1h +TO ja4_processing.agg_header_fingerprint_1h AS SELECT toStartOfHour(src.time) AS window_start, toIPv6(src.src_ip) AS src_ip, @@ -191,5 +191,5 @@ SELECT max(toUInt8(if((position(src.header_user_agent, 'Windows') > 0 AND position(src.header_sec_ch_ua_platform, 'Windows') == 0) OR (position(src.header_user_agent, 'iPhone') > 0 AND position(src.header_sec_ch_ua_platform, 'iOS') == 0), 1, 0))) AS ua_ch_mismatch, any(src.header_sec_fetch_mode) AS sec_fetch_mode, any(src.header_sec_fetch_dest) AS sec_fetch_dest -FROM mabase_prod.http_logs AS src +FROM ja4_logs.http_logs AS src GROUP BY window_start, src.src_ip; diff --git a/shared/clickhouse/06_ml_tables.sql b/shared/clickhouse/06_ml_tables.sql index 387c6c2..d29fe43 100644 --- a/shared/clickhouse/06_ml_tables.sql +++ b/shared/clickhouse/06_ml_tables.sql @@ -6,7 +6,7 @@ -- ----------------------------------------------------------------------------- -- ml_detected_anomalies — anomaly detections above threat threshold -- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.ml_detected_anomalies +CREATE TABLE IF NOT EXISTS ja4_processing.ml_detected_anomalies ( detected_at DateTime, src_ip IPv6, ja4 String, host String, bot_name String, anomaly_score Float32, threat_level String, model_name String, recurrence UInt32, @@ -42,7 +42,7 @@ TTL detected_at + INTERVAL 30 DAY; -- ----------------------------------------------------------------------------- -- ml_all_scores — all classifications (no threshold, for observability) -- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS mabase_prod.ml_all_scores +CREATE TABLE IF NOT EXISTS ja4_processing.ml_all_scores ( detected_at DateTime, window_start DateTime, @@ -78,7 +78,7 @@ SETTINGS index_granularity = 8192; -- ----------------------------------------------------------------------------- -- view_ip_recurrence — recurrence aggregation over ml_detected_anomalies -- ----------------------------------------------------------------------------- -CREATE OR REPLACE VIEW mabase_prod.view_ip_recurrence AS +CREATE OR REPLACE VIEW ja4_processing.view_ip_recurrence AS SELECT src_ip, count() AS recurrence, @@ -86,5 +86,5 @@ SELECT max(detected_at) AS last_seen, min(anomaly_score) AS worst_score, argMin(threat_level, anomaly_score) AS worst_threat_level -FROM mabase_prod.ml_detected_anomalies +FROM ja4_processing.ml_detected_anomalies GROUP BY src_ip; diff --git a/shared/clickhouse/07_ai_features_view.sql b/shared/clickhouse/07_ai_features_view.sql index 713bc38..a5e9745 100644 --- a/shared/clickhouse/07_ai_features_view.sql +++ b/shared/clickhouse/07_ai_features_view.sql @@ -4,75 +4,75 @@ -- Includes combined UA+IP priority logic and Anubis bot_name/action/category. -- ============================================================================= -CREATE OR REPLACE VIEW mabase_prod.view_ai_features_1h AS +CREATE OR REPLACE VIEW ja4_processing.view_ai_features_1h AS WITH base_data AS ( SELECT a.window_start, a.src_ip, a.ja4, a.host, toString(a.src_asn) AS asn_number, a.src_as_name AS asn_org, a.src_org AS asn_detail, a.src_domain AS asn_domain, a.src_country_code AS country_code, - dictGetOrDefault('mabase_prod.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label, + dictGetOrDefault('ja4_processing.dict_asn_reputation', 'label', toUInt64(a.src_asn), 'unknown') AS asn_label, COALESCE( - nullIf(dictGetOrDefault('mabase_prod.dict_bot_ip', 'bot_name', a.src_ip, ''), ''), - nullIf(dictGetOrDefault('mabase_prod.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_bot_ip', 'bot_name', a.src_ip, ''), ''), + nullIf(dictGetOrDefault('ja4_processing.dict_bot_ja4', 'bot_name', tuple(a.ja4), ''), ''), '' ) AS bot_name, -- Anubis: combined UA+IP priority logic > UA only > IP only > ASN > Country CASE - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua)) - = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) - THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' - THEN dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) - WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' + AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua)) + = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) + THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' + THEN dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) + WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') ELSE '' END AS anubis_bot_name, CASE - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua)) - = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) - THEN dictGet('mabase_prod.dict_anubis_ua', 'action', a.first_ua) - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' - THEN dictGet('mabase_prod.dict_anubis_ua', 'action', a.first_ua) - WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'action', a.src_ip, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'action', toUInt32(a.src_asn), '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'action', a.src_country_code, '') + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' + AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua)) + = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) + THEN dictGet('ja4_processing.dict_anubis_ua', 'action', a.first_ua) + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' + THEN dictGet('ja4_processing.dict_anubis_ua', 'action', a.first_ua) + WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'action', a.src_ip, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'action', toUInt32(a.src_asn), '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'action', a.src_country_code, '') ELSE '' END AS anubis_bot_action, CASE - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '1' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - AND toUInt64OrZero(dictGet('mabase_prod.dict_anubis_ua', 'rule_id', a.first_ua)) - = dictGetOrDefault('mabase_prod.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) - THEN dictGet('mabase_prod.dict_anubis_ua', 'category', a.first_ua) - WHEN dictGet('mabase_prod.dict_anubis_ua', 'has_ip', a.first_ua) = '0' - AND dictGet('mabase_prod.dict_anubis_ua', 'bot_name', a.first_ua) != '' - THEN dictGet('mabase_prod.dict_anubis_ua', 'category', a.first_ua) - WHEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 - AND dictGetOrDefault('mabase_prod.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_ip', 'category', a.src_ip, '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_asn', 'category', toUInt32(a.src_asn), '') - WHEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' - THEN dictGetOrDefault('mabase_prod.dict_anubis_country', 'category', a.src_country_code, '') + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '1' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' + AND toUInt64OrZero(dictGet('ja4_processing.dict_anubis_ua', 'rule_id', a.first_ua)) + = dictGetOrDefault('ja4_processing.dict_anubis_ip', 'rule_id', a.src_ip, toUInt64(0)) + THEN dictGet('ja4_processing.dict_anubis_ua', 'category', a.first_ua) + WHEN dictGet('ja4_processing.dict_anubis_ua', 'has_ip', a.first_ua) = '0' + AND dictGet('ja4_processing.dict_anubis_ua', 'bot_name', a.first_ua) != '' + THEN dictGet('ja4_processing.dict_anubis_ua', 'category', a.first_ua) + WHEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'has_ua', a.src_ip, toUInt8(0)) = 0 + AND dictGetOrDefault('ja4_processing.dict_anubis_ip', 'bot_name', a.src_ip, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_ip', 'category', a.src_ip, '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'bot_name', toUInt32(a.src_asn), '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_asn', 'category', toUInt32(a.src_asn), '') + WHEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'bot_name', a.src_country_code, '') != '' + THEN dictGetOrDefault('ja4_processing.dict_anubis_country', 'category', a.src_country_code, '') ELSE '' END AS anubis_bot_category, a.hits AS hits, @@ -162,7 +162,7 @@ WITH base_data AS ( sum(count_correlated) AS count_correlated_val, sum(count_no_accept_enc) AS count_no_accept_enc_val, sum(count_http_scheme) AS count_http_scheme_val - FROM mabase_prod.agg_host_ip_ja4_1h + FROM ja4_processing.agg_host_ip_ja4_1h WHERE window_start >= now() - INTERVAL 24 HOUR GROUP BY window_start, src_ip, ja4, host, src_asn ) a @@ -173,7 +173,7 @@ WITH base_data AS ( max(has_cookie) AS has_cookie, max(has_referer) AS has_referer, max(modern_browser_score) AS modern_browser_score, max(ua_ch_mismatch) AS ua_ch_mismatch, any(sec_fetch_mode) AS sec_fetch_mode, any(sec_fetch_dest) AS sec_fetch_dest - FROM mabase_prod.agg_header_fingerprint_1h + FROM ja4_processing.agg_header_fingerprint_1h WHERE window_start >= now() - INTERVAL 24 HOUR GROUP BY window_start, src_ip ) h ON a.src_ip = h.src_ip AND a.window_start = h.window_start diff --git a/shared/clickhouse/08_users.sql b/shared/clickhouse/08_users.sql index 8a1488c..f97c788 100644 --- a/shared/clickhouse/08_users.sql +++ b/shared/clickhouse/08_users.sql @@ -7,14 +7,16 @@ CREATE USER IF NOT EXISTS data_writer IDENTIFIED WITH plaintext_password BY 'ChangeMe'; CREATE USER IF NOT EXISTS analyst IDENTIFIED WITH plaintext_password BY 'ChangeMe'; --- data_writer: INSERT on raw table only (fed by correlator service) -GRANT INSERT ON mabase_prod.http_logs_raw TO data_writer; -GRANT SELECT ON mabase_prod.http_logs_raw TO data_writer; +-- data_writer: INSERT on raw table in ja4_logs (fed by correlator service) +GRANT INSERT ON ja4_logs.http_logs_raw TO data_writer; +GRANT SELECT ON ja4_logs.http_logs_raw TO data_writer; --- analyst: read access on parsed and derived tables -GRANT SELECT ON mabase_prod.http_logs TO analyst; -GRANT SELECT ON mabase_prod.ml_detected_anomalies TO analyst; -GRANT SELECT ON mabase_prod.ml_all_scores TO analyst; -GRANT SELECT ON mabase_prod.view_ai_features_1h TO analyst; -GRANT SELECT ON mabase_prod.view_ip_recurrence TO analyst; -GRANT SELECT ON mabase_prod.audit_logs TO analyst; +-- analyst: read access on ja4_logs (parsed logs) +GRANT SELECT ON ja4_logs.http_logs TO analyst; + +-- analyst: read access on ja4_processing (analytics, ML, views, audit) +GRANT SELECT ON ja4_processing.ml_detected_anomalies TO analyst; +GRANT SELECT ON ja4_processing.ml_all_scores TO analyst; +GRANT SELECT ON ja4_processing.view_ai_features_1h TO analyst; +GRANT SELECT ON ja4_processing.view_ip_recurrence TO analyst; +GRANT SELECT ON ja4_processing.audit_logs TO analyst; diff --git a/shared/clickhouse/09_audit_table.sql b/shared/clickhouse/09_audit_table.sql index 74d45ab..11c63ac 100644 --- a/shared/clickhouse/09_audit_table.sql +++ b/shared/clickhouse/09_audit_table.sql @@ -1,9 +1,9 @@ -- ============================================================================= -- 09_audit_table.sql — SOC audit log table for dashboard activity tracking --- Referenced as mabase_prod.audit_logs in dashboard/backend/routes/audit.py +-- Referenced as ja4_processing.audit_logs in dashboard/backend/routes/audit.py -- ============================================================================= -CREATE TABLE IF NOT EXISTS mabase_prod.audit_logs +CREATE TABLE IF NOT EXISTS ja4_processing.audit_logs ( `timestamp` DateTime DEFAULT now(), `user_name` LowCardinality(String) DEFAULT 'soc_user', diff --git a/shared/clickhouse/deploy_schema.sh b/shared/clickhouse/deploy_schema.sh new file mode 100755 index 0000000..090c99f --- /dev/null +++ b/shared/clickhouse/deploy_schema.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# ============================================================================= +# deploy_schema.sh — Apply ClickHouse schema migrations with configurable +# database names via environment variables. +# +# Usage: +# CLICKHOUSE_DB_LOGS=my_logs CLICKHOUSE_DB_PROCESSING=my_proc ./deploy_schema.sh +# +# Env vars: +# CLICKHOUSE_DB_LOGS — logs database name (default: ja4_logs) +# CLICKHOUSE_DB_PROCESSING — processing database name (default: ja4_processing) +# CLICKHOUSE_HOST — ClickHouse host (default: localhost) +# CLICKHOUSE_PORT — ClickHouse native port (default: 9000) +# CLICKHOUSE_USER — ClickHouse user (default: default) +# CLICKHOUSE_PASSWORD — ClickHouse password (default: empty) +# ============================================================================= +set -euo pipefail + +CLICKHOUSE_DB_LOGS="${CLICKHOUSE_DB_LOGS:-ja4_logs}" +CLICKHOUSE_DB_PROCESSING="${CLICKHOUSE_DB_PROCESSING:-ja4_processing}" +CLICKHOUSE_HOST="${CLICKHOUSE_HOST:-localhost}" +CLICKHOUSE_PORT="${CLICKHOUSE_PORT:-9000}" +CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}" +CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +CH_ARGS=( + --host "$CLICKHOUSE_HOST" + --port "$CLICKHOUSE_PORT" + --user "$CLICKHOUSE_USER" +) +if [[ -n "$CLICKHOUSE_PASSWORD" ]]; then + CH_ARGS+=(--password "$CLICKHOUSE_PASSWORD") +fi + +SQL_FILES=( + 00_database.sql + 01_raw_tables.sql + 02_dictionaries.sql + 03_anubis_tables.sql + 04_mv_http_logs.sql + 05_aggregation_tables.sql + 06_ml_tables.sql + 07_ai_features_view.sql + 08_users.sql + 09_audit_table.sql +) + +for f in "${SQL_FILES[@]}"; do + filepath="${SCRIPT_DIR}/${f}" + if [[ ! -f "$filepath" ]]; then + echo "WARN: ${f} not found, skipping" >&2 + continue + fi + + echo ">>> Applying ${f} ..." + sed \ + -e "s/ja4_logs/${CLICKHOUSE_DB_LOGS}/g" \ + -e "s/ja4_processing/${CLICKHOUSE_DB_PROCESSING}/g" \ + "$filepath" \ + | clickhouse-client "${CH_ARGS[@]}" --multiquery +done + +echo "=== Schema deployment complete ===" +echo " logs db: ${CLICKHOUSE_DB_LOGS}" +echo " processing db: ${CLICKHOUSE_DB_PROCESSING}" diff --git a/shared/python/ja4_common/ja4_common/settings.py b/shared/python/ja4_common/ja4_common/settings.py index 3e11cde..0f1904b 100644 --- a/shared/python/ja4_common/ja4_common/settings.py +++ b/shared/python/ja4_common/ja4_common/settings.py @@ -5,7 +5,9 @@ from pydantic_settings import BaseSettings class ClickHouseSettings(BaseSettings): CLICKHOUSE_HOST: str = "clickhouse" CLICKHOUSE_PORT: int = 8123 - CLICKHOUSE_DB: str = "mabase_prod" + CLICKHOUSE_DB: str = "ja4_processing" # default connection database + CLICKHOUSE_DB_LOGS: str = "ja4_logs" + CLICKHOUSE_DB_PROCESSING: str = "ja4_processing" CLICKHOUSE_USER: str = "admin" CLICKHOUSE_PASSWORD: str = "" diff --git a/shared/python/ja4_common/tests/test_settings.py b/shared/python/ja4_common/tests/test_settings.py index 8c132c4..dc10e9b 100644 --- a/shared/python/ja4_common/tests/test_settings.py +++ b/shared/python/ja4_common/tests/test_settings.py @@ -7,7 +7,9 @@ def test_default_settings(): s = ClickHouseSettings() assert s.CLICKHOUSE_HOST == "clickhouse" assert s.CLICKHOUSE_PORT == 8123 - assert s.CLICKHOUSE_DB == "mabase_prod" + assert s.CLICKHOUSE_DB == "ja4_processing" + assert s.CLICKHOUSE_DB_LOGS == "ja4_logs" + assert s.CLICKHOUSE_DB_PROCESSING == "ja4_processing" assert s.CLICKHOUSE_USER == "admin" assert s.CLICKHOUSE_PASSWORD == ""