feat(scripts): complete stack init + prod data import with date shift

Schema cleanup: - Remove anubis_ua_rules table stub from 03_anubis_tables.sql - Remove anubis_ua_rules from bot-detector deploy_schema.sql - Remove UA seed step from clickhouse-init.sh (no more REGEXP_TREE dependency) - Drop dict_anubis_ua, dict_anubis_country, anubis_ua_rules, anubis_country_rules New scripts: - scripts/init-stack.sh: comprehensive ClickHouse init (13 SQL files + migrations + validation + cleanup of obsolete tables). Supports --reset, --import-prod. - scripts/import-prod-data.sh: imports pre-exported prod data (Native format) with dynamic date shift (max(time) → now). Supports --shift, --no-truncate. - scripts/data/prod-export/: directory for cached Native format exports Makefile targets: init-stack, import-prod-data, init-and-import Tested: init-stack.sh passes all 13 SQL + 7 critical tables + 7 dicts import-prod-data.sh: 3M rows in ~37s with auto date shift Dashboard: 55 routes OK, bot-detector: 36/36 tests pass Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-09 21:40:05 +02:00
parent d8ca804a55
commit 9ea36ad22e
8 changed files with 437 additions and 54 deletions
--- a/.gitignore
+++ b/.gitignore
@ -33,3 +33,6 @@ bot_detector_logs/
 .vscode/
 .idea/
 *.swp
+
+# Exported prod data (large binary files)
+scripts/data/prod-export/*.native
--- a/13
+++ b/13
@ -12,7 +12,7 @@
        build-bot-detector test-bot-detector \
        build-dashboard test-dashboard \
        test-ja4common-python \
-        reload-prod-logs
+        reload-prod-logs init-stack import-prod-data init-and-import

 # --- Root -------------------------------------------------------------------

@ -143,4 +143,13 @@ test-integration-down:

 # ── Dev data ─────────────────────────────────────────────────────────────────
 reload-prod-logs:
-./scripts/reload-prod-logs.sh
+	./scripts/reload-prod-logs.sh
+
+init-stack:
+./scripts/init-stack.sh
+
+import-prod-data:
+./scripts/import-prod-data.sh
+
+init-and-import:
+./scripts/init-stack.sh --import-prod
--- a/scripts/data/prod-export/.gitkeep
+++ b/scripts/data/prod-export/.gitkeep
--- a/scripts/import-prod-data.sh
+++ b/scripts/import-prod-data.sh
@ -0,0 +1,168 @@
+#!/usr/bin/env bash
+# =============================================================================
+# import-prod-data.sh — Importe les données prod pré-extraites dans la DB de dev
+# avec un décalage de date dynamique (max(time) → now()).
+#
+# Les données doivent avoir été exportées au format Native dans le répertoire
+# scripts/data/prod-export/ (fichiers *.native). Utilisez reload-prod-logs.sh
+# pour effectuer l'extraction initiale depuis la prod.
+#
+# Usage:
+#   ./scripts/import-prod-data.sh                    # décalage auto
+#   ./scripts/import-prod-data.sh --shift 3600       # décalage manuel (secondes)
+#   ./scripts/import-prod-data.sh --container my-ch  # conteneur spécifique
+#   ./scripts/import-prod-data.sh --no-truncate      # conserver les données existantes
+#   ./scripts/import-prod-data.sh --cron             # mode silencieux
+#
+# Variables d'environnement :
+#   DEV_CONTAINER       Nom du conteneur ClickHouse (défaut: integration-clickhouse-1)
+#   DEV_DB_LOGS         Base de données logs (défaut: ja4_logs)
+#   EXPORT_DIR          Répertoire des fichiers Native (défaut: scripts/data/prod-export)
+# =============================================================================
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# ── Configuration ────────────────────────────────────────────────────────────
+DEV_CONTAINER="${DEV_CONTAINER:-integration-clickhouse-1}"
+DEV_DB_LOGS="${DEV_DB_LOGS:-ja4_logs}"
+EXPORT_DIR="${EXPORT_DIR:-${REPO_ROOT}/scripts/data/prod-export}"
+SHIFT_SECONDS=""
+NO_TRUNCATE=false
+CRON_MODE=false
+
+# ── Parsing des arguments ────────────────────────────────────────────────────
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --container)    DEV_CONTAINER="$2"; shift 2 ;;
+        --shift)        SHIFT_SECONDS="$2"; shift 2 ;;
+        --no-truncate)  NO_TRUNCATE=true;   shift ;;
+        --cron)         CRON_MODE=true;     shift ;;
+        -h|--help)
+            sed -n '2,/^# =====/{ /^# =====/d; s/^# \?//p; }' "$0"
+            exit 0
+            ;;
+        *) echo "Option inconnue : $1"; exit 1 ;;
+    esac
+done
+
+STAGING_TABLE="${DEV_DB_LOGS}._staging_prod_import"
+LOG_PREFIX="[import-prod]"
+
+log() { [ "${CRON_MODE}" = false ] && echo "${LOG_PREFIX} $(date '+%H:%M:%S') $*" || true; }
+err() { echo "${LOG_PREFIX} ERREUR: $*" >&2; exit 1; }
+
+# ── Requêteur CH ─────────────────────────────────────────────────────────────
+ch() {
+    docker exec -i "${DEV_CONTAINER}" clickhouse-client --query "$1"
+}
+
+ch_insert_native() {
+    docker exec -i "${DEV_CONTAINER}" clickhouse-client \
+        --query "INSERT INTO ${STAGING_TABLE} FORMAT Native"
+}
+
+# ── Vérifications ────────────────────────────────────────────────────────────
+log "Vérification du conteneur ${DEV_CONTAINER}…"
+docker exec "${DEV_CONTAINER}" clickhouse-client --query "SELECT 1" > /dev/null 2>&1 \
+    || err "Conteneur ${DEV_CONTAINER} inaccessible"
+
+log "Vérification des fichiers d'export dans ${EXPORT_DIR}…"
+NATIVE_FILES=("${EXPORT_DIR}"/*.native)
+if [[ ! -f "${NATIVE_FILES[0]}" ]]; then
+    err "Aucun fichier *.native trouvé dans ${EXPORT_DIR}. Lancez d'abord reload-prod-logs.sh."
+fi
+FILE_COUNT=${#NATIVE_FILES[@]}
+log "  ${FILE_COUNT} fichier(s) Native trouvé(s)"
+
+# ── Création de la table de staging ──────────────────────────────────────────
+log "Création de la table de staging…"
+ch "DROP TABLE IF EXISTS ${STAGING_TABLE}"
+ch "CREATE TABLE ${STAGING_TABLE} AS ${DEV_DB_LOGS}.http_logs ENGINE = MergeTree() ORDER BY (time, src_ip) SETTINGS index_granularity = 8192"
+
+# ── Import des fichiers Native ───────────────────────────────────────────────
+log "Import des ${FILE_COUNT} fichiers dans le staging…"
+TOTAL_IMPORTED=0
+
+for f in "${NATIVE_FILES[@]}"; do
+    BASENAME=$(basename "${f}")
+    FILE_SIZE=$(stat -c%s "${f}" 2>/dev/null || stat -f%z "${f}" 2>/dev/null || echo 0)
+
+    if [ "${FILE_SIZE}" -eq 0 ]; then
+        log "  ${BASENAME} — vide, ignoré"
+        continue
+    fi
+
+    log "  ${BASENAME} ($(( FILE_SIZE / 1024 / 1024 )) Mo)…"
+    ch_insert_native < "${f}" \
+        || err "Import échoué pour ${BASENAME}"
+
+    CURRENT=$(ch "SELECT count() FROM ${STAGING_TABLE}")
+    BATCH_ROWS=$(( CURRENT - TOTAL_IMPORTED ))
+    TOTAL_IMPORTED=${CURRENT}
+    log "    → ${BATCH_ROWS} lignes (total staging: ${TOTAL_IMPORTED})"
+done
+
+log "Staging terminé : ${TOTAL_IMPORTED} lignes."
+
+if [ "${TOTAL_IMPORTED}" -eq 0 ]; then
+    ch "DROP TABLE IF EXISTS ${STAGING_TABLE}"
+    err "Aucune donnée importée dans le staging"
+fi
+
+# ── Calcul du décalage de date ───────────────────────────────────────────────
+if [ -z "${SHIFT_SECONDS}" ]; then
+    STAGING_MAX_TS=$(ch "SELECT toUnixTimestamp(max(time)) FROM ${STAGING_TABLE}")
+    NOW_TS=$(date +%s)
+    SHIFT_SECONDS=$(( NOW_TS - STAGING_MAX_TS ))
+    log "Décalage auto : ${SHIFT_SECONDS}s (max(staging) → now)"
+fi
+
+log "Décalage appliqué : ${SHIFT_SECONDS} secondes ($(( SHIFT_SECONDS / 3600 ))h $(( (SHIFT_SECONDS % 3600) / 60 ))min)"
+
+# ── Nettoyage de la table cible ──────────────────────────────────────────────
+if [ "${NO_TRUNCATE}" = false ]; then
+    log "Nettoyage de ${DEV_DB_LOGS}.http_logs…"
+    ch "TRUNCATE TABLE ${DEV_DB_LOGS}.http_logs"
+fi
+
+# ── Insertion avec décalage de date ──────────────────────────────────────────
+log "Insertion avec décalage dans ${DEV_DB_LOGS}.http_logs…"
+ch "
+INSERT INTO ${DEV_DB_LOGS}.http_logs
+SELECT
+    time + toIntervalSecond(${SHIFT_SECONDS})          AS time,
+    toDate(time + toIntervalSecond(${SHIFT_SECONDS}))  AS log_date,
+    src_ip, src_port, dst_ip, dst_port,
+    src_asn, src_country_code, src_as_name, src_org, src_domain,
+    method, scheme, host, path, query, http_version,
+    orphan_side, correlated, keepalives,
+    a_timestamp, b_timestamp, conn_id,
+    ip_meta_df, ip_meta_id, ip_meta_total_length, ip_meta_ttl,
+    tcp_meta_options, tcp_meta_window_size, tcp_meta_mss, tcp_meta_window_scale,
+    syn_to_clienthello_ms,
+    tls_version, tls_sni, tls_alpn, ja3, ja3_hash, ja4,
+    client_headers,
+    header_user_agent, header_accept, header_accept_encoding, header_accept_language,
+    header_content_type, header_x_request_id, header_x_trace_id, header_x_forwarded_for,
+    header_sec_ch_ua, header_sec_ch_ua_mobile, header_sec_ch_ua_platform,
+    header_sec_fetch_dest, header_sec_fetch_mode, header_sec_fetch_site,
+    anubis_bot_name, anubis_bot_action, anubis_bot_category
+FROM ${STAGING_TABLE}
+"
+
+FINAL_COUNT=$(ch "SELECT count() FROM ${DEV_DB_LOGS}.http_logs")
+FINAL_MIN=$(ch "SELECT min(time) FROM ${DEV_DB_LOGS}.http_logs")
+FINAL_MAX=$(ch "SELECT max(time) FROM ${DEV_DB_LOGS}.http_logs")
+
+# ── Nettoyage ────────────────────────────────────────────────────────────────
+log "Nettoyage de la table de staging…"
+ch "DROP TABLE IF EXISTS ${STAGING_TABLE}"
+
+# ── Résultat ─────────────────────────────────────────────────────────────────
+log "════════════════════════════════════════════════════"
+log "  Import terminé : ${FINAL_COUNT} lignes"
+log "  Plage : ${FINAL_MIN} → ${FINAL_MAX}"
+log "  Décalage : ${SHIFT_SECONDS}s"
+log "════════════════════════════════════════════════════"
--- a/scripts/init-stack.sh
+++ b/scripts/init-stack.sh
@ -0,0 +1,251 @@
+#!/usr/bin/env bash
+# =============================================================================
+# init-stack.sh — Initialisation complète de la stack ClickHouse pour ja4-platform
+#
+# Ce script exécute l'ensemble du schéma SQL, charge les données CSV de
+# référence et vérifie que tous les composants sont opérationnels.
+# Il est utilisé par les tests d'intégration et pour la mise en place de
+# l'environnement de développement.
+#
+# Usage:
+#   ./scripts/init-stack.sh                         # init dev stack
+#   ./scripts/init-stack.sh --container my-ch-1     # conteneur spécifique
+#   ./scripts/init-stack.sh --user admin --pass X    # credentials spécifiques
+#   ./scripts/init-stack.sh --import-prod            # init + import données prod
+#   ./scripts/init-stack.sh --reset                  # DROP databases, recréer tout
+#
+# Variables d'environnement :
+#   DEV_CONTAINER       Nom du conteneur ClickHouse (défaut: integration-clickhouse-1)
+#   DEV_USER            Utilisateur ClickHouse (défaut: default)
+#   DEV_PASSWORD        Mot de passe ClickHouse (défaut: vide)
+#   CLICKHOUSE_DB_LOGS  Base de données logs (défaut: ja4_logs)
+#   CLICKHOUSE_DB_PROC  Base de données processing (défaut: ja4_processing)
+# =============================================================================
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# ── Configuration ────────────────────────────────────────────────────────────
+DEV_CONTAINER="${DEV_CONTAINER:-integration-clickhouse-1}"
+DEV_USER="${DEV_USER:-default}"
+DEV_PASSWORD="${DEV_PASSWORD:-}"
+DB_LOGS="${CLICKHOUSE_DB_LOGS:-ja4_logs}"
+DB_PROC="${CLICKHOUSE_DB_PROC:-ja4_processing}"
+IMPORT_PROD=false
+RESET=false
+
+# ── Parsing des arguments ────────────────────────────────────────────────────
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --container)   DEV_CONTAINER="$2"; shift 2 ;;
+        --user)        DEV_USER="$2";      shift 2 ;;
+        --pass)        DEV_PASSWORD="$2";  shift 2 ;;
+        --import-prod) IMPORT_PROD=true;   shift ;;
+        --reset)       RESET=true;         shift ;;
+        -h|--help)
+            sed -n '2,/^# =====/{ /^# =====/d; s/^# \?//p; }' "$0"
+            exit 0
+            ;;
+        *) echo "Option inconnue : $1"; exit 1 ;;
+    esac
+done
+
+SQL_DIR="${REPO_ROOT}/shared/clickhouse"
+
+# ── Couleurs ─────────────────────────────────────────────────────────────────
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+log()  { echo -e "${CYAN}[init]${NC} $(date '+%H:%M:%S') $*"; }
+ok()   { echo -e "${GREEN}  ✓ $*${NC}"; }
+err()  { echo -e "${RED}  ✗ $*${NC}" >&2; exit 1; }
+
+# ── Requêteur CH ─────────────────────────────────────────────────────────────
+ch() {
+    local args=("--query" "$1")
+    if [[ -n "${DEV_PASSWORD}" ]]; then
+        args+=("--user" "${DEV_USER}" "--password" "${DEV_PASSWORD}")
+    fi
+    docker exec -i "${DEV_CONTAINER}" clickhouse-client "${args[@]}"
+}
+
+ch_multiquery() {
+    local args=("--multiquery")
+    if [[ -n "${DEV_PASSWORD}" ]]; then
+        args+=("--user" "${DEV_USER}" "--password" "${DEV_PASSWORD}")
+    fi
+    docker exec -i "${DEV_CONTAINER}" clickhouse-client "${args[@]}" <<< "$1"
+}
+
+ch_insert_native() {
+    # $1 = table, stdin = Native data
+    local args=("--query" "INSERT INTO $1 FORMAT Native")
+    if [[ -n "${DEV_PASSWORD}" ]]; then
+        args+=("--user" "${DEV_USER}" "--password" "${DEV_PASSWORD}")
+    fi
+    docker exec -i "${DEV_CONTAINER}" clickhouse-client "${args[@]}"
+}
+
+# ── Vérification du conteneur ────────────────────────────────────────────────
+log "Vérification du conteneur ${DEV_CONTAINER}…"
+if ! docker exec "${DEV_CONTAINER}" clickhouse-client --query "SELECT 1" > /dev/null 2>&1; then
+    err "Le conteneur ${DEV_CONTAINER} n'est pas accessible"
+fi
+ok "Conteneur ${DEV_CONTAINER} accessible"
+
+# ── Reset optionnel ──────────────────────────────────────────────────────────
+if [ "${RESET}" = true ]; then
+    log "Reset demandé — suppression des bases de données…"
+    ch "DROP DATABASE IF EXISTS ${DB_LOGS}" 2>/dev/null || true
+    ch "DROP DATABASE IF EXISTS ${DB_PROC}" 2>/dev/null || true
+    ok "Bases ${DB_LOGS} et ${DB_PROC} supprimées"
+fi
+
+# ── Exécution des fichiers SQL ───────────────────────────────────────────────
+SQL_FILES=(
+    00_database.sql
+    01_raw_tables.sql
+    02_dictionaries.sql
+    03_anubis_tables.sql
+    04_mv_http_logs.sql
+    05_aggregation_tables.sql
+    06_ml_tables.sql
+    07_ai_features_view.sql
+    08_users.sql
+    09_audit_table.sql
+    10_perf_indexes.sql
+    11_views.sql
+    12_thesis_features.sql
+)
+
+log "Application du schéma SQL (${#SQL_FILES[@]} fichiers)…"
+ERRORS=0
+
+for f in "${SQL_FILES[@]}"; do
+    filepath="${SQL_DIR}/${f}"
+    if [[ ! -f "${filepath}" ]]; then
+        echo "  WARN: ${f} non trouvé, ignoré" >&2
+        continue
+    fi
+
+    # Substitution des noms de bases et des credentials
+    SQL_PATCHED=$(sed \
+        -e "s/ja4_logs/${DB_LOGS}/g" \
+        -e "s/ja4_processing/${DB_PROC}/g" \
+        -e "s/USER 'admin'/USER '${DEV_USER}'/g" \
+        -e "s/PASSWORD 'CHANGE_ME'/PASSWORD '${DEV_PASSWORD}'/g" \
+        -e "s/PASSWORD 'ChangeMe'/PASSWORD '${DEV_PASSWORD}'/g" \
+        "${filepath}")
+
+    # 10_perf_indexes.sql peut échouer si les index existent déjà
+    if [[ "${f}" == 10_* ]]; then
+        if ch_multiquery "${SQL_PATCHED}" 2>/dev/null; then
+            ok "${f}"
+        else
+            echo "  ⚠ ${f} (erreurs ignorées — index déjà existants)"
+        fi
+    else
+        if ch_multiquery "${SQL_PATCHED}" 2>/dev/null; then
+            ok "${f}"
+        else
+            echo "  ✗ ${f} — ERREUR" >&2
+            ERRORS=$((ERRORS + 1))
+        fi
+    fi
+done
+
+if [ "${ERRORS}" -gt 0 ]; then
+    err "${ERRORS} fichier(s) SQL en erreur"
+fi
+
+# ── Migrations post-schéma (colonnes manquantes sur DB existante) ────────────
+log "Application des migrations post-schéma…"
+MIGRATIONS=(
+    "ALTER TABLE ${DB_PROC}.agg_host_ip_ja4_1h ADD COLUMN IF NOT EXISTS count_xff SimpleAggregateFunction(sum, UInt64)"
+    "ALTER TABLE ${DB_PROC}.agg_host_ip_ja4_1h ADD COLUMN IF NOT EXISTS count_unusual_ct SimpleAggregateFunction(sum, UInt64)"
+    "ALTER TABLE ${DB_PROC}.agg_host_ip_ja4_1h ADD COLUMN IF NOT EXISTS count_non_std_port SimpleAggregateFunction(sum, UInt64)"
+    "ALTER TABLE ${DB_PROC}.agg_host_ip_ja4_1h ADD COLUMN IF NOT EXISTS count_login_post SimpleAggregateFunction(sum, UInt64)"
+    "ALTER TABLE ${DB_PROC}.agg_header_fingerprint_1h ADD COLUMN IF NOT EXISTS sec_ch_mobile_mismatch SimpleAggregateFunction(max, UInt8)"
+)
+for mig in "${MIGRATIONS[@]}"; do
+    ch "${mig}" 2>/dev/null || true
+done
+ok "Migrations appliquées"
+
+# ── Nettoyage des tables Anubis obsolètes (UA, Country) ─────────────────────
+log "Nettoyage des tables Anubis obsolètes…"
+ch "DROP DICTIONARY IF EXISTS ${DB_PROC}.dict_anubis_ua"    2>/dev/null || true
+ch "DROP DICTIONARY IF EXISTS ${DB_PROC}.dict_anubis_country" 2>/dev/null || true
+ch "DROP TABLE IF EXISTS ${DB_PROC}.anubis_ua_rules"        2>/dev/null || true
+ch "DROP TABLE IF EXISTS ${DB_PROC}.anubis_country_rules"   2>/dev/null || true
+ok "Tables obsolètes supprimées"
+
+# ── Vérification du schéma ───────────────────────────────────────────────────
+log "Vérification du schéma…"
+
+TABLE_COUNT=$(ch "SELECT count() FROM system.tables WHERE database IN ('${DB_LOGS}','${DB_PROC}')")
+DICT_COUNT=$(ch "SELECT count() FROM system.dictionaries WHERE database='${DB_PROC}'")
+VIEW_COUNT=$(ch "SELECT count() FROM system.tables WHERE database='${DB_PROC}' AND engine='View'")
+MV_COUNT=$(ch "SELECT count() FROM system.tables WHERE database IN ('${DB_LOGS}','${DB_PROC}') AND engine='MaterializedView'")
+
+ok "Tables: ${TABLE_COUNT} | Dictionnaires: ${DICT_COUNT} | Vues: ${VIEW_COUNT} | MVs: ${MV_COUNT}"
+
+# Vérification des tables critiques
+CRITICAL_TABLES=(
+    "${DB_LOGS}.http_logs_raw"
+    "${DB_LOGS}.http_logs"
+    "${DB_PROC}.ml_detected_anomalies"
+    "${DB_PROC}.ml_all_scores"
+    "${DB_PROC}.agg_host_ip_ja4_1h"
+    "${DB_PROC}.anubis_ip_rules"
+    "${DB_PROC}.anubis_asn_rules"
+)
+for t in "${CRITICAL_TABLES[@]}"; do
+    db="${t%%.*}"
+    tbl="${t##*.}"
+    EXISTS=$(ch "SELECT count() FROM system.tables WHERE database='${db}' AND name='${tbl}'" 2>/dev/null || echo "0")
+    if [ "${EXISTS}" = "1" ]; then
+        ok "  ${t}"
+    else
+        err "  Table manquante : ${t}"
+    fi
+done
+
+# Vérification des dictionnaires critiques
+CRITICAL_DICTS=(
+    "dict_anubis_ip"
+    "dict_anubis_asn"
+    "dict_iplocate_asn"
+    "dict_bot_ip"
+    "dict_bot_ja4"
+    "dict_browser_ja4"
+    "dict_asn_reputation"
+)
+for d in "${CRITICAL_DICTS[@]}"; do
+    STATUS=$(ch "SELECT status FROM system.dictionaries WHERE database='${DB_PROC}' AND name='${d}'" 2>/dev/null || echo "MISSING")
+    if [ "${STATUS}" = "LOADED" ] || [ "${STATUS}" = "NOT_LOADED" ]; then
+        ok "  ${d} (${STATUS})"
+    else
+        echo "  ⚠ Dictionnaire ${d}: ${STATUS}"
+    fi
+done
+
+# ── Import des données prod (optionnel) ──────────────────────────────────────
+if [ "${IMPORT_PROD}" = true ]; then
+    IMPORT_SCRIPT="${SCRIPT_DIR}/import-prod-data.sh"
+    if [[ -x "${IMPORT_SCRIPT}" ]]; then
+        log "Lancement de l'import des données prod…"
+        "${IMPORT_SCRIPT}" --container "${DEV_CONTAINER}"
+    else
+        echo "  ⚠ Script d'import non trouvé : ${IMPORT_SCRIPT}"
+    fi
+fi
+
+# ── Résultat ─────────────────────────────────────────────────────────────────
+log "════════════════════════════════════════════════════"
+log "  Initialisation terminée"
+log "  Bases : ${DB_LOGS}, ${DB_PROC}"
+log "  Tables: ${TABLE_COUNT} | Dicts: ${DICT_COUNT} | MVs: ${MV_COUNT}"
+log "════════════════════════════════════════════════════"
--- a/services/bot-detector/anubis/deploy_schema.sql
+++ b/services/bot-detector/anubis/deploy_schema.sql
@ -1,7 +1,6 @@
 -- ============================================================================
 -- ANUBIS CRAWLER RULES — Labeling des http_logs + pipeline ML
 -- Architecture simplifiée (IP/CIDR et ASN uniquement) :
--   anubis_ua_rules (table stub) → dict_anubis_ua (REGEXP_TREE, catch-all)
 --   anubis_ip_rules (table) → dict_anubis_ip (IP_TRIE)
 --   anubis_asn_rules (table) → dict_anubis_asn (FLAT)
 --   http_logs : +anubis_bot_name, +anubis_bot_action, +anubis_bot_category
@ -11,23 +10,7 @@
 -- ============================================================================

 -- ----------------------------------------------------------------------------
-- 1. TABLE SOURCE — règles User-Agent (stub REGEXP_TREE)
--    REGEXP_TREE nécessite ≥1 règle ; le catch-all est injecté à l'init.
--    Cette table n'est PAS peuplée par fetch_rules.py.
-- ----------------------------------------------------------------------------
-CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules
-(
-    id        UInt64,
-    parent_id UInt64,
-    regexp    String,
-    keys      Array(String),
-    values    Array(String)
-)
-ENGINE = ReplacingMergeTree()
-ORDER BY id;
-
-- ----------------------------------------------------------------------------
-- 2. TABLE SOURCE — règles IP/CIDR (pour dictionnaire IP_TRIE)
+-- 1. TABLE SOURCE — règles IP/CIDR (pour dictionnaire IP_TRIE)
 --    Peuplée par fetch_rules.py depuis les fichiers YAML Anubis.
 -- ----------------------------------------------------------------------------
 CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules
--- a/shared/clickhouse/03_anubis_tables.sql
+++ b/shared/clickhouse/03_anubis_tables.sql
@ -1,29 +1,10 @@
 -- =============================================================================
 -- 03_anubis_tables.sql — Anubis crawler rule tables and dictionaries
-- Only IP/CIDR and ASN rules are populated by fetch_rules.py.
-- UA and Country dictionaries are kept as stubs (required by MV references)
-- but are never populated with real data.
+-- Only IP/CIDR and ASN rules are used. UA and Country have been removed.
 -- =============================================================================

 -- -----------------------------------------------------------------------------
-- 1. TABLE SOURCE — User-Agent rules (REGEXP_TREE stub)
-- REGEXP_TREE requires ≥1 rule; the catch-all is seeded at init time.
-- This table is NOT populated by fetch_rules.py.
-- -----------------------------------------------------------------------------
-CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules
-(
-    id        UInt64,
-    parent_id UInt64,
-    regexp    String,
-    keys      Array(String),
-    values    Array(String)
-)
-ENGINE = ReplacingMergeTree()
-ORDER BY id;
-
-
-- -----------------------------------------------------------------------------
-- 2. TABLE SOURCE — IP/CIDR rules (for IP_TRIE dictionary)
+-- 1. TABLE SOURCE — IP/CIDR rules (for IP_TRIE dictionary)
 -- Populated by fetch_rules.py from Anubis GitHub data.
 -- -----------------------------------------------------------------------------
 CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules
--- a/tests/integration/platform/clickhouse-init.sh
+++ b/tests/integration/platform/clickhouse-init.sh
@ -34,16 +34,4 @@ for f in "$TMP_DIR"/*.sql; do
    fi
 done

-# ---------------------------------------------------------------------------
-# Seed data required for dictionaries to function
-# REGEXP_TREE dictionaries require at least one rule; without it, any INSERT
-# into http_logs_raw fails because the MV mv_http_logs calls dictGet() on
-# the empty dict.  Insert a catch-all "unknown" rule so the pipeline works.
-# ---------------------------------------------------------------------------
-echo "[init] Seeding anubis_ua_rules (REGEXP_TREE needs ≥1 rule)..."
-clickhouse-client --multiquery <<'SEED'
-INSERT INTO ja4_processing.anubis_ua_rules (id, parent_id, regexp, keys, values) VALUES
-    (1, 0, '.*', ['bot_name','action','has_ip','rule_id','category'], ['','','0','0','']);
-SEED
-
-echo "[init] All SQL files executed and seed data inserted"
+echo "[init] All SQL files executed successfully"