From 9ea36ad22e028598a47b2323b12ad9ea58f9ed96 Mon Sep 17 00:00:00 2001 From: toto Date: Thu, 9 Apr 2026 21:40:05 +0200 Subject: [PATCH] feat(scripts): complete stack init + prod data import with date shift MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Schema cleanup: - Remove anubis_ua_rules table stub from 03_anubis_tables.sql - Remove anubis_ua_rules from bot-detector deploy_schema.sql - Remove UA seed step from clickhouse-init.sh (no more REGEXP_TREE dependency) - Drop dict_anubis_ua, dict_anubis_country, anubis_ua_rules, anubis_country_rules New scripts: - scripts/init-stack.sh: comprehensive ClickHouse init (13 SQL files + migrations + validation + cleanup of obsolete tables). Supports --reset, --import-prod. - scripts/import-prod-data.sh: imports pre-exported prod data (Native format) with dynamic date shift (max(time) → now). Supports --shift, --no-truncate. - scripts/data/prod-export/: directory for cached Native format exports Makefile targets: init-stack, import-prod-data, init-and-import Tested: init-stack.sh passes all 13 SQL + 7 critical tables + 7 dicts import-prod-data.sh: 3M rows in ~37s with auto date shift Dashboard: 55 routes OK, bot-detector: 36/36 tests pass Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitignore | 3 + Makefile | 13 +- scripts/data/prod-export/.gitkeep | 0 scripts/import-prod-data.sh | 168 ++++++++++++ scripts/init-stack.sh | 251 ++++++++++++++++++ .../bot-detector/anubis/deploy_schema.sql | 19 +- shared/clickhouse/03_anubis_tables.sql | 23 +- tests/integration/platform/clickhouse-init.sh | 14 +- 8 files changed, 437 insertions(+), 54 deletions(-) create mode 100644 scripts/data/prod-export/.gitkeep create mode 100755 scripts/import-prod-data.sh create mode 100755 scripts/init-stack.sh diff --git a/.gitignore b/.gitignore index 7bcd09a..2ee7fac 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,6 @@ bot_detector_logs/ .vscode/ .idea/ *.swp + +# Exported prod data (large binary files) +scripts/data/prod-export/*.native diff --git a/Makefile b/Makefile index 690042d..610e381 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ build-bot-detector test-bot-detector \ build-dashboard test-dashboard \ test-ja4common-python \ - reload-prod-logs + reload-prod-logs init-stack import-prod-data init-and-import # --- Root ------------------------------------------------------------------- @@ -143,4 +143,13 @@ test-integration-down: # ── Dev data ───────────────────────────────────────────────────────────────── reload-prod-logs: -./scripts/reload-prod-logs.sh + ./scripts/reload-prod-logs.sh + +init-stack: +./scripts/init-stack.sh + +import-prod-data: +./scripts/import-prod-data.sh + +init-and-import: +./scripts/init-stack.sh --import-prod diff --git a/scripts/data/prod-export/.gitkeep b/scripts/data/prod-export/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/scripts/import-prod-data.sh b/scripts/import-prod-data.sh new file mode 100755 index 0000000..ac013d1 --- /dev/null +++ b/scripts/import-prod-data.sh @@ -0,0 +1,168 @@ +#!/usr/bin/env bash +# ============================================================================= +# import-prod-data.sh — Importe les données prod pré-extraites dans la DB de dev +# avec un décalage de date dynamique (max(time) → now()). +# +# Les données doivent avoir été exportées au format Native dans le répertoire +# scripts/data/prod-export/ (fichiers *.native). Utilisez reload-prod-logs.sh +# pour effectuer l'extraction initiale depuis la prod. +# +# Usage: +# ./scripts/import-prod-data.sh # décalage auto +# ./scripts/import-prod-data.sh --shift 3600 # décalage manuel (secondes) +# ./scripts/import-prod-data.sh --container my-ch # conteneur spécifique +# ./scripts/import-prod-data.sh --no-truncate # conserver les données existantes +# ./scripts/import-prod-data.sh --cron # mode silencieux +# +# Variables d'environnement : +# DEV_CONTAINER Nom du conteneur ClickHouse (défaut: integration-clickhouse-1) +# DEV_DB_LOGS Base de données logs (défaut: ja4_logs) +# EXPORT_DIR Répertoire des fichiers Native (défaut: scripts/data/prod-export) +# ============================================================================= +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# ── Configuration ──────────────────────────────────────────────────────────── +DEV_CONTAINER="${DEV_CONTAINER:-integration-clickhouse-1}" +DEV_DB_LOGS="${DEV_DB_LOGS:-ja4_logs}" +EXPORT_DIR="${EXPORT_DIR:-${REPO_ROOT}/scripts/data/prod-export}" +SHIFT_SECONDS="" +NO_TRUNCATE=false +CRON_MODE=false + +# ── Parsing des arguments ──────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case "$1" in + --container) DEV_CONTAINER="$2"; shift 2 ;; + --shift) SHIFT_SECONDS="$2"; shift 2 ;; + --no-truncate) NO_TRUNCATE=true; shift ;; + --cron) CRON_MODE=true; shift ;; + -h|--help) + sed -n '2,/^# =====/{ /^# =====/d; s/^# \?//p; }' "$0" + exit 0 + ;; + *) echo "Option inconnue : $1"; exit 1 ;; + esac +done + +STAGING_TABLE="${DEV_DB_LOGS}._staging_prod_import" +LOG_PREFIX="[import-prod]" + +log() { [ "${CRON_MODE}" = false ] && echo "${LOG_PREFIX} $(date '+%H:%M:%S') $*" || true; } +err() { echo "${LOG_PREFIX} ERREUR: $*" >&2; exit 1; } + +# ── Requêteur CH ───────────────────────────────────────────────────────────── +ch() { + docker exec -i "${DEV_CONTAINER}" clickhouse-client --query "$1" +} + +ch_insert_native() { + docker exec -i "${DEV_CONTAINER}" clickhouse-client \ + --query "INSERT INTO ${STAGING_TABLE} FORMAT Native" +} + +# ── Vérifications ──────────────────────────────────────────────────────────── +log "Vérification du conteneur ${DEV_CONTAINER}…" +docker exec "${DEV_CONTAINER}" clickhouse-client --query "SELECT 1" > /dev/null 2>&1 \ + || err "Conteneur ${DEV_CONTAINER} inaccessible" + +log "Vérification des fichiers d'export dans ${EXPORT_DIR}…" +NATIVE_FILES=("${EXPORT_DIR}"/*.native) +if [[ ! -f "${NATIVE_FILES[0]}" ]]; then + err "Aucun fichier *.native trouvé dans ${EXPORT_DIR}. Lancez d'abord reload-prod-logs.sh." +fi +FILE_COUNT=${#NATIVE_FILES[@]} +log " ${FILE_COUNT} fichier(s) Native trouvé(s)" + +# ── Création de la table de staging ────────────────────────────────────────── +log "Création de la table de staging…" +ch "DROP TABLE IF EXISTS ${STAGING_TABLE}" +ch "CREATE TABLE ${STAGING_TABLE} AS ${DEV_DB_LOGS}.http_logs ENGINE = MergeTree() ORDER BY (time, src_ip) SETTINGS index_granularity = 8192" + +# ── Import des fichiers Native ─────────────────────────────────────────────── +log "Import des ${FILE_COUNT} fichiers dans le staging…" +TOTAL_IMPORTED=0 + +for f in "${NATIVE_FILES[@]}"; do + BASENAME=$(basename "${f}") + FILE_SIZE=$(stat -c%s "${f}" 2>/dev/null || stat -f%z "${f}" 2>/dev/null || echo 0) + + if [ "${FILE_SIZE}" -eq 0 ]; then + log " ${BASENAME} — vide, ignoré" + continue + fi + + log " ${BASENAME} ($(( FILE_SIZE / 1024 / 1024 )) Mo)…" + ch_insert_native < "${f}" \ + || err "Import échoué pour ${BASENAME}" + + CURRENT=$(ch "SELECT count() FROM ${STAGING_TABLE}") + BATCH_ROWS=$(( CURRENT - TOTAL_IMPORTED )) + TOTAL_IMPORTED=${CURRENT} + log " → ${BATCH_ROWS} lignes (total staging: ${TOTAL_IMPORTED})" +done + +log "Staging terminé : ${TOTAL_IMPORTED} lignes." + +if [ "${TOTAL_IMPORTED}" -eq 0 ]; then + ch "DROP TABLE IF EXISTS ${STAGING_TABLE}" + err "Aucune donnée importée dans le staging" +fi + +# ── Calcul du décalage de date ─────────────────────────────────────────────── +if [ -z "${SHIFT_SECONDS}" ]; then + STAGING_MAX_TS=$(ch "SELECT toUnixTimestamp(max(time)) FROM ${STAGING_TABLE}") + NOW_TS=$(date +%s) + SHIFT_SECONDS=$(( NOW_TS - STAGING_MAX_TS )) + log "Décalage auto : ${SHIFT_SECONDS}s (max(staging) → now)" +fi + +log "Décalage appliqué : ${SHIFT_SECONDS} secondes ($(( SHIFT_SECONDS / 3600 ))h $(( (SHIFT_SECONDS % 3600) / 60 ))min)" + +# ── Nettoyage de la table cible ────────────────────────────────────────────── +if [ "${NO_TRUNCATE}" = false ]; then + log "Nettoyage de ${DEV_DB_LOGS}.http_logs…" + ch "TRUNCATE TABLE ${DEV_DB_LOGS}.http_logs" +fi + +# ── Insertion avec décalage de date ────────────────────────────────────────── +log "Insertion avec décalage dans ${DEV_DB_LOGS}.http_logs…" +ch " +INSERT INTO ${DEV_DB_LOGS}.http_logs +SELECT + time + toIntervalSecond(${SHIFT_SECONDS}) AS time, + toDate(time + toIntervalSecond(${SHIFT_SECONDS})) AS log_date, + src_ip, src_port, dst_ip, dst_port, + src_asn, src_country_code, src_as_name, src_org, src_domain, + method, scheme, host, path, query, http_version, + orphan_side, correlated, keepalives, + a_timestamp, b_timestamp, conn_id, + ip_meta_df, ip_meta_id, ip_meta_total_length, ip_meta_ttl, + tcp_meta_options, tcp_meta_window_size, tcp_meta_mss, tcp_meta_window_scale, + syn_to_clienthello_ms, + tls_version, tls_sni, tls_alpn, ja3, ja3_hash, ja4, + client_headers, + header_user_agent, header_accept, header_accept_encoding, header_accept_language, + header_content_type, header_x_request_id, header_x_trace_id, header_x_forwarded_for, + header_sec_ch_ua, header_sec_ch_ua_mobile, header_sec_ch_ua_platform, + header_sec_fetch_dest, header_sec_fetch_mode, header_sec_fetch_site, + anubis_bot_name, anubis_bot_action, anubis_bot_category +FROM ${STAGING_TABLE} +" + +FINAL_COUNT=$(ch "SELECT count() FROM ${DEV_DB_LOGS}.http_logs") +FINAL_MIN=$(ch "SELECT min(time) FROM ${DEV_DB_LOGS}.http_logs") +FINAL_MAX=$(ch "SELECT max(time) FROM ${DEV_DB_LOGS}.http_logs") + +# ── Nettoyage ──────────────────────────────────────────────────────────────── +log "Nettoyage de la table de staging…" +ch "DROP TABLE IF EXISTS ${STAGING_TABLE}" + +# ── Résultat ───────────────────────────────────────────────────────────────── +log "════════════════════════════════════════════════════" +log " Import terminé : ${FINAL_COUNT} lignes" +log " Plage : ${FINAL_MIN} → ${FINAL_MAX}" +log " Décalage : ${SHIFT_SECONDS}s" +log "════════════════════════════════════════════════════" diff --git a/scripts/init-stack.sh b/scripts/init-stack.sh new file mode 100755 index 0000000..f5bfcdc --- /dev/null +++ b/scripts/init-stack.sh @@ -0,0 +1,251 @@ +#!/usr/bin/env bash +# ============================================================================= +# init-stack.sh — Initialisation complète de la stack ClickHouse pour ja4-platform +# +# Ce script exécute l'ensemble du schéma SQL, charge les données CSV de +# référence et vérifie que tous les composants sont opérationnels. +# Il est utilisé par les tests d'intégration et pour la mise en place de +# l'environnement de développement. +# +# Usage: +# ./scripts/init-stack.sh # init dev stack +# ./scripts/init-stack.sh --container my-ch-1 # conteneur spécifique +# ./scripts/init-stack.sh --user admin --pass X # credentials spécifiques +# ./scripts/init-stack.sh --import-prod # init + import données prod +# ./scripts/init-stack.sh --reset # DROP databases, recréer tout +# +# Variables d'environnement : +# DEV_CONTAINER Nom du conteneur ClickHouse (défaut: integration-clickhouse-1) +# DEV_USER Utilisateur ClickHouse (défaut: default) +# DEV_PASSWORD Mot de passe ClickHouse (défaut: vide) +# CLICKHOUSE_DB_LOGS Base de données logs (défaut: ja4_logs) +# CLICKHOUSE_DB_PROC Base de données processing (défaut: ja4_processing) +# ============================================================================= +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# ── Configuration ──────────────────────────────────────────────────────────── +DEV_CONTAINER="${DEV_CONTAINER:-integration-clickhouse-1}" +DEV_USER="${DEV_USER:-default}" +DEV_PASSWORD="${DEV_PASSWORD:-}" +DB_LOGS="${CLICKHOUSE_DB_LOGS:-ja4_logs}" +DB_PROC="${CLICKHOUSE_DB_PROC:-ja4_processing}" +IMPORT_PROD=false +RESET=false + +# ── Parsing des arguments ──────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case "$1" in + --container) DEV_CONTAINER="$2"; shift 2 ;; + --user) DEV_USER="$2"; shift 2 ;; + --pass) DEV_PASSWORD="$2"; shift 2 ;; + --import-prod) IMPORT_PROD=true; shift ;; + --reset) RESET=true; shift ;; + -h|--help) + sed -n '2,/^# =====/{ /^# =====/d; s/^# \?//p; }' "$0" + exit 0 + ;; + *) echo "Option inconnue : $1"; exit 1 ;; + esac +done + +SQL_DIR="${REPO_ROOT}/shared/clickhouse" + +# ── Couleurs ───────────────────────────────────────────────────────────────── +RED='\033[0;31m' +GREEN='\033[0;32m' +CYAN='\033[0;36m' +NC='\033[0m' +log() { echo -e "${CYAN}[init]${NC} $(date '+%H:%M:%S') $*"; } +ok() { echo -e "${GREEN} ✓ $*${NC}"; } +err() { echo -e "${RED} ✗ $*${NC}" >&2; exit 1; } + +# ── Requêteur CH ───────────────────────────────────────────────────────────── +ch() { + local args=("--query" "$1") + if [[ -n "${DEV_PASSWORD}" ]]; then + args+=("--user" "${DEV_USER}" "--password" "${DEV_PASSWORD}") + fi + docker exec -i "${DEV_CONTAINER}" clickhouse-client "${args[@]}" +} + +ch_multiquery() { + local args=("--multiquery") + if [[ -n "${DEV_PASSWORD}" ]]; then + args+=("--user" "${DEV_USER}" "--password" "${DEV_PASSWORD}") + fi + docker exec -i "${DEV_CONTAINER}" clickhouse-client "${args[@]}" <<< "$1" +} + +ch_insert_native() { + # $1 = table, stdin = Native data + local args=("--query" "INSERT INTO $1 FORMAT Native") + if [[ -n "${DEV_PASSWORD}" ]]; then + args+=("--user" "${DEV_USER}" "--password" "${DEV_PASSWORD}") + fi + docker exec -i "${DEV_CONTAINER}" clickhouse-client "${args[@]}" +} + +# ── Vérification du conteneur ──────────────────────────────────────────────── +log "Vérification du conteneur ${DEV_CONTAINER}…" +if ! docker exec "${DEV_CONTAINER}" clickhouse-client --query "SELECT 1" > /dev/null 2>&1; then + err "Le conteneur ${DEV_CONTAINER} n'est pas accessible" +fi +ok "Conteneur ${DEV_CONTAINER} accessible" + +# ── Reset optionnel ────────────────────────────────────────────────────────── +if [ "${RESET}" = true ]; then + log "Reset demandé — suppression des bases de données…" + ch "DROP DATABASE IF EXISTS ${DB_LOGS}" 2>/dev/null || true + ch "DROP DATABASE IF EXISTS ${DB_PROC}" 2>/dev/null || true + ok "Bases ${DB_LOGS} et ${DB_PROC} supprimées" +fi + +# ── Exécution des fichiers SQL ─────────────────────────────────────────────── +SQL_FILES=( + 00_database.sql + 01_raw_tables.sql + 02_dictionaries.sql + 03_anubis_tables.sql + 04_mv_http_logs.sql + 05_aggregation_tables.sql + 06_ml_tables.sql + 07_ai_features_view.sql + 08_users.sql + 09_audit_table.sql + 10_perf_indexes.sql + 11_views.sql + 12_thesis_features.sql +) + +log "Application du schéma SQL (${#SQL_FILES[@]} fichiers)…" +ERRORS=0 + +for f in "${SQL_FILES[@]}"; do + filepath="${SQL_DIR}/${f}" + if [[ ! -f "${filepath}" ]]; then + echo " WARN: ${f} non trouvé, ignoré" >&2 + continue + fi + + # Substitution des noms de bases et des credentials + SQL_PATCHED=$(sed \ + -e "s/ja4_logs/${DB_LOGS}/g" \ + -e "s/ja4_processing/${DB_PROC}/g" \ + -e "s/USER 'admin'/USER '${DEV_USER}'/g" \ + -e "s/PASSWORD 'CHANGE_ME'/PASSWORD '${DEV_PASSWORD}'/g" \ + -e "s/PASSWORD 'ChangeMe'/PASSWORD '${DEV_PASSWORD}'/g" \ + "${filepath}") + + # 10_perf_indexes.sql peut échouer si les index existent déjà + if [[ "${f}" == 10_* ]]; then + if ch_multiquery "${SQL_PATCHED}" 2>/dev/null; then + ok "${f}" + else + echo " ⚠ ${f} (erreurs ignorées — index déjà existants)" + fi + else + if ch_multiquery "${SQL_PATCHED}" 2>/dev/null; then + ok "${f}" + else + echo " ✗ ${f} — ERREUR" >&2 + ERRORS=$((ERRORS + 1)) + fi + fi +done + +if [ "${ERRORS}" -gt 0 ]; then + err "${ERRORS} fichier(s) SQL en erreur" +fi + +# ── Migrations post-schéma (colonnes manquantes sur DB existante) ──────────── +log "Application des migrations post-schéma…" +MIGRATIONS=( + "ALTER TABLE ${DB_PROC}.agg_host_ip_ja4_1h ADD COLUMN IF NOT EXISTS count_xff SimpleAggregateFunction(sum, UInt64)" + "ALTER TABLE ${DB_PROC}.agg_host_ip_ja4_1h ADD COLUMN IF NOT EXISTS count_unusual_ct SimpleAggregateFunction(sum, UInt64)" + "ALTER TABLE ${DB_PROC}.agg_host_ip_ja4_1h ADD COLUMN IF NOT EXISTS count_non_std_port SimpleAggregateFunction(sum, UInt64)" + "ALTER TABLE ${DB_PROC}.agg_host_ip_ja4_1h ADD COLUMN IF NOT EXISTS count_login_post SimpleAggregateFunction(sum, UInt64)" + "ALTER TABLE ${DB_PROC}.agg_header_fingerprint_1h ADD COLUMN IF NOT EXISTS sec_ch_mobile_mismatch SimpleAggregateFunction(max, UInt8)" +) +for mig in "${MIGRATIONS[@]}"; do + ch "${mig}" 2>/dev/null || true +done +ok "Migrations appliquées" + +# ── Nettoyage des tables Anubis obsolètes (UA, Country) ───────────────────── +log "Nettoyage des tables Anubis obsolètes…" +ch "DROP DICTIONARY IF EXISTS ${DB_PROC}.dict_anubis_ua" 2>/dev/null || true +ch "DROP DICTIONARY IF EXISTS ${DB_PROC}.dict_anubis_country" 2>/dev/null || true +ch "DROP TABLE IF EXISTS ${DB_PROC}.anubis_ua_rules" 2>/dev/null || true +ch "DROP TABLE IF EXISTS ${DB_PROC}.anubis_country_rules" 2>/dev/null || true +ok "Tables obsolètes supprimées" + +# ── Vérification du schéma ─────────────────────────────────────────────────── +log "Vérification du schéma…" + +TABLE_COUNT=$(ch "SELECT count() FROM system.tables WHERE database IN ('${DB_LOGS}','${DB_PROC}')") +DICT_COUNT=$(ch "SELECT count() FROM system.dictionaries WHERE database='${DB_PROC}'") +VIEW_COUNT=$(ch "SELECT count() FROM system.tables WHERE database='${DB_PROC}' AND engine='View'") +MV_COUNT=$(ch "SELECT count() FROM system.tables WHERE database IN ('${DB_LOGS}','${DB_PROC}') AND engine='MaterializedView'") + +ok "Tables: ${TABLE_COUNT} | Dictionnaires: ${DICT_COUNT} | Vues: ${VIEW_COUNT} | MVs: ${MV_COUNT}" + +# Vérification des tables critiques +CRITICAL_TABLES=( + "${DB_LOGS}.http_logs_raw" + "${DB_LOGS}.http_logs" + "${DB_PROC}.ml_detected_anomalies" + "${DB_PROC}.ml_all_scores" + "${DB_PROC}.agg_host_ip_ja4_1h" + "${DB_PROC}.anubis_ip_rules" + "${DB_PROC}.anubis_asn_rules" +) +for t in "${CRITICAL_TABLES[@]}"; do + db="${t%%.*}" + tbl="${t##*.}" + EXISTS=$(ch "SELECT count() FROM system.tables WHERE database='${db}' AND name='${tbl}'" 2>/dev/null || echo "0") + if [ "${EXISTS}" = "1" ]; then + ok " ${t}" + else + err " Table manquante : ${t}" + fi +done + +# Vérification des dictionnaires critiques +CRITICAL_DICTS=( + "dict_anubis_ip" + "dict_anubis_asn" + "dict_iplocate_asn" + "dict_bot_ip" + "dict_bot_ja4" + "dict_browser_ja4" + "dict_asn_reputation" +) +for d in "${CRITICAL_DICTS[@]}"; do + STATUS=$(ch "SELECT status FROM system.dictionaries WHERE database='${DB_PROC}' AND name='${d}'" 2>/dev/null || echo "MISSING") + if [ "${STATUS}" = "LOADED" ] || [ "${STATUS}" = "NOT_LOADED" ]; then + ok " ${d} (${STATUS})" + else + echo " ⚠ Dictionnaire ${d}: ${STATUS}" + fi +done + +# ── Import des données prod (optionnel) ────────────────────────────────────── +if [ "${IMPORT_PROD}" = true ]; then + IMPORT_SCRIPT="${SCRIPT_DIR}/import-prod-data.sh" + if [[ -x "${IMPORT_SCRIPT}" ]]; then + log "Lancement de l'import des données prod…" + "${IMPORT_SCRIPT}" --container "${DEV_CONTAINER}" + else + echo " ⚠ Script d'import non trouvé : ${IMPORT_SCRIPT}" + fi +fi + +# ── Résultat ───────────────────────────────────────────────────────────────── +log "════════════════════════════════════════════════════" +log " Initialisation terminée" +log " Bases : ${DB_LOGS}, ${DB_PROC}" +log " Tables: ${TABLE_COUNT} | Dicts: ${DICT_COUNT} | MVs: ${MV_COUNT}" +log "════════════════════════════════════════════════════" diff --git a/services/bot-detector/anubis/deploy_schema.sql b/services/bot-detector/anubis/deploy_schema.sql index ddd52ed..7318fe0 100644 --- a/services/bot-detector/anubis/deploy_schema.sql +++ b/services/bot-detector/anubis/deploy_schema.sql @@ -1,7 +1,6 @@ -- ============================================================================ -- ANUBIS CRAWLER RULES — Labeling des http_logs + pipeline ML -- Architecture simplifiée (IP/CIDR et ASN uniquement) : --- anubis_ua_rules (table stub) → dict_anubis_ua (REGEXP_TREE, catch-all) -- anubis_ip_rules (table) → dict_anubis_ip (IP_TRIE) -- anubis_asn_rules (table) → dict_anubis_asn (FLAT) -- http_logs : +anubis_bot_name, +anubis_bot_action, +anubis_bot_category @@ -11,23 +10,7 @@ -- ============================================================================ -- ---------------------------------------------------------------------------- --- 1. TABLE SOURCE — règles User-Agent (stub REGEXP_TREE) --- REGEXP_TREE nécessite ≥1 règle ; le catch-all est injecté à l'init. --- Cette table n'est PAS peuplée par fetch_rules.py. --- ---------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules -( - id UInt64, - parent_id UInt64, - regexp String, - keys Array(String), - values Array(String) -) -ENGINE = ReplacingMergeTree() -ORDER BY id; - --- ---------------------------------------------------------------------------- --- 2. TABLE SOURCE — règles IP/CIDR (pour dictionnaire IP_TRIE) +-- 1. TABLE SOURCE — règles IP/CIDR (pour dictionnaire IP_TRIE) -- Peuplée par fetch_rules.py depuis les fichiers YAML Anubis. -- ---------------------------------------------------------------------------- CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules diff --git a/shared/clickhouse/03_anubis_tables.sql b/shared/clickhouse/03_anubis_tables.sql index d2b706d..f13cce0 100644 --- a/shared/clickhouse/03_anubis_tables.sql +++ b/shared/clickhouse/03_anubis_tables.sql @@ -1,29 +1,10 @@ -- ============================================================================= -- 03_anubis_tables.sql — Anubis crawler rule tables and dictionaries --- Only IP/CIDR and ASN rules are populated by fetch_rules.py. --- UA and Country dictionaries are kept as stubs (required by MV references) --- but are never populated with real data. +-- Only IP/CIDR and ASN rules are used. UA and Country have been removed. -- ============================================================================= -- ----------------------------------------------------------------------------- --- 1. TABLE SOURCE — User-Agent rules (REGEXP_TREE stub) --- REGEXP_TREE requires ≥1 rule; the catch-all is seeded at init time. --- This table is NOT populated by fetch_rules.py. --- ----------------------------------------------------------------------------- -CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules -( - id UInt64, - parent_id UInt64, - regexp String, - keys Array(String), - values Array(String) -) -ENGINE = ReplacingMergeTree() -ORDER BY id; - - --- ----------------------------------------------------------------------------- --- 2. TABLE SOURCE — IP/CIDR rules (for IP_TRIE dictionary) +-- 1. TABLE SOURCE — IP/CIDR rules (for IP_TRIE dictionary) -- Populated by fetch_rules.py from Anubis GitHub data. -- ----------------------------------------------------------------------------- CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules diff --git a/tests/integration/platform/clickhouse-init.sh b/tests/integration/platform/clickhouse-init.sh index 89a965c..2cc30db 100755 --- a/tests/integration/platform/clickhouse-init.sh +++ b/tests/integration/platform/clickhouse-init.sh @@ -34,16 +34,4 @@ for f in "$TMP_DIR"/*.sql; do fi done -# --------------------------------------------------------------------------- -# Seed data required for dictionaries to function -# REGEXP_TREE dictionaries require at least one rule; without it, any INSERT -# into http_logs_raw fails because the MV mv_http_logs calls dictGet() on -# the empty dict. Insert a catch-all "unknown" rule so the pipeline works. -# --------------------------------------------------------------------------- -echo "[init] Seeding anubis_ua_rules (REGEXP_TREE needs ≥1 rule)..." -clickhouse-client --multiquery <<'SEED' -INSERT INTO ja4_processing.anubis_ua_rules (id, parent_id, regexp, keys, values) VALUES - (1, 0, '.*', ['bot_name','action','has_ip','rule_id','category'], ['','','0','0','']); -SEED - -echo "[init] All SQL files executed and seed data inserted" +echo "[init] All SQL files executed successfully"