feat(scripts): complete stack init + prod data import with date shift
Schema cleanup:
- Remove anubis_ua_rules table stub from 03_anubis_tables.sql
- Remove anubis_ua_rules from bot-detector deploy_schema.sql
- Remove UA seed step from clickhouse-init.sh (no more REGEXP_TREE dependency)
- Drop dict_anubis_ua, dict_anubis_country, anubis_ua_rules, anubis_country_rules
New scripts:
- scripts/init-stack.sh: comprehensive ClickHouse init (13 SQL files + migrations
+ validation + cleanup of obsolete tables). Supports --reset, --import-prod.
- scripts/import-prod-data.sh: imports pre-exported prod data (Native format)
with dynamic date shift (max(time) → now). Supports --shift, --no-truncate.
- scripts/data/prod-export/: directory for cached Native format exports
Makefile targets: init-stack, import-prod-data, init-and-import
Tested: init-stack.sh passes all 13 SQL + 7 critical tables + 7 dicts
import-prod-data.sh: 3M rows in ~37s with auto date shift
Dashboard: 55 routes OK, bot-detector: 36/36 tests pass
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@ -33,3 +33,6 @@ bot_detector_logs/
|
|||||||
.vscode/
|
.vscode/
|
||||||
.idea/
|
.idea/
|
||||||
*.swp
|
*.swp
|
||||||
|
|
||||||
|
# Exported prod data (large binary files)
|
||||||
|
scripts/data/prod-export/*.native
|
||||||
|
|||||||
13
Makefile
13
Makefile
@ -12,7 +12,7 @@
|
|||||||
build-bot-detector test-bot-detector \
|
build-bot-detector test-bot-detector \
|
||||||
build-dashboard test-dashboard \
|
build-dashboard test-dashboard \
|
||||||
test-ja4common-python \
|
test-ja4common-python \
|
||||||
reload-prod-logs
|
reload-prod-logs init-stack import-prod-data init-and-import
|
||||||
|
|
||||||
# --- Root -------------------------------------------------------------------
|
# --- Root -------------------------------------------------------------------
|
||||||
|
|
||||||
@ -143,4 +143,13 @@ test-integration-down:
|
|||||||
|
|
||||||
# ── Dev data ─────────────────────────────────────────────────────────────────
|
# ── Dev data ─────────────────────────────────────────────────────────────────
|
||||||
reload-prod-logs:
|
reload-prod-logs:
|
||||||
./scripts/reload-prod-logs.sh
|
./scripts/reload-prod-logs.sh
|
||||||
|
|
||||||
|
init-stack:
|
||||||
|
./scripts/init-stack.sh
|
||||||
|
|
||||||
|
import-prod-data:
|
||||||
|
./scripts/import-prod-data.sh
|
||||||
|
|
||||||
|
init-and-import:
|
||||||
|
./scripts/init-stack.sh --import-prod
|
||||||
|
|||||||
0
scripts/data/prod-export/.gitkeep
Normal file
0
scripts/data/prod-export/.gitkeep
Normal file
168
scripts/import-prod-data.sh
Executable file
168
scripts/import-prod-data.sh
Executable file
@ -0,0 +1,168 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# =============================================================================
|
||||||
|
# import-prod-data.sh — Importe les données prod pré-extraites dans la DB de dev
|
||||||
|
# avec un décalage de date dynamique (max(time) → now()).
|
||||||
|
#
|
||||||
|
# Les données doivent avoir été exportées au format Native dans le répertoire
|
||||||
|
# scripts/data/prod-export/ (fichiers *.native). Utilisez reload-prod-logs.sh
|
||||||
|
# pour effectuer l'extraction initiale depuis la prod.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./scripts/import-prod-data.sh # décalage auto
|
||||||
|
# ./scripts/import-prod-data.sh --shift 3600 # décalage manuel (secondes)
|
||||||
|
# ./scripts/import-prod-data.sh --container my-ch # conteneur spécifique
|
||||||
|
# ./scripts/import-prod-data.sh --no-truncate # conserver les données existantes
|
||||||
|
# ./scripts/import-prod-data.sh --cron # mode silencieux
|
||||||
|
#
|
||||||
|
# Variables d'environnement :
|
||||||
|
# DEV_CONTAINER Nom du conteneur ClickHouse (défaut: integration-clickhouse-1)
|
||||||
|
# DEV_DB_LOGS Base de données logs (défaut: ja4_logs)
|
||||||
|
# EXPORT_DIR Répertoire des fichiers Native (défaut: scripts/data/prod-export)
|
||||||
|
# =============================================================================
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||||
|
|
||||||
|
# ── Configuration ────────────────────────────────────────────────────────────
|
||||||
|
DEV_CONTAINER="${DEV_CONTAINER:-integration-clickhouse-1}"
|
||||||
|
DEV_DB_LOGS="${DEV_DB_LOGS:-ja4_logs}"
|
||||||
|
EXPORT_DIR="${EXPORT_DIR:-${REPO_ROOT}/scripts/data/prod-export}"
|
||||||
|
SHIFT_SECONDS=""
|
||||||
|
NO_TRUNCATE=false
|
||||||
|
CRON_MODE=false
|
||||||
|
|
||||||
|
# ── Parsing des arguments ────────────────────────────────────────────────────
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--container) DEV_CONTAINER="$2"; shift 2 ;;
|
||||||
|
--shift) SHIFT_SECONDS="$2"; shift 2 ;;
|
||||||
|
--no-truncate) NO_TRUNCATE=true; shift ;;
|
||||||
|
--cron) CRON_MODE=true; shift ;;
|
||||||
|
-h|--help)
|
||||||
|
sed -n '2,/^# =====/{ /^# =====/d; s/^# \?//p; }' "$0"
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*) echo "Option inconnue : $1"; exit 1 ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
STAGING_TABLE="${DEV_DB_LOGS}._staging_prod_import"
|
||||||
|
LOG_PREFIX="[import-prod]"
|
||||||
|
|
||||||
|
log() { [ "${CRON_MODE}" = false ] && echo "${LOG_PREFIX} $(date '+%H:%M:%S') $*" || true; }
|
||||||
|
err() { echo "${LOG_PREFIX} ERREUR: $*" >&2; exit 1; }
|
||||||
|
|
||||||
|
# ── Requêteur CH ─────────────────────────────────────────────────────────────
|
||||||
|
ch() {
|
||||||
|
docker exec -i "${DEV_CONTAINER}" clickhouse-client --query "$1"
|
||||||
|
}
|
||||||
|
|
||||||
|
ch_insert_native() {
|
||||||
|
docker exec -i "${DEV_CONTAINER}" clickhouse-client \
|
||||||
|
--query "INSERT INTO ${STAGING_TABLE} FORMAT Native"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Vérifications ────────────────────────────────────────────────────────────
|
||||||
|
log "Vérification du conteneur ${DEV_CONTAINER}…"
|
||||||
|
docker exec "${DEV_CONTAINER}" clickhouse-client --query "SELECT 1" > /dev/null 2>&1 \
|
||||||
|
|| err "Conteneur ${DEV_CONTAINER} inaccessible"
|
||||||
|
|
||||||
|
log "Vérification des fichiers d'export dans ${EXPORT_DIR}…"
|
||||||
|
NATIVE_FILES=("${EXPORT_DIR}"/*.native)
|
||||||
|
if [[ ! -f "${NATIVE_FILES[0]}" ]]; then
|
||||||
|
err "Aucun fichier *.native trouvé dans ${EXPORT_DIR}. Lancez d'abord reload-prod-logs.sh."
|
||||||
|
fi
|
||||||
|
FILE_COUNT=${#NATIVE_FILES[@]}
|
||||||
|
log " ${FILE_COUNT} fichier(s) Native trouvé(s)"
|
||||||
|
|
||||||
|
# ── Création de la table de staging ──────────────────────────────────────────
|
||||||
|
log "Création de la table de staging…"
|
||||||
|
ch "DROP TABLE IF EXISTS ${STAGING_TABLE}"
|
||||||
|
ch "CREATE TABLE ${STAGING_TABLE} AS ${DEV_DB_LOGS}.http_logs ENGINE = MergeTree() ORDER BY (time, src_ip) SETTINGS index_granularity = 8192"
|
||||||
|
|
||||||
|
# ── Import des fichiers Native ───────────────────────────────────────────────
|
||||||
|
log "Import des ${FILE_COUNT} fichiers dans le staging…"
|
||||||
|
TOTAL_IMPORTED=0
|
||||||
|
|
||||||
|
for f in "${NATIVE_FILES[@]}"; do
|
||||||
|
BASENAME=$(basename "${f}")
|
||||||
|
FILE_SIZE=$(stat -c%s "${f}" 2>/dev/null || stat -f%z "${f}" 2>/dev/null || echo 0)
|
||||||
|
|
||||||
|
if [ "${FILE_SIZE}" -eq 0 ]; then
|
||||||
|
log " ${BASENAME} — vide, ignoré"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
log " ${BASENAME} ($(( FILE_SIZE / 1024 / 1024 )) Mo)…"
|
||||||
|
ch_insert_native < "${f}" \
|
||||||
|
|| err "Import échoué pour ${BASENAME}"
|
||||||
|
|
||||||
|
CURRENT=$(ch "SELECT count() FROM ${STAGING_TABLE}")
|
||||||
|
BATCH_ROWS=$(( CURRENT - TOTAL_IMPORTED ))
|
||||||
|
TOTAL_IMPORTED=${CURRENT}
|
||||||
|
log " → ${BATCH_ROWS} lignes (total staging: ${TOTAL_IMPORTED})"
|
||||||
|
done
|
||||||
|
|
||||||
|
log "Staging terminé : ${TOTAL_IMPORTED} lignes."
|
||||||
|
|
||||||
|
if [ "${TOTAL_IMPORTED}" -eq 0 ]; then
|
||||||
|
ch "DROP TABLE IF EXISTS ${STAGING_TABLE}"
|
||||||
|
err "Aucune donnée importée dans le staging"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Calcul du décalage de date ───────────────────────────────────────────────
|
||||||
|
if [ -z "${SHIFT_SECONDS}" ]; then
|
||||||
|
STAGING_MAX_TS=$(ch "SELECT toUnixTimestamp(max(time)) FROM ${STAGING_TABLE}")
|
||||||
|
NOW_TS=$(date +%s)
|
||||||
|
SHIFT_SECONDS=$(( NOW_TS - STAGING_MAX_TS ))
|
||||||
|
log "Décalage auto : ${SHIFT_SECONDS}s (max(staging) → now)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Décalage appliqué : ${SHIFT_SECONDS} secondes ($(( SHIFT_SECONDS / 3600 ))h $(( (SHIFT_SECONDS % 3600) / 60 ))min)"
|
||||||
|
|
||||||
|
# ── Nettoyage de la table cible ──────────────────────────────────────────────
|
||||||
|
if [ "${NO_TRUNCATE}" = false ]; then
|
||||||
|
log "Nettoyage de ${DEV_DB_LOGS}.http_logs…"
|
||||||
|
ch "TRUNCATE TABLE ${DEV_DB_LOGS}.http_logs"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Insertion avec décalage de date ──────────────────────────────────────────
|
||||||
|
log "Insertion avec décalage dans ${DEV_DB_LOGS}.http_logs…"
|
||||||
|
ch "
|
||||||
|
INSERT INTO ${DEV_DB_LOGS}.http_logs
|
||||||
|
SELECT
|
||||||
|
time + toIntervalSecond(${SHIFT_SECONDS}) AS time,
|
||||||
|
toDate(time + toIntervalSecond(${SHIFT_SECONDS})) AS log_date,
|
||||||
|
src_ip, src_port, dst_ip, dst_port,
|
||||||
|
src_asn, src_country_code, src_as_name, src_org, src_domain,
|
||||||
|
method, scheme, host, path, query, http_version,
|
||||||
|
orphan_side, correlated, keepalives,
|
||||||
|
a_timestamp, b_timestamp, conn_id,
|
||||||
|
ip_meta_df, ip_meta_id, ip_meta_total_length, ip_meta_ttl,
|
||||||
|
tcp_meta_options, tcp_meta_window_size, tcp_meta_mss, tcp_meta_window_scale,
|
||||||
|
syn_to_clienthello_ms,
|
||||||
|
tls_version, tls_sni, tls_alpn, ja3, ja3_hash, ja4,
|
||||||
|
client_headers,
|
||||||
|
header_user_agent, header_accept, header_accept_encoding, header_accept_language,
|
||||||
|
header_content_type, header_x_request_id, header_x_trace_id, header_x_forwarded_for,
|
||||||
|
header_sec_ch_ua, header_sec_ch_ua_mobile, header_sec_ch_ua_platform,
|
||||||
|
header_sec_fetch_dest, header_sec_fetch_mode, header_sec_fetch_site,
|
||||||
|
anubis_bot_name, anubis_bot_action, anubis_bot_category
|
||||||
|
FROM ${STAGING_TABLE}
|
||||||
|
"
|
||||||
|
|
||||||
|
FINAL_COUNT=$(ch "SELECT count() FROM ${DEV_DB_LOGS}.http_logs")
|
||||||
|
FINAL_MIN=$(ch "SELECT min(time) FROM ${DEV_DB_LOGS}.http_logs")
|
||||||
|
FINAL_MAX=$(ch "SELECT max(time) FROM ${DEV_DB_LOGS}.http_logs")
|
||||||
|
|
||||||
|
# ── Nettoyage ────────────────────────────────────────────────────────────────
|
||||||
|
log "Nettoyage de la table de staging…"
|
||||||
|
ch "DROP TABLE IF EXISTS ${STAGING_TABLE}"
|
||||||
|
|
||||||
|
# ── Résultat ─────────────────────────────────────────────────────────────────
|
||||||
|
log "════════════════════════════════════════════════════"
|
||||||
|
log " Import terminé : ${FINAL_COUNT} lignes"
|
||||||
|
log " Plage : ${FINAL_MIN} → ${FINAL_MAX}"
|
||||||
|
log " Décalage : ${SHIFT_SECONDS}s"
|
||||||
|
log "════════════════════════════════════════════════════"
|
||||||
251
scripts/init-stack.sh
Executable file
251
scripts/init-stack.sh
Executable file
@ -0,0 +1,251 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# =============================================================================
|
||||||
|
# init-stack.sh — Initialisation complète de la stack ClickHouse pour ja4-platform
|
||||||
|
#
|
||||||
|
# Ce script exécute l'ensemble du schéma SQL, charge les données CSV de
|
||||||
|
# référence et vérifie que tous les composants sont opérationnels.
|
||||||
|
# Il est utilisé par les tests d'intégration et pour la mise en place de
|
||||||
|
# l'environnement de développement.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./scripts/init-stack.sh # init dev stack
|
||||||
|
# ./scripts/init-stack.sh --container my-ch-1 # conteneur spécifique
|
||||||
|
# ./scripts/init-stack.sh --user admin --pass X # credentials spécifiques
|
||||||
|
# ./scripts/init-stack.sh --import-prod # init + import données prod
|
||||||
|
# ./scripts/init-stack.sh --reset # DROP databases, recréer tout
|
||||||
|
#
|
||||||
|
# Variables d'environnement :
|
||||||
|
# DEV_CONTAINER Nom du conteneur ClickHouse (défaut: integration-clickhouse-1)
|
||||||
|
# DEV_USER Utilisateur ClickHouse (défaut: default)
|
||||||
|
# DEV_PASSWORD Mot de passe ClickHouse (défaut: vide)
|
||||||
|
# CLICKHOUSE_DB_LOGS Base de données logs (défaut: ja4_logs)
|
||||||
|
# CLICKHOUSE_DB_PROC Base de données processing (défaut: ja4_processing)
|
||||||
|
# =============================================================================
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||||
|
|
||||||
|
# ── Configuration ────────────────────────────────────────────────────────────
|
||||||
|
DEV_CONTAINER="${DEV_CONTAINER:-integration-clickhouse-1}"
|
||||||
|
DEV_USER="${DEV_USER:-default}"
|
||||||
|
DEV_PASSWORD="${DEV_PASSWORD:-}"
|
||||||
|
DB_LOGS="${CLICKHOUSE_DB_LOGS:-ja4_logs}"
|
||||||
|
DB_PROC="${CLICKHOUSE_DB_PROC:-ja4_processing}"
|
||||||
|
IMPORT_PROD=false
|
||||||
|
RESET=false
|
||||||
|
|
||||||
|
# ── Parsing des arguments ────────────────────────────────────────────────────
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--container) DEV_CONTAINER="$2"; shift 2 ;;
|
||||||
|
--user) DEV_USER="$2"; shift 2 ;;
|
||||||
|
--pass) DEV_PASSWORD="$2"; shift 2 ;;
|
||||||
|
--import-prod) IMPORT_PROD=true; shift ;;
|
||||||
|
--reset) RESET=true; shift ;;
|
||||||
|
-h|--help)
|
||||||
|
sed -n '2,/^# =====/{ /^# =====/d; s/^# \?//p; }' "$0"
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*) echo "Option inconnue : $1"; exit 1 ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
SQL_DIR="${REPO_ROOT}/shared/clickhouse"
|
||||||
|
|
||||||
|
# ── Couleurs ─────────────────────────────────────────────────────────────────
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
CYAN='\033[0;36m'
|
||||||
|
NC='\033[0m'
|
||||||
|
log() { echo -e "${CYAN}[init]${NC} $(date '+%H:%M:%S') $*"; }
|
||||||
|
ok() { echo -e "${GREEN} ✓ $*${NC}"; }
|
||||||
|
err() { echo -e "${RED} ✗ $*${NC}" >&2; exit 1; }
|
||||||
|
|
||||||
|
# ── Requêteur CH ─────────────────────────────────────────────────────────────
|
||||||
|
ch() {
|
||||||
|
local args=("--query" "$1")
|
||||||
|
if [[ -n "${DEV_PASSWORD}" ]]; then
|
||||||
|
args+=("--user" "${DEV_USER}" "--password" "${DEV_PASSWORD}")
|
||||||
|
fi
|
||||||
|
docker exec -i "${DEV_CONTAINER}" clickhouse-client "${args[@]}"
|
||||||
|
}
|
||||||
|
|
||||||
|
ch_multiquery() {
|
||||||
|
local args=("--multiquery")
|
||||||
|
if [[ -n "${DEV_PASSWORD}" ]]; then
|
||||||
|
args+=("--user" "${DEV_USER}" "--password" "${DEV_PASSWORD}")
|
||||||
|
fi
|
||||||
|
docker exec -i "${DEV_CONTAINER}" clickhouse-client "${args[@]}" <<< "$1"
|
||||||
|
}
|
||||||
|
|
||||||
|
ch_insert_native() {
|
||||||
|
# $1 = table, stdin = Native data
|
||||||
|
local args=("--query" "INSERT INTO $1 FORMAT Native")
|
||||||
|
if [[ -n "${DEV_PASSWORD}" ]]; then
|
||||||
|
args+=("--user" "${DEV_USER}" "--password" "${DEV_PASSWORD}")
|
||||||
|
fi
|
||||||
|
docker exec -i "${DEV_CONTAINER}" clickhouse-client "${args[@]}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Vérification du conteneur ────────────────────────────────────────────────
|
||||||
|
log "Vérification du conteneur ${DEV_CONTAINER}…"
|
||||||
|
if ! docker exec "${DEV_CONTAINER}" clickhouse-client --query "SELECT 1" > /dev/null 2>&1; then
|
||||||
|
err "Le conteneur ${DEV_CONTAINER} n'est pas accessible"
|
||||||
|
fi
|
||||||
|
ok "Conteneur ${DEV_CONTAINER} accessible"
|
||||||
|
|
||||||
|
# ── Reset optionnel ──────────────────────────────────────────────────────────
|
||||||
|
if [ "${RESET}" = true ]; then
|
||||||
|
log "Reset demandé — suppression des bases de données…"
|
||||||
|
ch "DROP DATABASE IF EXISTS ${DB_LOGS}" 2>/dev/null || true
|
||||||
|
ch "DROP DATABASE IF EXISTS ${DB_PROC}" 2>/dev/null || true
|
||||||
|
ok "Bases ${DB_LOGS} et ${DB_PROC} supprimées"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Exécution des fichiers SQL ───────────────────────────────────────────────
|
||||||
|
SQL_FILES=(
|
||||||
|
00_database.sql
|
||||||
|
01_raw_tables.sql
|
||||||
|
02_dictionaries.sql
|
||||||
|
03_anubis_tables.sql
|
||||||
|
04_mv_http_logs.sql
|
||||||
|
05_aggregation_tables.sql
|
||||||
|
06_ml_tables.sql
|
||||||
|
07_ai_features_view.sql
|
||||||
|
08_users.sql
|
||||||
|
09_audit_table.sql
|
||||||
|
10_perf_indexes.sql
|
||||||
|
11_views.sql
|
||||||
|
12_thesis_features.sql
|
||||||
|
)
|
||||||
|
|
||||||
|
log "Application du schéma SQL (${#SQL_FILES[@]} fichiers)…"
|
||||||
|
ERRORS=0
|
||||||
|
|
||||||
|
for f in "${SQL_FILES[@]}"; do
|
||||||
|
filepath="${SQL_DIR}/${f}"
|
||||||
|
if [[ ! -f "${filepath}" ]]; then
|
||||||
|
echo " WARN: ${f} non trouvé, ignoré" >&2
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Substitution des noms de bases et des credentials
|
||||||
|
SQL_PATCHED=$(sed \
|
||||||
|
-e "s/ja4_logs/${DB_LOGS}/g" \
|
||||||
|
-e "s/ja4_processing/${DB_PROC}/g" \
|
||||||
|
-e "s/USER 'admin'/USER '${DEV_USER}'/g" \
|
||||||
|
-e "s/PASSWORD 'CHANGE_ME'/PASSWORD '${DEV_PASSWORD}'/g" \
|
||||||
|
-e "s/PASSWORD 'ChangeMe'/PASSWORD '${DEV_PASSWORD}'/g" \
|
||||||
|
"${filepath}")
|
||||||
|
|
||||||
|
# 10_perf_indexes.sql peut échouer si les index existent déjà
|
||||||
|
if [[ "${f}" == 10_* ]]; then
|
||||||
|
if ch_multiquery "${SQL_PATCHED}" 2>/dev/null; then
|
||||||
|
ok "${f}"
|
||||||
|
else
|
||||||
|
echo " ⚠ ${f} (erreurs ignorées — index déjà existants)"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
if ch_multiquery "${SQL_PATCHED}" 2>/dev/null; then
|
||||||
|
ok "${f}"
|
||||||
|
else
|
||||||
|
echo " ✗ ${f} — ERREUR" >&2
|
||||||
|
ERRORS=$((ERRORS + 1))
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${ERRORS}" -gt 0 ]; then
|
||||||
|
err "${ERRORS} fichier(s) SQL en erreur"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Migrations post-schéma (colonnes manquantes sur DB existante) ────────────
|
||||||
|
log "Application des migrations post-schéma…"
|
||||||
|
MIGRATIONS=(
|
||||||
|
"ALTER TABLE ${DB_PROC}.agg_host_ip_ja4_1h ADD COLUMN IF NOT EXISTS count_xff SimpleAggregateFunction(sum, UInt64)"
|
||||||
|
"ALTER TABLE ${DB_PROC}.agg_host_ip_ja4_1h ADD COLUMN IF NOT EXISTS count_unusual_ct SimpleAggregateFunction(sum, UInt64)"
|
||||||
|
"ALTER TABLE ${DB_PROC}.agg_host_ip_ja4_1h ADD COLUMN IF NOT EXISTS count_non_std_port SimpleAggregateFunction(sum, UInt64)"
|
||||||
|
"ALTER TABLE ${DB_PROC}.agg_host_ip_ja4_1h ADD COLUMN IF NOT EXISTS count_login_post SimpleAggregateFunction(sum, UInt64)"
|
||||||
|
"ALTER TABLE ${DB_PROC}.agg_header_fingerprint_1h ADD COLUMN IF NOT EXISTS sec_ch_mobile_mismatch SimpleAggregateFunction(max, UInt8)"
|
||||||
|
)
|
||||||
|
for mig in "${MIGRATIONS[@]}"; do
|
||||||
|
ch "${mig}" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
ok "Migrations appliquées"
|
||||||
|
|
||||||
|
# ── Nettoyage des tables Anubis obsolètes (UA, Country) ─────────────────────
|
||||||
|
log "Nettoyage des tables Anubis obsolètes…"
|
||||||
|
ch "DROP DICTIONARY IF EXISTS ${DB_PROC}.dict_anubis_ua" 2>/dev/null || true
|
||||||
|
ch "DROP DICTIONARY IF EXISTS ${DB_PROC}.dict_anubis_country" 2>/dev/null || true
|
||||||
|
ch "DROP TABLE IF EXISTS ${DB_PROC}.anubis_ua_rules" 2>/dev/null || true
|
||||||
|
ch "DROP TABLE IF EXISTS ${DB_PROC}.anubis_country_rules" 2>/dev/null || true
|
||||||
|
ok "Tables obsolètes supprimées"
|
||||||
|
|
||||||
|
# ── Vérification du schéma ───────────────────────────────────────────────────
|
||||||
|
log "Vérification du schéma…"
|
||||||
|
|
||||||
|
TABLE_COUNT=$(ch "SELECT count() FROM system.tables WHERE database IN ('${DB_LOGS}','${DB_PROC}')")
|
||||||
|
DICT_COUNT=$(ch "SELECT count() FROM system.dictionaries WHERE database='${DB_PROC}'")
|
||||||
|
VIEW_COUNT=$(ch "SELECT count() FROM system.tables WHERE database='${DB_PROC}' AND engine='View'")
|
||||||
|
MV_COUNT=$(ch "SELECT count() FROM system.tables WHERE database IN ('${DB_LOGS}','${DB_PROC}') AND engine='MaterializedView'")
|
||||||
|
|
||||||
|
ok "Tables: ${TABLE_COUNT} | Dictionnaires: ${DICT_COUNT} | Vues: ${VIEW_COUNT} | MVs: ${MV_COUNT}"
|
||||||
|
|
||||||
|
# Vérification des tables critiques
|
||||||
|
CRITICAL_TABLES=(
|
||||||
|
"${DB_LOGS}.http_logs_raw"
|
||||||
|
"${DB_LOGS}.http_logs"
|
||||||
|
"${DB_PROC}.ml_detected_anomalies"
|
||||||
|
"${DB_PROC}.ml_all_scores"
|
||||||
|
"${DB_PROC}.agg_host_ip_ja4_1h"
|
||||||
|
"${DB_PROC}.anubis_ip_rules"
|
||||||
|
"${DB_PROC}.anubis_asn_rules"
|
||||||
|
)
|
||||||
|
for t in "${CRITICAL_TABLES[@]}"; do
|
||||||
|
db="${t%%.*}"
|
||||||
|
tbl="${t##*.}"
|
||||||
|
EXISTS=$(ch "SELECT count() FROM system.tables WHERE database='${db}' AND name='${tbl}'" 2>/dev/null || echo "0")
|
||||||
|
if [ "${EXISTS}" = "1" ]; then
|
||||||
|
ok " ${t}"
|
||||||
|
else
|
||||||
|
err " Table manquante : ${t}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Vérification des dictionnaires critiques
|
||||||
|
CRITICAL_DICTS=(
|
||||||
|
"dict_anubis_ip"
|
||||||
|
"dict_anubis_asn"
|
||||||
|
"dict_iplocate_asn"
|
||||||
|
"dict_bot_ip"
|
||||||
|
"dict_bot_ja4"
|
||||||
|
"dict_browser_ja4"
|
||||||
|
"dict_asn_reputation"
|
||||||
|
)
|
||||||
|
for d in "${CRITICAL_DICTS[@]}"; do
|
||||||
|
STATUS=$(ch "SELECT status FROM system.dictionaries WHERE database='${DB_PROC}' AND name='${d}'" 2>/dev/null || echo "MISSING")
|
||||||
|
if [ "${STATUS}" = "LOADED" ] || [ "${STATUS}" = "NOT_LOADED" ]; then
|
||||||
|
ok " ${d} (${STATUS})"
|
||||||
|
else
|
||||||
|
echo " ⚠ Dictionnaire ${d}: ${STATUS}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# ── Import des données prod (optionnel) ──────────────────────────────────────
|
||||||
|
if [ "${IMPORT_PROD}" = true ]; then
|
||||||
|
IMPORT_SCRIPT="${SCRIPT_DIR}/import-prod-data.sh"
|
||||||
|
if [[ -x "${IMPORT_SCRIPT}" ]]; then
|
||||||
|
log "Lancement de l'import des données prod…"
|
||||||
|
"${IMPORT_SCRIPT}" --container "${DEV_CONTAINER}"
|
||||||
|
else
|
||||||
|
echo " ⚠ Script d'import non trouvé : ${IMPORT_SCRIPT}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Résultat ─────────────────────────────────────────────────────────────────
|
||||||
|
log "════════════════════════════════════════════════════"
|
||||||
|
log " Initialisation terminée"
|
||||||
|
log " Bases : ${DB_LOGS}, ${DB_PROC}"
|
||||||
|
log " Tables: ${TABLE_COUNT} | Dicts: ${DICT_COUNT} | MVs: ${MV_COUNT}"
|
||||||
|
log "════════════════════════════════════════════════════"
|
||||||
@ -1,7 +1,6 @@
|
|||||||
-- ============================================================================
|
-- ============================================================================
|
||||||
-- ANUBIS CRAWLER RULES — Labeling des http_logs + pipeline ML
|
-- ANUBIS CRAWLER RULES — Labeling des http_logs + pipeline ML
|
||||||
-- Architecture simplifiée (IP/CIDR et ASN uniquement) :
|
-- Architecture simplifiée (IP/CIDR et ASN uniquement) :
|
||||||
-- anubis_ua_rules (table stub) → dict_anubis_ua (REGEXP_TREE, catch-all)
|
|
||||||
-- anubis_ip_rules (table) → dict_anubis_ip (IP_TRIE)
|
-- anubis_ip_rules (table) → dict_anubis_ip (IP_TRIE)
|
||||||
-- anubis_asn_rules (table) → dict_anubis_asn (FLAT)
|
-- anubis_asn_rules (table) → dict_anubis_asn (FLAT)
|
||||||
-- http_logs : +anubis_bot_name, +anubis_bot_action, +anubis_bot_category
|
-- http_logs : +anubis_bot_name, +anubis_bot_action, +anubis_bot_category
|
||||||
@ -11,23 +10,7 @@
|
|||||||
-- ============================================================================
|
-- ============================================================================
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
-- 1. TABLE SOURCE — règles User-Agent (stub REGEXP_TREE)
|
-- 1. TABLE SOURCE — règles IP/CIDR (pour dictionnaire IP_TRIE)
|
||||||
-- REGEXP_TREE nécessite ≥1 règle ; le catch-all est injecté à l'init.
|
|
||||||
-- Cette table n'est PAS peuplée par fetch_rules.py.
|
|
||||||
-- ----------------------------------------------------------------------------
|
|
||||||
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules
|
|
||||||
(
|
|
||||||
id UInt64,
|
|
||||||
parent_id UInt64,
|
|
||||||
regexp String,
|
|
||||||
keys Array(String),
|
|
||||||
values Array(String)
|
|
||||||
)
|
|
||||||
ENGINE = ReplacingMergeTree()
|
|
||||||
ORDER BY id;
|
|
||||||
|
|
||||||
-- ----------------------------------------------------------------------------
|
|
||||||
-- 2. TABLE SOURCE — règles IP/CIDR (pour dictionnaire IP_TRIE)
|
|
||||||
-- Peuplée par fetch_rules.py depuis les fichiers YAML Anubis.
|
-- Peuplée par fetch_rules.py depuis les fichiers YAML Anubis.
|
||||||
-- ----------------------------------------------------------------------------
|
-- ----------------------------------------------------------------------------
|
||||||
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules
|
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules
|
||||||
|
|||||||
@ -1,29 +1,10 @@
|
|||||||
-- =============================================================================
|
-- =============================================================================
|
||||||
-- 03_anubis_tables.sql — Anubis crawler rule tables and dictionaries
|
-- 03_anubis_tables.sql — Anubis crawler rule tables and dictionaries
|
||||||
-- Only IP/CIDR and ASN rules are populated by fetch_rules.py.
|
-- Only IP/CIDR and ASN rules are used. UA and Country have been removed.
|
||||||
-- UA and Country dictionaries are kept as stubs (required by MV references)
|
|
||||||
-- but are never populated with real data.
|
|
||||||
-- =============================================================================
|
-- =============================================================================
|
||||||
|
|
||||||
-- -----------------------------------------------------------------------------
|
-- -----------------------------------------------------------------------------
|
||||||
-- 1. TABLE SOURCE — User-Agent rules (REGEXP_TREE stub)
|
-- 1. TABLE SOURCE — IP/CIDR rules (for IP_TRIE dictionary)
|
||||||
-- REGEXP_TREE requires ≥1 rule; the catch-all is seeded at init time.
|
|
||||||
-- This table is NOT populated by fetch_rules.py.
|
|
||||||
-- -----------------------------------------------------------------------------
|
|
||||||
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ua_rules
|
|
||||||
(
|
|
||||||
id UInt64,
|
|
||||||
parent_id UInt64,
|
|
||||||
regexp String,
|
|
||||||
keys Array(String),
|
|
||||||
values Array(String)
|
|
||||||
)
|
|
||||||
ENGINE = ReplacingMergeTree()
|
|
||||||
ORDER BY id;
|
|
||||||
|
|
||||||
|
|
||||||
-- -----------------------------------------------------------------------------
|
|
||||||
-- 2. TABLE SOURCE — IP/CIDR rules (for IP_TRIE dictionary)
|
|
||||||
-- Populated by fetch_rules.py from Anubis GitHub data.
|
-- Populated by fetch_rules.py from Anubis GitHub data.
|
||||||
-- -----------------------------------------------------------------------------
|
-- -----------------------------------------------------------------------------
|
||||||
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules
|
CREATE TABLE IF NOT EXISTS ja4_processing.anubis_ip_rules
|
||||||
|
|||||||
@ -34,16 +34,4 @@ for f in "$TMP_DIR"/*.sql; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
echo "[init] All SQL files executed successfully"
|
||||||
# Seed data required for dictionaries to function
|
|
||||||
# REGEXP_TREE dictionaries require at least one rule; without it, any INSERT
|
|
||||||
# into http_logs_raw fails because the MV mv_http_logs calls dictGet() on
|
|
||||||
# the empty dict. Insert a catch-all "unknown" rule so the pipeline works.
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
echo "[init] Seeding anubis_ua_rules (REGEXP_TREE needs ≥1 rule)..."
|
|
||||||
clickhouse-client --multiquery <<'SEED'
|
|
||||||
INSERT INTO ja4_processing.anubis_ua_rules (id, parent_id, regexp, keys, values) VALUES
|
|
||||||
(1, 0, '.*', ['bot_name','action','has_ip','rule_id','category'], ['','','0','0','']);
|
|
||||||
SEED
|
|
||||||
|
|
||||||
echo "[init] All SQL files executed and seed data inserted"
|
|
||||||
|
|||||||
Reference in New Issue
Block a user