feat(scripts): complete stack init + prod data import with date shift

Schema cleanup:
- Remove anubis_ua_rules table stub from 03_anubis_tables.sql
- Remove anubis_ua_rules from bot-detector deploy_schema.sql
- Remove UA seed step from clickhouse-init.sh (no more REGEXP_TREE dependency)
- Drop dict_anubis_ua, dict_anubis_country, anubis_ua_rules, anubis_country_rules

New scripts:
- scripts/init-stack.sh: comprehensive ClickHouse init (13 SQL files + migrations
  + validation + cleanup of obsolete tables). Supports --reset, --import-prod.
- scripts/import-prod-data.sh: imports pre-exported prod data (Native format)
  with dynamic date shift (max(time) → now). Supports --shift, --no-truncate.
- scripts/data/prod-export/: directory for cached Native format exports

Makefile targets: init-stack, import-prod-data, init-and-import

Tested: init-stack.sh passes all 13 SQL + 7 critical tables + 7 dicts
        import-prod-data.sh: 3M rows in ~37s with auto date shift
        Dashboard: 55 routes OK, bot-detector: 36/36 tests pass

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-09 21:40:05 +02:00
parent d8ca804a55
commit 9ea36ad22e
8 changed files with 437 additions and 54 deletions

168
scripts/import-prod-data.sh Executable file
View File

@ -0,0 +1,168 @@
#!/usr/bin/env bash
# =============================================================================
# import-prod-data.sh — Importe les données prod pré-extraites dans la DB de dev
# avec un décalage de date dynamique (max(time) → now()).
#
# Les données doivent avoir été exportées au format Native dans le répertoire
# scripts/data/prod-export/ (fichiers *.native). Utilisez reload-prod-logs.sh
# pour effectuer l'extraction initiale depuis la prod.
#
# Usage:
# ./scripts/import-prod-data.sh # décalage auto
# ./scripts/import-prod-data.sh --shift 3600 # décalage manuel (secondes)
# ./scripts/import-prod-data.sh --container my-ch # conteneur spécifique
# ./scripts/import-prod-data.sh --no-truncate # conserver les données existantes
# ./scripts/import-prod-data.sh --cron # mode silencieux
#
# Variables d'environnement :
# DEV_CONTAINER Nom du conteneur ClickHouse (défaut: integration-clickhouse-1)
# DEV_DB_LOGS Base de données logs (défaut: ja4_logs)
# EXPORT_DIR Répertoire des fichiers Native (défaut: scripts/data/prod-export)
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
# ── Configuration ────────────────────────────────────────────────────────────
DEV_CONTAINER="${DEV_CONTAINER:-integration-clickhouse-1}"
DEV_DB_LOGS="${DEV_DB_LOGS:-ja4_logs}"
EXPORT_DIR="${EXPORT_DIR:-${REPO_ROOT}/scripts/data/prod-export}"
SHIFT_SECONDS=""
NO_TRUNCATE=false
CRON_MODE=false
# ── Parsing des arguments ────────────────────────────────────────────────────
while [[ $# -gt 0 ]]; do
case "$1" in
--container) DEV_CONTAINER="$2"; shift 2 ;;
--shift) SHIFT_SECONDS="$2"; shift 2 ;;
--no-truncate) NO_TRUNCATE=true; shift ;;
--cron) CRON_MODE=true; shift ;;
-h|--help)
sed -n '2,/^# =====/{ /^# =====/d; s/^# \?//p; }' "$0"
exit 0
;;
*) echo "Option inconnue : $1"; exit 1 ;;
esac
done
STAGING_TABLE="${DEV_DB_LOGS}._staging_prod_import"
LOG_PREFIX="[import-prod]"
log() { [ "${CRON_MODE}" = false ] && echo "${LOG_PREFIX} $(date '+%H:%M:%S') $*" || true; }
err() { echo "${LOG_PREFIX} ERREUR: $*" >&2; exit 1; }
# ── Requêteur CH ─────────────────────────────────────────────────────────────
ch() {
docker exec -i "${DEV_CONTAINER}" clickhouse-client --query "$1"
}
ch_insert_native() {
docker exec -i "${DEV_CONTAINER}" clickhouse-client \
--query "INSERT INTO ${STAGING_TABLE} FORMAT Native"
}
# ── Vérifications ────────────────────────────────────────────────────────────
log "Vérification du conteneur ${DEV_CONTAINER}"
docker exec "${DEV_CONTAINER}" clickhouse-client --query "SELECT 1" > /dev/null 2>&1 \
|| err "Conteneur ${DEV_CONTAINER} inaccessible"
log "Vérification des fichiers d'export dans ${EXPORT_DIR}"
NATIVE_FILES=("${EXPORT_DIR}"/*.native)
if [[ ! -f "${NATIVE_FILES[0]}" ]]; then
err "Aucun fichier *.native trouvé dans ${EXPORT_DIR}. Lancez d'abord reload-prod-logs.sh."
fi
FILE_COUNT=${#NATIVE_FILES[@]}
log " ${FILE_COUNT} fichier(s) Native trouvé(s)"
# ── Création de la table de staging ──────────────────────────────────────────
log "Création de la table de staging…"
ch "DROP TABLE IF EXISTS ${STAGING_TABLE}"
ch "CREATE TABLE ${STAGING_TABLE} AS ${DEV_DB_LOGS}.http_logs ENGINE = MergeTree() ORDER BY (time, src_ip) SETTINGS index_granularity = 8192"
# ── Import des fichiers Native ───────────────────────────────────────────────
log "Import des ${FILE_COUNT} fichiers dans le staging…"
TOTAL_IMPORTED=0
for f in "${NATIVE_FILES[@]}"; do
BASENAME=$(basename "${f}")
FILE_SIZE=$(stat -c%s "${f}" 2>/dev/null || stat -f%z "${f}" 2>/dev/null || echo 0)
if [ "${FILE_SIZE}" -eq 0 ]; then
log " ${BASENAME} — vide, ignoré"
continue
fi
log " ${BASENAME} ($(( FILE_SIZE / 1024 / 1024 )) Mo)…"
ch_insert_native < "${f}" \
|| err "Import échoué pour ${BASENAME}"
CURRENT=$(ch "SELECT count() FROM ${STAGING_TABLE}")
BATCH_ROWS=$(( CURRENT - TOTAL_IMPORTED ))
TOTAL_IMPORTED=${CURRENT}
log "${BATCH_ROWS} lignes (total staging: ${TOTAL_IMPORTED})"
done
log "Staging terminé : ${TOTAL_IMPORTED} lignes."
if [ "${TOTAL_IMPORTED}" -eq 0 ]; then
ch "DROP TABLE IF EXISTS ${STAGING_TABLE}"
err "Aucune donnée importée dans le staging"
fi
# ── Calcul du décalage de date ───────────────────────────────────────────────
if [ -z "${SHIFT_SECONDS}" ]; then
STAGING_MAX_TS=$(ch "SELECT toUnixTimestamp(max(time)) FROM ${STAGING_TABLE}")
NOW_TS=$(date +%s)
SHIFT_SECONDS=$(( NOW_TS - STAGING_MAX_TS ))
log "Décalage auto : ${SHIFT_SECONDS}s (max(staging) → now)"
fi
log "Décalage appliqué : ${SHIFT_SECONDS} secondes ($(( SHIFT_SECONDS / 3600 ))h $(( (SHIFT_SECONDS % 3600) / 60 ))min)"
# ── Nettoyage de la table cible ──────────────────────────────────────────────
if [ "${NO_TRUNCATE}" = false ]; then
log "Nettoyage de ${DEV_DB_LOGS}.http_logs…"
ch "TRUNCATE TABLE ${DEV_DB_LOGS}.http_logs"
fi
# ── Insertion avec décalage de date ──────────────────────────────────────────
log "Insertion avec décalage dans ${DEV_DB_LOGS}.http_logs…"
ch "
INSERT INTO ${DEV_DB_LOGS}.http_logs
SELECT
time + toIntervalSecond(${SHIFT_SECONDS}) AS time,
toDate(time + toIntervalSecond(${SHIFT_SECONDS})) AS log_date,
src_ip, src_port, dst_ip, dst_port,
src_asn, src_country_code, src_as_name, src_org, src_domain,
method, scheme, host, path, query, http_version,
orphan_side, correlated, keepalives,
a_timestamp, b_timestamp, conn_id,
ip_meta_df, ip_meta_id, ip_meta_total_length, ip_meta_ttl,
tcp_meta_options, tcp_meta_window_size, tcp_meta_mss, tcp_meta_window_scale,
syn_to_clienthello_ms,
tls_version, tls_sni, tls_alpn, ja3, ja3_hash, ja4,
client_headers,
header_user_agent, header_accept, header_accept_encoding, header_accept_language,
header_content_type, header_x_request_id, header_x_trace_id, header_x_forwarded_for,
header_sec_ch_ua, header_sec_ch_ua_mobile, header_sec_ch_ua_platform,
header_sec_fetch_dest, header_sec_fetch_mode, header_sec_fetch_site,
anubis_bot_name, anubis_bot_action, anubis_bot_category
FROM ${STAGING_TABLE}
"
FINAL_COUNT=$(ch "SELECT count() FROM ${DEV_DB_LOGS}.http_logs")
FINAL_MIN=$(ch "SELECT min(time) FROM ${DEV_DB_LOGS}.http_logs")
FINAL_MAX=$(ch "SELECT max(time) FROM ${DEV_DB_LOGS}.http_logs")
# ── Nettoyage ────────────────────────────────────────────────────────────────
log "Nettoyage de la table de staging…"
ch "DROP TABLE IF EXISTS ${STAGING_TABLE}"
# ── Résultat ─────────────────────────────────────────────────────────────────
log "════════════════════════════════════════════════════"
log " Import terminé : ${FINAL_COUNT} lignes"
log " Plage : ${FINAL_MIN}${FINAL_MAX}"
log " Décalage : ${SHIFT_SECONDS}s"
log "════════════════════════════════════════════════════"