Files
ja4-platform/scripts/update-csv-data.sh
toto b6184e6529 feat: CSV generation scripts, API filter params, enriched CSV stubs
- scripts/generate_bot_ip.py: download Tor exit nodes + curate scanner IPs (1353 entries)
- scripts/generate_bot_ja4.py: 31 bot JA4 fingerprints across 16 families
- scripts/generate_asn_data.py: 38 ASNs + 96 IP-to-ASN prefixes
- scripts/update-csv-data.sh: master orchestrator with --install-stubs
- api.py: add asn_org/country_code/ja4/bot_name filters on detections+scores
- pages.py: add /network route
- csv-stubs: enriched with generated data (Tor nodes, scanner IPs, etc.)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-08 15:05:43 +02:00

68 lines
2.5 KiB
Bash
Executable File

#!/usr/bin/env bash
# update-csv-data.sh — Download and generate all CSV reference data for JA4 platform.
#
# Outputs:
# data/bot_ip.csv — Known bot/scanner IPs + Tor exit nodes
# data/bot_ja4.csv — Known bot JA4 TLS fingerprints
# data/asn_reputation.csv — ASN→label mapping (human/datacenter/hosting)
# data/iplocate-ip-to-asn.csv — CIDR→ASN for dictionary lookup
#
# Usage:
# ./scripts/update-csv-data.sh # generate all
# ./scripts/update-csv-data.sh --install-stubs # also copy to test csv-stubs
#
# Requirements: curl, python3 (stdlib only)
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
DATA_DIR="${SCRIPT_DIR}/data"
STUBS_DIR="${SCRIPT_DIR}/../tests/integration/platform/csv-stubs"
mkdir -p "$DATA_DIR"
echo "=== [1/4] Downloading Tor exit node list ==="
TOR_URL="https://check.torproject.org/torbulkexitlist"
TOR_TMP="${DATA_DIR}/tor_exit_nodes.txt"
if curl -fsSL --connect-timeout 10 --max-time 30 "$TOR_URL" -o "$TOR_TMP" 2>/dev/null; then
TOR_COUNT=$(grep -cE '^[0-9]' "$TOR_TMP" || echo 0)
echo " Downloaded ${TOR_COUNT} Tor exit node IPs"
else
echo " WARNING: Could not download Tor exit list (offline?), using fallback"
TOR_TMP=""
fi
echo "=== [2/4] Generating bot_ip.csv ==="
python3 "${SCRIPT_DIR}/generate_bot_ip.py" \
--output "${DATA_DIR}/bot_ip.csv" \
${TOR_TMP:+--tor-file "$TOR_TMP"}
echo " $(wc -l < "${DATA_DIR}/bot_ip.csv") entries"
echo "=== [3/4] Generating bot_ja4.csv ==="
python3 "${SCRIPT_DIR}/generate_bot_ja4.py" \
--output "${DATA_DIR}/bot_ja4.csv"
echo " $(wc -l < "${DATA_DIR}/bot_ja4.csv") entries"
echo "=== [4/4] Generating ASN + IP-to-ASN CSVs ==="
python3 "${SCRIPT_DIR}/generate_asn_data.py" \
--output-asn "${DATA_DIR}/asn_reputation.csv" \
--output-ipasn "${DATA_DIR}/iplocate-ip-to-asn.csv"
echo " ASN reputation: $(wc -l < "${DATA_DIR}/asn_reputation.csv") entries"
echo " IP-to-ASN: $(wc -l < "${DATA_DIR}/iplocate-ip-to-asn.csv") entries"
# Optionally install into test stubs
if [[ "${1:-}" == "--install-stubs" ]]; then
echo ""
echo "=== Installing to test csv-stubs ==="
cp -v "${DATA_DIR}/bot_ip.csv" "$STUBS_DIR/"
cp -v "${DATA_DIR}/bot_ja4.csv" "$STUBS_DIR/"
cp -v "${DATA_DIR}/asn_reputation.csv" "$STUBS_DIR/"
cp -v "${DATA_DIR}/iplocate-ip-to-asn.csv" "$STUBS_DIR/"
echo "Done."
fi
# Cleanup
rm -f "${DATA_DIR}/tor_exit_nodes.txt"
echo ""
echo "All CSV data generated in ${DATA_DIR}/"
echo "Run with --install-stubs to copy to test fixtures."