diff --git a/tests/integration/docker-compose.yml b/tests/integration/docker-compose.yml
index 8f29bb1..971e3c5 100644
--- a/tests/integration/docker-compose.yml
+++ b/tests/integration/docker-compose.yml
@@ -157,6 +157,8 @@ services:
depends_on:
platform:
condition: service_healthy
+ volumes:
+ - ../../scripts/data:/app/data:ro
networks:
- ja4net
diff --git a/tests/integration/run-tests.sh b/tests/integration/run-tests.sh
index 65116b8..6d23741 100755
--- a/tests/integration/run-tests.sh
+++ b/tests/integration/run-tests.sh
@@ -162,15 +162,15 @@ log "============================================"
# The seeder inserts directly into http_logs_raw, triggering all MVs:
# http_logs_raw → mv_http_logs → http_logs → mv_agg_host_ip_ja4_1h → agg_host_ip_ja4_1h
# This pre-populates:
-# - 720 human sessions (IPs in residential ASN ranges → asn_label='human')
-# - 150 scanner/anomaly sessions (IPs in datacenter ASN → ML anomaly candidates)
-# - 100 known-bot sessions (IPs/JA4 matching bot_ip.csv / bot_ja4.csv)
-# - 20 brute-force clusters (many POST /login per IP)
+# - ~350,000 rows from 14,000 browser IPs (ISP ASN ranges → asn_label='isp')
+# - ~100,000 rows from 3,000 scanner IPs (datacenter ASN → ML anomaly candidates)
+# - ~30,000 rows from 2,000 legit bot IPs (from bot_ip.csv CIDRs)
+# - ~20,000 rows from 1,000 AI bot IPs (datacenter ranges)
# After seeding, bot_detector has ≥500 human rows → can train and run.
log "Running seed_clickhouse.py..."
if docker compose exec -T traffic-gen python /app/seed_clickhouse.py \
--host clickhouse --port 8123 --user default --password ""; then
- pass "ClickHouse seeded (700+ human + 150 scanner + 100 known-bot rows)"
+ pass "ClickHouse seeded (500K rows: 350K browser + 100K scanner + 30K legit-bot + 20K AI-bot)"
else
warn "Seeder reported errors (pipeline verification will show impact)"
fi
diff --git a/tests/integration/traffic-gen/Dockerfile b/tests/integration/traffic-gen/Dockerfile
index f5adc31..50e568f 100644
--- a/tests/integration/traffic-gen/Dockerfile
+++ b/tests/integration/traffic-gen/Dockerfile
@@ -2,7 +2,7 @@ FROM python:3.12-alpine
# No extra deps needed — stdlib only (urllib, ssl, concurrent.futures)
WORKDIR /app
-COPY generate_traffic.py .
+COPY *.py .
# Keep container alive; traffic is triggered via docker compose exec
ENTRYPOINT ["sleep", "infinity"]
diff --git a/tests/integration/traffic-gen/seed_clickhouse.py b/tests/integration/traffic-gen/seed_clickhouse.py
index a4d9590..d1b46ff 100644
--- a/tests/integration/traffic-gen/seed_clickhouse.py
+++ b/tests/integration/traffic-gen/seed_clickhouse.py
@@ -3,26 +3,41 @@
seed_clickhouse.py — Bootstrap ClickHouse with realistic synthetic traffic data.
Inserts directly into ja4_logs.http_logs_raw (triggers all MVs automatically):
- • 700 human sessions — IPs in residential ISP ranges (ASN→'human' via dict)
- • 150 datacenter/scanner sessions — anomalous patterns for ML detection
- • 100 known-bot sessions — IPs/JA4 in bot_ip.csv / bot_ja4.csv
+ • ~350,000 rows from 14,000 legitimate browser IPs (ISP ranges, asn_label='isp')
+ • ~100,000 rows from 3,000 malicious bot/scanner IPs (datacenter ranges)
+ • ~30,000 rows from 2,000 legitimate bot IPs (from bot_ip.csv CIDRs)
+ • ~20,000 rows from 1,000 AI bot IPs (datacenter ranges)
+
+Total: 500,000 rows from 20,000 unique IPs (configurable via --rows / --ips).
+
+Browser JA4 fingerprints are loaded from browser_ja4.csv so they match
+dict_browser_ja4 (LEGITIMATE_BROWSER classification). Bot/scanner JA4s are
+synthetic hashes guaranteed NOT to appear in that dictionary.
+
+Bot IPs are drawn from real CIDRs in bot_ip.csv (Googlebot, Bingbot, etc.).
+ISP and datacenter IPs use hard-coded /24 prefixes from well-known ASNs that
+resolve correctly through iplocate-ip-to-asn.csv → asn_reputation.csv.
This ensures view_ai_features_1h has ≥ 500 human rows for the bot_detector
training threshold (run_semi_supervised_logic requires len(human_baseline) >= 500).
All timestamps are within the last 30 minutes so the 24h window filter catches them.
-No external dependencies — uses Python stdlib urllib only.
+No external dependencies — uses Python stdlib only.
Usage:
python seed_clickhouse.py
python seed_clickhouse.py --host clickhouse --port 8123 --user default --password ""
+ python seed_clickhouse.py --rows 500000 --ips 20000 --seed 42
python seed_clickhouse.py --dry-run
"""
import argparse
+import csv
import hashlib
+import ipaddress
import json
+import os
import random
import time
import urllib.error
@@ -31,104 +46,194 @@ import urllib.request
from datetime import datetime, timedelta, timezone
# ---------------------------------------------------------------------------
-# JA4 fingerprint profiles (must match bot_ja4.csv for bot detection to work)
+# Hard-coded /24 prefixes — guaranteed to resolve via ASN dictionaries
# ---------------------------------------------------------------------------
-# Human browser profiles — realistic TLS 1.3 fingerprints
-HUMAN_JA4S = [
- "t13d1917h2_b0372614b25a_6a77dcf5a8be", # Chrome 120 Windows TLS1.3
- "t13d1817h2_b0372614b25a_0a3e5785d15f", # Firefox 121 TLS1.3
- "t13d1617h2_fc82e8b7e1c0_9dc949149365", # Safari 17 macOS TLS1.3
- "t13d1917h2_fc82e8b7e1c0_6b9b1b2c3d4e", # Edge 120 TLS1.3
- "t13d1817h2_9dc949149365_8c4a9a4b0d01", # Chrome Mobile TLS1.3
- "t12d1706h2_9dc949149365_fc82e8b7e1c0", # Chrome 120 TLS1.2 (older server)
- "t12d1606h2_8c4a9a4b0d01_9dc949149365", # Firefox TLS1.2
-]
+# ISP ranges (asn_label='isp' in asn_reputation.csv)
+ISP_PREFIXES = (
+ # Comcast AS7922 — within 24.0.0.0/12
+ [f"24.{o2}.{o3}" for o2 in range(1, 11) for o3 in range(0, 3)]
+ # Orange AS3215 — within 2.3.0.0/16
+ + [f"2.3.{o3}" for o3 in range(0, 10)]
+ # Deutsche Telekom AS3320 — within 2.160.0.0/12
+ + [f"2.{160 + o2}.{o3}" for o2 in range(0, 5) for o3 in range(0, 2)]
+ # AT&T AS7018 — within 12.0.0.0/10
+ + [f"12.0.{o3}" for o3 in range(4, 14)]
+ # Verizon AS701 — within 63.0.0.0/12
+ + [f"63.{o2}.0" for o2 in range(0, 10)]
+ # BT AS2856 — within 5.80.0.0/15
+ + [f"5.80.{o3}" for o3 in range(0, 8)]
+) # ~68 prefixes × 254 ≈ 17K IPs
-# Bot/scanner profiles — intentionally minimal cipher suites, match bot_ja4.csv
-BOT_JA4S = [
- "t13d030500_ffd59bab1b39_6e7f7df63e98", # curl scanner (in bot_ja4.csv)
- "t13d020300_6b9b1b2c3d4e_ffd59bab1b39", # python-requests scanner (in bot_ja4.csv)
- "t10d170000_0a1b2c3d4e5f_1b2c3d4e5f60", # Masscan (in bot_ja4.csv)
- "t12d050700_5a6b7c8d9e0f_1a2b3c4d5e6f", # zgrab (in bot_ja4.csv)
- "t13d010100_aabbccddeeff_0011223344aa", # Headless Chrome automation (in bot_ja4.csv)
+# Datacenter ranges for scanners (asn_label='datacenter')
+DC_SCANNER_PREFIXES = (
+ # DigitalOcean AS14061 — within 5.101.96.0/20
+ [f"5.101.{96 + o3}" for o3 in range(0, 6)]
+ # Hetzner AS24940 — within 5.9.0.0/16
+ + [f"5.9.{o3}" for o3 in range(0, 6)]
+ # OVH AS16276 — within 5.39.0.0/17
+ + [f"5.39.{o3}" for o3 in range(0, 5)]
+) # ~17 prefixes × 254 ≈ 4.3K IPs
+
+# Datacenter ranges for AI bots (separate from scanner ranges)
+DC_AI_PREFIXES = (
+ # DigitalOcean (different /24s)
+ [f"5.101.{102 + o3}" for o3 in range(0, 4)]
+ # Hetzner (different /24s)
+ + [f"5.9.{6 + o3}" for o3 in range(0, 4)]
+) # ~8 prefixes × 254 ≈ 2K IPs
+
+# Fallback /24s for legitimate bot overflow
+DC_LEGIT_BOT_PREFIXES = [f"5.9.{20 + o3}" for o3 in range(0, 8)]
+
+# ---------------------------------------------------------------------------
+# Browser family → User-Agent mapping
+# ---------------------------------------------------------------------------
+
+BROWSER_UAS = {
+ "Chromium": [
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/120.0.6099.115 Mobile Safari/537.36",
+ ],
+ "Firefox": [
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) "
+ "Gecko/20100101 Firefox/121.0",
+ "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) "
+ "Gecko/20100101 Firefox/120.0",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.2; rv:121.0) "
+ "Gecko/20100101 Firefox/121.0",
+ ],
+ "Safari": [
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 "
+ "(KHTML, like Gecko) Version/17.2 Safari/605.1.15",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_1) AppleWebKit/605.1.15 "
+ "(KHTML, like Gecko) Version/17.1 Safari/605.1.15",
+ ],
+ "Edge": [
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0",
+ ],
+ "Opera": [
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 OPR/105.0.0.0",
+ ],
+ "Vivaldi": [
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 "
+ "Vivaldi/6.4.3160.47",
+ ],
+ "Chrome_iOS": [
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) "
+ "AppleWebKit/605.1.15 (KHTML, like Gecko) "
+ "CriOS/120.0.6099.119 Mobile/15E148 Safari/604.1",
+ ],
+ "Chromium_Legacy": [
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
+ ],
+ "Firefox_Legacy": [
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) "
+ "Gecko/20100101 Firefox/78.0",
+ ],
+ "Safari_Legacy": [
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+ "AppleWebKit/605.1.15 (KHTML, like Gecko) "
+ "Version/15.6 Safari/605.1.15",
+ ],
+ "Tor_Browser": [
+ "Mozilla/5.0 (Windows NT 10.0; rv:102.0) "
+ "Gecko/20100101 Firefox/102.0",
+ ],
+}
+
+BROWSER_TCP = {
+ "Chromium": ["windows", "linux", "android"],
+ "Firefox": ["windows", "linux"],
+ "Safari": ["macos"],
+ "Edge": ["windows"],
+ "Opera": ["windows", "linux"],
+ "Vivaldi": ["windows", "linux"],
+ "Chrome_iOS": ["macos"],
+ "Chromium_Legacy": ["windows"],
+ "Firefox_Legacy": ["windows", "linux"],
+ "Safari_Legacy": ["macos"],
+ "Tor_Browser": ["linux"],
+}
+
+CHROMIUM_FAMILIES = {
+ "Chromium", "Edge", "Opera", "Vivaldi",
+ "Chrome_iOS", "Chromium_Legacy",
+}
+
+BROWSER_WEIGHTS = {
+ "Chromium": 55, "Firefox": 15, "Safari": 12, "Edge": 8,
+ "Opera": 3, "Vivaldi": 1, "Chrome_iOS": 3,
+ "Chromium_Legacy": 1, "Firefox_Legacy": 1,
+ "Safari_Legacy": 0.5, "Tor_Browser": 0.5,
+}
+
+SEC_CH_UA = {
+ "Chromium":
+ '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"',
+ "Edge":
+ '"Chromium";v="120", "Microsoft Edge";v="120", "Not-A.Brand";v="99"',
+ "Opera":
+ '"Chromium";v="119", "Opera";v="105", "Not-A.Brand";v="99"',
+ "Vivaldi":
+ '"Chromium";v="118", "Vivaldi";v="6.4", "Not-A.Brand";v="99"',
+ "Chrome_iOS":
+ '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"',
+ "Chromium_Legacy":
+ '"Chromium";v="90", "Google Chrome";v="90", "Not-A.Brand";v="99"',
+}
+
+# ---------------------------------------------------------------------------
+# Hosts and Accept-Language pools
+# ---------------------------------------------------------------------------
+
+HOSTS = ["platform", "api.platform", "www.example.com", "shop.example.com"]
+
+ACCEPT_LANGUAGES = [
+ "en-US,en;q=0.9",
+ "en-GB,en;q=0.9",
+ "fr-FR,fr;q=0.9,en;q=0.8",
+ "de-DE,de;q=0.9,en;q=0.8",
+ "es-ES,es;q=0.9,en;q=0.8",
+ "ja-JP,ja;q=0.9,en;q=0.8",
+ "pt-BR,pt;q=0.9,en;q=0.8",
+ "zh-CN,zh;q=0.9,en;q=0.8",
]
# ---------------------------------------------------------------------------
-# IP pools — must match ranges in iplocate-ip-to-asn.csv
+# Path pools
# ---------------------------------------------------------------------------
-# Human residential IPs — OVH FR (ASN 16276) → asn_label='human'
-def _human_ips(n: int) -> list:
- ips = [f"91.121.{o3}.{o4}" for o3 in range(0, 20) for o4 in range(1, 60)]
- random.shuffle(ips)
- return ips[:n]
-
-# Datacenter / scanner IPs — Tor/Contabo/Reg.ru → asn_label='datacenter'/'hosting'
-def _scanner_ips(n: int) -> list:
- ips = (
- [f"185.220.101.{i}" for i in range(1, 101)] # ASN 210644 datacenter
- + [f"45.155.205.{i}" for i in range(1, 51)] # ASN 209083 datacenter
- + [f"193.32.162.{i}" for i in range(1, 31)] # ASN 197695 hosting
- )
- random.shuffle(ips)
- return ips[:n]
-
-# Known bot IPs (subset also in bot_ip.csv → directly labeled)
-BOT_IP_KNOWN = [
- "185.220.101.34", "185.220.101.47", "185.220.101.52",
- "185.220.101.73", "185.220.101.91",
- "45.155.205.233", "45.155.205.220", "45.155.205.205",
- "193.32.162.10", "193.32.162.11",
-]
-
-# ---------------------------------------------------------------------------
-# User-Agent pools per profile
-# ---------------------------------------------------------------------------
-HUMAN_UA = [
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
- "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
- "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.115 Mobile Safari/537.36",
- "Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1",
-]
-
-SCANNER_UA = [
- "curl/7.88.1",
- "python-requests/2.31.0",
- "Masscan/1.3",
- "zgrab/0.x",
- "Go-http-client/1.1",
- "libwww-perl/6.72",
- "Java/11.0.18",
- "Wget/1.21.3",
- "masscan/1.3 (https://github.com/robertdavidgraham/masscan)",
- "-", # No User-Agent (raw scanner)
-]
-
-BOT_CRAWLER_UA = [
- "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
- "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
- "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
- "Twitterbot/1.0",
- "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)",
- "Googlebot/2.1 (+http://www.google.com/bot.html)",
-]
-
-# ---------------------------------------------------------------------------
-# Path pools per profile
-# ---------------------------------------------------------------------------
-HUMAN_PATHS = [
+PAGE_PATHS = [
"/", "/index.html", "/about", "/contact", "/products", "/services",
- "/blog", "/blog/post-1", "/blog/post-2", "/faq", "/pricing",
- "/login", "/register", "/profile", "/dashboard",
+ "/blog", "/blog/post-1", "/blog/post-2", "/blog/post-3", "/faq",
+ "/pricing", "/login", "/register", "/profile", "/dashboard",
+ "/docs", "/docs/getting-started", "/docs/api-reference",
+ "/help", "/terms", "/privacy", "/search",
+]
+
+ASSET_PATHS = [
+ "/static/js/app.js", "/static/js/vendor.js", "/static/js/analytics.js",
+ "/static/css/main.css", "/static/css/theme.css",
+ "/images/logo.png", "/images/hero.webp", "/images/banner.jpg",
+ "/favicon.ico", "/fonts/inter-400.woff2", "/fonts/inter-700.woff2",
+]
+
+API_PATHS = [
"/api/v1/users", "/api/v1/status", "/api/v2/metrics",
- "/static/js/app.js", "/static/css/main.css", "/images/logo.png",
- "/favicon.ico", "/robots.txt", "/sitemap.xml",
- "/health", "/search?q=test", "/search?q=product+review",
+ "/api/v1/products", "/api/v1/search", "/api/v2/config",
]
ATTACK_PATHS = [
@@ -137,109 +242,393 @@ ATTACK_PATHS = [
"/phpmyadmin/", "/phpMyAdmin/", "/pma/",
"/admin", "/admin/login", "/administrator/",
"/cgi-bin/test.cgi", "/cgi-bin/../etc/passwd",
- "/download?file=../../../etc/passwd", "/download?file=../../../../etc/shadow",
+ "/download?file=../../../etc/passwd",
+ "/download?file=../../../../etc/shadow",
"/api/search?q=",
"/api/users?id=1+OR+1%3D1",
"/shell.php", "/cmd.php", "/eval.php",
"/.aws/credentials", "/.ssh/id_rsa",
"/etc/passwd", "/proc/self/environ",
+ "/actuator", "/actuator/env", "/actuator/health",
+ "/server-status", "/.svn/entries",
+ "/wp-content/uploads/", "/backup.zip", "/db.sql",
+ "/api/v1/../admin", "/api/debug",
+ "/.htaccess", "/.htpasswd",
+ "/console", "/debug/pprof/",
]
+SCRAPER_PATHS = (
+ [f"/products/page/{i}" for i in range(1, 51)]
+ + [
+ f"/category/{c}/page/{i}"
+ for c in ["electronics", "clothing", "books", "home", "sports"]
+ for i in range(1, 11)
+ ]
+)
+
BOT_PATHS = [
"/robots.txt", "/sitemap.xml", "/", "/index.html",
"/sitemap_index.xml", "/news-sitemap.xml",
"/feed", "/rss.xml", "/atom.xml",
]
+CONTENT_PATHS = PAGE_PATHS + [f"/blog/post-{i}" for i in range(1, 21)] + [
+ f"/products/{s}"
+ for s in ["widget-a", "widget-b", "gadget-x", "tool-pro", "kit-basic"]
+]
+
+# ---------------------------------------------------------------------------
+# Scanner / bot User-Agents
+# ---------------------------------------------------------------------------
+
+SCANNER_UAS = [
+ "curl/7.88.1",
+ "curl/8.1.2",
+ "python-requests/2.31.0",
+ "python-requests/2.28.1",
+ "python-urllib3/2.0.4",
+ "Masscan/1.3",
+ "masscan/1.3 (https://github.com/robertdavidgraham/masscan)",
+ "zgrab/0.x",
+ "Go-http-client/1.1",
+ "Go-http-client/2.0",
+ "libwww-perl/6.72",
+ "Java/11.0.18",
+ "Java/17.0.2",
+ "Wget/1.21.3",
+ "Scrapy/2.11.0",
+ "Apache-HttpClient/4.5.14",
+ "okhttp/4.12.0",
+ "Node-Fetch/1.0",
+ "axios/1.6.2",
+ "-",
+ "",
+]
+
+HEADLESS_UAS = [
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) HeadlessChrome/120.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) HeadlessChrome/119.0.0.0 Safari/537.36",
+]
+
+LEGIT_BOT_UAS = {
+ "Googlebot": [
+ "Mozilla/5.0 (compatible; Googlebot/2.1; "
+ "+http://www.google.com/bot.html)",
+ "Googlebot/2.1 (+http://www.google.com/bot.html)",
+ "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/120.0.6099.71 Mobile Safari/537.36 "
+ "(compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+ ],
+ "Bingbot": [
+ "Mozilla/5.0 (compatible; bingbot/2.0; "
+ "+http://www.bing.com/bingbot.htm)",
+ "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; "
+ "compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) "
+ "Chrome/116.0.1938.76 Safari/537.36",
+ ],
+ "DuckDuckBot": [
+ "DuckDuckBot/1.1; (+http://duckduckgo.com/duckduckbot.html)",
+ ],
+ "Applebot": [
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+ "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 "
+ "Safari/605.1.15 (Applebot/0.1; "
+ "+http://www.apple.com/go/applebot)",
+ ],
+ "YandexBot": [
+ "Mozilla/5.0 (compatible; YandexBot/3.0; "
+ "+http://yandex.com/bots)",
+ ],
+ "Twitterbot": [
+ "Twitterbot/1.0",
+ ],
+ "FacebookBot": [
+ "facebookexternalhit/1.1 "
+ "(+http://www.facebook.com/externalhit_uatext.php)",
+ ],
+}
+
+AI_BOT_UAS = {
+ "GPTBot": [
+ "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; "
+ "compatible; GPTBot/1.0; +https://openai.com/gptbot)",
+ ],
+ "PerplexityBot": [
+ "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; "
+ "compatible; PerplexityBot/1.0; "
+ "+https://docs.perplexity.ai/docs/perplexity-bot)",
+ ],
+ "ClaudeBot": [
+ "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; "
+ "compatible; ClaudeBot/1.0; "
+ "+https://www.anthropic.com/claude-bot)",
+ ],
+ "CCBot": [
+ "CCBot/2.0 (https://commoncrawl.org/faq/)",
+ ],
+ "Bytespider": [
+ "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Mobile Safari/537.36 "
+ "(compatible; Bytespider; spider-feedback@bytedance.com)",
+ ],
+}
+
# ---------------------------------------------------------------------------
# TCP / TLS metadata helpers
# ---------------------------------------------------------------------------
-# Realistic TCP options fingerprints per OS
TCP_OPTIONS = {
- "linux": "020405b40402080affffffff000000000103030a", # MSS+NOP+SACK+TS+WS=10
- "windows": "020405b40103030801010402", # MSS+NOP+WS+SACK
- "macos": "020405ac0103030601010402", # MSS+NOP+WS+SACK (macOS)
- "scanner": "0204ffff", # Scanner: only MSS, max value
- "minimal": "0204ffd7", # Minimal
+ "linux": "020405b40402080affffffff000000000103030a",
+ "windows": "020405b40103030801010402",
+ "macos": "020405ac0103030601010402",
+ "android": "020405b40402080affffffff000000000103030a",
+ "scanner": "0204ffff",
+ "minimal": "0204ffd7",
}
-def _tcp_meta(profile: str = "linux") -> dict:
- profiles = {
- "linux": {"window_size": 65535, "mss": 1460, "wscale": 10, "ttl": 64, "df": 1},
- "windows": {"window_size": 64240, "mss": 1460, "wscale": 8, "ttl": 128, "df": 1},
- "macos": {"window_size": 65535, "mss": 1460, "wscale": 6, "ttl": 64, "df": 1},
- "android": {"window_size": 65535, "mss": 1420, "wscale": 9, "ttl": 64, "df": 1},
- "scanner": {"window_size": 1024, "mss": 1460, "wscale": 0, "ttl": 48, "df": 0},
- "minimal": {"window_size": 512, "mss": 576, "wscale": 0, "ttl": 60, "df": 0},
- }
- meta = profiles.get(profile, profiles["linux"])
+TCP_PROFILES = {
+ "linux": {"window_size": 65535, "mss": 1460, "wscale": 10,
+ "ttl": 64, "df": 1},
+ "windows": {"window_size": 64240, "mss": 1460, "wscale": 8,
+ "ttl": 128, "df": 1},
+ "macos": {"window_size": 65535, "mss": 1460, "wscale": 6,
+ "ttl": 64, "df": 1},
+ "android": {"window_size": 65535, "mss": 1420, "wscale": 9,
+ "ttl": 64, "df": 1},
+ "scanner": {"window_size": 1024, "mss": 1460, "wscale": 0,
+ "ttl": 48, "df": 0},
+ "minimal": {"window_size": 512, "mss": 576, "wscale": 0,
+ "ttl": 60, "df": 0},
+}
+
+
+def _tcp_meta(profile, rng):
+ meta = TCP_PROFILES.get(profile, TCP_PROFILES["linux"])
return {
- "tcp_meta_window_size": meta["window_size"] + random.randint(-100, 100),
- "tcp_meta_mss": meta["mss"],
+ "tcp_meta_window_size": meta["window_size"] + rng.randint(-100, 100),
+ "tcp_meta_mss": meta["mss"],
"tcp_meta_window_scale": meta["wscale"],
- "tcp_meta_options": TCP_OPTIONS.get(profile, TCP_OPTIONS["linux"]),
- "ip_meta_ttl": meta["ttl"] - random.randint(0, 5),
- "ip_meta_df": meta["df"],
- "ip_meta_id": random.randint(1, 65535),
- "ip_meta_total_length": random.randint(1200, 1500),
+ "tcp_meta_options": TCP_OPTIONS.get(profile, TCP_OPTIONS["linux"]),
+ "ip_meta_ttl": meta["ttl"] - rng.randint(0, 5),
+ "ip_meta_df": meta["df"],
+ "ip_meta_id": rng.randint(1, 65535),
+ "ip_meta_total_length": rng.randint(1200, 1500),
}
-def _syn_ms(profile: str) -> int:
+def _syn_ms(profile, rng):
"""Realistic SYN→ClientHello latency in milliseconds."""
if profile == "scanner":
- return random.randint(0, 3) # Scanners: near-instant
- if profile in ("minimal",):
- return random.randint(1, 5)
- return random.randint(10, 120) # Humans: network RTT
+ return rng.randint(0, 3)
+ if profile == "minimal":
+ return rng.randint(1, 5)
+ return rng.randint(10, 120)
-def _ja3_for_ja4(ja4: str) -> tuple:
- """Generate a plausible JA3 string and its MD5 hash matching the JA4 profile."""
- # These are fake but consistent — just need to be non-empty strings
- if "tls13" in ja4 or ja4.startswith("t13"):
- raw = "771,4866-4867-4865-49196-49200-52393-52392,0-23-65281-10-11-35-16-5-13-18-51-45-43-27,29-23-24,0"
+def _ja3_for_ja4(ja4):
+ """Generate a plausible JA3 string and its MD5 hash."""
+ if ja4.startswith("t13"):
+ raw = ("771,4866-4867-4865-49196-49200-52393-52392,"
+ "0-23-65281-10-11-35-16-5-13-18-51-45-43-27,29-23-24,0")
elif ja4.startswith("t12"):
- raw = "771,49195-49199-49196-49200-52393-52392,0-23-65281-10-11-35-16-5-13,29-23-24,0"
+ raw = ("771,49195-49199-49196-49200-52393-52392,"
+ "0-23-65281-10-11-35-16-5-13,29-23-24,0")
elif ja4.startswith("t10"):
raw = "769,49161-49162-49171-49172,0-10-11,29-23-24,0"
else:
raw = "771,4866-4867-4865,0-23-65281,29-23-24,0"
- md5 = hashlib.md5(raw.encode()).hexdigest()
- return raw, md5
+ return raw, hashlib.md5(raw.encode()).hexdigest()
# ---------------------------------------------------------------------------
-# Row generators
+# CSV loading with fallback
# ---------------------------------------------------------------------------
-def _now_minus(seconds: int) -> str:
- """ISO-8601 UTC timestamp N seconds in the past."""
- t = datetime.now(timezone.utc) - timedelta(seconds=seconds)
+_DATA_SEARCH_PATHS = [
+ "/app/data",
+ os.path.join(os.path.dirname(os.path.abspath(__file__)),
+ "..", "..", "..", "scripts", "data"),
+ os.path.join(os.path.dirname(os.path.abspath(__file__)), "data"),
+]
+
+
+def _find_data_dir(explicit=None):
+ if explicit and os.path.isdir(explicit):
+ return explicit
+ for p in _DATA_SEARCH_PATHS:
+ real = os.path.realpath(p)
+ if os.path.isdir(real) and os.path.isfile(
+ os.path.join(real, "browser_ja4.csv")):
+ return real
+ return None
+
+
+def load_browser_ja4s(data_dir):
+ """Load browser_ja4.csv → {family: [ja4_hash, …]} for TLS only."""
+ result = {}
+ if not data_dir:
+ return result
+ path = os.path.join(data_dir, "browser_ja4.csv")
+ if not os.path.isfile(path):
+ return result
+ with open(path, newline="", encoding="utf-8") as f:
+ for row in csv.reader(f):
+ if len(row) < 2:
+ continue
+ ja4, family = row[0].strip(), row[1].strip()
+ if ja4.startswith("t13") or ja4.startswith("t12"):
+ result.setdefault(family, []).append(ja4)
+ return result
+
+
+def load_bot_ips(data_dir):
+ """Load bot_ip.csv → {family: [cidr_str, …]}."""
+ result = {}
+ if not data_dir:
+ return result
+ path = os.path.join(data_dir, "bot_ip.csv")
+ if not os.path.isfile(path):
+ return result
+ with open(path, newline="", encoding="utf-8") as f:
+ for row in csv.reader(f):
+ if len(row) < 2:
+ continue
+ result.setdefault(row[1].strip(), []).append(row[0].strip())
+ return result
+
+
+# ---------------------------------------------------------------------------
+# IP generation helpers
+# ---------------------------------------------------------------------------
+
+def _gen_ips_from_prefixes(prefixes, n, rng):
+ """Generate *n* unique IPs from /24 prefixes."""
+ ips = set()
+ attempts = 0
+ while len(ips) < n and attempts < n * 5:
+ prefix = rng.choice(prefixes)
+ octet = rng.randint(1, 254)
+ ips.add(f"{prefix}.{octet}")
+ attempts += 1
+ return list(ips)
+
+
+def _ips_from_cidrs(cidrs, n, rng):
+ """Generate *n* unique IPs from a list of CIDR strings."""
+ networks = []
+ single_ips = []
+ for c in cidrs:
+ try:
+ net = ipaddress.ip_network(c, strict=False)
+ if net.prefixlen == 32:
+ single_ips.append(str(net.network_address))
+ elif net.num_addresses > 2:
+ networks.append(net)
+ except ValueError:
+ continue
+ # Start with any /32 single IPs
+ ips = set(single_ips)
+ if not networks:
+ return list(ips)[:n]
+ weights = [net.num_addresses for net in networks]
+ attempts = 0
+ while len(ips) < n and attempts < n * 10:
+ net = rng.choices(networks, weights=weights, k=1)[0]
+ host_offset = rng.randint(1, max(1, net.num_addresses - 2))
+ ips.add(str(net.network_address + host_offset))
+ attempts += 1
+ return list(ips)[:n]
+
+
+def _generate_bot_ja4s(browser_ja4_set, rng, n=20):
+ """Generate synthetic JA4 hashes NOT in the browser CSV."""
+ prefixes = [
+ "t13d0305", "t13d0203", "t12d0507", "t10d0100", "t13d0101",
+ "t12d0302", "t13d0405", "t12d0204", "t10d0200", "t13d0102",
+ ]
+ bot_ja4s = []
+ for i in range(n):
+ p = prefixes[i % len(prefixes)]
+ seg1 = f"{rng.randint(0, 0xFFFFFFFFFFFF):012x}"
+ seg2 = f"{rng.randint(0, 0xFFFFFFFFFFFF):012x}"
+ suffix = "h1" if rng.random() < 0.7 else "h2"
+ ja4 = f"{p}{suffix}_{seg1}_{seg2}"
+ if ja4 not in browser_ja4_set:
+ bot_ja4s.append(ja4)
+ if not bot_ja4s:
+ bot_ja4s = [
+ "t13d030500_ffd59bab1b39_6e7f7df63e98",
+ "t13d020300_6b9b1b2c3d4e_ffd59bab1b39",
+ "t10d170000_0a1b2c3d4e5f_1b2c3d4e5f60",
+ "t12d050700_5a6b7c8d9e0f_1a2b3c4d5e6f",
+ "t13d010100_aabbccddeeff_0011223344aa",
+ ]
+ return bot_ja4s
+
+
+# ---------------------------------------------------------------------------
+# Request distribution
+# ---------------------------------------------------------------------------
+
+def _distribute_requests(n_ips, total_rows, min_req, max_req, rng):
+ """Distribute *total_rows* across *n_ips*, each in [min_req, max_req]."""
+ if n_ips == 0:
+ return []
+ counts = []
+ remaining = total_rows
+ for i in range(n_ips):
+ left = n_ips - i
+ if i == n_ips - 1:
+ counts.append(max(min_req, min(max_req, remaining)))
+ break
+ lo = max(min_req, remaining - (left - 1) * max_req)
+ hi = min(max_req, remaining - (left - 1) * min_req)
+ if lo > hi:
+ lo = hi = max(min_req, min(max_req, remaining // left))
+ counts.append(rng.randint(lo, hi))
+ remaining -= counts[-1]
+ rng.shuffle(counts)
+ return counts
+
+
+# ---------------------------------------------------------------------------
+# Timestamp helper
+# ---------------------------------------------------------------------------
+
+_BASE_TIME = None
+
+
+def _now_minus(seconds):
+ """ISO-8601 UTC timestamp *seconds* in the past."""
+ global _BASE_TIME
+ if _BASE_TIME is None:
+ _BASE_TIME = datetime.now(timezone.utc)
+ t = _BASE_TIME - timedelta(seconds=seconds)
return t.strftime("%Y-%m-%dT%H:%M:%SZ")
+# ---------------------------------------------------------------------------
+# Row builder — identical field set to original
+# ---------------------------------------------------------------------------
+
def _make_row(
- src_ip: str,
- ua: str,
- path: str,
- method: str = "GET",
- ja4: str = None,
- tcp_profile: str = "linux",
- scheme: str = "https",
- host: str = "platform",
- time_offset_s: int = None,
- extra_headers: dict = None,
-) -> dict:
+ src_ip, ua, path, method="GET", ja4=None, tcp_profile="linux",
+ scheme="https", host="platform", time_offset_s=None,
+ extra_headers=None, rng=None,
+):
"""Build a single raw_json dict matching what the correlator produces."""
+ if rng is None:
+ rng = random
if time_offset_s is None:
- time_offset_s = random.randint(0, 1700) # spread over last ~28 min
+ time_offset_s = rng.randint(0, 1700)
if ja4 is None:
- ja4 = random.choice(HUMAN_JA4S)
+ ja4 = "t13d1917h2_b0372614b25a_6a77dcf5a8be"
ja3_raw, ja3_hash = _ja3_for_ja4(ja4)
- tcp = _tcp_meta(tcp_profile)
- syn_ms = _syn_ms(tcp_profile)
+ tcp = _tcp_meta(tcp_profile, rng)
+ syn_ms = _syn_ms(tcp_profile, rng)
client_headers = "Host,User-Agent,Accept,Accept-Language,Accept-Encoding"
if extra_headers:
@@ -248,7 +637,7 @@ def _make_row(
row = {
"time": _now_minus(time_offset_s),
"src_ip": src_ip,
- "src_port": random.randint(1024, 65535),
+ "src_port": rng.randint(1024, 65535),
"dst_ip": "172.20.0.2",
"dst_port": 443 if scheme == "https" else 80,
"method": method,
@@ -256,15 +645,18 @@ def _make_row(
"host": host,
"path": path.split("?")[0] if "?" in path else path,
"query": path.split("?")[1] if "?" in path else "",
- "http_version": "HTTP/2.0" if ja4.endswith("h2") else "HTTP/1.1",
+ "http_version": "HTTP/2.0" if "h2" in ja4 else "HTTP/1.1",
"orphan_side": "",
"correlated": True,
- "keepalives": random.randint(1, 8),
+ "keepalives": rng.randint(1, 8),
"a_timestamp": int(time.time() * 1_000_000),
"b_timestamp": int(time.time() * 1_000_000) + syn_ms * 1000,
- "conn_id": f"seed_{src_ip.replace('.', '_')}_{random.randint(1000,9999)}",
+ "conn_id": f"seed_{src_ip.replace('.', '_')}"
+ f"_{rng.randint(1000, 9999)}",
"syn_to_clienthello_ms": syn_ms,
- "tls_version": "1.3" if ja4.startswith("t13") else ("1.2" if ja4.startswith("t12") else "1.0"),
+ "tls_version": ("1.3" if ja4.startswith("t13")
+ else "1.2" if ja4.startswith("t12")
+ else "1.0"),
"tls_sni": host,
"tls_alpn": "h2" if "h2" in ja4 else "http/1.1",
"ja3": ja3_raw,
@@ -274,14 +666,14 @@ def _make_row(
"header_User-Agent": ua,
"header_Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
"header_Accept-Encoding": "gzip, deflate, br",
- "header_Accept-Language": random.choice(["fr-FR,fr;q=0.9", "en-US,en;q=0.9", "de-DE,de;q=0.8"]),
+ "header_Accept-Language": "",
"header_Content-Type": "",
"header_X-Request-Id": "",
"header_X-Trace-Id": "",
"header_X-Forwarded-For": "",
- "header_Sec-Fetch-Site": "none" if tcp_profile != "scanner" else "",
- "header_Sec-Fetch-Mode": "navigate" if tcp_profile != "scanner" else "",
- "header_Sec-Fetch-Dest": "document" if tcp_profile != "scanner" else "",
+ "header_Sec-Fetch-Site": "",
+ "header_Sec-Fetch-Mode": "",
+ "header_Sec-Fetch-Dest": "",
"header_Sec-CH-UA": "",
"header_Sec-CH-UA-Mobile": "",
"header_Sec-CH-UA-Platform": "",
@@ -292,95 +684,341 @@ def _make_row(
return row
-def generate_human_sessions(n: int = 720) -> list:
- """Generate realistic human browsing sessions.
+# ---------------------------------------------------------------------------
+# Traffic generators
+# ---------------------------------------------------------------------------
- Each IP gets 1–3 requests spread across different paths.
- Distinct (src_ip, ja4, host) → distinct rows in agg_host_ip_ja4_1h.
- We need ≥ 500 human rows for the bot_detector baseline.
+def generate_browser_traffic(n_ips, total_rows, browser_ja4s, rng):
+ """Generate legitimate browser sessions with realistic navigation.
+
+ Each IP gets a consistent browser profile (family, JA4, UA, TCP) and
+ produces page navigations + asset/API requests with Referer chains,
+ cookies, Sec-Fetch headers, and Sec-CH-UA for Chromium browsers.
"""
- ips = _human_ips(n)
rows = []
- for ip in ips:
- # 1–3 requests per IP with the same JA4 (browser stays consistent)
- ja4 = random.choice(HUMAN_JA4S)
- ua = random.choice(HUMAN_UA)
- tcp = random.choice(["linux", "windows", "macos", "android"])
- n_req = random.randint(1, 3)
+ ips = _gen_ips_from_prefixes(ISP_PREFIXES, n_ips, rng)
+ counts = _distribute_requests(len(ips), total_rows, 5, 50, rng)
+
+ families = (list(browser_ja4s.keys())
+ if browser_ja4s else list(BROWSER_UAS.keys()))
+ family_weights = [BROWSER_WEIGHTS.get(f, 1) for f in families]
+
+ platform_map = {
+ "windows": '"Windows"', "linux": '"Linux"',
+ "macos": '"macOS"', "android": '"Android"',
+ }
+
+ for ip, n_req in zip(ips, counts):
+ family = rng.choices(families, weights=family_weights, k=1)[0]
+ ja4_list = browser_ja4s.get(family, [])
+ if not ja4_list:
+ for fb in ("Chromium", "Firefox", "Safari"):
+ ja4_list = browser_ja4s.get(fb, [])
+ if ja4_list:
+ break
+ if not ja4_list:
+ ja4_list = ["t13d1917h2_b0372614b25a_6a77dcf5a8be"]
+ ja4 = rng.choice(ja4_list)
+ ua = rng.choice(BROWSER_UAS.get(family, BROWSER_UAS["Chromium"]))
+ tcp = rng.choice(BROWSER_TCP.get(family, ["linux"]))
+ host = rng.choice(HOSTS)
+ lang = rng.choice(ACCEPT_LANGUAGES)
+ is_chromium = family in CHROMIUM_FAMILIES
+ session_cookie = f"sid={rng.randint(100000, 999999)}"
+ ch_ua = SEC_CH_UA.get(family, "") if is_chromium else ""
+ ch_mobile = "?0" if is_chromium else ""
+ ch_plat = platform_map.get(tcp, "") if is_chromium else ""
+
+ base_offset = rng.randint(60, 1700)
+ current_page = ""
+
+ for req_i in range(n_req):
+ offset = max(1, base_offset - req_i * rng.randint(1, 5))
+
+ if req_i == 0 or rng.random() < 0.25:
+ # Navigate to a new page
+ path = rng.choice(PAGE_PATHS)
+ method = "GET"
+ referer = ("" if req_i == 0
+ else f"https://{host}{current_page}")
+ sec_site = "none" if req_i == 0 else "same-origin"
+ sec_mode = "navigate"
+ sec_dest = "document"
+ current_page = path
+ elif rng.random() < 0.5:
+ # Asset request
+ path = rng.choice(ASSET_PATHS)
+ method = "GET"
+ referer = (f"https://{host}{current_page}"
+ if current_page else "")
+ sec_site = "same-origin"
+ sec_mode = "no-cors"
+ sec_dest = rng.choice(["script", "style", "image", "font"])
+ elif rng.random() < 0.4:
+ # API call
+ path = rng.choice(API_PATHS)
+ method = rng.choice(["GET", "POST"])
+ referer = (f"https://{host}{current_page}"
+ if current_page else "")
+ sec_site = "same-origin"
+ sec_mode = "cors"
+ sec_dest = "empty"
+ else:
+ # Another page navigation
+ path = rng.choice(PAGE_PATHS)
+ method = "GET"
+ referer = (f"https://{host}{current_page}"
+ if current_page else "")
+ sec_site = "same-origin"
+ sec_mode = "navigate"
+ sec_dest = "document"
+ current_page = path
+
+ row = _make_row(
+ src_ip=ip, ua=ua, path=path, method=method,
+ ja4=ja4, tcp_profile=tcp, scheme="https", host=host,
+ time_offset_s=offset, rng=rng,
+ )
+ row["header_Accept-Language"] = lang
+ row["header_Sec-Fetch-Site"] = sec_site
+ row["header_Sec-Fetch-Mode"] = sec_mode
+ row["header_Sec-Fetch-Dest"] = sec_dest
+ row["header_Sec-CH-UA"] = ch_ua
+ row["header_Sec-CH-UA-Mobile"] = ch_mobile
+ row["header_Sec-CH-UA-Platform"] = ch_plat
+ if referer:
+ row["header_Referer"] = referer
+ if "Referer" not in row["client_headers"]:
+ row["client_headers"] += ",Referer"
+ if req_i > 0:
+ row["header_Cookie"] = session_cookie
+ if "Cookie" not in row["client_headers"]:
+ row["client_headers"] += ",Cookie"
+ rows.append(row)
+
+ return rows
+
+
+def generate_scanner_traffic(n_ips, total_rows, bot_ja4s, rng):
+ """Generate malicious bot/scanner traffic in four sub-categories.
+
+ a. Vulnerability scanners (30%): diverse attack paths
+ b. Credential stuffers (20%): POST /login brute-force
+ c. Content scrapers (30%): methodical path crawling
+ d. DDoS-like (20%): same path hammered 100+ times
+ """
+ rows = []
+ ips = _gen_ips_from_prefixes(DC_SCANNER_PREFIXES, n_ips, rng)
+
+ n_vuln = int(n_ips * 0.30)
+ n_cred = int(n_ips * 0.20)
+ n_scrap = int(n_ips * 0.30)
+ n_ddos = n_ips - n_vuln - n_cred - n_scrap
+
+ # Compute row budgets respecting per-IP minimums
+ min_vuln = n_vuln * 10
+ min_cred = n_cred * 15
+ min_scrap = n_scrap * 10
+ min_ddos = n_ddos * 100
+ total_min = min_vuln + min_cred + min_scrap + min_ddos
+ surplus = max(0, total_rows - total_min)
+ r_ddos = min_ddos + int(surplus * 0.40)
+ r_vuln = min_vuln + int(surplus * 0.20)
+ r_cred = min_cred + int(surplus * 0.15)
+ r_scrap = total_rows - r_ddos - r_vuln - r_cred
+
+ idx = 0
+
+ # --- Vulnerability scanners ---
+ vuln_ips = ips[idx:idx + n_vuln]; idx += n_vuln
+ counts = _distribute_requests(len(vuln_ips), r_vuln, 10, 60, rng)
+ for ip, n_req in zip(vuln_ips, counts):
+ ua = rng.choice(SCANNER_UAS)
+ ja4 = rng.choice(bot_ja4s)
+ rotate_ua = rng.random() < 0.3
+ for _ in range(n_req):
+ if rotate_ua:
+ ua = rng.choice(SCANNER_UAS)
+ rows.append(_make_row(
+ src_ip=ip, ua=ua, path=rng.choice(ATTACK_PATHS),
+ method=rng.choice(["GET", "GET", "HEAD"]),
+ ja4=ja4, tcp_profile="scanner", scheme="https",
+ host=rng.choice(HOSTS), rng=rng,
+ ))
+
+ # Some vuln scanners do port probing (diverse dst_port)
+ for ip in rng.sample(vuln_ips, min(100, len(vuln_ips))):
+ for port in rng.sample(range(80, 10000), 5):
+ row = _make_row(
+ src_ip=ip, ua="-", path="/",
+ method="GET", ja4=rng.choice(bot_ja4s),
+ tcp_profile="scanner", scheme="https",
+ host=rng.choice(HOSTS), rng=rng,
+ )
+ row["dst_port"] = port
+ rows.append(row)
+
+ # --- Credential stuffers ---
+ login_paths = [
+ "/login", "/admin/login", "/api/auth/login",
+ "/wp-login.php", "/user/signin", "/api/v1/auth",
+ ]
+ cred_ips = ips[idx:idx + n_cred]; idx += n_cred
+ counts = _distribute_requests(len(cred_ips), r_cred, 15, 60, rng)
+ for ip, n_req in zip(cred_ips, counts):
+ ua = rng.choice(SCANNER_UAS + HEADLESS_UAS)
+ ja4 = rng.choice(bot_ja4s)
+ target = rng.choice(login_paths)
for _ in range(n_req):
rows.append(_make_row(
- src_ip=ip, ua=ua,
- path=random.choice(HUMAN_PATHS),
- method=random.choice(["GET", "GET", "GET", "POST"]),
- ja4=ja4, tcp_profile=tcp,
- scheme=random.choice(["https", "https", "http"]),
- ))
- return rows
-
-
-def generate_scanner_sessions(n: int = 150) -> list:
- """Generate scanner/attack traffic — anomalous patterns for ML detection.
-
- Characteristics: minimal TCP options, small window, no Sec-Fetch headers,
- attack paths, scanner UAs, rapid-fire requests (low syn_ms).
- """
- ips = _scanner_ips(n)
- rows = []
- for ip in ips:
- ja4 = random.choice(BOT_JA4S[:3]) # curl/python/masscan profiles
- ua = random.choice(SCANNER_UA)
- # Burst: 5–20 requests per IP (simulates scan / brute-force)
- n_req = random.randint(5, 20)
- for _ in range(n_req):
- rows.append(_make_row(
- src_ip=ip, ua=ua,
- path=random.choice(ATTACK_PATHS + ATTACK_PATHS + HUMAN_PATHS),
- method=random.choice(["GET", "GET", "GET", "HEAD", "POST"]),
- ja4=ja4, tcp_profile="scanner",
- scheme="https",
- extra_headers={"Content-Type": ""} if random.random() < 0.3 else None,
- ))
- return rows
-
-
-def generate_known_bot_sessions(n: int = 100) -> list:
- """Generate sessions from IPs listed in bot_ip.csv (direct bot labeling)."""
- rows = []
- for _ in range(n):
- ip = random.choice(BOT_IP_KNOWN)
- ua = random.choice(BOT_CRAWLER_UA + SCANNER_UA)
- ja4 = random.choice(BOT_JA4S)
- rows.append(_make_row(
- src_ip=ip, ua=ua,
- path=random.choice(BOT_PATHS + ATTACK_PATHS),
- ja4=ja4, tcp_profile="scanner",
- scheme="https",
- ))
- return rows
-
-
-def generate_brute_force_cluster(n_ips: int = 20) -> list:
- """Simulate credential stuffing / brute-force from a small set of IPs.
-
- Same IP → many POST /login requests = high hit count, suspicious pattern.
- """
- ips = _scanner_ips(n_ips)[:n_ips]
- rows = []
- for ip in ips:
- ua = random.choice(SCANNER_UA + BOT_CRAWLER_UA)
- ja4 = random.choice(BOT_JA4S)
- for _ in range(random.randint(20, 50)):
- rows.append(_make_row(
- src_ip=ip, ua=ua,
- path="/login",
- method="POST",
- ja4=ja4, tcp_profile="scanner",
- scheme="https",
+ src_ip=ip, ua=ua, path=target, method="POST",
+ ja4=ja4, tcp_profile="scanner", scheme="https",
+ host=rng.choice(HOSTS[:2]), rng=rng,
extra_headers={
"Content-Type": "application/x-www-form-urlencoded",
- "Content-Length": "32",
+ "Content-Length": str(rng.randint(20, 60)),
},
))
+
+ # --- Content scrapers ---
+ scrap_ips = ips[idx:idx + n_scrap]; idx += n_scrap
+ counts = _distribute_requests(len(scrap_ips), r_scrap, 10, 60, rng)
+ for ip, n_req in zip(scrap_ips, counts):
+ ua = rng.choice(SCANNER_UAS[:8])
+ ja4 = rng.choice(bot_ja4s)
+ for i in range(n_req):
+ rows.append(_make_row(
+ src_ip=ip, ua=ua,
+ path=SCRAPER_PATHS[i % len(SCRAPER_PATHS)],
+ method="GET", ja4=ja4, tcp_profile="scanner",
+ scheme="https", host=rng.choice(HOSTS), rng=rng,
+ ))
+
+ # --- DDoS-like ---
+ ddos_targets = ["/", "/api/v1/search", "/products", "/api/v2/metrics"]
+ ddos_ips = ips[idx:idx + n_ddos]
+ counts = _distribute_requests(len(ddos_ips), r_ddos, 100, 200, rng)
+ for ip, n_req in zip(ddos_ips, counts):
+ ua = rng.choice(SCANNER_UAS)
+ ja4 = rng.choice(bot_ja4s)
+ target = rng.choice(ddos_targets)
+ for _ in range(n_req):
+ rows.append(_make_row(
+ src_ip=ip, ua=ua, path=target, method="GET",
+ ja4=ja4,
+ tcp_profile=rng.choice(["scanner", "minimal"]),
+ scheme="https", host=HOSTS[0],
+ time_offset_s=rng.randint(0, 300), rng=rng,
+ ))
+
+ return rows
+
+
+def generate_legit_bot_traffic(n_ips, total_rows, bot_ips_by_family,
+ bot_ja4s, rng):
+ """Generate traffic from known legitimate bots (Googlebot, Bingbot, …).
+
+ IPs are drawn from bot_ip.csv CIDRs when available so they match
+ the bot_ip dictionary for direct labelling.
+ """
+ rows = []
+ legit_families = list(LEGIT_BOT_UAS.keys())
+
+ all_ips = []
+ ip_family_map = {}
+ per_family = max(1, n_ips // len(legit_families))
+
+ for family in legit_families:
+ cidrs = bot_ips_by_family.get(family, [])
+ if cidrs:
+ fam_ips = _ips_from_cidrs(cidrs, per_family, rng)
+ else:
+ fam_ips = _gen_ips_from_prefixes(
+ DC_LEGIT_BOT_PREFIXES, per_family, rng)
+ for ip in fam_ips:
+ ip_family_map[ip] = family
+ all_ips.extend(fam_ips)
+
+ if len(all_ips) < n_ips:
+ extra = _gen_ips_from_prefixes(
+ DC_LEGIT_BOT_PREFIXES, n_ips - len(all_ips), rng)
+ for ip in extra:
+ ip_family_map[ip] = rng.choice(legit_families)
+ all_ips.extend(extra)
+
+ all_ips = all_ips[:n_ips]
+ counts = _distribute_requests(len(all_ips), total_rows, 5, 30, rng)
+
+ for ip, n_req in zip(all_ips, counts):
+ family = ip_family_map.get(ip, rng.choice(legit_families))
+ ua = rng.choice(
+ LEGIT_BOT_UAS.get(family, LEGIT_BOT_UAS["Googlebot"]))
+ ja4 = rng.choice(bot_ja4s)
+
+ # Bots: robots.txt first, then sitemap, then content
+ paths = ["/robots.txt"]
+ if rng.random() < 0.7:
+ paths.append("/sitemap.xml")
+ remaining_n = n_req - len(paths)
+ paths.extend(
+ rng.choices(CONTENT_PATHS + BOT_PATHS, k=max(0, remaining_n)))
+ paths = paths[:n_req]
+
+ for path in paths:
+ rows.append(_make_row(
+ src_ip=ip, ua=ua, path=path, method="GET",
+ ja4=ja4,
+ tcp_profile=rng.choice(["linux", "linux", "scanner"]),
+ scheme="https", host=rng.choice(HOSTS), rng=rng,
+ ))
+
+ return rows
+
+
+def generate_ai_bot_traffic(n_ips, total_rows, bot_ips_by_family,
+ bot_ja4s, rng):
+ """Generate aggressive AI bot scraping traffic (GPTBot, ClaudeBot, …)."""
+ rows = []
+ ai_families = list(AI_BOT_UAS.keys())
+
+ all_ips = []
+ ip_family_map = {}
+ per_family = max(1, n_ips // len(ai_families))
+
+ for family in ai_families:
+ cidrs = bot_ips_by_family.get(family, [])
+ if cidrs:
+ fam_ips = _ips_from_cidrs(cidrs, per_family, rng)
+ else:
+ fam_ips = _gen_ips_from_prefixes(
+ DC_AI_PREFIXES, per_family, rng)
+ for ip in fam_ips:
+ ip_family_map[ip] = family
+ all_ips.extend(fam_ips)
+
+ if len(all_ips) < n_ips:
+ extra = _gen_ips_from_prefixes(
+ DC_AI_PREFIXES, n_ips - len(all_ips), rng)
+ for ip in extra:
+ ip_family_map[ip] = rng.choice(ai_families)
+ all_ips.extend(extra)
+
+ all_ips = all_ips[:n_ips]
+ counts = _distribute_requests(len(all_ips), total_rows, 10, 50, rng)
+
+ for ip, n_req in zip(all_ips, counts):
+ family = ip_family_map.get(ip, rng.choice(ai_families))
+ ua = rng.choice(AI_BOT_UAS[family])
+ ja4 = rng.choice(bot_ja4s)
+
+ paths = rng.choices(CONTENT_PATHS, k=n_req)
+ for path in paths:
+ rows.append(_make_row(
+ src_ip=ip, ua=ua, path=path, method="GET",
+ ja4=ja4, tcp_profile="linux", scheme="https",
+ host=rng.choice(HOSTS), rng=rng,
+ ))
+
return rows
@@ -388,8 +1026,8 @@ def generate_brute_force_cluster(n_ips: int = 20) -> list:
# ClickHouse insert
# ---------------------------------------------------------------------------
-def _ch_insert(rows: list, host: str, port: int, user: str, password: str,
- batch_size: int = 200, dry_run: bool = False) -> int:
+def _ch_insert(rows, host, port, user, password,
+ batch_size=2000, dry_run=False):
"""Insert rows into ja4_logs.http_logs_raw via ClickHouse HTTP interface.
Each row is wrapped as {"raw_json": ""} in JSONEachRow format.
@@ -399,19 +1037,20 @@ def _ch_insert(rows: list, host: str, port: int, user: str, password: str,
print("[seed] Sample row:", json.dumps(rows[0], indent=2)[:400])
return len(rows)
+ query = "INSERT INTO ja4_logs.http_logs_raw (raw_json) FORMAT JSONEachRow"
url = (
f"http://{host}:{port}/"
- f"?query={urllib.parse.quote('INSERT INTO ja4_logs.http_logs_raw (raw_json) FORMAT JSONEachRow')}"
+ f"?query={urllib.parse.quote(query)}"
f"&user={urllib.parse.quote(user)}"
f"&password={urllib.parse.quote(password)}"
)
total_inserted = 0
+ n_batches = (len(rows) + batch_size - 1) // batch_size
for i in range(0, len(rows), batch_size):
batch = rows[i:i + batch_size]
body_lines = []
for row in batch:
- # raw_json column holds the entire log as a JSON string
outer = {"raw_json": json.dumps(row, separators=(",", ":"))}
body_lines.append(json.dumps(outer, separators=(",", ":")))
body = "\n".join(body_lines).encode("utf-8")
@@ -426,15 +1065,20 @@ def _ch_insert(rows: list, host: str, port: int, user: str, password: str,
total_inserted += len(batch)
except urllib.error.HTTPError as e:
err_body = e.read(500).decode("utf-8", errors="replace")
- print(f"[seed] ERROR batch {i}–{i+batch_size}: HTTP {e.code}: {err_body}")
+ print(f"[seed] ERROR batch {i}–{i+batch_size}: "
+ f"HTTP {e.code}: {err_body}")
except Exception as e:
print(f"[seed] ERROR batch {i}–{i+batch_size}: {e}")
+ if total_inserted % 10000 < batch_size:
+ batch_num = i // batch_size + 1
+ print(f"[seed] Progress: {total_inserted:,}/{len(rows):,} rows "
+ f"({batch_num}/{n_batches} batches)")
+
return total_inserted
-def _wait_for_clickhouse(host: str, port: int, user: str, password: str,
- timeout_s: int = 60) -> bool:
+def _wait_for_clickhouse(host, port, user, password, timeout_s=60):
"""Wait for ClickHouse to be ready."""
url = (
f"http://{host}:{port}/"
@@ -459,51 +1103,121 @@ def _wait_for_clickhouse(host: str, port: int, user: str, password: str,
# ---------------------------------------------------------------------------
def main():
- parser = argparse.ArgumentParser(description="Seed ClickHouse with synthetic traffic")
+ parser = argparse.ArgumentParser(
+ description="Seed ClickHouse with synthetic traffic")
parser.add_argument("--host", default="clickhouse")
parser.add_argument("--port", type=int, default=8123)
parser.add_argument("--user", default="default")
parser.add_argument("--password", default="")
parser.add_argument("--dry-run", action="store_true",
help="Generate data but do not insert")
+ parser.add_argument("--rows", type=int, default=500_000,
+ help="Total rows to generate (default: 500000)")
+ parser.add_argument("--ips", type=int, default=20_000,
+ help="Total unique IPs (default: 20000)")
+ parser.add_argument("--seed", type=int, default=None,
+ help="Random seed for reproducibility")
+ parser.add_argument("--data-dir", default=None,
+ help="Path to CSV data directory "
+ "(browser_ja4.csv, bot_ip.csv)")
args = parser.parse_args()
+ rng = random.Random(args.seed)
+
if not args.dry_run:
print(f"[seed] Waiting for ClickHouse at {args.host}:{args.port}…")
- if not _wait_for_clickhouse(args.host, args.port, args.user, args.password):
+ if not _wait_for_clickhouse(args.host, args.port,
+ args.user, args.password):
print("[seed] FATAL: ClickHouse not reachable after 60s")
raise SystemExit(1)
print("[seed] ClickHouse ready.")
t0 = time.monotonic()
- # Generate all row sets
- print("[seed] Generating rows…")
- human_rows = generate_human_sessions(720) # ≥ 500 unique (ip,ja4,host) human sessions
- scanner_rows = generate_scanner_sessions(150) # anomalous datacenter traffic
- known_bot = generate_known_bot_sessions(100) # directly labeled by bot_ip.csv
- brute_force = generate_brute_force_cluster(20) # credential stuffing pattern
+ # --- Load CSV data ---
+ data_dir = _find_data_dir(args.data_dir)
+ if data_dir:
+ print(f"[seed] Loading CSV data from {data_dir}")
+ else:
+ print("[seed] WARNING: CSV data directory not found, "
+ "using fallback data")
- all_rows = human_rows + scanner_rows + known_bot + brute_force
- random.shuffle(all_rows)
+ browser_ja4s = load_browser_ja4s(data_dir)
+ bot_ips_by_family = load_bot_ips(data_dir)
- print(f"[seed] Total rows to insert: {len(all_rows)}")
- print(f" • {len(human_rows):<5} human sessions "
- f"(~{len(set(r['src_ip'] for r in human_rows))} unique IPs)")
- print(f" • {len(scanner_rows):<5} scanner/anomaly sessions")
- print(f" • {len(known_bot):<5} known-bot sessions")
- print(f" • {len(brute_force):<5} brute-force rows")
+ browser_ja4_set = set()
+ for ja4_list in browser_ja4s.values():
+ browser_ja4_set.update(ja4_list)
+ if browser_ja4s:
+ total_ja4 = sum(len(v) for v in browser_ja4s.values())
+ print(f"[seed] Loaded {total_ja4} browser JA4s "
+ f"across {len(browser_ja4s)} families")
+ if bot_ips_by_family:
+ total_cidrs = sum(len(v) for v in bot_ips_by_family.values())
+ print(f"[seed] Loaded {total_cidrs} bot CIDRs "
+ f"across {len(bot_ips_by_family)} families")
+
+ bot_ja4s = _generate_bot_ja4s(browser_ja4_set, rng)
+
+ # --- IP and row budgets ---
+ n_browser_ips = int(args.ips * 0.70)
+ n_scanner_ips = int(args.ips * 0.15)
+ n_legit_bot_ips = int(args.ips * 0.10)
+ n_ai_bot_ips = (args.ips - n_browser_ips
+ - n_scanner_ips - n_legit_bot_ips)
+
+ n_browser_rows = int(args.rows * 0.70)
+ n_scanner_rows = int(args.rows * 0.20)
+ n_legit_bot_rows = int(args.rows * 0.06)
+ n_ai_bot_rows = (args.rows - n_browser_rows
+ - n_scanner_rows - n_legit_bot_rows)
+
+ print(f"[seed] Generating {args.rows:,} rows from {args.ips:,} IPs…")
+
+ # --- Generate ---
+ browser_rows = generate_browser_traffic(
+ n_browser_ips, n_browser_rows, browser_ja4s, rng)
+ print(f"[seed] Browser: {len(browser_rows):>7,} rows "
+ f"({len(set(r['src_ip'] for r in browser_rows)):,} IPs)")
+
+ scanner_rows = generate_scanner_traffic(
+ n_scanner_ips, n_scanner_rows, bot_ja4s, rng)
+ print(f"[seed] Scanner: {len(scanner_rows):>7,} rows "
+ f"({len(set(r['src_ip'] for r in scanner_rows)):,} IPs)")
+
+ legit_bot_rows = generate_legit_bot_traffic(
+ n_legit_bot_ips, n_legit_bot_rows,
+ bot_ips_by_family, bot_ja4s, rng)
+ print(f"[seed] Legit bots: {len(legit_bot_rows):>7,} rows "
+ f"({len(set(r['src_ip'] for r in legit_bot_rows)):,} IPs)")
+
+ ai_bot_rows = generate_ai_bot_traffic(
+ n_ai_bot_ips, n_ai_bot_rows,
+ bot_ips_by_family, bot_ja4s, rng)
+ print(f"[seed] AI bots: {len(ai_bot_rows):>7,} rows "
+ f"({len(set(r['src_ip'] for r in ai_bot_rows)):,} IPs)")
+
+ all_rows = browser_rows + scanner_rows + legit_bot_rows + ai_bot_rows
+ rng.shuffle(all_rows)
+
+ gen_elapsed = time.monotonic() - t0
+ print(f"[seed] Generated {len(all_rows):,} total rows "
+ f"in {gen_elapsed:.1f}s")
+
+ # --- Insert ---
inserted = _ch_insert(
all_rows, args.host, args.port, args.user, args.password,
- dry_run=args.dry_run,
+ batch_size=2000, dry_run=args.dry_run,
)
elapsed = time.monotonic() - t0
- print(f"[seed] Done: {inserted}/{len(all_rows)} rows inserted in {elapsed:.1f}s")
+ print(f"[seed] Done: {inserted:,}/{len(all_rows):,} rows inserted "
+ f"in {elapsed:.1f}s")
if inserted < len(all_rows) * 0.9:
- print("[seed] WARNING: fewer than 90% of rows inserted — check errors above")
+ print("[seed] WARNING: fewer than 90% of rows inserted — "
+ "check errors above")
raise SystemExit(1)
print(f"[seed] The bot_detector should now see ≥ 500 human sessions "