From 7b7b69dee3316ccb8693d64523c0c912475d6293 Mon Sep 17 00:00:00 2001 From: toto Date: Wed, 8 Apr 2026 16:35:40 +0200 Subject: [PATCH] Rewrite seed_clickhouse.py: 500K rows from 20K IPs with realistic traffic - 350K browser rows (14K IPs) using real JA4s from browser_ja4.csv - 100K scanner rows (3K IPs) with vuln/cred/scraper/DDoS sub-categories - 30K legit bot rows (2K IPs) from real bot_ip.csv CIDRs - 20K AI bot rows (1K IPs) for GPTBot, ClaudeBot, etc. Key improvements: - Load browser_ja4.csv at startup, match JA4 to browser family - Load bot_ip.csv to generate IPs from real Googlebot/Bingbot CIDRs - Hard-coded ISP /24 prefixes from real ASNs (Comcast, Orange, DT, etc.) - Realistic navigation patterns with Referer chains and cookies - Sec-CH-UA headers for Chromium browsers (modern_browser_score >= 50) - Batch size increased to 2000, progress reporting every 10K rows - New CLI args: --rows, --ips, --seed, --data-dir - Bot JA4s are synthetic hashes guaranteed NOT in browser_ja4.csv Also updated: - Dockerfile: COPY *.py (was missing seed_clickhouse.py) - docker-compose.yml: mount scripts/data as /app/data for CSV access - run-tests.sh: updated seeder description comments Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/integration/docker-compose.yml | 2 + tests/integration/run-tests.sh | 10 +- tests/integration/traffic-gen/Dockerfile | 2 +- .../traffic-gen/seed_clickhouse.py | 1240 +++++++++++++---- 4 files changed, 985 insertions(+), 269 deletions(-) diff --git a/tests/integration/docker-compose.yml b/tests/integration/docker-compose.yml index 8f29bb1..971e3c5 100644 --- a/tests/integration/docker-compose.yml +++ b/tests/integration/docker-compose.yml @@ -157,6 +157,8 @@ services: depends_on: platform: condition: service_healthy + volumes: + - ../../scripts/data:/app/data:ro networks: - ja4net diff --git a/tests/integration/run-tests.sh b/tests/integration/run-tests.sh index 65116b8..6d23741 100755 --- a/tests/integration/run-tests.sh +++ b/tests/integration/run-tests.sh @@ -162,15 +162,15 @@ log "============================================" # The seeder inserts directly into http_logs_raw, triggering all MVs: # http_logs_raw → mv_http_logs → http_logs → mv_agg_host_ip_ja4_1h → agg_host_ip_ja4_1h # This pre-populates: -# - 720 human sessions (IPs in residential ASN ranges → asn_label='human') -# - 150 scanner/anomaly sessions (IPs in datacenter ASN → ML anomaly candidates) -# - 100 known-bot sessions (IPs/JA4 matching bot_ip.csv / bot_ja4.csv) -# - 20 brute-force clusters (many POST /login per IP) +# - ~350,000 rows from 14,000 browser IPs (ISP ASN ranges → asn_label='isp') +# - ~100,000 rows from 3,000 scanner IPs (datacenter ASN → ML anomaly candidates) +# - ~30,000 rows from 2,000 legit bot IPs (from bot_ip.csv CIDRs) +# - ~20,000 rows from 1,000 AI bot IPs (datacenter ranges) # After seeding, bot_detector has ≥500 human rows → can train and run. log "Running seed_clickhouse.py..." if docker compose exec -T traffic-gen python /app/seed_clickhouse.py \ --host clickhouse --port 8123 --user default --password ""; then - pass "ClickHouse seeded (700+ human + 150 scanner + 100 known-bot rows)" + pass "ClickHouse seeded (500K rows: 350K browser + 100K scanner + 30K legit-bot + 20K AI-bot)" else warn "Seeder reported errors (pipeline verification will show impact)" fi diff --git a/tests/integration/traffic-gen/Dockerfile b/tests/integration/traffic-gen/Dockerfile index f5adc31..50e568f 100644 --- a/tests/integration/traffic-gen/Dockerfile +++ b/tests/integration/traffic-gen/Dockerfile @@ -2,7 +2,7 @@ FROM python:3.12-alpine # No extra deps needed — stdlib only (urllib, ssl, concurrent.futures) WORKDIR /app -COPY generate_traffic.py . +COPY *.py . # Keep container alive; traffic is triggered via docker compose exec ENTRYPOINT ["sleep", "infinity"] diff --git a/tests/integration/traffic-gen/seed_clickhouse.py b/tests/integration/traffic-gen/seed_clickhouse.py index a4d9590..d1b46ff 100644 --- a/tests/integration/traffic-gen/seed_clickhouse.py +++ b/tests/integration/traffic-gen/seed_clickhouse.py @@ -3,26 +3,41 @@ seed_clickhouse.py — Bootstrap ClickHouse with realistic synthetic traffic data. Inserts directly into ja4_logs.http_logs_raw (triggers all MVs automatically): - • 700 human sessions — IPs in residential ISP ranges (ASN→'human' via dict) - • 150 datacenter/scanner sessions — anomalous patterns for ML detection - • 100 known-bot sessions — IPs/JA4 in bot_ip.csv / bot_ja4.csv + • ~350,000 rows from 14,000 legitimate browser IPs (ISP ranges, asn_label='isp') + • ~100,000 rows from 3,000 malicious bot/scanner IPs (datacenter ranges) + • ~30,000 rows from 2,000 legitimate bot IPs (from bot_ip.csv CIDRs) + • ~20,000 rows from 1,000 AI bot IPs (datacenter ranges) + +Total: 500,000 rows from 20,000 unique IPs (configurable via --rows / --ips). + +Browser JA4 fingerprints are loaded from browser_ja4.csv so they match +dict_browser_ja4 (LEGITIMATE_BROWSER classification). Bot/scanner JA4s are +synthetic hashes guaranteed NOT to appear in that dictionary. + +Bot IPs are drawn from real CIDRs in bot_ip.csv (Googlebot, Bingbot, etc.). +ISP and datacenter IPs use hard-coded /24 prefixes from well-known ASNs that +resolve correctly through iplocate-ip-to-asn.csv → asn_reputation.csv. This ensures view_ai_features_1h has ≥ 500 human rows for the bot_detector training threshold (run_semi_supervised_logic requires len(human_baseline) >= 500). All timestamps are within the last 30 minutes so the 24h window filter catches them. -No external dependencies — uses Python stdlib urllib only. +No external dependencies — uses Python stdlib only. Usage: python seed_clickhouse.py python seed_clickhouse.py --host clickhouse --port 8123 --user default --password "" + python seed_clickhouse.py --rows 500000 --ips 20000 --seed 42 python seed_clickhouse.py --dry-run """ import argparse +import csv import hashlib +import ipaddress import json +import os import random import time import urllib.error @@ -31,104 +46,194 @@ import urllib.request from datetime import datetime, timedelta, timezone # --------------------------------------------------------------------------- -# JA4 fingerprint profiles (must match bot_ja4.csv for bot detection to work) +# Hard-coded /24 prefixes — guaranteed to resolve via ASN dictionaries # --------------------------------------------------------------------------- -# Human browser profiles — realistic TLS 1.3 fingerprints -HUMAN_JA4S = [ - "t13d1917h2_b0372614b25a_6a77dcf5a8be", # Chrome 120 Windows TLS1.3 - "t13d1817h2_b0372614b25a_0a3e5785d15f", # Firefox 121 TLS1.3 - "t13d1617h2_fc82e8b7e1c0_9dc949149365", # Safari 17 macOS TLS1.3 - "t13d1917h2_fc82e8b7e1c0_6b9b1b2c3d4e", # Edge 120 TLS1.3 - "t13d1817h2_9dc949149365_8c4a9a4b0d01", # Chrome Mobile TLS1.3 - "t12d1706h2_9dc949149365_fc82e8b7e1c0", # Chrome 120 TLS1.2 (older server) - "t12d1606h2_8c4a9a4b0d01_9dc949149365", # Firefox TLS1.2 -] +# ISP ranges (asn_label='isp' in asn_reputation.csv) +ISP_PREFIXES = ( + # Comcast AS7922 — within 24.0.0.0/12 + [f"24.{o2}.{o3}" for o2 in range(1, 11) for o3 in range(0, 3)] + # Orange AS3215 — within 2.3.0.0/16 + + [f"2.3.{o3}" for o3 in range(0, 10)] + # Deutsche Telekom AS3320 — within 2.160.0.0/12 + + [f"2.{160 + o2}.{o3}" for o2 in range(0, 5) for o3 in range(0, 2)] + # AT&T AS7018 — within 12.0.0.0/10 + + [f"12.0.{o3}" for o3 in range(4, 14)] + # Verizon AS701 — within 63.0.0.0/12 + + [f"63.{o2}.0" for o2 in range(0, 10)] + # BT AS2856 — within 5.80.0.0/15 + + [f"5.80.{o3}" for o3 in range(0, 8)] +) # ~68 prefixes × 254 ≈ 17K IPs -# Bot/scanner profiles — intentionally minimal cipher suites, match bot_ja4.csv -BOT_JA4S = [ - "t13d030500_ffd59bab1b39_6e7f7df63e98", # curl scanner (in bot_ja4.csv) - "t13d020300_6b9b1b2c3d4e_ffd59bab1b39", # python-requests scanner (in bot_ja4.csv) - "t10d170000_0a1b2c3d4e5f_1b2c3d4e5f60", # Masscan (in bot_ja4.csv) - "t12d050700_5a6b7c8d9e0f_1a2b3c4d5e6f", # zgrab (in bot_ja4.csv) - "t13d010100_aabbccddeeff_0011223344aa", # Headless Chrome automation (in bot_ja4.csv) +# Datacenter ranges for scanners (asn_label='datacenter') +DC_SCANNER_PREFIXES = ( + # DigitalOcean AS14061 — within 5.101.96.0/20 + [f"5.101.{96 + o3}" for o3 in range(0, 6)] + # Hetzner AS24940 — within 5.9.0.0/16 + + [f"5.9.{o3}" for o3 in range(0, 6)] + # OVH AS16276 — within 5.39.0.0/17 + + [f"5.39.{o3}" for o3 in range(0, 5)] +) # ~17 prefixes × 254 ≈ 4.3K IPs + +# Datacenter ranges for AI bots (separate from scanner ranges) +DC_AI_PREFIXES = ( + # DigitalOcean (different /24s) + [f"5.101.{102 + o3}" for o3 in range(0, 4)] + # Hetzner (different /24s) + + [f"5.9.{6 + o3}" for o3 in range(0, 4)] +) # ~8 prefixes × 254 ≈ 2K IPs + +# Fallback /24s for legitimate bot overflow +DC_LEGIT_BOT_PREFIXES = [f"5.9.{20 + o3}" for o3 in range(0, 8)] + +# --------------------------------------------------------------------------- +# Browser family → User-Agent mapping +# --------------------------------------------------------------------------- + +BROWSER_UAS = { + "Chromium": [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", + "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/120.0.6099.115 Mobile Safari/537.36", + ], + "Firefox": [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) " + "Gecko/20100101 Firefox/121.0", + "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) " + "Gecko/20100101 Firefox/120.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.2; rv:121.0) " + "Gecko/20100101 Firefox/121.0", + ], + "Safari": [ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/17.2 Safari/605.1.15", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_1) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/17.1 Safari/605.1.15", + ], + "Edge": [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0", + ], + "Opera": [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 OPR/105.0.0.0", + ], + "Vivaldi": [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 " + "Vivaldi/6.4.3160.47", + ], + "Chrome_iOS": [ + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) " + "CriOS/120.0.6099.119 Mobile/15E148 Safari/604.1", + ], + "Chromium_Legacy": [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36", + ], + "Firefox_Legacy": [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) " + "Gecko/20100101 Firefox/78.0", + ], + "Safari_Legacy": [ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) " + "Version/15.6 Safari/605.1.15", + ], + "Tor_Browser": [ + "Mozilla/5.0 (Windows NT 10.0; rv:102.0) " + "Gecko/20100101 Firefox/102.0", + ], +} + +BROWSER_TCP = { + "Chromium": ["windows", "linux", "android"], + "Firefox": ["windows", "linux"], + "Safari": ["macos"], + "Edge": ["windows"], + "Opera": ["windows", "linux"], + "Vivaldi": ["windows", "linux"], + "Chrome_iOS": ["macos"], + "Chromium_Legacy": ["windows"], + "Firefox_Legacy": ["windows", "linux"], + "Safari_Legacy": ["macos"], + "Tor_Browser": ["linux"], +} + +CHROMIUM_FAMILIES = { + "Chromium", "Edge", "Opera", "Vivaldi", + "Chrome_iOS", "Chromium_Legacy", +} + +BROWSER_WEIGHTS = { + "Chromium": 55, "Firefox": 15, "Safari": 12, "Edge": 8, + "Opera": 3, "Vivaldi": 1, "Chrome_iOS": 3, + "Chromium_Legacy": 1, "Firefox_Legacy": 1, + "Safari_Legacy": 0.5, "Tor_Browser": 0.5, +} + +SEC_CH_UA = { + "Chromium": + '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"', + "Edge": + '"Chromium";v="120", "Microsoft Edge";v="120", "Not-A.Brand";v="99"', + "Opera": + '"Chromium";v="119", "Opera";v="105", "Not-A.Brand";v="99"', + "Vivaldi": + '"Chromium";v="118", "Vivaldi";v="6.4", "Not-A.Brand";v="99"', + "Chrome_iOS": + '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"', + "Chromium_Legacy": + '"Chromium";v="90", "Google Chrome";v="90", "Not-A.Brand";v="99"', +} + +# --------------------------------------------------------------------------- +# Hosts and Accept-Language pools +# --------------------------------------------------------------------------- + +HOSTS = ["platform", "api.platform", "www.example.com", "shop.example.com"] + +ACCEPT_LANGUAGES = [ + "en-US,en;q=0.9", + "en-GB,en;q=0.9", + "fr-FR,fr;q=0.9,en;q=0.8", + "de-DE,de;q=0.9,en;q=0.8", + "es-ES,es;q=0.9,en;q=0.8", + "ja-JP,ja;q=0.9,en;q=0.8", + "pt-BR,pt;q=0.9,en;q=0.8", + "zh-CN,zh;q=0.9,en;q=0.8", ] # --------------------------------------------------------------------------- -# IP pools — must match ranges in iplocate-ip-to-asn.csv +# Path pools # --------------------------------------------------------------------------- -# Human residential IPs — OVH FR (ASN 16276) → asn_label='human' -def _human_ips(n: int) -> list: - ips = [f"91.121.{o3}.{o4}" for o3 in range(0, 20) for o4 in range(1, 60)] - random.shuffle(ips) - return ips[:n] - -# Datacenter / scanner IPs — Tor/Contabo/Reg.ru → asn_label='datacenter'/'hosting' -def _scanner_ips(n: int) -> list: - ips = ( - [f"185.220.101.{i}" for i in range(1, 101)] # ASN 210644 datacenter - + [f"45.155.205.{i}" for i in range(1, 51)] # ASN 209083 datacenter - + [f"193.32.162.{i}" for i in range(1, 31)] # ASN 197695 hosting - ) - random.shuffle(ips) - return ips[:n] - -# Known bot IPs (subset also in bot_ip.csv → directly labeled) -BOT_IP_KNOWN = [ - "185.220.101.34", "185.220.101.47", "185.220.101.52", - "185.220.101.73", "185.220.101.91", - "45.155.205.233", "45.155.205.220", "45.155.205.205", - "193.32.162.10", "193.32.162.11", -] - -# --------------------------------------------------------------------------- -# User-Agent pools per profile -# --------------------------------------------------------------------------- -HUMAN_UA = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0", - "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0", - "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.115 Mobile Safari/537.36", - "Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1", -] - -SCANNER_UA = [ - "curl/7.88.1", - "python-requests/2.31.0", - "Masscan/1.3", - "zgrab/0.x", - "Go-http-client/1.1", - "libwww-perl/6.72", - "Java/11.0.18", - "Wget/1.21.3", - "masscan/1.3 (https://github.com/robertdavidgraham/masscan)", - "-", # No User-Agent (raw scanner) -] - -BOT_CRAWLER_UA = [ - "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", - "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", - "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)", - "Twitterbot/1.0", - "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)", - "Googlebot/2.1 (+http://www.google.com/bot.html)", -] - -# --------------------------------------------------------------------------- -# Path pools per profile -# --------------------------------------------------------------------------- -HUMAN_PATHS = [ +PAGE_PATHS = [ "/", "/index.html", "/about", "/contact", "/products", "/services", - "/blog", "/blog/post-1", "/blog/post-2", "/faq", "/pricing", - "/login", "/register", "/profile", "/dashboard", + "/blog", "/blog/post-1", "/blog/post-2", "/blog/post-3", "/faq", + "/pricing", "/login", "/register", "/profile", "/dashboard", + "/docs", "/docs/getting-started", "/docs/api-reference", + "/help", "/terms", "/privacy", "/search", +] + +ASSET_PATHS = [ + "/static/js/app.js", "/static/js/vendor.js", "/static/js/analytics.js", + "/static/css/main.css", "/static/css/theme.css", + "/images/logo.png", "/images/hero.webp", "/images/banner.jpg", + "/favicon.ico", "/fonts/inter-400.woff2", "/fonts/inter-700.woff2", +] + +API_PATHS = [ "/api/v1/users", "/api/v1/status", "/api/v2/metrics", - "/static/js/app.js", "/static/css/main.css", "/images/logo.png", - "/favicon.ico", "/robots.txt", "/sitemap.xml", - "/health", "/search?q=test", "/search?q=product+review", + "/api/v1/products", "/api/v1/search", "/api/v2/config", ] ATTACK_PATHS = [ @@ -137,109 +242,393 @@ ATTACK_PATHS = [ "/phpmyadmin/", "/phpMyAdmin/", "/pma/", "/admin", "/admin/login", "/administrator/", "/cgi-bin/test.cgi", "/cgi-bin/../etc/passwd", - "/download?file=../../../etc/passwd", "/download?file=../../../../etc/shadow", + "/download?file=../../../etc/passwd", + "/download?file=../../../../etc/shadow", "/api/search?q=", "/api/users?id=1+OR+1%3D1", "/shell.php", "/cmd.php", "/eval.php", "/.aws/credentials", "/.ssh/id_rsa", "/etc/passwd", "/proc/self/environ", + "/actuator", "/actuator/env", "/actuator/health", + "/server-status", "/.svn/entries", + "/wp-content/uploads/", "/backup.zip", "/db.sql", + "/api/v1/../admin", "/api/debug", + "/.htaccess", "/.htpasswd", + "/console", "/debug/pprof/", ] +SCRAPER_PATHS = ( + [f"/products/page/{i}" for i in range(1, 51)] + + [ + f"/category/{c}/page/{i}" + for c in ["electronics", "clothing", "books", "home", "sports"] + for i in range(1, 11) + ] +) + BOT_PATHS = [ "/robots.txt", "/sitemap.xml", "/", "/index.html", "/sitemap_index.xml", "/news-sitemap.xml", "/feed", "/rss.xml", "/atom.xml", ] +CONTENT_PATHS = PAGE_PATHS + [f"/blog/post-{i}" for i in range(1, 21)] + [ + f"/products/{s}" + for s in ["widget-a", "widget-b", "gadget-x", "tool-pro", "kit-basic"] +] + +# --------------------------------------------------------------------------- +# Scanner / bot User-Agents +# --------------------------------------------------------------------------- + +SCANNER_UAS = [ + "curl/7.88.1", + "curl/8.1.2", + "python-requests/2.31.0", + "python-requests/2.28.1", + "python-urllib3/2.0.4", + "Masscan/1.3", + "masscan/1.3 (https://github.com/robertdavidgraham/masscan)", + "zgrab/0.x", + "Go-http-client/1.1", + "Go-http-client/2.0", + "libwww-perl/6.72", + "Java/11.0.18", + "Java/17.0.2", + "Wget/1.21.3", + "Scrapy/2.11.0", + "Apache-HttpClient/4.5.14", + "okhttp/4.12.0", + "Node-Fetch/1.0", + "axios/1.6.2", + "-", + "", +] + +HEADLESS_UAS = [ + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) HeadlessChrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) HeadlessChrome/119.0.0.0 Safari/537.36", +] + +LEGIT_BOT_UAS = { + "Googlebot": [ + "Mozilla/5.0 (compatible; Googlebot/2.1; " + "+http://www.google.com/bot.html)", + "Googlebot/2.1 (+http://www.google.com/bot.html)", + "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.6099.71 Mobile Safari/537.36 " + "(compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + ], + "Bingbot": [ + "Mozilla/5.0 (compatible; bingbot/2.0; " + "+http://www.bing.com/bingbot.htm)", + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; " + "compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) " + "Chrome/116.0.1938.76 Safari/537.36", + ], + "DuckDuckBot": [ + "DuckDuckBot/1.1; (+http://duckduckgo.com/duckduckbot.html)", + ], + "Applebot": [ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 " + "Safari/605.1.15 (Applebot/0.1; " + "+http://www.apple.com/go/applebot)", + ], + "YandexBot": [ + "Mozilla/5.0 (compatible; YandexBot/3.0; " + "+http://yandex.com/bots)", + ], + "Twitterbot": [ + "Twitterbot/1.0", + ], + "FacebookBot": [ + "facebookexternalhit/1.1 " + "(+http://www.facebook.com/externalhit_uatext.php)", + ], +} + +AI_BOT_UAS = { + "GPTBot": [ + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; " + "compatible; GPTBot/1.0; +https://openai.com/gptbot)", + ], + "PerplexityBot": [ + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; " + "compatible; PerplexityBot/1.0; " + "+https://docs.perplexity.ai/docs/perplexity-bot)", + ], + "ClaudeBot": [ + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; " + "compatible; ClaudeBot/1.0; " + "+https://www.anthropic.com/claude-bot)", + ], + "CCBot": [ + "CCBot/2.0 (https://commoncrawl.org/faq/)", + ], + "Bytespider": [ + "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 " + "(KHTML, like Gecko) Mobile Safari/537.36 " + "(compatible; Bytespider; spider-feedback@bytedance.com)", + ], +} + # --------------------------------------------------------------------------- # TCP / TLS metadata helpers # --------------------------------------------------------------------------- -# Realistic TCP options fingerprints per OS TCP_OPTIONS = { - "linux": "020405b40402080affffffff000000000103030a", # MSS+NOP+SACK+TS+WS=10 - "windows": "020405b40103030801010402", # MSS+NOP+WS+SACK - "macos": "020405ac0103030601010402", # MSS+NOP+WS+SACK (macOS) - "scanner": "0204ffff", # Scanner: only MSS, max value - "minimal": "0204ffd7", # Minimal + "linux": "020405b40402080affffffff000000000103030a", + "windows": "020405b40103030801010402", + "macos": "020405ac0103030601010402", + "android": "020405b40402080affffffff000000000103030a", + "scanner": "0204ffff", + "minimal": "0204ffd7", } -def _tcp_meta(profile: str = "linux") -> dict: - profiles = { - "linux": {"window_size": 65535, "mss": 1460, "wscale": 10, "ttl": 64, "df": 1}, - "windows": {"window_size": 64240, "mss": 1460, "wscale": 8, "ttl": 128, "df": 1}, - "macos": {"window_size": 65535, "mss": 1460, "wscale": 6, "ttl": 64, "df": 1}, - "android": {"window_size": 65535, "mss": 1420, "wscale": 9, "ttl": 64, "df": 1}, - "scanner": {"window_size": 1024, "mss": 1460, "wscale": 0, "ttl": 48, "df": 0}, - "minimal": {"window_size": 512, "mss": 576, "wscale": 0, "ttl": 60, "df": 0}, - } - meta = profiles.get(profile, profiles["linux"]) +TCP_PROFILES = { + "linux": {"window_size": 65535, "mss": 1460, "wscale": 10, + "ttl": 64, "df": 1}, + "windows": {"window_size": 64240, "mss": 1460, "wscale": 8, + "ttl": 128, "df": 1}, + "macos": {"window_size": 65535, "mss": 1460, "wscale": 6, + "ttl": 64, "df": 1}, + "android": {"window_size": 65535, "mss": 1420, "wscale": 9, + "ttl": 64, "df": 1}, + "scanner": {"window_size": 1024, "mss": 1460, "wscale": 0, + "ttl": 48, "df": 0}, + "minimal": {"window_size": 512, "mss": 576, "wscale": 0, + "ttl": 60, "df": 0}, +} + + +def _tcp_meta(profile, rng): + meta = TCP_PROFILES.get(profile, TCP_PROFILES["linux"]) return { - "tcp_meta_window_size": meta["window_size"] + random.randint(-100, 100), - "tcp_meta_mss": meta["mss"], + "tcp_meta_window_size": meta["window_size"] + rng.randint(-100, 100), + "tcp_meta_mss": meta["mss"], "tcp_meta_window_scale": meta["wscale"], - "tcp_meta_options": TCP_OPTIONS.get(profile, TCP_OPTIONS["linux"]), - "ip_meta_ttl": meta["ttl"] - random.randint(0, 5), - "ip_meta_df": meta["df"], - "ip_meta_id": random.randint(1, 65535), - "ip_meta_total_length": random.randint(1200, 1500), + "tcp_meta_options": TCP_OPTIONS.get(profile, TCP_OPTIONS["linux"]), + "ip_meta_ttl": meta["ttl"] - rng.randint(0, 5), + "ip_meta_df": meta["df"], + "ip_meta_id": rng.randint(1, 65535), + "ip_meta_total_length": rng.randint(1200, 1500), } -def _syn_ms(profile: str) -> int: +def _syn_ms(profile, rng): """Realistic SYN→ClientHello latency in milliseconds.""" if profile == "scanner": - return random.randint(0, 3) # Scanners: near-instant - if profile in ("minimal",): - return random.randint(1, 5) - return random.randint(10, 120) # Humans: network RTT + return rng.randint(0, 3) + if profile == "minimal": + return rng.randint(1, 5) + return rng.randint(10, 120) -def _ja3_for_ja4(ja4: str) -> tuple: - """Generate a plausible JA3 string and its MD5 hash matching the JA4 profile.""" - # These are fake but consistent — just need to be non-empty strings - if "tls13" in ja4 or ja4.startswith("t13"): - raw = "771,4866-4867-4865-49196-49200-52393-52392,0-23-65281-10-11-35-16-5-13-18-51-45-43-27,29-23-24,0" +def _ja3_for_ja4(ja4): + """Generate a plausible JA3 string and its MD5 hash.""" + if ja4.startswith("t13"): + raw = ("771,4866-4867-4865-49196-49200-52393-52392," + "0-23-65281-10-11-35-16-5-13-18-51-45-43-27,29-23-24,0") elif ja4.startswith("t12"): - raw = "771,49195-49199-49196-49200-52393-52392,0-23-65281-10-11-35-16-5-13,29-23-24,0" + raw = ("771,49195-49199-49196-49200-52393-52392," + "0-23-65281-10-11-35-16-5-13,29-23-24,0") elif ja4.startswith("t10"): raw = "769,49161-49162-49171-49172,0-10-11,29-23-24,0" else: raw = "771,4866-4867-4865,0-23-65281,29-23-24,0" - md5 = hashlib.md5(raw.encode()).hexdigest() - return raw, md5 + return raw, hashlib.md5(raw.encode()).hexdigest() # --------------------------------------------------------------------------- -# Row generators +# CSV loading with fallback # --------------------------------------------------------------------------- -def _now_minus(seconds: int) -> str: - """ISO-8601 UTC timestamp N seconds in the past.""" - t = datetime.now(timezone.utc) - timedelta(seconds=seconds) +_DATA_SEARCH_PATHS = [ + "/app/data", + os.path.join(os.path.dirname(os.path.abspath(__file__)), + "..", "..", "..", "scripts", "data"), + os.path.join(os.path.dirname(os.path.abspath(__file__)), "data"), +] + + +def _find_data_dir(explicit=None): + if explicit and os.path.isdir(explicit): + return explicit + for p in _DATA_SEARCH_PATHS: + real = os.path.realpath(p) + if os.path.isdir(real) and os.path.isfile( + os.path.join(real, "browser_ja4.csv")): + return real + return None + + +def load_browser_ja4s(data_dir): + """Load browser_ja4.csv → {family: [ja4_hash, …]} for TLS only.""" + result = {} + if not data_dir: + return result + path = os.path.join(data_dir, "browser_ja4.csv") + if not os.path.isfile(path): + return result + with open(path, newline="", encoding="utf-8") as f: + for row in csv.reader(f): + if len(row) < 2: + continue + ja4, family = row[0].strip(), row[1].strip() + if ja4.startswith("t13") or ja4.startswith("t12"): + result.setdefault(family, []).append(ja4) + return result + + +def load_bot_ips(data_dir): + """Load bot_ip.csv → {family: [cidr_str, …]}.""" + result = {} + if not data_dir: + return result + path = os.path.join(data_dir, "bot_ip.csv") + if not os.path.isfile(path): + return result + with open(path, newline="", encoding="utf-8") as f: + for row in csv.reader(f): + if len(row) < 2: + continue + result.setdefault(row[1].strip(), []).append(row[0].strip()) + return result + + +# --------------------------------------------------------------------------- +# IP generation helpers +# --------------------------------------------------------------------------- + +def _gen_ips_from_prefixes(prefixes, n, rng): + """Generate *n* unique IPs from /24 prefixes.""" + ips = set() + attempts = 0 + while len(ips) < n and attempts < n * 5: + prefix = rng.choice(prefixes) + octet = rng.randint(1, 254) + ips.add(f"{prefix}.{octet}") + attempts += 1 + return list(ips) + + +def _ips_from_cidrs(cidrs, n, rng): + """Generate *n* unique IPs from a list of CIDR strings.""" + networks = [] + single_ips = [] + for c in cidrs: + try: + net = ipaddress.ip_network(c, strict=False) + if net.prefixlen == 32: + single_ips.append(str(net.network_address)) + elif net.num_addresses > 2: + networks.append(net) + except ValueError: + continue + # Start with any /32 single IPs + ips = set(single_ips) + if not networks: + return list(ips)[:n] + weights = [net.num_addresses for net in networks] + attempts = 0 + while len(ips) < n and attempts < n * 10: + net = rng.choices(networks, weights=weights, k=1)[0] + host_offset = rng.randint(1, max(1, net.num_addresses - 2)) + ips.add(str(net.network_address + host_offset)) + attempts += 1 + return list(ips)[:n] + + +def _generate_bot_ja4s(browser_ja4_set, rng, n=20): + """Generate synthetic JA4 hashes NOT in the browser CSV.""" + prefixes = [ + "t13d0305", "t13d0203", "t12d0507", "t10d0100", "t13d0101", + "t12d0302", "t13d0405", "t12d0204", "t10d0200", "t13d0102", + ] + bot_ja4s = [] + for i in range(n): + p = prefixes[i % len(prefixes)] + seg1 = f"{rng.randint(0, 0xFFFFFFFFFFFF):012x}" + seg2 = f"{rng.randint(0, 0xFFFFFFFFFFFF):012x}" + suffix = "h1" if rng.random() < 0.7 else "h2" + ja4 = f"{p}{suffix}_{seg1}_{seg2}" + if ja4 not in browser_ja4_set: + bot_ja4s.append(ja4) + if not bot_ja4s: + bot_ja4s = [ + "t13d030500_ffd59bab1b39_6e7f7df63e98", + "t13d020300_6b9b1b2c3d4e_ffd59bab1b39", + "t10d170000_0a1b2c3d4e5f_1b2c3d4e5f60", + "t12d050700_5a6b7c8d9e0f_1a2b3c4d5e6f", + "t13d010100_aabbccddeeff_0011223344aa", + ] + return bot_ja4s + + +# --------------------------------------------------------------------------- +# Request distribution +# --------------------------------------------------------------------------- + +def _distribute_requests(n_ips, total_rows, min_req, max_req, rng): + """Distribute *total_rows* across *n_ips*, each in [min_req, max_req].""" + if n_ips == 0: + return [] + counts = [] + remaining = total_rows + for i in range(n_ips): + left = n_ips - i + if i == n_ips - 1: + counts.append(max(min_req, min(max_req, remaining))) + break + lo = max(min_req, remaining - (left - 1) * max_req) + hi = min(max_req, remaining - (left - 1) * min_req) + if lo > hi: + lo = hi = max(min_req, min(max_req, remaining // left)) + counts.append(rng.randint(lo, hi)) + remaining -= counts[-1] + rng.shuffle(counts) + return counts + + +# --------------------------------------------------------------------------- +# Timestamp helper +# --------------------------------------------------------------------------- + +_BASE_TIME = None + + +def _now_minus(seconds): + """ISO-8601 UTC timestamp *seconds* in the past.""" + global _BASE_TIME + if _BASE_TIME is None: + _BASE_TIME = datetime.now(timezone.utc) + t = _BASE_TIME - timedelta(seconds=seconds) return t.strftime("%Y-%m-%dT%H:%M:%SZ") +# --------------------------------------------------------------------------- +# Row builder — identical field set to original +# --------------------------------------------------------------------------- + def _make_row( - src_ip: str, - ua: str, - path: str, - method: str = "GET", - ja4: str = None, - tcp_profile: str = "linux", - scheme: str = "https", - host: str = "platform", - time_offset_s: int = None, - extra_headers: dict = None, -) -> dict: + src_ip, ua, path, method="GET", ja4=None, tcp_profile="linux", + scheme="https", host="platform", time_offset_s=None, + extra_headers=None, rng=None, +): """Build a single raw_json dict matching what the correlator produces.""" + if rng is None: + rng = random if time_offset_s is None: - time_offset_s = random.randint(0, 1700) # spread over last ~28 min + time_offset_s = rng.randint(0, 1700) if ja4 is None: - ja4 = random.choice(HUMAN_JA4S) + ja4 = "t13d1917h2_b0372614b25a_6a77dcf5a8be" ja3_raw, ja3_hash = _ja3_for_ja4(ja4) - tcp = _tcp_meta(tcp_profile) - syn_ms = _syn_ms(tcp_profile) + tcp = _tcp_meta(tcp_profile, rng) + syn_ms = _syn_ms(tcp_profile, rng) client_headers = "Host,User-Agent,Accept,Accept-Language,Accept-Encoding" if extra_headers: @@ -248,7 +637,7 @@ def _make_row( row = { "time": _now_minus(time_offset_s), "src_ip": src_ip, - "src_port": random.randint(1024, 65535), + "src_port": rng.randint(1024, 65535), "dst_ip": "172.20.0.2", "dst_port": 443 if scheme == "https" else 80, "method": method, @@ -256,15 +645,18 @@ def _make_row( "host": host, "path": path.split("?")[0] if "?" in path else path, "query": path.split("?")[1] if "?" in path else "", - "http_version": "HTTP/2.0" if ja4.endswith("h2") else "HTTP/1.1", + "http_version": "HTTP/2.0" if "h2" in ja4 else "HTTP/1.1", "orphan_side": "", "correlated": True, - "keepalives": random.randint(1, 8), + "keepalives": rng.randint(1, 8), "a_timestamp": int(time.time() * 1_000_000), "b_timestamp": int(time.time() * 1_000_000) + syn_ms * 1000, - "conn_id": f"seed_{src_ip.replace('.', '_')}_{random.randint(1000,9999)}", + "conn_id": f"seed_{src_ip.replace('.', '_')}" + f"_{rng.randint(1000, 9999)}", "syn_to_clienthello_ms": syn_ms, - "tls_version": "1.3" if ja4.startswith("t13") else ("1.2" if ja4.startswith("t12") else "1.0"), + "tls_version": ("1.3" if ja4.startswith("t13") + else "1.2" if ja4.startswith("t12") + else "1.0"), "tls_sni": host, "tls_alpn": "h2" if "h2" in ja4 else "http/1.1", "ja3": ja3_raw, @@ -274,14 +666,14 @@ def _make_row( "header_User-Agent": ua, "header_Accept": "text/html,application/xhtml+xml,*/*;q=0.8", "header_Accept-Encoding": "gzip, deflate, br", - "header_Accept-Language": random.choice(["fr-FR,fr;q=0.9", "en-US,en;q=0.9", "de-DE,de;q=0.8"]), + "header_Accept-Language": "", "header_Content-Type": "", "header_X-Request-Id": "", "header_X-Trace-Id": "", "header_X-Forwarded-For": "", - "header_Sec-Fetch-Site": "none" if tcp_profile != "scanner" else "", - "header_Sec-Fetch-Mode": "navigate" if tcp_profile != "scanner" else "", - "header_Sec-Fetch-Dest": "document" if tcp_profile != "scanner" else "", + "header_Sec-Fetch-Site": "", + "header_Sec-Fetch-Mode": "", + "header_Sec-Fetch-Dest": "", "header_Sec-CH-UA": "", "header_Sec-CH-UA-Mobile": "", "header_Sec-CH-UA-Platform": "", @@ -292,95 +684,341 @@ def _make_row( return row -def generate_human_sessions(n: int = 720) -> list: - """Generate realistic human browsing sessions. +# --------------------------------------------------------------------------- +# Traffic generators +# --------------------------------------------------------------------------- - Each IP gets 1–3 requests spread across different paths. - Distinct (src_ip, ja4, host) → distinct rows in agg_host_ip_ja4_1h. - We need ≥ 500 human rows for the bot_detector baseline. +def generate_browser_traffic(n_ips, total_rows, browser_ja4s, rng): + """Generate legitimate browser sessions with realistic navigation. + + Each IP gets a consistent browser profile (family, JA4, UA, TCP) and + produces page navigations + asset/API requests with Referer chains, + cookies, Sec-Fetch headers, and Sec-CH-UA for Chromium browsers. """ - ips = _human_ips(n) rows = [] - for ip in ips: - # 1–3 requests per IP with the same JA4 (browser stays consistent) - ja4 = random.choice(HUMAN_JA4S) - ua = random.choice(HUMAN_UA) - tcp = random.choice(["linux", "windows", "macos", "android"]) - n_req = random.randint(1, 3) + ips = _gen_ips_from_prefixes(ISP_PREFIXES, n_ips, rng) + counts = _distribute_requests(len(ips), total_rows, 5, 50, rng) + + families = (list(browser_ja4s.keys()) + if browser_ja4s else list(BROWSER_UAS.keys())) + family_weights = [BROWSER_WEIGHTS.get(f, 1) for f in families] + + platform_map = { + "windows": '"Windows"', "linux": '"Linux"', + "macos": '"macOS"', "android": '"Android"', + } + + for ip, n_req in zip(ips, counts): + family = rng.choices(families, weights=family_weights, k=1)[0] + ja4_list = browser_ja4s.get(family, []) + if not ja4_list: + for fb in ("Chromium", "Firefox", "Safari"): + ja4_list = browser_ja4s.get(fb, []) + if ja4_list: + break + if not ja4_list: + ja4_list = ["t13d1917h2_b0372614b25a_6a77dcf5a8be"] + ja4 = rng.choice(ja4_list) + ua = rng.choice(BROWSER_UAS.get(family, BROWSER_UAS["Chromium"])) + tcp = rng.choice(BROWSER_TCP.get(family, ["linux"])) + host = rng.choice(HOSTS) + lang = rng.choice(ACCEPT_LANGUAGES) + is_chromium = family in CHROMIUM_FAMILIES + session_cookie = f"sid={rng.randint(100000, 999999)}" + ch_ua = SEC_CH_UA.get(family, "") if is_chromium else "" + ch_mobile = "?0" if is_chromium else "" + ch_plat = platform_map.get(tcp, "") if is_chromium else "" + + base_offset = rng.randint(60, 1700) + current_page = "" + + for req_i in range(n_req): + offset = max(1, base_offset - req_i * rng.randint(1, 5)) + + if req_i == 0 or rng.random() < 0.25: + # Navigate to a new page + path = rng.choice(PAGE_PATHS) + method = "GET" + referer = ("" if req_i == 0 + else f"https://{host}{current_page}") + sec_site = "none" if req_i == 0 else "same-origin" + sec_mode = "navigate" + sec_dest = "document" + current_page = path + elif rng.random() < 0.5: + # Asset request + path = rng.choice(ASSET_PATHS) + method = "GET" + referer = (f"https://{host}{current_page}" + if current_page else "") + sec_site = "same-origin" + sec_mode = "no-cors" + sec_dest = rng.choice(["script", "style", "image", "font"]) + elif rng.random() < 0.4: + # API call + path = rng.choice(API_PATHS) + method = rng.choice(["GET", "POST"]) + referer = (f"https://{host}{current_page}" + if current_page else "") + sec_site = "same-origin" + sec_mode = "cors" + sec_dest = "empty" + else: + # Another page navigation + path = rng.choice(PAGE_PATHS) + method = "GET" + referer = (f"https://{host}{current_page}" + if current_page else "") + sec_site = "same-origin" + sec_mode = "navigate" + sec_dest = "document" + current_page = path + + row = _make_row( + src_ip=ip, ua=ua, path=path, method=method, + ja4=ja4, tcp_profile=tcp, scheme="https", host=host, + time_offset_s=offset, rng=rng, + ) + row["header_Accept-Language"] = lang + row["header_Sec-Fetch-Site"] = sec_site + row["header_Sec-Fetch-Mode"] = sec_mode + row["header_Sec-Fetch-Dest"] = sec_dest + row["header_Sec-CH-UA"] = ch_ua + row["header_Sec-CH-UA-Mobile"] = ch_mobile + row["header_Sec-CH-UA-Platform"] = ch_plat + if referer: + row["header_Referer"] = referer + if "Referer" not in row["client_headers"]: + row["client_headers"] += ",Referer" + if req_i > 0: + row["header_Cookie"] = session_cookie + if "Cookie" not in row["client_headers"]: + row["client_headers"] += ",Cookie" + rows.append(row) + + return rows + + +def generate_scanner_traffic(n_ips, total_rows, bot_ja4s, rng): + """Generate malicious bot/scanner traffic in four sub-categories. + + a. Vulnerability scanners (30%): diverse attack paths + b. Credential stuffers (20%): POST /login brute-force + c. Content scrapers (30%): methodical path crawling + d. DDoS-like (20%): same path hammered 100+ times + """ + rows = [] + ips = _gen_ips_from_prefixes(DC_SCANNER_PREFIXES, n_ips, rng) + + n_vuln = int(n_ips * 0.30) + n_cred = int(n_ips * 0.20) + n_scrap = int(n_ips * 0.30) + n_ddos = n_ips - n_vuln - n_cred - n_scrap + + # Compute row budgets respecting per-IP minimums + min_vuln = n_vuln * 10 + min_cred = n_cred * 15 + min_scrap = n_scrap * 10 + min_ddos = n_ddos * 100 + total_min = min_vuln + min_cred + min_scrap + min_ddos + surplus = max(0, total_rows - total_min) + r_ddos = min_ddos + int(surplus * 0.40) + r_vuln = min_vuln + int(surplus * 0.20) + r_cred = min_cred + int(surplus * 0.15) + r_scrap = total_rows - r_ddos - r_vuln - r_cred + + idx = 0 + + # --- Vulnerability scanners --- + vuln_ips = ips[idx:idx + n_vuln]; idx += n_vuln + counts = _distribute_requests(len(vuln_ips), r_vuln, 10, 60, rng) + for ip, n_req in zip(vuln_ips, counts): + ua = rng.choice(SCANNER_UAS) + ja4 = rng.choice(bot_ja4s) + rotate_ua = rng.random() < 0.3 + for _ in range(n_req): + if rotate_ua: + ua = rng.choice(SCANNER_UAS) + rows.append(_make_row( + src_ip=ip, ua=ua, path=rng.choice(ATTACK_PATHS), + method=rng.choice(["GET", "GET", "HEAD"]), + ja4=ja4, tcp_profile="scanner", scheme="https", + host=rng.choice(HOSTS), rng=rng, + )) + + # Some vuln scanners do port probing (diverse dst_port) + for ip in rng.sample(vuln_ips, min(100, len(vuln_ips))): + for port in rng.sample(range(80, 10000), 5): + row = _make_row( + src_ip=ip, ua="-", path="/", + method="GET", ja4=rng.choice(bot_ja4s), + tcp_profile="scanner", scheme="https", + host=rng.choice(HOSTS), rng=rng, + ) + row["dst_port"] = port + rows.append(row) + + # --- Credential stuffers --- + login_paths = [ + "/login", "/admin/login", "/api/auth/login", + "/wp-login.php", "/user/signin", "/api/v1/auth", + ] + cred_ips = ips[idx:idx + n_cred]; idx += n_cred + counts = _distribute_requests(len(cred_ips), r_cred, 15, 60, rng) + for ip, n_req in zip(cred_ips, counts): + ua = rng.choice(SCANNER_UAS + HEADLESS_UAS) + ja4 = rng.choice(bot_ja4s) + target = rng.choice(login_paths) for _ in range(n_req): rows.append(_make_row( - src_ip=ip, ua=ua, - path=random.choice(HUMAN_PATHS), - method=random.choice(["GET", "GET", "GET", "POST"]), - ja4=ja4, tcp_profile=tcp, - scheme=random.choice(["https", "https", "http"]), - )) - return rows - - -def generate_scanner_sessions(n: int = 150) -> list: - """Generate scanner/attack traffic — anomalous patterns for ML detection. - - Characteristics: minimal TCP options, small window, no Sec-Fetch headers, - attack paths, scanner UAs, rapid-fire requests (low syn_ms). - """ - ips = _scanner_ips(n) - rows = [] - for ip in ips: - ja4 = random.choice(BOT_JA4S[:3]) # curl/python/masscan profiles - ua = random.choice(SCANNER_UA) - # Burst: 5–20 requests per IP (simulates scan / brute-force) - n_req = random.randint(5, 20) - for _ in range(n_req): - rows.append(_make_row( - src_ip=ip, ua=ua, - path=random.choice(ATTACK_PATHS + ATTACK_PATHS + HUMAN_PATHS), - method=random.choice(["GET", "GET", "GET", "HEAD", "POST"]), - ja4=ja4, tcp_profile="scanner", - scheme="https", - extra_headers={"Content-Type": ""} if random.random() < 0.3 else None, - )) - return rows - - -def generate_known_bot_sessions(n: int = 100) -> list: - """Generate sessions from IPs listed in bot_ip.csv (direct bot labeling).""" - rows = [] - for _ in range(n): - ip = random.choice(BOT_IP_KNOWN) - ua = random.choice(BOT_CRAWLER_UA + SCANNER_UA) - ja4 = random.choice(BOT_JA4S) - rows.append(_make_row( - src_ip=ip, ua=ua, - path=random.choice(BOT_PATHS + ATTACK_PATHS), - ja4=ja4, tcp_profile="scanner", - scheme="https", - )) - return rows - - -def generate_brute_force_cluster(n_ips: int = 20) -> list: - """Simulate credential stuffing / brute-force from a small set of IPs. - - Same IP → many POST /login requests = high hit count, suspicious pattern. - """ - ips = _scanner_ips(n_ips)[:n_ips] - rows = [] - for ip in ips: - ua = random.choice(SCANNER_UA + BOT_CRAWLER_UA) - ja4 = random.choice(BOT_JA4S) - for _ in range(random.randint(20, 50)): - rows.append(_make_row( - src_ip=ip, ua=ua, - path="/login", - method="POST", - ja4=ja4, tcp_profile="scanner", - scheme="https", + src_ip=ip, ua=ua, path=target, method="POST", + ja4=ja4, tcp_profile="scanner", scheme="https", + host=rng.choice(HOSTS[:2]), rng=rng, extra_headers={ "Content-Type": "application/x-www-form-urlencoded", - "Content-Length": "32", + "Content-Length": str(rng.randint(20, 60)), }, )) + + # --- Content scrapers --- + scrap_ips = ips[idx:idx + n_scrap]; idx += n_scrap + counts = _distribute_requests(len(scrap_ips), r_scrap, 10, 60, rng) + for ip, n_req in zip(scrap_ips, counts): + ua = rng.choice(SCANNER_UAS[:8]) + ja4 = rng.choice(bot_ja4s) + for i in range(n_req): + rows.append(_make_row( + src_ip=ip, ua=ua, + path=SCRAPER_PATHS[i % len(SCRAPER_PATHS)], + method="GET", ja4=ja4, tcp_profile="scanner", + scheme="https", host=rng.choice(HOSTS), rng=rng, + )) + + # --- DDoS-like --- + ddos_targets = ["/", "/api/v1/search", "/products", "/api/v2/metrics"] + ddos_ips = ips[idx:idx + n_ddos] + counts = _distribute_requests(len(ddos_ips), r_ddos, 100, 200, rng) + for ip, n_req in zip(ddos_ips, counts): + ua = rng.choice(SCANNER_UAS) + ja4 = rng.choice(bot_ja4s) + target = rng.choice(ddos_targets) + for _ in range(n_req): + rows.append(_make_row( + src_ip=ip, ua=ua, path=target, method="GET", + ja4=ja4, + tcp_profile=rng.choice(["scanner", "minimal"]), + scheme="https", host=HOSTS[0], + time_offset_s=rng.randint(0, 300), rng=rng, + )) + + return rows + + +def generate_legit_bot_traffic(n_ips, total_rows, bot_ips_by_family, + bot_ja4s, rng): + """Generate traffic from known legitimate bots (Googlebot, Bingbot, …). + + IPs are drawn from bot_ip.csv CIDRs when available so they match + the bot_ip dictionary for direct labelling. + """ + rows = [] + legit_families = list(LEGIT_BOT_UAS.keys()) + + all_ips = [] + ip_family_map = {} + per_family = max(1, n_ips // len(legit_families)) + + for family in legit_families: + cidrs = bot_ips_by_family.get(family, []) + if cidrs: + fam_ips = _ips_from_cidrs(cidrs, per_family, rng) + else: + fam_ips = _gen_ips_from_prefixes( + DC_LEGIT_BOT_PREFIXES, per_family, rng) + for ip in fam_ips: + ip_family_map[ip] = family + all_ips.extend(fam_ips) + + if len(all_ips) < n_ips: + extra = _gen_ips_from_prefixes( + DC_LEGIT_BOT_PREFIXES, n_ips - len(all_ips), rng) + for ip in extra: + ip_family_map[ip] = rng.choice(legit_families) + all_ips.extend(extra) + + all_ips = all_ips[:n_ips] + counts = _distribute_requests(len(all_ips), total_rows, 5, 30, rng) + + for ip, n_req in zip(all_ips, counts): + family = ip_family_map.get(ip, rng.choice(legit_families)) + ua = rng.choice( + LEGIT_BOT_UAS.get(family, LEGIT_BOT_UAS["Googlebot"])) + ja4 = rng.choice(bot_ja4s) + + # Bots: robots.txt first, then sitemap, then content + paths = ["/robots.txt"] + if rng.random() < 0.7: + paths.append("/sitemap.xml") + remaining_n = n_req - len(paths) + paths.extend( + rng.choices(CONTENT_PATHS + BOT_PATHS, k=max(0, remaining_n))) + paths = paths[:n_req] + + for path in paths: + rows.append(_make_row( + src_ip=ip, ua=ua, path=path, method="GET", + ja4=ja4, + tcp_profile=rng.choice(["linux", "linux", "scanner"]), + scheme="https", host=rng.choice(HOSTS), rng=rng, + )) + + return rows + + +def generate_ai_bot_traffic(n_ips, total_rows, bot_ips_by_family, + bot_ja4s, rng): + """Generate aggressive AI bot scraping traffic (GPTBot, ClaudeBot, …).""" + rows = [] + ai_families = list(AI_BOT_UAS.keys()) + + all_ips = [] + ip_family_map = {} + per_family = max(1, n_ips // len(ai_families)) + + for family in ai_families: + cidrs = bot_ips_by_family.get(family, []) + if cidrs: + fam_ips = _ips_from_cidrs(cidrs, per_family, rng) + else: + fam_ips = _gen_ips_from_prefixes( + DC_AI_PREFIXES, per_family, rng) + for ip in fam_ips: + ip_family_map[ip] = family + all_ips.extend(fam_ips) + + if len(all_ips) < n_ips: + extra = _gen_ips_from_prefixes( + DC_AI_PREFIXES, n_ips - len(all_ips), rng) + for ip in extra: + ip_family_map[ip] = rng.choice(ai_families) + all_ips.extend(extra) + + all_ips = all_ips[:n_ips] + counts = _distribute_requests(len(all_ips), total_rows, 10, 50, rng) + + for ip, n_req in zip(all_ips, counts): + family = ip_family_map.get(ip, rng.choice(ai_families)) + ua = rng.choice(AI_BOT_UAS[family]) + ja4 = rng.choice(bot_ja4s) + + paths = rng.choices(CONTENT_PATHS, k=n_req) + for path in paths: + rows.append(_make_row( + src_ip=ip, ua=ua, path=path, method="GET", + ja4=ja4, tcp_profile="linux", scheme="https", + host=rng.choice(HOSTS), rng=rng, + )) + return rows @@ -388,8 +1026,8 @@ def generate_brute_force_cluster(n_ips: int = 20) -> list: # ClickHouse insert # --------------------------------------------------------------------------- -def _ch_insert(rows: list, host: str, port: int, user: str, password: str, - batch_size: int = 200, dry_run: bool = False) -> int: +def _ch_insert(rows, host, port, user, password, + batch_size=2000, dry_run=False): """Insert rows into ja4_logs.http_logs_raw via ClickHouse HTTP interface. Each row is wrapped as {"raw_json": ""} in JSONEachRow format. @@ -399,19 +1037,20 @@ def _ch_insert(rows: list, host: str, port: int, user: str, password: str, print("[seed] Sample row:", json.dumps(rows[0], indent=2)[:400]) return len(rows) + query = "INSERT INTO ja4_logs.http_logs_raw (raw_json) FORMAT JSONEachRow" url = ( f"http://{host}:{port}/" - f"?query={urllib.parse.quote('INSERT INTO ja4_logs.http_logs_raw (raw_json) FORMAT JSONEachRow')}" + f"?query={urllib.parse.quote(query)}" f"&user={urllib.parse.quote(user)}" f"&password={urllib.parse.quote(password)}" ) total_inserted = 0 + n_batches = (len(rows) + batch_size - 1) // batch_size for i in range(0, len(rows), batch_size): batch = rows[i:i + batch_size] body_lines = [] for row in batch: - # raw_json column holds the entire log as a JSON string outer = {"raw_json": json.dumps(row, separators=(",", ":"))} body_lines.append(json.dumps(outer, separators=(",", ":"))) body = "\n".join(body_lines).encode("utf-8") @@ -426,15 +1065,20 @@ def _ch_insert(rows: list, host: str, port: int, user: str, password: str, total_inserted += len(batch) except urllib.error.HTTPError as e: err_body = e.read(500).decode("utf-8", errors="replace") - print(f"[seed] ERROR batch {i}–{i+batch_size}: HTTP {e.code}: {err_body}") + print(f"[seed] ERROR batch {i}–{i+batch_size}: " + f"HTTP {e.code}: {err_body}") except Exception as e: print(f"[seed] ERROR batch {i}–{i+batch_size}: {e}") + if total_inserted % 10000 < batch_size: + batch_num = i // batch_size + 1 + print(f"[seed] Progress: {total_inserted:,}/{len(rows):,} rows " + f"({batch_num}/{n_batches} batches)") + return total_inserted -def _wait_for_clickhouse(host: str, port: int, user: str, password: str, - timeout_s: int = 60) -> bool: +def _wait_for_clickhouse(host, port, user, password, timeout_s=60): """Wait for ClickHouse to be ready.""" url = ( f"http://{host}:{port}/" @@ -459,51 +1103,121 @@ def _wait_for_clickhouse(host: str, port: int, user: str, password: str, # --------------------------------------------------------------------------- def main(): - parser = argparse.ArgumentParser(description="Seed ClickHouse with synthetic traffic") + parser = argparse.ArgumentParser( + description="Seed ClickHouse with synthetic traffic") parser.add_argument("--host", default="clickhouse") parser.add_argument("--port", type=int, default=8123) parser.add_argument("--user", default="default") parser.add_argument("--password", default="") parser.add_argument("--dry-run", action="store_true", help="Generate data but do not insert") + parser.add_argument("--rows", type=int, default=500_000, + help="Total rows to generate (default: 500000)") + parser.add_argument("--ips", type=int, default=20_000, + help="Total unique IPs (default: 20000)") + parser.add_argument("--seed", type=int, default=None, + help="Random seed for reproducibility") + parser.add_argument("--data-dir", default=None, + help="Path to CSV data directory " + "(browser_ja4.csv, bot_ip.csv)") args = parser.parse_args() + rng = random.Random(args.seed) + if not args.dry_run: print(f"[seed] Waiting for ClickHouse at {args.host}:{args.port}…") - if not _wait_for_clickhouse(args.host, args.port, args.user, args.password): + if not _wait_for_clickhouse(args.host, args.port, + args.user, args.password): print("[seed] FATAL: ClickHouse not reachable after 60s") raise SystemExit(1) print("[seed] ClickHouse ready.") t0 = time.monotonic() - # Generate all row sets - print("[seed] Generating rows…") - human_rows = generate_human_sessions(720) # ≥ 500 unique (ip,ja4,host) human sessions - scanner_rows = generate_scanner_sessions(150) # anomalous datacenter traffic - known_bot = generate_known_bot_sessions(100) # directly labeled by bot_ip.csv - brute_force = generate_brute_force_cluster(20) # credential stuffing pattern + # --- Load CSV data --- + data_dir = _find_data_dir(args.data_dir) + if data_dir: + print(f"[seed] Loading CSV data from {data_dir}") + else: + print("[seed] WARNING: CSV data directory not found, " + "using fallback data") - all_rows = human_rows + scanner_rows + known_bot + brute_force - random.shuffle(all_rows) + browser_ja4s = load_browser_ja4s(data_dir) + bot_ips_by_family = load_bot_ips(data_dir) - print(f"[seed] Total rows to insert: {len(all_rows)}") - print(f" • {len(human_rows):<5} human sessions " - f"(~{len(set(r['src_ip'] for r in human_rows))} unique IPs)") - print(f" • {len(scanner_rows):<5} scanner/anomaly sessions") - print(f" • {len(known_bot):<5} known-bot sessions") - print(f" • {len(brute_force):<5} brute-force rows") + browser_ja4_set = set() + for ja4_list in browser_ja4s.values(): + browser_ja4_set.update(ja4_list) + if browser_ja4s: + total_ja4 = sum(len(v) for v in browser_ja4s.values()) + print(f"[seed] Loaded {total_ja4} browser JA4s " + f"across {len(browser_ja4s)} families") + if bot_ips_by_family: + total_cidrs = sum(len(v) for v in bot_ips_by_family.values()) + print(f"[seed] Loaded {total_cidrs} bot CIDRs " + f"across {len(bot_ips_by_family)} families") + + bot_ja4s = _generate_bot_ja4s(browser_ja4_set, rng) + + # --- IP and row budgets --- + n_browser_ips = int(args.ips * 0.70) + n_scanner_ips = int(args.ips * 0.15) + n_legit_bot_ips = int(args.ips * 0.10) + n_ai_bot_ips = (args.ips - n_browser_ips + - n_scanner_ips - n_legit_bot_ips) + + n_browser_rows = int(args.rows * 0.70) + n_scanner_rows = int(args.rows * 0.20) + n_legit_bot_rows = int(args.rows * 0.06) + n_ai_bot_rows = (args.rows - n_browser_rows + - n_scanner_rows - n_legit_bot_rows) + + print(f"[seed] Generating {args.rows:,} rows from {args.ips:,} IPs…") + + # --- Generate --- + browser_rows = generate_browser_traffic( + n_browser_ips, n_browser_rows, browser_ja4s, rng) + print(f"[seed] Browser: {len(browser_rows):>7,} rows " + f"({len(set(r['src_ip'] for r in browser_rows)):,} IPs)") + + scanner_rows = generate_scanner_traffic( + n_scanner_ips, n_scanner_rows, bot_ja4s, rng) + print(f"[seed] Scanner: {len(scanner_rows):>7,} rows " + f"({len(set(r['src_ip'] for r in scanner_rows)):,} IPs)") + + legit_bot_rows = generate_legit_bot_traffic( + n_legit_bot_ips, n_legit_bot_rows, + bot_ips_by_family, bot_ja4s, rng) + print(f"[seed] Legit bots: {len(legit_bot_rows):>7,} rows " + f"({len(set(r['src_ip'] for r in legit_bot_rows)):,} IPs)") + + ai_bot_rows = generate_ai_bot_traffic( + n_ai_bot_ips, n_ai_bot_rows, + bot_ips_by_family, bot_ja4s, rng) + print(f"[seed] AI bots: {len(ai_bot_rows):>7,} rows " + f"({len(set(r['src_ip'] for r in ai_bot_rows)):,} IPs)") + + all_rows = browser_rows + scanner_rows + legit_bot_rows + ai_bot_rows + rng.shuffle(all_rows) + + gen_elapsed = time.monotonic() - t0 + print(f"[seed] Generated {len(all_rows):,} total rows " + f"in {gen_elapsed:.1f}s") + + # --- Insert --- inserted = _ch_insert( all_rows, args.host, args.port, args.user, args.password, - dry_run=args.dry_run, + batch_size=2000, dry_run=args.dry_run, ) elapsed = time.monotonic() - t0 - print(f"[seed] Done: {inserted}/{len(all_rows)} rows inserted in {elapsed:.1f}s") + print(f"[seed] Done: {inserted:,}/{len(all_rows):,} rows inserted " + f"in {elapsed:.1f}s") if inserted < len(all_rows) * 0.9: - print("[seed] WARNING: fewer than 90% of rows inserted — check errors above") + print("[seed] WARNING: fewer than 90% of rows inserted — " + "check errors above") raise SystemExit(1) print(f"[seed] The bot_detector should now see ≥ 500 human sessions "