- 350K browser rows (14K IPs) using real JA4s from browser_ja4.csv - 100K scanner rows (3K IPs) with vuln/cred/scraper/DDoS sub-categories - 30K legit bot rows (2K IPs) from real bot_ip.csv CIDRs - 20K AI bot rows (1K IPs) for GPTBot, ClaudeBot, etc. Key improvements: - Load browser_ja4.csv at startup, match JA4 to browser family - Load bot_ip.csv to generate IPs from real Googlebot/Bingbot CIDRs - Hard-coded ISP /24 prefixes from real ASNs (Comcast, Orange, DT, etc.) - Realistic navigation patterns with Referer chains and cookies - Sec-CH-UA headers for Chromium browsers (modern_browser_score >= 50) - Batch size increased to 2000, progress reporting every 10K rows - New CLI args: --rows, --ips, --seed, --data-dir - Bot JA4s are synthetic hashes guaranteed NOT in browser_ja4.csv Also updated: - Dockerfile: COPY *.py (was missing seed_clickhouse.py) - docker-compose.yml: mount scripts/data as /app/data for CSV access - run-tests.sh: updated seeder description comments Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1229 lines
45 KiB
Python
1229 lines
45 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
seed_clickhouse.py — Bootstrap ClickHouse with realistic synthetic traffic data.
|
||
|
||
Inserts directly into ja4_logs.http_logs_raw (triggers all MVs automatically):
|
||
• ~350,000 rows from 14,000 legitimate browser IPs (ISP ranges, asn_label='isp')
|
||
• ~100,000 rows from 3,000 malicious bot/scanner IPs (datacenter ranges)
|
||
• ~30,000 rows from 2,000 legitimate bot IPs (from bot_ip.csv CIDRs)
|
||
• ~20,000 rows from 1,000 AI bot IPs (datacenter ranges)
|
||
|
||
Total: 500,000 rows from 20,000 unique IPs (configurable via --rows / --ips).
|
||
|
||
Browser JA4 fingerprints are loaded from browser_ja4.csv so they match
|
||
dict_browser_ja4 (LEGITIMATE_BROWSER classification). Bot/scanner JA4s are
|
||
synthetic hashes guaranteed NOT to appear in that dictionary.
|
||
|
||
Bot IPs are drawn from real CIDRs in bot_ip.csv (Googlebot, Bingbot, etc.).
|
||
ISP and datacenter IPs use hard-coded /24 prefixes from well-known ASNs that
|
||
resolve correctly through iplocate-ip-to-asn.csv → asn_reputation.csv.
|
||
|
||
This ensures view_ai_features_1h has ≥ 500 human rows for the bot_detector
|
||
training threshold (run_semi_supervised_logic requires len(human_baseline) >= 500).
|
||
|
||
All timestamps are within the last 30 minutes so the 24h window filter catches them.
|
||
|
||
No external dependencies — uses Python stdlib only.
|
||
|
||
Usage:
|
||
python seed_clickhouse.py
|
||
python seed_clickhouse.py --host clickhouse --port 8123 --user default --password ""
|
||
python seed_clickhouse.py --rows 500000 --ips 20000 --seed 42
|
||
python seed_clickhouse.py --dry-run
|
||
"""
|
||
|
||
import argparse
|
||
import csv
|
||
import hashlib
|
||
import ipaddress
|
||
import json
|
||
import os
|
||
import random
|
||
import time
|
||
import urllib.error
|
||
import urllib.parse
|
||
import urllib.request
|
||
from datetime import datetime, timedelta, timezone
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Hard-coded /24 prefixes — guaranteed to resolve via ASN dictionaries
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# ISP ranges (asn_label='isp' in asn_reputation.csv)
|
||
ISP_PREFIXES = (
|
||
# Comcast AS7922 — within 24.0.0.0/12
|
||
[f"24.{o2}.{o3}" for o2 in range(1, 11) for o3 in range(0, 3)]
|
||
# Orange AS3215 — within 2.3.0.0/16
|
||
+ [f"2.3.{o3}" for o3 in range(0, 10)]
|
||
# Deutsche Telekom AS3320 — within 2.160.0.0/12
|
||
+ [f"2.{160 + o2}.{o3}" for o2 in range(0, 5) for o3 in range(0, 2)]
|
||
# AT&T AS7018 — within 12.0.0.0/10
|
||
+ [f"12.0.{o3}" for o3 in range(4, 14)]
|
||
# Verizon AS701 — within 63.0.0.0/12
|
||
+ [f"63.{o2}.0" for o2 in range(0, 10)]
|
||
# BT AS2856 — within 5.80.0.0/15
|
||
+ [f"5.80.{o3}" for o3 in range(0, 8)]
|
||
) # ~68 prefixes × 254 ≈ 17K IPs
|
||
|
||
# Datacenter ranges for scanners (asn_label='datacenter')
|
||
DC_SCANNER_PREFIXES = (
|
||
# DigitalOcean AS14061 — within 5.101.96.0/20
|
||
[f"5.101.{96 + o3}" for o3 in range(0, 6)]
|
||
# Hetzner AS24940 — within 5.9.0.0/16
|
||
+ [f"5.9.{o3}" for o3 in range(0, 6)]
|
||
# OVH AS16276 — within 5.39.0.0/17
|
||
+ [f"5.39.{o3}" for o3 in range(0, 5)]
|
||
) # ~17 prefixes × 254 ≈ 4.3K IPs
|
||
|
||
# Datacenter ranges for AI bots (separate from scanner ranges)
|
||
DC_AI_PREFIXES = (
|
||
# DigitalOcean (different /24s)
|
||
[f"5.101.{102 + o3}" for o3 in range(0, 4)]
|
||
# Hetzner (different /24s)
|
||
+ [f"5.9.{6 + o3}" for o3 in range(0, 4)]
|
||
) # ~8 prefixes × 254 ≈ 2K IPs
|
||
|
||
# Fallback /24s for legitimate bot overflow
|
||
DC_LEGIT_BOT_PREFIXES = [f"5.9.{20 + o3}" for o3 in range(0, 8)]
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Browser family → User-Agent mapping
|
||
# ---------------------------------------------------------------------------
|
||
|
||
BROWSER_UAS = {
|
||
"Chromium": [
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
||
"Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/120.0.6099.115 Mobile Safari/537.36",
|
||
],
|
||
"Firefox": [
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) "
|
||
"Gecko/20100101 Firefox/121.0",
|
||
"Mozilla/5.0 (X11; Linux x86_64; rv:120.0) "
|
||
"Gecko/20100101 Firefox/120.0",
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14.2; rv:121.0) "
|
||
"Gecko/20100101 Firefox/121.0",
|
||
],
|
||
"Safari": [
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 "
|
||
"(KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_1) AppleWebKit/605.1.15 "
|
||
"(KHTML, like Gecko) Version/17.1 Safari/605.1.15",
|
||
],
|
||
"Edge": [
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0",
|
||
],
|
||
"Opera": [
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 OPR/105.0.0.0",
|
||
],
|
||
"Vivaldi": [
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 "
|
||
"Vivaldi/6.4.3160.47",
|
||
],
|
||
"Chrome_iOS": [
|
||
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) "
|
||
"AppleWebKit/605.1.15 (KHTML, like Gecko) "
|
||
"CriOS/120.0.6099.119 Mobile/15E148 Safari/604.1",
|
||
],
|
||
"Chromium_Legacy": [
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
|
||
],
|
||
"Firefox_Legacy": [
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) "
|
||
"Gecko/20100101 Firefox/78.0",
|
||
],
|
||
"Safari_Legacy": [
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||
"AppleWebKit/605.1.15 (KHTML, like Gecko) "
|
||
"Version/15.6 Safari/605.1.15",
|
||
],
|
||
"Tor_Browser": [
|
||
"Mozilla/5.0 (Windows NT 10.0; rv:102.0) "
|
||
"Gecko/20100101 Firefox/102.0",
|
||
],
|
||
}
|
||
|
||
BROWSER_TCP = {
|
||
"Chromium": ["windows", "linux", "android"],
|
||
"Firefox": ["windows", "linux"],
|
||
"Safari": ["macos"],
|
||
"Edge": ["windows"],
|
||
"Opera": ["windows", "linux"],
|
||
"Vivaldi": ["windows", "linux"],
|
||
"Chrome_iOS": ["macos"],
|
||
"Chromium_Legacy": ["windows"],
|
||
"Firefox_Legacy": ["windows", "linux"],
|
||
"Safari_Legacy": ["macos"],
|
||
"Tor_Browser": ["linux"],
|
||
}
|
||
|
||
CHROMIUM_FAMILIES = {
|
||
"Chromium", "Edge", "Opera", "Vivaldi",
|
||
"Chrome_iOS", "Chromium_Legacy",
|
||
}
|
||
|
||
BROWSER_WEIGHTS = {
|
||
"Chromium": 55, "Firefox": 15, "Safari": 12, "Edge": 8,
|
||
"Opera": 3, "Vivaldi": 1, "Chrome_iOS": 3,
|
||
"Chromium_Legacy": 1, "Firefox_Legacy": 1,
|
||
"Safari_Legacy": 0.5, "Tor_Browser": 0.5,
|
||
}
|
||
|
||
SEC_CH_UA = {
|
||
"Chromium":
|
||
'"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"',
|
||
"Edge":
|
||
'"Chromium";v="120", "Microsoft Edge";v="120", "Not-A.Brand";v="99"',
|
||
"Opera":
|
||
'"Chromium";v="119", "Opera";v="105", "Not-A.Brand";v="99"',
|
||
"Vivaldi":
|
||
'"Chromium";v="118", "Vivaldi";v="6.4", "Not-A.Brand";v="99"',
|
||
"Chrome_iOS":
|
||
'"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"',
|
||
"Chromium_Legacy":
|
||
'"Chromium";v="90", "Google Chrome";v="90", "Not-A.Brand";v="99"',
|
||
}
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Hosts and Accept-Language pools
|
||
# ---------------------------------------------------------------------------
|
||
|
||
HOSTS = ["platform", "api.platform", "www.example.com", "shop.example.com"]
|
||
|
||
ACCEPT_LANGUAGES = [
|
||
"en-US,en;q=0.9",
|
||
"en-GB,en;q=0.9",
|
||
"fr-FR,fr;q=0.9,en;q=0.8",
|
||
"de-DE,de;q=0.9,en;q=0.8",
|
||
"es-ES,es;q=0.9,en;q=0.8",
|
||
"ja-JP,ja;q=0.9,en;q=0.8",
|
||
"pt-BR,pt;q=0.9,en;q=0.8",
|
||
"zh-CN,zh;q=0.9,en;q=0.8",
|
||
]
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Path pools
|
||
# ---------------------------------------------------------------------------
|
||
|
||
PAGE_PATHS = [
|
||
"/", "/index.html", "/about", "/contact", "/products", "/services",
|
||
"/blog", "/blog/post-1", "/blog/post-2", "/blog/post-3", "/faq",
|
||
"/pricing", "/login", "/register", "/profile", "/dashboard",
|
||
"/docs", "/docs/getting-started", "/docs/api-reference",
|
||
"/help", "/terms", "/privacy", "/search",
|
||
]
|
||
|
||
ASSET_PATHS = [
|
||
"/static/js/app.js", "/static/js/vendor.js", "/static/js/analytics.js",
|
||
"/static/css/main.css", "/static/css/theme.css",
|
||
"/images/logo.png", "/images/hero.webp", "/images/banner.jpg",
|
||
"/favicon.ico", "/fonts/inter-400.woff2", "/fonts/inter-700.woff2",
|
||
]
|
||
|
||
API_PATHS = [
|
||
"/api/v1/users", "/api/v1/status", "/api/v2/metrics",
|
||
"/api/v1/products", "/api/v1/search", "/api/v2/config",
|
||
]
|
||
|
||
ATTACK_PATHS = [
|
||
"/.env", "/.git/HEAD", "/.git/config",
|
||
"/wp-login.php", "/wp-admin/", "/xmlrpc.php", "/wp-config.php",
|
||
"/phpmyadmin/", "/phpMyAdmin/", "/pma/",
|
||
"/admin", "/admin/login", "/administrator/",
|
||
"/cgi-bin/test.cgi", "/cgi-bin/../etc/passwd",
|
||
"/download?file=../../../etc/passwd",
|
||
"/download?file=../../../../etc/shadow",
|
||
"/api/search?q=<script>alert(1)</script>",
|
||
"/api/users?id=1+OR+1%3D1",
|
||
"/shell.php", "/cmd.php", "/eval.php",
|
||
"/.aws/credentials", "/.ssh/id_rsa",
|
||
"/etc/passwd", "/proc/self/environ",
|
||
"/actuator", "/actuator/env", "/actuator/health",
|
||
"/server-status", "/.svn/entries",
|
||
"/wp-content/uploads/", "/backup.zip", "/db.sql",
|
||
"/api/v1/../admin", "/api/debug",
|
||
"/.htaccess", "/.htpasswd",
|
||
"/console", "/debug/pprof/",
|
||
]
|
||
|
||
SCRAPER_PATHS = (
|
||
[f"/products/page/{i}" for i in range(1, 51)]
|
||
+ [
|
||
f"/category/{c}/page/{i}"
|
||
for c in ["electronics", "clothing", "books", "home", "sports"]
|
||
for i in range(1, 11)
|
||
]
|
||
)
|
||
|
||
BOT_PATHS = [
|
||
"/robots.txt", "/sitemap.xml", "/", "/index.html",
|
||
"/sitemap_index.xml", "/news-sitemap.xml",
|
||
"/feed", "/rss.xml", "/atom.xml",
|
||
]
|
||
|
||
CONTENT_PATHS = PAGE_PATHS + [f"/blog/post-{i}" for i in range(1, 21)] + [
|
||
f"/products/{s}"
|
||
for s in ["widget-a", "widget-b", "gadget-x", "tool-pro", "kit-basic"]
|
||
]
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Scanner / bot User-Agents
|
||
# ---------------------------------------------------------------------------
|
||
|
||
SCANNER_UAS = [
|
||
"curl/7.88.1",
|
||
"curl/8.1.2",
|
||
"python-requests/2.31.0",
|
||
"python-requests/2.28.1",
|
||
"python-urllib3/2.0.4",
|
||
"Masscan/1.3",
|
||
"masscan/1.3 (https://github.com/robertdavidgraham/masscan)",
|
||
"zgrab/0.x",
|
||
"Go-http-client/1.1",
|
||
"Go-http-client/2.0",
|
||
"libwww-perl/6.72",
|
||
"Java/11.0.18",
|
||
"Java/17.0.2",
|
||
"Wget/1.21.3",
|
||
"Scrapy/2.11.0",
|
||
"Apache-HttpClient/4.5.14",
|
||
"okhttp/4.12.0",
|
||
"Node-Fetch/1.0",
|
||
"axios/1.6.2",
|
||
"-",
|
||
"",
|
||
]
|
||
|
||
HEADLESS_UAS = [
|
||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) HeadlessChrome/120.0.0.0 Safari/537.36",
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) HeadlessChrome/119.0.0.0 Safari/537.36",
|
||
]
|
||
|
||
LEGIT_BOT_UAS = {
|
||
"Googlebot": [
|
||
"Mozilla/5.0 (compatible; Googlebot/2.1; "
|
||
"+http://www.google.com/bot.html)",
|
||
"Googlebot/2.1 (+http://www.google.com/bot.html)",
|
||
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/120.0.6099.71 Mobile Safari/537.36 "
|
||
"(compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
|
||
],
|
||
"Bingbot": [
|
||
"Mozilla/5.0 (compatible; bingbot/2.0; "
|
||
"+http://www.bing.com/bingbot.htm)",
|
||
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; "
|
||
"compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) "
|
||
"Chrome/116.0.1938.76 Safari/537.36",
|
||
],
|
||
"DuckDuckBot": [
|
||
"DuckDuckBot/1.1; (+http://duckduckgo.com/duckduckbot.html)",
|
||
],
|
||
"Applebot": [
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 "
|
||
"Safari/605.1.15 (Applebot/0.1; "
|
||
"+http://www.apple.com/go/applebot)",
|
||
],
|
||
"YandexBot": [
|
||
"Mozilla/5.0 (compatible; YandexBot/3.0; "
|
||
"+http://yandex.com/bots)",
|
||
],
|
||
"Twitterbot": [
|
||
"Twitterbot/1.0",
|
||
],
|
||
"FacebookBot": [
|
||
"facebookexternalhit/1.1 "
|
||
"(+http://www.facebook.com/externalhit_uatext.php)",
|
||
],
|
||
}
|
||
|
||
AI_BOT_UAS = {
|
||
"GPTBot": [
|
||
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; "
|
||
"compatible; GPTBot/1.0; +https://openai.com/gptbot)",
|
||
],
|
||
"PerplexityBot": [
|
||
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; "
|
||
"compatible; PerplexityBot/1.0; "
|
||
"+https://docs.perplexity.ai/docs/perplexity-bot)",
|
||
],
|
||
"ClaudeBot": [
|
||
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; "
|
||
"compatible; ClaudeBot/1.0; "
|
||
"+https://www.anthropic.com/claude-bot)",
|
||
],
|
||
"CCBot": [
|
||
"CCBot/2.0 (https://commoncrawl.org/faq/)",
|
||
],
|
||
"Bytespider": [
|
||
"Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Mobile Safari/537.36 "
|
||
"(compatible; Bytespider; spider-feedback@bytedance.com)",
|
||
],
|
||
}
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# TCP / TLS metadata helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
TCP_OPTIONS = {
|
||
"linux": "020405b40402080affffffff000000000103030a",
|
||
"windows": "020405b40103030801010402",
|
||
"macos": "020405ac0103030601010402",
|
||
"android": "020405b40402080affffffff000000000103030a",
|
||
"scanner": "0204ffff",
|
||
"minimal": "0204ffd7",
|
||
}
|
||
|
||
TCP_PROFILES = {
|
||
"linux": {"window_size": 65535, "mss": 1460, "wscale": 10,
|
||
"ttl": 64, "df": 1},
|
||
"windows": {"window_size": 64240, "mss": 1460, "wscale": 8,
|
||
"ttl": 128, "df": 1},
|
||
"macos": {"window_size": 65535, "mss": 1460, "wscale": 6,
|
||
"ttl": 64, "df": 1},
|
||
"android": {"window_size": 65535, "mss": 1420, "wscale": 9,
|
||
"ttl": 64, "df": 1},
|
||
"scanner": {"window_size": 1024, "mss": 1460, "wscale": 0,
|
||
"ttl": 48, "df": 0},
|
||
"minimal": {"window_size": 512, "mss": 576, "wscale": 0,
|
||
"ttl": 60, "df": 0},
|
||
}
|
||
|
||
|
||
def _tcp_meta(profile, rng):
|
||
meta = TCP_PROFILES.get(profile, TCP_PROFILES["linux"])
|
||
return {
|
||
"tcp_meta_window_size": meta["window_size"] + rng.randint(-100, 100),
|
||
"tcp_meta_mss": meta["mss"],
|
||
"tcp_meta_window_scale": meta["wscale"],
|
||
"tcp_meta_options": TCP_OPTIONS.get(profile, TCP_OPTIONS["linux"]),
|
||
"ip_meta_ttl": meta["ttl"] - rng.randint(0, 5),
|
||
"ip_meta_df": meta["df"],
|
||
"ip_meta_id": rng.randint(1, 65535),
|
||
"ip_meta_total_length": rng.randint(1200, 1500),
|
||
}
|
||
|
||
|
||
def _syn_ms(profile, rng):
|
||
"""Realistic SYN→ClientHello latency in milliseconds."""
|
||
if profile == "scanner":
|
||
return rng.randint(0, 3)
|
||
if profile == "minimal":
|
||
return rng.randint(1, 5)
|
||
return rng.randint(10, 120)
|
||
|
||
|
||
def _ja3_for_ja4(ja4):
|
||
"""Generate a plausible JA3 string and its MD5 hash."""
|
||
if ja4.startswith("t13"):
|
||
raw = ("771,4866-4867-4865-49196-49200-52393-52392,"
|
||
"0-23-65281-10-11-35-16-5-13-18-51-45-43-27,29-23-24,0")
|
||
elif ja4.startswith("t12"):
|
||
raw = ("771,49195-49199-49196-49200-52393-52392,"
|
||
"0-23-65281-10-11-35-16-5-13,29-23-24,0")
|
||
elif ja4.startswith("t10"):
|
||
raw = "769,49161-49162-49171-49172,0-10-11,29-23-24,0"
|
||
else:
|
||
raw = "771,4866-4867-4865,0-23-65281,29-23-24,0"
|
||
return raw, hashlib.md5(raw.encode()).hexdigest()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CSV loading with fallback
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_DATA_SEARCH_PATHS = [
|
||
"/app/data",
|
||
os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||
"..", "..", "..", "scripts", "data"),
|
||
os.path.join(os.path.dirname(os.path.abspath(__file__)), "data"),
|
||
]
|
||
|
||
|
||
def _find_data_dir(explicit=None):
|
||
if explicit and os.path.isdir(explicit):
|
||
return explicit
|
||
for p in _DATA_SEARCH_PATHS:
|
||
real = os.path.realpath(p)
|
||
if os.path.isdir(real) and os.path.isfile(
|
||
os.path.join(real, "browser_ja4.csv")):
|
||
return real
|
||
return None
|
||
|
||
|
||
def load_browser_ja4s(data_dir):
|
||
"""Load browser_ja4.csv → {family: [ja4_hash, …]} for TLS only."""
|
||
result = {}
|
||
if not data_dir:
|
||
return result
|
||
path = os.path.join(data_dir, "browser_ja4.csv")
|
||
if not os.path.isfile(path):
|
||
return result
|
||
with open(path, newline="", encoding="utf-8") as f:
|
||
for row in csv.reader(f):
|
||
if len(row) < 2:
|
||
continue
|
||
ja4, family = row[0].strip(), row[1].strip()
|
||
if ja4.startswith("t13") or ja4.startswith("t12"):
|
||
result.setdefault(family, []).append(ja4)
|
||
return result
|
||
|
||
|
||
def load_bot_ips(data_dir):
|
||
"""Load bot_ip.csv → {family: [cidr_str, …]}."""
|
||
result = {}
|
||
if not data_dir:
|
||
return result
|
||
path = os.path.join(data_dir, "bot_ip.csv")
|
||
if not os.path.isfile(path):
|
||
return result
|
||
with open(path, newline="", encoding="utf-8") as f:
|
||
for row in csv.reader(f):
|
||
if len(row) < 2:
|
||
continue
|
||
result.setdefault(row[1].strip(), []).append(row[0].strip())
|
||
return result
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# IP generation helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _gen_ips_from_prefixes(prefixes, n, rng):
|
||
"""Generate *n* unique IPs from /24 prefixes."""
|
||
ips = set()
|
||
attempts = 0
|
||
while len(ips) < n and attempts < n * 5:
|
||
prefix = rng.choice(prefixes)
|
||
octet = rng.randint(1, 254)
|
||
ips.add(f"{prefix}.{octet}")
|
||
attempts += 1
|
||
return list(ips)
|
||
|
||
|
||
def _ips_from_cidrs(cidrs, n, rng):
|
||
"""Generate *n* unique IPs from a list of CIDR strings."""
|
||
networks = []
|
||
single_ips = []
|
||
for c in cidrs:
|
||
try:
|
||
net = ipaddress.ip_network(c, strict=False)
|
||
if net.prefixlen == 32:
|
||
single_ips.append(str(net.network_address))
|
||
elif net.num_addresses > 2:
|
||
networks.append(net)
|
||
except ValueError:
|
||
continue
|
||
# Start with any /32 single IPs
|
||
ips = set(single_ips)
|
||
if not networks:
|
||
return list(ips)[:n]
|
||
weights = [net.num_addresses for net in networks]
|
||
attempts = 0
|
||
while len(ips) < n and attempts < n * 10:
|
||
net = rng.choices(networks, weights=weights, k=1)[0]
|
||
host_offset = rng.randint(1, max(1, net.num_addresses - 2))
|
||
ips.add(str(net.network_address + host_offset))
|
||
attempts += 1
|
||
return list(ips)[:n]
|
||
|
||
|
||
def _generate_bot_ja4s(browser_ja4_set, rng, n=20):
|
||
"""Generate synthetic JA4 hashes NOT in the browser CSV."""
|
||
prefixes = [
|
||
"t13d0305", "t13d0203", "t12d0507", "t10d0100", "t13d0101",
|
||
"t12d0302", "t13d0405", "t12d0204", "t10d0200", "t13d0102",
|
||
]
|
||
bot_ja4s = []
|
||
for i in range(n):
|
||
p = prefixes[i % len(prefixes)]
|
||
seg1 = f"{rng.randint(0, 0xFFFFFFFFFFFF):012x}"
|
||
seg2 = f"{rng.randint(0, 0xFFFFFFFFFFFF):012x}"
|
||
suffix = "h1" if rng.random() < 0.7 else "h2"
|
||
ja4 = f"{p}{suffix}_{seg1}_{seg2}"
|
||
if ja4 not in browser_ja4_set:
|
||
bot_ja4s.append(ja4)
|
||
if not bot_ja4s:
|
||
bot_ja4s = [
|
||
"t13d030500_ffd59bab1b39_6e7f7df63e98",
|
||
"t13d020300_6b9b1b2c3d4e_ffd59bab1b39",
|
||
"t10d170000_0a1b2c3d4e5f_1b2c3d4e5f60",
|
||
"t12d050700_5a6b7c8d9e0f_1a2b3c4d5e6f",
|
||
"t13d010100_aabbccddeeff_0011223344aa",
|
||
]
|
||
return bot_ja4s
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Request distribution
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _distribute_requests(n_ips, total_rows, min_req, max_req, rng):
|
||
"""Distribute *total_rows* across *n_ips*, each in [min_req, max_req]."""
|
||
if n_ips == 0:
|
||
return []
|
||
counts = []
|
||
remaining = total_rows
|
||
for i in range(n_ips):
|
||
left = n_ips - i
|
||
if i == n_ips - 1:
|
||
counts.append(max(min_req, min(max_req, remaining)))
|
||
break
|
||
lo = max(min_req, remaining - (left - 1) * max_req)
|
||
hi = min(max_req, remaining - (left - 1) * min_req)
|
||
if lo > hi:
|
||
lo = hi = max(min_req, min(max_req, remaining // left))
|
||
counts.append(rng.randint(lo, hi))
|
||
remaining -= counts[-1]
|
||
rng.shuffle(counts)
|
||
return counts
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Timestamp helper
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_BASE_TIME = None
|
||
|
||
|
||
def _now_minus(seconds):
|
||
"""ISO-8601 UTC timestamp *seconds* in the past."""
|
||
global _BASE_TIME
|
||
if _BASE_TIME is None:
|
||
_BASE_TIME = datetime.now(timezone.utc)
|
||
t = _BASE_TIME - timedelta(seconds=seconds)
|
||
return t.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Row builder — identical field set to original
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _make_row(
|
||
src_ip, ua, path, method="GET", ja4=None, tcp_profile="linux",
|
||
scheme="https", host="platform", time_offset_s=None,
|
||
extra_headers=None, rng=None,
|
||
):
|
||
"""Build a single raw_json dict matching what the correlator produces."""
|
||
if rng is None:
|
||
rng = random
|
||
if time_offset_s is None:
|
||
time_offset_s = rng.randint(0, 1700)
|
||
if ja4 is None:
|
||
ja4 = "t13d1917h2_b0372614b25a_6a77dcf5a8be"
|
||
ja3_raw, ja3_hash = _ja3_for_ja4(ja4)
|
||
tcp = _tcp_meta(tcp_profile, rng)
|
||
syn_ms = _syn_ms(tcp_profile, rng)
|
||
|
||
client_headers = "Host,User-Agent,Accept,Accept-Language,Accept-Encoding"
|
||
if extra_headers:
|
||
client_headers += "," + ",".join(extra_headers.keys())
|
||
|
||
row = {
|
||
"time": _now_minus(time_offset_s),
|
||
"src_ip": src_ip,
|
||
"src_port": rng.randint(1024, 65535),
|
||
"dst_ip": "172.20.0.2",
|
||
"dst_port": 443 if scheme == "https" else 80,
|
||
"method": method,
|
||
"scheme": scheme,
|
||
"host": host,
|
||
"path": path.split("?")[0] if "?" in path else path,
|
||
"query": path.split("?")[1] if "?" in path else "",
|
||
"http_version": "HTTP/2.0" if "h2" in ja4 else "HTTP/1.1",
|
||
"orphan_side": "",
|
||
"correlated": True,
|
||
"keepalives": rng.randint(1, 8),
|
||
"a_timestamp": int(time.time() * 1_000_000),
|
||
"b_timestamp": int(time.time() * 1_000_000) + syn_ms * 1000,
|
||
"conn_id": f"seed_{src_ip.replace('.', '_')}"
|
||
f"_{rng.randint(1000, 9999)}",
|
||
"syn_to_clienthello_ms": syn_ms,
|
||
"tls_version": ("1.3" if ja4.startswith("t13")
|
||
else "1.2" if ja4.startswith("t12")
|
||
else "1.0"),
|
||
"tls_sni": host,
|
||
"tls_alpn": "h2" if "h2" in ja4 else "http/1.1",
|
||
"ja3": ja3_raw,
|
||
"ja3_hash": ja3_hash,
|
||
"ja4": ja4,
|
||
"client_headers": client_headers,
|
||
"header_User-Agent": ua,
|
||
"header_Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
|
||
"header_Accept-Encoding": "gzip, deflate, br",
|
||
"header_Accept-Language": "",
|
||
"header_Content-Type": "",
|
||
"header_X-Request-Id": "",
|
||
"header_X-Trace-Id": "",
|
||
"header_X-Forwarded-For": "",
|
||
"header_Sec-Fetch-Site": "",
|
||
"header_Sec-Fetch-Mode": "",
|
||
"header_Sec-Fetch-Dest": "",
|
||
"header_Sec-CH-UA": "",
|
||
"header_Sec-CH-UA-Mobile": "",
|
||
"header_Sec-CH-UA-Platform": "",
|
||
**tcp,
|
||
}
|
||
if extra_headers:
|
||
row.update({f"header_{k}": v for k, v in extra_headers.items()})
|
||
return row
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Traffic generators
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def generate_browser_traffic(n_ips, total_rows, browser_ja4s, rng):
|
||
"""Generate legitimate browser sessions with realistic navigation.
|
||
|
||
Each IP gets a consistent browser profile (family, JA4, UA, TCP) and
|
||
produces page navigations + asset/API requests with Referer chains,
|
||
cookies, Sec-Fetch headers, and Sec-CH-UA for Chromium browsers.
|
||
"""
|
||
rows = []
|
||
ips = _gen_ips_from_prefixes(ISP_PREFIXES, n_ips, rng)
|
||
counts = _distribute_requests(len(ips), total_rows, 5, 50, rng)
|
||
|
||
families = (list(browser_ja4s.keys())
|
||
if browser_ja4s else list(BROWSER_UAS.keys()))
|
||
family_weights = [BROWSER_WEIGHTS.get(f, 1) for f in families]
|
||
|
||
platform_map = {
|
||
"windows": '"Windows"', "linux": '"Linux"',
|
||
"macos": '"macOS"', "android": '"Android"',
|
||
}
|
||
|
||
for ip, n_req in zip(ips, counts):
|
||
family = rng.choices(families, weights=family_weights, k=1)[0]
|
||
ja4_list = browser_ja4s.get(family, [])
|
||
if not ja4_list:
|
||
for fb in ("Chromium", "Firefox", "Safari"):
|
||
ja4_list = browser_ja4s.get(fb, [])
|
||
if ja4_list:
|
||
break
|
||
if not ja4_list:
|
||
ja4_list = ["t13d1917h2_b0372614b25a_6a77dcf5a8be"]
|
||
ja4 = rng.choice(ja4_list)
|
||
ua = rng.choice(BROWSER_UAS.get(family, BROWSER_UAS["Chromium"]))
|
||
tcp = rng.choice(BROWSER_TCP.get(family, ["linux"]))
|
||
host = rng.choice(HOSTS)
|
||
lang = rng.choice(ACCEPT_LANGUAGES)
|
||
is_chromium = family in CHROMIUM_FAMILIES
|
||
session_cookie = f"sid={rng.randint(100000, 999999)}"
|
||
ch_ua = SEC_CH_UA.get(family, "") if is_chromium else ""
|
||
ch_mobile = "?0" if is_chromium else ""
|
||
ch_plat = platform_map.get(tcp, "") if is_chromium else ""
|
||
|
||
base_offset = rng.randint(60, 1700)
|
||
current_page = ""
|
||
|
||
for req_i in range(n_req):
|
||
offset = max(1, base_offset - req_i * rng.randint(1, 5))
|
||
|
||
if req_i == 0 or rng.random() < 0.25:
|
||
# Navigate to a new page
|
||
path = rng.choice(PAGE_PATHS)
|
||
method = "GET"
|
||
referer = ("" if req_i == 0
|
||
else f"https://{host}{current_page}")
|
||
sec_site = "none" if req_i == 0 else "same-origin"
|
||
sec_mode = "navigate"
|
||
sec_dest = "document"
|
||
current_page = path
|
||
elif rng.random() < 0.5:
|
||
# Asset request
|
||
path = rng.choice(ASSET_PATHS)
|
||
method = "GET"
|
||
referer = (f"https://{host}{current_page}"
|
||
if current_page else "")
|
||
sec_site = "same-origin"
|
||
sec_mode = "no-cors"
|
||
sec_dest = rng.choice(["script", "style", "image", "font"])
|
||
elif rng.random() < 0.4:
|
||
# API call
|
||
path = rng.choice(API_PATHS)
|
||
method = rng.choice(["GET", "POST"])
|
||
referer = (f"https://{host}{current_page}"
|
||
if current_page else "")
|
||
sec_site = "same-origin"
|
||
sec_mode = "cors"
|
||
sec_dest = "empty"
|
||
else:
|
||
# Another page navigation
|
||
path = rng.choice(PAGE_PATHS)
|
||
method = "GET"
|
||
referer = (f"https://{host}{current_page}"
|
||
if current_page else "")
|
||
sec_site = "same-origin"
|
||
sec_mode = "navigate"
|
||
sec_dest = "document"
|
||
current_page = path
|
||
|
||
row = _make_row(
|
||
src_ip=ip, ua=ua, path=path, method=method,
|
||
ja4=ja4, tcp_profile=tcp, scheme="https", host=host,
|
||
time_offset_s=offset, rng=rng,
|
||
)
|
||
row["header_Accept-Language"] = lang
|
||
row["header_Sec-Fetch-Site"] = sec_site
|
||
row["header_Sec-Fetch-Mode"] = sec_mode
|
||
row["header_Sec-Fetch-Dest"] = sec_dest
|
||
row["header_Sec-CH-UA"] = ch_ua
|
||
row["header_Sec-CH-UA-Mobile"] = ch_mobile
|
||
row["header_Sec-CH-UA-Platform"] = ch_plat
|
||
if referer:
|
||
row["header_Referer"] = referer
|
||
if "Referer" not in row["client_headers"]:
|
||
row["client_headers"] += ",Referer"
|
||
if req_i > 0:
|
||
row["header_Cookie"] = session_cookie
|
||
if "Cookie" not in row["client_headers"]:
|
||
row["client_headers"] += ",Cookie"
|
||
rows.append(row)
|
||
|
||
return rows
|
||
|
||
|
||
def generate_scanner_traffic(n_ips, total_rows, bot_ja4s, rng):
|
||
"""Generate malicious bot/scanner traffic in four sub-categories.
|
||
|
||
a. Vulnerability scanners (30%): diverse attack paths
|
||
b. Credential stuffers (20%): POST /login brute-force
|
||
c. Content scrapers (30%): methodical path crawling
|
||
d. DDoS-like (20%): same path hammered 100+ times
|
||
"""
|
||
rows = []
|
||
ips = _gen_ips_from_prefixes(DC_SCANNER_PREFIXES, n_ips, rng)
|
||
|
||
n_vuln = int(n_ips * 0.30)
|
||
n_cred = int(n_ips * 0.20)
|
||
n_scrap = int(n_ips * 0.30)
|
||
n_ddos = n_ips - n_vuln - n_cred - n_scrap
|
||
|
||
# Compute row budgets respecting per-IP minimums
|
||
min_vuln = n_vuln * 10
|
||
min_cred = n_cred * 15
|
||
min_scrap = n_scrap * 10
|
||
min_ddos = n_ddos * 100
|
||
total_min = min_vuln + min_cred + min_scrap + min_ddos
|
||
surplus = max(0, total_rows - total_min)
|
||
r_ddos = min_ddos + int(surplus * 0.40)
|
||
r_vuln = min_vuln + int(surplus * 0.20)
|
||
r_cred = min_cred + int(surplus * 0.15)
|
||
r_scrap = total_rows - r_ddos - r_vuln - r_cred
|
||
|
||
idx = 0
|
||
|
||
# --- Vulnerability scanners ---
|
||
vuln_ips = ips[idx:idx + n_vuln]; idx += n_vuln
|
||
counts = _distribute_requests(len(vuln_ips), r_vuln, 10, 60, rng)
|
||
for ip, n_req in zip(vuln_ips, counts):
|
||
ua = rng.choice(SCANNER_UAS)
|
||
ja4 = rng.choice(bot_ja4s)
|
||
rotate_ua = rng.random() < 0.3
|
||
for _ in range(n_req):
|
||
if rotate_ua:
|
||
ua = rng.choice(SCANNER_UAS)
|
||
rows.append(_make_row(
|
||
src_ip=ip, ua=ua, path=rng.choice(ATTACK_PATHS),
|
||
method=rng.choice(["GET", "GET", "HEAD"]),
|
||
ja4=ja4, tcp_profile="scanner", scheme="https",
|
||
host=rng.choice(HOSTS), rng=rng,
|
||
))
|
||
|
||
# Some vuln scanners do port probing (diverse dst_port)
|
||
for ip in rng.sample(vuln_ips, min(100, len(vuln_ips))):
|
||
for port in rng.sample(range(80, 10000), 5):
|
||
row = _make_row(
|
||
src_ip=ip, ua="-", path="/",
|
||
method="GET", ja4=rng.choice(bot_ja4s),
|
||
tcp_profile="scanner", scheme="https",
|
||
host=rng.choice(HOSTS), rng=rng,
|
||
)
|
||
row["dst_port"] = port
|
||
rows.append(row)
|
||
|
||
# --- Credential stuffers ---
|
||
login_paths = [
|
||
"/login", "/admin/login", "/api/auth/login",
|
||
"/wp-login.php", "/user/signin", "/api/v1/auth",
|
||
]
|
||
cred_ips = ips[idx:idx + n_cred]; idx += n_cred
|
||
counts = _distribute_requests(len(cred_ips), r_cred, 15, 60, rng)
|
||
for ip, n_req in zip(cred_ips, counts):
|
||
ua = rng.choice(SCANNER_UAS + HEADLESS_UAS)
|
||
ja4 = rng.choice(bot_ja4s)
|
||
target = rng.choice(login_paths)
|
||
for _ in range(n_req):
|
||
rows.append(_make_row(
|
||
src_ip=ip, ua=ua, path=target, method="POST",
|
||
ja4=ja4, tcp_profile="scanner", scheme="https",
|
||
host=rng.choice(HOSTS[:2]), rng=rng,
|
||
extra_headers={
|
||
"Content-Type": "application/x-www-form-urlencoded",
|
||
"Content-Length": str(rng.randint(20, 60)),
|
||
},
|
||
))
|
||
|
||
# --- Content scrapers ---
|
||
scrap_ips = ips[idx:idx + n_scrap]; idx += n_scrap
|
||
counts = _distribute_requests(len(scrap_ips), r_scrap, 10, 60, rng)
|
||
for ip, n_req in zip(scrap_ips, counts):
|
||
ua = rng.choice(SCANNER_UAS[:8])
|
||
ja4 = rng.choice(bot_ja4s)
|
||
for i in range(n_req):
|
||
rows.append(_make_row(
|
||
src_ip=ip, ua=ua,
|
||
path=SCRAPER_PATHS[i % len(SCRAPER_PATHS)],
|
||
method="GET", ja4=ja4, tcp_profile="scanner",
|
||
scheme="https", host=rng.choice(HOSTS), rng=rng,
|
||
))
|
||
|
||
# --- DDoS-like ---
|
||
ddos_targets = ["/", "/api/v1/search", "/products", "/api/v2/metrics"]
|
||
ddos_ips = ips[idx:idx + n_ddos]
|
||
counts = _distribute_requests(len(ddos_ips), r_ddos, 100, 200, rng)
|
||
for ip, n_req in zip(ddos_ips, counts):
|
||
ua = rng.choice(SCANNER_UAS)
|
||
ja4 = rng.choice(bot_ja4s)
|
||
target = rng.choice(ddos_targets)
|
||
for _ in range(n_req):
|
||
rows.append(_make_row(
|
||
src_ip=ip, ua=ua, path=target, method="GET",
|
||
ja4=ja4,
|
||
tcp_profile=rng.choice(["scanner", "minimal"]),
|
||
scheme="https", host=HOSTS[0],
|
||
time_offset_s=rng.randint(0, 300), rng=rng,
|
||
))
|
||
|
||
return rows
|
||
|
||
|
||
def generate_legit_bot_traffic(n_ips, total_rows, bot_ips_by_family,
|
||
bot_ja4s, rng):
|
||
"""Generate traffic from known legitimate bots (Googlebot, Bingbot, …).
|
||
|
||
IPs are drawn from bot_ip.csv CIDRs when available so they match
|
||
the bot_ip dictionary for direct labelling.
|
||
"""
|
||
rows = []
|
||
legit_families = list(LEGIT_BOT_UAS.keys())
|
||
|
||
all_ips = []
|
||
ip_family_map = {}
|
||
per_family = max(1, n_ips // len(legit_families))
|
||
|
||
for family in legit_families:
|
||
cidrs = bot_ips_by_family.get(family, [])
|
||
if cidrs:
|
||
fam_ips = _ips_from_cidrs(cidrs, per_family, rng)
|
||
else:
|
||
fam_ips = _gen_ips_from_prefixes(
|
||
DC_LEGIT_BOT_PREFIXES, per_family, rng)
|
||
for ip in fam_ips:
|
||
ip_family_map[ip] = family
|
||
all_ips.extend(fam_ips)
|
||
|
||
if len(all_ips) < n_ips:
|
||
extra = _gen_ips_from_prefixes(
|
||
DC_LEGIT_BOT_PREFIXES, n_ips - len(all_ips), rng)
|
||
for ip in extra:
|
||
ip_family_map[ip] = rng.choice(legit_families)
|
||
all_ips.extend(extra)
|
||
|
||
all_ips = all_ips[:n_ips]
|
||
counts = _distribute_requests(len(all_ips), total_rows, 5, 30, rng)
|
||
|
||
for ip, n_req in zip(all_ips, counts):
|
||
family = ip_family_map.get(ip, rng.choice(legit_families))
|
||
ua = rng.choice(
|
||
LEGIT_BOT_UAS.get(family, LEGIT_BOT_UAS["Googlebot"]))
|
||
ja4 = rng.choice(bot_ja4s)
|
||
|
||
# Bots: robots.txt first, then sitemap, then content
|
||
paths = ["/robots.txt"]
|
||
if rng.random() < 0.7:
|
||
paths.append("/sitemap.xml")
|
||
remaining_n = n_req - len(paths)
|
||
paths.extend(
|
||
rng.choices(CONTENT_PATHS + BOT_PATHS, k=max(0, remaining_n)))
|
||
paths = paths[:n_req]
|
||
|
||
for path in paths:
|
||
rows.append(_make_row(
|
||
src_ip=ip, ua=ua, path=path, method="GET",
|
||
ja4=ja4,
|
||
tcp_profile=rng.choice(["linux", "linux", "scanner"]),
|
||
scheme="https", host=rng.choice(HOSTS), rng=rng,
|
||
))
|
||
|
||
return rows
|
||
|
||
|
||
def generate_ai_bot_traffic(n_ips, total_rows, bot_ips_by_family,
|
||
bot_ja4s, rng):
|
||
"""Generate aggressive AI bot scraping traffic (GPTBot, ClaudeBot, …)."""
|
||
rows = []
|
||
ai_families = list(AI_BOT_UAS.keys())
|
||
|
||
all_ips = []
|
||
ip_family_map = {}
|
||
per_family = max(1, n_ips // len(ai_families))
|
||
|
||
for family in ai_families:
|
||
cidrs = bot_ips_by_family.get(family, [])
|
||
if cidrs:
|
||
fam_ips = _ips_from_cidrs(cidrs, per_family, rng)
|
||
else:
|
||
fam_ips = _gen_ips_from_prefixes(
|
||
DC_AI_PREFIXES, per_family, rng)
|
||
for ip in fam_ips:
|
||
ip_family_map[ip] = family
|
||
all_ips.extend(fam_ips)
|
||
|
||
if len(all_ips) < n_ips:
|
||
extra = _gen_ips_from_prefixes(
|
||
DC_AI_PREFIXES, n_ips - len(all_ips), rng)
|
||
for ip in extra:
|
||
ip_family_map[ip] = rng.choice(ai_families)
|
||
all_ips.extend(extra)
|
||
|
||
all_ips = all_ips[:n_ips]
|
||
counts = _distribute_requests(len(all_ips), total_rows, 10, 50, rng)
|
||
|
||
for ip, n_req in zip(all_ips, counts):
|
||
family = ip_family_map.get(ip, rng.choice(ai_families))
|
||
ua = rng.choice(AI_BOT_UAS[family])
|
||
ja4 = rng.choice(bot_ja4s)
|
||
|
||
paths = rng.choices(CONTENT_PATHS, k=n_req)
|
||
for path in paths:
|
||
rows.append(_make_row(
|
||
src_ip=ip, ua=ua, path=path, method="GET",
|
||
ja4=ja4, tcp_profile="linux", scheme="https",
|
||
host=rng.choice(HOSTS), rng=rng,
|
||
))
|
||
|
||
return rows
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# ClickHouse insert
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _ch_insert(rows, host, port, user, password,
|
||
batch_size=2000, dry_run=False):
|
||
"""Insert rows into ja4_logs.http_logs_raw via ClickHouse HTTP interface.
|
||
|
||
Each row is wrapped as {"raw_json": "<escaped_json>"} in JSONEachRow format.
|
||
"""
|
||
if dry_run:
|
||
print(f"[seed] DRY-RUN — would insert {len(rows)} rows")
|
||
print("[seed] Sample row:", json.dumps(rows[0], indent=2)[:400])
|
||
return len(rows)
|
||
|
||
query = "INSERT INTO ja4_logs.http_logs_raw (raw_json) FORMAT JSONEachRow"
|
||
url = (
|
||
f"http://{host}:{port}/"
|
||
f"?query={urllib.parse.quote(query)}"
|
||
f"&user={urllib.parse.quote(user)}"
|
||
f"&password={urllib.parse.quote(password)}"
|
||
)
|
||
|
||
total_inserted = 0
|
||
n_batches = (len(rows) + batch_size - 1) // batch_size
|
||
for i in range(0, len(rows), batch_size):
|
||
batch = rows[i:i + batch_size]
|
||
body_lines = []
|
||
for row in batch:
|
||
outer = {"raw_json": json.dumps(row, separators=(",", ":"))}
|
||
body_lines.append(json.dumps(outer, separators=(",", ":")))
|
||
body = "\n".join(body_lines).encode("utf-8")
|
||
|
||
req = urllib.request.Request(
|
||
url, data=body, method="POST",
|
||
headers={"Content-Type": "application/x-ndjson; charset=utf-8"},
|
||
)
|
||
try:
|
||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||
resp.read()
|
||
total_inserted += len(batch)
|
||
except urllib.error.HTTPError as e:
|
||
err_body = e.read(500).decode("utf-8", errors="replace")
|
||
print(f"[seed] ERROR batch {i}–{i+batch_size}: "
|
||
f"HTTP {e.code}: {err_body}")
|
||
except Exception as e:
|
||
print(f"[seed] ERROR batch {i}–{i+batch_size}: {e}")
|
||
|
||
if total_inserted % 10000 < batch_size:
|
||
batch_num = i // batch_size + 1
|
||
print(f"[seed] Progress: {total_inserted:,}/{len(rows):,} rows "
|
||
f"({batch_num}/{n_batches} batches)")
|
||
|
||
return total_inserted
|
||
|
||
|
||
def _wait_for_clickhouse(host, port, user, password, timeout_s=60):
|
||
"""Wait for ClickHouse to be ready."""
|
||
url = (
|
||
f"http://{host}:{port}/"
|
||
f"?query=SELECT+1"
|
||
f"&user={urllib.parse.quote(user)}"
|
||
f"&password={urllib.parse.quote(password)}"
|
||
)
|
||
deadline = time.monotonic() + timeout_s
|
||
while time.monotonic() < deadline:
|
||
try:
|
||
with urllib.request.urlopen(url, timeout=5) as r:
|
||
if r.read().strip() == b"1":
|
||
return True
|
||
except Exception:
|
||
pass
|
||
time.sleep(2)
|
||
return False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Main
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description="Seed ClickHouse with synthetic traffic")
|
||
parser.add_argument("--host", default="clickhouse")
|
||
parser.add_argument("--port", type=int, default=8123)
|
||
parser.add_argument("--user", default="default")
|
||
parser.add_argument("--password", default="")
|
||
parser.add_argument("--dry-run", action="store_true",
|
||
help="Generate data but do not insert")
|
||
parser.add_argument("--rows", type=int, default=500_000,
|
||
help="Total rows to generate (default: 500000)")
|
||
parser.add_argument("--ips", type=int, default=20_000,
|
||
help="Total unique IPs (default: 20000)")
|
||
parser.add_argument("--seed", type=int, default=None,
|
||
help="Random seed for reproducibility")
|
||
parser.add_argument("--data-dir", default=None,
|
||
help="Path to CSV data directory "
|
||
"(browser_ja4.csv, bot_ip.csv)")
|
||
args = parser.parse_args()
|
||
|
||
rng = random.Random(args.seed)
|
||
|
||
if not args.dry_run:
|
||
print(f"[seed] Waiting for ClickHouse at {args.host}:{args.port}…")
|
||
if not _wait_for_clickhouse(args.host, args.port,
|
||
args.user, args.password):
|
||
print("[seed] FATAL: ClickHouse not reachable after 60s")
|
||
raise SystemExit(1)
|
||
print("[seed] ClickHouse ready.")
|
||
|
||
t0 = time.monotonic()
|
||
|
||
# --- Load CSV data ---
|
||
data_dir = _find_data_dir(args.data_dir)
|
||
if data_dir:
|
||
print(f"[seed] Loading CSV data from {data_dir}")
|
||
else:
|
||
print("[seed] WARNING: CSV data directory not found, "
|
||
"using fallback data")
|
||
|
||
browser_ja4s = load_browser_ja4s(data_dir)
|
||
bot_ips_by_family = load_bot_ips(data_dir)
|
||
|
||
browser_ja4_set = set()
|
||
for ja4_list in browser_ja4s.values():
|
||
browser_ja4_set.update(ja4_list)
|
||
|
||
if browser_ja4s:
|
||
total_ja4 = sum(len(v) for v in browser_ja4s.values())
|
||
print(f"[seed] Loaded {total_ja4} browser JA4s "
|
||
f"across {len(browser_ja4s)} families")
|
||
if bot_ips_by_family:
|
||
total_cidrs = sum(len(v) for v in bot_ips_by_family.values())
|
||
print(f"[seed] Loaded {total_cidrs} bot CIDRs "
|
||
f"across {len(bot_ips_by_family)} families")
|
||
|
||
bot_ja4s = _generate_bot_ja4s(browser_ja4_set, rng)
|
||
|
||
# --- IP and row budgets ---
|
||
n_browser_ips = int(args.ips * 0.70)
|
||
n_scanner_ips = int(args.ips * 0.15)
|
||
n_legit_bot_ips = int(args.ips * 0.10)
|
||
n_ai_bot_ips = (args.ips - n_browser_ips
|
||
- n_scanner_ips - n_legit_bot_ips)
|
||
|
||
n_browser_rows = int(args.rows * 0.70)
|
||
n_scanner_rows = int(args.rows * 0.20)
|
||
n_legit_bot_rows = int(args.rows * 0.06)
|
||
n_ai_bot_rows = (args.rows - n_browser_rows
|
||
- n_scanner_rows - n_legit_bot_rows)
|
||
|
||
print(f"[seed] Generating {args.rows:,} rows from {args.ips:,} IPs…")
|
||
|
||
# --- Generate ---
|
||
browser_rows = generate_browser_traffic(
|
||
n_browser_ips, n_browser_rows, browser_ja4s, rng)
|
||
print(f"[seed] Browser: {len(browser_rows):>7,} rows "
|
||
f"({len(set(r['src_ip'] for r in browser_rows)):,} IPs)")
|
||
|
||
scanner_rows = generate_scanner_traffic(
|
||
n_scanner_ips, n_scanner_rows, bot_ja4s, rng)
|
||
print(f"[seed] Scanner: {len(scanner_rows):>7,} rows "
|
||
f"({len(set(r['src_ip'] for r in scanner_rows)):,} IPs)")
|
||
|
||
legit_bot_rows = generate_legit_bot_traffic(
|
||
n_legit_bot_ips, n_legit_bot_rows,
|
||
bot_ips_by_family, bot_ja4s, rng)
|
||
print(f"[seed] Legit bots: {len(legit_bot_rows):>7,} rows "
|
||
f"({len(set(r['src_ip'] for r in legit_bot_rows)):,} IPs)")
|
||
|
||
ai_bot_rows = generate_ai_bot_traffic(
|
||
n_ai_bot_ips, n_ai_bot_rows,
|
||
bot_ips_by_family, bot_ja4s, rng)
|
||
print(f"[seed] AI bots: {len(ai_bot_rows):>7,} rows "
|
||
f"({len(set(r['src_ip'] for r in ai_bot_rows)):,} IPs)")
|
||
|
||
all_rows = browser_rows + scanner_rows + legit_bot_rows + ai_bot_rows
|
||
rng.shuffle(all_rows)
|
||
|
||
gen_elapsed = time.monotonic() - t0
|
||
print(f"[seed] Generated {len(all_rows):,} total rows "
|
||
f"in {gen_elapsed:.1f}s")
|
||
|
||
# --- Insert ---
|
||
inserted = _ch_insert(
|
||
all_rows, args.host, args.port, args.user, args.password,
|
||
batch_size=2000, dry_run=args.dry_run,
|
||
)
|
||
|
||
elapsed = time.monotonic() - t0
|
||
print(f"[seed] Done: {inserted:,}/{len(all_rows):,} rows inserted "
|
||
f"in {elapsed:.1f}s")
|
||
|
||
if inserted < len(all_rows) * 0.9:
|
||
print("[seed] WARNING: fewer than 90% of rows inserted — "
|
||
"check errors above")
|
||
raise SystemExit(1)
|
||
|
||
print(f"[seed] The bot_detector should now see ≥ 500 human sessions "
|
||
f"in view_ai_features_1h (after MV propagation).")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|