Files
ja4-platform/tests/integration/traffic-gen/generate_traffic.py
toto fc882dd3e7 feat(tests): realistic traffic seeder + IP diversity via mod_remoteip
Option A — X-Forwarded-For + mod_remoteip:
- httpd-integration.conf: load mod_remoteip, trust all Docker RFC-1918
  subnets (172/192.168/10). mod_reqin_log uses r->useragent_ip which
  mod_remoteip updates from XFF → each request logged with distinct src_ip
- generate_traffic.py: XFF always set (was 30% only); human scenarios
  use 91.121/78.41/90.x ranges, bot scenarios use 185.220/45.155/193.32;
  pool of 1168 human IPs and 180 bot IPs; default --requests 500

Option D — Direct ClickHouse seeder (seed_clickhouse.py, stdlib only):
- Inserts ~4000 rows into http_logs_raw triggering full MV chain:
    http_logs_raw → mv_http_logs → http_logs
                 → mv_agg_host_ip_ja4_1h → agg_host_ip_ja4_1h
  • 720 human sessions: IPs in OVH/SFR/Orange ASN ranges (16276/15557/3215)
    → dict_asn_reputation maps these to asn_label='human'
    → satisfies bot_detector human_baseline >= 500 threshold
  • 150 scanner sessions: datacenter IPs, attack paths (/.env, wp-login,
    SQLi, path traversal), scanner UAs, minimal TCP fingerprints
  • 100 known-bot sessions: IPs matching bot_ip.csv entries
  • 20 brute-force clusters: 20-50 POST /login per IP
  All TCP/TLS metadata is profile-realistic (window, MSS, TTL, JA4, JA3)

CSV stubs (mounted at /var/lib/clickhouse/user_files/):
- iplocate-ip-to-asn.csv: 13 CIDR→ASN mappings (OVH/SFR/Orange/Tor/Contabo)
- asn_reputation.csv: 13 ASN→label (8 'human', 3 'datacenter'/'hosting')
- bot_ip.csv: 14 known scanner/Tor IPs (Shodan, Censys, Tor exits)
- bot_ja4.csv: 5 bot JA4 fingerprints (curl, python-requests, masscan, zgrab)

run-tests.sh:
- Phase 4a: seeder runs before live traffic (ensures bot_detector baseline)
- Phase 4b: live traffic gen at 500 requests (up from 200)
- Phase 5f: new assertions — agg_host_ip_ja4_1h populated, ≥500 human
  rows in view_ai_features_1h, known-bot labels present
- Phase 7: verifies ml_all_scores populated (bot_detector ran a cycle)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-08 11:35:34 +02:00

527 lines
19 KiB
Python

#!/usr/bin/env python3
"""
generate_traffic.py — Realistic HTTP/HTTPS traffic generator for integration tests
Simulates varied web traffic including:
- Multiple browser User-Agents (Chrome, Firefox, Safari, Edge)
- Bot / crawler traffic (Googlebot, Bingbot, curl, wget, python-requests)
- Multiple HTTP methods (GET, POST, PUT, DELETE, HEAD, OPTIONS, PATCH)
- Varied paths, query strings, form data, JSON payloads
- Both HTTP (port 80) and HTTPS (port 443)
- Different Accept/Language/Encoding headers
- Cookie / Referer / X-Forwarded-For always set — ensures src_ip diversity
in ClickHouse via mod_remoteip (r->useragent_ip updated from XFF)
- Multiple SSL contexts to vary TLS ClientHello parameters
Usage:
python generate_traffic.py [--host platform] [--http-port 80] [--https-port 443]
[--requests 500] [--workers 10] [--scenario all]
"""
import argparse
import concurrent.futures
import json
import random
import ssl
import time
import urllib.error
import urllib.request
from dataclasses import dataclass, field
from typing import Optional
# ---------------------------------------------------------------------------
# Realistic data pools
# ---------------------------------------------------------------------------
BROWSERS = [
# Chrome 120 Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
# Chrome 118 Linux
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
# Firefox 121 Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
# Firefox 120 Linux
"Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
# Safari 17 macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
# Edge 120 Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
# Chrome Android
"Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.115 Mobile Safari/537.36",
# Safari iPhone
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1",
]
BOTS = [
"Googlebot/2.1 (+http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
"Twitterbot/1.0",
"facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)",
"curl/7.88.1",
"python-requests/2.31.0",
"wget/1.21.3",
"Wget/1.21 (linux-gnu)",
"Go-http-client/1.1",
"Java/11.0.18",
"masscan/1.3 (https://github.com/robertdavidgraham/masscan)",
"zgrab/0.x",
"libwww-perl/6.72",
]
PATHS = [
"/",
"/health",
"/index.html",
"/index.php",
"/login",
"/api/v1/users",
"/api/v1/status",
"/api/v2/metrics",
"/admin",
"/admin/login",
"/.env",
"/.git/HEAD",
"/wp-login.php",
"/wp-admin/",
"/phpmyadmin/",
"/xmlrpc.php",
"/robots.txt",
"/sitemap.xml",
"/favicon.ico",
"/static/js/app.js",
"/static/css/main.css",
"/images/logo.png",
"/api/search?q=test&limit=10",
"/api/search?q=<script>alert(1)</script>",
"/api/users?page=1&per_page=20&sort=created_at",
"/download?file=../../../etc/passwd",
"/cgi-bin/test.cgi",
]
QUERY_PARAMS = [
"",
"?id=1",
"?id=1+OR+1%3D1",
"?debug=true",
"?lang=fr",
"?ref=google",
"?utm_source=newsletter&utm_medium=email&utm_campaign=spring2024",
"?token=eyJhbGciOiJIUzI1NiJ9.dGVzdA.abc",
"?callback=jsonp_callback",
"?page=1&limit=100&sort=-created_at",
]
ACCEPT_LANGS = [
"fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
"en-US,en;q=0.9",
"de-DE,de;q=0.9,en;q=0.8",
"ja-JP,ja;q=0.9,en-US;q=0.8",
"zh-CN,zh;q=0.9",
"es-ES,es;q=0.9,en;q=0.8",
"*",
]
REFERERS = [
"",
"https://www.google.com/search?q=test",
"https://www.bing.com/search?q=example",
"https://t.co/abc123",
"https://www.facebook.com/",
"https://example.com/page",
]
SEC_FETCH_MODES = ["navigate", "cors", "no-cors", "same-origin", "websocket"]
SEC_FETCH_DESTS = ["document", "script", "style", "image", "fetch", "empty"]
SEC_FETCH_SITES = ["none", "same-origin", "same-site", "cross-site"]
JSON_BODIES = [
'{"username":"admin","password":"password123"}',
'{"query":"SELECT * FROM users","limit":100}',
'{"email":"test@example.com","action":"subscribe"}',
'{"data":{"key":"value","nested":{"array":[1,2,3]}}}',
]
FORM_BODIES = [
"username=admin&password=admin",
"email=test%40example.com&message=Hello+World",
"q=test+query&submit=Search",
]
# ---------------------------------------------------------------------------
# IP pools for X-Forwarded-For (mod_remoteip uses this as src_ip in logs)
# Ranges must match iplocate-ip-to-asn.csv entries so ASN lookup succeeds.
#
# HUMAN — residential ISP ranges → asn_label='human' → feeds ML baseline
HUMAN_IPS = (
# OVH FR (ASN 16276) — 91.121.0.0/16
[f"91.121.{o3}.{o4}" for o3 in range(0, 12) for o4 in range(1, 60)]
# SFR FR (ASN 15557) — 78.41.0.0/16
+ [f"78.41.{o3}.{o4}" for o3 in range(0, 4) for o4 in range(1, 40)]
# Orange FR (ASN 3215) — 90.x.x.x
+ [f"90.{o2}.{o3}.{o4}" for o2 in range(10, 14) for o3 in range(0, 4) for o4 in range(1, 20)]
)
random.shuffle(HUMAN_IPS)
# DATACENTER/BOT — scanner/Tor ranges → asn_label='datacenter' → ML scores these
BOT_IPS = (
# Tor exits / Accelerated-IT (ASN 210644) — 185.220.101.x
[f"185.220.101.{i}" for i in range(1, 101)]
# Contabo scanner (ASN 209083) — 45.155.205.x
+ [f"45.155.205.{i}" for i in range(1, 51)]
# Reg.ru (ASN 197695) — 193.32.162.x
+ [f"193.32.162.{i}" for i in range(1, 31)]
)
# Legacy alias (kept for existing code)
XFF_IPS = HUMAN_IPS[:20] + BOT_IPS[:10]
# ---------------------------------------------------------------------------
# SSL context variants — different cipher/protocol settings produce different
# TLS ClientHello messages (and thus different JA4/JA3 fingerprints).
# ---------------------------------------------------------------------------
def make_ssl_contexts():
contexts = []
# Default context (OS defaults)
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
contexts.append(("default", ctx))
# TLS 1.2 only
try:
ctx12 = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
ctx12.check_hostname = False
ctx12.verify_mode = ssl.CERT_NONE
ctx12.maximum_version = ssl.TLSVersion.TLSv1_2
ctx12.minimum_version = ssl.TLSVersion.TLSv1_2
contexts.append(("tls12", ctx12))
except Exception:
pass
# TLS 1.3 only
try:
ctx13 = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
ctx13.check_hostname = False
ctx13.verify_mode = ssl.CERT_NONE
ctx13.minimum_version = ssl.TLSVersion.TLSv1_3
contexts.append(("tls13", ctx13))
except Exception:
pass
# Reduced cipher set
try:
ctx_few = ssl.create_default_context()
ctx_few.check_hostname = False
ctx_few.verify_mode = ssl.CERT_NONE
ctx_few.set_ciphers("AES128-GCM-SHA256:AES256-GCM-SHA384")
contexts.append(("few_ciphers", ctx_few))
except Exception:
pass
return contexts
SSL_CONTEXTS = make_ssl_contexts()
# ---------------------------------------------------------------------------
# Request builder
# ---------------------------------------------------------------------------
@dataclass
class RequestScenario:
method: str
url: str
headers: dict
body: Optional[bytes] = None
ssl_ctx: Optional[ssl.SSLContext] = None
label: str = ""
def _random_headers(ua: str, is_bot: bool = False, xff_ip: str = None) -> dict:
headers = {
"User-Agent": ua,
"Accept": random.choice([
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"application/json, text/plain, */*",
"*/*",
"text/html,application/xhtml+xml,*/*;q=0.8",
]),
"Accept-Encoding": random.choice([
"gzip, deflate, br",
"gzip, deflate",
"identity",
"br;q=1.0, gzip;q=0.8",
]),
"Accept-Language": random.choice(ACCEPT_LANGS),
"Connection": random.choice(["keep-alive", "close"]),
# X-Forwarded-For: always set so mod_remoteip gives each request a
# distinct src_ip in the ClickHouse pipeline (r->useragent_ip).
"X-Forwarded-For": xff_ip or (
random.choice(BOT_IPS) if is_bot else random.choice(HUMAN_IPS)
),
}
# Sec-Fetch headers (browsers only)
if not is_bot and random.random() < 0.7:
headers["Sec-Fetch-Mode"] = random.choice(SEC_FETCH_MODES)
headers["Sec-Fetch-Dest"] = random.choice(SEC_FETCH_DESTS)
headers["Sec-Fetch-Site"] = random.choice(SEC_FETCH_SITES)
# Referer sometimes
ref = random.choice(REFERERS)
if ref:
headers["Referer"] = ref
# Cache headers
if random.random() < 0.4:
headers["Cache-Control"] = random.choice(["no-cache", "max-age=0", "no-store"])
# Cookie sometimes
if random.random() < 0.2:
session_id = "%032x" % random.getrandbits(128)
headers["Cookie"] = f"session={session_id}; lang={random.choice(['fr','en','de'])}"
return headers
def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> list:
"""Build a list of varied request scenarios."""
scenarios = []
base_http = f"http://{host}:{http_port}"
base_https = f"https://{host}:{https_port}"
# --- Browser-like HTTPS GET requests (most common) ---
for _ in range(int(count * 0.30)):
ua = random.choice(BROWSERS)
path = random.choice(PATHS)
qs = random.choice(QUERY_PARAMS)
ssl_name, ssl_ctx = random.choice(SSL_CONTEXTS)
scenarios.append(RequestScenario(
method="GET",
url=f"{base_https}{path}{qs}",
headers=_random_headers(ua, xff_ip=random.choice(HUMAN_IPS)),
ssl_ctx=ssl_ctx,
label=f"browser-https-{ssl_name}",
))
# --- Browser-like HTTP GET requests ---
for _ in range(int(count * 0.10)):
ua = random.choice(BROWSERS)
path = random.choice(PATHS)
qs = random.choice(QUERY_PARAMS)
scenarios.append(RequestScenario(
method="GET",
url=f"{base_http}{path}{qs}",
headers=_random_headers(ua, xff_ip=random.choice(HUMAN_IPS)),
label="browser-http",
))
# --- Bot / crawler HTTPS requests ---
for _ in range(int(count * 0.15)):
ua = random.choice(BOTS)
path = random.choice(PATHS)
ssl_name, ssl_ctx = random.choice(SSL_CONTEXTS)
scenarios.append(RequestScenario(
method="GET",
url=f"{base_https}{path}",
headers=_random_headers(ua, is_bot=True, xff_ip=random.choice(BOT_IPS)),
ssl_ctx=ssl_ctx,
label=f"bot-https-{ssl_name}",
))
# --- Bot HTTP requests ---
for _ in range(int(count * 0.05)):
ua = random.choice(BOTS)
path = random.choice(PATHS)
scenarios.append(RequestScenario(
method="GET",
url=f"{base_http}{path}",
headers=_random_headers(ua, is_bot=True, xff_ip=random.choice(BOT_IPS)),
label="bot-http",
))
# --- POST HTTPS with JSON body ---
for _ in range(int(count * 0.15)):
ua = random.choice(BROWSERS)
body_str = random.choice(JSON_BODIES)
body = body_str.encode()
hdrs = _random_headers(ua, xff_ip=random.choice(HUMAN_IPS))
hdrs["Content-Type"] = "application/json"
hdrs["Content-Length"] = str(len(body))
_, ssl_ctx = random.choice(SSL_CONTEXTS)
scenarios.append(RequestScenario(
method="POST",
url=f"{base_https}{random.choice(['/login','/api/v1/users','/api/v2/metrics','/health'])}",
headers=hdrs,
body=body,
ssl_ctx=ssl_ctx,
label="post-json-https",
))
# --- POST HTTP with form data ---
for _ in range(int(count * 0.05)):
ua = random.choice(BROWSERS + BOTS)
body_str = random.choice(FORM_BODIES)
body = body_str.encode()
hdrs = _random_headers(ua, xff_ip=random.choice(BOT_IPS))
hdrs["Content-Type"] = "application/x-www-form-urlencoded"
hdrs["Content-Length"] = str(len(body))
scenarios.append(RequestScenario(
method="POST",
url=f"{base_http}/login",
headers=hdrs,
body=body,
label="post-form-http",
))
# --- HEAD requests ---
for _ in range(int(count * 0.05)):
ua = random.choice(BROWSERS + BOTS)
_, ssl_ctx = random.choice(SSL_CONTEXTS)
scenarios.append(RequestScenario(
method="HEAD",
url=f"{base_https}{random.choice(PATHS)}",
headers=_random_headers(ua, xff_ip=random.choice(HUMAN_IPS)),
ssl_ctx=ssl_ctx,
label="head-https",
))
# --- PUT / PATCH ---
for _ in range(int(count * 0.05)):
ua = random.choice(BROWSERS)
body = json.dumps({"id": random.randint(1, 999), "value": "updated"}).encode()
hdrs = _random_headers(ua, xff_ip=random.choice(HUMAN_IPS))
hdrs["Content-Type"] = "application/json"
hdrs["Content-Length"] = str(len(body))
_, ssl_ctx = random.choice(SSL_CONTEXTS)
scenarios.append(RequestScenario(
method=random.choice(["PUT", "PATCH"]),
url=f"{base_https}/api/v1/users/{random.randint(1,999)}",
headers=hdrs,
body=body,
ssl_ctx=ssl_ctx,
label="put-patch-https",
))
# --- DELETE ---
for _ in range(int(count * 0.02)):
ua = random.choice(BROWSERS)
_, ssl_ctx = random.choice(SSL_CONTEXTS)
scenarios.append(RequestScenario(
method="DELETE",
url=f"{base_https}/api/v1/users/{random.randint(1,999)}",
headers=_random_headers(ua, xff_ip=random.choice(HUMAN_IPS)),
ssl_ctx=ssl_ctx,
label="delete-https",
))
# --- OPTIONS (CORS preflight) ---
for _ in range(int(count * 0.03)):
ua = random.choice(BROWSERS)
hdrs = _random_headers(ua, xff_ip=random.choice(HUMAN_IPS))
hdrs["Origin"] = random.choice(["https://app.example.com", "http://localhost:3000"])
hdrs["Access-Control-Request-Method"] = random.choice(["POST", "PUT", "DELETE"])
_, ssl_ctx = random.choice(SSL_CONTEXTS)
scenarios.append(RequestScenario(
method="OPTIONS",
url=f"{base_https}{random.choice(['/api/v1/users','/api/v2/metrics'])}",
headers=hdrs,
ssl_ctx=ssl_ctx,
label="options-cors",
))
# Fill remaining with browser HTTPS GETs
while len(scenarios) < count:
ua = random.choice(BROWSERS)
_, ssl_ctx = random.choice(SSL_CONTEXTS)
scenarios.append(RequestScenario(
method="GET",
url=f"{base_https}/health?filler={random.randint(1,9999)}",
headers=_random_headers(ua, xff_ip=random.choice(HUMAN_IPS)),
ssl_ctx=ssl_ctx,
label="filler-https",
))
random.shuffle(scenarios)
return scenarios[:count]
# ---------------------------------------------------------------------------
# Executor
# ---------------------------------------------------------------------------
stats = {"ok": 0, "err": 0, "by_label": {}}
def send_request(scenario: RequestScenario) -> dict:
"""Send a single request, return result dict."""
t0 = time.monotonic()
try:
req = urllib.request.Request(
url=scenario.url,
data=scenario.body,
method=scenario.method,
headers=scenario.headers,
)
ctx = scenario.ssl_ctx
with urllib.request.urlopen(req, context=ctx, timeout=5) as resp:
_ = resp.read(4096) # consume partial body
return {"ok": True, "status": resp.status, "label": scenario.label,
"ms": int((time.monotonic() - t0) * 1000)}
except urllib.error.HTTPError as e:
# HTTP errors (4xx/5xx) are still valid responses — Apache served them
return {"ok": True, "status": e.code, "label": scenario.label,
"ms": int((time.monotonic() - t0) * 1000)}
except Exception as e:
return {"ok": False, "error": str(e)[:80], "label": scenario.label,
"ms": int((time.monotonic() - t0) * 1000)}
def run(host: str, http_port: int, https_port: int, total: int, workers: int):
scenarios = build_scenarios(host, http_port, https_port, total)
print(f"[traffic-gen] Sending {len(scenarios)} requests to {host} "
f"(http:{http_port} https:{https_port}) with {workers} workers")
label_counts: dict = {}
ok = err = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as pool:
futures = {pool.submit(send_request, s): s for s in scenarios}
for fut in concurrent.futures.as_completed(futures):
res = fut.result()
lbl = res.get("label", "?")
label_counts[lbl] = label_counts.get(lbl, 0) + 1
if res["ok"]:
ok += 1
else:
err += 1
print(f"[traffic-gen] WARN {lbl}: {res.get('error','?')}")
print(f"[traffic-gen] Done: {ok} OK, {err} errors")
print("[traffic-gen] Breakdown by scenario:")
for lbl, cnt in sorted(label_counts.items()):
print(f" {lbl:35s} {cnt:4d}")
return err == 0 or (ok / (ok + err)) > 0.8
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Realistic traffic generator")
parser.add_argument("--host", default="platform")
parser.add_argument("--http-port", type=int, default=80)
parser.add_argument("--https-port", type=int, default=443)
parser.add_argument("--requests", type=int, default=500)
parser.add_argument("--workers", type=int, default=10)
args = parser.parse_args()
success = run(args.host, args.http_port, args.https_port, args.requests, args.workers)
raise SystemExit(0 if success else 1)