feat(tests): realistic traffic seeder + IP diversity via mod_remoteip
Option A — X-Forwarded-For + mod_remoteip:
- httpd-integration.conf: load mod_remoteip, trust all Docker RFC-1918
subnets (172/192.168/10). mod_reqin_log uses r->useragent_ip which
mod_remoteip updates from XFF → each request logged with distinct src_ip
- generate_traffic.py: XFF always set (was 30% only); human scenarios
use 91.121/78.41/90.x ranges, bot scenarios use 185.220/45.155/193.32;
pool of 1168 human IPs and 180 bot IPs; default --requests 500
Option D — Direct ClickHouse seeder (seed_clickhouse.py, stdlib only):
- Inserts ~4000 rows into http_logs_raw triggering full MV chain:
http_logs_raw → mv_http_logs → http_logs
→ mv_agg_host_ip_ja4_1h → agg_host_ip_ja4_1h
• 720 human sessions: IPs in OVH/SFR/Orange ASN ranges (16276/15557/3215)
→ dict_asn_reputation maps these to asn_label='human'
→ satisfies bot_detector human_baseline >= 500 threshold
• 150 scanner sessions: datacenter IPs, attack paths (/.env, wp-login,
SQLi, path traversal), scanner UAs, minimal TCP fingerprints
• 100 known-bot sessions: IPs matching bot_ip.csv entries
• 20 brute-force clusters: 20-50 POST /login per IP
All TCP/TLS metadata is profile-realistic (window, MSS, TTL, JA4, JA3)
CSV stubs (mounted at /var/lib/clickhouse/user_files/):
- iplocate-ip-to-asn.csv: 13 CIDR→ASN mappings (OVH/SFR/Orange/Tor/Contabo)
- asn_reputation.csv: 13 ASN→label (8 'human', 3 'datacenter'/'hosting')
- bot_ip.csv: 14 known scanner/Tor IPs (Shodan, Censys, Tor exits)
- bot_ja4.csv: 5 bot JA4 fingerprints (curl, python-requests, masscan, zgrab)
run-tests.sh:
- Phase 4a: seeder runs before live traffic (ensures bot_detector baseline)
- Phase 4b: live traffic gen at 500 requests (up from 200)
- Phase 5f: new assertions — agg_host_ip_ja4_1h populated, ≥500 human
rows in view_ai_features_1h, known-bot labels present
- Phase 7: verifies ml_all_scores populated (bot_detector ran a cycle)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -9,13 +9,13 @@ Simulates varied web traffic including:
|
||||
- Varied paths, query strings, form data, JSON payloads
|
||||
- Both HTTP (port 80) and HTTPS (port 443)
|
||||
- Different Accept/Language/Encoding headers
|
||||
- Cookie / Referer / X-Forwarded-For variations
|
||||
- Burst mode and sequential scenarios
|
||||
- Cookie / Referer / X-Forwarded-For always set — ensures src_ip diversity
|
||||
in ClickHouse via mod_remoteip (r->useragent_ip updated from XFF)
|
||||
- Multiple SSL contexts to vary TLS ClientHello parameters
|
||||
|
||||
Usage:
|
||||
python generate_traffic.py [--host platform] [--http-port 80] [--https-port 443]
|
||||
[--requests 200] [--workers 10] [--scenario all]
|
||||
[--requests 500] [--workers 10] [--scenario all]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@ -148,14 +148,34 @@ FORM_BODIES = [
|
||||
"q=test+query&submit=Search",
|
||||
]
|
||||
|
||||
XFF_IPS = [
|
||||
"1.2.3.4",
|
||||
"192.168.1.100",
|
||||
"10.0.0.1",
|
||||
"203.0.113.42",
|
||||
"185.220.101.34", # Known Tor exit
|
||||
"45.155.205.233", # Scanning IP
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# IP pools for X-Forwarded-For (mod_remoteip uses this as src_ip in logs)
|
||||
# Ranges must match iplocate-ip-to-asn.csv entries so ASN lookup succeeds.
|
||||
#
|
||||
# HUMAN — residential ISP ranges → asn_label='human' → feeds ML baseline
|
||||
HUMAN_IPS = (
|
||||
# OVH FR (ASN 16276) — 91.121.0.0/16
|
||||
[f"91.121.{o3}.{o4}" for o3 in range(0, 12) for o4 in range(1, 60)]
|
||||
# SFR FR (ASN 15557) — 78.41.0.0/16
|
||||
+ [f"78.41.{o3}.{o4}" for o3 in range(0, 4) for o4 in range(1, 40)]
|
||||
# Orange FR (ASN 3215) — 90.x.x.x
|
||||
+ [f"90.{o2}.{o3}.{o4}" for o2 in range(10, 14) for o3 in range(0, 4) for o4 in range(1, 20)]
|
||||
)
|
||||
random.shuffle(HUMAN_IPS)
|
||||
|
||||
# DATACENTER/BOT — scanner/Tor ranges → asn_label='datacenter' → ML scores these
|
||||
BOT_IPS = (
|
||||
# Tor exits / Accelerated-IT (ASN 210644) — 185.220.101.x
|
||||
[f"185.220.101.{i}" for i in range(1, 101)]
|
||||
# Contabo scanner (ASN 209083) — 45.155.205.x
|
||||
+ [f"45.155.205.{i}" for i in range(1, 51)]
|
||||
# Reg.ru (ASN 197695) — 193.32.162.x
|
||||
+ [f"193.32.162.{i}" for i in range(1, 31)]
|
||||
)
|
||||
|
||||
# Legacy alias (kept for existing code)
|
||||
XFF_IPS = HUMAN_IPS[:20] + BOT_IPS[:10]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -221,7 +241,7 @@ class RequestScenario:
|
||||
label: str = ""
|
||||
|
||||
|
||||
def _random_headers(ua: str, is_bot: bool = False) -> dict:
|
||||
def _random_headers(ua: str, is_bot: bool = False, xff_ip: str = None) -> dict:
|
||||
headers = {
|
||||
"User-Agent": ua,
|
||||
"Accept": random.choice([
|
||||
@ -238,6 +258,11 @@ def _random_headers(ua: str, is_bot: bool = False) -> dict:
|
||||
]),
|
||||
"Accept-Language": random.choice(ACCEPT_LANGS),
|
||||
"Connection": random.choice(["keep-alive", "close"]),
|
||||
# X-Forwarded-For: always set so mod_remoteip gives each request a
|
||||
# distinct src_ip in the ClickHouse pipeline (r->useragent_ip).
|
||||
"X-Forwarded-For": xff_ip or (
|
||||
random.choice(BOT_IPS) if is_bot else random.choice(HUMAN_IPS)
|
||||
),
|
||||
}
|
||||
|
||||
# Sec-Fetch headers (browsers only)
|
||||
@ -251,10 +276,6 @@ def _random_headers(ua: str, is_bot: bool = False) -> dict:
|
||||
if ref:
|
||||
headers["Referer"] = ref
|
||||
|
||||
# X-Forwarded-For sometimes (proxy simulation)
|
||||
if random.random() < 0.3:
|
||||
headers["X-Forwarded-For"] = random.choice(XFF_IPS)
|
||||
|
||||
# Cache headers
|
||||
if random.random() < 0.4:
|
||||
headers["Cache-Control"] = random.choice(["no-cache", "max-age=0", "no-store"])
|
||||
@ -283,7 +304,7 @@ def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> l
|
||||
scenarios.append(RequestScenario(
|
||||
method="GET",
|
||||
url=f"{base_https}{path}{qs}",
|
||||
headers=_random_headers(ua),
|
||||
headers=_random_headers(ua, xff_ip=random.choice(HUMAN_IPS)),
|
||||
ssl_ctx=ssl_ctx,
|
||||
label=f"browser-https-{ssl_name}",
|
||||
))
|
||||
@ -296,7 +317,7 @@ def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> l
|
||||
scenarios.append(RequestScenario(
|
||||
method="GET",
|
||||
url=f"{base_http}{path}{qs}",
|
||||
headers=_random_headers(ua),
|
||||
headers=_random_headers(ua, xff_ip=random.choice(HUMAN_IPS)),
|
||||
label="browser-http",
|
||||
))
|
||||
|
||||
@ -308,7 +329,7 @@ def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> l
|
||||
scenarios.append(RequestScenario(
|
||||
method="GET",
|
||||
url=f"{base_https}{path}",
|
||||
headers=_random_headers(ua, is_bot=True),
|
||||
headers=_random_headers(ua, is_bot=True, xff_ip=random.choice(BOT_IPS)),
|
||||
ssl_ctx=ssl_ctx,
|
||||
label=f"bot-https-{ssl_name}",
|
||||
))
|
||||
@ -320,7 +341,7 @@ def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> l
|
||||
scenarios.append(RequestScenario(
|
||||
method="GET",
|
||||
url=f"{base_http}{path}",
|
||||
headers=_random_headers(ua, is_bot=True),
|
||||
headers=_random_headers(ua, is_bot=True, xff_ip=random.choice(BOT_IPS)),
|
||||
label="bot-http",
|
||||
))
|
||||
|
||||
@ -329,7 +350,7 @@ def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> l
|
||||
ua = random.choice(BROWSERS)
|
||||
body_str = random.choice(JSON_BODIES)
|
||||
body = body_str.encode()
|
||||
hdrs = _random_headers(ua)
|
||||
hdrs = _random_headers(ua, xff_ip=random.choice(HUMAN_IPS))
|
||||
hdrs["Content-Type"] = "application/json"
|
||||
hdrs["Content-Length"] = str(len(body))
|
||||
_, ssl_ctx = random.choice(SSL_CONTEXTS)
|
||||
@ -347,7 +368,7 @@ def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> l
|
||||
ua = random.choice(BROWSERS + BOTS)
|
||||
body_str = random.choice(FORM_BODIES)
|
||||
body = body_str.encode()
|
||||
hdrs = _random_headers(ua)
|
||||
hdrs = _random_headers(ua, xff_ip=random.choice(BOT_IPS))
|
||||
hdrs["Content-Type"] = "application/x-www-form-urlencoded"
|
||||
hdrs["Content-Length"] = str(len(body))
|
||||
scenarios.append(RequestScenario(
|
||||
@ -365,7 +386,7 @@ def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> l
|
||||
scenarios.append(RequestScenario(
|
||||
method="HEAD",
|
||||
url=f"{base_https}{random.choice(PATHS)}",
|
||||
headers=_random_headers(ua),
|
||||
headers=_random_headers(ua, xff_ip=random.choice(HUMAN_IPS)),
|
||||
ssl_ctx=ssl_ctx,
|
||||
label="head-https",
|
||||
))
|
||||
@ -374,7 +395,7 @@ def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> l
|
||||
for _ in range(int(count * 0.05)):
|
||||
ua = random.choice(BROWSERS)
|
||||
body = json.dumps({"id": random.randint(1, 999), "value": "updated"}).encode()
|
||||
hdrs = _random_headers(ua)
|
||||
hdrs = _random_headers(ua, xff_ip=random.choice(HUMAN_IPS))
|
||||
hdrs["Content-Type"] = "application/json"
|
||||
hdrs["Content-Length"] = str(len(body))
|
||||
_, ssl_ctx = random.choice(SSL_CONTEXTS)
|
||||
@ -394,7 +415,7 @@ def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> l
|
||||
scenarios.append(RequestScenario(
|
||||
method="DELETE",
|
||||
url=f"{base_https}/api/v1/users/{random.randint(1,999)}",
|
||||
headers=_random_headers(ua),
|
||||
headers=_random_headers(ua, xff_ip=random.choice(HUMAN_IPS)),
|
||||
ssl_ctx=ssl_ctx,
|
||||
label="delete-https",
|
||||
))
|
||||
@ -402,7 +423,7 @@ def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> l
|
||||
# --- OPTIONS (CORS preflight) ---
|
||||
for _ in range(int(count * 0.03)):
|
||||
ua = random.choice(BROWSERS)
|
||||
hdrs = _random_headers(ua)
|
||||
hdrs = _random_headers(ua, xff_ip=random.choice(HUMAN_IPS))
|
||||
hdrs["Origin"] = random.choice(["https://app.example.com", "http://localhost:3000"])
|
||||
hdrs["Access-Control-Request-Method"] = random.choice(["POST", "PUT", "DELETE"])
|
||||
_, ssl_ctx = random.choice(SSL_CONTEXTS)
|
||||
@ -421,7 +442,7 @@ def build_scenarios(host: str, http_port: int, https_port: int, count: int) -> l
|
||||
scenarios.append(RequestScenario(
|
||||
method="GET",
|
||||
url=f"{base_https}/health?filler={random.randint(1,9999)}",
|
||||
headers=_random_headers(ua),
|
||||
headers=_random_headers(ua, xff_ip=random.choice(HUMAN_IPS)),
|
||||
ssl_ctx=ssl_ctx,
|
||||
label="filler-https",
|
||||
))
|
||||
@ -497,7 +518,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--host", default="platform")
|
||||
parser.add_argument("--http-port", type=int, default=80)
|
||||
parser.add_argument("--https-port", type=int, default=443)
|
||||
parser.add_argument("--requests", type=int, default=200)
|
||||
parser.add_argument("--requests", type=int, default=500)
|
||||
parser.add_argument("--workers", type=int, default=10)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user