feat: CSV generation scripts, API filter params, enriched CSV stubs

- scripts/generate_bot_ip.py: download Tor exit nodes + curate scanner IPs (1353 entries)
- scripts/generate_bot_ja4.py: 31 bot JA4 fingerprints across 16 families
- scripts/generate_asn_data.py: 38 ASNs + 96 IP-to-ASN prefixes
- scripts/update-csv-data.sh: master orchestrator with --install-stubs
- api.py: add asn_org/country_code/ja4/bot_name filters on detections+scores
- pages.py: add /network route
- csv-stubs: enriched with generated data (Tor nodes, scanner IPs, etc.)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-08 15:05:43 +02:00
parent c6ca352db9
commit b6184e6529
14 changed files with 3779 additions and 27 deletions

View File

@ -26,11 +26,13 @@ _DETECTION_SORT_COLS = {
"detected_at", "src_ip", "ja4", "host", "anomaly_score",
"threat_level", "recurrence", "hits", "hit_velocity",
"fuzzing_index", "post_ratio", "campaign_id",
"asn_org", "country_code", "bot_name",
}
_SCORE_SORT_COLS = {
"detected_at", "window_start", "src_ip", "ja4", "host",
"anomaly_score", "raw_anomaly_score", "threat_level",
"hits", "hit_velocity", "xgb_prob", "ae_recon_error",
"asn_org", "country_code",
}
_TRAFFIC_SORT_COLS = {
"time", "src_ip", "method", "host", "path", "http_version",
@ -137,6 +139,10 @@ async def detections(
order: str = Query("DESC"),
threat_level: str | None = Query(None),
search: str | None = Query(None),
asn_org: str | None = Query(None),
country_code: str | None = Query(None),
ja4: str | None = Query(None),
bot_name: str | None = Query(None),
) -> dict[str, Any]:
sort = _validate_sort(sort, _DETECTION_SORT_COLS, "detected_at")
order = _validate_order(order)
@ -155,6 +161,22 @@ async def detections(
)
params["search"] = f"%{search}%"
if asn_org:
where_clauses.append("asn_org = {asn_org:String}")
params["asn_org"] = asn_org
if country_code:
where_clauses.append("country_code = {cc:String}")
params["cc"] = country_code
if ja4:
where_clauses.append("ja4 = {ja4:String}")
params["ja4"] = ja4
if bot_name:
where_clauses.append("bot_name = {bn:String}")
params["bn"] = bot_name
where = " AND ".join(where_clauses)
try:
@ -194,6 +216,9 @@ async def scores(
order: str = Query("DESC"),
threat_level: str | None = Query(None),
search: str | None = Query(None),
asn_org: str | None = Query(None),
country_code: str | None = Query(None),
ja4: str | None = Query(None),
) -> dict[str, Any]:
sort = _validate_sort(sort, _SCORE_SORT_COLS, "detected_at")
order = _validate_order(order)
@ -212,6 +237,18 @@ async def scores(
)
params["search"] = f"%{search}%"
if asn_org:
where_clauses.append("asn_org = {asn_org:String}")
params["asn_org"] = asn_org
if country_code:
where_clauses.append("country_code = {cc:String}")
params["cc"] = country_code
if ja4:
where_clauses.append("ja4 = {ja4:String}")
params["ja4"] = ja4
where = " AND ".join(where_clauses)
try:
@ -375,15 +412,22 @@ async def ip_detail(ip: str) -> dict[str, Any]:
# ---------------------------------------------------------------------------
@router.get("/features")
async def features() -> dict[str, Any]:
result: dict[str, Any] = {"ai_features": {}, "thesis_features": {}}
result: dict[str, Any] = {
"ai_features": {}, "thesis_features": {},
"human_profile": {}, "bot_profile": {},
"feature_importance": [],
}
_feat_cols = (
"avg(hits) AS avg_hits, avg(hit_velocity) AS avg_velocity, "
"avg(fuzzing_index) AS avg_fuzz, avg(post_ratio) AS avg_post, "
"avg(asset_ratio) AS avg_asset, avg(direct_access_ratio) AS avg_direct, "
"avg(temporal_entropy) AS avg_entropy, avg(path_diversity_ratio) AS avg_path_div, "
"avg(modern_browser_score) AS avg_browser, avg(header_count) AS avg_headers, "
"avg(src_port_density) AS avg_port_density, avg(distinct_ja4_count) AS avg_ja4_count"
)
try:
ai_stats = query(
f"SELECT count() AS total, "
f"avg(hits) AS avg_hits, "
f"avg(hit_velocity) AS avg_hit_velocity, "
f"avg(fuzzing_index) AS avg_fuzzing_index, "
f"avg(post_ratio) AS avg_post_ratio "
f"FROM {_DB}.view_ai_features_1h"
f"SELECT count() AS total, {_feat_cols} FROM {_DB}.view_ai_features_1h"
)
if ai_stats:
result["ai_features"] = ai_stats[0]
@ -392,21 +436,212 @@ async def features() -> dict[str, Any]:
try:
thesis_stats = query(
f"SELECT count() AS total, "
f"avg(hits) AS avg_hits, "
f"avg(hit_velocity) AS avg_hit_velocity, "
f"avg(fuzzing_index) AS avg_fuzzing_index, "
f"avg(post_ratio) AS avg_post_ratio "
f"FROM {_DB}.view_thesis_features_1h"
f"SELECT count() AS total, {_feat_cols} FROM {_DB}.view_thesis_features_1h"
)
if thesis_stats:
result["thesis_features"] = thesis_stats[0]
except Exception:
logger.debug("view_thesis_features_1h not available")
# Human vs bot feature profiles for radar comparison
try:
human = query(
f"SELECT {_feat_cols} FROM {_DB}.view_ai_features_1h "
"WHERE asn_label = 'human'"
)
if human:
result["human_profile"] = human[0]
except Exception:
pass
try:
bot = query(
f"SELECT {_feat_cols} FROM {_DB}.view_ai_features_1h "
"WHERE asn_label IN ('datacenter', 'hosting')"
)
if bot:
result["bot_profile"] = bot[0]
except Exception:
pass
# Feature variance (importance proxy)
try:
variance_rows = query(
f"SELECT "
f"varPop(hit_velocity) AS v_velocity, "
f"varPop(fuzzing_index) AS v_fuzz, "
f"varPop(post_ratio) AS v_post, "
f"varPop(asset_ratio) AS v_asset, "
f"varPop(direct_access_ratio) AS v_direct, "
f"varPop(temporal_entropy) AS v_entropy, "
f"varPop(path_diversity_ratio) AS v_path_div, "
f"varPop(src_port_density) AS v_port_density "
f"FROM {_DB}.view_ai_features_1h"
)
if variance_rows:
row = variance_rows[0]
result["feature_importance"] = [
{"name": k.replace("v_", ""), "variance": v}
for k, v in sorted(row.items(), key=lambda x: -(x[1] or 0))
]
except Exception:
pass
return result
# ---------------------------------------------------------------------------
# GET /api/geo — Geographic & ASN breakdown
# ---------------------------------------------------------------------------
@router.get("/geo")
async def geo() -> dict[str, Any]:
try:
countries = query(
f"SELECT country_code, asn_label, "
f"count() AS sessions, sum(hits) AS total_hits "
f"FROM {_DB}.view_ai_features_1h "
"WHERE country_code != '' "
"GROUP BY country_code, asn_label ORDER BY sessions DESC"
)
asns = query(
f"SELECT asn_org, asn_label, country_code, "
f"count() AS sessions, sum(hits) AS total_hits, "
f"avg(hit_velocity) AS avg_velocity, avg(fuzzing_index) AS avg_fuzz "
f"FROM {_DB}.view_ai_features_1h "
"WHERE asn_org != '' "
"GROUP BY asn_org, asn_label, country_code ORDER BY sessions DESC LIMIT 50"
)
return {"countries": countries, "asns": asns}
except Exception as exc:
logger.exception("geo query failed")
return {"countries": [], "asns": []}
# ---------------------------------------------------------------------------
# GET /api/fingerprints — JA4 fingerprint analysis
# ---------------------------------------------------------------------------
@router.get("/fingerprints")
async def fingerprints() -> dict[str, Any]:
try:
ja4_stats = query(
f"SELECT ja4, asn_label, "
f"count() AS sessions, sum(hits) AS total_hits, "
f"avg(hit_velocity) AS avg_velocity, "
f"avg(fuzzing_index) AS avg_fuzz, "
f"avg(modern_browser_score) AS avg_browser_score "
f"FROM {_DB}.view_ai_features_1h "
"WHERE ja4 != '' "
"GROUP BY ja4, asn_label ORDER BY sessions DESC LIMIT 100"
)
bot_ja4 = query(
f"SELECT ja4, bot_name, count() AS sessions "
f"FROM {_DB}.view_ai_features_1h "
"WHERE bot_name != '' AND ja4 != '' "
"GROUP BY ja4, bot_name ORDER BY sessions DESC"
)
return {"ja4_stats": ja4_stats, "bot_ja4": bot_ja4}
except Exception as exc:
logger.exception("fingerprints query failed")
return {"ja4_stats": [], "bot_ja4": []}
# ---------------------------------------------------------------------------
# GET /api/behavior — Feature scatter + distributions
# ---------------------------------------------------------------------------
_BEHAVIOR_FEATURES = [
"hit_velocity", "fuzzing_index", "post_ratio", "asset_ratio",
"direct_access_ratio", "temporal_entropy", "path_diversity_ratio",
"modern_browser_score", "header_count", "is_ua_rotating",
"distinct_ja4_count", "src_port_density",
]
@router.get("/behavior")
async def behavior() -> dict[str, Any]:
cols = ", ".join(_BEHAVIOR_FEATURES)
try:
scatter = query(
f"SELECT toString(src_ip) AS ip, asn_label, bot_name, hits, {cols} "
f"FROM {_DB}.view_ai_features_1h "
"ORDER BY hits DESC LIMIT 500"
)
# Per-feature distributions (histogram buckets)
distributions: dict[str, list] = {}
for feat in ["hit_velocity", "fuzzing_index", "post_ratio",
"asset_ratio", "temporal_entropy", "path_diversity_ratio"]:
buckets = query(
f"SELECT round({feat}, 2) AS bucket, count() AS cnt "
f"FROM {_DB}.view_ai_features_1h "
f"GROUP BY bucket ORDER BY bucket"
)
distributions[feat] = buckets
return {"scatter": scatter, "distributions": distributions}
except Exception as exc:
logger.exception("behavior query failed")
return {"scatter": [], "distributions": {}}
# ---------------------------------------------------------------------------
# GET /api/heatmap — Temporal heatmap (hour × day)
# ---------------------------------------------------------------------------
@router.get("/heatmap")
async def heatmap() -> dict[str, Any]:
try:
cells = query(
f"SELECT toDayOfWeek(time) AS dow, toHour(time) AS hour, count() AS cnt "
f"FROM {_DB_LOGS}.http_logs "
"WHERE time >= now() - INTERVAL 7 DAY "
"GROUP BY dow, hour ORDER BY dow, hour"
)
return {"cells": cells}
except Exception as exc:
logger.exception("heatmap query failed")
return {"cells": []}
# ---------------------------------------------------------------------------
# GET /api/ip/{ip}/radar — Radar comparison vs human baseline
# ---------------------------------------------------------------------------
_RADAR_FEATURES = [
"hit_velocity", "fuzzing_index", "post_ratio", "asset_ratio",
"direct_access_ratio", "temporal_entropy", "path_diversity_ratio",
"modern_browser_score",
]
@router.get("/ip/{ip}/radar")
async def ip_radar(ip: str) -> dict[str, Any]:
clean_ip = ip.replace("::ffff:", "")
cols_avg = ", ".join(f"avg({f}) AS {f}" for f in _RADAR_FEATURES)
try:
ip_data = query(
f"SELECT {', '.join(_RADAR_FEATURES)} "
f"FROM {_DB}.view_ai_features_1h "
"WHERE src_ip = toIPv6({ip:String}) LIMIT 1",
{"ip": clean_ip},
)
baseline = query(
f"SELECT {cols_avg} "
f"FROM {_DB}.view_ai_features_1h "
"WHERE asn_label = 'human'"
)
bot_avg = query(
f"SELECT {cols_avg} "
f"FROM {_DB}.view_ai_features_1h "
"WHERE asn_label IN ('datacenter', 'hosting')"
)
return {
"features": _RADAR_FEATURES,
"ip_values": ip_data[0] if ip_data else {},
"human_baseline": baseline[0] if baseline else {},
"bot_baseline": bot_avg[0] if bot_avg else {},
}
except Exception as exc:
logger.exception("ip radar query failed for %s", ip)
return {"features": _RADAR_FEATURES, "ip_values": {},
"human_baseline": {}, "bot_baseline": {}}
# ---------------------------------------------------------------------------
# GET /api/models
# ---------------------------------------------------------------------------

View File

@ -51,3 +51,8 @@ async def features(request: Request):
@router.get("/models")
async def models(request: Request):
return templates.TemplateResponse("models.html", _ctx(request, "models"))
@router.get("/network")
async def network(request: Request):
return templates.TemplateResponse("network.html", _ctx(request, "network"))