feat: LEGITIMATE_BROWSER classification from JA4 + behavioral consistency

Add browser legitimacy classification (A9) to the bot detection pipeline:

- New features: is_known_browser (binary) and browser_consistency_score [0..5]
  combining 5 signals: JA4 browser match, modern_browser_score, Accept-Language,
  cookies, Sec-Fetch-* presence
- Post-scoring: sessions with known browser JA4 + consistency >= 4/5 + NORMAL/LOW
  threat level are reclassified as LEGITIMATE_BROWSER
- Spoofing detection: inconsistent behavior (known JA4 but low consistency) stays
  in normal anomaly scoring — prevents evasion via JA4 spoofing
- XGBoost treats LEGITIMATE_BROWSER as non-threat (negative label)
- ClickHouse: browser_family column added to ml_detected_anomalies and ml_all_scores
- Dashboard: browser_family filter/sort on detections and scores endpoints,
  legitimate_browsers count and browser_stats in overview
- 6 new unit tests covering classification threshold, spoofing, exclusion logic

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-08 15:46:22 +02:00
parent 7d09c614c3
commit 9a48fb9d29
4 changed files with 215 additions and 7 deletions

View File

@ -26,13 +26,13 @@ _DETECTION_SORT_COLS = {
"detected_at", "src_ip", "ja4", "host", "anomaly_score",
"threat_level", "recurrence", "hits", "hit_velocity",
"fuzzing_index", "post_ratio", "campaign_id",
"asn_org", "country_code", "bot_name",
"asn_org", "country_code", "bot_name", "browser_family",
}
_SCORE_SORT_COLS = {
"detected_at", "window_start", "src_ip", "ja4", "host",
"anomaly_score", "raw_anomaly_score", "threat_level",
"hits", "hit_velocity", "xgb_prob", "ae_recon_error",
"asn_org", "country_code",
"asn_org", "country_code", "browser_family",
}
_TRAFFIC_SORT_COLS = {
"time", "src_ip", "method", "host", "path", "http_version",
@ -111,6 +111,20 @@ async def overview() -> dict[str, Any]:
"GROUP BY model_name"
)
browser_stats = query(
f"SELECT browser_family, count() AS cnt "
f"FROM {_DB}.ml_all_scores "
"WHERE detected_at >= now() - INTERVAL 1 DAY "
"AND browser_family != '' "
"GROUP BY browser_family ORDER BY cnt DESC"
)
legitimate_browsers = query_scalar(
f"SELECT count() FROM {_DB}.ml_all_scores "
"WHERE detected_at >= now() - INTERVAL 1 DAY "
"AND threat_level = 'LEGITIMATE_BROWSER'"
) or 0
return {
"detections_24h": detections_24h,
"scored_24h": scored_24h,
@ -118,6 +132,8 @@ async def overview() -> dict[str, Any]:
"unique_ips": unique_ips,
"critical_count": critical_count,
"high_count": high_count,
"legitimate_browsers": legitimate_browsers,
"browser_stats": browser_stats,
"threat_distribution": threat_distribution,
"top_ips": top_ips,
"timeline": [{"hour": str(r["hour"]), "cnt": r["cnt"]} for r in timeline],
@ -143,6 +159,7 @@ async def detections(
country_code: str | None = Query(None),
ja4: str | None = Query(None),
bot_name: str | None = Query(None),
browser_family: str | None = Query(None),
) -> dict[str, Any]:
sort = _validate_sort(sort, _DETECTION_SORT_COLS, "detected_at")
order = _validate_order(order)
@ -177,6 +194,10 @@ async def detections(
where_clauses.append("bot_name = {bn:String}")
params["bn"] = bot_name
if browser_family:
where_clauses.append("browser_family = {bf:String}")
params["bf"] = browser_family
where = " AND ".join(where_clauses)
try:
@ -219,6 +240,7 @@ async def scores(
asn_org: str | None = Query(None),
country_code: str | None = Query(None),
ja4: str | None = Query(None),
browser_family: str | None = Query(None),
) -> dict[str, Any]:
sort = _validate_sort(sort, _SCORE_SORT_COLS, "detected_at")
order = _validate_order(order)
@ -249,6 +271,10 @@ async def scores(
where_clauses.append("ja4 = {ja4:String}")
params["ja4"] = ja4
if browser_family:
where_clauses.append("browser_family = {bf:String}")
params["bf"] = browser_family
where = " AND ".join(where_clauses)
try: