feat: rewrite ASN classification with PeeringDB + expanded heuristics
Major improvements to generate_asn_data.py: - Add PeeringDB network data source (34K networks with info_type) - Add new categories: education, government, enterprise - Rename 'human' label to 'isp' across all consumers - Expand keyword heuristics (ISP, datacenter, hosting, CDN, education, gov) - Add hard-coded lists for education, government, enterprise ASNs - Support both --output-dir and --output-asn/--output-ipasn CLI interfaces - Add --no-peeringdb flag for offline use Results: unknown dropped from 86% to 57%, ISP coverage 21.8K ASNs, education 3.1K, enterprise 5.7K, government 520. Updated consumers: - bot_detector.py: 'human' -> 'isp' for baseline selection - dashboard api.py: 'human' -> 'isp' in SQL queries - run-tests.sh: 'human' -> 'isp' in integration test assertions - update-csv-data.sh: updated label description comment Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -268,7 +268,7 @@ fi
|
||||
|
||||
# 5f. Verify seeder data reached agg table and AI features view
|
||||
AGG_COUNT=$(ch_query "SELECT count() FROM ja4_processing.agg_host_ip_ja4_1h")
|
||||
HUMAN_COUNT=$(ch_query "SELECT count() FROM ja4_processing.view_ai_features_1h WHERE asn_label='human'")
|
||||
HUMAN_COUNT=$(ch_query "SELECT count() FROM ja4_processing.view_ai_features_1h WHERE asn_label='isp'")
|
||||
BOT_LABEL_COUNT=$(ch_query "SELECT count() FROM ja4_processing.view_ai_features_1h WHERE bot_name != ''")
|
||||
UNIQ_SRC_IPS=$(ch_query "SELECT count(DISTINCT src_ip) FROM ja4_processing.view_ai_features_1h")
|
||||
UNIQ_JA4=$(ch_query "SELECT count(DISTINCT ja4) FROM ja4_processing.view_ai_features_1h")
|
||||
@ -280,11 +280,11 @@ else
|
||||
fi
|
||||
|
||||
if [ "$HUMAN_COUNT" -ge 500 ] 2>/dev/null; then
|
||||
pass "Bot-detector baseline: $HUMAN_COUNT human sessions (≥500 threshold met)"
|
||||
pass "Bot-detector baseline: $HUMAN_COUNT ISP sessions (≥500 threshold met)"
|
||||
elif [ "$HUMAN_COUNT" -gt 0 ] 2>/dev/null; then
|
||||
warn "Human sessions below threshold: $HUMAN_COUNT < 500 (bot_detector will skip cycle)"
|
||||
warn "ISP sessions below threshold: $HUMAN_COUNT < 500 (bot_detector will skip cycle)"
|
||||
else
|
||||
fail "No human sessions in view_ai_features_1h (asn_reputation CSV not loaded?)"
|
||||
fail "No ISP sessions in view_ai_features_1h (asn_reputation CSV not loaded?)"
|
||||
fi
|
||||
|
||||
if [ "$BOT_LABEL_COUNT" -gt 0 ] 2>/dev/null; then
|
||||
|
||||
Reference in New Issue
Block a user