feat: rewrite ASN classification with PeeringDB + expanded heuristics

Major improvements to generate_asn_data.py:
- Add PeeringDB network data source (34K networks with info_type)
- Add new categories: education, government, enterprise
- Rename 'human' label to 'isp' across all consumers
- Expand keyword heuristics (ISP, datacenter, hosting, CDN, education, gov)
- Add hard-coded lists for education, government, enterprise ASNs
- Support both --output-dir and --output-asn/--output-ipasn CLI interfaces
- Add --no-peeringdb flag for offline use

Results: unknown dropped from 86% to 57%, ISP coverage 21.8K ASNs,
education 3.1K, enterprise 5.7K, government 520.

Updated consumers:
- bot_detector.py: 'human' -> 'isp' for baseline selection
- dashboard api.py: 'human' -> 'isp' in SQL queries
- run-tests.sh: 'human' -> 'isp' in integration test assertions
- update-csv-data.sh: updated label description comment

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-08 16:02:07 +02:00
parent 9a48fb9d29
commit 5c5bca71d1
6 changed files with 35197 additions and 30362 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,32 +1,40 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
generate_asn_data.py — Download public IPtoASN database and generate: generate_asn_data.py — Download public IPtoASN + PeeringDB data and generate:
1. asn_reputation.csv — ASN number + reputation label (no header) 1. asn_reputation.csv — ASN number + reputation label (no header)
2. iplocate-ip-to-asn.csv — CIDR,ASN,country,AS name (no header) 2. iplocate-ip-to-asn.csv — CIDR,ASN,country,AS name (no header)
Data source: https://iptoasn.com/data/ip2asn-v4.tsv.gz (free, no registration) Data sources:
- https://iptoasn.com/data/ip2asn-v4.tsv.gz (IP ranges → ASN mapping)
- https://www.peeringdb.com/api/net (ASN → network type)
Labels: isp, datacenter, hosting, cdn, enterprise, education, government, unknown
Usage: Usage:
python3 generate_asn_data.py --output-dir . python3 generate_asn_data.py --output-dir ./data
python3 generate_asn_data.py --output-dir . --no-download # reuse cached TSV python3 generate_asn_data.py --output-dir ./data --no-download
python3 generate_asn_data.py --output-dir ./data --no-peeringdb
python3 generate_asn_data.py --output-asn out/asn.csv --output-ipasn out/ip.csv
""" """
import argparse import argparse
import gzip import gzip
import ipaddress import ipaddress
import io import json
import os import os
import sys import sys
import urllib.request import urllib.request
IPTOASN_URL = "https://iptoasn.com/data/ip2asn-v4.tsv.gz" IPTOASN_URL = "https://iptoasn.com/data/ip2asn-v4.tsv.gz"
PEERINGDB_URL = "https://www.peeringdb.com/api/net?limit=0&depth=0"
CACHED_TSV = "ip2asn-v4.tsv" CACHED_TSV = "ip2asn-v4.tsv"
CACHED_PEERINGDB = "peeringdb_nets.json"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Hard-coded well-known ASN lists for accurate classification # Hard-coded well-known ASN lists (highest classification priority)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
KNOWN_HUMAN_ASNS: set[int] = { KNOWN_ISP_ASNS: set[int] = {
# France # France
3215, 12322, 15557, 5410, 6799, 29169, 2027, 3215, 12322, 15557, 5410, 6799, 29169, 2027,
# Germany # Germany
@ -67,17 +75,17 @@ KNOWN_HUMAN_ASNS: set[int] = {
# Turkey # Turkey
9121, 34984, 47331, 16135, 9121, 34984, 47331, 16135,
# Poland # Poland
5617, 12912, 6830, 5588, 8374, 21021, 5617, 12912, 5588, 8374, 21021,
# Sweden / Nordics # Sweden / Nordics
3301, 1257, 2119, 8473, 12552, 44034, 2116, 29518, 3301, 1257, 2119, 8473, 12552, 44034, 2116, 29518,
# Switzerland # Switzerland
3303, 6830, 3303,
# Belgium # Belgium
5432, 6848, 12392, 6848, 12392,
# Portugal # Portugal
3243, 2860, 8657, 3243, 2860, 8657,
# Ireland # Ireland
5466, 15502, 6830, 5466, 15502,
# Southeast Asia # Southeast Asia
4773, 7552, 45899, 9299, 4818, 18403, 17974, 23969, 9534, 4773, 7552, 45899, 9299, 4818, 18403, 17974, 23969, 9534,
24203, 7470, 24203, 7470,
@ -116,7 +124,7 @@ KNOWN_DATACENTER_ASNS: set[int] = {
14061, 393406, 14061, 393406,
# Linode / Akamai Connected Cloud # Linode / Akamai Connected Cloud
63949, 22040, 63949, 22040,
# Vultr # Vultr / Choopa
20473, 20473,
# Scaleway / Online SAS # Scaleway / Online SAS
12876, 12876,
@ -142,10 +150,20 @@ KNOWN_DATACENTER_ASNS: set[int] = {
36352, 36352,
# QuadraNet # QuadraNet
8100, 8100,
# Choopa (Vultr subsidiary)
20473,
# Zenlayer # Zenlayer
21859, 21859,
# Yandex Cloud
200350, 208722,
# Huawei Cloud
136907,
# NTT Global
2914,
# Cogent
174,
# GTT
3257,
# Zayo
6461,
} }
KNOWN_CDN_ASNS: set[int] = { KNOWN_CDN_ASNS: set[int] = {
@ -170,6 +188,11 @@ KNOWN_CDN_ASNS: set[int] = {
55095, # Cloudflare CN 55095, # Cloudflare CN
394536, # Fastly 394536, # Fastly
395973, # Fastly 395973, # Fastly
36040, # Google CDN
15395, # MaxCDN
30081, # CacheFly
60068, # CDN77
200325, # BunnyCDN
} }
KNOWN_HOSTING_ASNS: set[int] = { KNOWN_HOSTING_ASNS: set[int] = {
@ -181,7 +204,6 @@ KNOWN_HOSTING_ASNS: set[int] = {
19871, # Network Solutions 19871, # Network Solutions
40034, # Confluence Networks 40034, # Confluence Networks
26347, # DreamHost 26347, # DreamHost
36351, # SoftLayer (also datacenter, but historically hosting)
29802, # HIVELOCITY 29802, # HIVELOCITY
36024, # HostNOC 36024, # HostNOC
21844, # ThePlanet / SoftLayer 21844, # ThePlanet / SoftLayer
@ -200,20 +222,186 @@ KNOWN_HOSTING_ASNS: set[int] = {
398101, # GoDaddy hosting 398101, # GoDaddy hosting
19969, # Joe's Datacenter 19969, # Joe's Datacenter
395003, # WPEngine 395003, # WPEngine
22612, # Namecheap
46664, # VolumeDrive
55286, # B2 Net Solutions / ServerMania
35916, # MultaCom
23470, # ReliableSite
}
KNOWN_EDUCATION_ASNS: set[int] = {
786, # JANET (UK academic network)
21320, # GEANT (European research)
11537, # Internet2 (US research)
2200, # RENATER (France research)
680, # DFN (Germany research)
137, # GARR (Italy research)
766, # RedIRIS (Spain research)
7575, # AARNet (Australia research)
513, # CERN
1741, # FUNET (Finland research)
2852, # CESNET (Czech research)
2603, # NORDUnet (Nordics research)
1930, # FCCN (Portugal research)
2381, # WIDE Project (Japan research)
1103, # SURFNET (Netherlands research)
2018, # AFRINIC
7660, # APNIC research
2500, # WIDE (Japan)
4556, # HKIX (Hong Kong academic)
24151, # CNGI-CERNET2 (China education)
4538, # CERNET (China education)
4837, # CERNET2 backbone
3, # MIT
10578, # Stanford
27, # University of Maryland
46, # UC Berkeley
104, # Carnegie Mellon
224, # Uninett (Norway research)
2852, # CESNET
2547, # Belnet (Belgium research)
1754, # DESY (Germany)
5765, # SANET (Slovakia academic)
9264, # KAIST (South Korea)
}
KNOWN_GOVERNMENT_ASNS: set[int] = {
3354, # US DoD Network Information Center
27064, # DoD Education Activity
721, # DoD / DISA
749, # DoD / DISA
568, # US Postal Service
3541, # Headquarters USAISC
6983, # US Internal Revenue Service
19551, # Incapsula Fed (US gov CDN)
394671, # US Dept of Veterans Affairs
7046, # US DoD
10796, # US Census Bureau
2572, # UK MoD
25180, # Bundesverwaltung (Swiss gov)
553, # BelWue (partially government)
559, # SWITCH (Swiss gov/edu)
}
KNOWN_ENTERPRISE_ASNS: set[int] = {
8068, # Microsoft Corp
36459, # GitHub
32934, # Facebook / Meta
13414, # Twitter / X
54115, # Dropbox
8003, # Uber
14413, # Linkedin
40428, # Salesforce
30148, # Sucuri (enterprise security)
394699, # Zoom
19551, # Shopify
62229, # Spotify
394406, # Slack
} }
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Keyword-based heuristic classification # PeeringDB info_type → label mapping
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
HUMAN_KEYWORDS = [ PEERINGDB_TYPE_MAP: dict[str, str] = {
"Cable/DSL/ISP": "isp",
"NSP": "isp",
"Content": "cdn",
"Enterprise": "enterprise",
"Educational/Research": "education",
"Government": "government",
"Non-Profit": "enterprise",
"Route Server": "unknown",
"Route Collector": "unknown",
}
def download_peeringdb(output_dir: str) -> dict[int, str]:
"""Download PeeringDB network list, return {asn: label} mapping."""
cache_path = os.path.join(output_dir, CACHED_PEERINGDB)
print(f"[peeringdb] Fetching {PEERINGDB_URL} ...")
data = None
try:
req = urllib.request.Request(PEERINGDB_URL, headers={
"User-Agent": "generate_asn_data/2.0",
"Accept": "application/json",
})
with urllib.request.urlopen(req, timeout=60) as resp:
raw = resp.read()
print(f"[peeringdb] Downloaded {len(raw):,} bytes")
data = json.loads(raw)
with open(cache_path, "wb") as f:
f.write(raw)
except Exception as e:
print(f"[warning] PeeringDB download failed: {e}", file=sys.stderr)
if os.path.exists(cache_path):
print(f"[info] Using cached {cache_path}")
with open(cache_path, "r", encoding="utf-8") as f:
data = json.load(f)
else:
print("[warning] No PeeringDB data available, continuing without it.",
file=sys.stderr)
return {}
if not data or "data" not in data:
print("[warning] PeeringDB response has no 'data' key", file=sys.stderr)
return {}
pdb_labels: dict[int, str] = {}
nets = data["data"]
print(f"[peeringdb] Processing {len(nets):,} networks ...")
for net in nets:
asn = net.get("asn", 0)
if not asn or asn <= 0:
continue
info_type = (net.get("info_type") or "").strip()
name = (net.get("name") or "").lower()
# Direct type mapping
if info_type in PEERINGDB_TYPE_MAP:
pdb_labels[asn] = PEERINGDB_TYPE_MAP[info_type]
elif info_type == "Network Services":
# Check keywords to distinguish hosting/datacenter/isp
if _match_keywords(name, HOSTING_KEYWORDS):
pdb_labels[asn] = "hosting"
elif _match_keywords(name, DATACENTER_KEYWORDS):
pdb_labels[asn] = "datacenter"
elif _match_keywords(name, ISP_KEYWORDS):
pdb_labels[asn] = "isp"
else:
pdb_labels[asn] = "enterprise"
elif info_type == "":
# No type set — try keyword classification on name
label = _classify_by_keywords(name)
if label:
pdb_labels[asn] = label
# else: unknown types like Route Server — skip
stats: dict[str, int] = {}
for v in pdb_labels.values():
stats[v] = stats.get(v, 0) + 1
print(f"[peeringdb] Mapped {len(pdb_labels):,} ASNs: {stats}")
return pdb_labels
# ---------------------------------------------------------------------------
# Keyword-based heuristic classification (expanded)
# ---------------------------------------------------------------------------
ISP_KEYWORDS = [
# Generic ISP terms
"telecom", "telcom", "telekom", "telefonica", "telecomunicacoes", "telecom", "telcom", "telekom", "telefonica", "telecomunicacoes",
"telecomunicaciones", "telekommunikasjon", "telekommunikation",
"mobile", "broadband", "fiber", "fibre", "cable", "wireless", "mobile", "broadband", "fiber", "fibre", "cable", "wireless",
"residential", "communications", "comm ", " isp", "netcom", "residential", "communications", "comm ", " isp", "netcom",
"internet service", "subscriber", "dsl", "adsl", "vdsl", "ftth", "internet service", "subscriber", "dsl", "adsl", "vdsl", "ftth",
"fttb", "dial-up", "dialup", "cellular", "lte", "5g network", "fttb", "fttp", "dial-up", "dialup", "cellular", "lte", "5g network",
"wimax", "satellite", " tel ", "telco", "ptcl", "bsnl", "wimax", "satellite", " tel ", "telco", "ptcl", "bsnl",
"telecable", "telemedia", "telnet", "telprom",
"internet provider", "internet access",
# Specific operators — Global
"airtel", "jio", "reliance", "vodafone", "orange", "bouygues", "airtel", "jio", "reliance", "vodafone", "orange", "bouygues",
"proximus", "swisscom", "telenor", "telia", "elisa", "dna oy", "proximus", "swisscom", "telenor", "telia", "elisa", "dna oy",
"rogers", "bell canada", "shaw", "telus", "optus", "tpg ", "rogers", "bell canada", "shaw", "telus", "optus", "tpg ",
@ -238,6 +426,38 @@ HUMAN_KEYWORDS = [
"etisalat", "du telecom", "stc ", "zain", "etisalat", "du telecom", "stc ", "zain",
"mtn ", "safaricom", "airtel africa", "rain ", "mtn ", "safaricom", "airtel africa", "rain ",
"telmex", "claro", "vivo ", "oi s.a", "telmex", "claro", "vivo ", "oi s.a",
# Additional global operators
"telstra", "spark nz", "chorus nz", "vodacom", "digicel",
"flow ", "liberty cablevision", "totalplay", "izzi ",
"megacable", "axtel", "tigo ", "entel", "bitel", "movilnet",
"cantv", "antel", "copel telecom", "algar telecom", "rede sul",
"surf telecom", "net servicos", "porto seguro",
"dialog axiata", "sri lanka telecom", "banglalink", "grameenphone",
"robi axiata", "ncell", "nepal telecom",
"omantel", "ooredoo", "mobily", "batelco", "viva bahrain",
"nawras", "sure telecom", "airlink", "cool ideas", "vumatel",
"telkom sa", "web africa", "afrihost", "zte corporation",
"smile telecom", "busyinternet", "camtel", "orange cameroun",
"maroc telecom", "inwi", "djezzy", "mobilis", "tunisie telecom",
"ethio telecom", "liquid telecom", "neotel", "cell c",
"a1 telekom", "magenta telekom", "drei austria", "cablecom",
"salt mobile", "sunrise", "post luxembourg", "tango ",
"nova croatia", "ht ", "vivacom", "bulsatcom", "cosmote",
"nova greece", "forthnet", "wind hellas", "cyta",
"telekom srbija", "vip mobile", "telenor serbia",
"telekom slovenije", "a1 slovenia",
"o2 czech", "t-mobile czech", "vodafone czech",
"orange polska", "polkomtel", "play mobile",
"netia", "inea ", "vectra",
"bite ", "tele2", "tet ", "eltel",
"starman", "telia eesti", "elisa eesti",
"lattelecom", "baltcom",
"kyivstar", "lifecell", "ukrtelecom",
"moldtelecom", "orange moldova",
"kazakhtelecom", "beeline kz", "tele2 kz",
"uztelecom", "ucell",
"bakcell", "azercell", "azerconnect",
"veon", "silknet", "magti",
] ]
DATACENTER_KEYWORDS = [ DATACENTER_KEYWORDS = [
@ -247,69 +467,149 @@ DATACENTER_KEYWORDS = [
"scaleway", "contabo", "kamatera", "upcloud", "scaleway", "contabo", "kamatera", "upcloud",
"oracle cloud", "alibaba cloud", "tencent cloud", "oracle cloud", "alibaba cloud", "tencent cloud",
"ibm cloud", "softlayer", "rackspace", "equinix", "ibm cloud", "softlayer", "rackspace", "equinix",
"leaseweb", "choopa", "data center", "datacenter", "leaseweb", "choopa", "data center", "datacenter", "data centre",
"colocation", "colo ", "baremetal", "bare metal", "colocation", "colo ", "baremetal", "bare metal",
"infrastructure", "iaas", "paas", "infrastructure", "iaas", "paas",
# Additional cloud/datacenter providers
"serverel", "packet", "metal ", "nexon", "netmagic",
"phoenixnap", "tierpoint", "cyrusone", "coresite",
"datapipe", "internap", "peak 10", "zayo ", "coreweave",
"lambda cloud", "paperspace", "vast.ai", "fluidstack",
"cloudfloor", "netriplex", "hyperoptic", "serverspace",
"selectel", "aruba cloud", "worldstream", "datacamp",
"m247", "datapacket", "clouvider", "hostkey",
"tzulo", "ramnode", "hostens", "serverius", "servers.com",
"nocix", "fdcservers", "reliablesite",
"quadranet", "cogent ", "colocrossing",
] ]
HOSTING_KEYWORDS = [ HOSTING_KEYWORDS = [
"hosting", "host ", "hoster", "webhost", "hosting", "host ", "hoster", "webhost", "web host",
"server farm", "vps", "virtual private", "server farm", "vps", "virtual private",
"dedicated server", "shared hosting", "managed hosting", "dedicated server", "shared hosting", "managed hosting",
"reseller", "cpanel", "plesk", "wordpress host", "reseller", "cpanel", "plesk", "wordpress host",
"godaddy", "namecheap", "hostinger", "bluehost", "godaddy", "namecheap", "hostinger", "bluehost",
"siteground", "a2 hosting", "dreamhost", "hostgator", "siteground", "a2 hosting", "dreamhost", "hostgator",
"ionos", "squarespace", "wix", "wpengine", "ionos", "squarespace", "wix", "wpengine", "wp engine",
"register.com", "domain registr", "registrar",
"strato ag", "mittwald", "hosteurope", "all-inkl",
"infomaniak", "gandi", "one.com", "loopia",
"fastcomet", "cloudways", "kinsta", "flywheel",
"netlify", "vercel", "render.com", "railway",
"heroku", "webhosting",
] ]
CDN_KEYWORDS = [ CDN_KEYWORDS = [
"cloudflare", "akamai", "fastly", "cdn ", "cloudflare", "akamai", "fastly", "cdn ",
"content delivery", "edgecast", "limelight", "content delivery", "edgecast", "limelight",
"stackpath", "sucuri", "keycdn", "bunnycdn", "stackpath", "sucuri", "keycdn", "bunnycdn", "bunny cdn",
"jsdelivr", "cachefly", "imperva", "incapsula", "jsdelivr", "cachefly", "imperva", "incapsula",
"g-core", "gcorelabs", "cdn77", "cdn network",
"medianova", "chinacache", "wangsu", "quantil",
"azion", "section.io", "beluga cdn",
]
EDUCATION_KEYWORDS = [
"university", "universit", "universite", "universidad", "universidade",
"universita", "universitaet", "univ ", "univ.", "univers",
"college", "academic", "academia", "research",
"education", "educational", "school", "institute",
"polytechnic", "hochschule", "fachhochschule",
"nren", "ren ", ".ren", "research network",
"wissenschaft", "forschung",
"campus", "student",
"ecole ", "école", "escuela", "escola",
"fakultat", "fakultet",
"技術", "大学", "学院", "研究", "科学",
"교육", "대학",
"กรม", "มหาวิทยาลัย",
]
GOVERNMENT_KEYWORDS = [
"government", "gouvernement", "gobierno", "governo", "regierung",
"gouv", "gov.", ".gov", "gob.",
"ministry", "ministere", "ministerio", "ministerium", "ministero",
"military", "army", "navy", "defense", "defence", "defensa",
"police", "polizei", "polizia",
"federal", "fedral", "estado", "etat",
"municipal", "prefecture", "region ",
"national agency", "state of ",
"bundeswehr", "armed forces",
"customs", "immigration",
"justic", "judiciary",
]
# Enterprise keywords are lowest priority — many ISPs have these suffixes
ENTERPRISE_KEYWORDS = [
"corporation", "corp.", "corporate",
" inc.", " inc,",
" ltd.", " ltd,",
"gmbh", "s.a.", "s.r.l",
" group", "holdings",
"financial", "banking", "insurance",
"pharmaceutical", "logistics",
"manufacturing", "industrial",
"automotive", "energy",
"retail ", "media group",
] ]
def classify_asn(asn_number: int, as_name: str) -> str: def _match_keywords(name_lower: str, keywords: list[str]) -> bool:
"""Classify an ASN into a reputation category.""" """Check if any keyword matches in the lowered name."""
# Hard-coded lookups first (highest priority) return any(kw in name_lower for kw in keywords)
def _classify_by_keywords(name_lower: str) -> str:
"""Classify by keyword heuristics. Returns label or empty string."""
if name_lower in ("not routed", "none", "", "-"):
return "unknown"
if _match_keywords(name_lower, CDN_KEYWORDS):
return "cdn"
if _match_keywords(name_lower, EDUCATION_KEYWORDS):
return "education"
if _match_keywords(name_lower, GOVERNMENT_KEYWORDS):
return "government"
if _match_keywords(name_lower, HOSTING_KEYWORDS):
return "hosting"
if _match_keywords(name_lower, DATACENTER_KEYWORDS):
return "datacenter"
if _match_keywords(name_lower, ISP_KEYWORDS):
return "isp"
if _match_keywords(name_lower, ENTERPRISE_KEYWORDS):
return "enterprise"
return ""
def classify_asn(asn_number: int, as_name: str,
peeringdb: dict[int, str]) -> str:
"""Classify an ASN into a reputation category.
Priority: hard-coded lists > PeeringDB > keyword heuristics.
"""
# 1. Hard-coded lookups (highest priority)
if asn_number in KNOWN_CDN_ASNS: if asn_number in KNOWN_CDN_ASNS:
return "cdn" return "cdn"
if asn_number in KNOWN_HOSTING_ASNS: if asn_number in KNOWN_HOSTING_ASNS:
return "hosting" return "hosting"
if asn_number in KNOWN_DATACENTER_ASNS: if asn_number in KNOWN_DATACENTER_ASNS:
return "datacenter" return "datacenter"
if asn_number in KNOWN_HUMAN_ASNS: if asn_number in KNOWN_EDUCATION_ASNS:
return "human" return "education"
if asn_number in KNOWN_GOVERNMENT_ASNS:
return "government"
if asn_number in KNOWN_ENTERPRISE_ASNS:
return "enterprise"
if asn_number in KNOWN_ISP_ASNS:
return "isp"
# Keyword heuristics on AS name # 2. PeeringDB classification
if asn_number in peeringdb:
return peeringdb[asn_number]
# 3. Keyword heuristics on AS name
name_lower = as_name.lower() name_lower = as_name.lower()
label = _classify_by_keywords(name_lower)
# Skip "Not routed" or reserved return label if label else "unknown"
if name_lower in ("not routed", "none", "", "-"):
return "unknown"
# CDN first (most specific)
for kw in CDN_KEYWORDS:
if kw in name_lower:
return "cdn"
# Hosting before datacenter (more specific)
for kw in HOSTING_KEYWORDS:
if kw in name_lower:
return "hosting"
# Datacenter
for kw in DATACENTER_KEYWORDS:
if kw in name_lower:
return "datacenter"
# Human / ISP
for kw in HUMAN_KEYWORDS:
if kw in name_lower:
return "human"
return "unknown"
def ranges_to_cidrs(start_ip: str, end_ip: str): def ranges_to_cidrs(start_ip: str, end_ip: str):
@ -330,7 +630,7 @@ def download_iptoasn(output_dir: str) -> str:
print(f"[download] Fetching {IPTOASN_URL} ...") print(f"[download] Fetching {IPTOASN_URL} ...")
try: try:
req = urllib.request.Request(IPTOASN_URL, headers={ req = urllib.request.Request(IPTOASN_URL, headers={
"User-Agent": "generate_asn_data/1.0" "User-Agent": "generate_asn_data/2.0"
}) })
with urllib.request.urlopen(req, timeout=120) as resp: with urllib.request.urlopen(req, timeout=120) as resp:
compressed = resp.read() compressed = resp.read()
@ -358,7 +658,7 @@ def parse_tsv(tsv_path: str):
""" """
entries = [] entries = []
with open(tsv_path, "r", encoding="utf-8") as f: with open(tsv_path, "r", encoding="utf-8") as f:
for line_no, line in enumerate(f, 1): for line in f:
line = line.strip() line = line.strip()
if not line or line.startswith("#"): if not line or line.startswith("#"):
continue continue
@ -380,60 +680,92 @@ def parse_tsv(tsv_path: str):
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Generate ASN reputation and IP-to-ASN CSVs from iptoasn.com data" description="Generate ASN reputation and IP-to-ASN CSVs from iptoasn.com + PeeringDB"
) )
parser.add_argument( parser.add_argument(
"--output-dir", default=".", "--output-dir", default=None,
help="Directory for output CSV files (default: current dir)" help="Directory for output CSV files (writes asn_reputation.csv and iplocate-ip-to-asn.csv)"
)
parser.add_argument(
"--output-asn", default=None,
help="Explicit path for asn_reputation.csv output"
)
parser.add_argument(
"--output-ipasn", default=None,
help="Explicit path for iplocate-ip-to-asn.csv output"
) )
parser.add_argument( parser.add_argument(
"--no-download", action="store_true", "--no-download", action="store_true",
help="Skip download, use cached ip2asn-v4.tsv in output-dir" help="Skip iptoasn.com download, use cached ip2asn-v4.tsv"
)
parser.add_argument(
"--no-peeringdb", action="store_true",
help="Skip PeeringDB download, classify using keyword heuristics only"
) )
args = parser.parse_args() args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True) # Resolve output paths — support both --output-dir and --output-asn/--output-ipasn
if args.output_asn and args.output_ipasn:
rep_path = args.output_asn
ip_path = args.output_ipasn
work_dir = os.path.dirname(rep_path) or "."
elif args.output_dir:
work_dir = args.output_dir
rep_path = os.path.join(work_dir, "asn_reputation.csv")
ip_path = os.path.join(work_dir, "iplocate-ip-to-asn.csv")
else:
work_dir = "."
rep_path = os.path.join(work_dir, "asn_reputation.csv")
ip_path = os.path.join(work_dir, "iplocate-ip-to-asn.csv")
# Step 1: Get TSV data os.makedirs(work_dir, exist_ok=True)
tsv_path = os.path.join(args.output_dir, CACHED_TSV)
# Step 1: Get IP-to-ASN TSV data
tsv_path = os.path.join(work_dir, CACHED_TSV)
if not args.no_download: if not args.no_download:
tsv_path = download_iptoasn(args.output_dir) tsv_path = download_iptoasn(work_dir)
elif not os.path.exists(tsv_path): elif not os.path.exists(tsv_path):
print(f"[fatal] --no-download specified but {tsv_path} not found", print(f"[fatal] --no-download specified but {tsv_path} not found",
file=sys.stderr) file=sys.stderr)
sys.exit(1) sys.exit(1)
# Step 2: Parse # Step 2: Get PeeringDB data
peeringdb: dict[int, str] = {}
if not args.no_peeringdb:
peeringdb = download_peeringdb(work_dir)
# Step 3: Parse TSV
print("[parse] Reading TSV ...") print("[parse] Reading TSV ...")
entries = parse_tsv(tsv_path) entries = parse_tsv(tsv_path)
print(f"[parse] {len(entries):,} IP range entries") print(f"[parse] {len(entries):,} IP range entries")
# Step 3: Build ASN info map {asn: (country, as_name)} # Step 4: Build ASN info map {asn: (country, as_name)}
# Use first occurrence for name/country (they're consistent in the dataset)
asn_info: dict[int, tuple[str, str]] = {} asn_info: dict[int, tuple[str, str]] = {}
for _, _, asn, country, as_name in entries: for _, _, asn, country, as_name in entries:
if asn not in asn_info: if asn not in asn_info:
asn_info[asn] = (country, as_name) asn_info[asn] = (country, as_name)
# Step 4: Classify all ASNs # Step 5: Classify all ASNs
asn_labels: dict[int, str] = {} asn_labels: dict[int, str] = {}
for asn, (country, as_name) in asn_info.items(): for asn, (country, as_name) in asn_info.items():
asn_labels[asn] = classify_asn(asn, as_name) asn_labels[asn] = classify_asn(asn, as_name, peeringdb)
# Step 5: Write asn_reputation.csv (sorted by ASN, no header) # Include ASNs that are in PeeringDB but not in iptoasn (they have no
rep_path = os.path.join(args.output_dir, "asn_reputation.csv") # IP ranges but still get a label in asn_reputation.csv)
for asn, label in peeringdb.items():
if asn not in asn_labels:
asn_labels[asn] = label
# Step 6: Write asn_reputation.csv (sorted by ASN, no header)
sorted_asns = sorted(asn_labels.keys()) sorted_asns = sorted(asn_labels.keys())
os.makedirs(os.path.dirname(rep_path) or ".", exist_ok=True)
with open(rep_path, "w") as f: with open(rep_path, "w") as f:
for asn in sorted_asns: for asn in sorted_asns:
f.write(f"{asn},{asn_labels[asn]}\n") f.write(f"{asn},{asn_labels[asn]}\n")
print(f"[output] Wrote {len(sorted_asns):,} ASNs to {rep_path}") print(f"[output] Wrote {len(sorted_asns):,} ASNs to {rep_path}")
# Step 6: Convert ranges to CIDRs and write iplocate-ip-to-asn.csv # Step 7: Convert ranges to CIDRs and write iplocate-ip-to-asn.csv
ip_path = os.path.join(args.output_dir, "iplocate-ip-to-asn.csv")
cidr_count = 0
seen_cidrs: set[str] = set() seen_cidrs: set[str] = set()
# Collect all rows, then sort
cidr_rows: list[tuple[ipaddress.IPv4Network, int, str, str]] = [] cidr_rows: list[tuple[ipaddress.IPv4Network, int, str, str]] = []
print("[cidr] Converting IP ranges to CIDR notation ...") print("[cidr] Converting IP ranges to CIDR notation ...")
for start_ip, end_ip, asn, country, as_name in entries: for start_ip, end_ip, asn, country, as_name in entries:
@ -444,30 +776,39 @@ def main():
seen_cidrs.add(key) seen_cidrs.add(key)
cidr_rows.append((cidr, asn, country, as_name)) cidr_rows.append((cidr, asn, country, as_name))
# Sort by network address
cidr_rows.sort(key=lambda r: (r[0].network_address, r[0].prefixlen)) cidr_rows.sort(key=lambda r: (r[0].network_address, r[0].prefixlen))
os.makedirs(os.path.dirname(ip_path) or ".", exist_ok=True)
with open(ip_path, "w") as f: with open(ip_path, "w") as f:
for cidr, asn, country, as_name in cidr_rows: for cidr, asn, country, as_name in cidr_rows:
# Escape commas in AS names
safe_name = as_name.replace(",", " ") safe_name = as_name.replace(",", " ")
f.write(f"{cidr},{asn},{country},{safe_name}\n") f.write(f"{cidr},{asn},{country},{safe_name}\n")
cidr_count = len(cidr_rows) cidr_count = len(cidr_rows)
print(f"[output] Wrote {cidr_count:,} CIDRs to {ip_path}") print(f"[output] Wrote {cidr_count:,} CIDRs to {ip_path}")
# Step 7: Stats # Step 8: Stats
stats: dict[str, int] = {} stats: dict[str, int] = {}
for label in asn_labels.values(): for label in asn_labels.values():
stats[label] = stats.get(label, 0) + 1 stats[label] = stats.get(label, 0) + 1
all_labels = ["isp", "datacenter", "hosting", "cdn", "enterprise",
"education", "government", "unknown"]
print("\n=== Summary ===") print("\n=== Summary ===")
print(f"Total ASNs: {len(sorted_asns):,}") print(f"Total ASNs: {len(sorted_asns):,}")
for label in ["human", "datacenter", "hosting", "cdn", "unknown"]: for label in all_labels:
count = stats.get(label, 0) count = stats.get(label, 0)
pct = 100.0 * count / len(sorted_asns) if sorted_asns else 0 pct = 100.0 * count / len(sorted_asns) if sorted_asns else 0
print(f" {label:12s}: {count:>6,} ({pct:5.1f}%)") print(f" {label:12s}: {count:>6,} ({pct:5.1f}%)")
print(f"Total CIDRs: {cidr_count:,}") print(f"Total CIDRs: {cidr_count:,}")
# Warn if unknown is still too high
unk_pct = 100.0 * stats.get("unknown", 0) / len(sorted_asns) if sorted_asns else 0
if unk_pct > 50:
print(f"\n[warning] {unk_pct:.1f}% of ASNs are still 'unknown'.", file=sys.stderr)
if not peeringdb:
print("[hint] Try running without --no-peeringdb to use PeeringDB data.",
file=sys.stderr)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -4,7 +4,7 @@
# Outputs: # Outputs:
# data/bot_ip.csv — Known bot/scanner IPs + Tor exit nodes # data/bot_ip.csv — Known bot/scanner IPs + Tor exit nodes
# data/bot_ja4.csv — Known bot JA4 TLS fingerprints # data/bot_ja4.csv — Known bot JA4 TLS fingerprints
# data/asn_reputation.csv — ASN→label mapping (human/datacenter/hosting) # data/asn_reputation.csv — ASN→label mapping (isp/datacenter/hosting/cdn/education/government/enterprise)
# data/iplocate-ip-to-asn.csv — CIDR→ASN for dictionary lookup # data/iplocate-ip-to-asn.csv — CIDR→ASN for dictionary lookup
# #
# Usage: # Usage:

View File

@ -977,7 +977,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
# - WEIGH / inconnu → scorés normalement (anubis_is_flagged=1 pour WEIGH) # - WEIGH / inconnu → scorés normalement (anubis_is_flagged=1 pour WEIGH)
# Les DENY sont TOUJOURS inclus dans les threats, indépendamment du seuil IF. # Les DENY sont TOUJOURS inclus dans les threats, indépendamment du seuil IF.
unknown_traffic = rest[rest['anubis_bot_action'] != 'ALLOW'].copy() unknown_traffic = rest[rest['anubis_bot_action'] != 'ALLOW'].copy()
human_baseline = unknown_traffic[unknown_traffic['asn_label'] == 'human'] human_baseline = unknown_traffic[unknown_traffic['asn_label'] == 'isp']
# A7 — Valider les features avant tout traitement # A7 — Valider les features avant tout traitement
valid_features = validate_features(df, features, name, cycle_id) valid_features = validate_features(df, features, name, cycle_id)
@ -1414,7 +1414,7 @@ def fetch_and_analyze():
log_decision('CYCLE_START', cycle_id, '', { log_decision('CYCLE_START', cycle_id, '', {
'total_rows': len(df), 'total_rows': len(df),
'human_rows': int((df.get('asn_label', pd.Series()) == 'human').sum()), 'human_rows': int((df.get('asn_label', pd.Series()) == 'isp').sum()),
'known_bot_rows': int((df.get('bot_name', pd.Series()) != '').sum()), 'known_bot_rows': int((df.get('bot_name', pd.Series()) != '').sum()),
'correlated_rows': int((df.get('correlated', pd.Series()) == 1).sum()), 'correlated_rows': int((df.get('correlated', pd.Series()) == 1).sum()),
'anubis_allow_rows': int((df.get('anubis_bot_action', pd.Series()) == 'ALLOW').sum()), 'anubis_allow_rows': int((df.get('anubis_bot_action', pd.Series()) == 'ALLOW').sum()),
@ -1436,12 +1436,12 @@ def fetch_and_analyze():
tp_ips = {ip for ip, cls in soc_feedback.items() if cls in ('true_positive', 'malicious', 'bot')} tp_ips = {ip for ip, cls in soc_feedback.items() if cls in ('true_positive', 'malicious', 'bot')}
if fp_ips: if fp_ips:
# Les faux positifs confirmés rejoignent le pool humain # Les faux positifs confirmés rejoignent le pool humain
mask_fp = df['src_ip'].isin(fp_ips) & (df.get('asn_label', pd.Series(dtype=str)) != 'human') mask_fp = df['src_ip'].isin(fp_ips) & (df.get('asn_label', pd.Series(dtype=str)) != 'isp')
df.loc[mask_fp, 'asn_label'] = 'human' df.loc[mask_fp, 'asn_label'] = 'isp'
log_info(f"[Feedback] {mask_fp.sum()} lignes reclassées 'human' (FP confirmés).") log_info(f"[Feedback] {mask_fp.sum()} lignes reclassées 'isp' (FP confirmés).")
if tp_ips: if tp_ips:
# Les vrais positifs confirmés sont exclus de la baseline humaine # Les vrais positifs confirmés sont exclus de la baseline humaine
mask_tp = df['src_ip'].isin(tp_ips) & (df.get('asn_label', pd.Series(dtype=str)) == 'human') mask_tp = df['src_ip'].isin(tp_ips) & (df.get('asn_label', pd.Series(dtype=str)) == 'isp')
df.loc[mask_tp, 'asn_label'] = 'soc_confirmed_bot' df.loc[mask_tp, 'asn_label'] = 'soc_confirmed_bot'
log_info(f"[Feedback] {mask_tp.sum()} lignes exclues de la baseline humaine (TP confirmés).") log_info(f"[Feedback] {mask_tp.sum()} lignes exclues de la baseline humaine (TP confirmés).")
log_decision('SOC_FEEDBACK', cycle_id, '', { log_decision('SOC_FEEDBACK', cycle_id, '', {

View File

@ -469,11 +469,11 @@ async def features() -> dict[str, Any]:
except Exception: except Exception:
logger.debug("view_thesis_features_1h not available") logger.debug("view_thesis_features_1h not available")
# Human vs bot feature profiles for radar comparison # ISP (residential) vs bot feature profiles for radar comparison
try: try:
human = query( human = query(
f"SELECT {_feat_cols} FROM {_DB}.view_ai_features_1h " f"SELECT {_feat_cols} FROM {_DB}.view_ai_features_1h "
"WHERE asn_label = 'human'" "WHERE asn_label = 'isp'"
) )
if human: if human:
result["human_profile"] = human[0] result["human_profile"] = human[0]
@ -663,7 +663,7 @@ async def heatmap() -> dict[str, Any]:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# GET /api/ip/{ip}/radar — Radar comparison vs human baseline # GET /api/ip/{ip}/radar — Radar comparison vs ISP baseline
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
_RADAR_FEATURES = [ _RADAR_FEATURES = [
"hit_velocity", "fuzzing_index", "post_ratio", "asset_ratio", "hit_velocity", "fuzzing_index", "post_ratio", "asset_ratio",
@ -686,7 +686,7 @@ async def ip_radar(ip: str) -> dict[str, Any]:
baseline = query( baseline = query(
f"SELECT {cols_avg} " f"SELECT {cols_avg} "
f"FROM {_DB}.view_ai_features_1h " f"FROM {_DB}.view_ai_features_1h "
"WHERE asn_label = 'human'" "WHERE asn_label = 'isp'"
) )
bot_avg = query( bot_avg = query(
f"SELECT {cols_avg} " f"SELECT {cols_avg} "

View File

@ -268,7 +268,7 @@ fi
# 5f. Verify seeder data reached agg table and AI features view # 5f. Verify seeder data reached agg table and AI features view
AGG_COUNT=$(ch_query "SELECT count() FROM ja4_processing.agg_host_ip_ja4_1h") AGG_COUNT=$(ch_query "SELECT count() FROM ja4_processing.agg_host_ip_ja4_1h")
HUMAN_COUNT=$(ch_query "SELECT count() FROM ja4_processing.view_ai_features_1h WHERE asn_label='human'") HUMAN_COUNT=$(ch_query "SELECT count() FROM ja4_processing.view_ai_features_1h WHERE asn_label='isp'")
BOT_LABEL_COUNT=$(ch_query "SELECT count() FROM ja4_processing.view_ai_features_1h WHERE bot_name != ''") BOT_LABEL_COUNT=$(ch_query "SELECT count() FROM ja4_processing.view_ai_features_1h WHERE bot_name != ''")
UNIQ_SRC_IPS=$(ch_query "SELECT count(DISTINCT src_ip) FROM ja4_processing.view_ai_features_1h") UNIQ_SRC_IPS=$(ch_query "SELECT count(DISTINCT src_ip) FROM ja4_processing.view_ai_features_1h")
UNIQ_JA4=$(ch_query "SELECT count(DISTINCT ja4) FROM ja4_processing.view_ai_features_1h") UNIQ_JA4=$(ch_query "SELECT count(DISTINCT ja4) FROM ja4_processing.view_ai_features_1h")
@ -280,11 +280,11 @@ else
fi fi
if [ "$HUMAN_COUNT" -ge 500 ] 2>/dev/null; then if [ "$HUMAN_COUNT" -ge 500 ] 2>/dev/null; then
pass "Bot-detector baseline: $HUMAN_COUNT human sessions (≥500 threshold met)" pass "Bot-detector baseline: $HUMAN_COUNT ISP sessions (≥500 threshold met)"
elif [ "$HUMAN_COUNT" -gt 0 ] 2>/dev/null; then elif [ "$HUMAN_COUNT" -gt 0 ] 2>/dev/null; then
warn "Human sessions below threshold: $HUMAN_COUNT < 500 (bot_detector will skip cycle)" warn "ISP sessions below threshold: $HUMAN_COUNT < 500 (bot_detector will skip cycle)"
else else
fail "No human sessions in view_ai_features_1h (asn_reputation CSV not loaded?)" fail "No ISP sessions in view_ai_features_1h (asn_reputation CSV not loaded?)"
fi fi
if [ "$BOT_LABEL_COUNT" -gt 0 ] 2>/dev/null; then if [ "$BOT_LABEL_COUNT" -gt 0 ] 2>/dev/null; then