ja4-platform/scripts/generate_asn_data.py

#!/usr/bin/env python3
"""
generate_asn_data.py — Download public IPtoASN + PeeringDB data and generate:
  1. asn_reputation.csv      — ASN number + reputation label (no header)
  2. iplocate-ip-to-asn.csv  — CIDR,ASN,country,AS name (no header)

Data sources:
  - https://iptoasn.com/data/ip2asn-v4.tsv.gz  (IP ranges → ASN mapping)
  - https://www.peeringdb.com/api/net             (ASN → network type)

Labels: isp, datacenter, hosting, cdn, enterprise, education, government, unknown

Usage:
  python3 generate_asn_data.py --output-dir ./data
  python3 generate_asn_data.py --output-dir ./data --no-download
  python3 generate_asn_data.py --output-dir ./data --no-peeringdb
  python3 generate_asn_data.py --output-asn out/asn.csv --output-ipasn out/ip.csv
"""

import argparse
import gzip
import ipaddress
import json
import os
import sys
import urllib.request

IPTOASN_URL = "https://iptoasn.com/data/ip2asn-v4.tsv.gz"
PEERINGDB_URL = "https://www.peeringdb.com/api/net?limit=0&depth=0"
CACHED_TSV = "ip2asn-v4.tsv"
CACHED_PEERINGDB = "peeringdb_nets.json"

# ---------------------------------------------------------------------------
# Hard-coded well-known ASN lists (highest classification priority)
# ---------------------------------------------------------------------------

KNOWN_ISP_ASNS: set[int] = {
    # France
    3215, 12322, 15557, 5410, 6799, 29169, 2027,
    # Germany
    3320, 5432, 6805, 6830, 8422, 31334, 8881, 9145,
    # UK
    2856, 5607, 8913, 6871, 13285, 20712, 25577,
    # Netherlands
    1136, 15542, 33915, 50266, 15435,
    # Spain
    3352, 12357, 12715, 12479, 12338,
    # Italy
    3269, 12874, 30722, 1267, 12797,
    # US — residential / consumer ISPs
    7922, 7018, 701, 20115, 209, 6389, 22773, 10796, 11351, 11427,
    11426, 20001, 22394, 5650, 6128, 10507, 12271, 19108, 26801,
    33363, 33588, 33651, 33652, 33657, 33659, 33660, 33662, 33668,
    7843, 11025, 12083, 20057, 23005, 26827, 33491, 33650, 5769,
    6167, 11404, 14265, 21508, 22561, 30036,
    # Canada
    577, 812, 6327, 852, 855, 6539, 21949, 5645, 6453,
    # Japan
    2516, 4713, 17676, 2519, 2497, 9605, 4685, 7679, 9824,
    # South Korea
    4766, 9318, 3786, 38091, 9848, 4659,
    # Australia
    1221, 4764, 4804, 7545, 9443, 18291, 24429,
    # India
    9829, 45609, 55836, 24560, 17488, 9498, 18101, 45820,
    # Brazil
    4230, 7738, 8167, 16735, 26599, 27699, 28573, 53006, 53089,
    18881, 22085, 28343,
    # Russia
    8359, 12389, 25513, 8402, 3216, 31133, 42610,
    # China
    4134, 4837, 4808, 4812, 9808, 9394, 56040, 56041, 56042,
    # Mexico
    8151, 6503, 11888, 17072, 32098,
    # Turkey
    9121, 34984, 47331, 16135,
    # Poland
    5617, 12912, 5588, 8374, 21021,
    # Sweden / Nordics
    3301, 1257, 2119, 8473, 12552, 44034, 2116, 29518,
    # Switzerland
    3303,
    # Belgium
    6848, 12392,
    # Portugal
    3243, 2860, 8657,
    # Ireland
    5466, 15502,
    # Southeast Asia
    4773, 7552, 45899, 9299, 4818, 18403, 17974, 23969, 9534,
    24203, 7470,
    # Middle East
    5384, 8781, 39891, 42961, 12880, 44244, 50710,
    # Africa
    36903, 37105, 36874, 36992, 37453, 29571, 33771, 37492,
    # Argentina
    7303, 10318, 11664, 22927,
    # Colombia
    10620, 13489, 14080, 27831,
    # New Zealand
    9790, 4771, 24127, 9500,
}

KNOWN_DATACENTER_ASNS: set[int] = {
    # AWS
    16509, 14618, 7224,
    # Google Cloud
    396982, 36492, 36384, 15169,
    # Microsoft Azure
    8075, 8068, 8069, 12076,
    # Oracle Cloud
    31898,
    # IBM Cloud / SoftLayer
    36351,
    # Alibaba Cloud
    45102,
    # Tencent Cloud
    132203,
    # OVH / OVHcloud
    16276,
    # Hetzner
    24940, 213230,
    # DigitalOcean
    14061, 393406,
    # Linode / Akamai Connected Cloud
    63949, 22040,
    # Vultr / Choopa
    20473,
    # Scaleway / Online SAS
    12876,
    # Contabo
    209083, 40021,
    # IONOS / 1&1
    8560,
    # Rackspace
    33070, 19994, 27357,
    # Equinix Metal
    54825,
    # Kamatera
    36007,
    # UpCloud
    202053,
    # Cherry Servers
    59642,
    # Leaseweb
    28753, 60781,
    # Psychz Networks
    40676,
    # ColoCrossing
    36352,
    # QuadraNet
    8100,
    # Zenlayer
    21859,
    # Yandex Cloud
    200350, 208722,
    # Huawei Cloud
    136907,
    # NTT Global
    2914,
    # Cogent
    174,
    # GTT
    3257,
    # Zayo
    6461,
}

KNOWN_CDN_ASNS: set[int] = {
    13335,  # Cloudflare
    20940,  # Akamai
    54113,  # Fastly
    15133,  # Edgecast / Verizon Digital Media / Edgio
    16625,  # Akamai
    22822,  # Limelight Networks
    30148,  # Sucuri
    209242, # Cloudflare WARP
    132892, # Cloudflare APAC
    202623, # StackPath
    35994,  # Akamai
    23454,  # Akamai
    23455,  # Akamai
    6185,   # Apple CDN (via Akamai)
    714,    # Apple
    2906,   # Netflix Open Connect
    36183,  # Netflix
    40027,  # Netflix
    55095,  # Cloudflare CN
    394536, # Fastly
    395973, # Fastly
    36040,  # Google CDN
    15395,  # MaxCDN
    30081,  # CacheFly
    60068,  # CDN77
    200325, # BunnyCDN
}

KNOWN_HOSTING_ASNS: set[int] = {
    26496,  # GoDaddy
    46606,  # Unified Layer / Bluehost
    197695, # Reg.ru
    47583,  # Hostinger
    32244,  # Liquid Web
    19871,  # Network Solutions
    40034,  # Confluence Networks
    26347,  # DreamHost
    29802,  # HIVELOCITY
    36024,  # HostNOC
    21844,  # ThePlanet / SoftLayer
    33182,  # HostDime
    32475,  # SingleHop
    18450,  # WebNX
    53831,  # Squarespace
    15830,  # Telecity / Equinix EU
    25369,  # Hydra Communications
    46475,  # Limestone Networks
    13768,  # Peer 1 Network
    29854,  # Westhost
    36137,  # ServerMania
    20454,  # SecuredServers
    62567,  # DigitalOcean (hosting arm)
    398101, # GoDaddy hosting
    19969,  # Joe's Datacenter
    395003, # WPEngine
    22612,  # Namecheap
    46664,  # VolumeDrive
    55286,  # B2 Net Solutions / ServerMania
    35916,  # MultaCom
    23470,  # ReliableSite
}

KNOWN_EDUCATION_ASNS: set[int] = {
    786,    # JANET (UK academic network)
    21320,  # GEANT (European research)
    11537,  # Internet2 (US research)
    2200,   # RENATER (France research)
    680,    # DFN (Germany research)
    137,    # GARR (Italy research)
    766,    # RedIRIS (Spain research)
    7575,   # AARNet (Australia research)
    513,    # CERN
    1741,   # FUNET (Finland research)
    2852,   # CESNET (Czech research)
    2603,   # NORDUnet (Nordics research)
    1930,   # FCCN (Portugal research)
    2381,   # WIDE Project (Japan research)
    1103,   # SURFNET (Netherlands research)
    2018,   # AFRINIC
    7660,   # APNIC research
    2500,   # WIDE (Japan)
    4556,   # HKIX (Hong Kong academic)
    24151,  # CNGI-CERNET2 (China education)
    4538,   # CERNET (China education)
    4837,   # CERNET2 backbone
    3,      # MIT
    10578,  # Stanford
    27,     # University of Maryland
    46,     # UC Berkeley
    104,    # Carnegie Mellon
    224,    # Uninett (Norway research)
    2852,   # CESNET
    2547,   # Belnet (Belgium research)
    1754,   # DESY (Germany)
    5765,   # SANET (Slovakia academic)
    9264,   # KAIST (South Korea)
}

KNOWN_GOVERNMENT_ASNS: set[int] = {
    3354,   # US DoD Network Information Center
    27064,  # DoD Education Activity
    721,    # DoD / DISA
    749,    # DoD / DISA
    568,    # US Postal Service
    3541,   # Headquarters USAISC
    6983,   # US Internal Revenue Service
    19551,  # Incapsula Fed (US gov CDN)
    394671, # US Dept of Veterans Affairs
    7046,   # US DoD
    10796,  # US Census Bureau
    2572,   # UK MoD
    25180,  # Bundesverwaltung (Swiss gov)
    553,    # BelWue (partially government)
    559,    # SWITCH (Swiss gov/edu)
}

KNOWN_ENTERPRISE_ASNS: set[int] = {
    8068,   # Microsoft Corp
    36459,  # GitHub
    32934,  # Facebook / Meta
    13414,  # Twitter / X
    54115,  # Dropbox
    8003,   # Uber
    14413,  # Linkedin
    40428,  # Salesforce
    30148,  # Sucuri (enterprise security)
    394699, # Zoom
    19551,  # Shopify
    62229,  # Spotify
    394406, # Slack
}


# ---------------------------------------------------------------------------
# PeeringDB info_type → label mapping
# ---------------------------------------------------------------------------

PEERINGDB_TYPE_MAP: dict[str, str] = {
    "Cable/DSL/ISP": "isp",
    "NSP":           "isp",
    "Content":       "cdn",
    "Enterprise":    "enterprise",
    "Educational/Research": "education",
    "Government":    "government",
    "Non-Profit":    "enterprise",
    "Route Server":  "unknown",
    "Route Collector": "unknown",
}


def download_peeringdb(output_dir: str) -> dict[int, str]:
    """Download PeeringDB network list, return {asn: label} mapping."""
    cache_path = os.path.join(output_dir, CACHED_PEERINGDB)
    print(f"[peeringdb] Fetching {PEERINGDB_URL} ...")
    data = None
    try:
        req = urllib.request.Request(PEERINGDB_URL, headers={
            "User-Agent": "generate_asn_data/2.0",
            "Accept": "application/json",
        })
        with urllib.request.urlopen(req, timeout=60) as resp:
            raw = resp.read()
        print(f"[peeringdb] Downloaded {len(raw):,} bytes")
        data = json.loads(raw)
        with open(cache_path, "wb") as f:
            f.write(raw)
    except Exception as e:
        print(f"[warning] PeeringDB download failed: {e}", file=sys.stderr)
        if os.path.exists(cache_path):
            print(f"[info] Using cached {cache_path}")
            with open(cache_path, "r", encoding="utf-8") as f:
                data = json.load(f)
        else:
            print("[warning] No PeeringDB data available, continuing without it.",
                  file=sys.stderr)
            return {}

    if not data or "data" not in data:
        print("[warning] PeeringDB response has no 'data' key", file=sys.stderr)
        return {}

    pdb_labels: dict[int, str] = {}
    nets = data["data"]
    print(f"[peeringdb] Processing {len(nets):,} networks ...")

    for net in nets:
        asn = net.get("asn", 0)
        if not asn or asn <= 0:
            continue
        info_type = (net.get("info_type") or "").strip()
        name = (net.get("name") or "").lower()

        # Direct type mapping
        if info_type in PEERINGDB_TYPE_MAP:
            pdb_labels[asn] = PEERINGDB_TYPE_MAP[info_type]
        elif info_type == "Network Services":
            # Check keywords to distinguish hosting/datacenter/isp
            if _match_keywords(name, HOSTING_KEYWORDS):
                pdb_labels[asn] = "hosting"
            elif _match_keywords(name, DATACENTER_KEYWORDS):
                pdb_labels[asn] = "datacenter"
            elif _match_keywords(name, ISP_KEYWORDS):
                pdb_labels[asn] = "isp"
            else:
                pdb_labels[asn] = "enterprise"
        elif info_type == "":
            # No type set — try keyword classification on name
            label = _classify_by_keywords(name)
            if label:
                pdb_labels[asn] = label
        # else: unknown types like Route Server — skip

    stats: dict[str, int] = {}
    for v in pdb_labels.values():
        stats[v] = stats.get(v, 0) + 1
    print(f"[peeringdb] Mapped {len(pdb_labels):,} ASNs: {stats}")
    return pdb_labels


# ---------------------------------------------------------------------------
# Keyword-based heuristic classification (expanded)
# ---------------------------------------------------------------------------

ISP_KEYWORDS = [
    # Generic ISP terms
    "telecom", "telcom", "telekom", "telefonica", "telecomunicacoes",
    "telecomunicaciones", "telekommunikasjon", "telekommunikation",
    "mobile", "broadband", "fiber", "fibre", "cable", "wireless",
    "residential", "communications", "comm ", " isp", "netcom",
    "internet service", "subscriber", "dsl", "adsl", "vdsl", "ftth",
    "fttb", "fttp", "dial-up", "dialup", "cellular", "lte", "5g network",
    "wimax", "satellite", " tel ", "telco", "ptcl", "bsnl",
    "telecable", "telemedia", "telnet", "telprom",
    "internet provider", "internet access",
    # Specific operators — Global
    "airtel", "jio", "reliance", "vodafone", "orange", "bouygues",
    "proximus", "swisscom", "telenor", "telia", "elisa", "dna oy",
    "rogers", "bell canada", "shaw", "telus", "optus", "tpg ",
    "internode", "comcast", "charter", "spectrum", "cox comm",
    "verizon", "at&t", "centurylink", "lumen", "frontier comm",
    "mediacom", "windstream", "consolidated comm", "t-mobile",
    "sprint", "cricket", "boost mobile", "virgin media",
    "bt group", "sky broadband", "sky uk", "plusnet", "talktalk",
    "ee limited", "three uk", "hutchison", "deutsche telekom",
    "1&1 versatel", "freenet", "unitymedia", "kabel deutschland",
    "kpn", "ziggo", "xs4all", "movistar", "masmovil", "yoigo",
    "tim s.p.a", "fastweb", "iliad", "free sas", "sfr",
    "numericable", "kddi", "ntt ", "softbank bb", "so-net",
    "biglobe", "kt corporation", "sk broadband", "lg uplus",
    "china telecom", "china unicom", "china mobile",
    "chunghwa telecom", "taiwan mobile", "far eastone",
    "pldt", "globe telecom", "true internet", "ais ", "dtac",
    "unifi", "maxis", "singtel", "starhub", "m1 limited",
    "viettel", "vnpt", "indosat", "telkomsel", "xl axiata",
    "turkcell", "turk telekom", "superonline",
    "rostelecom", "mts ", "beeline", "megafon",
    "etisalat", "du telecom", "stc ", "zain",
    "mtn ", "safaricom", "airtel africa", "rain ",
    "telmex", "claro", "vivo ", "oi s.a",
    # Additional global operators
    "telstra", "spark nz", "chorus nz", "vodacom", "digicel",
    "flow ", "liberty cablevision", "totalplay", "izzi ",
    "megacable", "axtel", "tigo ", "entel", "bitel", "movilnet",
    "cantv", "antel", "copel telecom", "algar telecom", "rede sul",
    "surf telecom", "net servicos", "porto seguro",
    "dialog axiata", "sri lanka telecom", "banglalink", "grameenphone",
    "robi axiata", "ncell", "nepal telecom",
    "omantel", "ooredoo", "mobily", "batelco", "viva bahrain",
    "nawras", "sure telecom", "airlink", "cool ideas", "vumatel",
    "telkom sa", "web africa", "afrihost", "zte corporation",
    "smile telecom", "busyinternet", "camtel", "orange cameroun",
    "maroc telecom", "inwi", "djezzy", "mobilis", "tunisie telecom",
    "ethio telecom", "liquid telecom", "neotel", "cell c",
    "a1 telekom", "magenta telekom", "drei austria", "cablecom",
    "salt mobile", "sunrise", "post luxembourg", "tango ",
    "nova croatia", "ht ", "vivacom", "bulsatcom", "cosmote",
    "nova greece", "forthnet", "wind hellas", "cyta",
    "telekom srbija", "vip mobile", "telenor serbia",
    "telekom slovenije", "a1 slovenia",
    "o2 czech", "t-mobile czech", "vodafone czech",
    "orange polska", "polkomtel", "play mobile",
    "netia", "inea ", "vectra",
    "bite ", "tele2", "tet ", "eltel",
    "starman", "telia eesti", "elisa eesti",
    "lattelecom", "baltcom",
    "kyivstar", "lifecell", "ukrtelecom",
    "moldtelecom", "orange moldova",
    "kazakhtelecom", "beeline kz", "tele2 kz",
    "uztelecom", "ucell",
    "bakcell", "azercell", "azerconnect",
    "veon", "silknet", "magti",
]

DATACENTER_KEYWORDS = [
    "cloud", "amazon", "aws", "google cloud", "gcp",
    "microsoft azure", "azure", "digitalocean", "digital ocean",
    "linode", "vultr", "hetzner", "ovhcloud", "ovh sas",
    "scaleway", "contabo", "kamatera", "upcloud",
    "oracle cloud", "alibaba cloud", "tencent cloud",
    "ibm cloud", "softlayer", "rackspace", "equinix",
    "leaseweb", "choopa", "data center", "datacenter", "data centre",
    "colocation", "colo ", "baremetal", "bare metal",
    "infrastructure", "iaas", "paas",
    # Additional cloud/datacenter providers
    "serverel", "packet", "metal ", "nexon", "netmagic",
    "phoenixnap", "tierpoint", "cyrusone", "coresite",
    "datapipe", "internap", "peak 10", "zayo ", "coreweave",
    "lambda cloud", "paperspace", "vast.ai", "fluidstack",
    "cloudfloor", "netriplex", "hyperoptic", "serverspace",
    "selectel", "aruba cloud", "worldstream", "datacamp",
    "m247", "datapacket", "clouvider", "hostkey",
    "tzulo", "ramnode", "hostens", "serverius", "servers.com",
    "nocix", "fdcservers", "reliablesite",
    "quadranet", "cogent ", "colocrossing",
]

HOSTING_KEYWORDS = [
    "hosting", "host ", "hoster", "webhost", "web host",
    "server farm", "vps", "virtual private",
    "dedicated server", "shared hosting", "managed hosting",
    "reseller", "cpanel", "plesk", "wordpress host",
    "godaddy", "namecheap", "hostinger", "bluehost",
    "siteground", "a2 hosting", "dreamhost", "hostgator",
    "ionos", "squarespace", "wix", "wpengine", "wp engine",
    "register.com", "domain registr", "registrar",
    "strato ag", "mittwald", "hosteurope", "all-inkl",
    "infomaniak", "gandi", "one.com", "loopia",
    "fastcomet", "cloudways", "kinsta", "flywheel",
    "netlify", "vercel", "render.com", "railway",
    "heroku", "webhosting",
]

CDN_KEYWORDS = [
    "cloudflare", "akamai", "fastly", "cdn ",
    "content delivery", "edgecast", "limelight",
    "stackpath", "sucuri", "keycdn", "bunnycdn", "bunny cdn",
    "jsdelivr", "cachefly", "imperva", "incapsula",
    "g-core", "gcorelabs", "cdn77", "cdn network",
    "medianova", "chinacache", "wangsu", "quantil",
    "azion", "section.io", "beluga cdn",
]

EDUCATION_KEYWORDS = [
    "university", "universit", "universite", "universidad", "universidade",
    "universita", "universitaet", "univ ", "univ.", "univers",
    "college", "academic", "academia", "research",
    "education", "educational", "school", "institute",
    "polytechnic", "hochschule", "fachhochschule",
    "nren", "ren ", ".ren", "research network",
    "wissenschaft", "forschung",
    "campus", "student",
    "ecole ", "école", "escuela", "escola",
    "fakultat", "fakultet",
    "技術", "大学", "学院", "研究", "科学",
    "교육", "대학",
    "กรม", "มหาวิทยาลัย",
]

GOVERNMENT_KEYWORDS = [
    "government", "gouvernement", "gobierno", "governo", "regierung",
    "gouv", "gov.", ".gov", "gob.",
    "ministry", "ministere", "ministerio", "ministerium", "ministero",
    "military", "army", "navy", "defense", "defence", "defensa",
    "police", "polizei", "polizia",
    "federal", "fedral", "estado", "etat",
    "municipal", "prefecture", "region ",
    "national agency", "state of ",
    "bundeswehr", "armed forces",
    "customs", "immigration",
    "justic", "judiciary",
]

# Enterprise keywords are lowest priority — many ISPs have these suffixes
ENTERPRISE_KEYWORDS = [
    "corporation", "corp.", "corporate",
    " inc.", " inc,",
    " ltd.", " ltd,",
    "gmbh", "s.a.", "s.r.l",
    " group", "holdings",
    "financial", "banking", "insurance",
    "pharmaceutical", "logistics",
    "manufacturing", "industrial",
    "automotive", "energy",
    "retail ", "media group",
]


def _match_keywords(name_lower: str, keywords: list[str]) -> bool:
    """Check if any keyword matches in the lowered name."""
    return any(kw in name_lower for kw in keywords)


def _classify_by_keywords(name_lower: str) -> str:
    """Classify by keyword heuristics. Returns label or empty string."""
    if name_lower in ("not routed", "none", "", "-"):
        return "unknown"
    if _match_keywords(name_lower, CDN_KEYWORDS):
        return "cdn"
    if _match_keywords(name_lower, EDUCATION_KEYWORDS):
        return "education"
    if _match_keywords(name_lower, GOVERNMENT_KEYWORDS):
        return "government"
    if _match_keywords(name_lower, HOSTING_KEYWORDS):
        return "hosting"
    if _match_keywords(name_lower, DATACENTER_KEYWORDS):
        return "datacenter"
    if _match_keywords(name_lower, ISP_KEYWORDS):
        return "isp"
    if _match_keywords(name_lower, ENTERPRISE_KEYWORDS):
        return "enterprise"
    return ""


def classify_asn(asn_number: int, as_name: str,
                 peeringdb: dict[int, str]) -> str:
    """Classify an ASN into a reputation category.

    Priority: hard-coded lists > PeeringDB > keyword heuristics.
    """
    # 1. Hard-coded lookups (highest priority)
    if asn_number in KNOWN_CDN_ASNS:
        return "cdn"
    if asn_number in KNOWN_HOSTING_ASNS:
        return "hosting"
    if asn_number in KNOWN_DATACENTER_ASNS:
        return "datacenter"
    if asn_number in KNOWN_EDUCATION_ASNS:
        return "education"
    if asn_number in KNOWN_GOVERNMENT_ASNS:
        return "government"
    if asn_number in KNOWN_ENTERPRISE_ASNS:
        return "enterprise"
    if asn_number in KNOWN_ISP_ASNS:
        return "isp"

    # 2. PeeringDB classification
    if asn_number in peeringdb:
        return peeringdb[asn_number]

    # 3. Keyword heuristics on AS name
    name_lower = as_name.lower()
    label = _classify_by_keywords(name_lower)
    return label if label else "unknown"


def ranges_to_cidrs(start_ip: str, end_ip: str):
    """Convert an IP range to a list of CIDR networks."""
    try:
        first = ipaddress.IPv4Address(start_ip)
        last = ipaddress.IPv4Address(end_ip)
        if first > last:
            return []
        return list(ipaddress.summarize_address_range(first, last))
    except (ipaddress.AddressValueError, ValueError, TypeError):
        return []


def download_iptoasn(output_dir: str) -> str:
    """Download and decompress ip2asn-v4.tsv.gz, return path to TSV."""
    tsv_path = os.path.join(output_dir, CACHED_TSV)
    print(f"[download] Fetching {IPTOASN_URL} ...")
    try:
        req = urllib.request.Request(IPTOASN_URL, headers={
            "User-Agent": "generate_asn_data/2.0"
        })
        with urllib.request.urlopen(req, timeout=120) as resp:
            compressed = resp.read()
        print(f"[download] Downloaded {len(compressed):,} bytes compressed")
        raw = gzip.decompress(compressed)
        print(f"[download] Decompressed to {len(raw):,} bytes")
        with open(tsv_path, "wb") as f:
            f.write(raw)
        print(f"[download] Saved to {tsv_path}")
    except Exception as e:
        print(f"[error] Download failed: {e}", file=sys.stderr)
        if os.path.exists(tsv_path):
            print(f"[info] Using cached {tsv_path}")
        else:
            print("[fatal] No cached TSV available, aborting.", file=sys.stderr)
            sys.exit(1)
    return tsv_path


def parse_tsv(tsv_path: str):
    """
    Parse ip2asn-v4.tsv.
    Format: range_start \\t range_end \\t AS_number \\t country_code \\t AS_description
    Returns list of (start_ip, end_ip, asn, country, as_name) tuples.
    """
    entries = []
    with open(tsv_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            parts = line.split("\t")
            if len(parts) < 5:
                continue
            start_ip, end_ip, asn_str, country, as_name = (
                parts[0], parts[1], parts[2], parts[3], parts[4]
            )
            try:
                asn = int(asn_str)
            except ValueError:
                continue
            if asn == 0:
                continue
            entries.append((start_ip, end_ip, asn, country, as_name))
    return entries


def main():
    parser = argparse.ArgumentParser(
        description="Generate ASN reputation and IP-to-ASN CSVs from iptoasn.com + PeeringDB"
    )
    parser.add_argument(
        "--output-dir", default=None,
        help="Directory for output CSV files (writes asn_reputation.csv and iplocate-ip-to-asn.csv)"
    )
    parser.add_argument(
        "--output-asn", default=None,
        help="Explicit path for asn_reputation.csv output"
    )
    parser.add_argument(
        "--output-ipasn", default=None,
        help="Explicit path for iplocate-ip-to-asn.csv output"
    )
    parser.add_argument(
        "--no-download", action="store_true",
        help="Skip iptoasn.com download, use cached ip2asn-v4.tsv"
    )
    parser.add_argument(
        "--no-peeringdb", action="store_true",
        help="Skip PeeringDB download, classify using keyword heuristics only"
    )
    args = parser.parse_args()

    # Resolve output paths — support both --output-dir and --output-asn/--output-ipasn
    if args.output_asn and args.output_ipasn:
        rep_path = args.output_asn
        ip_path = args.output_ipasn
        work_dir = os.path.dirname(rep_path) or "."
    elif args.output_dir:
        work_dir = args.output_dir
        rep_path = os.path.join(work_dir, "asn_reputation.csv")
        ip_path = os.path.join(work_dir, "iplocate-ip-to-asn.csv")
    else:
        work_dir = "."
        rep_path = os.path.join(work_dir, "asn_reputation.csv")
        ip_path = os.path.join(work_dir, "iplocate-ip-to-asn.csv")

    os.makedirs(work_dir, exist_ok=True)

    # Step 1: Get IP-to-ASN TSV data
    tsv_path = os.path.join(work_dir, CACHED_TSV)
    if not args.no_download:
        tsv_path = download_iptoasn(work_dir)
    elif not os.path.exists(tsv_path):
        print(f"[fatal] --no-download specified but {tsv_path} not found",
              file=sys.stderr)
        sys.exit(1)

    # Step 2: Get PeeringDB data
    peeringdb: dict[int, str] = {}
    if not args.no_peeringdb:
        peeringdb = download_peeringdb(work_dir)

    # Step 3: Parse TSV
    print("[parse] Reading TSV ...")
    entries = parse_tsv(tsv_path)
    print(f"[parse] {len(entries):,} IP range entries")

    # Step 4: Build ASN info map {asn: (country, as_name)}
    asn_info: dict[int, tuple[str, str]] = {}
    for _, _, asn, country, as_name in entries:
        if asn not in asn_info:
            asn_info[asn] = (country, as_name)

    # Step 5: Classify all ASNs
    asn_labels: dict[int, str] = {}
    for asn, (country, as_name) in asn_info.items():
        asn_labels[asn] = classify_asn(asn, as_name, peeringdb)

    # Include ASNs that are in PeeringDB but not in iptoasn (they have no
    # IP ranges but still get a label in asn_reputation.csv)
    for asn, label in peeringdb.items():
        if asn not in asn_labels:
            asn_labels[asn] = label

    # Step 6: Write asn_reputation.csv (sorted by ASN, no header)
    sorted_asns = sorted(asn_labels.keys())
    os.makedirs(os.path.dirname(rep_path) or ".", exist_ok=True)
    with open(rep_path, "w") as f:
        for asn in sorted_asns:
            f.write(f"{asn},{asn_labels[asn]}\n")
    print(f"[output] Wrote {len(sorted_asns):,} ASNs to {rep_path}")

    # Step 7: Convert ranges to CIDRs and write iplocate-ip-to-asn.csv
    seen_cidrs: set[str] = set()
    cidr_rows: list[tuple[ipaddress.IPv4Network, int, str, str]] = []
    print("[cidr] Converting IP ranges to CIDR notation ...")
    for start_ip, end_ip, asn, country, as_name in entries:
        cidrs = ranges_to_cidrs(start_ip, end_ip)
        for cidr in cidrs:
            key = f"{cidr},{asn}"
            if key not in seen_cidrs:
                seen_cidrs.add(key)
                cidr_rows.append((cidr, asn, country, as_name))

    cidr_rows.sort(key=lambda r: (r[0].network_address, r[0].prefixlen))

    os.makedirs(os.path.dirname(ip_path) or ".", exist_ok=True)
    with open(ip_path, "w") as f:
        f.write("network,asn,country_code,name\n")
        for cidr, asn, country, as_name in cidr_rows:
            safe_name = as_name.replace(",", " ")
            f.write(f"{cidr},{asn},{country},{safe_name}\n")
    cidr_count = len(cidr_rows)
    print(f"[output] Wrote {cidr_count:,} CIDRs to {ip_path}")

    # Step 8: Stats
    stats: dict[str, int] = {}
    for label in asn_labels.values():
        stats[label] = stats.get(label, 0) + 1

    all_labels = ["isp", "datacenter", "hosting", "cdn", "enterprise",
                  "education", "government", "unknown"]
    print("\n=== Summary ===")
    print(f"Total ASNs:  {len(sorted_asns):,}")
    for label in all_labels:
        count = stats.get(label, 0)
        pct = 100.0 * count / len(sorted_asns) if sorted_asns else 0
        print(f"  {label:12s}: {count:>6,}  ({pct:5.1f}%)")
    print(f"Total CIDRs: {cidr_count:,}")

    # Warn if unknown is still too high
    unk_pct = 100.0 * stats.get("unknown", 0) / len(sorted_asns) if sorted_asns else 0
    if unk_pct > 50:
        print(f"\n[warning] {unk_pct:.1f}% of ASNs are still 'unknown'.", file=sys.stderr)
        if not peeringdb:
            print("[hint] Try running without --no-peeringdb to use PeeringDB data.",
                  file=sys.stderr)


if __name__ == "__main__":
    main()