feat: browser JA4 detection, Anubis bot rules, worldwide ASN data
- Add generate_browser_ja4.py: 1,186 browser JA4 fingerprints from FoxIO + ja4db.com covering 11 families (Chromium, Firefox, Safari, Edge, Tor, Opera, Vivaldi...) - Rewrite generate_bot_ip.py: Anubis YAML rules (Google, Bing, Apple, DuckDuck, OpenAI, Perplexity bots) + Tor exit nodes + cloud scanner IPs (3,555 entries) - Rewrite generate_asn_data.py: worldwide iptoasn.com data (78,049 ASNs, 714K CIDRs) - Add dict_browser_ja4 ClickHouse dictionary + browser_family in AI features views - Add /api/browsers dashboard endpoint - Fix CSV quoting for fields containing commas (User-Agent strings) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -1,154 +1,472 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
generate_asn_data.py — Generate ASN reputation + IP-to-ASN lookup CSVs.
|
||||
generate_asn_data.py — Download public IPtoASN database and generate:
|
||||
1. asn_reputation.csv — ASN number + reputation label (no header)
|
||||
2. iplocate-ip-to-asn.csv — CIDR,ASN,country,AS name (no header)
|
||||
|
||||
Sources:
|
||||
• RIPE NCC, ARIN, APNIC ASN registries (well-known allocations)
|
||||
• DataCenter ASN lists from ipinfo.io and bgp.he.net
|
||||
• Manual curation of hosting/cloud/residential ISP ASNs
|
||||
Data source: https://iptoasn.com/data/ip2asn-v4.tsv.gz (free, no registration)
|
||||
|
||||
Outputs:
|
||||
asn_reputation.csv: src_asn,label
|
||||
iplocate-ip-to-asn.csv: network,asn,country_code,name,org,domain
|
||||
Usage:
|
||||
python3 generate_asn_data.py --output-dir .
|
||||
python3 generate_asn_data.py --output-dir . --no-download # reuse cached TSV
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import gzip
|
||||
import ipaddress
|
||||
import io
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
# --- ASN Classifications ---
|
||||
# Each entry: (asn, label, country, name, org, domain, networks[])
|
||||
ASN_DATABASE = [
|
||||
# ========================= RESIDENTIAL ISPs (human) =========================
|
||||
IPTOASN_URL = "https://iptoasn.com/data/ip2asn-v4.tsv.gz"
|
||||
CACHED_TSV = "ip2asn-v4.tsv"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Hard-coded well-known ASN lists for accurate classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
KNOWN_HUMAN_ASNS: set[int] = {
|
||||
# France
|
||||
(16276, "human", "FR", "OVH SAS", "OVH", "ovh.com",
|
||||
["91.121.0.0/16", "151.80.0.0/16", "137.74.0.0/16", "5.196.0.0/16", "54.36.0.0/16"]),
|
||||
(15557, "human", "FR", "SFR SA", "SFR", "sfr.com",
|
||||
["78.41.0.0/16", "90.28.0.0/14", "109.0.0.0/14"]),
|
||||
(3215, "human", "FR", "Orange SA", "Orange", "orange.fr",
|
||||
["90.0.0.0/8", "86.192.0.0/11", "81.48.0.0/14"]),
|
||||
(12322, "human", "FR", "Free SAS", "Free", "free.fr",
|
||||
["82.64.0.0/14", "78.220.0.0/14", "88.120.0.0/13"]),
|
||||
|
||||
3215, 12322, 15557, 5410, 6799, 29169, 2027,
|
||||
# Germany
|
||||
(5432, "human", "DE", "Deutsche Telekom AG", "Telekom", "telekom.de",
|
||||
["212.0.0.0/8", "91.64.0.0/14", "2.200.0.0/14"]),
|
||||
(3320, "human", "DE", "Deutsche Telekom DTAG", "DTAG", "telekom.de",
|
||||
["80.128.0.0/11"]),
|
||||
(6805, "human", "DE", "Telefonica Germany", "O2", "o2online.de",
|
||||
["176.0.0.0/12"]),
|
||||
|
||||
# Netherlands
|
||||
(1136, "human", "NL", "KPN Internet BV", "KPN", "kpn.com",
|
||||
["84.116.0.0/16", "145.90.0.0/16"]),
|
||||
(1103, "human", "NL", "SURF", "SURFnet", "surf.nl",
|
||||
["145.0.0.0/16"]),
|
||||
|
||||
3320, 5432, 6805, 6830, 8422, 31334, 8881, 9145,
|
||||
# UK
|
||||
(2856, "human", "GB", "BT Group plc", "BT", "bt.com",
|
||||
["77.108.0.0/16", "81.128.0.0/11", "86.128.0.0/11"]),
|
||||
(8913, "human", "GB", "Virgin Media", "Virgin Media", "virginmedia.com",
|
||||
["82.45.0.0/16", "86.0.0.0/11"]),
|
||||
(5607, "human", "GB", "Sky UK Limited", "Sky", "sky.com",
|
||||
["90.192.0.0/11", "151.224.0.0/13"]),
|
||||
|
||||
2856, 5607, 8913, 6871, 13285, 20712, 25577,
|
||||
# Netherlands
|
||||
1136, 15542, 33915, 50266, 15435,
|
||||
# Spain
|
||||
(3352, "human", "ES", "Telefonica Spain", "Telefonica", "telefonica.es",
|
||||
["62.98.0.0/16", "80.24.0.0/14", "83.32.0.0/11"]),
|
||||
|
||||
3352, 12357, 12715, 12479, 12338,
|
||||
# Italy
|
||||
(3269, "human", "IT", "Telecom Italia", "TIM", "telecomitalia.it",
|
||||
["79.0.0.0/12", "82.48.0.0/12"]),
|
||||
|
||||
# US residential
|
||||
(7922, "human", "US", "Comcast Cable", "Comcast", "comcast.net",
|
||||
["50.128.0.0/9", "73.0.0.0/8", "75.64.0.0/13"]),
|
||||
(7018, "human", "US", "AT&T Services", "AT&T", "att.com",
|
||||
["12.0.0.0/8", "32.0.0.0/11"]),
|
||||
(701, "human", "US", "Verizon Business", "Verizon", "verizon.com",
|
||||
["71.160.0.0/11", "74.64.0.0/11"]),
|
||||
(20115, "human", "US", "Charter Communications", "Spectrum", "charter.com",
|
||||
["24.16.0.0/13", "65.32.0.0/11"]),
|
||||
|
||||
3269, 12874, 30722, 1267, 12797,
|
||||
# US — residential / consumer ISPs
|
||||
7922, 7018, 701, 20115, 209, 6389, 22773, 10796, 11351, 11427,
|
||||
11426, 20001, 22394, 5650, 6128, 10507, 12271, 19108, 26801,
|
||||
33363, 33588, 33651, 33652, 33657, 33659, 33660, 33662, 33668,
|
||||
7843, 11025, 12083, 20057, 23005, 26827, 33491, 33650, 5769,
|
||||
6167, 11404, 14265, 21508, 22561, 30036,
|
||||
# Canada
|
||||
577, 812, 6327, 852, 855, 6539, 21949, 5645, 6453,
|
||||
# Japan
|
||||
(2516, "human", "JP", "KDDI Corporation", "KDDI", "kddi.com",
|
||||
["106.128.0.0/10", "111.86.0.0/15"]),
|
||||
(4713, "human", "JP", "NTT Communications", "OCN", "ntt.com",
|
||||
["114.144.0.0/14", "118.238.0.0/15"]),
|
||||
2516, 4713, 17676, 2519, 2497, 9605, 4685, 7679, 9824,
|
||||
# South Korea
|
||||
4766, 9318, 3786, 38091, 9848, 4659,
|
||||
# Australia
|
||||
1221, 4764, 4804, 7545, 9443, 18291, 24429,
|
||||
# India
|
||||
9829, 45609, 55836, 24560, 17488, 9498, 18101, 45820,
|
||||
# Brazil
|
||||
4230, 7738, 8167, 16735, 26599, 27699, 28573, 53006, 53089,
|
||||
18881, 22085, 28343,
|
||||
# Russia
|
||||
8359, 12389, 25513, 8402, 3216, 31133, 42610,
|
||||
# China
|
||||
4134, 4837, 4808, 4812, 9808, 9394, 56040, 56041, 56042,
|
||||
# Mexico
|
||||
8151, 6503, 11888, 17072, 32098,
|
||||
# Turkey
|
||||
9121, 34984, 47331, 16135,
|
||||
# Poland
|
||||
5617, 12912, 6830, 5588, 8374, 21021,
|
||||
# Sweden / Nordics
|
||||
3301, 1257, 2119, 8473, 12552, 44034, 2116, 29518,
|
||||
# Switzerland
|
||||
3303, 6830,
|
||||
# Belgium
|
||||
5432, 6848, 12392,
|
||||
# Portugal
|
||||
3243, 2860, 8657,
|
||||
# Ireland
|
||||
5466, 15502, 6830,
|
||||
# Southeast Asia
|
||||
4773, 7552, 45899, 9299, 4818, 18403, 17974, 23969, 9534,
|
||||
24203, 7470,
|
||||
# Middle East
|
||||
5384, 8781, 39891, 42961, 12880, 44244, 50710,
|
||||
# Africa
|
||||
36903, 37105, 36874, 36992, 37453, 29571, 33771, 37492,
|
||||
# Argentina
|
||||
7303, 10318, 11664, 22927,
|
||||
# Colombia
|
||||
10620, 13489, 14080, 27831,
|
||||
# New Zealand
|
||||
9790, 4771, 24127, 9500,
|
||||
}
|
||||
|
||||
# ========================= SEARCH ENGINES (human) =========================
|
||||
(15169, "human", "US", "Google LLC", "Google", "google.com",
|
||||
["66.249.64.0/19", "64.233.160.0/19", "72.14.192.0/18"]),
|
||||
(8075, "human", "US", "Microsoft Corporation", "Bing", "microsoft.com",
|
||||
["157.55.0.0/16", "207.46.0.0/16", "40.76.0.0/14"]),
|
||||
(32934, "human", "US", "Facebook Inc", "Meta", "facebook.com",
|
||||
["69.63.176.0/20", "66.220.144.0/20", "31.13.24.0/21"]),
|
||||
(13414, "human", "US", "Twitter Inc", "Twitter", "twitter.com",
|
||||
["199.59.148.0/22", "199.16.156.0/22"]),
|
||||
KNOWN_DATACENTER_ASNS: set[int] = {
|
||||
# AWS
|
||||
16509, 14618, 7224,
|
||||
# Google Cloud
|
||||
396982, 36492, 36384, 15169,
|
||||
# Microsoft Azure
|
||||
8075, 8068, 8069, 12076,
|
||||
# Oracle Cloud
|
||||
31898,
|
||||
# IBM Cloud / SoftLayer
|
||||
36351,
|
||||
# Alibaba Cloud
|
||||
45102,
|
||||
# Tencent Cloud
|
||||
132203,
|
||||
# OVH / OVHcloud
|
||||
16276,
|
||||
# Hetzner
|
||||
24940, 213230,
|
||||
# DigitalOcean
|
||||
14061, 393406,
|
||||
# Linode / Akamai Connected Cloud
|
||||
63949, 22040,
|
||||
# Vultr
|
||||
20473,
|
||||
# Scaleway / Online SAS
|
||||
12876,
|
||||
# Contabo
|
||||
209083, 40021,
|
||||
# IONOS / 1&1
|
||||
8560,
|
||||
# Rackspace
|
||||
33070, 19994, 27357,
|
||||
# Equinix Metal
|
||||
54825,
|
||||
# Kamatera
|
||||
36007,
|
||||
# UpCloud
|
||||
202053,
|
||||
# Cherry Servers
|
||||
59642,
|
||||
# Leaseweb
|
||||
28753, 60781,
|
||||
# Psychz Networks
|
||||
40676,
|
||||
# ColoCrossing
|
||||
36352,
|
||||
# QuadraNet
|
||||
8100,
|
||||
# Choopa (Vultr subsidiary)
|
||||
20473,
|
||||
# Zenlayer
|
||||
21859,
|
||||
}
|
||||
|
||||
# ========================= DATACENTER / SCANNER =========================
|
||||
(210644, "datacenter", "NL", "Accelerated-IT Services", "Tor Project", "tor-project.org",
|
||||
["185.220.100.0/22", "185.220.101.0/24", "185.220.102.0/24"]),
|
||||
(209083, "datacenter", "DE", "Contabo GmbH", "Contabo", "contabo.de",
|
||||
["45.155.205.0/24", "62.171.128.0/17", "5.161.0.0/16"]),
|
||||
(14061, "datacenter", "US", "DigitalOcean LLC", "DigitalOcean", "digitalocean.com",
|
||||
["64.225.0.0/16", "104.131.0.0/16", "138.197.0.0/16", "159.65.0.0/16"]),
|
||||
(16509, "datacenter", "US", "Amazon.com ARIN", "AWS", "amazonaws.com",
|
||||
["3.0.0.0/8", "18.0.0.0/8", "52.0.0.0/8", "54.0.0.0/8"]),
|
||||
(396982, "datacenter", "US", "Google Cloud", "GCP", "cloud.google.com",
|
||||
["34.0.0.0/8", "35.184.0.0/13"]),
|
||||
(8560, "datacenter", "DE", "IONOS SE", "IONOS", "ionos.com",
|
||||
["74.208.0.0/16", "212.227.0.0/16"]),
|
||||
(24940, "datacenter", "DE", "Hetzner Online GmbH", "Hetzner", "hetzner.com",
|
||||
["136.243.0.0/16", "138.201.0.0/16", "144.76.0.0/16", "178.63.0.0/16"]),
|
||||
(20473, "datacenter", "US", "The Constant Company", "Vultr", "vultr.com",
|
||||
["45.32.0.0/16", "64.237.32.0/19", "108.61.0.0/16"]),
|
||||
(63949, "datacenter", "US", "Linode LLC", "Linode", "linode.com",
|
||||
["45.33.0.0/17", "45.56.0.0/16", "50.116.0.0/18"]),
|
||||
(13335, "datacenter", "US", "Cloudflare Inc", "Cloudflare", "cloudflare.com",
|
||||
["104.16.0.0/12", "172.64.0.0/13", "162.158.0.0/15"]),
|
||||
KNOWN_CDN_ASNS: set[int] = {
|
||||
13335, # Cloudflare
|
||||
20940, # Akamai
|
||||
54113, # Fastly
|
||||
15133, # Edgecast / Verizon Digital Media / Edgio
|
||||
16625, # Akamai
|
||||
22822, # Limelight Networks
|
||||
30148, # Sucuri
|
||||
209242, # Cloudflare WARP
|
||||
132892, # Cloudflare APAC
|
||||
202623, # StackPath
|
||||
35994, # Akamai
|
||||
23454, # Akamai
|
||||
23455, # Akamai
|
||||
6185, # Apple CDN (via Akamai)
|
||||
714, # Apple
|
||||
2906, # Netflix Open Connect
|
||||
36183, # Netflix
|
||||
40027, # Netflix
|
||||
55095, # Cloudflare CN
|
||||
394536, # Fastly
|
||||
395973, # Fastly
|
||||
}
|
||||
|
||||
# ========================= HOSTING =========================
|
||||
(197695, "hosting", "RU", "Reg.ru Hosting", "Reg.ru", "reg.ru",
|
||||
["193.32.162.0/24", "194.58.92.0/22"]),
|
||||
(51167, "hosting", "DE", "Contabo GmbH", "Contabo Hosting", "contabo.de",
|
||||
["78.46.0.0/15"]),
|
||||
(46606, "hosting", "US", "Unified Layer", "Bluehost", "bluehost.com",
|
||||
["162.241.0.0/16", "198.57.128.0/17"]),
|
||||
(26496, "hosting", "US", "GoDaddy.com", "GoDaddy", "godaddy.com",
|
||||
["184.168.0.0/16", "198.71.128.0/17"]),
|
||||
KNOWN_HOSTING_ASNS: set[int] = {
|
||||
26496, # GoDaddy
|
||||
46606, # Unified Layer / Bluehost
|
||||
197695, # Reg.ru
|
||||
47583, # Hostinger
|
||||
32244, # Liquid Web
|
||||
19871, # Network Solutions
|
||||
40034, # Confluence Networks
|
||||
26347, # DreamHost
|
||||
36351, # SoftLayer (also datacenter, but historically hosting)
|
||||
29802, # HIVELOCITY
|
||||
36024, # HostNOC
|
||||
21844, # ThePlanet / SoftLayer
|
||||
33182, # HostDime
|
||||
32475, # SingleHop
|
||||
18450, # WebNX
|
||||
53831, # Squarespace
|
||||
15830, # Telecity / Equinix EU
|
||||
25369, # Hydra Communications
|
||||
46475, # Limestone Networks
|
||||
13768, # Peer 1 Network
|
||||
29854, # Westhost
|
||||
36137, # ServerMania
|
||||
20454, # SecuredServers
|
||||
62567, # DigitalOcean (hosting arm)
|
||||
398101, # GoDaddy hosting
|
||||
19969, # Joe's Datacenter
|
||||
395003, # WPEngine
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Keyword-based heuristic classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
HUMAN_KEYWORDS = [
|
||||
"telecom", "telcom", "telekom", "telefonica", "telecomunicacoes",
|
||||
"mobile", "broadband", "fiber", "fibre", "cable", "wireless",
|
||||
"residential", "communications", "comm ", " isp", "netcom",
|
||||
"internet service", "subscriber", "dsl", "adsl", "vdsl", "ftth",
|
||||
"fttb", "dial-up", "dialup", "cellular", "lte", "5g network",
|
||||
"wimax", "satellite", " tel ", "telco", "ptcl", "bsnl",
|
||||
"airtel", "jio", "reliance", "vodafone", "orange", "bouygues",
|
||||
"proximus", "swisscom", "telenor", "telia", "elisa", "dna oy",
|
||||
"rogers", "bell canada", "shaw", "telus", "optus", "tpg ",
|
||||
"internode", "comcast", "charter", "spectrum", "cox comm",
|
||||
"verizon", "at&t", "centurylink", "lumen", "frontier comm",
|
||||
"mediacom", "windstream", "consolidated comm", "t-mobile",
|
||||
"sprint", "cricket", "boost mobile", "virgin media",
|
||||
"bt group", "sky broadband", "sky uk", "plusnet", "talktalk",
|
||||
"ee limited", "three uk", "hutchison", "deutsche telekom",
|
||||
"1&1 versatel", "freenet", "unitymedia", "kabel deutschland",
|
||||
"kpn", "ziggo", "xs4all", "movistar", "masmovil", "yoigo",
|
||||
"tim s.p.a", "fastweb", "iliad", "free sas", "sfr",
|
||||
"numericable", "kddi", "ntt ", "softbank bb", "so-net",
|
||||
"biglobe", "kt corporation", "sk broadband", "lg uplus",
|
||||
"china telecom", "china unicom", "china mobile",
|
||||
"chunghwa telecom", "taiwan mobile", "far eastone",
|
||||
"pldt", "globe telecom", "true internet", "ais ", "dtac",
|
||||
"unifi", "maxis", "singtel", "starhub", "m1 limited",
|
||||
"viettel", "vnpt", "indosat", "telkomsel", "xl axiata",
|
||||
"turkcell", "turk telekom", "superonline",
|
||||
"rostelecom", "mts ", "beeline", "megafon",
|
||||
"etisalat", "du telecom", "stc ", "zain",
|
||||
"mtn ", "safaricom", "airtel africa", "rain ",
|
||||
"telmex", "claro", "vivo ", "oi s.a",
|
||||
]
|
||||
|
||||
DATACENTER_KEYWORDS = [
|
||||
"cloud", "amazon", "aws", "google cloud", "gcp",
|
||||
"microsoft azure", "azure", "digitalocean", "digital ocean",
|
||||
"linode", "vultr", "hetzner", "ovhcloud", "ovh sas",
|
||||
"scaleway", "contabo", "kamatera", "upcloud",
|
||||
"oracle cloud", "alibaba cloud", "tencent cloud",
|
||||
"ibm cloud", "softlayer", "rackspace", "equinix",
|
||||
"leaseweb", "choopa", "data center", "datacenter",
|
||||
"colocation", "colo ", "baremetal", "bare metal",
|
||||
"infrastructure", "iaas", "paas",
|
||||
]
|
||||
|
||||
HOSTING_KEYWORDS = [
|
||||
"hosting", "host ", "hoster", "webhost",
|
||||
"server farm", "vps", "virtual private",
|
||||
"dedicated server", "shared hosting", "managed hosting",
|
||||
"reseller", "cpanel", "plesk", "wordpress host",
|
||||
"godaddy", "namecheap", "hostinger", "bluehost",
|
||||
"siteground", "a2 hosting", "dreamhost", "hostgator",
|
||||
"ionos", "squarespace", "wix", "wpengine",
|
||||
]
|
||||
|
||||
CDN_KEYWORDS = [
|
||||
"cloudflare", "akamai", "fastly", "cdn ",
|
||||
"content delivery", "edgecast", "limelight",
|
||||
"stackpath", "sucuri", "keycdn", "bunnycdn",
|
||||
"jsdelivr", "cachefly", "imperva", "incapsula",
|
||||
]
|
||||
|
||||
|
||||
def classify_asn(asn_number: int, as_name: str) -> str:
|
||||
"""Classify an ASN into a reputation category."""
|
||||
# Hard-coded lookups first (highest priority)
|
||||
if asn_number in KNOWN_CDN_ASNS:
|
||||
return "cdn"
|
||||
if asn_number in KNOWN_HOSTING_ASNS:
|
||||
return "hosting"
|
||||
if asn_number in KNOWN_DATACENTER_ASNS:
|
||||
return "datacenter"
|
||||
if asn_number in KNOWN_HUMAN_ASNS:
|
||||
return "human"
|
||||
|
||||
# Keyword heuristics on AS name
|
||||
name_lower = as_name.lower()
|
||||
|
||||
# Skip "Not routed" or reserved
|
||||
if name_lower in ("not routed", "none", "", "-"):
|
||||
return "unknown"
|
||||
|
||||
# CDN first (most specific)
|
||||
for kw in CDN_KEYWORDS:
|
||||
if kw in name_lower:
|
||||
return "cdn"
|
||||
|
||||
# Hosting before datacenter (more specific)
|
||||
for kw in HOSTING_KEYWORDS:
|
||||
if kw in name_lower:
|
||||
return "hosting"
|
||||
|
||||
# Datacenter
|
||||
for kw in DATACENTER_KEYWORDS:
|
||||
if kw in name_lower:
|
||||
return "datacenter"
|
||||
|
||||
# Human / ISP
|
||||
for kw in HUMAN_KEYWORDS:
|
||||
if kw in name_lower:
|
||||
return "human"
|
||||
|
||||
return "unknown"
|
||||
|
||||
|
||||
def ranges_to_cidrs(start_ip: str, end_ip: str):
|
||||
"""Convert an IP range to a list of CIDR networks."""
|
||||
try:
|
||||
first = ipaddress.IPv4Address(start_ip)
|
||||
last = ipaddress.IPv4Address(end_ip)
|
||||
if first > last:
|
||||
return []
|
||||
return list(ipaddress.summarize_address_range(first, last))
|
||||
except (ipaddress.AddressValueError, ValueError, TypeError):
|
||||
return []
|
||||
|
||||
|
||||
def download_iptoasn(output_dir: str) -> str:
|
||||
"""Download and decompress ip2asn-v4.tsv.gz, return path to TSV."""
|
||||
tsv_path = os.path.join(output_dir, CACHED_TSV)
|
||||
print(f"[download] Fetching {IPTOASN_URL} ...")
|
||||
try:
|
||||
req = urllib.request.Request(IPTOASN_URL, headers={
|
||||
"User-Agent": "generate_asn_data/1.0"
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=120) as resp:
|
||||
compressed = resp.read()
|
||||
print(f"[download] Downloaded {len(compressed):,} bytes compressed")
|
||||
raw = gzip.decompress(compressed)
|
||||
print(f"[download] Decompressed to {len(raw):,} bytes")
|
||||
with open(tsv_path, "wb") as f:
|
||||
f.write(raw)
|
||||
print(f"[download] Saved to {tsv_path}")
|
||||
except Exception as e:
|
||||
print(f"[error] Download failed: {e}", file=sys.stderr)
|
||||
if os.path.exists(tsv_path):
|
||||
print(f"[info] Using cached {tsv_path}")
|
||||
else:
|
||||
print("[fatal] No cached TSV available, aborting.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
return tsv_path
|
||||
|
||||
|
||||
def parse_tsv(tsv_path: str):
|
||||
"""
|
||||
Parse ip2asn-v4.tsv.
|
||||
Format: range_start \\t range_end \\t AS_number \\t country_code \\t AS_description
|
||||
Returns list of (start_ip, end_ip, asn, country, as_name) tuples.
|
||||
"""
|
||||
entries = []
|
||||
with open(tsv_path, "r", encoding="utf-8") as f:
|
||||
for line_no, line in enumerate(f, 1):
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 5:
|
||||
continue
|
||||
start_ip, end_ip, asn_str, country, as_name = (
|
||||
parts[0], parts[1], parts[2], parts[3], parts[4]
|
||||
)
|
||||
try:
|
||||
asn = int(asn_str)
|
||||
except ValueError:
|
||||
continue
|
||||
if asn == 0:
|
||||
continue
|
||||
entries.append((start_ip, end_ip, asn, country, as_name))
|
||||
return entries
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate ASN reputation and IP-to-ASN CSVs")
|
||||
parser.add_argument("--output-asn", default="asn_reputation.csv")
|
||||
parser.add_argument("--output-ipasn", default="iplocate-ip-to-asn.csv")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate ASN reputation and IP-to-ASN CSVs from iptoasn.com data"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir", default=".",
|
||||
help="Directory for output CSV files (default: current dir)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-download", action="store_true",
|
||||
help="Skip download, use cached ip2asn-v4.tsv in output-dir"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Generate asn_reputation.csv
|
||||
seen_asn = set()
|
||||
with open(args.output_asn, "w") as f:
|
||||
f.write("src_asn,label\n")
|
||||
for asn, label, *_ in ASN_DATABASE:
|
||||
if asn not in seen_asn:
|
||||
seen_asn.add(asn)
|
||||
f.write(f"{asn},{label}\n")
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# Generate iplocate-ip-to-asn.csv
|
||||
with open(args.output_ipasn, "w") as f:
|
||||
f.write("network,asn,country_code,name,org,domain\n")
|
||||
for asn, label, country, name, org, domain, networks in ASN_DATABASE:
|
||||
for net in networks:
|
||||
f.write(f"{net},{asn},{country},{name},{org},{domain}\n")
|
||||
# Step 1: Get TSV data
|
||||
tsv_path = os.path.join(args.output_dir, CACHED_TSV)
|
||||
if not args.no_download:
|
||||
tsv_path = download_iptoasn(args.output_dir)
|
||||
elif not os.path.exists(tsv_path):
|
||||
print(f"[fatal] --no-download specified but {tsv_path} not found",
|
||||
file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
total_nets = sum(len(entry[6]) for entry in ASN_DATABASE)
|
||||
human_count = sum(1 for entry in ASN_DATABASE if entry[1] == "human")
|
||||
dc_count = sum(1 for entry in ASN_DATABASE if entry[1] == "datacenter")
|
||||
host_count = sum(1 for entry in ASN_DATABASE if entry[1] == "hosting")
|
||||
# Step 2: Parse
|
||||
print("[parse] Reading TSV ...")
|
||||
entries = parse_tsv(tsv_path)
|
||||
print(f"[parse] {len(entries):,} IP range entries")
|
||||
|
||||
print(f"[asn] {len(seen_asn)} unique ASNs: "
|
||||
f"{human_count} human, {dc_count} datacenter, {host_count} hosting")
|
||||
print(f"[ipasn] {total_nets} network prefixes mapped")
|
||||
# Step 3: Build ASN info map {asn: (country, as_name)}
|
||||
# Use first occurrence for name/country (they're consistent in the dataset)
|
||||
asn_info: dict[int, tuple[str, str]] = {}
|
||||
for _, _, asn, country, as_name in entries:
|
||||
if asn not in asn_info:
|
||||
asn_info[asn] = (country, as_name)
|
||||
|
||||
# Step 4: Classify all ASNs
|
||||
asn_labels: dict[int, str] = {}
|
||||
for asn, (country, as_name) in asn_info.items():
|
||||
asn_labels[asn] = classify_asn(asn, as_name)
|
||||
|
||||
# Step 5: Write asn_reputation.csv (sorted by ASN, no header)
|
||||
rep_path = os.path.join(args.output_dir, "asn_reputation.csv")
|
||||
sorted_asns = sorted(asn_labels.keys())
|
||||
with open(rep_path, "w") as f:
|
||||
for asn in sorted_asns:
|
||||
f.write(f"{asn},{asn_labels[asn]}\n")
|
||||
print(f"[output] Wrote {len(sorted_asns):,} ASNs to {rep_path}")
|
||||
|
||||
# Step 6: Convert ranges to CIDRs and write iplocate-ip-to-asn.csv
|
||||
ip_path = os.path.join(args.output_dir, "iplocate-ip-to-asn.csv")
|
||||
cidr_count = 0
|
||||
seen_cidrs: set[str] = set()
|
||||
|
||||
# Collect all rows, then sort
|
||||
cidr_rows: list[tuple[ipaddress.IPv4Network, int, str, str]] = []
|
||||
print("[cidr] Converting IP ranges to CIDR notation ...")
|
||||
for start_ip, end_ip, asn, country, as_name in entries:
|
||||
cidrs = ranges_to_cidrs(start_ip, end_ip)
|
||||
for cidr in cidrs:
|
||||
key = f"{cidr},{asn}"
|
||||
if key not in seen_cidrs:
|
||||
seen_cidrs.add(key)
|
||||
cidr_rows.append((cidr, asn, country, as_name))
|
||||
|
||||
# Sort by network address
|
||||
cidr_rows.sort(key=lambda r: (r[0].network_address, r[0].prefixlen))
|
||||
|
||||
with open(ip_path, "w") as f:
|
||||
for cidr, asn, country, as_name in cidr_rows:
|
||||
# Escape commas in AS names
|
||||
safe_name = as_name.replace(",", " ")
|
||||
f.write(f"{cidr},{asn},{country},{safe_name}\n")
|
||||
cidr_count = len(cidr_rows)
|
||||
print(f"[output] Wrote {cidr_count:,} CIDRs to {ip_path}")
|
||||
|
||||
# Step 7: Stats
|
||||
stats: dict[str, int] = {}
|
||||
for label in asn_labels.values():
|
||||
stats[label] = stats.get(label, 0) + 1
|
||||
|
||||
print("\n=== Summary ===")
|
||||
print(f"Total ASNs: {len(sorted_asns):,}")
|
||||
for label in ["human", "datacenter", "hosting", "cdn", "unknown"]:
|
||||
count = stats.get(label, 0)
|
||||
pct = 100.0 * count / len(sorted_asns) if sorted_asns else 0
|
||||
print(f" {label:12s}: {count:>6,} ({pct:5.1f}%)")
|
||||
print(f"Total CIDRs: {cidr_count:,}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user