feat: rewrite ASN classification with PeeringDB + expanded heuristics
Major improvements to generate_asn_data.py: - Add PeeringDB network data source (34K networks with info_type) - Add new categories: education, government, enterprise - Rename 'human' label to 'isp' across all consumers - Expand keyword heuristics (ISP, datacenter, hosting, CDN, education, gov) - Add hard-coded lists for education, government, enterprise ASNs - Support both --output-dir and --output-asn/--output-ipasn CLI interfaces - Add --no-peeringdb flag for offline use Results: unknown dropped from 86% to 57%, ISP coverage 21.8K ASNs, education 3.1K, enterprise 5.7K, government 520. Updated consumers: - bot_detector.py: 'human' -> 'isp' for baseline selection - dashboard api.py: 'human' -> 'isp' in SQL queries - run-tests.sh: 'human' -> 'isp' in integration test assertions - update-csv-data.sh: updated label description comment Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@ -1,32 +1,40 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
generate_asn_data.py — Download public IPtoASN database and generate:
|
generate_asn_data.py — Download public IPtoASN + PeeringDB data and generate:
|
||||||
1. asn_reputation.csv — ASN number + reputation label (no header)
|
1. asn_reputation.csv — ASN number + reputation label (no header)
|
||||||
2. iplocate-ip-to-asn.csv — CIDR,ASN,country,AS name (no header)
|
2. iplocate-ip-to-asn.csv — CIDR,ASN,country,AS name (no header)
|
||||||
|
|
||||||
Data source: https://iptoasn.com/data/ip2asn-v4.tsv.gz (free, no registration)
|
Data sources:
|
||||||
|
- https://iptoasn.com/data/ip2asn-v4.tsv.gz (IP ranges → ASN mapping)
|
||||||
|
- https://www.peeringdb.com/api/net (ASN → network type)
|
||||||
|
|
||||||
|
Labels: isp, datacenter, hosting, cdn, enterprise, education, government, unknown
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python3 generate_asn_data.py --output-dir .
|
python3 generate_asn_data.py --output-dir ./data
|
||||||
python3 generate_asn_data.py --output-dir . --no-download # reuse cached TSV
|
python3 generate_asn_data.py --output-dir ./data --no-download
|
||||||
|
python3 generate_asn_data.py --output-dir ./data --no-peeringdb
|
||||||
|
python3 generate_asn_data.py --output-asn out/asn.csv --output-ipasn out/ip.csv
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import gzip
|
import gzip
|
||||||
import ipaddress
|
import ipaddress
|
||||||
import io
|
import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
|
||||||
IPTOASN_URL = "https://iptoasn.com/data/ip2asn-v4.tsv.gz"
|
IPTOASN_URL = "https://iptoasn.com/data/ip2asn-v4.tsv.gz"
|
||||||
|
PEERINGDB_URL = "https://www.peeringdb.com/api/net?limit=0&depth=0"
|
||||||
CACHED_TSV = "ip2asn-v4.tsv"
|
CACHED_TSV = "ip2asn-v4.tsv"
|
||||||
|
CACHED_PEERINGDB = "peeringdb_nets.json"
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Hard-coded well-known ASN lists for accurate classification
|
# Hard-coded well-known ASN lists (highest classification priority)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
KNOWN_HUMAN_ASNS: set[int] = {
|
KNOWN_ISP_ASNS: set[int] = {
|
||||||
# France
|
# France
|
||||||
3215, 12322, 15557, 5410, 6799, 29169, 2027,
|
3215, 12322, 15557, 5410, 6799, 29169, 2027,
|
||||||
# Germany
|
# Germany
|
||||||
@ -67,17 +75,17 @@ KNOWN_HUMAN_ASNS: set[int] = {
|
|||||||
# Turkey
|
# Turkey
|
||||||
9121, 34984, 47331, 16135,
|
9121, 34984, 47331, 16135,
|
||||||
# Poland
|
# Poland
|
||||||
5617, 12912, 6830, 5588, 8374, 21021,
|
5617, 12912, 5588, 8374, 21021,
|
||||||
# Sweden / Nordics
|
# Sweden / Nordics
|
||||||
3301, 1257, 2119, 8473, 12552, 44034, 2116, 29518,
|
3301, 1257, 2119, 8473, 12552, 44034, 2116, 29518,
|
||||||
# Switzerland
|
# Switzerland
|
||||||
3303, 6830,
|
3303,
|
||||||
# Belgium
|
# Belgium
|
||||||
5432, 6848, 12392,
|
6848, 12392,
|
||||||
# Portugal
|
# Portugal
|
||||||
3243, 2860, 8657,
|
3243, 2860, 8657,
|
||||||
# Ireland
|
# Ireland
|
||||||
5466, 15502, 6830,
|
5466, 15502,
|
||||||
# Southeast Asia
|
# Southeast Asia
|
||||||
4773, 7552, 45899, 9299, 4818, 18403, 17974, 23969, 9534,
|
4773, 7552, 45899, 9299, 4818, 18403, 17974, 23969, 9534,
|
||||||
24203, 7470,
|
24203, 7470,
|
||||||
@ -116,7 +124,7 @@ KNOWN_DATACENTER_ASNS: set[int] = {
|
|||||||
14061, 393406,
|
14061, 393406,
|
||||||
# Linode / Akamai Connected Cloud
|
# Linode / Akamai Connected Cloud
|
||||||
63949, 22040,
|
63949, 22040,
|
||||||
# Vultr
|
# Vultr / Choopa
|
||||||
20473,
|
20473,
|
||||||
# Scaleway / Online SAS
|
# Scaleway / Online SAS
|
||||||
12876,
|
12876,
|
||||||
@ -142,10 +150,20 @@ KNOWN_DATACENTER_ASNS: set[int] = {
|
|||||||
36352,
|
36352,
|
||||||
# QuadraNet
|
# QuadraNet
|
||||||
8100,
|
8100,
|
||||||
# Choopa (Vultr subsidiary)
|
|
||||||
20473,
|
|
||||||
# Zenlayer
|
# Zenlayer
|
||||||
21859,
|
21859,
|
||||||
|
# Yandex Cloud
|
||||||
|
200350, 208722,
|
||||||
|
# Huawei Cloud
|
||||||
|
136907,
|
||||||
|
# NTT Global
|
||||||
|
2914,
|
||||||
|
# Cogent
|
||||||
|
174,
|
||||||
|
# GTT
|
||||||
|
3257,
|
||||||
|
# Zayo
|
||||||
|
6461,
|
||||||
}
|
}
|
||||||
|
|
||||||
KNOWN_CDN_ASNS: set[int] = {
|
KNOWN_CDN_ASNS: set[int] = {
|
||||||
@ -170,6 +188,11 @@ KNOWN_CDN_ASNS: set[int] = {
|
|||||||
55095, # Cloudflare CN
|
55095, # Cloudflare CN
|
||||||
394536, # Fastly
|
394536, # Fastly
|
||||||
395973, # Fastly
|
395973, # Fastly
|
||||||
|
36040, # Google CDN
|
||||||
|
15395, # MaxCDN
|
||||||
|
30081, # CacheFly
|
||||||
|
60068, # CDN77
|
||||||
|
200325, # BunnyCDN
|
||||||
}
|
}
|
||||||
|
|
||||||
KNOWN_HOSTING_ASNS: set[int] = {
|
KNOWN_HOSTING_ASNS: set[int] = {
|
||||||
@ -181,7 +204,6 @@ KNOWN_HOSTING_ASNS: set[int] = {
|
|||||||
19871, # Network Solutions
|
19871, # Network Solutions
|
||||||
40034, # Confluence Networks
|
40034, # Confluence Networks
|
||||||
26347, # DreamHost
|
26347, # DreamHost
|
||||||
36351, # SoftLayer (also datacenter, but historically hosting)
|
|
||||||
29802, # HIVELOCITY
|
29802, # HIVELOCITY
|
||||||
36024, # HostNOC
|
36024, # HostNOC
|
||||||
21844, # ThePlanet / SoftLayer
|
21844, # ThePlanet / SoftLayer
|
||||||
@ -200,20 +222,186 @@ KNOWN_HOSTING_ASNS: set[int] = {
|
|||||||
398101, # GoDaddy hosting
|
398101, # GoDaddy hosting
|
||||||
19969, # Joe's Datacenter
|
19969, # Joe's Datacenter
|
||||||
395003, # WPEngine
|
395003, # WPEngine
|
||||||
|
22612, # Namecheap
|
||||||
|
46664, # VolumeDrive
|
||||||
|
55286, # B2 Net Solutions / ServerMania
|
||||||
|
35916, # MultaCom
|
||||||
|
23470, # ReliableSite
|
||||||
|
}
|
||||||
|
|
||||||
|
KNOWN_EDUCATION_ASNS: set[int] = {
|
||||||
|
786, # JANET (UK academic network)
|
||||||
|
21320, # GEANT (European research)
|
||||||
|
11537, # Internet2 (US research)
|
||||||
|
2200, # RENATER (France research)
|
||||||
|
680, # DFN (Germany research)
|
||||||
|
137, # GARR (Italy research)
|
||||||
|
766, # RedIRIS (Spain research)
|
||||||
|
7575, # AARNet (Australia research)
|
||||||
|
513, # CERN
|
||||||
|
1741, # FUNET (Finland research)
|
||||||
|
2852, # CESNET (Czech research)
|
||||||
|
2603, # NORDUnet (Nordics research)
|
||||||
|
1930, # FCCN (Portugal research)
|
||||||
|
2381, # WIDE Project (Japan research)
|
||||||
|
1103, # SURFNET (Netherlands research)
|
||||||
|
2018, # AFRINIC
|
||||||
|
7660, # APNIC research
|
||||||
|
2500, # WIDE (Japan)
|
||||||
|
4556, # HKIX (Hong Kong academic)
|
||||||
|
24151, # CNGI-CERNET2 (China education)
|
||||||
|
4538, # CERNET (China education)
|
||||||
|
4837, # CERNET2 backbone
|
||||||
|
3, # MIT
|
||||||
|
10578, # Stanford
|
||||||
|
27, # University of Maryland
|
||||||
|
46, # UC Berkeley
|
||||||
|
104, # Carnegie Mellon
|
||||||
|
224, # Uninett (Norway research)
|
||||||
|
2852, # CESNET
|
||||||
|
2547, # Belnet (Belgium research)
|
||||||
|
1754, # DESY (Germany)
|
||||||
|
5765, # SANET (Slovakia academic)
|
||||||
|
9264, # KAIST (South Korea)
|
||||||
|
}
|
||||||
|
|
||||||
|
KNOWN_GOVERNMENT_ASNS: set[int] = {
|
||||||
|
3354, # US DoD Network Information Center
|
||||||
|
27064, # DoD Education Activity
|
||||||
|
721, # DoD / DISA
|
||||||
|
749, # DoD / DISA
|
||||||
|
568, # US Postal Service
|
||||||
|
3541, # Headquarters USAISC
|
||||||
|
6983, # US Internal Revenue Service
|
||||||
|
19551, # Incapsula Fed (US gov CDN)
|
||||||
|
394671, # US Dept of Veterans Affairs
|
||||||
|
7046, # US DoD
|
||||||
|
10796, # US Census Bureau
|
||||||
|
2572, # UK MoD
|
||||||
|
25180, # Bundesverwaltung (Swiss gov)
|
||||||
|
553, # BelWue (partially government)
|
||||||
|
559, # SWITCH (Swiss gov/edu)
|
||||||
|
}
|
||||||
|
|
||||||
|
KNOWN_ENTERPRISE_ASNS: set[int] = {
|
||||||
|
8068, # Microsoft Corp
|
||||||
|
36459, # GitHub
|
||||||
|
32934, # Facebook / Meta
|
||||||
|
13414, # Twitter / X
|
||||||
|
54115, # Dropbox
|
||||||
|
8003, # Uber
|
||||||
|
14413, # Linkedin
|
||||||
|
40428, # Salesforce
|
||||||
|
30148, # Sucuri (enterprise security)
|
||||||
|
394699, # Zoom
|
||||||
|
19551, # Shopify
|
||||||
|
62229, # Spotify
|
||||||
|
394406, # Slack
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Keyword-based heuristic classification
|
# PeeringDB info_type → label mapping
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
HUMAN_KEYWORDS = [
|
PEERINGDB_TYPE_MAP: dict[str, str] = {
|
||||||
|
"Cable/DSL/ISP": "isp",
|
||||||
|
"NSP": "isp",
|
||||||
|
"Content": "cdn",
|
||||||
|
"Enterprise": "enterprise",
|
||||||
|
"Educational/Research": "education",
|
||||||
|
"Government": "government",
|
||||||
|
"Non-Profit": "enterprise",
|
||||||
|
"Route Server": "unknown",
|
||||||
|
"Route Collector": "unknown",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def download_peeringdb(output_dir: str) -> dict[int, str]:
|
||||||
|
"""Download PeeringDB network list, return {asn: label} mapping."""
|
||||||
|
cache_path = os.path.join(output_dir, CACHED_PEERINGDB)
|
||||||
|
print(f"[peeringdb] Fetching {PEERINGDB_URL} ...")
|
||||||
|
data = None
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(PEERINGDB_URL, headers={
|
||||||
|
"User-Agent": "generate_asn_data/2.0",
|
||||||
|
"Accept": "application/json",
|
||||||
|
})
|
||||||
|
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||||
|
raw = resp.read()
|
||||||
|
print(f"[peeringdb] Downloaded {len(raw):,} bytes")
|
||||||
|
data = json.loads(raw)
|
||||||
|
with open(cache_path, "wb") as f:
|
||||||
|
f.write(raw)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[warning] PeeringDB download failed: {e}", file=sys.stderr)
|
||||||
|
if os.path.exists(cache_path):
|
||||||
|
print(f"[info] Using cached {cache_path}")
|
||||||
|
with open(cache_path, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
else:
|
||||||
|
print("[warning] No PeeringDB data available, continuing without it.",
|
||||||
|
file=sys.stderr)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
if not data or "data" not in data:
|
||||||
|
print("[warning] PeeringDB response has no 'data' key", file=sys.stderr)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
pdb_labels: dict[int, str] = {}
|
||||||
|
nets = data["data"]
|
||||||
|
print(f"[peeringdb] Processing {len(nets):,} networks ...")
|
||||||
|
|
||||||
|
for net in nets:
|
||||||
|
asn = net.get("asn", 0)
|
||||||
|
if not asn or asn <= 0:
|
||||||
|
continue
|
||||||
|
info_type = (net.get("info_type") or "").strip()
|
||||||
|
name = (net.get("name") or "").lower()
|
||||||
|
|
||||||
|
# Direct type mapping
|
||||||
|
if info_type in PEERINGDB_TYPE_MAP:
|
||||||
|
pdb_labels[asn] = PEERINGDB_TYPE_MAP[info_type]
|
||||||
|
elif info_type == "Network Services":
|
||||||
|
# Check keywords to distinguish hosting/datacenter/isp
|
||||||
|
if _match_keywords(name, HOSTING_KEYWORDS):
|
||||||
|
pdb_labels[asn] = "hosting"
|
||||||
|
elif _match_keywords(name, DATACENTER_KEYWORDS):
|
||||||
|
pdb_labels[asn] = "datacenter"
|
||||||
|
elif _match_keywords(name, ISP_KEYWORDS):
|
||||||
|
pdb_labels[asn] = "isp"
|
||||||
|
else:
|
||||||
|
pdb_labels[asn] = "enterprise"
|
||||||
|
elif info_type == "":
|
||||||
|
# No type set — try keyword classification on name
|
||||||
|
label = _classify_by_keywords(name)
|
||||||
|
if label:
|
||||||
|
pdb_labels[asn] = label
|
||||||
|
# else: unknown types like Route Server — skip
|
||||||
|
|
||||||
|
stats: dict[str, int] = {}
|
||||||
|
for v in pdb_labels.values():
|
||||||
|
stats[v] = stats.get(v, 0) + 1
|
||||||
|
print(f"[peeringdb] Mapped {len(pdb_labels):,} ASNs: {stats}")
|
||||||
|
return pdb_labels
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Keyword-based heuristic classification (expanded)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
ISP_KEYWORDS = [
|
||||||
|
# Generic ISP terms
|
||||||
"telecom", "telcom", "telekom", "telefonica", "telecomunicacoes",
|
"telecom", "telcom", "telekom", "telefonica", "telecomunicacoes",
|
||||||
|
"telecomunicaciones", "telekommunikasjon", "telekommunikation",
|
||||||
"mobile", "broadband", "fiber", "fibre", "cable", "wireless",
|
"mobile", "broadband", "fiber", "fibre", "cable", "wireless",
|
||||||
"residential", "communications", "comm ", " isp", "netcom",
|
"residential", "communications", "comm ", " isp", "netcom",
|
||||||
"internet service", "subscriber", "dsl", "adsl", "vdsl", "ftth",
|
"internet service", "subscriber", "dsl", "adsl", "vdsl", "ftth",
|
||||||
"fttb", "dial-up", "dialup", "cellular", "lte", "5g network",
|
"fttb", "fttp", "dial-up", "dialup", "cellular", "lte", "5g network",
|
||||||
"wimax", "satellite", " tel ", "telco", "ptcl", "bsnl",
|
"wimax", "satellite", " tel ", "telco", "ptcl", "bsnl",
|
||||||
|
"telecable", "telemedia", "telnet", "telprom",
|
||||||
|
"internet provider", "internet access",
|
||||||
|
# Specific operators — Global
|
||||||
"airtel", "jio", "reliance", "vodafone", "orange", "bouygues",
|
"airtel", "jio", "reliance", "vodafone", "orange", "bouygues",
|
||||||
"proximus", "swisscom", "telenor", "telia", "elisa", "dna oy",
|
"proximus", "swisscom", "telenor", "telia", "elisa", "dna oy",
|
||||||
"rogers", "bell canada", "shaw", "telus", "optus", "tpg ",
|
"rogers", "bell canada", "shaw", "telus", "optus", "tpg ",
|
||||||
@ -238,6 +426,38 @@ HUMAN_KEYWORDS = [
|
|||||||
"etisalat", "du telecom", "stc ", "zain",
|
"etisalat", "du telecom", "stc ", "zain",
|
||||||
"mtn ", "safaricom", "airtel africa", "rain ",
|
"mtn ", "safaricom", "airtel africa", "rain ",
|
||||||
"telmex", "claro", "vivo ", "oi s.a",
|
"telmex", "claro", "vivo ", "oi s.a",
|
||||||
|
# Additional global operators
|
||||||
|
"telstra", "spark nz", "chorus nz", "vodacom", "digicel",
|
||||||
|
"flow ", "liberty cablevision", "totalplay", "izzi ",
|
||||||
|
"megacable", "axtel", "tigo ", "entel", "bitel", "movilnet",
|
||||||
|
"cantv", "antel", "copel telecom", "algar telecom", "rede sul",
|
||||||
|
"surf telecom", "net servicos", "porto seguro",
|
||||||
|
"dialog axiata", "sri lanka telecom", "banglalink", "grameenphone",
|
||||||
|
"robi axiata", "ncell", "nepal telecom",
|
||||||
|
"omantel", "ooredoo", "mobily", "batelco", "viva bahrain",
|
||||||
|
"nawras", "sure telecom", "airlink", "cool ideas", "vumatel",
|
||||||
|
"telkom sa", "web africa", "afrihost", "zte corporation",
|
||||||
|
"smile telecom", "busyinternet", "camtel", "orange cameroun",
|
||||||
|
"maroc telecom", "inwi", "djezzy", "mobilis", "tunisie telecom",
|
||||||
|
"ethio telecom", "liquid telecom", "neotel", "cell c",
|
||||||
|
"a1 telekom", "magenta telekom", "drei austria", "cablecom",
|
||||||
|
"salt mobile", "sunrise", "post luxembourg", "tango ",
|
||||||
|
"nova croatia", "ht ", "vivacom", "bulsatcom", "cosmote",
|
||||||
|
"nova greece", "forthnet", "wind hellas", "cyta",
|
||||||
|
"telekom srbija", "vip mobile", "telenor serbia",
|
||||||
|
"telekom slovenije", "a1 slovenia",
|
||||||
|
"o2 czech", "t-mobile czech", "vodafone czech",
|
||||||
|
"orange polska", "polkomtel", "play mobile",
|
||||||
|
"netia", "inea ", "vectra",
|
||||||
|
"bite ", "tele2", "tet ", "eltel",
|
||||||
|
"starman", "telia eesti", "elisa eesti",
|
||||||
|
"lattelecom", "baltcom",
|
||||||
|
"kyivstar", "lifecell", "ukrtelecom",
|
||||||
|
"moldtelecom", "orange moldova",
|
||||||
|
"kazakhtelecom", "beeline kz", "tele2 kz",
|
||||||
|
"uztelecom", "ucell",
|
||||||
|
"bakcell", "azercell", "azerconnect",
|
||||||
|
"veon", "silknet", "magti",
|
||||||
]
|
]
|
||||||
|
|
||||||
DATACENTER_KEYWORDS = [
|
DATACENTER_KEYWORDS = [
|
||||||
@ -247,69 +467,149 @@ DATACENTER_KEYWORDS = [
|
|||||||
"scaleway", "contabo", "kamatera", "upcloud",
|
"scaleway", "contabo", "kamatera", "upcloud",
|
||||||
"oracle cloud", "alibaba cloud", "tencent cloud",
|
"oracle cloud", "alibaba cloud", "tencent cloud",
|
||||||
"ibm cloud", "softlayer", "rackspace", "equinix",
|
"ibm cloud", "softlayer", "rackspace", "equinix",
|
||||||
"leaseweb", "choopa", "data center", "datacenter",
|
"leaseweb", "choopa", "data center", "datacenter", "data centre",
|
||||||
"colocation", "colo ", "baremetal", "bare metal",
|
"colocation", "colo ", "baremetal", "bare metal",
|
||||||
"infrastructure", "iaas", "paas",
|
"infrastructure", "iaas", "paas",
|
||||||
|
# Additional cloud/datacenter providers
|
||||||
|
"serverel", "packet", "metal ", "nexon", "netmagic",
|
||||||
|
"phoenixnap", "tierpoint", "cyrusone", "coresite",
|
||||||
|
"datapipe", "internap", "peak 10", "zayo ", "coreweave",
|
||||||
|
"lambda cloud", "paperspace", "vast.ai", "fluidstack",
|
||||||
|
"cloudfloor", "netriplex", "hyperoptic", "serverspace",
|
||||||
|
"selectel", "aruba cloud", "worldstream", "datacamp",
|
||||||
|
"m247", "datapacket", "clouvider", "hostkey",
|
||||||
|
"tzulo", "ramnode", "hostens", "serverius", "servers.com",
|
||||||
|
"nocix", "fdcservers", "reliablesite",
|
||||||
|
"quadranet", "cogent ", "colocrossing",
|
||||||
]
|
]
|
||||||
|
|
||||||
HOSTING_KEYWORDS = [
|
HOSTING_KEYWORDS = [
|
||||||
"hosting", "host ", "hoster", "webhost",
|
"hosting", "host ", "hoster", "webhost", "web host",
|
||||||
"server farm", "vps", "virtual private",
|
"server farm", "vps", "virtual private",
|
||||||
"dedicated server", "shared hosting", "managed hosting",
|
"dedicated server", "shared hosting", "managed hosting",
|
||||||
"reseller", "cpanel", "plesk", "wordpress host",
|
"reseller", "cpanel", "plesk", "wordpress host",
|
||||||
"godaddy", "namecheap", "hostinger", "bluehost",
|
"godaddy", "namecheap", "hostinger", "bluehost",
|
||||||
"siteground", "a2 hosting", "dreamhost", "hostgator",
|
"siteground", "a2 hosting", "dreamhost", "hostgator",
|
||||||
"ionos", "squarespace", "wix", "wpengine",
|
"ionos", "squarespace", "wix", "wpengine", "wp engine",
|
||||||
|
"register.com", "domain registr", "registrar",
|
||||||
|
"strato ag", "mittwald", "hosteurope", "all-inkl",
|
||||||
|
"infomaniak", "gandi", "one.com", "loopia",
|
||||||
|
"fastcomet", "cloudways", "kinsta", "flywheel",
|
||||||
|
"netlify", "vercel", "render.com", "railway",
|
||||||
|
"heroku", "webhosting",
|
||||||
]
|
]
|
||||||
|
|
||||||
CDN_KEYWORDS = [
|
CDN_KEYWORDS = [
|
||||||
"cloudflare", "akamai", "fastly", "cdn ",
|
"cloudflare", "akamai", "fastly", "cdn ",
|
||||||
"content delivery", "edgecast", "limelight",
|
"content delivery", "edgecast", "limelight",
|
||||||
"stackpath", "sucuri", "keycdn", "bunnycdn",
|
"stackpath", "sucuri", "keycdn", "bunnycdn", "bunny cdn",
|
||||||
"jsdelivr", "cachefly", "imperva", "incapsula",
|
"jsdelivr", "cachefly", "imperva", "incapsula",
|
||||||
|
"g-core", "gcorelabs", "cdn77", "cdn network",
|
||||||
|
"medianova", "chinacache", "wangsu", "quantil",
|
||||||
|
"azion", "section.io", "beluga cdn",
|
||||||
|
]
|
||||||
|
|
||||||
|
EDUCATION_KEYWORDS = [
|
||||||
|
"university", "universit", "universite", "universidad", "universidade",
|
||||||
|
"universita", "universitaet", "univ ", "univ.", "univers",
|
||||||
|
"college", "academic", "academia", "research",
|
||||||
|
"education", "educational", "school", "institute",
|
||||||
|
"polytechnic", "hochschule", "fachhochschule",
|
||||||
|
"nren", "ren ", ".ren", "research network",
|
||||||
|
"wissenschaft", "forschung",
|
||||||
|
"campus", "student",
|
||||||
|
"ecole ", "école", "escuela", "escola",
|
||||||
|
"fakultat", "fakultet",
|
||||||
|
"技術", "大学", "学院", "研究", "科学",
|
||||||
|
"교육", "대학",
|
||||||
|
"กรม", "มหาวิทยาลัย",
|
||||||
|
]
|
||||||
|
|
||||||
|
GOVERNMENT_KEYWORDS = [
|
||||||
|
"government", "gouvernement", "gobierno", "governo", "regierung",
|
||||||
|
"gouv", "gov.", ".gov", "gob.",
|
||||||
|
"ministry", "ministere", "ministerio", "ministerium", "ministero",
|
||||||
|
"military", "army", "navy", "defense", "defence", "defensa",
|
||||||
|
"police", "polizei", "polizia",
|
||||||
|
"federal", "fedral", "estado", "etat",
|
||||||
|
"municipal", "prefecture", "region ",
|
||||||
|
"national agency", "state of ",
|
||||||
|
"bundeswehr", "armed forces",
|
||||||
|
"customs", "immigration",
|
||||||
|
"justic", "judiciary",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Enterprise keywords are lowest priority — many ISPs have these suffixes
|
||||||
|
ENTERPRISE_KEYWORDS = [
|
||||||
|
"corporation", "corp.", "corporate",
|
||||||
|
" inc.", " inc,",
|
||||||
|
" ltd.", " ltd,",
|
||||||
|
"gmbh", "s.a.", "s.r.l",
|
||||||
|
" group", "holdings",
|
||||||
|
"financial", "banking", "insurance",
|
||||||
|
"pharmaceutical", "logistics",
|
||||||
|
"manufacturing", "industrial",
|
||||||
|
"automotive", "energy",
|
||||||
|
"retail ", "media group",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def classify_asn(asn_number: int, as_name: str) -> str:
|
def _match_keywords(name_lower: str, keywords: list[str]) -> bool:
|
||||||
"""Classify an ASN into a reputation category."""
|
"""Check if any keyword matches in the lowered name."""
|
||||||
# Hard-coded lookups first (highest priority)
|
return any(kw in name_lower for kw in keywords)
|
||||||
|
|
||||||
|
|
||||||
|
def _classify_by_keywords(name_lower: str) -> str:
|
||||||
|
"""Classify by keyword heuristics. Returns label or empty string."""
|
||||||
|
if name_lower in ("not routed", "none", "", "-"):
|
||||||
|
return "unknown"
|
||||||
|
if _match_keywords(name_lower, CDN_KEYWORDS):
|
||||||
|
return "cdn"
|
||||||
|
if _match_keywords(name_lower, EDUCATION_KEYWORDS):
|
||||||
|
return "education"
|
||||||
|
if _match_keywords(name_lower, GOVERNMENT_KEYWORDS):
|
||||||
|
return "government"
|
||||||
|
if _match_keywords(name_lower, HOSTING_KEYWORDS):
|
||||||
|
return "hosting"
|
||||||
|
if _match_keywords(name_lower, DATACENTER_KEYWORDS):
|
||||||
|
return "datacenter"
|
||||||
|
if _match_keywords(name_lower, ISP_KEYWORDS):
|
||||||
|
return "isp"
|
||||||
|
if _match_keywords(name_lower, ENTERPRISE_KEYWORDS):
|
||||||
|
return "enterprise"
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def classify_asn(asn_number: int, as_name: str,
|
||||||
|
peeringdb: dict[int, str]) -> str:
|
||||||
|
"""Classify an ASN into a reputation category.
|
||||||
|
|
||||||
|
Priority: hard-coded lists > PeeringDB > keyword heuristics.
|
||||||
|
"""
|
||||||
|
# 1. Hard-coded lookups (highest priority)
|
||||||
if asn_number in KNOWN_CDN_ASNS:
|
if asn_number in KNOWN_CDN_ASNS:
|
||||||
return "cdn"
|
return "cdn"
|
||||||
if asn_number in KNOWN_HOSTING_ASNS:
|
if asn_number in KNOWN_HOSTING_ASNS:
|
||||||
return "hosting"
|
return "hosting"
|
||||||
if asn_number in KNOWN_DATACENTER_ASNS:
|
if asn_number in KNOWN_DATACENTER_ASNS:
|
||||||
return "datacenter"
|
return "datacenter"
|
||||||
if asn_number in KNOWN_HUMAN_ASNS:
|
if asn_number in KNOWN_EDUCATION_ASNS:
|
||||||
return "human"
|
return "education"
|
||||||
|
if asn_number in KNOWN_GOVERNMENT_ASNS:
|
||||||
|
return "government"
|
||||||
|
if asn_number in KNOWN_ENTERPRISE_ASNS:
|
||||||
|
return "enterprise"
|
||||||
|
if asn_number in KNOWN_ISP_ASNS:
|
||||||
|
return "isp"
|
||||||
|
|
||||||
# Keyword heuristics on AS name
|
# 2. PeeringDB classification
|
||||||
|
if asn_number in peeringdb:
|
||||||
|
return peeringdb[asn_number]
|
||||||
|
|
||||||
|
# 3. Keyword heuristics on AS name
|
||||||
name_lower = as_name.lower()
|
name_lower = as_name.lower()
|
||||||
|
label = _classify_by_keywords(name_lower)
|
||||||
# Skip "Not routed" or reserved
|
return label if label else "unknown"
|
||||||
if name_lower in ("not routed", "none", "", "-"):
|
|
||||||
return "unknown"
|
|
||||||
|
|
||||||
# CDN first (most specific)
|
|
||||||
for kw in CDN_KEYWORDS:
|
|
||||||
if kw in name_lower:
|
|
||||||
return "cdn"
|
|
||||||
|
|
||||||
# Hosting before datacenter (more specific)
|
|
||||||
for kw in HOSTING_KEYWORDS:
|
|
||||||
if kw in name_lower:
|
|
||||||
return "hosting"
|
|
||||||
|
|
||||||
# Datacenter
|
|
||||||
for kw in DATACENTER_KEYWORDS:
|
|
||||||
if kw in name_lower:
|
|
||||||
return "datacenter"
|
|
||||||
|
|
||||||
# Human / ISP
|
|
||||||
for kw in HUMAN_KEYWORDS:
|
|
||||||
if kw in name_lower:
|
|
||||||
return "human"
|
|
||||||
|
|
||||||
return "unknown"
|
|
||||||
|
|
||||||
|
|
||||||
def ranges_to_cidrs(start_ip: str, end_ip: str):
|
def ranges_to_cidrs(start_ip: str, end_ip: str):
|
||||||
@ -330,7 +630,7 @@ def download_iptoasn(output_dir: str) -> str:
|
|||||||
print(f"[download] Fetching {IPTOASN_URL} ...")
|
print(f"[download] Fetching {IPTOASN_URL} ...")
|
||||||
try:
|
try:
|
||||||
req = urllib.request.Request(IPTOASN_URL, headers={
|
req = urllib.request.Request(IPTOASN_URL, headers={
|
||||||
"User-Agent": "generate_asn_data/1.0"
|
"User-Agent": "generate_asn_data/2.0"
|
||||||
})
|
})
|
||||||
with urllib.request.urlopen(req, timeout=120) as resp:
|
with urllib.request.urlopen(req, timeout=120) as resp:
|
||||||
compressed = resp.read()
|
compressed = resp.read()
|
||||||
@ -358,7 +658,7 @@ def parse_tsv(tsv_path: str):
|
|||||||
"""
|
"""
|
||||||
entries = []
|
entries = []
|
||||||
with open(tsv_path, "r", encoding="utf-8") as f:
|
with open(tsv_path, "r", encoding="utf-8") as f:
|
||||||
for line_no, line in enumerate(f, 1):
|
for line in f:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line or line.startswith("#"):
|
if not line or line.startswith("#"):
|
||||||
continue
|
continue
|
||||||
@ -380,60 +680,92 @@ def parse_tsv(tsv_path: str):
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Generate ASN reputation and IP-to-ASN CSVs from iptoasn.com data"
|
description="Generate ASN reputation and IP-to-ASN CSVs from iptoasn.com + PeeringDB"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output-dir", default=".",
|
"--output-dir", default=None,
|
||||||
help="Directory for output CSV files (default: current dir)"
|
help="Directory for output CSV files (writes asn_reputation.csv and iplocate-ip-to-asn.csv)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-asn", default=None,
|
||||||
|
help="Explicit path for asn_reputation.csv output"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-ipasn", default=None,
|
||||||
|
help="Explicit path for iplocate-ip-to-asn.csv output"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--no-download", action="store_true",
|
"--no-download", action="store_true",
|
||||||
help="Skip download, use cached ip2asn-v4.tsv in output-dir"
|
help="Skip iptoasn.com download, use cached ip2asn-v4.tsv"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-peeringdb", action="store_true",
|
||||||
|
help="Skip PeeringDB download, classify using keyword heuristics only"
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
os.makedirs(args.output_dir, exist_ok=True)
|
# Resolve output paths — support both --output-dir and --output-asn/--output-ipasn
|
||||||
|
if args.output_asn and args.output_ipasn:
|
||||||
|
rep_path = args.output_asn
|
||||||
|
ip_path = args.output_ipasn
|
||||||
|
work_dir = os.path.dirname(rep_path) or "."
|
||||||
|
elif args.output_dir:
|
||||||
|
work_dir = args.output_dir
|
||||||
|
rep_path = os.path.join(work_dir, "asn_reputation.csv")
|
||||||
|
ip_path = os.path.join(work_dir, "iplocate-ip-to-asn.csv")
|
||||||
|
else:
|
||||||
|
work_dir = "."
|
||||||
|
rep_path = os.path.join(work_dir, "asn_reputation.csv")
|
||||||
|
ip_path = os.path.join(work_dir, "iplocate-ip-to-asn.csv")
|
||||||
|
|
||||||
# Step 1: Get TSV data
|
os.makedirs(work_dir, exist_ok=True)
|
||||||
tsv_path = os.path.join(args.output_dir, CACHED_TSV)
|
|
||||||
|
# Step 1: Get IP-to-ASN TSV data
|
||||||
|
tsv_path = os.path.join(work_dir, CACHED_TSV)
|
||||||
if not args.no_download:
|
if not args.no_download:
|
||||||
tsv_path = download_iptoasn(args.output_dir)
|
tsv_path = download_iptoasn(work_dir)
|
||||||
elif not os.path.exists(tsv_path):
|
elif not os.path.exists(tsv_path):
|
||||||
print(f"[fatal] --no-download specified but {tsv_path} not found",
|
print(f"[fatal] --no-download specified but {tsv_path} not found",
|
||||||
file=sys.stderr)
|
file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Step 2: Parse
|
# Step 2: Get PeeringDB data
|
||||||
|
peeringdb: dict[int, str] = {}
|
||||||
|
if not args.no_peeringdb:
|
||||||
|
peeringdb = download_peeringdb(work_dir)
|
||||||
|
|
||||||
|
# Step 3: Parse TSV
|
||||||
print("[parse] Reading TSV ...")
|
print("[parse] Reading TSV ...")
|
||||||
entries = parse_tsv(tsv_path)
|
entries = parse_tsv(tsv_path)
|
||||||
print(f"[parse] {len(entries):,} IP range entries")
|
print(f"[parse] {len(entries):,} IP range entries")
|
||||||
|
|
||||||
# Step 3: Build ASN info map {asn: (country, as_name)}
|
# Step 4: Build ASN info map {asn: (country, as_name)}
|
||||||
# Use first occurrence for name/country (they're consistent in the dataset)
|
|
||||||
asn_info: dict[int, tuple[str, str]] = {}
|
asn_info: dict[int, tuple[str, str]] = {}
|
||||||
for _, _, asn, country, as_name in entries:
|
for _, _, asn, country, as_name in entries:
|
||||||
if asn not in asn_info:
|
if asn not in asn_info:
|
||||||
asn_info[asn] = (country, as_name)
|
asn_info[asn] = (country, as_name)
|
||||||
|
|
||||||
# Step 4: Classify all ASNs
|
# Step 5: Classify all ASNs
|
||||||
asn_labels: dict[int, str] = {}
|
asn_labels: dict[int, str] = {}
|
||||||
for asn, (country, as_name) in asn_info.items():
|
for asn, (country, as_name) in asn_info.items():
|
||||||
asn_labels[asn] = classify_asn(asn, as_name)
|
asn_labels[asn] = classify_asn(asn, as_name, peeringdb)
|
||||||
|
|
||||||
# Step 5: Write asn_reputation.csv (sorted by ASN, no header)
|
# Include ASNs that are in PeeringDB but not in iptoasn (they have no
|
||||||
rep_path = os.path.join(args.output_dir, "asn_reputation.csv")
|
# IP ranges but still get a label in asn_reputation.csv)
|
||||||
|
for asn, label in peeringdb.items():
|
||||||
|
if asn not in asn_labels:
|
||||||
|
asn_labels[asn] = label
|
||||||
|
|
||||||
|
# Step 6: Write asn_reputation.csv (sorted by ASN, no header)
|
||||||
sorted_asns = sorted(asn_labels.keys())
|
sorted_asns = sorted(asn_labels.keys())
|
||||||
|
os.makedirs(os.path.dirname(rep_path) or ".", exist_ok=True)
|
||||||
with open(rep_path, "w") as f:
|
with open(rep_path, "w") as f:
|
||||||
for asn in sorted_asns:
|
for asn in sorted_asns:
|
||||||
f.write(f"{asn},{asn_labels[asn]}\n")
|
f.write(f"{asn},{asn_labels[asn]}\n")
|
||||||
print(f"[output] Wrote {len(sorted_asns):,} ASNs to {rep_path}")
|
print(f"[output] Wrote {len(sorted_asns):,} ASNs to {rep_path}")
|
||||||
|
|
||||||
# Step 6: Convert ranges to CIDRs and write iplocate-ip-to-asn.csv
|
# Step 7: Convert ranges to CIDRs and write iplocate-ip-to-asn.csv
|
||||||
ip_path = os.path.join(args.output_dir, "iplocate-ip-to-asn.csv")
|
|
||||||
cidr_count = 0
|
|
||||||
seen_cidrs: set[str] = set()
|
seen_cidrs: set[str] = set()
|
||||||
|
|
||||||
# Collect all rows, then sort
|
|
||||||
cidr_rows: list[tuple[ipaddress.IPv4Network, int, str, str]] = []
|
cidr_rows: list[tuple[ipaddress.IPv4Network, int, str, str]] = []
|
||||||
print("[cidr] Converting IP ranges to CIDR notation ...")
|
print("[cidr] Converting IP ranges to CIDR notation ...")
|
||||||
for start_ip, end_ip, asn, country, as_name in entries:
|
for start_ip, end_ip, asn, country, as_name in entries:
|
||||||
@ -444,30 +776,39 @@ def main():
|
|||||||
seen_cidrs.add(key)
|
seen_cidrs.add(key)
|
||||||
cidr_rows.append((cidr, asn, country, as_name))
|
cidr_rows.append((cidr, asn, country, as_name))
|
||||||
|
|
||||||
# Sort by network address
|
|
||||||
cidr_rows.sort(key=lambda r: (r[0].network_address, r[0].prefixlen))
|
cidr_rows.sort(key=lambda r: (r[0].network_address, r[0].prefixlen))
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(ip_path) or ".", exist_ok=True)
|
||||||
with open(ip_path, "w") as f:
|
with open(ip_path, "w") as f:
|
||||||
for cidr, asn, country, as_name in cidr_rows:
|
for cidr, asn, country, as_name in cidr_rows:
|
||||||
# Escape commas in AS names
|
|
||||||
safe_name = as_name.replace(",", " ")
|
safe_name = as_name.replace(",", " ")
|
||||||
f.write(f"{cidr},{asn},{country},{safe_name}\n")
|
f.write(f"{cidr},{asn},{country},{safe_name}\n")
|
||||||
cidr_count = len(cidr_rows)
|
cidr_count = len(cidr_rows)
|
||||||
print(f"[output] Wrote {cidr_count:,} CIDRs to {ip_path}")
|
print(f"[output] Wrote {cidr_count:,} CIDRs to {ip_path}")
|
||||||
|
|
||||||
# Step 7: Stats
|
# Step 8: Stats
|
||||||
stats: dict[str, int] = {}
|
stats: dict[str, int] = {}
|
||||||
for label in asn_labels.values():
|
for label in asn_labels.values():
|
||||||
stats[label] = stats.get(label, 0) + 1
|
stats[label] = stats.get(label, 0) + 1
|
||||||
|
|
||||||
|
all_labels = ["isp", "datacenter", "hosting", "cdn", "enterprise",
|
||||||
|
"education", "government", "unknown"]
|
||||||
print("\n=== Summary ===")
|
print("\n=== Summary ===")
|
||||||
print(f"Total ASNs: {len(sorted_asns):,}")
|
print(f"Total ASNs: {len(sorted_asns):,}")
|
||||||
for label in ["human", "datacenter", "hosting", "cdn", "unknown"]:
|
for label in all_labels:
|
||||||
count = stats.get(label, 0)
|
count = stats.get(label, 0)
|
||||||
pct = 100.0 * count / len(sorted_asns) if sorted_asns else 0
|
pct = 100.0 * count / len(sorted_asns) if sorted_asns else 0
|
||||||
print(f" {label:12s}: {count:>6,} ({pct:5.1f}%)")
|
print(f" {label:12s}: {count:>6,} ({pct:5.1f}%)")
|
||||||
print(f"Total CIDRs: {cidr_count:,}")
|
print(f"Total CIDRs: {cidr_count:,}")
|
||||||
|
|
||||||
|
# Warn if unknown is still too high
|
||||||
|
unk_pct = 100.0 * stats.get("unknown", 0) / len(sorted_asns) if sorted_asns else 0
|
||||||
|
if unk_pct > 50:
|
||||||
|
print(f"\n[warning] {unk_pct:.1f}% of ASNs are still 'unknown'.", file=sys.stderr)
|
||||||
|
if not peeringdb:
|
||||||
|
print("[hint] Try running without --no-peeringdb to use PeeringDB data.",
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
# Outputs:
|
# Outputs:
|
||||||
# data/bot_ip.csv — Known bot/scanner IPs + Tor exit nodes
|
# data/bot_ip.csv — Known bot/scanner IPs + Tor exit nodes
|
||||||
# data/bot_ja4.csv — Known bot JA4 TLS fingerprints
|
# data/bot_ja4.csv — Known bot JA4 TLS fingerprints
|
||||||
# data/asn_reputation.csv — ASN→label mapping (human/datacenter/hosting)
|
# data/asn_reputation.csv — ASN→label mapping (isp/datacenter/hosting/cdn/education/government/enterprise)
|
||||||
# data/iplocate-ip-to-asn.csv — CIDR→ASN for dictionary lookup
|
# data/iplocate-ip-to-asn.csv — CIDR→ASN for dictionary lookup
|
||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
|
|||||||
@ -977,7 +977,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
|||||||
# - WEIGH / inconnu → scorés normalement (anubis_is_flagged=1 pour WEIGH)
|
# - WEIGH / inconnu → scorés normalement (anubis_is_flagged=1 pour WEIGH)
|
||||||
# Les DENY sont TOUJOURS inclus dans les threats, indépendamment du seuil IF.
|
# Les DENY sont TOUJOURS inclus dans les threats, indépendamment du seuil IF.
|
||||||
unknown_traffic = rest[rest['anubis_bot_action'] != 'ALLOW'].copy()
|
unknown_traffic = rest[rest['anubis_bot_action'] != 'ALLOW'].copy()
|
||||||
human_baseline = unknown_traffic[unknown_traffic['asn_label'] == 'human']
|
human_baseline = unknown_traffic[unknown_traffic['asn_label'] == 'isp']
|
||||||
|
|
||||||
# A7 — Valider les features avant tout traitement
|
# A7 — Valider les features avant tout traitement
|
||||||
valid_features = validate_features(df, features, name, cycle_id)
|
valid_features = validate_features(df, features, name, cycle_id)
|
||||||
@ -1414,7 +1414,7 @@ def fetch_and_analyze():
|
|||||||
|
|
||||||
log_decision('CYCLE_START', cycle_id, '', {
|
log_decision('CYCLE_START', cycle_id, '', {
|
||||||
'total_rows': len(df),
|
'total_rows': len(df),
|
||||||
'human_rows': int((df.get('asn_label', pd.Series()) == 'human').sum()),
|
'human_rows': int((df.get('asn_label', pd.Series()) == 'isp').sum()),
|
||||||
'known_bot_rows': int((df.get('bot_name', pd.Series()) != '').sum()),
|
'known_bot_rows': int((df.get('bot_name', pd.Series()) != '').sum()),
|
||||||
'correlated_rows': int((df.get('correlated', pd.Series()) == 1).sum()),
|
'correlated_rows': int((df.get('correlated', pd.Series()) == 1).sum()),
|
||||||
'anubis_allow_rows': int((df.get('anubis_bot_action', pd.Series()) == 'ALLOW').sum()),
|
'anubis_allow_rows': int((df.get('anubis_bot_action', pd.Series()) == 'ALLOW').sum()),
|
||||||
@ -1436,12 +1436,12 @@ def fetch_and_analyze():
|
|||||||
tp_ips = {ip for ip, cls in soc_feedback.items() if cls in ('true_positive', 'malicious', 'bot')}
|
tp_ips = {ip for ip, cls in soc_feedback.items() if cls in ('true_positive', 'malicious', 'bot')}
|
||||||
if fp_ips:
|
if fp_ips:
|
||||||
# Les faux positifs confirmés rejoignent le pool humain
|
# Les faux positifs confirmés rejoignent le pool humain
|
||||||
mask_fp = df['src_ip'].isin(fp_ips) & (df.get('asn_label', pd.Series(dtype=str)) != 'human')
|
mask_fp = df['src_ip'].isin(fp_ips) & (df.get('asn_label', pd.Series(dtype=str)) != 'isp')
|
||||||
df.loc[mask_fp, 'asn_label'] = 'human'
|
df.loc[mask_fp, 'asn_label'] = 'isp'
|
||||||
log_info(f"[Feedback] {mask_fp.sum()} lignes reclassées 'human' (FP confirmés).")
|
log_info(f"[Feedback] {mask_fp.sum()} lignes reclassées 'isp' (FP confirmés).")
|
||||||
if tp_ips:
|
if tp_ips:
|
||||||
# Les vrais positifs confirmés sont exclus de la baseline humaine
|
# Les vrais positifs confirmés sont exclus de la baseline humaine
|
||||||
mask_tp = df['src_ip'].isin(tp_ips) & (df.get('asn_label', pd.Series(dtype=str)) == 'human')
|
mask_tp = df['src_ip'].isin(tp_ips) & (df.get('asn_label', pd.Series(dtype=str)) == 'isp')
|
||||||
df.loc[mask_tp, 'asn_label'] = 'soc_confirmed_bot'
|
df.loc[mask_tp, 'asn_label'] = 'soc_confirmed_bot'
|
||||||
log_info(f"[Feedback] {mask_tp.sum()} lignes exclues de la baseline humaine (TP confirmés).")
|
log_info(f"[Feedback] {mask_tp.sum()} lignes exclues de la baseline humaine (TP confirmés).")
|
||||||
log_decision('SOC_FEEDBACK', cycle_id, '', {
|
log_decision('SOC_FEEDBACK', cycle_id, '', {
|
||||||
|
|||||||
@ -469,11 +469,11 @@ async def features() -> dict[str, Any]:
|
|||||||
except Exception:
|
except Exception:
|
||||||
logger.debug("view_thesis_features_1h not available")
|
logger.debug("view_thesis_features_1h not available")
|
||||||
|
|
||||||
# Human vs bot feature profiles for radar comparison
|
# ISP (residential) vs bot feature profiles for radar comparison
|
||||||
try:
|
try:
|
||||||
human = query(
|
human = query(
|
||||||
f"SELECT {_feat_cols} FROM {_DB}.view_ai_features_1h "
|
f"SELECT {_feat_cols} FROM {_DB}.view_ai_features_1h "
|
||||||
"WHERE asn_label = 'human'"
|
"WHERE asn_label = 'isp'"
|
||||||
)
|
)
|
||||||
if human:
|
if human:
|
||||||
result["human_profile"] = human[0]
|
result["human_profile"] = human[0]
|
||||||
@ -663,7 +663,7 @@ async def heatmap() -> dict[str, Any]:
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# GET /api/ip/{ip}/radar — Radar comparison vs human baseline
|
# GET /api/ip/{ip}/radar — Radar comparison vs ISP baseline
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
_RADAR_FEATURES = [
|
_RADAR_FEATURES = [
|
||||||
"hit_velocity", "fuzzing_index", "post_ratio", "asset_ratio",
|
"hit_velocity", "fuzzing_index", "post_ratio", "asset_ratio",
|
||||||
@ -686,7 +686,7 @@ async def ip_radar(ip: str) -> dict[str, Any]:
|
|||||||
baseline = query(
|
baseline = query(
|
||||||
f"SELECT {cols_avg} "
|
f"SELECT {cols_avg} "
|
||||||
f"FROM {_DB}.view_ai_features_1h "
|
f"FROM {_DB}.view_ai_features_1h "
|
||||||
"WHERE asn_label = 'human'"
|
"WHERE asn_label = 'isp'"
|
||||||
)
|
)
|
||||||
bot_avg = query(
|
bot_avg = query(
|
||||||
f"SELECT {cols_avg} "
|
f"SELECT {cols_avg} "
|
||||||
|
|||||||
@ -268,7 +268,7 @@ fi
|
|||||||
|
|
||||||
# 5f. Verify seeder data reached agg table and AI features view
|
# 5f. Verify seeder data reached agg table and AI features view
|
||||||
AGG_COUNT=$(ch_query "SELECT count() FROM ja4_processing.agg_host_ip_ja4_1h")
|
AGG_COUNT=$(ch_query "SELECT count() FROM ja4_processing.agg_host_ip_ja4_1h")
|
||||||
HUMAN_COUNT=$(ch_query "SELECT count() FROM ja4_processing.view_ai_features_1h WHERE asn_label='human'")
|
HUMAN_COUNT=$(ch_query "SELECT count() FROM ja4_processing.view_ai_features_1h WHERE asn_label='isp'")
|
||||||
BOT_LABEL_COUNT=$(ch_query "SELECT count() FROM ja4_processing.view_ai_features_1h WHERE bot_name != ''")
|
BOT_LABEL_COUNT=$(ch_query "SELECT count() FROM ja4_processing.view_ai_features_1h WHERE bot_name != ''")
|
||||||
UNIQ_SRC_IPS=$(ch_query "SELECT count(DISTINCT src_ip) FROM ja4_processing.view_ai_features_1h")
|
UNIQ_SRC_IPS=$(ch_query "SELECT count(DISTINCT src_ip) FROM ja4_processing.view_ai_features_1h")
|
||||||
UNIQ_JA4=$(ch_query "SELECT count(DISTINCT ja4) FROM ja4_processing.view_ai_features_1h")
|
UNIQ_JA4=$(ch_query "SELECT count(DISTINCT ja4) FROM ja4_processing.view_ai_features_1h")
|
||||||
@ -280,11 +280,11 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$HUMAN_COUNT" -ge 500 ] 2>/dev/null; then
|
if [ "$HUMAN_COUNT" -ge 500 ] 2>/dev/null; then
|
||||||
pass "Bot-detector baseline: $HUMAN_COUNT human sessions (≥500 threshold met)"
|
pass "Bot-detector baseline: $HUMAN_COUNT ISP sessions (≥500 threshold met)"
|
||||||
elif [ "$HUMAN_COUNT" -gt 0 ] 2>/dev/null; then
|
elif [ "$HUMAN_COUNT" -gt 0 ] 2>/dev/null; then
|
||||||
warn "Human sessions below threshold: $HUMAN_COUNT < 500 (bot_detector will skip cycle)"
|
warn "ISP sessions below threshold: $HUMAN_COUNT < 500 (bot_detector will skip cycle)"
|
||||||
else
|
else
|
||||||
fail "No human sessions in view_ai_features_1h (asn_reputation CSV not loaded?)"
|
fail "No ISP sessions in view_ai_features_1h (asn_reputation CSV not loaded?)"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$BOT_LABEL_COUNT" -gt 0 ] 2>/dev/null; then
|
if [ "$BOT_LABEL_COUNT" -gt 0 ] 2>/dev/null; then
|
||||||
|
|||||||
Reference in New Issue
Block a user