feat: browser JA4 detection, Anubis bot rules, worldwide ASN data

- Add generate_browser_ja4.py: 1,186 browser JA4 fingerprints from FoxIO + ja4db.com
  covering 11 families (Chromium, Firefox, Safari, Edge, Tor, Opera, Vivaldi...)
- Rewrite generate_bot_ip.py: Anubis YAML rules (Google, Bing, Apple, DuckDuck,
  OpenAI, Perplexity bots) + Tor exit nodes + cloud scanner IPs (3,555 entries)
- Rewrite generate_asn_data.py: worldwide iptoasn.com data (78,049 ASNs, 714K CIDRs)
- Add dict_browser_ja4 ClickHouse dictionary + browser_family in AI features views
- Add /api/browsers dashboard endpoint
- Fix CSV quoting for fields containing commas (User-Agent strings)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-08 15:27:37 +02:00
parent b6184e6529
commit 7d09c614c3
15 changed files with 885900 additions and 3151 deletions

View File

@ -1,25 +1,75 @@
#!/usr/bin/env python3
"""
generate_bot_ip.py — Generate bot_ip.csv from known scanner networks + Tor exit nodes.
generate_bot_ip.py — Generate bot_ip.csv from Anubis crawler data, scanner
networks, and Tor exit nodes.
Sources:
Tor exit nodes: downloaded list or hardcoded fallback
• Shodan: known scanner ranges (census.shodan.io, 2024)
• Censys: known scanner ranges (censys.io, 2024)
• Binaryedge, SecurityTrails, ZoomEye, Stretchoid: known ranges
GreyNoise: top mass-scanner IPs (manually curated)
Anubis crawler YAML files (TecharoHQ/anubis on GitHub)
- Search engines: Googlebot, Bingbot, Applebot, DuckDuckBot
- AI crawlers: GPTBot, PerplexityBot
- Cloud scrapers: Alibaba Cloud, Huawei Cloud, Tencent Cloud
Curated scanner networks: Shodan, Censys, BinaryEdge, Stretchoid,
SecurityTrails, ZoomEye, GreyNoise, Shadowserver
• Tor exit nodes: https://check.torproject.org/torbulkexitlist
Anubis UA-only bot references (no IP ranges, documented here for reference):
• ai-catchall.yaml patterns: bytespider, CCBot, ChatGLM-Spider, ClaudeBot,
Diffbot, FacebookBot, Google-Extended, ImagesiftBot, Kangaroo Bot,
Meta-ExternalAgent, PetalBot, Scrapy, Timpibot, VelenPublicWebCrawler,
Webzio-Extended, cohere-ai
• ai-robots-txt.yaml patterns: Amazonbot, anthropic-ai, Applebot-Extended,
Bytespider, CCBot, ChatGPT-User, Claude-Web, cohere-ai, Diffbot,
FacebookBot, Google-Extended, GPTBot, Meta-ExternalAgent, OAI-SearchBot,
omgili, PerplexityBot, Timpibot, YouBot
Output format (no header):
<ip_or_cidr>,<bot_name>
ip_or_cidr,bot_name
Usage:
python3 generate_bot_ip.py --output bot_ip.csv [--no-download]
"""
import argparse
import ipaddress
import os
import sys
import urllib.request
import urllib.error
# --- Known scanner networks (public, well-documented) ---
KNOWN_SCANNERS = {
try:
import yaml
HAS_YAML = True
except ImportError:
HAS_YAML = False
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
ANUBIS_COMMIT = "dbd64e0f4f23c37476b40b332781c676ffa413ef"
ANUBIS_BASE = (
f"https://raw.githubusercontent.com/TecharoHQ/anubis/{ANUBIS_COMMIT}/data/"
)
# (yaml_path_relative_to_data, bot_name, category)
ANUBIS_CRAWLERS = [
("crawlers/googlebot.yaml", "Googlebot", "search_engine"),
("crawlers/bingbot.yaml", "Bingbot", "search_engine"),
("crawlers/applebot.yaml", "Applebot", "search_engine"),
("crawlers/duckduckbot.yaml", "DuckDuckBot", "search_engine"),
("crawlers/openai-gptbot.yaml", "GPTBot", "ai_crawler"),
("crawlers/perplexitybot.yaml", "PerplexityBot", "ai_crawler"),
("crawlers/alibaba-cloud.yaml", "Alibaba_Cloud_Scraper", "cloud_scraper"),
("crawlers/huawei-cloud.yaml", "Huawei_Cloud_Scraper", "cloud_scraper"),
("crawlers/tencent-cloud.yaml", "Tencent_Cloud_Scraper", "cloud_scraper"),
]
TOR_URL = "https://check.torproject.org/torbulkexitlist"
# Curated scanner networks (category: scanner)
SCANNER_RANGES = {
# Shodan — https://wiki.ipfire.org/dns/public-servers (census.shodan.io)
"Shodan_Scanner": [
"Shodan": [
"66.240.192.0/24", "66.240.205.0/24", "66.240.236.0/24",
"71.6.135.0/24", "71.6.146.0/24", "71.6.158.0/24", "71.6.165.0/24",
"80.82.77.0/24", "80.82.78.0/24",
@ -32,45 +82,45 @@ KNOWN_SCANNERS = {
"209.126.110.0/24",
],
# Censys — https://support.censys.io/hc/en-us/articles/360043177092
"Censys_Scanner": [
"Censys": [
"162.142.125.0/24", "167.248.133.0/24", "167.94.138.0/24",
"167.94.145.0/24", "167.94.146.0/24",
"192.35.168.0/23",
],
# BinaryEdge — https://docs.binaryedge.io/
"BinaryEdge_Scanner": [
"BinaryEdge": [
"154.89.5.0/24",
"45.143.200.0/22",
],
# Stretchoid — persistent scanner botnet
"Stretchoid_Scanner": [
"Stretchoid": [
"198.235.24.0/24",
"205.210.31.0/24",
],
# SecurityTrails (Recorded Future) crawlers
"SecurityTrails_Crawler": [
"SecurityTrails": [
"52.250.0.0/16",
],
# ZoomEye (Knownsec)
"ZoomEye_Scanner": [
"ZoomEye": [
"106.75.0.0/16",
],
# GreyNoise known mass-scanners (individual IPs)
"GreyNoise_MassScanner": [
"GreyNoise": [
"45.155.205.233/32", "45.155.205.220/32", "45.155.205.205/32",
"45.155.205.190/32", "45.155.205.175/32", "45.155.205.160/32",
"45.155.205.146/32", "45.155.205.131/32",
"193.32.162.10/32", "193.32.162.11/32", "193.32.162.25/32",
"193.32.162.30/32", "193.32.162.40/32",
],
# Netlab/Shadowserver known sinkholes used by malware
"Shadowserver_Sinkhole": [
# Netlab/Shadowserver known sinkholes
"Shadowserver": [
"74.82.47.0/24",
"184.105.139.0/24", "184.105.247.0/24",
],
}
# Fallback Tor exit nodes when download unavailable
# Fallback Tor exit nodes when download is unavailable
FALLBACK_TOR_IPS = [
"185.220.101.34", "185.220.101.35", "185.220.101.36", "185.220.101.37",
"185.220.101.38", "185.220.101.39", "185.220.101.40", "185.220.101.41",
@ -111,61 +161,192 @@ FALLBACK_TOR_IPS = [
]
def load_tor_ips(tor_file):
"""Load Tor exit node IPs from downloaded file."""
ips = set()
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def fetch_url(url, timeout=30):
"""Download URL content as string. Returns None on failure."""
try:
with open(tor_file) as f:
for line in f:
req = urllib.request.Request(url, headers={"User-Agent": "ja4-bot-ip-gen/1.0"})
with urllib.request.urlopen(req, timeout=timeout) as resp:
return resp.read().decode("utf-8", errors="replace")
except (urllib.error.URLError, OSError, ValueError) as exc:
print(f" [WARN] Failed to download {url}: {exc}", file=sys.stderr)
return None
def parse_yaml_remote_addresses(text):
"""Extract remote_addresses from an Anubis crawler YAML file.
Uses PyYAML if available, otherwise falls back to simple text parsing
since the YAML structure is predictable: remote_addresses is always a
flat list of CIDR strings.
"""
if HAS_YAML:
docs = yaml.safe_load(text)
addrs = []
if isinstance(docs, list):
for doc in docs:
if isinstance(doc, dict) and "remote_addresses" in doc:
addrs.extend(doc["remote_addresses"])
elif isinstance(docs, dict) and "remote_addresses" in docs:
addrs.extend(docs["remote_addresses"])
return [a.strip() for a in addrs if isinstance(a, str)]
# Fallback: manual parsing
addrs = []
in_block = False
for line in text.splitlines():
stripped = line.strip()
if stripped.startswith("remote_addresses:"):
in_block = True
# Inline list on same line: remote_addresses: [...]
rest = stripped.split(":", 1)[1].strip()
if rest.startswith("["):
for token in rest.strip("[]").split(","):
token = token.strip().strip('"').strip("'")
if token:
addrs.append(token)
if rest.endswith("]"):
in_block = False
continue
if in_block:
if stripped.startswith("- "):
val = stripped[2:].strip().strip('"').strip("'")
if val:
addrs.append(val)
elif stripped.startswith('"') or stripped.startswith("'"):
val = stripped.strip('",').strip("',").strip()
if val:
addrs.append(val)
elif stripped == "]":
in_block = False
elif stripped and not stripped.startswith("#"):
if stripped.startswith("["):
continue
# End of block when hitting a new YAML key
if ":" in stripped and not "/" in stripped:
in_block = False
return addrs
def normalize_cidr(addr):
"""Normalize an IP or CIDR to consistent string form."""
addr = addr.strip()
try:
net = ipaddress.ip_network(addr, strict=False)
return str(net)
except ValueError:
return addr
def sort_key(entry):
"""Sort entries by (bot_name, ip_version, network_address)."""
cidr, bot_name = entry
try:
net = ipaddress.ip_network(cidr, strict=False)
version = net.version
addr_int = int(net.network_address)
except ValueError:
version = 99
addr_int = 0
return (bot_name, version, addr_int)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="Generate bot_ip.csv from Anubis crawler data, scanners, and Tor"
)
parser.add_argument("--output", default="bot_ip.csv",
help="Output CSV path (default: bot_ip.csv)")
parser.add_argument("--no-download", action="store_true",
help="Skip all network downloads; use only built-in data")
args = parser.parse_args()
# {cidr: bot_name} for dedup
entries = {}
# {category: count} for stats
category_counts = {}
def add_entry(cidr, bot_name, category):
cidr = normalize_cidr(cidr)
if cidr not in entries:
entries[cidr] = bot_name
category_counts[category] = category_counts.get(category, 0) + 1
# ------------------------------------------------------------------
# 1. Anubis crawler YAML files
# ------------------------------------------------------------------
if not args.no_download:
print("[bot_ip] Downloading Anubis crawler data ...")
for yaml_path, bot_name, category in ANUBIS_CRAWLERS:
url = ANUBIS_BASE + yaml_path
print(f" Fetching {yaml_path} ...", end=" ")
text = fetch_url(url)
if text is None:
print("FAILED")
continue
addrs = parse_yaml_remote_addresses(text)
print(f"{len(addrs)} addresses")
for addr in addrs:
add_entry(addr, bot_name, category)
# ------------------------------------------------------------------
# 2. Curated scanner networks
# ------------------------------------------------------------------
print("[bot_ip] Adding curated scanner networks ...")
for bot_name, networks in SCANNER_RANGES.items():
for net in networks:
add_entry(net, bot_name, "scanner")
# ------------------------------------------------------------------
# 3. Tor exit nodes
# ------------------------------------------------------------------
tor_ips = set()
if not args.no_download:
print("[bot_ip] Downloading Tor exit node list ...")
text = fetch_url(TOR_URL, timeout=30)
if text:
for line in text.splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
try:
ipaddress.ip_address(line)
ips.add(line)
tor_ips.add(line)
except ValueError:
pass
except FileNotFoundError:
pass
return ips
print(f" Downloaded {len(tor_ips)} Tor exit nodes")
def main():
parser = argparse.ArgumentParser(description="Generate bot_ip.csv")
parser.add_argument("--output", default="bot_ip.csv")
parser.add_argument("--tor-file", help="Path to downloaded Tor exit node list")
args = parser.parse_args()
entries = []
seen = set()
# Add known scanner networks
for bot_name, networks in KNOWN_SCANNERS.items():
for net in networks:
key = net
if key not in seen:
seen.add(key)
entries.append((net, bot_name))
# Add Tor exit nodes
if args.tor_file:
tor_ips = load_tor_ips(args.tor_file)
else:
if not tor_ips:
print(" Using fallback Tor exit node list")
tor_ips = set(FALLBACK_TOR_IPS)
for ip in sorted(tor_ips, key=lambda x: ipaddress.ip_address(x)):
key = f"{ip}/32"
if key not in seen:
seen.add(key)
entries.append((key, "Tor_Exit_Node"))
for ip in tor_ips:
add_entry(f"{ip}/32", "tor_exit_node", "anonymizer")
# ------------------------------------------------------------------
# 4. Sort, deduplicate, and write
# ------------------------------------------------------------------
sorted_entries = sorted(entries.items(), key=sort_key)
os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
with open(args.output, "w") as f:
for net, name in entries:
f.write(f"{net},{name}\n")
for cidr, bot_name in sorted_entries:
f.write(f"{cidr},{bot_name}\n")
print(f"[bot_ip] Generated {len(entries)} entries "
f"({len(tor_ips)} Tor nodes, "
f"{len(entries) - len(tor_ips)} scanner ranges)")
# ------------------------------------------------------------------
# 5. Print stats
# ------------------------------------------------------------------
print(f"\n[bot_ip] Generated {len(sorted_entries)} entries → {args.output}")
print("[bot_ip] Breakdown by category:")
for category in sorted(category_counts):
print(f" {category:20s} {category_counts[category]:>6d}")
if __name__ == "__main__":