#!/usr/bin/env python3 """ generate_bot_ip.py — Generate bot_ip.csv from Anubis crawler data, scanner networks, and Tor exit nodes. Sources: • Anubis crawler YAML files (TecharoHQ/anubis on GitHub) - Search engines: Googlebot, Bingbot, Applebot, DuckDuckBot - AI crawlers: GPTBot, PerplexityBot - Cloud scrapers: Alibaba Cloud, Huawei Cloud, Tencent Cloud • Curated scanner networks: Shodan, Censys, BinaryEdge, Stretchoid, SecurityTrails, ZoomEye, GreyNoise, Shadowserver • Tor exit nodes: https://check.torproject.org/torbulkexitlist Anubis UA-only bot references (no IP ranges, documented here for reference): • ai-catchall.yaml patterns: bytespider, CCBot, ChatGLM-Spider, ClaudeBot, Diffbot, FacebookBot, Google-Extended, ImagesiftBot, Kangaroo Bot, Meta-ExternalAgent, PetalBot, Scrapy, Timpibot, VelenPublicWebCrawler, Webzio-Extended, cohere-ai • ai-robots-txt.yaml patterns: Amazonbot, anthropic-ai, Applebot-Extended, Bytespider, CCBot, ChatGPT-User, Claude-Web, cohere-ai, Diffbot, FacebookBot, Google-Extended, GPTBot, Meta-ExternalAgent, OAI-SearchBot, omgili, PerplexityBot, Timpibot, YouBot Output format (no header): ip_or_cidr,bot_name Usage: python3 generate_bot_ip.py --output bot_ip.csv [--no-download] """ import argparse import ipaddress import os import sys import urllib.request import urllib.error try: import yaml HAS_YAML = True except ImportError: HAS_YAML = False # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- ANUBIS_COMMIT = "dbd64e0f4f23c37476b40b332781c676ffa413ef" ANUBIS_BASE = ( f"https://raw.githubusercontent.com/TecharoHQ/anubis/{ANUBIS_COMMIT}/data/" ) # (yaml_path_relative_to_data, bot_name, category) ANUBIS_CRAWLERS = [ ("crawlers/googlebot.yaml", "Googlebot", "search_engine"), ("crawlers/bingbot.yaml", "Bingbot", "search_engine"), ("crawlers/applebot.yaml", "Applebot", "search_engine"), ("crawlers/duckduckbot.yaml", "DuckDuckBot", "search_engine"), ("crawlers/openai-gptbot.yaml", "GPTBot", "ai_crawler"), ("crawlers/perplexitybot.yaml", "PerplexityBot", "ai_crawler"), ("crawlers/alibaba-cloud.yaml", "Alibaba_Cloud_Scraper", "cloud_scraper"), ("crawlers/huawei-cloud.yaml", "Huawei_Cloud_Scraper", "cloud_scraper"), ("crawlers/tencent-cloud.yaml", "Tencent_Cloud_Scraper", "cloud_scraper"), ] TOR_URL = "https://check.torproject.org/torbulkexitlist" # Curated scanner networks (category: scanner) SCANNER_RANGES = { # Shodan — https://wiki.ipfire.org/dns/public-servers (census.shodan.io) "Shodan": [ "66.240.192.0/24", "66.240.205.0/24", "66.240.236.0/24", "71.6.135.0/24", "71.6.146.0/24", "71.6.158.0/24", "71.6.165.0/24", "80.82.77.0/24", "80.82.78.0/24", "82.221.105.0/24", "82.221.106.0/24", "85.25.43.0/24", "85.25.103.0/24", "93.120.27.0/24", "94.102.49.0/24", "188.138.9.0/24", "198.20.69.0/24", "198.20.70.0/24", "198.20.87.0/24", "198.20.99.0/24", "209.126.110.0/24", ], # Censys — https://support.censys.io/hc/en-us/articles/360043177092 "Censys": [ "162.142.125.0/24", "167.248.133.0/24", "167.94.138.0/24", "167.94.145.0/24", "167.94.146.0/24", "192.35.168.0/23", ], # BinaryEdge — https://docs.binaryedge.io/ "BinaryEdge": [ "154.89.5.0/24", "45.143.200.0/22", ], # Stretchoid — persistent scanner botnet "Stretchoid": [ "198.235.24.0/24", "205.210.31.0/24", ], # SecurityTrails (Recorded Future) crawlers "SecurityTrails": [ "52.250.0.0/16", ], # ZoomEye (Knownsec) "ZoomEye": [ "106.75.0.0/16", ], # GreyNoise known mass-scanners (individual IPs) "GreyNoise": [ "45.155.205.233/32", "45.155.205.220/32", "45.155.205.205/32", "45.155.205.190/32", "45.155.205.175/32", "45.155.205.160/32", "45.155.205.146/32", "45.155.205.131/32", "193.32.162.10/32", "193.32.162.11/32", "193.32.162.25/32", "193.32.162.30/32", "193.32.162.40/32", ], # Netlab/Shadowserver known sinkholes "Shadowserver": [ "74.82.47.0/24", "184.105.139.0/24", "184.105.247.0/24", ], } # Fallback Tor exit nodes when download is unavailable FALLBACK_TOR_IPS = [ "185.220.101.34", "185.220.101.35", "185.220.101.36", "185.220.101.37", "185.220.101.38", "185.220.101.39", "185.220.101.40", "185.220.101.41", "185.220.101.42", "185.220.101.43", "185.220.101.44", "185.220.101.45", "185.220.101.46", "185.220.101.47", "185.220.101.48", "185.220.101.49", "185.220.101.50", "185.220.101.51", "185.220.101.52", "185.220.101.53", "185.220.101.54", "185.220.101.55", "185.220.101.56", "185.220.101.57", "185.220.101.58", "185.220.101.59", "185.220.101.60", "185.220.101.61", "185.220.101.62", "185.220.101.63", "185.220.101.64", "185.220.101.65", "185.220.101.66", "185.220.101.67", "185.220.101.68", "185.220.101.69", "185.220.101.70", "185.220.101.71", "185.220.101.72", "185.220.101.73", "185.220.101.74", "185.220.101.75", "185.220.101.76", "185.220.101.77", "185.220.101.78", "185.220.101.79", "185.220.101.80", "185.220.101.81", "185.220.101.82", "185.220.101.83", "185.220.101.84", "185.220.101.85", "185.220.101.86", "185.220.101.87", "185.220.101.88", "185.220.101.89", "185.220.101.90", "185.220.101.91", "185.220.101.92", "185.220.101.93", "185.220.101.94", "185.220.101.95", "185.220.101.96", "185.220.101.97", "185.220.100.240", "185.220.100.241", "185.220.100.242", "185.220.100.243", "185.220.100.244", "185.220.100.245", "185.220.100.246", "185.220.100.247", "185.220.100.248", "185.220.100.249", "185.220.100.250", "185.220.100.251", "185.220.100.252", "185.220.100.253", "185.220.100.254", "185.220.100.255", "178.20.55.16", "178.20.55.18", "178.20.55.182", "23.129.64.130", "23.129.64.131", "23.129.64.132", "23.129.64.133", "23.129.64.134", "23.129.64.135", "23.129.64.136", "23.129.64.137", "23.129.64.138", "23.129.64.139", "23.129.64.140", "23.129.64.141", "23.129.64.142", "23.129.64.143", "23.129.64.144", "23.129.64.145", "23.129.64.146", "23.129.64.147", "23.129.64.148", "23.129.64.149", "23.129.64.150", "23.129.64.151", "23.129.64.152", "23.129.64.153", "104.244.76.13", "104.244.76.14", "104.244.76.15", "104.244.76.16", "104.244.76.17", "104.244.76.18", "104.244.76.19", "104.244.76.20", "199.249.230.64", "199.249.230.65", "199.249.230.66", "199.249.230.67", "199.249.230.68", "199.249.230.69", "199.249.230.70", "199.249.230.71", "199.249.230.72", "199.249.230.73", "199.249.230.74", "199.249.230.75", "199.249.230.76", "199.249.230.77", "199.249.230.78", "199.249.230.79", "199.249.230.80", "199.249.230.81", "199.249.230.82", "199.249.230.83", "199.249.230.84", "199.249.230.85", "199.249.230.86", "199.249.230.87", "199.249.230.88", "199.249.230.89", ] # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def fetch_url(url, timeout=30): """Download URL content as string. Returns None on failure.""" try: req = urllib.request.Request(url, headers={"User-Agent": "ja4-bot-ip-gen/1.0"}) with urllib.request.urlopen(req, timeout=timeout) as resp: return resp.read().decode("utf-8", errors="replace") except (urllib.error.URLError, OSError, ValueError) as exc: print(f" [WARN] Failed to download {url}: {exc}", file=sys.stderr) return None def parse_yaml_remote_addresses(text): """Extract remote_addresses from an Anubis crawler YAML file. Uses PyYAML if available, otherwise falls back to simple text parsing since the YAML structure is predictable: remote_addresses is always a flat list of CIDR strings. """ if HAS_YAML: docs = yaml.safe_load(text) addrs = [] if isinstance(docs, list): for doc in docs: if isinstance(doc, dict) and "remote_addresses" in doc: addrs.extend(doc["remote_addresses"]) elif isinstance(docs, dict) and "remote_addresses" in docs: addrs.extend(docs["remote_addresses"]) return [a.strip() for a in addrs if isinstance(a, str)] # Fallback: manual parsing addrs = [] in_block = False for line in text.splitlines(): stripped = line.strip() if stripped.startswith("remote_addresses:"): in_block = True # Inline list on same line: remote_addresses: [...] rest = stripped.split(":", 1)[1].strip() if rest.startswith("["): for token in rest.strip("[]").split(","): token = token.strip().strip('"').strip("'") if token: addrs.append(token) if rest.endswith("]"): in_block = False continue if in_block: if stripped.startswith("- "): val = stripped[2:].strip().strip('"').strip("'") if val: addrs.append(val) elif stripped.startswith('"') or stripped.startswith("'"): val = stripped.strip('",').strip("',").strip() if val: addrs.append(val) elif stripped == "]": in_block = False elif stripped and not stripped.startswith("#"): if stripped.startswith("["): continue # End of block when hitting a new YAML key if ":" in stripped and not "/" in stripped: in_block = False return addrs def normalize_cidr(addr): """Normalize an IP or CIDR to consistent string form.""" addr = addr.strip() try: net = ipaddress.ip_network(addr, strict=False) return str(net) except ValueError: return addr def sort_key(entry): """Sort entries by (bot_name, ip_version, network_address).""" cidr, bot_name = entry try: net = ipaddress.ip_network(cidr, strict=False) version = net.version addr_int = int(net.network_address) except ValueError: version = 99 addr_int = 0 return (bot_name, version, addr_int) # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser( description="Generate bot_ip.csv from Anubis crawler data, scanners, and Tor" ) parser.add_argument("--output", default="bot_ip.csv", help="Output CSV path (default: bot_ip.csv)") parser.add_argument("--no-download", action="store_true", help="Skip all network downloads; use only built-in data") args = parser.parse_args() # {cidr: bot_name} for dedup entries = {} # {category: count} for stats category_counts = {} def add_entry(cidr, bot_name, category): cidr = normalize_cidr(cidr) if cidr not in entries: entries[cidr] = bot_name category_counts[category] = category_counts.get(category, 0) + 1 # ------------------------------------------------------------------ # 1. Anubis crawler YAML files # ------------------------------------------------------------------ if not args.no_download: print("[bot_ip] Downloading Anubis crawler data ...") for yaml_path, bot_name, category in ANUBIS_CRAWLERS: url = ANUBIS_BASE + yaml_path print(f" Fetching {yaml_path} ...", end=" ") text = fetch_url(url) if text is None: print("FAILED") continue addrs = parse_yaml_remote_addresses(text) print(f"{len(addrs)} addresses") for addr in addrs: add_entry(addr, bot_name, category) # ------------------------------------------------------------------ # 2. Curated scanner networks # ------------------------------------------------------------------ print("[bot_ip] Adding curated scanner networks ...") for bot_name, networks in SCANNER_RANGES.items(): for net in networks: add_entry(net, bot_name, "scanner") # ------------------------------------------------------------------ # 3. Tor exit nodes # ------------------------------------------------------------------ tor_ips = set() if not args.no_download: print("[bot_ip] Downloading Tor exit node list ...") text = fetch_url(TOR_URL, timeout=30) if text: for line in text.splitlines(): line = line.strip() if not line or line.startswith("#"): continue try: ipaddress.ip_address(line) tor_ips.add(line) except ValueError: pass print(f" Downloaded {len(tor_ips)} Tor exit nodes") if not tor_ips: print(" Using fallback Tor exit node list") tor_ips = set(FALLBACK_TOR_IPS) for ip in tor_ips: add_entry(f"{ip}/32", "tor_exit_node", "anonymizer") # ------------------------------------------------------------------ # 4. Sort, deduplicate, and write # ------------------------------------------------------------------ sorted_entries = sorted(entries.items(), key=sort_key) os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True) with open(args.output, "w") as f: for cidr, bot_name in sorted_entries: f.write(f"{cidr},{bot_name}\n") # ------------------------------------------------------------------ # 5. Print stats # ------------------------------------------------------------------ print(f"\n[bot_ip] Generated {len(sorted_entries)} entries → {args.output}") print("[bot_ip] Breakdown by category:") for category in sorted(category_counts): print(f" {category:20s} {category_counts[category]:>6d}") if __name__ == "__main__": main()