ja4-platform/scripts/generate_bot_ip.py

#!/usr/bin/env python3
"""
generate_bot_ip.py — Generate bot_ip.csv from Anubis crawler data, scanner
networks, and Tor exit nodes.

Sources:
  • Anubis crawler YAML files (TecharoHQ/anubis on GitHub)
    - Search engines: Googlebot, Bingbot, Applebot, DuckDuckBot
    - AI crawlers: GPTBot, PerplexityBot
    - Cloud scrapers: Alibaba Cloud, Huawei Cloud, Tencent Cloud
  • Curated scanner networks: Shodan, Censys, BinaryEdge, Stretchoid,
    SecurityTrails, ZoomEye, GreyNoise, Shadowserver
  • Tor exit nodes: https://check.torproject.org/torbulkexitlist

Anubis UA-only bot references (no IP ranges, documented here for reference):
  • ai-catchall.yaml patterns: bytespider, CCBot, ChatGLM-Spider, ClaudeBot,
    Diffbot, FacebookBot, Google-Extended, ImagesiftBot, Kangaroo Bot,
    Meta-ExternalAgent, PetalBot, Scrapy, Timpibot, VelenPublicWebCrawler,
    Webzio-Extended, cohere-ai
  • ai-robots-txt.yaml patterns: Amazonbot, anthropic-ai, Applebot-Extended,
    Bytespider, CCBot, ChatGPT-User, Claude-Web, cohere-ai, Diffbot,
    FacebookBot, Google-Extended, GPTBot, Meta-ExternalAgent, OAI-SearchBot,
    omgili, PerplexityBot, Timpibot, YouBot

Output format (no header):
    ip_or_cidr,bot_name

Usage:
    python3 generate_bot_ip.py --output bot_ip.csv [--no-download]
"""

import argparse
import ipaddress
import os
import sys
import urllib.request
import urllib.error

try:
    import yaml
    HAS_YAML = True
except ImportError:
    HAS_YAML = False

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

ANUBIS_COMMIT = "dbd64e0f4f23c37476b40b332781c676ffa413ef"
ANUBIS_BASE = (
    f"https://raw.githubusercontent.com/TecharoHQ/anubis/{ANUBIS_COMMIT}/data/"
)

# (yaml_path_relative_to_data, bot_name, category)
ANUBIS_CRAWLERS = [
    ("crawlers/googlebot.yaml", "Googlebot", "search_engine"),
    ("crawlers/bingbot.yaml", "Bingbot", "search_engine"),
    ("crawlers/applebot.yaml", "Applebot", "search_engine"),
    ("crawlers/duckduckbot.yaml", "DuckDuckBot", "search_engine"),
    ("crawlers/openai-gptbot.yaml", "GPTBot", "ai_crawler"),
    ("crawlers/perplexitybot.yaml", "PerplexityBot", "ai_crawler"),
    ("crawlers/alibaba-cloud.yaml", "Alibaba_Cloud_Scraper", "cloud_scraper"),
    ("crawlers/huawei-cloud.yaml", "Huawei_Cloud_Scraper", "cloud_scraper"),
    ("crawlers/tencent-cloud.yaml", "Tencent_Cloud_Scraper", "cloud_scraper"),
]

TOR_URL = "https://check.torproject.org/torbulkexitlist"

# Curated scanner networks (category: scanner)
SCANNER_RANGES = {
    # Shodan — https://wiki.ipfire.org/dns/public-servers (census.shodan.io)
    "Shodan": [
        "66.240.192.0/24", "66.240.205.0/24", "66.240.236.0/24",
        "71.6.135.0/24", "71.6.146.0/24", "71.6.158.0/24", "71.6.165.0/24",
        "80.82.77.0/24", "80.82.78.0/24",
        "82.221.105.0/24", "82.221.106.0/24",
        "85.25.43.0/24", "85.25.103.0/24",
        "93.120.27.0/24",
        "94.102.49.0/24",
        "188.138.9.0/24",
        "198.20.69.0/24", "198.20.70.0/24", "198.20.87.0/24", "198.20.99.0/24",
        "209.126.110.0/24",
    ],
    # Censys — https://support.censys.io/hc/en-us/articles/360043177092
    "Censys": [
        "162.142.125.0/24", "167.248.133.0/24", "167.94.138.0/24",
        "167.94.145.0/24", "167.94.146.0/24",
        "192.35.168.0/23",
    ],
    # BinaryEdge — https://docs.binaryedge.io/
    "BinaryEdge": [
        "154.89.5.0/24",
        "45.143.200.0/22",
    ],
    # Stretchoid — persistent scanner botnet
    "Stretchoid": [
        "198.235.24.0/24",
        "205.210.31.0/24",
    ],
    # SecurityTrails (Recorded Future) crawlers
    "SecurityTrails": [
        "52.250.0.0/16",
    ],
    # ZoomEye (Knownsec)
    "ZoomEye": [
        "106.75.0.0/16",
    ],
    # GreyNoise known mass-scanners (individual IPs)
    "GreyNoise": [
        "45.155.205.233/32", "45.155.205.220/32", "45.155.205.205/32",
        "45.155.205.190/32", "45.155.205.175/32", "45.155.205.160/32",
        "45.155.205.146/32", "45.155.205.131/32",
        "193.32.162.10/32", "193.32.162.11/32", "193.32.162.25/32",
        "193.32.162.30/32", "193.32.162.40/32",
    ],
    # Netlab/Shadowserver known sinkholes
    "Shadowserver": [
        "74.82.47.0/24",
        "184.105.139.0/24", "184.105.247.0/24",
    ],
}

# Fallback Tor exit nodes when download is unavailable
FALLBACK_TOR_IPS = [
    "185.220.101.34", "185.220.101.35", "185.220.101.36", "185.220.101.37",
    "185.220.101.38", "185.220.101.39", "185.220.101.40", "185.220.101.41",
    "185.220.101.42", "185.220.101.43", "185.220.101.44", "185.220.101.45",
    "185.220.101.46", "185.220.101.47", "185.220.101.48", "185.220.101.49",
    "185.220.101.50", "185.220.101.51", "185.220.101.52", "185.220.101.53",
    "185.220.101.54", "185.220.101.55", "185.220.101.56", "185.220.101.57",
    "185.220.101.58", "185.220.101.59", "185.220.101.60", "185.220.101.61",
    "185.220.101.62", "185.220.101.63", "185.220.101.64", "185.220.101.65",
    "185.220.101.66", "185.220.101.67", "185.220.101.68", "185.220.101.69",
    "185.220.101.70", "185.220.101.71", "185.220.101.72", "185.220.101.73",
    "185.220.101.74", "185.220.101.75", "185.220.101.76", "185.220.101.77",
    "185.220.101.78", "185.220.101.79", "185.220.101.80", "185.220.101.81",
    "185.220.101.82", "185.220.101.83", "185.220.101.84", "185.220.101.85",
    "185.220.101.86", "185.220.101.87", "185.220.101.88", "185.220.101.89",
    "185.220.101.90", "185.220.101.91", "185.220.101.92", "185.220.101.93",
    "185.220.101.94", "185.220.101.95", "185.220.101.96", "185.220.101.97",
    "185.220.100.240", "185.220.100.241", "185.220.100.242", "185.220.100.243",
    "185.220.100.244", "185.220.100.245", "185.220.100.246", "185.220.100.247",
    "185.220.100.248", "185.220.100.249", "185.220.100.250", "185.220.100.251",
    "185.220.100.252", "185.220.100.253", "185.220.100.254", "185.220.100.255",
    "178.20.55.16", "178.20.55.18", "178.20.55.182",
    "23.129.64.130", "23.129.64.131", "23.129.64.132", "23.129.64.133",
    "23.129.64.134", "23.129.64.135", "23.129.64.136", "23.129.64.137",
    "23.129.64.138", "23.129.64.139", "23.129.64.140", "23.129.64.141",
    "23.129.64.142", "23.129.64.143", "23.129.64.144", "23.129.64.145",
    "23.129.64.146", "23.129.64.147", "23.129.64.148", "23.129.64.149",
    "23.129.64.150", "23.129.64.151", "23.129.64.152", "23.129.64.153",
    "104.244.76.13", "104.244.76.14", "104.244.76.15", "104.244.76.16",
    "104.244.76.17", "104.244.76.18", "104.244.76.19", "104.244.76.20",
    "199.249.230.64", "199.249.230.65", "199.249.230.66", "199.249.230.67",
    "199.249.230.68", "199.249.230.69", "199.249.230.70", "199.249.230.71",
    "199.249.230.72", "199.249.230.73", "199.249.230.74", "199.249.230.75",
    "199.249.230.76", "199.249.230.77", "199.249.230.78", "199.249.230.79",
    "199.249.230.80", "199.249.230.81", "199.249.230.82", "199.249.230.83",
    "199.249.230.84", "199.249.230.85", "199.249.230.86", "199.249.230.87",
    "199.249.230.88", "199.249.230.89",
]


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def fetch_url(url, timeout=30):
    """Download URL content as string. Returns None on failure."""
    try:
        req = urllib.request.Request(url, headers={"User-Agent": "ja4-bot-ip-gen/1.0"})
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            return resp.read().decode("utf-8", errors="replace")
    except (urllib.error.URLError, OSError, ValueError) as exc:
        print(f"  [WARN] Failed to download {url}: {exc}", file=sys.stderr)
        return None


def parse_yaml_remote_addresses(text):
    """Extract remote_addresses from an Anubis crawler YAML file.

    Uses PyYAML if available, otherwise falls back to simple text parsing
    since the YAML structure is predictable: remote_addresses is always a
    flat list of CIDR strings.
    """
    if HAS_YAML:
        docs = yaml.safe_load(text)
        addrs = []
        if isinstance(docs, list):
            for doc in docs:
                if isinstance(doc, dict) and "remote_addresses" in doc:
                    addrs.extend(doc["remote_addresses"])
        elif isinstance(docs, dict) and "remote_addresses" in docs:
            addrs.extend(docs["remote_addresses"])
        return [a.strip() for a in addrs if isinstance(a, str)]

    # Fallback: manual parsing
    addrs = []
    in_block = False
    for line in text.splitlines():
        stripped = line.strip()
        if stripped.startswith("remote_addresses:"):
            in_block = True
            # Inline list on same line: remote_addresses: [...]
            rest = stripped.split(":", 1)[1].strip()
            if rest.startswith("["):
                for token in rest.strip("[]").split(","):
                    token = token.strip().strip('"').strip("'")
                    if token:
                        addrs.append(token)
                if rest.endswith("]"):
                    in_block = False
            continue
        if in_block:
            if stripped.startswith("- "):
                val = stripped[2:].strip().strip('"').strip("'")
                if val:
                    addrs.append(val)
            elif stripped.startswith('"') or stripped.startswith("'"):
                val = stripped.strip('",').strip("',").strip()
                if val:
                    addrs.append(val)
            elif stripped == "]":
                in_block = False
            elif stripped and not stripped.startswith("#"):
                if stripped.startswith("["):
                    continue
                # End of block when hitting a new YAML key
                if ":" in stripped and not "/" in stripped:
                    in_block = False
    return addrs


def normalize_cidr(addr):
    """Normalize an IP or CIDR to consistent string form."""
    addr = addr.strip()
    try:
        net = ipaddress.ip_network(addr, strict=False)
        return str(net)
    except ValueError:
        return addr


def sort_key(entry):
    """Sort entries by (bot_name, ip_version, network_address)."""
    cidr, bot_name = entry
    try:
        net = ipaddress.ip_network(cidr, strict=False)
        version = net.version
        addr_int = int(net.network_address)
    except ValueError:
        version = 99
        addr_int = 0
    return (bot_name, version, addr_int)


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="Generate bot_ip.csv from Anubis crawler data, scanners, and Tor"
    )
    parser.add_argument("--output", default="bot_ip.csv",
                        help="Output CSV path (default: bot_ip.csv)")
    parser.add_argument("--no-download", action="store_true",
                        help="Skip all network downloads; use only built-in data")
    args = parser.parse_args()

    # {cidr: bot_name} for dedup
    entries = {}
    # {category: count} for stats
    category_counts = {}

    def add_entry(cidr, bot_name, category):
        cidr = normalize_cidr(cidr)
        if cidr not in entries:
            entries[cidr] = bot_name
            category_counts[category] = category_counts.get(category, 0) + 1

    # ------------------------------------------------------------------
    # 1. Anubis crawler YAML files
    # ------------------------------------------------------------------
    if not args.no_download:
        print("[bot_ip] Downloading Anubis crawler data ...")
        for yaml_path, bot_name, category in ANUBIS_CRAWLERS:
            url = ANUBIS_BASE + yaml_path
            print(f"  Fetching {yaml_path} ...", end=" ")
            text = fetch_url(url)
            if text is None:
                print("FAILED")
                continue
            addrs = parse_yaml_remote_addresses(text)
            print(f"{len(addrs)} addresses")
            for addr in addrs:
                add_entry(addr, bot_name, category)

    # ------------------------------------------------------------------
    # 2. Curated scanner networks
    # ------------------------------------------------------------------
    print("[bot_ip] Adding curated scanner networks ...")
    for bot_name, networks in SCANNER_RANGES.items():
        for net in networks:
            add_entry(net, bot_name, "scanner")

    # ------------------------------------------------------------------
    # 3. Tor exit nodes
    # ------------------------------------------------------------------
    tor_ips = set()
    if not args.no_download:
        print("[bot_ip] Downloading Tor exit node list ...")
        text = fetch_url(TOR_URL, timeout=30)
        if text:
            for line in text.splitlines():
                line = line.strip()
                if not line or line.startswith("#"):
                    continue
                try:
                    ipaddress.ip_address(line)
                    tor_ips.add(line)
                except ValueError:
                    pass
            print(f"  Downloaded {len(tor_ips)} Tor exit nodes")

    if not tor_ips:
        print("  Using fallback Tor exit node list")
        tor_ips = set(FALLBACK_TOR_IPS)

    for ip in tor_ips:
        add_entry(f"{ip}/32", "tor_exit_node", "anonymizer")

    # ------------------------------------------------------------------
    # 4. Sort, deduplicate, and write
    # ------------------------------------------------------------------
    sorted_entries = sorted(entries.items(), key=sort_key)

    os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
    with open(args.output, "w") as f:
        for cidr, bot_name in sorted_entries:
            f.write(f"{cidr},{bot_name}\n")

    # ------------------------------------------------------------------
    # 5. Print stats
    # ------------------------------------------------------------------
    print(f"\n[bot_ip] Generated {len(sorted_entries)} entries → {args.output}")
    print("[bot_ip] Breakdown by category:")
    for category in sorted(category_counts):
        print(f"  {category:20s} {category_counts[category]:>6d}")


if __name__ == "__main__":
    main()