feat: browser JA4 detection, Anubis bot rules, worldwide ASN data

- Add generate_browser_ja4.py: 1,186 browser JA4 fingerprints from FoxIO + ja4db.com covering 11 families (Chromium, Firefox, Safari, Edge, Tor, Opera, Vivaldi...) - Rewrite generate_bot_ip.py: Anubis YAML rules (Google, Bing, Apple, DuckDuck, OpenAI, Perplexity bots) + Tor exit nodes + cloud scanner IPs (3,555 entries) - Rewrite generate_asn_data.py: worldwide iptoasn.com data (78,049 ASNs, 714K CIDRs) - Add dict_browser_ja4 ClickHouse dictionary + browser_family in AI features views - Add /api/browsers dashboard endpoint - Fix CSV quoting for fields containing commas (User-Agent strings) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-08 15:27:37 +02:00
parent b6184e6529
commit 7d09c614c3
15 changed files with 885900 additions and 3151 deletions
--- a/scripts/generate_bot_ip.py
+++ b/scripts/generate_bot_ip.py
@ -1,25 +1,75 @@
 #!/usr/bin/env python3
 """
-generate_bot_ip.py — Generate bot_ip.csv from known scanner networks + Tor exit nodes.
+generate_bot_ip.py — Generate bot_ip.csv from Anubis crawler data, scanner
+networks, and Tor exit nodes.

 Sources:
-  • Tor exit nodes: downloaded list or hardcoded fallback
-  • Shodan:   known scanner ranges (census.shodan.io, 2024)
-  • Censys:   known scanner ranges (censys.io, 2024)
-  • Binaryedge, SecurityTrails, ZoomEye, Stretchoid: known ranges
-  • GreyNoise: top mass-scanner IPs (manually curated)
+  • Anubis crawler YAML files (TecharoHQ/anubis on GitHub)
+    - Search engines: Googlebot, Bingbot, Applebot, DuckDuckBot
+    - AI crawlers: GPTBot, PerplexityBot
+    - Cloud scrapers: Alibaba Cloud, Huawei Cloud, Tencent Cloud
+  • Curated scanner networks: Shodan, Censys, BinaryEdge, Stretchoid,
+    SecurityTrails, ZoomEye, GreyNoise, Shadowserver
+  • Tor exit nodes: https://check.torproject.org/torbulkexitlist
+
+Anubis UA-only bot references (no IP ranges, documented here for reference):
+  • ai-catchall.yaml patterns: bytespider, CCBot, ChatGLM-Spider, ClaudeBot,
+    Diffbot, FacebookBot, Google-Extended, ImagesiftBot, Kangaroo Bot,
+    Meta-ExternalAgent, PetalBot, Scrapy, Timpibot, VelenPublicWebCrawler,
+    Webzio-Extended, cohere-ai
+  • ai-robots-txt.yaml patterns: Amazonbot, anthropic-ai, Applebot-Extended,
+    Bytespider, CCBot, ChatGPT-User, Claude-Web, cohere-ai, Diffbot,
+    FacebookBot, Google-Extended, GPTBot, Meta-ExternalAgent, OAI-SearchBot,
+    omgili, PerplexityBot, Timpibot, YouBot

 Output format (no header):
-    <ip_or_cidr>,<bot_name>
+    ip_or_cidr,bot_name
+
+Usage:
+    python3 generate_bot_ip.py --output bot_ip.csv [--no-download]
 """
+
 import argparse
 import ipaddress
+import os
 import sys
+import urllib.request
+import urllib.error

-# --- Known scanner networks (public, well-documented) ---
-KNOWN_SCANNERS = {
+try:
+    import yaml
+    HAS_YAML = True
+except ImportError:
+    HAS_YAML = False
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+ANUBIS_COMMIT = "dbd64e0f4f23c37476b40b332781c676ffa413ef"
+ANUBIS_BASE = (
+    f"https://raw.githubusercontent.com/TecharoHQ/anubis/{ANUBIS_COMMIT}/data/"
+)
+
+# (yaml_path_relative_to_data, bot_name, category)
+ANUBIS_CRAWLERS = [
+    ("crawlers/googlebot.yaml", "Googlebot", "search_engine"),
+    ("crawlers/bingbot.yaml", "Bingbot", "search_engine"),
+    ("crawlers/applebot.yaml", "Applebot", "search_engine"),
+    ("crawlers/duckduckbot.yaml", "DuckDuckBot", "search_engine"),
+    ("crawlers/openai-gptbot.yaml", "GPTBot", "ai_crawler"),
+    ("crawlers/perplexitybot.yaml", "PerplexityBot", "ai_crawler"),
+    ("crawlers/alibaba-cloud.yaml", "Alibaba_Cloud_Scraper", "cloud_scraper"),
+    ("crawlers/huawei-cloud.yaml", "Huawei_Cloud_Scraper", "cloud_scraper"),
+    ("crawlers/tencent-cloud.yaml", "Tencent_Cloud_Scraper", "cloud_scraper"),
+]
+
+TOR_URL = "https://check.torproject.org/torbulkexitlist"
+
+# Curated scanner networks (category: scanner)
+SCANNER_RANGES = {
    # Shodan — https://wiki.ipfire.org/dns/public-servers (census.shodan.io)
-    "Shodan_Scanner": [
+    "Shodan": [
        "66.240.192.0/24", "66.240.205.0/24", "66.240.236.0/24",
        "71.6.135.0/24", "71.6.146.0/24", "71.6.158.0/24", "71.6.165.0/24",
        "80.82.77.0/24", "80.82.78.0/24",
@ -32,45 +82,45 @@ KNOWN_SCANNERS = {
        "209.126.110.0/24",
    ],
    # Censys — https://support.censys.io/hc/en-us/articles/360043177092
-    "Censys_Scanner": [
+    "Censys": [
        "162.142.125.0/24", "167.248.133.0/24", "167.94.138.0/24",
        "167.94.145.0/24", "167.94.146.0/24",
        "192.35.168.0/23",
    ],
    # BinaryEdge — https://docs.binaryedge.io/
-    "BinaryEdge_Scanner": [
+    "BinaryEdge": [
        "154.89.5.0/24",
        "45.143.200.0/22",
    ],
    # Stretchoid — persistent scanner botnet
-    "Stretchoid_Scanner": [
+    "Stretchoid": [
        "198.235.24.0/24",
        "205.210.31.0/24",
    ],
    # SecurityTrails (Recorded Future) crawlers
-    "SecurityTrails_Crawler": [
+    "SecurityTrails": [
        "52.250.0.0/16",
    ],
    # ZoomEye (Knownsec)
-    "ZoomEye_Scanner": [
+    "ZoomEye": [
        "106.75.0.0/16",
    ],
    # GreyNoise known mass-scanners (individual IPs)
-    "GreyNoise_MassScanner": [
+    "GreyNoise": [
        "45.155.205.233/32", "45.155.205.220/32", "45.155.205.205/32",
        "45.155.205.190/32", "45.155.205.175/32", "45.155.205.160/32",
        "45.155.205.146/32", "45.155.205.131/32",
        "193.32.162.10/32", "193.32.162.11/32", "193.32.162.25/32",
        "193.32.162.30/32", "193.32.162.40/32",
    ],
-    # Netlab/Shadowserver known sinkholes used by malware
-    "Shadowserver_Sinkhole": [
+    # Netlab/Shadowserver known sinkholes
+    "Shadowserver": [
        "74.82.47.0/24",
        "184.105.139.0/24", "184.105.247.0/24",
    ],
 }

-# Fallback Tor exit nodes when download unavailable
+# Fallback Tor exit nodes when download is unavailable
 FALLBACK_TOR_IPS = [
    "185.220.101.34", "185.220.101.35", "185.220.101.36", "185.220.101.37",
    "185.220.101.38", "185.220.101.39", "185.220.101.40", "185.220.101.41",
@ -111,61 +161,192 @@ FALLBACK_TOR_IPS = [
 ]


-def load_tor_ips(tor_file):
-    """Load Tor exit node IPs from downloaded file."""
-    ips = set()
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def fetch_url(url, timeout=30):
+    """Download URL content as string. Returns None on failure."""
    try:
-        with open(tor_file) as f:
-            for line in f:
+        req = urllib.request.Request(url, headers={"User-Agent": "ja4-bot-ip-gen/1.0"})
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            return resp.read().decode("utf-8", errors="replace")
+    except (urllib.error.URLError, OSError, ValueError) as exc:
+        print(f"  [WARN] Failed to download {url}: {exc}", file=sys.stderr)
+        return None
+
+
+def parse_yaml_remote_addresses(text):
+    """Extract remote_addresses from an Anubis crawler YAML file.
+
+    Uses PyYAML if available, otherwise falls back to simple text parsing
+    since the YAML structure is predictable: remote_addresses is always a
+    flat list of CIDR strings.
+    """
+    if HAS_YAML:
+        docs = yaml.safe_load(text)
+        addrs = []
+        if isinstance(docs, list):
+            for doc in docs:
+                if isinstance(doc, dict) and "remote_addresses" in doc:
+                    addrs.extend(doc["remote_addresses"])
+        elif isinstance(docs, dict) and "remote_addresses" in docs:
+            addrs.extend(docs["remote_addresses"])
+        return [a.strip() for a in addrs if isinstance(a, str)]
+
+    # Fallback: manual parsing
+    addrs = []
+    in_block = False
+    for line in text.splitlines():
+        stripped = line.strip()
+        if stripped.startswith("remote_addresses:"):
+            in_block = True
+            # Inline list on same line: remote_addresses: [...]
+            rest = stripped.split(":", 1)[1].strip()
+            if rest.startswith("["):
+                for token in rest.strip("[]").split(","):
+                    token = token.strip().strip('"').strip("'")
+                    if token:
+                        addrs.append(token)
+                if rest.endswith("]"):
+                    in_block = False
+            continue
+        if in_block:
+            if stripped.startswith("- "):
+                val = stripped[2:].strip().strip('"').strip("'")
+                if val:
+                    addrs.append(val)
+            elif stripped.startswith('"') or stripped.startswith("'"):
+                val = stripped.strip('",').strip("',").strip()
+                if val:
+                    addrs.append(val)
+            elif stripped == "]":
+                in_block = False
+            elif stripped and not stripped.startswith("#"):
+                if stripped.startswith("["):
+                    continue
+                # End of block when hitting a new YAML key
+                if ":" in stripped and not "/" in stripped:
+                    in_block = False
+    return addrs
+
+
+def normalize_cidr(addr):
+    """Normalize an IP or CIDR to consistent string form."""
+    addr = addr.strip()
+    try:
+        net = ipaddress.ip_network(addr, strict=False)
+        return str(net)
+    except ValueError:
+        return addr
+
+
+def sort_key(entry):
+    """Sort entries by (bot_name, ip_version, network_address)."""
+    cidr, bot_name = entry
+    try:
+        net = ipaddress.ip_network(cidr, strict=False)
+        version = net.version
+        addr_int = int(net.network_address)
+    except ValueError:
+        version = 99
+        addr_int = 0
+    return (bot_name, version, addr_int)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate bot_ip.csv from Anubis crawler data, scanners, and Tor"
+    )
+    parser.add_argument("--output", default="bot_ip.csv",
+                        help="Output CSV path (default: bot_ip.csv)")
+    parser.add_argument("--no-download", action="store_true",
+                        help="Skip all network downloads; use only built-in data")
+    args = parser.parse_args()
+
+    # {cidr: bot_name} for dedup
+    entries = {}
+    # {category: count} for stats
+    category_counts = {}
+
+    def add_entry(cidr, bot_name, category):
+        cidr = normalize_cidr(cidr)
+        if cidr not in entries:
+            entries[cidr] = bot_name
+            category_counts[category] = category_counts.get(category, 0) + 1
+
+    # ------------------------------------------------------------------
+    # 1. Anubis crawler YAML files
+    # ------------------------------------------------------------------
+    if not args.no_download:
+        print("[bot_ip] Downloading Anubis crawler data ...")
+        for yaml_path, bot_name, category in ANUBIS_CRAWLERS:
+            url = ANUBIS_BASE + yaml_path
+            print(f"  Fetching {yaml_path} ...", end=" ")
+            text = fetch_url(url)
+            if text is None:
+                print("FAILED")
+                continue
+            addrs = parse_yaml_remote_addresses(text)
+            print(f"{len(addrs)} addresses")
+            for addr in addrs:
+                add_entry(addr, bot_name, category)
+
+    # ------------------------------------------------------------------
+    # 2. Curated scanner networks
+    # ------------------------------------------------------------------
+    print("[bot_ip] Adding curated scanner networks ...")
+    for bot_name, networks in SCANNER_RANGES.items():
+        for net in networks:
+            add_entry(net, bot_name, "scanner")
+
+    # ------------------------------------------------------------------
+    # 3. Tor exit nodes
+    # ------------------------------------------------------------------
+    tor_ips = set()
+    if not args.no_download:
+        print("[bot_ip] Downloading Tor exit node list ...")
+        text = fetch_url(TOR_URL, timeout=30)
+        if text:
+            for line in text.splitlines():
                line = line.strip()
                if not line or line.startswith("#"):
                    continue
                try:
                    ipaddress.ip_address(line)
-                    ips.add(line)
+                    tor_ips.add(line)
                except ValueError:
                    pass
-    except FileNotFoundError:
-        pass
-    return ips
+            print(f"  Downloaded {len(tor_ips)} Tor exit nodes")

-
-def main():
-    parser = argparse.ArgumentParser(description="Generate bot_ip.csv")
-    parser.add_argument("--output", default="bot_ip.csv")
-    parser.add_argument("--tor-file", help="Path to downloaded Tor exit node list")
-    args = parser.parse_args()
-
-    entries = []
-    seen = set()
-
-    # Add known scanner networks
-    for bot_name, networks in KNOWN_SCANNERS.items():
-        for net in networks:
-            key = net
-            if key not in seen:
-                seen.add(key)
-                entries.append((net, bot_name))
-
-    # Add Tor exit nodes
-    if args.tor_file:
-        tor_ips = load_tor_ips(args.tor_file)
-    else:
+    if not tor_ips:
+        print("  Using fallback Tor exit node list")
        tor_ips = set(FALLBACK_TOR_IPS)

-    for ip in sorted(tor_ips, key=lambda x: ipaddress.ip_address(x)):
-        key = f"{ip}/32"
-        if key not in seen:
-            seen.add(key)
-            entries.append((key, "Tor_Exit_Node"))
+    for ip in tor_ips:
+        add_entry(f"{ip}/32", "tor_exit_node", "anonymizer")

+    # ------------------------------------------------------------------
+    # 4. Sort, deduplicate, and write
+    # ------------------------------------------------------------------
+    sorted_entries = sorted(entries.items(), key=sort_key)
+
+    os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
    with open(args.output, "w") as f:
-        for net, name in entries:
-            f.write(f"{net},{name}\n")
+        for cidr, bot_name in sorted_entries:
+            f.write(f"{cidr},{bot_name}\n")

-    print(f"[bot_ip] Generated {len(entries)} entries "
-          f"({len(tor_ips)} Tor nodes, "
-          f"{len(entries) - len(tor_ips)} scanner ranges)")
+    # ------------------------------------------------------------------
+    # 5. Print stats
+    # ------------------------------------------------------------------
+    print(f"\n[bot_ip] Generated {len(sorted_entries)} entries → {args.output}")
+    print("[bot_ip] Breakdown by category:")
+    for category in sorted(category_counts):
+        print(f"  {category:20s} {category_counts[category]:>6d}")


 if __name__ == "__main__":