- Add generate_browser_ja4.py: 1,186 browser JA4 fingerprints from FoxIO + ja4db.com covering 11 families (Chromium, Firefox, Safari, Edge, Tor, Opera, Vivaldi...) - Rewrite generate_bot_ip.py: Anubis YAML rules (Google, Bing, Apple, DuckDuck, OpenAI, Perplexity bots) + Tor exit nodes + cloud scanner IPs (3,555 entries) - Rewrite generate_asn_data.py: worldwide iptoasn.com data (78,049 ASNs, 714K CIDRs) - Add dict_browser_ja4 ClickHouse dictionary + browser_family in AI features views - Add /api/browsers dashboard endpoint - Fix CSV quoting for fields containing commas (User-Agent strings) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
354 lines
14 KiB
Python
354 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
generate_bot_ip.py — Generate bot_ip.csv from Anubis crawler data, scanner
|
|
networks, and Tor exit nodes.
|
|
|
|
Sources:
|
|
• Anubis crawler YAML files (TecharoHQ/anubis on GitHub)
|
|
- Search engines: Googlebot, Bingbot, Applebot, DuckDuckBot
|
|
- AI crawlers: GPTBot, PerplexityBot
|
|
- Cloud scrapers: Alibaba Cloud, Huawei Cloud, Tencent Cloud
|
|
• Curated scanner networks: Shodan, Censys, BinaryEdge, Stretchoid,
|
|
SecurityTrails, ZoomEye, GreyNoise, Shadowserver
|
|
• Tor exit nodes: https://check.torproject.org/torbulkexitlist
|
|
|
|
Anubis UA-only bot references (no IP ranges, documented here for reference):
|
|
• ai-catchall.yaml patterns: bytespider, CCBot, ChatGLM-Spider, ClaudeBot,
|
|
Diffbot, FacebookBot, Google-Extended, ImagesiftBot, Kangaroo Bot,
|
|
Meta-ExternalAgent, PetalBot, Scrapy, Timpibot, VelenPublicWebCrawler,
|
|
Webzio-Extended, cohere-ai
|
|
• ai-robots-txt.yaml patterns: Amazonbot, anthropic-ai, Applebot-Extended,
|
|
Bytespider, CCBot, ChatGPT-User, Claude-Web, cohere-ai, Diffbot,
|
|
FacebookBot, Google-Extended, GPTBot, Meta-ExternalAgent, OAI-SearchBot,
|
|
omgili, PerplexityBot, Timpibot, YouBot
|
|
|
|
Output format (no header):
|
|
ip_or_cidr,bot_name
|
|
|
|
Usage:
|
|
python3 generate_bot_ip.py --output bot_ip.csv [--no-download]
|
|
"""
|
|
|
|
import argparse
|
|
import ipaddress
|
|
import os
|
|
import sys
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
try:
|
|
import yaml
|
|
HAS_YAML = True
|
|
except ImportError:
|
|
HAS_YAML = False
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Constants
|
|
# ---------------------------------------------------------------------------
|
|
|
|
ANUBIS_COMMIT = "dbd64e0f4f23c37476b40b332781c676ffa413ef"
|
|
ANUBIS_BASE = (
|
|
f"https://raw.githubusercontent.com/TecharoHQ/anubis/{ANUBIS_COMMIT}/data/"
|
|
)
|
|
|
|
# (yaml_path_relative_to_data, bot_name, category)
|
|
ANUBIS_CRAWLERS = [
|
|
("crawlers/googlebot.yaml", "Googlebot", "search_engine"),
|
|
("crawlers/bingbot.yaml", "Bingbot", "search_engine"),
|
|
("crawlers/applebot.yaml", "Applebot", "search_engine"),
|
|
("crawlers/duckduckbot.yaml", "DuckDuckBot", "search_engine"),
|
|
("crawlers/openai-gptbot.yaml", "GPTBot", "ai_crawler"),
|
|
("crawlers/perplexitybot.yaml", "PerplexityBot", "ai_crawler"),
|
|
("crawlers/alibaba-cloud.yaml", "Alibaba_Cloud_Scraper", "cloud_scraper"),
|
|
("crawlers/huawei-cloud.yaml", "Huawei_Cloud_Scraper", "cloud_scraper"),
|
|
("crawlers/tencent-cloud.yaml", "Tencent_Cloud_Scraper", "cloud_scraper"),
|
|
]
|
|
|
|
TOR_URL = "https://check.torproject.org/torbulkexitlist"
|
|
|
|
# Curated scanner networks (category: scanner)
|
|
SCANNER_RANGES = {
|
|
# Shodan — https://wiki.ipfire.org/dns/public-servers (census.shodan.io)
|
|
"Shodan": [
|
|
"66.240.192.0/24", "66.240.205.0/24", "66.240.236.0/24",
|
|
"71.6.135.0/24", "71.6.146.0/24", "71.6.158.0/24", "71.6.165.0/24",
|
|
"80.82.77.0/24", "80.82.78.0/24",
|
|
"82.221.105.0/24", "82.221.106.0/24",
|
|
"85.25.43.0/24", "85.25.103.0/24",
|
|
"93.120.27.0/24",
|
|
"94.102.49.0/24",
|
|
"188.138.9.0/24",
|
|
"198.20.69.0/24", "198.20.70.0/24", "198.20.87.0/24", "198.20.99.0/24",
|
|
"209.126.110.0/24",
|
|
],
|
|
# Censys — https://support.censys.io/hc/en-us/articles/360043177092
|
|
"Censys": [
|
|
"162.142.125.0/24", "167.248.133.0/24", "167.94.138.0/24",
|
|
"167.94.145.0/24", "167.94.146.0/24",
|
|
"192.35.168.0/23",
|
|
],
|
|
# BinaryEdge — https://docs.binaryedge.io/
|
|
"BinaryEdge": [
|
|
"154.89.5.0/24",
|
|
"45.143.200.0/22",
|
|
],
|
|
# Stretchoid — persistent scanner botnet
|
|
"Stretchoid": [
|
|
"198.235.24.0/24",
|
|
"205.210.31.0/24",
|
|
],
|
|
# SecurityTrails (Recorded Future) crawlers
|
|
"SecurityTrails": [
|
|
"52.250.0.0/16",
|
|
],
|
|
# ZoomEye (Knownsec)
|
|
"ZoomEye": [
|
|
"106.75.0.0/16",
|
|
],
|
|
# GreyNoise known mass-scanners (individual IPs)
|
|
"GreyNoise": [
|
|
"45.155.205.233/32", "45.155.205.220/32", "45.155.205.205/32",
|
|
"45.155.205.190/32", "45.155.205.175/32", "45.155.205.160/32",
|
|
"45.155.205.146/32", "45.155.205.131/32",
|
|
"193.32.162.10/32", "193.32.162.11/32", "193.32.162.25/32",
|
|
"193.32.162.30/32", "193.32.162.40/32",
|
|
],
|
|
# Netlab/Shadowserver known sinkholes
|
|
"Shadowserver": [
|
|
"74.82.47.0/24",
|
|
"184.105.139.0/24", "184.105.247.0/24",
|
|
],
|
|
}
|
|
|
|
# Fallback Tor exit nodes when download is unavailable
|
|
FALLBACK_TOR_IPS = [
|
|
"185.220.101.34", "185.220.101.35", "185.220.101.36", "185.220.101.37",
|
|
"185.220.101.38", "185.220.101.39", "185.220.101.40", "185.220.101.41",
|
|
"185.220.101.42", "185.220.101.43", "185.220.101.44", "185.220.101.45",
|
|
"185.220.101.46", "185.220.101.47", "185.220.101.48", "185.220.101.49",
|
|
"185.220.101.50", "185.220.101.51", "185.220.101.52", "185.220.101.53",
|
|
"185.220.101.54", "185.220.101.55", "185.220.101.56", "185.220.101.57",
|
|
"185.220.101.58", "185.220.101.59", "185.220.101.60", "185.220.101.61",
|
|
"185.220.101.62", "185.220.101.63", "185.220.101.64", "185.220.101.65",
|
|
"185.220.101.66", "185.220.101.67", "185.220.101.68", "185.220.101.69",
|
|
"185.220.101.70", "185.220.101.71", "185.220.101.72", "185.220.101.73",
|
|
"185.220.101.74", "185.220.101.75", "185.220.101.76", "185.220.101.77",
|
|
"185.220.101.78", "185.220.101.79", "185.220.101.80", "185.220.101.81",
|
|
"185.220.101.82", "185.220.101.83", "185.220.101.84", "185.220.101.85",
|
|
"185.220.101.86", "185.220.101.87", "185.220.101.88", "185.220.101.89",
|
|
"185.220.101.90", "185.220.101.91", "185.220.101.92", "185.220.101.93",
|
|
"185.220.101.94", "185.220.101.95", "185.220.101.96", "185.220.101.97",
|
|
"185.220.100.240", "185.220.100.241", "185.220.100.242", "185.220.100.243",
|
|
"185.220.100.244", "185.220.100.245", "185.220.100.246", "185.220.100.247",
|
|
"185.220.100.248", "185.220.100.249", "185.220.100.250", "185.220.100.251",
|
|
"185.220.100.252", "185.220.100.253", "185.220.100.254", "185.220.100.255",
|
|
"178.20.55.16", "178.20.55.18", "178.20.55.182",
|
|
"23.129.64.130", "23.129.64.131", "23.129.64.132", "23.129.64.133",
|
|
"23.129.64.134", "23.129.64.135", "23.129.64.136", "23.129.64.137",
|
|
"23.129.64.138", "23.129.64.139", "23.129.64.140", "23.129.64.141",
|
|
"23.129.64.142", "23.129.64.143", "23.129.64.144", "23.129.64.145",
|
|
"23.129.64.146", "23.129.64.147", "23.129.64.148", "23.129.64.149",
|
|
"23.129.64.150", "23.129.64.151", "23.129.64.152", "23.129.64.153",
|
|
"104.244.76.13", "104.244.76.14", "104.244.76.15", "104.244.76.16",
|
|
"104.244.76.17", "104.244.76.18", "104.244.76.19", "104.244.76.20",
|
|
"199.249.230.64", "199.249.230.65", "199.249.230.66", "199.249.230.67",
|
|
"199.249.230.68", "199.249.230.69", "199.249.230.70", "199.249.230.71",
|
|
"199.249.230.72", "199.249.230.73", "199.249.230.74", "199.249.230.75",
|
|
"199.249.230.76", "199.249.230.77", "199.249.230.78", "199.249.230.79",
|
|
"199.249.230.80", "199.249.230.81", "199.249.230.82", "199.249.230.83",
|
|
"199.249.230.84", "199.249.230.85", "199.249.230.86", "199.249.230.87",
|
|
"199.249.230.88", "199.249.230.89",
|
|
]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def fetch_url(url, timeout=30):
|
|
"""Download URL content as string. Returns None on failure."""
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": "ja4-bot-ip-gen/1.0"})
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
return resp.read().decode("utf-8", errors="replace")
|
|
except (urllib.error.URLError, OSError, ValueError) as exc:
|
|
print(f" [WARN] Failed to download {url}: {exc}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def parse_yaml_remote_addresses(text):
|
|
"""Extract remote_addresses from an Anubis crawler YAML file.
|
|
|
|
Uses PyYAML if available, otherwise falls back to simple text parsing
|
|
since the YAML structure is predictable: remote_addresses is always a
|
|
flat list of CIDR strings.
|
|
"""
|
|
if HAS_YAML:
|
|
docs = yaml.safe_load(text)
|
|
addrs = []
|
|
if isinstance(docs, list):
|
|
for doc in docs:
|
|
if isinstance(doc, dict) and "remote_addresses" in doc:
|
|
addrs.extend(doc["remote_addresses"])
|
|
elif isinstance(docs, dict) and "remote_addresses" in docs:
|
|
addrs.extend(docs["remote_addresses"])
|
|
return [a.strip() for a in addrs if isinstance(a, str)]
|
|
|
|
# Fallback: manual parsing
|
|
addrs = []
|
|
in_block = False
|
|
for line in text.splitlines():
|
|
stripped = line.strip()
|
|
if stripped.startswith("remote_addresses:"):
|
|
in_block = True
|
|
# Inline list on same line: remote_addresses: [...]
|
|
rest = stripped.split(":", 1)[1].strip()
|
|
if rest.startswith("["):
|
|
for token in rest.strip("[]").split(","):
|
|
token = token.strip().strip('"').strip("'")
|
|
if token:
|
|
addrs.append(token)
|
|
if rest.endswith("]"):
|
|
in_block = False
|
|
continue
|
|
if in_block:
|
|
if stripped.startswith("- "):
|
|
val = stripped[2:].strip().strip('"').strip("'")
|
|
if val:
|
|
addrs.append(val)
|
|
elif stripped.startswith('"') or stripped.startswith("'"):
|
|
val = stripped.strip('",').strip("',").strip()
|
|
if val:
|
|
addrs.append(val)
|
|
elif stripped == "]":
|
|
in_block = False
|
|
elif stripped and not stripped.startswith("#"):
|
|
if stripped.startswith("["):
|
|
continue
|
|
# End of block when hitting a new YAML key
|
|
if ":" in stripped and not "/" in stripped:
|
|
in_block = False
|
|
return addrs
|
|
|
|
|
|
def normalize_cidr(addr):
|
|
"""Normalize an IP or CIDR to consistent string form."""
|
|
addr = addr.strip()
|
|
try:
|
|
net = ipaddress.ip_network(addr, strict=False)
|
|
return str(net)
|
|
except ValueError:
|
|
return addr
|
|
|
|
|
|
def sort_key(entry):
|
|
"""Sort entries by (bot_name, ip_version, network_address)."""
|
|
cidr, bot_name = entry
|
|
try:
|
|
net = ipaddress.ip_network(cidr, strict=False)
|
|
version = net.version
|
|
addr_int = int(net.network_address)
|
|
except ValueError:
|
|
version = 99
|
|
addr_int = 0
|
|
return (bot_name, version, addr_int)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate bot_ip.csv from Anubis crawler data, scanners, and Tor"
|
|
)
|
|
parser.add_argument("--output", default="bot_ip.csv",
|
|
help="Output CSV path (default: bot_ip.csv)")
|
|
parser.add_argument("--no-download", action="store_true",
|
|
help="Skip all network downloads; use only built-in data")
|
|
args = parser.parse_args()
|
|
|
|
# {cidr: bot_name} for dedup
|
|
entries = {}
|
|
# {category: count} for stats
|
|
category_counts = {}
|
|
|
|
def add_entry(cidr, bot_name, category):
|
|
cidr = normalize_cidr(cidr)
|
|
if cidr not in entries:
|
|
entries[cidr] = bot_name
|
|
category_counts[category] = category_counts.get(category, 0) + 1
|
|
|
|
# ------------------------------------------------------------------
|
|
# 1. Anubis crawler YAML files
|
|
# ------------------------------------------------------------------
|
|
if not args.no_download:
|
|
print("[bot_ip] Downloading Anubis crawler data ...")
|
|
for yaml_path, bot_name, category in ANUBIS_CRAWLERS:
|
|
url = ANUBIS_BASE + yaml_path
|
|
print(f" Fetching {yaml_path} ...", end=" ")
|
|
text = fetch_url(url)
|
|
if text is None:
|
|
print("FAILED")
|
|
continue
|
|
addrs = parse_yaml_remote_addresses(text)
|
|
print(f"{len(addrs)} addresses")
|
|
for addr in addrs:
|
|
add_entry(addr, bot_name, category)
|
|
|
|
# ------------------------------------------------------------------
|
|
# 2. Curated scanner networks
|
|
# ------------------------------------------------------------------
|
|
print("[bot_ip] Adding curated scanner networks ...")
|
|
for bot_name, networks in SCANNER_RANGES.items():
|
|
for net in networks:
|
|
add_entry(net, bot_name, "scanner")
|
|
|
|
# ------------------------------------------------------------------
|
|
# 3. Tor exit nodes
|
|
# ------------------------------------------------------------------
|
|
tor_ips = set()
|
|
if not args.no_download:
|
|
print("[bot_ip] Downloading Tor exit node list ...")
|
|
text = fetch_url(TOR_URL, timeout=30)
|
|
if text:
|
|
for line in text.splitlines():
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
try:
|
|
ipaddress.ip_address(line)
|
|
tor_ips.add(line)
|
|
except ValueError:
|
|
pass
|
|
print(f" Downloaded {len(tor_ips)} Tor exit nodes")
|
|
|
|
if not tor_ips:
|
|
print(" Using fallback Tor exit node list")
|
|
tor_ips = set(FALLBACK_TOR_IPS)
|
|
|
|
for ip in tor_ips:
|
|
add_entry(f"{ip}/32", "tor_exit_node", "anonymizer")
|
|
|
|
# ------------------------------------------------------------------
|
|
# 4. Sort, deduplicate, and write
|
|
# ------------------------------------------------------------------
|
|
sorted_entries = sorted(entries.items(), key=sort_key)
|
|
|
|
os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
|
|
with open(args.output, "w") as f:
|
|
for cidr, bot_name in sorted_entries:
|
|
f.write(f"{cidr},{bot_name}\n")
|
|
|
|
# ------------------------------------------------------------------
|
|
# 5. Print stats
|
|
# ------------------------------------------------------------------
|
|
print(f"\n[bot_ip] Generated {len(sorted_entries)} entries → {args.output}")
|
|
print("[bot_ip] Breakdown by category:")
|
|
for category in sorted(category_counts):
|
|
print(f" {category:20s} {category_counts[category]:>6d}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|