feat: browser JA4 detection, Anubis bot rules, worldwide ASN data
- Add generate_browser_ja4.py: 1,186 browser JA4 fingerprints from FoxIO + ja4db.com covering 11 families (Chromium, Firefox, Safari, Edge, Tor, Opera, Vivaldi...) - Rewrite generate_bot_ip.py: Anubis YAML rules (Google, Bing, Apple, DuckDuck, OpenAI, Perplexity bots) + Tor exit nodes + cloud scanner IPs (3,555 entries) - Rewrite generate_asn_data.py: worldwide iptoasn.com data (78,049 ASNs, 714K CIDRs) - Add dict_browser_ja4 ClickHouse dictionary + browser_family in AI features views - Add /api/browsers dashboard endpoint - Fix CSV quoting for fields containing commas (User-Agent strings) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -1,25 +1,75 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
generate_bot_ip.py — Generate bot_ip.csv from known scanner networks + Tor exit nodes.
|
||||
generate_bot_ip.py — Generate bot_ip.csv from Anubis crawler data, scanner
|
||||
networks, and Tor exit nodes.
|
||||
|
||||
Sources:
|
||||
• Tor exit nodes: downloaded list or hardcoded fallback
|
||||
• Shodan: known scanner ranges (census.shodan.io, 2024)
|
||||
• Censys: known scanner ranges (censys.io, 2024)
|
||||
• Binaryedge, SecurityTrails, ZoomEye, Stretchoid: known ranges
|
||||
• GreyNoise: top mass-scanner IPs (manually curated)
|
||||
• Anubis crawler YAML files (TecharoHQ/anubis on GitHub)
|
||||
- Search engines: Googlebot, Bingbot, Applebot, DuckDuckBot
|
||||
- AI crawlers: GPTBot, PerplexityBot
|
||||
- Cloud scrapers: Alibaba Cloud, Huawei Cloud, Tencent Cloud
|
||||
• Curated scanner networks: Shodan, Censys, BinaryEdge, Stretchoid,
|
||||
SecurityTrails, ZoomEye, GreyNoise, Shadowserver
|
||||
• Tor exit nodes: https://check.torproject.org/torbulkexitlist
|
||||
|
||||
Anubis UA-only bot references (no IP ranges, documented here for reference):
|
||||
• ai-catchall.yaml patterns: bytespider, CCBot, ChatGLM-Spider, ClaudeBot,
|
||||
Diffbot, FacebookBot, Google-Extended, ImagesiftBot, Kangaroo Bot,
|
||||
Meta-ExternalAgent, PetalBot, Scrapy, Timpibot, VelenPublicWebCrawler,
|
||||
Webzio-Extended, cohere-ai
|
||||
• ai-robots-txt.yaml patterns: Amazonbot, anthropic-ai, Applebot-Extended,
|
||||
Bytespider, CCBot, ChatGPT-User, Claude-Web, cohere-ai, Diffbot,
|
||||
FacebookBot, Google-Extended, GPTBot, Meta-ExternalAgent, OAI-SearchBot,
|
||||
omgili, PerplexityBot, Timpibot, YouBot
|
||||
|
||||
Output format (no header):
|
||||
<ip_or_cidr>,<bot_name>
|
||||
ip_or_cidr,bot_name
|
||||
|
||||
Usage:
|
||||
python3 generate_bot_ip.py --output bot_ip.csv [--no-download]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import ipaddress
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
# --- Known scanner networks (public, well-documented) ---
|
||||
KNOWN_SCANNERS = {
|
||||
try:
|
||||
import yaml
|
||||
HAS_YAML = True
|
||||
except ImportError:
|
||||
HAS_YAML = False
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ANUBIS_COMMIT = "dbd64e0f4f23c37476b40b332781c676ffa413ef"
|
||||
ANUBIS_BASE = (
|
||||
f"https://raw.githubusercontent.com/TecharoHQ/anubis/{ANUBIS_COMMIT}/data/"
|
||||
)
|
||||
|
||||
# (yaml_path_relative_to_data, bot_name, category)
|
||||
ANUBIS_CRAWLERS = [
|
||||
("crawlers/googlebot.yaml", "Googlebot", "search_engine"),
|
||||
("crawlers/bingbot.yaml", "Bingbot", "search_engine"),
|
||||
("crawlers/applebot.yaml", "Applebot", "search_engine"),
|
||||
("crawlers/duckduckbot.yaml", "DuckDuckBot", "search_engine"),
|
||||
("crawlers/openai-gptbot.yaml", "GPTBot", "ai_crawler"),
|
||||
("crawlers/perplexitybot.yaml", "PerplexityBot", "ai_crawler"),
|
||||
("crawlers/alibaba-cloud.yaml", "Alibaba_Cloud_Scraper", "cloud_scraper"),
|
||||
("crawlers/huawei-cloud.yaml", "Huawei_Cloud_Scraper", "cloud_scraper"),
|
||||
("crawlers/tencent-cloud.yaml", "Tencent_Cloud_Scraper", "cloud_scraper"),
|
||||
]
|
||||
|
||||
TOR_URL = "https://check.torproject.org/torbulkexitlist"
|
||||
|
||||
# Curated scanner networks (category: scanner)
|
||||
SCANNER_RANGES = {
|
||||
# Shodan — https://wiki.ipfire.org/dns/public-servers (census.shodan.io)
|
||||
"Shodan_Scanner": [
|
||||
"Shodan": [
|
||||
"66.240.192.0/24", "66.240.205.0/24", "66.240.236.0/24",
|
||||
"71.6.135.0/24", "71.6.146.0/24", "71.6.158.0/24", "71.6.165.0/24",
|
||||
"80.82.77.0/24", "80.82.78.0/24",
|
||||
@ -32,45 +82,45 @@ KNOWN_SCANNERS = {
|
||||
"209.126.110.0/24",
|
||||
],
|
||||
# Censys — https://support.censys.io/hc/en-us/articles/360043177092
|
||||
"Censys_Scanner": [
|
||||
"Censys": [
|
||||
"162.142.125.0/24", "167.248.133.0/24", "167.94.138.0/24",
|
||||
"167.94.145.0/24", "167.94.146.0/24",
|
||||
"192.35.168.0/23",
|
||||
],
|
||||
# BinaryEdge — https://docs.binaryedge.io/
|
||||
"BinaryEdge_Scanner": [
|
||||
"BinaryEdge": [
|
||||
"154.89.5.0/24",
|
||||
"45.143.200.0/22",
|
||||
],
|
||||
# Stretchoid — persistent scanner botnet
|
||||
"Stretchoid_Scanner": [
|
||||
"Stretchoid": [
|
||||
"198.235.24.0/24",
|
||||
"205.210.31.0/24",
|
||||
],
|
||||
# SecurityTrails (Recorded Future) crawlers
|
||||
"SecurityTrails_Crawler": [
|
||||
"SecurityTrails": [
|
||||
"52.250.0.0/16",
|
||||
],
|
||||
# ZoomEye (Knownsec)
|
||||
"ZoomEye_Scanner": [
|
||||
"ZoomEye": [
|
||||
"106.75.0.0/16",
|
||||
],
|
||||
# GreyNoise known mass-scanners (individual IPs)
|
||||
"GreyNoise_MassScanner": [
|
||||
"GreyNoise": [
|
||||
"45.155.205.233/32", "45.155.205.220/32", "45.155.205.205/32",
|
||||
"45.155.205.190/32", "45.155.205.175/32", "45.155.205.160/32",
|
||||
"45.155.205.146/32", "45.155.205.131/32",
|
||||
"193.32.162.10/32", "193.32.162.11/32", "193.32.162.25/32",
|
||||
"193.32.162.30/32", "193.32.162.40/32",
|
||||
],
|
||||
# Netlab/Shadowserver known sinkholes used by malware
|
||||
"Shadowserver_Sinkhole": [
|
||||
# Netlab/Shadowserver known sinkholes
|
||||
"Shadowserver": [
|
||||
"74.82.47.0/24",
|
||||
"184.105.139.0/24", "184.105.247.0/24",
|
||||
],
|
||||
}
|
||||
|
||||
# Fallback Tor exit nodes when download unavailable
|
||||
# Fallback Tor exit nodes when download is unavailable
|
||||
FALLBACK_TOR_IPS = [
|
||||
"185.220.101.34", "185.220.101.35", "185.220.101.36", "185.220.101.37",
|
||||
"185.220.101.38", "185.220.101.39", "185.220.101.40", "185.220.101.41",
|
||||
@ -111,61 +161,192 @@ FALLBACK_TOR_IPS = [
|
||||
]
|
||||
|
||||
|
||||
def load_tor_ips(tor_file):
|
||||
"""Load Tor exit node IPs from downloaded file."""
|
||||
ips = set()
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fetch_url(url, timeout=30):
|
||||
"""Download URL content as string. Returns None on failure."""
|
||||
try:
|
||||
with open(tor_file) as f:
|
||||
for line in f:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "ja4-bot-ip-gen/1.0"})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
return resp.read().decode("utf-8", errors="replace")
|
||||
except (urllib.error.URLError, OSError, ValueError) as exc:
|
||||
print(f" [WARN] Failed to download {url}: {exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def parse_yaml_remote_addresses(text):
|
||||
"""Extract remote_addresses from an Anubis crawler YAML file.
|
||||
|
||||
Uses PyYAML if available, otherwise falls back to simple text parsing
|
||||
since the YAML structure is predictable: remote_addresses is always a
|
||||
flat list of CIDR strings.
|
||||
"""
|
||||
if HAS_YAML:
|
||||
docs = yaml.safe_load(text)
|
||||
addrs = []
|
||||
if isinstance(docs, list):
|
||||
for doc in docs:
|
||||
if isinstance(doc, dict) and "remote_addresses" in doc:
|
||||
addrs.extend(doc["remote_addresses"])
|
||||
elif isinstance(docs, dict) and "remote_addresses" in docs:
|
||||
addrs.extend(docs["remote_addresses"])
|
||||
return [a.strip() for a in addrs if isinstance(a, str)]
|
||||
|
||||
# Fallback: manual parsing
|
||||
addrs = []
|
||||
in_block = False
|
||||
for line in text.splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("remote_addresses:"):
|
||||
in_block = True
|
||||
# Inline list on same line: remote_addresses: [...]
|
||||
rest = stripped.split(":", 1)[1].strip()
|
||||
if rest.startswith("["):
|
||||
for token in rest.strip("[]").split(","):
|
||||
token = token.strip().strip('"').strip("'")
|
||||
if token:
|
||||
addrs.append(token)
|
||||
if rest.endswith("]"):
|
||||
in_block = False
|
||||
continue
|
||||
if in_block:
|
||||
if stripped.startswith("- "):
|
||||
val = stripped[2:].strip().strip('"').strip("'")
|
||||
if val:
|
||||
addrs.append(val)
|
||||
elif stripped.startswith('"') or stripped.startswith("'"):
|
||||
val = stripped.strip('",').strip("',").strip()
|
||||
if val:
|
||||
addrs.append(val)
|
||||
elif stripped == "]":
|
||||
in_block = False
|
||||
elif stripped and not stripped.startswith("#"):
|
||||
if stripped.startswith("["):
|
||||
continue
|
||||
# End of block when hitting a new YAML key
|
||||
if ":" in stripped and not "/" in stripped:
|
||||
in_block = False
|
||||
return addrs
|
||||
|
||||
|
||||
def normalize_cidr(addr):
|
||||
"""Normalize an IP or CIDR to consistent string form."""
|
||||
addr = addr.strip()
|
||||
try:
|
||||
net = ipaddress.ip_network(addr, strict=False)
|
||||
return str(net)
|
||||
except ValueError:
|
||||
return addr
|
||||
|
||||
|
||||
def sort_key(entry):
|
||||
"""Sort entries by (bot_name, ip_version, network_address)."""
|
||||
cidr, bot_name = entry
|
||||
try:
|
||||
net = ipaddress.ip_network(cidr, strict=False)
|
||||
version = net.version
|
||||
addr_int = int(net.network_address)
|
||||
except ValueError:
|
||||
version = 99
|
||||
addr_int = 0
|
||||
return (bot_name, version, addr_int)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate bot_ip.csv from Anubis crawler data, scanners, and Tor"
|
||||
)
|
||||
parser.add_argument("--output", default="bot_ip.csv",
|
||||
help="Output CSV path (default: bot_ip.csv)")
|
||||
parser.add_argument("--no-download", action="store_true",
|
||||
help="Skip all network downloads; use only built-in data")
|
||||
args = parser.parse_args()
|
||||
|
||||
# {cidr: bot_name} for dedup
|
||||
entries = {}
|
||||
# {category: count} for stats
|
||||
category_counts = {}
|
||||
|
||||
def add_entry(cidr, bot_name, category):
|
||||
cidr = normalize_cidr(cidr)
|
||||
if cidr not in entries:
|
||||
entries[cidr] = bot_name
|
||||
category_counts[category] = category_counts.get(category, 0) + 1
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 1. Anubis crawler YAML files
|
||||
# ------------------------------------------------------------------
|
||||
if not args.no_download:
|
||||
print("[bot_ip] Downloading Anubis crawler data ...")
|
||||
for yaml_path, bot_name, category in ANUBIS_CRAWLERS:
|
||||
url = ANUBIS_BASE + yaml_path
|
||||
print(f" Fetching {yaml_path} ...", end=" ")
|
||||
text = fetch_url(url)
|
||||
if text is None:
|
||||
print("FAILED")
|
||||
continue
|
||||
addrs = parse_yaml_remote_addresses(text)
|
||||
print(f"{len(addrs)} addresses")
|
||||
for addr in addrs:
|
||||
add_entry(addr, bot_name, category)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 2. Curated scanner networks
|
||||
# ------------------------------------------------------------------
|
||||
print("[bot_ip] Adding curated scanner networks ...")
|
||||
for bot_name, networks in SCANNER_RANGES.items():
|
||||
for net in networks:
|
||||
add_entry(net, bot_name, "scanner")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 3. Tor exit nodes
|
||||
# ------------------------------------------------------------------
|
||||
tor_ips = set()
|
||||
if not args.no_download:
|
||||
print("[bot_ip] Downloading Tor exit node list ...")
|
||||
text = fetch_url(TOR_URL, timeout=30)
|
||||
if text:
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
try:
|
||||
ipaddress.ip_address(line)
|
||||
ips.add(line)
|
||||
tor_ips.add(line)
|
||||
except ValueError:
|
||||
pass
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
return ips
|
||||
print(f" Downloaded {len(tor_ips)} Tor exit nodes")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate bot_ip.csv")
|
||||
parser.add_argument("--output", default="bot_ip.csv")
|
||||
parser.add_argument("--tor-file", help="Path to downloaded Tor exit node list")
|
||||
args = parser.parse_args()
|
||||
|
||||
entries = []
|
||||
seen = set()
|
||||
|
||||
# Add known scanner networks
|
||||
for bot_name, networks in KNOWN_SCANNERS.items():
|
||||
for net in networks:
|
||||
key = net
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
entries.append((net, bot_name))
|
||||
|
||||
# Add Tor exit nodes
|
||||
if args.tor_file:
|
||||
tor_ips = load_tor_ips(args.tor_file)
|
||||
else:
|
||||
if not tor_ips:
|
||||
print(" Using fallback Tor exit node list")
|
||||
tor_ips = set(FALLBACK_TOR_IPS)
|
||||
|
||||
for ip in sorted(tor_ips, key=lambda x: ipaddress.ip_address(x)):
|
||||
key = f"{ip}/32"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
entries.append((key, "Tor_Exit_Node"))
|
||||
for ip in tor_ips:
|
||||
add_entry(f"{ip}/32", "tor_exit_node", "anonymizer")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 4. Sort, deduplicate, and write
|
||||
# ------------------------------------------------------------------
|
||||
sorted_entries = sorted(entries.items(), key=sort_key)
|
||||
|
||||
os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
|
||||
with open(args.output, "w") as f:
|
||||
for net, name in entries:
|
||||
f.write(f"{net},{name}\n")
|
||||
for cidr, bot_name in sorted_entries:
|
||||
f.write(f"{cidr},{bot_name}\n")
|
||||
|
||||
print(f"[bot_ip] Generated {len(entries)} entries "
|
||||
f"({len(tor_ips)} Tor nodes, "
|
||||
f"{len(entries) - len(tor_ips)} scanner ranges)")
|
||||
# ------------------------------------------------------------------
|
||||
# 5. Print stats
|
||||
# ------------------------------------------------------------------
|
||||
print(f"\n[bot_ip] Generated {len(sorted_entries)} entries → {args.output}")
|
||||
print("[bot_ip] Breakdown by category:")
|
||||
for category in sorted(category_counts):
|
||||
print(f" {category:20s} {category_counts[category]:>6d}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user