Files
ja4-platform/scripts/generate_browser_ja4.py
toto 7d09c614c3 feat: browser JA4 detection, Anubis bot rules, worldwide ASN data
- Add generate_browser_ja4.py: 1,186 browser JA4 fingerprints from FoxIO + ja4db.com
  covering 11 families (Chromium, Firefox, Safari, Edge, Tor, Opera, Vivaldi...)
- Rewrite generate_bot_ip.py: Anubis YAML rules (Google, Bing, Apple, DuckDuck,
  OpenAI, Perplexity bots) + Tor exit nodes + cloud scanner IPs (3,555 entries)
- Rewrite generate_asn_data.py: worldwide iptoasn.com data (78,049 ASNs, 714K CIDRs)
- Add dict_browser_ja4 ClickHouse dictionary + browser_family in AI features views
- Add /api/browsers dashboard endpoint
- Fix CSV quoting for fields containing commas (User-Agent strings)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-08 15:27:37 +02:00

252 lines
9.7 KiB
Python

#!/usr/bin/env python3
"""
generate_browser_ja4.py — Generate browser_ja4.csv with known browser TLS fingerprints.
Sources:
• FoxIO official ja4plus-mapping.csv:
https://github.com/FoxIO-LLC/ja4/blob/main/ja4plus-mapping.csv
• ja4db.com public API: https://ja4db.com/api/read/
• JA4+ specification: https://github.com/FoxIO-LLC/ja4
JA4 fingerprints identify TLS libraries, not individual browser versions.
All Chromium-based browsers (Chrome, Edge, Brave, Opera, Vivaldi, Samsung Internet)
share BoringSSL and produce identical JA4 fingerprints.
Firefox uses NSS, Safari uses Apple SecureTransport/Network.framework.
Output format (no header):
<ja4_fingerprint>,<browser_family>,<tls_library>,<context>
"""
import argparse
import json
import os
import sys
import urllib.request
# =============================================================================
# Curated browser JA4 fingerprints from FoxIO official mapping
# Format: (ja4, browser_family, tls_library, context)
# =============================================================================
BROWSER_FINGERPRINTS = [
# --- Chromium (Chrome, Edge, Brave, Opera, Vivaldi, Samsung Internet) ---
# BoringSSL — all Chromium-based browsers produce identical JA4
("t13d1516h2_8daaf6152771_02713d6af862", "Chromium",
"BoringSSL", "TCP fresh connection"),
("t13d1517h2_8daaf6152771_b0da82dd1658", "Chromium",
"BoringSSL", "TCP with PSK extension"),
("t13d1517h2_8daaf6152771_b1ff8ab2d16f", "Chromium",
"BoringSSL", "TCP alt extension set"),
("t13i1515h2_8daaf6152771_02713d6af862", "Chromium",
"BoringSSL", "TCP to IP (no SNI)"),
("t13i1516h2_8daaf6152771_b0da82dd1658", "Chromium",
"BoringSSL", "TCP to IP with PSK"),
("t13i1516h2_8daaf6152771_b1ff8ab2d16f", "Chromium",
"BoringSSL", "TCP to IP alt ext"),
("q13d0312h3_55b375c5d22e_06cda9e17597", "Chromium",
"BoringSSL", "QUIC HTTP/3 to domain"),
("q13i0311h3_55b375c5d22e_06cda9e17597", "Chromium",
"BoringSSL", "QUIC HTTP/3 to IP"),
# --- Mozilla Firefox ---
# NSS library
("t13d1715h2_5b57614c22b0_7121afd63204", "Firefox",
"NSS", "TCP to domain"),
("t13i1714h2_5b57614c22b0_7121afd63204", "Firefox",
"NSS", "TCP to IP (no SNI)"),
# Firefox with different extension counts (minor NSS updates)
("t13d1715h2_5b57614c22b0_3c5d18be5765", "Firefox",
"NSS", "TCP domain (NSS variant)"),
("t13d1716h2_5b57614c22b0_7121afd63204", "Firefox",
"NSS", "TCP domain (16 ext)"),
# --- Apple Safari (macOS + iOS) ---
# Apple SecureTransport / Network.framework
("t13d2014h2_a09f3c656075_14788d8d241b", "Safari",
"SecureTransport", "TCP to domain"),
("t13i2013h2_a09f3c656075_14788d8d241b", "Safari",
"SecureTransport", "TCP to IP (no SNI)"),
# Safari variants with different extension hashes
("t13d1714h2_5b57614c22b0_14788d8d241b", "Safari",
"SecureTransport", "Safari/iOS via WebKit (hybrid)"),
# --- Chrome on iOS ---
# Uses Apple's TLS stack, not BoringSSL (Apple enforces this on iOS)
("t13d1714h2_5b57614c22b0_14788d8d241b", "Chrome_iOS",
"SecureTransport", "Chrome on iOS (Apple TLS)"),
# --- Tor Browser ---
# Based on Firefox ESR but with a hardened NSS configuration
("t13d1614h2_5b57614c22b0_7121afd63204", "Tor_Browser",
"NSS", "TCP (hardened Firefox ESR)"),
# --- Common mobile browser patterns ---
# Android WebView uses BoringSSL (same as Chromium)
("t13d1516h2_8daaf6152771_02713d6af862", "Android_WebView",
"BoringSSL", "Android WebView (same as Chromium)"),
# --- Older browser versions (TLS 1.2 fallback) ---
("t12d1516h2_8daaf6152771_02713d6af862", "Chromium_Legacy",
"BoringSSL", "Chromium TLS 1.2 only (old/restricted)"),
("t12d1715h2_5b57614c22b0_7121afd63204", "Firefox_Legacy",
"NSS", "Firefox TLS 1.2 only (old)"),
("t12d2014h2_a09f3c656075_14788d8d241b", "Safari_Legacy",
"SecureTransport", "Safari TLS 1.2 only (old)"),
]
# Unique cipher hashes that identify TLS libraries
CIPHER_HASH_FAMILIES = {
"8daaf6152771": "Chromium/BoringSSL",
"55b375c5d22e": "Chromium/BoringSSL (QUIC)",
"5b57614c22b0": "Firefox/NSS",
"a09f3c656075": "Safari/SecureTransport",
}
JA4DB_URL = "https://ja4db.com/api/read/"
FOXIO_URL = ("https://raw.githubusercontent.com/FoxIO-LLC/ja4/"
"main/ja4plus-mapping.csv")
# Valid JA4 fingerprint: protocol prefix + hex hashes separated by underscores
# Example: t13d1516h2_8daaf6152771_02713d6af862
import re
_JA4_RE = re.compile(r'^[tdq]\d{2}[di]\d{4}[a-z0-9]{2}_[0-9a-f]{12}_[0-9a-f]{12}$')
def _is_valid_ja4(s):
"""Validate that a string is a well-formed JA4 fingerprint (ASCII hex only)."""
return bool(_JA4_RE.match(s))
def download_ja4db():
"""Download from ja4db.com API and extract browser fingerprints."""
entries = []
rejected = 0
try:
req = urllib.request.Request(JA4DB_URL, headers={"User-Agent": "ja4-platform/1.0"})
with urllib.request.urlopen(req, timeout=30) as r:
data = json.loads(r.read().decode())
for item in data:
ja4 = item.get("ja4_fingerprint")
if not ja4 or not _is_valid_ja4(ja4):
if ja4:
rejected += 1
continue
app = (item.get("application") or "").strip()
ua = (item.get("user_agent_string") or "").strip()
if app and "browser" in app.lower():
entries.append((ja4, app, "ja4db.com (verified)" if item.get("verified") else "ja4db.com"))
elif ua:
family = _classify_ua(ua)
if family:
entries.append((ja4, family, f"ja4db.com UA: {ua[:60]}"))
print(f" [ja4db.com] Downloaded {len(entries)} browser fingerprints "
f"(rejected {rejected} malformed)")
except Exception as e:
print(f" [ja4db.com] Warning: download failed ({e})", file=sys.stderr)
return entries
def download_foxio_mapping():
"""Download FoxIO official ja4plus-mapping.csv."""
entries = []
try:
req = urllib.request.Request(FOXIO_URL, headers={"User-Agent": "ja4-platform/1.0"})
with urllib.request.urlopen(req, timeout=15) as r:
lines = r.read().decode().strip().split("\n")
# Skip header
for line in lines[1:]:
parts = line.split(",")
if len(parts) < 5:
continue
app = parts[0].strip()
lib = parts[1].strip()
ja4 = parts[4].strip()
if not ja4 or not _is_valid_ja4(ja4):
continue
if "browser" in app.lower() or app in ("Mozilla Firefox", "Safari"):
entries.append((ja4, app, f"FoxIO official (lib: {lib})"))
print(f" [FoxIO] Downloaded {len(entries)} browser fingerprints")
except Exception as e:
print(f" [FoxIO] Warning: download failed ({e})", file=sys.stderr)
return entries
def _classify_ua(ua):
"""Classify a user-agent string into a browser family."""
ua_lower = ua.lower()
# Order matters — check more specific strings first
if "crios/" in ua_lower:
return "Chrome_iOS"
if "fxios/" in ua_lower:
return "Firefox_iOS"
if "edg/" in ua_lower or "edge/" in ua_lower:
return "Edge"
if "opr/" in ua_lower or "opera" in ua_lower:
return "Opera"
if "brave" in ua_lower:
return "Brave"
if "vivaldi" in ua_lower:
return "Vivaldi"
if "samsungbrowser" in ua_lower:
return "Samsung_Internet"
if "firefox/" in ua_lower:
return "Firefox"
if "safari/" in ua_lower and "chrome/" not in ua_lower:
return "Safari"
if "chrome/" in ua_lower:
return "Chromium"
return None
def main():
parser = argparse.ArgumentParser(description="Generate browser_ja4.csv")
parser.add_argument("--output", default="browser_ja4.csv",
help="Output CSV file path")
parser.add_argument("--no-download", action="store_true",
help="Skip downloading from remote sources")
args = parser.parse_args()
# Collect all fingerprints: (ja4, browser_family, tls_library, context)
seen = {} # ja4 -> (browser_family, tls_library, context)
# 1. Start with curated list (highest priority)
for ja4, family, lib, ctx in BROWSER_FINGERPRINTS:
if ja4 not in seen:
seen[ja4] = (family, lib, ctx)
# 2. Download from remote sources
if not args.no_download:
print("[browser_ja4] Downloading from remote sources...")
for ja4, family, source in download_foxio_mapping():
if ja4 not in seen:
lib = _lib_from_cipher_hash(ja4)
seen[ja4] = (family, lib, source)
for ja4, family, source in download_ja4db():
if ja4 not in seen:
lib = _lib_from_cipher_hash(ja4)
seen[ja4] = (family, lib, source)
# 3. Write output (use csv module to properly handle commas in context field)
import csv
with open(args.output, "w", newline="") as f:
writer = csv.writer(f)
for ja4 in sorted(seen.keys()):
family, lib, ctx = seen[ja4]
writer.writerow([ja4, family, lib, ctx])
families = set(f for f, _, _ in seen.values())
print(f"[browser_ja4] Generated {len(seen)} unique fingerprints "
f"covering {len(families)} browser families: {', '.join(sorted(families))}")
def _lib_from_cipher_hash(ja4):
"""Infer TLS library from the cipher hash portion of a JA4 fingerprint."""
parts = ja4.split("_")
if len(parts) >= 2:
cipher_hash = parts[1]
return CIPHER_HASH_FAMILIES.get(cipher_hash, "unknown")
return "unknown"
if __name__ == "__main__":
main()