refactor(anubis): simplify to IP/CIDR + ASN only, remove UA and Country rules
- Remove UA regex extraction (extract_ua_regex, _extract_ua_from_all/any) - Remove Country rule collection from parse_bot_policies_inline - Simplify fetch_rules.py: collect_all_rules returns (ip_rules, asn_rules) - Remove insert_ua_rules and insert_country_rules functions - reload_dicts now only reloads dict_anubis_ip + dict_anubis_asn - Simplify CASE blocks in 04_mv_http_logs.sql, 07_ai_features_view.sql, view_ai_features_anubis.sql, mv_http_logs.sql: IP > ASN (was 5-level UA+IP > UA > IP > ASN > Country cascade) - Remove dict_anubis_country + dict_anubis_ua from 03_anubis_tables.sql (UA table kept as stub for REGEXP_TREE catch-all compatibility) - Remove anubis_country_rules table from schema - Remove Anubis UA and Country tabs from dashboard reflists page - Remove anubis_ua_rules/country_rules from API reflist queries - deploy_schema.sql simplified from 339 to 122 lines - 764 lines removed across 9 files Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -18,7 +18,6 @@ Variables d'environnement :
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
@ -57,15 +56,9 @@ DIRECTORIES = [
|
||||
("data/bots", "bots"), # Catch-alls larges (ai-robots-txt, ai-catchall) — LAST
|
||||
]
|
||||
|
||||
# Fichier de politique principal (règles ASN + pays inline)
|
||||
# Fichier de politique principal (règles ASN inline)
|
||||
BOT_POLICIES_PATH = "data/botPolicies.yaml"
|
||||
|
||||
# UA_PARENT_OVERRIDE : mapping nom_règle → nom_parent pour forcer la hiérarchie REGEXP_TREE.
|
||||
# Conservé vide intentionnellement : l'ordre de chargement (spécifique avant catch-all)
|
||||
# garantit la priorité sans hiérarchie parent_id explicite.
|
||||
# Populer ce dict si une règle doit hériter d'une autre via parent_id dans REGEXP_TREE.
|
||||
UA_PARENT_OVERRIDE: dict[str, str] = {}
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# HTTP helpers
|
||||
@ -111,69 +104,6 @@ def list_yaml_files(api_path: str) -> list[str]:
|
||||
return [f["download_url"] for f in files]
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Extraction des patterns UA depuis les expressions CEL-like
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _extract_ua_from_all(conditions: list) -> str | None:
|
||||
"""Extrait une regex UA depuis une expression 'all' (ex: yandexbot userAgent.matches)."""
|
||||
for cond in conditions:
|
||||
if not isinstance(cond, str):
|
||||
continue
|
||||
m = re.search(r'userAgent\.matches\("(.+?)"\)', cond)
|
||||
if m:
|
||||
return m.group(1).replace("\\\\", "\\")
|
||||
return None
|
||||
|
||||
|
||||
def _extract_ua_from_any(conditions: list) -> str | None:
|
||||
"""
|
||||
Extrait une regex UA depuis une expression 'any' avec userAgent.contains(...)
|
||||
Exemple : aggressive-brazilian-scrapers.yaml
|
||||
Retourne une regex en OR : MSIE|Trident|...
|
||||
"""
|
||||
patterns = []
|
||||
for cond in conditions:
|
||||
if not isinstance(cond, str):
|
||||
continue
|
||||
m = re.search(r'userAgent\.contains\("(.+?)"\)', cond)
|
||||
if m:
|
||||
patterns.append(re.escape(m.group(1)))
|
||||
if patterns:
|
||||
return "|".join(patterns)
|
||||
return None
|
||||
|
||||
|
||||
def extract_ua_regex(rule: dict) -> str | None:
|
||||
"""Extrait la regex User-Agent depuis toutes les formes possibles."""
|
||||
# Forme directe
|
||||
if ua := rule.get("user_agent_regex"):
|
||||
return ua.strip()
|
||||
|
||||
expr = rule.get("expression")
|
||||
if not expr:
|
||||
return None
|
||||
|
||||
# Expression scalaire (CEL string)
|
||||
if isinstance(expr, str):
|
||||
m = re.search(r'userAgent\.matches\("(.+?)"\)', expr)
|
||||
if m:
|
||||
return m.group(1).replace("\\\\", "\\")
|
||||
m = re.search(r'userAgent\.contains\("(.+?)"\)', expr)
|
||||
if m:
|
||||
return re.escape(m.group(1))
|
||||
return None
|
||||
|
||||
# Expression structurée dict
|
||||
if isinstance(expr, dict):
|
||||
if ua := _extract_ua_from_all(expr.get("all", [])):
|
||||
return ua
|
||||
if ua := _extract_ua_from_any(expr.get("any", [])):
|
||||
return ua
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Parse des fichiers YAML
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
@ -182,27 +112,22 @@ def parse_file(
|
||||
url: str,
|
||||
category: str,
|
||||
ua_name_to_id: dict,
|
||||
ua_id_counter_ref: list, # [int] — compteur mutable partagé entre appels
|
||||
rule_id_counter_ref: list, # [int] — idem
|
||||
ua_id_counter_ref: list,
|
||||
rule_id_counter_ref: list,
|
||||
) -> tuple[list[dict], list[dict]]:
|
||||
"""
|
||||
Parse un fichier YAML Anubis.
|
||||
Retourne (ua_rules, ip_rules).
|
||||
|
||||
Note : ua_name_to_id est maintenu pour supporter la hiérarchie parent_id dans
|
||||
REGEXP_TREE (via UA_PARENT_OVERRIDE). Tant que UA_PARENT_OVERRIDE est vide,
|
||||
parent_id vaut toujours 0 et ua_name_to_id n'est pas consulté en pratique.
|
||||
Retourne ([], ip_rules) — les règles UA ne sont plus collectées.
|
||||
"""
|
||||
data = fetch_yaml_url(url)
|
||||
if not data or not isinstance(data, list):
|
||||
return [], []
|
||||
|
||||
ua_rules, ip_rules = [], []
|
||||
ip_rules = []
|
||||
|
||||
for rule in data:
|
||||
if not isinstance(rule, dict):
|
||||
continue
|
||||
# Ignorer les imports (références à d'autres fichiers)
|
||||
if "import" in rule:
|
||||
continue
|
||||
|
||||
@ -212,58 +137,34 @@ def parse_file(
|
||||
continue
|
||||
|
||||
remote_addrs = [str(c).strip() for c in rule.get("remote_addresses", []) if c]
|
||||
has_ip = bool(remote_addrs)
|
||||
|
||||
rule_id = rule_id_counter_ref[0]
|
||||
rule_id_counter_ref[0] += 1
|
||||
|
||||
# ── User-Agent regex ─────────────────────────────────────────────────
|
||||
ua_regex = extract_ua_regex(rule)
|
||||
if ua_regex:
|
||||
parent_name = UA_PARENT_OVERRIDE.get(name)
|
||||
parent_id = ua_name_to_id.get(parent_name, 0) if parent_name else 0
|
||||
|
||||
uid = ua_id_counter_ref[0]
|
||||
ua_id_counter_ref[0] += 1
|
||||
ua_name_to_id[name] = uid
|
||||
|
||||
ua_rules.append({
|
||||
"id": uid,
|
||||
"parent_id": parent_id,
|
||||
"regexp": ua_regex,
|
||||
"bot_name": name,
|
||||
"action": action,
|
||||
"has_ip": "1" if has_ip else "0",
|
||||
"rule_id": str(rule_id),
|
||||
"category": category,
|
||||
})
|
||||
|
||||
# ── IP/CIDR ranges ───────────────────────────────────────────────────
|
||||
has_ua = bool(ua_regex)
|
||||
for cidr in remote_addrs:
|
||||
ip_rules.append({
|
||||
"prefix": cidr,
|
||||
"bot_name": name,
|
||||
"action": action,
|
||||
"rule_id": rule_id,
|
||||
"has_ua": 1 if has_ua else 0,
|
||||
"has_ua": 0,
|
||||
"category": category,
|
||||
})
|
||||
|
||||
return ua_rules, ip_rules
|
||||
return [], ip_rules
|
||||
|
||||
|
||||
def parse_bot_policies_inline(url: str) -> tuple[list[dict], list[dict]]:
|
||||
"""
|
||||
Parse botPolicies.yaml pour les règles inline avec geoip.countries et asns.match.
|
||||
Retourne (asn_rules, country_rules).
|
||||
Parse botPolicies.yaml pour les règles ASN inline.
|
||||
Retourne (asn_rules, []) — les règles pays ne sont plus collectées.
|
||||
"""
|
||||
data = fetch_yaml_url(url)
|
||||
if not data or not isinstance(data, dict):
|
||||
return [], []
|
||||
|
||||
asn_rules: list[dict] = []
|
||||
country_rules: list[dict] = []
|
||||
asn_rules: list[dict] = []
|
||||
|
||||
for rule in data.get("bots", []):
|
||||
if not isinstance(rule, dict):
|
||||
@ -287,31 +188,19 @@ def parse_bot_policies_inline(url: str) -> tuple[list[dict], list[dict]]:
|
||||
"category": "policies",
|
||||
})
|
||||
|
||||
# Country rules
|
||||
geoip = rule.get("geoip", {})
|
||||
if isinstance(geoip, dict):
|
||||
for cc in geoip.get("countries", []):
|
||||
country_rules.append({
|
||||
"country_code": str(cc).upper(),
|
||||
"bot_name": name,
|
||||
"action": action,
|
||||
"category": "policies",
|
||||
})
|
||||
|
||||
return asn_rules, country_rules
|
||||
return asn_rules, []
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Collecte de toutes les règles
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def collect_all_rules() -> tuple[list, list, list, list]:
|
||||
"""Retourne (ua_rules, ip_rules, asn_rules, country_rules)."""
|
||||
def collect_all_rules() -> tuple[list, list]:
|
||||
"""Retourne (ip_rules, asn_rules). Les règles UA et pays ne sont plus collectées."""
|
||||
ua_name_to_id: dict[str, int] = {}
|
||||
ua_id_counter_ref: list[int] = [1]
|
||||
rule_id_counter: list[int] = [1]
|
||||
|
||||
all_ua: list[dict] = []
|
||||
all_ip: list[dict] = []
|
||||
|
||||
for api_path, category in DIRECTORIES:
|
||||
@ -319,16 +208,15 @@ def collect_all_rules() -> tuple[list, list, list, list]:
|
||||
file_urls = list_yaml_files(api_path)
|
||||
print(f" {len(file_urls)} fichiers trouvés")
|
||||
for url in file_urls:
|
||||
ua, ip = parse_file(url, category, ua_name_to_id, ua_id_counter_ref, rule_id_counter)
|
||||
all_ua.extend(ua)
|
||||
_ua, ip = parse_file(url, category, ua_name_to_id, ua_id_counter_ref, rule_id_counter)
|
||||
all_ip.extend(ip)
|
||||
|
||||
# Règles ASN + pays depuis botPolicies.yaml
|
||||
# Règles ASN depuis botPolicies.yaml
|
||||
print(f"[INFO] Lecture de botPolicies.yaml…")
|
||||
policies_url = f"{GITHUB_RAW}/{BOT_POLICIES_PATH}"
|
||||
asn_rules, country_rules = parse_bot_policies_inline(policies_url)
|
||||
asn_rules, _country_rules = parse_bot_policies_inline(policies_url)
|
||||
|
||||
return all_ua, all_ip, asn_rules, country_rules
|
||||
return all_ip, asn_rules
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
@ -348,30 +236,6 @@ def get_ch_client():
|
||||
DB_PROC = os.environ.get("CLICKHOUSE_DB_PROCESSING", os.environ.get("CLICKHOUSE_DB", "ja4_processing"))
|
||||
|
||||
|
||||
def insert_ua_rules(client, rules: list[dict]) -> None:
|
||||
"""Tronque et remplace la table anubis_ua_rules avec les règles User-Agent fournies.
|
||||
|
||||
Le format cible est REGEXP_TREE (colonnes id, parent_id, regexp, keys[], values[]).
|
||||
"""
|
||||
if not rules:
|
||||
print("[INFO] Aucune règle UA.")
|
||||
return
|
||||
client.command(f"TRUNCATE TABLE {DB_PROC}.anubis_ua_rules")
|
||||
# REGEXP_TREE format : id, parent_id, regexp, keys[], values[]
|
||||
# keys = ['bot_name', 'action', 'has_ip', 'rule_id', 'category']
|
||||
data = [
|
||||
[
|
||||
r["id"], r["parent_id"], r["regexp"],
|
||||
["bot_name", "action", "has_ip", "rule_id", "category"],
|
||||
[r["bot_name"], r["action"], r["has_ip"], r["rule_id"], r["category"]],
|
||||
]
|
||||
for r in rules
|
||||
]
|
||||
client.insert(f"{DB_PROC}.anubis_ua_rules", data,
|
||||
column_names=["id", "parent_id", "regexp", "keys", "values"])
|
||||
print(f"[OK] {len(rules)} règles UA insérées.")
|
||||
|
||||
|
||||
def insert_ip_rules(client, rules: list[dict]) -> None:
|
||||
"""Tronque et remplace la table anubis_ip_rules avec les règles CIDR/IP fournies."""
|
||||
if not rules:
|
||||
@ -400,25 +264,11 @@ def insert_asn_rules(client, rules: list[dict]) -> None:
|
||||
print(f"[OK] {len(rules)} règles ASN insérées.")
|
||||
|
||||
|
||||
def insert_country_rules(client, rules: list[dict]) -> None:
|
||||
"""Tronque et remplace la table anubis_country_rules avec les règles pays fournies."""
|
||||
if not rules:
|
||||
print("[INFO] Aucune règle pays.")
|
||||
return
|
||||
client.command(f"TRUNCATE TABLE {DB_PROC}.anubis_country_rules")
|
||||
data = [[r["country_code"], r["bot_name"], r["action"], r["category"]] for r in rules]
|
||||
client.insert(f"{DB_PROC}.anubis_country_rules", data,
|
||||
column_names=["country_code", "bot_name", "action", "category"])
|
||||
print(f"[OK] {len(rules)} règles pays insérées.")
|
||||
|
||||
|
||||
def reload_dicts(client) -> None:
|
||||
"""Recharge les quatre dictionnaires ClickHouse Anubis après mise à jour des tables sources."""
|
||||
"""Recharge les dictionnaires ClickHouse Anubis IP et ASN après mise à jour."""
|
||||
dicts = [
|
||||
f"{DB_PROC}.dict_anubis_ua",
|
||||
f"{DB_PROC}.dict_anubis_ip",
|
||||
f"{DB_PROC}.dict_anubis_asn",
|
||||
f"{DB_PROC}.dict_anubis_country",
|
||||
]
|
||||
for d in dicts:
|
||||
try:
|
||||
@ -432,27 +282,14 @@ def reload_dicts(client) -> None:
|
||||
# Rapport
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def print_summary(ua_rules, ip_rules, asn_rules, country_rules):
|
||||
"""Affiche un résumé lisible des règles collectées (UA, IP, ASN, pays) sur la sortie standard."""
|
||||
print("\n── Règles UA ──")
|
||||
by_cat: dict[str, list] = {}
|
||||
for r in ua_rules:
|
||||
by_cat.setdefault(r["category"], []).append(r)
|
||||
for cat, rules in sorted(by_cat.items()):
|
||||
print(f" [{cat}] {len(rules)} règle(s)")
|
||||
for r in rules[:5]:
|
||||
has = " [+IP]" if r["has_ip"] == "1" else ""
|
||||
par = f" [parent={r['parent_id']}]" if r["parent_id"] else ""
|
||||
print(f" [{r['action']:9s}] {r['bot_name']}{has}{par}: {r['regexp'][:50]}")
|
||||
if len(rules) > 5:
|
||||
print(f" … et {len(rules) - 5} autres")
|
||||
|
||||
def print_summary(ip_rules, asn_rules):
|
||||
"""Affiche un résumé lisible des règles collectées (IP, ASN) sur la sortie standard."""
|
||||
print(f"\n── Règles IP : {len(ip_rules)} CIDRs ──")
|
||||
by_bot: dict[str, list] = {}
|
||||
for r in ip_rules:
|
||||
by_bot.setdefault(r["bot_name"], []).append(r)
|
||||
for bot, rs in sorted(by_bot.items())[:15]:
|
||||
print(f" [{rs[0]['action']:9s}] {bot}: {len(rs)} CIDRs (cat={rs[0]['category']}, has_ua={rs[0]['has_ua']})")
|
||||
print(f" [{rs[0]['action']:9s}] {bot}: {len(rs)} CIDRs (cat={rs[0]['category']})")
|
||||
if len(by_bot) > 15:
|
||||
print(f" … et {len(by_bot) - 15} autres bots")
|
||||
|
||||
@ -461,38 +298,30 @@ def print_summary(ua_rules, ip_rules, asn_rules, country_rules):
|
||||
for r in asn_rules:
|
||||
print(f" [{r['action']:9s}] ASN {r['asn']}: {r['bot_name']}")
|
||||
|
||||
if country_rules:
|
||||
print(f"\n── Règles pays : {len(country_rules)} ──")
|
||||
for r in country_rules:
|
||||
print(f" [{r['action']:9s}] {r['country_code']}: {r['bot_name']}")
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Main
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
"""Point d'entrée principal : collecte les règles Anubis et les charge dans ClickHouse."""
|
||||
print("[INFO] Collecte des règles Anubis depuis GitHub…")
|
||||
ua_rules, ip_rules, asn_rules, country_rules = collect_all_rules()
|
||||
"""Point d'entrée principal : collecte les règles Anubis IP/CIDR et ASN, puis les charge dans ClickHouse."""
|
||||
print("[INFO] Collecte des règles Anubis depuis GitHub (IP/CIDR + ASN uniquement)…")
|
||||
ip_rules, asn_rules = collect_all_rules()
|
||||
|
||||
total = len(ua_rules) + len(ip_rules) + len(asn_rules) + len(country_rules)
|
||||
print(f"\n[INFO] {len(ua_rules)} règles UA, {len(ip_rules)} CIDRs IP, "
|
||||
f"{len(asn_rules)} ASN, {len(country_rules)} pays (total={total})")
|
||||
total = len(ip_rules) + len(asn_rules)
|
||||
print(f"\n[INFO] {len(ip_rules)} CIDRs IP, {len(asn_rules)} ASN (total={total})")
|
||||
|
||||
if total == 0:
|
||||
print("[ERREUR] Aucune règle récupérée.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print_summary(ua_rules, ip_rules, asn_rules, country_rules)
|
||||
print_summary(ip_rules, asn_rules)
|
||||
|
||||
print("\n[INFO] Connexion à ClickHouse…")
|
||||
client = get_ch_client()
|
||||
|
||||
insert_ua_rules(client, ua_rules)
|
||||
insert_ip_rules(client, ip_rules)
|
||||
insert_asn_rules(client, asn_rules)
|
||||
insert_country_rules(client, country_rules)
|
||||
reload_dicts(client)
|
||||
|
||||
print("\n[OK] Règles Anubis chargées avec succès.")
|
||||
|
||||
Reference in New Issue
Block a user