view

2026-03-09 08:37:59 +01:00
parent 703b2b5181
commit 5df2fd965b
5 changed files with 232 additions and 96 deletions
--- a/sql/bots.sql
+++ b/sql/bots.sql
@ -0,0 +1,10 @@
+DROP TABLE IF EXISTS mabase_prod.ref_bot_networks;
+
+CREATE TABLE mabase_prod.ref_bot_networks (
+    -- On utilise IPv6CIDR car il accepte aussi les IPv4 au format ::ffff:1.2.3.4/120
+    network IPv6CIDR,
+    bot_name LowCardinality(String),
+    is_legitimate UInt8,
+    last_update DateTime
+) ENGINE = ReplacingMergeTree(last_update)
+ORDER BY (network, bot_name);
--- a/sql/iptoasn.sql
+++ b/sql/iptoasn.sql
@ -1,15 +0,0 @@
-DROP DICTIONARY IF EXISTS mabase_prod.dict_iplocate_asn;
-
-CREATE DICTIONARY IF NOT EXISTS mabase_prod.dict_iplocate_asn
-(
-    network String,
-    asn UInt32,
-    country_code String,
-    name String,
-    org String,
-    domain String
-)
-PRIMARY KEY network
-SOURCE(FILE(path '/var/lib/clickhouse/user_files/iplocate-ip-to-asn.csv' format 'CSVWithNames'))
-LAYOUT(IP_TRIE())
-LIFETIME(MIN 3600 MAX 7200);
--- a/sql/mv1.sql
+++ b/sql/mv1.sql
@ -1,11 +1,12 @@
 -- ============================================================================
-- PROJET : Moteur de Détection de Menaces HTTP
-- DESCRIPTION : Reconfiguration des vues d'agrégation, nouveauté et scoring.
+-- PROJET : Moteur de Détection de Menaces HTTP (Full Spectrum)
+-- DESCRIPTION : Configuration complète des tables d'agrégation et du scoring.
+-- COUVRE : Spoofing UA/TLS, TCP Fingerprinting, Anomalies comportementales.
 -- DATE : 2026-03-08
 -- ============================================================================

 -- ----------------------------------------------------------------------------
-- 1. NETTOYAGE DES OBJETS EXISTANTS (Ordre inverse des dépendances)
+-- 1. NETTOYAGE (Ordre inverse des dépendances)
 -- ----------------------------------------------------------------------------
 DROP VIEW IF EXISTS mabase_prod.live_threat_scores;
 DROP VIEW IF EXISTS mabase_prod.mv_baseline_update;
@ -14,13 +15,76 @@ DROP VIEW IF EXISTS mabase_prod.mv_traffic_1d;
 DROP VIEW IF EXISTS mabase_prod.mv_traffic_1h;
 DROP VIEW IF EXISTS mabase_prod.mv_traffic_1m;

+DROP TABLE IF EXISTS mabase_prod.agg_traffic_1d;
+DROP TABLE IF EXISTS mabase_prod.agg_traffic_1h;
+DROP TABLE IF EXISTS mabase_prod.agg_traffic_1m;
+
 -- ----------------------------------------------------------------------------
-- 2. RECONSTRUCTION DE LA CHAÎNE DE ROLLUP (Aggrégations temporelles)
+-- 2. TABLES DE DESTINATION (STORAGE)
 -- ----------------------------------------------------------------------------

-- MV 1 Minute : Transformation des logs bruts en métriques techniques
-CREATE MATERIALIZED VIEW mabase_prod.mv_traffic_1m 
-TO mabase_prod.agg_traffic_1m
+CREATE TABLE mabase_prod.agg_traffic_1m (
+    minute DateTime,
+    host LowCardinality(String),
+    src_ip IPv4,
+    src_asn UInt32,
+    src_country_code LowCardinality(String),
+    ja4 String,
+    ja3_hash String,
+    header_user_agent String,
+
+    -- Métriques de Base
+    hits AggregateFunction(count, UInt64),
+    uniq_paths AggregateFunction(uniq, String),
+    
+    -- Couche 4 : TCP & Handshake
+    avg_syn_to_clienthello_ms AggregateFunction(avg, Int32),
+    var_syn_to_clienthello_ms AggregateFunction(varPop, Int32),
+    tcp_fingerprint AggregateFunction(uniq, UInt64), -- MSS + Window + Scale
+    
+    -- Couche 7 : HTTP Fingerprinting
+    avg_headers_count AggregateFunction(avg, Float64),
+    host_sni_mismatch AggregateFunction(countIf, UInt8),
+    
+    -- Détection Spoofing & Incohérences
+    spoofing_ua_tls AggregateFunction(countIf, UInt8),
+    spoofing_ua_alpn AggregateFunction(countIf, UInt8),
+    spoofing_os_ttl AggregateFunction(countIf, UInt8),
+    missing_human_headers AggregateFunction(countIf, UInt8),
+    
+    -- Comportement & Payloads
+    sensitive_path_hits AggregateFunction(countIf, UInt8),
+    suspicious_methods AggregateFunction(countIf, UInt8),
+    suspicious_queries AggregateFunction(countIf, UInt8)
+) ENGINE = AggregatingMergeTree()
+PARTITION BY toYYYYMM(minute)
+ORDER BY (host, ja4, src_ip, minute);
+
+-- Tables 1h et 1d (Simplifiées pour le stockage long terme)
+CREATE TABLE mabase_prod.agg_traffic_1h (
+    hour DateTime,
+    host LowCardinality(String),
+    src_country_code LowCardinality(String),
+    ja4 String,
+    hits AggregateFunction(count, UInt64),
+    uniq_ips AggregateFunction(uniq, IPv4)
+) ENGINE = AggregatingMergeTree() ORDER BY (host, ja4, hour);
+
+CREATE TABLE mabase_prod.agg_traffic_1d (
+    day Date,
+    host LowCardinality(String),
+    ja4 String,
+    hits AggregateFunction(count, UInt64),
+    uniq_ips AggregateFunction(uniq, IPv4)
+) ENGINE = AggregatingMergeTree() ORDER BY (host, ja4, day);
+
+-- ----------------------------------------------------------------------------
+-- 3. VUES MATÉRIALISÉES (MOTEUR DE CALCUL)
+-- ----------------------------------------------------------------------------
+
+
+
+CREATE MATERIALIZED VIEW mabase_prod.mv_traffic_1m TO mabase_prod.agg_traffic_1m
 AS SELECT
    toStartOfMinute(time) AS minute,
    host, src_ip, src_asn, src_country_code, ja4, ja3_hash, header_user_agent,
@ -28,80 +92,30 @@ AS SELECT
    uniqState(path) AS uniq_paths,
    avgState(syn_to_clienthello_ms) AS avg_syn_to_clienthello_ms,
    varPopState(syn_to_clienthello_ms) AS var_syn_to_clienthello_ms,
-    avgState(toFloat64((length(client_headers) - length(replaceAll(client_headers, ',', ''))) + 1)) AS avg_headers_count,
+    -- TCP Fingerprint Hash
+    uniqState(cityHash64(toString(tcp_meta_mss), toString(tcp_meta_window_size), toString(tcp_meta_window_scale))) AS tcp_fingerprint,
+    -- HTTP Metrics
+    avgState(toFloat64(length(client_headers) - length(replaceAll(client_headers, ',', '')) + 1)) AS avg_headers_count,
+    countIfState(host != tls_sni AND tls_sni != '') AS host_sni_mismatch,
+    -- Spoofing Logic
    countIfState((header_user_agent ILIKE '%Chrome%') AND (ja4 NOT ILIKE 't13d%')) AS spoofing_ua_tls,
    countIfState((header_user_agent ILIKE '%Chrome%') AND (tls_alpn NOT ILIKE '%h2%')) AS spoofing_ua_alpn,
    countIfState((header_user_agent ILIKE '%Windows%') AND (ip_meta_ttl <= 64)) AS spoofing_os_ttl,
-    countIfState((header_accept_language = '') OR (header_sec_ch_ua = '')) AS missing_human_headers,
+    countIfState((header_user_agent ILIKE '%Mozilla%') AND (header_sec_ch_ua = '')) AS missing_human_headers,
+    -- Behavior & Payloads
+    countIfState(match(path, 'login|auth|admin|password|config|wp-admin|api/v[0-9]/auth')) AS sensitive_path_hits,
    countIfState(method IN ('PUT', 'DELETE', 'OPTIONS', 'TRACE')) AS suspicious_methods,
-    countIfState((length(query) > 200) OR match(query, '(%[0-9A-Fa-f]{2}){5,}')) AS suspicious_queries
+    countIfState((length(query) > 250) OR match(query, '(<script|union|select|etc/passwd|%00)')) AS suspicious_queries
 FROM mabase_prod.http_logs
 GROUP BY minute, host, src_ip, src_asn, src_country_code, ja4, ja3_hash, header_user_agent;

-- MV 1 Heure : Agrégation secondaire (Cascading)
-CREATE MATERIALIZED VIEW mabase_prod.mv_traffic_1h 
-TO mabase_prod.agg_traffic_1h
-AS SELECT
-    toStartOfHour(minute) AS hour,
-    host, src_country_code, ja4, ja3_hash, header_user_agent,
-    countMergeState(hits) AS hits,
-    uniqMergeState(uniq_paths) AS uniq_paths,
-    countIfMergeState(missing_human_headers) AS missing_human_headers,
-    uniqState(src_ip) AS uniq_ips
-FROM mabase_prod.agg_traffic_1m
-GROUP BY hour, host, src_country_code, ja4, ja3_hash, header_user_agent;
-
-- MV 1 Jour : Agrégation tertiaire pour archivage et baseline
-CREATE MATERIALIZED VIEW mabase_prod.mv_traffic_1d 
-TO mabase_prod.agg_traffic_1d
-AS SELECT
-    toDate(hour) AS day,
-    host, src_country_code, ja4, ja3_hash, header_user_agent,
-    countMergeState(hits) AS hits,
-    uniqMergeState(uniq_ips) AS uniq_ips,
-    uniqMergeState(uniq_paths) AS uniq_paths,
-    countIfMergeState(missing_human_headers) AS missing_human_headers
-FROM mabase_prod.agg_traffic_1h
-GROUP BY day, host, src_country_code, ja4, ja3_hash, header_user_agent;
+-- Cascading to 1h
+CREATE MATERIALIZED VIEW mabase_prod.mv_traffic_1h TO mabase_prod.agg_traffic_1h
+AS SELECT toStartOfHour(minute) AS hour, host, src_country_code, ja4, countMergeState(hits) AS hits, uniqState(src_ip) AS uniq_ips
+FROM mabase_prod.agg_traffic_1m GROUP BY hour, host, src_country_code, ja4;

 -- ----------------------------------------------------------------------------
-- 3. RECONSTRUCTION DES BRIQUES D'INTELLIGENCE (Novelty & Baseline)
-- ----------------------------------------------------------------------------
-
-- MV Novelty : Détection de nouvelles empreintes (HTTP + TLS)
-CREATE MATERIALIZED VIEW mabase_prod.mv_novelty
-TO mabase_prod.agg_novelty
-AS SELECT
-    host,
-    ja4,
-    cityHash64(client_headers) AS http_fp,
-    minState(time) AS first_seen,
-    maxState(time) AS last_seen,
-    countState() AS total_hits
-FROM mabase_prod.http_logs
-GROUP BY host, ja4, http_fp;
-
-- MV Baseline : Calcul statistique du trafic normal par JA4
-CREATE MATERIALIZED VIEW mabase_prod.mv_baseline_update
-TO mabase_prod.tbl_baseline_ja4_7d
-AS SELECT
-    ja4,
-    quantile(0.99)(hourly_hits) AS p99_hits_per_hour,
-    avg(hourly_hits) AS avg_hits_per_hour,
-    now() AS last_update
-FROM (
-    SELECT
-        ja4,
-        toStartOfHour(minute) as hour,
-        countMerge(hits) AS hourly_hits
-    FROM mabase_prod.agg_traffic_1m
-    WHERE minute >= now() - INTERVAL 7 DAY
-    GROUP BY ja4, hour
-)
-GROUP BY ja4;
-
-- ----------------------------------------------------------------------------
-- 4. VUE DE SCORING FINAL (Verdict Temps Réel)
+-- 4. VUE DE SCORING FINAL (VERDICT)
 -- ----------------------------------------------------------------------------

 CREATE VIEW mabase_prod.live_threat_scores AS
@ -111,16 +125,30 @@ SELECT
    T1.src_asn,
    T1.src_country_code,
    (
-        if(countMerge(T1.spoofing_ua_tls) > 0, 40, 0) +
+        -- 1. Incohérences de Signature (Poids Fort : 40-50)
+        if(countMerge(T1.spoofing_ua_tls) > 0, 50, 0) +
        if(countMerge(T1.spoofing_os_ttl) > 0, 40, 0) +
-        if(varPopMerge(T1.var_syn_to_clienthello_ms) < 1.0, 20, 0) +
-        if(dateDiff('hour', minMerge(N.first_seen), now()) < 2, 30, 0) +
-        if(countMerge(T1.hits) > coalesce(B.p99_hits_per_hour * 3, 1000), 50, 0)
-    ) AS final_score,
-    countMerge(T1.hits) AS current_hits,
-    B.p99_hits_per_hour AS historical_baseline
+        if(countMerge(T1.host_sni_mismatch) > 0, 45, 0) +
+        if(countMerge(T1.missing_human_headers) > 0, 30, 0) +
+        
+        -- 2. Anomalies Réseau (Poids Moyen : 20-30)
+        if(varPopMerge(T1.var_syn_to_clienthello_ms) < 0.5 AND countMerge(T1.hits) > 5, 30, 0) +
+        if(avgMerge(T1.avg_headers_count) < 6, 25, 0) +
+        
+        -- 3. Comportement (Poids Variable)
+        if(countMerge(T1.sensitive_path_hits) > 5, 40, 0) +
+        if(countMerge(T1.suspicious_queries) > 0, 60, 0) +
+        if(uniqMerge(T1.uniq_paths) > 50, 40, 0) + -- Balayage (Scanner)
+        
+        -- 4. Volumétrie vs Baseline
+        if(countMerge(T1.hits) > (B.p99_hits_per_hour * 3), 50, 0)
+        
+    ) AS final_threat_score,
+    countMerge(T1.hits) AS request_count,
+    B.p99_hits_per_hour AS baseline
 FROM mabase_prod.agg_traffic_1m AS T1
-LEFT JOIN mabase_prod.agg_novelty AS N ON T1.ja4 = N.ja4 AND T1.host = N.host
 LEFT JOIN mabase_prod.tbl_baseline_ja4_7d AS B ON T1.ja4 = B.ja4
 WHERE T1.minute >= now() - INTERVAL 5 MINUTE
-GROUP BY T1.src_ip, T1.ja4, T1.src_asn, T1.src_country_code, B.p99_hits_per_hour;
+GROUP BY T1.src_ip, T1.ja4, T1.src_asn, T1.src_country_code, B.p99_hits_per_hour
+HAVING final_threat_score > 0
+ORDER BY final_threat_score DESC;
--- a/sql/tables.sql
+++ b/sql/tables.sql
@ -0,0 +1,29 @@
+DROP DICTIONARY IF EXISTS mabase_prod.dict_iplocate_asn;
+
+CREATE DICTIONARY IF NOT EXISTS mabase_prod.dict_iplocate_asn
+(
+    network String,
+    asn UInt32,
+    country_code String,
+    name String,
+    org String,
+    domain String
+)
+PRIMARY KEY network
+SOURCE(FILE(path '/var/lib/clickhouse/user_files/iplocate-ip-to-asn.csv' format 'CSVWithNames'))
+LAYOUT(IP_TRIE())
+LIFETIME(MIN 3600 MAX 7200);
+
+
+
+-- Suppression si existe pour reconfiguration
+DROP TABLE IF EXISTS mabase_prod.ref_bot_networks;
+
+-- Table optimisée pour le filtrage binaire de CIDR
+CREATE TABLE mabase_prod.ref_bot_networks (
+    network IPv6CIDR,          -- Gère nativement '1.2.3.0/24' et '2001:db8::/32'
+    bot_name LowCardinality(String),
+    is_legitimate UInt8,       -- 1 = Whitelist, 0 = Blacklist
+    last_update DateTime
+) ENGINE = ReplacingMergeTree(last_update)
+ORDER BY (network, bot_name)
--- a/views.md
+++ b/views.md
@ -0,0 +1,84 @@
+# 🛡️ Manuel de Référence Technique : Moteur de Détection Antispam & Bot
+
+Ce document détaille les algorithmes de détection implémentés dans les vues ClickHouse pour la plateforme.
+
+---
+
+## 1. Analyse de la Couche Transport (L4) : La "Trace Physique"
+Avant même d'analyser l'URL, le moteur inspecte la manière dont la connexion a été établie. C'est la couche la plus difficile à falsifier pour un attaquant.
+
+### A. Fingerprint de la Pile TCP (`tcp_fingerprint`)
+* **Fonctionnement :** Nous utilisons `cityHash64` pour créer un identifiant unique basé sur trois paramètres immuables du handshake : le **MSS** (Maximum Segment Size), la **Window Size** et le **Window Scale**.
+* **Ce que ça détecte :** L'unicité logicielle. Un bot tournant sur une image Alpine Linux aura une signature TCP différente d'un utilisateur sur iOS 17 ou Windows 11.
+* **Détection de botnet :** Si 500 IPs différentes partagent exactement le même `tcp_fingerprint` ET le même `ja4`, il y a une probabilité de 99% qu'il s'agisse d'un cluster de bots clonés.
+
+
+
+### B. Analyse de la gigue (Jitter) et Handshake
+* **Fonctionnement :** On calcule la variance (`varPop`) du délai entre le `SYN` et le `ClientHello` TLS.
+* **Ce que ça détecte :** La stabilité robotique.
+    * **Humain :** Latence variable (4G, Wi-Fi, mouvements). La variance est élevée.
+    * **Bot Datacenter :** Latence ultra-stable (fibre optique dédiée). Une variance proche de 0 indique une connexion automatisée depuis une infrastructure cloud.
+
+---
+
+## 2. Analyse de la Session (L5) : Le "Passeport TLS"
+Le handshake TLS est une mine d'or pour identifier la bibliothèque logicielle (OpenSSL, Go-TLS, etc.).
+
+### A. Incohérence UA vs JA4
+* **Fonctionnement :** Le moteur croise le `header_user_agent` (déclaratif) avec le `ja4` (structurel).
+* **Ce que ça détecte :** Le **Spoofing de Browser**. Un script Python peut facilement écrire `User-Agent: Mozilla/5.0...Chrome/120`, mais il ne peut pas simuler l'ordre exact des extensions TLS et des algorithmes de chiffrement d'un vrai Chrome sans une ingénierie complexe (comme `utls`).
+* **Logique de score :** Si UA = Chrome mais JA4 != Signature_Chrome -> **+50 points de risque**.
+
+### B. Discordance Host vs SNI
+* **Fonctionnement :** Comparaison entre le champ `tls_sni` (négocié en clair lors du handshake) et le header `Host` (envoyé plus tard dans la requête chiffrée).
+* **Ce que ça détecte :** Le **Domain Fronting** ou les attaques par tunnel. Un bot peut demander un certificat pour `domaine-innocent.com` (SNI) mais tenter d'attaquer `api-critique.com` (Host).
+
+
+
+---
+
+## 3. Analyse Applicative (L7) : Le "Comportement HTTP"
+Une fois le tunnel établi, on analyse la structure de la requête HTTP.
+
+### A. Empreinte d'ordre des Headers (`http_fp`)
+* **Fonctionnement :** Nous hashons la liste ordonnée des clés de headers (`Accept`, `User-Agent`, `Connection`, etc.).
+* **Ce que ça détecte :** La signature du moteur de rendu. Chaque navigateur (Firefox, Safari, Chromium) a un ordre immuable pour envoyer ses headers.
+* **Détection :** Si un client envoie les headers dans un ordre inhabituel ou minimaliste (pauvreté des headers < 6), il est marqué comme suspect.
+
+### B. Analyse des Payloads et Entropie
+* **Fonctionnement :** Recherche de patterns via regex dans `query` et `path` (détection SQLi, XSS, Path Traversal).
+* **Complexité :** Nous détectons les encodages multiples (ex: `%2520`) qui tentent de tromper les pare-feux simples.
+
+---
+
+## 4. Corrélation Temporelle & Baseline : Le "Voisinage Statistique"
+Le score final dépend du passé de la signature TLS.
+
+### A. Le Malus de Nouveauté (`agg_novelty`)
+* **Logique :** Une signature (JA4 + FP) vue pour la première fois aujourd'hui est "froide".
+* **Traitement :** On applique un malus si `first_seen` date de moins de 2 heures. Un botnet qui vient de lancer une campagne de rotation de signatures sera immédiatement pénalisé par son manque d'historique.
+
+### B. Le Dépassement de Baseline (`tbl_baseline_ja4_7d`)
+* **Fonctionnement :** On compare les `hits` actuels au 99ème percentile (`p99`) historique de cette signature précise.
+* **Exemple :** Si le JA4 de "Chrome 122" fait habituellement 10 requêtes/min/IP sur votre site, et qu'une IP en fait soudainement 300, le score explose même si la requête est techniquement parfaite.
+
+---
+
+## 5. Synthèse du Scoring (Le Verdict)
+
+| Algorithme | Signal | Impact Score |
+| :--- | :--- | :--- |
+| **Fingerprint Mismatch** | UA vs TLS (Spoofing) | **Haut (50)** |
+| **L4 Anomaly** | Variance latence < 0.5ms | **Moyen (30)** |
+| **Path Sensitivity** | Hit sur `/admin` ou `/config` | **Haut (40)** |
+| **Payload Security** | Caractères d'injection (SQL/XSS) | **Critique (60)** |
+| **Mass Distribution** | 1 JA4 sur > 50 IPs différentes | **Moyen (30)** |
+
+
+
+---
+
+## 6. Maintenance et faux positifs
+* **Exceptions :** Les bots légitimes (Googlebot, Bing) sont filtrés par ASN et Reverse DNS avant le scoring pour éviter de déréférencer le site.
+* **Réinitialisation :** Un `final_score` est volatile (calculé sur 5 minutes). Une IP bloquée par erreur retrouvera un score normal dès qu'elle cessera son comportement atypique.