feat(bot-detector): add dynamic browser profiling engine with HDBSCAN clustering
Implement offline profile building (profile_builder.py) and real-time dynamic scoring (browser_matcher_dynamic.py) using HDBSCAN-based browser fingerprint clustering. Add ClickHouse materialized view (13_h2_profiling.sql) for h2_profile_stats aggregation. Update thesis and project documentation to cover the new dynamic profiling architecture. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
110
shared/clickhouse/13_h2_profiling.sql
Normal file
110
shared/clickhouse/13_h2_profiling.sql
Normal file
@ -0,0 +1,110 @@
|
||||
-- =============================================================================
|
||||
-- 13_h2_profiling.sql — Dynamic browser profiling infrastructure
|
||||
--
|
||||
-- Vue matérialisée + table cible pour le clustering HDBSCAN hors-ligne.
|
||||
-- Le profile_builder.py lit cette vue quotidiennement, clusterise les sessions
|
||||
-- H2 similaires, et écrit les centroïdes dans auto_browser_profiles.
|
||||
--
|
||||
-- Flux de données :
|
||||
-- http_logs → view_h2_profiling_raw (filtração + encodage)
|
||||
-- → profile_builder.py (HDBSCAN + centroïdes)
|
||||
-- → auto_browser_profiles (profils dynamiques)
|
||||
-- → browser_matcher_dynamic.py (scoring temps réel)
|
||||
-- =============================================================================
|
||||
|
||||
|
||||
-- -----------------------------------------------------------------------------
|
||||
-- auto_browser_profiles — centroïdes de profils navigateur auto-appris
|
||||
--
|
||||
-- Chaque ligne = un profil issu du clustering HDBSCAN.
|
||||
-- Le scoring temps réel compare les sessions entrantes à ces centroïdes.
|
||||
-- Les profils obsolètes (> 14 jours sans observation) sont purgés par le cron.
|
||||
-- -----------------------------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS ja4_processing.auto_browser_profiles
|
||||
(
|
||||
profile_id String,
|
||||
detected_family LowCardinality(String),
|
||||
count_ips UInt64,
|
||||
last_seen_date Date,
|
||||
|
||||
-- Variables continues : moyenne
|
||||
h2_initial_window_size_mean Int64,
|
||||
h2_window_update_mean UInt64,
|
||||
|
||||
-- Tolérance sur window_update = mean + 3*std (plafonnée à la valeur mean)
|
||||
h2_window_update_tol UInt64,
|
||||
|
||||
-- Variables catégorielles : mode (valeur la plus fréquente dans le cluster)
|
||||
pseudo_order_mode UInt8,
|
||||
h2_has_priority_mode UInt8,
|
||||
|
||||
created_at DateTime DEFAULT now()
|
||||
)
|
||||
ENGINE = ReplacingMergeTree(created_at)
|
||||
ORDER BY (detected_family, profile_id)
|
||||
TTL last_seen_date + INTERVAL 14 DAY
|
||||
SETTINGS
|
||||
index_granularity = 8192,
|
||||
ttl_only_drop_parts = 1;
|
||||
|
||||
|
||||
-- -----------------------------------------------------------------------------
|
||||
-- view_h2_profiling_raw — extraction du vecteur H2 pour le clustering
|
||||
--
|
||||
-- Filtre le trafic de bots évidents et encode les variables catégorielles.
|
||||
-- Cette vue est consommée par profile_builder.py lors du cron quotidien.
|
||||
--
|
||||
-- Règles de filtrage :
|
||||
-- 1. sec_fetch_absence_rate > 0.5 ET (h2_initial_window_size = -1 OU 65535)
|
||||
-- → trafic curl/bots sans fingerprints H2 valides
|
||||
-- 2. h2_window_update = 0 (absent du preface client)
|
||||
-- 3. h2_pseudo_order vide (pas de HTTP/2 détecté)
|
||||
--
|
||||
-- Encodage pseudo_order → UInt8 :
|
||||
-- 1 = "m,a,s,p" (Chrome/Safari) 2 = "m,p,a,s" (Firefox ancien)
|
||||
-- 3 = "m,s,p,a" 4 = "m,p,s,a" (curl/Firefox)
|
||||
-- 5 = "m,a,p,s" 0 = inconnu
|
||||
-- -----------------------------------------------------------------------------
|
||||
CREATE OR REPLACE VIEW ja4_processing.view_h2_profiling_raw AS
|
||||
SELECT
|
||||
src_ip,
|
||||
-- Variable cible pour le labeling du cluster
|
||||
header_user_agent,
|
||||
|
||||
-- === Variables continues (embedding) ===
|
||||
h2_initial_window_size,
|
||||
h2_window_update,
|
||||
|
||||
-- === Encodage catégoriel ===
|
||||
multiIf(
|
||||
h2_pseudo_order = 'm,a,s,p', 1,
|
||||
h2_pseudo_order = 'm,p,a,s', 2,
|
||||
h2_pseudo_order = 'm,s,p,a', 3,
|
||||
h2_pseudo_order = 'm,p,s,a', 4,
|
||||
h2_pseudo_order = 'm,a,p,s', 5,
|
||||
0
|
||||
) AS pseudo_order_id,
|
||||
|
||||
h2_has_priority,
|
||||
|
||||
-- === Agrégation utile pour le labeling ===
|
||||
-- Taux d'absence Sec-Fetch sur la session (= count_no_sec_fetch / (hits+1))
|
||||
-- Approximé ici au niveau log individuel : 1 si absent, 0 si présent
|
||||
toUInt8(IF(length(header_sec_fetch_site) = 0, 1, 0)) AS sec_fetch_missing,
|
||||
|
||||
-- Taille du header table H2 (-1 = absent)
|
||||
h2_header_table_size,
|
||||
|
||||
log_date
|
||||
|
||||
FROM ja4_logs.http_logs
|
||||
WHERE
|
||||
-- HTTP/2 uniquement : pseudo_order non vide
|
||||
h2_pseudo_order != ''
|
||||
-- Exclure les sessions sans WINDOW_UPDATE (curl, outils basiques)
|
||||
AND h2_window_update > 0
|
||||
-- Exclure les fingerprints manifestement bots :
|
||||
-- h2_initial_window_size = -1 (absent) ou 65535 (curl/python)
|
||||
AND h2_initial_window_size NOT IN (-1, 65535)
|
||||
-- Exclure le trafic sans UA (scanners)
|
||||
AND header_user_agent != '';
|
||||
Reference in New Issue
Block a user