feat(bot-detector): implement 8 state-of-art improvements
- EIF: Extended Isolation Forest via isotree (fallback to sklearn IF) - Benford's Law deviation feature on inter-request timing - Lag-1 autocorrelation feature for cadence analysis - Validation gate: reject model if val_anomaly_rate > 20% - Feature pruning: remove variance < 1e-6 features before training - Quantile drift: replace N(μ,σ) synthetic with quantile interpolation - Thread safety: Lock for _service_healthy/_consecutive_failures - Score normalization: inverted to [0,1] where 1=most anomalous SQL: add lag1_autocorrelation + benford_deviation to view_thesis_features_1h Tests: 10 new test functions covering all improvements Integration: verify_mvs.py checks new thesis feature columns Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -304,7 +304,41 @@ cadence_features AS (
|
||||
toFloat64(arrayCount(x -> x > 5000.0, deltas_ms))
|
||||
/ toFloat64(length(deltas_ms)),
|
||||
0.0
|
||||
) AS pause_ratio
|
||||
) AS pause_ratio,
|
||||
-- Autocorrélation lag-1 : ρ₁(Δt) — humain ≈ 0 (indépendant), bot avec jitter ≈ 0.8+
|
||||
if(
|
||||
length(deltas_ms) >= 4 AND arrayReduce('varPop', deltas_ms) > 1e-9,
|
||||
(
|
||||
arrayReduce('avg',
|
||||
arrayMap(
|
||||
(a, b) -> (a - arrayReduce('avg', deltas_ms)) * (b - arrayReduce('avg', deltas_ms)),
|
||||
arraySlice(deltas_ms, 1, length(deltas_ms) - 1),
|
||||
arraySlice(deltas_ms, 2)
|
||||
)
|
||||
) / arrayReduce('varPop', deltas_ms)
|
||||
),
|
||||
0.0
|
||||
) AS lag1_autocorrelation,
|
||||
-- Loi de Benford : χ² entre premiers chiffres des Δt et distribution attendue
|
||||
-- Benford P(d) = log10(1 + 1/d) pour d=1..9
|
||||
if(
|
||||
length(deltas_ms) >= 10,
|
||||
(
|
||||
let benford_expected = [0.301, 0.176, 0.125, 0.097, 0.079, 0.067, 0.058, 0.051, 0.046],
|
||||
let first_digits = arrayMap(x -> toUInt8(substring(toString(toUInt64(greatest(abs(x), 1))), 1, 1)), deltas_ms),
|
||||
let n = toFloat64(length(first_digits)),
|
||||
arraySum(
|
||||
arrayMap(
|
||||
d -> pow(
|
||||
(toFloat64(arrayCount(x -> x = d, first_digits)) / n) - benford_expected[d],
|
||||
2
|
||||
) / benford_expected[d],
|
||||
[1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
)
|
||||
)
|
||||
),
|
||||
0.0
|
||||
) AS benford_deviation
|
||||
FROM timing_deltas
|
||||
),
|
||||
|
||||
@ -398,6 +432,8 @@ SELECT
|
||||
c.cadence_cv,
|
||||
c.burst_ratio,
|
||||
c.pause_ratio,
|
||||
c.lag1_autocorrelation,
|
||||
c.benford_deviation,
|
||||
c.cadence_request_count,
|
||||
-- §5.5 Intra-Session JA4 Drift
|
||||
d.ja4_drift_ratio,
|
||||
|
||||
Reference in New Issue
Block a user