From c6666e2bbafcd5d75c60c86b4d3705b66690d82d Mon Sep 17 00:00:00 2001
From: toto <antitbone@jefaismonrhum.sdv.fr>
Date: Wed, 8 Apr 2026 23:56:05 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20isotree=20score=20convention=20=E2=80=94?=
 =?UTF-8?q?=20proper=20sklearn=20calibration?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

isotree decision_function returns [0,1] (higher=anomalous, 0.5=boundary).
The entire pipeline (normalize_scores, score_to_threat_level,
compute_adaptive_threshold) expects sklearn convention (negative=anomalous).

Previous fix (-raw_scores) negated all values, making everything
below -0.30 → all CRITICAL. New fix: 0.5 - isotree_score maps
correctly to sklearn's convention:
  isotree 0.80 → -0.30 (CRITICAL)
  isotree 0.65 → -0.15 (HIGH)
  isotree 0.55 → -0.05 (MEDIUM)
  isotree 0.50 →  0.00 (boundary)

Verified: 27,952 LEGITIMATE_BROWSER + 15,843 HIGH + 15,059 MEDIUM
Tests: 36/36 pass.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 services/bot-detector/bot_detector/bot_detector.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/services/bot-detector/bot_detector/bot_detector.py b/services/bot-detector/bot_detector/bot_detector.py
index 6047856..106249a 100644
--- a/services/bot-detector/bot_detector/bot_detector.py
+++ b/services/bot-detector/bot_detector/bot_detector.py
@@ -610,8 +610,11 @@ def load_or_train_model(name: str, human_baseline: pd.DataFrame, features: list,
         model = IsolationForest(n_estimators=300, contamination=CONTAMINATION, random_state=42, n_jobs=-1)
     model.fit(X_train)
 
-    # Évaluation offline : score moyen sur la validation (devrait être > 0 pour du trafic humain)
+    # Évaluation offline : score moyen sur la validation (devrait être > 0 pour du trafic humain sklearn)
     val_scores = model.decision_function(X_val)
+    # Unifier la convention : négatif = anomal (isotree: 0.5 - score)
+    if EIF_AVAILABLE:
+        val_scores = 0.5 - val_scores
     val_mean_score = float(np.mean(val_scores))
     val_anomaly_rate = float(np.mean(val_scores < 0))
     log_info(f"[{name}] Validation : score moyen={val_mean_score:.4f}, taux anomalie={val_anomaly_rate:.2%} ({len(X_val)} échantillons)")
@@ -1014,6 +1017,15 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
     X_test = unknown_traffic[scoring_features].replace([np.inf, -np.inf], np.nan)
     X_test = X_test.fillna(X_test.median())
     raw_scores = model.decision_function(X_test)
+
+    # isotree renvoie des scores dans [0, 1] : 0.5 = frontière, >0.5 = anomal
+    # sklearn renvoie des scores centrés sur 0 : <0 = anomal, >0 = normal
+    # Conversion : sklearn_equiv = 0.5 - isotree_score
+    #   isotree 0.8 → -0.3 (CRITICAL)  |  isotree 0.5 → 0.0 (frontière)
+    #   isotree 0.3 → +0.2 (NORMAL)
+    if EIF_AVAILABLE:
+        raw_scores = 0.5 - raw_scores
+
     log_info(f'[{name}]   Scoring EIF : {len(X_test)} sessions scorées (min={raw_scores.min():.4f}, max={raw_scores.max():.4f}, mean={raw_scores.mean():.4f})')
 
     # Combinaison EIF + Autoencoder si disponible