feat: rewrite ASN classification with PeeringDB + expanded heuristics

Major improvements to generate_asn_data.py: - Add PeeringDB network data source (34K networks with info_type) - Add new categories: education, government, enterprise - Rename 'human' label to 'isp' across all consumers - Expand keyword heuristics (ISP, datacenter, hosting, CDN, education, gov) - Add hard-coded lists for education, government, enterprise ASNs - Support both --output-dir and --output-asn/--output-ipasn CLI interfaces - Add --no-peeringdb flag for offline use Results: unknown dropped from 86% to 57%, ISP coverage 21.8K ASNs, education 3.1K, enterprise 5.7K, government 520. Updated consumers: - bot_detector.py: 'human' -> 'isp' for baseline selection - dashboard api.py: 'human' -> 'isp' in SQL queries - run-tests.sh: 'human' -> 'isp' in integration test assertions - update-csv-data.sh: updated label description comment Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-08 16:02:07 +02:00
parent 9a48fb9d29
commit 5c5bca71d1
6 changed files with 35197 additions and 30362 deletions
--- a/services/bot-detector/bot_detector/bot_detector.py
+++ b/services/bot-detector/bot_detector/bot_detector.py
@ -977,7 +977,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
    #    - WEIGH / inconnu → scorés normalement (anubis_is_flagged=1 pour WEIGH)
    #    Les DENY sont TOUJOURS inclus dans les threats, indépendamment du seuil IF.
    unknown_traffic = rest[rest['anubis_bot_action'] != 'ALLOW'].copy()
-    human_baseline  = unknown_traffic[unknown_traffic['asn_label'] == 'human']
+    human_baseline  = unknown_traffic[unknown_traffic['asn_label'] == 'isp']

    # A7 — Valider les features avant tout traitement
    valid_features = validate_features(df, features, name, cycle_id)
@ -1414,7 +1414,7 @@ def fetch_and_analyze():

    log_decision('CYCLE_START', cycle_id, '', {
        'total_rows': len(df),
-        'human_rows': int((df.get('asn_label', pd.Series()) == 'human').sum()),
+        'human_rows': int((df.get('asn_label', pd.Series()) == 'isp').sum()),
        'known_bot_rows': int((df.get('bot_name', pd.Series()) != '').sum()),
        'correlated_rows': int((df.get('correlated', pd.Series()) == 1).sum()),
        'anubis_allow_rows': int((df.get('anubis_bot_action', pd.Series()) == 'ALLOW').sum()),
@ -1436,12 +1436,12 @@ def fetch_and_analyze():
        tp_ips = {ip for ip, cls in soc_feedback.items() if cls in ('true_positive', 'malicious', 'bot')}
        if fp_ips:
            # Les faux positifs confirmés rejoignent le pool humain
-            mask_fp = df['src_ip'].isin(fp_ips) & (df.get('asn_label', pd.Series(dtype=str)) != 'human')
-            df.loc[mask_fp, 'asn_label'] = 'human'
-            log_info(f"[Feedback] {mask_fp.sum()} lignes reclassées 'human' (FP confirmés).")
+            mask_fp = df['src_ip'].isin(fp_ips) & (df.get('asn_label', pd.Series(dtype=str)) != 'isp')
+            df.loc[mask_fp, 'asn_label'] = 'isp'
+            log_info(f"[Feedback] {mask_fp.sum()} lignes reclassées 'isp' (FP confirmés).")
        if tp_ips:
            # Les vrais positifs confirmés sont exclus de la baseline humaine
-            mask_tp = df['src_ip'].isin(tp_ips) & (df.get('asn_label', pd.Series(dtype=str)) == 'human')
+            mask_tp = df['src_ip'].isin(tp_ips) & (df.get('asn_label', pd.Series(dtype=str)) == 'isp')
            df.loc[mask_tp, 'asn_label'] = 'soc_confirmed_bot'
            log_info(f"[Feedback] {mask_tp.sum()} lignes exclues de la baseline humaine (TP confirmés).")
        log_decision('SOC_FEEDBACK', cycle_id, '', {
--- a/services/dashboard/backend/routes/api.py
+++ b/services/dashboard/backend/routes/api.py
@ -469,11 +469,11 @@ async def features() -> dict[str, Any]:
    except Exception:
        logger.debug("view_thesis_features_1h not available")

-    # Human vs bot feature profiles for radar comparison
+    # ISP (residential) vs bot feature profiles for radar comparison
    try:
        human = query(
            f"SELECT {_feat_cols} FROM {_DB}.view_ai_features_1h "
-            "WHERE asn_label = 'human'"
+            "WHERE asn_label = 'isp'"
        )
        if human:
            result["human_profile"] = human[0]
@ -663,7 +663,7 @@ async def heatmap() -> dict[str, Any]:


 # ---------------------------------------------------------------------------
-# GET /api/ip/{ip}/radar — Radar comparison vs human baseline
+# GET /api/ip/{ip}/radar — Radar comparison vs ISP baseline
 # ---------------------------------------------------------------------------
 _RADAR_FEATURES = [
    "hit_velocity", "fuzzing_index", "post_ratio", "asset_ratio",
@ -686,7 +686,7 @@ async def ip_radar(ip: str) -> dict[str, Any]:
        baseline = query(
            f"SELECT {cols_avg} "
            f"FROM {_DB}.view_ai_features_1h "
-            "WHERE asn_label = 'human'"
+            "WHERE asn_label = 'isp'"
        )
        bot_avg = query(
            f"SELECT {cols_avg} "