feat: rewrite ASN classification with PeeringDB + expanded heuristics

Major improvements to generate_asn_data.py:
- Add PeeringDB network data source (34K networks with info_type)
- Add new categories: education, government, enterprise
- Rename 'human' label to 'isp' across all consumers
- Expand keyword heuristics (ISP, datacenter, hosting, CDN, education, gov)
- Add hard-coded lists for education, government, enterprise ASNs
- Support both --output-dir and --output-asn/--output-ipasn CLI interfaces
- Add --no-peeringdb flag for offline use

Results: unknown dropped from 86% to 57%, ISP coverage 21.8K ASNs,
education 3.1K, enterprise 5.7K, government 520.

Updated consumers:
- bot_detector.py: 'human' -> 'isp' for baseline selection
- dashboard api.py: 'human' -> 'isp' in SQL queries
- run-tests.sh: 'human' -> 'isp' in integration test assertions
- update-csv-data.sh: updated label description comment

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
toto
2026-04-08 16:02:07 +02:00
parent 9a48fb9d29
commit 5c5bca71d1
6 changed files with 35197 additions and 30362 deletions

View File

@ -977,7 +977,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
# - WEIGH / inconnu → scorés normalement (anubis_is_flagged=1 pour WEIGH)
# Les DENY sont TOUJOURS inclus dans les threats, indépendamment du seuil IF.
unknown_traffic = rest[rest['anubis_bot_action'] != 'ALLOW'].copy()
human_baseline = unknown_traffic[unknown_traffic['asn_label'] == 'human']
human_baseline = unknown_traffic[unknown_traffic['asn_label'] == 'isp']
# A7 — Valider les features avant tout traitement
valid_features = validate_features(df, features, name, cycle_id)
@ -1414,7 +1414,7 @@ def fetch_and_analyze():
log_decision('CYCLE_START', cycle_id, '', {
'total_rows': len(df),
'human_rows': int((df.get('asn_label', pd.Series()) == 'human').sum()),
'human_rows': int((df.get('asn_label', pd.Series()) == 'isp').sum()),
'known_bot_rows': int((df.get('bot_name', pd.Series()) != '').sum()),
'correlated_rows': int((df.get('correlated', pd.Series()) == 1).sum()),
'anubis_allow_rows': int((df.get('anubis_bot_action', pd.Series()) == 'ALLOW').sum()),
@ -1436,12 +1436,12 @@ def fetch_and_analyze():
tp_ips = {ip for ip, cls in soc_feedback.items() if cls in ('true_positive', 'malicious', 'bot')}
if fp_ips:
# Les faux positifs confirmés rejoignent le pool humain
mask_fp = df['src_ip'].isin(fp_ips) & (df.get('asn_label', pd.Series(dtype=str)) != 'human')
df.loc[mask_fp, 'asn_label'] = 'human'
log_info(f"[Feedback] {mask_fp.sum()} lignes reclassées 'human' (FP confirmés).")
mask_fp = df['src_ip'].isin(fp_ips) & (df.get('asn_label', pd.Series(dtype=str)) != 'isp')
df.loc[mask_fp, 'asn_label'] = 'isp'
log_info(f"[Feedback] {mask_fp.sum()} lignes reclassées 'isp' (FP confirmés).")
if tp_ips:
# Les vrais positifs confirmés sont exclus de la baseline humaine
mask_tp = df['src_ip'].isin(tp_ips) & (df.get('asn_label', pd.Series(dtype=str)) == 'human')
mask_tp = df['src_ip'].isin(tp_ips) & (df.get('asn_label', pd.Series(dtype=str)) == 'isp')
df.loc[mask_tp, 'asn_label'] = 'soc_confirmed_bot'
log_info(f"[Feedback] {mask_tp.sum()} lignes exclues de la baseline humaine (TP confirmés).")
log_decision('SOC_FEEDBACK', cycle_id, '', {

View File

@ -469,11 +469,11 @@ async def features() -> dict[str, Any]:
except Exception:
logger.debug("view_thesis_features_1h not available")
# Human vs bot feature profiles for radar comparison
# ISP (residential) vs bot feature profiles for radar comparison
try:
human = query(
f"SELECT {_feat_cols} FROM {_DB}.view_ai_features_1h "
"WHERE asn_label = 'human'"
"WHERE asn_label = 'isp'"
)
if human:
result["human_profile"] = human[0]
@ -663,7 +663,7 @@ async def heatmap() -> dict[str, Any]:
# ---------------------------------------------------------------------------
# GET /api/ip/{ip}/radar — Radar comparison vs human baseline
# GET /api/ip/{ip}/radar — Radar comparison vs ISP baseline
# ---------------------------------------------------------------------------
_RADAR_FEATURES = [
"hit_velocity", "fuzzing_index", "post_ratio", "asset_ratio",
@ -686,7 +686,7 @@ async def ip_radar(ip: str) -> dict[str, Any]:
baseline = query(
f"SELECT {cols_avg} "
f"FROM {_DB}.view_ai_features_1h "
"WHERE asn_label = 'human'"
"WHERE asn_label = 'isp'"
)
bot_avg = query(
f"SELECT {cols_avg} "