feat: rewrite ASN classification with PeeringDB + expanded heuristics
Major improvements to generate_asn_data.py: - Add PeeringDB network data source (34K networks with info_type) - Add new categories: education, government, enterprise - Rename 'human' label to 'isp' across all consumers - Expand keyword heuristics (ISP, datacenter, hosting, CDN, education, gov) - Add hard-coded lists for education, government, enterprise ASNs - Support both --output-dir and --output-asn/--output-ipasn CLI interfaces - Add --no-peeringdb flag for offline use Results: unknown dropped from 86% to 57%, ISP coverage 21.8K ASNs, education 3.1K, enterprise 5.7K, government 520. Updated consumers: - bot_detector.py: 'human' -> 'isp' for baseline selection - dashboard api.py: 'human' -> 'isp' in SQL queries - run-tests.sh: 'human' -> 'isp' in integration test assertions - update-csv-data.sh: updated label description comment Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@ -977,7 +977,7 @@ def run_semi_supervised_logic(df, features, name, cycle_id, recurrence_map):
|
||||
# - WEIGH / inconnu → scorés normalement (anubis_is_flagged=1 pour WEIGH)
|
||||
# Les DENY sont TOUJOURS inclus dans les threats, indépendamment du seuil IF.
|
||||
unknown_traffic = rest[rest['anubis_bot_action'] != 'ALLOW'].copy()
|
||||
human_baseline = unknown_traffic[unknown_traffic['asn_label'] == 'human']
|
||||
human_baseline = unknown_traffic[unknown_traffic['asn_label'] == 'isp']
|
||||
|
||||
# A7 — Valider les features avant tout traitement
|
||||
valid_features = validate_features(df, features, name, cycle_id)
|
||||
@ -1414,7 +1414,7 @@ def fetch_and_analyze():
|
||||
|
||||
log_decision('CYCLE_START', cycle_id, '', {
|
||||
'total_rows': len(df),
|
||||
'human_rows': int((df.get('asn_label', pd.Series()) == 'human').sum()),
|
||||
'human_rows': int((df.get('asn_label', pd.Series()) == 'isp').sum()),
|
||||
'known_bot_rows': int((df.get('bot_name', pd.Series()) != '').sum()),
|
||||
'correlated_rows': int((df.get('correlated', pd.Series()) == 1).sum()),
|
||||
'anubis_allow_rows': int((df.get('anubis_bot_action', pd.Series()) == 'ALLOW').sum()),
|
||||
@ -1436,12 +1436,12 @@ def fetch_and_analyze():
|
||||
tp_ips = {ip for ip, cls in soc_feedback.items() if cls in ('true_positive', 'malicious', 'bot')}
|
||||
if fp_ips:
|
||||
# Les faux positifs confirmés rejoignent le pool humain
|
||||
mask_fp = df['src_ip'].isin(fp_ips) & (df.get('asn_label', pd.Series(dtype=str)) != 'human')
|
||||
df.loc[mask_fp, 'asn_label'] = 'human'
|
||||
log_info(f"[Feedback] {mask_fp.sum()} lignes reclassées 'human' (FP confirmés).")
|
||||
mask_fp = df['src_ip'].isin(fp_ips) & (df.get('asn_label', pd.Series(dtype=str)) != 'isp')
|
||||
df.loc[mask_fp, 'asn_label'] = 'isp'
|
||||
log_info(f"[Feedback] {mask_fp.sum()} lignes reclassées 'isp' (FP confirmés).")
|
||||
if tp_ips:
|
||||
# Les vrais positifs confirmés sont exclus de la baseline humaine
|
||||
mask_tp = df['src_ip'].isin(tp_ips) & (df.get('asn_label', pd.Series(dtype=str)) == 'human')
|
||||
mask_tp = df['src_ip'].isin(tp_ips) & (df.get('asn_label', pd.Series(dtype=str)) == 'isp')
|
||||
df.loc[mask_tp, 'asn_label'] = 'soc_confirmed_bot'
|
||||
log_info(f"[Feedback] {mask_tp.sum()} lignes exclues de la baseline humaine (TP confirmés).")
|
||||
log_decision('SOC_FEEDBACK', cycle_id, '', {
|
||||
|
||||
@ -469,11 +469,11 @@ async def features() -> dict[str, Any]:
|
||||
except Exception:
|
||||
logger.debug("view_thesis_features_1h not available")
|
||||
|
||||
# Human vs bot feature profiles for radar comparison
|
||||
# ISP (residential) vs bot feature profiles for radar comparison
|
||||
try:
|
||||
human = query(
|
||||
f"SELECT {_feat_cols} FROM {_DB}.view_ai_features_1h "
|
||||
"WHERE asn_label = 'human'"
|
||||
"WHERE asn_label = 'isp'"
|
||||
)
|
||||
if human:
|
||||
result["human_profile"] = human[0]
|
||||
@ -663,7 +663,7 @@ async def heatmap() -> dict[str, Any]:
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# GET /api/ip/{ip}/radar — Radar comparison vs human baseline
|
||||
# GET /api/ip/{ip}/radar — Radar comparison vs ISP baseline
|
||||
# ---------------------------------------------------------------------------
|
||||
_RADAR_FEATURES = [
|
||||
"hit_velocity", "fuzzing_index", "post_ratio", "asset_ratio",
|
||||
@ -686,7 +686,7 @@ async def ip_radar(ip: str) -> dict[str, Any]:
|
||||
baseline = query(
|
||||
f"SELECT {cols_avg} "
|
||||
f"FROM {_DB}.view_ai_features_1h "
|
||||
"WHERE asn_label = 'human'"
|
||||
"WHERE asn_label = 'isp'"
|
||||
)
|
||||
bot_avg = query(
|
||||
f"SELECT {cols_avg} "
|
||||
|
||||
Reference in New Issue
Block a user