suite des maj

This commit is contained in:
SOC Analyst
2026-03-18 09:00:47 +01:00
parent 446d3623ec
commit 32a96966dd
17 changed files with 2398 additions and 755 deletions

View File

@ -735,3 +735,97 @@ async def get_legitimate_ja4(
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")
# =============================================================================
# ENDPOINT — Corrélation JA4 × ASN / Pays (C5)
# Détecte les JA4 fortement concentrés sur un seul ASN ou pays
# → signal de botnet ciblé ou d'infrastructure de test/attaque partagée
# =============================================================================
@router.get("/asn-correlation")
async def get_ja4_asn_correlation(
min_concentration: float = Query(0.7, ge=0.0, le=1.0, description="Seuil min de concentration ASN ou pays"),
min_ips: int = Query(5, ge=1, description="Nombre minimum d'IPs par JA4"),
limit: int = Query(50, ge=1, le=200),
):
"""
Identifie les JA4 fingerprints fortement concentrés sur un seul ASN ou pays.
Un JA4 avec asn_concentration ≥ 0.7 signifie que ≥70% des IPs utilisant ce fingerprint
proviennent du même ASN → infrastructure de bot partagée ou datacenter suspect.
"""
try:
# Two-pass: first aggregate per (ja4, asn) to get IP counts per ASN,
# then aggregate per ja4 to compute concentration ratio
sql = """
SELECT
ja4,
sum(ips_per_combo) AS unique_ips,
uniq(src_asn) AS unique_asns,
uniq(src_country_code) AS unique_countries,
toString(argMax(src_asn, ips_per_combo)) AS top_asn_number,
argMax(asn_name, ips_per_combo) AS top_asn_name,
argMax(src_country_code, country_ips) AS dominant_country,
sum(total_hits) AS total_hits,
round(max(ips_per_combo) / greatest(sum(ips_per_combo), 1), 3) AS asn_concentration,
round(max(country_ips) / greatest(sum(ips_per_combo), 1), 3) AS country_concentration
FROM (
SELECT
ja4,
src_asn,
src_country_code,
any(src_as_name) AS asn_name,
uniq(src_ip) AS ips_per_combo,
uniq(src_ip) AS country_ips,
sum(hits) AS total_hits
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
AND ja4 != ''
GROUP BY ja4, src_asn, src_country_code
)
GROUP BY ja4
HAVING unique_ips >= %(min_ips)s
AND (asn_concentration >= %(min_conc)s OR country_concentration >= %(min_conc)s)
ORDER BY asn_concentration DESC, unique_ips DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"min_ips": min_ips, "min_conc": min_concentration, "limit": limit})
items = []
for row in result.result_rows:
ja4 = str(row[0])
unique_ips = int(row[1])
unique_asns = int(row[2])
unique_countries = int(row[3])
top_asn_number = str(row[4] or "")
top_asn_name = str(row[5] or "")
dominant_country = str(row[6] or "")
total_hits = int(row[7] or 0)
asn_concentration = float(row[8] or 0)
country_concentration = float(row[9] or 0)
if asn_concentration >= 0.85:
corr_type, risk = "asn_monopoly", "high"
elif asn_concentration >= min_concentration:
corr_type, risk = "asn_dominant", "medium"
elif country_concentration >= min_concentration:
corr_type, risk = "geo_targeted", "medium"
else:
corr_type, risk = "distributed", "low"
items.append({
"ja4": ja4,
"unique_ips": unique_ips,
"unique_asns": unique_asns,
"unique_countries": unique_countries,
"top_asn_name": top_asn_name,
"top_asn_number": top_asn_number,
"dominant_country": dominant_country,
"total_hits": total_hits,
"asn_concentration": asn_concentration,
"country_concentration":country_concentration,
"correlation_type": corr_type,
"risk": risk,
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

View File

@ -23,37 +23,47 @@ def _attack_type(fuzzing_index: float, hit_velocity: float,
@router.get("/top-anomalies")
async def get_top_anomalies(limit: int = Query(50, ge=1, le=500)):
"""Top IPs anomales déduplicées par IP (max fuzzing_index), triées par fuzzing_index DESC."""
"""Top IPs anomales (24h) — bypass view_ai_features_1h pour éviter les window functions.
Query directe sur agg_host_ip_ja4_1h + LEFT JOIN agg_header_fingerprint_1h.
"""
try:
sql = """
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
any(ja4) AS ja4,
any(host) AS host,
max(hits) AS hits,
max(fuzzing_index) AS max_fuzzing,
max(hit_velocity) AS hit_velocity,
max(temporal_entropy) AS temporal_entropy,
max(is_fake_navigation) AS is_fake_navigation,
max(ua_ch_mismatch) AS ua_ch_mismatch,
max(sni_host_mismatch) AS sni_host_mismatch,
max(is_ua_rotating) AS is_ua_rotating,
max(path_diversity_ratio) AS path_diversity_ratio,
max(anomalous_payload_ratio) AS anomalous_payload_ratio,
any(asn_label) AS asn_label,
any(bot_name) AS bot_name
FROM mabase_prod.view_ai_features_1h
GROUP BY src_ip
ORDER BY 5 DESC
replaceRegexpAll(toString(a.src_ip), '^::ffff:', '') AS ip,
any(a.ja4) AS ja4,
any(a.host) AS host,
sum(a.hits) AS hits,
round(max(uniqMerge(a.uniq_query_params))
/ greatest(max(uniqMerge(a.uniq_paths)), 1), 4) AS fuzzing_index,
round(sum(a.hits)
/ greatest(dateDiff('second', min(a.first_seen), max(a.last_seen)), 1), 2) AS hit_velocity,
round(sum(a.count_head) / greatest(sum(a.hits), 1), 4) AS head_ratio,
round(sum(a.count_no_sec_fetch) / greatest(sum(a.hits), 1), 4) AS sec_fetch_absence,
round(sum(a.tls12_count) / greatest(sum(a.hits), 1), 4) AS tls12_ratio,
round(sum(a.count_generic_accept) / greatest(sum(a.hits), 1), 4) AS generic_accept_ratio,
any(a.src_country_code) AS country,
any(a.src_as_name) AS asn_name,
max(h.ua_ch_mismatch) AS ua_ch_mismatch,
max(h.modern_browser_score) AS browser_score,
dictGetOrDefault('mabase_prod.dict_asn_reputation', 'label', toUInt64(any(a.src_asn)), 'unknown') AS asn_label,
coalesce(
nullIf(dictGetOrDefault('mabase_prod.dict_bot_ja4', 'bot_name', tuple(any(a.ja4)), ''), ''),
''
) AS bot_name
FROM mabase_prod.agg_host_ip_ja4_1h a
LEFT JOIN mabase_prod.agg_header_fingerprint_1h h
ON a.src_ip = h.src_ip AND a.window_start = h.window_start
WHERE a.window_start >= now() - INTERVAL 24 HOUR
GROUP BY a.src_ip
ORDER BY fuzzing_index DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
fuzzing = float(row[4] or 0)
velocity = float(row[5] or 0)
fake_nav = int(row[7] or 0)
ua_mm = int(row[8] or 0)
fuzzing = float(row[4] or 0)
velocity = float(row[5] or 0)
ua_mm = int(row[12] or 0)
items.append({
"ip": str(row[0]),
"ja4": str(row[1]),
@ -61,16 +71,17 @@ async def get_top_anomalies(limit: int = Query(50, ge=1, le=500)):
"hits": int(row[3] or 0),
"fuzzing_index": fuzzing,
"hit_velocity": velocity,
"temporal_entropy": float(row[6] or 0),
"is_fake_navigation": fake_nav,
"head_ratio": float(row[6] or 0),
"sec_fetch_absence": float(row[7] or 0),
"tls12_ratio": float(row[8] or 0),
"generic_accept_ratio": float(row[9] or 0),
"country": str(row[10] or ""),
"asn_name": str(row[11] or ""),
"ua_ch_mismatch": ua_mm,
"sni_host_mismatch": int(row[9] or 0),
"is_ua_rotating": int(row[10] or 0),
"path_diversity_ratio": float(row[11] or 0),
"anomalous_payload_ratio":float(row[12] or 0),
"asn_label": str(row[13] or ""),
"bot_name": str(row[14] or ""),
"attack_type": _attack_type(fuzzing, velocity, fake_nav, ua_mm),
"browser_score": int(row[13] or 0),
"asn_label": str(row[14] or ""),
"bot_name": str(row[15] or ""),
"attack_type": _attack_type(fuzzing, velocity, 0, ua_mm),
})
return {"items": items}
except Exception as e:
@ -93,6 +104,7 @@ async def get_ip_radar(ip: str):
avg(anomalous_payload_ratio) AS anomalous_payload_ratio
FROM mabase_prod.view_ai_features_1h
WHERE replaceRegexpAll(toString(src_ip), '^::ffff:', '') = %(ip)s
AND window_start >= now() - INTERVAL 24 HOUR
"""
result = db.query(sql, {"ip": ip})
if not result.result_rows:
@ -119,22 +131,264 @@ async def get_ip_radar(ip: str):
raise HTTPException(status_code=500, detail=str(e))
@router.get("/scatter")
async def get_ml_scatter(limit: int = Query(200, ge=1, le=1000)):
"""Points pour scatter plot (fuzzing_index × hit_velocity), dédupliqués par IP."""
@router.get("/score-distribution")
async def get_score_distribution():
"""
Distribution de TOUS les scores ML depuis ml_all_scores (3j).
Single query avec conditional aggregates pour éviter le double scan.
"""
try:
# Single scan — global totals + per-model breakdown via GROUPING SETS
sql = """
SELECT
threat_level,
model_name,
count() AS total,
round(avg(anomaly_score), 4) AS avg_score,
round(min(anomaly_score), 4) AS min_score,
countIf(threat_level = 'NORMAL') AS normal_count,
countIf(threat_level NOT IN ('NORMAL','KNOWN_BOT')) AS anomaly_count,
countIf(threat_level = 'KNOWN_BOT') AS bot_count
FROM mabase_prod.ml_all_scores
WHERE detected_at >= now() - INTERVAL 3 DAY
GROUP BY threat_level, model_name
ORDER BY model_name, total DESC
"""
result = db.query(sql)
by_model: dict = {}
grand_total = 0
total_normal = total_anomaly = total_bot = 0
for row in result.result_rows:
level = str(row[0])
model = str(row[1])
total = int(row[2])
grand_total += total
total_normal += int(row[5] or 0)
total_anomaly += int(row[6] or 0)
total_bot += int(row[7] or 0)
if model not in by_model:
by_model[model] = []
by_model[model].append({
"threat_level": level,
"total": total,
"avg_score": float(row[3] or 0),
"min_score": float(row[4] or 0),
})
grand_total = max(grand_total, 1)
return {
"by_model": by_model,
"totals": {
"normal": total_normal,
"anomaly": total_anomaly,
"known_bot": total_bot,
"grand_total": grand_total,
"normal_pct": round(total_normal / grand_total * 100, 1),
"anomaly_pct": round(total_anomaly / grand_total * 100, 1),
"bot_pct": round(total_bot / grand_total * 100, 1),
}
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/score-trends")
async def get_score_trends(hours: int = Query(72, ge=1, le=168)):
"""
Évolution temporelle des scores ML depuis ml_all_scores.
Retourne le score moyen et les counts par heure et par modèle.
"""
try:
sql = """
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
any(ja4) AS ja4,
max(fuzzing_index) AS max_fuzzing,
max(hit_velocity) AS hit_velocity,
max(hits) AS hits,
max(is_fake_navigation) AS is_fake_navigation,
max(ua_ch_mismatch) AS ua_ch_mismatch
FROM mabase_prod.view_ai_features_1h
toStartOfHour(window_start) AS hour,
model_name,
countIf(threat_level = 'NORMAL') AS normal_count,
countIf(threat_level IN ('LOW','MEDIUM','HIGH','CRITICAL')) AS anomaly_count,
countIf(threat_level = 'KNOWN_BOT') AS bot_count,
round(avgIf(anomaly_score, threat_level IN ('LOW','MEDIUM','HIGH','CRITICAL')), 4) AS avg_anomaly_score
FROM mabase_prod.ml_all_scores
WHERE window_start >= now() - INTERVAL %(hours)s HOUR
GROUP BY hour, model_name
ORDER BY hour ASC, model_name
"""
result = db.query(sql, {"hours": hours})
points = []
for row in result.result_rows:
points.append({
"hour": str(row[0]),
"model": str(row[1]),
"normal_count": int(row[2] or 0),
"anomaly_count": int(row[3] or 0),
"bot_count": int(row[4] or 0),
"avg_anomaly_score": float(row[5] or 0),
})
return {"points": points, "hours": hours}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/b-features")
async def get_b_features(limit: int = Query(50, ge=1, le=200)):
"""
Agrégation des B-features (HTTP pures) pour les top IPs anomales.
Source: agg_host_ip_ja4_1h (SimpleAggregateFunction columns).
Expose: head_ratio, sec_fetch_absence, tls12_ratio, generic_accept_ratio, http10_ratio.
Ces features sont calculées dans view_ai_features_1h mais jamais visualisées dans le dashboard.
"""
try:
sql = """
SELECT ip, ja4, country, asn_name, hits,
head_ratio, sec_fetch_absence, tls12_ratio, generic_accept_ratio, http10_ratio
FROM (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
any(ja4) AS ja4,
any(src_country_code) AS country,
any(src_as_name) AS asn_name,
sum(hits) AS hits,
round(sum(count_head) / greatest(sum(hits),1), 4) AS head_ratio,
round(sum(count_no_sec_fetch) / greatest(sum(hits),1), 4) AS sec_fetch_absence,
round(sum(tls12_count) / greatest(sum(hits),1), 4) AS tls12_ratio,
round(sum(count_generic_accept) / greatest(sum(hits),1), 4) AS generic_accept_ratio,
round(sum(count_http10) / greatest(sum(hits),1), 4) AS http10_ratio
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
GROUP BY src_ip
)
WHERE sec_fetch_absence > 0.5 OR generic_accept_ratio > 0.3
OR head_ratio > 0.1 OR tls12_ratio > 0.5
ORDER BY (head_ratio + sec_fetch_absence + generic_accept_ratio) DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for row in result.result_rows:
items.append({
"ip": str(row[0]),
"ja4": str(row[1] or ""),
"country": str(row[2] or ""),
"asn_name": str(row[3] or ""),
"hits": int(row[4] or 0),
"head_ratio": float(row[5] or 0),
"sec_fetch_absence": float(row[6] or 0),
"tls12_ratio": float(row[7] or 0),
"generic_accept_ratio":float(row[8] or 0),
"http10_ratio": float(row[9] or 0),
})
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/campaigns")
async def get_ml_campaigns(hours: int = Query(24, ge=1, le=168), limit: int = Query(20, ge=1, le=100)):
"""
Groupes d'anomalies détectées par DBSCAN (campaign_id >= 0).
Si aucune campagne active, fallback sur clustering par /24 subnet + JA4 commun.
Utile pour détecter les botnets distribués sans état de campagne DBSCAN.
"""
try:
# First: check real campaigns
campaign_sql = """
SELECT
campaign_id,
count() AS total_detections,
uniq(src_ip) AS unique_ips,
any(threat_level) AS dominant_threat,
groupUniqArray(3)(threat_level) AS threat_levels,
groupUniqArray(3)(bot_name) AS bot_names,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM mabase_prod.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND campaign_id >= 0
GROUP BY campaign_id
ORDER BY total_detections DESC
LIMIT %(limit)s
"""
result = db.query(campaign_sql, {"hours": hours, "limit": limit})
campaigns = []
for row in result.result_rows:
campaigns.append({
"id": f"C{row[0]}",
"campaign_id": int(row[0]),
"total_detections": int(row[1]),
"unique_ips": int(row[2]),
"dominant_threat": str(row[3] or ""),
"threat_levels": list(row[4] or []),
"bot_names": list(row[5] or []),
"first_seen": str(row[6]),
"last_seen": str(row[7]),
"source": "dbscan",
})
# Fallback: subnet-based clustering when DBSCAN has no campaigns
if not campaigns:
subnet_sql = """
SELECT
IPv4CIDRToRange(toIPv4(replaceRegexpAll(toString(src_ip),'^::ffff:','')), 24).1 AS subnet,
count() AS total_detections,
uniq(src_ip) AS unique_ips,
groupArray(3)(threat_level) AS threat_levels,
any(bot_name) AS bot_name,
any(ja4) AS sample_ja4,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM mabase_prod.ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
AND threat_level IN ('HIGH','CRITICAL','MEDIUM')
GROUP BY subnet
HAVING unique_ips >= 3
ORDER BY total_detections DESC
LIMIT %(limit)s
"""
result2 = db.query(subnet_sql, {"hours": hours, "limit": limit})
for i, row in enumerate(result2.result_rows):
subnet_str = str(row[0]) + "/24"
campaigns.append({
"id": f"S{i+1:03d}",
"campaign_id": -1,
"subnet": subnet_str,
"total_detections": int(row[1]),
"unique_ips": int(row[2]),
"dominant_threat": str((row[3] or [""])[0]),
"threat_levels": list(row[3] or []),
"bot_names": [str(row[4] or "")],
"sample_ja4": str(row[5] or ""),
"first_seen": str(row[6]),
"last_seen": str(row[7]),
"source": "subnet_cluster",
})
dbscan_active = any(c["campaign_id"] >= 0 for c in campaigns)
return {
"campaigns": campaigns,
"total": len(campaigns),
"dbscan_active": dbscan_active,
"hours": hours,
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/scatter")
async def get_ml_scatter(limit: int = Query(200, ge=1, le=1000)):
"""Points scatter plot (fuzzing_index × hit_velocity) — bypass view_ai_features_1h."""
try:
sql = """
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
any(ja4) AS ja4,
round(max(uniqMerge(uniq_query_params)) / greatest(max(uniqMerge(uniq_paths)), 1), 4) AS fuzzing_index,
round(sum(hits) / greatest(dateDiff('second', min(first_seen), max(last_seen)), 1), 2) AS hit_velocity,
sum(hits) AS hits,
round(sum(count_head) / greatest(sum(hits), 1), 4) AS head_ratio,
max(correlated_raw) AS correlated
FROM mabase_prod.agg_host_ip_ja4_1h
WHERE window_start >= now() - INTERVAL 24 HOUR
GROUP BY src_ip
ORDER BY 3 DESC
ORDER BY fuzzing_index DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
@ -142,15 +396,13 @@ async def get_ml_scatter(limit: int = Query(200, ge=1, le=1000)):
for row in result.result_rows:
fuzzing = float(row[2] or 0)
velocity = float(row[3] or 0)
fake_nav = int(row[5] or 0)
ua_mm = int(row[6] or 0)
points.append({
"ip": str(row[0]),
"ja4": str(row[1]),
"fuzzing_index":fuzzing,
"hit_velocity": velocity,
"hits": int(row[4] or 0),
"attack_type": _attack_type(fuzzing, velocity, fake_nav, ua_mm),
"attack_type": _attack_type(fuzzing, velocity, 0, 0),
})
return {"points": points}
except Exception as e:

View File

@ -104,40 +104,40 @@ async def get_ip_ja4_history(ip: str):
@router.get("/sophistication")
async def get_sophistication(limit: int = Query(50, ge=1, le=500)):
"""Score de sophistication adversaire par IP (rotation JA4 + récurrence + bruteforce)."""
"""Score de sophistication adversaire par IP (rotation JA4 + récurrence + bruteforce).
Single SQL JOIN query — aucun traitement Python sur 34K entrées.
"""
try:
# Separate queries merged in Python to avoid view JOIN issues
rot_result = db.query("""
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
distinct_ja4_count
FROM mabase_prod.view_host_ip_ja4_rotation
""")
rotation_map = {str(row[0]): int(row[1]) for row in rot_result.result_rows}
rec_result = db.query("""
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
recurrence
FROM mabase_prod.view_ip_recurrence
""")
recurrence_map = {str(row[0]): int(row[1]) for row in rec_result.result_rows}
bf_result = db.query("""
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS ip,
sum(hits) AS total_hits
sql = """
SELECT
replaceRegexpAll(toString(r.src_ip), '^::ffff:', '') AS ip,
r.distinct_ja4_count,
coalesce(rec.recurrence, 0) AS recurrence,
coalesce(bf.bruteforce_hits, 0) AS bruteforce_hits,
round(least(100.0,
r.distinct_ja4_count * 10
+ coalesce(rec.recurrence, 0) * 20
+ least(30.0, log(coalesce(bf.bruteforce_hits, 0) + 1) * 5)
), 1) AS sophistication_score
FROM mabase_prod.view_host_ip_ja4_rotation r
LEFT JOIN (
SELECT src_ip, count() AS recurrence
FROM mabase_prod.ml_detected_anomalies FINAL
GROUP BY src_ip
) rec USING(src_ip)
LEFT JOIN (
SELECT replaceRegexpAll(toString(src_ip),'^::ffff:','') AS src_ip,
sum(hits) AS bruteforce_hits
FROM mabase_prod.view_form_bruteforce_detected
GROUP BY ip
""")
bruteforce_map = {str(row[0]): int(row[1]) for row in bf_result.result_rows}
# Start from IPs that appear in rotation view (most evasive)
GROUP BY src_ip
) bf USING(src_ip)
ORDER BY sophistication_score DESC
LIMIT %(limit)s
"""
result = db.query(sql, {"limit": limit})
items = []
for ip, ja4_count in rotation_map.items():
recurrence = recurrence_map.get(ip, 0)
bf_hits = bruteforce_map.get(ip, 0)
score = min(100.0, ja4_count * 10 + recurrence * 20 + min(30.0, math.log(bf_hits + 1) * 5))
for row in result.result_rows:
score = float(row[4] or 0)
if score > 80:
tier = "APT-like"
elif score > 50:
@ -147,16 +147,13 @@ async def get_sophistication(limit: int = Query(50, ge=1, le=500)):
else:
tier = "Basic"
items.append({
"ip": ip,
"ja4_rotation_count": ja4_count,
"recurrence": recurrence,
"bruteforce_hits": bf_hits,
"sophistication_score": round(score, 1),
"tier": tier,
"ip": str(row[0]),
"ja4_rotation_count": int(row[1] or 0),
"recurrence": int(row[2] or 0),
"bruteforce_hits": int(row[3] or 0),
"sophistication_score":score,
"tier": tier,
})
items.sort(key=lambda x: x["sophistication_score"], reverse=True)
items = items[:limit]
return {"items": items, "total": len(items)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))