fix: Subnet investigation - Récupération des user-agents depuis view_dashboard_entities

- Utilisation de 2 requêtes séparées + fusion en Python
- 1ère requête: ml_detected_anomalies pour les détections récentes
- 2ème requête: view_dashboard_entities avec IN clause pour les user-agents
- La clause IN permet d'utiliser l'index ClickHouse (splitByChar ne l'utilise pas)
- PREWHERE optimise les performances de requête

Problème résolu:
- unique_ua était toujours à 0 car la jointure LEFT JOIN ne fonctionnait pas
- La solution avec IN clause fonctionne car elle utilise l'index sur entity_value

Testé avec 141.98.11.0/24:
- 5 IPs, 8 détections, 65 user-agents uniques
- 141.98.11.209: 68 user-agents différents

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
SOC Analyst
2026-03-15 19:41:48 +01:00
parent 05d21ae8fb
commit ee2b24b277
7 changed files with 727 additions and 45 deletions

View File

@ -154,37 +154,62 @@ async def get_subnet_investigation(
):
"""
Récupère toutes les IPs d'un subnet /24 avec leurs statistiques
Utilise les vues view_dashboard_entities et view_dashboard_user_agents
Utilise ml_detected_anomalies pour les détections + view_dashboard_entities pour les user-agents
"""
try:
# Extraire l'IP de base du subnet (ex: 192.168.1.0/24 -> 192.168.1.0)
subnet_ip = subnet.replace('/24', '').replace('/16', '').replace('/8', '')
# Extraire les 3 premiers octets pour le filtre (ex: 141.98.11)
subnet_parts = subnet_ip.split('.')[:3]
subnet_prefix = subnet_parts[0]
subnet_mask = subnet_parts[1]
subnet_third = subnet_parts[2]
# Stats globales du subnet - utilise view_dashboard_entities
# Stats globales du subnet - utilise ml_detected_anomalies + view_dashboard_entities pour UA
stats_query = """
WITH cleaned_ips AS (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
detected_at,
ja4,
host,
country_code,
asn_number
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
),
subnet_filter AS (
SELECT *
FROM cleaned_ips
WHERE splitByChar('.', clean_ip)[1] = %(subnet_prefix)s
AND splitByChar('.', clean_ip)[2] = %(subnet_mask)s
AND splitByChar('.', clean_ip)[3] = %(subnet_third)s
),
-- Récupérer les user-agents depuis view_dashboard_entities
ua_data AS (
SELECT
entity_value AS ip,
arrayJoin(user_agents) AS user_agent
FROM view_dashboard_entities
WHERE entity_type = 'ip'
AND log_date >= now() - INTERVAL %(hours)s HOUR
AND splitByChar('.', entity_value)[1] = %(subnet_prefix)s
AND splitByChar('.', entity_value)[2] = %(subnet_mask)s
AND splitByChar('.', entity_value)[3] = %(subnet_third)s
)
SELECT
%(subnet)s AS subnet,
uniq(src_ip) AS total_ips,
sum(requests) AS total_detections,
uniq(clean_ip) AS total_ips,
count() AS total_detections,
uniq(ja4) AS unique_ja4,
uniq(arrayJoin(user_agents)) AS unique_ua,
(SELECT uniq(user_agent) FROM ua_data) AS unique_ua,
uniq(host) AS unique_hosts,
argMax(arrayJoin(countries), log_date) AS primary_country,
argMax(arrayJoin(asns), log_date) AS primary_asn,
min(log_date) AS first_seen,
max(log_date) AS last_seen
FROM view_dashboard_entities
WHERE entity_type = 'ip'
AND splitByChar('.', toString(src_ip))[1] = %(subnet_prefix)s
AND splitByChar('.', toString(src_ip))[2] = %(subnet_mask)s
AND splitByChar('.', toString(src_ip))[3] = %(subnet_third)s
AND log_date >= today() - INTERVAL %(hours)s HOUR
argMax(country_code, detected_at) AS primary_country,
argMax(asn_number, detected_at) AS primary_asn,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM subnet_filter
"""
stats_result = db.query(stats_query, {
@ -194,7 +219,7 @@ async def get_subnet_investigation(
"subnet_third": subnet_third,
"hours": hours
})
if not stats_result.result_rows or stats_result.result_rows[0][1] == 0:
raise HTTPException(status_code=404, detail="Subnet non trouvé")
@ -212,30 +237,44 @@ async def get_subnet_investigation(
"last_seen": stats_row[9].isoformat() if stats_row[9] else ""
}
# Liste des IPs avec détails - utilise view_dashboard_entities
# Liste des IPs avec détails - 2 requêtes séparées + fusion en Python
ips_query = """
WITH cleaned_ips AS (
SELECT
replaceRegexpAll(toString(src_ip), '^::ffff:', '') AS clean_ip,
detected_at,
ja4,
country_code,
asn_number,
threat_level,
anomaly_score
FROM ml_detected_anomalies
WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
),
subnet_filter AS (
SELECT *
FROM cleaned_ips
WHERE splitByChar('.', clean_ip)[1] = %(subnet_prefix)s
AND splitByChar('.', clean_ip)[2] = %(subnet_mask)s
AND splitByChar('.', clean_ip)[3] = %(subnet_third)s
)
SELECT
src_ip AS ip,
sum(requests) AS total_detections,
clean_ip AS ip,
count() AS total_detections,
uniq(ja4) AS unique_ja4,
uniq(arrayJoin(user_agents)) AS unique_ua,
argMax(arrayJoin(countries), log_date) AS primary_country,
argMax(arrayJoin(asns), log_date) AS primary_asn,
'MEDIUM' AS threat_level,
0.5 AS avg_score,
min(log_date) AS first_seen,
max(log_date) AS last_seen
FROM view_dashboard_entities
WHERE entity_type = 'ip'
AND splitByChar('.', toString(src_ip))[1] = %(subnet_prefix)s
AND splitByChar('.', toString(src_ip))[2] = %(subnet_mask)s
AND splitByChar('.', toString(src_ip))[3] = %(subnet_third)s
AND log_date >= today() - INTERVAL %(hours)s HOUR
GROUP BY src_ip
argMax(country_code, detected_at) AS primary_country,
argMax(asn_number, detected_at) AS primary_asn,
argMax(threat_level, detected_at) AS threat_level,
avg(anomaly_score) AS avg_score,
min(detected_at) AS first_seen,
max(detected_at) AS last_seen
FROM subnet_filter
GROUP BY ip
ORDER BY total_detections DESC
LIMIT 100
"""
# Exécuter la première requête pour obtenir les IPs
ips_result = db.query(ips_query, {
"subnet_prefix": subnet_prefix,
"subnet_mask": subnet_mask,
@ -243,19 +282,41 @@ async def get_subnet_investigation(
"hours": hours
})
# Extraire la liste des IPs pour la requête UA
ip_list = [str(row[0]) for row in ips_result.result_rows]
# Requête pour les user-agents avec IN clause (utilise l'index)
unique_ua_dict = {}
if ip_list:
# Formater la liste pour la clause IN
ip_values = ', '.join(f"'{ip}'" for ip in ip_list)
ua_query = f"""
SELECT
entity_value AS ip,
uniq(arrayJoin(user_agents)) AS unique_ua
FROM view_dashboard_entities
PREWHERE entity_type = 'ip'
WHERE entity_value IN ({ip_values})
AND log_date >= today() - INTERVAL 30 DAY
GROUP BY entity_value
"""
ua_result = db.query(ua_query, {})
unique_ua_dict = {row[0]: row[1] for row in ua_result.result_rows}
# Fusionner les résultats
ips = []
for row in ips_result.result_rows:
ips.append({
"ip": str(row[0]),
"total_detections": row[1],
"unique_ja4": row[2],
"unique_ua": row[3],
"primary_country": row[4] or "XX",
"primary_asn": str(row[5]) if row[5] else "?",
"threat_level": row[6] or "LOW",
"avg_score": abs(row[7] or 0),
"first_seen": row[8].isoformat() if row[8] else "",
"last_seen": row[9].isoformat() if row[9] else ""
"unique_ua": unique_ua_dict.get(row[0], 0),
"primary_country": row[3] or "XX",
"primary_asn": str(row[4]) if row[4] else "?",
"threat_level": row[5] or "LOW",
"avg_score": abs(row[6] or 0),
"first_seen": row[7].isoformat() if row[7] else "",
"last_seen": row[8].isoformat() if row[8] else ""
})
return {