refactor: replace hardcoded mabase_prod DB prefix with configurable settings

Replace all hardcoded 'mabase_prod.' table prefixes in dashboard route SQL queries with configurable database names from settings: - http_logs, http_logs_raw → settings.CLICKHOUSE_DB_LOGS - All other tables → settings.CLICKHOUSE_DB_PROCESSING Also qualify previously unqualified table references (bare FROM/JOIN table_name) with the appropriate database prefix for consistency. Each route file now imports 'from ..config import settings' and uses f-strings with {settings.CLICKHOUSE_DB_PROCESSING} or {settings.CLICKHOUSE_DB_LOGS} for database-qualified table names. Files updated: analysis, attributes, audit, botnets, bruteforce, clustering, detections, entities, fingerprints, header_fingerprint, heatmap, incidents, investigation_summary, metrics, ml_features, rotation, search, tcp_spoofing, variability (19 files). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-07 19:03:05 +02:00
parent dba2676fa7
commit b6391afbeb
19 changed files with 225 additions and 206 deletions
--- a/services/dashboard/backend/routes/ml_features.py
+++ b/services/dashboard/backend/routes/ml_features.py
@ -4,6 +4,7 @@ Endpoints pour les features ML / IA (scores d'anomalies, radar, scatter)
 from fastapi import APIRouter, HTTPException, Query

 from ..database import db
+from ..config import settings

 router = APIRouter(prefix="/api/ml", tags=["ml_features"])

@ -27,7 +28,7 @@ async def get_top_anomalies(limit: int = Query(50, ge=1, le=500)):
    Query directe sur agg_host_ip_ja4_1h + LEFT JOIN agg_header_fingerprint_1h.
    """
    try:
-        sql = """
+        sql = f"""
        SELECT
            replaceRegexpAll(toString(a.src_ip), '^::ffff:', '')                                AS ip,
            any(a.ja4)                                                                           AS ja4,
@ -45,13 +46,13 @@ async def get_top_anomalies(limit: int = Query(50, ge=1, le=500)):
            any(a.src_as_name)                                                                   AS asn_name,
            max(h.ua_ch_mismatch)                                                                AS ua_ch_mismatch,
            max(h.modern_browser_score)                                                          AS browser_score,
-            dictGetOrDefault('mabase_prod.dict_asn_reputation', 'label', toUInt64(any(a.src_asn)), 'unknown') AS asn_label,
+            dictGetOrDefault('{settings.CLICKHOUSE_DB_PROCESSING}.dict_asn_reputation', 'label', toUInt64(any(a.src_asn)), 'unknown') AS asn_label,
            coalesce(
-                nullIf(dictGetOrDefault('mabase_prod.dict_bot_ja4', 'bot_name', tuple(any(a.ja4)), ''), ''),
+                nullIf(dictGetOrDefault('{settings.CLICKHOUSE_DB_PROCESSING}.dict_bot_ja4', 'bot_name', tuple(any(a.ja4)), ''), ''),
                ''
            ) AS bot_name
-        FROM mabase_prod.agg_host_ip_ja4_1h a
-        LEFT JOIN mabase_prod.agg_header_fingerprint_1h h
+        FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h a
+        LEFT JOIN {settings.CLICKHOUSE_DB_PROCESSING}.agg_header_fingerprint_1h h
              ON a.src_ip = h.src_ip AND a.window_start = h.window_start
        WHERE a.window_start >= now() - INTERVAL 24 HOUR
        GROUP BY a.src_ip
@ -92,7 +93,7 @@ async def get_top_anomalies(limit: int = Query(50, ge=1, le=500)):
 async def get_ip_radar(ip: str):
    """Scores radar pour une IP spécifique (8 dimensions d'anomalie)."""
    try:
-        sql = """
+        sql = f"""
        SELECT
            avg(fuzzing_index)              AS fuzzing_index,
            avg(hit_velocity)               AS hit_velocity,
@ -102,7 +103,7 @@ async def get_ip_radar(ip: str):
            avg(orphan_ratio)               AS orphan_ratio,
            avg(path_diversity_ratio)       AS path_diversity_ratio,
            avg(anomalous_payload_ratio)    AS anomalous_payload_ratio
-        FROM mabase_prod.view_ai_features_1h
+        FROM {settings.CLICKHOUSE_DB_PROCESSING}.view_ai_features_1h
        WHERE replaceRegexpAll(toString(src_ip), '^::ffff:', '') = %(ip)s
          AND window_start >= now() - INTERVAL 24 HOUR
        """
@ -139,7 +140,7 @@ async def get_score_distribution():
    """
    try:
        # Single scan — global totals + per-model breakdown via GROUPING SETS
-        sql = """
+        sql = f"""
        SELECT
            threat_level,
            model_name,
@ -149,7 +150,7 @@ async def get_score_distribution():
            countIf(threat_level = 'NORMAL')   AS normal_count,
            countIf(threat_level NOT IN ('NORMAL','KNOWN_BOT')) AS anomaly_count,
            countIf(threat_level = 'KNOWN_BOT') AS bot_count
-        FROM mabase_prod.ml_all_scores
+        FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_all_scores
        WHERE detected_at >= now() - INTERVAL 3 DAY
        GROUP BY threat_level, model_name
        ORDER BY model_name, total DESC
@ -199,7 +200,7 @@ async def get_score_trends(hours: int = Query(72, ge=1, le=168)):
    Retourne le score moyen et les counts par heure et par modèle.
    """
    try:
-        sql = """
+        sql = f"""
        SELECT
            toStartOfHour(window_start)      AS hour,
            model_name,
@ -207,7 +208,7 @@ async def get_score_trends(hours: int = Query(72, ge=1, le=168)):
            countIf(threat_level IN ('LOW','MEDIUM','HIGH','CRITICAL')) AS anomaly_count,
            countIf(threat_level = 'KNOWN_BOT') AS bot_count,
            round(avgIf(anomaly_score, threat_level IN ('LOW','MEDIUM','HIGH','CRITICAL')), 4) AS avg_anomaly_score
-        FROM mabase_prod.ml_all_scores
+        FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_all_scores
        WHERE window_start >= now() - INTERVAL %(hours)s HOUR
        GROUP BY hour, model_name
        ORDER BY hour ASC, model_name
@ -237,7 +238,7 @@ async def get_b_features(limit: int = Query(50, ge=1, le=200)):
    Ces features sont calculées dans view_ai_features_1h mais jamais visualisées dans le dashboard.
    """
    try:
-        sql = """
+        sql = f"""
        SELECT ip, ja4, country, asn_name, total_hits AS hits,
               head_ratio, sec_fetch_absence, tls12_ratio, generic_accept_ratio, http10_ratio,
               missing_accept_enc_ratio, http_scheme_ratio
@ -255,7 +256,7 @@ async def get_b_features(limit: int = Query(50, ge=1, le=200)):
                round(sum(count_http10)           / greatest(sum(hits),1), 4) AS http10_ratio,
                round(sum(count_no_accept_enc)    / greatest(sum(hits),1), 4) AS missing_accept_enc_ratio,
                round(sum(count_http_scheme)      / greatest(sum(hits),1), 4) AS http_scheme_ratio
-            FROM mabase_prod.agg_host_ip_ja4_1h
+            FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
            WHERE window_start >= now() - INTERVAL 24 HOUR
            GROUP BY src_ip
        )
@ -295,7 +296,7 @@ async def get_ml_campaigns(hours: int = Query(24, ge=1, le=168), limit: int = Qu
    """
    try:
        # First: check real campaigns
-        campaign_sql = """
+        campaign_sql = f"""
        SELECT
            campaign_id,
            count()                                                     AS total_detections,
@ -305,7 +306,7 @@ async def get_ml_campaigns(hours: int = Query(24, ge=1, le=168), limit: int = Qu
            groupUniqArray(3)(bot_name)                                 AS bot_names,
            min(detected_at)                                            AS first_seen,
            max(detected_at)                                            AS last_seen
-        FROM mabase_prod.ml_detected_anomalies
+        FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
        WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
          AND campaign_id >= 0
        GROUP BY campaign_id
@ -330,7 +331,7 @@ async def get_ml_campaigns(hours: int = Query(24, ge=1, le=168), limit: int = Qu

        # Fallback: subnet-based clustering when DBSCAN has no campaigns
        if not campaigns:
-            subnet_sql = """
+            subnet_sql = f"""
            SELECT
                IPv4CIDRToRange(toIPv4(replaceRegexpAll(toString(src_ip),'^::ffff:','')), 24).1 AS subnet,
                count()                             AS total_detections,
@ -340,7 +341,7 @@ async def get_ml_campaigns(hours: int = Query(24, ge=1, le=168), limit: int = Qu
                any(ja4)                            AS sample_ja4,
                min(detected_at)                    AS first_seen,
                max(detected_at)                    AS last_seen
-            FROM mabase_prod.ml_detected_anomalies
+            FROM {settings.CLICKHOUSE_DB_PROCESSING}.ml_detected_anomalies
            WHERE detected_at >= now() - INTERVAL %(hours)s HOUR
              AND threat_level IN ('HIGH','CRITICAL','MEDIUM')
            GROUP BY subnet
@ -381,7 +382,7 @@ async def get_ml_campaigns(hours: int = Query(24, ge=1, le=168), limit: int = Qu
 async def get_ml_scatter(limit: int = Query(200, ge=1, le=1000)):
    """Points scatter plot (fuzzing_index × hit_velocity) — bypass view_ai_features_1h."""
    try:
-        sql = """
+        sql = f"""
        SELECT
            ip,
            ja4,
@ -400,7 +401,7 @@ async def get_ml_scatter(limit: int = Query(200, ge=1, le=1000)):
                max(last_seen)                                                                    AS max_last,
                sum(count_head)                                                                   AS total_count_head,
                max(correlated_raw)                                                               AS correlated
-            FROM mabase_prod.agg_host_ip_ja4_1h
+            FROM {settings.CLICKHOUSE_DB_PROCESSING}.agg_host_ip_ja4_1h
            WHERE window_start >= now() - INTERVAL 24 HOUR
            GROUP BY src_ip
        )