Fix: CsI(Tl) non-linear response correction + detector calibration overhaul

Root cause of Am-241 misidentification: the Radiacode 103's CsI(Tl) crystal shifts low-energy peaks upward (59.5 keV → 71.6 keV for Am-241) due to non-proportional scintillation response. The model was trained on theoretical peak positions and couldn't match the shifted real peaks. Changes: - Add inverse CsI(Tl) non-linear correction to inference pipeline (radiacode_monitor.py, web/config.py, test_detection.py) E_apparent = E_true * (1 + 0.37 * exp(-E_true/100)) Corrects channel mapping so peaks appear at theoretical energies - Fix energy calibration: DetectorConfig now uses E = 0.33 + 2.97*ch with 1023 channels, matching the real detector (was energy_min=20, skip_first_channel=True, different channel width) - Add K-escape peaks for CsI(Tl) iodine X-ray escape (E - 28.5 keV) - Add asymmetric peak shapes for low-energy tails (< 200 keV) - Add log1p normalization in dataset and inference (replaces max-norm) - Add background-subtracted training mode (subtract_background flag) - Add low-signal augmentation (0.01-5 Bq activities, 30-300s durations) - Update docker-compose.yml: batch_size=32, duration=30-300s, CSI_NONLINEAR_ALPHA/BETA env vars for detect and web - Web dashboard: apply CsI correction to displayed spectra - Various UI fixes (Chart.js width, zoom/pan, isotope lines) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 17:35:22 +02:00
parent 3b4446b181
commit 0847a3fc80
21 changed files with 913 additions and 278 deletions
--- a/train/entrypoint.sh
+++ b/train/entrypoint.sh
@ -5,11 +5,11 @@ DATA_DIR="${DATA_DIR:-/data/synthetic}"
 MODEL_DIR="${MODEL_DIR:-/models}"
 NUM_SAMPLES="${NUM_SAMPLES:-50000}"
 EPOCHS="${EPOCHS:-100}"
-BATCH_SIZE="${BATCH_SIZE:-64}"
+BATCH_SIZE="${BATCH_SIZE:-32}"
 LEARNING_RATE="${LEARNING_RATE:-0.001}"
 DETECTOR="${DETECTOR:-radiacode_103}"
-MIN_DURATION="${MIN_DURATION:-43200}"
-MAX_DURATION="${MAX_DURATION:-86400}"
+MIN_DURATION="${MIN_DURATION:-30}"
+MAX_DURATION="${MAX_DURATION:-300}"
 SEED="${SEED:-42}"
 MEASURED_BACKGROUND_PATH="${MEASURED_BACKGROUND_PATH:-}"

@ -20,7 +20,7 @@ echo "  Data dir    : $DATA_DIR"
 echo "  Model dir   : $MODEL_DIR"
 echo "  Samples     : $NUM_SAMPLES"
 echo "  Detector    : $DETECTOR"
-echo "  Duration     : $MIN_DURATION-$MAX_DURATION s"
+echo "  Duration    : $MIN_DURATION-$MAX_DURATION s"
 echo "  Epochs      : $EPOCHS"
 echo "  Batch size  : $BATCH_SIZE"
 echo "  Learning rate: $LEARNING_RATE"
--- a/train/vega_ml/synthetic_spectra/config.py
+++ b/train/vega_ml/synthetic_spectra/config.py
@ -3,99 +3,77 @@ Detector Configuration Module

 Contains configuration parameters for Radiacode gamma spectrometers
 and other detector settings.
+
+Energy calibration matches the real Radiacode 103:
+    E(keV) = 0.33 + 2.97 * channel_index
+Uses 1023 channels (channel 1023 is overflow, excluded).
 """

-from dataclasses import dataclass, field
-from typing import Dict, Optional
+from dataclasses import dataclass
+from typing import Dict
 import numpy as np


@dataclass
 class DetectorConfig:
    """Configuration for a gamma spectrometer detector."""
-    
-    name: str
-    # Energy range in keV
-    energy_min_kev: float = 20.0
-    energy_max_kev: float = 3000.0
-    
-    # Number of channels
-    num_channels: int = 1024

-    # Some devices/software workflows treat channel 0 as unreliable/noisy.
-    # This project models "usable" channels by skipping the first raw channel.
-    skip_first_channel: bool = True
-    
+    name: str
+    # Energy calibration: E = calibration_offset + calibration_slope * channel
+    # Must match the real detector calibration used in inference.
+    calibration_offset_kev: float = 0.33
+    calibration_slope_kev: float = 2.97
+
+    # Number of usable channels (1023 for Radiacode, channel 1023 is overflow)
+    num_channels: int = 1023
+
    # FWHM at 662 keV (Cs-137 reference) as fraction
    fwhm_at_662: float = 0.084  # 8.4%
    fwhm_uncertainty: float = 0.003  # ±0.3%
-    
+
    # Detector crystal type
    crystal_type: str = "CsI(Tl)"
-    
+
    # Sensitivity: counts per second at 1 μSv/h for Cs-137
    sensitivity_cps_per_usvh: float = 30.0
-    
+
    # Detector volume in cm³
    detector_volume_cm3: float = 1.0
-    
-    def get_channel_width_kev(self) -> float:
-        """Get the width of each channel in keV."""
-        return (self.energy_max_kev - self.energy_min_kev) / self.num_channels
-    
-    def get_energy_bins(self) -> np.ndarray:
-        """Get array of energy bin centers (keV) for the modeled usable channels."""
-        channel_width = self.get_channel_width_kev()

-        # Raw device channels are assumed to be 0..num_channels-1 with centers:
-        #   E_center(k) = E_min + (k + 0.5) * channel_width
-        # If we skip the first raw channel (k=0), we model usable channels k=1..num_channels-1.
-        start_raw_channel = 1 if self.skip_first_channel else 0
-        raw_channels = np.arange(start_raw_channel, self.num_channels, dtype=np.float64)
-        return self.energy_min_kev + (raw_channels + 0.5) * channel_width
-    
+    def get_energy_bins(self) -> np.ndarray:
+        """Get array of energy bin centers (keV) matching the real detector calibration."""
+        channels = np.arange(self.num_channels, dtype=np.float64)
+        return self.calibration_offset_kev + self.calibration_slope_kev * channels
+
    def get_fwhm_at_energy(self, energy_kev: float) -> float:
        """
        Calculate FWHM at a given energy.
-        
+
        For scintillators, FWHM scales approximately as sqrt(E).
-        FWHM(E) = FWHM_662 * sqrt(662/E) * E / 662 = FWHM_662 * sqrt(E/662)
+        FWHM(E) = FWHM_662 * sqrt(E/662)
        """
-        return self.fwhm_at_662 * np.sqrt(662.0 / energy_kev) * energy_kev
-    
+        return self.fwhm_at_662 * np.sqrt(energy_kev / 662.0) * 662.0
+
    def get_sigma_at_energy(self, energy_kev: float) -> float:
-        """
-        Get Gaussian sigma at a given energy.
-        sigma = FWHM / (2 * sqrt(2 * ln(2))) ≈ FWHM / 2.355
-        """
+        """Get Gaussian sigma at a given energy."""
        fwhm = self.get_fwhm_at_energy(energy_kev)
        return fwhm / 2.355
-    
+
    def energy_to_channel(self, energy_kev: float) -> int:
-        """Convert energy in keV to modeled usable channel index."""
-        channel_width = self.get_channel_width_kev()
-        raw_channel = int((energy_kev - self.energy_min_kev) / channel_width)
-        if self.skip_first_channel:
-            channel = raw_channel - 1
-            max_channel = self.num_channels - 2
-        else:
-            channel = raw_channel
-            max_channel = self.num_channels - 1
-        return max(0, min(max_channel, channel))
+        """Convert energy in keV to channel index."""
+        channel = int((energy_kev - self.calibration_offset_kev) / self.calibration_slope_kev)
+        return max(0, min(self.num_channels - 1, channel))

    def channel_to_energy(self, channel: int) -> float:
-        """Convert modeled usable channel index to energy bin center (keV)."""
-        channel_width = self.get_channel_width_kev()
-        raw_channel = channel + (1 if self.skip_first_channel else 0)
-        raw_channel = max(0, min(self.num_channels - 1, int(raw_channel)))
-        return self.energy_min_kev + (raw_channel + 0.5) * channel_width
+        """Convert channel index to energy in keV."""
+        return self.calibration_offset_kev + self.calibration_slope_kev * channel


 # Pre-defined configurations for Radiacode devices
 RADIACODE_CONFIGS: Dict[str, DetectorConfig] = {
    "radiacode_101": DetectorConfig(
        name="Radiacode 101",
-        fwhm_at_662=0.095,  # 9.5% (original model, similar to 102)
+        fwhm_at_662=0.095,  # 9.5%
        fwhm_uncertainty=0.004,
        crystal_type="CsI(Tl)",
        sensitivity_cps_per_usvh=30.0,
@ -119,8 +97,7 @@ RADIACODE_CONFIGS: Dict[str, DetectorConfig] = {
    ),
    "radiacode_103g": DetectorConfig(
        name="Radiacode 103G",
-        energy_min_kev=25.0,  # Tech spec lists 0.025…3 MeV
-        fwhm_at_662=0.074,  # 7.4% (GAGG crystal - better resolution)
+        fwhm_at_662=0.074,  # 7.4% (GAGG crystal)
        fwhm_uncertainty=0.003,
        crystal_type="GAGG(Ce)",
        sensitivity_cps_per_usvh=40.0,
@ -131,12 +108,12 @@ RADIACODE_CONFIGS: Dict[str, DetectorConfig] = {
        fwhm_at_662=0.084,  # 8.4%
        fwhm_uncertainty=0.003,
        crystal_type="CsI(Tl)",
-        sensitivity_cps_per_usvh=77.0,  # Higher sensitivity
-        detector_volume_cm3=2.5,  # Larger crystal
+        sensitivity_cps_per_usvh=77.0,
+        detector_volume_cm3=2.5,
    ),
 }


 def get_default_config() -> DetectorConfig:
    """Get the default detector configuration (Radiacode 103)."""
-    return RADIACODE_CONFIGS["radiacode_103"]
+    return RADIACODE_CONFIGS["radiacode_103"]
--- a/train/vega_ml/synthetic_spectra/generate_spectra.py
+++ b/train/vega_ml/synthetic_spectra/generate_spectra.py
@ -128,19 +128,21 @@ def generate_training_batch(
    num_samples: int,
    output_dir: Path,
    detector_name: str = "radiacode_103",
-    duration_range: tuple = (60, 300),
+    duration_range: tuple = (30, 300),
    activity_range: tuple = (1.0, 100.0),
-    single_isotope_fraction: float = 0.4,
-    dual_isotope_fraction: float = 0.3,
-    multi_isotope_fraction: float = 0.2,
+    single_isotope_fraction: float = 0.3,
+    dual_isotope_fraction: float = 0.2,
+    multi_isotope_fraction: float = 0.15,
    background_only_fraction: float = 0.1,
+    low_signal_fraction: float = 0.15,
+    subtracted_fraction: float = 0.1,
    save_png: bool = False,
    random_seed: int = None,
    measured_background_path: str = None,
 ) -> list:
    """
    Generate a batch of training samples with various configurations.
-    
+
    Args:
        num_samples: Total number of samples to generate
        output_dir: Output directory for spectra and labels
@ -151,9 +153,11 @@ def generate_training_batch(
        dual_isotope_fraction: Fraction of two-isotope samples
        multi_isotope_fraction: Fraction of 3+ isotope samples
        background_only_fraction: Fraction of background-only samples
+        low_signal_fraction: Fraction of low-activity samples (0.01-5 Bq)
+        subtracted_fraction: Fraction of background-subtracted samples
        save_png: Whether to also save PNG images
        random_seed: Random seed for reproducibility
-    
+
    Returns:
        List of generated spectra
    """
@ -181,11 +185,13 @@ def generate_training_batch(
    n_dual = int(num_samples * dual_isotope_fraction)
    n_multi = int(num_samples * multi_isotope_fraction)
    n_background = int(num_samples * background_only_fraction)
-    
+    n_low_signal = int(num_samples * low_signal_fraction)
+    n_subtracted = int(num_samples * subtracted_fraction)
+
    # Adjust to ensure we hit exactly num_samples
-    remaining = num_samples - (n_single + n_dual + n_multi + n_background)
+    remaining = num_samples - (n_single + n_dual + n_multi + n_background + n_low_signal + n_subtracted)
    n_single += remaining
-    
+
    total_generated = 0

    print(f"\nGenerating {num_samples} synthetic spectra:")
@ -193,6 +199,8 @@ def generate_training_batch(
    print(f"  - Dual isotope: {n_dual}")
    print(f"  - Multi isotope (3+): {n_multi}")
    print(f"  - Background only: {n_background}")
+    print(f"  - Low signal (0.01-5 Bq): {n_low_signal}")
+    print(f"  - Background-subtracted: {n_subtracted}")
    print()

    sample_num = 0
@ -314,6 +322,77 @@ def generate_training_batch(

        sample_num += 1

+    # Generate low-signal samples (weak sources, 0.01-5 Bq)
+    print("Generating low-signal samples...")
+    for i in range(n_low_signal):
+        isotope = np.random.choice(isotope_pool)
+        activity = np.random.uniform(0.01, 5.0)
+        duration = np.random.uniform(*duration_range)
+
+        spectrum = generate_single_isotope_sample(
+            generator,
+            isotope,
+            activity,
+            duration,
+            detector_name=detector_name,
+            include_background=True,
+            measured_background_path=measured_background_path,
+        )
+
+        save_spectrum(
+            spectrum,
+            spectra_dir,
+            save_image=True,
+            image_format='npy'
+        )
+        del spectrum
+
+        sample_num += 1
+
+        if sample_num % 100 == 0:
+            print(f"  Generated {sample_num}/{num_samples} samples...")
+
+    # Generate background-subtracted samples (simulates inference pipeline)
+    print("Generating background-subtracted samples...")
+    for i in range(n_subtracted):
+        num_iso = np.random.choice([1, 2, 3], p=[0.5, 0.3, 0.2])
+        isotopes = np.random.choice(isotope_pool, size=num_iso, replace=False)
+        activities = [np.random.uniform(0.1, 50.0) for _ in range(num_iso)]
+        duration = np.random.uniform(*duration_range)
+
+        sources = [
+            IsotopeSource(
+                isotope_name=name,
+                activity_bq=activity,
+                include_daughters=True
+            )
+            for name, activity in zip(isotopes, activities)
+        ]
+
+        config = SpectrumConfig(
+            duration_seconds=duration,
+            sources=sources,
+            include_background=True,
+            subtract_background=True,
+            detector_name=detector_name,
+            measured_background_path=measured_background_path,
+        )
+
+        spectrum = generator.generate_spectrum(config)
+
+        save_spectrum(
+            spectrum,
+            spectra_dir,
+            save_image=True,
+            image_format='npy'
+        )
+        del spectrum
+
+        sample_num += 1
+
+        if sample_num % 100 == 0:
+            print(f"  Generated {sample_num}/{num_samples} samples...")
+
    total_generated = sample_num
    print(f"\nGenerated {total_generated} samples total")

--- a/train/vega_ml/synthetic_spectra/generator.py
+++ b/train/vega_ml/synthetic_spectra/generator.py
@ -49,14 +49,14 @@ class IsotopeSource:
@dataclass
 class SpectrumConfig:
    """Configuration for a single spectrum generation."""
-    
+
    # Time parameters
    duration_seconds: float = 60.0
    time_interval_seconds: float = 1.0  # Each row in the spectrogram
-    
+
    # Sources to include
    sources: List[IsotopeSource] = field(default_factory=list)
-    
+
    # Background options
    include_background: bool = True
    background_cps: float = 5.0
@ -64,18 +64,25 @@ class SpectrumConfig:
    include_radon: bool = True
    include_thorium: bool = True
    measured_background_path: Optional[str] = None
-    
+
+    # Background subtraction simulation
+    # When True, generates a second independent background realization
+    # and subtracts it from the spectrum, then clips negatives to 0.
+    # This simulates what happens at inference time (measured bg subtraction).
+    subtract_background: bool = False
+
    # Detector configuration
    detector_name: str = "radiacode_103"
-    
+
    # Noise options
    apply_poisson: bool = True
    apply_electronic: bool = False
    electronic_noise_sigma: float = 0.5
-    
-    # Normalization
+
+    # Normalization — "log1p" preserves relative signal levels,
+    # works well after background subtraction where many channels are ~0.
    normalize: bool = True
-    normalization_method: str = "max"  # max, sum, log, sqrt
+    normalization_method: str = "log1p"  # max, sum, log, sqrt, log1p


@dataclass
@ -272,7 +279,7 @@ class SpectrumGenerator:
        all_source_isotopes.extend(src_iso)
        all_background_isotopes.extend(bg_iso)

-        # Apply noise
+        # Apply noise before any subtraction (Poisson noise on raw counts)
        if config.apply_poisson:
            spectrum = apply_poisson_noise(spectrum)

@ -282,6 +289,24 @@ class SpectrumGenerator:
                config.electronic_noise_sigma
            )

+        # Simulate background subtraction (matches inference pipeline)
+        if config.subtract_background and config.include_background:
+            # Generate an independent background realization
+            bg_spectrum2, _ = generate_environmental_background(
+                self.energy_bins,
+                config.duration_seconds,
+                background_cps=config.background_cps,
+                include_k40=config.include_k40,
+                include_radon=config.include_radon,
+                include_thorium=config.include_thorium,
+                detector_config=self.detector_config,
+                measured_background_path=config.measured_background_path,
+            )
+            if config.apply_poisson:
+                bg_spectrum2 = apply_poisson_noise(bg_spectrum2)
+            # Subtract and clip — same as inference: net = clip(rate - bg_rate, 0, inf)
+            spectrum = np.maximum(spectrum - bg_spectrum2, 0)
+
        # Normalize if requested
        if config.normalize:
            spectrum = normalize_spectrum(spectrum, config.normalization_method)
--- a/train/vega_ml/synthetic_spectra/physics/spectrum_physics.py
+++ b/train/vega_ml/synthetic_spectra/physics/spectrum_physics.py
@ -184,38 +184,148 @@ def calculate_expected_counts(
    return expected


+def _k_escape_fraction(energy_kev: float, detector_config: Optional[DetectorConfig] = None) -> float:
+    """
+    Calculate K-escape peak fraction for CsI(Tl) detector.
+
+    For iodine K-shell (binding energy ~33.2 keV), when a gamma photon
+    interacts with the K-shell, there's a chance the K X-ray escapes the
+    crystal, producing a peak at E - E_Ka (~28.5 keV for I K-alpha).
+
+    The escape fraction decreases with energy as the photoelectric cross-section
+    ratio (K-shell / total) decreases.
+
+    Args:
+        energy_kev: Gamma energy in keV
+        detector_config: Detector configuration
+
+    Returns:
+        Fraction of photopeak counts that appear in the K-escape peak
+    """
+    if energy_kev <= 33.2:
+        return 0.0
+
+    # K-shell binding energy for iodine
+    k_binding = 33.2  # keV
+
+    # K-escape fraction for CsI(Tl) detector
+    # Based on measured data: ~35% at 60 keV, ~15% at 150 keV, ~5% at 662 keV
+    # Model as: fraction = A * (1 - exp(-E/B)) where A and B are fit parameters
+    # Fitted to typical CsI K-escape measurements
+    fraction = 0.40 * (1.0 - np.exp(-(energy_kev - k_binding) / 80.0))
+
+    return float(np.clip(fraction, 0.0, 0.45))
+
+
+def _asymmetric_peak(
+    energy_bins: np.ndarray,
+    peak_energy: float,
+    sigma: float,
+    amplitude: float,
+    tail_fraction: float = 0.0,
+    tail_sigma_ratio: float = 3.0
+) -> np.ndarray:
+    """
+    Generate an asymmetric peak using an exponentially-modified Gaussian.
+
+    For scintillation detectors at low energies, incomplete charge collection
+    creates a low-energy tail. The tail fraction increases at lower energies.
+
+    Args:
+        energy_bins: Array of energy bin centers (keV)
+        peak_energy: Center energy of peak (keV)
+        sigma: Gaussian sigma (keV)
+        amplitude: Total peak area (counts)
+        tail_fraction: Fraction of peak area in low-energy tail (0-0.5)
+        tail_sigma_ratio: Ratio of tail sigma to peak sigma
+
+    Returns:
+        Array of counts in each bin
+    """
+    # Main Gaussian component
+    main_peak = gaussian_peak(energy_bins, peak_energy, sigma, amplitude * (1 - tail_fraction))
+
+    if tail_fraction <= 0:
+        return main_peak
+
+    # Low-energy tail: Gaussian shifted to lower energy with broader width
+    tail_sigma = sigma * tail_sigma_ratio
+    tail_energy = peak_energy - 2.0 * sigma  # Tail centered 2 sigma below peak
+    tail_peak = gaussian_peak(energy_bins, tail_energy, tail_sigma, amplitude * tail_fraction)
+
+    return main_peak + tail_peak
+
+
 def generate_peak_spectrum(
    energy_bins: np.ndarray,
    peak_params: PeakParameters,
    detector_config: Optional[DetectorConfig] = None
 ) -> np.ndarray:
    """
-    Generate a single gamma peak with detector response.
-    
+    Generate a single gamma peak with realistic CsI(Tl) detector response.
+
+    Includes:
+    - Asymmetric peak shape (low-energy tail from incomplete charge collection)
+    - K-escape peak (Iodine K-shell X-ray escape at E - 28.5 keV)
+    - Energy-dependent resolution
+
+    Note: Peaks are placed at theoretical gamma energies. The non-linear
+    CsI(Tl) response correction is applied in the inference pipeline
+    (radiacode_monitor.py), not here, to keep training data detector-independent.
+
    Args:
-        energy_bins: Array of energy bin centers (keV)
+        energy_bins: Array of energy bin centers (keV) matching detector calibration
        peak_params: Peak parameters
        detector_config: Detector configuration
-    
+
    Returns:
        Array of expected counts in each bin (not yet Poisson sampled)
    """
    if detector_config is None:
        detector_config = get_default_config()
-    
+
    # Calculate expected counts
-    amplitude = calculate_expected_counts(peak_params, detector_config)
-    
-    if amplitude <= 0:
+    total_amplitude = calculate_expected_counts(peak_params, detector_config)
+
+    if total_amplitude <= 0:
        return np.zeros_like(energy_bins)
-    
+
    # Calculate peak width
    fwhm_kev = calculate_fwhm(peak_params.energy_kev, detector_config.fwhm_at_662)
    sigma = fwhm_to_sigma(fwhm_kev)
-    
-    # Generate Gaussian peak
-    peak = gaussian_peak(energy_bins, peak_params.energy_kev, sigma, amplitude)
-    
+
+    # Low-energy tail fraction: increases at lower energies due to
+    # incomplete charge collection in CsI(Tl)
+    if peak_params.energy_kev < 200:
+        tail_frac = 0.15 * (1.0 - peak_params.energy_kev / 200.0)
+    else:
+        tail_frac = 0.0
+
+    # Generate main peak (asymmetric)
+    peak = _asymmetric_peak(
+        energy_bins, peak_params.energy_kev, sigma,
+        total_amplitude, tail_fraction=tail_frac
+    )
+
+    # K-escape peak for CsI(Tl)
+    escape_frac = _k_escape_fraction(peak_params.energy_kev, detector_config)
+    if escape_frac > 0:
+        escape_energy = peak_params.energy_kev - 28.5  # I K-alpha at 28.5 keV
+        if escape_energy > 20:  # Only if above detection threshold
+            escape_amplitude = total_amplitude * escape_frac
+            # Reduce main peak amplitude
+            peak = peak * (1 - escape_frac)
+
+            # Escape peak has slightly broader resolution
+            escape_fwhm = calculate_fwhm(escape_energy, detector_config.fwhm_at_662)
+            escape_sigma = fwhm_to_sigma(escape_fwhm) * 1.3
+
+            escape_peak = _asymmetric_peak(
+                energy_bins, escape_energy, escape_sigma,
+                escape_amplitude, tail_fraction=0.25
+            )
+            peak = peak + escape_peak
+
    return peak


@ -636,11 +746,11 @@ def apply_electronic_noise(

 def normalize_spectrum(
    spectrum: np.ndarray,
-    method: str = "max"
+    method: str = "log1p"
 ) -> np.ndarray:
    """
    Normalize a spectrum for ML training.
-    
+
    Args:
        spectrum: Raw count spectrum
        method: Normalization method
@ -648,7 +758,8 @@ def normalize_spectrum(
            - "sum": Divide by total counts (probability distribution)
            - "log": Log transform then max normalize
            - "sqrt": Square root transform then max normalize
-    
+            - "log1p": log(1+x) then max normalize (best for bg-subtracted spectra)
+
    Returns:
        Normalized spectrum
    """
@ -657,7 +768,7 @@ def normalize_spectrum(
        if max_val > 0:
            return spectrum / max_val
        return spectrum
-    
+
    elif method == "sum":
        total = spectrum.sum()
        if total > 0:
@ -678,6 +789,13 @@ def normalize_spectrum(
        if max_val > 0:
            return sqrt_spec / max_val
        return sqrt_spec
-    
+
+    elif method == "log1p":
+        log_spec = np.log1p(np.maximum(spectrum, 0))
+        max_val = log_spec.max()
+        if max_val > 0:
+            return log_spec / max_val
+        return log_spec
+
    else:
        raise ValueError(f"Unknown normalization method: {method}")
--- a/train/vega_ml/training/vega/init.py
+++ b/train/vega_ml/training/vega/init.py
@ -14,7 +14,12 @@ Features:

 from .model import VegaModel, VegaConfig
 from .dataset import SpectrumDataset, create_data_loaders
-from .train import train_vega, VegaTrainer
+
+def __getattr__(name):
+    if name in ('train_vega', 'VegaTrainer'):
+        from .train import train_vega, VegaTrainer
+        return locals()[name]
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

 __all__ = [
    'VegaModel',
--- a/train/vega_ml/training/vega/dataset.py
+++ b/train/vega_ml/training/vega/dataset.py
@ -31,24 +31,38 @@ class SpectrumSample:
    detector: str


+def normalize_log1p(spectrum: np.ndarray) -> np.ndarray:
+    """Log1p normalization: log(1 + x) / max(log(1 + x)).
+
+    Preserves relative signal levels across channels, works well when
+    many channels are zero (e.g. after background subtraction).
+    """
+    log_spec = np.log1p(np.maximum(spectrum, 0))
+    max_val = log_spec.max()
+    if max_val > 0:
+        return log_spec / max_val
+    return log_spec
+
+
 class SpectrumDataset(Dataset):
    """
    PyTorch Dataset for synthetic gamma spectra.
-    
+
    Loads spectra from numpy files and their labels from JSON files.
    Supports both individual JSON files per sample (efficient for large datasets)
    and combined labels.json (legacy format).
-    
+
    Converts to tensors suitable for the Vega model.
    """
-    
+
    def __init__(
        self,
        data_dir: Path,
        isotope_index: Optional[IsotopeIndex] = None,
        max_activity_bq: float = 1000.0,
        collapse_time: bool = True,
-        transform=None
+        transform=None,
+        normalization: str = "log1p"
    ):
        """
        Initialize the dataset.
@ -66,6 +80,7 @@ class SpectrumDataset(Dataset):
        self.max_activity_bq = max_activity_bq
        self.collapse_time = collapse_time
        self.transform = transform
+        self.normalization = normalization
        
        # Detect label format and load sample list
        self.use_individual_labels = self._detect_label_format()
@ -156,7 +171,15 @@ class SpectrumDataset(Dataset):
        if self.collapse_time and spectrum.ndim == 2:
            # Average across time intervals to get single spectrum
            spectrum = spectrum.mean(axis=0)
-        
+
+        # Normalize spectrum
+        if self.normalization == "log1p":
+            spectrum = normalize_log1p(spectrum)
+        elif self.normalization == "max":
+            max_val = spectrum.max()
+            if max_val > 0:
+                spectrum = spectrum / max_val
+
        # Convert to tensor
        spectrum_tensor = torch.tensor(spectrum, dtype=torch.float32)