Files
radiacode/train/vega_ml/synthetic_spectra/generate_spectra_v2.py
Jacquin Antoine 745a64b342 Pipeline complet Radiacode 103 - identification automatique d'isotopes
- VegaModel CNN-FCNN 34.5M params, 82 isotopes, val acc 99.89%
- Generation 50k spectres synthetiques 1D (12-24h durees)
- Entrainement 100 epochs sur RTX 5060 Ti (CUDA 12.8, Blackwell)
- Detection continue avec soustraction du background
- Capture background 24h avec gestion deconnexion
- Docker Compose : conteneur train (GPU) + detect (CPU/USB)
- Modele entraite inclus (vega_best.pt, 395 Mo)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-19 12:29:56 +02:00

527 lines
18 KiB
Python

"""
Synthetic Spectra Generation Script v2
Improvements over v1:
- Parallel generation using multiprocessing for 10x+ speedup
- Class-balanced isotope sampling to ensure all isotopes are represented
- More variable background noise (intensity, composition)
- Memory efficient - doesn't accumulate spectra in memory
- Progress bar with ETA
Usage:
python -m synthetic_spectra.generate_spectra_v2 --num_samples 100000 --workers 8
"""
import argparse
import sys
from pathlib import Path
import json
from datetime import datetime
import numpy as np
from multiprocessing import Pool, cpu_count
from functools import partial
import time
from typing import List, Tuple, Dict, Optional
import os
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from synthetic_spectra.generator import (
SpectrumGenerator,
SpectrumConfig,
IsotopeSource,
GeneratedSpectrum,
save_spectrum,
)
from synthetic_spectra.config import RADIACODE_CONFIGS
from synthetic_spectra.ground_truth import get_isotope
# =============================================================================
# ISOTOPE POOL WITH CATEGORIES FOR BALANCED SAMPLING
# =============================================================================
ISOTOPE_CATEGORIES = {
"calibration": [
"Cs-137", "Co-60", "Am-241", "Ba-133", "Eu-152", "Na-22", "Co-57", "Mn-54"
],
"medical": [
"Tc-99m", "I-131", "I-123", "F-18", "Ga-67", "Ga-68", "In-111", "Lu-177", "Tl-201"
],
"industrial": [
"Ir-192", "Se-75", "Zn-65", "Co-58", "Cd-109"
],
"natural_background": [
"K-40", "Ra-226", "U-235", "U-238", "Th-232"
],
"decay_chain_u238": [
"Pb-214", "Bi-214", "Pb-210"
],
"decay_chain_th232": [
"Pb-212", "Bi-212", "Tl-208", "Ac-228", "Ra-224"
],
"reactor_fallout": [
"Cs-134", "I-131", "Sr-90", "Zr-95", "Nb-95", "Ru-103", "Ce-141", "Ce-144", "Sb-125"
],
}
def get_valid_isotope_pool() -> Tuple[List[str], Dict[str, List[str]]]:
"""
Get all valid isotopes (with gamma lines) organized by category.
Returns:
Tuple of (flat_list, category_dict)
"""
valid_categories = {}
all_isotopes = []
for category, isotopes in ISOTOPE_CATEGORIES.items():
valid = []
for name in isotopes:
iso = get_isotope(name)
if iso and len(iso.gamma_lines) > 0:
valid.append(name)
if name not in all_isotopes:
all_isotopes.append(name)
valid_categories[category] = valid
return all_isotopes, valid_categories
# =============================================================================
# BACKGROUND VARIATION
# =============================================================================
class BackgroundConfig:
"""Configuration for varied background generation."""
def __init__(
self,
intensity_min: float = 0.3,
intensity_max: float = 3.0,
k40_prob: float = 0.95, # Almost always present
radon_prob: float = 0.8, # Usually present indoors
thorium_prob: float = 0.6, # Sometimes present
):
self.intensity_min = intensity_min
self.intensity_max = intensity_max
self.k40_prob = k40_prob
self.radon_prob = radon_prob
self.thorium_prob = thorium_prob
def sample(self, rng: np.random.Generator) -> dict:
"""Sample a random background configuration."""
return {
'background_cps': rng.uniform(self.intensity_min, self.intensity_max) * 5.0,
'include_k40': rng.random() < self.k40_prob,
'include_radon': rng.random() < self.radon_prob,
'include_thorium': rng.random() < self.thorium_prob,
}
# =============================================================================
# SINGLE SAMPLE GENERATION (for parallel workers)
# =============================================================================
def generate_single_sample(
args: Tuple[int, dict]
) -> Optional[str]:
"""
Generate a single sample. Designed to be called by worker processes.
Args:
args: Tuple of (sample_index, config_dict)
Returns:
Sample ID if successful, None if failed
"""
sample_idx, config = args
try:
# Create RNG with unique seed per sample
rng = np.random.default_rng(config['base_seed'] + sample_idx)
# Initialize generator (each worker creates its own)
detector_config = RADIACODE_CONFIGS.get(config['detector_name'])
generator = SpectrumGenerator(detector_config=detector_config)
# Determine sample type based on distribution
sample_type = config['sample_types'][sample_idx % len(config['sample_types'])]
# Get isotopes for this sample
isotope_pool = config['isotope_pool']
category_pools = config['category_pools']
# Sample background configuration
bg_config = BackgroundConfig(
intensity_min=config.get('bg_intensity_min', 0.3),
intensity_max=config.get('bg_intensity_max', 3.0),
)
bg_params = bg_config.sample(rng)
# Random duration
duration = rng.uniform(*config['duration_range'])
# Build sources based on sample type
sources = []
if sample_type == 'single':
# For class balance, cycle through isotopes
isotope_idx = sample_idx % len(isotope_pool)
isotope = isotope_pool[isotope_idx]
activity = rng.uniform(*config['activity_range'])
sources.append(IsotopeSource(
isotope_name=isotope,
activity_bq=activity,
include_daughters=True
))
elif sample_type == 'dual':
# Pick from different categories for variety
categories = list(category_pools.keys())
cat1, cat2 = rng.choice(categories, size=2, replace=True)
iso1 = rng.choice(category_pools[cat1]) if category_pools[cat1] else rng.choice(isotope_pool)
iso2 = rng.choice(category_pools[cat2]) if category_pools[cat2] else rng.choice(isotope_pool)
# Ensure different isotopes
while iso2 == iso1:
iso2 = rng.choice(isotope_pool)
for iso in [iso1, iso2]:
activity = rng.uniform(*config['activity_range'])
sources.append(IsotopeSource(
isotope_name=iso,
activity_bq=activity,
include_daughters=True
))
elif sample_type == 'multi':
# 3-5 isotopes from various categories
num_isotopes = rng.integers(3, 6)
selected = set()
for _ in range(num_isotopes):
cat = rng.choice(list(category_pools.keys()))
pool = category_pools[cat] if category_pools[cat] else isotope_pool
iso = rng.choice(pool)
# Avoid duplicates
attempts = 0
while iso in selected and attempts < 10:
iso = rng.choice(isotope_pool)
attempts += 1
if iso not in selected:
selected.add(iso)
activity = rng.uniform(*config['activity_range'])
sources.append(IsotopeSource(
isotope_name=iso,
activity_bq=activity,
include_daughters=True
))
# elif sample_type == 'background': sources stays empty
# Create spectrum config
spec_config = SpectrumConfig(
duration_seconds=duration,
sources=sources,
include_background=True,
background_cps=bg_params['background_cps'],
include_k40=bg_params['include_k40'],
include_radon=bg_params['include_radon'],
include_thorium=bg_params['include_thorium'],
detector_name=config['detector_name'],
)
# Generate spectrum
spectrum = generator.generate_spectrum(spec_config)
# Save spectrum
output_dir = Path(config['output_dir']) / "spectra"
save_spectrum(
spectrum,
output_dir,
save_image=True,
image_format='npy' # Skip PNG for speed
)
return spectrum.sample_id
except Exception as e:
print(f"Error generating sample {sample_idx}: {e}")
return None
# =============================================================================
# MAIN BATCH GENERATION
# =============================================================================
def generate_training_batch_parallel(
num_samples: int,
output_dir: Path,
detector_name: str = "radiacode_103",
duration_range: Tuple[float, float] = (60, 300),
activity_range: Tuple[float, float] = (1.0, 100.0),
single_isotope_fraction: float = 0.40,
dual_isotope_fraction: float = 0.30,
multi_isotope_fraction: float = 0.20,
background_only_fraction: float = 0.10,
bg_intensity_range: Tuple[float, float] = (0.3, 3.0),
num_workers: int = None,
random_seed: int = None,
chunk_size: int = 100,
) -> int:
"""
Generate training samples in parallel.
Args:
num_samples: Total number of samples to generate
output_dir: Output directory
detector_name: Detector to simulate
duration_range: (min, max) duration in seconds
activity_range: (min, max) activity in Bq
single_isotope_fraction: Fraction of single-isotope samples
dual_isotope_fraction: Fraction of dual-isotope samples
multi_isotope_fraction: Fraction of multi-isotope samples
background_only_fraction: Fraction of background-only samples
bg_intensity_range: (min, max) background intensity multiplier
num_workers: Number of parallel workers (default: CPU count - 1)
random_seed: Base random seed
chunk_size: Number of samples per worker batch
Returns:
Number of successfully generated samples
"""
if num_workers is None:
num_workers = max(1, cpu_count() - 1)
if random_seed is None:
random_seed = int(time.time())
# Create output directory
output_dir = Path(output_dir)
spectra_dir = output_dir / "spectra"
spectra_dir.mkdir(parents=True, exist_ok=True)
# Get isotope pools
isotope_pool, category_pools = get_valid_isotope_pool()
print(f"Isotope pool: {len(isotope_pool)} isotopes across {len(category_pools)} categories")
# Calculate sample counts
n_single = int(num_samples * single_isotope_fraction)
n_dual = int(num_samples * dual_isotope_fraction)
n_multi = int(num_samples * multi_isotope_fraction)
n_background = int(num_samples * background_only_fraction)
# Adjust to hit exact count
remaining = num_samples - (n_single + n_dual + n_multi + n_background)
n_single += remaining
# Create sample type list (shuffled for variety in batches)
sample_types = (
['single'] * n_single +
['dual'] * n_dual +
['multi'] * n_multi +
['background'] * n_background
)
np.random.seed(random_seed)
np.random.shuffle(sample_types)
print(f"\nGenerating {num_samples} samples with {num_workers} workers:")
print(f" - Single isotope: {n_single} ({single_isotope_fraction*100:.0f}%)")
print(f" - Dual isotope: {n_dual} ({dual_isotope_fraction*100:.0f}%)")
print(f" - Multi isotope: {n_multi} ({multi_isotope_fraction*100:.0f}%)")
print(f" - Background only: {n_background} ({background_only_fraction*100:.0f}%)")
print(f" - Background intensity: {bg_intensity_range[0]:.1f}x - {bg_intensity_range[1]:.1f}x")
print()
# Shared config for all workers
shared_config = {
'detector_name': detector_name,
'output_dir': str(output_dir),
'duration_range': duration_range,
'activity_range': activity_range,
'bg_intensity_min': bg_intensity_range[0],
'bg_intensity_max': bg_intensity_range[1],
'base_seed': random_seed,
'isotope_pool': isotope_pool,
'category_pools': category_pools,
'sample_types': sample_types,
}
# Generate samples in parallel
start_time = time.time()
successful = 0
# Create argument list
args_list = [(i, shared_config) for i in range(num_samples)]
# Use multiprocessing pool
with Pool(processes=num_workers) as pool:
# Process in chunks and report progress
for i in range(0, num_samples, chunk_size):
chunk_end = min(i + chunk_size, num_samples)
chunk_args = args_list[i:chunk_end]
results = pool.map(generate_single_sample, chunk_args)
chunk_success = sum(1 for r in results if r is not None)
successful += chunk_success
# Progress report
elapsed = time.time() - start_time
rate = successful / elapsed if elapsed > 0 else 0
eta = (num_samples - successful) / rate if rate > 0 else 0
print(f" Progress: {successful}/{num_samples} ({100*successful/num_samples:.1f}%) | "
f"Rate: {rate:.1f} samples/s | ETA: {eta/60:.1f} min")
total_time = time.time() - start_time
print(f"\n{'='*60}")
print(f"Generation complete!")
print(f" Total samples: {successful}/{num_samples}")
print(f" Total time: {total_time/60:.1f} minutes")
print(f" Average rate: {successful/total_time:.1f} samples/second")
print(f"{'='*60}")
return successful
def main():
parser = argparse.ArgumentParser(
description="Generate synthetic gamma spectra (v2 - parallel, balanced)"
)
parser.add_argument(
"--num_samples", "-n",
type=int,
default=100000,
help="Number of samples to generate (default: 100000)"
)
parser.add_argument(
"--output_dir", "-o",
type=str,
default="O:/master_data_collection/isotopev2",
help="Output directory (default: O:/master_data_collection/isotopev2)"
)
parser.add_argument(
"--detector",
type=str,
default="radiacode_103",
choices=list(RADIACODE_CONFIGS.keys()),
help="Detector to simulate (default: radiacode_103)"
)
parser.add_argument(
"--workers", "-w",
type=int,
default=None,
help="Number of parallel workers (default: CPU count - 1)"
)
parser.add_argument(
"--min_duration",
type=float,
default=60,
help="Minimum duration in seconds (default: 60)"
)
parser.add_argument(
"--max_duration",
type=float,
default=300,
help="Maximum duration in seconds (default: 300)"
)
parser.add_argument(
"--min_activity",
type=float,
default=1.0,
help="Minimum activity in Bq (default: 1.0)"
)
parser.add_argument(
"--max_activity",
type=float,
default=100.0,
help="Maximum activity in Bq (default: 100.0)"
)
parser.add_argument(
"--bg_min",
type=float,
default=0.3,
help="Minimum background intensity multiplier (default: 0.3)"
)
parser.add_argument(
"--bg_max",
type=float,
default=3.0,
help="Maximum background intensity multiplier (default: 3.0)"
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Random seed for reproducibility"
)
parser.add_argument(
"--chunk_size",
type=int,
default=100,
help="Samples per progress update (default: 100)"
)
# Sample type fractions
parser.add_argument("--single_frac", type=float, default=0.40)
parser.add_argument("--dual_frac", type=float, default=0.30)
parser.add_argument("--multi_frac", type=float, default=0.20)
parser.add_argument("--bg_frac", type=float, default=0.10)
args = parser.parse_args()
print("=" * 60)
print("Synthetic Gamma Spectra Generator v2")
print(" - Parallel processing")
print(" - Class-balanced sampling")
print(" - Variable background")
print("=" * 60)
print(f"Samples: {args.num_samples:,}")
print(f"Workers: {args.workers or (cpu_count() - 1)}")
print(f"Output: {args.output_dir}")
print(f"Detector: {args.detector}")
print(f"Duration: {args.min_duration}-{args.max_duration}s")
print(f"Activity: {args.min_activity}-{args.max_activity} Bq")
print(f"Background: {args.bg_min}x-{args.bg_max}x")
print("=" * 60)
generate_training_batch_parallel(
num_samples=args.num_samples,
output_dir=Path(args.output_dir),
detector_name=args.detector,
duration_range=(args.min_duration, args.max_duration),
activity_range=(args.min_activity, args.max_activity),
single_isotope_fraction=args.single_frac,
dual_isotope_fraction=args.dual_frac,
multi_isotope_fraction=args.multi_frac,
background_only_fraction=args.bg_frac,
bg_intensity_range=(args.bg_min, args.bg_max),
num_workers=args.workers,
random_seed=args.seed,
chunk_size=args.chunk_size,
)
if __name__ == "__main__":
main()