Pipeline complet Radiacode 103 - identification automatique d'isotopes

- VegaModel CNN-FCNN 34.5M params, 82 isotopes, val acc 99.89%
- Generation 50k spectres synthetiques 1D (12-24h durees)
- Entrainement 100 epochs sur RTX 5060 Ti (CUDA 12.8, Blackwell)
- Detection continue avec soustraction du background
- Capture background 24h avec gestion deconnexion
- Docker Compose : conteneur train (GPU) + detect (CPU/USB)
- Modele entraite inclus (vega_best.pt, 395 Mo)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jacquin Antoine
2026-05-19 12:29:56 +02:00
commit 745a64b342
52 changed files with 17558 additions and 0 deletions

View File

@ -0,0 +1,614 @@
"""
Training Script for Vega Model
Implements the training loop with:
- Mixed precision training for RTX 5090 efficiency
- Learning rate scheduling
- Early stopping
- Model checkpointing
- Training metrics logging
"""
import os
import sys
import json
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, Optional, Tuple
from dataclasses import dataclass, asdict
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
# Sklearn metrics for comprehensive evaluation
from sklearn.metrics import (
roc_auc_score,
f1_score,
precision_score,
recall_score,
hamming_loss
)
from .model import VegaModel, VegaConfig, VegaLoss
from .dataset import create_data_loaders, SpectrumDataset
from .isotope_index import IsotopeIndex, get_default_isotope_index
@dataclass
class TrainingConfig:
"""Configuration for training."""
# Data
data_dir: str = "O:/master_data_collection/isotopev2"
# Model save path
model_dir: str = "models"
model_name: str = "vega"
# Training hyperparameters
batch_size: int = 64 # Increased from 32 for better GPU utilization
learning_rate: float = 1e-3
weight_decay: float = 1e-4
num_epochs: int = 100
# Early stopping
patience: int = 10
min_delta: float = 1e-4
# Learning rate scheduling
lr_scheduler_patience: int = 5
lr_scheduler_factor: float = 0.5
min_lr: float = 1e-6
# Mixed precision
use_amp: bool = True
# Data splits
train_split: float = 0.8
val_split: float = 0.1
test_split: float = 0.1
# Workers - parallel data loading for better GPU utilization
num_workers: int = 8 # Parallel data loading workers
prefetch_factor: int = 4 # Batches to prefetch per worker
persistent_workers: bool = True # Keep workers alive between epochs
# Reproducibility
seed: int = 42
# Activity normalization
max_activity_bq: float = 1000.0
class VegaTrainer:
"""
Trainer class for the Vega model.
Handles the training loop, validation, checkpointing, and metrics.
"""
def __init__(
self,
model: VegaModel,
config: TrainingConfig,
device: Optional[torch.device] = None,
force_cpu: bool = False
):
self.model = model
self.config = config
# Device selection - force CPU if requested or if CUDA incompatible
if force_cpu:
self.device = torch.device('cpu')
elif device:
self.device = device
else:
# Try CUDA but fall back to CPU if there are compatibility issues
if torch.cuda.is_available():
try:
# Test if CUDA actually works
test_tensor = torch.zeros(1, device='cuda')
_ = test_tensor + 1
self.device = torch.device('cuda')
except RuntimeError:
print("CUDA device found but not compatible, falling back to CPU")
self.device = torch.device('cpu')
else:
self.device = torch.device('cpu')
# Move model to device
self.model = self.model.to(self.device)
# Setup loss function
self.loss_fn = VegaLoss(
classification_weight=model.config.classification_weight,
regression_weight=model.config.regression_weight
)
# Setup optimizer
self.optimizer = Adam(
self.model.parameters(),
lr=config.learning_rate,
weight_decay=config.weight_decay
)
# Setup learning rate scheduler
self.scheduler = ReduceLROnPlateau(
self.optimizer,
mode='min',
patience=config.lr_scheduler_patience,
factor=config.lr_scheduler_factor,
min_lr=config.min_lr
)
# Setup mixed precision training (only if CUDA is working)
if config.use_amp and self.device.type == 'cuda':
self.scaler = torch.amp.GradScaler('cuda')
else:
self.scaler = None
# Training state
self.current_epoch = 0
self.best_val_loss = float('inf')
self.epochs_without_improvement = 0
self.training_history = []
# Create model directory
self.model_dir = Path(config.model_dir)
self.model_dir.mkdir(parents=True, exist_ok=True)
print(f"Training on device: {self.device}")
if self.device.type == 'cuda':
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Mixed precision: {config.use_amp}")
def train_epoch(self, train_loader) -> Dict[str, float]:
"""Train for one epoch."""
self.model.train()
total_loss = 0.0
cls_loss_sum = 0.0
reg_loss_sum = 0.0
num_batches = 0
# Track accuracy during training
correct_isotopes = 0
total_isotopes = 0
# Timing for profiling - track data loading vs GPU compute
data_time = 0.0
compute_time = 0.0
data_start = time.time()
for batch in train_loader:
# Data loading time (time spent waiting for next batch)
data_time += time.time() - data_start
compute_start = time.time()
# Move to device
spectra = batch['spectrum'].to(self.device)
presence = batch['presence_labels'].to(self.device)
activities = batch['activity_labels'].to(self.device)
# Zero gradients
self.optimizer.zero_grad()
# Forward pass with optional mixed precision
if self.scaler is not None:
with torch.amp.autocast('cuda'):
pred_logits, pred_activities = self.model(spectra)
loss, loss_dict = self.loss_fn(
pred_logits, pred_activities, presence, activities
)
# Backward pass with scaling
self.scaler.scale(loss).backward()
self.scaler.step(self.optimizer)
self.scaler.update()
else:
pred_logits, pred_activities = self.model(spectra)
loss, loss_dict = self.loss_fn(
pred_logits, pred_activities, presence, activities
)
loss.backward()
self.optimizer.step()
total_loss += loss_dict['total']
cls_loss_sum += loss_dict['classification']
reg_loss_sum += loss_dict['regression']
num_batches += 1
# Calculate training accuracy (detach to avoid memory buildup)
with torch.no_grad():
pred_probs = torch.sigmoid(pred_logits)
pred_presence = (pred_probs >= 0.5).float()
correct_isotopes += (pred_presence == presence).sum().item()
total_isotopes += presence.numel()
# Mark compute time and restart timing for data loading
compute_time += time.time() - compute_start
data_start = time.time()
train_accuracy = correct_isotopes / total_isotopes if total_isotopes > 0 else 0.0
return {
'train_loss': total_loss / num_batches,
'train_cls_loss': cls_loss_sum / num_batches,
'train_reg_loss': reg_loss_sum / num_batches,
'train_accuracy': train_accuracy,
'data_time': data_time,
'compute_time': compute_time
}
@torch.no_grad()
def validate(self, val_loader) -> Dict[str, float]:
"""Validate the model with comprehensive metrics."""
if val_loader is None:
return {}
self.model.eval()
total_loss = 0.0
cls_loss_sum = 0.0
reg_loss_sum = 0.0
num_batches = 0
# Collect all predictions and labels for sklearn metrics
all_probs = []
all_preds = []
all_labels = []
for batch in val_loader:
spectra = batch['spectrum'].to(self.device)
presence = batch['presence_labels'].to(self.device)
activities = batch['activity_labels'].to(self.device)
pred_logits, pred_activities = self.model(spectra)
loss, loss_dict = self.loss_fn(
pred_logits, pred_activities, presence, activities
)
total_loss += loss_dict['total']
cls_loss_sum += loss_dict['classification']
reg_loss_sum += loss_dict['regression']
num_batches += 1
# Collect predictions for metrics
pred_probs = torch.sigmoid(pred_logits)
pred_presence = (pred_probs >= 0.5).float()
all_probs.append(pred_probs.cpu().numpy())
all_preds.append(pred_presence.cpu().numpy())
all_labels.append(presence.cpu().numpy())
# Concatenate all batches
all_probs = np.vstack(all_probs)
all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)
# Basic accuracy (element-wise)
correct = (all_preds == all_labels).sum()
total = all_labels.size
accuracy = correct / total if total > 0 else 0.0
# Multi-label metrics using sklearn
metrics = {
'val_loss': total_loss / num_batches,
'val_cls_loss': cls_loss_sum / num_batches,
'val_reg_loss': reg_loss_sum / num_batches,
'val_accuracy': accuracy,
}
try:
# ROC-AUC (macro-averaged over isotopes with both classes present)
# Only compute for columns that have both 0s and 1s
valid_cols = []
for i in range(all_labels.shape[1]):
if len(np.unique(all_labels[:, i])) == 2:
valid_cols.append(i)
if valid_cols:
auc_macro = roc_auc_score(
all_labels[:, valid_cols],
all_probs[:, valid_cols],
average='macro'
)
auc_micro = roc_auc_score(
all_labels[:, valid_cols],
all_probs[:, valid_cols],
average='micro'
)
metrics['val_auc_macro'] = auc_macro
metrics['val_auc_micro'] = auc_micro
else:
metrics['val_auc_macro'] = 0.0
metrics['val_auc_micro'] = 0.0
except ValueError:
# Handle case where AUC can't be computed
metrics['val_auc_macro'] = 0.0
metrics['val_auc_micro'] = 0.0
# F1, Precision, Recall (samples-averaged for multi-label)
metrics['val_f1_macro'] = f1_score(all_labels, all_preds, average='macro', zero_division=0)
metrics['val_f1_micro'] = f1_score(all_labels, all_preds, average='micro', zero_division=0)
metrics['val_precision'] = precision_score(all_labels, all_preds, average='micro', zero_division=0)
metrics['val_recall'] = recall_score(all_labels, all_preds, average='micro', zero_division=0)
# Hamming loss (fraction of labels incorrectly predicted)
metrics['val_hamming'] = hamming_loss(all_labels, all_preds)
# Exact match ratio (all isotopes correct for a sample)
exact_matches = (all_preds == all_labels).all(axis=1).sum()
metrics['val_exact_match'] = exact_matches / len(all_labels)
return metrics
def save_checkpoint(self, path: Path, is_best: bool = False):
"""Save a model checkpoint."""
checkpoint = {
'epoch': self.current_epoch,
'model_state_dict': self.model.state_dict(),
'optimizer_state_dict': self.optimizer.state_dict(),
'scheduler_state_dict': self.scheduler.state_dict(),
'best_val_loss': self.best_val_loss,
'model_config': asdict(self.model.config),
'training_config': asdict(self.config),
'training_history': self.training_history
}
if self.scaler is not None:
checkpoint['scaler_state_dict'] = self.scaler.state_dict()
torch.save(checkpoint, path)
if is_best:
best_path = path.parent / f"{self.config.model_name}_best.pt"
torch.save(checkpoint, best_path)
def load_checkpoint(self, path: Path):
"""Load a model checkpoint."""
checkpoint = torch.load(path, map_location=self.device)
self.model.load_state_dict(checkpoint['model_state_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
if self.scaler is not None and 'scaler_state_dict' in checkpoint:
self.scaler.load_state_dict(checkpoint['scaler_state_dict'])
self.current_epoch = checkpoint['epoch']
self.best_val_loss = checkpoint['best_val_loss']
self.training_history = checkpoint.get('training_history', [])
print(f"Loaded checkpoint from epoch {self.current_epoch}")
def train(
self,
train_loader,
val_loader,
resume_from: Optional[Path] = None
) -> Dict:
"""
Full training loop.
Args:
train_loader: Training data loader
val_loader: Validation data loader
resume_from: Optional path to checkpoint to resume from
Returns:
Training results dictionary
"""
if resume_from is not None:
self.load_checkpoint(resume_from)
print("\n" + "=" * 60)
print("Starting Vega Training")
print("=" * 60)
print(f"Epochs: {self.config.num_epochs}")
print(f"Batch size: {self.config.batch_size}")
print(f"Learning rate: {self.config.learning_rate}")
print(f"Training samples: {len(train_loader.dataset)}")
if val_loader:
print(f"Validation samples: {len(val_loader.dataset)}")
print("=" * 60 + "\n")
start_time = time.time()
for epoch in range(self.current_epoch, self.config.num_epochs):
self.current_epoch = epoch
epoch_start = time.time()
# Train
train_metrics = self.train_epoch(train_loader)
# Validate
val_metrics = self.validate(val_loader)
# Combine metrics
metrics = {**train_metrics, **val_metrics, 'epoch': epoch}
self.training_history.append(metrics)
# Update learning rate
if val_loader and 'val_loss' in val_metrics:
self.scheduler.step(val_metrics['val_loss'])
else:
self.scheduler.step(train_metrics['train_loss'])
# Check for improvement
val_loss = val_metrics.get('val_loss', train_metrics['train_loss'])
is_best = val_loss < self.best_val_loss - self.config.min_delta
if is_best:
self.best_val_loss = val_loss
self.epochs_without_improvement = 0
else:
self.epochs_without_improvement += 1
# Save checkpoint
checkpoint_path = self.model_dir / f"{self.config.model_name}_epoch_{epoch}.pt"
self.save_checkpoint(checkpoint_path, is_best=is_best)
# Logging
epoch_time = time.time() - epoch_start
lr = self.optimizer.param_groups[0]['lr']
# Primary metrics line
log_str = (
f"Epoch {epoch+1:3d}/{self.config.num_epochs} | "
f"Train Loss: {train_metrics['train_loss']:.4f} | "
f"Train Acc: {train_metrics['train_accuracy']:.4f} | "
)
if val_loader:
log_str += (
f"Val Loss: {val_metrics['val_loss']:.4f} | "
f"Val Acc: {val_metrics['val_accuracy']:.4f} | "
)
log_str += f"LR: {lr:.2e} | Time: {epoch_time:.1f}s"
if is_best:
log_str += " *"
print(log_str)
# Timing breakdown line
data_t = train_metrics.get('data_time', 0)
compute_t = train_metrics.get('compute_time', 0)
if data_t > 0 or compute_t > 0:
data_pct = 100 * data_t / (data_t + compute_t) if (data_t + compute_t) > 0 else 0
print(f" └── Data: {data_t:.1f}s ({data_pct:.0f}%) | Compute: {compute_t:.1f}s ({100-data_pct:.0f}%)")
# Secondary metrics line (detailed classification metrics)
if val_loader and 'val_auc_macro' in val_metrics:
detail_str = (
f" └── AUC: {val_metrics['val_auc_macro']:.4f} | "
f"F1: {val_metrics['val_f1_macro']:.4f} | "
f"Prec: {val_metrics['val_precision']:.4f} | "
f"Recall: {val_metrics['val_recall']:.4f} | "
f"Exact: {val_metrics['val_exact_match']:.4f}"
)
print(detail_str)
# Early stopping
if self.epochs_without_improvement >= self.config.patience:
print(f"\nEarly stopping after {epoch + 1} epochs")
break
total_time = time.time() - start_time
# Save final model
final_path = self.model_dir / f"{self.config.model_name}_final.pt"
self.save_checkpoint(final_path)
# Save training history
history_path = self.model_dir / f"{self.config.model_name}_history.json"
with open(history_path, 'w') as f:
json.dump(self.training_history, f, indent=2)
print("\n" + "=" * 60)
print(f"Training complete!")
print(f"Total time: {total_time / 60:.1f} minutes")
print(f"Best validation loss: {self.best_val_loss:.4f}")
print(f"Model saved to: {final_path}")
print("=" * 60)
return {
'best_val_loss': self.best_val_loss,
'total_epochs': self.current_epoch + 1,
'total_time': total_time,
'history': self.training_history
}
def train_vega(
data_dir: Optional[str] = None,
model_dir: Optional[str] = None,
config: Optional[TrainingConfig] = None,
model_config: Optional[VegaConfig] = None
) -> Tuple[VegaModel, Dict]:
"""
Convenience function to train a Vega model.
Args:
data_dir: Path to data directory
model_dir: Path to save models
config: Training configuration
model_config: Model configuration
Returns:
Tuple of (trained model, training results)
"""
# Setup paths
project_root = Path(__file__).parent.parent.parent
if config is None:
config = TrainingConfig()
if data_dir:
config.data_dir = data_dir
if model_dir:
config.model_dir = model_dir
# Make paths absolute
data_path = Path(config.data_dir)
if not data_path.is_absolute():
data_path = project_root / data_path
model_path = Path(config.model_dir)
if not model_path.is_absolute():
model_path = project_root / model_path
config.model_dir = str(model_path)
# Set random seeds
torch.manual_seed(config.seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(config.seed)
# Get isotope index
isotope_index = get_default_isotope_index()
# Create data loaders with parallel loading
train_loader, val_loader, test_loader = create_data_loaders(
data_dir=data_path,
batch_size=config.batch_size,
train_split=config.train_split,
val_split=config.val_split,
test_split=config.test_split,
num_workers=config.num_workers,
prefetch_factor=config.prefetch_factor,
persistent_workers=config.persistent_workers,
isotope_index=isotope_index,
max_activity_bq=config.max_activity_bq,
seed=config.seed
)
# Create model
if model_config is None:
model_config = VegaConfig(
num_isotopes=isotope_index.num_isotopes,
max_activity_bq=config.max_activity_bq
)
model = VegaModel(model_config)
print(model.summary())
# Create trainer
trainer = VegaTrainer(model, config)
# Train
results = trainer.train(train_loader, val_loader)
# Save isotope index with model
index_path = model_path / f"{config.model_name}_isotope_index.txt"
isotope_index.save(index_path)
return model, results
if __name__ == "__main__":
# Quick test training
model, results = train_vega()