- VegaModel CNN-FCNN 34.5M params, 82 isotopes, val acc 99.89% - Generation 50k spectres synthetiques 1D (12-24h durees) - Entrainement 100 epochs sur RTX 5060 Ti (CUDA 12.8, Blackwell) - Detection continue avec soustraction du background - Capture background 24h avec gestion deconnexion - Docker Compose : conteneur train (GPU) + detect (CPU/USB) - Modele entraite inclus (vega_best.pt, 395 Mo) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
947 lines
30 KiB
Python
947 lines
30 KiB
Python
"""
|
|
Training Data Visualization Script
|
|
|
|
Generates an interactive HTML dashboard with Plotly visualizations to explore
|
|
the synthetic training data distribution, isotope combinations, activities,
|
|
durations, and sample spectra.
|
|
|
|
Usage:
|
|
python -m synthetic_spectra.visualize_training_data
|
|
python -m synthetic_spectra.visualize_training_data --data-dir data/synthetic/spectra
|
|
python -m synthetic_spectra.visualize_training_data --output report.html --max-samples 1000
|
|
|
|
Output:
|
|
An interactive HTML file that can be opened in any browser.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from collections import Counter, defaultdict
|
|
from itertools import combinations
|
|
from typing import Dict, List, Tuple, Optional
|
|
import numpy as np
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
try:
|
|
import plotly.graph_objects as go
|
|
import plotly.express as px
|
|
from plotly.subplots import make_subplots
|
|
except ImportError:
|
|
print("Error: Plotly is required. Install it with: pip install plotly")
|
|
sys.exit(1)
|
|
|
|
from synthetic_spectra.ground_truth.isotope_data import (
|
|
ISOTOPE_DATABASE,
|
|
IsotopeCategory,
|
|
get_isotopes_by_category,
|
|
)
|
|
|
|
|
|
def load_all_metadata(data_dir: Path, max_samples: Optional[int] = None) -> List[Dict]:
|
|
"""Load all JSON metadata files from the data directory."""
|
|
json_files = sorted(data_dir.glob("*.json"))
|
|
|
|
if max_samples is not None and len(json_files) > max_samples:
|
|
# Randomly sample if we have too many
|
|
np.random.seed(42)
|
|
indices = np.random.choice(len(json_files), max_samples, replace=False)
|
|
json_files = [json_files[i] for i in sorted(indices)]
|
|
|
|
metadata_list = []
|
|
print(f"Loading {len(json_files)} metadata files...")
|
|
|
|
for i, json_file in enumerate(json_files):
|
|
try:
|
|
with open(json_file, 'r') as f:
|
|
data = json.load(f)
|
|
data['_filename'] = json_file.stem
|
|
metadata_list.append(data)
|
|
except Exception as e:
|
|
print(f" Warning: Could not load {json_file}: {e}")
|
|
|
|
if (i + 1) % 1000 == 0:
|
|
print(f" Loaded {i + 1}/{len(json_files)} files...")
|
|
|
|
print(f"Loaded {len(metadata_list)} samples successfully.")
|
|
return metadata_list
|
|
|
|
|
|
def load_sample_spectra(data_dir: Path, sample_ids: List[str]) -> Dict[str, np.ndarray]:
|
|
"""Load a few sample spectra for visualization."""
|
|
spectra = {}
|
|
for sample_id in sample_ids:
|
|
npy_file = data_dir / f"{sample_id}.npy"
|
|
if npy_file.exists():
|
|
try:
|
|
spectra[sample_id] = np.load(npy_file)
|
|
except Exception as e:
|
|
print(f" Warning: Could not load spectrum {npy_file}: {e}")
|
|
return spectra
|
|
|
|
|
|
def compute_statistics(metadata_list: List[Dict]) -> Dict:
|
|
"""Compute various statistics from the metadata."""
|
|
stats = {
|
|
'total_samples': len(metadata_list),
|
|
'isotope_counts': Counter(),
|
|
'isotope_cooccurrence': defaultdict(int),
|
|
'num_isotopes_distribution': Counter(),
|
|
'durations': [],
|
|
'activities': defaultdict(list),
|
|
'detectors': Counter(),
|
|
'category_counts': Counter(),
|
|
'samples_by_num_isotopes': defaultdict(list),
|
|
}
|
|
|
|
for meta in metadata_list:
|
|
isotopes = meta.get('isotopes', [])
|
|
source_activities = meta.get('source_activities_bq', {})
|
|
duration = meta.get('duration_seconds', 0)
|
|
detector = meta.get('detector', 'unknown')
|
|
|
|
# Count isotopes
|
|
for iso in isotopes:
|
|
stats['isotope_counts'][iso] += 1
|
|
|
|
# Get category
|
|
if iso in ISOTOPE_DATABASE:
|
|
cat = ISOTOPE_DATABASE[iso].category.value
|
|
stats['category_counts'][cat] += 1
|
|
|
|
# Count isotope pairs (co-occurrence)
|
|
for pair in combinations(sorted(isotopes), 2):
|
|
stats['isotope_cooccurrence'][pair] += 1
|
|
|
|
# Number of isotopes distribution
|
|
num_iso = len(isotopes)
|
|
stats['num_isotopes_distribution'][num_iso] += 1
|
|
stats['samples_by_num_isotopes'][num_iso].append(meta['_filename'])
|
|
|
|
# Duration
|
|
stats['durations'].append(duration)
|
|
|
|
# Activities per isotope
|
|
for iso, activity in source_activities.items():
|
|
stats['activities'][iso].append(activity)
|
|
|
|
# Detector
|
|
stats['detectors'][detector] += 1
|
|
|
|
return stats
|
|
|
|
|
|
def create_isotope_frequency_chart(stats: Dict) -> go.Figure:
|
|
"""Create bar chart of isotope frequencies."""
|
|
isotope_counts = stats['isotope_counts']
|
|
|
|
# Sort by frequency
|
|
sorted_isotopes = sorted(isotope_counts.items(), key=lambda x: x[1], reverse=True)
|
|
isotopes, counts = zip(*sorted_isotopes) if sorted_isotopes else ([], [])
|
|
|
|
# Color by category
|
|
colors = []
|
|
category_colors = {
|
|
'natural_background': '#2ecc71',
|
|
'primordial': '#27ae60',
|
|
'cosmogenic': '#1abc9c',
|
|
'u238_chain': '#e74c3c',
|
|
'th232_chain': '#c0392b',
|
|
'u235_chain': '#d35400',
|
|
'calibration': '#3498db',
|
|
'industrial': '#9b59b6',
|
|
'medical': '#f1c40f',
|
|
'reactor_fallout': '#e67e22',
|
|
'activation': '#95a5a6',
|
|
}
|
|
|
|
for iso in isotopes:
|
|
if iso in ISOTOPE_DATABASE:
|
|
cat = ISOTOPE_DATABASE[iso].category.value
|
|
colors.append(category_colors.get(cat, '#7f8c8d'))
|
|
else:
|
|
colors.append('#7f8c8d')
|
|
|
|
fig = go.Figure(data=[
|
|
go.Bar(
|
|
x=list(isotopes),
|
|
y=list(counts),
|
|
marker_color=colors,
|
|
hovertemplate="<b>%{x}</b><br>Count: %{y}<extra></extra>"
|
|
)
|
|
])
|
|
|
|
fig.update_layout(
|
|
title="Isotope Frequency Distribution",
|
|
xaxis_title="Isotope",
|
|
yaxis_title="Number of Samples",
|
|
xaxis_tickangle=-45,
|
|
height=500,
|
|
showlegend=False
|
|
)
|
|
|
|
return fig
|
|
|
|
|
|
def create_category_pie_chart(stats: Dict) -> go.Figure:
|
|
"""Create pie chart of isotope categories."""
|
|
category_counts = stats['category_counts']
|
|
|
|
if not category_counts:
|
|
return go.Figure().add_annotation(text="No category data available",
|
|
xref="paper", yref="paper", x=0.5, y=0.5)
|
|
|
|
labels = list(category_counts.keys())
|
|
values = list(category_counts.values())
|
|
|
|
# Pretty names for categories
|
|
pretty_names = {
|
|
'natural_background': 'Natural Background',
|
|
'primordial': 'Primordial',
|
|
'cosmogenic': 'Cosmogenic',
|
|
'u238_chain': 'U-238 Chain',
|
|
'th232_chain': 'Th-232 Chain',
|
|
'u235_chain': 'U-235 Chain',
|
|
'calibration': 'Calibration',
|
|
'industrial': 'Industrial',
|
|
'medical': 'Medical',
|
|
'reactor_fallout': 'Reactor/Fallout',
|
|
'activation': 'Activation Products',
|
|
}
|
|
|
|
labels = [pretty_names.get(l, l) for l in labels]
|
|
|
|
fig = go.Figure(data=[
|
|
go.Pie(
|
|
labels=labels,
|
|
values=values,
|
|
hole=0.4,
|
|
hovertemplate="<b>%{label}</b><br>Count: %{value}<br>%{percent}<extra></extra>"
|
|
)
|
|
])
|
|
|
|
fig.update_layout(
|
|
title="Isotope Categories Distribution",
|
|
height=450,
|
|
)
|
|
|
|
return fig
|
|
|
|
|
|
def create_num_isotopes_histogram(stats: Dict) -> go.Figure:
|
|
"""Create histogram of number of isotopes per sample."""
|
|
num_iso_dist = stats['num_isotopes_distribution']
|
|
|
|
x = sorted(num_iso_dist.keys())
|
|
y = [num_iso_dist[k] for k in x]
|
|
|
|
# Calculate percentages
|
|
total = sum(y)
|
|
percentages = [f"{(v/total)*100:.1f}%" for v in y]
|
|
|
|
fig = go.Figure(data=[
|
|
go.Bar(
|
|
x=[str(k) for k in x],
|
|
y=y,
|
|
text=percentages,
|
|
textposition='auto',
|
|
marker_color='#3498db',
|
|
hovertemplate="<b>%{x} isotopes</b><br>Count: %{y}<br>%{text}<extra></extra>"
|
|
)
|
|
])
|
|
|
|
fig.update_layout(
|
|
title="Sample Complexity (Number of Isotopes per Sample)",
|
|
xaxis_title="Number of Source Isotopes",
|
|
yaxis_title="Number of Samples",
|
|
height=400,
|
|
)
|
|
|
|
return fig
|
|
|
|
|
|
def create_duration_histogram(stats: Dict) -> go.Figure:
|
|
"""Create histogram of measurement durations."""
|
|
durations = stats['durations']
|
|
|
|
if not durations:
|
|
return go.Figure().add_annotation(text="No duration data available",
|
|
xref="paper", yref="paper", x=0.5, y=0.5)
|
|
|
|
fig = go.Figure(data=[
|
|
go.Histogram(
|
|
x=durations,
|
|
nbinsx=50,
|
|
marker_color='#9b59b6',
|
|
hovertemplate="Duration: %{x:.1f}s<br>Count: %{y}<extra></extra>"
|
|
)
|
|
])
|
|
|
|
fig.update_layout(
|
|
title="Measurement Duration Distribution",
|
|
xaxis_title="Duration (seconds)",
|
|
yaxis_title="Number of Samples",
|
|
height=400,
|
|
)
|
|
|
|
# Add statistics annotation
|
|
mean_dur = np.mean(durations)
|
|
std_dur = np.std(durations)
|
|
min_dur = np.min(durations)
|
|
max_dur = np.max(durations)
|
|
|
|
fig.add_annotation(
|
|
text=f"Mean: {mean_dur:.1f}s | Std: {std_dur:.1f}s | Range: [{min_dur:.1f}, {max_dur:.1f}]s",
|
|
xref="paper", yref="paper",
|
|
x=0.98, y=0.98,
|
|
xanchor='right', yanchor='top',
|
|
showarrow=False,
|
|
bgcolor="white",
|
|
bordercolor="black",
|
|
borderwidth=1,
|
|
font=dict(size=11)
|
|
)
|
|
|
|
return fig
|
|
|
|
|
|
def create_activity_boxplot(stats: Dict) -> go.Figure:
|
|
"""Create box plot of activities per isotope."""
|
|
activities = stats['activities']
|
|
|
|
if not activities:
|
|
return go.Figure().add_annotation(text="No activity data available",
|
|
xref="paper", yref="paper", x=0.5, y=0.5)
|
|
|
|
# Sort by median activity
|
|
sorted_isotopes = sorted(
|
|
activities.keys(),
|
|
key=lambda x: np.median(activities[x]) if activities[x] else 0,
|
|
reverse=True
|
|
)
|
|
|
|
# Only show top 30 for readability
|
|
top_isotopes = sorted_isotopes[:30]
|
|
|
|
fig = go.Figure()
|
|
|
|
for iso in top_isotopes:
|
|
fig.add_trace(go.Box(
|
|
y=activities[iso],
|
|
name=iso,
|
|
boxpoints='outliers',
|
|
hovertemplate=f"<b>{iso}</b><br>Activity: %{{y:.2f}} Bq<extra></extra>"
|
|
))
|
|
|
|
fig.update_layout(
|
|
title="Activity Distribution by Isotope (Top 30)",
|
|
xaxis_title="Isotope",
|
|
yaxis_title="Activity (Bq)",
|
|
xaxis_tickangle=-45,
|
|
height=500,
|
|
showlegend=False
|
|
)
|
|
|
|
return fig
|
|
|
|
|
|
def create_cooccurrence_heatmap(stats: Dict, top_n: int = 20) -> go.Figure:
|
|
"""Create heatmap of isotope co-occurrence."""
|
|
cooccurrence = stats['isotope_cooccurrence']
|
|
isotope_counts = stats['isotope_counts']
|
|
|
|
if not cooccurrence:
|
|
return go.Figure().add_annotation(text="No co-occurrence data (need multi-isotope samples)",
|
|
xref="paper", yref="paper", x=0.5, y=0.5)
|
|
|
|
# Get top N most frequent isotopes
|
|
top_isotopes = [iso for iso, _ in isotope_counts.most_common(top_n)]
|
|
|
|
# Build matrix
|
|
n = len(top_isotopes)
|
|
matrix = np.zeros((n, n))
|
|
|
|
for i, iso1 in enumerate(top_isotopes):
|
|
for j, iso2 in enumerate(top_isotopes):
|
|
if i < j:
|
|
pair = tuple(sorted([iso1, iso2]))
|
|
matrix[i, j] = cooccurrence.get(pair, 0)
|
|
matrix[j, i] = matrix[i, j]
|
|
|
|
fig = go.Figure(data=go.Heatmap(
|
|
z=matrix,
|
|
x=top_isotopes,
|
|
y=top_isotopes,
|
|
colorscale='Blues',
|
|
hovertemplate="<b>%{x}</b> + <b>%{y}</b><br>Co-occurrences: %{z}<extra></extra>"
|
|
))
|
|
|
|
fig.update_layout(
|
|
title=f"Isotope Co-occurrence Matrix (Top {top_n} Isotopes)",
|
|
xaxis_tickangle=-45,
|
|
height=600,
|
|
width=700,
|
|
)
|
|
|
|
return fig
|
|
|
|
|
|
def create_activity_vs_duration_scatter(metadata_list: List[Dict]) -> go.Figure:
|
|
"""Create scatter plot of total activity vs duration."""
|
|
durations = []
|
|
total_activities = []
|
|
num_isotopes = []
|
|
sample_ids = []
|
|
|
|
for meta in metadata_list:
|
|
duration = meta.get('duration_seconds', 0)
|
|
activities = meta.get('source_activities_bq', {})
|
|
|
|
if duration > 0 and activities:
|
|
durations.append(duration)
|
|
total_activities.append(sum(activities.values()))
|
|
num_isotopes.append(len(meta.get('isotopes', [])))
|
|
sample_ids.append(meta['_filename'])
|
|
|
|
if not durations:
|
|
return go.Figure().add_annotation(text="No data available",
|
|
xref="paper", yref="paper", x=0.5, y=0.5)
|
|
|
|
fig = go.Figure(data=go.Scatter(
|
|
x=durations,
|
|
y=total_activities,
|
|
mode='markers',
|
|
marker=dict(
|
|
size=6,
|
|
color=num_isotopes,
|
|
colorscale='Viridis',
|
|
colorbar=dict(title="# Isotopes"),
|
|
opacity=0.6
|
|
),
|
|
text=sample_ids,
|
|
hovertemplate="<b>%{text}</b><br>Duration: %{x:.1f}s<br>Total Activity: %{y:.2f} Bq<extra></extra>"
|
|
))
|
|
|
|
fig.update_layout(
|
|
title="Total Source Activity vs Measurement Duration",
|
|
xaxis_title="Duration (seconds)",
|
|
yaxis_title="Total Activity (Bq)",
|
|
height=500,
|
|
)
|
|
|
|
return fig
|
|
|
|
|
|
def create_sample_spectrum_plot(spectra: Dict[str, np.ndarray], metadata_list: List[Dict]) -> go.Figure:
|
|
"""Create interactive plot of sample spectra."""
|
|
if not spectra:
|
|
return go.Figure().add_annotation(text="No spectrum data loaded",
|
|
xref="paper", yref="paper", x=0.5, y=0.5)
|
|
|
|
# Create a metadata lookup
|
|
meta_lookup = {m['_filename']: m for m in metadata_list}
|
|
|
|
# Energy axis (keV) - 1023 channels from 20 to 3000 keV
|
|
num_channels = 1023
|
|
energy = np.linspace(20, 3000, num_channels)
|
|
|
|
fig = go.Figure()
|
|
|
|
colors = px.colors.qualitative.Set2
|
|
|
|
for i, (sample_id, spectrum) in enumerate(list(spectra.items())[:6]):
|
|
# Sum across time intervals to get total spectrum
|
|
total_spectrum = spectrum.sum(axis=0) if spectrum.ndim == 2 else spectrum
|
|
|
|
# Get isotope info
|
|
meta = meta_lookup.get(sample_id, {})
|
|
isotopes = meta.get('isotopes', ['Unknown'])
|
|
label = f"{sample_id[-6:]}: {', '.join(isotopes)}"
|
|
|
|
fig.add_trace(go.Scatter(
|
|
x=energy,
|
|
y=total_spectrum,
|
|
mode='lines',
|
|
name=label,
|
|
line=dict(color=colors[i % len(colors)], width=1),
|
|
hovertemplate=f"<b>{label}</b><br>Energy: %{{x:.1f}} keV<br>Counts: %{{y:.2f}}<extra></extra>"
|
|
))
|
|
|
|
fig.update_layout(
|
|
title="Sample Spectra (Time-Integrated)",
|
|
xaxis_title="Energy (keV)",
|
|
yaxis_title="Normalized Counts",
|
|
height=500,
|
|
legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
|
|
hovermode='closest'
|
|
)
|
|
|
|
return fig
|
|
|
|
|
|
def create_3d_spectrum_surface(spectrum: np.ndarray, sample_id: str) -> go.Figure:
|
|
"""Create 3D surface plot of a single spectrum (time vs energy vs counts)."""
|
|
if spectrum.ndim != 2:
|
|
return go.Figure().add_annotation(text="Spectrum must be 2D",
|
|
xref="paper", yref="paper", x=0.5, y=0.5)
|
|
|
|
num_intervals, num_channels = spectrum.shape
|
|
|
|
# Create axes
|
|
time_axis = np.arange(num_intervals)
|
|
energy_axis = np.linspace(20, 3000, num_channels)
|
|
|
|
# Downsample for performance if needed
|
|
if num_intervals > 100:
|
|
step = num_intervals // 100
|
|
spectrum = spectrum[::step, :]
|
|
time_axis = time_axis[::step]
|
|
|
|
if num_channels > 256:
|
|
ch_step = num_channels // 256
|
|
spectrum = spectrum[:, ::ch_step]
|
|
energy_axis = energy_axis[::ch_step]
|
|
|
|
fig = go.Figure(data=[
|
|
go.Surface(
|
|
z=spectrum,
|
|
x=energy_axis,
|
|
y=time_axis,
|
|
colorscale='Viridis',
|
|
hovertemplate="Time: %{y}s<br>Energy: %{x:.1f} keV<br>Counts: %{z:.3f}<extra></extra>"
|
|
)
|
|
])
|
|
|
|
fig.update_layout(
|
|
title=f"3D Spectrum View: {sample_id}",
|
|
scene=dict(
|
|
xaxis_title="Energy (keV)",
|
|
yaxis_title="Time (s)",
|
|
zaxis_title="Counts",
|
|
),
|
|
height=600,
|
|
)
|
|
|
|
return fig
|
|
|
|
|
|
def create_summary_table(stats: Dict) -> str:
|
|
"""Create an HTML summary table."""
|
|
total = stats['total_samples']
|
|
num_unique_isotopes = len(stats['isotope_counts'])
|
|
avg_isotopes_per_sample = sum(k * v for k, v in stats['num_isotopes_distribution'].items()) / total if total else 0
|
|
|
|
durations = stats['durations']
|
|
activities_all = [a for acts in stats['activities'].values() for a in acts]
|
|
|
|
html = f"""
|
|
<div style="padding: 20px; background: #f8f9fa; border-radius: 10px; margin: 20px 0;">
|
|
<h3 style="margin-top: 0; color: #2c3e50;">📊 Dataset Summary</h3>
|
|
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
|
|
<tr style="border-bottom: 1px solid #ddd;">
|
|
<td style="padding: 8px;"><strong>Total Samples</strong></td>
|
|
<td style="padding: 8px;">{total:,}</td>
|
|
</tr>
|
|
<tr style="border-bottom: 1px solid #ddd;">
|
|
<td style="padding: 8px;"><strong>Unique Isotopes</strong></td>
|
|
<td style="padding: 8px;">{num_unique_isotopes}</td>
|
|
</tr>
|
|
<tr style="border-bottom: 1px solid #ddd;">
|
|
<td style="padding: 8px;"><strong>Avg Isotopes per Sample</strong></td>
|
|
<td style="padding: 8px;">{avg_isotopes_per_sample:.2f}</td>
|
|
</tr>
|
|
<tr style="border-bottom: 1px solid #ddd;">
|
|
<td style="padding: 8px;"><strong>Duration Range</strong></td>
|
|
<td style="padding: 8px;">{min(durations) if durations else 0:.1f}s - {max(durations) if durations else 0:.1f}s</td>
|
|
</tr>
|
|
<tr style="border-bottom: 1px solid #ddd;">
|
|
<td style="padding: 8px;"><strong>Mean Duration</strong></td>
|
|
<td style="padding: 8px;">{np.mean(durations) if durations else 0:.1f}s</td>
|
|
</tr>
|
|
<tr style="border-bottom: 1px solid #ddd;">
|
|
<td style="padding: 8px;"><strong>Activity Range</strong></td>
|
|
<td style="padding: 8px;">{min(activities_all) if activities_all else 0:.2f} - {max(activities_all) if activities_all else 0:.2f} Bq</td>
|
|
</tr>
|
|
<tr>
|
|
<td style="padding: 8px;"><strong>Detectors</strong></td>
|
|
<td style="padding: 8px;">{', '.join(stats['detectors'].keys())}</td>
|
|
</tr>
|
|
</table>
|
|
</div>
|
|
"""
|
|
return html
|
|
|
|
|
|
def create_isotope_database_summary() -> go.Figure:
|
|
"""Create a sunburst chart of the isotope database by category."""
|
|
# Build hierarchy data
|
|
categories = defaultdict(list)
|
|
for name, isotope in ISOTOPE_DATABASE.items():
|
|
categories[isotope.category.value].append(name)
|
|
|
|
# Create sunburst data
|
|
ids = []
|
|
labels = []
|
|
parents = []
|
|
values = []
|
|
|
|
# Root
|
|
ids.append("Isotope Database")
|
|
labels.append("Isotope Database")
|
|
parents.append("")
|
|
values.append(len(ISOTOPE_DATABASE))
|
|
|
|
# Categories and isotopes
|
|
pretty_names = {
|
|
'natural_background': 'Natural Background',
|
|
'primordial': 'Primordial',
|
|
'cosmogenic': 'Cosmogenic',
|
|
'u238_chain': 'U-238 Chain',
|
|
'th232_chain': 'Th-232 Chain',
|
|
'u235_chain': 'U-235 Chain',
|
|
'calibration': 'Calibration',
|
|
'industrial': 'Industrial',
|
|
'medical': 'Medical',
|
|
'reactor_fallout': 'Reactor/Fallout',
|
|
'activation': 'Activation',
|
|
}
|
|
|
|
for cat, isotopes in categories.items():
|
|
cat_label = pretty_names.get(cat, cat)
|
|
ids.append(cat_label)
|
|
labels.append(f"{cat_label} ({len(isotopes)})")
|
|
parents.append("Isotope Database")
|
|
values.append(len(isotopes))
|
|
|
|
for iso in isotopes:
|
|
ids.append(f"{cat_label}/{iso}")
|
|
labels.append(iso)
|
|
parents.append(cat_label)
|
|
values.append(1)
|
|
|
|
fig = go.Figure(go.Sunburst(
|
|
ids=ids,
|
|
labels=labels,
|
|
parents=parents,
|
|
values=values,
|
|
branchvalues="total",
|
|
hovertemplate="<b>%{label}</b><extra></extra>"
|
|
))
|
|
|
|
fig.update_layout(
|
|
title=f"Isotope Database Structure ({len(ISOTOPE_DATABASE)} isotopes)",
|
|
height=600,
|
|
)
|
|
|
|
return fig
|
|
|
|
|
|
def generate_html_report(
|
|
data_dir: Path,
|
|
output_file: Path,
|
|
max_samples: Optional[int] = None
|
|
):
|
|
"""Generate the complete HTML report."""
|
|
|
|
print("=" * 60)
|
|
print("Training Data Visualization Report Generator")
|
|
print("=" * 60)
|
|
|
|
# Load all metadata
|
|
metadata_list = load_all_metadata(data_dir, max_samples)
|
|
|
|
if not metadata_list:
|
|
print("Error: No metadata files found!")
|
|
return
|
|
|
|
# Compute statistics
|
|
print("\nComputing statistics...")
|
|
stats = compute_statistics(metadata_list)
|
|
|
|
# Load a few sample spectra
|
|
print("\nLoading sample spectra for visualization...")
|
|
sample_ids = [m['_filename'] for m in metadata_list[:10]]
|
|
spectra = load_sample_spectra(data_dir, sample_ids)
|
|
|
|
print(f"\nGenerating visualizations...")
|
|
|
|
# Generate all figures
|
|
figures = {
|
|
'isotope_freq': create_isotope_frequency_chart(stats),
|
|
'category_pie': create_category_pie_chart(stats),
|
|
'num_isotopes': create_num_isotopes_histogram(stats),
|
|
'duration_hist': create_duration_histogram(stats),
|
|
'activity_box': create_activity_boxplot(stats),
|
|
'cooccurrence': create_cooccurrence_heatmap(stats),
|
|
'activity_duration': create_activity_vs_duration_scatter(metadata_list),
|
|
'sample_spectra': create_sample_spectrum_plot(spectra, metadata_list),
|
|
'isotope_db': create_isotope_database_summary(),
|
|
}
|
|
|
|
# Add 3D spectrum if we have data
|
|
if spectra:
|
|
first_id = list(spectra.keys())[0]
|
|
figures['spectrum_3d'] = create_3d_spectrum_surface(spectra[first_id], first_id)
|
|
|
|
# Create HTML
|
|
print("\nBuilding HTML report...")
|
|
|
|
html_parts = [
|
|
"""
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Synthetic Training Data Visualization</title>
|
|
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
|
|
<style>
|
|
body {
|
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
|
|
margin: 0;
|
|
padding: 20px;
|
|
background: #ecf0f1;
|
|
color: #2c3e50;
|
|
}
|
|
.container {
|
|
max-width: 1400px;
|
|
margin: 0 auto;
|
|
}
|
|
h1 {
|
|
text-align: center;
|
|
color: #2c3e50;
|
|
padding: 20px;
|
|
background: white;
|
|
border-radius: 10px;
|
|
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
|
}
|
|
h2 {
|
|
color: #34495e;
|
|
border-bottom: 2px solid #3498db;
|
|
padding-bottom: 10px;
|
|
margin-top: 40px;
|
|
}
|
|
.chart-container {
|
|
background: white;
|
|
padding: 20px;
|
|
border-radius: 10px;
|
|
margin: 20px 0;
|
|
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
|
}
|
|
.row {
|
|
display: flex;
|
|
gap: 20px;
|
|
flex-wrap: wrap;
|
|
}
|
|
.col-6 {
|
|
flex: 1;
|
|
min-width: 400px;
|
|
}
|
|
.col-12 {
|
|
width: 100%;
|
|
}
|
|
.toc {
|
|
background: white;
|
|
padding: 20px;
|
|
border-radius: 10px;
|
|
margin: 20px 0;
|
|
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
|
}
|
|
.toc ul {
|
|
list-style: none;
|
|
padding-left: 0;
|
|
}
|
|
.toc li {
|
|
margin: 10px 0;
|
|
}
|
|
.toc a {
|
|
color: #3498db;
|
|
text-decoration: none;
|
|
}
|
|
.toc a:hover {
|
|
text-decoration: underline;
|
|
}
|
|
.info-box {
|
|
background: #e8f6ff;
|
|
border-left: 4px solid #3498db;
|
|
padding: 15px;
|
|
margin: 20px 0;
|
|
border-radius: 0 10px 10px 0;
|
|
}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<div class="container">
|
|
<h1>🔬 Synthetic Gamma Spectra Training Data Analysis</h1>
|
|
""",
|
|
|
|
create_summary_table(stats),
|
|
|
|
"""
|
|
<div class="toc">
|
|
<h3>📑 Table of Contents</h3>
|
|
<ul>
|
|
<li><a href="#isotope-distribution">1. Isotope Distribution</a></li>
|
|
<li><a href="#sample-complexity">2. Sample Complexity</a></li>
|
|
<li><a href="#temporal-activity">3. Temporal & Activity Analysis</a></li>
|
|
<li><a href="#cooccurrence">4. Isotope Co-occurrence</a></li>
|
|
<li><a href="#sample-spectra">5. Sample Spectra</a></li>
|
|
<li><a href="#database-overview">6. Isotope Database Overview</a></li>
|
|
</ul>
|
|
</div>
|
|
|
|
<h2 id="isotope-distribution">1. Isotope Distribution</h2>
|
|
<div class="info-box">
|
|
<strong>What this shows:</strong> The frequency of each isotope across all training samples.
|
|
Imbalanced distributions may lead to model bias towards common isotopes.
|
|
</div>
|
|
<div class="row">
|
|
<div class="col-6 chart-container">
|
|
""",
|
|
figures['isotope_freq'].to_html(full_html=False, include_plotlyjs=False),
|
|
"""
|
|
</div>
|
|
<div class="col-6 chart-container">
|
|
""",
|
|
figures['category_pie'].to_html(full_html=False, include_plotlyjs=False),
|
|
"""
|
|
</div>
|
|
</div>
|
|
|
|
<h2 id="sample-complexity">2. Sample Complexity</h2>
|
|
<div class="info-box">
|
|
<strong>What this shows:</strong> Distribution of how many source isotopes are present per sample.
|
|
Mix of single and multi-isotope samples helps the model handle real-world complexity.
|
|
</div>
|
|
<div class="chart-container">
|
|
""",
|
|
figures['num_isotopes'].to_html(full_html=False, include_plotlyjs=False),
|
|
"""
|
|
</div>
|
|
|
|
<h2 id="temporal-activity">3. Temporal & Activity Analysis</h2>
|
|
<div class="info-box">
|
|
<strong>What this shows:</strong> Distribution of measurement durations and source activities.
|
|
Varied durations simulate different counting scenarios.
|
|
</div>
|
|
<div class="row">
|
|
<div class="col-6 chart-container">
|
|
""",
|
|
figures['duration_hist'].to_html(full_html=False, include_plotlyjs=False),
|
|
"""
|
|
</div>
|
|
<div class="col-6 chart-container">
|
|
""",
|
|
figures['activity_duration'].to_html(full_html=False, include_plotlyjs=False),
|
|
"""
|
|
</div>
|
|
</div>
|
|
<div class="chart-container">
|
|
""",
|
|
figures['activity_box'].to_html(full_html=False, include_plotlyjs=False),
|
|
"""
|
|
</div>
|
|
|
|
<h2 id="cooccurrence">4. Isotope Co-occurrence</h2>
|
|
<div class="info-box">
|
|
<strong>What this shows:</strong> Which isotopes frequently appear together in training samples.
|
|
This helps understand potential confusion pairs and realistic combinations.
|
|
</div>
|
|
<div class="chart-container">
|
|
""",
|
|
figures['cooccurrence'].to_html(full_html=False, include_plotlyjs=False),
|
|
"""
|
|
</div>
|
|
|
|
<h2 id="sample-spectra">5. Sample Spectra Visualization</h2>
|
|
<div class="info-box">
|
|
<strong>What this shows:</strong> Actual spectrum shapes from the training data.
|
|
Each peak corresponds to gamma emission lines from the source isotopes.
|
|
</div>
|
|
<div class="chart-container">
|
|
""",
|
|
figures['sample_spectra'].to_html(full_html=False, include_plotlyjs=False),
|
|
"""
|
|
</div>
|
|
"""
|
|
]
|
|
|
|
# Add 3D spectrum if available
|
|
if 'spectrum_3d' in figures:
|
|
html_parts.append("""
|
|
<div class="chart-container">
|
|
<h3>3D Time-Energy-Counts View</h3>
|
|
""")
|
|
html_parts.append(figures['spectrum_3d'].to_html(full_html=False, include_plotlyjs=False))
|
|
html_parts.append("</div>")
|
|
|
|
html_parts.append("""
|
|
<h2 id="database-overview">6. Isotope Database Overview</h2>
|
|
<div class="info-box">
|
|
<strong>What this shows:</strong> The complete isotope database structure organized by category.
|
|
Click to explore the hierarchy.
|
|
</div>
|
|
<div class="chart-container">
|
|
""")
|
|
html_parts.append(figures['isotope_db'].to_html(full_html=False, include_plotlyjs=False))
|
|
html_parts.append("""
|
|
</div>
|
|
|
|
<footer style="text-align: center; padding: 40px; color: #7f8c8d;">
|
|
<p>Generated by ML for Isotope Identification Training Data Analyzer</p>
|
|
</footer>
|
|
</div>
|
|
</body>
|
|
</html>
|
|
""")
|
|
|
|
# Write HTML file
|
|
html_content = ''.join(html_parts)
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(html_content)
|
|
|
|
print(f"\n✅ Report generated successfully!")
|
|
print(f" Output: {output_file.absolute()}")
|
|
print(f"\nOpen in your browser to view the interactive visualizations.")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate interactive HTML visualization of training data"
|
|
)
|
|
parser.add_argument(
|
|
'--data-dir',
|
|
type=str,
|
|
default='data/synthetic/spectra',
|
|
help='Directory containing spectrum .json and .npy files'
|
|
)
|
|
parser.add_argument(
|
|
'--output',
|
|
type=str,
|
|
default='training_data_report.html',
|
|
help='Output HTML file name'
|
|
)
|
|
parser.add_argument(
|
|
'--max-samples',
|
|
type=int,
|
|
default=None,
|
|
help='Maximum number of samples to analyze (for faster generation)'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
data_dir = Path(args.data_dir)
|
|
output_file = Path(args.output)
|
|
|
|
if not data_dir.exists():
|
|
print(f"Error: Data directory not found: {data_dir}")
|
|
sys.exit(1)
|
|
|
|
generate_html_report(data_dir, output_file, args.max_samples)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|