Multi-Dimensional UMAP Export for Web Viewer#
This notebook doubles as a tutorial for exporting deterministic 1D/2D/3D embeddings into the Cellucid WebGL viewer bundles.
This variant is pre-configured for the Braun dataset; tweak the configuration cell if your files live elsewhere.
In this walkthrough you will
configure dataset roots once so the same notebook runs on any machine without editing paths elsewhere.
recompute missing UMAP dimensions only when necessary while reusing a single neighbor graph for perfect alignment across 1D/2D/3D.
hydrate metadata + expressions and feed everything into
cellucid.prepare.validate the generated binary assets and manifests before handing them to the frontend.
Each section calls out why the code exists so you can adapt the pattern to your own datasets.
Environment#
These setup helpers make the notebook location-agnostic: run it from the repo root, from notebooks/, or from VS Code and the imports/paths will still resolve.
%load_ext autoreload
%autoreload 2
from pathlib import Path
import sys
import gc
import pickle
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
HERE = Path(__file__).resolve().parent if '__file__' in globals() else Path.cwd()
def find_project_root(start: Path) -> Path:
for candidate in [start, *start.parents]:
if (candidate / "pyproject.toml").exists():
return candidate
return start
PROJECT_ROOT = find_project_root(HERE)
SRC_DIR = PROJECT_ROOT / "src"
if SRC_DIR.exists() and str(SRC_DIR) not in sys.path:
sys.path.append(str(SRC_DIR))
sc.settings.verbosity = 3
Configuration#
Keep all project-specific paths and knobs together so rerunning exports becomes a one-cell edit exercise.
# File locations
DATASET_NAME = "braun" # Slug used to name the export directory
DATA_ROOT = PROJECT_ROOT / "data" # Base folder shared across raw/experiment data
RAW_DIR = DATA_ROOT / "raw" # Location of raw AnnData inputs
EXPERIMENT_DIR = DATA_ROOT / "experiments" # Folder containing experiment-specific .h5ad files
EXPORT_DIR = PROJECT_ROOT.parent / "cellucid-datasets" / "exports" / DATASET_NAME # Final viewer export root
# Inputs/outputs
EXPERIMENT_FILE = EXPERIMENT_DIR / Path("braun_developmental_complete_with_3d_umap.h5ad")
COMPLETE_ADATA_FILE = RAW_DIR / "braun_developmental_complete.h5ad"
COMPLETE_ADATA_VAR = RAW_DIR / "dataset_complete_Braun_varnames.pickle"
# Quick peek at the raw AnnData without loading it fully
if COMPLETE_ADATA_FILE.exists():
adata_preview = ad.read_h5ad(COMPLETE_ADATA_FILE, backed="r")
print(adata_preview)
if "LVL0" in adata_preview.obs:
lvl0_counts = adata_preview.obs["LVL0"].value_counts()
print("LVL0 value counts:")
for label, count in lvl0_counts.items():
print(f" {label}: {count}")
else:
print("LVL0 column not found in obs.")
adata_preview.file.close()
del adata_preview
else:
print(f"Complete AnnData file not found at {COMPLETE_ADATA_FILE}")
# UMAP parameters
n_neighbors = 15
min_dist = 0.5
RANDOM_SEED = 0
# Storage keys for each dimensionality we care about.
UMAP_DIMENSION_KEYS = {
1: "X_umap_1d",
2: "X_umap_2d",
3: "X_umap_3d",
# 4: "X_umap_4d", # Reserved for future expansion
}
def compute_umap_embedding(adata_source, n_components: int, min_dist: float, random_state: int) -> np.ndarray:
"""Compute a UMAP embedding with the provided dimensionality without mutating the source AnnData."""
neighbors_params = adata_source.uns.get("neighbors", {}).get("params", {})
use_rep = neighbors_params.get("use_rep", None)
adata_temp = ad.AnnData(
obs=adata_source.obs[[]],
obsp={
"connectivities": adata_source.obsp["connectivities"],
"distances": adata_source.obsp["distances"],
},
)
if use_rep is not None and use_rep in adata_source.obsm:
adata_temp.obsm[use_rep] = adata_source.obsm[use_rep]
adata_temp.uns["neighbors"] = adata_source.uns["neighbors"].copy()
sc.tl.umap(adata_temp, n_components=n_components, min_dist=min_dist, random_state=random_state)
embedding = adata_temp.obsm["X_umap"].copy()
del adata_temp
return embedding
Deterministic Embedding Strategy#
Stable random seed (
RANDOM_SEED) keeps layouts reproducible between releases, which is critical when comparing viewer builds, spotting regression diffs, or debugging quantization artifacts.Stable kNN graph for all dimensions means
sc.pp.neighborsruns once and every 1D/2D/3D embedding encodes the exact same neighbor relationships; cross-dimensional brushing stays intuitive and centroid statistics stay comparable.Shared latent representation (
scVI) ensures the centroids and connectivities exported later line up with whatever representation was used in training; no silent drift between the viewer and the model.Legacy
adata.obsm['X_umap']alias mirrors the 3D embedding under the historical key so older ingestion scripts and viewer builds continue to work even though the tutorial now emits explicit multi-dimensional files.Backed AnnData checks let us peek into the
.h5adfile without loading it fully and skip recomputation when the embeddings are already up to date.
Tweak the parameters in the previous cell only when you explicitly want to generate alternative deterministic layouts.
def umap_dimensions_present(exp_file: Path, dim_keys: dict[int, str]) -> tuple[bool, list[int]]:
"""Return whether each required UMAP embedding is stored in exp_file plus the missing dimensions."""
if exp_file is None or not exp_file.exists():
return False, list(dim_keys.keys())
backed = ad.read_h5ad(exp_file, backed="r")
try:
available = set(backed.obsm_keys())
finally:
backed.file.close()
missing = [dim for dim, key in dim_keys.items() if key not in available]
return len(missing) == 0, missing
def ensure_umap_embeddings():
"""Compute multi-dimensional UMAP embeddings only when they are absent on disk."""
ready, missing_dims = umap_dimensions_present(EXPERIMENT_FILE, UMAP_DIMENSION_KEYS)
if ready:
print(
f"✓ {EXPERIMENT_FILE.name} already stores "
f"{', '.join(f'{dim}D' for dim in UMAP_DIMENSION_KEYS)} embeddings."
)
return
missing_msg = ", ".join(f"{dim}D" for dim in missing_dims) if missing_dims else "all required"
if EXPERIMENT_FILE.exists():
print(f"Updating {EXPERIMENT_FILE.name}: missing {missing_msg} embeddings.")
else:
print(f"{EXPERIMENT_FILE} does not exist yet. Computing full multi-dimensional embeddings.")
if not COMPLETE_ADATA_FILE.exists():
raise FileNotFoundError(f"Complete AnnData file not found at {COMPLETE_ADATA_FILE}")
adata = ad.read_h5ad(COMPLETE_ADATA_FILE)
print("Computing neighbors once and reusing them for every dimensionality...")
sc.pp.neighbors(adata, n_neighbors=n_neighbors, random_state=RANDOM_SEED, use_rep="scvi")
for n_dim, key in UMAP_DIMENSION_KEYS.items():
print(f"Computing {n_dim}D UMAP → {key}")
adata.obsm[key] = compute_umap_embedding(
adata, n_components=n_dim, min_dist=min_dist, random_state=RANDOM_SEED
)
if 3 in UMAP_DIMENSION_KEYS:
# Keep a 3D copy under "X_umap" because legacy viewer builds still expect this key
# even though newer ones read the explicit multi-dimensional files.
adata.obsm["X_umap"] = adata.obsm[UMAP_DIMENSION_KEYS[3]].copy()
print("UMAP embeddings computed:")
for n_dim, key in UMAP_DIMENSION_KEYS.items():
shape = adata.obsm[key].shape
print(f" {key}: {shape}")
adata.write_h5ad(EXPERIMENT_FILE)
print(f"Saved updated embeddings to {EXPERIMENT_FILE}")
del adata
gc.collect()
ensure_umap_embeddings()
Load UMAP run#
The previous step guarantees that the experiment file exists and houses every required UMAP dimension. Load it now and double-check which embeddings are present.
if EXPERIMENT_FILE is None or not EXPERIMENT_FILE.exists():
raise FileNotFoundError(
f"UMAP file not found. Set EXPERIMENT_FILE to a valid .h5ad under {EXPERIMENT_DIR}"
)
adata = ad.read_h5ad(EXPERIMENT_FILE)
if "age" in adata.obs:
adata.obs["age"] = pd.to_numeric(adata.obs["age"], errors="coerce")
drop_columns = [col for col in ("_scvi_batch", "_scvi_labels") if col in adata.obs]
if drop_columns:
adata.obs = adata.obs.drop(columns=drop_columns)
adata.var = pd.read_pickle(COMPLETE_ADATA_VAR)
legacy_umap_key = "X_umap" # 3D default used by legacy exports/viewers
available_umaps = {}
for dim, key in UMAP_DIMENSION_KEYS.items():
dim_label = f"{dim}d"
if key in adata.obsm:
available_umaps[dim_label] = adata.obsm[key]
print(f"✓ Found {key}: shape {adata.obsm[key].shape}")
else:
print(f"✗ Missing {key}")
if not available_umaps:
if legacy_umap_key in adata.obsm:
print(f"Using legacy {legacy_umap_key}")
available_umaps['3d'] = adata.obsm[legacy_umap_key]
else:
raise KeyError(f"No UMAP embeddings found in adata.obsm")
print(f"\nAvailable dimensions: {list(available_umaps.keys())}")
adata
Quick UMAP stats#
Lightweight sanity check on all loaded UMAP embeddings (1D, 2D, 3D).
# Stats for all available UMAP dimensions
umap_stats = {}
for dim, coords in available_umaps.items():
umap_stats[dim] = {
"shape": coords.shape,
"mean": coords.mean(axis=0).tolist(),
"std": coords.std(axis=0).tolist(),
"min": coords.min(axis=0).tolist(),
"max": coords.max(axis=0).tolist(),
}
print(f"UMAP stats for {adata.n_obs} cells:")
for dim, stats in umap_stats.items():
print(f"\n{dim.upper()}:")
print(f" Shape: {stats['shape']}")
print(f" Mean: {[f'{x:.3f}' for x in stats['mean']]}")
print(f" Std: {[f'{x:.3f}' for x in stats['std']]}")
umap_stats
Load full annotated data#
The export step needs the full expression + metadata matrices, not just the subset used for UMAP projection. We therefore:
read the complete AnnData object,
align it to the experiment cell ordering, and
apply a lightweight normalize/log1p transform so the quantized export stays well behaved.
if not COMPLETE_ADATA_FILE.exists():
raise FileNotFoundError(f"Complete AnnData file not found at {COMPLETE_ADATA_FILE}")
adata_complete = ad.read_h5ad(COMPLETE_ADATA_FILE)
# adata_complete = adata_complete[:, adata_complete.var["highly_variable"] == 1]
adata_complete = adata_complete[adata.obs.index].copy()
# Normalize counts to 1e4 per cell and log-transform for export
sc.pp.normalize_total(adata_complete, target_sum=1e4)
sc.pp.log1p(adata_complete)
adata_complete
import numpy as np
row = adata_complete.X[0]
dense_row = row.A.ravel() if hasattr(row, "A") else np.asarray(row).ravel()
non_zero_preview = dense_row[dense_row != 0][:5]
non_zero_preview
Export for web viewer#
cellucid.prepare handles the heavy lifting described in src/cellucid/prepare_data.py:
quantizes continuous obs/var fields and expression matrices to keep payloads small,
auto-picks compact categorical dtypes and gzips the resulting binaries, and
emits dataset manifests (
dataset_identity.json,obs_manifest.json,var_manifest.json) that the WebGL viewer reads at runtime.
The call below wires our multi-dimensional UMAPs plus metadata into that exporter.
from cellucid import prepare
prepare(
# Multi-dimensional UMAP embeddings; missing keys evaluate to None and will be skipped
X_umap_1d=available_umaps.get('1d'),
X_umap_2d=available_umaps.get('2d'),
X_umap_3d=available_umaps.get('3d'),
# X_umap_4d is reserved for future development
# Other data matrices (scVI latent space drives centroids/kNN reuse)
latent_space=adata.obsm["scvi"],
obs=adata.obs,
var=adata.var,
gene_expression=adata.X,
connectivities=adata.obsp['connectivities'],
# Export behavior knobs defined inline for clarity
var_gene_id_column=None, # Use var.index as gene identifiers
gene_identifiers=None, # Export every gene; slice list here if needed
centroid_outlier_quantile=0.90, # Trim cells far from centroid when summarizing categories
centroid_min_points=10, # Require at least this many cells per centroid
force=False,
var_quantization=8,
obs_continuous_quantization=8,
obs_categorical_dtype="auto",
compression=6,
# Dataset identity metadata surfaced in dataset_identity.json
out_dir=EXPORT_DIR,
dataset_name=DATASET_NAME,
dataset_description="Braun developmental atlas",
source_name="Braun et al.",
source_url=""
)
Validate export artifacts#
Spot-check file sizes (MB), manifest stats, and total obs/var directory sizes.
import json
from pathlib import Path
BYTES_IN_MB = 1024 * 1024
def size_mb(path: Path) -> float:
return round(path.stat().st_size / BYTES_IN_MB, 3) if path.exists() else 0
def dir_stats(path: Path) -> dict:
if not path.exists():
return {"size_mb": 0, "files": 0}
total_bytes = 0
file_count = 0
for p in path.rglob("*"):
if p.is_file():
file_count += 1
total_bytes += p.stat().st_size
return {"size_mb": round(total_bytes / BYTES_IN_MB, 3), "files": file_count}
# Multi-dimensional point files
points_files = {
'points_1d': EXPORT_DIR / "points_1d.bin.gz",
'points_2d': EXPORT_DIR / "points_2d.bin.gz",
'points_3d': EXPORT_DIR / "points_3d.bin.gz",
'points (legacy)': EXPORT_DIR / "points.bin.gz",
}
obs_manifest_path = EXPORT_DIR / "obs_manifest.json"
var_manifest_path = EXPORT_DIR / "var_manifest.json"
dataset_identity_path = EXPORT_DIR / "dataset_identity.json"
obs_dir = EXPORT_DIR / "obs"
var_dir = EXPORT_DIR / "var"
obs_manifest = json.loads(obs_manifest_path.read_text()) if obs_manifest_path.exists() else None
var_manifest = json.loads(var_manifest_path.read_text()) if var_manifest_path.exists() else None
dataset_identity = json.loads(dataset_identity_path.read_text()) if dataset_identity_path.exists() else None
# Check which point files exist
point_sizes = {}
for name, path in points_files.items():
if path.exists():
point_sizes[name] = size_mb(path)
print(f"✓ {name}: {point_sizes[name]} MB")
else:
print(f"✗ {name}: not found")
# Show embeddings metadata
if dataset_identity and 'embeddings' in dataset_identity:
embeddings_meta = dataset_identity['embeddings']
print(f"\nEmbeddings metadata:")
print(f" Available dimensions: {embeddings_meta.get('available_dimensions')}")
print(f" Default dimension: {embeddings_meta.get('default_dimension')}D")
{
"paths": {
"export_dir": EXPORT_DIR,
"obs_manifest": obs_manifest_path,
"var_manifest": var_manifest_path,
"dataset_identity": dataset_identity_path,
"obs_dir": obs_dir,
"var_dir": var_dir,
},
"sizes_mb": {
**point_sizes,
"obs_manifest": size_mb(obs_manifest_path),
"var_manifest": size_mb(var_manifest_path),
"dataset_identity": size_mb(dataset_identity_path),
},
"dir_sizes_mb": {
"obs": dir_stats(obs_dir),
"var": dir_stats(var_dir),
},
"manifest_stats": {
"obs": None if obs_manifest is None else {
"n_points": obs_manifest.get("n_points"),
"fields": len(obs_manifest.get("fields", [])),
"centroid_outlier_quantile": obs_manifest.get("centroid_outlier_quantile"),
},
"var": None if var_manifest is None else {
"n_points": var_manifest.get("n_points"),
"fields": len(var_manifest.get("fields", [])),
"var_gene_id_column": var_manifest.get("var_gene_id_column"),
},
"embeddings": None if dataset_identity is None else dataset_identity.get("embeddings"),
},
}
Done. Serve index.html from the repo root to view the exported data.