3D UMAP Export for Web Viewer#
Export helper to move a parameterized AnnData UMAP run into the web viewer format.
Workflow:
Configure run + export paths.
Load the selected UMAP and optional full AnnData for metadata/genes.
Export viewer assets and sanity-check file sizes.
Environment#
Lightweight setup so the notebook works whether it’s launched from the repo root or the notebooks/ directory.
%load_ext autoreload
%autoreload 2
from pathlib import Path
import sys
import scanpy as sc
import anndata as ad
HERE = Path(__file__).resolve().parent if '__file__' in globals() else Path.cwd()
def find_project_root(start: Path) -> Path:
for candidate in [start, *start.parents]:
if (candidate / "pyproject.toml").exists():
return candidate
return start
PROJECT_ROOT = find_project_root(HERE)
SRC_DIR = PROJECT_ROOT / "src"
if SRC_DIR.exists() and str(SRC_DIR) not in sys.path:
sys.path.append(str(SRC_DIR))
Configuration#
Set file locations and export options. Update EXPERIMENT_FILE to the run you want to export.
# File locations
DATASET_NAME = "apidip"
DATA_ROOT = PROJECT_ROOT / "data"
RAW_DIR = DATA_ROOT / "raw"
EXPERIMENT_DIR = DATA_ROOT / "experiments" / "umap_parameter_sweep"
EXPORT_DIR = PROJECT_ROOT.parent / "cellucid-datasets" / "exports" / DATASET_NAME
# Inputs/outputs
EXPERIMENT_FILE = EXPERIMENT_DIR / Path("adata|k__90|min_dist_0_10.h5ad")
COMPLETE_ADATA_FILE = RAW_DIR / "adata_unified_20250925_001_complete.h5ad"
# Export options
UMAP_KEY = "X_umap" # 3D layout to send to the viewer
LATENT_KEY = None # e.g., "X_pca"; None uses adata.X
VAR_GENE_ID_COLUMN = "converted_id"
GENE_IDENTIFIERS = None # e.g., ["Gene1", "Gene2"] or None for all
CENTROID_OUTLIER_Q = 0.90
CENTROID_MIN_POINTS = 10
# Quick peek at the raw AnnData without loading it fully
if COMPLETE_ADATA_FILE.exists():
adata_preview = ad.read_h5ad(COMPLETE_ADATA_FILE, backed="r")
print(adata_preview)
if "LVL0" in adata_preview.obs:
lvl0_counts = adata_preview.obs["LVL0"].value_counts()
print("LVL0 value counts:")
for label, count in lvl0_counts.items():
print(f" {label}: {count}")
else:
print("LVL0 column not found in obs.")
adata_preview.file.close()
del adata_preview
else:
print(f"Complete AnnData file not found at {COMPLETE_ADATA_FILE}")
Load UMAP run#
Load the selected AnnData object and confirm the file exists.
if EXPERIMENT_FILE is None or not EXPERIMENT_FILE.exists():
raise FileNotFoundError(
f"UMAP file not found. Set EXPERIMENT_FILE to a valid .h5ad under {EXPERIMENT_DIR}"
)
adata = ad.read_h5ad(EXPERIMENT_FILE)
if "age" in adata.obs:
adata.obs["age"] = pd.to_numeric(adata.obs["age"], errors="coerce")
drop_columns = [col for col in ("_scvi_batch", "_scvi_labels") if col in adata.obs]
if drop_columns:
adata.obs = adata.obs.drop(columns=drop_columns)
if UMAP_KEY not in adata.obsm:
raise KeyError(f"UMAP_KEY {UMAP_KEY!r} not found in adata.obsm")
umap_coords = adata.obsm[UMAP_KEY]
adata
Quick UMAP stats#
Lightweight sanity check on the loaded coordinates.
umap_stats = {
"mean": umap_coords.mean(axis=0).tolist(),
"std": umap_coords.std(axis=0).tolist(),
"n_cells": adata.n_obs,
}
umap_stats
Load full annotated data#
Load the complete AnnData to supply metadata (obs) and gene expression matrices aligned to the UMAP run.
if not COMPLETE_ADATA_FILE.exists():
raise FileNotFoundError(f"Complete AnnData file not found at {COMPLETE_ADATA_FILE}")
adata_complete = ad.read_h5ad(COMPLETE_ADATA_FILE)
# adata_complete = adata_complete[:, adata_complete.var["highly_variable"] == 1]
adata_complete = adata_complete[adata.obs.index].copy()
# Normalize counts to 1e4 per cell and log-transform for export
sc.pp.normalize_total(adata_complete, target_sum=1e4)
sc.pp.log1p(adata_complete)
adata_complete.X[0].A
Export for web viewer#
Selects the latent space (uses LATENT_KEY if provided, else adata.X) and writes points.bin plus obs/var manifests for the HTML viewer in index.html.
from cellucid import prepare
prepare(
X_umap=umap_coords,
latent_space=adata.X,
obs=adata_complete.obs,
var=adata_complete.var,
gene_expression=adata_complete.X,
connectivities=adata.obsp['connectivities'],
var_gene_id_column=VAR_GENE_ID_COLUMN,
gene_identifiers=GENE_IDENTIFIERS,
centroid_outlier_quantile=CENTROID_OUTLIER_Q,
centroid_min_points=CENTROID_MIN_POINTS,
force=False,
var_quantization = 8,
obs_continuous_quantization = 8,
obs_categorical_dtype = "auto",
compression = 6,
# Dataset info
out_dir=EXPORT_DIR,
dataset_name=DATASET_NAME,
dataset_description="Atlas of Pancreatic Islet Differentiation Protocols",
source_name="Temporary Data Source (Internal / Non-official)",
source_url="https://drive.google.com/drive/folders/1weTUQ8lDoTESC3aziN7QTGiSwDQd85Jf?usp=sharing"
)
Validate export artifacts#
Spot-check file sizes (MB), manifest stats, and total obs/var directory sizes.
import json
from pathlib import Path
BYTES_IN_MB = 1024 * 1024
def size_mb(path: Path) -> float:
return round(path.stat().st_size / BYTES_IN_MB, 3) if path.exists() else 0
def dir_stats(path: Path) -> dict:
if not path.exists():
return {"size_mb": 0, "files": 0}
total_bytes = 0
file_count = 0
for p in path.rglob("*"):
if p.is_file():
file_count += 1
total_bytes += p.stat().st_size
return {"size_mb": round(total_bytes / BYTES_IN_MB, 3), "files": file_count}
points_path = EXPORT_DIR / "points.bin"
obs_manifest_path = EXPORT_DIR / "obs_manifest.json"
var_manifest_path = EXPORT_DIR / "var_manifest.json"
obs_dir = EXPORT_DIR / "obs"
var_dir = EXPORT_DIR / "var"
obs_manifest = json.loads(obs_manifest_path.read_text()) if obs_manifest_path.exists() else None
var_manifest = json.loads(var_manifest_path.read_text()) if var_manifest_path.exists() else None
{
"paths": {
"export_dir": EXPORT_DIR,
"points": points_path,
"obs_manifest": obs_manifest_path,
"var_manifest": var_manifest_path,
"obs_dir": obs_dir,
"var_dir": var_dir,
},
"sizes_mb": {
"points": size_mb(points_path),
"obs_manifest": size_mb(obs_manifest_path),
"var_manifest": size_mb(var_manifest_path),
},
"dir_sizes_mb": {
"obs": dir_stats(obs_dir),
"var": dir_stats(var_dir),
},
"manifest_stats": {
"obs": None if obs_manifest is None else {
"n_points": obs_manifest.get("n_points"),
"fields": len(obs_manifest.get("fields", [])),
"centroid_outlier_quantile": obs_manifest.get("centroid_outlier_quantile"),
},
"var": None if var_manifest is None else {
"n_points": var_manifest.get("n_points"),
"fields": len(var_manifest.get("fields", [])),
"var_gene_id_column": var_manifest.get("var_gene_id_column"),
},
},
}
Done. Serve index.html from the repo root to view the exported data.