feat(loss): Implement Z-score normalization for AuxiliaryPathwayLoss

BenjaminIsaac0111 · BenjaminIsaac0111 · commit 4d4af814813e · 2026-03-04T00:54:10.000Z
Implemented spatial Z-score normalization and mean-aggregation for biological
pathway ground-truth calculation. This ensures that every member gene in a
pathway (even lowly-expressed transcription factors) contributes equally to
the spatial activation signature, preventing high-count housekeeping genes
from dominating the pathway patterns.

Changes:
- Updated AuxiliaryPathwayLoss to spatially standardize genes before
  projecting onto the pathway matrix.
- Handled normalization across batch (patch-level) and spatial (whole-slide)
  dimensions with proper masking.
- Switched from raw summation to mean-aggregation (averaging by pathway
  member counts).
- Synchronized visualization.py ground-truth logic with the new objective.
- Fixed mock tests in test_losses.py to match the normalized targets.

Variance analysis on HEST data indicated raw gene variance ratios exceeding
300,000x, necessitating this standardization for biologically relevant
pathway supervision.
diff --git a/scripts/analyze_expression_variance.py b/scripts/analyze_expression_variance.py
@@ -0,0 +1,95 @@
+import os
+import argparse
+import numpy as np
+import h5py
+import matplotlib.pyplot as plt
+import pandas as pd
+import json
+
+
+def analyze_sample(h5ad_path):
+    print(f"Analyzing {h5ad_path}...")
+
+    with h5py.File(h5ad_path, "r") as f:
+        # Check standard AnnData structure
+        if "X" in f:
+            if isinstance(f["X"], h5py.Group):
+                # Sparse format (CSR/CSC)
+                data_group = f["X"]["data"][:]
+                n_cells = (
+                    f["obs"]["_index"].shape[0]
+                    if "_index" in f["obs"]
+                    else len(f["obs"])
+                )
+                n_genes = (
+                    f["var"]["_index"].shape[0]
+                    if "_index" in f["var"]
+                    else len(f["var"])
+                )
+
+                print(f"Data is sparse, shape: ({n_cells}, {n_genes})")
+                print(f"Non-zero elements: {len(data_group)}")
+
+                # Analyze non-zero elements
+                mean_val = np.mean(data_group)
+                max_val = np.max(data_group)
+                min_val = np.min(data_group)
+
+                print(f"Non-zero Mean: {mean_val:.4f}")
+                print(f"Max Expression: {max_val:.4f}")
+                print(f"Min Expression: {min_val:.4f}")
+
+            else:
+                # Dense array
+                X = f["X"][:]
+                print(f"Data is dense, shape: {X.shape}")
+
+                # Basic stats
+                mean_exp = np.mean(X, axis=0)  # per gene mean
+                var_exp = np.var(X, axis=0)  # per gene variance
+                max_exp = np.max(X, axis=0)
+
+                sparsity = np.sum(X == 0) / X.size
+                print(f"Overall Sparsity (zeros): {sparsity:.2%}")
+
+                print(
+                    f"Gene Mean Range: {np.min(mean_exp):.4f} to {np.max(mean_exp):.4f}"
+                )
+                print(f"Gene Var Range: {np.min(var_exp):.4f} to {np.max(var_exp):.4f}")
+                print(f"Overall Max Expression: {np.max(max_exp):.4f}")
+
+                # Check for extreme differences in variance
+                var_ratio = np.max(var_exp) / (np.min(var_exp) + 1e-8)
+                print(f"Ratio of max/min gene variance: {var_ratio:.4e}")
+
+                return {
+                    "sparsity": sparsity,
+                    "var_ratio": var_ratio,
+                    "max_exp": np.max(max_exp),
+                }
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data-dir",
+        type=str,
+        default="A:\\hest_data",
+        help="Path to HEST data directory",
+    )
+    args = parser.parse_args()
+
+    st_dir = os.path.join(args.data_dir, "st")
+    if not os.path.exists(st_dir):
+        print(f"Error: Directory not found: {st_dir}")
+        exit(1)
+
+    # Get a few random samples
+    samples = [f for f in os.listdir(st_dir) if f.endswith(".h5ad")]
+    if not samples:
+        print(f"No .h5ad files found in {st_dir}")
+
+    # Analyze the first couple of samples
+    for sample in samples[:3]:
+        analyze_sample(os.path.join(st_dir, sample))
+        print("-" * 50)
diff --git a/scripts/run_preset.py b/scripts/run_preset.py
@@ -22,7 +22,7 @@ def make_stf_params(n_layers: int, token_dim: int, n_heads: int, batch_size: int
         "token-dim": token_dim,
         "n-heads": n_heads,
         "batch-size": batch_size,
-        "vis_sample": 'TENX29',
+        "vis_sample": "TENX29",
     }
 
 
@@ -52,9 +52,9 @@ def make_stf_params(n_layers: int, token_dim: int, n_heads: int, batch_size: int
     },
     # --- SpatialTranscriptFormer Variants ---
     "stf_tiny": make_stf_params(n_layers=2, token_dim=256, n_heads=4, batch_size=8),
-    "stf_small": make_stf_params(n_layers=4, token_dim=384, n_heads=8, batch_size=4),
-    "stf_medium": make_stf_params(n_layers=6, token_dim=512, n_heads=8, batch_size=2),
-    "stf_large": make_stf_params(n_layers=12, token_dim=768, n_heads=12, batch_size=1),
+    "stf_small": make_stf_params(n_layers=4, token_dim=384, n_heads=8, batch_size=8),
+    "stf_medium": make_stf_params(n_layers=6, token_dim=512, n_heads=8, batch_size=8),
+    "stf_large": make_stf_params(n_layers=12, token_dim=768, n_heads=12, batch_size=8),
 }
 
 
diff --git a/src/spatial_transcript_former/training/losses.py b/src/spatial_transcript_former/training/losses.py
@@ -327,10 +327,51 @@ def forward(self, gene_preds, target_genes, mask=None, pathway_preds=None):
             return gene_loss
 
         # Compute pathway ground truth from gene expression
-        # target_genes: (B, [N,] G), pathway_matrix: (P, G)
-        # result: (B, [N,] P)
+        # 1. Spatially standardize (Z-score) the target genes to ensure equal weighting
         with torch.no_grad():
-            target_pathways = torch.matmul(target_genes, self.pathway_matrix.T)
+            if target_genes.dim() == 2:
+                # Patch level: (B, G). Normalize across the batch dimension (which acts as spatial context)
+                if target_genes.shape[0] > 1:
+                    means = target_genes.mean(dim=0, keepdim=True)
+                    stds = target_genes.std(dim=0, keepdim=True).clamp(min=1e-6)
+                    norm_genes = (target_genes - means) / stds
+                else:
+                    norm_genes = torch.zeros_like(target_genes)
+            else:
+                # Whole slide: (B, N, G). Normalize across valid spatial positions N
+                if mask is not None:
+                    valid_mask = ~mask.unsqueeze(-1)  # (B, N, 1)
+                    valid_counts = valid_mask.sum(dim=1, keepdim=True).clamp(
+                        min=1.0
+                    )  # (B, 1, 1)
+
+                    means = (target_genes * valid_mask.float()).sum(
+                        dim=1, keepdim=True
+                    ) / valid_counts
+
+                    # Compute variance explicitly to handle masking correctly
+                    diffs = (target_genes - means) * valid_mask.float()
+                    vars = (diffs**2).sum(dim=1, keepdim=True) / (
+                        valid_counts - 1
+                    ).clamp(min=1.0)
+                    stds = torch.sqrt(vars).clamp(min=1e-6)
+
+                    norm_genes = diffs / stds
+                    norm_genes = norm_genes * valid_mask.float()
+                else:
+                    means = target_genes.mean(dim=1, keepdim=True)
+                    stds = target_genes.std(dim=1, keepdim=True).clamp(min=1e-6)
+                    norm_genes = (target_genes - means) / stds
+
+            # 2. Project normalized genes onto the pathway matrix
+            # target_pathways: (B, P) or (B, N, P)
+            target_pathways = torch.matmul(norm_genes, self.pathway_matrix.T)
+
+            # 3. Average by the number of genes in each pathway
+            member_counts = self.pathway_matrix.sum(dim=1, keepdim=True).T.clamp(
+                min=1.0
+            )
+            target_pathways = target_pathways / member_counts
 
         pathway_loss = self.pcc(pathway_preds, target_pathways, mask=mask)
 
diff --git a/src/spatial_transcript_former/visualization.py b/src/spatial_transcript_former/visualization.py
@@ -55,10 +55,18 @@ def _compute_pathway_truth(gene_truth, gene_names, args):
         # Only use hallmarks for periodic visualization to keep it fast
         pw_matrix, pw_names = get_pathway_init(gene_names, gmt_urls=urls, verbose=False)
         pw_np = pw_matrix.numpy()  # (P, G)
+
+        # Z-score normalize gene spatial patterns to match AuxiliaryPathwayLoss
+        gene_truth = gene_truth.astype(np.float64)
+        means = np.mean(gene_truth, axis=0, keepdims=True)
+        stds = np.std(gene_truth, axis=0, keepdims=True)
+        stds[stds < 1e-6] = 1e-6  # prevent division by zero
+        norm_genes = (gene_truth - means) / stds
+
         member_counts = pw_np.sum(axis=1, keepdims=True).clip(min=1)
-        # Mean expression of member genes per pathway
-        pathway_truth = (gene_truth @ pw_np.T) / member_counts.T  # (N, P)
-        return pathway_truth, pw_names
+        # Mean expression of normalized member genes per pathway
+        pathway_truth = (norm_genes @ pw_np.T) / member_counts.T  # (N, P)
+        return pathway_truth.astype(np.float32), pw_names
     except Exception as e:
         print(f"Warning: Could not compute pathway ground truth: {e}")
         return None, None
diff --git a/tests/test_losses.py b/tests/test_losses.py
@@ -447,9 +447,34 @@ def test_perfect_match_zero_aux(self, pathway_tensors):
         base = MaskedMSELoss()
         aux = AuxiliaryPathwayLoss(pw_matrix, base, lambda_pathway=1.0)
 
-        # Compute ground truth pathways
+        # Compute ground truth pathways matching the new AuxiliaryPathwayLoss logic
         with torch.no_grad():
-            target_pathways = torch.matmul(targets, pw_matrix.T)
+            if targets.dim() == 2:
+                # Patch level: (B, G). Normalize across the batch dimension
+                means = targets.mean(dim=0, keepdim=True)
+                stds = targets.std(dim=0, keepdim=True).clamp(min=1e-6)
+                norm_genes = (targets - means) / stds
+            else:
+                # Whole slide: (B, N, G). Normalize across valid spatial positions N
+                valid_mask = (
+                    ~mask.unsqueeze(-1)
+                    if mask is not None
+                    else torch.ones_like(targets, dtype=torch.bool)
+                )
+                valid_counts = valid_mask.sum(dim=1, keepdim=True).clamp(min=1.0)
+                means = (targets * valid_mask.float()).sum(
+                    dim=1, keepdim=True
+                ) / valid_counts
+                diffs = (targets - means) * valid_mask.float()
+                vars = (diffs**2).sum(dim=1, keepdim=True) / (valid_counts - 1).clamp(
+                    min=1.0
+                )
+                stds = torch.sqrt(vars).clamp(min=1e-6)
+                norm_genes = (diffs / stds) * valid_mask.float()
+
+            target_pathways = torch.matmul(norm_genes, pw_matrix.T)
+            member_counts = pw_matrix.sum(dim=1, keepdim=True).T.clamp(min=1.0)
+            target_pathways = target_pathways / member_counts
 
         gene_loss = base(gene_preds, targets, mask=mask)
         # Use target_pathways as pathway_preds
@@ -566,8 +591,15 @@ def test_hallmark_signal_detection(self):
         loss_fn = AuxiliaryPathwayLoss(pw_matrix, MaskedMSELoss(), lambda_pathway=1.0)
         loss_random = loss_fn(gene_preds, targets, pathway_preds=pw_preds_random)
 
-        # Case 2: Pathway preds perfectly match truth (which is targets @ matrix.T)
-        pw_truth = torch.matmul(targets, pw_matrix.T)
+        # Case 2: Pathway preds perfectly match truth
+        with torch.no_grad():
+            means = targets.mean(dim=1, keepdim=True)
+            stds = targets.std(dim=1, keepdim=True).clamp(min=1e-6)
+            norm_genes = (targets - means) / stds
+            pw_truth = torch.matmul(norm_genes, pw_matrix.T)
+            member_counts = pw_matrix.sum(dim=1, keepdim=True).T.clamp(min=1.0)
+            pw_truth = pw_truth / member_counts
+
         loss_perfect = loss_fn(gene_preds, targets, pathway_preds=pw_truth)
 
         # Case 3: Gene expression is specifically high for P0, and pw_preds are high for P0