Added support for encoder_init_norm and refactored normalization.

jkminder · jkminder · commit 0e94d50a4d6d · 2025-06-24T18:32:50.000Z
diff --git a/dictionary_learning/dictionary.py b/dictionary_learning/dictionary.py
@@ -26,7 +26,15 @@ class NormalizableMixin(nn.Module):
     pass through unchanged.
     """
 
-    def __init__(self, activation_mean: th.Tensor | None = None, activation_std: th.Tensor | None = None, activation_shape: tuple[int, ...] | None = None):
+    def __init__(
+        self,
+        activation_mean: th.Tensor | None = None,
+        activation_std: th.Tensor | None = None,
+        activation_shape: tuple[int, ...] | None = None,
+        *,
+        keep_relative_variance: bool = True,
+        target_rms: float = 1.0,
+    ):
         """
         Initialize the normalization mixin.
 
@@ -36,26 +44,44 @@ def __init__(self, activation_mean: th.Tensor | None = None, activation_std: th.
             activation_std: Optional std tensor for normalization. If None,
                           normalization is a no-op.
             activation_shape: Shape of the activation tensor. Required if activation_mean and activation_std are None for proper initialization and registration of the buffers.
+            keep_relative_variance: If True, performs global scaling so that the
+                                  sum of variances is 1 while their relative magnitudes stay unchanged. If false we normalize neuron-wise.
+            target_rms: Target RMS for input activation normalization.
         """
         super().__init__()
+        self.keep_relative_variance = keep_relative_variance
+        self.register_buffer("target_rms", th.tensor(target_rms))
         if activation_mean is not None and activation_std is not None:
             # Type assertion to help linter understand these are tensors
-            assert isinstance(activation_mean, th.Tensor), "Expected mean to be a tensor"
+            assert isinstance(
+                activation_mean, th.Tensor
+            ), "Expected mean to be a tensor"
             assert isinstance(activation_std, th.Tensor), "Expected std to be a tensor"
             assert not th.isnan(activation_mean).any(), "Expected mean to be non-NaN"
             assert not th.isnan(activation_std).any(), "Expected std to be non-NaN"
             self.register_buffer("activation_mean", activation_mean)
             self.register_buffer("activation_std", activation_std)
         else:
-            assert activation_shape is not None, "activation_shape must be provided if activation_mean and activation_std are None"
+            assert (
+                activation_shape is not None
+            ), "activation_shape must be provided if activation_mean and activation_std are None"
             self.register_buffer("activation_mean", th.nan * th.ones(activation_shape))
             self.register_buffer("activation_std", th.nan * th.ones(activation_shape))
 
+        if self.keep_relative_variance and self.has_activation_normalizer:
+            total_var = (self.activation_std**2).sum()
+            activation_global_scale = self.target_rms / th.sqrt(total_var + 1e-8)
+            self.register_buffer("activation_global_scale", activation_global_scale)
+        else:
+            self.register_buffer("activation_global_scale", th.tensor(1.0))
+
     @property
     def has_activation_normalizer(self) -> bool:
         """Check if activation normalization is enabled."""
-        return (not th.isnan(self.activation_mean).any() and
-                not th.isnan(self.activation_std).any())
+        return (
+            not th.isnan(self.activation_mean).any()
+            and not th.isnan(self.activation_std).any()
+        )
 
     def normalize_activations(self, x: th.Tensor, inplace: bool = False) -> th.Tensor:
         """
@@ -74,7 +100,12 @@ def normalize_activations(self, x: th.Tensor, inplace: bool = False) -> th.Tenso
             # Type assertions for linter
             assert isinstance(self.activation_mean, th.Tensor)
             assert isinstance(self.activation_std, th.Tensor)
-            return (x - self.activation_mean) / (self.activation_std + 1e-8)
+            x = x - self.activation_mean
+
+            if self.keep_relative_variance:
+                return x * self.activation_global_scale
+            else:
+                return x / (self.activation_std + 1e-8)
         return x
 
     def denormalize_activations(self, x: th.Tensor, inplace: bool = False) -> th.Tensor:
@@ -94,7 +125,13 @@ def denormalize_activations(self, x: th.Tensor, inplace: bool = False) -> th.Ten
             # Type assertions for linter
             assert isinstance(self.activation_mean, th.Tensor)
             assert isinstance(self.activation_std, th.Tensor)
-            return x * (self.activation_std + 1e-8) + self.activation_mean
+
+            if self.keep_relative_variance:
+                x = x / (self.activation_global_scale + 1e-8)
+            else:
+                x = x * (self.activation_std + 1e-8)
+
+            return x + self.activation_mean
         return x
 
 
@@ -454,6 +491,8 @@ def __init__(
         k: int,
         activation_mean: th.Tensor | None = None,
         activation_std: th.Tensor | None = None,
+        target_rms: float = 1.0,
+        encoder_init_norm: float = 1.0,
     ):
         """
         Initialize the Batch Top-K SAE.
@@ -464,11 +503,17 @@ def __init__(
             k: Number of top features to keep active across the batch
             activation_mean: Optional mean tensor for input activation normalization. If None, no normalization is applied.
             activation_std: Optional std tensor for input activation normalization. If None, no normalization is applied.
+            target_rms: Target variance for input activation normalization.
+            encoder_init_norm: Norm for the encoder weights.
         """
 
-        super().__init__(activation_mean=activation_mean, activation_std=activation_std, activation_shape=(activation_dim,))
-        
-        
+        super().__init__(
+            activation_mean=activation_mean,
+            activation_std=activation_std,
+            activation_shape=(activation_dim,),
+            target_rms=target_rms,
+        )
+
         self.activation_dim = activation_dim
         self.dict_size = dict_size
 
@@ -482,7 +527,7 @@ def __init__(
         )
 
         self.encoder = nn.Linear(activation_dim, dict_size)
-        self.encoder.weight.data = self.decoder.weight.T.clone()
+        self.encoder.weight.data = self.decoder.weight.T.clone() * encoder_init_norm
         self.encoder.bias.data.zero_()
         self.b_dec = nn.Parameter(th.zeros(activation_dim))
 
@@ -627,10 +672,10 @@ def from_pretrained(
         elif "k" in state_dict and k != state_dict["k"].item():
             raise ValueError(f"k={k} != {state_dict['k'].item()}=state_dict['k']")
 
-
-        
         autoencoder = cls(
-            activation_dim, dict_size, k, 
+            activation_dim,
+            dict_size,
+            k,
         )
         autoencoder.load_state_dict(state_dict)
         if device is not None:
@@ -645,6 +690,7 @@ def dtype(self):
     def device(self):
         return self.encoder.weight.device
 
+
 # TODO merge this with AutoEncoder
 class AutoEncoderNew(Dictionary, nn.Module):
     """
@@ -994,6 +1040,7 @@ class CrossCoder(Dictionary, NormalizableMixin):
         code_normalization_alpha_cc: Weight for CrossCoder component in MIXED normalization
         activation_mean: Optional mean tensor for input/output activation normalization
         activation_std: Optional std tensor for input/output activation normalization
+        target_rms: Optional target RMS for input/output activation normalization
     """
 
     def __init__(
@@ -1012,6 +1059,7 @@ def __init__(
         code_normalization_alpha_cc: float | None = 0.1,
         activation_mean: th.Tensor | None = None,
         activation_std: th.Tensor | None = None,
+        target_rms: float | None = None,
     ):
         """
         Initialize a CrossCoder sparse autoencoder.
@@ -1031,11 +1079,16 @@ def __init__(
             code_normalization_alpha_cc: Weight for CrossCoder component in MIXED normalization
             activation_mean: Optional mean tensor for input/output activation normalization
             activation_std: Optional std tensor for input/output activation normalization
+            target_rms: Optional target RMS for input/output activation normalization
         """
         # First initialize the base classes that don't take normalization parameters
-        super().__init__(activation_mean=activation_mean, activation_std=activation_std, activation_shape=(num_layers, activation_dim))
+        super().__init__(
+            activation_mean=activation_mean,
+            activation_std=activation_std,
+            activation_shape=(num_layers, activation_dim),
+            target_rms=target_rms,
+        )
 
-        
         if num_decoder_layers is None:
             num_decoder_layers = num_layers
 
@@ -1306,7 +1359,7 @@ def dtype(self):
     @property
     def device(self):
         return self.encoder.weight.device
-    
+
     def resample_neurons(self, deads, activations):
         """
         Resample dead neurons by reinitializing their weights.
@@ -1401,6 +1454,7 @@ def __init__(
             norm_init_scale: Scale factor for weight initialization normalization
             activation_mean: Optional mean tensor for input/output activation normalization
             activation_std: Optional std tensor for input/output activation normalization
+            target_rms: Optional target RMS for input/output activation normalization
             *args: Additional positional arguments passed to parent class
             **kwargs: Additional keyword arguments passed to parent class
         """
@@ -1411,6 +1465,7 @@ def __init__(
             norm_init_scale=norm_init_scale,
             activation_mean=activation_mean,
             activation_std=activation_std,
+            target_rms=target_rms,
             *args,
             **kwargs,
         )
@@ -1687,7 +1742,6 @@ def from_pretrained(
             ), f"k in kwargs ({kwargs['k']}) does not match k in state_dict ({state_dict['k']})"
             kwargs.pop("k")
 
-        
         crosscoder = cls(
             activation_dim,
             dict_size,
diff --git a/dictionary_learning/trainers/batch_top_k.py b/dictionary_learning/trainers/batch_top_k.py
@@ -32,6 +32,8 @@ def __init__(
         wandb_name: str = "BatchTopKSAE",
         activation_mean: Optional[t.Tensor] = None,
         activation_std: Optional[t.Tensor] = None,
+        target_rms: float = 1.0,
+        encoder_init_norm: str = 1.0,
     ):
         super().__init__(seed)
         assert layer is not None and lm_name is not None
@@ -50,7 +52,13 @@ def __init__(
             t.cuda.manual_seed_all(seed)
 
         self.ae = dict_class(
-            activation_dim, dict_size, k, activation_mean=activation_mean, activation_std=activation_std
+            activation_dim,
+            dict_size,
+            k,
+            activation_mean=activation_mean,
+            activation_std=activation_std,
+            target_rms=target_rms,
+            encoder_init_norm=encoder_init_norm,
         )
 
         if device is None:
@@ -78,6 +86,7 @@ def __init__(
         self.effective_l0 = -1
         self.dead_features = -1
         self.pre_norm_auxk_loss = -1
+        self.encoder_init_norm = encoder_init_norm
 
         self.optimizer = t.optim.Adam(
             self.ae.parameters(), lr=self.lr, betas=(0.9, 0.999)
@@ -199,6 +208,9 @@ def loss(
                     "l2_loss": l2_loss.item(),
                     "auxk_loss": auxk_loss.item(),
                     "loss": loss.item(),
+                    "deads": ~did_fire,
+                    "threshold": self.ae.threshold.item(),
+                    "rms_norm": t.sqrt((x.pow(2).sum(-1)).mean()).item(),
                 },
             )
 
@@ -256,6 +268,7 @@ def config(self):
             "layer": self.layer,
             "lm_name": self.lm_name,
             "wandb_name": self.wandb_name,
+            "encoder_init_norm": self.encoder_init_norm,
         }
 
     @staticmethod
diff --git a/dictionary_learning/trainers/crosscoder.py b/dictionary_learning/trainers/crosscoder.py
@@ -38,6 +38,7 @@ class CrossCoderTrainer(SAETrainer):
         use_mse_loss: Whether to use MSE loss instead of L2 loss for reconstruction (default: False)
         activation_mean: Optional activation mean (default: None)
         activation_std: Optional activation std (default: None)
+        target_rms: Target RMS for input activation normalization.
     """
 
     def __init__(
@@ -62,6 +63,7 @@ def __init__(
         use_mse_loss=False,
         activation_mean: Optional[th.Tensor] = None,
         activation_std: Optional[th.Tensor] = None,
+        target_rms: float = 1.0,
     ):
         super().__init__(seed)
 
@@ -71,6 +73,7 @@ def __init__(
         self.submodule_name = submodule_name
         self.compile = compile
         self.use_mse_loss = use_mse_loss
+        self.target_rms = target_rms
         if seed is not None:
             th.manual_seed(seed)
             th.cuda.manual_seed_all(seed)
@@ -83,6 +86,7 @@ def __init__(
                 num_layers=num_layers,
                 activation_mean=activation_mean,
                 activation_std=activation_std,
+                target_rms=target_rms,
                 **dict_class_kwargs,
             )
         else:
@@ -267,6 +271,7 @@ def config(self):
             "code_normalization": str(self.ae.code_normalization),
             "code_normalization_alpha_sae": self.ae.code_normalization_alpha_sae,
             "code_normalization_alpha_cc": self.ae.code_normalization_alpha_cc,
+            "target_rms": self.target_rms,
         }
 
 
@@ -302,6 +307,7 @@ class BatchTopKCrossCoderTrainer(SAETrainer):
         dict_class_kwargs: Additional arguments for the dictionary class (default: {})
         activation_mean: Optional activation mean (default: None)
         activation_std: Optional activation std (default: None)
+        target_rms: Target RMS for input activation normalization.
     """
 
     def __init__(
@@ -330,6 +336,7 @@ def __init__(
         dict_class_kwargs: dict = {},
         activation_mean: Optional[th.Tensor] = None,
         activation_std: Optional[th.Tensor] = None,
+        target_rms: float = 1.0,
     ):
         super().__init__(seed)
         assert layer is not None and lm_name is not None
@@ -348,6 +355,7 @@ def __init__(
 
         self.threshold_beta = threshold_beta
         self.threshold_start_step = threshold_start_step
+        self.target_rms = target_rms
 
         if seed is not None:
             th.manual_seed(seed)
@@ -362,6 +370,7 @@ def __init__(
                 self.k_initial,
                 activation_mean=activation_mean,
                 activation_std=activation_std,
+                target_rms=target_rms,
                 **dict_class_kwargs,
             )
         else:
@@ -689,6 +698,7 @@ def config(self):
             "wandb_name": self.wandb_name,
             "submodule_name": self.submodule_name,
             "dict_class_kwargs": {k: str(v) for k, v in self.dict_class_kwargs.items()},
+            "target_rms": self.target_rms,
         }
 
     @staticmethod