Refactor imports and enhance activation normalization handling

jkminder · jkminder · commit e8db0873c4e6 · 2025-06-17T15:26:10.000Z
This commit includes the following changes:

- Reformatted the import statements in `__init__.py` for improved readability.
- Increased the sleep duration in `ActivationCache` from 1 to 10 seconds to allow more time for save processes to complete.
- Updated the `BatchTopKSAE`, `CrossCoder`, and `BatchTopKCrossCoder` classes to load the `activation_normalizer` from the state dictionary, ensuring that normalization is applied correctly during model initialization.
- Refined the normalization checks in the `CrossCoderEncoder` and `CrossCoderDecoder` classes to ensure that normalization only occurs if an `activation_normalizer` is present.
- Made minor formatting adjustments in the `training.py` file for better code clarity.

These changes aim to enhance the clarity and maintainability of the code while ensuring proper handling of activation normalization across various components.
diff --git a/dictionary_learning/__init__.py b/dictionary_learning/__init__.py
@@ -1,2 +1,9 @@
-from .dictionary import AutoEncoder, GatedAutoEncoder, JumpReluAutoEncoder, CrossCoder, BatchTopKSAE, BatchTopKCrossCoder
+from .dictionary import (
+    AutoEncoder,
+    GatedAutoEncoder,
+    JumpReluAutoEncoder,
+    CrossCoder,
+    BatchTopKSAE,
+    BatchTopKCrossCoder,
+)
 from .buffer import ActivationBuffer
diff --git a/dictionary_learning/cache.py b/dictionary_learning/cache.py
@@ -349,7 +349,7 @@ def cleanup_multiprocessing():
                 print(
                     f"Waiting for {ActivationCache.__active_processes.value} save processes to finish"
                 )
-                time.sleep(1)
+                time.sleep(10)
             ActivationCache.__pool.close()
             ActivationCache.__pool = None
             ActivationCache.__manager.shutdown()
diff --git a/dictionary_learning/dictionary.py b/dictionary_learning/dictionary.py
@@ -36,8 +36,7 @@ def __init__(self, activation_normalizer: ActivationNormalizer | None = None):
         """
         super().__init__()
         self.activation_normalizer = activation_normalizer
-        if self.activation_normalizer is not None:
-            self.activation_normalizer.to(self.device)
+
 
     def normalize_activations(self, x: th.Tensor, inplace: bool = False) -> th.Tensor:
         """
@@ -594,7 +593,16 @@ def from_pretrained(
         elif "k" in state_dict and k != state_dict["k"].item():
             raise ValueError(f"k={k} != {state_dict['k'].item()}=state_dict['k']")
 
-        autoencoder = cls(activation_dim, dict_size, k)
+        # Load activation normalizer if present in kwargs
+        activation_normalizer_mean = state_dict.get("activation_normalizer.mean", None)
+        activation_normalizer_std = state_dict.get("activation_normalizer.std", None)
+        if activation_normalizer_mean is not None and activation_normalizer_std is not None:
+            activation_normalizer = ActivationNormalizer(
+                mean=activation_normalizer_mean, std=activation_normalizer_std
+            )
+        else:  
+            activation_normalizer = None
+        autoencoder = cls(activation_dim, dict_size, k, activation_normalizer=activation_normalizer)
         autoencoder.load_state_dict(state_dict)
         if device is not None:
             autoencoder.to(device)
@@ -729,8 +737,6 @@ def __init__(
         self.weight = nn.Parameter(weight)
         self.bias = nn.Parameter(th.zeros(dict_size))
         self.activation_normalizer = activation_normalizer
-        if self.activation_normalizer is not None:
-            self.activation_normalizer.to(self.device)
 
     def forward(
         self,
@@ -763,7 +769,7 @@ def forward(
                 - summed_features: shape (batch_size, dict_size)
                 - per_layer_features: shape (batch_size, num_layers, dict_size)
         """
-        if normalize_activations:
+        if normalize_activations and self.activation_normalizer is not None:
             x = self.activation_normalizer.normalize(x, inplace=inplace_normalize)
         x = x[:, self.encoder_layers]
         if select_features is not None:
@@ -836,8 +842,7 @@ def __init__(
                 weight = weight / weight.norm(dim=2, keepdim=True) * norm_init_scale
             self.weight = nn.Parameter(weight)
         self.activation_normalizer = activation_normalizer
-        if self.activation_normalizer is not None:
-            self.activation_normalizer.to(self.device)
+
 
     def forward(
         self,
@@ -873,7 +878,7 @@ def forward(
             x = th.einsum("blf, lfd -> bld", f, w)
         if add_bias:
             x += self.bias
-        if denormalize_activations:
+        if denormalize_activations and self.activation_normalizer is not None:
             x = self.activation_normalizer.denormalize(x, inplace=True)
         return x
 
@@ -1260,6 +1265,15 @@ def from_pretrained(
                     code_normalization.value, dtype=th.int
                 )
         num_layers, activation_dim, dict_size = state_dict["encoder.weight"].shape
+        # Load activation normalizer if present in kwargs
+        activation_normalizer_mean = state_dict.get("activation_normalizer.mean", None)
+        activation_normalizer_std = state_dict.get("activation_normalizer.std", None)
+        if activation_normalizer_mean is not None and activation_normalizer_std is not None:
+            activation_normalizer = ActivationNormalizer(
+                mean=activation_normalizer_mean, std=activation_normalizer_std
+            )
+        else:
+            activation_normalizer = None
 
         crosscoder = cls(
             activation_dim,
@@ -1268,6 +1282,7 @@ def from_pretrained(
             code_normalization=CodeNormalization._value2member_map_[
                 state_dict["code_normalization_id"].item()
             ],
+            activation_normalizer=activation_normalizer,
         )
         crosscoder.load_state_dict(state_dict)
 
@@ -1650,6 +1665,18 @@ def from_pretrained(
                 state_dict["k"] == kwargs["k"]
             ), f"k in kwargs ({kwargs['k']}) does not match k in state_dict ({state_dict['k']})"
             kwargs.pop("k")
+
+        # Load activation normalizer if present in kwargs
+        activation_normalizer_mean = state_dict.get("activation_normalizer.mean", None)
+        activation_normalizer_std = state_dict.get("activation_normalizer.std", None)
+        if activation_normalizer_mean is not None and activation_normalizer_std is not None:
+            activation_normalizer = ActivationNormalizer(
+                mean=activation_normalizer_mean, std=activation_normalizer_std
+            )
+
+        else:
+            activation_normalizer = None
+
         kwargs.update()
 
         crosscoder = cls(
@@ -1658,6 +1685,7 @@ def from_pretrained(
             num_layers,
             k=state_dict["k"],
             code_normalization=code_normalization,
+            activation_normalizer=activation_normalizer,
             **kwargs,
         )
         if "code_normalization_id" not in state_dict:
diff --git a/dictionary_learning/trainers/batch_top_k.py b/dictionary_learning/trainers/batch_top_k.py
@@ -213,7 +213,7 @@ def update(self, step, x):
         x = x.to(self.device)
         x = self.ae.normalize_activations(
             x,
-            inplace_normalize=True,  # Normalize inplace to avoid copying the activations during training
+            inplace=True,  # Normalize inplace to avoid copying the activations during training
         )
         loss = self.loss(x, step=step, normalize_activations=False)
         loss.backward()
diff --git a/dictionary_learning/training.py b/dictionary_learning/training.py
@@ -68,6 +68,7 @@ def get_stats(
     out["frac_variance_explained"] = frac_variance_explained.item()
     return out
 
+
 def get_model(trainer):
     if hasattr(trainer, "ae"):
         model = trainer.ae
@@ -77,6 +78,7 @@ def get_model(trainer):
         model = model._orig_mod
     return model
 
+
 def log_stats(
     trainer,
     step: int,
@@ -106,7 +108,10 @@ def log_stats(
         for name, value in trainer_log.items():
             log[f"{stage}/{name}"] = value
 
-        wandb.log(log, step=step, epoch=epoch_idx_per_step[step] if epoch_idx_per_step is not None else None)
+        if epoch_idx_per_step is not None:
+            log["epoch"] = epoch_idx_per_step[step]
+        wandb.log(log, step=step)
+
 
 @th.no_grad()
 def run_validation(
@@ -177,7 +182,9 @@ def run_validation(
             ).mean()
     if step is not None:
         log["step"] = step
-    wandb.log(log, step=step, epoch=epoch_idx_per_step[step] if epoch_idx_per_step is not None else None)
+        if epoch_idx_per_step is not None:
+            log["epoch"] = epoch_idx_per_step[step]
+    wandb.log(log, step=step)
 
     return log
 
@@ -194,6 +201,7 @@ def trainSAE(
     use_wandb=False,
     wandb_entity="",
     wandb_project="",
+    wandb_group="",
     steps=None,
     save_steps=None,
     save_dir=None,
@@ -212,13 +220,14 @@ def trainSAE(
 ):
     """
     Train SAE using the given trainer
-    
+
     Args:
         data: Training data iterator/dataloader
         trainer_config: Configuration dictionary for the trainer
         use_wandb: Whether to use Weights & Biases logging (default: False)
         wandb_entity: W&B entity name (default: "")
         wandb_project: W&B project name (default: "")
+        wandb_group: W&B group name (default: "")
         steps: Maximum number of training steps (default: None)
         save_steps: Frequency of model checkpointing (default: None)
         save_dir: Directory to save checkpoints and config (default: None)
@@ -234,10 +243,10 @@ def trainSAE(
         dtype: Training data type (default: torch.float32)
         run_wandb_finish: Whether to call wandb.finish() at end of training (default: True)
         epoch_idx_per_step: Optional mapping of training steps to epoch indices (default: None). Mainly used for logging when the dataset is pre-shuffled and contains multiple epochs.
-    
+
     Returns:
         Trained model
-        
+
     Raises:
         AssertionError: If validation_data is None but validate_every_n_steps is specified
     """
@@ -256,11 +265,12 @@ def trainSAE(
         config=wandb_config,
         name=wandb_config["wandb_name"],
         mode="disabled" if not use_wandb else "online",
+        group=wandb_group,
     )
 
     trainer.model.to(dtype)
 
-    # make save dir, export config  
+    # make save dir, export config
     if save_dir is not None:
         os.makedirs(save_dir, exist_ok=True)
         # save config
@@ -317,7 +327,13 @@ def trainSAE(
             and (start_of_training_eval or step > 0)
         ):
             print(f"Validating at step {step}")
-            logs = run_validation(trainer, validation_data, step=step, dtype=dtype, epoch_idx_per_step=epoch_idx_per_step)
+            logs = run_validation(
+                trainer,
+                validation_data,
+                step=step,
+                dtype=dtype,
+                epoch_idx_per_step=epoch_idx_per_step,
+            )
             try:
                 os.makedirs(save_dir, exist_ok=True)
                 th.save(logs, os.path.join(save_dir, f"eval_logs_{step}.pt"))
@@ -328,7 +344,11 @@ def trainSAE(
             end_of_step_logging_fn(trainer, step)
     try:
         last_eval_logs = run_validation(
-            trainer, validation_data, step=step, dtype=dtype, epoch_idx_per_step=epoch_idx_per_step
+            trainer,
+            validation_data,
+            step=step,
+            dtype=dtype,
+            epoch_idx_per_step=epoch_idx_per_step,
         )
         if save_last_eval:
             os.makedirs(save_dir, exist_ok=True)
@@ -343,4 +363,4 @@ def trainSAE(
     if use_wandb and run_wandb_finish:
         wandb.finish()
 
-    return get_model(trainer)
+    return get_model(trainer)

Original file line number	Diff line number	Diff line change
`@@ -349,7 +349,7 @@ def cleanup_multiprocessing():`
`349`	`349`	`print(`
`350`	`350`	`f"Waiting for {ActivationCache.__active_processes.value} save processes to finish"`
`351`	`351`	`)`
`352`		`- time.sleep(1)`
	`352`	`+ time.sleep(10)`
`353`	`353`	`ActivationCache.__pool.close()`
`354`	`354`	`ActivationCache.__pool = None`
`355`	`355`	`ActivationCache.__manager.shutdown()`
Original file line number	Diff line number	Diff line change
`@@ -213,7 +213,7 @@ def update(self, step, x):`
`213`	`213`	`x = x.to(self.device)`
`214`	`214`	`x = self.ae.normalize_activations(`
`215`	`215`	`x,`
`216`		`- inplace_normalize=True, # Normalize inplace to avoid copying the activations during training`
	`216`	`+ inplace=True, # Normalize inplace to avoid copying the activations during training`
`217`	`217`	`)`
`218`	`218`	`loss = self.loss(x, step=step, normalize_activations=False)`
`219`	`219`	`loss.backward()`