Merge pull request #164 from cfifty/master

lucidrains · web-flow · commit d3dd5e5e6a31 · 2024-10-10T20:04:23.000-07:00
Implement the rotation trick.
diff --git a/README.md b/README.md
@@ -122,6 +122,20 @@ quantized, indices, commit_loss = residual_vq(x)
 # (1, 1024, 256), (1, 1024, 4), (1, 4)
 ```
 
+## Gradient Computation
+
+VQ-VAEs are traditionally trained with the straight-through estimator (STE). During the backwards pass, the gradient flows _around_ the VQ layer rather than _through_ it. The <a href="https://arxiv.org/abs/2410.06424">rotation trick paper</a> proposes to transform the gradient _through_ the VQ layer so the relative angle and magnitude between the input vector and quantized output are encoded into the gradient. You can enable or disable this feature with ```rotation_trick=True/False``` in the ```VectorQuantize``` class.
+
+```python
+from vector_quantize_pytorch import VectorQuantize
+
+vq_layer = VectorQuantize(
+    dim = 256,
+    codebook_size = 256,
+    rotation_trick = True,   # Set to False to use the STE gradient estimator or True to use the rotation trick.
+)
+```
+
 ## Increasing codebook usage
 
 This repository will contain a few techniques from various papers to combat "dead" codebook entries, which is a common problem when using vector quantizers.
@@ -699,3 +713,14 @@ assert loss.item() >= 0
     url     = {https://api.semanticscholar.org/CorpusID:267301189}
 }
 ```
+
+```bibtex
+@article{Fifty2024Restructuring,
+    title   = {Restructuring Vector Quantization with the Rotation Trick},
+    author  = {Christopher Fifty, Ronald G. Junkins, Dennis Duan, Aniketh Iyengar, Jerry W. Liu, Ehsan Amid, Sebastian Thrun, Christopher Ré},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2410.06424},
+    url     = {https://api.semanticscholar.org/CorpusID:273229218}
+}
+```
diff --git a/tests/test_readme.py b/tests/test_readme.py
@@ -89,7 +89,7 @@ def test_residual_vq(
 
     quantized, indices, commit_loss = residual_vq(x, freeze_codebook = train and not implicit_neural_codebook)
     quantized_out = residual_vq.get_output_from_indices(indices)
-    assert torch.allclose(quantized, quantized_out, atol = 1e-6)
+    assert torch.allclose(quantized, quantized_out, atol = 1e-5)
 
 def test_residual_vq2():
     from vector_quantize_pytorch import ResidualVQ
diff --git a/vector_quantize_pytorch/vector_quantize_pytorch.py b/vector_quantize_pytorch/vector_quantize_pytorch.py
@@ -811,6 +811,7 @@ def __init__(
         stochastic_sample_codes = False,
         sample_codebook_temp = 1.,
         straight_through = False,
+        rotation_trick = True,  # Propagate grads through VQ layer w/ rotation trick: https://arxiv.org/abs/2410.06424.
         reinmax = False,  # using reinmax for improved straight-through, assuming straight through helps at all
         sync_codebook = None,
         sync_affine_param = False,
@@ -821,7 +822,7 @@ def __init__(
         manual_in_place_optimizer_update = False,
         affine_param = False,
         affine_param_batch_decay = 0.99,
-        affine_param_codebook_decay = 0.9, 
+        affine_param_codebook_decay = 0.9,
         sync_update_v = 0., # the v that controls optimistic vs pessimistic update for synchronous update rule (21) https://minyoungg.github.io/vqtorch/assets/draft_050523.pdf
         return_zeros_for_masked_padding = True
     ):
@@ -863,6 +864,9 @@ def __init__(
         self.codebook_diversity_temperature = codebook_diversity_temperature
         self.codebook_diversity_loss_weight = codebook_diversity_loss_weight
 
+        assert not (straight_through and rotation_trick)
+        self.rotation_trick = rotation_trick
+
         assert not (ema_update and learnable_codebook), 'learnable codebook not compatible with EMA update'
 
         assert 0 <= sync_update_v <= 1.
@@ -942,6 +946,13 @@ def codebook(self, codes):
 
         self._codebook.embed.copy_(codes)
 
+    @staticmethod
+    def rotation_trick_transform(u, q, e):
+        w = ((u + q) / torch.norm(u + q, dim=1, keepdim=True)).detach()
+        e = e - 2 * torch.bmm(torch.bmm(e, w.unsqueeze(-1)), w.unsqueeze(1)) + 2 * torch.bmm(
+            torch.bmm(e, u.unsqueeze(-1).detach()), q.unsqueeze(1).detach())
+        return e
+
     def get_codes_from_indices(self, indices):
         codebook = self.codebook
         is_multiheaded = codebook.ndim > 2
@@ -1090,11 +1101,26 @@ def forward(
             # determine code to use for commitment loss
             maybe_detach = torch.detach if not self.learnable_codebook or freeze_codebook else identity
 
-            commit_quantize = maybe_detach(quantize)            
-
-            # straight through
-
-            quantize = x + (quantize - x).detach()
+            commit_quantize = maybe_detach(quantize)
+
+            # Use the rotation trick (https://arxiv.org/abs/2410.06424) to get gradients through VQ layer.
+            if self.rotation_trick:
+                init_shape = x.shape
+                x = x.reshape(-1, init_shape[-1])
+                quantize = quantize.reshape(-1, init_shape[-1])
+
+                eps = 1e-6  # For numerical stability if any vector is close to 0 norm.
+                rot_quantize = self.rotation_trick_transform(
+                    x / (torch.norm(x, dim=1, keepdim=True) + eps),
+                    quantize / (torch.norm(quantize, dim=1, keepdim=True) + eps),
+                    x.unsqueeze(1)).squeeze()
+                quantize = rot_quantize * (torch.norm(quantize, dim=1, keepdim=True)
+                                           / (torch.norm(x, dim=1, keepdim=True) + 1e-6)).detach()
+
+                x = x.reshape(init_shape)
+                quantize = quantize.reshape(init_shape)
+            else:  # Use STE to get gradients through VQ layer.
+                quantize = x + (quantize - x).detach()
 
             if self.sync_update_v > 0.:
                 # (21) in https://minyoungg.github.io/vqtorch/assets/draft_050523.pdf