reintroduce orthogonal regularization, due to bug that @kashif found

lucidrains · lucidrains · commit ae5db0169772 · 2023-05-30T13:03:24.000-07:00
diff --git a/README.md b/README.md
@@ -182,6 +182,28 @@ x = torch.randn(1, 1024, 256)
 quantized, indices, commit_loss = vq(x)
 ```
 
+### Orthogonal regularization loss
+
+VQ-VAE / VQ-GAN is quickly gaining popularity. A <a href="https://arxiv.org/abs/2112.00384">recent paper</a> proposes that when using vector quantization on images, enforcing the codebook to be orthogonal leads to translation equivariance of the discretized codes, leading to large improvements in downstream text to image generation tasks.
+
+You can use this feature by simply setting the `orthogonal_reg_weight` to be greater than `0`, in which case the orthogonal regularization will be added to the auxiliary loss outputted by the module.
+
+```python
+import torch
+from vector_quantize_pytorch import VectorQuantize
+vq = VectorQuantize(
+    dim = 256,
+    codebook_size = 256,
+    accept_image_fmap = True,                   # set this true to be able to pass in an image feature map
+    orthogonal_reg_weight = 10,                 # in paper, they recommended a value of 10
+    orthogonal_reg_max_codes = 128,             # this would randomly sample from the codebook for the orthogonal regularization loss, for limiting memory usage
+    orthogonal_reg_active_codes_only = False    # set this to True if you have a very large codebook, and would only like to enforce the loss on the activated codes per batch
+)
+img_fmap = torch.randn(1, 256, 32, 32)
+quantized, indices, loss = vq(img_fmap) # (1, 256, 32, 32), (1, 32, 32), (1,)
+# loss now contains the orthogonal regularization loss with the weight as assigned
+```
+
 ### Multi-headed VQ
 
 There has been a number of papers that proposes variants of discrete latent representations with a multi-headed approach (multiple codes per feature). I have decided to offer one variant where the same codebook is used to vector quantize across the input dimension `head` times.
@@ -399,3 +421,14 @@ if __name__ == '__main__':
     url     = {https://openreview.net/forum?id=oapKSVM2bcj}
 }
 ```
+
+```bibtex
+@misc{shin2021translationequivariant,
+    title   = {Translation-equivariant Image Quantizer for Bi-directional Image-Text Generation},
+    author  = {Woncheol Shin and Gyubok Lee and Jiyoung Lee and Joonseok Lee and Edward Choi},
+    year    = {2021},
+    eprint  = {2112.00384},
+    archivePrefix = {arXiv},
+    primaryClass = {cs.CV}
+}
+```
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'vector_quantize_pytorch',
   packages = find_packages(),
-  version = '1.6.7',
+  version = '1.6.9',
   license='MIT',
   description = 'Vector Quantization - Pytorch',
   long_description_content_type = 'text/markdown',
diff --git a/vector_quantize_pytorch/vector_quantize_pytorch.py b/vector_quantize_pytorch/vector_quantize_pytorch.py
@@ -211,6 +211,15 @@ def batched_embedding(indices, embeds):
     embeds = repeat(embeds, 'h c d -> h b c d', b = batch)
     return embeds.gather(2, indices)
 
+# regularization losses
+
+def orthogonal_loss_fn(t):
+    # eq (2) from https://arxiv.org/abs/2112.00384
+    h, n = t.shape[:2]
+    normed_codes = l2norm(t)
+    cosine_sim = einsum('h i d, h j d -> h i j', normed_codes, normed_codes)
+    return (cosine_sim ** 2).sum() / (h * n ** 2) - (1 / n)
+
 # distance types
 
 class EuclideanCodebook(nn.Module):
@@ -630,6 +639,9 @@ def __init__(
         accept_image_fmap = False,
         commitment_weight = 1.,
         commitment_use_cross_entropy_loss = False,
+        orthogonal_reg_weight = 0.,
+        orthogonal_reg_active_codes_only = False,
+        orthogonal_reg_max_codes = None,
         stochastic_sample_codes = False,
         sample_codebook_temp = 1.,
         straight_through = False,
@@ -659,6 +671,12 @@ def __init__(
         self.commitment_weight = commitment_weight
         self.commitment_use_cross_entropy_loss = commitment_use_cross_entropy_loss # whether to use cross entropy loss to codebook as commitment loss
 
+        has_codebook_orthogonal_loss = orthogonal_reg_weight > 0
+        self.has_codebook_orthogonal_loss = has_codebook_orthogonal_loss
+        self.orthogonal_reg_weight = orthogonal_reg_weight
+        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
+        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
+
         assert not (ema_update and learnable_codebook), 'learnable codebook not compatible with EMA update'
 
         assert 0 <= sync_update_v <= 1.
@@ -686,7 +704,7 @@ def __init__(
             eps = eps,
             threshold_ema_dead_code = threshold_ema_dead_code,
             use_ddp = sync_codebook,
-            learnable_codebook = learnable_codebook,
+            learnable_codebook = has_codebook_orthogonal_loss or learnable_codebook,
             sample_codebook_temp = sample_codebook_temp,
             gumbel_sample = gumbel_sample_fn,
             ema_update = ema_update
@@ -854,6 +872,25 @@ def calculate_ce_loss(codes):
 
                 loss = loss + commit_loss * self.commitment_weight
 
+            if self.has_codebook_orthogonal_loss:
+                codebook = self._codebook.embed
+
+                # only calculate orthogonal loss for the activated codes for this batch
+
+                if self.orthogonal_reg_active_codes_only:
+                    assert not (is_multiheaded and self.separate_codebook_per_head), 'orthogonal regularization for only active codes not compatible with multi-headed with separate codebooks yet'
+                    unique_code_ids = torch.unique(embed_ind)
+                    codebook = codebook[:, unique_code_ids]
+
+                num_codes = codebook.shape[-2]
+
+                if exists(self.orthogonal_reg_max_codes) and num_codes > self.orthogonal_reg_max_codes:
+                    rand_ids = torch.randperm(num_codes, device = device)[:self.orthogonal_reg_max_codes]
+                    codebook = codebook[:, rand_ids]
+
+                orthogonal_reg_loss = orthogonal_loss_fn(codebook)
+                loss = loss + orthogonal_reg_loss * self.orthogonal_reg_weight
+
         # handle multi-headed quantized embeddings
 
         if is_multiheaded: