add a new technique where one can do cross entropy loss on the distance matrix with the codes, if indices were to be passed in

lucidrains · lucidrains · commit 17862257b0ca · 2023-04-21T17:40:30.000-07:00
diff --git a/README.md b/README.md
@@ -367,3 +367,11 @@ if __name__ == '__main__':
     year    = {2023}
 }
 ```
+
+```bibtex
+@inproceedings{Shen2023NaturalSpeech2L,
+    title   = {NaturalSpeech 2: Latent Diffusion Models are Natural and Zero-Shot Speech and Singing Synthesizers},
+    author  = {Kai Shen and Zeqian Ju and Xu Tan and Yanqing Liu and Yichong Leng and Lei He and Tao Qin and Sheng Zhao and Jiang Bian},
+    year    = {2023}
+}
+```
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'vector_quantize_pytorch',
   packages = find_packages(),
-  version = '1.1.6',
+  version = '1.2.0',
   license='MIT',
   description = 'Vector Quantization - Pytorch',
   long_description_content_type = 'text/markdown',
diff --git a/vector_quantize_pytorch/random_projection_quantizer.py b/vector_quantize_pytorch/random_projection_quantizer.py
@@ -5,6 +5,9 @@
 
 from einops import rearrange, repeat, pack, unpack
 
+def exists(val):
+    return val is not None
+
 class RandomProjectionQuantizer(nn.Module):
     """ https://arxiv.org/abs/2202.01855 """
 
@@ -40,15 +43,24 @@ def __init__(
             **kwargs
         )
 
-    @torch.no_grad()
-    def forward(self, x):
+    def forward(
+        self,
+        x,
+        indices = None
+    ):
+        return_loss = exists(indices)
 
         x = self.norm(x)
 
         x = einsum('b n d, h d e -> b n h e', x, self.rand_projs)
         x, ps = pack([x], 'b n *')
 
         self.vq.eval()
-        _, indices, _ = self.vq(x)
+        out = self.vq(x, indices = indices)
+
+        if return_loss:
+            _, ce_loss = out
+            return ce_loss
 
+        _, indices, _ = out
         return indices
diff --git a/vector_quantize_pytorch/residual_vq.py b/vector_quantize_pytorch/residual_vq.py
@@ -9,6 +9,11 @@
 
 from einops import rearrange, repeat, pack, unpack
 
+# helper functions
+
+def exists(val):
+    return val is not None
+
 def round_up_multiple(num, mult):
     return ceil(num / mult) * mult
 
@@ -99,16 +104,21 @@ def get_codes_from_indices(self, indices):
     def forward(
         self,
         x,
+        indices = None,
         return_all_codes = False
     ):
-        num_quant, quant_dropout_multiple_of, device = self.num_quantizers, self.quantize_dropout_multiple_of, x.device
+        num_quant, quant_dropout_multiple_of, return_loss, device = self.num_quantizers, self.quantize_dropout_multiple_of, exists(indices), x.device
+
+        assert not (self.accept_image_fmap and exists(indices))
+
         quantized_out = 0.
         residual = x
 
         all_losses = []
         all_indices = []
+        ce_losses = []   # for cross entropy losses across quantizers, if indices are passed in
 
-        should_quantize_dropout = self.training and self.quantize_dropout
+        should_quantize_dropout = self.training and self.quantize_dropout and not return_loss
 
         # sample a layer index at which to dropout further residual quantization
         # also prepare null indices and loss
@@ -132,13 +142,32 @@ def forward(
                 all_losses.append(null_loss)
                 continue
 
-            quantized, indices, loss = layer(residual)
+            layer_indices = None
+            if return_loss:
+                layer_indices = indices[..., quantizer_index]
+
+            quantized, *rest = layer(residual, indices = layer_indices)
+
             residual = residual - quantized.detach()
             quantized_out = quantized_out + quantized
 
-            all_indices.append(indices)
+            if return_loss:
+                ce_loss = rest[0]
+                ce_losses.append(ce_loss)
+                continue
+
+            embed_indices, loss = rest
+
+            all_indices.append(embed_indices)
             all_losses.append(loss)
 
+        # whether to early return the cross entropy loss
+
+        if return_loss:
+            return quantized_out, sum(ce_losses)
+
+        # stack all losses and indices
+
         all_losses, all_indices = map(partial(torch.stack, dim = -1), (all_losses, all_indices))
 
         ret = (quantized_out, all_indices, all_losses)
diff --git a/vector_quantize_pytorch/vector_quantize_pytorch.py b/vector_quantize_pytorch/vector_quantize_pytorch.py
@@ -301,7 +301,7 @@ def forward(self, x):
         if needs_codebook_dim:
             quantize, embed_ind = map(lambda t: rearrange(t, '1 ... -> ...'), (quantize, embed_ind))
 
-        return quantize, embed_ind
+        return quantize, embed_ind, dist
 
 class CosineSimCodebook(nn.Module):
     def __init__(
@@ -441,7 +441,7 @@ def forward(self, x):
         if needs_codebook_dim:
             quantize, embed_ind = map(lambda t: rearrange(t, '1 ... -> ...'), (quantize, embed_ind))
 
-        return quantize, embed_ind
+        return quantize, embed_ind, dist
 
 # main class
 
@@ -541,14 +541,15 @@ def get_codes_from_indices(self, indices):
     def forward(
         self,
         x,
+        indices = None,
         mask = None
     ):
         only_one = x.ndim == 2
 
         if only_one:
             x = rearrange(x, 'b d -> b 1 d')
 
-        shape, device, heads, is_multiheaded, codebook_size = x.shape, x.device, self.heads, self.heads > 1, self.codebook_size
+        shape, device, heads, is_multiheaded, codebook_size, return_loss = x.shape, x.device, self.heads, self.heads > 1, self.codebook_size, exists(indices)
 
         need_transpose = not self.channel_last and not self.accept_image_fmap
 
@@ -565,11 +566,21 @@ def forward(
             ein_rhs_eq = 'h b n d' if self.separate_codebook_per_head else '1 (b h) n d'
             x = rearrange(x, f'b n (h d) -> {ein_rhs_eq}', h = heads)
 
-        quantize, embed_ind = self._codebook(x)
+        quantize, embed_ind, distances = self._codebook(x)
 
         if self.training:
             quantize = x + (quantize - x).detach()
 
+        if return_loss:
+            if not is_multiheaded:
+                distances = rearrange(distances, '1 (b n) l -> b l n', b = shape[0])
+            elif self.separate_codebook_per_head:
+                distances = rearrange(distances, 'c (b n) l -> b l n c', b = shape[0])
+            else:
+                distances = rearrange(distances, '1 (b h n) l -> b l n h', b = shape[0], h = heads)
+
+            return quantize, F.cross_entropy(distances, indices, ignore_index = -1)
+
         loss = torch.tensor([0.], device = device, requires_grad = self.training)
 
         if self.training: