add kmeans init of codebook, as proposed in soundstream paper, also make sure commitment loss is not calculated on eval

lucidrains · lucidrains · commit d28d851aa514 · 2021-10-18T10:12:04.000-07:00
diff --git a/README.md b/README.md
@@ -48,6 +48,26 @@ quantized, indices, commit_loss = residual_vq(x)
 # (batch, seq, dim), (quantizer, batch, seq), (quantizer, batch)
 ```
 
+## Initialization
+
+The SoundStream paper proposes that the codebook should be initialized by the kmeans centroids of the first batch. You can easily turn on this feature with one flag `kmeans_init = True`, for either `VectorQuantize` or `ResidualVQ` class
+
+```python
+import torch
+from vector_quantize_pytorch import ResidualVQ
+
+residual_vq = ResidualVQ(
+    dim = 256,
+    codebook_size = 256,
+    num_quantizers = 4,
+    kmeans_init = True,   # set to True
+    kmeans_iters = 10     # number of kmeans iterations to calculate the centroids for the codebook on init
+)
+
+x = torch.randn(1, 1024, 256)
+quantized, indices, commit_loss = residual_vq(x)
+```
+
 ## Citations
 
 ```bibtex
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'vector_quantize_pytorch',
   packages = find_packages(),
-  version = '0.2.2',
+  version = '0.3.0',
   license='MIT',
   description = 'Vector Quantization - Pytorch',
   author = 'Phil Wang',
@@ -16,6 +16,7 @@
     'quantization'
   ],
   install_requires=[
+    'einops',
     'torch'
   ],
   classifiers=[
diff --git a/vector_quantize_pytorch/vector_quantize_pytorch.py b/vector_quantize_pytorch/vector_quantize_pytorch.py
@@ -1,6 +1,7 @@
 import torch
-from torch import nn
+from torch import nn, einsum
 import torch.nn.functional as F
+from einops import rearrange, repeat
 
 def exists(val):
     return val is not None
@@ -11,9 +12,36 @@ def default(val, d):
 def ema_inplace(moving_avg, new, decay):
     moving_avg.data.mul_(decay).add_(new, alpha = (1 - decay))
 
-def laplace_smoothing(x, n_categories, eps=1e-5):
+def laplace_smoothing(x, n_categories, eps = 1e-5):
     return (x + eps) / (x.sum() + n_categories * eps)
 
+def kmeans(x, num_clusters, num_iters = 10):
+    samples = rearrange(x, '... d -> (...) d')
+    num_samples, dim, dtype, device = *samples.shape, x.dtype, x.device
+
+    if num_samples >= num_clusters:
+        indices = torch.randperm(num_samples, device=device)[:num_clusters]
+    else:
+        indices = torch.randint(0, num_samples, (num_clusters,), device=device)
+
+    means = samples[indices]
+
+    for _ in range(num_iters):
+        diffs = rearrange(samples, 'n d -> n () d') - rearrange(means, 'c d -> () c d')
+        dists = (diffs ** 2).sum(dim = -1)
+        buckets = dists.argmin(dim = -1)
+
+        bins = torch.bincount(buckets, minlength = num_clusters)
+        zero_mask = bins == 0
+        bins = bins.masked_fill(zero_mask, 1)
+
+        new_means = buckets.new_zeros(num_clusters, dim, dtype = dtype)
+        new_means.scatter_add_(0, repeat(buckets, 'n -> n d', d = dim), samples)
+        new_means = new_means / bins[..., None]
+        means = torch.where(zero_mask[..., None], means, new_means)
+
+    return rearrange(means, 'n d -> d n')
+
 class VectorQuantize(nn.Module):
     def __init__(
         self,
@@ -23,6 +51,8 @@ def __init__(
         commitment = 1.,
         eps = 1e-5,
         n_embed = None,
+        kmeans_init = False,
+        kmeans_iters = 10
     ):
         super().__init__()
         n_embed = default(n_embed, codebook_size)
@@ -33,26 +63,42 @@ def __init__(
         self.eps = eps
         self.commitment = commitment
 
-        embed = torch.randn(dim, n_embed)
-        self.register_buffer('embed', embed)
+        init_fn = torch.randn if not kmeans_init else torch.zeros
+        embed = init_fn(dim, n_embed)
+
+        self.kmeans_iters = kmeans_iters
+        self.register_buffer('initted', torch.Tensor([not kmeans_init]))
         self.register_buffer('cluster_size', torch.zeros(n_embed))
+        self.register_buffer('embed', embed)
         self.register_buffer('embed_avg', embed.clone())
 
     @property
     def codebook(self):
         return self.embed.transpose(0, 1)
 
+    def init_embed_(self, data):
+        embed = kmeans(data, self.n_embed, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.initted.data.copy_(torch.Tensor([True]))
+
     def forward(self, input):
+        if not self.initted:
+            self.init_embed_(input)
+
         dtype = input.dtype
         flatten = input.reshape(-1, self.dim)
         dist = (
             flatten.pow(2).sum(1, keepdim=True)
             - 2 * flatten @ self.embed
             + self.embed.pow(2).sum(0, keepdim=True)
         )
+
         _, embed_ind = (-dist).max(1)
         embed_onehot = F.one_hot(embed_ind, self.n_embed).type(dtype)
         embed_ind = embed_ind.view(*input.shape[:-1])
+
+        commit_loss = 0.
         quantize = F.embedding(embed_ind, self.embed.transpose(0, 1))
 
         if self.training:
@@ -63,6 +109,7 @@ def forward(self, input):
             embed_normalized = self.embed_avg / cluster_size.unsqueeze(0)
             self.embed.data.copy_(embed_normalized)
 
-        loss = F.mse_loss(quantize.detach(), input) * self.commitment
-        quantize = input + (quantize - input).detach()
-        return quantize, embed_ind, loss
+            commit_loss = F.mse_loss(quantize.detach(), input) * self.commitment
+            quantize = input + (quantize - input).detach()
+
+        return quantize, embed_ind, commit_loss