Merge pull request #77 from lucidrains/LFQ

lucidrains · web-flow · commit 3debe666f48c · 2023-10-10T16:04:54.000-07:00
begin work on the proposed "lookup free quantization"
diff --git a/README.md b/README.md
@@ -284,12 +284,39 @@ assert xhat.shape == x.shape
 assert torch.all(xhat == quantizer.indices_to_codes(indices))
 ```
 
+### Lookup Free Quantization
 
-## Todo
+<img src="./lfq.png" width="450px"></img>
 
-- [x] allow for multi-headed codebooks
-- [x] support masking
-- [x] make sure affine param works with (`sync_affine_param` set to `True`)
+The research team behind <a href="https://arxiv.org/abs/2212.05199">MagViT</a> has released new SOTA results for generative video modeling. The core change between v1 and v2 of their architecture is using a new type of quantization, which is essentially the same as <a href="https://arxiv.org/abs/2309.15505">Finite Scalar Quantization</a> but with 2 levels (binary latents). (FSQ would be a generalization of this technique). However, this team chose to use extra entropy regularizations to promote codebook usage.
+
+Finite scalar quantization and follow up papers will likely lead to further game changing results in generative modeling.
+
+You can use it simply as follows. Will be dogfooded at <a href="https://github.com/lucidrains/magvit2-pytorch">MagViT2 pytorch port</a>
+
+```python
+import torch
+from vector_quantize_pytorch import LFQ
+
+# you can specify either dim or codebook_size
+# if both specified, will be validated against each other
+
+quantizer = LFQ(
+    dim = 16,                   # this is the input feature dimension, but also the log2(codebook_size)
+    # codebook_size = 2 ** 16,    # correspondingly, this would be 2 ^ dim - since each scalar in the feature dimension is a binary latent
+    entropy_loss_weight = 0.1,  # how much weight to place on entropy loss
+    diversity_gamma = 1.        # within entropy loss, how much weight to give to diversity of codes, taken from https://arxiv.org/abs/1911.05894
+)
+
+image_feats = torch.randn(1, 16, 32, 32)
+
+quantized, indices, entropy_aux_loss = quantizer(image_feats)
+
+# (1, 16, 32, 32), (1, 32, 32), (1,)
+
+assert image_feats.shape == quantized.shape
+assert (quantized == quantizer.indices_to_codes(indices)).all()
+```
 
 ## Citations
 
@@ -429,3 +456,14 @@ assert torch.all(xhat == quantizer.indices_to_codes(indices))
     primaryClass = {cs.CV}
 }
 ```
+
+```bibtex
+@misc{yu2023language,
+    title   = {Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation},
+    author  = {Lijun Yu and José Lezama and Nitesh B. Gundavarapu and Luca Versari and Kihyuk Sohn and David Minnen and Yong Cheng and Agrim Gupta and Xiuye Gu and Alexander G. Hauptmann and Boqing Gong and Ming-Hsuan Yang and Irfan Essa and David A. Ross and Lu Jiang},
+    year    = {2023},
+    eprint  = {2310.05737},
+    archivePrefix = {arXiv},
+    primaryClass = {cs.CV}
+}
+```
diff --git a/examples/autoencoder_lfq.py b/examples/autoencoder_lfq.py
@@ -0,0 +1,112 @@
+# FashionMnist VQ experiment with various settings.
+# From https://github.com/minyoungg/vqtorch/blob/main/examples/autoencoder.py
+
+from tqdm.auto import trange
+from math import log2
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import datasets, transforms
+from torch.utils.data import DataLoader
+
+from vector_quantize_pytorch import LFQ
+
+lr = 3e-4
+train_iter = 10000
+seed = 1234
+codebook_size = 2 ** 8
+diversity_gamma = 10.
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+class LFQAutoEncoder(nn.Module):
+    def __init__(
+        self,
+        codebook_size,
+        **vq_kwargs
+    ):
+        super().__init__()
+        assert log2(codebook_size).is_integer()
+        quantize_dim = int(log2(codebook_size))
+
+        self.encode = nn.Sequential(
+            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.GELU(),
+            nn.Conv2d(16, 16, kernel_size=3, stride=1, padding=1),
+            nn.GELU(),
+            nn.Conv2d(16, quantize_dim, kernel_size=3, stride=1, padding=1),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+
+        self.quantize = LFQ(dim=quantize_dim, **vq_kwargs)
+
+        self.decode = nn.Sequential(
+            nn.Upsample(scale_factor=2, mode="nearest"),
+            nn.Conv2d(quantize_dim, 16, kernel_size=3, stride=1, padding=1),
+            nn.GELU(),
+            nn.Conv2d(16, 16, kernel_size=3, stride=1, padding=1),
+            nn.GELU(),
+            nn.Upsample(scale_factor=2, mode="nearest"),
+            nn.Conv2d(16, 1, kernel_size=3, stride=1, padding=1)
+        )
+        return
+
+    def forward(self, x):
+        x = self.encode(x)
+        x, indices, entropy_aux_loss = self.quantize(x)
+        x = self.decode(x)
+        return x.clamp(-1, 1), indices, entropy_aux_loss
+
+
+def train(model, train_loader, train_iterations=1000):
+    def iterate_dataset(data_loader):
+        data_iter = iter(data_loader)
+        while True:
+            try:
+                x, y = next(data_iter)
+            except StopIteration:
+                data_iter = iter(data_loader)
+                x, y = next(data_iter)
+            yield x.to(device), y.to(device)
+
+    for _ in (pbar := trange(train_iterations)):
+        opt.zero_grad()
+        x, _ = next(iterate_dataset(train_loader))
+        out, indices, entropy_aux_loss = model(x)
+
+        rec_loss = F.l1_loss(out, x)
+        (rec_loss + entropy_aux_loss).backward()
+
+        opt.step()
+        pbar.set_description(
+              f"rec loss: {rec_loss.item():.3f} | "
+            + f"entropy aux loss: {entropy_aux_loss.item():.3f} | "
+            + f"active %: {indices.unique().numel() / codebook_size * 100:.3f}"
+        )
+    return
+
+transform = transforms.Compose(
+    [transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]
+)
+
+train_dataset = DataLoader(
+    datasets.FashionMNIST(
+        root="~/data/fashion_mnist", train=True, download=True, transform=transform
+    ),
+    batch_size=256,
+    shuffle=True,
+)
+
+print("baseline")
+
+torch.random.manual_seed(seed)
+
+model = LFQAutoEncoder(
+    codebook_size = codebook_size,
+    diversity_gamma = diversity_gamma
+).to(device)
+
+opt = torch.optim.AdamW(model.parameters(), lr=lr)
+
+train(model, train_dataset, train_iterations=train_iter)
diff --git a/lfq.png b/lfq.png
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'vector_quantize_pytorch',
   packages = find_packages(),
-  version = '1.8.1',
+  version = '1.9.0',
   license='MIT',
   description = 'Vector Quantization - Pytorch',
   long_description_content_type = 'text/markdown',
@@ -17,7 +17,7 @@
     'quantization'
   ],
   install_requires=[
-    'einops>=0.6.1',
+    'einops>=0.7.0',
     'torch'
   ],
   classifiers=[
diff --git a/vector_quantize_pytorch/__init__.py b/vector_quantize_pytorch/__init__.py
@@ -1,4 +1,5 @@
 from vector_quantize_pytorch.vector_quantize_pytorch import VectorQuantize
 from vector_quantize_pytorch.residual_vq import ResidualVQ, GroupedResidualVQ
 from vector_quantize_pytorch.random_projection_quantizer import RandomProjectionQuantizer
-from vector_quantize_pytorch.finite_scalar_quantization import FSQ
+from vector_quantize_pytorch.finite_scalar_quantization import FSQ
+from vector_quantize_pytorch.lookup_free_quantization import LFQ
diff --git a/vector_quantize_pytorch/lookup_free_quantization.py b/vector_quantize_pytorch/lookup_free_quantization.py
@@ -0,0 +1,197 @@
+"""
+Lookup Free Quantization
+Proposed in https://arxiv.org/abs/2310.05737
+
+basically a 2-level FSQ (Finite Scalar Quantization) with entropy loss
+https://arxiv.org/abs/2309.15505
+"""
+
+from math import log2
+from collections import namedtuple
+
+import torch
+from torch import nn, Tensor
+from torch.nn import Module, ModuleList
+
+from einops import rearrange, reduce, pack, unpack
+
+# constants
+
+Return = namedtuple('Return', ['quantized', 'indices', 'entropy_aux_loss'])
+
+# helper functions
+
+def exists(v):
+    return v is not None
+
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
+
+def pack_one(t, pattern):
+    return pack([t], pattern)
+
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+
+# entropy
+
+def binary_entropy(prob):
+    return -prob * log(prob) - (1 - prob) * log(1 - prob)
+
+# tensor helpers
+
+def log(t, eps = 1e-20):
+    return t.clamp(min = eps).log()
+
+# convert to bit representations and back
+
+def decimal_to_bits(x, bits):
+    device = x.device
+
+    x = x.int()
+
+    mask = 2 ** torch.arange(bits - 1, -1, -1, device = device)
+    x = rearrange(x, 'b n -> b n 1')
+
+    bits = ((x & mask) != 0).float()
+    bits = rearrange(bits, 'b n d -> b n d')
+    return bits * 2 - 1
+
+def bits_to_decimal(x, bits):
+    device, dtype = x.device, x.dtype
+
+    x = (x > 0).int()
+
+    mask = 2 ** torch.arange(bits - 1, -1, -1, device = device, dtype = torch.int32)
+    dec = reduce(x * mask, 'b n d -> b n', 'sum')
+    return dec
+
+# class
+
+class LFQ(Module):
+    def __init__(
+        self,
+        *,
+        dim = None,
+        codebook_size = None,
+        entropy_loss_weight = 0.1,
+        diversity_gamma = 2.5
+    ):
+        super().__init__()
+
+        # some assert validations
+
+        assert exists(dim) or exists(codebook_size)
+        assert not exists(codebook_size) or log2(codebook_size).is_integer()
+
+        codebook_size = default(codebook_size, 2 ** dim)
+        dim = default(dim, int(log2(codebook_size)))
+
+        assert (2 ** dim) == codebook_size, f'2 ^ dimension ({dim}) must be equal to the codebook size ({codebook_size})'
+
+        self.dim = dim
+
+        # entropy aux loss related weights
+
+        self.diversity_gamma = diversity_gamma
+        self.entropy_loss_weight = entropy_loss_weight
+
+        # for no auxiliary loss, during inference
+
+        self.register_buffer('zero', torch.zeros(1,), persistent = False)
+
+    def indices_to_codes(self, indices):
+        is_img_or_video = indices.ndim >= 3
+
+        # rearrange if image or video into (batch, seq, dimension)
+
+        if is_img_or_video:
+            indices, ps = pack_one(indices, 'b *')
+
+        # indices to codes, which are bits of either -1 or 1
+
+        codes = decimal_to_bits(indices, self.dim)
+
+        # rearrange codes back to original shape
+
+        if is_img_or_video:
+            codes = unpack_one(codes, ps, 'b * d')
+            codes = rearrange(codes, 'b ... d -> b d ...')
+
+        return codes
+
+    def forward(
+        self,
+        x,
+        inv_temperature = 1.
+    ):
+        """
+        einstein notation
+        b - batch
+        n - sequence (or flattened spatial dimensions)
+        d - feature dimension, which is also log2(codebook size)
+        """
+
+        is_img_or_video = x.ndim >= 4
+
+        # rearrange if image or video into (batch, seq, dimension)
+
+        if is_img_or_video:
+            x = rearrange(x, 'b d ... -> b ... d')
+            x, ps = pack_one(x, 'b * d')
+
+        assert x.shape[-1] == self.dim
+
+        # quantize by eq 3.
+
+        greater_than_zero = x > 0
+        ones = torch.ones_like(x)
+
+        quantized = torch.where(greater_than_zero, ones, -ones)
+
+        # use straight-through gradients with tanh if training
+
+        if self.training:
+            x = torch.tanh(x * inv_temperature)
+            x = x - x.detach() + quantized
+        else:
+            x = quantized
+
+        # calculate indices
+
+        indices = bits_to_decimal(x, self.dim)
+
+        # entropy aux loss (todo)
+
+        if self.training:
+            prob = (x * inv_temperature).sigmoid()
+
+            bit_entropy = binary_entropy(prob).mean()
+
+            avg_prob = reduce(prob, 'b n d -> b d', 'mean')
+            codebook_entropy = binary_entropy(avg_prob).mean()
+
+            # 1. entropy will be nudged to be low for each bit, so each scalar commits to one latent binary bit or the other
+            # 2. codebook entropy will be nudged to be high, to encourage all codes to be uniformly used
+
+            entropy_aux_loss = bit_entropy - self.diversity_gamma * codebook_entropy
+        else:
+            # if not training, just return dummy 0
+            entropy_aux_loss = self.zero
+
+        entropy_aux_loss = entropy_aux_loss * self.entropy_loss_weight
+
+        # reconstitute image or video dimensions
+
+        if is_img_or_video:
+            x = unpack_one(x, ps, 'b * d')
+            x = rearrange(x, 'b ... d -> b d ...')
+
+            indices = unpack_one(indices, ps, 'b *')
+
+        # bits to decimal for the codebook indices
+
+        return Return(x, indices, entropy_aux_loss)