tensorflow
diff --git a/‎tensor2tensor/trax/layers/__init__.py
Lines changed: 4 additions & 0 deletions b/‎tensor2tensor/trax/layers/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎tensor2tensor/trax/layers/attention.py
Lines changed: 0 additions & 127 deletions b/‎tensor2tensor/trax/layers/attention.py
Lines changed: 0 additions & 127 deletions
diff --git a/‎tensor2tensor/trax/layers/convolution.py
Lines changed: 152 additions & 0 deletions b/‎tensor2tensor/trax/layers/convolution.py
Lines changed: 152 additions & 0 deletions
diff --git a/‎tensor2tensor/trax/layers/convolution_test.py
Lines changed: 36 additions & 0 deletions b/‎tensor2tensor/trax/layers/convolution_test.py
Lines changed: 36 additions & 0 deletions
@@ -24,5 +24,9 @@
 from tensor2tensor.trax.layers.attention import *
 from tensor2tensor.trax.layers.base import *
 from tensor2tensor.trax.layers.combinators import *
+from tensor2tensor.trax.layers.convolution import *
 from tensor2tensor.trax.layers.core import *
+from tensor2tensor.trax.layers.initializers import *
+from tensor2tensor.trax.layers.normalization import *
+from tensor2tensor.trax.layers.pooling import *
 from tensor2tensor.trax.layers.rnn import *
@@ -59,25 +59,6 @@ def EncoderDecoderMask(x, **unused_kwargs):
   return padding_mask + np.zeros((1, 1, decoder_input.shape[1], 1))
 
 
-# Layer normalization.
-def _layer_norm_new_params(input_shape, rng, epsilon=1e-6):  # pylint: disable=invalid-name
-  """Helper: create layer norm parameters."""
-  del rng, epsilon
-  features = input_shape[-1]
-  scale = np.ones(features)
-  bias = np.zeros(features)
-  return (scale, bias)
-
-
-@base.layer(new_parameters=_layer_norm_new_params)
-def LayerNorm(x, params, epsilon=1e-6, **unused_kwargs):
-  (scale, bias) = params
-  mean = np.mean(x, axis=-1, keepdims=True)
-  variance = np.mean((x - mean)**2, axis=-1, keepdims=True)
-  norm_inputs = (x - mean) / np.sqrt(variance + epsilon)
-  return norm_inputs * scale + bias
-
-
 # Positional encoding.
 def _positional_encoding_new_params(input_shape, rng, max_len=2048):  # pylint: disable=invalid-name
   """Helper: create positional encoding parameters."""
@@ -271,114 +252,6 @@ def MultiHeadedAttention(
   )
 
 
-# Chunked attention.
-def _chunked_selector_output_shape(  # pylint: disable=invalid-name
-    input_shapes, selector=None, **unused_kwargs):
-  """Helper: calculate output shape for chunked key selector (see below)."""
-  # Read the main function below first, the shape logic just follows the ops.
-  selector = selector or (lambda x: [] if x < 1 else [x-1])
-  triples, _ = zip(*input_shapes)
-  (query_shapes, key_shapes, value_shapes) = zip(*triples)
-  result = []
-  for i in range(len(input_shapes)):
-    selected = selector(i)
-    cur_key_shape, cur_value_shape = key_shapes[i], value_shapes[i]
-    # Since keys and values are [batch, length, depth] we concatenate on axis=1.
-    new_key_len = sum([key_shapes[j][1] for j in selected]) + cur_key_shape[1]
-    new_key_shape = (cur_key_shape[0], new_key_len, cur_key_shape[2])
-    new_value_len = sum(
-        [value_shapes[j][1] for j in selected]) + cur_value_shape[1]
-    new_value_shape = (cur_value_shape[0], new_value_len, cur_value_shape[2])
-    # Masks are (1, query-len, key-len).
-    new_mask_shape = (1, query_shapes[i][1], new_key_len)
-    new_shape = ((query_shapes[i], new_key_shape, new_value_shape),
-                 new_mask_shape)
-    result.append(new_shape)
-  return tuple(result)
-
-
-@base.layer(output_shape=_chunked_selector_output_shape)
-def ChunkedAttentionSelector(x, params, selector=None, **kwargs):
-  """Select which chunks to attend to in chunked attention.
-
-  Args:
-    x: inputs, a list of elements of the form (q, k, v), mask for each chunk.
-    params: parameters (unused).
-    selector: a function from chunk_number -> list of chunk numbers that says
-      which other chunks should be appended to the given one (previous if None).
-    **kwargs: unused other arguments.
-
-  Returns:
-    a list of elements of the form (q, k', v'), mask' where k', v' and mask' are
-    concatenations of k, v and identity-extended masks from selected chunks.
-  """
-  del params, kwargs
-  selector = selector or (lambda x: [] if x < 1 else [x-1])
-  triples, masks = zip(*x)
-  (queries, keys, values) = zip(*triples)
-  result = []
-  for i in range(len(x)):
-    selected = selector(i)
-    # Since keys and values are [batch, length, depth] we concatenate on axis=1.
-    # We also always include the current key or value at the end.
-    new_key_list = [keys[j] for j in selected]
-    new_key = np.concatenate(new_key_list + [keys[i]], axis=1)
-    new_value = np.concatenate(
-        [values[j] for j in selected] + [values[i]], axis=1)
-    # Masks are (1, query-len, key-len) so we concatenate on axis=2.
-    new_mask_shapes = [(1, queries[i].shape[1], key.shape[1])
-                       for key in new_key_list]
-    cur_mask = masks[i]
-    # Masks are all-1 for the added chunks (no masking).
-    new_mask_list = [np.ones(s, dtype=cur_mask.dtype) for s in new_mask_shapes]
-    # We still use the current (often causal) mask for the final chunk.
-    new_mask = np.concatenate(new_mask_list + [cur_mask], axis=2)
-    result.append(((queries[i], new_key, new_value), new_mask))
-  return tuple(result)
-
-
-def ChunkedCausalMultiHeadedAttention(
-    feature_depth, num_heads=8, dropout=0.0, chunk_selector=None, mode='train'):
-  """Transformer-style causal multi-headed attention operating on chunks.
-
-  Accepts inputs that are a list of chunks and applies causal attention.
-
-  Args:
-    feature_depth: int:  depth of embedding
-    num_heads: int: number of attention heads
-    dropout: float: dropout rate
-    chunk_selector: a function from chunk number to list of chunks to attend.
-    mode: str: 'train' or 'eval'
-
-  Returns:
-    Multi-headed self-attention layer.
-  """
-  prepare_attention_input = combinators.Serial(
-      combinators.Branch(
-          combinators.Branch(  # q = k = v = first input
-              combinators.Copy(), combinators.Copy(), combinators.Copy()),
-          CausalMask(axis=-2),  # pylint: disable=no-value-for-parameter
-      ),
-      combinators.Parallel(
-          combinators.Parallel(
-              core.Dense(feature_depth),
-              core.Dense(feature_depth),
-              core.Dense(feature_depth),
-          ),
-          combinators.Copy()
-      )
-  )
-  return combinators.Serial(
-      combinators.Map(prepare_attention_input),
-      ChunkedAttentionSelector(selector=chunk_selector),  # pylint: disable=no-value-for-parameter
-      combinators.Map(PureMultiHeadedAttention(  # pylint: disable=no-value-for-parameter
-          feature_depth=feature_depth, num_heads=num_heads,
-          dropout=dropout, mode=mode), check_shapes=False),
-      combinators.Map(combinators.Select(0), check_shapes=False),  # drop masks
-      combinators.Map(core.Dense(feature_depth))
-  )
-
-
 @base.layer()
 def ShiftRight(x, **unused_kwargs):
   """Layer to shift the tensor to the right by padding on axis 1."""
 
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2019 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Trax convolution layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from jax import lax
+
+import numpy as onp
+from tensor2tensor.trax.layers import base
+from tensor2tensor.trax.layers import initializers as init
+
+
+def PadtypeToPads(in_shape, window_shape, window_strides, padding):
+  """Convert padding string to list of pairs of pad values."""
+  padding = padding.upper()
+  if padding == 'SAME':
+    out_shape = onp.ceil(
+        onp.true_divide(in_shape, window_strides)).astype(int)
+    pad_sizes = [max((out_size - 1) * stride + window_shape - in_size, 0)
+                 for out_size, stride, window_shape, in_size
+                 in zip(out_shape, window_strides, window_shape, in_shape)]
+    return [(pad_size // 2, pad_size - pad_size // 2)
+            for pad_size in pad_sizes]
+  elif padding == 'VALID':
+    return [(0, 0)] * len(in_shape)
+  else:
+    msg = 'Unknown padding type: {}.'
+    raise TypeError(msg.format(padding))
+
+
+class Conv(base.Layer):
+  """Layer constructor function for a general convolution layer."""
+
+  def __init__(self, filters, kernel_size, strides=None, padding='VALID',
+               dimension_numbers=('NHWC', 'HWIO', 'NHWC'),
+               kernel_initializer=None,
+               bias_initializer=init.RandomNormalInitializer(1e-6)):
+    super(Conv, self).__init__()
+    self._filters = filters
+    self._kernel_size = kernel_size
+    self._padding = padding
+    self._dimension_numbers = dimension_numbers
+    self._lhs_spec, self._rhs_spec, self._out_spec = dimension_numbers
+    self._one = (1,) * len(kernel_size)
+    self._strides = strides or self._one
+    self._bias_initializer = bias_initializer
+    rhs_spec = self._rhs_spec
+    self._kernel_initializer = kernel_initializer
+    if kernel_initializer is None:
+      self._kernel_initializer = init.GlorotNormalInitializer(
+          rhs_spec.index('O'), rhs_spec.index('I'))
+
+  def call(self, x, params=(), **kwargs):
+    del kwargs
+    w, b = params
+    return lax.conv_general_dilated(
+        x, w, self._strides, self._padding, self._one, self._one,
+        self._dimension_numbers) + b
+
+  def _kernel_shape(self, input_shape):
+    """Helper to calculate the kernel shape."""
+    kernel_size_iter = iter(self._kernel_size)
+    return [self._filters if c == 'O' else
+            input_shape[self._lhs_spec.index('C')] if c == 'I' else
+            next(kernel_size_iter) for c in self._rhs_spec]
+
+  def _conv_shape_tuple(self, lhs_shape, rhs_shape, strides, pads):
+    """Compute the shape of a conv given input shapes in canonical order."""
+    if isinstance(pads, str):
+      pads = PadtypeToPads(lhs_shape[2:], rhs_shape[2:], strides, pads)
+    if len(pads) != len(lhs_shape) - 2:
+      msg = 'Wrong number of explicit pads for conv: expected {}, got {}.'
+      raise TypeError(msg.format(len(lhs_shape) - 2, len(pads)))
+    lhs_padded = onp.add(lhs_shape[2:], onp.add(*zip(*pads)))
+    out_space = onp.floor_divide(
+        onp.subtract(lhs_padded, rhs_shape[2:]), strides) + 1
+    out_space = onp.maximum(0, out_space)
+    out_shape = (lhs_shape[0], rhs_shape[0]) + tuple(out_space)
+    return tuple(out_shape)
+
+  def _conv_general_permutations(self, dimension_numbers):
+    """Utility for convolution dimension permutations relative to Conv HLO."""
+    lhs_spec, rhs_spec, out_spec = dimension_numbers
+    lhs_char, rhs_char, out_char = ('N', 'C'), ('O', 'I'), ('N', 'C')
+    charpairs = (lhs_char, rhs_char, out_char)
+    for i, (a, b) in enumerate(charpairs):
+      if not (dimension_numbers[i].count(a) == 1 and
+              dimension_numbers[i].count(b) == 1):
+        msg = ('convolution dimension_numbers[{}] must contain the characters '
+               '"{}" and "{}" exatly once, got {}.')
+        raise TypeError(msg.format(i, a, b, dimension_numbers[i]))
+      if len(dimension_numbers[i]) != len(set(dimension_numbers[i])):
+        msg = ('convolution dimension_numbers[{}] cannot have duplicate '
+               'characters, got {}.')
+        raise TypeError(msg.format(i, dimension_numbers[i]))
+    if not (set(lhs_spec) - set(lhs_char) == set(rhs_spec) - set(rhs_char) ==
+            set(out_spec) - set(out_char)):
+      msg = ('convolution dimension_numbers elements must each have the same '
+             'set of spatial characters, got {}.')
+      raise TypeError(msg.format(dimension_numbers))
+
+    def GetPerm(spec, charpair):
+      spatial = (i for i, c in enumerate(spec) if c not in charpair)
+      if spec is not rhs_spec:
+        spatial = sorted(spatial, key=lambda i: rhs_spec.index(spec[i]))
+      return (spec.index(charpair[0]), spec.index(charpair[1])) + tuple(spatial)
+
+    lhs_perm, rhs_perm, out_perm = map(GetPerm, dimension_numbers, charpairs)
+    return lhs_perm, rhs_perm, out_perm
+
+  def _conv_general_shape_tuple(self, lhs_shape, rhs_shape, window_strides,
+                                padding, dimension_numbers):
+    """Generalized computation of conv shape."""
+    lhs_perm, rhs_perm, out_perm = self._conv_general_permutations(
+        dimension_numbers)
+    lhs_trans = onp.take(lhs_shape, lhs_perm)
+    rhs_trans = onp.take(rhs_shape, rhs_perm)
+    out_trans = self._conv_shape_tuple(
+        lhs_trans, rhs_trans, window_strides, padding)
+    return tuple(onp.take(out_trans, onp.argsort(out_perm)))
+
+  def output_shape(self, input_shape):
+    kernel_shape = self._kernel_shape(input_shape)
+    return self._conv_general_shape_tuple(
+        input_shape, kernel_shape,
+        self._strides, self._padding, self._dimension_numbers)
+
+  def new_parameters(self, input_shape, rng):
+    kernel_shape = self._kernel_shape(input_shape)
+    bias_shape = [self._filters if c == 'C' else 1 for c in self._out_spec]
+    bias_shape = tuple(itertools.dropwhile(lambda x: x == 1, bias_shape))
+    w = self._kernel_initializer(kernel_shape, rng)
+    b = self._bias_initializer(bias_shape, rng)
+    return (w, b)
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2019 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for convolution layers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import absltest
+from tensor2tensor.trax.layers import base
+from tensor2tensor.trax.layers import convolution
+
+
+class ConvolutionLayerTest(absltest.TestCase):
+
+  def test_conv(self):
+    input_shape = (29, 5, 5, 20)
+    result_shape = base.check_shape_agreement(
+        convolution.Conv(30, (3, 3)), input_shape)
+    self.assertEqual(result_shape, (29, 3, 3, 30))
+
+
+if __name__ == "__main__":
+  absltest.main()