tensorflow
diff --git a/‎tensor2tensor/data_generators/generator_utils.py
Lines changed: 10 additions & 7 deletions b/‎tensor2tensor/data_generators/generator_utils.py
Lines changed: 10 additions & 7 deletions
diff --git a/‎tensor2tensor/data_generators/problem_hparams.py
Lines changed: 11 additions & 1 deletion b/‎tensor2tensor/data_generators/problem_hparams.py
Lines changed: 11 additions & 1 deletion
diff --git a/‎tensor2tensor/data_generators/text_encoder.py
Lines changed: 10 additions & 7 deletions b/‎tensor2tensor/data_generators/text_encoder.py
Lines changed: 10 additions & 7 deletions
@@ -29,7 +29,7 @@
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import six.moves.urllib_request as urllib  # Imports urllib on Python2, urllib.request on Python3
 
-from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder
+from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.data_generators.tokenizer import Tokenizer
 
 import tensorflow as tf
@@ -218,15 +218,18 @@ def gunzip_file(gz_path, new_path):
 ]
 
 
-def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
-  """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS."""
+def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size, sources=None):
+  """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS)."""
   vocab_filepath = os.path.join(tmp_dir, vocab_filename)
   if os.path.exists(vocab_filepath):
-    vocab = SubwordTextEncoder(vocab_filepath)
+    tf.logging.info("Found vocab file: %s", vocab_filepath)
+    vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
     return vocab
 
+  sources = sources or _DATA_FILE_URLS
+  tf.logging.info("Generating vocab from: %s", str(sources))
   tokenizer = Tokenizer()
-  for source in _DATA_FILE_URLS:
+  for source in sources:
     url = source[0]
     filename = os.path.basename(url)
     read_type = "r:gz" if "tgz" in filename else "r"
@@ -259,9 +262,9 @@ def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
             break
           line = line.strip()
           file_byte_budget -= len(line)
-          _ = tokenizer.encode(line)
+          _ = tokenizer.encode(text_encoder.native_to_unicode(line))
 
-  vocab = SubwordTextEncoder.build_to_target_size(
+  vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
       vocab_size, tokenizer.token_counts, 1, 1e3)
   vocab.store_to_file(vocab_filepath)
   return vocab
 
@@ -664,8 +664,17 @@ def image_mscoco_tokens(model_hparams, vocab_count):
   }
   p.batch_size_multiplier = 256
   p.max_expected_batch_size_per_shard = 2
+
+
+def img2img_imagenet(unused_model_hparams):
+  """Image 2 Image for imagenet dataset."""
+  p = default_problem_hparams()
+  p.input_modality = {"inputs": ("image:identity", None)}
+  p.target_modality = ("image:identity", None)
+  p.batch_size_multiplier = 256
+  p.max_expected_batch_size_per_shard = 4
   p.input_space_id = 1
-  p.target_space_id = 3
+  p.target_space_id = 1
   return p
 
 
@@ -732,4 +741,5 @@ def image_mscoco_tokens(model_hparams, vocab_count):
     "image_mscoco_tokens_128k_tune": lambda p: image_mscoco_tokens(p, 2**17),
     "image_mscoco_tokens_128k_test": lambda p: image_mscoco_tokens(p, 2**17),
     "image_imagenet": image_imagenet,
+    "img2img_imagenet": img2img_imagenet,
 }
@@ -36,10 +36,10 @@
 
 
 # Conversion between Unicode and UTF-8, if required (on Python2)
-_native_to_unicode = (lambda s: s.decode("utf-8")) if PY2 else (lambda s: s)
+native_to_unicode = (lambda s: s.decode("utf-8")) if PY2 else (lambda s: s)
 
 
-_unicode_to_native = (lambda s: s.encode("utf-8")) if PY2 else (lambda s: s)
+unicode_to_native = (lambda s: s.encode("utf-8")) if PY2 else (lambda s: s)
 
 
 # Reserved tokens for things like padding and EOS symbols.
@@ -220,7 +220,7 @@ def encode(self, raw_text):
       a list of integers in the range [0, vocab_size)
     """
     return self._tokens_to_subtokens(self._tokenizer.encode(
-        _native_to_unicode(raw_text)))
+        native_to_unicode(raw_text)))
 
   def decode(self, subtokens):
     """Converts a sequence of subtoken ids to a native string.
@@ -230,7 +230,7 @@ def decode(self, subtokens):
     Returns:
       a native string
     """
-    return _unicode_to_native(self._tokenizer.decode(
+    return unicode_to_native(self._tokenizer.decode(
         self._subtokens_to_tokens(subtokens)))
 
   @property
@@ -335,6 +335,9 @@ def bisect(min_val, max_val):
       else:
         other_subtokenizer = bisect(min_val, present_count - 1)
 
+      if other_subtokenizer is None:
+        return subtokenizer
+
       if (abs(other_subtokenizer.vocab_size - target_size) <
           abs(subtokenizer.vocab_size - target_size)):
         return other_subtokenizer
@@ -449,13 +452,13 @@ def _load_from_file(self, filename):
     subtoken_strings = []
     with tf.gfile.Open(filename) as f:
       for line in f:
-        subtoken_strings.append(_native_to_unicode(line.strip()[1:-1]))
+        subtoken_strings.append(native_to_unicode(line.strip()[1:-1]))
     self._init_from_list(subtoken_strings)
 
   def store_to_file(self, filename):
     with tf.gfile.Open(filename, "w") as f:
       for subtoken_string in self._all_subtoken_strings:
-        f.write("'" + _unicode_to_native(subtoken_string) + "'\n")
+        f.write("'" + unicode_to_native(subtoken_string) + "'\n")
 
   def _escape_token(self, token):
     r"""Escape away underscores and OOV characters and append '_'.
@@ -524,7 +527,7 @@ def get_token_counts(cls, text_filepattern, corpus_max_lines):
       with tf.gfile.Open(text_filename) as f:
         for line in f:
           # The tokenizer updates token_counts in encode()
-          tok.encode(_native_to_unicode(line.strip()))
+          tok.encode(native_to_unicode(line.strip()))
           lines_read += 1
           if corpus_max_lines > 0 and lines_read > corpus_max_lines:
             return tok.token_counts