mdangschat
diff --git a/‎python/evaluate.py
Lines changed: 7 additions & 6 deletions b/‎python/evaluate.py
Lines changed: 7 additions & 6 deletions
diff --git a/‎python/inference.py
Lines changed: 114 additions & 0 deletions b/‎python/inference.py
Lines changed: 114 additions & 0 deletions
diff --git a/‎python/loader/audio_sample_info.py
Lines changed: 29 additions & 4 deletions b/‎python/loader/audio_sample_info.py
Lines changed: 29 additions & 4 deletions
diff --git a/‎python/loader/bucket_estimator.py
Lines changed: 35 additions & 5 deletions b/‎python/loader/bucket_estimator.py
Lines changed: 35 additions & 5 deletions
@@ -18,7 +18,7 @@
     import python.model as model
 
 
-# Which dataset *.txt file to use for evaluation. 'train' or 'validate'.
+# Which dataset *.txt file to use for evaluation. 'test' or 'validate'.
 EVALUATION_TARGET = 'test'
 
 
@@ -83,9 +83,9 @@ def evaluate_once(loss_op, mean_ed_op, wer_op, summary_op, summary_writer):
                 wer_sum += wer_batch
                 step += 1
 
-                print('{:%Y-%m-%d %H:%M:%S}: Step {:5,d} results: loss={:7.3f}; '
+                print('{:%Y-%m-%d %H:%M:%S}: Step {:,d} of {:,d}; Results: loss={:7.3f}; '
                       'mean_edit_distance={:5.3f}; WER={:5.3f}'
-                      .format(datetime.now(), step, loss_batch, mean_ed_batch, wer_batch))
+                      .format(datetime.now(), step, num_iter, loss_batch, mean_ed_batch, wer_batch))
 
             # Compute error rates.
             avg_loss = loss_sum / num_iter
@@ -126,11 +126,11 @@ def evaluate(eval_dir):
     with tf.Graph().as_default() as graph:
         # Get evaluation sequences and ground truth.
         with tf.device('/cpu:0'):
-            sequences, seq_length, labels, label_length, originals = model.inputs(
+            sequences, _, labels, label_length, originals = model.inputs(
                 target=EVALUATION_TARGET)
 
         # Build a graph that computes the logits predictions from the inference model.
-        logits = model.inference(sequences, seq_length, training=False)
+        logits, seq_length = model.inference(sequences, training=False)
 
         with tf.variable_scope('loss', reuse=tf.AUTO_REUSE):
             # Calculate error rates
@@ -156,7 +156,8 @@ def main(argv=None):
     """TensorFlow starting routine."""
 
     # Determine evaluation log directory.
-    eval_dir = FLAGS.eval_dir if len(FLAGS.eval_dir) > 0 else '{}_eval'.format(FLAGS.train_dir)
+    eval_dir = FLAGS.eval_dir if len(FLAGS.eval_dir) > 0 else '{}_{}'\
+        .format(FLAGS.train_dir, EVALUATION_TARGET)
 
     # Delete old evaluation data if requested.
     if tf.gfile.Exists(eval_dir) and FLAGS.delete:
 
@@ -0,0 +1,114 @@
+"""Transcribe a given audio file."""
+
+import os
+
+import tensorflow as tf
+
+from python.params import FLAGS, TF_FLOAT
+from python.loader.load_sample import load_sample, NUM_FEATURES
+# WarpCTC crashes during evaluation. Even if it's only imported and not actually being used.
+if FLAGS.use_warp_ctc:
+    FLAGS.use_warp_ctc = False
+    import python.model as model
+else:
+    import python.model as model
+
+
+# File to transcribe.
+WAV_FILE = '/home/marc/workspace/datasets/speech_data/timit/TIMIT/TRAIN/DR4/FALR0/SA1.WAV'
+
+
+def transcribe_once(logits_op, decoded_op, plaintext_op, sequences, sequences_ph):
+    """Restore model from latest checkpoint and run the inference for the provided `sequence`.
+
+    Args:
+        logits_op (tf.Tensor):
+            Logits operator.
+        decoded_op (tf.Tensor):
+            Decoded operator.
+        plaintext_op (tf.Tensor):
+            Plaintext operator.
+        sequences (List[np.ndarray]):
+            Python list of 2D numpy arrays, each containing audio features.
+        sequences_ph (tf.Tensor):
+            Placeholder for the input sequences.
+
+    Returns:
+        Nothing.
+    """
+    # Session configuration.
+    session_config = tf.ConfigProto(
+        log_device_placement=False,
+        gpu_options=tf.GPUOptions(allow_growth=True)
+    )
+
+    with tf.Session(config=session_config) as sess:
+        checkpoint = tf.train.get_checkpoint_state(FLAGS.train_dir)
+        if checkpoint and checkpoint.model_checkpoint_path:
+            saver = tf.train.Saver()
+
+            # Restore from checkpoint.
+            saver.restore(sess, checkpoint.model_checkpoint_path)
+            # Extract global stop from checkpoint.
+            global_step = checkpoint.model_checkpoint_path.split('/')[-1].split('-')[-1]
+            global_step = str(global_step)
+            print('Loaded global step: {}, from checkpoint: {}'
+                  .format(global_step, FLAGS.train_dir))
+        else:
+            print('No checkpoint file found.')
+            return
+
+        # Start the queue runners.
+        coord = tf.train.Coordinator()
+        threads = []
+        try:
+            for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
+                threads.extend(qr.create_threads(sess, coord=coord, daemon=True, start=True))
+
+            if not coord.should_stop():
+                logits, decoded, plaintext = sess.run([logits_op, decoded_op, plaintext_op],
+                                                      feed_dict={sequences_ph: sequences})
+
+                print('Transcriptions {}:\n{}'.format(plaintext.shape, plaintext))
+
+        except Exception as e:
+            print('EXCEPTION:', e, ', type:', type(e))
+            coord.request_stop(e)
+
+        coord.request_stop()
+        coord.join(threads, stop_grace_period_secs=120)
+
+
+def transcribe():
+    """Load an audio file and prepare the TensorFlow graph for inference.
+
+    Returns:
+        Nothing.
+    """
+    assert os.path.isfile(WAV_FILE)
+
+    with tf.Graph().as_default():
+        # Get evaluation sequences and ground truth.
+        with tf.device('/cpu:0'):
+            # Load audio file into tensor.
+            sequences, _ = load_sample(WAV_FILE)
+            sequences = [sequences] * FLAGS.batch_size
+            sequences_ph = tf.placeholder(dtype=TF_FLOAT,
+                                          shape=[FLAGS.batch_size, None, NUM_FEATURES])
+
+        # Build a graph that computes the logits predictions from the inference model.
+        logits_op, seq_length = model.inference(sequences_ph, training=False)
+
+        decoded_op, plaintext_op, _ = model.decode(logits_op, seq_length, originals=None)
+
+        transcribe_once(logits_op, decoded_op, plaintext_op, sequences, sequences_ph)
+
+
+# noinspection PyUnusedLocal
+def main(argv=None):
+    """TensorFlow starting routine."""
+    transcribe()
+
+
+if __name__ == '__main__':
+    main()
@@ -3,8 +3,6 @@
 Note that the network does not use `librosa`_ anymore, because it has problems
 with concurrent sample loading. This module has not been updated yet.
 
-L8ER: Move away from librosa, use python_speech_features.
-
 .. _librosa:
     https://librosa.github.io/librosa/index.html
 """
@@ -16,6 +14,8 @@
 from librosa import display
 from matplotlib import pyplot as plt
 
+from python.loader import load_sample as ls
+
 
 DATASETS_PATH = '/home/marc/workspace/datasets/speech_data'
 
@@ -36,7 +36,6 @@ def display_sample_info(file_path, label=''):
         raise ValueError('{} does not exist.'.format(file_path))
 
     # By default, all audio is mixed to mono and resampled to 22050 Hz at load time.
-    # y, sr = rosa.load(file_path, sr=None, mono=True)
     y, sr = rosa.load(file_path, sr=None, mono=True)
 
     # At 16000 Hz, 512 samples ~= 32ms. At 16000 Hz, 200 samples = 12ms. 16 samples = 1ms @ 16kHz.
@@ -141,6 +140,31 @@ def display_sample_info(file_path, label=''):
     plt.colorbar(format='%+2.0f dB')
     plt.title('Mel spectrogram')
 
+    # Import project used features (python speech features).
+    normalize_features = 'global'
+    mfcc = ls.load_sample(file_path, feature_type='mfcc', normalize_features=normalize_features,
+                          normalize_signal=False)[0]
+    mfcc = np.swapaxes(mfcc, 0, 1)
+
+    mel = ls.load_sample(file_path, feature_type='mel', normalize_features=normalize_features,
+                         normalize_signal=False)[0]
+    mel = np.swapaxes(mel, 0, 1)
+
+    plt.figure(figsize=(12, 8))
+    plt.subplot(2, 1, 1)
+    display.specshow(mfcc, sr=16000, x_axis='time', y_axis='linear', hop_length=ls.WIN_STEP * 16000)
+    # plt.set_cmap('magma')
+    plt.xticks(rotation=295)
+    plt.colorbar(format='%+2.0f')
+    plt.title('MFCC')
+
+    plt.subplot(2, 1, 2)
+    display.specshow(mel, sr=16000, x_axis='time', y_axis='linear', hop_length=ls.WIN_STEP * 16000)
+    # plt.set_cmap('magma')
+    plt.xticks(rotation=295)
+    plt.colorbar(format='%+2.0f')
+    plt.title('Mel')
+
     plt.tight_layout()
     plt.show()
 
@@ -151,7 +175,8 @@ def display_sample_info(file_path, label=''):
     # Display specific sample info's.
     with open(_test_txt_path, 'r') as f:
         _lines = f.readlines()
-        _line = _lines[0]
+        _line = _lines[len(_lines) // 5]
+        # _line = _lines[1]
         _wav_path, txt = _line.split(' ', 1)
         _wav_path = os.path.join('/home/marc/workspace/datasets/speech_data', _wav_path)
         _txt = txt.strip()
 
@@ -9,7 +9,8 @@
 import numpy as np
 from matplotlib import pyplot as plt
 
-from python.loader.load_sample import wav_length
+from python.loader.load_sample import load_sample
+from python.util.storage import delete_file_if_exists
 
 
 # Path to train.txt file.
@@ -18,31 +19,60 @@
 DATASET_PATH = '/home/marc/workspace/datasets/speech_data/'
 
 
-def estimate_bucket_sizes(num_buckets=32):
+def estimate_bucket_sizes(num_buckets=32, max_length=1750):
     """Estimate optimal bucket sizes based on the samples in `train.txt` file.
     Results are printed out or plotted.
+    Optional, if `max_length` is greater than `0`, audio examples with feature vectors longer than
+    `max_length` are being removed from the .txt file.
 
     Args:
         num_buckets (int): Number of buckets.
             Note that TensorFlow bucketing adds a smallest and largest bucket to the list.
+        max_length (int): Maximum feature vector length of a preprocessed audio example.
+            Longer ones are being removed from the .txt file.
+            Set to `0` to disable removal.
 
     Returns:
         Nothing.
     """
     with open(TRAIN_TXT_PATH, 'r') as f:
         lines = f.readlines()
 
+    overlength_counter = 0
     lengths = []
+    tmp_lines = []
 
     # Progressbar
     for line in tqdm(lines, desc='Reading audio files', total=len(lines), file=sys.stdout,
                      unit='files', dynamic_ncols=True):
-        wav_path = line.split(' ', 1)[0]
+        wav_path, label = line.split(' ', 1)
         wav_path = os.path.join(DATASET_PATH, wav_path)
-        sample_len = wav_length(wav_path)
-        lengths.append(sample_len)
+        _, sample_len = load_sample(wav_path, feature_type='mel',
+                                    normalize_features=False, normalize_signal=False)
+
+        if max_length > 0:
+            if sample_len < max_length:
+                lengths.append(sample_len)
+                tmp_lines.append(line)
+            else:
+                overlength_counter += 1
+
+        else:
+            lengths.append(sample_len)
+            tmp_lines.append(line)
+
     print()  # Clear line from tqdm progressbar.
 
+    # Write reduced data back to .txt file, if selected.
+    if max_length > 0:
+        print('{} examples have a length greater than {} and have been removed from .txt file.'
+              .format(overlength_counter, max_length))
+
+        delete_file_if_exists(TRAIN_TXT_PATH)
+        with open(TRAIN_TXT_PATH, 'w') as f:
+            f.writelines(tmp_lines)
+
+    print('Evaluated {} examples.'.format(len(lengths)))
     lengths = np.array(lengths)
     lengths = np.sort(lengths)
     step = len(lengths) // num_buckets