Skip to content

Commit 77042bd

Browse files
committed
released 1.2.6
1 parent 0d0bf45 commit 77042bd

File tree

8 files changed

+116
-107
lines changed

8 files changed

+116
-107
lines changed

docs/huggingface-repository.ipynb

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,12 @@
2222
"cell_type": "markdown",
2323
"metadata": {},
2424
"source": [
25-
"Starting Malaya-Speech 1.2.6, you can load Malaya-Speech models from https://huggingface.co/huseinzol05 to get better download speed.\n",
26-
"\n",
27-
"**Starting Malaya-Speech 1.2.7, by default Malaya-Speech will use HuggingFace as backend repository**."
25+
"Starting Malaya-Speech 1.2.6, you can load Malaya-Speech models from https://huggingface.co/huseinzol05 to get better download speed, and by default Malaya-Speech will use HuggingFace as backend repository."
2826
]
2927
},
3028
{
3129
"cell_type": "code",
32-
"execution_count": 1,
30+
"execution_count": null,
3331
"metadata": {},
3432
"outputs": [],
3533
"source": [
@@ -38,20 +36,9 @@
3836
},
3937
{
4038
"cell_type": "code",
41-
"execution_count": 2,
39+
"execution_count": null,
4240
"metadata": {},
43-
"outputs": [
44-
{
45-
"data": {
46-
"text/plain": [
47-
"'1.2.7'"
48-
]
49-
},
50-
"execution_count": 2,
51-
"metadata": {},
52-
"output_type": "execute_result"
53-
}
54-
],
41+
"outputs": [],
5542
"source": [
5643
"malaya_speech.__version__"
5744
]

example/huggingface-repository/huggingface-repository.ipynb

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,12 @@
2222
"cell_type": "markdown",
2323
"metadata": {},
2424
"source": [
25-
"Starting Malaya-Speech 1.2.6, you can load Malaya-Speech models from https://huggingface.co/huseinzol05 to get better download speed.\n",
26-
"\n",
27-
"**Starting Malaya-Speech 1.2.7, by default Malaya-Speech will use HuggingFace as backend repository**."
25+
"Starting Malaya-Speech 1.2.6, you can load Malaya-Speech models from https://huggingface.co/huseinzol05 to get better download speed, and by default Malaya-Speech will use HuggingFace as backend repository."
2826
]
2927
},
3028
{
3129
"cell_type": "code",
32-
"execution_count": 1,
30+
"execution_count": null,
3331
"metadata": {},
3432
"outputs": [],
3533
"source": [
@@ -38,20 +36,9 @@
3836
},
3937
{
4038
"cell_type": "code",
41-
"execution_count": 2,
39+
"execution_count": null,
4240
"metadata": {},
43-
"outputs": [
44-
{
45-
"data": {
46-
"text/plain": [
47-
"'1.2.7'"
48-
]
49-
},
50-
"execution_count": 2,
51-
"metadata": {},
52-
"output_type": "execute_result"
53-
}
54-
],
41+
"outputs": [],
5542
"source": [
5643
"malaya_speech.__version__"
5744
]

malaya_speech/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from malaya_boilerplate.utils import get_home
1010

1111
version = '1.2'
12-
bump_version = '1.2.7'
12+
bump_version = '1.2.6'
1313
__version__ = bump_version
1414

1515
package = 'malaya-speech'
@@ -57,5 +57,6 @@
5757
padding,
5858
split,
5959
subword,
60-
tf_featurization)
60+
tf_featurization
61+
)
6162
from .utils.read import load, resample

malaya_speech/train/model/hubert/model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,5 +279,6 @@ def compute_pred(proj_x, target, label_embs):
279279
"logit_u_list": logit_u_list,
280280
"padding_mask": padding_mask,
281281
"features_pen": features_pen,
282+
'x': x,
282283
}
283284
return result

malaya_speech/utils/text.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,34 @@
1616
_rejected = '\'():;"'
1717
_punct = ':;,.?'
1818

19+
PRONUNCIATION = {
20+
'A': 'ae',
21+
'B': 'bi',
22+
'C': 'si',
23+
'D': 'di',
24+
'E': 'ei',
25+
'F': 'ef',
26+
'G': 'ji',
27+
'H': 'hesh',
28+
'I': 'ai',
29+
'J': 'jei',
30+
'K': 'kei',
31+
'L': 'el',
32+
'M': 'eim',
33+
'N': 'ein',
34+
'O': 'ou',
35+
'P': 'pi',
36+
'Q': 'qeu',
37+
'R': 'ar',
38+
'S': 'es',
39+
'T': 'ti',
40+
'U': 'yu',
41+
'V': 'vi',
42+
'W': 'dablui',
43+
'X': 'ex',
44+
'Y': 'wai',
45+
'Z': 'zed',
46+
}
1947

2048
TTS_SYMBOLS = (
2149
[_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)

malaya_speech/vocoder.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,6 @@
4242
'Quantized Size (MB)': 19.9,
4343
'Mel loss': 0.4591,
4444
},
45-
'universal-384': {
46-
'Size (MB)': 78.4,
47-
'Quantized Size (MB)': 19.9,
48-
'Mel loss': 0.4591,
49-
},
5045
}
5146

5247
_mbmelgan_availability = {
@@ -83,11 +78,6 @@
8378
'Quantized Size (MB)': 2.49,
8479
'Mel loss': 0.5547,
8580
},
86-
'universal-1024': {
87-
'Size (MB)': 72.8,
88-
'Quantized Size (MB)': 18.5,
89-
'Mel loss': 0.3617,
90-
},
9181
'universal-768': {
9282
'Size (MB)': 72.8,
9383
'Quantized Size (MB)': 18.5,
@@ -147,7 +137,6 @@ def melgan(model: str = 'universal-1024', quantized: bool = False, **kwargs):
147137
* ``'female-singlish'`` - MelGAN trained on Female Singlish voice, https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus
148138
* ``'universal'`` - Universal MelGAN trained on multiple speakers.
149139
* ``'universal-1024'`` - Universal MelGAN with 1024 filters trained on multiple speakers.
150-
* ``'universal-384'`` - Universal MelGAN with 384 filters trained on multiple speakers.
151140
152141
quantized : bool, optional (default=False)
153142
if True, will load 8-bit quantized model.
@@ -219,7 +208,6 @@ def hifigan(model: str = 'universal-768', quantized: bool = False, **kwargs):
219208
220209
* ``'female'`` - HiFiGAN trained on female voice.
221210
* ``'male'`` - HiFiGAN trained on male voice.
222-
* ``'universal-1024'`` - Universal HiFiGAN with 1024 filters trained on multiple speakers.
223211
* ``'universal-768'`` - Universal HiFiGAN with 768 filters trained on multiple speakers.
224212
* ``'universal-512'`` - Universal HiFiGAN with 512 filters trained on multiple speakers.
225213

pretrained-model/speaker-embedding/hubert/hubert-base.py

Lines changed: 75 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import malaya_speech.train as train
99
from malaya_speech.train.model.conformer.model import Model as ConformerModel
1010
from malaya_speech.train.model import hubert
11-
import tensorflow.keras as keras
12-
import tensorflow.keras.backend as K
1311
import numpy as np
1412
import string
1513
import json
@@ -27,71 +25,89 @@
2725
test_set = glob('/home/husein/youtube/voxceleb-wav/*.wav')
2826

2927
sr = 16000
30-
maxlen = 18
31-
minlen = 3
32-
weight_decay = 1e-5
28+
maxlen = 15
29+
minlen = 2
30+
kmean = hubert.kmeans.ApplyKmeans_TF('kmean.km')
3331

3432

3533
def generate(files):
3634
while True:
3735
random.shuffle(files)
3836
for f in files:
3937
f = f.decode() if isinstance(f, bytes) else f
40-
x, _ = malaya_speech.load(f)
38+
wav_data, _ = malaya_speech.load(f)
4139
label = os.path.split(f)[1].replace('wav-', '').split('-')[1]
4240
y = int(ids[label])
4341

44-
len_x = len(x)
42+
len_x = len(wav_data) / sr
4543

46-
if (len_x / sr) < minlen:
44+
if len_x < minlen:
4745
continue
4846

49-
if (len_x / sr) > maxlen:
50-
x = augmentation.random_sampling(x, sr, random.randint(1000 * minlen, 1000 * maxlen))
47+
if len_x > maxlen:
48+
wav_data = augmentation.random_sampling(wav_data, sr, random.randint(1000 * minlen, 1000 * maxlen))
5149

5250
yield {
53-
'waveforms': x,
54-
'waveforms_length': [len(x)],
51+
'waveforms': wav_data,
52+
'waveforms_length': [len(wav_data)],
5553
'Y': [y],
5654
}
5755

5856

59-
def get_dataset(files, batch_size=4, shuffle_size=32, thread_count=24):
57+
def preprocess_inputs(example):
58+
v = featurizer.vectorize(example['waveforms'])
59+
deltas = malaya_speech.utils.tf_featurization.deltas(v)
60+
ddeltas = malaya_speech.utils.tf_featurization.deltas(deltas)
61+
concated = tf.concat([v, deltas, ddeltas], axis=1)
62+
s = tf.compat.v1.numpy_function(kmean, [concated], tf.int64)
63+
s = tf.cast(s, tf.int32)
64+
kmean_tf = tf.reshape(s, (-1,)) + 3
65+
example['targets'] = kmean_tf
66+
return example
67+
68+
69+
def get_dataset(
70+
file,
71+
batch_size=4,
72+
shuffle_size=20,
73+
thread_count=24,
74+
maxlen_feature=1800,
75+
):
6076
def get():
6177
dataset = tf.data.Dataset.from_generator(
6278
generate,
63-
{
64-
'waveforms': tf.float32,
65-
'waveforms_length': tf.int32,
66-
'Y': tf.int32,
67-
},
79+
{'waveforms': tf.float32,
80+
'waveforms_length': tf.int32,
81+
'Y': tf.int32,
82+
},
6883
output_shapes={
6984
'waveforms': tf.TensorShape([None]),
7085
'waveforms_length': tf.TensorShape([None]),
7186
'Y': tf.TensorShape([None]),
7287
},
73-
args=(files,),
74-
)
75-
dataset = dataset.filter(
76-
lambda x: tf.less(tf.shape(x['waveforms'])[0] / sr, maxlen)
88+
args=(file,),
7789
)
78-
dataset = dataset.filter(
79-
lambda x: tf.greater(tf.shape(x['waveforms'])[0] / sr, minlen)
90+
dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
91+
dataset = dataset.map(
92+
preprocess_inputs, num_parallel_calls=thread_count
8093
)
8194
dataset = dataset.padded_batch(
82-
shuffle_size,
95+
batch_size,
8396
padded_shapes={
8497
'waveforms': tf.TensorShape([None]),
8598
'waveforms_length': tf.TensorShape([None]),
99+
'targets': tf.TensorShape([None]),
86100
'Y': tf.TensorShape([None]),
87101
},
88102
padding_values={
89103
'waveforms': tf.constant(0, dtype=tf.float32),
90104
'waveforms_length': tf.constant(0, dtype=tf.int32),
105+
'targets': tf.constant(0, dtype=tf.int32),
91106
'Y': tf.constant(0, dtype=tf.int32),
92107
},
93108
)
94109
return dataset
110+
95111
return get
96112

97113

@@ -107,12 +123,6 @@ def __call__(self, x, input_mask, training=True):
107123
total_steps = 3000000
108124

109125

110-
def amsoftmax_loss(y_true, y_pred, scale=30, margin=0.35):
111-
y_pred = y_true * (y_pred - margin) + (1 - y_true) * y_pred
112-
y_pred *= scale
113-
return K.categorical_crossentropy(y_true, y_pred, from_logits=True)
114-
115-
116126
def model_fn(features, labels, mode, params):
117127
config_conformer = malaya_speech.config.conformer_base_encoder_config
118128
config_conformer['subsampling']['type'] = 'none'
@@ -130,40 +140,47 @@ def model_fn(features, labels, mode, params):
130140
model = hubert.Model(cfg, encoder, ['pad', 'eos', 'unk'] + [str(i) for i in range(100)])
131141
X = features['waveforms']
132142
X_len = features['waveforms_length'][:, 0]
143+
Y = features['targets']
144+
r = model(X, padding_mask=X_len, target_list=Y)
145+
146+
target_m = tf.zeros((tf.shape(r['logit_m_list'])[0],), dtype=tf.int32)
147+
target_u = tf.zeros((tf.shape(r['logit_u_list'])[0],), dtype=tf.int32)
148+
149+
sample_size = tf.cast(tf.shape(target_m)[0], tf.float32)
150+
entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_m, logits=r['logit_m_list'])
151+
entropy_m = tf.reduce_sum(entropy) / sample_size
152+
153+
sample_size = tf.cast(tf.shape(target_u)[0], tf.float32)
154+
entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_u, logits=r['logit_u_list'])
155+
entropy_u = tf.reduce_sum(entropy) / sample_size
156+
157+
seq = r['x']
133158
Y = features['Y']
134-
Y_onehot = tf.one_hot(Y, depth=num_class)
135-
136-
r = model(X, padding_mask=X_len, features_only=True, mask=False)
137-
first_token_tensor = tf.squeeze(r['x'][:, 0:1, :], axis=1)
138-
pooled_output = keras.layers.Dense(cfg.final_dim * 2, activation='tanh',
139-
kernel_initializer='orthogonal',
140-
use_bias=True, trainable=True,
141-
kernel_regularizer=keras.regularizers.l2(weight_decay),
142-
bias_regularizer=keras.regularizers.l2(weight_decay))(first_token_tensor)
143-
logits = keras.layers.Dense(num_class,
144-
kernel_initializer='orthogonal',
145-
use_bias=False, trainable=True,
146-
kernel_constraint=keras.constraints.unit_norm(),
147-
kernel_regularizer=keras.regularizers.l2(weight_decay),
148-
bias_regularizer=keras.regularizers.l2(weight_decay),
149-
name='prediction')(pooled_output)
150-
loss = tf.reduce_mean(amsoftmax_loss(Y_onehot, logits))
151-
accuracy = tf.metrics.accuracy(
152-
labels=Y, predictions=tf.argmax(logits, axis=1)
159+
first_token_tensor = tf.squeeze(seq[:, 0:1, :], axis=1)
160+
pooled_output = tf.keras.layers.Dense(embedding_dim, activation='tanh',
161+
use_bias=True, trainable=True)(first_token_tensor)
162+
logits = tf.keras.layers.Dense(num_class, trainable=True,)(pooled_output)
163+
entropy_speakers = tf.reduce_mean(
164+
tf.nn.sparse_softmax_cross_entropy_with_logits(
165+
logits=logits, labels=Y
166+
)
153167
)
154168

155-
tf.identity(accuracy[1], name='train_accuracy')
169+
loss = entropy_m * 0.95 + entropy_u * 0.05 + entropy_speakers
156170

157-
tf.identity(loss, 'train_loss')
171+
tf.identity(entropy_m, 'entropy_m')
172+
tf.summary.scalar('entropy_m', entropy_m)
158173

159-
variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
160-
init_checkpoint = 'hubert-conformer-base-output-3mixed/model.ckpt-2000000'
174+
tf.identity(entropy_u, 'entropy_u')
175+
tf.summary.scalar('entropy_u', entropy_u)
161176

162-
assignment_map, initialized_variable_names = train.get_assignment_map_from_checkpoint(
163-
variables, init_checkpoint
177+
tf.identity(loss, 'train_loss')
178+
179+
accuracy = tf.metrics.accuracy(
180+
labels=Y, predictions=tf.argmax(logits, axis=1)
164181
)
165182

166-
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
183+
tf.identity(accuracy[1], name='train_accuracy')
167184

168185
if mode == tf.estimator.ModeKeys.TRAIN:
169186
train_op = train.optimizer.adamw.create_optimizer(
@@ -195,7 +212,7 @@ def model_fn(features, labels, mode, params):
195212

196213
train_hooks = [
197214
tf.train.LoggingTensorHook(
198-
['train_accuracy', 'train_loss'], every_n_iter=1
215+
['entropy_m', 'entropy_u', 'entropy_speakers', 'train_accuracy', 'train_loss'], every_n_iter=1
199216
)
200217
]
201218

0 commit comments

Comments
 (0)