added speechsplit pyworld

huseinzol05 · huseinzol05 · commit 53718cdd55e1 · 2021-05-30T20:48:45.000+08:00
diff --git a/README-pypi.rst b/README-pypi.rst
@@ -36,7 +36,8 @@ Features
 -  **Speaker overlap**, detect overlap speakers using Finetuned Speaker Vector.
 -  **Speaker Vector**, calculate similarity between speakers using Pretrained Speaker Vector.
 -  **Speech Enhancement**, enhance voice activities using Waveform UNET.
--  **Speech-to-Text**, End-to-End Speech to Text for Malay and Mixed (Malay and Singlish) using RNN-Transducer.
+-  **SpeechSplit Conversion**, detailed speaking style conversion by disentangling speech into content, timbre, rhythm and pitch using PyWorld and PySPTK.
+-  **Speech-to-Text**, End-to-End Speech to Text for Malay and Mixed (Malay and Singlish) using RNN-Transducer and Wav2Vec2 CTC.
 -  **Super Resolution**, Super Resolution 4x for Waveform.
 -  **Text-to-Speech**, Text to Speech for Malay and Singlish using Tacotron2 and FastSpeech2.
 -  **Vocoder**, convert Mel to Waveform using MelGAN, Multiband MelGAN and Universal MelGAN Vocoder.
@@ -71,6 +72,9 @@ Malaya-Speech also released pretrained models, simply check at `malaya-speech/pr
 -  **FastVC**, Faster and Accurate Voice Conversion using Transformer, no paper produced.
 -  **FastSep**, Faster and Accurate Speech Separation using Transformer, no paper produced.
 -  **wav2vec 2.0**, A Framework for Self-Supervised Learning of Speech Representations, https://arxiv.org/abs/2006.11477
+-  **FastSpeechSplit**, Unsupervised Speech Decomposition Via Triple Information Bottleneck using Transformer, no paper produced.
+-  **Sepformer**, Attention is All You Need in Speech Separation, https://arxiv.org/abs/2010.13154
+-  **FastSpeechSplit**, Faster and Accurate Speech Split Conversion using Transformer, no paper produced.
 
 References
 -----------
diff --git a/README.rst b/README.rst
@@ -55,7 +55,7 @@ Features
 -  **Speaker overlap**, detect overlap speakers using Finetuned Speaker Vector.
 -  **Speaker Vector**, calculate similarity between speakers using Pretrained Speaker Vector.
 -  **Speech Enhancement**, enhance voice activities using Waveform UNET.
--  **SpeechSplit Conversion**, detailed speaking style conversion by disentangling speech into content, timbre, rhythm and pitch.
+-  **SpeechSplit Conversion**, detailed speaking style conversion by disentangling speech into content, timbre, rhythm and pitch using PyWorld and PySPTK.
 -  **Speech-to-Text**, End-to-End Speech to Text for Malay and Mixed (Malay and Singlish) using RNN-Transducer and Wav2Vec2 CTC.
 -  **Super Resolution**, Super Resolution 4x for Waveform.
 -  **Text-to-Speech**, Text to Speech for Malay and Singlish using Tacotron2 and FastSpeech2.
@@ -93,6 +93,7 @@ Malaya-Speech also released pretrained models, simply check at `malaya-speech/pr
 -  **wav2vec 2.0**, A Framework for Self-Supervised Learning of Speech Representations, https://arxiv.org/abs/2006.11477
 -  **FastSpeechSplit**, Unsupervised Speech Decomposition Via Triple Information Bottleneck using Transformer, no paper produced.
 -  **Sepformer**, Attention is All You Need in Speech Separation, https://arxiv.org/abs/2010.13154
+-  **FastSpeechSplit**, Faster and Accurate Speech Split Conversion using Transformer, no paper produced.
 
 References
 -----------
diff --git a/docs/Api.rst b/docs/Api.rst
@@ -112,6 +112,18 @@ malaya_speech.model.tf.Split_Mel
 .. autoclass:: malaya_speech.model.tf.Split_Mel()
     :members:
 
+malaya_speech.model.tf.Wav2Vec2_CTC
+-------------------------------------
+
+.. autoclass:: malaya_speech.model.tf.Wav2Vec2_CTC()
+    :members:
+
+malaya_speech.model.tf.FastSpeechSplit
+---------------------------------------
+
+.. autoclass:: malaya_speech.model.tf.FastSpeechSplit()
+    :members:
+
 malaya_speech.model.webrtc.WebRTC
 ----------------------------------
 
@@ -304,6 +316,12 @@ malaya_speech.speech_enhancement
 .. automodule:: malaya_speech.speech_enhancement
     :members:
 
+malaya_speech.speechsplit_conversion
+--------------------------------------
+
+.. automodule:: malaya_speech.speechsplit_conversion
+    :members:
+
 malaya_speech.stack
 -----------------------------------
 
diff --git a/docs/README.rst b/docs/README.rst
@@ -55,7 +55,7 @@ Features
 -  **Speaker overlap**, detect overlap speakers using Finetuned Speaker Vector.
 -  **Speaker Vector**, calculate similarity between speakers using Pretrained Speaker Vector.
 -  **Speech Enhancement**, enhance voice activities using Waveform UNET.
--  **SpeechSplit Conversion**, detailed speaking style conversion by disentangling speech into content, timbre, rhythm and pitch.
+-  **SpeechSplit Conversion**, detailed speaking style conversion by disentangling speech into content, timbre, rhythm and pitch using PyWorld and PySPTK.
 -  **Speech-to-Text**, End-to-End Speech to Text for Malay and Mixed (Malay and Singlish) using RNN-Transducer and Wav2Vec2 CTC.
 -  **Super Resolution**, Super Resolution 4x for Waveform.
 -  **Text-to-Speech**, Text to Speech for Malay and Singlish using Tacotron2 and FastSpeech2.
@@ -93,6 +93,7 @@ Malaya-Speech also released pretrained models, simply check at `malaya-speech/pr
 -  **wav2vec 2.0**, A Framework for Self-Supervised Learning of Speech Representations, https://arxiv.org/abs/2006.11477
 -  **FastSpeechSplit**, Unsupervised Speech Decomposition Via Triple Information Bottleneck using Transformer, no paper produced.
 -  **Sepformer**, Attention is All You Need in Speech Separation, https://arxiv.org/abs/2010.13154
+-  **FastSpeechSplit**, Faster and Accurate Speech Split Conversion using Transformer, no paper produced.
 
 References
 -----------
diff --git a/docs/index.rst b/docs/index.rst
@@ -60,6 +60,7 @@ Contents:
    :caption: Conversion Module
 
    load-voice-conversion
+   speechsplit-conversion-pyworld
    
 .. toctree::
    :maxdepth: 2
diff --git a/docs/load-voice-conversion.ipynb b/docs/load-voice-conversion.ipynb
@@ -61,7 +61,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### List available Voice Conversion"
+    "### List available Voice Conversion models"
    ]
   },
   {
diff --git a/docs/speechsplit-conversion-pyworld.ipynb b/docs/speechsplit-conversion-pyworld.ipynb
diff --git a/example/speechsplit-conversion-pyworld/speechsplit-conversion-pyworld.ipynb b/example/speechsplit-conversion-pyworld/speechsplit-conversion-pyworld.ipynb
diff --git a/example/voice-conversion/load-voice-conversion.ipynb b/example/voice-conversion/load-voice-conversion.ipynb
@@ -61,7 +61,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### List available Voice Conversion"
+    "### List available Voice Conversion models"
    ]
   },
   {
diff --git a/malaya_speech/model/tf.py b/malaya_speech/model/tf.py
@@ -1415,7 +1415,6 @@ def __init__(
         output_nodes,
         speaker_vector,
         gender_model,
-        magnitude,
         sess,
         model,
         name,
@@ -1424,14 +1423,13 @@ def __init__(
         self._output_nodes = output_nodes
         self._speaker_vector = speaker_vector
         self._gender_model = gender_model
-        self._magnitude = magnitude
         self._sess = sess
         self.__model__ = model
         self.__name__ = name
         self._modes = {'R', 'F', 'U', 'RF', 'RU', 'FU', 'RFU'}
         self._freqs = {'female': [100, 600], 'male': [50, 250]}
 
-    def _get_data(x, sr = 22050, target_sr = 16000):
+    def _get_data(self, x, sr = 22050, target_sr = 16000):
         x_16k = resample(x, sr, target_sr)
         if self._gender_model is not None:
             gender = self._gender_model(x_16k)
@@ -1458,6 +1456,16 @@ def predict(
         ----------
         original_audio: np.array or malaya_speech.model.frame.Frame
         target_audio: np.array or malaya_speech.model.frame.Frame
+        modes: List[str], optional (default = ['R', 'F', 'U', 'RF', 'RU', 'FU', 'RFU'])
+            R denotes rhythm, F denotes pitch target, U denotes speaker target (vector).
+
+            * ``'R'`` - maintain `original_audio` F and U on `target_audio` R.
+            * ``'F'`` - maintain `original_audio` R and U on `target_audio` F.
+            * ``'U'`` - maintain `original_audio` R and F on `target_audio` U.
+            * ``'RF'`` - maintain `original_audio` U on `target_audio` R and F.
+            * ``'RU'`` - maintain `original_audio` F on `target_audio` R and U.
+            * ``'FU'`` - maintain `original_audio` R on `target_audio` F and U.
+            * ``'RFU'`` - no conversion happened, just do encoder-decoder on `target_audio`
 
         Returns
         -------
@@ -1475,8 +1483,8 @@ def predict(
         target_audio = (
             input.array if isinstance(target_audio, Frame) else target_audio
         )
-        wav, mel, f0, v = get_speech(original_audio)
-        wav_1, mel_1, f0_1, v_1 = get_speech(target_audio)
+        wav, mel, f0, v = self._get_data(original_audio)
+        wav_1, mel_1, f0_1, v_1 = self._get_data(target_audio)
         mels, mel_lens = padding_sequence_nd(
             [mel, mel_1], dim = 0, return_len = True
         )
@@ -1532,9 +1540,9 @@ def predict(
                 x_ = mels[:1]
 
             r = self._execute(
-                inputs = [uttr_f0_, x_, v_, len(f0s[0])],
+                inputs = [uttr_f0_, x_, [v_], [len(f0s[0])]],
                 input_labels = ['uttr_f0', 'X', 'V', 'len_X'],
-                output_labels = ['f0_target'],
+                output_labels = ['mel_outputs'],
             )
             mel_outputs = r['mel_outputs'][0]
             if 'R' in condition:
diff --git a/malaya_speech/speechsplit_conversion.py b/malaya_speech/speechsplit_conversion.py
@@ -1,4 +1,5 @@
 from herpetologist import check_type
+from malaya_speech.supervised import speechsplit_conversion
 
 _availability = {
     'pysptk': {
@@ -27,7 +28,7 @@
 f0_modes = ['pysptk', 'pyworld']
 
 
-def check_f0_mode(f0_mode = 'pyworld'):
+def check_f0_mode(f0_mode = 'pysptk'):
     f0_mode = f0_mode.lower()
     if f0_mode not in f0_modes:
         raise ValueError("`f0_mode` only support one of ['pysptk', 'pyworld']")
@@ -46,7 +47,7 @@ def available_deep_conversion(f0_mode = 'pysptk'):
 
 def deep_conversion(
     model: str = 'fastspeechsplit-v2-vggvox-v2',
-    f0_mode = 'pyworld',
+    f0_mode = 'pysptk',
     quantized: bool = False,
     **kwargs,
 ):
@@ -60,6 +61,12 @@ def deep_conversion(
 
         * ``'fastspeechsplit-vggvox-v2'`` - FastSpeechSplit with VGGVox-v2 Speaker Vector.
         * ``'fastspeechsplit-v2-vggvox-v2'`` - FastSpeechSplit V2 with VGGVox-v2 Speaker Vector.
+
+    f0_mode : str, optional (default='pysptk)
+        F0 conversion supported. Allowed values:
+
+        * ``'pysptk'`` - https://github.com/r9y9/pysptk, sensitive towards gender.
+        * ``'pyworld'`` - https://pypi.org/project/pyworld/
         
     quantized : bool, optional (default=False)
         if True, will load 8-bit quantized model. 
@@ -69,6 +76,7 @@ def deep_conversion(
     -------
     result : malaya_speech.supervised.speechsplit_conversion.load function
     """
+
     model = model.lower()
     f0_mode = check_f0_mode(f0_mode = f0_mode)
     if model not in _availability[f0_mode]:
@@ -94,7 +102,7 @@ def deep_conversion(
                 'pyworld not installed. Please install it by `pip install pyworld` and try again.'
             )
 
-    return voice_conversion.load(
+    return speechsplit_conversion.load(
         model = model,
         module = f'speechsplit-conversion-{f0_mode}',
         f0_mode = f0_mode,
diff --git a/malaya_speech/supervised/speechsplit_conversion.py b/malaya_speech/supervised/speechsplit_conversion.py
@@ -21,7 +21,7 @@ def load(model, module, f0_mode = 'pyworld', quantized = False, **kwargs):
     outputs = ['mel_outputs', 'f0_target']
     input_nodes, output_nodes = nodes_session(g, inputs, outputs)
 
-    speaker_vector_model = '-'.join(model.split('-')[2:])
+    speaker_vector_model = '-'.join(model.split('-')[-2:])
 
     speaker_model = speaker_vector.deep_model(speaker_vector_model, **kwargs)
     if f0_mode == 'pysptk':
diff --git a/malaya_speech/utils/speechsplit.py b/malaya_speech/utils/speechsplit.py
@@ -6,6 +6,7 @@
 B = np.array(
     [0.98626332, -4.93131661, 9.86263322, -9.86263322, 4.93131661, -0.98626332]
 )
+sr = 22050
 
 
 def quantize_f0_numpy(x, num_bins = 256):

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@`
`61`	`61`	`"cell_type": "markdown",`
`62`	`62`	`"metadata": {},`
`63`	`63`	`"source": [`
`64`		`- "### List available Voice Conversion"`
	`64`	`+ "### List available Voice Conversion models"`
`65`	`65`	`]`
`66`	`66`	`},`
`67`	`67`	`{`