Skip to content

Commit fd35209

Browse files
committed
released 1.1
1 parent 53718cd commit fd35209

27 files changed

+7611
-1103
lines changed

docs/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ Contents:
6161

6262
load-voice-conversion
6363
speechsplit-conversion-pyworld
64+
speechsplit-conversion-pysptk
6465

6566
.. toctree::
6667
:maxdepth: 2

docs/load-stt-transducer-model-mixed.ipynb

Lines changed: 153 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@
228228
},
229229
{
230230
"cell_type": "code",
231-
"execution_count": 30,
231+
"execution_count": 9,
232232
"metadata": {
233233
"scrolled": true
234234
},
@@ -251,17 +251,9 @@
251251
},
252252
{
253253
"cell_type": "code",
254-
"execution_count": null,
254+
"execution_count": 10,
255255
"metadata": {},
256-
"outputs": [
257-
{
258-
"name": "stderr",
259-
"output_type": "stream",
260-
"text": [
261-
"WARNING:root:Load quantized model will cause accuracy drop.\n"
262-
]
263-
}
264-
],
256+
"outputs": [],
265257
"source": [
266258
"quantized_small_model = malaya_speech.stt.deep_transducer(model = 'small-conformer-mixed', quantized = True)\n",
267259
"quantized_model = malaya_speech.stt.deep_transducer(model = 'conformer-mixed', quantized = True)"
@@ -276,7 +268,7 @@
276268
},
277269
{
278270
"cell_type": "code",
279-
"execution_count": 9,
271+
"execution_count": 5,
280272
"metadata": {},
281273
"outputs": [],
282274
"source": [
@@ -763,6 +755,155 @@
763755
"source": [
764756
"**RNNT model beam decoder not able to utilise batch programming, if feed a batch, it will process one by one**."
765757
]
758+
},
759+
{
760+
"cell_type": "markdown",
761+
"metadata": {},
762+
"source": [
763+
"### Predict force alignment\n",
764+
"\n",
765+
"We want to know when the speakers speak certain words, so we can use `predict_timestamp`,\n",
766+
"\n",
767+
"```python\n",
768+
"def predict_alignment(self, input, combined = True):\n",
769+
" \"\"\"\n",
770+
" Transcribe input and get timestamp, only support greedy decoder.\n",
771+
"\n",
772+
" Parameters\n",
773+
" ----------\n",
774+
" input: np.array\n",
775+
" np.array or malaya_speech.model.frame.Frame.\n",
776+
" combined: bool, optional (default=True)\n",
777+
" If True, will combined subwords to become a word.\n",
778+
"\n",
779+
" Returns\n",
780+
" -------\n",
781+
" result: List[Dict[text, start, end]]\n",
782+
" \"\"\"\n",
783+
"```"
784+
]
785+
},
786+
{
787+
"cell_type": "code",
788+
"execution_count": 6,
789+
"metadata": {},
790+
"outputs": [
791+
{
792+
"name": "stdout",
793+
"output_type": "stream",
794+
"text": [
795+
"CPU times: user 4.06 s, sys: 704 ms, total: 4.76 s\n",
796+
"Wall time: 4.57 s\n"
797+
]
798+
},
799+
{
800+
"data": {
801+
"text/plain": [
802+
"[{'text': 'how', 'start': 0.96, 'end': 0.97},\n",
803+
" {'text': 'they', 'start': 1.32, 'end': 1.45},\n",
804+
" {'text': 'royat', 'start': 1.92, 'end': 2.17},\n",
805+
" {'text': 'in', 'start': 2.36, 'end': 2.37},\n",
806+
" {'text': 'fail', 'start': 2.64, 'end': 2.77},\n",
807+
" {'text': 'okay', 'start': 3.64, 'end': 3.85},\n",
808+
" {'text': 'actually', 'start': 3.96, 'end': 4.25}]"
809+
]
810+
},
811+
"execution_count": 6,
812+
"metadata": {},
813+
"output_type": "execute_result"
814+
}
815+
],
816+
"source": [
817+
"%%time\n",
818+
"\n",
819+
"small_model.predict_alignment(singlish0)"
820+
]
821+
},
822+
{
823+
"cell_type": "code",
824+
"execution_count": 7,
825+
"metadata": {},
826+
"outputs": [
827+
{
828+
"name": "stdout",
829+
"output_type": "stream",
830+
"text": [
831+
"CPU times: user 240 ms, sys: 47.2 ms, total: 287 ms\n",
832+
"Wall time: 97.8 ms\n"
833+
]
834+
},
835+
{
836+
"data": {
837+
"text/plain": [
838+
"[{'text': 'how', 'start': 0.96, 'end': 0.97},\n",
839+
" {'text': ' ', 'start': 1.08, 'end': 1.09},\n",
840+
" {'text': 'the', 'start': 1.32, 'end': 1.33},\n",
841+
" {'text': 'y_', 'start': 1.44, 'end': 1.45},\n",
842+
" {'text': 'ro', 'start': 1.92, 'end': 1.93},\n",
843+
" {'text': 'yat', 'start': 2.16, 'end': 2.17},\n",
844+
" {'text': ' ', 'start': 2.28, 'end': 2.29},\n",
845+
" {'text': 'and', 'start': 2.36, 'end': 2.37},\n",
846+
" {'text': ' ', 'start': 2.44, 'end': 2.45},\n",
847+
" {'text': 'fi', 'start': 2.64, 'end': 2.65},\n",
848+
" {'text': 'l', 'start': 2.76, 'end': 2.77},\n",
849+
" {'text': 'm_', 'start': 2.84, 'end': 2.85},\n",
850+
" {'text': 'oka', 'start': 3.64, 'end': 3.65},\n",
851+
" {'text': 'y_', 'start': 3.84, 'end': 3.85},\n",
852+
" {'text': 'act', 'start': 3.96, 'end': 3.97},\n",
853+
" {'text': 'ual', 'start': 4.08, 'end': 4.09},\n",
854+
" {'text': 'l', 'start': 4.2, 'end': 4.21},\n",
855+
" {'text': 'y', 'start': 4.24, 'end': 4.25}]"
856+
]
857+
},
858+
"execution_count": 7,
859+
"metadata": {},
860+
"output_type": "execute_result"
861+
}
862+
],
863+
"source": [
864+
"%%time\n",
865+
"\n",
866+
"small_model.predict_alignment(singlish0, combined = False)"
867+
]
868+
},
869+
{
870+
"cell_type": "code",
871+
"execution_count": 8,
872+
"metadata": {},
873+
"outputs": [
874+
{
875+
"name": "stdout",
876+
"output_type": "stream",
877+
"text": [
878+
"CPU times: user 6.03 s, sys: 1.83 s, total: 7.86 s\n",
879+
"Wall time: 7.37 s\n"
880+
]
881+
},
882+
{
883+
"data": {
884+
"text/plain": [
885+
"[{'text': 'and', 'start': 0.2, 'end': 0.21},\n",
886+
" {'text': 'and', 'start': 0.4, 'end': 0.41},\n",
887+
" {'text': 'see', 'start': 0.6, 'end': 0.61},\n",
888+
" {'text': 'how', 'start': 0.88, 'end': 0.89},\n",
889+
" {'text': 'they', 'start': 1.4, 'end': 1.49},\n",
890+
" {'text': 'brought', 'start': 1.88, 'end': 2.25},\n",
891+
" {'text': 'and', 'start': 2.4, 'end': 2.41},\n",
892+
" {'text': 'film', 'start': 2.64, 'end': 2.85},\n",
893+
" {'text': 'okay', 'start': 3.68, 'end': 3.85},\n",
894+
" {'text': 'shi', 'start': 4.08, 'end': 4.21}]"
895+
]
896+
},
897+
"execution_count": 8,
898+
"metadata": {},
899+
"output_type": "execute_result"
900+
}
901+
],
902+
"source": [
903+
"%%time\n",
904+
"\n",
905+
"model.predict_alignment(singlish0)"
906+
]
766907
}
767908
],
768909
"metadata": {

0 commit comments

Comments
 (0)