|
228 | 228 | },
|
229 | 229 | {
|
230 | 230 | "cell_type": "code",
|
231 |
| - "execution_count": 30, |
| 231 | + "execution_count": 9, |
232 | 232 | "metadata": {
|
233 | 233 | "scrolled": true
|
234 | 234 | },
|
|
251 | 251 | },
|
252 | 252 | {
|
253 | 253 | "cell_type": "code",
|
254 |
| - "execution_count": null, |
| 254 | + "execution_count": 10, |
255 | 255 | "metadata": {},
|
256 |
| - "outputs": [ |
257 |
| - { |
258 |
| - "name": "stderr", |
259 |
| - "output_type": "stream", |
260 |
| - "text": [ |
261 |
| - "WARNING:root:Load quantized model will cause accuracy drop.\n" |
262 |
| - ] |
263 |
| - } |
264 |
| - ], |
| 256 | + "outputs": [], |
265 | 257 | "source": [
|
266 | 258 | "quantized_small_model = malaya_speech.stt.deep_transducer(model = 'small-conformer-mixed', quantized = True)\n",
|
267 | 259 | "quantized_model = malaya_speech.stt.deep_transducer(model = 'conformer-mixed', quantized = True)"
|
|
276 | 268 | },
|
277 | 269 | {
|
278 | 270 | "cell_type": "code",
|
279 |
| - "execution_count": 9, |
| 271 | + "execution_count": 5, |
280 | 272 | "metadata": {},
|
281 | 273 | "outputs": [],
|
282 | 274 | "source": [
|
|
763 | 755 | "source": [
|
764 | 756 | "**RNNT model beam decoder not able to utilise batch programming, if feed a batch, it will process one by one**."
|
765 | 757 | ]
|
| 758 | + }, |
| 759 | + { |
| 760 | + "cell_type": "markdown", |
| 761 | + "metadata": {}, |
| 762 | + "source": [ |
| 763 | + "### Predict force alignment\n", |
| 764 | + "\n", |
| 765 | + "We want to know when the speakers speak certain words, so we can use `predict_timestamp`,\n", |
| 766 | + "\n", |
| 767 | + "```python\n", |
| 768 | + "def predict_alignment(self, input, combined = True):\n", |
| 769 | + " \"\"\"\n", |
| 770 | + " Transcribe input and get timestamp, only support greedy decoder.\n", |
| 771 | + "\n", |
| 772 | + " Parameters\n", |
| 773 | + " ----------\n", |
| 774 | + " input: np.array\n", |
| 775 | + " np.array or malaya_speech.model.frame.Frame.\n", |
| 776 | + " combined: bool, optional (default=True)\n", |
| 777 | + " If True, will combined subwords to become a word.\n", |
| 778 | + "\n", |
| 779 | + " Returns\n", |
| 780 | + " -------\n", |
| 781 | + " result: List[Dict[text, start, end]]\n", |
| 782 | + " \"\"\"\n", |
| 783 | + "```" |
| 784 | + ] |
| 785 | + }, |
| 786 | + { |
| 787 | + "cell_type": "code", |
| 788 | + "execution_count": 6, |
| 789 | + "metadata": {}, |
| 790 | + "outputs": [ |
| 791 | + { |
| 792 | + "name": "stdout", |
| 793 | + "output_type": "stream", |
| 794 | + "text": [ |
| 795 | + "CPU times: user 4.06 s, sys: 704 ms, total: 4.76 s\n", |
| 796 | + "Wall time: 4.57 s\n" |
| 797 | + ] |
| 798 | + }, |
| 799 | + { |
| 800 | + "data": { |
| 801 | + "text/plain": [ |
| 802 | + "[{'text': 'how', 'start': 0.96, 'end': 0.97},\n", |
| 803 | + " {'text': 'they', 'start': 1.32, 'end': 1.45},\n", |
| 804 | + " {'text': 'royat', 'start': 1.92, 'end': 2.17},\n", |
| 805 | + " {'text': 'in', 'start': 2.36, 'end': 2.37},\n", |
| 806 | + " {'text': 'fail', 'start': 2.64, 'end': 2.77},\n", |
| 807 | + " {'text': 'okay', 'start': 3.64, 'end': 3.85},\n", |
| 808 | + " {'text': 'actually', 'start': 3.96, 'end': 4.25}]" |
| 809 | + ] |
| 810 | + }, |
| 811 | + "execution_count": 6, |
| 812 | + "metadata": {}, |
| 813 | + "output_type": "execute_result" |
| 814 | + } |
| 815 | + ], |
| 816 | + "source": [ |
| 817 | + "%%time\n", |
| 818 | + "\n", |
| 819 | + "small_model.predict_alignment(singlish0)" |
| 820 | + ] |
| 821 | + }, |
| 822 | + { |
| 823 | + "cell_type": "code", |
| 824 | + "execution_count": 7, |
| 825 | + "metadata": {}, |
| 826 | + "outputs": [ |
| 827 | + { |
| 828 | + "name": "stdout", |
| 829 | + "output_type": "stream", |
| 830 | + "text": [ |
| 831 | + "CPU times: user 240 ms, sys: 47.2 ms, total: 287 ms\n", |
| 832 | + "Wall time: 97.8 ms\n" |
| 833 | + ] |
| 834 | + }, |
| 835 | + { |
| 836 | + "data": { |
| 837 | + "text/plain": [ |
| 838 | + "[{'text': 'how', 'start': 0.96, 'end': 0.97},\n", |
| 839 | + " {'text': ' ', 'start': 1.08, 'end': 1.09},\n", |
| 840 | + " {'text': 'the', 'start': 1.32, 'end': 1.33},\n", |
| 841 | + " {'text': 'y_', 'start': 1.44, 'end': 1.45},\n", |
| 842 | + " {'text': 'ro', 'start': 1.92, 'end': 1.93},\n", |
| 843 | + " {'text': 'yat', 'start': 2.16, 'end': 2.17},\n", |
| 844 | + " {'text': ' ', 'start': 2.28, 'end': 2.29},\n", |
| 845 | + " {'text': 'and', 'start': 2.36, 'end': 2.37},\n", |
| 846 | + " {'text': ' ', 'start': 2.44, 'end': 2.45},\n", |
| 847 | + " {'text': 'fi', 'start': 2.64, 'end': 2.65},\n", |
| 848 | + " {'text': 'l', 'start': 2.76, 'end': 2.77},\n", |
| 849 | + " {'text': 'm_', 'start': 2.84, 'end': 2.85},\n", |
| 850 | + " {'text': 'oka', 'start': 3.64, 'end': 3.65},\n", |
| 851 | + " {'text': 'y_', 'start': 3.84, 'end': 3.85},\n", |
| 852 | + " {'text': 'act', 'start': 3.96, 'end': 3.97},\n", |
| 853 | + " {'text': 'ual', 'start': 4.08, 'end': 4.09},\n", |
| 854 | + " {'text': 'l', 'start': 4.2, 'end': 4.21},\n", |
| 855 | + " {'text': 'y', 'start': 4.24, 'end': 4.25}]" |
| 856 | + ] |
| 857 | + }, |
| 858 | + "execution_count": 7, |
| 859 | + "metadata": {}, |
| 860 | + "output_type": "execute_result" |
| 861 | + } |
| 862 | + ], |
| 863 | + "source": [ |
| 864 | + "%%time\n", |
| 865 | + "\n", |
| 866 | + "small_model.predict_alignment(singlish0, combined = False)" |
| 867 | + ] |
| 868 | + }, |
| 869 | + { |
| 870 | + "cell_type": "code", |
| 871 | + "execution_count": 8, |
| 872 | + "metadata": {}, |
| 873 | + "outputs": [ |
| 874 | + { |
| 875 | + "name": "stdout", |
| 876 | + "output_type": "stream", |
| 877 | + "text": [ |
| 878 | + "CPU times: user 6.03 s, sys: 1.83 s, total: 7.86 s\n", |
| 879 | + "Wall time: 7.37 s\n" |
| 880 | + ] |
| 881 | + }, |
| 882 | + { |
| 883 | + "data": { |
| 884 | + "text/plain": [ |
| 885 | + "[{'text': 'and', 'start': 0.2, 'end': 0.21},\n", |
| 886 | + " {'text': 'and', 'start': 0.4, 'end': 0.41},\n", |
| 887 | + " {'text': 'see', 'start': 0.6, 'end': 0.61},\n", |
| 888 | + " {'text': 'how', 'start': 0.88, 'end': 0.89},\n", |
| 889 | + " {'text': 'they', 'start': 1.4, 'end': 1.49},\n", |
| 890 | + " {'text': 'brought', 'start': 1.88, 'end': 2.25},\n", |
| 891 | + " {'text': 'and', 'start': 2.4, 'end': 2.41},\n", |
| 892 | + " {'text': 'film', 'start': 2.64, 'end': 2.85},\n", |
| 893 | + " {'text': 'okay', 'start': 3.68, 'end': 3.85},\n", |
| 894 | + " {'text': 'shi', 'start': 4.08, 'end': 4.21}]" |
| 895 | + ] |
| 896 | + }, |
| 897 | + "execution_count": 8, |
| 898 | + "metadata": {}, |
| 899 | + "output_type": "execute_result" |
| 900 | + } |
| 901 | + ], |
| 902 | + "source": [ |
| 903 | + "%%time\n", |
| 904 | + "\n", |
| 905 | + "model.predict_alignment(singlish0)" |
| 906 | + ] |
766 | 907 | }
|
767 | 908 | ],
|
768 | 909 | "metadata": {
|
|
0 commit comments