mesolitica
diff --git a/‎docs/index.rst
Lines changed: 1 addition & 0 deletions b/‎docs/index.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/load-stt-transducer-model-mixed.ipynb
Lines changed: 153 additions & 12 deletions b/‎docs/load-stt-transducer-model-mixed.ipynb
Lines changed: 153 additions & 12 deletions
@@ -61,6 +61,7 @@ Contents:
 
    load-voice-conversion
    speechsplit-conversion-pyworld
+   speechsplit-conversion-pysptk
 
 .. toctree::
    :maxdepth: 2
 
@@ -228,7 +228,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 9,
    "metadata": {
     "scrolled": true
    },
@@ -251,17 +251,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:root:Load quantized model will cause accuracy drop.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "quantized_small_model = malaya_speech.stt.deep_transducer(model = 'small-conformer-mixed', quantized = True)\n",
     "quantized_model = malaya_speech.stt.deep_transducer(model = 'conformer-mixed', quantized = True)"
@@ -276,7 +268,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -763,6 +755,155 @@
    "source": [
     "**RNNT model beam decoder not able to utilise batch programming, if feed a batch, it will process one by one**."
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Predict force alignment\n",
+    "\n",
+    "We want to know when the speakers speak certain words, so we can use `predict_timestamp`,\n",
+    "\n",
+    "```python\n",
+    "def predict_alignment(self, input, combined = True):\n",
+    "    \"\"\"\n",
+    "    Transcribe input and get timestamp, only support greedy decoder.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    input: np.array\n",
+    "        np.array or malaya_speech.model.frame.Frame.\n",
+    "    combined: bool, optional (default=True)\n",
+    "        If True, will combined subwords to become a word.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    result: List[Dict[text, start, end]]\n",
+    "    \"\"\"\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 4.06 s, sys: 704 ms, total: 4.76 s\n",
+      "Wall time: 4.57 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'text': 'how', 'start': 0.96, 'end': 0.97},\n",
+       " {'text': 'they', 'start': 1.32, 'end': 1.45},\n",
+       " {'text': 'royat', 'start': 1.92, 'end': 2.17},\n",
+       " {'text': 'in', 'start': 2.36, 'end': 2.37},\n",
+       " {'text': 'fail', 'start': 2.64, 'end': 2.77},\n",
+       " {'text': 'okay', 'start': 3.64, 'end': 3.85},\n",
+       " {'text': 'actually', 'start': 3.96, 'end': 4.25}]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "small_model.predict_alignment(singlish0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 240 ms, sys: 47.2 ms, total: 287 ms\n",
+      "Wall time: 97.8 ms\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'text': 'how', 'start': 0.96, 'end': 0.97},\n",
+       " {'text': ' ', 'start': 1.08, 'end': 1.09},\n",
+       " {'text': 'the', 'start': 1.32, 'end': 1.33},\n",
+       " {'text': 'y_', 'start': 1.44, 'end': 1.45},\n",
+       " {'text': 'ro', 'start': 1.92, 'end': 1.93},\n",
+       " {'text': 'yat', 'start': 2.16, 'end': 2.17},\n",
+       " {'text': ' ', 'start': 2.28, 'end': 2.29},\n",
+       " {'text': 'and', 'start': 2.36, 'end': 2.37},\n",
+       " {'text': ' ', 'start': 2.44, 'end': 2.45},\n",
+       " {'text': 'fi', 'start': 2.64, 'end': 2.65},\n",
+       " {'text': 'l', 'start': 2.76, 'end': 2.77},\n",
+       " {'text': 'm_', 'start': 2.84, 'end': 2.85},\n",
+       " {'text': 'oka', 'start': 3.64, 'end': 3.65},\n",
+       " {'text': 'y_', 'start': 3.84, 'end': 3.85},\n",
+       " {'text': 'act', 'start': 3.96, 'end': 3.97},\n",
+       " {'text': 'ual', 'start': 4.08, 'end': 4.09},\n",
+       " {'text': 'l', 'start': 4.2, 'end': 4.21},\n",
+       " {'text': 'y', 'start': 4.24, 'end': 4.25}]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "small_model.predict_alignment(singlish0, combined = False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 6.03 s, sys: 1.83 s, total: 7.86 s\n",
+      "Wall time: 7.37 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'text': 'and', 'start': 0.2, 'end': 0.21},\n",
+       " {'text': 'and', 'start': 0.4, 'end': 0.41},\n",
+       " {'text': 'see', 'start': 0.6, 'end': 0.61},\n",
+       " {'text': 'how', 'start': 0.88, 'end': 0.89},\n",
+       " {'text': 'they', 'start': 1.4, 'end': 1.49},\n",
+       " {'text': 'brought', 'start': 1.88, 'end': 2.25},\n",
+       " {'text': 'and', 'start': 2.4, 'end': 2.41},\n",
+       " {'text': 'film', 'start': 2.64, 'end': 2.85},\n",
+       " {'text': 'okay', 'start': 3.68, 'end': 3.85},\n",
+       " {'text': 'shi', 'start': 4.08, 'end': 4.21}]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "model.predict_alignment(singlish0)"
+   ]
   }
  ],
  "metadata": {