Skip to content

Commit a1a223a

Browse files
authored
Merge pull request #8 from jgw96/try-parcel
fix microsoft/vscode#194414
2 parents a663e84 + 4a7266f commit a1a223a

File tree

13 files changed

+780
-755
lines changed

13 files changed

+780
-755
lines changed

package-lock.json

Lines changed: 604 additions & 349 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "web-ai-toolkit",
3-
"version": "0.1.8",
3+
"version": "0.2.0",
44
"repository": "https://github.com/jgw96/web-ai-toolkit",
55
"keywords": [
66
"ai",
@@ -29,12 +29,12 @@
2929
"author": "",
3030
"license": "ISC",
3131
"devDependencies": {
32-
"typescript": "^5.5.3",
33-
"vite": "^5.3.3",
34-
"vite-plugin-dts": "^3.9.1"
32+
"typescript": "^5.6.2",
33+
"vite": "^5.4.8",
34+
"vite-plugin-dts": "^4.2.2"
3535
},
3636
"dependencies": {
37-
"@huggingface/transformers": "^3.0.0-alpha.14",
37+
"@huggingface/transformers": "^3.0.0-alpha.16",
3838
"@xenova/transformers": "^2.17.2"
3939
}
4040
}

src/index.ts

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
export async function transcribeAudioFile(audioFile: Blob, model: string = "Xenova/whisper-tiny", timestamps: boolean = false, language: string = "en-US") {
22
try {
3-
const { loadTranscriber, doLocalWhisper } = await import("./services/speech-recognition/whisper-ai");
3+
const { loadTranscriber, doLocalWhisper } = await import("./services/speech-recognition/recognition");
44
await loadTranscriber(model, timestamps, language);
55
return doLocalWhisper(audioFile, model);
66
}
@@ -12,9 +12,8 @@ export async function transcribeAudioFile(audioFile: Blob, model: string = "Xeno
1212

1313
export async function textToSpeech(text: string, model: string = "Xenova/mms-tts-eng") {
1414
try {
15-
const { loadTTS, doLocalTTS } = await import("./services/text-to-speech/text-to-speech");
16-
await loadTTS(model);
17-
return doLocalTTS(text);
15+
const { runSynthesizer } = await import("./services/text-to-speech/tts");
16+
return runSynthesizer(text, model);
1817
}
1918
catch (err) {
2019
console.error(err);
@@ -24,9 +23,8 @@ export async function textToSpeech(text: string, model: string = "Xenova/mms-tts
2423

2524
export async function summarize(text: string, model: string = "Xenova/distilbart-cnn-6-6") {
2625
try {
27-
const { loadSummarizer, doLocalSummarize } = await import("./services/summarization/summarization");
28-
await loadSummarizer(model);
29-
return doLocalSummarize(text);
26+
const { runSummarizer } = await import("./services/summarization/summarization");
27+
return runSummarizer(text, model);
3028
}
3129
catch (err) {
3230
console.error(err);
@@ -36,9 +34,8 @@ export async function summarize(text: string, model: string = "Xenova/distilbart
3634

3735
export async function ocr(image: Blob, model: string = "Xenova/trocr-small-printed") {
3836
try {
39-
const { loadOCR, doLocalOCR } = await import("./services/ocr/ocr");
40-
await loadOCR(model);
41-
return doLocalOCR(image);
37+
const { runOCR } = await import("./services/ocr/ocr");
38+
return runOCR(image, model);
4239
}
4340
catch (err) {
4441
console.error(err);

src/services/ocr/ocr-worker.ts

Lines changed: 0 additions & 55 deletions
This file was deleted.

src/services/ocr/ocr.ts

Lines changed: 68 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,80 @@
1-
let ocrWorker: Worker;
1+
// let ocrWorker: Worker;
22

3-
// @ts-ignore
4-
import OCRWorker from './ocr-worker?worker&inline';
3+
// // @ts-ignore
4+
// import OCRWorker from './ocr-worker?worker&inline';
55

6-
export async function loadOCR(model: string): Promise<void> {
7-
return new Promise(async (resolve) => {
8-
if (!ocrWorker) {
9-
ocrWorker = new OCRWorker();
10-
}
6+
// export async function loadOCR(model: string): Promise<void> {
7+
// return new Promise(async (resolve) => {
8+
// if (!ocrWorker) {
9+
// ocrWorker = new OCRWorker();
10+
// }
1111

12-
ocrWorker.onmessage = async (e) => {
13-
if (e.data.type === "loaded") {
14-
resolve();
15-
}
16-
}
12+
// ocrWorker.onmessage = async (e) => {
13+
// if (e.data.type === "loaded") {
14+
// resolve();
15+
// }
16+
// }
17+
18+
// ocrWorker.postMessage({
19+
// type: "load",
20+
// model
21+
// });
22+
// });
23+
// }
24+
25+
// export function doLocalOCR(blob: Blob) {
26+
// return new Promise((resolve, reject) => {
27+
// try {
28+
// ocrWorker.onmessage = async (e) => {
29+
// if (e.data.type === "ocr") {
30+
// resolve(e.data.text);
31+
// }
32+
// else if (e.data.type === "error") {
33+
// reject(e.data.error);
34+
// }
35+
// }
36+
37+
// const dataURL = URL.createObjectURL(blob);
1738

18-
ocrWorker.postMessage({
19-
type: "load",
20-
model
21-
});
39+
// ocrWorker.postMessage({
40+
// type: "ocr",
41+
// blob: dataURL
42+
// });
43+
// }
44+
// catch (err) {
45+
// reject(err);
46+
// }
47+
// });
48+
// }
49+
50+
/* eslint-disable no-async-promise-executor */
51+
import { pipeline, env } from '@huggingface/transformers';
52+
53+
let ocr: any = undefined;
54+
55+
export async function runOCR(image: Blob, model: string = "Xenova/trocr-small-printed") {
56+
return new Promise(async (resolve) => {
57+
if (!ocr) {
58+
await loadOCR(model);
59+
}
60+
const out = await ocr(image);
61+
resolve(out);
2262
});
2363
}
2464

25-
export function doLocalOCR(blob: Blob) {
26-
return new Promise((resolve, reject) => {
27-
try {
28-
ocrWorker.onmessage = async (e) => {
29-
if (e.data.type === "ocr") {
30-
resolve(e.data.text);
31-
}
32-
else if (e.data.type === "error") {
33-
reject(e.data.error);
34-
}
35-
}
36-
37-
const dataURL = URL.createObjectURL(blob);
38-
39-
ocrWorker.postMessage({
40-
type: "ocr",
41-
blob: dataURL
65+
async function loadOCR(model: string): Promise<void> {
66+
return new Promise(async (resolve) => {
67+
if (!ocr) {
68+
env.allowLocalModels = false;
69+
env.useBrowserCache = false;
70+
ocr = await pipeline('image-to-text', model || 'Xenova/trocr-small-printed', {
71+
device: (navigator as any).ml ? "webnn" : "webgpu"
4272
});
73+
console.log("loaded ocr", ocr)
74+
resolve();
4375
}
44-
catch (err) {
45-
reject(err);
76+
else {
77+
resolve();
4678
}
4779
});
4880
}

src/services/speech-recognition/worker.ts renamed to src/services/speech-recognition/recognition.ts

Lines changed: 44 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,30 +3,49 @@ import { AutomaticSpeechRecognitionPipeline, pipeline, env } from '@huggingface/
33

44
let transcriber: AutomaticSpeechRecognitionPipeline | undefined = undefined;
55

6-
self.onmessage = async (e) => {
7-
if (e.data.type === 'transcribe') {
8-
return new Promise((resolve) => {
9-
console.log("in worker", e.data)
10-
localTranscribe(e.data.blob).then((transcription) => {
11-
console.log("in worker", transcription)
12-
self.postMessage({
13-
type: 'transcribe',
14-
transcription: transcription
6+
export function doLocalWhisper(audioFile: Blob, model: string = "Xenova/whisper-tiny") {
7+
return new Promise(async (resolve, reject) => {
8+
try {
9+
if (!transcriber) {
10+
await loadTranscriber(model || 'Xenova/whisper-tiny', false, 'en');
11+
}
12+
13+
const fileReader = new FileReader();
14+
fileReader.onloadend = async () => {
15+
const audioCTX = new AudioContext({
16+
sampleRate: 16000,
1517
});
16-
resolve(transcription);
17-
})
18-
})
19-
}
20-
else if (e.data.type === "load") {
21-
await loadTranscriber(e.data.model || 'Xenova/whisper-tiny', e.data.timestamps, e.data.language);
22-
self.postMessage({
23-
type: 'loaded'
24-
});
25-
return Promise.resolve();
26-
}
27-
else {
28-
return Promise.reject('Unknown message type');
29-
}
18+
const arrayBuffer = fileReader.result as ArrayBuffer;
19+
const audioData = await audioCTX.decodeAudioData(arrayBuffer);
20+
21+
let audio;
22+
if (audioData.numberOfChannels === 2) {
23+
const SCALING_FACTOR = Math.sqrt(2);
24+
25+
const left = audioData.getChannelData(0);
26+
const right = audioData.getChannelData(1);
27+
28+
audio = new Float32Array(left.length);
29+
for (let i = 0; i < audioData.length; ++i) {
30+
audio[i] = SCALING_FACTOR * (left[i] + right[i]) / 2;
31+
}
32+
} else {
33+
// If the audio is not stereo, we can just use the first channel:
34+
audio = audioData.getChannelData(0);
35+
}
36+
37+
const output = await localTranscribe(audio);
38+
resolve(output);
39+
40+
41+
42+
};
43+
fileReader.readAsArrayBuffer(audioFile);
44+
}
45+
catch (err) {
46+
reject(err);
47+
}
48+
})
3049
}
3150

3251
export async function loadTranscriber(model: string = "Xenova/whisper-tiny", timestamps: boolean, language: string): Promise<void> {
@@ -49,13 +68,14 @@ export async function loadTranscriber(model: string = "Xenova/whisper-tiny", tim
4968
})
5069
}
5170

52-
export async function localTranscribe(audio: Blob): Promise<string> {
71+
export async function localTranscribe(audio: Float32Array): Promise<string> {
5372
return new Promise(async (resolve, reject) => {
5473
if (transcriber) {
5574
// @ts-ignore
5675
const output = await transcriber(audio, {
5776
chunk_length_s: 30,
5877
stride_length_s: 5,
78+
// @ts-ignore
5979
callback_function: callback_function, // after each generation step
6080
chunk_callback: chunk_callback, // after each chunk is processed
6181
});

0 commit comments

Comments
 (0)