Skip to content

Commit d0f99b8

Browse files
authored
Merge pull request #11 from jgw96/better-tests
v0.3.1
2 parents 3eb56f0 + 1ffe92a commit d0f99b8

File tree

7 files changed

+101
-23
lines changed

7 files changed

+101
-23
lines changed

README.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ the code will attempt to choose an NPU first, then a GPU and finally the CPU if
1818

1919
| Function Name | Parameter | Type | Default Value | Supported Hardware |
2020
|-----------------------|----------------|------------------------|---------------|--------------------|
21-
| transcribeAudioFile | audioFile | Blob | - | NPU / GPU / CPU |
21+
| transcribeAudioFile | audioFile | Blob | - | GPU / CPU |
2222
| | model | string | "Xenova/whisper-tiny"| |
2323
| | timestamps | boolean | false | |
2424
| | language | string | "en-US" | |
@@ -28,10 +28,12 @@ the code will attempt to choose an NPU first, then a GPU and finally the CPU if
2828
| | model | string | "Xenova/distilbart-cnn-6-6"| |
2929
| ocr | image | Blob | - | GPU / CPU |
3030
| | model | string | "Xenova/trocr-small-printed"| |
31+
| image-classification | image | Blob | - | NPU / GPU / CPU |
32+
| | model | string | "Xenova/resnet-50"| |
3133

3234
## Technical Details
3335

34-
The Web AI Toolkit utilizes the [transformers.js project](https://huggingface.co/docs/transformers.js/index) to run AI workloads. All AI processing is performed locally on the device, ensuring data privacy and reducing latency. AI workloads are run using the [WebNN API](https://learn.microsoft.com/en-us/windows/ai/directml/webnn-overview) when available, otherwise falling back to the WebGPU API. Both of these APIs are used to "hardware accelerate" the AI inferences, with WebNN targeting NPUs and GPUs, and WebGPU strictly targeting GPUs.
36+
The Web AI Toolkit utilizes the [transformers.js project](https://huggingface.co/docs/transformers.js/index) to run AI workloads. All AI processing is performed locally on the device, ensuring data privacy and reducing latency. AI workloads are run using the [WebNN API](https://learn.microsoft.com/en-us/windows/ai/directml/webnn-overview) when available, otherwise falling back to the WebGPU API, or even to the CPU with WebAssembly. Choosing the correct hardware to target is handled by the library.
3537

3638
## Usage
3739

@@ -77,6 +79,16 @@ const text = await ocr(image);
7779
console.log(text);
7880
```
7981

82+
### Image Classification
83+
84+
```javascript
85+
import { classifyImage } from 'web-ai-toolkit';
86+
87+
const image = ...; // Your image Blob
88+
const text = await classifyImage(image);
89+
console.log(text);
90+
```
91+
8092
## Contribution
8193

8294
We welcome contributions to the Web AI Toolkit. Please fork the repository and submit a pull request with your changes. For major changes, please open an issue first to discuss what you would like to change.

package-lock.json

Lines changed: 14 additions & 14 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "web-ai-toolkit",
3-
"version": "0.2.1",
3+
"version": "0.3.1",
44
"repository": "https://github.com/jgw96/web-ai-toolkit",
55
"keywords": [
66
"ai",
@@ -38,7 +38,7 @@
3838
"vitest": "^2.1.2"
3939
},
4040
"dependencies": {
41-
"@huggingface/transformers": "^3.0.0-alpha.16",
41+
"@huggingface/transformers": "^3.0.0-alpha.22",
4242
"@xenova/transformers": "^2.17.2"
4343
}
4444
}

src/index.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,14 @@ export async function ocr(image: Blob, model: string = "Xenova/trocr-small-print
4242
return err;
4343
}
4444
}
45+
46+
export async function classifyImage(image: Blob, model: string = "Xenova/resnet-50") {
47+
try {
48+
const { runClassifier } = await import("./services/image-classification/image-classification");
49+
return runClassifier(image, model);
50+
}
51+
catch (err) {
52+
console.error(err);
53+
return err;
54+
}
55+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import { pipeline, env } from '@huggingface/transformers';
2+
import { webGPUCheck } from '../../utils';
3+
4+
let classifier: any = undefined;
5+
6+
export async function runClassifier(image: Blob | string, model: string = "onnx-community/mobilenetv4s-webnn") {
7+
return new Promise(async (resolve, reject) => {
8+
try {
9+
if (!classifier) {
10+
await loadClassifier(model);
11+
};
12+
13+
if (typeof image !== "string") {
14+
image = URL.createObjectURL(image);
15+
}
16+
17+
const out = await classifier(image);
18+
resolve(out);
19+
}
20+
catch (err) {
21+
reject(err);
22+
}
23+
});
24+
}
25+
26+
async function loadClassifier(model: string): Promise<void> {
27+
return new Promise(async (resolve) => {
28+
if (!classifier) {
29+
env.allowLocalModels = false;
30+
env.useBrowserCache = false;
31+
32+
classifier = await pipeline("image-classification", model || "Xenova/resnet-50", {
33+
device: (navigator as any).ml ? "webnn-npu" : await webGPUCheck() ? "webgpu" : "wasm"
34+
});
35+
console.log("loaded classifier", classifier)
36+
resolve();
37+
}
38+
else {
39+
resolve();
40+
}
41+
});
42+
}

src/services/speech-recognition/recognition.ts

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ export function doLocalWhisper(audioFile: Blob, model: string = "Xenova/whisper-
1010
if (!transcriber) {
1111
await loadTranscriber(model || 'Xenova/whisper-tiny', false, 'en');
1212
}
13-
13+
1414
const fileReader = new FileReader();
1515
fileReader.onloadend = async () => {
1616
const audioCTX = new AudioContext({
@@ -58,9 +58,11 @@ export async function loadTranscriber(model: string = "Xenova/whisper-tiny", tim
5858
// @ts-ignore
5959
return_timestamps: timestamps,
6060
language,
61-
device: (navigator as any).ml ? "webnn" : await webGPUCheck() ? "webgpu" : "wasm"
61+
// @ts-ignore
62+
device: await webGPUCheck() ? "webgpu" : "wasm"
6263
});
6364

65+
6466
resolve();
6567
}
6668
else {
@@ -127,8 +129,6 @@ function callback_function(item: any) {
127129
// Update tokens of last chunk
128130
last.tokens = [...item[0].output_token_ids];
129131

130-
console.log("callback_function", item, last)
131-
132132
// Merge text chunks
133133
// TODO optimise so we don't have to decode all chunks every time
134134
// @ts-ignore
@@ -138,7 +138,6 @@ function callback_function(item: any) {
138138
force_full_sequences: false,
139139
});
140140

141-
console.log("callback_function", data);
142141

143142
self.postMessage({
144143
type: 'transcribe-interim',

test.html

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222
<button id="image-to-text-button">Test Image to Text</button>
2323
</div>
2424

25+
<div id="image-classify-block">
26+
<input type="file" id="image-classify-file" accept="image/*" />
27+
<button id="image-classify-button">Test Image Classification</button>
28+
</div>
29+
2530

2631
<script type="module">
2732
document.querySelector("#summarize_button").addEventListener("click", async () => {
@@ -56,6 +61,15 @@
5661
console.log(text);
5762
URL.revokeObjectURL(file);
5863
});
64+
65+
document.querySelector("#image-classify-button").addEventListener("click", async () => {
66+
const { classifyImage } = await import("/dist/index.js");
67+
68+
const file = document.querySelector("#image-classify-file").files[0];
69+
const text = await classifyImage(URL.createObjectURL(file));
70+
console.log(text);
71+
URL.revokeObjectURL(file);
72+
});
5973
</script>
6074
</body>
6175

0 commit comments

Comments
 (0)