Skip to content

Commit 11463d1

Browse files
committed
Properly handle images that appear on multiple pages
1 parent ab3d78a commit 11463d1

File tree

3 files changed

+27
-5
lines changed

3 files changed

+27
-5
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,11 @@ Initial version, contains the **Extract text** and **Extract boxes** operations.
200200
* Add the ability to extract all images from a PDF and process them, in addition to single images
201201
(closes [#4](https://github.com/jreyesr/n8n-nodes-tesseractjs/issues/4))
202202

203+
### v1.4.1
204+
205+
* Adds proper handling for images that are repeated across pages (e.g. logo on header, page background,
206+
image that was copy-pasted on several pages), which are stored separately from page-specific images
207+
203208
## Developer info
204209

205210
```bash

nodes/TesseractNode/operations.ts

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,13 @@ async function pdfImageToJpeg(this: IExecuteFunctions, image: PdfJsImage): Promi
5454
data: Buffer.from(pixels_),
5555
width: image.width, height: image.height
5656
})
57-
return await jimp.getBuffer("image/jpeg")
57+
const jpeg = await jimp.getBuffer("image/jpeg")
58+
this.logger.debug("encoded to JPG", {size: jpeg.length})
59+
return jpeg
5860
}
5961

6062
async function getImagesFromBinary(this: IExecuteFunctions, itemIndex: number, imageFieldName: string): Promise<ImageWithName[]> {
63+
this.logger.debug("Getting images", {itemIndex})
6164
const binaryInfo = this.getInputData()[itemIndex].binary![imageFieldName]
6265
const buffer = await this.helpers.getBinaryDataBuffer(itemIndex, imageFieldName)
6366
if (binaryInfo.mimeType.startsWith("image/")) {
@@ -68,14 +71,21 @@ async function getImagesFromBinary(this: IExecuteFunctions, itemIndex: number, i
6871

6972
// NOTE: PDF page numbers start at 1!
7073
for (let pageNumber = 1; pageNumber <= pdfDoc.numPages; pageNumber++) {
74+
this.logger.debug("Getting images in page", {pageNumber})
7175
const page = await pdfDoc.getPage(pageNumber)
7276
const operators = await page.getOperatorList()
7377
// operators has two parallel arrays, fnArray and argsArray, equivalent to calling fnArray[i](...argsArray[i]) for each i
7478
for (let i = 0; i < operators.fnArray.length; i++) {
7579
if (operators.fnArray[i] == OPS.paintImageXObject) { // NOTE: You may find references to paintJpegXObject, it's now deprecated
76-
const imgIndex = operators.argsArray[i][0];
80+
const imgIndex: string = operators.argsArray[i][0];
81+
this.logger.debug("Found image in page", {pageNumber, imgIndex})
7782
imageBufferPromises.push(new Promise<ImageWithName>(resolve => {
78-
page.objs.get(imgIndex, async (imgRef: PdfJsImage) => {
83+
// NOTE: images whose IDs start with "g_" indicate that they're cached at the document level
84+
// This happens for images that appear on several pages, at which point PDF.js moves them from the page object store to the doc object store
85+
// Those images need to be accessed from `page.commonObjs` rather than `page.objs`, if you try to access `page.objs` the callback is simply
86+
// never called and this whole function hangs
87+
// See https://github.com/mozilla/pdf.js/issues/13742#issuecomment-881297161
88+
(imgIndex.startsWith("g_") ? page.commonObjs : page.objs).get(imgIndex, async (imgRef: PdfJsImage) => {
7989
resolve({
8090
data: await pdfImageToJpeg.apply(this, [imgRef]),
8191
name: imgIndex.toString() + ".jpg",
@@ -87,7 +97,10 @@ async function getImagesFromBinary(this: IExecuteFunctions, itemIndex: number, i
8797
}
8898
}
8999

90-
return Promise.all(imageBufferPromises)
100+
return Promise.all(imageBufferPromises).catch(e => {
101+
this.logger.error("ERR", {e});
102+
return [];
103+
})
91104
} else {
92105
throw new NodeOperationError(this.getNode(), {}, {
93106
itemIndex,
@@ -99,7 +112,10 @@ async function getImagesFromBinary(this: IExecuteFunctions, itemIndex: number, i
99112

100113
export async function performOCR(this: IExecuteFunctions, worker: Worker, item: INodeExecutionData, itemIndex: number, imageFieldName: string, bbox?: BoundingBox, timeout: number = 0): Promise<INodeExecutionData[]> {
101114
const images = await getImagesFromBinary.apply(this, [itemIndex, imageFieldName])
115+
this.logger.debug("images fetched", {num: images.length})
102116
const processImage = async ({data: image, name, mimetype}: ImageWithName) => {
117+
this.logger.debug("Processing image", {name, size: image.length})
118+
103119
const newItem: INodeExecutionData = {
104120
json: {},
105121
binary: {...item.binary}, // clone because otherwise the multiple items of a PDF will step on each other
@@ -112,6 +128,7 @@ export async function performOCR(this: IExecuteFunctions, worker: Worker, item:
112128
async () => {
113129
await worker.terminate()
114130
})
131+
this.logger.debug("Image processed", {name})
115132

116133
newItem.json =
117134
d === "timeout" ?

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "n8n-nodes-tesseractjs",
3-
"version": "1.4.0",
3+
"version": "1.4.1",
44
"description": "A n8n module that exposes Tesseract.js, an OCR library that can detect text on images",
55
"keywords": [
66
"n8n-community-node-package"

0 commit comments

Comments
 (0)