@@ -54,10 +54,13 @@ async function pdfImageToJpeg(this: IExecuteFunctions, image: PdfJsImage): Promi
54
54
data : Buffer . from ( pixels_ ) ,
55
55
width : image . width , height : image . height
56
56
} )
57
- return await jimp . getBuffer ( "image/jpeg" )
57
+ const jpeg = await jimp . getBuffer ( "image/jpeg" )
58
+ this . logger . debug ( "encoded to JPG" , { size : jpeg . length } )
59
+ return jpeg
58
60
}
59
61
60
62
async function getImagesFromBinary ( this : IExecuteFunctions , itemIndex : number , imageFieldName : string ) : Promise < ImageWithName [ ] > {
63
+ this . logger . debug ( "Getting images" , { itemIndex} )
61
64
const binaryInfo = this . getInputData ( ) [ itemIndex ] . binary ! [ imageFieldName ]
62
65
const buffer = await this . helpers . getBinaryDataBuffer ( itemIndex , imageFieldName )
63
66
if ( binaryInfo . mimeType . startsWith ( "image/" ) ) {
@@ -68,14 +71,21 @@ async function getImagesFromBinary(this: IExecuteFunctions, itemIndex: number, i
68
71
69
72
// NOTE: PDF page numbers start at 1!
70
73
for ( let pageNumber = 1 ; pageNumber <= pdfDoc . numPages ; pageNumber ++ ) {
74
+ this . logger . debug ( "Getting images in page" , { pageNumber} )
71
75
const page = await pdfDoc . getPage ( pageNumber )
72
76
const operators = await page . getOperatorList ( )
73
77
// operators has two parallel arrays, fnArray and argsArray, equivalent to calling fnArray[i](...argsArray[i]) for each i
74
78
for ( let i = 0 ; i < operators . fnArray . length ; i ++ ) {
75
79
if ( operators . fnArray [ i ] == OPS . paintImageXObject ) { // NOTE: You may find references to paintJpegXObject, it's now deprecated
76
- const imgIndex = operators . argsArray [ i ] [ 0 ] ;
80
+ const imgIndex : string = operators . argsArray [ i ] [ 0 ] ;
81
+ this . logger . debug ( "Found image in page" , { pageNumber, imgIndex} )
77
82
imageBufferPromises . push ( new Promise < ImageWithName > ( resolve => {
78
- page . objs . get ( imgIndex , async ( imgRef : PdfJsImage ) => {
83
+ // NOTE: images whose IDs start with "g_" indicate that they're cached at the document level
84
+ // This happens for images that appear on several pages, at which point PDF.js moves them from the page object store to the doc object store
85
+ // Those images need to be accessed from `page.commonObjs` rather than `page.objs`, if you try to access `page.objs` the callback is simply
86
+ // never called and this whole function hangs
87
+ // See https://github.com/mozilla/pdf.js/issues/13742#issuecomment-881297161
88
+ ( imgIndex . startsWith ( "g_" ) ? page . commonObjs : page . objs ) . get ( imgIndex , async ( imgRef : PdfJsImage ) => {
79
89
resolve ( {
80
90
data : await pdfImageToJpeg . apply ( this , [ imgRef ] ) ,
81
91
name : imgIndex . toString ( ) + ".jpg" ,
@@ -87,7 +97,10 @@ async function getImagesFromBinary(this: IExecuteFunctions, itemIndex: number, i
87
97
}
88
98
}
89
99
90
- return Promise . all ( imageBufferPromises )
100
+ return Promise . all ( imageBufferPromises ) . catch ( e => {
101
+ this . logger . error ( "ERR" , { e} ) ;
102
+ return [ ] ;
103
+ } )
91
104
} else {
92
105
throw new NodeOperationError ( this . getNode ( ) , { } , {
93
106
itemIndex,
@@ -99,7 +112,10 @@ async function getImagesFromBinary(this: IExecuteFunctions, itemIndex: number, i
99
112
100
113
export async function performOCR ( this : IExecuteFunctions , worker : Worker , item : INodeExecutionData , itemIndex : number , imageFieldName : string , bbox ?: BoundingBox , timeout : number = 0 ) : Promise < INodeExecutionData [ ] > {
101
114
const images = await getImagesFromBinary . apply ( this , [ itemIndex , imageFieldName ] )
115
+ this . logger . debug ( "images fetched" , { num : images . length } )
102
116
const processImage = async ( { data : image , name, mimetype} : ImageWithName ) => {
117
+ this . logger . debug ( "Processing image" , { name, size : image . length } )
118
+
103
119
const newItem : INodeExecutionData = {
104
120
json : { } ,
105
121
binary : { ...item . binary } , // clone because otherwise the multiple items of a PDF will step on each other
@@ -112,6 +128,7 @@ export async function performOCR(this: IExecuteFunctions, worker: Worker, item:
112
128
async ( ) => {
113
129
await worker . terminate ( )
114
130
} )
131
+ this . logger . debug ( "Image processed" , { name} )
115
132
116
133
newItem . json =
117
134
d === "timeout" ?
0 commit comments