Skip to content

Commit d911f0f

Browse files
committed
better metadata and chapter name reading
1 parent 9bb1f0a commit d911f0f

File tree

2 files changed

+24
-21
lines changed

2 files changed

+24
-21
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ fileToContent returns an object of type EbookContent, which has the following fi
3131

3232
**images** : Object where keys are image filenames, values are dataURLs (so you can use ```<img :src="data.images[filename]"```)
3333

34-
**metadata** : Object with the following self-explanatory fields: *author, name, publisher, description, language*. Only *name* and *language* are guaranteed to be non-null
34+
**metadata** : Object with all metadata entries. Only *title*, *identifier*, and *language* are guaranteed to be non-null. Typically also includes information like *creator*, *publisher*, *date*, etc. Values are either Strings or arrays of Strings. Array of Strings happens when a metadata tag is specified multiple times, for example ```metadata.subject == ['Non-fiction', 'Biology']```
3535

3636
Chapters
3737
-----

index.js

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,6 @@ function pFileReader(file, mode){
1616
});
1717
}
1818

19-
function getMetadata(tagName, metadata) {
20-
var match = metadata.match(new RegExp(`<${tagName}[^>]*>[^>]+</${tagName}>`,'i'))
21-
var value = null
22-
if (match) {
23-
value = match.toString()
24-
value = value.substring(value.indexOf('>') + 1,value.length - `</${tagName}>`.length)
25-
}
26-
return value
27-
}
28-
2919
class EbookContent {
3020
/**
3121
* @param {EbookChapter[]} chapters - List of chapters. May contain things like Table of Contents, Preface, etc
@@ -90,7 +80,24 @@ async function fileToContent (file) {
9080
var opfContent = opfFile.explicitOriginalTarget.result
9181
// 4.1 read metadata
9282
var metadata = opfContent.substring(opfContent.indexOf('<metadata'),opfContent.indexOf('</metadata>'))
93-
var bookName = getMetadata('dc:title',metadata)
83+
var metadataObj = {}
84+
var metadataTags = metadata.match(/<dc:[^>]+>[^<]+<\/[^>]+>/gi)
85+
for (const match of metadataTags) {
86+
var matchStr = match.toString()
87+
var key = matchStr.match(/[^(>| )]+/i)
88+
var val = matchStr.match(/>[^<]+/i)
89+
if (key && val) {
90+
key = key.toString().substring(4)
91+
val = val.toString().substring(1)
92+
}
93+
if (!metadataObj[key]) metadataObj[key] = val
94+
else if (typeof metadataObj[key] == 'string') {
95+
console.log(typeof metadataObj[key])
96+
metadataObj[key] = [metadataObj[key],val]
97+
}
98+
else metadataObj[key].push(val)
99+
}
100+
var bookName = metadataObj.name
94101
// 4.2 read manifest (lists all the files contained in the package, has id to filename mapping useful later)
95102
var manifest = opfContent.substring(opfContent.indexOf('<manifest'),opfContent.indexOf('</manifest>'))
96103
var idToHref = {}
@@ -136,14 +143,16 @@ async function fileToContent (file) {
136143
if (head && head.length > 0) {
137144
var title = head[0].getElementsByTagName("title")
138145
// also need to check if <title> is not just book name
139-
if (title && title.length > 0 && title[0].innerText.toLowerCase() != bookName.toLowerCase()) chapterName = title[0].innerText
146+
if (title && title.length > 0 && !bookName.toLowerCase().startsWith(title[0].innerText.toLowerCase())) {
147+
chapterName = title[0].innerText.trim()
148+
}
140149
}
141150
if (!chapterName) {
142151
// try to find heading tags with chapter name
143152
for (var j = 0; j < 3; j++) {
144153
var tags = htmlDoc.getElementsByTagName(["h1","h2"][j])
145154
if (tags && tags.length > 0) {
146-
chapterName = [...tags].map(tag => tag.innerText).join(" ")
155+
chapterName = tags[0].innerText.trim()
147156
break;
148157
}
149158
}
@@ -177,13 +186,7 @@ async function fileToContent (file) {
177186
if (fname.match(imageRegex)) images[fname] = filenameToContent[fname]
178187
else if (fname.match(stylesheetRegex)) stylesheets[fname] = filenameToContent[fname]
179188
}
180-
return new EbookContent(chapters,stylesheets,images,{
181-
author: getMetadata('dc:creator',metadata),
182-
name: bookName,
183-
description: getMetadata('dc:description',metadata),
184-
publisher: getMetadata('dc:publisher',metadata),
185-
language: getMetadata('dc:language',metadata)
186-
})
189+
return new EbookContent(chapters,stylesheets,images,metadataObj)
187190
}
188191
var ebookjs = {}
189192
ebookjs.fileToContent = fileToContent

0 commit comments

Comments
 (0)