mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-04 06:37:38 +03:00
Add EPUB 3 nav.xhtml TOC support (#197)
## Summary * **What is the goal of this PR?** Add EPUB 3 support by implementing native navigation document (nav.xhtml) parsing with NCX fallback, addressing issue Fixes: #143. * **What changes are included?** - New `TocNavParser` for parsing EPUB 3 HTML5 navigation documents (`<nav epub:type="toc">`) - Detection of nav documents via `properties="nav"` attribute in OPF manifest - Fallback logic: try EPUB 3 nav first, fall back to NCX (EPUB 2) if unavailable - Graceful degradation: books without any TOC now load with a warning instead of failing ## Additional Context * The implementation follows the existing streaming XML parser pattern using Expat to minimize RAM usage on the ESP32-C3 * EPUB 3 books that include both nav.xhtml and toc.ncx will prefer the nav document (per EPUB 3 spec recommendation) * No breaking changes - existing EPUB 2 books continue to work as before * Tested on examples from https://idpf.github.io/epub3-samples/30/samples.html
This commit is contained in:
parent
5790d6f5dc
commit
0332e1103a
@ -25,7 +25,7 @@ This project is **not affiliated with Xteink**; it's built as a community projec
|
||||
|
||||
## Features & Usage
|
||||
|
||||
- [x] EPUB parsing and rendering
|
||||
- [x] EPUB parsing and rendering (EPUB 2 and EPUB 3)
|
||||
- [ ] Image support within EPUB
|
||||
- [x] Saved reading position
|
||||
- [x] File explorer with file picker
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
|
||||
#include "Epub/parsers/ContainerParser.h"
|
||||
#include "Epub/parsers/ContentOpfParser.h"
|
||||
#include "Epub/parsers/TocNavParser.h"
|
||||
#include "Epub/parsers/TocNcxParser.h"
|
||||
|
||||
bool Epub::findContentOpfFile(std::string* contentOpfFile) const {
|
||||
@ -80,6 +81,10 @@ bool Epub::parseContentOpf(BookMetadataCache::BookMetadata& bookMetadata) {
|
||||
tocNcxItem = opfParser.tocNcxPath;
|
||||
}
|
||||
|
||||
if (!opfParser.tocNavPath.empty()) {
|
||||
tocNavItem = opfParser.tocNavPath;
|
||||
}
|
||||
|
||||
Serial.printf("[%lu] [EBP] Successfully parsed content.opf\n", millis());
|
||||
return true;
|
||||
}
|
||||
@ -141,6 +146,60 @@ bool Epub::parseTocNcxFile() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Epub::parseTocNavFile() const {
|
||||
// the nav file should have been specified in the content.opf file (EPUB 3)
|
||||
if (tocNavItem.empty()) {
|
||||
Serial.printf("[%lu] [EBP] No nav file specified\n", millis());
|
||||
return false;
|
||||
}
|
||||
|
||||
Serial.printf("[%lu] [EBP] Parsing toc nav file: %s\n", millis(), tocNavItem.c_str());
|
||||
|
||||
const auto tmpNavPath = getCachePath() + "/toc.nav";
|
||||
FsFile tempNavFile;
|
||||
if (!SdMan.openFileForWrite("EBP", tmpNavPath, tempNavFile)) {
|
||||
return false;
|
||||
}
|
||||
readItemContentsToStream(tocNavItem, tempNavFile, 1024);
|
||||
tempNavFile.close();
|
||||
if (!SdMan.openFileForRead("EBP", tmpNavPath, tempNavFile)) {
|
||||
return false;
|
||||
}
|
||||
const auto navSize = tempNavFile.size();
|
||||
|
||||
TocNavParser navParser(contentBasePath, navSize, bookMetadataCache.get());
|
||||
|
||||
if (!navParser.setup()) {
|
||||
Serial.printf("[%lu] [EBP] Could not setup toc nav parser\n", millis());
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto navBuffer = static_cast<uint8_t*>(malloc(1024));
|
||||
if (!navBuffer) {
|
||||
Serial.printf("[%lu] [EBP] Could not allocate memory for toc nav parser\n", millis());
|
||||
return false;
|
||||
}
|
||||
|
||||
while (tempNavFile.available()) {
|
||||
const auto readSize = tempNavFile.read(navBuffer, 1024);
|
||||
const auto processedSize = navParser.write(navBuffer, readSize);
|
||||
|
||||
if (processedSize != readSize) {
|
||||
Serial.printf("[%lu] [EBP] Could not process all toc nav data\n", millis());
|
||||
free(navBuffer);
|
||||
tempNavFile.close();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
free(navBuffer);
|
||||
tempNavFile.close();
|
||||
SdMan.remove(tmpNavPath.c_str());
|
||||
|
||||
Serial.printf("[%lu] [EBP] Parsed TOC nav items\n", millis());
|
||||
return true;
|
||||
}
|
||||
|
||||
// load in the meta data for the epub file
|
||||
bool Epub::load(const bool buildIfMissing) {
|
||||
Serial.printf("[%lu] [EBP] Loading ePub: %s\n", millis(), filepath.c_str());
|
||||
@ -184,15 +243,31 @@ bool Epub::load(const bool buildIfMissing) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// TOC Pass
|
||||
// TOC Pass - try EPUB 3 nav first, fall back to NCX
|
||||
if (!bookMetadataCache->beginTocPass()) {
|
||||
Serial.printf("[%lu] [EBP] Could not begin writing toc pass\n", millis());
|
||||
return false;
|
||||
}
|
||||
if (!parseTocNcxFile()) {
|
||||
Serial.printf("[%lu] [EBP] Could not parse toc\n", millis());
|
||||
return false;
|
||||
|
||||
bool tocParsed = false;
|
||||
|
||||
// Try EPUB 3 nav document first (preferred)
|
||||
if (!tocNavItem.empty()) {
|
||||
Serial.printf("[%lu] [EBP] Attempting to parse EPUB 3 nav document\n", millis());
|
||||
tocParsed = parseTocNavFile();
|
||||
}
|
||||
|
||||
// Fall back to NCX if nav parsing failed or wasn't available
|
||||
if (!tocParsed && !tocNcxItem.empty()) {
|
||||
Serial.printf("[%lu] [EBP] Falling back to NCX TOC\n", millis());
|
||||
tocParsed = parseTocNcxFile();
|
||||
}
|
||||
|
||||
if (!tocParsed) {
|
||||
Serial.printf("[%lu] [EBP] Warning: Could not parse any TOC format\n", millis());
|
||||
// Continue anyway - book will work without TOC
|
||||
}
|
||||
|
||||
if (!bookMetadataCache->endTocPass()) {
|
||||
Serial.printf("[%lu] [EBP] Could not end writing toc pass\n", millis());
|
||||
return false;
|
||||
|
||||
@ -12,8 +12,10 @@
|
||||
class ZipFile;
|
||||
|
||||
class Epub {
|
||||
// the ncx file
|
||||
// the ncx file (EPUB 2)
|
||||
std::string tocNcxItem;
|
||||
// the nav file (EPUB 3)
|
||||
std::string tocNavItem;
|
||||
// where is the EPUBfile?
|
||||
std::string filepath;
|
||||
// the base path for items in the EPUB file
|
||||
@ -26,6 +28,7 @@ class Epub {
|
||||
bool findContentOpfFile(std::string* contentOpfFile) const;
|
||||
bool parseContentOpf(BookMetadataCache::BookMetadata& bookMetadata);
|
||||
bool parseTocNcxFile() const;
|
||||
bool parseTocNavFile() const;
|
||||
|
||||
public:
|
||||
explicit Epub(std::string filepath, const std::string& cacheDir) : filepath(std::move(filepath)) {
|
||||
|
||||
@ -161,6 +161,7 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
|
||||
std::string itemId;
|
||||
std::string href;
|
||||
std::string mediaType;
|
||||
std::string properties;
|
||||
|
||||
for (int i = 0; atts[i]; i += 2) {
|
||||
if (strcmp(atts[i], "id") == 0) {
|
||||
@ -169,6 +170,8 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
|
||||
href = self->baseContentPath + atts[i + 1];
|
||||
} else if (strcmp(atts[i], "media-type") == 0) {
|
||||
mediaType = atts[i + 1];
|
||||
} else if (strcmp(atts[i], "properties") == 0) {
|
||||
properties = atts[i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
@ -188,6 +191,15 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
|
||||
href.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// EPUB 3: Check for nav document (properties contains "nav")
|
||||
if (!properties.empty() && self->tocNavPath.empty()) {
|
||||
// Properties is space-separated, check if "nav" is present as a word
|
||||
if (properties == "nav" || properties.find("nav ") == 0 || properties.find(" nav") != std::string::npos) {
|
||||
self->tocNavPath = href;
|
||||
Serial.printf("[%lu] [COF] Found EPUB 3 nav document: %s\n", millis(), href.c_str());
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@ -35,6 +35,7 @@ class ContentOpfParser final : public Print {
|
||||
std::string title;
|
||||
std::string author;
|
||||
std::string tocNcxPath;
|
||||
std::string tocNavPath; // EPUB 3 nav document path
|
||||
std::string coverItemHref;
|
||||
std::string textReferenceHref;
|
||||
|
||||
|
||||
184
lib/Epub/Epub/parsers/TocNavParser.cpp
Normal file
184
lib/Epub/Epub/parsers/TocNavParser.cpp
Normal file
@ -0,0 +1,184 @@
|
||||
#include "TocNavParser.h"
|
||||
|
||||
#include <HardwareSerial.h>
|
||||
|
||||
#include "../BookMetadataCache.h"
|
||||
|
||||
bool TocNavParser::setup() {
|
||||
parser = XML_ParserCreate(nullptr);
|
||||
if (!parser) {
|
||||
Serial.printf("[%lu] [NAV] Couldn't allocate memory for parser\n", millis());
|
||||
return false;
|
||||
}
|
||||
|
||||
XML_SetUserData(parser, this);
|
||||
XML_SetElementHandler(parser, startElement, endElement);
|
||||
XML_SetCharacterDataHandler(parser, characterData);
|
||||
return true;
|
||||
}
|
||||
|
||||
TocNavParser::~TocNavParser() {
|
||||
if (parser) {
|
||||
XML_StopParser(parser, XML_FALSE);
|
||||
XML_SetElementHandler(parser, nullptr, nullptr);
|
||||
XML_SetCharacterDataHandler(parser, nullptr);
|
||||
XML_ParserFree(parser);
|
||||
parser = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
size_t TocNavParser::write(const uint8_t data) { return write(&data, 1); }
|
||||
|
||||
size_t TocNavParser::write(const uint8_t* buffer, const size_t size) {
|
||||
if (!parser) return 0;
|
||||
|
||||
const uint8_t* currentBufferPos = buffer;
|
||||
auto remainingInBuffer = size;
|
||||
|
||||
while (remainingInBuffer > 0) {
|
||||
void* const buf = XML_GetBuffer(parser, 1024);
|
||||
if (!buf) {
|
||||
Serial.printf("[%lu] [NAV] Couldn't allocate memory for buffer\n", millis());
|
||||
XML_StopParser(parser, XML_FALSE);
|
||||
XML_SetElementHandler(parser, nullptr, nullptr);
|
||||
XML_SetCharacterDataHandler(parser, nullptr);
|
||||
XML_ParserFree(parser);
|
||||
parser = nullptr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
const auto toRead = remainingInBuffer < 1024 ? remainingInBuffer : 1024;
|
||||
memcpy(buf, currentBufferPos, toRead);
|
||||
|
||||
if (XML_ParseBuffer(parser, static_cast<int>(toRead), remainingSize == toRead) == XML_STATUS_ERROR) {
|
||||
Serial.printf("[%lu] [NAV] Parse error at line %lu: %s\n", millis(), XML_GetCurrentLineNumber(parser),
|
||||
XML_ErrorString(XML_GetErrorCode(parser)));
|
||||
XML_StopParser(parser, XML_FALSE);
|
||||
XML_SetElementHandler(parser, nullptr, nullptr);
|
||||
XML_SetCharacterDataHandler(parser, nullptr);
|
||||
XML_ParserFree(parser);
|
||||
parser = nullptr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
currentBufferPos += toRead;
|
||||
remainingInBuffer -= toRead;
|
||||
remainingSize -= toRead;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
void XMLCALL TocNavParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
|
||||
auto* self = static_cast<TocNavParser*>(userData);
|
||||
|
||||
// Track HTML structure loosely - we mainly care about finding <nav epub:type="toc">
|
||||
if (strcmp(name, "html") == 0) {
|
||||
self->state = IN_HTML;
|
||||
return;
|
||||
}
|
||||
|
||||
if (self->state == IN_HTML && strcmp(name, "body") == 0) {
|
||||
self->state = IN_BODY;
|
||||
return;
|
||||
}
|
||||
|
||||
// Look for <nav epub:type="toc"> anywhere in body (or nested elements)
|
||||
if (self->state >= IN_BODY && strcmp(name, "nav") == 0) {
|
||||
for (int i = 0; atts[i]; i += 2) {
|
||||
if ((strcmp(atts[i], "epub:type") == 0 || strcmp(atts[i], "type") == 0) && strcmp(atts[i + 1], "toc") == 0) {
|
||||
self->state = IN_NAV_TOC;
|
||||
Serial.printf("[%lu] [NAV] Found nav toc element\n", millis());
|
||||
return;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Only process ol/li/a if we're inside the toc nav
|
||||
if (self->state < IN_NAV_TOC) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (strcmp(name, "ol") == 0) {
|
||||
self->olDepth++;
|
||||
self->state = IN_OL;
|
||||
return;
|
||||
}
|
||||
|
||||
if (self->state == IN_OL && strcmp(name, "li") == 0) {
|
||||
self->state = IN_LI;
|
||||
self->currentLabel.clear();
|
||||
self->currentHref.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
if (self->state == IN_LI && strcmp(name, "a") == 0) {
|
||||
self->state = IN_ANCHOR;
|
||||
// Get href attribute
|
||||
for (int i = 0; atts[i]; i += 2) {
|
||||
if (strcmp(atts[i], "href") == 0) {
|
||||
self->currentHref = atts[i + 1];
|
||||
break;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void XMLCALL TocNavParser::characterData(void* userData, const XML_Char* s, const int len) {
|
||||
auto* self = static_cast<TocNavParser*>(userData);
|
||||
|
||||
// Only collect text when inside an anchor within the TOC nav
|
||||
if (self->state == IN_ANCHOR) {
|
||||
self->currentLabel.append(s, len);
|
||||
}
|
||||
}
|
||||
|
||||
void XMLCALL TocNavParser::endElement(void* userData, const XML_Char* name) {
|
||||
auto* self = static_cast<TocNavParser*>(userData);
|
||||
|
||||
if (strcmp(name, "a") == 0 && self->state == IN_ANCHOR) {
|
||||
// Create TOC entry when closing anchor tag (we have all data now)
|
||||
if (!self->currentLabel.empty() && !self->currentHref.empty()) {
|
||||
std::string href = self->baseContentPath + self->currentHref;
|
||||
std::string anchor;
|
||||
|
||||
const size_t pos = href.find('#');
|
||||
if (pos != std::string::npos) {
|
||||
anchor = href.substr(pos + 1);
|
||||
href = href.substr(0, pos);
|
||||
}
|
||||
|
||||
if (self->cache) {
|
||||
// olDepth gives us the nesting level (1-based from the outer ol)
|
||||
self->cache->createTocEntry(self->currentLabel, href, anchor, self->olDepth);
|
||||
}
|
||||
|
||||
self->currentLabel.clear();
|
||||
self->currentHref.clear();
|
||||
}
|
||||
self->state = IN_LI;
|
||||
return;
|
||||
}
|
||||
|
||||
if (strcmp(name, "li") == 0 && (self->state == IN_LI || self->state == IN_OL)) {
|
||||
self->state = IN_OL;
|
||||
return;
|
||||
}
|
||||
|
||||
if (strcmp(name, "ol") == 0 && self->state >= IN_NAV_TOC) {
|
||||
self->olDepth--;
|
||||
if (self->olDepth == 0) {
|
||||
self->state = IN_NAV_TOC;
|
||||
} else {
|
||||
self->state = IN_LI; // Back to parent li
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (strcmp(name, "nav") == 0 && self->state >= IN_NAV_TOC) {
|
||||
self->state = IN_BODY;
|
||||
Serial.printf("[%lu] [NAV] Finished parsing nav toc\n", millis());
|
||||
return;
|
||||
}
|
||||
}
|
||||
47
lib/Epub/Epub/parsers/TocNavParser.h
Normal file
47
lib/Epub/Epub/parsers/TocNavParser.h
Normal file
@ -0,0 +1,47 @@
|
||||
#pragma once
|
||||
#include <Print.h>
|
||||
#include <expat.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
class BookMetadataCache;
|
||||
|
||||
// Parser for EPUB 3 nav.xhtml navigation documents
|
||||
// Parses HTML5 nav elements with epub:type="toc" to extract table of contents
|
||||
class TocNavParser final : public Print {
|
||||
enum ParserState {
|
||||
START,
|
||||
IN_HTML,
|
||||
IN_BODY,
|
||||
IN_NAV_TOC, // Inside <nav epub:type="toc">
|
||||
IN_OL, // Inside <ol>
|
||||
IN_LI, // Inside <li>
|
||||
IN_ANCHOR, // Inside <a>
|
||||
};
|
||||
|
||||
const std::string& baseContentPath;
|
||||
size_t remainingSize;
|
||||
XML_Parser parser = nullptr;
|
||||
ParserState state = START;
|
||||
BookMetadataCache* cache;
|
||||
|
||||
// Track nesting depth for <ol> elements to determine TOC depth
|
||||
uint8_t olDepth = 0;
|
||||
// Current entry data being collected
|
||||
std::string currentLabel;
|
||||
std::string currentHref;
|
||||
|
||||
static void startElement(void* userData, const XML_Char* name, const XML_Char** atts);
|
||||
static void characterData(void* userData, const XML_Char* s, int len);
|
||||
static void endElement(void* userData, const XML_Char* name);
|
||||
|
||||
public:
|
||||
explicit TocNavParser(const std::string& baseContentPath, const size_t xmlSize, BookMetadataCache* cache)
|
||||
: baseContentPath(baseContentPath), remainingSize(xmlSize), cache(cache) {}
|
||||
~TocNavParser() override;
|
||||
|
||||
bool setup();
|
||||
|
||||
size_t write(uint8_t) override;
|
||||
size_t write(const uint8_t* buffer, size_t size) override;
|
||||
};
|
||||
Loading…
Reference in New Issue
Block a user