mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-05 15:17:37 +03:00
## Summary * **What is the goal of this PR?** - Adds basic CSS parsing to EPUBs and determine the CSS rules when rendering to the screen so that text is styled correctly. Currently supports bold, underline, italics, margin, padding, and text alignment ## Additional Context - My main reason for wanting this is that the book I'm currently reading, Carl's Doomsday Scenario (2nd in the Dungeon Crawler Carl series), relies _a lot_ on styled text for telling parts of the story. When text is bolded, it's supposed to be a message that's rendered "on-screen" in the story. When characters are "chatting" with each other, the text is bolded and their names are underlined. Plus, normal emphasis is provided with italicizing words here and there. So, this greatly improves my experience reading this book on the Xteink, and I figured it was useful enough for others too. - For transparency: I'm a software engineer, but I'm mostly frontend and TypeScript/JavaScript. It's been _years_ since I did any C/C++, so I would not be surprised if I'm doing something dumb along the way in this code. Please don't hesitate to ask for changes if something looks off. I heavily relied on Claude Code for help, and I had a lot of inspiration from how [microreader](https://github.com/CidVonHighwind/microreader) achieves their CSS parsing and styling. I did give this as good of a code review as I could and went through everything, and _it works on my machine_ 😄 ### Before   ### After   --- ### AI Usage Did you use AI tools to help write this code? **YES**, Claude Code
386 lines
13 KiB
C++
386 lines
13 KiB
C++
#include "ContentOpfParser.h"
|
|
|
|
#include <FsHelpers.h>
|
|
#include <HardwareSerial.h>
|
|
#include <Serialization.h>
|
|
|
|
#include "../BookMetadataCache.h"
|
|
|
|
namespace {
|
|
constexpr char MEDIA_TYPE_NCX[] = "application/x-dtbncx+xml";
|
|
constexpr char MEDIA_TYPE_CSS[] = "text/css";
|
|
constexpr char itemCacheFile[] = "/.items.bin";
|
|
} // namespace
|
|
|
|
bool ContentOpfParser::setup() {
|
|
parser = XML_ParserCreate(nullptr);
|
|
if (!parser) {
|
|
Serial.printf("[%lu] [COF] Couldn't allocate memory for parser\n", millis());
|
|
return false;
|
|
}
|
|
|
|
XML_SetUserData(parser, this);
|
|
XML_SetElementHandler(parser, startElement, endElement);
|
|
XML_SetCharacterDataHandler(parser, characterData);
|
|
return true;
|
|
}
|
|
|
|
ContentOpfParser::~ContentOpfParser() {
|
|
if (parser) {
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
parser = nullptr;
|
|
}
|
|
if (tempItemStore) {
|
|
tempItemStore.close();
|
|
}
|
|
if (SdMan.exists((cachePath + itemCacheFile).c_str())) {
|
|
SdMan.remove((cachePath + itemCacheFile).c_str());
|
|
}
|
|
itemIndex.clear();
|
|
itemIndex.shrink_to_fit();
|
|
useItemIndex = false;
|
|
}
|
|
|
|
size_t ContentOpfParser::write(const uint8_t data) { return write(&data, 1); }
|
|
|
|
size_t ContentOpfParser::write(const uint8_t* buffer, const size_t size) {
|
|
if (!parser) return 0;
|
|
|
|
const uint8_t* currentBufferPos = buffer;
|
|
auto remainingInBuffer = size;
|
|
|
|
while (remainingInBuffer > 0) {
|
|
void* const buf = XML_GetBuffer(parser, 1024);
|
|
|
|
if (!buf) {
|
|
Serial.printf("[%lu] [COF] Couldn't allocate memory for buffer\n", millis());
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
parser = nullptr;
|
|
return 0;
|
|
}
|
|
|
|
const auto toRead = remainingInBuffer < 1024 ? remainingInBuffer : 1024;
|
|
memcpy(buf, currentBufferPos, toRead);
|
|
|
|
if (XML_ParseBuffer(parser, static_cast<int>(toRead), remainingSize == toRead) == XML_STATUS_ERROR) {
|
|
Serial.printf("[%lu] [COF] Parse error at line %lu: %s\n", millis(), XML_GetCurrentLineNumber(parser),
|
|
XML_ErrorString(XML_GetErrorCode(parser)));
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
parser = nullptr;
|
|
return 0;
|
|
}
|
|
|
|
currentBufferPos += toRead;
|
|
remainingInBuffer -= toRead;
|
|
remainingSize -= toRead;
|
|
}
|
|
|
|
return size;
|
|
}
|
|
|
|
void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
|
|
auto* self = static_cast<ContentOpfParser*>(userData);
|
|
(void)atts;
|
|
|
|
if (self->state == START && (strcmp(name, "package") == 0 || strcmp(name, "opf:package") == 0)) {
|
|
self->state = IN_PACKAGE;
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_PACKAGE && (strcmp(name, "metadata") == 0 || strcmp(name, "opf:metadata") == 0)) {
|
|
self->state = IN_METADATA;
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_METADATA && strcmp(name, "dc:title") == 0) {
|
|
self->state = IN_BOOK_TITLE;
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_METADATA && strcmp(name, "dc:creator") == 0) {
|
|
self->state = IN_BOOK_AUTHOR;
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_METADATA && strcmp(name, "dc:language") == 0) {
|
|
self->state = IN_BOOK_LANGUAGE;
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_PACKAGE && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) {
|
|
self->state = IN_MANIFEST;
|
|
if (!SdMan.openFileForWrite("COF", self->cachePath + itemCacheFile, self->tempItemStore)) {
|
|
Serial.printf(
|
|
"[%lu] [COF] Couldn't open temp items file for writing. This is probably going to be a fatal error.\n",
|
|
millis());
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_PACKAGE && (strcmp(name, "spine") == 0 || strcmp(name, "opf:spine") == 0)) {
|
|
self->state = IN_SPINE;
|
|
if (!SdMan.openFileForRead("COF", self->cachePath + itemCacheFile, self->tempItemStore)) {
|
|
Serial.printf(
|
|
"[%lu] [COF] Couldn't open temp items file for reading. This is probably going to be a fatal error.\n",
|
|
millis());
|
|
}
|
|
|
|
// Sort item index for binary search if we have enough items
|
|
if (self->itemIndex.size() >= LARGE_SPINE_THRESHOLD) {
|
|
std::sort(self->itemIndex.begin(), self->itemIndex.end(), [](const ItemIndexEntry& a, const ItemIndexEntry& b) {
|
|
return a.idHash < b.idHash || (a.idHash == b.idHash && a.idLen < b.idLen);
|
|
});
|
|
self->useItemIndex = true;
|
|
Serial.printf("[%lu] [COF] Using fast index for %zu manifest items\n", millis(), self->itemIndex.size());
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_PACKAGE && (strcmp(name, "guide") == 0 || strcmp(name, "opf:guide") == 0)) {
|
|
self->state = IN_GUIDE;
|
|
// TODO Remove print
|
|
Serial.printf("[%lu] [COF] Entering guide state.\n", millis());
|
|
if (!SdMan.openFileForRead("COF", self->cachePath + itemCacheFile, self->tempItemStore)) {
|
|
Serial.printf(
|
|
"[%lu] [COF] Couldn't open temp items file for reading. This is probably going to be a fatal error.\n",
|
|
millis());
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_METADATA && (strcmp(name, "meta") == 0 || strcmp(name, "opf:meta") == 0)) {
|
|
bool isCover = false;
|
|
std::string coverItemId;
|
|
|
|
for (int i = 0; atts[i]; i += 2) {
|
|
if (strcmp(atts[i], "name") == 0 && strcmp(atts[i + 1], "cover") == 0) {
|
|
isCover = true;
|
|
} else if (strcmp(atts[i], "content") == 0) {
|
|
coverItemId = atts[i + 1];
|
|
}
|
|
}
|
|
|
|
if (isCover) {
|
|
self->coverItemId = coverItemId;
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_MANIFEST && (strcmp(name, "item") == 0 || strcmp(name, "opf:item") == 0)) {
|
|
std::string itemId;
|
|
std::string href;
|
|
std::string mediaType;
|
|
std::string properties;
|
|
|
|
for (int i = 0; atts[i]; i += 2) {
|
|
if (strcmp(atts[i], "id") == 0) {
|
|
itemId = atts[i + 1];
|
|
} else if (strcmp(atts[i], "href") == 0) {
|
|
href = FsHelpers::normalisePath(self->baseContentPath + atts[i + 1]);
|
|
} else if (strcmp(atts[i], "media-type") == 0) {
|
|
mediaType = atts[i + 1];
|
|
} else if (strcmp(atts[i], "properties") == 0) {
|
|
properties = atts[i + 1];
|
|
}
|
|
}
|
|
|
|
// Record index entry for fast lookup later
|
|
if (self->tempItemStore) {
|
|
ItemIndexEntry entry;
|
|
entry.idHash = fnvHash(itemId);
|
|
entry.idLen = static_cast<uint16_t>(itemId.size());
|
|
entry.fileOffset = static_cast<uint32_t>(self->tempItemStore.position());
|
|
self->itemIndex.push_back(entry);
|
|
}
|
|
|
|
// Write items down to SD card
|
|
serialization::writeString(self->tempItemStore, itemId);
|
|
serialization::writeString(self->tempItemStore, href);
|
|
|
|
if (itemId == self->coverItemId) {
|
|
self->coverItemHref = href;
|
|
}
|
|
|
|
if (mediaType == MEDIA_TYPE_NCX) {
|
|
if (self->tocNcxPath.empty()) {
|
|
self->tocNcxPath = href;
|
|
} else {
|
|
Serial.printf("[%lu] [COF] Warning: Multiple NCX files found in manifest. Ignoring duplicate: %s\n", millis(),
|
|
href.c_str());
|
|
}
|
|
}
|
|
|
|
// Collect CSS files
|
|
if (mediaType == MEDIA_TYPE_CSS) {
|
|
self->cssFiles.push_back(href);
|
|
}
|
|
|
|
// EPUB 3: Check for nav document (properties contains "nav")
|
|
if (!properties.empty() && self->tocNavPath.empty()) {
|
|
// Properties is space-separated, check if "nav" is present as a word
|
|
if (properties == "nav" || properties.find("nav ") == 0 || properties.find(" nav") != std::string::npos) {
|
|
self->tocNavPath = href;
|
|
Serial.printf("[%lu] [COF] Found EPUB 3 nav document: %s\n", millis(), href.c_str());
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
// NOTE: This relies on spine appearing after item manifest (which is pretty safe as it's part of the EPUB spec)
|
|
// Only run the spine parsing if there's a cache to add it to
|
|
if (self->cache) {
|
|
if (self->state == IN_SPINE && (strcmp(name, "itemref") == 0 || strcmp(name, "opf:itemref") == 0)) {
|
|
for (int i = 0; atts[i]; i += 2) {
|
|
if (strcmp(atts[i], "idref") == 0) {
|
|
const std::string idref = atts[i + 1];
|
|
std::string href;
|
|
bool found = false;
|
|
|
|
if (self->useItemIndex) {
|
|
// Fast path: binary search
|
|
uint32_t targetHash = fnvHash(idref);
|
|
uint16_t targetLen = static_cast<uint16_t>(idref.size());
|
|
|
|
auto it = std::lower_bound(self->itemIndex.begin(), self->itemIndex.end(),
|
|
ItemIndexEntry{targetHash, targetLen, 0},
|
|
[](const ItemIndexEntry& a, const ItemIndexEntry& b) {
|
|
return a.idHash < b.idHash || (a.idHash == b.idHash && a.idLen < b.idLen);
|
|
});
|
|
|
|
// Check for match (may need to check a few due to hash collisions)
|
|
while (it != self->itemIndex.end() && it->idHash == targetHash) {
|
|
self->tempItemStore.seek(it->fileOffset);
|
|
std::string itemId;
|
|
serialization::readString(self->tempItemStore, itemId);
|
|
if (itemId == idref) {
|
|
serialization::readString(self->tempItemStore, href);
|
|
found = true;
|
|
break;
|
|
}
|
|
++it;
|
|
}
|
|
} else {
|
|
// Slow path: linear scan (for small manifests, keeps original behavior)
|
|
// TODO: This lookup is slow as need to scan through all items each time.
|
|
// It can take up to 200ms per item when getting to 1500 items.
|
|
self->tempItemStore.seek(0);
|
|
std::string itemId;
|
|
while (self->tempItemStore.available()) {
|
|
serialization::readString(self->tempItemStore, itemId);
|
|
serialization::readString(self->tempItemStore, href);
|
|
if (itemId == idref) {
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (found && self->cache) {
|
|
self->cache->createSpineEntry(href);
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
// parse the guide
|
|
if (self->state == IN_GUIDE && (strcmp(name, "reference") == 0 || strcmp(name, "opf:reference") == 0)) {
|
|
std::string type;
|
|
std::string textHref;
|
|
for (int i = 0; atts[i]; i += 2) {
|
|
if (strcmp(atts[i], "type") == 0) {
|
|
type = atts[i + 1];
|
|
if (type == "text" || type == "start") {
|
|
continue;
|
|
} else {
|
|
Serial.printf("[%lu] [COF] Skipping non-text reference in guide: %s\n", millis(), type.c_str());
|
|
break;
|
|
}
|
|
} else if (strcmp(atts[i], "href") == 0) {
|
|
textHref = FsHelpers::normalisePath(self->baseContentPath + atts[i + 1]);
|
|
}
|
|
}
|
|
if ((type == "text" || (type == "start" && !self->textReferenceHref.empty())) && (textHref.length() > 0)) {
|
|
Serial.printf("[%lu] [COF] Found %s reference in guide: %s.\n", millis(), type.c_str(), textHref.c_str());
|
|
self->textReferenceHref = textHref;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
void XMLCALL ContentOpfParser::characterData(void* userData, const XML_Char* s, const int len) {
|
|
auto* self = static_cast<ContentOpfParser*>(userData);
|
|
|
|
if (self->state == IN_BOOK_TITLE) {
|
|
self->title.append(s, len);
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_BOOK_AUTHOR) {
|
|
self->author.append(s, len);
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_BOOK_LANGUAGE) {
|
|
self->language.append(s, len);
|
|
return;
|
|
}
|
|
}
|
|
|
|
void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name) {
|
|
auto* self = static_cast<ContentOpfParser*>(userData);
|
|
(void)name;
|
|
|
|
if (self->state == IN_SPINE && (strcmp(name, "spine") == 0 || strcmp(name, "opf:spine") == 0)) {
|
|
self->state = IN_PACKAGE;
|
|
self->tempItemStore.close();
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_GUIDE && (strcmp(name, "guide") == 0 || strcmp(name, "opf:guide") == 0)) {
|
|
self->state = IN_PACKAGE;
|
|
self->tempItemStore.close();
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_MANIFEST && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) {
|
|
self->state = IN_PACKAGE;
|
|
self->tempItemStore.close();
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_BOOK_TITLE && strcmp(name, "dc:title") == 0) {
|
|
self->state = IN_METADATA;
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_BOOK_AUTHOR && strcmp(name, "dc:creator") == 0) {
|
|
self->state = IN_METADATA;
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_BOOK_LANGUAGE && strcmp(name, "dc:language") == 0) {
|
|
self->state = IN_METADATA;
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_METADATA && (strcmp(name, "metadata") == 0 || strcmp(name, "opf:metadata") == 0)) {
|
|
self->state = IN_PACKAGE;
|
|
return;
|
|
}
|
|
|
|
if (self->state == IN_PACKAGE && (strcmp(name, "package") == 0 || strcmp(name, "opf:package") == 0)) {
|
|
self->state = START;
|
|
return;
|
|
}
|
|
}
|