Xteink-X4-crosspoint-reader/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
Dave Allie 712c566664
fix: Correctly render italics on image alt placeholders (#569)
## Summary

* Correctly render italics on image alt placeholders
  * Parser incorrectly handled depth of self-closing tags
  * Self-closing tags immediately call start and end tag

## Additional Context

* Previously, it would incorrectly make the whole chapter bold/italics,
or not italicised the image alt

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? No
2026-01-28 03:33:36 +11:00

400 lines
13 KiB
C++

#include "ChapterHtmlSlimParser.h"
#include <GfxRenderer.h>
#include <HardwareSerial.h>
#include <SDCardManager.h>
#include <expat.h>
#include "../Page.h"
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
// Minimum file size (in bytes) to show progress bar - smaller chapters don't benefit from it
constexpr size_t MIN_SIZE_FOR_PROGRESS = 50 * 1024; // 50KB
const char* BLOCK_TAGS[] = {"p", "li", "div", "br", "blockquote"};
constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
const char* BOLD_TAGS[] = {"b", "strong"};
constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
const char* ITALIC_TAGS[] = {"i", "em"};
constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
const char* IMAGE_TAGS[] = {"img"};
constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
const char* SKIP_TAGS[] = {"head"};
constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n' || c == '\t'; }
// given the start and end of a tag, check to see if it matches a known tag
bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
for (int i = 0; i < possible_tag_count; i++) {
if (strcmp(tag_name, possible_tags[i]) == 0) {
return true;
}
}
return false;
}
// flush the contents of partWordBuffer to currentTextBlock
void ChapterHtmlSlimParser::flushPartWordBuffer() {
// determine font style
EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR;
if (boldUntilDepth < depth && italicUntilDepth < depth) {
fontStyle = EpdFontFamily::BOLD_ITALIC;
} else if (boldUntilDepth < depth) {
fontStyle = EpdFontFamily::BOLD;
} else if (italicUntilDepth < depth) {
fontStyle = EpdFontFamily::ITALIC;
}
// flush the buffer
partWordBuffer[partWordBufferIndex] = '\0';
currentTextBlock->addWord(partWordBuffer, fontStyle);
partWordBufferIndex = 0;
}
// start a new text block if needed
void ChapterHtmlSlimParser::startNewTextBlock(const TextBlock::Style style) {
if (currentTextBlock) {
// already have a text block running and it is empty - just reuse it
if (currentTextBlock->isEmpty()) {
currentTextBlock->setStyle(style);
return;
}
makePages();
}
currentTextBlock.reset(new ParsedText(style, extraParagraphSpacing, hyphenationEnabled));
}
void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
// Middle of skip
if (self->skipUntilDepth < self->depth) {
self->depth += 1;
return;
}
// Special handling for tables - show placeholder text instead of dropping silently
if (strcmp(name, "table") == 0) {
// Add placeholder text
self->startNewTextBlock(TextBlock::CENTER_ALIGN);
self->italicUntilDepth = min(self->italicUntilDepth, self->depth);
// Advance depth before processing character data (like you would for a element with text)
self->depth += 1;
self->characterData(userData, "[Table omitted]", strlen("[Table omitted]"));
// Skip table contents (skip until parent as we pre-advanced depth above)
self->skipUntilDepth = self->depth - 1;
return;
}
if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
// TODO: Start processing image tags
std::string alt = "[Image]";
if (atts != nullptr) {
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "alt") == 0) {
if (strlen(atts[i + 1]) > 0) {
alt = "[Image: " + std::string(atts[i + 1]) + "]";
}
break;
}
}
}
Serial.printf("[%lu] [EHP] Image alt: %s\n", millis(), alt.c_str());
self->startNewTextBlock(TextBlock::CENTER_ALIGN);
self->italicUntilDepth = min(self->italicUntilDepth, self->depth);
// Advance depth before processing character data (like you would for a element with text)
self->depth += 1;
self->characterData(userData, alt.c_str(), alt.length());
// Skip table contents (skip until parent as we pre-advanced depth above)
self->skipUntilDepth = self->depth - 1;
return;
}
if (matches(name, SKIP_TAGS, NUM_SKIP_TAGS)) {
// start skip
self->skipUntilDepth = self->depth;
self->depth += 1;
return;
}
// Skip blocks with role="doc-pagebreak" and epub:type="pagebreak"
if (atts != nullptr) {
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "role") == 0 && strcmp(atts[i + 1], "doc-pagebreak") == 0 ||
strcmp(atts[i], "epub:type") == 0 && strcmp(atts[i + 1], "pagebreak") == 0) {
self->skipUntilDepth = self->depth;
self->depth += 1;
return;
}
}
}
if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) {
self->startNewTextBlock(TextBlock::CENTER_ALIGN);
self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth);
self->depth += 1;
return;
}
if (matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
if (strcmp(name, "br") == 0) {
if (self->partWordBufferIndex > 0) {
// flush word preceding <br/> to currentTextBlock before calling startNewTextBlock
self->flushPartWordBuffer();
}
self->startNewTextBlock(self->currentTextBlock->getStyle());
self->depth += 1;
return;
}
self->startNewTextBlock(static_cast<TextBlock::Style>(self->paragraphAlignment));
if (strcmp(name, "li") == 0) {
self->currentTextBlock->addWord("\xe2\x80\xa2", EpdFontFamily::REGULAR);
}
self->depth += 1;
return;
}
if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) {
self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth);
self->depth += 1;
return;
}
if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth);
self->depth += 1;
return;
}
// Unprocessed tag, just increasing depth and continue forward
self->depth += 1;
}
void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char* s, const int len) {
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
// Middle of skip
if (self->skipUntilDepth < self->depth) {
return;
}
for (int i = 0; i < len; i++) {
if (isWhitespace(s[i])) {
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
if (self->partWordBufferIndex > 0) {
self->flushPartWordBuffer();
}
// Skip the whitespace char
continue;
}
// Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF
const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF);
const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB);
const XML_Char FEFF_BYTE_3 = static_cast<XML_Char>(0xBF);
if (s[i] == FEFF_BYTE_1) {
// Check if the next two bytes complete the 3-byte sequence
if ((i + 2 < len) && (s[i + 1] == FEFF_BYTE_2) && (s[i + 2] == FEFF_BYTE_3)) {
// Sequence 0xEF 0xBB 0xBF found!
i += 2; // Skip the next two bytes
continue; // Move to the next iteration
}
}
// If we're about to run out of space, then cut the word off and start a new one
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
self->flushPartWordBuffer();
}
self->partWordBuffer[self->partWordBufferIndex++] = s[i];
}
// If we have > 750 words buffered up, perform the layout and consume out all but the last line
// There should be enough here to build out 1-2 full pages and doing this will free up a lot of
// memory.
// Spotted when reading Intermezzo, there are some really long text blocks in there.
if (self->currentTextBlock->size() > 750) {
Serial.printf("[%lu] [EHP] Text block too long, splitting into multiple pages\n", millis());
self->currentTextBlock->layoutAndExtractLines(
self->renderer, self->fontId, self->viewportWidth,
[self](const std::shared_ptr<TextBlock>& textBlock) { self->addLineToPage(textBlock); }, false);
}
}
void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) {
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
if (self->partWordBufferIndex > 0) {
// Only flush out part word buffer if we're closing a block tag or are at the top of the HTML file.
// We don't want to flush out content when closing inline tags like <span>.
// Currently this also flushes out on closing <b> and <i> tags, but they are line tags so that shouldn't happen,
// text styling needs to be overhauled to fix it.
const bool shouldBreakText =
matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS) || matches(name, HEADER_TAGS, NUM_HEADER_TAGS) ||
matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) ||
strcmp(name, "table") == 0 || matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) || self->depth == 1;
if (shouldBreakText) {
self->flushPartWordBuffer();
}
}
self->depth -= 1;
// Leaving skip
if (self->skipUntilDepth == self->depth) {
self->skipUntilDepth = INT_MAX;
}
// Leaving bold
if (self->boldUntilDepth == self->depth) {
self->boldUntilDepth = INT_MAX;
}
// Leaving italic
if (self->italicUntilDepth == self->depth) {
self->italicUntilDepth = INT_MAX;
}
}
bool ChapterHtmlSlimParser::parseAndBuildPages() {
startNewTextBlock((TextBlock::Style)this->paragraphAlignment);
const XML_Parser parser = XML_ParserCreate(nullptr);
int done;
if (!parser) {
Serial.printf("[%lu] [EHP] Couldn't allocate memory for parser\n", millis());
return false;
}
FsFile file;
if (!SdMan.openFileForRead("EHP", filepath, file)) {
XML_ParserFree(parser);
return false;
}
// Get file size for progress calculation
const size_t totalSize = file.size();
size_t bytesRead = 0;
int lastProgress = -1;
XML_SetUserData(parser, this);
XML_SetElementHandler(parser, startElement, endElement);
XML_SetCharacterDataHandler(parser, characterData);
do {
void* const buf = XML_GetBuffer(parser, 1024);
if (!buf) {
Serial.printf("[%lu] [EHP] Couldn't allocate memory for buffer\n", millis());
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
file.close();
return false;
}
const size_t len = file.read(buf, 1024);
if (len == 0 && file.available() > 0) {
Serial.printf("[%lu] [EHP] File read error\n", millis());
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
file.close();
return false;
}
// Update progress (call every 10% change to avoid too frequent updates)
// Only show progress for larger chapters where rendering overhead is worth it
bytesRead += len;
if (progressFn && totalSize >= MIN_SIZE_FOR_PROGRESS) {
const int progress = static_cast<int>((bytesRead * 100) / totalSize);
if (lastProgress / 10 != progress / 10) {
lastProgress = progress;
progressFn(progress);
}
}
done = file.available() == 0;
if (XML_ParseBuffer(parser, static_cast<int>(len), done) == XML_STATUS_ERROR) {
Serial.printf("[%lu] [EHP] Parse error at line %lu:\n%s\n", millis(), XML_GetCurrentLineNumber(parser),
XML_ErrorString(XML_GetErrorCode(parser)));
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
file.close();
return false;
}
} while (!done);
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
file.close();
// Process last page if there is still text
if (currentTextBlock) {
makePages();
completePageFn(std::move(currentPage));
currentPage.reset();
currentTextBlock.reset();
}
return true;
}
void ChapterHtmlSlimParser::addLineToPage(std::shared_ptr<TextBlock> line) {
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
if (currentPageNextY + lineHeight > viewportHeight) {
completePageFn(std::move(currentPage));
currentPage.reset(new Page());
currentPageNextY = 0;
}
currentPage->elements.push_back(std::make_shared<PageLine>(line, 0, currentPageNextY));
currentPageNextY += lineHeight;
}
void ChapterHtmlSlimParser::makePages() {
if (!currentTextBlock) {
Serial.printf("[%lu] [EHP] !! No text block to make pages for !!\n", millis());
return;
}
if (!currentPage) {
currentPage.reset(new Page());
currentPageNextY = 0;
}
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
currentTextBlock->layoutAndExtractLines(
renderer, fontId, viewportWidth,
[this](const std::shared_ptr<TextBlock>& textBlock) { addLineToPage(textBlock); });
// Extra paragraph spacing if enabled
if (extraParagraphSpacing) {
currentPageNextY += lineHeight / 2;
}
}