#include "ChapterHtmlSlimParser.h" #include #include #include #include #include "../Page.h" const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"}; constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]); // Minimum file size (in bytes) to show progress bar - smaller chapters don't benefit from it constexpr size_t MIN_SIZE_FOR_PROGRESS = 50 * 1024; // 50KB const char* BLOCK_TAGS[] = {"p", "li", "div", "br", "blockquote"}; constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]); const char* BOLD_TAGS[] = {"b", "strong"}; constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]); const char* ITALIC_TAGS[] = {"i", "em"}; constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]); const char* IMAGE_TAGS[] = {"img"}; constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]); const char* SKIP_TAGS[] = {"head"}; constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]); bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n' || c == '\t'; } // given the start and end of a tag, check to see if it matches a known tag bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) { for (int i = 0; i < possible_tag_count; i++) { if (strcmp(tag_name, possible_tags[i]) == 0) { return true; } } return false; } // start a new text block if needed void ChapterHtmlSlimParser::startNewTextBlock(const TextBlock::Style style) { if (currentTextBlock) { // already have a text block running and it is empty - just reuse it if (currentTextBlock->isEmpty()) { currentTextBlock->setStyle(style); return; } makePages(); } currentTextBlock.reset(new ParsedText(style, extraParagraphSpacing)); } void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) { // Serial.printf("startElement: %s\n", name); auto* self = static_cast(userData); // Middle of skip if (self->skipUntilDepth < self->depth) { self->depth += 1; return; } // Special handling for tables - show placeholder text instead of dropping silently if (strcmp(name, "table") == 0) { // Add placeholder text self->startNewTextBlock(TextBlock::CENTER_ALIGN); if (self->currentTextBlock) { self->currentTextBlock->addWord("[Table omitted]", EpdFontFamily::ITALIC); } // Skip table contents self->skipUntilDepth = self->depth; self->depth += 1; return; } if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) { // TODO: Start processing image tags std::string alt; if (atts != nullptr) { for (int i = 0; atts[i]; i += 2) { if (strcmp(atts[i], "alt") == 0) { alt = "[Image: " + std::string(atts[i + 1]) + "]"; } } Serial.printf("[%lu] [EHP] Image alt: %s\n", millis(), alt.c_str()); self->startNewTextBlock(TextBlock::CENTER_ALIGN); self->italicUntilDepth = min(self->italicUntilDepth, self->depth); self->depth += 1; self->characterData(userData, alt.c_str(), alt.length()); } else { // Skip for now self->skipUntilDepth = self->depth; self->depth += 1; return; } } if (matches(name, SKIP_TAGS, NUM_SKIP_TAGS)) { // start skip self->skipUntilDepth = self->depth; self->depth += 1; return; } // Skip blocks with role="doc-pagebreak" and epub:type="pagebreak" if (atts != nullptr) { for (int i = 0; atts[i]; i += 2) { if (strcmp(atts[i], "role") == 0 && strcmp(atts[i + 1], "doc-pagebreak") == 0 || strcmp(atts[i], "epub:type") == 0 && strcmp(atts[i + 1], "pagebreak") == 0) { self->skipUntilDepth = self->depth; self->depth += 1; return; } } } if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) { self->startNewTextBlock(TextBlock::CENTER_ALIGN); self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth); } else if (matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS)) { if (strcmp(name, "br") == 0) { self->startNewTextBlock(self->currentTextBlock->getStyle()); } else { self->startNewTextBlock((TextBlock::Style)self->paragraphAlignment); if (strcmp(name, "li") == 0) { self->currentTextBlock->addWord("\xe2\x80\xa2", EpdFontFamily::REGULAR); } } } else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) { self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth); } else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) { self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth); } self->depth += 1; } void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char* s, const int len) { auto* self = static_cast(userData); // Middle of skip if (self->skipUntilDepth < self->depth) { return; } EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR; if (self->boldUntilDepth < self->depth && self->italicUntilDepth < self->depth) { fontStyle = EpdFontFamily::BOLD_ITALIC; } else if (self->boldUntilDepth < self->depth) { fontStyle = EpdFontFamily::BOLD; } else if (self->italicUntilDepth < self->depth) { fontStyle = EpdFontFamily::ITALIC; } for (int i = 0; i < len; i++) { if (isWhitespace(s[i])) { // Currently looking at whitespace, if there's anything in the partWordBuffer, flush it if (self->partWordBufferIndex > 0) { self->partWordBuffer[self->partWordBufferIndex] = '\0'; self->currentTextBlock->addWord(self->partWordBuffer, fontStyle); self->partWordBufferIndex = 0; } // Skip the whitespace char continue; } // Skip soft-hyphen with UTF-8 representation (U+00AD) = 0xC2 0xAD const XML_Char SHY_BYTE_1 = static_cast(0xC2); const XML_Char SHY_BYTE_2 = static_cast(0xAD); // 1. Check for the start of the 2-byte Soft Hyphen sequence if (s[i] == SHY_BYTE_1) { // 2. Check if the next byte exists AND if it completes the sequence // We must check i + 1 < len to prevent reading past the end of the buffer. if ((i + 1 < len) && (s[i + 1] == SHY_BYTE_2)) { // Sequence 0xC2 0xAD found! // Skip the current byte (0xC2) and the next byte (0xAD) i++; // Increment 'i' one more time to skip the 0xAD byte continue; // Skip the rest of the loop and move to the next iteration } } // Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF const XML_Char FEFF_BYTE_1 = static_cast(0xEF); const XML_Char FEFF_BYTE_2 = static_cast(0xBB); const XML_Char FEFF_BYTE_3 = static_cast(0xBF); if (s[i] == FEFF_BYTE_1) { // Check if the next two bytes complete the 3-byte sequence if ((i + 2 < len) && (s[i + 1] == FEFF_BYTE_2) && (s[i + 2] == FEFF_BYTE_3)) { // Sequence 0xEF 0xBB 0xBF found! i += 2; // Skip the next two bytes continue; // Move to the next iteration } } // If we're about to run out of space, then cut the word off and start a new one if (self->partWordBufferIndex >= MAX_WORD_SIZE) { self->partWordBuffer[self->partWordBufferIndex] = '\0'; self->currentTextBlock->addWord(self->partWordBuffer, fontStyle); self->partWordBufferIndex = 0; } self->partWordBuffer[self->partWordBufferIndex++] = s[i]; } // If we have > 750 words buffered up, perform the layout and consume out all but the last line // There should be enough here to build out 1-2 full pages and doing this will free up a lot of // memory. // Spotted when reading Intermezzo, there are some really long text blocks in there. if (self->currentTextBlock->size() > 750) { Serial.printf("[%lu] [EHP] Text block too long, splitting into multiple pages\n", millis()); self->currentTextBlock->layoutAndExtractLines( self->renderer, self->fontId, self->viewportWidth, [self](const std::shared_ptr& textBlock) { self->addLineToPage(textBlock); }, false); } } void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) { auto* self = static_cast(userData); if (self->partWordBufferIndex > 0) { // Only flush out part word buffer if we're closing a block tag or are at the top of the HTML file. // We don't want to flush out content when closing inline tags like . // Currently this also flushes out on closing and tags, but they are line tags so that shouldn't happen, // text styling needs to be overhauled to fix it. const bool shouldBreakText = matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS) || matches(name, HEADER_TAGS, NUM_HEADER_TAGS) || matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || self->depth == 1; if (shouldBreakText) { EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR; if (self->boldUntilDepth < self->depth && self->italicUntilDepth < self->depth) { fontStyle = EpdFontFamily::BOLD_ITALIC; } else if (self->boldUntilDepth < self->depth) { fontStyle = EpdFontFamily::BOLD; } else if (self->italicUntilDepth < self->depth) { fontStyle = EpdFontFamily::ITALIC; } self->partWordBuffer[self->partWordBufferIndex] = '\0'; self->currentTextBlock->addWord(self->partWordBuffer, fontStyle); self->partWordBufferIndex = 0; } } self->depth -= 1; // Leaving skip if (self->skipUntilDepth == self->depth) { self->skipUntilDepth = INT_MAX; } // Leaving bold if (self->boldUntilDepth == self->depth) { self->boldUntilDepth = INT_MAX; } // Leaving italic if (self->italicUntilDepth == self->depth) { self->italicUntilDepth = INT_MAX; } } bool ChapterHtmlSlimParser::parseAndBuildPages() { SDLock lock; Serial.printf("[%lu] [EHP] parseAndBuildPages start. Heap: %u\n", millis(), ESP.getFreeHeap()); Serial.printf("[%lu] [EHP] Calling startNewTextBlock\n", millis()); startNewTextBlock((TextBlock::Style)this->paragraphAlignment); Serial.printf("[%lu] [EHP] startNewTextBlock returned\n", millis()); Serial.printf("[%lu] [EHP] Creating XML parser\n", millis()); const XML_Parser parser = XML_ParserCreate(nullptr); if (parser) Serial.printf("[%lu] [EHP] Parser created\n", millis()); int done; if (!parser) { Serial.printf("[%lu] [EHP] Couldn't allocate memory for parser\n", millis()); return false; } FsFile file; if (!SdMan.openFileForRead("EHP", filepath, file)) { XML_ParserFree(parser); return false; } // Get file size for progress calculation const size_t totalSize = file.size(); size_t bytesRead = 0; int lastProgress = -1; XML_SetUserData(parser, this); XML_SetElementHandler(parser, startElement, endElement); XML_SetCharacterDataHandler(parser, characterData); do { void* const buf = XML_GetBuffer(parser, 1024); if (!buf) { Serial.printf("[%lu] [EHP] Couldn't allocate memory for buffer\n", millis()); XML_StopParser(parser, XML_FALSE); // Stop any pending processing XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks XML_SetCharacterDataHandler(parser, nullptr); XML_ParserFree(parser); file.close(); return false; } const size_t len = file.read(buf, 1024); // Serial.printf("[%lu] [EHP] Read %d bytes\n", millis(), len); if (len == 0 && file.available() > 0) { Serial.printf("[%lu] [EHP] File read error\n", millis()); XML_StopParser(parser, XML_FALSE); // Stop any pending processing XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks XML_SetCharacterDataHandler(parser, nullptr); XML_ParserFree(parser); file.close(); return false; } // Update progress (call every 10% change to avoid too frequent updates) // Only show progress for larger chapters where rendering overhead is worth it bytesRead += len; if (progressFn && totalSize >= MIN_SIZE_FOR_PROGRESS) { const int progress = static_cast((bytesRead * 100) / totalSize); if (lastProgress / 10 != progress / 10) { lastProgress = progress; progressFn(progress); } } done = file.available() == 0; if (XML_ParseBuffer(parser, static_cast(len), done) == XML_STATUS_ERROR) { Serial.printf("[%lu] [EHP] XML_ParseBuffer returned error\n", millis()); Serial.printf("[%lu] [EHP] Parse error at line %lu:\n%s\n", millis(), XML_GetCurrentLineNumber(parser), XML_ErrorString(XML_GetErrorCode(parser))); XML_StopParser(parser, XML_FALSE); // Stop any pending processing XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks XML_SetCharacterDataHandler(parser, nullptr); XML_ParserFree(parser); file.close(); return false; } vTaskDelay(1); } while (!done); XML_StopParser(parser, XML_FALSE); // Stop any pending processing XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks XML_SetCharacterDataHandler(parser, nullptr); XML_ParserFree(parser); file.close(); // Process last page if there is still text if (currentTextBlock) { makePages(); completePageFn(std::move(currentPage)); currentPage.reset(); currentTextBlock.reset(); } return true; } void ChapterHtmlSlimParser::addLineToPage(std::shared_ptr line) { const int lineHeight = renderer.getLineHeight(fontId) * lineCompression; if (currentPageNextY + lineHeight > viewportHeight) { completePageFn(std::move(currentPage)); currentPage.reset(new Page()); currentPageNextY = 0; } currentPage->elements.push_back(std::make_shared(line, 0, currentPageNextY)); currentPageNextY += lineHeight; } void ChapterHtmlSlimParser::makePages() { if (!currentTextBlock) { Serial.printf("[%lu] [EHP] !! No text block to make pages for !!\n", millis()); return; } if (!currentPage) { currentPage.reset(new Page()); currentPageNextY = 0; } const int lineHeight = renderer.getLineHeight(fontId) * lineCompression; currentTextBlock->layoutAndExtractLines( renderer, fontId, viewportWidth, [this](const std::shared_ptr& textBlock) { addLineToPage(textBlock); }); // Extra paragraph spacing if enabled if (extraParagraphSpacing) { currentPageNextY += lineHeight / 2; } }