diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index 5dd83130..a93cca63 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -55,35 +55,80 @@ std::string replaceHtmlEntities(const char* text) { if (!text) return ""; std::string s(text); - // Replace common entities size_t pos = 0; - while ((pos = s.find("<", pos)) != std::string::npos) { - s.replace(pos, 4, "<"); - pos += 1; - } - pos = 0; - while ((pos = s.find(">", pos)) != std::string::npos) { - s.replace(pos, 4, ">"); - pos += 1; - } - pos = 0; - while ((pos = s.find("&", pos)) != std::string::npos) { - s.replace(pos, 5, "&"); - pos += 1; - } - pos = 0; - while ((pos = s.find(""", pos)) != std::string::npos) { - s.replace(pos, 6, "\""); - pos += 1; - } - pos = 0; - while ((pos = s.find("'", pos)) != std::string::npos) { - s.replace(pos, 6, "'"); - pos += 1; + while (pos < s.length()) { + if (s[pos] == '&') { + bool replaced = false; + const char* ptr = s.c_str() + pos; // Get pointer to current position (no allocation) + + if (pos + 1 < s.length()) { + switch (s[pos + 1]) { + case 'l': // < + if (pos + 3 < s.length() && strncmp(ptr, "<", 4) == 0) { + s.replace(pos, 4, "<"); + replaced = true; + } + break; + + case 'g': // > + if (pos + 3 < s.length() && strncmp(ptr, ">", 4) == 0) { + s.replace(pos, 4, ">"); + replaced = true; + } + break; + + case 'a': // & or ' + if (pos + 4 < s.length() && strncmp(ptr, "&", 5) == 0) { + s.replace(pos, 5, "&"); + replaced = true; + } else if (pos + 5 < s.length() && strncmp(ptr, "'", 6) == 0) { + s.replace(pos, 6, "'"); + replaced = true; + } + break; + + case 'q': // " + if (pos + 5 < s.length() && strncmp(ptr, """, 6) == 0) { + s.replace(pos, 6, "\""); + replaced = true; + } + break; + } + } + + // Don't increment pos if we replaced - allows nested entity handling + // Example: &lt; -> < (iteration 1) -> < (iteration 2) + if (!replaced) { + pos++; + } + } else { + pos++; + } } + return s; } +// Check if href points to internal EPUB location (not external URL) +bool isInternalEpubLink(const char* href) { + if (!href) return false; + + switch (href[0]) { + case 'h': // http/https + if (strncmp(href, "http", 4) == 0) return false; + case 'f': // ftp + if (strncmp(href, "ftp://", 6) == 0) return false; + case 'm': // mailto + if (strncmp(href, "mailto:", 7) == 0) return false; + case 't': // tel + if (strncmp(href, "tel:", 4) == 0) return false; + case 's': // sms + if (strncmp(href, "sms:", 4) == 0) return false; + } + // Everything else is internal (relative paths, anchors, etc.) + return true; +} + EpdFontFamily::Style ChapterHtmlSlimParser::getCurrentFontStyle() const { if (boldUntilDepth < depth && italicUntilDepth < depth) { return EpdFontFamily::BOLD_ITALIC; @@ -240,130 +285,57 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* // Pass 2: Skip the aside (we already have it from Pass 1) Serial.printf("[%lu] [ASIDE] Skipping aside in Pass 2: id=%s\n", millis(), id); - // Find the inline footnote text - for (int i = 0; i < self->inlineFootnoteCount; i++) { - if (strcmp(self->inlineFootnotes[i].id, id) == 0 && self->inlineFootnotes[i].text) { - // Output the footnote text as normal text - const char* text = self->inlineFootnotes[i].text; - int textLen = strlen(text); - - // Process it through characterData - self->characterData(self, text, textLen); - - Serial.printf("[%lu] [ASIDE] Rendered aside text: %.80s...\n", millis(), text); - break; - } - } - - // Skip the aside element itself self->skipUntilDepth = self->depth; } - - self->depth += 1; - return; } - } - // ============================================================================ - // PASS 1: Skip everything else - // ============================================================================ - if (self->isPass1CollectingAsides) { self->depth += 1; return; } // ============================================================================ - // PASS 2: Skip

(we already have them from Pass 1) + // PASS 2: FOOTNOTE DETECTION + // All tags with internal hrefs are treated as footnotes // ============================================================================ - if (strcmp(name, "p") == 0) { - const char* classAttr = getAttribute(atts, "class"); - - if (classAttr && (strcmp(classAttr, "note") == 0 || strstr(classAttr, "note"))) { - Serial.printf("[%lu] [PNOTE] Skipping paragraph note in Pass 2\n", millis()); - self->skipUntilDepth = self->depth; - self->depth += 1; - return; - } - } - - // ============================================================================ - // PASS 2: Normal parsing - // ============================================================================ - - // Middle of skip - if (self->skipUntilDepth < self->depth) { - self->depth += 1; - return; - } - - // Rest of startElement logic for pass 2... - if (strcmp(name, "sup") == 0) { - self->supDepth = self->depth; - - // Case A: Found inside a normal (which wasn't marked as a note yet) - // Example: * - if (self->anchorDepth != -1 && !self->insideNoteref) { - Serial.printf("[%lu] [NOTEREF] Found inside , promoting to noteref\n", millis()); - - // Flush the current word buffer (text before the sup is normal text) - if (self->partWordBufferIndex > 0) { - self->flushPartWordBuffer(); - } - - // Activate footnote mode - self->insideNoteref = true; - self->currentNoterefTextLen = 0; - self->currentNoterefText[0] = '\0'; - // Note: The href was already saved to currentNoterefHref when the was opened (see below) - } - } - - // === Update the existing A block === - if (strcmp(name, "a") == 0) { - const char* epubType = getAttribute(atts, "epub:type"); + if (!self->isPass1CollectingAsides && strcmp(name, "a") == 0) { const char* href = getAttribute(atts, "href"); - // Save Anchor state - self->anchorDepth = self->depth; + // Flush pending word buffer before starting footnote + if (self->partWordBufferIndex > 0) { + self->flushPartWordBuffer(); + } + // Check for internal EPUB link + bool isInternalLink = isInternalEpubLink(href); - // Optimistically save the href, in case this becomes a footnote later (via internal ) - if (!self->insideNoteref) { - if (href) { - strncpy(self->currentNoterefHref, href, 127); - self->currentNoterefHref[127] = '\0'; - } else { - self->currentNoterefHref[0] = '\0'; - } + // Special case: javascript:void(0) links with data attributes + // Example: + if (href && strncmp(href, "javascript:", 11) == 0) { + isInternalLink = false; + + // TODO: Parse data-* attributes to extract actual href } - // Footnote detection: via epub:type, rnote pattern, or if we are already inside a - // Case B: Found inside - // Example: 1 - bool isNoteref = (epubType && strcmp(epubType, "noteref") == 0); + // If it's an internal link, treat it as a footnote + if (isInternalLink && href) { + Serial.printf("[%lu] [FOOTNOTE] Found internal link (footnote candidate): href=%s\n", millis(), href); - if (!isNoteref && href && href[0] == '#' && strncmp(href + 1, "rnote", 5) == 0) { - isNoteref = true; + self->insideFootnoteLink = true; + self->footnoteLinkDepth = self->depth; + self->currentFootnoteLinkHref[0] = '\0'; + strncpy(self->currentFootnoteLinkHref, href, 63); + self->currentFootnoteLinkHref[63] = '\0'; + + self->currentFootnoteLinkText[0] = '\0'; + self->currentFootnoteLinkTextLen = 0; } - // New detection: if we are inside SUP, this link is a footnote - if (!isNoteref && self->supDepth != -1) { - isNoteref = true; - Serial.printf("[%lu] [NOTEREF] Found inside , treating as noteref\n", millis()); - } - - if (isNoteref) { - Serial.printf("[%lu] [NOTEREF] Found noteref: href=%s\n", millis(), href ? href : "null"); - // Flush word buffer - if (self->partWordBufferIndex > 0) { - self->flushPartWordBuffer(); - } - self->insideNoteref = true; - self->currentNoterefTextLen = 0; - self->currentNoterefText[0] = '\0'; - self->depth += 1; - return; - } + self->depth += 1; + return; } + // ============================================================================ + // Handle other tags + // ============================================================================ // Special handling for tables - show placeholder text instead of dropping silently if (strcmp(name, "table") == 0) { @@ -533,13 +505,13 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char } // Rest of characterData logic for pass 2... - if (self->insideNoteref) { + if (self->insideFootnoteLink) { for (int i = 0; i < len; i++) { unsigned char c = (unsigned char)s[i]; // Skip whitespace and brackets [] - if (!isWhitespace(c) && c != '[' && c != ']' && self->currentNoterefTextLen < 15) { - self->currentNoterefText[self->currentNoterefTextLen++] = c; - self->currentNoterefText[self->currentNoterefTextLen] = '\0'; + if (!isWhitespace(c) && c != '[' && c != ']' && self->currentFootnoteLinkTextLen < 63) { + self->currentFootnoteLinkText[self->currentFootnoteLinkTextLen++] = c; + self->currentFootnoteLinkText[self->currentFootnoteLinkTextLen] = '\0'; } } return; @@ -600,144 +572,95 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) { auto* self = static_cast(userData); - // Closing paragraph note in Pass 1 - if (strcmp(name, "p") == 0 && self->insideParagraphNote && self->depth - 1 == self->paragraphNoteDepth) { - if (self->isPass1CollectingAsides && self->currentParagraphNoteTextLen > 0 && self->paragraphNoteCount < 32 && - self->currentParagraphNoteId[0] != '\0') { - // Copy ID - strncpy(self->paragraphNotes[self->paragraphNoteCount].id, self->currentParagraphNoteId, 15); - self->paragraphNotes[self->paragraphNoteCount].id[15] = '\0'; + // ============================================================================ + // PASS 1: End of