diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index 5dd83130..a93cca63 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -55,35 +55,80 @@ std::string replaceHtmlEntities(const char* text) { if (!text) return ""; std::string s(text); - // Replace common entities size_t pos = 0; - while ((pos = s.find("<", pos)) != std::string::npos) { - s.replace(pos, 4, "<"); - pos += 1; - } - pos = 0; - while ((pos = s.find(">", pos)) != std::string::npos) { - s.replace(pos, 4, ">"); - pos += 1; - } - pos = 0; - while ((pos = s.find("&", pos)) != std::string::npos) { - s.replace(pos, 5, "&"); - pos += 1; - } - pos = 0; - while ((pos = s.find(""", pos)) != std::string::npos) { - s.replace(pos, 6, "\""); - pos += 1; - } - pos = 0; - while ((pos = s.find("'", pos)) != std::string::npos) { - s.replace(pos, 6, "'"); - pos += 1; + while (pos < s.length()) { + if (s[pos] == '&') { + bool replaced = false; + const char* ptr = s.c_str() + pos; // Get pointer to current position (no allocation) + + if (pos + 1 < s.length()) { + switch (s[pos + 1]) { + case 'l': // < + if (pos + 3 < s.length() && strncmp(ptr, "<", 4) == 0) { + s.replace(pos, 4, "<"); + replaced = true; + } + break; + + case 'g': // > + if (pos + 3 < s.length() && strncmp(ptr, ">", 4) == 0) { + s.replace(pos, 4, ">"); + replaced = true; + } + break; + + case 'a': // & or ' + if (pos + 4 < s.length() && strncmp(ptr, "&", 5) == 0) { + s.replace(pos, 5, "&"); + replaced = true; + } else if (pos + 5 < s.length() && strncmp(ptr, "'", 6) == 0) { + s.replace(pos, 6, "'"); + replaced = true; + } + break; + + case 'q': // " + if (pos + 5 < s.length() && strncmp(ptr, """, 6) == 0) { + s.replace(pos, 6, "\""); + replaced = true; + } + break; + } + } + + // Don't increment pos if we replaced - allows nested entity handling + // Example: < -> < (iteration 1) -> < (iteration 2) + if (!replaced) { + pos++; + } + } else { + pos++; + } } + return s; } +// Check if href points to internal EPUB location (not external URL) +bool isInternalEpubLink(const char* href) { + if (!href) return false; + + switch (href[0]) { + case 'h': // http/https + if (strncmp(href, "http", 4) == 0) return false; + case 'f': // ftp + if (strncmp(href, "ftp://", 6) == 0) return false; + case 'm': // mailto + if (strncmp(href, "mailto:", 7) == 0) return false; + case 't': // tel + if (strncmp(href, "tel:", 4) == 0) return false; + case 's': // sms + if (strncmp(href, "sms:", 4) == 0) return false; + } + // Everything else is internal (relative paths, anchors, etc.) + return true; +} + EpdFontFamily::Style ChapterHtmlSlimParser::getCurrentFontStyle() const { if (boldUntilDepth < depth && italicUntilDepth < depth) { return EpdFontFamily::BOLD_ITALIC; @@ -240,130 +285,57 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* // Pass 2: Skip the aside (we already have it from Pass 1) Serial.printf("[%lu] [ASIDE] Skipping aside in Pass 2: id=%s\n", millis(), id); - // Find the inline footnote text - for (int i = 0; i < self->inlineFootnoteCount; i++) { - if (strcmp(self->inlineFootnotes[i].id, id) == 0 && self->inlineFootnotes[i].text) { - // Output the footnote text as normal text - const char* text = self->inlineFootnotes[i].text; - int textLen = strlen(text); - - // Process it through characterData - self->characterData(self, text, textLen); - - Serial.printf("[%lu] [ASIDE] Rendered aside text: %.80s...\n", millis(), text); - break; - } - } - - // Skip the aside element itself self->skipUntilDepth = self->depth; } - - self->depth += 1; - return; } - } - // ============================================================================ - // PASS 1: Skip everything else - // ============================================================================ - if (self->isPass1CollectingAsides) { self->depth += 1; return; } // ============================================================================ - // PASS 2: Skip
(we already have them from Pass 1)
+ // PASS 2: FOOTNOTE DETECTION
+ // All tags with internal hrefs are treated as footnotes
// ============================================================================
- if (strcmp(name, "p") == 0) {
- const char* classAttr = getAttribute(atts, "class");
-
- if (classAttr && (strcmp(classAttr, "note") == 0 || strstr(classAttr, "note"))) {
- Serial.printf("[%lu] [PNOTE] Skipping paragraph note in Pass 2\n", millis());
- self->skipUntilDepth = self->depth;
- self->depth += 1;
- return;
- }
- }
-
- // ============================================================================
- // PASS 2: Normal parsing
- // ============================================================================
-
- // Middle of skip
- if (self->skipUntilDepth < self->depth) {
- self->depth += 1;
- return;
- }
-
- // Rest of startElement logic for pass 2...
- if (strcmp(name, "sup") == 0) {
- self->supDepth = self->depth;
-
- // Case A: Found inside a normal (which wasn't marked as a note yet)
- // Example: *
- if (self->anchorDepth != -1 && !self->insideNoteref) {
- Serial.printf("[%lu] [NOTEREF] Found inside , promoting to noteref\n", millis());
-
- // Flush the current word buffer (text before the sup is normal text)
- if (self->partWordBufferIndex > 0) {
- self->flushPartWordBuffer();
- }
-
- // Activate footnote mode
- self->insideNoteref = true;
- self->currentNoterefTextLen = 0;
- self->currentNoterefText[0] = '\0';
- // Note: The href was already saved to currentNoterefHref when the was opened (see below)
- }
- }
-
- // === Update the existing A block ===
- if (strcmp(name, "a") == 0) {
- const char* epubType = getAttribute(atts, "epub:type");
+ if (!self->isPass1CollectingAsides && strcmp(name, "a") == 0) {
const char* href = getAttribute(atts, "href");
- // Save Anchor state
- self->anchorDepth = self->depth;
+ // Flush pending word buffer before starting footnote
+ if (self->partWordBufferIndex > 0) {
+ self->flushPartWordBuffer();
+ }
+ // Check for internal EPUB link
+ bool isInternalLink = isInternalEpubLink(href);
- // Optimistically save the href, in case this becomes a footnote later (via internal )
- if (!self->insideNoteref) {
- if (href) {
- strncpy(self->currentNoterefHref, href, 127);
- self->currentNoterefHref[127] = '\0';
- } else {
- self->currentNoterefHref[0] = '\0';
- }
+ // Special case: javascript:void(0) links with data attributes
+ // Example:
+ if (href && strncmp(href, "javascript:", 11) == 0) {
+ isInternalLink = false;
+
+ // TODO: Parse data-* attributes to extract actual href
}
- // Footnote detection: via epub:type, rnote pattern, or if we are already inside a
- // Case B: Found inside
- // Example: 1
- bool isNoteref = (epubType && strcmp(epubType, "noteref") == 0);
+ // If it's an internal link, treat it as a footnote
+ if (isInternalLink && href) {
+ Serial.printf("[%lu] [FOOTNOTE] Found internal link (footnote candidate): href=%s\n", millis(), href);
- if (!isNoteref && href && href[0] == '#' && strncmp(href + 1, "rnote", 5) == 0) {
- isNoteref = true;
+ self->insideFootnoteLink = true;
+ self->footnoteLinkDepth = self->depth;
+ self->currentFootnoteLinkHref[0] = '\0';
+ strncpy(self->currentFootnoteLinkHref, href, 63);
+ self->currentFootnoteLinkHref[63] = '\0';
+
+ self->currentFootnoteLinkText[0] = '\0';
+ self->currentFootnoteLinkTextLen = 0;
}
- // New detection: if we are inside SUP, this link is a footnote
- if (!isNoteref && self->supDepth != -1) {
- isNoteref = true;
- Serial.printf("[%lu] [NOTEREF] Found inside , treating as noteref\n", millis());
- }
-
- if (isNoteref) {
- Serial.printf("[%lu] [NOTEREF] Found noteref: href=%s\n", millis(), href ? href : "null");
- // Flush word buffer
- if (self->partWordBufferIndex > 0) {
- self->flushPartWordBuffer();
- }
- self->insideNoteref = true;
- self->currentNoterefTextLen = 0;
- self->currentNoterefText[0] = '\0';
- self->depth += 1;
- return;
- }
+ self->depth += 1;
+ return;
}
+ // ============================================================================
+ // Handle other tags
+ // ============================================================================
// Special handling for tables - show placeholder text instead of dropping silently
if (strcmp(name, "table") == 0) {
@@ -533,13 +505,13 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
}
// Rest of characterData logic for pass 2...
- if (self->insideNoteref) {
+ if (self->insideFootnoteLink) {
for (int i = 0; i < len; i++) {
unsigned char c = (unsigned char)s[i];
// Skip whitespace and brackets []
- if (!isWhitespace(c) && c != '[' && c != ']' && self->currentNoterefTextLen < 15) {
- self->currentNoterefText[self->currentNoterefTextLen++] = c;
- self->currentNoterefText[self->currentNoterefTextLen] = '\0';
+ if (!isWhitespace(c) && c != '[' && c != ']' && self->currentFootnoteLinkTextLen < 63) {
+ self->currentFootnoteLinkText[self->currentFootnoteLinkTextLen++] = c;
+ self->currentFootnoteLinkText[self->currentFootnoteLinkTextLen] = '\0';
}
}
return;
@@ -600,144 +572,95 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) {
auto* self = static_cast