From b2020f5512f5579a67af33da3e3648e85acf74fc Mon Sep 17 00:00:00 2001 From: Dave Allie Date: Fri, 19 Dec 2025 01:11:03 +1100 Subject: [PATCH] Skip pagebreak blocks when parsing epub file (#58) ## Summary * Skip pagebreak blocks when parsing epub file * These blocks break the flow and often contain the page number in them which should not interrupt the flow of the content - Attributes sourced from: - https://www.w3.org/TR/epub-ssv-11/#pagebreak - https://www.w3.org/TR/dpub-aria-1.1/#doc-pagebreak --- lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index ea15e1a..d4edc33 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -75,6 +75,18 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* return; } + // Skip blocks with role="doc-pagebreak" and epub:type="pagebreak" + if (atts != nullptr) { + for (int i = 0; atts[i]; i += 2) { + if (strcmp(atts[i], "role") == 0 && strcmp(atts[i + 1], "doc-pagebreak") == 0 || + strcmp(atts[i], "epub:type") == 0 && strcmp(atts[i + 1], "pagebreak") == 0) { + self->skipUntilDepth = self->depth; + self->depth += 1; + return; + } + } + } + if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) { self->startNewTextBlock(TextBlock::CENTER_ALIGN); self->boldUntilDepth = min(self->boldUntilDepth, self->depth);