From 9f78fd33e851257288254d36327bae6d425f6e66 Mon Sep 17 00:00:00 2001 From: Jake Kenneally Date: Fri, 6 Feb 2026 03:10:37 -0500 Subject: [PATCH] fix: Remove separations after style changes (#720) Closes #182. Closes #710. Closes #711. ## Summary **What is the goal of this PR?** - A longer-term, more robust fix for the issue with spurious spaces appearing after style changes. Replaces solution from #694. **What changes are included?** - Add continuation flags to determine if to add a space after a word or if the word connects to the previous word. Replaces simple solution that only considered ending punctuation. - Fixed an issue with greedy line-breaking algorithm where punctuation could appear on the next line, separated from the word, if there was a style change between the word and punctuation --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**YES**_, Claude Code --- lib/Epub/Epub/ParsedText.cpp | 132 +++++++++--------- lib/Epub/Epub/ParsedText.h | 13 +- .../Epub/parsers/ChapterHtmlSlimParser.cpp | 32 ++++- lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h | 1 + 4 files changed, 108 insertions(+), 70 deletions(-) diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index 3af85a1b..82ddaecd 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -19,38 +19,6 @@ namespace { constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD"; constexpr size_t SOFT_HYPHEN_BYTES = 2; -// Known attaching punctuation (including UTF-8 sequences) -const std::vector punctuation = { - ".", - ",", - "!", - "?", - ";", - ":", - "\"", - "'", - "\xE2\x80\x99", // ’ (U+2019 right single quote) - "\xE2\x80\x9D" // ” (U+201D right double quote) -}; - -bool isAttachingPunctuationWord(const std::string& word) { - if (word.empty()) return false; - - size_t pos = 0; - while (pos < word.size()) { - bool matched = false; - for (const auto& p : punctuation) { - if (word.compare(pos, p.size(), p) == 0) { - pos += p.size(); - matched = true; - break; - } - } - if (!matched) return false; - } - return true; -} - bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; } // Removes every soft hyphen in-place so rendered glyphs match measured widths. @@ -81,15 +49,17 @@ uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const s } // namespace -void ParsedText::addWord(std::string word, const EpdFontFamily::Style style, const bool underline) { +void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, const bool underline, + const bool attachToPrevious) { if (word.empty()) return; words.push_back(std::move(word)); - EpdFontFamily::Style combinedStyle = style; + EpdFontFamily::Style combinedStyle = fontStyle; if (underline) { combinedStyle = static_cast(combinedStyle | EpdFontFamily::UNDERLINE); } wordStyles.push_back(combinedStyle); + wordContinues.push_back(attachToPrevious); } // Consumes data to minimize memory usage @@ -106,17 +76,21 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo const int pageWidth = viewportWidth; const int spaceWidth = renderer.getSpaceWidth(fontId); auto wordWidths = calculateWordWidths(renderer, fontId); + + // Build indexed continues vector from the parallel list for O(1) access during layout + std::vector continuesVec(wordContinues.begin(), wordContinues.end()); + std::vector lineBreakIndices; if (hyphenationEnabled) { // Use greedy layout that can split words mid-loop when a hyphenated prefix fits. - lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths); + lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths, continuesVec); } else { - lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths); + lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths, continuesVec); } const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1; for (size_t i = 0; i < lineCount; ++i) { - extractLine(i, pageWidth, spaceWidth, wordWidths, lineBreakIndices, processLine); + extractLine(i, pageWidth, spaceWidth, wordWidths, continuesVec, lineBreakIndices, processLine); } } @@ -140,7 +114,8 @@ std::vector ParsedText::calculateWordWidths(const GfxRenderer& rendere } std::vector ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth, - const int spaceWidth, std::vector& wordWidths) { + const int spaceWidth, std::vector& wordWidths, + std::vector& continuesVec) { if (words.empty()) { return {}; } @@ -157,7 +132,8 @@ std::vector ParsedText::computeLineBreaks(const GfxRenderer& renderer, c // First word needs to fit in reduced width if there's an indent const int effectiveWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth; while (wordWidths[i] > effectiveWidth) { - if (!hyphenateWordAtIndex(i, effectiveWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true)) { + if (!hyphenateWordAtIndex(i, effectiveWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true, + &continuesVec)) { break; } } @@ -175,20 +151,26 @@ std::vector ParsedText::computeLineBreaks(const GfxRenderer& renderer, c ans[totalWordCount - 1] = totalWordCount - 1; for (int i = totalWordCount - 2; i >= 0; --i) { - int currlen = -spaceWidth; + int currlen = 0; dp[i] = MAX_COST; // First line has reduced width due to text-indent const int effectivePageWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth; for (size_t j = i; j < totalWordCount; ++j) { - // Current line length: previous width + space + current word width - currlen += wordWidths[j] + spaceWidth; + // Add space before word j, unless it's the first word on the line or a continuation + const int gap = j > static_cast(i) && !continuesVec[j] ? spaceWidth : 0; + currlen += wordWidths[j] + gap; if (currlen > effectivePageWidth) { break; } + // Cannot break after word j if the next word attaches to it (continuation group) + if (j + 1 < totalWordCount && continuesVec[j + 1]) { + continue; + } + int cost; if (j == totalWordCount - 1) { cost = 0; // Last line @@ -260,7 +242,8 @@ void ParsedText::applyParagraphIndent() { // Builds break indices while opportunistically splitting the word that would overflow the current line. std::vector ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth, const int spaceWidth, - std::vector& wordWidths) { + std::vector& wordWidths, + std::vector& continuesVec) { // Calculate first line indent (only for left/justified text without extra paragraph spacing) const int firstLineIndent = blockStyle.textIndent > 0 && !extraParagraphSpacing && @@ -282,7 +265,7 @@ std::vector ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r // Consume as many words as possible for current line, splitting when prefixes fit while (currentIndex < wordWidths.size()) { const bool isFirstWord = currentIndex == lineStart; - const int spacing = isFirstWord ? 0 : spaceWidth; + const int spacing = isFirstWord || continuesVec[currentIndex] ? 0 : spaceWidth; const int candidateWidth = spacing + wordWidths[currentIndex]; // Word fits on current line @@ -296,8 +279,8 @@ std::vector ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r const int availableWidth = effectivePageWidth - lineWidth - spacing; const bool allowFallbackBreaks = isFirstWord; // Only for first word on line - if (availableWidth > 0 && - hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) { + if (availableWidth > 0 && hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, + allowFallbackBreaks, &continuesVec)) { // Prefix now fits; append it to this line and move to next line lineWidth += spacing + wordWidths[currentIndex]; ++currentIndex; @@ -312,6 +295,12 @@ std::vector ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r break; } + // Don't break before a continuation word (e.g., orphaned "?" after "question"). + // Backtrack to the start of the continuation group so the whole group moves to the next line. + while (currentIndex > lineStart + 1 && currentIndex < wordWidths.size() && continuesVec[currentIndex]) { + --currentIndex; + } + lineBreakIndices.push_back(currentIndex); isFirstLine = false; } @@ -323,7 +312,7 @@ std::vector ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r // available width. bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availableWidth, const GfxRenderer& renderer, const int fontId, std::vector& wordWidths, - const bool allowFallbackBreaks) { + const bool allowFallbackBreaks, std::vector* continuesVec) { // Guard against invalid indices or zero available width before attempting to split. if (availableWidth <= 0 || wordIndex >= words.size()) { return false; @@ -378,12 +367,28 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl wordIt->push_back('-'); } - // Insert the remainder word (with matching style) directly after the prefix. + // Insert the remainder word (with matching style and continuation flag) directly after the prefix. auto insertWordIt = std::next(wordIt); auto insertStyleIt = std::next(styleIt); words.insert(insertWordIt, remainder); wordStyles.insert(insertStyleIt, style); + // The remainder inherits whatever continuation status the original word had with the word after it. + // Find the continues entry for the original word and insert the remainder's entry after it. + auto continuesIt = wordContinues.begin(); + std::advance(continuesIt, wordIndex); + const bool originalContinuedToNext = *continuesIt; + // The original word (now prefix) does NOT continue to remainder (hyphen separates them) + *continuesIt = false; + const auto insertContinuesIt = std::next(continuesIt); + wordContinues.insert(insertContinuesIt, originalContinuedToNext); + + // Keep the indexed vector in sync if provided + if (continuesVec) { + (*continuesVec)[wordIndex] = false; + continuesVec->insert(continuesVec->begin() + wordIndex + 1, originalContinuedToNext); + } + // Update cached widths to reflect the new prefix/remainder pairing. wordWidths[wordIndex] = static_cast(chosenWidth); const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style); @@ -392,7 +397,8 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl } void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth, - const std::vector& wordWidths, const std::vector& lineBreakIndices, + const std::vector& wordWidths, const std::vector& continuesVec, + const std::vector& lineBreakIndices, const std::function)>& processLine) { const size_t lineBreak = lineBreakIndices[breakIndex]; const size_t lastBreakAt = breakIndex > 0 ? lineBreakIndices[breakIndex - 1] : 0; @@ -407,19 +413,16 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const : 0; // Calculate total word width for this line and count actual word gaps - // (punctuation that attaches to previous word doesn't count as a gap) - // Note: words list starts at the beginning because previous lines were spliced out + // (continuation words attach to previous word with no gap) int lineWordWidthSum = 0; size_t actualGapCount = 0; - auto countWordIt = words.begin(); for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) { lineWordWidthSum += wordWidths[lastBreakAt + wordIdx]; - // Count gaps: each word after the first creates a gap, unless it's attaching punctuation - if (wordIdx > 0 && !isAttachingPunctuationWord(*countWordIt)) { + // Count gaps: each word after the first creates a gap, unless it's a continuation + if (wordIdx > 0 && !continuesVec[lastBreakAt + wordIdx]) { actualGapCount++; } - ++countWordIt; } // Calculate spacing (account for indent reducing effective page width on first line) @@ -443,30 +446,27 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const } // Pre-calculate X positions for words - // Punctuation that attaches to the previous word doesn't get space before it - // Note: words list starts at the beginning because previous lines were spliced out + // Continuation words attach to the previous word with no space before them std::list lineXPos; - auto wordIt = words.begin(); for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) { const uint16_t currentWordWidth = wordWidths[lastBreakAt + wordIdx]; lineXPos.push_back(xpos); - // Add spacing after this word, unless the next word is attaching punctuation - auto nextWordIt = wordIt; - ++nextWordIt; - const bool nextIsAttachingPunctuation = wordIdx + 1 < lineWordCount && isAttachingPunctuationWord(*nextWordIt); + // Add spacing after this word, unless the next word is a continuation + const bool nextIsContinuation = wordIdx + 1 < lineWordCount && continuesVec[lastBreakAt + wordIdx + 1]; - xpos += currentWordWidth + (nextIsAttachingPunctuation ? 0 : spacing); - ++wordIt; + xpos += currentWordWidth + (nextIsContinuation ? 0 : spacing); } // Iterators always start at the beginning as we are moving content with splice below auto wordEndIt = words.begin(); auto wordStyleEndIt = wordStyles.begin(); + auto wordContinuesEndIt = wordContinues.begin(); std::advance(wordEndIt, lineWordCount); std::advance(wordStyleEndIt, lineWordCount); + std::advance(wordContinuesEndIt, lineWordCount); // *** CRITICAL STEP: CONSUME DATA USING SPLICE *** std::list lineWords; @@ -474,6 +474,10 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const std::list lineWordStyles; lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyles.begin(), wordStyleEndIt); + // Consume continues flags (not passed to TextBlock, but must be consumed to stay in sync) + std::list lineContinues; + lineContinues.splice(lineContinues.begin(), wordContinues, wordContinues.begin(), wordContinuesEndIt); + for (auto& word : lineWords) { if (containsSoftHyphen(word)) { stripSoftHyphensInPlace(word); diff --git a/lib/Epub/Epub/ParsedText.h b/lib/Epub/Epub/ParsedText.h index a13d13b5..8dd040cb 100644 --- a/lib/Epub/Epub/ParsedText.h +++ b/lib/Epub/Epub/ParsedText.h @@ -16,19 +16,22 @@ class GfxRenderer; class ParsedText { std::list words; std::list wordStyles; + std::list wordContinues; // true = word attaches to previous (no space before it) BlockStyle blockStyle; bool extraParagraphSpacing; bool hyphenationEnabled; void applyParagraphIndent(); std::vector computeLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, int spaceWidth, - std::vector& wordWidths); + std::vector& wordWidths, std::vector& continuesVec); std::vector computeHyphenatedLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, - int spaceWidth, std::vector& wordWidths); + int spaceWidth, std::vector& wordWidths, + std::vector& continuesVec); bool hyphenateWordAtIndex(size_t wordIndex, int availableWidth, const GfxRenderer& renderer, int fontId, - std::vector& wordWidths, bool allowFallbackBreaks); + std::vector& wordWidths, bool allowFallbackBreaks, + std::vector* continuesVec = nullptr); void extractLine(size_t breakIndex, int pageWidth, int spaceWidth, const std::vector& wordWidths, - const std::vector& lineBreakIndices, + const std::vector& continuesVec, const std::vector& lineBreakIndices, const std::function)>& processLine); std::vector calculateWordWidths(const GfxRenderer& renderer, int fontId); @@ -38,7 +41,7 @@ class ParsedText { : blockStyle(blockStyle), extraParagraphSpacing(extraParagraphSpacing), hyphenationEnabled(hyphenationEnabled) {} ~ParsedText() = default; - void addWord(std::string word, EpdFontFamily::Style fontStyle, bool underline = false); + void addWord(std::string word, EpdFontFamily::Style fontStyle, bool underline = false, bool attachToPrevious = false); void setBlockStyle(const BlockStyle& blockStyle) { this->blockStyle = blockStyle; } BlockStyle& getBlockStyle() { return blockStyle; } size_t size() const { return words.size(); } diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index cb5625c1..043cfca7 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -90,12 +90,14 @@ void ChapterHtmlSlimParser::flushPartWordBuffer() { // flush the buffer partWordBuffer[partWordBufferIndex] = '\0'; - currentTextBlock->addWord(partWordBuffer, fontStyle); + currentTextBlock->addWord(partWordBuffer, fontStyle, false, nextWordContinues); partWordBufferIndex = 0; + nextWordContinues = false; } // start a new text block if needed void ChapterHtmlSlimParser::startNewTextBlock(const BlockStyle& blockStyle) { + nextWordContinues = false; // New block = new paragraph, no continuation if (currentTextBlock) { // already have a text block running and it is empty - just reuse it if (currentTextBlock->isEmpty()) { @@ -241,6 +243,11 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* } } } else if (matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS)) { + // Flush buffer before style change so preceding text gets current style + if (self->partWordBufferIndex > 0) { + self->flushPartWordBuffer(); + self->nextWordContinues = true; + } self->underlineUntilDepth = std::min(self->underlineUntilDepth, self->depth); // Push inline style entry for underline tag StyleStackEntry entry; @@ -258,6 +265,11 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* self->inlineStyleStack.push_back(entry); self->updateEffectiveInlineStyle(); } else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) { + // Flush buffer before style change so preceding text gets current style + if (self->partWordBufferIndex > 0) { + self->flushPartWordBuffer(); + self->nextWordContinues = true; + } self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth); // Push inline style entry for bold tag StyleStackEntry entry; @@ -275,6 +287,11 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* self->inlineStyleStack.push_back(entry); self->updateEffectiveInlineStyle(); } else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) { + // Flush buffer before style change so preceding text gets current style + if (self->partWordBufferIndex > 0) { + self->flushPartWordBuffer(); + self->nextWordContinues = true; + } self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth); // Push inline style entry for italic tag StyleStackEntry entry; @@ -294,6 +311,11 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* } else if (strcmp(name, "span") == 0 || !isHeaderOrBlock(name)) { // Handle span and other inline elements for CSS styling if (cssStyle.hasFontWeight() || cssStyle.hasFontStyle() || cssStyle.hasTextDecoration()) { + // Flush buffer before style change so preceding text gets current style + if (self->partWordBufferIndex > 0) { + self->flushPartWordBuffer(); + self->nextWordContinues = true; + } StyleStackEntry entry; entry.depth = self->depth; // Track depth for matching pop if (cssStyle.hasFontWeight()) { @@ -331,6 +353,8 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char if (self->partWordBufferIndex > 0) { self->flushPartWordBuffer(); } + // Whitespace is a real word boundary — reset continuation state + self->nextWordContinues = false; // Skip the whitespace char continue; } @@ -387,6 +411,8 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n // Flush buffer with current style BEFORE any style changes if (self->partWordBufferIndex > 0) { // Flush if style will change OR if we're closing a block/structural element + const bool isInlineTag = !headerOrBlockTag && strcmp(name, "table") != 0 && + !matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) && self->depth != 1; const bool shouldFlush = styleWillChange || headerOrBlockTag || matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS) || strcmp(name, "table") == 0 || @@ -394,6 +420,10 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n if (shouldFlush) { self->flushPartWordBuffer(); + // If closing an inline element, the next word fragment continues the same visual word + if (isInlineTag) { + self->nextWordContinues = true; + } } } diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h index 261d8f4e..909913b1 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h @@ -30,6 +30,7 @@ class ChapterHtmlSlimParser { // leave one char at end for null pointer char partWordBuffer[MAX_WORD_SIZE + 1] = {}; int partWordBufferIndex = 0; + bool nextWordContinues = false; // true when next flushed word attaches to previous (inline element boundary) std::unique_ptr currentTextBlock = nullptr; std::unique_ptr currentPage = nullptr; int16_t currentPageNextY = 0;