fix: Remove separations after style changes (#720)

Closes #182. Closes #710. Closes #711. ## Summary **What is the goal of this PR?** - A longer-term, more robust fix for the issue with spurious spaces appearing after style changes. Replaces solution from #694. **What changes are included?** - Add continuation flags to determine if to add a space after a word or if the word connects to the previous word. Replaces simple solution that only considered ending punctuation. - Fixed an issue with greedy line-breaking algorithm where punctuation could appear on the next line, separated from the word, if there was a style change between the word and punctuation --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**YES**_, Claude Code
2026-02-07 08:07:40 +03:00 · 2026-02-06 03:10:37 -05:00 · 2026-02-06 03:10:37 -05:00 · 9f78fd33e8
commit 9f78fd33e8
parent bd8132a260
4 changed files with 108 additions and 70 deletions
--- a/lib/Epub/Epub/ParsedText.cpp
+++ b/lib/Epub/Epub/ParsedText.cpp
@ -19,38 +19,6 @@ namespace {
 constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD";
 constexpr size_t SOFT_HYPHEN_BYTES = 2;

-// Known attaching punctuation (including UTF-8 sequences)
-const std::vector<std::string> punctuation = {
-    ".",
-    ",",
-    "!",
-    "?",
-    ";",
-    ":",
-    "\"",
-    "'",
-    "\xE2\x80\x99",  // ’ (U+2019 right single quote)
-    "\xE2\x80\x9D"   // ” (U+201D right double quote)
-};
-
-bool isAttachingPunctuationWord(const std::string& word) {
-  if (word.empty()) return false;
-
-  size_t pos = 0;
-  while (pos < word.size()) {
-    bool matched = false;
-    for (const auto& p : punctuation) {
-      if (word.compare(pos, p.size(), p) == 0) {
-        pos += p.size();
-        matched = true;
-        break;
-      }
-    }
-    if (!matched) return false;
-  }
-  return true;
-}
-
 bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; }

 // Removes every soft hyphen in-place so rendered glyphs match measured widths.
@ -81,15 +49,17 @@ uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const s

 }  // namespace

-void ParsedText::addWord(std::string word, const EpdFontFamily::Style style, const bool underline) {
+void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, const bool underline,
+                         const bool attachToPrevious) {
  if (word.empty()) return;

  words.push_back(std::move(word));
-  EpdFontFamily::Style combinedStyle = style;
+  EpdFontFamily::Style combinedStyle = fontStyle;
  if (underline) {
    combinedStyle = static_cast<EpdFontFamily::Style>(combinedStyle | EpdFontFamily::UNDERLINE);
  }
  wordStyles.push_back(combinedStyle);
+  wordContinues.push_back(attachToPrevious);
 }

 // Consumes data to minimize memory usage
@ -106,17 +76,21 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
  const int pageWidth = viewportWidth;
  const int spaceWidth = renderer.getSpaceWidth(fontId);
  auto wordWidths = calculateWordWidths(renderer, fontId);
+
+  // Build indexed continues vector from the parallel list for O(1) access during layout
+  std::vector<bool> continuesVec(wordContinues.begin(), wordContinues.end());
+
  std::vector<size_t> lineBreakIndices;
  if (hyphenationEnabled) {
    // Use greedy layout that can split words mid-loop when a hyphenated prefix fits.
-    lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths);
+    lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths, continuesVec);
  } else {
-    lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths);
+    lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths, continuesVec);
  }
  const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1;

  for (size_t i = 0; i < lineCount; ++i) {
-    extractLine(i, pageWidth, spaceWidth, wordWidths, lineBreakIndices, processLine);
+    extractLine(i, pageWidth, spaceWidth, wordWidths, continuesVec, lineBreakIndices, processLine);
  }
 }

@ -140,7 +114,8 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
 }

 std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth,
-                                                  const int spaceWidth, std::vector<uint16_t>& wordWidths) {
+                                                  const int spaceWidth, std::vector<uint16_t>& wordWidths,
+                                                  std::vector<bool>& continuesVec) {
  if (words.empty()) {
    return {};
  }
@ -157,7 +132,8 @@ std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, c
    // First word needs to fit in reduced width if there's an indent
    const int effectiveWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth;
    while (wordWidths[i] > effectiveWidth) {
-      if (!hyphenateWordAtIndex(i, effectiveWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true)) {
+      if (!hyphenateWordAtIndex(i, effectiveWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true,
+                                &continuesVec)) {
        break;
      }
    }
@ -175,20 +151,26 @@ std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, c
  ans[totalWordCount - 1] = totalWordCount - 1;

  for (int i = totalWordCount - 2; i >= 0; --i) {
-    int currlen = -spaceWidth;
+    int currlen = 0;
    dp[i] = MAX_COST;

    // First line has reduced width due to text-indent
    const int effectivePageWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth;

    for (size_t j = i; j < totalWordCount; ++j) {
-      // Current line length: previous width + space + current word width
-      currlen += wordWidths[j] + spaceWidth;
+      // Add space before word j, unless it's the first word on the line or a continuation
+      const int gap = j > static_cast<size_t>(i) && !continuesVec[j] ? spaceWidth : 0;
+      currlen += wordWidths[j] + gap;

      if (currlen > effectivePageWidth) {
        break;
      }

+      // Cannot break after word j if the next word attaches to it (continuation group)
+      if (j + 1 < totalWordCount && continuesVec[j + 1]) {
+        continue;
+      }
+
      int cost;
      if (j == totalWordCount - 1) {
        cost = 0;  // Last line
@ -260,7 +242,8 @@ void ParsedText::applyParagraphIndent() {
 // Builds break indices while opportunistically splitting the word that would overflow the current line.
 std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& renderer, const int fontId,
                                                            const int pageWidth, const int spaceWidth,
-                                                            std::vector<uint16_t>& wordWidths) {
+                                                            std::vector<uint16_t>& wordWidths,
+                                                            std::vector<bool>& continuesVec) {
  // Calculate first line indent (only for left/justified text without extra paragraph spacing)
  const int firstLineIndent =
      blockStyle.textIndent > 0 && !extraParagraphSpacing &&
@ -282,7 +265,7 @@ std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r
    // Consume as many words as possible for current line, splitting when prefixes fit
    while (currentIndex < wordWidths.size()) {
      const bool isFirstWord = currentIndex == lineStart;
-      const int spacing = isFirstWord ? 0 : spaceWidth;
+      const int spacing = isFirstWord || continuesVec[currentIndex] ? 0 : spaceWidth;
      const int candidateWidth = spacing + wordWidths[currentIndex];

      // Word fits on current line
@ -296,8 +279,8 @@ std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r
      const int availableWidth = effectivePageWidth - lineWidth - spacing;
      const bool allowFallbackBreaks = isFirstWord;  // Only for first word on line

-      if (availableWidth > 0 &&
-          hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) {
+      if (availableWidth > 0 && hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths,
+                                                     allowFallbackBreaks, &continuesVec)) {
        // Prefix now fits; append it to this line and move to next line
        lineWidth += spacing + wordWidths[currentIndex];
        ++currentIndex;
@ -312,6 +295,12 @@ std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r
      break;
    }

+    // Don't break before a continuation word (e.g., orphaned "?" after "question").
+    // Backtrack to the start of the continuation group so the whole group moves to the next line.
+    while (currentIndex > lineStart + 1 && currentIndex < wordWidths.size() && continuesVec[currentIndex]) {
+      --currentIndex;
+    }
+
    lineBreakIndices.push_back(currentIndex);
    isFirstLine = false;
  }
@ -323,7 +312,7 @@ std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r
 // available width.
 bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availableWidth, const GfxRenderer& renderer,
                                      const int fontId, std::vector<uint16_t>& wordWidths,
-                                      const bool allowFallbackBreaks) {
+                                      const bool allowFallbackBreaks, std::vector<bool>* continuesVec) {
  // Guard against invalid indices or zero available width before attempting to split.
  if (availableWidth <= 0 || wordIndex >= words.size()) {
    return false;
@ -378,12 +367,28 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
    wordIt->push_back('-');
  }

-  // Insert the remainder word (with matching style) directly after the prefix.
+  // Insert the remainder word (with matching style and continuation flag) directly after the prefix.
  auto insertWordIt = std::next(wordIt);
  auto insertStyleIt = std::next(styleIt);
  words.insert(insertWordIt, remainder);
  wordStyles.insert(insertStyleIt, style);

+  // The remainder inherits whatever continuation status the original word had with the word after it.
+  // Find the continues entry for the original word and insert the remainder's entry after it.
+  auto continuesIt = wordContinues.begin();
+  std::advance(continuesIt, wordIndex);
+  const bool originalContinuedToNext = *continuesIt;
+  // The original word (now prefix) does NOT continue to remainder (hyphen separates them)
+  *continuesIt = false;
+  const auto insertContinuesIt = std::next(continuesIt);
+  wordContinues.insert(insertContinuesIt, originalContinuedToNext);
+
+  // Keep the indexed vector in sync if provided
+  if (continuesVec) {
+    (*continuesVec)[wordIndex] = false;
+    continuesVec->insert(continuesVec->begin() + wordIndex + 1, originalContinuedToNext);
+  }
+
  // Update cached widths to reflect the new prefix/remainder pairing.
  wordWidths[wordIndex] = static_cast<uint16_t>(chosenWidth);
  const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style);
@ -392,7 +397,8 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
 }

 void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth,
-                             const std::vector<uint16_t>& wordWidths, const std::vector<size_t>& lineBreakIndices,
+                             const std::vector<uint16_t>& wordWidths, const std::vector<bool>& continuesVec,
+                             const std::vector<size_t>& lineBreakIndices,
                             const std::function<void(std::shared_ptr<TextBlock>)>& processLine) {
  const size_t lineBreak = lineBreakIndices[breakIndex];
  const size_t lastBreakAt = breakIndex > 0 ? lineBreakIndices[breakIndex - 1] : 0;
@ -407,19 +413,16 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
          : 0;

  // Calculate total word width for this line and count actual word gaps
-  // (punctuation that attaches to previous word doesn't count as a gap)
-  // Note: words list starts at the beginning because previous lines were spliced out
+  // (continuation words attach to previous word with no gap)
  int lineWordWidthSum = 0;
  size_t actualGapCount = 0;
-  auto countWordIt = words.begin();

  for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) {
    lineWordWidthSum += wordWidths[lastBreakAt + wordIdx];
-    // Count gaps: each word after the first creates a gap, unless it's attaching punctuation
-    if (wordIdx > 0 && !isAttachingPunctuationWord(*countWordIt)) {
+    // Count gaps: each word after the first creates a gap, unless it's a continuation
+    if (wordIdx > 0 && !continuesVec[lastBreakAt + wordIdx]) {
      actualGapCount++;
    }
-    ++countWordIt;
  }

  // Calculate spacing (account for indent reducing effective page width on first line)
@ -443,30 +446,27 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
  }

  // Pre-calculate X positions for words
-  // Punctuation that attaches to the previous word doesn't get space before it
-  // Note: words list starts at the beginning because previous lines were spliced out
+  // Continuation words attach to the previous word with no space before them
  std::list<uint16_t> lineXPos;
-  auto wordIt = words.begin();

  for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) {
    const uint16_t currentWordWidth = wordWidths[lastBreakAt + wordIdx];

    lineXPos.push_back(xpos);

-    // Add spacing after this word, unless the next word is attaching punctuation
-    auto nextWordIt = wordIt;
-    ++nextWordIt;
-    const bool nextIsAttachingPunctuation = wordIdx + 1 < lineWordCount && isAttachingPunctuationWord(*nextWordIt);
+    // Add spacing after this word, unless the next word is a continuation
+    const bool nextIsContinuation = wordIdx + 1 < lineWordCount && continuesVec[lastBreakAt + wordIdx + 1];

-    xpos += currentWordWidth + (nextIsAttachingPunctuation ? 0 : spacing);
-    ++wordIt;
+    xpos += currentWordWidth + (nextIsContinuation ? 0 : spacing);
  }

  // Iterators always start at the beginning as we are moving content with splice below
  auto wordEndIt = words.begin();
  auto wordStyleEndIt = wordStyles.begin();
+  auto wordContinuesEndIt = wordContinues.begin();
  std::advance(wordEndIt, lineWordCount);
  std::advance(wordStyleEndIt, lineWordCount);
+  std::advance(wordContinuesEndIt, lineWordCount);

  // *** CRITICAL STEP: CONSUME DATA USING SPLICE ***
  std::list<std::string> lineWords;
@ -474,6 +474,10 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
  std::list<EpdFontFamily::Style> lineWordStyles;
  lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyles.begin(), wordStyleEndIt);

+  // Consume continues flags (not passed to TextBlock, but must be consumed to stay in sync)
+  std::list<bool> lineContinues;
+  lineContinues.splice(lineContinues.begin(), wordContinues, wordContinues.begin(), wordContinuesEndIt);
+
  for (auto& word : lineWords) {
    if (containsSoftHyphen(word)) {
      stripSoftHyphensInPlace(word);
--- a/lib/Epub/Epub/ParsedText.h
+++ b/lib/Epub/Epub/ParsedText.h
@ -16,19 +16,22 @@ class GfxRenderer;
 class ParsedText {
  std::list<std::string> words;
  std::list<EpdFontFamily::Style> wordStyles;
+  std::list<bool> wordContinues;  // true = word attaches to previous (no space before it)
  BlockStyle blockStyle;
  bool extraParagraphSpacing;
  bool hyphenationEnabled;

  void applyParagraphIndent();
  std::vector<size_t> computeLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, int spaceWidth,
-                                        std::vector<uint16_t>& wordWidths);
+                                        std::vector<uint16_t>& wordWidths, std::vector<bool>& continuesVec);
  std::vector<size_t> computeHyphenatedLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth,
-                                                  int spaceWidth, std::vector<uint16_t>& wordWidths);
+                                                  int spaceWidth, std::vector<uint16_t>& wordWidths,
+                                                  std::vector<bool>& continuesVec);
  bool hyphenateWordAtIndex(size_t wordIndex, int availableWidth, const GfxRenderer& renderer, int fontId,
-                            std::vector<uint16_t>& wordWidths, bool allowFallbackBreaks);
+                            std::vector<uint16_t>& wordWidths, bool allowFallbackBreaks,
+                            std::vector<bool>* continuesVec = nullptr);
  void extractLine(size_t breakIndex, int pageWidth, int spaceWidth, const std::vector<uint16_t>& wordWidths,
-                   const std::vector<size_t>& lineBreakIndices,
+                   const std::vector<bool>& continuesVec, const std::vector<size_t>& lineBreakIndices,
                   const std::function<void(std::shared_ptr<TextBlock>)>& processLine);
  std::vector<uint16_t> calculateWordWidths(const GfxRenderer& renderer, int fontId);

@ -38,7 +41,7 @@ class ParsedText {
      : blockStyle(blockStyle), extraParagraphSpacing(extraParagraphSpacing), hyphenationEnabled(hyphenationEnabled) {}
  ~ParsedText() = default;

-  void addWord(std::string word, EpdFontFamily::Style fontStyle, bool underline = false);
+  void addWord(std::string word, EpdFontFamily::Style fontStyle, bool underline = false, bool attachToPrevious = false);
  void setBlockStyle(const BlockStyle& blockStyle) { this->blockStyle = blockStyle; }
  BlockStyle& getBlockStyle() { return blockStyle; }
  size_t size() const { return words.size(); }
--- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
+++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
@ -90,12 +90,14 @@ void ChapterHtmlSlimParser::flushPartWordBuffer() {

  // flush the buffer
  partWordBuffer[partWordBufferIndex] = '\0';
-  currentTextBlock->addWord(partWordBuffer, fontStyle);
+  currentTextBlock->addWord(partWordBuffer, fontStyle, false, nextWordContinues);
  partWordBufferIndex = 0;
+  nextWordContinues = false;
 }

 // start a new text block if needed
 void ChapterHtmlSlimParser::startNewTextBlock(const BlockStyle& blockStyle) {
+  nextWordContinues = false;  // New block = new paragraph, no continuation
  if (currentTextBlock) {
    // already have a text block running and it is empty - just reuse it
    if (currentTextBlock->isEmpty()) {
@ -241,6 +243,11 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char*
      }
    }
  } else if (matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS)) {
+    // Flush buffer before style change so preceding text gets current style
+    if (self->partWordBufferIndex > 0) {
+      self->flushPartWordBuffer();
+      self->nextWordContinues = true;
+    }
    self->underlineUntilDepth = std::min(self->underlineUntilDepth, self->depth);
    // Push inline style entry for underline tag
    StyleStackEntry entry;
@ -258,6 +265,11 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char*
    self->inlineStyleStack.push_back(entry);
    self->updateEffectiveInlineStyle();
  } else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) {
+    // Flush buffer before style change so preceding text gets current style
+    if (self->partWordBufferIndex > 0) {
+      self->flushPartWordBuffer();
+      self->nextWordContinues = true;
+    }
    self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth);
    // Push inline style entry for bold tag
    StyleStackEntry entry;
@ -275,6 +287,11 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char*
    self->inlineStyleStack.push_back(entry);
    self->updateEffectiveInlineStyle();
  } else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
+    // Flush buffer before style change so preceding text gets current style
+    if (self->partWordBufferIndex > 0) {
+      self->flushPartWordBuffer();
+      self->nextWordContinues = true;
+    }
    self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth);
    // Push inline style entry for italic tag
    StyleStackEntry entry;
@ -294,6 +311,11 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char*
  } else if (strcmp(name, "span") == 0 || !isHeaderOrBlock(name)) {
    // Handle span and other inline elements for CSS styling
    if (cssStyle.hasFontWeight() || cssStyle.hasFontStyle() || cssStyle.hasTextDecoration()) {
+      // Flush buffer before style change so preceding text gets current style
+      if (self->partWordBufferIndex > 0) {
+        self->flushPartWordBuffer();
+        self->nextWordContinues = true;
+      }
      StyleStackEntry entry;
      entry.depth = self->depth;  // Track depth for matching pop
      if (cssStyle.hasFontWeight()) {
@ -331,6 +353,8 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
      if (self->partWordBufferIndex > 0) {
        self->flushPartWordBuffer();
      }
+      // Whitespace is a real word boundary — reset continuation state
+      self->nextWordContinues = false;
      // Skip the whitespace char
      continue;
    }
@ -387,6 +411,8 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n
  // Flush buffer with current style BEFORE any style changes
  if (self->partWordBufferIndex > 0) {
    // Flush if style will change OR if we're closing a block/structural element
+    const bool isInlineTag = !headerOrBlockTag && strcmp(name, "table") != 0 &&
+                             !matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) && self->depth != 1;
    const bool shouldFlush = styleWillChange || headerOrBlockTag || matches(name, BOLD_TAGS, NUM_BOLD_TAGS) ||
                             matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) ||
                             matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS) || strcmp(name, "table") == 0 ||
@ -394,6 +420,10 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n

    if (shouldFlush) {
      self->flushPartWordBuffer();
+      // If closing an inline element, the next word fragment continues the same visual word
+      if (isInlineTag) {
+        self->nextWordContinues = true;
+      }
    }
  }

--- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h
+++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h
@ -30,6 +30,7 @@ class ChapterHtmlSlimParser {
  // leave one char at end for null pointer
  char partWordBuffer[MAX_WORD_SIZE + 1] = {};
  int partWordBufferIndex = 0;
+  bool nextWordContinues = false;  // true when next flushed word attaches to previous (inline element boundary)
  std::unique_ptr<ParsedText> currentTextBlock = nullptr;
  std::unique_ptr<Page> currentPage = nullptr;
  int16_t currentPageNextY = 0;