fix: prevent spurious spaces before attaching punctuation (#694)

Fixes issue #182 ## Summary **What is the goal of this PR?** When inline styles change mid-paragraph, words like periods, commas, and quotes could end up as separate tokens. The justified text algorithm was treating these as regular words, adding space before them. **What changes are included?** Now tracks which words are "attaching punctuation" (., , ! ? ; : " ' and smart quotes) and excludes them from gap counting. These punctuation marks attach directly to the preceding word without spacing. ## Additional Context This is split out from code in #411 to address this comment https://github.com/crosspoint-reader/crosspoint-reader/pull/411#discussion_r2751166631 --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**YES**_, Claude Code
2026-02-06 07:37:37 +03:00 · 2026-02-05 09:55:15 -05:00 · 2026-02-05 09:55:15 -05:00 · cb4d86fec6
commit cb4d86fec6
parent e94f056e8a
1 changed files with 65 additions and 10 deletions
--- a/lib/Epub/Epub/ParsedText.cpp
+++ b/lib/Epub/Epub/ParsedText.cpp
@ -19,6 +19,38 @@ namespace {
 constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD";
 constexpr size_t SOFT_HYPHEN_BYTES = 2;

+// Known attaching punctuation (including UTF-8 sequences)
+const std::vector<std::string> punctuation = {
+    ".",
+    ",",
+    "!",
+    "?",
+    ";",
+    ":",
+    "\"",
+    "'",
+    "\xE2\x80\x99",  // ’ (U+2019 right single quote)
+    "\xE2\x80\x9D"   // ” (U+201D right double quote)
+};
+
+bool isAttachingPunctuationWord(const std::string& word) {
+  if (word.empty()) return false;
+
+  size_t pos = 0;
+  while (pos < word.size()) {
+    bool matched = false;
+    for (const auto& p : punctuation) {
+      if (word.compare(pos, p.size(), p) == 0) {
+        pos += p.size();
+        matched = true;
+        break;
+      }
+    }
+    if (!matched) return false;
+  }
+  return true;
+}
+
 bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; }

 // Removes every soft hyphen in-place so rendered glyphs match measured widths.
@ -374,10 +406,20 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
          ? blockStyle.textIndent
          : 0;

-  // Calculate total word width for this line
+  // Calculate total word width for this line and count actual word gaps
+  // (punctuation that attaches to previous word doesn't count as a gap)
+  // Note: words list starts at the beginning because previous lines were spliced out
  int lineWordWidthSum = 0;
-  for (size_t i = lastBreakAt; i < lineBreak; i++) {
-    lineWordWidthSum += wordWidths[i];
+  size_t actualGapCount = 0;
+  auto countWordIt = words.begin();
+
+  for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) {
+    lineWordWidthSum += wordWidths[lastBreakAt + wordIdx];
+    // Count gaps: each word after the first creates a gap, unless it's attaching punctuation
+    if (wordIdx > 0 && !isAttachingPunctuationWord(*countWordIt)) {
+      actualGapCount++;
+    }
+    ++countWordIt;
  }

  // Calculate spacing (account for indent reducing effective page width on first line)
@ -387,24 +429,37 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
  int spacing = spaceWidth;
  const bool isLastLine = breakIndex == lineBreakIndices.size() - 1;

-  if (blockStyle.alignment == CssTextAlign::Justify && !isLastLine && lineWordCount >= 2) {
-    spacing = spareSpace / (lineWordCount - 1);
+  // For justified text, calculate spacing based on actual gap count
+  if (blockStyle.alignment == CssTextAlign::Justify && !isLastLine && actualGapCount >= 1) {
+    spacing = spareSpace / static_cast<int>(actualGapCount);
  }

  // Calculate initial x position (first line starts at indent for left/justified text)
  auto xpos = static_cast<uint16_t>(firstLineIndent);
  if (blockStyle.alignment == CssTextAlign::Right) {
-    xpos = spareSpace - (lineWordCount - 1) * spaceWidth;
+    xpos = spareSpace - static_cast<int>(actualGapCount) * spaceWidth;
  } else if (blockStyle.alignment == CssTextAlign::Center) {
-    xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2;
+    xpos = (spareSpace - static_cast<int>(actualGapCount) * spaceWidth) / 2;
  }

  // Pre-calculate X positions for words
+  // Punctuation that attaches to the previous word doesn't get space before it
+  // Note: words list starts at the beginning because previous lines were spliced out
  std::list<uint16_t> lineXPos;
-  for (size_t i = lastBreakAt; i < lineBreak; i++) {
-    const uint16_t currentWordWidth = wordWidths[i];
+  auto wordIt = words.begin();
+
+  for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) {
+    const uint16_t currentWordWidth = wordWidths[lastBreakAt + wordIdx];
+
    lineXPos.push_back(xpos);
-    xpos += currentWordWidth + spacing;
+
+    // Add spacing after this word, unless the next word is attaching punctuation
+    auto nextWordIt = wordIt;
+    ++nextWordIt;
+    const bool nextIsAttachingPunctuation = wordIdx + 1 < lineWordCount && isAttachingPunctuationWord(*nextWordIt);
+
+    xpos += currentWordWidth + (nextIsAttachingPunctuation ? 0 : spacing);
+    ++wordIt;
  }

  // Iterators always start at the beginning as we are moving content with splice below