diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index aca85581..3af85a1b 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -19,6 +19,38 @@ namespace { constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD"; constexpr size_t SOFT_HYPHEN_BYTES = 2; +// Known attaching punctuation (including UTF-8 sequences) +const std::vector punctuation = { + ".", + ",", + "!", + "?", + ";", + ":", + "\"", + "'", + "\xE2\x80\x99", // ’ (U+2019 right single quote) + "\xE2\x80\x9D" // ” (U+201D right double quote) +}; + +bool isAttachingPunctuationWord(const std::string& word) { + if (word.empty()) return false; + + size_t pos = 0; + while (pos < word.size()) { + bool matched = false; + for (const auto& p : punctuation) { + if (word.compare(pos, p.size(), p) == 0) { + pos += p.size(); + matched = true; + break; + } + } + if (!matched) return false; + } + return true; +} + bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; } // Removes every soft hyphen in-place so rendered glyphs match measured widths. @@ -374,10 +406,20 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const ? blockStyle.textIndent : 0; - // Calculate total word width for this line + // Calculate total word width for this line and count actual word gaps + // (punctuation that attaches to previous word doesn't count as a gap) + // Note: words list starts at the beginning because previous lines were spliced out int lineWordWidthSum = 0; - for (size_t i = lastBreakAt; i < lineBreak; i++) { - lineWordWidthSum += wordWidths[i]; + size_t actualGapCount = 0; + auto countWordIt = words.begin(); + + for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) { + lineWordWidthSum += wordWidths[lastBreakAt + wordIdx]; + // Count gaps: each word after the first creates a gap, unless it's attaching punctuation + if (wordIdx > 0 && !isAttachingPunctuationWord(*countWordIt)) { + actualGapCount++; + } + ++countWordIt; } // Calculate spacing (account for indent reducing effective page width on first line) @@ -387,24 +429,37 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spacing = spaceWidth; const bool isLastLine = breakIndex == lineBreakIndices.size() - 1; - if (blockStyle.alignment == CssTextAlign::Justify && !isLastLine && lineWordCount >= 2) { - spacing = spareSpace / (lineWordCount - 1); + // For justified text, calculate spacing based on actual gap count + if (blockStyle.alignment == CssTextAlign::Justify && !isLastLine && actualGapCount >= 1) { + spacing = spareSpace / static_cast(actualGapCount); } // Calculate initial x position (first line starts at indent for left/justified text) auto xpos = static_cast(firstLineIndent); if (blockStyle.alignment == CssTextAlign::Right) { - xpos = spareSpace - (lineWordCount - 1) * spaceWidth; + xpos = spareSpace - static_cast(actualGapCount) * spaceWidth; } else if (blockStyle.alignment == CssTextAlign::Center) { - xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2; + xpos = (spareSpace - static_cast(actualGapCount) * spaceWidth) / 2; } // Pre-calculate X positions for words + // Punctuation that attaches to the previous word doesn't get space before it + // Note: words list starts at the beginning because previous lines were spliced out std::list lineXPos; - for (size_t i = lastBreakAt; i < lineBreak; i++) { - const uint16_t currentWordWidth = wordWidths[i]; + auto wordIt = words.begin(); + + for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) { + const uint16_t currentWordWidth = wordWidths[lastBreakAt + wordIdx]; + lineXPos.push_back(xpos); - xpos += currentWordWidth + spacing; + + // Add spacing after this word, unless the next word is attaching punctuation + auto nextWordIt = wordIt; + ++nextWordIt; + const bool nextIsAttachingPunctuation = wordIdx + 1 < lineWordCount && isAttachingPunctuationWord(*nextWordIt); + + xpos += currentWordWidth + (nextIsAttachingPunctuation ? 0 : spacing); + ++wordIt; } // Iterators always start at the beginning as we are moving content with splice below