fix: prevent spurious spaces before attaching punctuation (#694)

Fixes issue #182

## Summary

**What is the goal of this PR?** 
When inline styles change mid-paragraph, words like periods, commas, and
quotes could end up as separate tokens. The justified text algorithm was
treating these as regular words, adding space before them.

**What changes are included?**

Now tracks which words are "attaching punctuation" (., , ! ? ; : " ' and
smart quotes) and excludes them from gap counting. These punctuation
marks attach directly to the preceding word without spacing.

## Additional Context

This is split out from code in #411 to address this comment
https://github.com/crosspoint-reader/crosspoint-reader/pull/411#discussion_r2751166631

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? _**YES**_, Claude Code
This commit is contained in:
Jake Kenneally 2026-02-05 09:55:15 -05:00 committed by GitHub
parent e94f056e8a
commit cb4d86fec6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -19,6 +19,38 @@ namespace {
constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD";
constexpr size_t SOFT_HYPHEN_BYTES = 2;
// Known attaching punctuation (including UTF-8 sequences)
const std::vector<std::string> punctuation = {
".",
",",
"!",
"?",
";",
":",
"\"",
"'",
"\xE2\x80\x99", // (U+2019 right single quote)
"\xE2\x80\x9D" // ” (U+201D right double quote)
};
bool isAttachingPunctuationWord(const std::string& word) {
if (word.empty()) return false;
size_t pos = 0;
while (pos < word.size()) {
bool matched = false;
for (const auto& p : punctuation) {
if (word.compare(pos, p.size(), p) == 0) {
pos += p.size();
matched = true;
break;
}
}
if (!matched) return false;
}
return true;
}
bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; }
// Removes every soft hyphen in-place so rendered glyphs match measured widths.
@ -374,10 +406,20 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
? blockStyle.textIndent
: 0;
// Calculate total word width for this line
// Calculate total word width for this line and count actual word gaps
// (punctuation that attaches to previous word doesn't count as a gap)
// Note: words list starts at the beginning because previous lines were spliced out
int lineWordWidthSum = 0;
for (size_t i = lastBreakAt; i < lineBreak; i++) {
lineWordWidthSum += wordWidths[i];
size_t actualGapCount = 0;
auto countWordIt = words.begin();
for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) {
lineWordWidthSum += wordWidths[lastBreakAt + wordIdx];
// Count gaps: each word after the first creates a gap, unless it's attaching punctuation
if (wordIdx > 0 && !isAttachingPunctuationWord(*countWordIt)) {
actualGapCount++;
}
++countWordIt;
}
// Calculate spacing (account for indent reducing effective page width on first line)
@ -387,24 +429,37 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
int spacing = spaceWidth;
const bool isLastLine = breakIndex == lineBreakIndices.size() - 1;
if (blockStyle.alignment == CssTextAlign::Justify && !isLastLine && lineWordCount >= 2) {
spacing = spareSpace / (lineWordCount - 1);
// For justified text, calculate spacing based on actual gap count
if (blockStyle.alignment == CssTextAlign::Justify && !isLastLine && actualGapCount >= 1) {
spacing = spareSpace / static_cast<int>(actualGapCount);
}
// Calculate initial x position (first line starts at indent for left/justified text)
auto xpos = static_cast<uint16_t>(firstLineIndent);
if (blockStyle.alignment == CssTextAlign::Right) {
xpos = spareSpace - (lineWordCount - 1) * spaceWidth;
xpos = spareSpace - static_cast<int>(actualGapCount) * spaceWidth;
} else if (blockStyle.alignment == CssTextAlign::Center) {
xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2;
xpos = (spareSpace - static_cast<int>(actualGapCount) * spaceWidth) / 2;
}
// Pre-calculate X positions for words
// Punctuation that attaches to the previous word doesn't get space before it
// Note: words list starts at the beginning because previous lines were spliced out
std::list<uint16_t> lineXPos;
for (size_t i = lastBreakAt; i < lineBreak; i++) {
const uint16_t currentWordWidth = wordWidths[i];
auto wordIt = words.begin();
for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) {
const uint16_t currentWordWidth = wordWidths[lastBreakAt + wordIdx];
lineXPos.push_back(xpos);
xpos += currentWordWidth + spacing;
// Add spacing after this word, unless the next word is attaching punctuation
auto nextWordIt = wordIt;
++nextWordIt;
const bool nextIsAttachingPunctuation = wordIdx + 1 < lineWordCount && isAttachingPunctuationWord(*nextWordIt);
xpos += currentWordWidth + (nextIsAttachingPunctuation ? 0 : spacing);
++wordIt;
}
// Iterators always start at the beginning as we are moving content with splice below