fix: Remove separations after style changes (#720)
Some checks are pending
CI (build) / clang-format (push) Waiting to run
CI (build) / cppcheck (push) Waiting to run
CI (build) / build (push) Waiting to run
CI (build) / Test Status (push) Blocked by required conditions

Closes #182. Closes #710. Closes #711.

## Summary

**What is the goal of this PR?**
- A longer-term, more robust fix for the issue with spurious spaces
appearing after style changes. Replaces solution from #694.

**What changes are included?**
- Add continuation flags to determine if to add a space after a word or
if the word connects to the previous word. Replaces simple solution that
only considered ending punctuation.
- Fixed an issue with greedy line-breaking algorithm where punctuation
could appear on the next line, separated from the word, if there was a
style change between the word and punctuation

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? _**YES**_, Claude Code
This commit is contained in:
Jake Kenneally 2026-02-06 03:10:37 -05:00 committed by GitHub
parent bd8132a260
commit 9f78fd33e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 108 additions and 70 deletions

View File

@ -19,38 +19,6 @@ namespace {
constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD";
constexpr size_t SOFT_HYPHEN_BYTES = 2;
// Known attaching punctuation (including UTF-8 sequences)
const std::vector<std::string> punctuation = {
".",
",",
"!",
"?",
";",
":",
"\"",
"'",
"\xE2\x80\x99", // (U+2019 right single quote)
"\xE2\x80\x9D" // ” (U+201D right double quote)
};
bool isAttachingPunctuationWord(const std::string& word) {
if (word.empty()) return false;
size_t pos = 0;
while (pos < word.size()) {
bool matched = false;
for (const auto& p : punctuation) {
if (word.compare(pos, p.size(), p) == 0) {
pos += p.size();
matched = true;
break;
}
}
if (!matched) return false;
}
return true;
}
bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; }
// Removes every soft hyphen in-place so rendered glyphs match measured widths.
@ -81,15 +49,17 @@ uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const s
} // namespace
void ParsedText::addWord(std::string word, const EpdFontFamily::Style style, const bool underline) {
void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, const bool underline,
const bool attachToPrevious) {
if (word.empty()) return;
words.push_back(std::move(word));
EpdFontFamily::Style combinedStyle = style;
EpdFontFamily::Style combinedStyle = fontStyle;
if (underline) {
combinedStyle = static_cast<EpdFontFamily::Style>(combinedStyle | EpdFontFamily::UNDERLINE);
}
wordStyles.push_back(combinedStyle);
wordContinues.push_back(attachToPrevious);
}
// Consumes data to minimize memory usage
@ -106,17 +76,21 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
const int pageWidth = viewportWidth;
const int spaceWidth = renderer.getSpaceWidth(fontId);
auto wordWidths = calculateWordWidths(renderer, fontId);
// Build indexed continues vector from the parallel list for O(1) access during layout
std::vector<bool> continuesVec(wordContinues.begin(), wordContinues.end());
std::vector<size_t> lineBreakIndices;
if (hyphenationEnabled) {
// Use greedy layout that can split words mid-loop when a hyphenated prefix fits.
lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths);
lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths, continuesVec);
} else {
lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths);
lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths, continuesVec);
}
const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1;
for (size_t i = 0; i < lineCount; ++i) {
extractLine(i, pageWidth, spaceWidth, wordWidths, lineBreakIndices, processLine);
extractLine(i, pageWidth, spaceWidth, wordWidths, continuesVec, lineBreakIndices, processLine);
}
}
@ -140,7 +114,8 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
}
std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth,
const int spaceWidth, std::vector<uint16_t>& wordWidths) {
const int spaceWidth, std::vector<uint16_t>& wordWidths,
std::vector<bool>& continuesVec) {
if (words.empty()) {
return {};
}
@ -157,7 +132,8 @@ std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, c
// First word needs to fit in reduced width if there's an indent
const int effectiveWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth;
while (wordWidths[i] > effectiveWidth) {
if (!hyphenateWordAtIndex(i, effectiveWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true)) {
if (!hyphenateWordAtIndex(i, effectiveWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true,
&continuesVec)) {
break;
}
}
@ -175,20 +151,26 @@ std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, c
ans[totalWordCount - 1] = totalWordCount - 1;
for (int i = totalWordCount - 2; i >= 0; --i) {
int currlen = -spaceWidth;
int currlen = 0;
dp[i] = MAX_COST;
// First line has reduced width due to text-indent
const int effectivePageWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth;
for (size_t j = i; j < totalWordCount; ++j) {
// Current line length: previous width + space + current word width
currlen += wordWidths[j] + spaceWidth;
// Add space before word j, unless it's the first word on the line or a continuation
const int gap = j > static_cast<size_t>(i) && !continuesVec[j] ? spaceWidth : 0;
currlen += wordWidths[j] + gap;
if (currlen > effectivePageWidth) {
break;
}
// Cannot break after word j if the next word attaches to it (continuation group)
if (j + 1 < totalWordCount && continuesVec[j + 1]) {
continue;
}
int cost;
if (j == totalWordCount - 1) {
cost = 0; // Last line
@ -260,7 +242,8 @@ void ParsedText::applyParagraphIndent() {
// Builds break indices while opportunistically splitting the word that would overflow the current line.
std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& renderer, const int fontId,
const int pageWidth, const int spaceWidth,
std::vector<uint16_t>& wordWidths) {
std::vector<uint16_t>& wordWidths,
std::vector<bool>& continuesVec) {
// Calculate first line indent (only for left/justified text without extra paragraph spacing)
const int firstLineIndent =
blockStyle.textIndent > 0 && !extraParagraphSpacing &&
@ -282,7 +265,7 @@ std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r
// Consume as many words as possible for current line, splitting when prefixes fit
while (currentIndex < wordWidths.size()) {
const bool isFirstWord = currentIndex == lineStart;
const int spacing = isFirstWord ? 0 : spaceWidth;
const int spacing = isFirstWord || continuesVec[currentIndex] ? 0 : spaceWidth;
const int candidateWidth = spacing + wordWidths[currentIndex];
// Word fits on current line
@ -296,8 +279,8 @@ std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r
const int availableWidth = effectivePageWidth - lineWidth - spacing;
const bool allowFallbackBreaks = isFirstWord; // Only for first word on line
if (availableWidth > 0 &&
hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) {
if (availableWidth > 0 && hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths,
allowFallbackBreaks, &continuesVec)) {
// Prefix now fits; append it to this line and move to next line
lineWidth += spacing + wordWidths[currentIndex];
++currentIndex;
@ -312,6 +295,12 @@ std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r
break;
}
// Don't break before a continuation word (e.g., orphaned "?" after "question").
// Backtrack to the start of the continuation group so the whole group moves to the next line.
while (currentIndex > lineStart + 1 && currentIndex < wordWidths.size() && continuesVec[currentIndex]) {
--currentIndex;
}
lineBreakIndices.push_back(currentIndex);
isFirstLine = false;
}
@ -323,7 +312,7 @@ std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r
// available width.
bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availableWidth, const GfxRenderer& renderer,
const int fontId, std::vector<uint16_t>& wordWidths,
const bool allowFallbackBreaks) {
const bool allowFallbackBreaks, std::vector<bool>* continuesVec) {
// Guard against invalid indices or zero available width before attempting to split.
if (availableWidth <= 0 || wordIndex >= words.size()) {
return false;
@ -378,12 +367,28 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
wordIt->push_back('-');
}
// Insert the remainder word (with matching style) directly after the prefix.
// Insert the remainder word (with matching style and continuation flag) directly after the prefix.
auto insertWordIt = std::next(wordIt);
auto insertStyleIt = std::next(styleIt);
words.insert(insertWordIt, remainder);
wordStyles.insert(insertStyleIt, style);
// The remainder inherits whatever continuation status the original word had with the word after it.
// Find the continues entry for the original word and insert the remainder's entry after it.
auto continuesIt = wordContinues.begin();
std::advance(continuesIt, wordIndex);
const bool originalContinuedToNext = *continuesIt;
// The original word (now prefix) does NOT continue to remainder (hyphen separates them)
*continuesIt = false;
const auto insertContinuesIt = std::next(continuesIt);
wordContinues.insert(insertContinuesIt, originalContinuedToNext);
// Keep the indexed vector in sync if provided
if (continuesVec) {
(*continuesVec)[wordIndex] = false;
continuesVec->insert(continuesVec->begin() + wordIndex + 1, originalContinuedToNext);
}
// Update cached widths to reflect the new prefix/remainder pairing.
wordWidths[wordIndex] = static_cast<uint16_t>(chosenWidth);
const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style);
@ -392,7 +397,8 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
}
void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth,
const std::vector<uint16_t>& wordWidths, const std::vector<size_t>& lineBreakIndices,
const std::vector<uint16_t>& wordWidths, const std::vector<bool>& continuesVec,
const std::vector<size_t>& lineBreakIndices,
const std::function<void(std::shared_ptr<TextBlock>)>& processLine) {
const size_t lineBreak = lineBreakIndices[breakIndex];
const size_t lastBreakAt = breakIndex > 0 ? lineBreakIndices[breakIndex - 1] : 0;
@ -407,19 +413,16 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
: 0;
// Calculate total word width for this line and count actual word gaps
// (punctuation that attaches to previous word doesn't count as a gap)
// Note: words list starts at the beginning because previous lines were spliced out
// (continuation words attach to previous word with no gap)
int lineWordWidthSum = 0;
size_t actualGapCount = 0;
auto countWordIt = words.begin();
for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) {
lineWordWidthSum += wordWidths[lastBreakAt + wordIdx];
// Count gaps: each word after the first creates a gap, unless it's attaching punctuation
if (wordIdx > 0 && !isAttachingPunctuationWord(*countWordIt)) {
// Count gaps: each word after the first creates a gap, unless it's a continuation
if (wordIdx > 0 && !continuesVec[lastBreakAt + wordIdx]) {
actualGapCount++;
}
++countWordIt;
}
// Calculate spacing (account for indent reducing effective page width on first line)
@ -443,30 +446,27 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
}
// Pre-calculate X positions for words
// Punctuation that attaches to the previous word doesn't get space before it
// Note: words list starts at the beginning because previous lines were spliced out
// Continuation words attach to the previous word with no space before them
std::list<uint16_t> lineXPos;
auto wordIt = words.begin();
for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) {
const uint16_t currentWordWidth = wordWidths[lastBreakAt + wordIdx];
lineXPos.push_back(xpos);
// Add spacing after this word, unless the next word is attaching punctuation
auto nextWordIt = wordIt;
++nextWordIt;
const bool nextIsAttachingPunctuation = wordIdx + 1 < lineWordCount && isAttachingPunctuationWord(*nextWordIt);
// Add spacing after this word, unless the next word is a continuation
const bool nextIsContinuation = wordIdx + 1 < lineWordCount && continuesVec[lastBreakAt + wordIdx + 1];
xpos += currentWordWidth + (nextIsAttachingPunctuation ? 0 : spacing);
++wordIt;
xpos += currentWordWidth + (nextIsContinuation ? 0 : spacing);
}
// Iterators always start at the beginning as we are moving content with splice below
auto wordEndIt = words.begin();
auto wordStyleEndIt = wordStyles.begin();
auto wordContinuesEndIt = wordContinues.begin();
std::advance(wordEndIt, lineWordCount);
std::advance(wordStyleEndIt, lineWordCount);
std::advance(wordContinuesEndIt, lineWordCount);
// *** CRITICAL STEP: CONSUME DATA USING SPLICE ***
std::list<std::string> lineWords;
@ -474,6 +474,10 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
std::list<EpdFontFamily::Style> lineWordStyles;
lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyles.begin(), wordStyleEndIt);
// Consume continues flags (not passed to TextBlock, but must be consumed to stay in sync)
std::list<bool> lineContinues;
lineContinues.splice(lineContinues.begin(), wordContinues, wordContinues.begin(), wordContinuesEndIt);
for (auto& word : lineWords) {
if (containsSoftHyphen(word)) {
stripSoftHyphensInPlace(word);

View File

@ -16,19 +16,22 @@ class GfxRenderer;
class ParsedText {
std::list<std::string> words;
std::list<EpdFontFamily::Style> wordStyles;
std::list<bool> wordContinues; // true = word attaches to previous (no space before it)
BlockStyle blockStyle;
bool extraParagraphSpacing;
bool hyphenationEnabled;
void applyParagraphIndent();
std::vector<size_t> computeLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, int spaceWidth,
std::vector<uint16_t>& wordWidths);
std::vector<uint16_t>& wordWidths, std::vector<bool>& continuesVec);
std::vector<size_t> computeHyphenatedLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth,
int spaceWidth, std::vector<uint16_t>& wordWidths);
int spaceWidth, std::vector<uint16_t>& wordWidths,
std::vector<bool>& continuesVec);
bool hyphenateWordAtIndex(size_t wordIndex, int availableWidth, const GfxRenderer& renderer, int fontId,
std::vector<uint16_t>& wordWidths, bool allowFallbackBreaks);
std::vector<uint16_t>& wordWidths, bool allowFallbackBreaks,
std::vector<bool>* continuesVec = nullptr);
void extractLine(size_t breakIndex, int pageWidth, int spaceWidth, const std::vector<uint16_t>& wordWidths,
const std::vector<size_t>& lineBreakIndices,
const std::vector<bool>& continuesVec, const std::vector<size_t>& lineBreakIndices,
const std::function<void(std::shared_ptr<TextBlock>)>& processLine);
std::vector<uint16_t> calculateWordWidths(const GfxRenderer& renderer, int fontId);
@ -38,7 +41,7 @@ class ParsedText {
: blockStyle(blockStyle), extraParagraphSpacing(extraParagraphSpacing), hyphenationEnabled(hyphenationEnabled) {}
~ParsedText() = default;
void addWord(std::string word, EpdFontFamily::Style fontStyle, bool underline = false);
void addWord(std::string word, EpdFontFamily::Style fontStyle, bool underline = false, bool attachToPrevious = false);
void setBlockStyle(const BlockStyle& blockStyle) { this->blockStyle = blockStyle; }
BlockStyle& getBlockStyle() { return blockStyle; }
size_t size() const { return words.size(); }

View File

@ -90,12 +90,14 @@ void ChapterHtmlSlimParser::flushPartWordBuffer() {
// flush the buffer
partWordBuffer[partWordBufferIndex] = '\0';
currentTextBlock->addWord(partWordBuffer, fontStyle);
currentTextBlock->addWord(partWordBuffer, fontStyle, false, nextWordContinues);
partWordBufferIndex = 0;
nextWordContinues = false;
}
// start a new text block if needed
void ChapterHtmlSlimParser::startNewTextBlock(const BlockStyle& blockStyle) {
nextWordContinues = false; // New block = new paragraph, no continuation
if (currentTextBlock) {
// already have a text block running and it is empty - just reuse it
if (currentTextBlock->isEmpty()) {
@ -241,6 +243,11 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char*
}
}
} else if (matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS)) {
// Flush buffer before style change so preceding text gets current style
if (self->partWordBufferIndex > 0) {
self->flushPartWordBuffer();
self->nextWordContinues = true;
}
self->underlineUntilDepth = std::min(self->underlineUntilDepth, self->depth);
// Push inline style entry for underline tag
StyleStackEntry entry;
@ -258,6 +265,11 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char*
self->inlineStyleStack.push_back(entry);
self->updateEffectiveInlineStyle();
} else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) {
// Flush buffer before style change so preceding text gets current style
if (self->partWordBufferIndex > 0) {
self->flushPartWordBuffer();
self->nextWordContinues = true;
}
self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth);
// Push inline style entry for bold tag
StyleStackEntry entry;
@ -275,6 +287,11 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char*
self->inlineStyleStack.push_back(entry);
self->updateEffectiveInlineStyle();
} else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
// Flush buffer before style change so preceding text gets current style
if (self->partWordBufferIndex > 0) {
self->flushPartWordBuffer();
self->nextWordContinues = true;
}
self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth);
// Push inline style entry for italic tag
StyleStackEntry entry;
@ -294,6 +311,11 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char*
} else if (strcmp(name, "span") == 0 || !isHeaderOrBlock(name)) {
// Handle span and other inline elements for CSS styling
if (cssStyle.hasFontWeight() || cssStyle.hasFontStyle() || cssStyle.hasTextDecoration()) {
// Flush buffer before style change so preceding text gets current style
if (self->partWordBufferIndex > 0) {
self->flushPartWordBuffer();
self->nextWordContinues = true;
}
StyleStackEntry entry;
entry.depth = self->depth; // Track depth for matching pop
if (cssStyle.hasFontWeight()) {
@ -331,6 +353,8 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
if (self->partWordBufferIndex > 0) {
self->flushPartWordBuffer();
}
// Whitespace is a real word boundary — reset continuation state
self->nextWordContinues = false;
// Skip the whitespace char
continue;
}
@ -387,6 +411,8 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n
// Flush buffer with current style BEFORE any style changes
if (self->partWordBufferIndex > 0) {
// Flush if style will change OR if we're closing a block/structural element
const bool isInlineTag = !headerOrBlockTag && strcmp(name, "table") != 0 &&
!matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) && self->depth != 1;
const bool shouldFlush = styleWillChange || headerOrBlockTag || matches(name, BOLD_TAGS, NUM_BOLD_TAGS) ||
matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) ||
matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS) || strcmp(name, "table") == 0 ||
@ -394,6 +420,10 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n
if (shouldFlush) {
self->flushPartWordBuffer();
// If closing an inline element, the next word fragment continues the same visual word
if (isInlineTag) {
self->nextWordContinues = true;
}
}
}

View File

@ -30,6 +30,7 @@ class ChapterHtmlSlimParser {
// leave one char at end for null pointer
char partWordBuffer[MAX_WORD_SIZE + 1] = {};
int partWordBufferIndex = 0;
bool nextWordContinues = false; // true when next flushed word attaches to previous (inline element boundary)
std::unique_ptr<ParsedText> currentTextBlock = nullptr;
std::unique_ptr<Page> currentPage = nullptr;
int16_t currentPageNextY = 0;