From a6d6e5e770b02c799917a4a812d0f08e1164ffc1 Mon Sep 17 00:00:00 2001 From: Jake Kenneally Date: Sat, 31 Jan 2026 14:46:20 -0500 Subject: [PATCH] fix some styling edge cases: preserving spacing from parent elements, removing stray spaces after italicized text in child elements --- lib/Epub/Epub/ParsedText.cpp | 61 ++++++++++++++++--- .../Epub/parsers/ChapterHtmlSlimParser.cpp | 33 +++++++++- 2 files changed, 83 insertions(+), 11 deletions(-) diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index 708f90e8..25f7e5ed 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -19,6 +19,23 @@ namespace { constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD"; constexpr size_t SOFT_HYPHEN_BYTES = 2; +// Check if a character is punctuation that should attach to the previous word +// (no space before it). Limited to sentence-ending and clause-separating punctuation +// to avoid false positives with decorative brackets like "[ 1 ]". +bool isAttachingPunctuation(const char c) { + return c == '.' || c == ',' || c == '!' || c == '?' || c == ';' || c == ':'; +} + +// Check if a word consists entirely of punctuation that should attach to the previous word +bool isAttachingPunctuationWord(const std::string& word) { + if (word.empty()) return false; + // Check if word starts with attaching punctuation and is short (to avoid false positives) + if (isAttachingPunctuation(word[0]) && word.size() <= 3) { + return true; + } + return false; +} + bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; } // Removes every soft hyphen in-place so rendered glyphs match measured widths. @@ -369,10 +386,20 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const ? blockStyle.textIndent : 0; - // Calculate total word width for this line + // Calculate total word width for this line and count actual word gaps + // (punctuation that attaches to previous word doesn't count as a gap) + // Note: words list starts at the beginning because previous lines were spliced out int lineWordWidthSum = 0; - for (size_t i = lastBreakAt; i < lineBreak; i++) { - lineWordWidthSum += wordWidths[i]; + size_t actualGapCount = 0; + auto countWordIt = words.begin(); + + for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) { + lineWordWidthSum += wordWidths[lastBreakAt + wordIdx]; + // Count gaps: each word after the first creates a gap, unless it's attaching punctuation + if (wordIdx > 0 && !isAttachingPunctuationWord(*countWordIt)) { + actualGapCount++; + } + ++countWordIt; } // Calculate spacing (account for indent reducing effective page width on first line) @@ -382,24 +409,38 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spacing = spaceWidth; const bool isLastLine = breakIndex == lineBreakIndices.size() - 1; - if (style == TextBlock::JUSTIFIED && !isLastLine && lineWordCount >= 2) { - spacing = spareSpace / (lineWordCount - 1); + // For justified text, calculate spacing based on actual gap count + if (style == TextBlock::JUSTIFIED && !isLastLine && actualGapCount >= 1) { + spacing = spareSpace / static_cast(actualGapCount); } // Calculate initial x position (first line starts at indent for left/justified text) auto xpos = static_cast(firstLineIndent); if (style == TextBlock::RIGHT_ALIGN) { - xpos = spareSpace - (lineWordCount - 1) * spaceWidth; + xpos = spareSpace - static_cast(actualGapCount) * spaceWidth; } else if (style == TextBlock::CENTER_ALIGN) { - xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2; + xpos = (spareSpace - static_cast(actualGapCount) * spaceWidth) / 2; } // Pre-calculate X positions for words + // Punctuation that attaches to the previous word doesn't get space before it + // Note: words list starts at the beginning because previous lines were spliced out std::list lineXPos; - for (size_t i = lastBreakAt; i < lineBreak; i++) { - const uint16_t currentWordWidth = wordWidths[i]; + auto wordIt = words.begin(); + + for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) { + const uint16_t currentWordWidth = wordWidths[lastBreakAt + wordIdx]; + lineXPos.push_back(xpos); - xpos += currentWordWidth + spacing; + + // Add spacing after this word, unless the next word is attaching punctuation + auto nextWordIt = wordIt; + ++nextWordIt; + const bool nextIsAttachingPunctuation = + wordIdx + 1 < lineWordCount && isAttachingPunctuationWord(*nextWordIt); + + xpos += currentWordWidth + (nextIsAttachingPunctuation ? 0 : spacing); + ++wordIt; } // Iterators always start at the beginning as we are moving content with splice below diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index 13325df1..991a8c42 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -113,13 +113,44 @@ void ChapterHtmlSlimParser::flushPartWordBuffer() { partWordBufferIndex = 0; } +// Merge block styles for nested block elements +// When a child block element is inside a parent with no direct text content, +// we accumulate their margins so nested containers properly contribute spacing +BlockStyle mergeBlockStyles(const BlockStyle& parent, const BlockStyle& child) { + BlockStyle merged; + // Vertical margins: sum them (nested blocks create additive spacing) + merged.marginTop = static_cast(parent.marginTop + child.marginTop); + merged.marginBottom = static_cast(parent.marginBottom + child.marginBottom); + // Horizontal margins: sum them (nested blocks create cumulative indentation) + merged.marginLeft = static_cast(parent.marginLeft + child.marginLeft); + merged.marginRight = static_cast(parent.marginRight + child.marginRight); + // Padding: sum them + merged.paddingTop = static_cast(parent.paddingTop + child.paddingTop); + merged.paddingBottom = static_cast(parent.paddingBottom + child.paddingBottom); + merged.paddingLeft = static_cast(parent.paddingLeft + child.paddingLeft); + merged.paddingRight = static_cast(parent.paddingRight + child.paddingRight); + // Text indent: use child's if defined, otherwise inherit parent's + if (child.textIndentDefined) { + merged.textIndent = child.textIndent; + merged.textIndentDefined = true; + } else if (parent.textIndentDefined) { + merged.textIndent = parent.textIndent; + merged.textIndentDefined = true; + } + return merged; +} + // start a new text block if needed void ChapterHtmlSlimParser::startNewTextBlock(const TextBlock::Style style, const BlockStyle& blockStyle) { if (currentTextBlock) { // already have a text block running and it is empty - just reuse it if (currentTextBlock->isEmpty()) { currentTextBlock->setStyle(style); - currentTextBlock->setBlockStyle(blockStyle); + // Merge with existing block style to accumulate margins from parent block elements + // This handles cases like

text

where the + // div's margin should be preserved even though it has no direct text content + const BlockStyle merged = mergeBlockStyles(currentTextBlock->getBlockStyle(), blockStyle); + currentTextBlock->setBlockStyle(merged); return; }