From e46a271c12c31695f41ad5f241d3cf2a3944be50 Mon Sep 17 00:00:00 2001 From: Arthur Tazhitdinov Date: Wed, 7 Jan 2026 02:14:01 +0500 Subject: [PATCH] Implement hyphenation support and refactor line break calculations --- lib/Epub/Epub/ParsedText.cpp | 532 +++++++++++++++++++---------------- lib/Epub/Epub/ParsedText.h | 18 +- 2 files changed, 296 insertions(+), 254 deletions(-) diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index 9a6162d7..524cdb9b 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -1,7 +1,7 @@ #include "ParsedText.h" #include -#include +#include "hyphenation/Hyphenator.h" #include #include @@ -10,95 +10,8 @@ #include #include -#include "hyphenation/HyphenationCommon.h" -#include "hyphenation/Hyphenator.h" - constexpr int MAX_COST = std::numeric_limits::max(); -namespace { - -struct HyphenSplitDecision { - size_t byteOffset; - uint16_t prefixWidth; - bool appendHyphen; // true when we must draw an extra hyphen after the prefix glyphs -}; - -// Verifies whether the substring ending at `offset` already contains a literal hyphen glyph, so we can avoid -// drawing a duplicate hyphen when breaking the word. -bool endsWithExplicitHyphen(const std::string& word, const size_t offset) { - if (offset == 0 || offset > word.size()) { - return false; - } - - const unsigned char* base = reinterpret_cast(word.data()); - const unsigned char* ptr = base; - const unsigned char* target = base + offset; - const unsigned char* lastStart = nullptr; - - while (ptr < target) { - lastStart = ptr; - utf8NextCodepoint(&ptr); - if (ptr > target) { - return false; - } - } - - if (!lastStart || ptr != target) { - return false; - } - - const unsigned char* tmp = lastStart; - const uint32_t cp = utf8NextCodepoint(&tmp); // decode the codepoint immediately prior to the break - return isExplicitHyphen(cp); -} - -bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const std::string& word, - const EpdFontFamily::Style style, const int availableWidth, const bool includeFallback, - HyphenSplitDecision* decision) { - if (!decision || availableWidth <= 0) { - return false; - } - - const int hyphenWidth = renderer.getTextWidth(fontId, "-", style); - - auto offsets = Hyphenator::breakOffsets(word, includeFallback); - if (offsets.empty()) { - return false; - } - - size_t chosenOffset = std::numeric_limits::max(); - uint16_t chosenWidth = 0; - bool chosenAppendHyphen = true; - - for (const size_t offset : offsets) { - const bool needsInsertedHyphen = !endsWithExplicitHyphen(word, offset); - const int budget = availableWidth - (needsInsertedHyphen ? hyphenWidth : 0); - if (budget <= 0) { - continue; - } - const std::string prefix = word.substr(0, offset); - const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style); - if (prefixWidth <= budget) { - chosenOffset = offset; - chosenWidth = static_cast(prefixWidth + (needsInsertedHyphen ? hyphenWidth : 0)); - chosenAppendHyphen = needsInsertedHyphen; - } else { - break; - } - } - - if (chosenOffset == std::numeric_limits::max()) { - return false; - } - - decision->byteOffset = chosenOffset; - decision->prefixWidth = chosenWidth; - decision->appendHyphen = chosenAppendHyphen; - return true; -} - -} // namespace - void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle) { if (word.empty()) return; @@ -114,11 +27,20 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo return; } + // Apply fixed transforms before any per-line layout work. + applyParagraphIndent(); + ensureHyphenationFitsViewport(renderer, fontId, viewportWidth); + const int pageWidth = viewportWidth; const int spaceWidth = renderer.getSpaceWidth(fontId); - // Pre-split oversized tokens so the DP step always has feasible line candidates. - auto wordWidths = calculateWordWidths(renderer, fontId, pageWidth); - auto lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths); + auto wordWidths = calculateWordWidths(renderer, fontId); + std::vector lineBreakIndices; + if (hyphenationEnabled) { + // Use greedy layout that can split words mid-loop when a hyphenated prefix fits. + lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths); + } else { + lineBreakIndices = computeLineBreaks(pageWidth, spaceWidth, wordWidths); + } const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1; for (size_t i = 0; i < lineCount; ++i) { @@ -126,52 +48,17 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo } } -std::vector ParsedText::calculateWordWidths(const GfxRenderer& renderer, const int fontId, - const int pageWidth) { +std::vector ParsedText::calculateWordWidths(const GfxRenderer& renderer, const int fontId) { const size_t totalWordCount = words.size(); std::vector wordWidths; wordWidths.reserve(totalWordCount); - // add em-space at the beginning of first word in paragraph to indent - if (!extraParagraphSpacing) { - std::string& first_word = words.front(); - first_word.insert(0, "\xe2\x80\x83"); - } - auto wordsIt = words.begin(); auto wordStylesIt = wordStyles.begin(); while (wordsIt != words.end()) { - uint16_t width = renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt); - - if (width > pageWidth) { - HyphenSplitDecision decision{}; - if (chooseSplitForWidth(renderer, fontId, *wordsIt, *wordStylesIt, pageWidth, true, &decision)) { - const std::string originalWord = *wordsIt; - const std::string tail = originalWord.substr(decision.byteOffset); - if (tail.empty()) { - continue; - } - std::string prefix = originalWord.substr(0, decision.byteOffset); - if (decision.appendHyphen) { - prefix += "-"; - } - - *wordsIt = prefix; - auto nextWordIt = words.insert(std::next(wordsIt), tail); - auto nextStyleIt = wordStyles.insert(std::next(wordStylesIt), *wordStylesIt); - // Continue processing the freshly inserted tail so cascading splits still respect the limit. - - wordWidths.push_back(decision.prefixWidth); - - wordsIt = nextWordIt; - wordStylesIt = nextStyleIt; - continue; - } - } - - wordWidths.push_back(width); + wordWidths.push_back(renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt)); std::advance(wordsIt, 1); std::advance(wordStylesIt, 1); @@ -180,149 +67,289 @@ std::vector ParsedText::calculateWordWidths(const GfxRenderer& rendere return wordWidths; } -std::vector ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth, - const int spaceWidth, std::vector& wordWidths) { - std::vector lineBreakIndices; - if (words.empty()) { - return lineBreakIndices; - } +std::vector ParsedText::computeLineBreaks(const int pageWidth, const int spaceWidth, + const std::vector& wordWidths) const { + const size_t totalWordCount = words.size(); - if (wordWidths.empty()) { - return lineBreakIndices; - } + // DP table to store the minimum badness (cost) of lines starting at index i + std::vector dp(totalWordCount); + // 'ans[i]' stores the index 'j' of the *last word* in the optimal line starting at 'i' + std::vector ans(totalWordCount); - constexpr size_t MAX_LINES = 1000; - const auto appendLineBreak = [&](size_t index) { - const size_t clampedIndex = std::min(index, wordWidths.size()); - lineBreakIndices.push_back(clampedIndex); - return lineBreakIndices.size() < MAX_LINES; - }; + // Base Case + dp[totalWordCount - 1] = 0; + ans[totalWordCount - 1] = totalWordCount - 1; - const auto runDp = [&]() { - lineBreakIndices.clear(); - const size_t totalWordCount = wordWidths.size(); - if (totalWordCount == 0) { - return; - } + for (int i = totalWordCount - 2; i >= 0; --i) { + int currlen = -spaceWidth; + dp[i] = MAX_COST; - std::vector dp(totalWordCount); - std::vector ans(totalWordCount); - dp[totalWordCount - 1] = 0; - ans[totalWordCount - 1] = totalWordCount - 1; + for (size_t j = i; j < totalWordCount; ++j) { + // Current line length: previous width + space + current word width + currlen += wordWidths[j] + spaceWidth; - for (int i = static_cast(totalWordCount) - 2; i >= 0; --i) { - int currlen = -spaceWidth; - dp[i] = MAX_COST; + if (currlen > pageWidth) { + break; + } - for (size_t j = i; j < totalWordCount; ++j) { - currlen += wordWidths[j] + spaceWidth; - if (currlen > pageWidth) { - break; - } + int cost; + if (j == totalWordCount - 1) { + cost = 0; // Last line + } else { + const int remainingSpace = pageWidth - currlen; + // Use long long for the square to prevent overflow + const long long cost_ll = static_cast(remainingSpace) * remainingSpace + dp[j + 1]; - int cost; - if (j == totalWordCount - 1) { - cost = 0; + if (cost_ll > MAX_COST) { + cost = MAX_COST; } else { - const int remainingSpace = pageWidth - currlen; - const long long costLl = static_cast(remainingSpace) * remainingSpace + dp[j + 1]; - cost = costLl > MAX_COST ? MAX_COST : static_cast(costLl); + cost = static_cast(cost_ll); } + } - if (cost < dp[i]) { - dp[i] = cost; - ans[i] = j; - } + if (cost < dp[i]) { + dp[i] = cost; + ans[i] = j; // j is the index of the last word in this optimal line } } - size_t currentWordIndex = 0; - while (currentWordIndex < totalWordCount && lineBreakIndices.size() < MAX_LINES) { - const size_t nextBreakIndex = ans[currentWordIndex] + 1; - if (!appendLineBreak(nextBreakIndex)) { - break; + // Handle oversized word: if no valid configuration found, force single-word line + // This prevents cascade failure where one oversized word breaks all preceding words + if (dp[i] == MAX_COST) { + ans[i] = i; // Just this word on its own line + // Inherit cost from next word to allow subsequent words to find valid configurations + if (i + 1 < static_cast(totalWordCount)) { + dp[i] = dp[i + 1]; + } else { + dp[i] = 0; } - currentWordIndex = nextBreakIndex; } - }; + } - const auto runHyphenatedLayout = [&]() { - size_t currentIndex = 0; - bool continueLayout = true; + // Stores the index of the word that starts the next line (last_word_index + 1) + std::vector lineBreakIndices; + size_t currentWordIndex = 0; - while (currentIndex < wordWidths.size() && continueLayout) { - int lineWidth = 0; - size_t wordsOnLine = 0; + while (currentWordIndex < totalWordCount) { + size_t nextBreakIndex = ans[currentWordIndex] + 1; - while (currentIndex < wordWidths.size()) { - const int interWordSpace = (wordsOnLine == 0) ? 0 : spaceWidth; - const int projectedWidth = lineWidth + interWordSpace + wordWidths[currentIndex]; - - if (projectedWidth <= pageWidth) { - lineWidth = projectedWidth; - ++currentIndex; - ++wordsOnLine; - continue; - } - - auto wordNodeIt = words.begin(); - auto styleNodeIt = wordStyles.begin(); - std::advance(wordNodeIt, currentIndex); - std::advance(styleNodeIt, currentIndex); - if (wordNodeIt == words.end()) { - break; - } - - const int availableWidth = pageWidth - lineWidth - interWordSpace; - HyphenSplitDecision decision{}; - if (!chooseSplitForWidth(renderer, fontId, *wordNodeIt, *styleNodeIt, availableWidth, false, &decision)) { - break; - } - - const std::string originalWord = *wordNodeIt; - const std::string tail = originalWord.substr(decision.byteOffset); - if (tail.empty()) { - break; - } - std::string prefix = originalWord.substr(0, decision.byteOffset); - if (decision.appendHyphen) { - prefix += "-"; - } - - const EpdFontFamily::Style styleForSplit = *styleNodeIt; - *wordNodeIt = tail; - words.insert(wordNodeIt, prefix); - wordStyles.insert(styleNodeIt, styleForSplit); - - const uint16_t tailWidth = renderer.getTextWidth(fontId, tail.c_str(), styleForSplit); - wordWidths.insert(wordWidths.begin() + currentIndex, decision.prefixWidth); - wordWidths[currentIndex + 1] = tailWidth; - - lineWidth += interWordSpace + decision.prefixWidth; - ++currentIndex; - ++wordsOnLine; - break; - } - - if (wordsOnLine == 0) { - ++currentIndex; - } - - continueLayout = appendLineBreak(currentIndex); + // Safety check: prevent infinite loop if nextBreakIndex doesn't advance + if (nextBreakIndex <= currentWordIndex) { + // Force advance by at least one word to avoid infinite loop + nextBreakIndex = currentWordIndex + 1; } - }; - if (hyphenationEnabled) { - // Simple greedy layout with hyphenation - runHyphenatedLayout(); - } else { - // TeX-like optimal layout without hyphenation - runDp(); + lineBreakIndices.push_back(nextBreakIndex); + currentWordIndex = nextBreakIndex; } return lineBreakIndices; } +void ParsedText::applyParagraphIndent() { + if (extraParagraphSpacing || words.empty()) { + return; + } + + words.front().insert(0, "\xe2\x80\x83"); +} + +void ParsedText::ensureHyphenationFitsViewport(const GfxRenderer& renderer, const int fontId, + const int viewportWidth) { + if (!hyphenationEnabled || viewportWidth <= 0) { + return; + } + + auto wordIt = words.begin(); + auto styleIt = wordStyles.begin(); + + while (wordIt != words.end() && styleIt != wordStyles.end()) { + const int wordWidth = renderer.getTextWidth(fontId, wordIt->c_str(), *styleIt); + if (wordWidth <= viewportWidth) { + ++wordIt; + ++styleIt; + continue; + } + + auto chunks = hyphenateWordToFit(*wordIt, renderer, fontId, *styleIt, viewportWidth); + if (chunks.empty()) { + ++wordIt; + ++styleIt; + continue; + } + + *wordIt = chunks.front(); + + auto wordInsertPos = std::next(wordIt); + auto styleInsertPos = std::next(styleIt); + for (size_t idx = 1; idx < chunks.size(); ++idx) { + wordInsertPos = words.insert(wordInsertPos, chunks[idx]); + styleInsertPos = wordStyles.insert(styleInsertPos, *styleIt); + ++wordInsertPos; + ++styleInsertPos; + } + + ++wordIt; + ++styleIt; + } +} + +std::vector ParsedText::hyphenateWordToFit(const std::string& word, const GfxRenderer& renderer, + const int fontId, const EpdFontFamily::Style style, + const int viewportWidth) const { + std::vector chunks; + if (word.empty() || viewportWidth <= 0) { + return chunks; + } + + const int hyphenWidth = renderer.getTextWidth(fontId, "-", style); + std::string remaining = word; + + while (!remaining.empty()) { + const int remainingWidth = renderer.getTextWidth(fontId, remaining.c_str(), style); + if (remainingWidth <= viewportWidth) { + chunks.push_back(remaining); + break; + } + + const auto breakOffsets = Hyphenator::breakOffsets(remaining, true); + if (breakOffsets.empty()) { + return {}; + } + + size_t chosenOffset = 0; + int chosenWidth = -1; + for (const size_t offset : breakOffsets) { + if (offset == 0 || offset >= remaining.size()) { + continue; + } + + const std::string candidate = remaining.substr(0, offset); + const int candidateWidth = renderer.getTextWidth(fontId, candidate.c_str(), style) + hyphenWidth; + if (candidateWidth > viewportWidth) { + continue; + } + + if (candidateWidth > chosenWidth) { + chosenWidth = candidateWidth; + chosenOffset = offset; + } + } + + if (chosenWidth < 0) { + return {}; + } + + std::string chunk = remaining.substr(0, chosenOffset); + chunk.push_back('-'); + chunks.push_back(std::move(chunk)); + remaining.erase(0, chosenOffset); + } + + return chunks; +} + +// Builds break indices while opportunistically splitting the word that would overflow the current line. +std::vector ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& renderer, const int fontId, + const int pageWidth, const int spaceWidth, + std::vector& wordWidths) { + std::vector lineBreakIndices; + size_t currentIndex = 0; + + while (currentIndex < wordWidths.size()) { + const size_t lineStart = currentIndex; + int lineWidth = 0; + + while (currentIndex < wordWidths.size()) { + const bool isFirstWord = currentIndex == lineStart; + const int spacing = isFirstWord ? 0 : spaceWidth; + const int candidateWidth = spacing + wordWidths[currentIndex]; + + if (lineWidth + candidateWidth <= pageWidth) { + lineWidth += candidateWidth; + currentIndex += 1; + continue; + } + + const int availableWidth = pageWidth - lineWidth - spacing; + if (availableWidth > 0 && hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths)) { + // Widths updated for the split word; retry with current index + continue; + } + + // Word cannot fit on current line (with or without hyphenation). + if (currentIndex == lineStart) { + // Force place at least one word to prevent stalling. + lineWidth += candidateWidth; + currentIndex += 1; + } + break; + } + + lineBreakIndices.push_back(currentIndex); + } + + return lineBreakIndices; +} + +// Splits words[wordIndex] into prefix+hyphen and remainder when a legal breakpoint fits the available width. +bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availableWidth, const GfxRenderer& renderer, + const int fontId, std::vector& wordWidths) { + if (availableWidth <= 0 || wordIndex >= words.size()) { + return false; + } + + auto wordIt = words.begin(); + auto styleIt = wordStyles.begin(); + std::advance(wordIt, wordIndex); + std::advance(styleIt, wordIndex); + + const auto breakOffsets = Hyphenator::breakOffsets(*wordIt, true); + if (breakOffsets.empty()) { + return false; + } + + const auto style = *styleIt; + size_t chosenOffset = 0; + int chosenWidth = -1; + + for (const size_t offset : breakOffsets) { + if (offset == 0 || offset >= wordIt->size()) { + continue; + } + + std::string prefix = wordIt->substr(0, offset); + prefix.push_back('-'); + const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style); + if (prefixWidth > availableWidth) { + continue; + } + + if (prefixWidth > chosenWidth) { + chosenWidth = prefixWidth; + chosenOffset = offset; + } + } + + if (chosenWidth < 0) { + return false; + } + + std::string remainder = wordIt->substr(chosenOffset); + wordIt->resize(chosenOffset); + wordIt->push_back('-'); + + auto insertWordIt = std::next(wordIt); + auto insertStyleIt = std::next(styleIt); + words.insert(insertWordIt, remainder); + wordStyles.insert(insertStyleIt, style); + + wordWidths[wordIndex] = static_cast(chosenWidth); + const uint16_t remainderWidth = renderer.getTextWidth(fontId, remainder.c_str(), style); + wordWidths.insert(wordWidths.begin() + wordIndex + 1, remainderWidth); + return true; +} + void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth, const std::vector& wordWidths, const std::vector& lineBreakIndices, const std::function)>& processLine) { @@ -330,19 +357,23 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const const size_t lastBreakAt = breakIndex > 0 ? lineBreakIndices[breakIndex - 1] : 0; const size_t lineWordCount = lineBreak - lastBreakAt; + // Calculate total word width for this line int lineWordWidthSum = 0; - for (size_t idx = lastBreakAt; idx < lineBreak; ++idx) { - lineWordWidthSum += wordWidths[idx]; + for (size_t i = lastBreakAt; i < lineBreak; i++) { + lineWordWidthSum += wordWidths[i]; } - const bool isLastLine = breakIndex == lineBreakIndices.size() - 1; + // Calculate spacing const int spareSpace = pageWidth - lineWordWidthSum; int spacing = spaceWidth; + const bool isLastLine = breakIndex == lineBreakIndices.size() - 1; + if (style == TextBlock::JUSTIFIED && !isLastLine && lineWordCount >= 2) { spacing = spareSpace / (lineWordCount - 1); } + // Calculate initial x position uint16_t xpos = 0; if (style == TextBlock::RIGHT_ALIGN) { xpos = spareSpace - (lineWordCount - 1) * spaceWidth; @@ -350,6 +381,7 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2; } + // Pre-calculate X positions for words std::list lineXPos; for (size_t i = lastBreakAt; i < lineBreak; i++) { const uint16_t currentWordWidth = wordWidths[i]; @@ -357,11 +389,13 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const xpos += currentWordWidth + spacing; } + // Iterators always start at the beginning as we are moving content with splice below auto wordEndIt = words.begin(); auto wordStyleEndIt = wordStyles.begin(); std::advance(wordEndIt, lineWordCount); std::advance(wordStyleEndIt, lineWordCount); + // *** CRITICAL STEP: CONSUME DATA USING SPLICE *** std::list lineWords; lineWords.splice(lineWords.begin(), words, words.begin(), wordEndIt); std::list lineWordStyles; diff --git a/lib/Epub/Epub/ParsedText.h b/lib/Epub/Epub/ParsedText.h index f19b26cb..13f36a22 100644 --- a/lib/Epub/Epub/ParsedText.h +++ b/lib/Epub/Epub/ParsedText.h @@ -19,15 +19,23 @@ class ParsedText { bool extraParagraphSpacing; bool hyphenationEnabled; - std::vector computeLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, int spaceWidth, - std::vector& wordWidths); + void applyParagraphIndent(); + void ensureHyphenationFitsViewport(const GfxRenderer& renderer, int fontId, int viewportWidth); + std::vector hyphenateWordToFit(const std::string& word, const GfxRenderer& renderer, int fontId, + EpdFontFamily::Style style, int viewportWidth) const; + std::vector computeHyphenatedLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, + int spaceWidth, std::vector& wordWidths); + bool hyphenateWordAtIndex(size_t wordIndex, int availableWidth, const GfxRenderer& renderer, int fontId, + std::vector& wordWidths); + std::vector computeLineBreaks(int pageWidth, int spaceWidth, const std::vector& wordWidths) const; void extractLine(size_t breakIndex, int pageWidth, int spaceWidth, const std::vector& wordWidths, const std::vector& lineBreakIndices, const std::function)>& processLine); - std::vector calculateWordWidths(const GfxRenderer& renderer, int fontId, int pageWidth); + std::vector calculateWordWidths(const GfxRenderer& renderer, int fontId); public: - explicit ParsedText(const TextBlock::Style style, const bool extraParagraphSpacing, const bool hyphenationEnabled) + explicit ParsedText(const TextBlock::Style style, const bool extraParagraphSpacing, + const bool hyphenationEnabled = false) : style(style), extraParagraphSpacing(extraParagraphSpacing), hyphenationEnabled(hyphenationEnabled) {} ~ParsedText() = default; @@ -39,4 +47,4 @@ class ParsedText { void layoutAndExtractLines(const GfxRenderer& renderer, int fontId, uint16_t viewportWidth, const std::function)>& processLine, bool includeLastLine = true); -}; +}; \ No newline at end of file