From 10fa0cc060c7ed29acb9bdbec26f6b0450d96b20 Mon Sep 17 00:00:00 2001 From: Arthur Tazhitdinov Date: Wed, 14 Jan 2026 22:56:12 +0500 Subject: [PATCH] use language hyphenator for overfit words --- lib/Epub/Epub/ParsedText.cpp | 28 ++++++++++++++++++++++++---- lib/Epub/Epub/ParsedText.h | 3 ++- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index 12681915..b08893f8 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -75,7 +75,7 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo // Use greedy layout that can split words mid-loop when a hyphenated prefix fits. lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths); } else { - lineBreakIndices = computeLineBreaks(pageWidth, spaceWidth, wordWidths); + lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths); } const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1; @@ -103,8 +103,25 @@ std::vector ParsedText::calculateWordWidths(const GfxRenderer& rendere return wordWidths; } -std::vector ParsedText::computeLineBreaks(const int pageWidth, const int spaceWidth, - const std::vector& wordWidths) const { +std::vector ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth, + const int spaceWidth, std::vector& wordWidths) { + if (words.empty()) { + return {}; + } + + // Ensure any word that would overflow even as the first entry on a line is split using fallback hyphenation. + for (size_t i = 0; i < wordWidths.size(); ++i) { + while (wordWidths[i] > pageWidth) { + // Try language-aware hyphenation first; only fall back to heuristics when no dictionary break fits. + if (hyphenateWordAtIndex(i, pageWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/false)) { + continue; + } + if (!hyphenateWordAtIndex(i, pageWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true)) { + break; + } + } + } + const size_t totalWordCount = words.size(); // DP table to store the minimum badness (cost) of lines starting at index i @@ -260,7 +277,10 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl const auto style = *styleIt; // Collect candidate breakpoints (byte offsets and hyphen requirements). - const auto breakInfos = Hyphenator::breakOffsets(word, allowFallbackBreaks); + auto breakInfos = Hyphenator::breakOffsets(word, /*allowFallback=*/false); + if (breakInfos.empty() && allowFallbackBreaks) { + breakInfos = Hyphenator::breakOffsets(word, /*allowFallback=*/true); + } if (breakInfos.empty()) { return false; } diff --git a/lib/Epub/Epub/ParsedText.h b/lib/Epub/Epub/ParsedText.h index 1089054e..e72db7ef 100644 --- a/lib/Epub/Epub/ParsedText.h +++ b/lib/Epub/Epub/ParsedText.h @@ -20,7 +20,8 @@ class ParsedText { bool hyphenationEnabled; void applyParagraphIndent(); - std::vector computeLineBreaks(int pageWidth, int spaceWidth, const std::vector& wordWidths) const; + std::vector computeLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, int spaceWidth, + std::vector& wordWidths); std::vector computeHyphenatedLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, int spaceWidth, std::vector& wordWidths); bool hyphenateWordAtIndex(size_t wordIndex, int availableWidth, const GfxRenderer& renderer, int fontId,