From 58314e9efdebd47f5704cfcc9f764f0ac937cb3b Mon Sep 17 00:00:00 2001 From: Arthur Tazhitdinov Date: Thu, 8 Jan 2026 03:01:36 +0500 Subject: [PATCH] optimization --- lib/Epub/Epub/ParsedText.cpp | 40 +++++------ .../Epub/hyphenation/HyphenationCommon.cpp | 2 - lib/Epub/Epub/hyphenation/HyphenationCommon.h | 1 - lib/Epub/Epub/hyphenation/Hyphenator.cpp | 70 +++++++------------ 4 files changed, 45 insertions(+), 68 deletions(-) diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index 8e3d2f30..12681915 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -207,28 +207,29 @@ std::vector ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r const int spacing = isFirstWord ? 0 : spaceWidth; const int candidateWidth = spacing + wordWidths[currentIndex]; + // Word fits on current line if (lineWidth + candidateWidth <= pageWidth) { lineWidth += candidateWidth; - currentIndex += 1; + ++currentIndex; continue; } // Word would overflow — try to split based on hyphenation points const int availableWidth = pageWidth - lineWidth - spacing; - const bool allowFallbackBreaks = - isFirstWord; // Permit fallback breaks only when first word one the line still overflows + const bool allowFallbackBreaks = isFirstWord; // Only for first word on line + if (availableWidth > 0 && hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) { - // Prefix now fits; append it to this line and immediately move to the next line + // Prefix now fits; append it to this line and move to next line lineWidth += spacing + wordWidths[currentIndex]; - currentIndex += 1; + ++currentIndex; break; } // Could not split: force at least one word per line to avoid infinite loop if (currentIndex == lineStart) { lineWidth += candidateWidth; - currentIndex += 1; + ++currentIndex; } break; } @@ -249,19 +250,21 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl return false; } - // Position iterators at the target word and its style entry. + // Get iterators to target word and style. auto wordIt = words.begin(); auto styleIt = wordStyles.begin(); std::advance(wordIt, wordIndex); std::advance(styleIt, wordIndex); + const std::string& word = *wordIt; + const auto style = *styleIt; + // Collect candidate breakpoints (byte offsets and hyphen requirements). - const auto breakInfos = Hyphenator::breakOffsets(*wordIt, allowFallbackBreaks); + const auto breakInfos = Hyphenator::breakOffsets(word, allowFallbackBreaks); if (breakInfos.empty()) { return false; } - const auto style = *styleIt; size_t chosenOffset = 0; int chosenWidth = -1; bool chosenNeedsHyphen = true; @@ -269,22 +272,19 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl // Iterate over each legal breakpoint and retain the widest prefix that still fits. for (const auto& info : breakInfos) { const size_t offset = info.byteOffset; - if (offset == 0 || offset >= wordIt->size()) { + if (offset == 0 || offset >= word.size()) { continue; } const bool needsHyphen = info.requiresInsertedHyphen; - std::string prefix = wordIt->substr(0, offset); - const int prefixWidth = measureWordWidth(renderer, fontId, prefix, style, needsHyphen); - if (prefixWidth > availableWidth) { - continue; + const int prefixWidth = measureWordWidth(renderer, fontId, word.substr(0, offset), style, needsHyphen); + if (prefixWidth > availableWidth || prefixWidth <= chosenWidth) { + continue; // Skip if too wide or not an improvement } - if (prefixWidth > chosenWidth) { - chosenWidth = prefixWidth; - chosenOffset = offset; - chosenNeedsHyphen = needsHyphen; - } + chosenWidth = prefixWidth; + chosenOffset = offset; + chosenNeedsHyphen = needsHyphen; } if (chosenWidth < 0) { @@ -293,7 +293,7 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl } // Split the word at the selected breakpoint and append a hyphen if required. - std::string remainder = wordIt->substr(chosenOffset); + std::string remainder = word.substr(chosenOffset); wordIt->resize(chosenOffset); if (chosenNeedsHyphen) { wordIt->push_back('-'); diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp index a18361c3..541f6c34 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp @@ -59,8 +59,6 @@ bool isCyrillicConsonant(const uint32_t cp) { return isCyrillicLetter(cp) && !is bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); } -bool isVowel(const uint32_t cp) { return isLatinVowel(cp) || isCyrillicVowel(cp); } - bool isPunctuation(const uint32_t cp) { switch (cp) { case '.': diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.h b/lib/Epub/Epub/hyphenation/HyphenationCommon.h index 0e68ef04..927ac4c3 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h @@ -25,7 +25,6 @@ bool isCyrillicVowel(uint32_t cp); bool isCyrillicConsonant(uint32_t cp); bool isAlphabetic(uint32_t cp); -bool isVowel(uint32_t cp); bool isPunctuation(uint32_t cp); bool isAsciiDigit(uint32_t cp); bool isExplicitHyphen(uint32_t cp); diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp index 4f22cafc..09d0ab66 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp @@ -3,7 +3,6 @@ #include #include -#include #include #include "EnglishHyphenator.h" @@ -32,12 +31,6 @@ const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) { return nullptr; } -// Preferred language hint; empty means "auto". -std::string& preferredLanguage() { - static std::string lang; - return lang; -} - // Cached hyphenator instance for the current preferred language. const LanguageHyphenator*& cachedHyphenator() { static const LanguageHyphenator* hyphenator = nullptr; @@ -86,67 +79,54 @@ void trimTrailingFootnoteReference(std::vector& cps) { // Asks the language hyphenator for legal break positions inside the word. std::vector collectBreakIndexes(const std::vector& cps) { - if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { - return {}; - } - if (const auto* hyphenator = cachedHyphenator()) { - auto indexes = hyphenator->breakIndexes(cps); - return indexes; + return hyphenator->breakIndexes(cps); } - return {}; } // Maps a codepoint index back to its byte offset inside the source word. size_t byteOffsetForIndex(const std::vector& cps, const size_t index) { - if (index >= cps.size()) { - return cps.empty() ? 0 : cps.back().byteOffset; - } - return cps[index].byteOffset; + return (index < cps.size()) ? cps[index].byteOffset : (cps.empty() ? 0 : cps.back().byteOffset); } // Builds a vector of break information from explicit hyphen markers in the given codepoints. std::vector buildExplicitBreakInfos(const std::vector& cps) { std::vector breaks; - breaks.reserve(cps.size()); // Scan every codepoint looking for explicit/soft hyphen markers that are surrounded by letters. - for (size_t i = 0; i < cps.size(); ++i) { + for (size_t i = 1; i + 1 < cps.size(); ++i) { const uint32_t cp = cps[i].value; - if (!isExplicitHyphen(cp) || i == 0 || i + 1 >= cps.size()) { - continue; // Need at least one alphabetic character on both sides. - } - if (!isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) { + if (!isExplicitHyphen(cp) || !isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) { continue; } // Offset points to the next codepoint so rendering starts after the hyphen marker. - breaks.push_back({byteOffsetForIndex(cps, i + 1), isSoftHyphen(cp)}); + breaks.push_back({cps[i + 1].byteOffset, isSoftHyphen(cp)}); } if (breaks.empty()) { return breaks; } - // Sort by byte offset so we can deduplicate sequential markers. - // Multiple dash codepoints can point to the same byte offset once punctuation is trimmed; sort before merging. + // Sort by byte offset so we can deduplicate sequential markers in-place. std::sort(breaks.begin(), breaks.end(), [](const Hyphenator::BreakInfo& lhs, const Hyphenator::BreakInfo& rhs) { return lhs.byteOffset < rhs.byteOffset; }); - // Ensure we keep a single entry per break while retaining the "needs hyphen" flag when any marker requested it. - std::vector deduped; - deduped.reserve(breaks.size()); - for (const auto& entry : breaks) { - if (!deduped.empty() && deduped.back().byteOffset == entry.byteOffset) { - // Merge entries so that an explicit hyphen wins over a soft hyphen at the same offset. - deduped.back().requiresInsertedHyphen = deduped.back().requiresInsertedHyphen || entry.requiresInsertedHyphen; + // Deduplicate in-place: merge entries at same offset while retaining "needs hyphen" flag. + size_t writePos = 0; + for (size_t readPos = 1; readPos < breaks.size(); ++readPos) { + if (breaks[readPos].byteOffset == breaks[writePos].byteOffset) { + // Merge: explicit hyphen wins over soft hyphen at same offset. + breaks[writePos].requiresInsertedHyphen = + breaks[writePos].requiresInsertedHyphen || breaks[readPos].requiresInsertedHyphen; } else { - deduped.push_back(entry); + breaks[++writePos] = breaks[readPos]; } } + breaks.resize(writePos + 1); - return deduped; + return breaks; } } // namespace @@ -170,22 +150,25 @@ std::vector Hyphenator::breakOffsets(const std::string& w return explicitBreakInfos; } - // Ask language hyphenator for legal break points, optionally augment with naive fallback. + // Ask language hyphenator for legal break points. std::vector indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector(); + + // Only add fallback breaks if needed and deduplicate if both language and fallback breaks exist. if (includeFallback) { for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) { indexes.push_back(idx); } + // Only deduplicate if we have both language-specific and fallback breaks. + std::sort(indexes.begin(), indexes.end()); + indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); + } else if (indexes.empty()) { + return {}; } if (indexes.empty()) { return {}; } - // Sort/deduplicate break indexes before converting them back to byte offsets. - std::sort(indexes.begin(), indexes.end()); - indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); - std::vector breaks; breaks.reserve(indexes.size()); for (const size_t idx : indexes) { @@ -195,7 +178,4 @@ std::vector Hyphenator::breakOffsets(const std::string& w return breaks; } -void Hyphenator::setPreferredLanguage(const std::string& lang) { - preferredLanguage() = lang; - cachedHyphenator() = hyphenatorForLanguage(lang); -} +void Hyphenator::setPreferredLanguage(const std::string& lang) { cachedHyphenator() = hyphenatorForLanguage(lang); }