From 3ef2448f72e3bb208c06dfda475824c2debdab09 Mon Sep 17 00:00:00 2001 From: Arthur Tazhitdinov Date: Thu, 8 Jan 2026 03:27:27 +0500 Subject: [PATCH] optimization --- .../Epub/hyphenation/EnglishHyphenator.cpp | 19 +++---- .../Epub/hyphenation/RussianHyphenator.cpp | 50 +++++++++---------- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp index 3cf64c87..a4ec4822 100644 --- a/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp @@ -290,14 +290,11 @@ bool nextToApostrophe(const std::vector& cps, const size_t index) // Returns byte indexes where the word may break according to English syllable rules. std::vector englishBreakIndexes(const std::vector& cps) { std::vector indexes; - if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { - return indexes; - } + const size_t wordSize = cps.size(); - const auto lowerWord = lowercaseLatinWord(cps); std::vector vowelPositions; - vowelPositions.reserve(cps.size()); - for (size_t i = 0; i < cps.size(); ++i) { + vowelPositions.reserve(wordSize / 2); + for (size_t i = 0; i < wordSize; ++i) { if (isLatinVowel(cps[i].value)) { vowelPositions.push_back(i); } @@ -321,7 +318,7 @@ std::vector englishBreakIndexes(const std::vector& cps) { const size_t clusterStart = leftVowel + 1; const size_t clusterEnd = rightVowel; const size_t onsetLen = englishOnsetLength(cps, clusterStart, clusterEnd); - size_t breakIndex = clusterEnd - onsetLen; + const size_t breakIndex = clusterEnd - onsetLen; if (!englishBreakAllowed(cps, breakIndex)) { continue; @@ -329,10 +326,14 @@ std::vector englishBreakIndexes(const std::vector& cps) { indexes.push_back(breakIndex); } + const auto lowerWord = lowercaseLatinWord(cps); + const size_t preDedupeCount = indexes.size(); appendMorphologyBreaks(cps, lowerWord, indexes); - std::sort(indexes.begin(), indexes.end()); - indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); + if (indexes.size() > preDedupeCount) { + std::sort(indexes.begin(), indexes.end()); + indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); + } return indexes; } diff --git a/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp b/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp index d7fd0342..acae84b1 100644 --- a/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp @@ -333,68 +333,68 @@ void appendMorphologyBreaks(const std::vector& cps, const std::ve // Produces syllable break indexes tailored to Russian phonotactics. std::vector russianBreakIndexes(const std::vector& cps) { std::vector indexes; - if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { - return indexes; - } - - const auto lowerWord = lowercaseCyrillicWord(cps); + const size_t wordSize = cps.size(); + // Collect vowel positions. std::vector vowelPositions; - vowelPositions.reserve(cps.size()); - for (size_t i = 0; i < cps.size(); ++i) { + vowelPositions.reserve(wordSize / 2); // Typical estimate: ~50% vowels + for (size_t i = 0; i < wordSize; ++i) { if (isCyrillicVowel(cps[i].value)) { vowelPositions.push_back(i); } } + // Need at least 2 vowels to create a syllable break. if (vowelPositions.size() < 2) { return indexes; } + // Process inter-vowel clusters for hyphenation points. for (size_t v = 0; v + 1 < vowelPositions.size(); ++v) { const size_t leftVowel = vowelPositions[v]; const size_t rightVowel = vowelPositions[v + 1]; + const size_t suffixLen = wordSize - rightVowel; + // Adjacent vowels: can break between them if constraints allow. if (rightVowel - leftVowel == 1) { - if (rightVowel >= MIN_PREFIX_CP && cps.size() - rightVowel >= MIN_SUFFIX_CP && !nextToSoftSign(cps, rightVowel) && + if (rightVowel >= MIN_PREFIX_CP && suffixLen >= MIN_SUFFIX_CP && !nextToSoftSign(cps, rightVowel) && russianBreakAllowed(cps, rightVowel)) { indexes.push_back(rightVowel); } continue; } + // Consonant cluster between vowels: find optimal break point. const size_t clusterStart = leftVowel + 1; const size_t clusterEnd = rightVowel; - size_t breakIndex = std::numeric_limits::max(); - const auto split = doubleConsonantSplit(cps, clusterStart, clusterEnd); - if (split != std::numeric_limits::max()) { - breakIndex = split; - } else { + // Try double consonant split first (preferred). + size_t breakIndex = doubleConsonantSplit(cps, clusterStart, clusterEnd); + + // Fall back to onset-based split. + if (breakIndex == std::numeric_limits::max()) { const size_t onsetLen = russianOnsetLength(cps, clusterStart, clusterEnd); breakIndex = clusterEnd - onsetLen; } - if (breakIndex == std::numeric_limits::max()) { + // Validate candidate break point. + if (breakIndex < MIN_PREFIX_CP || suffixLen < MIN_SUFFIX_CP || nextToSoftSign(cps, breakIndex) || + !russianBreakAllowed(cps, breakIndex)) { continue; } - if (breakIndex < MIN_PREFIX_CP || cps.size() - breakIndex < MIN_SUFFIX_CP) { - continue; - } - if (nextToSoftSign(cps, breakIndex)) { - continue; - } - if (!russianBreakAllowed(cps, breakIndex)) { - continue; - } indexes.push_back(breakIndex); } + const auto lowerWord = lowercaseCyrillicWord(cps); + const size_t preDedupeCount = indexes.size(); appendMorphologyBreaks(cps, lowerWord, indexes); - std::sort(indexes.begin(), indexes.end()); - indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); + if (indexes.size() > preDedupeCount) { + std::sort(indexes.begin(), indexes.end()); + indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); + } + return indexes; }