optimization

2026-02-06 15:47:39 +03:00 · 2026-01-08 03:27:27 +05:00 · 2026-01-08 03:27:27 +05:00 · 3ef2448f72
commit 3ef2448f72
parent 58314e9efd
2 changed files with 35 additions and 34 deletions
--- a/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp
@ -290,14 +290,11 @@ bool nextToApostrophe(const std::vector<CodepointInfo>& cps, const size_t index)
 // Returns byte indexes where the word may break according to English syllable rules.
 std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
  std::vector<size_t> indexes;
-  if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
+  const size_t wordSize = cps.size();
    return indexes;
  }
  const auto lowerWord = lowercaseLatinWord(cps);
  std::vector<size_t> vowelPositions;
-  vowelPositions.reserve(cps.size());
+  vowelPositions.reserve(wordSize / 2);
-  for (size_t i = 0; i < cps.size(); ++i) {
+  for (size_t i = 0; i < wordSize; ++i) {
    if (isLatinVowel(cps[i].value)) {
      vowelPositions.push_back(i);
    }
@ -321,7 +318,7 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
    const size_t clusterStart = leftVowel + 1;
    const size_t clusterEnd = rightVowel;
    const size_t onsetLen = englishOnsetLength(cps, clusterStart, clusterEnd);
-    size_t breakIndex = clusterEnd - onsetLen;
+    const size_t breakIndex = clusterEnd - onsetLen;
    if (!englishBreakAllowed(cps, breakIndex)) {
      continue;
@ -329,10 +326,14 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
    indexes.push_back(breakIndex);
  }
  const auto lowerWord = lowercaseLatinWord(cps);
  const size_t preDedupeCount = indexes.size();
  appendMorphologyBreaks(cps, lowerWord, indexes);
  if (indexes.size() > preDedupeCount) {
    std::sort(indexes.begin(), indexes.end());
    indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
  }
  return indexes;
 }
--- a/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp
@ -333,68 +333,68 @@ void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::ve
 // Produces syllable break indexes tailored to Russian phonotactics.
 std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
  std::vector<size_t> indexes;
-  if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
+  const size_t wordSize = cps.size();
    return indexes;
  }
  const auto lowerWord = lowercaseCyrillicWord(cps);
  // Collect vowel positions.
  std::vector<size_t> vowelPositions;
-  vowelPositions.reserve(cps.size());
+  vowelPositions.reserve(wordSize / 2);  // Typical estimate: ~50% vowels
-  for (size_t i = 0; i < cps.size(); ++i) {
+  for (size_t i = 0; i < wordSize; ++i) {
    if (isCyrillicVowel(cps[i].value)) {
      vowelPositions.push_back(i);
    }
  }
  // Need at least 2 vowels to create a syllable break.
  if (vowelPositions.size() < 2) {
    return indexes;
  }
  // Process inter-vowel clusters for hyphenation points.
  for (size_t v = 0; v + 1 < vowelPositions.size(); ++v) {
    const size_t leftVowel = vowelPositions[v];
    const size_t rightVowel = vowelPositions[v + 1];
    const size_t suffixLen = wordSize - rightVowel;
    // Adjacent vowels: can break between them if constraints allow.
    if (rightVowel - leftVowel == 1) {
-      if (rightVowel >= MIN_PREFIX_CP && cps.size() - rightVowel >= MIN_SUFFIX_CP && !nextToSoftSign(cps, rightVowel) &&
+      if (rightVowel >= MIN_PREFIX_CP && suffixLen >= MIN_SUFFIX_CP && !nextToSoftSign(cps, rightVowel) &&
          russianBreakAllowed(cps, rightVowel)) {
        indexes.push_back(rightVowel);
      }
      continue;
    }
    // Consonant cluster between vowels: find optimal break point.
    const size_t clusterStart = leftVowel + 1;
    const size_t clusterEnd = rightVowel;
-    size_t breakIndex = std::numeric_limits<size_t>::max();
+    // Try double consonant split first (preferred).
-    const auto split = doubleConsonantSplit(cps, clusterStart, clusterEnd);
+    size_t breakIndex = doubleConsonantSplit(cps, clusterStart, clusterEnd);
-    if (split != std::numeric_limits<size_t>::max()) {
+    
-      breakIndex = split;
+    // Fall back to onset-based split.
-    } else {
+    if (breakIndex == std::numeric_limits<size_t>::max()) {
      const size_t onsetLen = russianOnsetLength(cps, clusterStart, clusterEnd);
      breakIndex = clusterEnd - onsetLen;
    }
-    if (breakIndex == std::numeric_limits<size_t>::max()) {
+    // Validate candidate break point.
    if (breakIndex < MIN_PREFIX_CP || suffixLen < MIN_SUFFIX_CP || nextToSoftSign(cps, breakIndex) ||
        !russianBreakAllowed(cps, breakIndex)) {
      continue;
    }
    if (breakIndex < MIN_PREFIX_CP || cps.size() - breakIndex < MIN_SUFFIX_CP) {
      continue;
    }
    if (nextToSoftSign(cps, breakIndex)) {
      continue;
    }
    if (!russianBreakAllowed(cps, breakIndex)) {
      continue;
    }
    indexes.push_back(breakIndex);
  }
  const auto lowerWord = lowercaseCyrillicWord(cps);
  const size_t preDedupeCount = indexes.size();
  appendMorphologyBreaks(cps, lowerWord, indexes);
  if (indexes.size() > preDedupeCount) {
    std::sort(indexes.begin(), indexes.end());
    indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
  }
  return indexes;
 }