optimization

2026-02-06 15:47:39 +03:00 · 2026-01-08 03:01:36 +05:00 · 2026-01-08 03:01:36 +05:00 · 58314e9efd
commit 58314e9efd
parent 34d2b81f40
4 changed files with 45 additions and 68 deletions
--- a/lib/Epub/Epub/ParsedText.cpp
+++ b/lib/Epub/Epub/ParsedText.cpp
@ -207,28 +207,29 @@ std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r
      const int spacing = isFirstWord ? 0 : spaceWidth;
      const int candidateWidth = spacing + wordWidths[currentIndex];

+      // Word fits on current line
      if (lineWidth + candidateWidth <= pageWidth) {
        lineWidth += candidateWidth;
-        currentIndex += 1;
+        ++currentIndex;
        continue;
      }

      // Word would overflow — try to split based on hyphenation points
      const int availableWidth = pageWidth - lineWidth - spacing;
-      const bool allowFallbackBreaks =
-          isFirstWord;  // Permit fallback breaks only when first word one the line still overflows
+      const bool allowFallbackBreaks = isFirstWord;  // Only for first word on line
+
      if (availableWidth > 0 &&
          hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) {
-        // Prefix now fits; append it to this line and immediately move to the next line
+        // Prefix now fits; append it to this line and move to next line
        lineWidth += spacing + wordWidths[currentIndex];
-        currentIndex += 1;
+        ++currentIndex;
        break;
      }

      // Could not split: force at least one word per line to avoid infinite loop
      if (currentIndex == lineStart) {
        lineWidth += candidateWidth;
-        currentIndex += 1;
+        ++currentIndex;
      }
      break;
    }
@ -249,19 +250,21 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
    return false;
  }

-  // Position iterators at the target word and its style entry.
+  // Get iterators to target word and style.
  auto wordIt = words.begin();
  auto styleIt = wordStyles.begin();
  std::advance(wordIt, wordIndex);
  std::advance(styleIt, wordIndex);

+  const std::string& word = *wordIt;
+  const auto style = *styleIt;
+
  // Collect candidate breakpoints (byte offsets and hyphen requirements).
-  const auto breakInfos = Hyphenator::breakOffsets(*wordIt, allowFallbackBreaks);
+  const auto breakInfos = Hyphenator::breakOffsets(word, allowFallbackBreaks);
  if (breakInfos.empty()) {
    return false;
  }

-  const auto style = *styleIt;
  size_t chosenOffset = 0;
  int chosenWidth = -1;
  bool chosenNeedsHyphen = true;
@ -269,22 +272,19 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
  // Iterate over each legal breakpoint and retain the widest prefix that still fits.
  for (const auto& info : breakInfos) {
    const size_t offset = info.byteOffset;
-    if (offset == 0 || offset >= wordIt->size()) {
+    if (offset == 0 || offset >= word.size()) {
      continue;
    }

    const bool needsHyphen = info.requiresInsertedHyphen;
-    std::string prefix = wordIt->substr(0, offset);
-    const int prefixWidth = measureWordWidth(renderer, fontId, prefix, style, needsHyphen);
-    if (prefixWidth > availableWidth) {
-      continue;
+    const int prefixWidth = measureWordWidth(renderer, fontId, word.substr(0, offset), style, needsHyphen);
+    if (prefixWidth > availableWidth || prefixWidth <= chosenWidth) {
+      continue;  // Skip if too wide or not an improvement
    }

-    if (prefixWidth > chosenWidth) {
-      chosenWidth = prefixWidth;
-      chosenOffset = offset;
-      chosenNeedsHyphen = needsHyphen;
-    }
+    chosenWidth = prefixWidth;
+    chosenOffset = offset;
+    chosenNeedsHyphen = needsHyphen;
  }

  if (chosenWidth < 0) {
@ -293,7 +293,7 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
  }

  // Split the word at the selected breakpoint and append a hyphen if required.
-  std::string remainder = wordIt->substr(chosenOffset);
+  std::string remainder = word.substr(chosenOffset);
  wordIt->resize(chosenOffset);
  if (chosenNeedsHyphen) {
    wordIt->push_back('-');
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
@ -59,8 +59,6 @@ bool isCyrillicConsonant(const uint32_t cp) { return isCyrillicLetter(cp) && !is

 bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }

-bool isVowel(const uint32_t cp) { return isLatinVowel(cp) || isCyrillicVowel(cp); }
-
 bool isPunctuation(const uint32_t cp) {
  switch (cp) {
    case '.':
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h
@ -25,7 +25,6 @@ bool isCyrillicVowel(uint32_t cp);
 bool isCyrillicConsonant(uint32_t cp);

 bool isAlphabetic(uint32_t cp);
-bool isVowel(uint32_t cp);
 bool isPunctuation(uint32_t cp);
 bool isAsciiDigit(uint32_t cp);
 bool isExplicitHyphen(uint32_t cp);
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@ -3,7 +3,6 @@
 #include <Utf8.h>

 #include <algorithm>
-#include <array>
 #include <vector>

 #include "EnglishHyphenator.h"
@ -32,12 +31,6 @@ const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) {
  return nullptr;
 }

-// Preferred language hint; empty means "auto".
-std::string& preferredLanguage() {
-  static std::string lang;
-  return lang;
-}
-
 // Cached hyphenator instance for the current preferred language.
 const LanguageHyphenator*& cachedHyphenator() {
  static const LanguageHyphenator* hyphenator = nullptr;
@ -86,67 +79,54 @@ void trimTrailingFootnoteReference(std::vector<CodepointInfo>& cps) {

 // Asks the language hyphenator for legal break positions inside the word.
 std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps) {
-  if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
-    return {};
-  }
-
  if (const auto* hyphenator = cachedHyphenator()) {
-    auto indexes = hyphenator->breakIndexes(cps);
-    return indexes;
+    return hyphenator->breakIndexes(cps);
  }
-
  return {};
 }

 // Maps a codepoint index back to its byte offset inside the source word.
 size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t index) {
-  if (index >= cps.size()) {
-    return cps.empty() ? 0 : cps.back().byteOffset;
-  }
-  return cps[index].byteOffset;
+  return (index < cps.size()) ? cps[index].byteOffset : (cps.empty() ? 0 : cps.back().byteOffset);
 }

 // Builds a vector of break information from explicit hyphen markers in the given codepoints.
 std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<CodepointInfo>& cps) {
  std::vector<Hyphenator::BreakInfo> breaks;
-  breaks.reserve(cps.size());

  // Scan every codepoint looking for explicit/soft hyphen markers that are surrounded by letters.
-  for (size_t i = 0; i < cps.size(); ++i) {
+  for (size_t i = 1; i + 1 < cps.size(); ++i) {
    const uint32_t cp = cps[i].value;
-    if (!isExplicitHyphen(cp) || i == 0 || i + 1 >= cps.size()) {
-      continue;  // Need at least one alphabetic character on both sides.
-    }
-    if (!isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) {
+    if (!isExplicitHyphen(cp) || !isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) {
      continue;
    }
    // Offset points to the next codepoint so rendering starts after the hyphen marker.
-    breaks.push_back({byteOffsetForIndex(cps, i + 1), isSoftHyphen(cp)});
+    breaks.push_back({cps[i + 1].byteOffset, isSoftHyphen(cp)});
  }

  if (breaks.empty()) {
    return breaks;
  }

-  // Sort by byte offset so we can deduplicate sequential markers.
-  // Multiple dash codepoints can point to the same byte offset once punctuation is trimmed; sort before merging.
+  // Sort by byte offset so we can deduplicate sequential markers in-place.
  std::sort(breaks.begin(), breaks.end(), [](const Hyphenator::BreakInfo& lhs, const Hyphenator::BreakInfo& rhs) {
    return lhs.byteOffset < rhs.byteOffset;
  });

-  // Ensure we keep a single entry per break while retaining the "needs hyphen" flag when any marker requested it.
-  std::vector<Hyphenator::BreakInfo> deduped;
-  deduped.reserve(breaks.size());
-  for (const auto& entry : breaks) {
-    if (!deduped.empty() && deduped.back().byteOffset == entry.byteOffset) {
-      // Merge entries so that an explicit hyphen wins over a soft hyphen at the same offset.
-      deduped.back().requiresInsertedHyphen = deduped.back().requiresInsertedHyphen || entry.requiresInsertedHyphen;
+  // Deduplicate in-place: merge entries at same offset while retaining "needs hyphen" flag.
+  size_t writePos = 0;
+  for (size_t readPos = 1; readPos < breaks.size(); ++readPos) {
+    if (breaks[readPos].byteOffset == breaks[writePos].byteOffset) {
+      // Merge: explicit hyphen wins over soft hyphen at same offset.
+      breaks[writePos].requiresInsertedHyphen =
+          breaks[writePos].requiresInsertedHyphen || breaks[readPos].requiresInsertedHyphen;
    } else {
-      deduped.push_back(entry);
+      breaks[++writePos] = breaks[readPos];
    }
  }
+  breaks.resize(writePos + 1);

-  return deduped;
+  return breaks;
 }

 }  // namespace
@ -170,22 +150,25 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
    return explicitBreakInfos;
  }

-  // Ask language hyphenator for legal break points, optionally augment with naive fallback.
+  // Ask language hyphenator for legal break points.
  std::vector<size_t> indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector<size_t>();
+
+  // Only add fallback breaks if needed and deduplicate if both language and fallback breaks exist.
  if (includeFallback) {
    for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
      indexes.push_back(idx);
    }
+    // Only deduplicate if we have both language-specific and fallback breaks.
+    std::sort(indexes.begin(), indexes.end());
+    indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
+  } else if (indexes.empty()) {
+    return {};
  }

  if (indexes.empty()) {
    return {};
  }

-  // Sort/deduplicate break indexes before converting them back to byte offsets.
-  std::sort(indexes.begin(), indexes.end());
-  indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
-
  std::vector<Hyphenator::BreakInfo> breaks;
  breaks.reserve(indexes.size());
  for (const size_t idx : indexes) {
@ -195,7 +178,4 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
  return breaks;
 }

-void Hyphenator::setPreferredLanguage(const std::string& lang) {
-  preferredLanguage() = lang;
-  cachedHyphenator() = hyphenatorForLanguage(lang);
-}
+void Hyphenator::setPreferredLanguage(const std::string& lang) { cachedHyphenator() = hyphenatorForLanguage(lang); }