refactor: Simplify break information handling in buildExplicitBreakInfos and enhance clarity in liangBreakIndexes

2026-02-06 15:47:39 +03:00 · 2026-01-14 19:17:29 +05:00 · 2026-01-14 19:17:29 +05:00 · 04a084f6c8
commit 04a084f6c8
parent 5dab3d4fcf
2 changed files with 11 additions and 26 deletions
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@ -48,28 +48,6 @@ std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<Cod
    breaks.push_back({cps[i + 1].byteOffset, isSoftHyphen(cp)});
  }

-  if (breaks.empty()) {
-    return breaks;
-  }
-
-  // Sort by byte offset so we can deduplicate sequential markers in-place.
-  std::sort(breaks.begin(), breaks.end(), [](const Hyphenator::BreakInfo& lhs, const Hyphenator::BreakInfo& rhs) {
-    return lhs.byteOffset < rhs.byteOffset;
-  });
-
-  // Deduplicate in-place: merge entries at same offset while retaining "needs hyphen" flag.
-  size_t writePos = 0;
-  for (size_t readPos = 1; readPos < breaks.size(); ++readPos) {
-    if (breaks[readPos].byteOffset == breaks[writePos].byteOffset) {
-      // Merge: explicit hyphen wins over soft hyphen at same offset.
-      breaks[writePos].requiresInsertedHyphen =
-          breaks[writePos].requiresInsertedHyphen || breaks[readPos].requiresInsertedHyphen;
-    } else {
-      breaks[++writePos] = breaks[readPos];
-    }
-  }
-  breaks.resize(writePos + 1);
-
  return breaks;
 }

--- a/lib/Epub/Epub/hyphenation/LiangHyphenation.cpp
+++ b/lib/Epub/Epub/hyphenation/LiangHyphenation.cpp
@ -354,36 +354,43 @@ std::vector<size_t> liangBreakIndexes(const std::vector<CodepointInfo>& cps,
    return {};
  }

+  // Liang scores: one entry per augmented char (leading/trailing dots included).
  std::vector<uint8_t> scores(augmented.charCount(), 0);

+  // Walk every starting character position and stream bytes through the trie.
  for (size_t charStart = 0; charStart < augmented.charByteOffsets.size(); ++charStart) {
-    size_t byteStart = augmented.charByteOffsets[charStart];
+    const size_t byteStart = augmented.charByteOffsets[charStart];
    AutomatonState state = root;
+
    for (size_t cursor = byteStart; cursor < augmented.bytes.size(); ++cursor) {
      AutomatonState next;
      if (!transition(automaton, state, augmented.bytes[cursor], next)) {
-        break;
+        break;  // No more matches for this prefix.
      }
      state = next;

      if (state.levels && state.levelsLen > 0) {
        size_t offset = 0;
+        // Each packed byte stores the byte-distance delta and the Liang level digit.
        for (size_t i = 0; i < state.levelsLen; ++i) {
          const uint8_t packed = state.levels[i];
          const size_t dist = static_cast<size_t>(packed / 10);
          const uint8_t level = static_cast<uint8_t>(packed % 10);
+
          offset += dist;
          const size_t splitByte = byteStart + offset;
          if (splitByte >= augmented.byteToCharIndex.size()) {
            continue;
          }
+
          const int32_t boundary = augmented.byteToCharIndex[splitByte];
          if (boundary < 0) {
-            continue;
+            continue;  // Mid-codepoint byte, wait for the next one.
          }
          if (boundary < 2 || boundary + 2 > static_cast<int32_t>(augmented.charCount())) {
-            continue;
+            continue;  // Skip splits that land in the leading/trailing sentinels.
          }
+
          const size_t idx = static_cast<size_t>(boundary);
          if (idx >= scores.size()) {
            continue;