From 04a084f6c805c4e7c049809d809c026d98be3936 Mon Sep 17 00:00:00 2001 From: Arthur Tazhitdinov Date: Wed, 14 Jan 2026 19:17:29 +0500 Subject: [PATCH] refactor: Simplify break information handling in buildExplicitBreakInfos and enhance clarity in liangBreakIndexes --- lib/Epub/Epub/hyphenation/Hyphenator.cpp | 22 ------------------- .../Epub/hyphenation/LiangHyphenation.cpp | 15 +++++++++---- 2 files changed, 11 insertions(+), 26 deletions(-) diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp index 3436a90b..5ad2bb85 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp @@ -48,28 +48,6 @@ std::vector buildExplicitBreakInfos(const std::vector liangBreakIndexes(const std::vector& cps, return {}; } + // Liang scores: one entry per augmented char (leading/trailing dots included). std::vector scores(augmented.charCount(), 0); + // Walk every starting character position and stream bytes through the trie. for (size_t charStart = 0; charStart < augmented.charByteOffsets.size(); ++charStart) { - size_t byteStart = augmented.charByteOffsets[charStart]; + const size_t byteStart = augmented.charByteOffsets[charStart]; AutomatonState state = root; + for (size_t cursor = byteStart; cursor < augmented.bytes.size(); ++cursor) { AutomatonState next; if (!transition(automaton, state, augmented.bytes[cursor], next)) { - break; + break; // No more matches for this prefix. } state = next; if (state.levels && state.levelsLen > 0) { size_t offset = 0; + // Each packed byte stores the byte-distance delta and the Liang level digit. for (size_t i = 0; i < state.levelsLen; ++i) { const uint8_t packed = state.levels[i]; const size_t dist = static_cast(packed / 10); const uint8_t level = static_cast(packed % 10); + offset += dist; const size_t splitByte = byteStart + offset; if (splitByte >= augmented.byteToCharIndex.size()) { continue; } + const int32_t boundary = augmented.byteToCharIndex[splitByte]; if (boundary < 0) { - continue; + continue; // Mid-codepoint byte, wait for the next one. } if (boundary < 2 || boundary + 2 > static_cast(augmented.charCount())) { - continue; + continue; // Skip splits that land in the leading/trailing sentinels. } + const size_t idx = static_cast(boundary); if (idx >= scores.size()) { continue;