Rename trimTrailingPunctuation to trimSurroundingPunctuation and update logic to remove surrounding punctuation; add explicit hyphen handling in breakOffsets function.

format fix
Disable hyphenation feature in CrossPointSettings
2026-02-04 14:47:37 +03:00 · 2025-12-26 06:03:38 +05:00 · 2025-12-26 05:12:26 +05:00 · 2025-12-26 05:11:51 +05:00
7 changed files with 285 additions and 79 deletions
--- a/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp
@ -6,6 +6,8 @@
 #include <string>
 #include <vector>

+#include "HyphenationLiterals.h"
+
 namespace {

 char lowerLatinChar(const uint32_t cp) {
@ -47,10 +49,17 @@ bool isEnglishFricativeChar(const char c) {
  }
 }

-struct LatinLiteral {
-  const char* text;
-  size_t length;
-};
+using LatinLiteral = HyphenLiteralT<char>;
+
+constexpr std::array<LatinLiteral, 20> ENGLISH_PREFIXES = {
+    {{"anti", 4},  {"auto", 4}, {"counter", 7}, {"de", 2},    {"dis", 3},   {"hyper", 5}, {"inter", 5},
+     {"micro", 5}, {"mis", 3},  {"mono", 4},    {"multi", 5}, {"non", 3},   {"over", 4},  {"post", 4},
+     {"pre", 3},   {"pro", 3},  {"re", 2},      {"sub", 3},   {"super", 5}, {"trans", 5}}};
+
+constexpr std::array<LatinLiteral, 24> ENGLISH_SUFFIXES = {
+    {{"able", 4}, {"ible", 4}, {"ing", 3},  {"ings", 4},   {"ed", 2},    {"er", 2},   {"ers", 3},   {"est", 3},
+     {"ful", 3},  {"hood", 4}, {"less", 4}, {"lessly", 6}, {"ly", 2},    {"ment", 4}, {"ments", 5}, {"ness", 4},
+     {"ous", 3},  {"tion", 4}, {"sion", 4}, {"ward", 4},   {"wards", 5}, {"ship", 4}, {"ships", 5}, {"y", 1}}};

 bool nextToApostrophe(const std::vector<CodepointInfo>& cps, size_t index);

@ -63,21 +72,6 @@ std::string lowercaseLatinWord(const std::vector<CodepointInfo>& cps) {
  return lower;
 }

-bool matchesPatternAt(const std::string& lowerWord, const size_t start, const LatinLiteral& pattern) {
-  if (!pattern.text || pattern.length == 0) {
-    return false;
-  }
-  if (start + pattern.length > lowerWord.size()) {
-    return false;
-  }
-  for (size_t i = 0; i < pattern.length; ++i) {
-    if (lowerWord[start + i] != pattern.text[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
 bool englishSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
  if (start >= end || start >= cps.size()) {
    return false;
@ -91,56 +85,33 @@ bool englishSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t
  return false;
 }

+bool englishBreakAllowed(const std::vector<CodepointInfo>& cps, const size_t breakIndex) {
+  if (breakIndex == 0 || breakIndex >= cps.size()) {
+    return false;
+  }
+
+  const size_t prefixLen = breakIndex;
+  const size_t suffixLen = cps.size() - breakIndex;
+  if (prefixLen < MIN_PREFIX_CP || suffixLen < MIN_SUFFIX_CP) {
+    return false;
+  }
+
+  if (!englishSegmentHasVowel(cps, 0, breakIndex) || !englishSegmentHasVowel(cps, breakIndex, cps.size())) {
+    return false;
+  }
+
+  if (nextToApostrophe(cps, breakIndex)) {
+    return false;
+  }
+
+  return true;
+}
+
 void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::string& lowerWord,
                            std::vector<size_t>& indexes) {
-  static constexpr std::array<LatinLiteral, 20> PREFIXES = {
-      {{"anti", 4},  {"auto", 4}, {"counter", 7}, {"de", 2},    {"dis", 3},   {"hyper", 5}, {"inter", 5},
-       {"micro", 5}, {"mis", 3},  {"mono", 4},    {"multi", 5}, {"non", 3},   {"over", 4},  {"post", 4},
-       {"pre", 3},   {"pro", 3},  {"re", 2},      {"sub", 3},   {"super", 5}, {"trans", 5}}};
-
-  static constexpr std::array<LatinLiteral, 24> SUFFIXES = {
-      {{"able", 4}, {"ible", 4}, {"ing", 3},  {"ings", 4},   {"ed", 2},    {"er", 2},   {"ers", 3},   {"est", 3},
-       {"ful", 3},  {"hood", 4}, {"less", 4}, {"lessly", 6}, {"ly", 2},    {"ment", 4}, {"ments", 5}, {"ness", 4},
-       {"ous", 3},  {"tion", 4}, {"sion", 4}, {"ward", 4},   {"wards", 5}, {"ship", 4}, {"ships", 5}, {"y", 1}}};
-
-  const size_t length = cps.size();
-  if (length < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
-    return;
-  }
-
-  const auto tryPush = [&](const size_t breakIndex) {
-    if (breakIndex < MIN_PREFIX_CP || length - breakIndex < MIN_SUFFIX_CP) {
-      return;
-    }
-    if (!englishSegmentHasVowel(cps, 0, breakIndex) || !englishSegmentHasVowel(cps, breakIndex, length)) {
-      return;
-    }
-    if (nextToApostrophe(cps, breakIndex)) {
-      return;
-    }
-    indexes.push_back(breakIndex);
-  };
-
-  for (const auto& prefix : PREFIXES) {
-    if (prefix.length == 0 || prefix.length >= length) {
-      continue;
-    }
-    if (!matchesPatternAt(lowerWord, 0, prefix)) {
-      continue;
-    }
-    tryPush(prefix.length);
-  }
-
-  for (const auto& suffix : SUFFIXES) {
-    if (suffix.length == 0 || suffix.length >= length) {
-      continue;
-    }
-    const size_t breakIndex = length - suffix.length;
-    if (!matchesPatternAt(lowerWord, breakIndex, suffix)) {
-      continue;
-    }
-    tryPush(breakIndex);
-  }
+  appendLiteralBreaks(
+      lowerWord, ENGLISH_PREFIXES, ENGLISH_SUFFIXES,
+      [&](const size_t breakIndex) { return englishBreakAllowed(cps, breakIndex); }, indexes);
 }

 struct CharPair {
@ -341,8 +312,7 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
    const size_t rightVowel = vowelPositions[v + 1];

    if (rightVowel - leftVowel == 1) {
-      if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) && rightVowel >= MIN_PREFIX_CP &&
-          cps.size() - rightVowel >= MIN_SUFFIX_CP && !nextToApostrophe(cps, rightVowel)) {
+      if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) && englishBreakAllowed(cps, rightVowel)) {
        indexes.push_back(rightVowel);
      }
      continue;
@ -353,10 +323,7 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
    const size_t onsetLen = englishOnsetLength(cps, clusterStart, clusterEnd);
    size_t breakIndex = clusterEnd - onsetLen;

-    if (breakIndex < MIN_PREFIX_CP || cps.size() - breakIndex < MIN_SUFFIX_CP) {
-      continue;
-    }
-    if (nextToApostrophe(cps, breakIndex)) {
+    if (!englishBreakAllowed(cps, breakIndex)) {
      continue;
    }
    indexes.push_back(breakIndex);
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
@ -92,7 +92,10 @@ bool isPunctuation(const uint32_t cp) {
  }
 }

-void trimTrailingPunctuation(std::vector<CodepointInfo>& cps) {
+void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps) {
+  while (!cps.empty() && isPunctuation(cps.front().value)) {
+    cps.erase(cps.begin());
+  }
  while (!cps.empty() && isPunctuation(cps.back().value)) {
    cps.pop_back();
  }
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h
@ -28,6 +28,6 @@ bool isCyrillicConsonant(uint32_t cp);
 bool isAlphabetic(uint32_t cp);
 bool isVowel(uint32_t cp);
 bool isPunctuation(uint32_t cp);
-void trimTrailingPunctuation(std::vector<CodepointInfo>& cps);
+void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);

 Script detectScript(const std::vector<CodepointInfo>& cps);
--- a/lib/Epub/Epub/hyphenation/HyphenationLiterals.h
+++ b/lib/Epub/Epub/hyphenation/HyphenationLiterals.h
@ -0,0 +1,63 @@
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+template <typename T>
+struct HyphenLiteral {
+  const T* data;
+  size_t length;
+};
+
+template <typename T>
+using HyphenLiteralT = HyphenLiteral<T>;
+
+template <typename WordContainer, typename Literal>
+bool matchesLiteralAt(const WordContainer& word, const size_t start, const Literal& literal) {
+  if (!literal.data || literal.length == 0) {
+    return false;
+  }
+  if (start + literal.length > word.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < literal.length; ++i) {
+    if (word[start + i] != literal.data[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename WordContainer, typename PrefixContainer, typename SuffixContainer, typename BreakAllowedFn>
+void appendLiteralBreaks(const WordContainer& lowerWord, const PrefixContainer& prefixes,
+                         const SuffixContainer& suffixes, BreakAllowedFn&& breakAllowed, std::vector<size_t>& indexes) {
+  const size_t length = lowerWord.size();
+
+  const auto tryPush = [&](const size_t breakIndex) {
+    if (!breakAllowed(breakIndex)) {
+      return;
+    }
+    indexes.push_back(breakIndex);
+  };
+
+  for (const auto& literal : prefixes) {
+    if (literal.length == 0 || literal.length >= length) {
+      continue;
+    }
+    if (!matchesLiteralAt(lowerWord, 0, literal)) {
+      continue;
+    }
+    tryPush(literal.length);
+  }
+
+  for (const auto& literal : suffixes) {
+    if (literal.length == 0 || literal.length >= length) {
+      continue;
+    }
+    const size_t breakIndex = length - literal.length;
+    if (!matchesLiteralAt(lowerWord, breakIndex, literal)) {
+      continue;
+    }
+    tryPush(breakIndex);
+  }
+}
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@ -48,6 +48,32 @@ std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
  return cps;
 }

+bool isExplicitHyphen(const uint32_t cp) { return cp == '-' || cp == 0x2010; }
+
+std::vector<size_t> collectExplicitHyphenIndexes(const std::vector<CodepointInfo>& cps) {
+  std::vector<size_t> indexes;
+  for (size_t i = 0; i < cps.size(); ++i) {
+    if (!isExplicitHyphen(cps[i].value)) {
+      continue;
+    }
+    if (i == 0 || i + 1 >= cps.size()) {
+      continue;
+    }
+    if (!isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) {
+      continue;
+    }
+    const size_t breakIndex = i + 1;
+    if (breakIndex >= cps.size()) {
+      continue;
+    }
+    if (breakIndex == 0) {
+      continue;
+    }
+    indexes.push_back(breakIndex);
+  }
+  return indexes;
+}
+
 // Rejects words containing punctuation or digits unless forced.
 bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
  if (cps.empty()) {
@ -93,11 +119,22 @@ std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool
  }

  auto cps = collectCodepoints(word);
-  trimTrailingPunctuation(cps);
+  trimSurroundingPunctuation(cps);
  if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
    return {};
  }

+  if (auto explicitIndexes = collectExplicitHyphenIndexes(cps); !explicitIndexes.empty()) {
+    std::sort(explicitIndexes.begin(), explicitIndexes.end());
+    explicitIndexes.erase(std::unique(explicitIndexes.begin(), explicitIndexes.end()), explicitIndexes.end());
+    std::vector<size_t> byteOffsets;
+    byteOffsets.reserve(explicitIndexes.size());
+    for (const size_t idx : explicitIndexes) {
+      byteOffsets.push_back(byteOffsetForIndex(cps, idx));
+    }
+    return byteOffsets;
+  }
+
  std::vector<size_t> indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector<size_t>();
  if (includeFallback) {
    for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
--- a/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp
@ -1,11 +1,128 @@
 #include "RussianHyphenator.h"

 #include <algorithm>
+#include <array>
 #include <limits>
 #include <vector>

+#include "HyphenationLiterals.h"
+
 namespace {

+using CyrillicLiteral = HyphenLiteralT<uint32_t>;
+
+constexpr uint32_t PFX_BEZ[3] = {0x0431, 0x0435, 0x0437};
+constexpr uint32_t PFX_RAZ[3] = {0x0440, 0x0430, 0x0437};
+constexpr uint32_t PFX_POD[3] = {0x043F, 0x043E, 0x0434};
+constexpr uint32_t PFX_NAD[3] = {0x043D, 0x0430, 0x0434};
+constexpr uint32_t PFX_PERE[4] = {0x043F, 0x0435, 0x0440, 0x0435};
+constexpr uint32_t PFX_SVERH[5] = {0x0441, 0x0432, 0x0435, 0x0440, 0x0445};
+constexpr uint32_t PFX_MEZH[3] = {0x043C, 0x0435, 0x0436};
+constexpr uint32_t PFX_SUPER[5] = {0x0441, 0x0443, 0x043F, 0x0435, 0x0440};
+constexpr uint32_t PFX_PRED[4] = {0x043F, 0x0440, 0x0435, 0x0434};
+constexpr uint32_t PFX_SAMO[4] = {0x0441, 0x0430, 0x043C, 0x043E};
+constexpr uint32_t PFX_OBO[3] = {0x043E, 0x0431, 0x043E};
+constexpr uint32_t PFX_PROTIV[6] = {0x043F, 0x0440, 0x043E, 0x0442, 0x0438, 0x0432};
+
+constexpr std::array<CyrillicLiteral, 12> RUSSIAN_PREFIXES = {{{PFX_BEZ, 3},
+                                                               {PFX_RAZ, 3},
+                                                               {PFX_POD, 3},
+                                                               {PFX_NAD, 3},
+                                                               {PFX_PERE, 4},
+                                                               {PFX_SVERH, 5},
+                                                               {PFX_MEZH, 3},
+                                                               {PFX_SUPER, 5},
+                                                               {PFX_PRED, 4},
+                                                               {PFX_SAMO, 4},
+                                                               {PFX_OBO, 3},
+                                                               {PFX_PROTIV, 6}}};
+
+constexpr uint32_t SFX_NOST[4] = {0x043D, 0x043E, 0x0441, 0x0442};
+constexpr uint32_t SFX_STVO[4] = {0x0441, 0x0442, 0x0432, 0x043E};
+constexpr uint32_t SFX_ENIE[4] = {0x0435, 0x043D, 0x0438, 0x0435};
+constexpr uint32_t SFX_ATION[4] = {0x0430, 0x0446, 0x0438, 0x044F};
+constexpr uint32_t SFX_CHIK[3] = {0x0447, 0x0438, 0x043A};
+constexpr uint32_t SFX_NIK[3] = {0x043D, 0x0438, 0x043A};
+constexpr uint32_t SFX_TEL[4] = {0x0442, 0x0435, 0x043B, 0x044C};
+constexpr uint32_t SFX_SKII[4] = {0x0441, 0x043A, 0x0438, 0x0439};
+constexpr uint32_t SFX_AL[6] = {0x0430, 0x043B, 0x044C, 0x043D, 0x044B, 0x0439};
+constexpr uint32_t SFX_ISM[3] = {0x0438, 0x0437, 0x043C};
+constexpr uint32_t SFX_LIV[5] = {0x043B, 0x0438, 0x0432, 0x044B, 0x0439};
+constexpr uint32_t SFX_OST[4] = {0x043E, 0x0441, 0x0442, 0x044C};
+
+constexpr std::array<CyrillicLiteral, 12> RUSSIAN_SUFFIXES = {{{SFX_NOST, 4},
+                                                               {SFX_STVO, 4},
+                                                               {SFX_ENIE, 4},
+                                                               {SFX_ATION, 4},
+                                                               {SFX_CHIK, 3},
+                                                               {SFX_NIK, 3},
+                                                               {SFX_TEL, 4},
+                                                               {SFX_SKII, 4},
+                                                               {SFX_AL, 6},
+                                                               {SFX_ISM, 3},
+                                                               {SFX_LIV, 5},
+                                                               {SFX_OST, 4}}};
+
+std::vector<uint32_t> lowercaseCyrillicWord(const std::vector<CodepointInfo>& cps) {
+  std::vector<uint32_t> lower;
+  lower.reserve(cps.size());
+  for (const auto& info : cps) {
+    lower.push_back(isCyrillicLetter(info.value) ? toLowerCyrillic(info.value) : info.value);
+  }
+  return lower;
+}
+
+bool russianSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
+  if (start >= cps.size()) {
+    return false;
+  }
+  const size_t clampedEnd = std::min(end, cps.size());
+  for (size_t i = start; i < clampedEnd; ++i) {
+    if (isCyrillicVowel(cps[i].value)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool exposesLeadingDoubleConsonant(const std::vector<CodepointInfo>& cps, const size_t index) {
+  if (index + 1 >= cps.size()) {
+    return false;
+  }
+  const auto first = cps[index].value;
+  const auto second = cps[index + 1].value;
+  if (!isCyrillicConsonant(first) || !isCyrillicConsonant(second)) {
+    return false;
+  }
+  if (toLowerCyrillic(first) != toLowerCyrillic(second)) {
+    return false;
+  }
+  const bool hasLeftVowel = index > 0 && isCyrillicVowel(cps[index - 1].value);
+  const bool hasRightVowel = (index + 2 < cps.size()) && isCyrillicVowel(cps[index + 2].value);
+  return hasLeftVowel && hasRightVowel;
+}
+
+bool exposesTrailingDoubleConsonant(const std::vector<CodepointInfo>& cps, const size_t index) {
+  if (index < 2) {
+    return false;
+  }
+  const auto last = cps[index - 1].value;
+  const auto prev = cps[index - 2].value;
+  if (!isCyrillicConsonant(last) || !isCyrillicConsonant(prev)) {
+    return false;
+  }
+  if (toLowerCyrillic(last) != toLowerCyrillic(prev)) {
+    return false;
+  }
+  const bool hasLeftVowel = (index >= 3) && isCyrillicVowel(cps[index - 3].value);
+  const bool hasRightVowel = (index < cps.size()) && isCyrillicVowel(cps[index].value);
+  return hasLeftVowel && hasRightVowel;
+}
+
+bool violatesDoubleConsonantRule(const std::vector<CodepointInfo>& cps, const size_t index) {
+  return exposesLeadingDoubleConsonant(cps, index) || exposesTrailingDoubleConsonant(cps, index);
+}
+
 // Checks if the codepoint is the Cyrillic soft sign (ь).
 bool isSoftSign(uint32_t cp) { return toLowerCyrillic(cp) == 0x044C; }

@ -163,10 +280,18 @@ bool russianBreakAllowed(const std::vector<CodepointInfo>& cps, const size_t bre
    return false;
  }

+  if (!russianSegmentHasVowel(cps, 0, breakIndex) || !russianSegmentHasVowel(cps, breakIndex, cps.size())) {
+    return false;
+  }
+
  if (beginsWithForbiddenSuffix(cps, breakIndex)) {
    return false;
  }

+  if (violatesDoubleConsonantRule(cps, breakIndex)) {
+    return false;
+  }
+
  return true;
 }

@ -198,6 +323,13 @@ bool nextToSoftSign(const std::vector<CodepointInfo>& cps, const size_t index) {
  return isSoftOrHardSign(left) || isSoftOrHardSign(right);
 }

+void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::vector<uint32_t>& lowerWord,
+                            std::vector<size_t>& indexes) {
+  appendLiteralBreaks(
+      lowerWord, RUSSIAN_PREFIXES, RUSSIAN_SUFFIXES,
+      [&](const size_t breakIndex) { return russianBreakAllowed(cps, breakIndex); }, indexes);
+}
+
 // Produces syllable break indexes tailored to Russian phonotactics.
 std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
  std::vector<size_t> indexes;
@ -205,6 +337,8 @@ std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
    return indexes;
  }

+  const auto lowerWord = lowercaseCyrillicWord(cps);
+
  std::vector<size_t> vowelPositions;
  vowelPositions.reserve(cps.size());
  for (size_t i = 0; i < cps.size(); ++i) {
@ -233,8 +367,8 @@ std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
    const size_t clusterEnd = rightVowel;

    size_t breakIndex = std::numeric_limits<size_t>::max();
-    if (const auto split = doubleConsonantSplit(cps, clusterStart, clusterEnd);
-        split != std::numeric_limits<size_t>::max()) {
+    const auto split = doubleConsonantSplit(cps, clusterStart, clusterEnd);
+    if (split != std::numeric_limits<size_t>::max()) {
      breakIndex = split;
    } else {
      const size_t onsetLen = russianOnsetLength(cps, clusterStart, clusterEnd);
@ -257,6 +391,8 @@ std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
    indexes.push_back(breakIndex);
  }

+  appendMorphologyBreaks(cps, lowerWord, indexes);
+
  std::sort(indexes.begin(), indexes.end());
  indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
  return indexes;
--- a/src/CrossPointSettings.h
+++ b/src/CrossPointSettings.h
@ -24,7 +24,7 @@ class CrossPointSettings {
  uint8_t extraParagraphSpacing = 1;
  // Duration of the power button press
  uint8_t shortPwrBtn = 0;
-  uint8_t hyphenationEnabled = 1;
+  uint8_t hyphenationEnabled = 0;

  ~CrossPointSettings() = default;
Author	SHA1	Message	Date
Arthur Tazhitdinov	f6767c857f	Rename trimTrailingPunctuation to trimSurroundingPunctuation and update logic to remove surrounding punctuation; add explicit hyphen handling in breakOffsets function.	2025-12-26 06:03:38 +05:00
Arthur Tazhitdinov	23183a6270	format fix	2025-12-26 05:12:26 +05:00
Arthur Tazhitdinov	3cf52d8bd1	Disable hyphenation feature in CrossPointSettings	2025-12-26 05:11:51 +05:00