Enhance hyphenation logic: add morphology break handling and improve vowel detection

2026-02-04 22:57:50 +03:00 · 2025-12-26 04:36:19 +05:00 · 2025-12-26 04:36:19 +05:00 · 5d00e5ac0f
commit 5d00e5ac0f
parent 0fa50291b9
1 changed files with 104 additions and 0 deletions
--- a/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp
@ -1,7 +1,9 @@
 #include "EnglishHyphenator.h"
 #include <algorithm>
 #include <array>
 #include <initializer_list>
 #include <string>
 #include <vector>
 namespace {
@ -45,6 +47,105 @@ bool isEnglishFricativeChar(const char c) {
  }
 }
 struct LatinLiteral {
  const char* text;
  size_t length;
 };
 bool nextToApostrophe(const std::vector<CodepointInfo>& cps, size_t index);
 std::string lowercaseLatinWord(const std::vector<CodepointInfo>& cps) {
  std::string lower;
  lower.reserve(cps.size());
  for (const auto& info : cps) {
    lower.push_back(lowerLatinChar(info.value));
  }
  return lower;
 }
 bool matchesPatternAt(const std::string& lowerWord, const size_t start, const LatinLiteral& pattern) {
  if (!pattern.text || pattern.length == 0) {
    return false;
  }
  if (start + pattern.length > lowerWord.size()) {
    return false;
  }
  for (size_t i = 0; i < pattern.length; ++i) {
    if (lowerWord[start + i] != pattern.text[i]) {
      return false;
    }
  }
  return true;
 }
 bool englishSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
  if (start >= end || start >= cps.size()) {
    return false;
  }
  const size_t clampedEnd = std::min(end, cps.size());
  for (size_t i = start; i < clampedEnd; ++i) {
    if (isLatinVowel(cps[i].value)) {
      return true;
    }
  }
  return false;
 }
 void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::string& lowerWord,
                            std::vector<size_t>& indexes) {
  static constexpr std::array<LatinLiteral, 20> PREFIXES = {{{"anti", 4},   {"auto", 4},   {"counter", 7}, {"de", 2},
                                                             {"dis", 3},    {"hyper", 5},  {"inter", 5},   {"micro", 5},
                                                             {"mis", 3},    {"mono", 4},   {"multi", 5},   {"non", 3},
                                                             {"over", 4},   {"post", 4},   {"pre", 3},     {"pro", 3},
                                                             {"re", 2},     {"sub", 3},    {"super", 5},   {"trans", 5}}};
  static constexpr std::array<LatinLiteral, 24> SUFFIXES = {{{"able", 4},   {"ible", 4},   {"ing", 3},    {"ings", 4},
                                                             {"ed", 2},     {"er", 2},     {"ers", 3},    {"est", 3},
                                                             {"ful", 3},    {"hood", 4},   {"less", 4},   {"lessly", 6},
                                                             {"ly", 2},     {"ment", 4},   {"ments", 5},  {"ness", 4},
                                                             {"ous", 3},    {"tion", 4},   {"sion", 4},   {"ward", 4},
                                                             {"wards", 5},  {"ship", 4},   {"ships", 5},  {"y", 1}}};
  const size_t length = cps.size();
  if (length < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
    return;
  }
  const auto tryPush = [&](const size_t breakIndex) {
    if (breakIndex < MIN_PREFIX_CP || length - breakIndex < MIN_SUFFIX_CP) {
      return;
    }
    if (!englishSegmentHasVowel(cps, 0, breakIndex) || !englishSegmentHasVowel(cps, breakIndex, length)) {
      return;
    }
    if (nextToApostrophe(cps, breakIndex)) {
      return;
    }
    indexes.push_back(breakIndex);
  };
  for (const auto& prefix : PREFIXES) {
    if (prefix.length == 0 || prefix.length >= length) {
      continue;
    }
    if (!matchesPatternAt(lowerWord, 0, prefix)) {
      continue;
    }
    tryPush(prefix.length);
  }
  for (const auto& suffix : SUFFIXES) {
    if (suffix.length == 0 || suffix.length >= length) {
      continue;
    }
    const size_t breakIndex = length - suffix.length;
    if (!matchesPatternAt(lowerWord, breakIndex, suffix)) {
      continue;
    }
    tryPush(breakIndex);
  }
 }
 struct CharPair {
  char first;
  char second;
@ -225,6 +326,7 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
    return indexes;
  }
  const auto lowerWord = lowercaseLatinWord(cps);
  std::vector<size_t> vowelPositions;
  vowelPositions.reserve(cps.size());
  for (size_t i = 0; i < cps.size(); ++i) {
@ -263,6 +365,8 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
    indexes.push_back(breakIndex);
  }
  appendMorphologyBreaks(cps, lowerWord, indexes);
  std::sort(indexes.begin(), indexes.end());
  indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
  return indexes;