Enhance hyphenation logic: add morphology break handling and improve vowel detection

This commit is contained in:
Arthur Tazhitdinov 2025-12-26 04:36:19 +05:00
parent 0fa50291b9
commit 5d00e5ac0f

View File

@ -1,7 +1,9 @@
#include "EnglishHyphenator.h" #include "EnglishHyphenator.h"
#include <algorithm> #include <algorithm>
#include <array>
#include <initializer_list> #include <initializer_list>
#include <string>
#include <vector> #include <vector>
namespace { namespace {
@ -45,6 +47,105 @@ bool isEnglishFricativeChar(const char c) {
} }
} }
struct LatinLiteral {
const char* text;
size_t length;
};
bool nextToApostrophe(const std::vector<CodepointInfo>& cps, size_t index);
std::string lowercaseLatinWord(const std::vector<CodepointInfo>& cps) {
std::string lower;
lower.reserve(cps.size());
for (const auto& info : cps) {
lower.push_back(lowerLatinChar(info.value));
}
return lower;
}
bool matchesPatternAt(const std::string& lowerWord, const size_t start, const LatinLiteral& pattern) {
if (!pattern.text || pattern.length == 0) {
return false;
}
if (start + pattern.length > lowerWord.size()) {
return false;
}
for (size_t i = 0; i < pattern.length; ++i) {
if (lowerWord[start + i] != pattern.text[i]) {
return false;
}
}
return true;
}
bool englishSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
if (start >= end || start >= cps.size()) {
return false;
}
const size_t clampedEnd = std::min(end, cps.size());
for (size_t i = start; i < clampedEnd; ++i) {
if (isLatinVowel(cps[i].value)) {
return true;
}
}
return false;
}
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::string& lowerWord,
std::vector<size_t>& indexes) {
static constexpr std::array<LatinLiteral, 20> PREFIXES = {{{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2},
{"dis", 3}, {"hyper", 5}, {"inter", 5}, {"micro", 5},
{"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3},
{"over", 4}, {"post", 4}, {"pre", 3}, {"pro", 3},
{"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}};
static constexpr std::array<LatinLiteral, 24> SUFFIXES = {{{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4},
{"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3},
{"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6},
{"ly", 2}, {"ment", 4}, {"ments", 5}, {"ness", 4},
{"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4},
{"wards", 5}, {"ship", 4}, {"ships", 5}, {"y", 1}}};
const size_t length = cps.size();
if (length < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
return;
}
const auto tryPush = [&](const size_t breakIndex) {
if (breakIndex < MIN_PREFIX_CP || length - breakIndex < MIN_SUFFIX_CP) {
return;
}
if (!englishSegmentHasVowel(cps, 0, breakIndex) || !englishSegmentHasVowel(cps, breakIndex, length)) {
return;
}
if (nextToApostrophe(cps, breakIndex)) {
return;
}
indexes.push_back(breakIndex);
};
for (const auto& prefix : PREFIXES) {
if (prefix.length == 0 || prefix.length >= length) {
continue;
}
if (!matchesPatternAt(lowerWord, 0, prefix)) {
continue;
}
tryPush(prefix.length);
}
for (const auto& suffix : SUFFIXES) {
if (suffix.length == 0 || suffix.length >= length) {
continue;
}
const size_t breakIndex = length - suffix.length;
if (!matchesPatternAt(lowerWord, breakIndex, suffix)) {
continue;
}
tryPush(breakIndex);
}
}
struct CharPair { struct CharPair {
char first; char first;
char second; char second;
@ -225,6 +326,7 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
return indexes; return indexes;
} }
const auto lowerWord = lowercaseLatinWord(cps);
std::vector<size_t> vowelPositions; std::vector<size_t> vowelPositions;
vowelPositions.reserve(cps.size()); vowelPositions.reserve(cps.size());
for (size_t i = 0; i < cps.size(); ++i) { for (size_t i = 0; i < cps.size(); ++i) {
@ -263,6 +365,8 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
indexes.push_back(breakIndex); indexes.push_back(breakIndex);
} }
appendMorphologyBreaks(cps, lowerWord, indexes);
std::sort(indexes.begin(), indexes.end()); std::sort(indexes.begin(), indexes.end());
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
return indexes; return indexes;