diff --git a/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp index ca179d8f..bbda2525 100644 --- a/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp @@ -1,4 +1,5 @@ #include "EnglishHyphenator.h" +#include "HyphenationLiterals.h" #include #include @@ -47,10 +48,20 @@ bool isEnglishFricativeChar(const char c) { } } -struct LatinLiteral { - const char* text; - size_t length; -}; +using LatinLiteral = HyphenLiteralT; + +constexpr std::array ENGLISH_PREFIXES = {{{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2}, + {"dis", 3}, {"hyper", 5}, {"inter", 5}, {"micro", 5}, + {"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3}, + {"over", 4}, {"post", 4}, {"pre", 3}, {"pro", 3}, + {"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}}; + +constexpr std::array ENGLISH_SUFFIXES = {{{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4}, + {"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3}, + {"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6}, + {"ly", 2}, {"ment", 4}, {"ments", 5},{"ness", 4}, + {"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4}, + {"wards", 5},{"ship", 4}, {"ships", 5},{"y", 1}}}; bool nextToApostrophe(const std::vector& cps, size_t index); @@ -63,21 +74,6 @@ std::string lowercaseLatinWord(const std::vector& cps) { return lower; } -bool matchesPatternAt(const std::string& lowerWord, const size_t start, const LatinLiteral& pattern) { - if (!pattern.text || pattern.length == 0) { - return false; - } - if (start + pattern.length > lowerWord.size()) { - return false; - } - for (size_t i = 0; i < pattern.length; ++i) { - if (lowerWord[start + i] != pattern.text[i]) { - return false; - } - } - return true; -} - bool englishSegmentHasVowel(const std::vector& cps, const size_t start, const size_t end) { if (start >= end || start >= cps.size()) { return false; @@ -91,56 +87,32 @@ bool englishSegmentHasVowel(const std::vector& cps, const size_t return false; } +bool englishBreakAllowed(const std::vector& cps, const size_t breakIndex) { + if (breakIndex == 0 || breakIndex >= cps.size()) { + return false; + } + + const size_t prefixLen = breakIndex; + const size_t suffixLen = cps.size() - breakIndex; + if (prefixLen < MIN_PREFIX_CP || suffixLen < MIN_SUFFIX_CP) { + return false; + } + + if (!englishSegmentHasVowel(cps, 0, breakIndex) || !englishSegmentHasVowel(cps, breakIndex, cps.size())) { + return false; + } + + if (nextToApostrophe(cps, breakIndex)) { + return false; + } + + return true; +} + void appendMorphologyBreaks(const std::vector& cps, const std::string& lowerWord, std::vector& indexes) { - static constexpr std::array PREFIXES = { - {{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2}, {"dis", 3}, {"hyper", 5}, {"inter", 5}, - {"micro", 5}, {"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3}, {"over", 4}, {"post", 4}, - {"pre", 3}, {"pro", 3}, {"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}}; - - static constexpr std::array SUFFIXES = { - {{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4}, {"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3}, - {"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6}, {"ly", 2}, {"ment", 4}, {"ments", 5}, {"ness", 4}, - {"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4}, {"wards", 5}, {"ship", 4}, {"ships", 5}, {"y", 1}}}; - - const size_t length = cps.size(); - if (length < MIN_PREFIX_CP + MIN_SUFFIX_CP) { - return; - } - - const auto tryPush = [&](const size_t breakIndex) { - if (breakIndex < MIN_PREFIX_CP || length - breakIndex < MIN_SUFFIX_CP) { - return; - } - if (!englishSegmentHasVowel(cps, 0, breakIndex) || !englishSegmentHasVowel(cps, breakIndex, length)) { - return; - } - if (nextToApostrophe(cps, breakIndex)) { - return; - } - indexes.push_back(breakIndex); - }; - - for (const auto& prefix : PREFIXES) { - if (prefix.length == 0 || prefix.length >= length) { - continue; - } - if (!matchesPatternAt(lowerWord, 0, prefix)) { - continue; - } - tryPush(prefix.length); - } - - for (const auto& suffix : SUFFIXES) { - if (suffix.length == 0 || suffix.length >= length) { - continue; - } - const size_t breakIndex = length - suffix.length; - if (!matchesPatternAt(lowerWord, breakIndex, suffix)) { - continue; - } - tryPush(breakIndex); - } + appendLiteralBreaks(lowerWord, ENGLISH_PREFIXES, ENGLISH_SUFFIXES, + [&](const size_t breakIndex) { return englishBreakAllowed(cps, breakIndex); }, indexes); } struct CharPair { @@ -341,8 +313,8 @@ std::vector englishBreakIndexes(const std::vector& cps) { const size_t rightVowel = vowelPositions[v + 1]; if (rightVowel - leftVowel == 1) { - if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) && rightVowel >= MIN_PREFIX_CP && - cps.size() - rightVowel >= MIN_SUFFIX_CP && !nextToApostrophe(cps, rightVowel)) { + if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) && + englishBreakAllowed(cps, rightVowel)) { indexes.push_back(rightVowel); } continue; @@ -353,10 +325,7 @@ std::vector englishBreakIndexes(const std::vector& cps) { const size_t onsetLen = englishOnsetLength(cps, clusterStart, clusterEnd); size_t breakIndex = clusterEnd - onsetLen; - if (breakIndex < MIN_PREFIX_CP || cps.size() - breakIndex < MIN_SUFFIX_CP) { - continue; - } - if (nextToApostrophe(cps, breakIndex)) { + if (!englishBreakAllowed(cps, breakIndex)) { continue; } indexes.push_back(breakIndex); diff --git a/lib/Epub/Epub/hyphenation/HyphenationLiterals.h b/lib/Epub/Epub/hyphenation/HyphenationLiterals.h new file mode 100644 index 00000000..5273ac8d --- /dev/null +++ b/lib/Epub/Epub/hyphenation/HyphenationLiterals.h @@ -0,0 +1,64 @@ +#pragma once + +#include +#include + +template +struct HyphenLiteral { + const T* data; + size_t length; +}; + +template +using HyphenLiteralT = HyphenLiteral; + +template +bool matchesLiteralAt(const WordContainer& word, const size_t start, const Literal& literal) { + if (!literal.data || literal.length == 0) { + return false; + } + if (start + literal.length > word.size()) { + return false; + } + for (size_t i = 0; i < literal.length; ++i) { + if (word[start + i] != literal.data[i]) { + return false; + } + } + return true; +} + +template +void appendLiteralBreaks(const WordContainer& lowerWord, const PrefixContainer& prefixes, + const SuffixContainer& suffixes, BreakAllowedFn&& breakAllowed, + std::vector& indexes) { + const size_t length = lowerWord.size(); + + const auto tryPush = [&](const size_t breakIndex) { + if (!breakAllowed(breakIndex)) { + return; + } + indexes.push_back(breakIndex); + }; + + for (const auto& literal : prefixes) { + if (literal.length == 0 || literal.length >= length) { + continue; + } + if (!matchesLiteralAt(lowerWord, 0, literal)) { + continue; + } + tryPush(literal.length); + } + + for (const auto& literal : suffixes) { + if (literal.length == 0 || literal.length >= length) { + continue; + } + const size_t breakIndex = length - literal.length; + if (!matchesLiteralAt(lowerWord, breakIndex, literal)) { + continue; + } + tryPush(breakIndex); + } +} diff --git a/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp b/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp index 120ad7c0..8807dfe1 100644 --- a/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp @@ -1,11 +1,111 @@ #include "RussianHyphenator.h" +#include "HyphenationLiterals.h" #include +#include #include #include namespace { +using CyrillicLiteral = HyphenLiteralT; + +constexpr uint32_t PFX_BEZ[3] = {0x0431, 0x0435, 0x0437}; +constexpr uint32_t PFX_RAZ[3] = {0x0440, 0x0430, 0x0437}; +constexpr uint32_t PFX_POD[3] = {0x043F, 0x043E, 0x0434}; +constexpr uint32_t PFX_NAD[3] = {0x043D, 0x0430, 0x0434}; +constexpr uint32_t PFX_PERE[4] = {0x043F, 0x0435, 0x0440, 0x0435}; +constexpr uint32_t PFX_SVERH[5] = {0x0441, 0x0432, 0x0435, 0x0440, 0x0445}; +constexpr uint32_t PFX_MEZH[3] = {0x043C, 0x0435, 0x0436}; +constexpr uint32_t PFX_SUPER[5] = {0x0441, 0x0443, 0x043F, 0x0435, 0x0440}; +constexpr uint32_t PFX_PRED[4] = {0x043F, 0x0440, 0x0435, 0x0434}; +constexpr uint32_t PFX_SAMO[4] = {0x0441, 0x0430, 0x043C, 0x043E}; +constexpr uint32_t PFX_OBO[3] = {0x043E, 0x0431, 0x043E}; +constexpr uint32_t PFX_PROTIV[6] = {0x043F, 0x0440, 0x043E, 0x0442, 0x0438, 0x0432}; + +constexpr std::array RUSSIAN_PREFIXES = {{{PFX_BEZ, 3}, {PFX_RAZ, 3}, {PFX_POD, 3}, + {PFX_NAD, 3}, {PFX_PERE, 4}, {PFX_SVERH, 5}, + {PFX_MEZH, 3}, {PFX_SUPER, 5},{PFX_PRED, 4}, + {PFX_SAMO, 4}, {PFX_OBO, 3}, {PFX_PROTIV, 6}}}; + +constexpr uint32_t SFX_NOST[4] = {0x043D, 0x043E, 0x0441, 0x0442}; +constexpr uint32_t SFX_STVO[4] = {0x0441, 0x0442, 0x0432, 0x043E}; +constexpr uint32_t SFX_ENIE[4] = {0x0435, 0x043D, 0x0438, 0x0435}; +constexpr uint32_t SFX_ATION[4] = {0x0430, 0x0446, 0x0438, 0x044F}; +constexpr uint32_t SFX_CHIK[3] = {0x0447, 0x0438, 0x043A}; +constexpr uint32_t SFX_NIK[3] = {0x043D, 0x0438, 0x043A}; +constexpr uint32_t SFX_TEL[4] = {0x0442, 0x0435, 0x043B, 0x044C}; +constexpr uint32_t SFX_SKII[4] = {0x0441, 0x043A, 0x0438, 0x0439}; +constexpr uint32_t SFX_AL[6] = {0x0430, 0x043B, 0x044C, 0x043D, 0x044B, 0x0439}; +constexpr uint32_t SFX_ISM[3] = {0x0438, 0x0437, 0x043C}; +constexpr uint32_t SFX_LIV[5] = {0x043B, 0x0438, 0x0432, 0x044B, 0x0439}; +constexpr uint32_t SFX_OST[4] = {0x043E, 0x0441, 0x0442, 0x044C}; + +constexpr std::array RUSSIAN_SUFFIXES = {{{SFX_NOST, 4}, {SFX_STVO, 4}, {SFX_ENIE, 4}, + {SFX_ATION, 4}, {SFX_CHIK, 3}, {SFX_NIK, 3}, + {SFX_TEL, 4}, {SFX_SKII, 4}, {SFX_AL, 6}, + {SFX_ISM, 3}, {SFX_LIV, 5}, {SFX_OST, 4}}}; + +std::vector lowercaseCyrillicWord(const std::vector& cps) { + std::vector lower; + lower.reserve(cps.size()); + for (const auto& info : cps) { + lower.push_back(isCyrillicLetter(info.value) ? toLowerCyrillic(info.value) : info.value); + } + return lower; +} + +bool russianSegmentHasVowel(const std::vector& cps, const size_t start, const size_t end) { + if (start >= cps.size()) { + return false; + } + const size_t clampedEnd = std::min(end, cps.size()); + for (size_t i = start; i < clampedEnd; ++i) { + if (isCyrillicVowel(cps[i].value)) { + return true; + } + } + return false; +} + +bool exposesLeadingDoubleConsonant(const std::vector& cps, const size_t index) { + if (index + 1 >= cps.size()) { + return false; + } + const auto first = cps[index].value; + const auto second = cps[index + 1].value; + if (!isCyrillicConsonant(first) || !isCyrillicConsonant(second)) { + return false; + } + if (toLowerCyrillic(first) != toLowerCyrillic(second)) { + return false; + } + const bool hasLeftVowel = index > 0 && isCyrillicVowel(cps[index - 1].value); + const bool hasRightVowel = (index + 2 < cps.size()) && isCyrillicVowel(cps[index + 2].value); + return hasLeftVowel && hasRightVowel; +} + +bool exposesTrailingDoubleConsonant(const std::vector& cps, const size_t index) { + if (index < 2) { + return false; + } + const auto last = cps[index - 1].value; + const auto prev = cps[index - 2].value; + if (!isCyrillicConsonant(last) || !isCyrillicConsonant(prev)) { + return false; + } + if (toLowerCyrillic(last) != toLowerCyrillic(prev)) { + return false; + } + const bool hasLeftVowel = (index >= 3) && isCyrillicVowel(cps[index - 3].value); + const bool hasRightVowel = (index < cps.size()) && isCyrillicVowel(cps[index].value); + return hasLeftVowel && hasRightVowel; +} + +bool violatesDoubleConsonantRule(const std::vector& cps, const size_t index) { + return exposesLeadingDoubleConsonant(cps, index) || exposesTrailingDoubleConsonant(cps, index); +} + // Checks if the codepoint is the Cyrillic soft sign (ь). bool isSoftSign(uint32_t cp) { return toLowerCyrillic(cp) == 0x044C; } @@ -163,10 +263,18 @@ bool russianBreakAllowed(const std::vector& cps, const size_t bre return false; } + if (!russianSegmentHasVowel(cps, 0, breakIndex) || !russianSegmentHasVowel(cps, breakIndex, cps.size())) { + return false; + } + if (beginsWithForbiddenSuffix(cps, breakIndex)) { return false; } + if (violatesDoubleConsonantRule(cps, breakIndex)) { + return false; + } + return true; } @@ -198,6 +306,12 @@ bool nextToSoftSign(const std::vector& cps, const size_t index) { return isSoftOrHardSign(left) || isSoftOrHardSign(right); } +void appendMorphologyBreaks(const std::vector& cps, const std::vector& lowerWord, + std::vector& indexes) { + appendLiteralBreaks(lowerWord, RUSSIAN_PREFIXES, RUSSIAN_SUFFIXES, + [&](const size_t breakIndex) { return russianBreakAllowed(cps, breakIndex); }, indexes); +} + // Produces syllable break indexes tailored to Russian phonotactics. std::vector russianBreakIndexes(const std::vector& cps) { std::vector indexes; @@ -205,6 +319,8 @@ std::vector russianBreakIndexes(const std::vector& cps) { return indexes; } + const auto lowerWord = lowercaseCyrillicWord(cps); + std::vector vowelPositions; vowelPositions.reserve(cps.size()); for (size_t i = 0; i < cps.size(); ++i) { @@ -233,8 +349,8 @@ std::vector russianBreakIndexes(const std::vector& cps) { const size_t clusterEnd = rightVowel; size_t breakIndex = std::numeric_limits::max(); - if (const auto split = doubleConsonantSplit(cps, clusterStart, clusterEnd); - split != std::numeric_limits::max()) { + const auto split = doubleConsonantSplit(cps, clusterStart, clusterEnd); + if (split != std::numeric_limits::max()) { breakIndex = split; } else { const size_t onsetLen = russianOnsetLength(cps, clusterStart, clusterEnd); @@ -257,6 +373,8 @@ std::vector russianBreakIndexes(const std::vector& cps) { indexes.push_back(breakIndex); } + appendMorphologyBreaks(cps, lowerWord, indexes); + std::sort(indexes.begin(), indexes.end()); indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); return indexes; diff --git a/src/CrossPointSettings.h b/src/CrossPointSettings.h index 985643b3..1211dc19 100644 --- a/src/CrossPointSettings.h +++ b/src/CrossPointSettings.h @@ -24,7 +24,7 @@ class CrossPointSettings { uint8_t extraParagraphSpacing = 1; // Duration of the power button press uint8_t shortPwrBtn = 0; - uint8_t hyphenationEnabled = 1; + uint8_t hyphenationEnabled = 0; ~CrossPointSettings() = default;