switch to trie packed liang hyphenation dictionaries

This commit is contained in:
Arthur Tazhitdinov 2026-01-09 20:54:31 +05:00
parent c83fd37286
commit 0b3e029484
21 changed files with 15771 additions and 850 deletions

2
.gitignore vendored
View File

@ -4,3 +4,5 @@
.vscode .vscode
lib/EpdFont/fontsrc lib/EpdFont/fontsrc
*.generated.h *.generated.h
build
**/__pycache__/

View File

@ -78,12 +78,6 @@ bool Epub::parseContentOpf(BookMetadataCache::BookMetadata& bookMetadata) {
bookMetadata.coverItemHref = opfParser.coverItemHref; bookMetadata.coverItemHref = opfParser.coverItemHref;
bookMetadata.textReferenceHref = opfParser.textReferenceHref; bookMetadata.textReferenceHref = opfParser.textReferenceHref;
if (!bookMetadata.language.empty()) {
Serial.printf("[%lu] [EBP] OPF language: %s\n", millis(), bookMetadata.language.c_str());
} else {
Serial.printf("[%lu] [EBP] OPF language: <none>\n", millis());
}
if (!opfParser.tocNcxPath.empty()) { if (!opfParser.tocNcxPath.empty()) {
tocNcxItem = opfParser.tocNcxPath; tocNcxItem = opfParser.tocNcxPath;
} }

View File

@ -188,7 +188,6 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c
[this, &lut](std::unique_ptr<Page> page) { lut.emplace_back(this->onPageComplete(std::move(page))); }, [this, &lut](std::unique_ptr<Page> page) { lut.emplace_back(this->onPageComplete(std::move(page))); },
progressFn); progressFn);
Hyphenator::setPreferredLanguage(epub->getLanguage()); Hyphenator::setPreferredLanguage(epub->getLanguage());
Serial.printf("[%lu] [SCT] Hyphenation language set to: %s\n", millis(), epub->getLanguage().c_str());
success = visitor.parseAndBuildPages(); success = visitor.parseAndBuildPages();
SdMan.remove(tmpHtmlPath.c_str()); SdMan.remove(tmpHtmlPath.c_str());

View File

@ -1,343 +1,9 @@
#include "EnglishHyphenator.h" #include "EnglishHyphenator.h"
#include <algorithm>
#include <array>
#include <initializer_list>
#include <string>
#include <vector> #include <vector>
#include "HyphenationLiterals.h" #include "LiangHyphenation.h"
#include "generated/hyph-en-us.trie.h"
namespace {
char lowerLatinChar(const uint32_t cp) {
if (!isLatinLetter(cp)) {
return 0;
}
return static_cast<char>(toLowerLatin(cp));
}
bool isEnglishApproximantChar(const char c) { return c == 'l' || c == 'r' || c == 'w' || c == 'y'; }
bool isEnglishStopChar(const char c) {
switch (c) {
case 'p':
case 'b':
case 't':
case 'd':
case 'k':
case 'g':
case 'c':
case 'q':
return true;
default:
return false;
}
}
bool isEnglishFricativeChar(const char c) {
switch (c) {
case 'f':
case 'v':
case 's':
case 'z':
case 'h':
case 'x':
return true;
default:
return false;
}
}
using LatinLiteral = HyphenLiteralT<char>;
constexpr std::array<LatinLiteral, 20> ENGLISH_PREFIXES = {
{{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2}, {"dis", 3}, {"hyper", 5}, {"inter", 5},
{"micro", 5}, {"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3}, {"over", 4}, {"post", 4},
{"pre", 3}, {"pro", 3}, {"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}};
constexpr std::array<LatinLiteral, 24> ENGLISH_SUFFIXES = {
{{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4}, {"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3},
{"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6}, {"ly", 2}, {"ment", 4}, {"ments", 5}, {"ness", 4},
{"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4}, {"wards", 5}, {"ship", 4}, {"ships", 5}, {"y", 1}}};
bool nextToApostrophe(const std::vector<CodepointInfo>& cps, size_t index);
std::string lowercaseLatinWord(const std::vector<CodepointInfo>& cps) {
std::string lower;
lower.reserve(cps.size());
for (const auto& info : cps) {
lower.push_back(lowerLatinChar(info.value));
}
return lower;
}
bool englishSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
if (start >= end || start >= cps.size()) {
return false;
}
const size_t clampedEnd = std::min(end, cps.size());
for (size_t i = start; i < clampedEnd; ++i) {
if (isLatinVowel(cps[i].value)) {
return true;
}
}
return false;
}
bool englishBreakAllowed(const std::vector<CodepointInfo>& cps, const size_t breakIndex) {
if (breakIndex == 0 || breakIndex >= cps.size()) {
return false;
}
const size_t prefixLen = breakIndex;
const size_t suffixLen = cps.size() - breakIndex;
if (prefixLen < MIN_PREFIX_CP || suffixLen < MIN_SUFFIX_CP) {
return false;
}
if (!englishSegmentHasVowel(cps, 0, breakIndex) || !englishSegmentHasVowel(cps, breakIndex, cps.size())) {
return false;
}
if (nextToApostrophe(cps, breakIndex)) {
return false;
}
return true;
}
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::string& lowerWord,
std::vector<size_t>& indexes) {
appendLiteralBreaks(
lowerWord, ENGLISH_PREFIXES, ENGLISH_SUFFIXES,
[&](const size_t breakIndex) { return englishBreakAllowed(cps, breakIndex); }, indexes);
}
struct CharPair {
char first;
char second;
};
bool matchesDigraph(const char first, const char second, const std::initializer_list<CharPair>& pairs) {
for (const auto& pair : pairs) {
if (pair.first == first && pair.second == second) {
return true;
}
}
return false;
}
bool isEnglishDiphthong(const uint32_t first, const uint32_t second) {
if (!isLatinLetter(first) || !isLatinLetter(second)) {
return false;
}
const auto f = static_cast<char>(toLowerLatin(first));
const auto s = static_cast<char>(toLowerLatin(second));
switch (f) {
case 'a':
return s == 'i' || s == 'y' || s == 'u';
case 'e':
return s == 'a' || s == 'e' || s == 'i' || s == 'o' || s == 'u' || s == 'y';
case 'i':
return s == 'e' || s == 'u' || s == 'a';
case 'o':
return s == 'a' || s == 'e' || s == 'i' || s == 'o' || s == 'u' || s == 'y';
case 'u':
return s == 'i' || s == 'a' || s == 'e';
}
return false;
}
bool isValidEnglishOnsetBigram(const uint32_t firstCp, const uint32_t secondCp) {
const char first = lowerLatinChar(firstCp);
const char second = lowerLatinChar(secondCp);
if (!first || !second) {
return false;
}
if (matchesDigraph(first, second,
{{'c', 'h'},
{'s', 'h'},
{'t', 'h'},
{'p', 'h'},
{'w', 'h'},
{'w', 'r'},
{'k', 'n'},
{'g', 'n'},
{'p', 's'},
{'p', 't'},
{'p', 'n'},
{'r', 'h'}})) {
return true;
}
if (isEnglishStopChar(first) && isEnglishApproximantChar(second)) {
return true;
}
if (isEnglishFricativeChar(first) && isEnglishApproximantChar(second)) {
return true;
}
if (first == 's' && (second == 'p' || second == 't' || second == 'k' || second == 'm' || second == 'n' ||
second == 'f' || second == 'l' || second == 'w' || second == 'c')) {
return true;
}
if (second == 'y' && (first == 'p' || first == 'b' || first == 't' || first == 'd' || first == 'f' || first == 'k' ||
first == 'g' || first == 'h' || first == 'm' || first == 'n' || first == 'l' || first == 's')) {
return true;
}
return false;
}
bool isValidEnglishOnsetTrigram(const uint32_t firstCp, const uint32_t secondCp, const uint32_t thirdCp) {
const char first = lowerLatinChar(firstCp);
const char second = lowerLatinChar(secondCp);
const char third = lowerLatinChar(thirdCp);
if (!first || !second || !third) {
return false;
}
if (first == 's') {
if (second == 'p' && (third == 'l' || third == 'r' || third == 'w')) {
return true;
}
if (second == 't' && (third == 'r' || third == 'w' || third == 'y')) {
return true;
}
if (second == 'k' && (third == 'l' || third == 'r' || third == 'w')) {
return true;
}
if (second == 'c' && (third == 'l' || third == 'r')) {
return true;
}
if (second == 'f' && third == 'r') {
return true;
}
if (second == 'h' && third == 'r') {
return true;
}
}
if (first == 't' && second == 'h' && third == 'r') {
return true;
}
return false;
}
// Verifies that the consonant cluster could begin an English syllable.
bool englishClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
if (start >= end) {
return false;
}
for (size_t i = start; i < end; ++i) {
const char ch = lowerLatinChar(cps[i].value);
if (!ch) {
return false;
}
if (!isLatinConsonant(cps[i].value) && ch != 'y') {
return false;
}
}
const size_t len = end - start;
if (len == 1) {
return true;
}
if (len == 2) {
return isValidEnglishOnsetBigram(cps[start].value, cps[start + 1].value);
}
if (len == 3) {
return isValidEnglishOnsetTrigram(cps[start].value, cps[start + 1].value, cps[start + 2].value);
}
return false;
}
// Picks the longest legal onset inside the consonant cluster between vowels.
size_t englishOnsetLength(const std::vector<CodepointInfo>& cps, const size_t clusterStart, const size_t clusterEnd) {
const size_t clusterLen = clusterEnd - clusterStart;
if (clusterLen == 0) {
return 0;
}
const size_t maxLen = std::min<size_t>(3, clusterLen);
for (size_t len = maxLen; len >= 1; --len) {
const size_t suffixStart = clusterEnd - len;
if (englishClusterIsValidOnset(cps, suffixStart, clusterEnd)) {
return len;
}
}
return 1;
}
// Avoids creating hyphen positions adjacent to apostrophes (e.g., contractions).
bool nextToApostrophe(const std::vector<CodepointInfo>& cps, const size_t index) {
if (index == 0 || index >= cps.size()) {
return false;
}
const auto left = cps[index - 1].value;
const auto right = cps[index].value;
return left == '\'' || right == '\'';
}
// Returns byte indexes where the word may break according to English syllable rules.
std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
std::vector<size_t> indexes;
const size_t wordSize = cps.size();
std::vector<size_t> vowelPositions;
vowelPositions.reserve(wordSize / 2);
for (size_t i = 0; i < wordSize; ++i) {
if (isLatinVowel(cps[i].value)) {
vowelPositions.push_back(i);
}
}
if (vowelPositions.size() < 2) {
return indexes;
}
for (size_t v = 0; v + 1 < vowelPositions.size(); ++v) {
const size_t leftVowel = vowelPositions[v];
const size_t rightVowel = vowelPositions[v + 1];
if (rightVowel - leftVowel == 1) {
if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) && englishBreakAllowed(cps, rightVowel)) {
indexes.push_back(rightVowel);
}
continue;
}
const size_t clusterStart = leftVowel + 1;
const size_t clusterEnd = rightVowel;
const size_t onsetLen = englishOnsetLength(cps, clusterStart, clusterEnd);
const size_t breakIndex = clusterEnd - onsetLen;
if (!englishBreakAllowed(cps, breakIndex)) {
continue;
}
indexes.push_back(breakIndex);
}
const auto lowerWord = lowercaseLatinWord(cps);
const size_t preDedupeCount = indexes.size();
appendMorphologyBreaks(cps, lowerWord, indexes);
if (indexes.size() > preDedupeCount) {
std::sort(indexes.begin(), indexes.end());
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
}
return indexes;
}
} // namespace
const EnglishHyphenator& EnglishHyphenator::instance() { const EnglishHyphenator& EnglishHyphenator::instance() {
static EnglishHyphenator instance; static EnglishHyphenator instance;
@ -345,5 +11,8 @@ const EnglishHyphenator& EnglishHyphenator::instance() {
} }
std::vector<size_t> EnglishHyphenator::breakIndexes(const std::vector<CodepointInfo>& cps) const { std::vector<size_t> EnglishHyphenator::breakIndexes(const std::vector<CodepointInfo>& cps) const {
return englishBreakIndexes(cps); // The shared Liang engine needs to know which letters are valid, how to lowercase them, and what
// TeX-style prefix/suffix minima to respect (currently set to lefthyphenmin=2 and righthyphenmin=2)
const LiangWordConfig config(isLatinLetter, toLowerLatin, minPrefix(), minSuffix());
return liangBreakIndexes(cps, en_us_patterns, config);
} }

View File

@ -8,6 +8,9 @@ class EnglishHyphenator final : public LanguageHyphenator {
static const EnglishHyphenator& instance(); static const EnglishHyphenator& instance();
std::vector<size_t> breakIndexes(const std::vector<CodepointInfo>& cps) const override; std::vector<size_t> breakIndexes(const std::vector<CodepointInfo>& cps) const override;
// Keep both minima at two characters to mirror Pyphen defaults.
size_t minPrefix() const override { return 2; }
size_t minSuffix() const override { return 2; }
private: private:
EnglishHyphenator() = default; EnglishHyphenator() = default;

View File

@ -0,0 +1,14 @@
#include "GermanHyphenator.h"
#include "LiangHyphenation.h"
#include "generated/hyph-de.trie.h"
const GermanHyphenator& GermanHyphenator::instance() {
static GermanHyphenator instance;
return instance;
}
std::vector<size_t> GermanHyphenator::breakIndexes(const std::vector<CodepointInfo>& cps) const {
const LiangWordConfig config(isLatinLetter, toLowerLatin, minPrefix(), minSuffix());
return liangBreakIndexes(cps, de_patterns, config);
}

View File

@ -0,0 +1,14 @@
#pragma once
#include "LanguageHyphenator.h"
// Implements Liang hyphenation rules for German (Latin script).
class GermanHyphenator final : public LanguageHyphenator {
public:
static const GermanHyphenator& instance();
std::vector<size_t> breakIndexes(const std::vector<CodepointInfo>& cps) const override;
private:
GermanHyphenator() = default;
};

View File

@ -2,6 +2,7 @@
namespace { namespace {
// Convert Latin uppercase letters (A-Z) to lowercase (a-z)
uint32_t toLowerLatinImpl(const uint32_t cp) { uint32_t toLowerLatinImpl(const uint32_t cp) {
if (cp >= 'A' && cp <= 'Z') { if (cp >= 'A' && cp <= 'Z') {
return cp - 'A' + 'a'; return cp - 'A' + 'a';
@ -9,6 +10,9 @@ uint32_t toLowerLatinImpl(const uint32_t cp) {
return cp; return cp;
} }
// Convert Cyrillic uppercase letters to lowercase
// Cyrillic uppercase range 0x0410-0x042F maps to lowercase by adding 0x20
// Special case: Cyrillic capital IO (0x0401) maps to lowercase io (0x0451)
uint32_t toLowerCyrillicImpl(const uint32_t cp) { uint32_t toLowerCyrillicImpl(const uint32_t cp) {
if (cp >= 0x0410 && cp <= 0x042F) { if (cp >= 0x0410 && cp <= 0x042F) {
return cp + 0x20; return cp + 0x20;
@ -27,36 +31,8 @@ uint32_t toLowerCyrillic(const uint32_t cp) { return toLowerCyrillicImpl(cp); }
bool isLatinLetter(const uint32_t cp) { return (cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z'); } bool isLatinLetter(const uint32_t cp) { return (cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z'); }
bool isLatinVowel(uint32_t cp) {
cp = toLowerLatinImpl(cp);
return cp == 'a' || cp == 'e' || cp == 'i' || cp == 'o' || cp == 'u' || cp == 'y';
}
bool isLatinConsonant(const uint32_t cp) { return isLatinLetter(cp) && !isLatinVowel(cp); }
bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); } bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); }
bool isCyrillicVowel(uint32_t cp) {
cp = toLowerCyrillicImpl(cp);
switch (cp) {
case 0x0430: // а
case 0x0435: // е
case 0x0451: // ё
case 0x0438: // и
case 0x043E: // о
case 0x0443: // у
case 0x044B: // ы
case 0x044D: // э
case 0x044E: // ю
case 0x044F: // я
return true;
default:
return false;
}
}
bool isCyrillicConsonant(const uint32_t cp) { return isCyrillicLetter(cp) && !isCyrillicVowel(cp); }
bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); } bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
bool isPunctuation(const uint32_t cp) { bool isPunctuation(const uint32_t cp) {

View File

@ -9,20 +9,11 @@ struct CodepointInfo {
size_t byteOffset; size_t byteOffset;
}; };
// Minimum number of codepoints required in prefix and suffix for hyphenation.
constexpr size_t MIN_PREFIX_CP = 2;
constexpr size_t MIN_SUFFIX_CP = 2;
uint32_t toLowerLatin(uint32_t cp); uint32_t toLowerLatin(uint32_t cp);
uint32_t toLowerCyrillic(uint32_t cp); uint32_t toLowerCyrillic(uint32_t cp);
bool isLatinLetter(uint32_t cp); bool isLatinLetter(uint32_t cp);
bool isLatinVowel(uint32_t cp);
bool isLatinConsonant(uint32_t cp);
bool isCyrillicLetter(uint32_t cp); bool isCyrillicLetter(uint32_t cp);
bool isCyrillicVowel(uint32_t cp);
bool isCyrillicConsonant(uint32_t cp);
bool isAlphabetic(uint32_t cp); bool isAlphabetic(uint32_t cp);
bool isPunctuation(uint32_t cp); bool isPunctuation(uint32_t cp);

View File

@ -1,63 +0,0 @@
#pragma once
#include <cstddef>
#include <vector>
template <typename T>
struct HyphenLiteral {
const T* data;
size_t length;
};
template <typename T>
using HyphenLiteralT = HyphenLiteral<T>;
template <typename WordContainer, typename Literal>
bool matchesLiteralAt(const WordContainer& word, const size_t start, const Literal& literal) {
if (!literal.data || literal.length == 0) {
return false;
}
if (start + literal.length > word.size()) {
return false;
}
for (size_t i = 0; i < literal.length; ++i) {
if (word[start + i] != literal.data[i]) {
return false;
}
}
return true;
}
template <typename WordContainer, typename PrefixContainer, typename SuffixContainer, typename BreakAllowedFn>
void appendLiteralBreaks(const WordContainer& lowerWord, const PrefixContainer& prefixes,
const SuffixContainer& suffixes, BreakAllowedFn&& breakAllowed, std::vector<size_t>& indexes) {
const size_t length = lowerWord.size();
const auto tryPush = [&](const size_t breakIndex) {
if (!breakAllowed(breakIndex)) {
return;
}
indexes.push_back(breakIndex);
};
for (const auto& literal : prefixes) {
if (literal.length == 0 || literal.length >= length) {
continue;
}
if (!matchesLiteralAt(lowerWord, 0, literal)) {
continue;
}
tryPush(literal.length);
}
for (const auto& literal : suffixes) {
if (literal.length == 0 || literal.length >= length) {
continue;
}
const size_t breakIndex = length - literal.length;
if (!matchesLiteralAt(lowerWord, breakIndex, literal)) {
continue;
}
tryPush(breakIndex);
}
}

View File

@ -6,6 +6,7 @@
#include <vector> #include <vector>
#include "EnglishHyphenator.h" #include "EnglishHyphenator.h"
#include "GermanHyphenator.h"
#include "HyphenationCommon.h" #include "HyphenationCommon.h"
#include "LanguageHyphenator.h" #include "LanguageHyphenator.h"
#include "RussianHyphenator.h" #include "RussianHyphenator.h"
@ -27,6 +28,7 @@ const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) {
if (primary.empty()) return nullptr; if (primary.empty()) return nullptr;
if (primary == "en") return &EnglishHyphenator::instance(); if (primary == "en") return &EnglishHyphenator::instance();
if (primary == "de") return &GermanHyphenator::instance();
if (primary == "ru") return &RussianHyphenator::instance(); if (primary == "ru") return &RussianHyphenator::instance();
return nullptr; return nullptr;
} }
@ -78,8 +80,8 @@ void trimTrailingFootnoteReference(std::vector<CodepointInfo>& cps) {
} }
// Asks the language hyphenator for legal break positions inside the word. // Asks the language hyphenator for legal break positions inside the word.
std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps) { std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps, const LanguageHyphenator* hyphenator) {
if (const auto* hyphenator = cachedHyphenator()) { if (hyphenator) {
return hyphenator->breakIndexes(cps); return hyphenator->breakIndexes(cps);
} }
return {}; return {};
@ -140,7 +142,10 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
auto cps = collectCodepoints(word); auto cps = collectCodepoints(word);
trimSurroundingPunctuation(cps); trimSurroundingPunctuation(cps);
trimTrailingFootnoteReference(cps); trimTrailingFootnoteReference(cps);
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { const auto* hyphenator = cachedHyphenator();
const size_t minPrefix = hyphenator ? hyphenator->minPrefix() : LanguageHyphenator::kDefaultMinPrefix;
const size_t minSuffix = hyphenator ? hyphenator->minSuffix() : LanguageHyphenator::kDefaultMinSuffix;
if (cps.size() < minPrefix + minSuffix) {
return {}; return {};
} }
@ -151,11 +156,11 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
} }
// Ask language hyphenator for legal break points. // Ask language hyphenator for legal break points.
std::vector<size_t> indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector<size_t>(); std::vector<size_t> indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps, hyphenator) : std::vector<size_t>();
// Only add fallback breaks if needed and deduplicate if both language and fallback breaks exist. // Only add fallback breaks if needed and deduplicate if both language and fallback breaks exist.
if (includeFallback) { if (includeFallback) {
for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) { for (size_t idx = minPrefix; idx + minSuffix <= cps.size(); ++idx) {
indexes.push_back(idx); indexes.push_back(idx);
} }
// Only deduplicate if we have both language-specific and fallback breaks. // Only deduplicate if we have both language-specific and fallback breaks.

View File

@ -1,11 +1,17 @@
#pragma once #pragma once
#include <cstddef>
#include <vector> #include <vector>
#include "HyphenationCommon.h" #include "HyphenationCommon.h"
class LanguageHyphenator { class LanguageHyphenator {
public: public:
static constexpr size_t kDefaultMinPrefix = 2;
static constexpr size_t kDefaultMinSuffix = 2;
virtual ~LanguageHyphenator() = default; virtual ~LanguageHyphenator() = default;
virtual std::vector<size_t> breakIndexes(const std::vector<CodepointInfo>& cps) const = 0; virtual std::vector<size_t> breakIndexes(const std::vector<CodepointInfo>& cps) const = 0;
virtual size_t minPrefix() const { return kDefaultMinPrefix; }
virtual size_t minSuffix() const { return kDefaultMinSuffix; }
}; };

View File

@ -0,0 +1,360 @@
#include "LiangHyphenation.h"
#include <algorithm>
#include <limits>
#include <vector>
namespace {
// Holds the dotted, lower-case representation used by Liang along with the original character order
// so we can traverse via Unicode scalars instead of raw UTF-8 bytes.
struct AugmentedWord {
std::vector<uint32_t> chars;
bool empty() const { return chars.empty(); }
size_t charCount() const { return chars.size(); }
};
// Adds a single character to the augmented word.
void appendCharToAugmentedWord(uint32_t cp, AugmentedWord& word) { word.chars.push_back(cp); }
// Produces the dotted ('.' + lowercase word + '.') UTF-8 byte stream required by Liang. Classic TeX
// hyphenation logic prepends/appends '.' sentinels so that patterns like ".ab" may anchor to word
// boundaries. If any character in the candidate word fails the `isLetter` predicate we abort early
// and return an empty structure, signaling the caller to skip hyphenation entirely.
AugmentedWord buildAugmentedWord(const std::vector<CodepointInfo>& cps, const LiangWordConfig& config) {
AugmentedWord word;
if (cps.empty()) {
return word;
}
word.chars.reserve(cps.size() + 2);
appendCharToAugmentedWord('.', word);
for (const auto& info : cps) {
if (!config.isLetter(info.value)) {
word.chars.clear();
return word;
}
appendCharToAugmentedWord(config.toLower(info.value), word);
}
appendCharToAugmentedWord('.', word);
return word;
}
// Compact header that prefixes every serialized trie blob and lets us locate
// the individual sections without storing pointers in flash.
struct SerializedTrieHeader {
uint32_t letterCount;
uint32_t nodeCount;
uint32_t edgeCount;
uint32_t valueBytes;
};
constexpr size_t kNodeRecordSize = 7;
constexpr uint32_t kNoValueOffset = 0x00FFFFFFu;
// Lightweight view over the packed blob emitted by the generator script.
struct SerializedTrieView {
const uint32_t* letters = nullptr;
const uint8_t* nodes = nullptr;
const uint8_t* edgeChildren = nullptr;
const uint8_t* edgeLetters = nullptr;
const uint8_t* values = nullptr;
uint32_t letterCount = 0;
uint32_t nodeCount = 0;
uint32_t edgeCount = 0;
uint32_t valueBytes = 0;
size_t edgeLetterBytes = 0;
static constexpr size_t kInvalidNodeIndex = std::numeric_limits<size_t>::max();
static constexpr uint32_t kInvalidLetterIndex = std::numeric_limits<uint32_t>::max();
};
// Splits the raw byte array into typed slices. We purposely keep this logic
// very defensive: any malformed blob results in an empty view so the caller can
// bail out quietly.
SerializedTrieView parseSerializedTrie(const SerializedHyphenationPatterns& patterns) {
SerializedTrieView view;
if (!patterns.data || patterns.size < sizeof(SerializedTrieHeader)) {
return view;
}
const auto* header = reinterpret_cast<const SerializedTrieHeader*>(patterns.data);
const uint8_t* cursor = patterns.data + sizeof(SerializedTrieHeader);
const uint8_t* end = patterns.data + patterns.size;
const auto requireBytes = [&](size_t bytes) {
return bytes <= static_cast<size_t>(end - cursor);
};
const size_t lettersBytes = static_cast<size_t>(header->letterCount) * sizeof(uint32_t);
if (!requireBytes(lettersBytes)) {
return SerializedTrieView{};
}
view.letters = reinterpret_cast<const uint32_t*>(cursor);
cursor += lettersBytes;
const size_t nodesBytes = static_cast<size_t>(header->nodeCount) * kNodeRecordSize;
if (!requireBytes(nodesBytes)) {
return SerializedTrieView{};
}
view.nodes = cursor;
cursor += nodesBytes;
const size_t childBytes = static_cast<size_t>(header->edgeCount) * sizeof(uint16_t);
if (!requireBytes(childBytes)) {
return SerializedTrieView{};
}
view.edgeChildren = cursor;
cursor += childBytes;
const size_t letterBits = static_cast<size_t>(header->edgeCount) * 6;
const size_t letterBytes = (letterBits + 7) >> 3;
if (!requireBytes(letterBytes)) {
return SerializedTrieView{};
}
view.edgeLetters = cursor;
view.edgeLetterBytes = letterBytes;
cursor += letterBytes;
if (!requireBytes(header->valueBytes)) {
return SerializedTrieView{};
}
view.values = cursor;
view.valueBytes = header->valueBytes;
view.letterCount = header->letterCount;
view.nodeCount = header->nodeCount;
view.edgeCount = header->edgeCount;
return view;
}
// The serialized blobs live in PROGMEM, so parsing them repeatedly is cheap but
// wasteful. Keep a tiny cache indexed by the descriptor address so every
// language builds its view only once.
const SerializedTrieView& getSerializedTrie(const SerializedHyphenationPatterns& patterns) {
struct CacheEntry {
const SerializedHyphenationPatterns* key;
SerializedTrieView view;
};
static std::vector<CacheEntry> cache;
for (const auto& entry : cache) {
if (entry.key == &patterns) {
return entry.view;
}
}
cache.push_back({&patterns, parseSerializedTrie(patterns)});
return cache.back().view;
}
uint16_t readUint16LE(const uint8_t* ptr) {
return static_cast<uint16_t>(ptr[0]) | static_cast<uint16_t>(static_cast<uint16_t>(ptr[1]) << 8);
}
uint32_t readUint24LE(const uint8_t* ptr) {
return static_cast<uint32_t>(ptr[0]) | (static_cast<uint32_t>(ptr[1]) << 8) |
(static_cast<uint32_t>(ptr[2]) << 16);
}
// Edges store child indexes and letter indexes in separate, compact arrays. We
// read the child from the 16-bit table and decode the 6-bit letter from the
// bitstream, which packs two entries per 12 bits on average.
uint8_t readEdgeLetterIndex(const SerializedTrieView& trie, const size_t edgeIndex) {
if (!trie.edgeLetters) {
return 0xFFu;
}
const size_t bitOffset = edgeIndex * 6;
const size_t byteOffset = bitOffset >> 3;
if (byteOffset >= trie.edgeLetterBytes) {
return 0xFFu;
}
const uint8_t bitShift = static_cast<uint8_t>(bitOffset & 0x07u);
uint32_t chunk = trie.edgeLetters[byteOffset];
if (byteOffset + 1 < trie.edgeLetterBytes) {
chunk |= static_cast<uint32_t>(trie.edgeLetters[byteOffset + 1]) << 8;
}
const uint8_t value = static_cast<uint8_t>((chunk >> bitShift) & 0x3Fu);
return value;
}
// Materialized view of a node record so call sites do not repeatedly poke into
// the byte array.
struct NodeFields {
uint16_t firstEdge;
uint8_t childCount;
uint32_t valueOffset;
uint8_t valueLength;
};
NodeFields loadNode(const SerializedTrieView& trie, const size_t nodeIndex) {
NodeFields fields{0, 0, kNoValueOffset, 0};
if (!trie.nodes || nodeIndex >= trie.nodeCount) {
return fields;
}
const uint8_t* entry = trie.nodes + nodeIndex * kNodeRecordSize;
fields.firstEdge = readUint16LE(entry);
fields.childCount = entry[2];
fields.valueOffset = readUint24LE(entry + 3);
fields.valueLength = entry[6];
return fields;
}
// Letter indexes are stored sorted, so a lower_bound gives us O(log n) lookups
// without building auxiliary maps.
uint32_t letterIndexForCodepoint(const SerializedTrieView& trie, const uint32_t cp) {
if (!trie.letters || trie.letterCount == 0) {
return SerializedTrieView::kInvalidLetterIndex;
}
const uint32_t* begin = trie.letters;
const uint32_t* end = begin + trie.letterCount;
const auto it = std::lower_bound(begin, end, cp);
if (it == end || *it != cp) {
return SerializedTrieView::kInvalidLetterIndex;
}
return static_cast<uint32_t>(it - begin);
}
// Walks the child edge slice described by the node record using binary search
// on the inlined letter indexes. Returns kInvalidNodeIndex when the path ends.
size_t findChild(const SerializedTrieView& trie, const size_t nodeIndex, const uint32_t letter) {
const uint32_t letterIndex = letterIndexForCodepoint(trie, letter);
if (letterIndex == SerializedTrieView::kInvalidLetterIndex) {
return SerializedTrieView::kInvalidNodeIndex;
}
if (!trie.edgeChildren || !trie.edgeLetters) {
return SerializedTrieView::kInvalidNodeIndex;
}
const NodeFields node = loadNode(trie, nodeIndex);
size_t low = 0;
size_t high = node.childCount;
while (low < high) {
const size_t mid = low + ((high - low) >> 1);
const size_t edgeIndex = static_cast<size_t>(node.firstEdge) + mid;
if (edgeIndex >= trie.edgeCount) {
return SerializedTrieView::kInvalidNodeIndex;
}
const uint32_t entryLetterIndex = readEdgeLetterIndex(trie, edgeIndex);
if (entryLetterIndex == letterIndex) {
const uint8_t* childPtr = trie.edgeChildren + edgeIndex * sizeof(uint16_t);
return readUint16LE(childPtr);
}
if (entryLetterIndex < letterIndex) {
low = mid + 1;
} else {
high = mid;
}
}
return SerializedTrieView::kInvalidNodeIndex;
}
// Merges the pattern's numeric priorities into the global score array (max per slot).
void applyPatternValues(const SerializedTrieView& trie, const NodeFields& node,
const size_t startCharIndex, std::vector<uint8_t>& scores) {
if (node.valueLength == 0 || node.valueOffset == kNoValueOffset || !trie.values ||
node.valueOffset >= trie.valueBytes) {
return;
}
const size_t availableBytes = trie.valueBytes - node.valueOffset;
const size_t packedBytesNeeded = (static_cast<size_t>(node.valueLength) + 1) >> 1;
const size_t packedBytes = std::min<size_t>(packedBytesNeeded, availableBytes);
const uint8_t* packedValues = trie.values + node.valueOffset;
// Value digits remain nibble-encoded (two per byte) to keep flash usage low;
// expand back to single scores just before applying them.
for (size_t valueIdx = 0; valueIdx < node.valueLength; ++valueIdx) {
const size_t packedIndex = valueIdx >> 1;
if (packedIndex >= packedBytes) {
break;
}
const uint8_t packedByte = packedValues[packedIndex];
const uint8_t value = (valueIdx & 1u) ? static_cast<uint8_t>((packedByte >> 4) & 0x0Fu)
: static_cast<uint8_t>(packedByte & 0x0Fu);
const size_t scoreIdx = startCharIndex + valueIdx;
if (scoreIdx >= scores.size()) {
break;
}
scores[scoreIdx] = std::max(scores[scoreIdx], value);
}
}
// Converts odd score positions back into codepoint indexes, honoring min prefix/suffix constraints.
// By iterating over codepoint indexes rather than raw byte offsets we naturally support UTF-8 input
// without bookkeeping gymnastics. Each break corresponds to scores[breakIndex + 1] because of the
// leading '.' sentinel emitted in buildAugmentedWord().
std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps, const std::vector<uint8_t>& scores,
const size_t minPrefix, const size_t minSuffix) {
std::vector<size_t> indexes;
const size_t cpCount = cps.size();
if (cpCount < 2) {
return indexes;
}
for (size_t breakIndex = 1; breakIndex < cpCount; ++breakIndex) {
if (breakIndex < minPrefix) {
continue;
}
const size_t suffixCount = cpCount - breakIndex;
if (suffixCount < minSuffix) {
continue;
}
const size_t scoreIdx = breakIndex + 1; // Account for leading '.' sentinel.
if (scoreIdx >= scores.size()) {
break;
}
if ((scores[scoreIdx] & 1u) == 0) {
continue;
}
indexes.push_back(breakIndex);
}
return indexes;
}
} // namespace
std::vector<size_t> liangBreakIndexes(const std::vector<CodepointInfo>& cps,
const SerializedHyphenationPatterns& patterns,
const LiangWordConfig& config) {
// Step 1: convert the input word into the dotted UTF-8 stream the Liang algorithm expects. A return
// value of {} means the word contained something outside the language's alphabet and should be left
// untouched by hyphenation.
const auto augmented = buildAugmentedWord(cps, config);
if (augmented.empty()) {
return {};
}
// Step 2: run the augmented word through the trie-backed pattern table so we reuse common prefixes
// instead of rescanning the UTF-8 bytes for every substring.
const SerializedTrieView& trie = getSerializedTrie(patterns);
if (!trie.nodes || trie.nodeCount == 0) {
return {};
}
const size_t charCount = augmented.charCount();
std::vector<uint8_t> scores(charCount + 1, 0);
for (size_t charStart = 0; charStart < charCount; ++charStart) {
size_t currentNode = 0; // Root node.
for (size_t cursor = charStart; cursor < charCount; ++cursor) {
const uint32_t letter = augmented.chars[cursor];
currentNode = findChild(trie, currentNode, letter);
if (currentNode == SerializedTrieView::kInvalidNodeIndex) {
break;
}
const NodeFields node = loadNode(trie, currentNode);
if (node.valueLength > 0 && node.valueOffset != kNoValueOffset) {
applyPatternValues(trie, node, charStart, scores);
}
}
}
// Step 3: translate odd-numbered score positions back into codepoint indexes, enforcing per-language
// prefix/suffix minima so we do not produce visually awkward fragments.
return collectBreakIndexes(cps, scores, config.minPrefix, config.minSuffix);
}

View File

@ -0,0 +1,39 @@
#pragma once
#include <cstddef>
#include <cstdint>
#include <vector>
#include "HyphenationCommon.h"
#include "SerializedHyphenationTrie.h"
// Encapsulates every language-specific dial the Liang algorithm needs at runtime. The helpers are
// intentionally represented as bare function pointers because we invoke them inside tight loops and
// want to avoid the overhead of std::function or functors. The minima default to the TeX-recommended
// "2/2" split but individual languages (English, for example) can override them.
struct LiangWordConfig {
static constexpr size_t kDefaultMinPrefix = 2;
static constexpr size_t kDefaultMinSuffix = 2;
// Predicate used to reject non-alphabetic characters before pattern lookup. Returning false causes
// the entire word to be skipped, matching the behavior of classic TeX hyphenation tables.
bool (*isLetter)(uint32_t);
// Language-specific case folding that matches how the TeX patterns were authored (usually lower-case
// ASCII for Latin and lowercase Cyrillic for Russian). Patterns are stored in UTF-8, so this must
// operate on Unicode scalars rather than bytes.
uint32_t (*toLower)(uint32_t);
// Minimum codepoints required on the left/right of any break. These correspond to TeX's
// lefthyphenmin and righthyphenmin knobs.
size_t minPrefix;
size_t minSuffix;
// Lightweight aggregate constructor so call sites can declare `const LiangWordConfig config(...)`
// without verbose member assignment boilerplate.
LiangWordConfig(bool (*letterFn)(uint32_t), uint32_t (*lowerFn)(uint32_t),
size_t prefix = kDefaultMinPrefix, size_t suffix = kDefaultMinSuffix)
: isLetter(letterFn), toLower(lowerFn), minPrefix(prefix), minSuffix(suffix) {}
};
// Shared Liang pattern evaluator used by every language-specific hyphenator.
std::vector<size_t> liangBreakIndexes(const std::vector<CodepointInfo>& cps,
const SerializedHyphenationPatterns& patterns,
const LiangWordConfig& config);

View File

@ -1,404 +1,9 @@
#include "RussianHyphenator.h" #include "RussianHyphenator.h"
#include <algorithm>
#include <array>
#include <limits>
#include <vector> #include <vector>
#include "HyphenationLiterals.h" #include "LiangHyphenation.h"
#include "generated/hyph-ru-ru.trie.h"
namespace {
using CyrillicLiteral = HyphenLiteralT<uint32_t>;
constexpr uint32_t PFX_BEZ[3] = {0x0431, 0x0435, 0x0437};
constexpr uint32_t PFX_RAZ[3] = {0x0440, 0x0430, 0x0437};
constexpr uint32_t PFX_POD[3] = {0x043F, 0x043E, 0x0434};
constexpr uint32_t PFX_NAD[3] = {0x043D, 0x0430, 0x0434};
constexpr uint32_t PFX_PERE[4] = {0x043F, 0x0435, 0x0440, 0x0435};
constexpr uint32_t PFX_SVERH[5] = {0x0441, 0x0432, 0x0435, 0x0440, 0x0445};
constexpr uint32_t PFX_MEZH[3] = {0x043C, 0x0435, 0x0436};
constexpr uint32_t PFX_SUPER[5] = {0x0441, 0x0443, 0x043F, 0x0435, 0x0440};
constexpr uint32_t PFX_PRED[4] = {0x043F, 0x0440, 0x0435, 0x0434};
constexpr uint32_t PFX_SAMO[4] = {0x0441, 0x0430, 0x043C, 0x043E};
constexpr uint32_t PFX_OBO[3] = {0x043E, 0x0431, 0x043E};
constexpr uint32_t PFX_PROTIV[6] = {0x043F, 0x0440, 0x043E, 0x0442, 0x0438, 0x0432};
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_PREFIXES = {{{PFX_BEZ, 3},
{PFX_RAZ, 3},
{PFX_POD, 3},
{PFX_NAD, 3},
{PFX_PERE, 4},
{PFX_SVERH, 5},
{PFX_MEZH, 3},
{PFX_SUPER, 5},
{PFX_PRED, 4},
{PFX_SAMO, 4},
{PFX_OBO, 3},
{PFX_PROTIV, 6}}};
constexpr uint32_t SFX_NOST[4] = {0x043D, 0x043E, 0x0441, 0x0442};
constexpr uint32_t SFX_STVO[4] = {0x0441, 0x0442, 0x0432, 0x043E};
constexpr uint32_t SFX_ENIE[4] = {0x0435, 0x043D, 0x0438, 0x0435};
constexpr uint32_t SFX_ATION[4] = {0x0430, 0x0446, 0x0438, 0x044F};
constexpr uint32_t SFX_CHIK[3] = {0x0447, 0x0438, 0x043A};
constexpr uint32_t SFX_NIK[3] = {0x043D, 0x0438, 0x043A};
constexpr uint32_t SFX_TEL[4] = {0x0442, 0x0435, 0x043B, 0x044C};
constexpr uint32_t SFX_SKII[4] = {0x0441, 0x043A, 0x0438, 0x0439};
constexpr uint32_t SFX_AL[6] = {0x0430, 0x043B, 0x044C, 0x043D, 0x044B, 0x0439};
constexpr uint32_t SFX_ISM[3] = {0x0438, 0x0437, 0x043C};
constexpr uint32_t SFX_LIV[5] = {0x043B, 0x0438, 0x0432, 0x044B, 0x0439};
constexpr uint32_t SFX_OST[4] = {0x043E, 0x0441, 0x0442, 0x044C};
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_SUFFIXES = {{{SFX_NOST, 4},
{SFX_STVO, 4},
{SFX_ENIE, 4},
{SFX_ATION, 4},
{SFX_CHIK, 3},
{SFX_NIK, 3},
{SFX_TEL, 4},
{SFX_SKII, 4},
{SFX_AL, 6},
{SFX_ISM, 3},
{SFX_LIV, 5},
{SFX_OST, 4}}};
std::vector<uint32_t> lowercaseCyrillicWord(const std::vector<CodepointInfo>& cps) {
std::vector<uint32_t> lower;
lower.reserve(cps.size());
for (const auto& info : cps) {
lower.push_back(isCyrillicLetter(info.value) ? toLowerCyrillic(info.value) : info.value);
}
return lower;
}
bool russianSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
if (start >= cps.size()) {
return false;
}
const size_t clampedEnd = std::min(end, cps.size());
for (size_t i = start; i < clampedEnd; ++i) {
if (isCyrillicVowel(cps[i].value)) {
return true;
}
}
return false;
}
bool exposesLeadingDoubleConsonant(const std::vector<CodepointInfo>& cps, const size_t index) {
if (index + 1 >= cps.size()) {
return false;
}
const auto first = cps[index].value;
const auto second = cps[index + 1].value;
if (!isCyrillicConsonant(first) || !isCyrillicConsonant(second)) {
return false;
}
if (toLowerCyrillic(first) != toLowerCyrillic(second)) {
return false;
}
const bool hasLeftVowel = index > 0 && isCyrillicVowel(cps[index - 1].value);
const bool hasRightVowel = (index + 2 < cps.size()) && isCyrillicVowel(cps[index + 2].value);
return hasLeftVowel && hasRightVowel;
}
bool exposesTrailingDoubleConsonant(const std::vector<CodepointInfo>& cps, const size_t index) {
if (index < 2) {
return false;
}
const auto last = cps[index - 1].value;
const auto prev = cps[index - 2].value;
if (!isCyrillicConsonant(last) || !isCyrillicConsonant(prev)) {
return false;
}
if (toLowerCyrillic(last) != toLowerCyrillic(prev)) {
return false;
}
const bool hasLeftVowel = (index >= 3) && isCyrillicVowel(cps[index - 3].value);
const bool hasRightVowel = (index < cps.size()) && isCyrillicVowel(cps[index].value);
return hasLeftVowel && hasRightVowel;
}
bool violatesDoubleConsonantRule(const std::vector<CodepointInfo>& cps, const size_t index) {
return exposesLeadingDoubleConsonant(cps, index) || exposesTrailingDoubleConsonant(cps, index);
}
// Checks if the codepoint is the Cyrillic soft sign (ь).
bool isSoftSign(uint32_t cp) { return toLowerCyrillic(cp) == 0x044C; }
// Checks if the codepoint is the Cyrillic hard sign (ъ).
bool isHardSign(uint32_t cp) { return toLowerCyrillic(cp) == 0x044A; }
// Checks if the codepoint is either the Cyrillic soft sign (ь) or hard sign (ъ).
bool isSoftOrHardSign(uint32_t cp) { return isSoftSign(cp) || isHardSign(cp); }
// Checks if the codepoint is the Cyrillic short i (й).
bool isCyrillicShortI(uint32_t cp) { return toLowerCyrillic(cp) == 0x0439; }
// Checks if the codepoint is the Cyrillic yeru (ы).
bool isCyrillicYeru(uint32_t cp) { return toLowerCyrillic(cp) == 0x044B; }
// Checks if the codepoint is a Russian prefix consonant that can start certain clusters.
bool isRussianPrefixConsonant(uint32_t cp) {
cp = toLowerCyrillic(cp);
return cp == 0x0432 || cp == 0x0437 || cp == 0x0441; // в, з, с
}
// Checks if the codepoint is a Russian sibilant consonant.
bool isRussianSibilant(uint32_t cp) {
cp = toLowerCyrillic(cp);
switch (cp) {
case 0x0437: // з
case 0x0441: // с
case 0x0436: // ж
case 0x0448: // ш
case 0x0449: // щ
case 0x0447: // ч
case 0x0446: // ц
return true;
default:
return false;
}
}
// Checks if the codepoint is a Russian stop consonant.
bool isRussianStop(uint32_t cp) {
cp = toLowerCyrillic(cp);
switch (cp) {
case 0x0431: // б
case 0x0433: // г
case 0x0434: // д
case 0x043F: // п
case 0x0442: // т
case 0x043A: // к
return true;
default:
return false;
}
}
// Checks the sonority rank of a Russian consonant for syllable onset validation.
int russianSonority(uint32_t cp) {
cp = toLowerCyrillic(cp);
switch (cp) {
case 0x043B: // л
case 0x0440: // р
case 0x0439: // й
return 4;
case 0x043C: // м
case 0x043D: // н
return 3;
case 0x0432: // в
case 0x0437: // з
case 0x0436: // ж
return 2;
case 0x0444: // ф
case 0x0441: // с
case 0x0448: // ш
case 0x0449: // щ
case 0x0447: // ч
case 0x0446: // ц
case 0x0445: // х
return 1;
case 0x0431: // б
case 0x0433: // г
case 0x0434: // д
case 0x043F: // п
case 0x0442: // т
case 0x043A: // к
return 0;
default:
return 1;
}
}
// Applies Russian sonority sequencing to ensure the consonant cluster can start a syllable.
bool russianClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
if (start >= end) {
return false;
}
for (size_t i = start; i < end; ++i) {
const auto cp = cps[i].value;
if (!isCyrillicConsonant(cp) || isSoftOrHardSign(cp)) {
return false;
}
}
if (end - start == 1) {
return true;
}
for (size_t i = start; i + 1 < end; ++i) {
const uint32_t current = cps[i].value;
const uint32_t next = cps[i + 1].value;
const int currentRank = russianSonority(current);
const int nextRank = russianSonority(next);
if (currentRank > nextRank) {
const bool atClusterStart = (i == start);
const bool prefixAllowance = atClusterStart && isRussianPrefixConsonant(current);
const bool sibilantAllowance = isRussianSibilant(current) && isRussianStop(next);
if (!prefixAllowance && !sibilantAllowance) {
return false;
}
}
}
return true;
}
// Identifies splits within double consonant clusters.
size_t doubleConsonantSplit(const std::vector<CodepointInfo>& cps, const size_t clusterStart, const size_t clusterEnd) {
for (size_t i = clusterStart; i + 1 < clusterEnd; ++i) {
const auto left = cps[i].value;
const auto right = cps[i + 1].value;
if (isCyrillicConsonant(left) && toLowerCyrillic(left) == toLowerCyrillic(right) && !isSoftOrHardSign(right)) {
return i + 1;
}
}
return std::numeric_limits<size_t>::max();
}
// Prevents breaks that would create forbidden suffixes.
bool beginsWithForbiddenSuffix(const std::vector<CodepointInfo>& cps, const size_t index) {
if (index >= cps.size()) {
return true;
}
const auto cp = cps[index].value;
return isSoftOrHardSign(cp) || isCyrillicShortI(cp) || isCyrillicYeru(cp);
}
// Validates whether a hyphenation break is allowed at the specified index.
bool russianBreakAllowed(const std::vector<CodepointInfo>& cps, const size_t breakIndex) {
if (breakIndex == 0 || breakIndex >= cps.size()) {
return false;
}
const size_t prefixLen = breakIndex;
const size_t suffixLen = cps.size() - breakIndex;
if (prefixLen < 2 || suffixLen < 2) {
return false;
}
if (!russianSegmentHasVowel(cps, 0, breakIndex) || !russianSegmentHasVowel(cps, breakIndex, cps.size())) {
return false;
}
if (beginsWithForbiddenSuffix(cps, breakIndex)) {
return false;
}
if (violatesDoubleConsonantRule(cps, breakIndex)) {
return false;
}
return true;
}
// Chooses the longest valid onset contained within the inter-vowel cluster.
size_t russianOnsetLength(const std::vector<CodepointInfo>& cps, const size_t clusterStart, const size_t clusterEnd) {
const size_t clusterLen = clusterEnd - clusterStart;
if (clusterLen == 0) {
return 0;
}
const size_t maxLen = std::min<size_t>(4, clusterLen);
for (size_t len = maxLen; len >= 1; --len) {
const size_t suffixStart = clusterEnd - len;
if (russianClusterIsValidOnset(cps, suffixStart, clusterEnd)) {
return len;
}
}
return 1;
}
// Prevents hyphenation splits immediately beside ь/ъ characters.
bool nextToSoftSign(const std::vector<CodepointInfo>& cps, const size_t index) {
if (index == 0 || index >= cps.size()) {
return false;
}
const auto left = cps[index - 1].value;
const auto right = cps[index].value;
return isSoftOrHardSign(left) || isSoftOrHardSign(right);
}
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::vector<uint32_t>& lowerWord,
std::vector<size_t>& indexes) {
appendLiteralBreaks(
lowerWord, RUSSIAN_PREFIXES, RUSSIAN_SUFFIXES,
[&](const size_t breakIndex) { return russianBreakAllowed(cps, breakIndex); }, indexes);
}
// Produces syllable break indexes tailored to Russian phonotactics.
std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
std::vector<size_t> indexes;
const size_t wordSize = cps.size();
// Collect vowel positions.
std::vector<size_t> vowelPositions;
vowelPositions.reserve(wordSize / 2); // Typical estimate: ~50% vowels
for (size_t i = 0; i < wordSize; ++i) {
if (isCyrillicVowel(cps[i].value)) {
vowelPositions.push_back(i);
}
}
// Need at least 2 vowels to create a syllable break.
if (vowelPositions.size() < 2) {
return indexes;
}
// Process inter-vowel clusters for hyphenation points.
for (size_t v = 0; v + 1 < vowelPositions.size(); ++v) {
const size_t leftVowel = vowelPositions[v];
const size_t rightVowel = vowelPositions[v + 1];
const size_t suffixLen = wordSize - rightVowel;
// Adjacent vowels: can break between them if constraints allow.
if (rightVowel - leftVowel == 1) {
if (rightVowel >= MIN_PREFIX_CP && suffixLen >= MIN_SUFFIX_CP && !nextToSoftSign(cps, rightVowel) &&
russianBreakAllowed(cps, rightVowel)) {
indexes.push_back(rightVowel);
}
continue;
}
// Consonant cluster between vowels: find optimal break point.
const size_t clusterStart = leftVowel + 1;
const size_t clusterEnd = rightVowel;
// Try double consonant split first (preferred).
size_t breakIndex = doubleConsonantSplit(cps, clusterStart, clusterEnd);
// Fall back to onset-based split.
if (breakIndex == std::numeric_limits<size_t>::max()) {
const size_t onsetLen = russianOnsetLength(cps, clusterStart, clusterEnd);
breakIndex = clusterEnd - onsetLen;
}
// Validate candidate break point.
if (breakIndex < MIN_PREFIX_CP || suffixLen < MIN_SUFFIX_CP || nextToSoftSign(cps, breakIndex) ||
!russianBreakAllowed(cps, breakIndex)) {
continue;
}
indexes.push_back(breakIndex);
}
const auto lowerWord = lowercaseCyrillicWord(cps);
const size_t preDedupeCount = indexes.size();
appendMorphologyBreaks(cps, lowerWord, indexes);
if (indexes.size() > preDedupeCount) {
std::sort(indexes.begin(), indexes.end());
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
}
return indexes;
}
} // namespace
const RussianHyphenator& RussianHyphenator::instance() { const RussianHyphenator& RussianHyphenator::instance() {
static RussianHyphenator instance; static RussianHyphenator instance;
@ -406,5 +11,9 @@ const RussianHyphenator& RussianHyphenator::instance() {
} }
std::vector<size_t> RussianHyphenator::breakIndexes(const std::vector<CodepointInfo>& cps) const { std::vector<size_t> RussianHyphenator::breakIndexes(const std::vector<CodepointInfo>& cps) const {
return russianBreakIndexes(cps); // Russian uses the same Liang runtime but needs Cyrillic-aware helpers plus symmetrical
// lefthyphenmin/righthyphenmin values. Most Russian TeX distributions stick with 2/2, which keeps
// short words readable while still allowing frequent hyphenation opportunities.
const LiangWordConfig config(isCyrillicLetter, toLowerCyrillic, minPrefix(), minSuffix());
return liangBreakIndexes(cps, ru_ru_patterns, config);
} }

View File

@ -8,6 +8,8 @@ class RussianHyphenator final : public LanguageHyphenator {
static const RussianHyphenator& instance(); static const RussianHyphenator& instance();
std::vector<size_t> breakIndexes(const std::vector<CodepointInfo>& cps) const override; std::vector<size_t> breakIndexes(const std::vector<CodepointInfo>& cps) const override;
size_t minPrefix() const override { return 2; }
size_t minSuffix() const override { return 2; }
private: private:
RussianHyphenator() = default; RussianHyphenator() = default;

View File

@ -0,0 +1,10 @@
#pragma once
#include <cstddef>
#include <cstdint>
// Lightweight descriptor that points at a serialized Liang hyphenation trie stored in flash.
struct SerializedHyphenationPatterns {
const std::uint8_t* data;
size_t size;
};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,9 @@
[platformio] [platformio]
crosspoint_version = 0.12.0
default_envs = default default_envs = default
[crosspoint]
crosspoint_version = 0.12.0
[base] [base]
platform = espressif32 @ 6.12.0 platform = espressif32 @ 6.12.0
board = esp32-c3-devkitm-1 board = esp32-c3-devkitm-1
@ -50,10 +52,10 @@ lib_deps =
extends = base extends = base
build_flags = build_flags =
${base.build_flags} ${base.build_flags}
-DCROSSPOINT_VERSION=\"${platformio.crosspoint_version}-dev\" -DCROSSPOINT_VERSION=\"${crosspoint.crosspoint_version}-dev\"
[env:gh_release] [env:gh_release]
extends = base extends = base
build_flags = build_flags =
${base.build_flags} ${base.build_flags}
-DCROSSPOINT_VERSION=\"${platformio.crosspoint_version}\" -DCROSSPOINT_VERSION=\"${crosspoint.crosspoint_version}\"