Xteink-X4-crosspoint-reader/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
Arthur Tazhitdinov 8824c87490
feat: dict based Hyphenation (#305)
## Summary

* Adds (optional) Hyphenation for English, French, German, Russian
languages

## Additional Context

* Included hyphenation dictionaries add approximately 280kb to the flash
usage (German alone takes 200kb)
* Trie encoded dictionaries are adopted from hypher project
(https://github.com/typst/hypher)
* Soft hyphens (and other explicit hyphens) take precedence over
dict-based hyphenation. Overall, the hyphenation rules are quite
aggressive, as I believe it makes more sense on our smaller screen.

---------

Co-authored-by: Dave Allie <dave@daveallie.com>
2026-01-19 12:56:26 +00:00

180 lines
4.4 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "HyphenationCommon.h"
#include <Utf8.h>
namespace {
// Convert Latin uppercase letters (ASCII plus Latin-1 supplement) to lowercase
uint32_t toLowerLatinImpl(const uint32_t cp) {
if (cp >= 'A' && cp <= 'Z') {
return cp - 'A' + 'a';
}
if ((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00DE)) {
return cp + 0x20;
}
switch (cp) {
case 0x0152: // Œ
return 0x0153; // œ
case 0x0178: // Ÿ
return 0x00FF; // ÿ
case 0x1E9E: // ẞ
return 0x00DF; // ß
default:
return cp;
}
}
// Convert Cyrillic uppercase letters to lowercase
// Cyrillic uppercase range 0x0410-0x042F maps to lowercase by adding 0x20
// Special case: Cyrillic capital IO (0x0401) maps to lowercase io (0x0451)
uint32_t toLowerCyrillicImpl(const uint32_t cp) {
if (cp >= 0x0410 && cp <= 0x042F) {
return cp + 0x20;
}
if (cp == 0x0401) {
return 0x0451;
}
return cp;
}
} // namespace
uint32_t toLowerLatin(const uint32_t cp) { return toLowerLatinImpl(cp); }
uint32_t toLowerCyrillic(const uint32_t cp) { return toLowerCyrillicImpl(cp); }
bool isLatinLetter(const uint32_t cp) {
if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) {
return true;
}
if (((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00F6) || (cp >= 0x00F8 && cp <= 0x00FF)) &&
cp != 0x00D7 && cp != 0x00F7) {
return true;
}
switch (cp) {
case 0x0152: // Œ
case 0x0153: // œ
case 0x0178: // Ÿ
case 0x1E9E: // ẞ
return true;
default:
return false;
}
}
bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); }
bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
bool isPunctuation(const uint32_t cp) {
switch (cp) {
case '-':
case '.':
case ',':
case '!':
case '?':
case ';':
case ':':
case '"':
case '\'':
case ')':
case '(':
case 0x00AB: // «
case 0x00BB: // »
case 0x2018: //
case 0x2019: //
case 0x201C: // “
case 0x201D: // ”
case 0x00A0: // no-break space
case '{':
case '}':
case '[':
case ']':
case '/':
case 0x203A: //
case 0x2026: // …
return true;
default:
return false;
}
}
bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
bool isExplicitHyphen(const uint32_t cp) {
switch (cp) {
case '-':
case 0x00AD: // soft hyphen
case 0x058A: // Armenian hyphen
case 0x2010: // hyphen
case 0x2011: // non-breaking hyphen
case 0x2012: // figure dash
case 0x2013: // en dash
case 0x2014: // em dash
case 0x2015: // horizontal bar
case 0x2043: // hyphen bullet
case 0x207B: // superscript minus
case 0x208B: // subscript minus
case 0x2212: // minus sign
case 0x2E17: // double oblique hyphen
case 0x2E3A: // two-em dash
case 0x2E3B: // three-em dash
case 0xFE58: // small em dash
case 0xFE63: // small hyphen-minus
case 0xFF0D: // fullwidth hyphen-minus
return true;
default:
return false;
}
}
bool isSoftHyphen(const uint32_t cp) { return cp == 0x00AD; }
void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps) {
if (cps.empty()) {
return;
}
// Remove trailing footnote references like [12], even if punctuation trails after the closing bracket.
if (cps.size() >= 3) {
int end = static_cast<int>(cps.size()) - 1;
while (end >= 0 && isPunctuation(cps[end].value)) {
--end;
}
int pos = end;
if (pos >= 0 && isAsciiDigit(cps[pos].value)) {
while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
--pos;
}
if (pos >= 0 && cps[pos].value == '[' && end - pos > 1) {
cps.erase(cps.begin() + pos, cps.end());
}
}
}
while (!cps.empty() && isPunctuation(cps.front().value)) {
cps.erase(cps.begin());
}
while (!cps.empty() && isPunctuation(cps.back().value)) {
cps.pop_back();
}
}
std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
std::vector<CodepointInfo> cps;
cps.reserve(word.size());
const unsigned char* base = reinterpret_cast<const unsigned char*>(word.c_str());
const unsigned char* ptr = base;
while (*ptr != 0) {
const unsigned char* current = ptr;
const uint32_t cp = utf8NextCodepoint(&ptr);
cps.push_back({cp, static_cast<size_t>(current - base)});
}
return cps;
}