mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-05 15:17:37 +03:00
## Summary * Adds (optional) Hyphenation for English, French, German, Russian languages ## Additional Context * Included hyphenation dictionaries add approximately 280kb to the flash usage (German alone takes 200kb) * Trie encoded dictionaries are adopted from hypher project (https://github.com/typst/hypher) * Soft hyphens (and other explicit hyphens) take precedence over dict-based hyphenation. Overall, the hyphenation rules are quite aggressive, as I believe it makes more sense on our smaller screen. --------- Co-authored-by: Dave Allie <dave@daveallie.com>
180 lines
4.4 KiB
C++
180 lines
4.4 KiB
C++
#include "HyphenationCommon.h"
|
||
|
||
#include <Utf8.h>
|
||
|
||
namespace {
|
||
|
||
// Convert Latin uppercase letters (ASCII plus Latin-1 supplement) to lowercase
|
||
uint32_t toLowerLatinImpl(const uint32_t cp) {
|
||
if (cp >= 'A' && cp <= 'Z') {
|
||
return cp - 'A' + 'a';
|
||
}
|
||
if ((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00DE)) {
|
||
return cp + 0x20;
|
||
}
|
||
|
||
switch (cp) {
|
||
case 0x0152: // Œ
|
||
return 0x0153; // œ
|
||
case 0x0178: // Ÿ
|
||
return 0x00FF; // ÿ
|
||
case 0x1E9E: // ẞ
|
||
return 0x00DF; // ß
|
||
default:
|
||
return cp;
|
||
}
|
||
}
|
||
|
||
// Convert Cyrillic uppercase letters to lowercase
|
||
// Cyrillic uppercase range 0x0410-0x042F maps to lowercase by adding 0x20
|
||
// Special case: Cyrillic capital IO (0x0401) maps to lowercase io (0x0451)
|
||
uint32_t toLowerCyrillicImpl(const uint32_t cp) {
|
||
if (cp >= 0x0410 && cp <= 0x042F) {
|
||
return cp + 0x20;
|
||
}
|
||
if (cp == 0x0401) {
|
||
return 0x0451;
|
||
}
|
||
return cp;
|
||
}
|
||
|
||
} // namespace
|
||
|
||
uint32_t toLowerLatin(const uint32_t cp) { return toLowerLatinImpl(cp); }
|
||
|
||
uint32_t toLowerCyrillic(const uint32_t cp) { return toLowerCyrillicImpl(cp); }
|
||
|
||
bool isLatinLetter(const uint32_t cp) {
|
||
if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) {
|
||
return true;
|
||
}
|
||
|
||
if (((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00F6) || (cp >= 0x00F8 && cp <= 0x00FF)) &&
|
||
cp != 0x00D7 && cp != 0x00F7) {
|
||
return true;
|
||
}
|
||
|
||
switch (cp) {
|
||
case 0x0152: // Œ
|
||
case 0x0153: // œ
|
||
case 0x0178: // Ÿ
|
||
case 0x1E9E: // ẞ
|
||
return true;
|
||
default:
|
||
return false;
|
||
}
|
||
}
|
||
|
||
bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); }
|
||
|
||
bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
|
||
|
||
bool isPunctuation(const uint32_t cp) {
|
||
switch (cp) {
|
||
case '-':
|
||
case '.':
|
||
case ',':
|
||
case '!':
|
||
case '?':
|
||
case ';':
|
||
case ':':
|
||
case '"':
|
||
case '\'':
|
||
case ')':
|
||
case '(':
|
||
case 0x00AB: // «
|
||
case 0x00BB: // »
|
||
case 0x2018: // ‘
|
||
case 0x2019: // ’
|
||
case 0x201C: // “
|
||
case 0x201D: // ”
|
||
case 0x00A0: // no-break space
|
||
case '{':
|
||
case '}':
|
||
case '[':
|
||
case ']':
|
||
case '/':
|
||
case 0x203A: // ›
|
||
case 0x2026: // …
|
||
return true;
|
||
default:
|
||
return false;
|
||
}
|
||
}
|
||
|
||
bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
|
||
|
||
bool isExplicitHyphen(const uint32_t cp) {
|
||
switch (cp) {
|
||
case '-':
|
||
case 0x00AD: // soft hyphen
|
||
case 0x058A: // Armenian hyphen
|
||
case 0x2010: // hyphen
|
||
case 0x2011: // non-breaking hyphen
|
||
case 0x2012: // figure dash
|
||
case 0x2013: // en dash
|
||
case 0x2014: // em dash
|
||
case 0x2015: // horizontal bar
|
||
case 0x2043: // hyphen bullet
|
||
case 0x207B: // superscript minus
|
||
case 0x208B: // subscript minus
|
||
case 0x2212: // minus sign
|
||
case 0x2E17: // double oblique hyphen
|
||
case 0x2E3A: // two-em dash
|
||
case 0x2E3B: // three-em dash
|
||
case 0xFE58: // small em dash
|
||
case 0xFE63: // small hyphen-minus
|
||
case 0xFF0D: // fullwidth hyphen-minus
|
||
return true;
|
||
default:
|
||
return false;
|
||
}
|
||
}
|
||
|
||
bool isSoftHyphen(const uint32_t cp) { return cp == 0x00AD; }
|
||
|
||
void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps) {
|
||
if (cps.empty()) {
|
||
return;
|
||
}
|
||
|
||
// Remove trailing footnote references like [12], even if punctuation trails after the closing bracket.
|
||
if (cps.size() >= 3) {
|
||
int end = static_cast<int>(cps.size()) - 1;
|
||
while (end >= 0 && isPunctuation(cps[end].value)) {
|
||
--end;
|
||
}
|
||
int pos = end;
|
||
if (pos >= 0 && isAsciiDigit(cps[pos].value)) {
|
||
while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
|
||
--pos;
|
||
}
|
||
if (pos >= 0 && cps[pos].value == '[' && end - pos > 1) {
|
||
cps.erase(cps.begin() + pos, cps.end());
|
||
}
|
||
}
|
||
}
|
||
|
||
while (!cps.empty() && isPunctuation(cps.front().value)) {
|
||
cps.erase(cps.begin());
|
||
}
|
||
while (!cps.empty() && isPunctuation(cps.back().value)) {
|
||
cps.pop_back();
|
||
}
|
||
}
|
||
|
||
std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
|
||
std::vector<CodepointInfo> cps;
|
||
cps.reserve(word.size());
|
||
|
||
const unsigned char* base = reinterpret_cast<const unsigned char*>(word.c_str());
|
||
const unsigned char* ptr = base;
|
||
while (*ptr != 0) {
|
||
const unsigned char* current = ptr;
|
||
const uint32_t cp = utf8NextCodepoint(&ptr);
|
||
cps.push_back({cp, static_cast<size_t>(current - base)});
|
||
}
|
||
|
||
return cps;
|
||
}
|