mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-05 07:07:38 +03:00
## Summary * Adds (optional) Hyphenation for English, French, German, Russian languages ## Additional Context * Included hyphenation dictionaries add approximately 280kb to the flash usage (German alone takes 200kb) * Trie encoded dictionaries are adopted from hypher project (https://github.com/typst/hypher) * Soft hyphens (and other explicit hyphens) take precedence over dict-based hyphenation. Overall, the hyphenation rules are quite aggressive, as I believe it makes more sense on our smaller screen. --------- Co-authored-by: Dave Allie <dave@daveallie.com>
98 lines
3.2 KiB
C++
98 lines
3.2 KiB
C++
#include "Hyphenator.h"
|
|
|
|
#include <vector>
|
|
|
|
#include "HyphenationCommon.h"
|
|
#include "LanguageRegistry.h"
|
|
|
|
const LanguageHyphenator* Hyphenator::cachedHyphenator_ = nullptr;
|
|
|
|
namespace {
|
|
|
|
// Maps a BCP-47 language tag to a language-specific hyphenator.
|
|
const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) {
|
|
if (langTag.empty()) return nullptr;
|
|
|
|
// Extract primary subtag and normalize to lowercase (e.g., "en-US" -> "en").
|
|
std::string primary;
|
|
primary.reserve(langTag.size());
|
|
for (char c : langTag) {
|
|
if (c == '-' || c == '_') break;
|
|
if (c >= 'A' && c <= 'Z') c = static_cast<char>(c - 'A' + 'a');
|
|
primary.push_back(c);
|
|
}
|
|
if (primary.empty()) return nullptr;
|
|
|
|
return getLanguageHyphenatorForPrimaryTag(primary);
|
|
}
|
|
|
|
// Maps a codepoint index back to its byte offset inside the source word.
|
|
size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t index) {
|
|
return (index < cps.size()) ? cps[index].byteOffset : (cps.empty() ? 0 : cps.back().byteOffset);
|
|
}
|
|
|
|
// Builds a vector of break information from explicit hyphen markers in the given codepoints.
|
|
std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<CodepointInfo>& cps) {
|
|
std::vector<Hyphenator::BreakInfo> breaks;
|
|
|
|
// Scan every codepoint looking for explicit/soft hyphen markers that are surrounded by letters.
|
|
for (size_t i = 1; i + 1 < cps.size(); ++i) {
|
|
const uint32_t cp = cps[i].value;
|
|
if (!isExplicitHyphen(cp) || !isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) {
|
|
continue;
|
|
}
|
|
// Offset points to the next codepoint so rendering starts after the hyphen marker.
|
|
breaks.push_back({cps[i + 1].byteOffset, isSoftHyphen(cp)});
|
|
}
|
|
|
|
return breaks;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) {
|
|
if (word.empty()) {
|
|
return {};
|
|
}
|
|
|
|
// Convert to codepoints and normalize word boundaries.
|
|
auto cps = collectCodepoints(word);
|
|
trimSurroundingPunctuationAndFootnote(cps);
|
|
const auto* hyphenator = cachedHyphenator_;
|
|
|
|
// Explicit hyphen markers (soft or hard) take precedence over language breaks.
|
|
auto explicitBreakInfos = buildExplicitBreakInfos(cps);
|
|
if (!explicitBreakInfos.empty()) {
|
|
return explicitBreakInfos;
|
|
}
|
|
|
|
// Ask language hyphenator for legal break points.
|
|
std::vector<size_t> indexes;
|
|
if (hyphenator) {
|
|
indexes = hyphenator->breakIndexes(cps);
|
|
}
|
|
|
|
// Only add fallback breaks if needed
|
|
if (includeFallback && indexes.empty()) {
|
|
const size_t minPrefix = hyphenator ? hyphenator->minPrefix() : LiangWordConfig::kDefaultMinPrefix;
|
|
const size_t minSuffix = hyphenator ? hyphenator->minSuffix() : LiangWordConfig::kDefaultMinSuffix;
|
|
for (size_t idx = minPrefix; idx + minSuffix <= cps.size(); ++idx) {
|
|
indexes.push_back(idx);
|
|
}
|
|
}
|
|
|
|
if (indexes.empty()) {
|
|
return {};
|
|
}
|
|
|
|
std::vector<Hyphenator::BreakInfo> breaks;
|
|
breaks.reserve(indexes.size());
|
|
for (const size_t idx : indexes) {
|
|
breaks.push_back({byteOffsetForIndex(cps, idx), true});
|
|
}
|
|
|
|
return breaks;
|
|
}
|
|
|
|
void Hyphenator::setPreferredLanguage(const std::string& lang) { cachedHyphenator_ = hyphenatorForLanguage(lang); }
|