mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-04 22:57:50 +03:00
Enhance hyphenation logic: add morphology break handling and improve vowel detection
This commit is contained in:
parent
0fa50291b9
commit
5d00e5ac0f
@ -1,7 +1,9 @@
|
|||||||
#include "EnglishHyphenator.h"
|
#include "EnglishHyphenator.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <array>
|
||||||
#include <initializer_list>
|
#include <initializer_list>
|
||||||
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
@ -45,6 +47,105 @@ bool isEnglishFricativeChar(const char c) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct LatinLiteral {
|
||||||
|
const char* text;
|
||||||
|
size_t length;
|
||||||
|
};
|
||||||
|
|
||||||
|
bool nextToApostrophe(const std::vector<CodepointInfo>& cps, size_t index);
|
||||||
|
|
||||||
|
std::string lowercaseLatinWord(const std::vector<CodepointInfo>& cps) {
|
||||||
|
std::string lower;
|
||||||
|
lower.reserve(cps.size());
|
||||||
|
for (const auto& info : cps) {
|
||||||
|
lower.push_back(lowerLatinChar(info.value));
|
||||||
|
}
|
||||||
|
return lower;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool matchesPatternAt(const std::string& lowerWord, const size_t start, const LatinLiteral& pattern) {
|
||||||
|
if (!pattern.text || pattern.length == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (start + pattern.length > lowerWord.size()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < pattern.length; ++i) {
|
||||||
|
if (lowerWord[start + i] != pattern.text[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool englishSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
|
||||||
|
if (start >= end || start >= cps.size()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const size_t clampedEnd = std::min(end, cps.size());
|
||||||
|
for (size_t i = start; i < clampedEnd; ++i) {
|
||||||
|
if (isLatinVowel(cps[i].value)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::string& lowerWord,
|
||||||
|
std::vector<size_t>& indexes) {
|
||||||
|
static constexpr std::array<LatinLiteral, 20> PREFIXES = {{{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2},
|
||||||
|
{"dis", 3}, {"hyper", 5}, {"inter", 5}, {"micro", 5},
|
||||||
|
{"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3},
|
||||||
|
{"over", 4}, {"post", 4}, {"pre", 3}, {"pro", 3},
|
||||||
|
{"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}};
|
||||||
|
|
||||||
|
static constexpr std::array<LatinLiteral, 24> SUFFIXES = {{{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4},
|
||||||
|
{"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3},
|
||||||
|
{"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6},
|
||||||
|
{"ly", 2}, {"ment", 4}, {"ments", 5}, {"ness", 4},
|
||||||
|
{"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4},
|
||||||
|
{"wards", 5}, {"ship", 4}, {"ships", 5}, {"y", 1}}};
|
||||||
|
|
||||||
|
const size_t length = cps.size();
|
||||||
|
if (length < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto tryPush = [&](const size_t breakIndex) {
|
||||||
|
if (breakIndex < MIN_PREFIX_CP || length - breakIndex < MIN_SUFFIX_CP) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!englishSegmentHasVowel(cps, 0, breakIndex) || !englishSegmentHasVowel(cps, breakIndex, length)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (nextToApostrophe(cps, breakIndex)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
indexes.push_back(breakIndex);
|
||||||
|
};
|
||||||
|
|
||||||
|
for (const auto& prefix : PREFIXES) {
|
||||||
|
if (prefix.length == 0 || prefix.length >= length) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!matchesPatternAt(lowerWord, 0, prefix)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
tryPush(prefix.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto& suffix : SUFFIXES) {
|
||||||
|
if (suffix.length == 0 || suffix.length >= length) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const size_t breakIndex = length - suffix.length;
|
||||||
|
if (!matchesPatternAt(lowerWord, breakIndex, suffix)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
tryPush(breakIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct CharPair {
|
struct CharPair {
|
||||||
char first;
|
char first;
|
||||||
char second;
|
char second;
|
||||||
@ -225,6 +326,7 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
|||||||
return indexes;
|
return indexes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const auto lowerWord = lowercaseLatinWord(cps);
|
||||||
std::vector<size_t> vowelPositions;
|
std::vector<size_t> vowelPositions;
|
||||||
vowelPositions.reserve(cps.size());
|
vowelPositions.reserve(cps.size());
|
||||||
for (size_t i = 0; i < cps.size(); ++i) {
|
for (size_t i = 0; i < cps.size(); ++i) {
|
||||||
@ -263,6 +365,8 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
|||||||
indexes.push_back(breakIndex);
|
indexes.push_back(breakIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
appendMorphologyBreaks(cps, lowerWord, indexes);
|
||||||
|
|
||||||
std::sort(indexes.begin(), indexes.end());
|
std::sort(indexes.begin(), indexes.end());
|
||||||
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
|
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
|
||||||
return indexes;
|
return indexes;
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user