mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-06 15:47:39 +03:00
optimization
This commit is contained in:
parent
58314e9efd
commit
3ef2448f72
@ -290,14 +290,11 @@ bool nextToApostrophe(const std::vector<CodepointInfo>& cps, const size_t index)
|
|||||||
// Returns byte indexes where the word may break according to English syllable rules.
|
// Returns byte indexes where the word may break according to English syllable rules.
|
||||||
std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||||
std::vector<size_t> indexes;
|
std::vector<size_t> indexes;
|
||||||
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
const size_t wordSize = cps.size();
|
||||||
return indexes;
|
|
||||||
}
|
|
||||||
|
|
||||||
const auto lowerWord = lowercaseLatinWord(cps);
|
|
||||||
std::vector<size_t> vowelPositions;
|
std::vector<size_t> vowelPositions;
|
||||||
vowelPositions.reserve(cps.size());
|
vowelPositions.reserve(wordSize / 2);
|
||||||
for (size_t i = 0; i < cps.size(); ++i) {
|
for (size_t i = 0; i < wordSize; ++i) {
|
||||||
if (isLatinVowel(cps[i].value)) {
|
if (isLatinVowel(cps[i].value)) {
|
||||||
vowelPositions.push_back(i);
|
vowelPositions.push_back(i);
|
||||||
}
|
}
|
||||||
@ -321,7 +318,7 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
|||||||
const size_t clusterStart = leftVowel + 1;
|
const size_t clusterStart = leftVowel + 1;
|
||||||
const size_t clusterEnd = rightVowel;
|
const size_t clusterEnd = rightVowel;
|
||||||
const size_t onsetLen = englishOnsetLength(cps, clusterStart, clusterEnd);
|
const size_t onsetLen = englishOnsetLength(cps, clusterStart, clusterEnd);
|
||||||
size_t breakIndex = clusterEnd - onsetLen;
|
const size_t breakIndex = clusterEnd - onsetLen;
|
||||||
|
|
||||||
if (!englishBreakAllowed(cps, breakIndex)) {
|
if (!englishBreakAllowed(cps, breakIndex)) {
|
||||||
continue;
|
continue;
|
||||||
@ -329,10 +326,14 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
|||||||
indexes.push_back(breakIndex);
|
indexes.push_back(breakIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const auto lowerWord = lowercaseLatinWord(cps);
|
||||||
|
const size_t preDedupeCount = indexes.size();
|
||||||
appendMorphologyBreaks(cps, lowerWord, indexes);
|
appendMorphologyBreaks(cps, lowerWord, indexes);
|
||||||
|
|
||||||
|
if (indexes.size() > preDedupeCount) {
|
||||||
std::sort(indexes.begin(), indexes.end());
|
std::sort(indexes.begin(), indexes.end());
|
||||||
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
|
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
|
||||||
|
}
|
||||||
return indexes;
|
return indexes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -333,68 +333,68 @@ void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::ve
|
|||||||
// Produces syllable break indexes tailored to Russian phonotactics.
|
// Produces syllable break indexes tailored to Russian phonotactics.
|
||||||
std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||||
std::vector<size_t> indexes;
|
std::vector<size_t> indexes;
|
||||||
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
const size_t wordSize = cps.size();
|
||||||
return indexes;
|
|
||||||
}
|
|
||||||
|
|
||||||
const auto lowerWord = lowercaseCyrillicWord(cps);
|
|
||||||
|
|
||||||
|
// Collect vowel positions.
|
||||||
std::vector<size_t> vowelPositions;
|
std::vector<size_t> vowelPositions;
|
||||||
vowelPositions.reserve(cps.size());
|
vowelPositions.reserve(wordSize / 2); // Typical estimate: ~50% vowels
|
||||||
for (size_t i = 0; i < cps.size(); ++i) {
|
for (size_t i = 0; i < wordSize; ++i) {
|
||||||
if (isCyrillicVowel(cps[i].value)) {
|
if (isCyrillicVowel(cps[i].value)) {
|
||||||
vowelPositions.push_back(i);
|
vowelPositions.push_back(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Need at least 2 vowels to create a syllable break.
|
||||||
if (vowelPositions.size() < 2) {
|
if (vowelPositions.size() < 2) {
|
||||||
return indexes;
|
return indexes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Process inter-vowel clusters for hyphenation points.
|
||||||
for (size_t v = 0; v + 1 < vowelPositions.size(); ++v) {
|
for (size_t v = 0; v + 1 < vowelPositions.size(); ++v) {
|
||||||
const size_t leftVowel = vowelPositions[v];
|
const size_t leftVowel = vowelPositions[v];
|
||||||
const size_t rightVowel = vowelPositions[v + 1];
|
const size_t rightVowel = vowelPositions[v + 1];
|
||||||
|
const size_t suffixLen = wordSize - rightVowel;
|
||||||
|
|
||||||
|
// Adjacent vowels: can break between them if constraints allow.
|
||||||
if (rightVowel - leftVowel == 1) {
|
if (rightVowel - leftVowel == 1) {
|
||||||
if (rightVowel >= MIN_PREFIX_CP && cps.size() - rightVowel >= MIN_SUFFIX_CP && !nextToSoftSign(cps, rightVowel) &&
|
if (rightVowel >= MIN_PREFIX_CP && suffixLen >= MIN_SUFFIX_CP && !nextToSoftSign(cps, rightVowel) &&
|
||||||
russianBreakAllowed(cps, rightVowel)) {
|
russianBreakAllowed(cps, rightVowel)) {
|
||||||
indexes.push_back(rightVowel);
|
indexes.push_back(rightVowel);
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Consonant cluster between vowels: find optimal break point.
|
||||||
const size_t clusterStart = leftVowel + 1;
|
const size_t clusterStart = leftVowel + 1;
|
||||||
const size_t clusterEnd = rightVowel;
|
const size_t clusterEnd = rightVowel;
|
||||||
|
|
||||||
size_t breakIndex = std::numeric_limits<size_t>::max();
|
// Try double consonant split first (preferred).
|
||||||
const auto split = doubleConsonantSplit(cps, clusterStart, clusterEnd);
|
size_t breakIndex = doubleConsonantSplit(cps, clusterStart, clusterEnd);
|
||||||
if (split != std::numeric_limits<size_t>::max()) {
|
|
||||||
breakIndex = split;
|
// Fall back to onset-based split.
|
||||||
} else {
|
if (breakIndex == std::numeric_limits<size_t>::max()) {
|
||||||
const size_t onsetLen = russianOnsetLength(cps, clusterStart, clusterEnd);
|
const size_t onsetLen = russianOnsetLength(cps, clusterStart, clusterEnd);
|
||||||
breakIndex = clusterEnd - onsetLen;
|
breakIndex = clusterEnd - onsetLen;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (breakIndex == std::numeric_limits<size_t>::max()) {
|
// Validate candidate break point.
|
||||||
|
if (breakIndex < MIN_PREFIX_CP || suffixLen < MIN_SUFFIX_CP || nextToSoftSign(cps, breakIndex) ||
|
||||||
|
!russianBreakAllowed(cps, breakIndex)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (breakIndex < MIN_PREFIX_CP || cps.size() - breakIndex < MIN_SUFFIX_CP) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (nextToSoftSign(cps, breakIndex)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!russianBreakAllowed(cps, breakIndex)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
indexes.push_back(breakIndex);
|
indexes.push_back(breakIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const auto lowerWord = lowercaseCyrillicWord(cps);
|
||||||
|
const size_t preDedupeCount = indexes.size();
|
||||||
appendMorphologyBreaks(cps, lowerWord, indexes);
|
appendMorphologyBreaks(cps, lowerWord, indexes);
|
||||||
|
|
||||||
|
if (indexes.size() > preDedupeCount) {
|
||||||
std::sort(indexes.begin(), indexes.end());
|
std::sort(indexes.begin(), indexes.end());
|
||||||
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
|
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
|
||||||
|
}
|
||||||
|
|
||||||
return indexes;
|
return indexes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user