optimization

This commit is contained in:
Arthur Tazhitdinov 2026-01-08 03:27:27 +05:00
parent 58314e9efd
commit 3ef2448f72
2 changed files with 35 additions and 34 deletions

View File

@ -290,14 +290,11 @@ bool nextToApostrophe(const std::vector<CodepointInfo>& cps, const size_t index)
// Returns byte indexes where the word may break according to English syllable rules. // Returns byte indexes where the word may break according to English syllable rules.
std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) { std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
std::vector<size_t> indexes; std::vector<size_t> indexes;
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { const size_t wordSize = cps.size();
return indexes;
}
const auto lowerWord = lowercaseLatinWord(cps);
std::vector<size_t> vowelPositions; std::vector<size_t> vowelPositions;
vowelPositions.reserve(cps.size()); vowelPositions.reserve(wordSize / 2);
for (size_t i = 0; i < cps.size(); ++i) { for (size_t i = 0; i < wordSize; ++i) {
if (isLatinVowel(cps[i].value)) { if (isLatinVowel(cps[i].value)) {
vowelPositions.push_back(i); vowelPositions.push_back(i);
} }
@ -321,7 +318,7 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
const size_t clusterStart = leftVowel + 1; const size_t clusterStart = leftVowel + 1;
const size_t clusterEnd = rightVowel; const size_t clusterEnd = rightVowel;
const size_t onsetLen = englishOnsetLength(cps, clusterStart, clusterEnd); const size_t onsetLen = englishOnsetLength(cps, clusterStart, clusterEnd);
size_t breakIndex = clusterEnd - onsetLen; const size_t breakIndex = clusterEnd - onsetLen;
if (!englishBreakAllowed(cps, breakIndex)) { if (!englishBreakAllowed(cps, breakIndex)) {
continue; continue;
@ -329,10 +326,14 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
indexes.push_back(breakIndex); indexes.push_back(breakIndex);
} }
const auto lowerWord = lowercaseLatinWord(cps);
const size_t preDedupeCount = indexes.size();
appendMorphologyBreaks(cps, lowerWord, indexes); appendMorphologyBreaks(cps, lowerWord, indexes);
if (indexes.size() > preDedupeCount) {
std::sort(indexes.begin(), indexes.end()); std::sort(indexes.begin(), indexes.end());
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
}
return indexes; return indexes;
} }

View File

@ -333,68 +333,68 @@ void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::ve
// Produces syllable break indexes tailored to Russian phonotactics. // Produces syllable break indexes tailored to Russian phonotactics.
std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) { std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
std::vector<size_t> indexes; std::vector<size_t> indexes;
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { const size_t wordSize = cps.size();
return indexes;
}
const auto lowerWord = lowercaseCyrillicWord(cps);
// Collect vowel positions.
std::vector<size_t> vowelPositions; std::vector<size_t> vowelPositions;
vowelPositions.reserve(cps.size()); vowelPositions.reserve(wordSize / 2); // Typical estimate: ~50% vowels
for (size_t i = 0; i < cps.size(); ++i) { for (size_t i = 0; i < wordSize; ++i) {
if (isCyrillicVowel(cps[i].value)) { if (isCyrillicVowel(cps[i].value)) {
vowelPositions.push_back(i); vowelPositions.push_back(i);
} }
} }
// Need at least 2 vowels to create a syllable break.
if (vowelPositions.size() < 2) { if (vowelPositions.size() < 2) {
return indexes; return indexes;
} }
// Process inter-vowel clusters for hyphenation points.
for (size_t v = 0; v + 1 < vowelPositions.size(); ++v) { for (size_t v = 0; v + 1 < vowelPositions.size(); ++v) {
const size_t leftVowel = vowelPositions[v]; const size_t leftVowel = vowelPositions[v];
const size_t rightVowel = vowelPositions[v + 1]; const size_t rightVowel = vowelPositions[v + 1];
const size_t suffixLen = wordSize - rightVowel;
// Adjacent vowels: can break between them if constraints allow.
if (rightVowel - leftVowel == 1) { if (rightVowel - leftVowel == 1) {
if (rightVowel >= MIN_PREFIX_CP && cps.size() - rightVowel >= MIN_SUFFIX_CP && !nextToSoftSign(cps, rightVowel) && if (rightVowel >= MIN_PREFIX_CP && suffixLen >= MIN_SUFFIX_CP && !nextToSoftSign(cps, rightVowel) &&
russianBreakAllowed(cps, rightVowel)) { russianBreakAllowed(cps, rightVowel)) {
indexes.push_back(rightVowel); indexes.push_back(rightVowel);
} }
continue; continue;
} }
// Consonant cluster between vowels: find optimal break point.
const size_t clusterStart = leftVowel + 1; const size_t clusterStart = leftVowel + 1;
const size_t clusterEnd = rightVowel; const size_t clusterEnd = rightVowel;
size_t breakIndex = std::numeric_limits<size_t>::max(); // Try double consonant split first (preferred).
const auto split = doubleConsonantSplit(cps, clusterStart, clusterEnd); size_t breakIndex = doubleConsonantSplit(cps, clusterStart, clusterEnd);
if (split != std::numeric_limits<size_t>::max()) {
breakIndex = split; // Fall back to onset-based split.
} else { if (breakIndex == std::numeric_limits<size_t>::max()) {
const size_t onsetLen = russianOnsetLength(cps, clusterStart, clusterEnd); const size_t onsetLen = russianOnsetLength(cps, clusterStart, clusterEnd);
breakIndex = clusterEnd - onsetLen; breakIndex = clusterEnd - onsetLen;
} }
if (breakIndex == std::numeric_limits<size_t>::max()) { // Validate candidate break point.
if (breakIndex < MIN_PREFIX_CP || suffixLen < MIN_SUFFIX_CP || nextToSoftSign(cps, breakIndex) ||
!russianBreakAllowed(cps, breakIndex)) {
continue; continue;
} }
if (breakIndex < MIN_PREFIX_CP || cps.size() - breakIndex < MIN_SUFFIX_CP) {
continue;
}
if (nextToSoftSign(cps, breakIndex)) {
continue;
}
if (!russianBreakAllowed(cps, breakIndex)) {
continue;
}
indexes.push_back(breakIndex); indexes.push_back(breakIndex);
} }
const auto lowerWord = lowercaseCyrillicWord(cps);
const size_t preDedupeCount = indexes.size();
appendMorphologyBreaks(cps, lowerWord, indexes); appendMorphologyBreaks(cps, lowerWord, indexes);
if (indexes.size() > preDedupeCount) {
std::sort(indexes.begin(), indexes.end()); std::sort(indexes.begin(), indexes.end());
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
}
return indexes; return indexes;
} }