mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-06 15:47:39 +03:00
refactor: Simplify break information handling in buildExplicitBreakInfos and enhance clarity in liangBreakIndexes
This commit is contained in:
parent
5dab3d4fcf
commit
04a084f6c8
@ -48,28 +48,6 @@ std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<Cod
|
||||
breaks.push_back({cps[i + 1].byteOffset, isSoftHyphen(cp)});
|
||||
}
|
||||
|
||||
if (breaks.empty()) {
|
||||
return breaks;
|
||||
}
|
||||
|
||||
// Sort by byte offset so we can deduplicate sequential markers in-place.
|
||||
std::sort(breaks.begin(), breaks.end(), [](const Hyphenator::BreakInfo& lhs, const Hyphenator::BreakInfo& rhs) {
|
||||
return lhs.byteOffset < rhs.byteOffset;
|
||||
});
|
||||
|
||||
// Deduplicate in-place: merge entries at same offset while retaining "needs hyphen" flag.
|
||||
size_t writePos = 0;
|
||||
for (size_t readPos = 1; readPos < breaks.size(); ++readPos) {
|
||||
if (breaks[readPos].byteOffset == breaks[writePos].byteOffset) {
|
||||
// Merge: explicit hyphen wins over soft hyphen at same offset.
|
||||
breaks[writePos].requiresInsertedHyphen =
|
||||
breaks[writePos].requiresInsertedHyphen || breaks[readPos].requiresInsertedHyphen;
|
||||
} else {
|
||||
breaks[++writePos] = breaks[readPos];
|
||||
}
|
||||
}
|
||||
breaks.resize(writePos + 1);
|
||||
|
||||
return breaks;
|
||||
}
|
||||
|
||||
|
||||
@ -354,36 +354,43 @@ std::vector<size_t> liangBreakIndexes(const std::vector<CodepointInfo>& cps,
|
||||
return {};
|
||||
}
|
||||
|
||||
// Liang scores: one entry per augmented char (leading/trailing dots included).
|
||||
std::vector<uint8_t> scores(augmented.charCount(), 0);
|
||||
|
||||
// Walk every starting character position and stream bytes through the trie.
|
||||
for (size_t charStart = 0; charStart < augmented.charByteOffsets.size(); ++charStart) {
|
||||
size_t byteStart = augmented.charByteOffsets[charStart];
|
||||
const size_t byteStart = augmented.charByteOffsets[charStart];
|
||||
AutomatonState state = root;
|
||||
|
||||
for (size_t cursor = byteStart; cursor < augmented.bytes.size(); ++cursor) {
|
||||
AutomatonState next;
|
||||
if (!transition(automaton, state, augmented.bytes[cursor], next)) {
|
||||
break;
|
||||
break; // No more matches for this prefix.
|
||||
}
|
||||
state = next;
|
||||
|
||||
if (state.levels && state.levelsLen > 0) {
|
||||
size_t offset = 0;
|
||||
// Each packed byte stores the byte-distance delta and the Liang level digit.
|
||||
for (size_t i = 0; i < state.levelsLen; ++i) {
|
||||
const uint8_t packed = state.levels[i];
|
||||
const size_t dist = static_cast<size_t>(packed / 10);
|
||||
const uint8_t level = static_cast<uint8_t>(packed % 10);
|
||||
|
||||
offset += dist;
|
||||
const size_t splitByte = byteStart + offset;
|
||||
if (splitByte >= augmented.byteToCharIndex.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int32_t boundary = augmented.byteToCharIndex[splitByte];
|
||||
if (boundary < 0) {
|
||||
continue;
|
||||
continue; // Mid-codepoint byte, wait for the next one.
|
||||
}
|
||||
if (boundary < 2 || boundary + 2 > static_cast<int32_t>(augmented.charCount())) {
|
||||
continue;
|
||||
continue; // Skip splits that land in the leading/trailing sentinels.
|
||||
}
|
||||
|
||||
const size_t idx = static_cast<size_t>(boundary);
|
||||
if (idx >= scores.size()) {
|
||||
continue;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user