mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-07 08:07:40 +03:00
refactor: Simplify break information handling in buildExplicitBreakInfos and enhance clarity in liangBreakIndexes
This commit is contained in:
parent
5dab3d4fcf
commit
04a084f6c8
@ -48,28 +48,6 @@ std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<Cod
|
|||||||
breaks.push_back({cps[i + 1].byteOffset, isSoftHyphen(cp)});
|
breaks.push_back({cps[i + 1].byteOffset, isSoftHyphen(cp)});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (breaks.empty()) {
|
|
||||||
return breaks;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort by byte offset so we can deduplicate sequential markers in-place.
|
|
||||||
std::sort(breaks.begin(), breaks.end(), [](const Hyphenator::BreakInfo& lhs, const Hyphenator::BreakInfo& rhs) {
|
|
||||||
return lhs.byteOffset < rhs.byteOffset;
|
|
||||||
});
|
|
||||||
|
|
||||||
// Deduplicate in-place: merge entries at same offset while retaining "needs hyphen" flag.
|
|
||||||
size_t writePos = 0;
|
|
||||||
for (size_t readPos = 1; readPos < breaks.size(); ++readPos) {
|
|
||||||
if (breaks[readPos].byteOffset == breaks[writePos].byteOffset) {
|
|
||||||
// Merge: explicit hyphen wins over soft hyphen at same offset.
|
|
||||||
breaks[writePos].requiresInsertedHyphen =
|
|
||||||
breaks[writePos].requiresInsertedHyphen || breaks[readPos].requiresInsertedHyphen;
|
|
||||||
} else {
|
|
||||||
breaks[++writePos] = breaks[readPos];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
breaks.resize(writePos + 1);
|
|
||||||
|
|
||||||
return breaks;
|
return breaks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -354,36 +354,43 @@ std::vector<size_t> liangBreakIndexes(const std::vector<CodepointInfo>& cps,
|
|||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Liang scores: one entry per augmented char (leading/trailing dots included).
|
||||||
std::vector<uint8_t> scores(augmented.charCount(), 0);
|
std::vector<uint8_t> scores(augmented.charCount(), 0);
|
||||||
|
|
||||||
|
// Walk every starting character position and stream bytes through the trie.
|
||||||
for (size_t charStart = 0; charStart < augmented.charByteOffsets.size(); ++charStart) {
|
for (size_t charStart = 0; charStart < augmented.charByteOffsets.size(); ++charStart) {
|
||||||
size_t byteStart = augmented.charByteOffsets[charStart];
|
const size_t byteStart = augmented.charByteOffsets[charStart];
|
||||||
AutomatonState state = root;
|
AutomatonState state = root;
|
||||||
|
|
||||||
for (size_t cursor = byteStart; cursor < augmented.bytes.size(); ++cursor) {
|
for (size_t cursor = byteStart; cursor < augmented.bytes.size(); ++cursor) {
|
||||||
AutomatonState next;
|
AutomatonState next;
|
||||||
if (!transition(automaton, state, augmented.bytes[cursor], next)) {
|
if (!transition(automaton, state, augmented.bytes[cursor], next)) {
|
||||||
break;
|
break; // No more matches for this prefix.
|
||||||
}
|
}
|
||||||
state = next;
|
state = next;
|
||||||
|
|
||||||
if (state.levels && state.levelsLen > 0) {
|
if (state.levels && state.levelsLen > 0) {
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
|
// Each packed byte stores the byte-distance delta and the Liang level digit.
|
||||||
for (size_t i = 0; i < state.levelsLen; ++i) {
|
for (size_t i = 0; i < state.levelsLen; ++i) {
|
||||||
const uint8_t packed = state.levels[i];
|
const uint8_t packed = state.levels[i];
|
||||||
const size_t dist = static_cast<size_t>(packed / 10);
|
const size_t dist = static_cast<size_t>(packed / 10);
|
||||||
const uint8_t level = static_cast<uint8_t>(packed % 10);
|
const uint8_t level = static_cast<uint8_t>(packed % 10);
|
||||||
|
|
||||||
offset += dist;
|
offset += dist;
|
||||||
const size_t splitByte = byteStart + offset;
|
const size_t splitByte = byteStart + offset;
|
||||||
if (splitByte >= augmented.byteToCharIndex.size()) {
|
if (splitByte >= augmented.byteToCharIndex.size()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int32_t boundary = augmented.byteToCharIndex[splitByte];
|
const int32_t boundary = augmented.byteToCharIndex[splitByte];
|
||||||
if (boundary < 0) {
|
if (boundary < 0) {
|
||||||
continue;
|
continue; // Mid-codepoint byte, wait for the next one.
|
||||||
}
|
}
|
||||||
if (boundary < 2 || boundary + 2 > static_cast<int32_t>(augmented.charCount())) {
|
if (boundary < 2 || boundary + 2 > static_cast<int32_t>(augmented.charCount())) {
|
||||||
continue;
|
continue; // Skip splits that land in the leading/trailing sentinels.
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t idx = static_cast<size_t>(boundary);
|
const size_t idx = static_cast<size_t>(boundary);
|
||||||
if (idx >= scores.size()) {
|
if (idx >= scores.size()) {
|
||||||
continue;
|
continue;
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user