mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-06 15:47:39 +03:00
optimization
This commit is contained in:
parent
34d2b81f40
commit
58314e9efd
@ -207,28 +207,29 @@ std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r
|
|||||||
const int spacing = isFirstWord ? 0 : spaceWidth;
|
const int spacing = isFirstWord ? 0 : spaceWidth;
|
||||||
const int candidateWidth = spacing + wordWidths[currentIndex];
|
const int candidateWidth = spacing + wordWidths[currentIndex];
|
||||||
|
|
||||||
|
// Word fits on current line
|
||||||
if (lineWidth + candidateWidth <= pageWidth) {
|
if (lineWidth + candidateWidth <= pageWidth) {
|
||||||
lineWidth += candidateWidth;
|
lineWidth += candidateWidth;
|
||||||
currentIndex += 1;
|
++currentIndex;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Word would overflow — try to split based on hyphenation points
|
// Word would overflow — try to split based on hyphenation points
|
||||||
const int availableWidth = pageWidth - lineWidth - spacing;
|
const int availableWidth = pageWidth - lineWidth - spacing;
|
||||||
const bool allowFallbackBreaks =
|
const bool allowFallbackBreaks = isFirstWord; // Only for first word on line
|
||||||
isFirstWord; // Permit fallback breaks only when first word one the line still overflows
|
|
||||||
if (availableWidth > 0 &&
|
if (availableWidth > 0 &&
|
||||||
hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) {
|
hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) {
|
||||||
// Prefix now fits; append it to this line and immediately move to the next line
|
// Prefix now fits; append it to this line and move to next line
|
||||||
lineWidth += spacing + wordWidths[currentIndex];
|
lineWidth += spacing + wordWidths[currentIndex];
|
||||||
currentIndex += 1;
|
++currentIndex;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Could not split: force at least one word per line to avoid infinite loop
|
// Could not split: force at least one word per line to avoid infinite loop
|
||||||
if (currentIndex == lineStart) {
|
if (currentIndex == lineStart) {
|
||||||
lineWidth += candidateWidth;
|
lineWidth += candidateWidth;
|
||||||
currentIndex += 1;
|
++currentIndex;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -249,19 +250,21 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Position iterators at the target word and its style entry.
|
// Get iterators to target word and style.
|
||||||
auto wordIt = words.begin();
|
auto wordIt = words.begin();
|
||||||
auto styleIt = wordStyles.begin();
|
auto styleIt = wordStyles.begin();
|
||||||
std::advance(wordIt, wordIndex);
|
std::advance(wordIt, wordIndex);
|
||||||
std::advance(styleIt, wordIndex);
|
std::advance(styleIt, wordIndex);
|
||||||
|
|
||||||
|
const std::string& word = *wordIt;
|
||||||
|
const auto style = *styleIt;
|
||||||
|
|
||||||
// Collect candidate breakpoints (byte offsets and hyphen requirements).
|
// Collect candidate breakpoints (byte offsets and hyphen requirements).
|
||||||
const auto breakInfos = Hyphenator::breakOffsets(*wordIt, allowFallbackBreaks);
|
const auto breakInfos = Hyphenator::breakOffsets(word, allowFallbackBreaks);
|
||||||
if (breakInfos.empty()) {
|
if (breakInfos.empty()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto style = *styleIt;
|
|
||||||
size_t chosenOffset = 0;
|
size_t chosenOffset = 0;
|
||||||
int chosenWidth = -1;
|
int chosenWidth = -1;
|
||||||
bool chosenNeedsHyphen = true;
|
bool chosenNeedsHyphen = true;
|
||||||
@ -269,22 +272,19 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
|
|||||||
// Iterate over each legal breakpoint and retain the widest prefix that still fits.
|
// Iterate over each legal breakpoint and retain the widest prefix that still fits.
|
||||||
for (const auto& info : breakInfos) {
|
for (const auto& info : breakInfos) {
|
||||||
const size_t offset = info.byteOffset;
|
const size_t offset = info.byteOffset;
|
||||||
if (offset == 0 || offset >= wordIt->size()) {
|
if (offset == 0 || offset >= word.size()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool needsHyphen = info.requiresInsertedHyphen;
|
const bool needsHyphen = info.requiresInsertedHyphen;
|
||||||
std::string prefix = wordIt->substr(0, offset);
|
const int prefixWidth = measureWordWidth(renderer, fontId, word.substr(0, offset), style, needsHyphen);
|
||||||
const int prefixWidth = measureWordWidth(renderer, fontId, prefix, style, needsHyphen);
|
if (prefixWidth > availableWidth || prefixWidth <= chosenWidth) {
|
||||||
if (prefixWidth > availableWidth) {
|
continue; // Skip if too wide or not an improvement
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prefixWidth > chosenWidth) {
|
chosenWidth = prefixWidth;
|
||||||
chosenWidth = prefixWidth;
|
chosenOffset = offset;
|
||||||
chosenOffset = offset;
|
chosenNeedsHyphen = needsHyphen;
|
||||||
chosenNeedsHyphen = needsHyphen;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (chosenWidth < 0) {
|
if (chosenWidth < 0) {
|
||||||
@ -293,7 +293,7 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Split the word at the selected breakpoint and append a hyphen if required.
|
// Split the word at the selected breakpoint and append a hyphen if required.
|
||||||
std::string remainder = wordIt->substr(chosenOffset);
|
std::string remainder = word.substr(chosenOffset);
|
||||||
wordIt->resize(chosenOffset);
|
wordIt->resize(chosenOffset);
|
||||||
if (chosenNeedsHyphen) {
|
if (chosenNeedsHyphen) {
|
||||||
wordIt->push_back('-');
|
wordIt->push_back('-');
|
||||||
|
|||||||
@ -59,8 +59,6 @@ bool isCyrillicConsonant(const uint32_t cp) { return isCyrillicLetter(cp) && !is
|
|||||||
|
|
||||||
bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
|
bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
|
||||||
|
|
||||||
bool isVowel(const uint32_t cp) { return isLatinVowel(cp) || isCyrillicVowel(cp); }
|
|
||||||
|
|
||||||
bool isPunctuation(const uint32_t cp) {
|
bool isPunctuation(const uint32_t cp) {
|
||||||
switch (cp) {
|
switch (cp) {
|
||||||
case '.':
|
case '.':
|
||||||
|
|||||||
@ -25,7 +25,6 @@ bool isCyrillicVowel(uint32_t cp);
|
|||||||
bool isCyrillicConsonant(uint32_t cp);
|
bool isCyrillicConsonant(uint32_t cp);
|
||||||
|
|
||||||
bool isAlphabetic(uint32_t cp);
|
bool isAlphabetic(uint32_t cp);
|
||||||
bool isVowel(uint32_t cp);
|
|
||||||
bool isPunctuation(uint32_t cp);
|
bool isPunctuation(uint32_t cp);
|
||||||
bool isAsciiDigit(uint32_t cp);
|
bool isAsciiDigit(uint32_t cp);
|
||||||
bool isExplicitHyphen(uint32_t cp);
|
bool isExplicitHyphen(uint32_t cp);
|
||||||
|
|||||||
@ -3,7 +3,6 @@
|
|||||||
#include <Utf8.h>
|
#include <Utf8.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "EnglishHyphenator.h"
|
#include "EnglishHyphenator.h"
|
||||||
@ -32,12 +31,6 @@ const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) {
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Preferred language hint; empty means "auto".
|
|
||||||
std::string& preferredLanguage() {
|
|
||||||
static std::string lang;
|
|
||||||
return lang;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Cached hyphenator instance for the current preferred language.
|
// Cached hyphenator instance for the current preferred language.
|
||||||
const LanguageHyphenator*& cachedHyphenator() {
|
const LanguageHyphenator*& cachedHyphenator() {
|
||||||
static const LanguageHyphenator* hyphenator = nullptr;
|
static const LanguageHyphenator* hyphenator = nullptr;
|
||||||
@ -86,67 +79,54 @@ void trimTrailingFootnoteReference(std::vector<CodepointInfo>& cps) {
|
|||||||
|
|
||||||
// Asks the language hyphenator for legal break positions inside the word.
|
// Asks the language hyphenator for legal break positions inside the word.
|
||||||
std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||||
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
if (const auto* hyphenator = cachedHyphenator()) {
|
if (const auto* hyphenator = cachedHyphenator()) {
|
||||||
auto indexes = hyphenator->breakIndexes(cps);
|
return hyphenator->breakIndexes(cps);
|
||||||
return indexes;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Maps a codepoint index back to its byte offset inside the source word.
|
// Maps a codepoint index back to its byte offset inside the source word.
|
||||||
size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t index) {
|
size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t index) {
|
||||||
if (index >= cps.size()) {
|
return (index < cps.size()) ? cps[index].byteOffset : (cps.empty() ? 0 : cps.back().byteOffset);
|
||||||
return cps.empty() ? 0 : cps.back().byteOffset;
|
|
||||||
}
|
|
||||||
return cps[index].byteOffset;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Builds a vector of break information from explicit hyphen markers in the given codepoints.
|
// Builds a vector of break information from explicit hyphen markers in the given codepoints.
|
||||||
std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<CodepointInfo>& cps) {
|
std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<CodepointInfo>& cps) {
|
||||||
std::vector<Hyphenator::BreakInfo> breaks;
|
std::vector<Hyphenator::BreakInfo> breaks;
|
||||||
breaks.reserve(cps.size());
|
|
||||||
|
|
||||||
// Scan every codepoint looking for explicit/soft hyphen markers that are surrounded by letters.
|
// Scan every codepoint looking for explicit/soft hyphen markers that are surrounded by letters.
|
||||||
for (size_t i = 0; i < cps.size(); ++i) {
|
for (size_t i = 1; i + 1 < cps.size(); ++i) {
|
||||||
const uint32_t cp = cps[i].value;
|
const uint32_t cp = cps[i].value;
|
||||||
if (!isExplicitHyphen(cp) || i == 0 || i + 1 >= cps.size()) {
|
if (!isExplicitHyphen(cp) || !isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) {
|
||||||
continue; // Need at least one alphabetic character on both sides.
|
|
||||||
}
|
|
||||||
if (!isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// Offset points to the next codepoint so rendering starts after the hyphen marker.
|
// Offset points to the next codepoint so rendering starts after the hyphen marker.
|
||||||
breaks.push_back({byteOffsetForIndex(cps, i + 1), isSoftHyphen(cp)});
|
breaks.push_back({cps[i + 1].byteOffset, isSoftHyphen(cp)});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (breaks.empty()) {
|
if (breaks.empty()) {
|
||||||
return breaks;
|
return breaks;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort by byte offset so we can deduplicate sequential markers.
|
// Sort by byte offset so we can deduplicate sequential markers in-place.
|
||||||
// Multiple dash codepoints can point to the same byte offset once punctuation is trimmed; sort before merging.
|
|
||||||
std::sort(breaks.begin(), breaks.end(), [](const Hyphenator::BreakInfo& lhs, const Hyphenator::BreakInfo& rhs) {
|
std::sort(breaks.begin(), breaks.end(), [](const Hyphenator::BreakInfo& lhs, const Hyphenator::BreakInfo& rhs) {
|
||||||
return lhs.byteOffset < rhs.byteOffset;
|
return lhs.byteOffset < rhs.byteOffset;
|
||||||
});
|
});
|
||||||
|
|
||||||
// Ensure we keep a single entry per break while retaining the "needs hyphen" flag when any marker requested it.
|
// Deduplicate in-place: merge entries at same offset while retaining "needs hyphen" flag.
|
||||||
std::vector<Hyphenator::BreakInfo> deduped;
|
size_t writePos = 0;
|
||||||
deduped.reserve(breaks.size());
|
for (size_t readPos = 1; readPos < breaks.size(); ++readPos) {
|
||||||
for (const auto& entry : breaks) {
|
if (breaks[readPos].byteOffset == breaks[writePos].byteOffset) {
|
||||||
if (!deduped.empty() && deduped.back().byteOffset == entry.byteOffset) {
|
// Merge: explicit hyphen wins over soft hyphen at same offset.
|
||||||
// Merge entries so that an explicit hyphen wins over a soft hyphen at the same offset.
|
breaks[writePos].requiresInsertedHyphen =
|
||||||
deduped.back().requiresInsertedHyphen = deduped.back().requiresInsertedHyphen || entry.requiresInsertedHyphen;
|
breaks[writePos].requiresInsertedHyphen || breaks[readPos].requiresInsertedHyphen;
|
||||||
} else {
|
} else {
|
||||||
deduped.push_back(entry);
|
breaks[++writePos] = breaks[readPos];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
breaks.resize(writePos + 1);
|
||||||
|
|
||||||
return deduped;
|
return breaks;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
@ -170,22 +150,25 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
|
|||||||
return explicitBreakInfos;
|
return explicitBreakInfos;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ask language hyphenator for legal break points, optionally augment with naive fallback.
|
// Ask language hyphenator for legal break points.
|
||||||
std::vector<size_t> indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector<size_t>();
|
std::vector<size_t> indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector<size_t>();
|
||||||
|
|
||||||
|
// Only add fallback breaks if needed and deduplicate if both language and fallback breaks exist.
|
||||||
if (includeFallback) {
|
if (includeFallback) {
|
||||||
for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
|
for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
|
||||||
indexes.push_back(idx);
|
indexes.push_back(idx);
|
||||||
}
|
}
|
||||||
|
// Only deduplicate if we have both language-specific and fallback breaks.
|
||||||
|
std::sort(indexes.begin(), indexes.end());
|
||||||
|
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
|
||||||
|
} else if (indexes.empty()) {
|
||||||
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (indexes.empty()) {
|
if (indexes.empty()) {
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort/deduplicate break indexes before converting them back to byte offsets.
|
|
||||||
std::sort(indexes.begin(), indexes.end());
|
|
||||||
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
|
|
||||||
|
|
||||||
std::vector<Hyphenator::BreakInfo> breaks;
|
std::vector<Hyphenator::BreakInfo> breaks;
|
||||||
breaks.reserve(indexes.size());
|
breaks.reserve(indexes.size());
|
||||||
for (const size_t idx : indexes) {
|
for (const size_t idx : indexes) {
|
||||||
@ -195,7 +178,4 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
|
|||||||
return breaks;
|
return breaks;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Hyphenator::setPreferredLanguage(const std::string& lang) {
|
void Hyphenator::setPreferredLanguage(const std::string& lang) { cachedHyphenator() = hyphenatorForLanguage(lang); }
|
||||||
preferredLanguage() = lang;
|
|
||||||
cachedHyphenator() = hyphenatorForLanguage(lang);
|
|
||||||
}
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user