mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-07 16:17:38 +03:00
support for soft hyphens
This commit is contained in:
parent
8cbf24d12f
commit
4f94cf2c36
@ -13,6 +13,42 @@
|
|||||||
|
|
||||||
constexpr int MAX_COST = std::numeric_limits<int>::max();
|
constexpr int MAX_COST = std::numeric_limits<int>::max();
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
// Soft hyphen byte pattern used throughout EPUBs (UTF-8 for U+00AD).
|
||||||
|
constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD";
|
||||||
|
constexpr size_t SOFT_HYPHEN_BYTES = 2;
|
||||||
|
|
||||||
|
bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; }
|
||||||
|
|
||||||
|
// Removes every soft hyphen in-place so rendered glyphs match measured widths.
|
||||||
|
void stripSoftHyphensInPlace(std::string& word) {
|
||||||
|
size_t pos = 0;
|
||||||
|
while ((pos = word.find(SOFT_HYPHEN_UTF8, pos)) != std::string::npos) {
|
||||||
|
word.erase(pos, SOFT_HYPHEN_BYTES);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen.
|
||||||
|
uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
|
||||||
|
const EpdFontFamily::Style style, const bool appendHyphen = false) {
|
||||||
|
const bool hasSoftHyphen = containsSoftHyphen(word);
|
||||||
|
if (!hasSoftHyphen && !appendHyphen) {
|
||||||
|
return renderer.getTextWidth(fontId, word.c_str(), style);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string sanitized = word;
|
||||||
|
if (hasSoftHyphen) {
|
||||||
|
stripSoftHyphensInPlace(sanitized);
|
||||||
|
}
|
||||||
|
if (appendHyphen) {
|
||||||
|
sanitized.push_back('-');
|
||||||
|
}
|
||||||
|
return renderer.getTextWidth(fontId, sanitized.c_str(), style);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle) {
|
void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle) {
|
||||||
if (word.empty()) return;
|
if (word.empty()) return;
|
||||||
|
|
||||||
@ -58,7 +94,7 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
|
|||||||
auto wordStylesIt = wordStyles.begin();
|
auto wordStylesIt = wordStyles.begin();
|
||||||
|
|
||||||
while (wordsIt != words.end()) {
|
while (wordsIt != words.end()) {
|
||||||
wordWidths.push_back(renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt));
|
wordWidths.push_back(measureWordWidth(renderer, fontId, *wordsIt, *wordStylesIt));
|
||||||
|
|
||||||
std::advance(wordsIt, 1);
|
std::advance(wordsIt, 1);
|
||||||
std::advance(wordStylesIt, 1);
|
std::advance(wordStylesIt, 1);
|
||||||
@ -239,10 +275,7 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
|
|||||||
|
|
||||||
const bool needsHyphen = info.requiresInsertedHyphen;
|
const bool needsHyphen = info.requiresInsertedHyphen;
|
||||||
std::string prefix = wordIt->substr(0, offset);
|
std::string prefix = wordIt->substr(0, offset);
|
||||||
if (needsHyphen) {
|
const int prefixWidth = measureWordWidth(renderer, fontId, prefix, style, needsHyphen);
|
||||||
prefix.push_back('-');
|
|
||||||
}
|
|
||||||
const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style);
|
|
||||||
if (prefixWidth > availableWidth) {
|
if (prefixWidth > availableWidth) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -274,7 +307,7 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
|
|||||||
|
|
||||||
// Update cached widths to reflect the new prefix/remainder pairing.
|
// Update cached widths to reflect the new prefix/remainder pairing.
|
||||||
wordWidths[wordIndex] = static_cast<uint16_t>(chosenWidth);
|
wordWidths[wordIndex] = static_cast<uint16_t>(chosenWidth);
|
||||||
const uint16_t remainderWidth = renderer.getTextWidth(fontId, remainder.c_str(), style);
|
const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style);
|
||||||
wordWidths.insert(wordWidths.begin() + wordIndex + 1, remainderWidth);
|
wordWidths.insert(wordWidths.begin() + wordIndex + 1, remainderWidth);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -330,5 +363,11 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
|
|||||||
std::list<EpdFontFamily::Style> lineWordStyles;
|
std::list<EpdFontFamily::Style> lineWordStyles;
|
||||||
lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyles.begin(), wordStyleEndIt);
|
lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyles.begin(), wordStyleEndIt);
|
||||||
|
|
||||||
|
for (auto& word : lineWords) {
|
||||||
|
if (containsSoftHyphen(word)) {
|
||||||
|
stripSoftHyphensInPlace(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
processLine(std::make_shared<TextBlock>(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), style));
|
processLine(std::make_shared<TextBlock>(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), style));
|
||||||
}
|
}
|
||||||
@ -90,6 +90,8 @@ bool isPunctuation(const uint32_t cp) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
|
||||||
|
|
||||||
bool isExplicitHyphen(const uint32_t cp) {
|
bool isExplicitHyphen(const uint32_t cp) {
|
||||||
switch (cp) {
|
switch (cp) {
|
||||||
case '-':
|
case '-':
|
||||||
@ -117,6 +119,8 @@ bool isExplicitHyphen(const uint32_t cp) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isSoftHyphen(const uint32_t cp) { return cp == 0x00AD; }
|
||||||
|
|
||||||
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps) {
|
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps) {
|
||||||
while (!cps.empty() && isPunctuation(cps.front().value)) {
|
while (!cps.empty() && isPunctuation(cps.front().value)) {
|
||||||
cps.erase(cps.begin());
|
cps.erase(cps.begin());
|
||||||
@ -126,6 +130,19 @@ void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
|
||||||
|
if (cps.empty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto& info : cps) {
|
||||||
|
if (!isAlphabetic(info.value)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
Script detectScript(const std::vector<CodepointInfo>& cps) {
|
Script detectScript(const std::vector<CodepointInfo>& cps) {
|
||||||
bool hasLatin = false;
|
bool hasLatin = false;
|
||||||
bool hasCyrillic = false;
|
bool hasCyrillic = false;
|
||||||
|
|||||||
@ -11,6 +11,7 @@ struct CodepointInfo {
|
|||||||
|
|
||||||
enum class Script { Latin, Cyrillic, Mixed };
|
enum class Script { Latin, Cyrillic, Mixed };
|
||||||
|
|
||||||
|
// Minimum number of codepoints required in prefix and suffix for hyphenation.
|
||||||
constexpr size_t MIN_PREFIX_CP = 2;
|
constexpr size_t MIN_PREFIX_CP = 2;
|
||||||
constexpr size_t MIN_SUFFIX_CP = 2;
|
constexpr size_t MIN_SUFFIX_CP = 2;
|
||||||
|
|
||||||
@ -28,7 +29,10 @@ bool isCyrillicConsonant(uint32_t cp);
|
|||||||
bool isAlphabetic(uint32_t cp);
|
bool isAlphabetic(uint32_t cp);
|
||||||
bool isVowel(uint32_t cp);
|
bool isVowel(uint32_t cp);
|
||||||
bool isPunctuation(uint32_t cp);
|
bool isPunctuation(uint32_t cp);
|
||||||
|
bool isAsciiDigit(uint32_t cp);
|
||||||
bool isExplicitHyphen(uint32_t cp);
|
bool isExplicitHyphen(uint32_t cp);
|
||||||
|
bool isSoftHyphen(uint32_t cp);
|
||||||
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);
|
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);
|
||||||
|
bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps);
|
||||||
|
|
||||||
Script detectScript(const std::vector<CodepointInfo>& cps);
|
Script detectScript(const std::vector<CodepointInfo>& cps);
|
||||||
|
|||||||
@ -48,32 +48,6 @@ std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
|
|||||||
return cps;
|
return cps;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<size_t> collectExplicitHyphenIndexes(const std::vector<CodepointInfo>& cps) {
|
|
||||||
std::vector<size_t> indexes;
|
|
||||||
for (size_t i = 0; i < cps.size(); ++i) {
|
|
||||||
if (!isExplicitHyphen(cps[i].value)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (i == 0 || i + 1 >= cps.size()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const size_t breakIndex = i + 1;
|
|
||||||
if (breakIndex >= cps.size()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (breakIndex == 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
indexes.push_back(breakIndex);
|
|
||||||
}
|
|
||||||
return indexes;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
|
|
||||||
|
|
||||||
void trimTrailingFootnoteReference(std::vector<CodepointInfo>& cps) {
|
void trimTrailingFootnoteReference(std::vector<CodepointInfo>& cps) {
|
||||||
if (cps.size() < 3) {
|
if (cps.size() < 3) {
|
||||||
return;
|
return;
|
||||||
@ -98,20 +72,6 @@ void trimTrailingFootnoteReference(std::vector<CodepointInfo>& cps) {
|
|||||||
cps.erase(cps.begin() + pos, cps.end());
|
cps.erase(cps.begin() + pos, cps.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Rejects words containing punctuation or digits unless forced.
|
|
||||||
bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
|
|
||||||
if (cps.empty()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto& info : cps) {
|
|
||||||
if (!isAlphabetic(info.value)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Asks the language hyphenator for legal break positions inside the word.
|
// Asks the language hyphenator for legal break positions inside the word.
|
||||||
std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||||
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
||||||
@ -135,17 +95,48 @@ size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t in
|
|||||||
return cps[index].byteOffset;
|
return cps[index].byteOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<Hyphenator::BreakInfo> buildBreakInfoVector(const std::vector<size_t>& indexes,
|
std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<CodepointInfo>& cps) {
|
||||||
const std::vector<CodepointInfo>& cps,
|
|
||||||
const bool requiresHyphen) {
|
|
||||||
std::vector<Hyphenator::BreakInfo> breaks;
|
std::vector<Hyphenator::BreakInfo> breaks;
|
||||||
breaks.reserve(indexes.size());
|
breaks.reserve(cps.size());
|
||||||
for (const size_t idx : indexes) {
|
|
||||||
breaks.push_back({byteOffsetForIndex(cps, idx), requiresHyphen});
|
// Scan every codepoint looking for explicit/soft hyphen markers that are surrounded by letters.
|
||||||
|
for (size_t i = 0; i < cps.size(); ++i) {
|
||||||
|
const uint32_t cp = cps[i].value;
|
||||||
|
if (!isExplicitHyphen(cp) || i == 0 || i + 1 >= cps.size()) {
|
||||||
|
continue; // Need at least one alphabetic character on both sides.
|
||||||
}
|
}
|
||||||
|
if (!isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Offset points to the next codepoint so rendering starts after the hyphen marker.
|
||||||
|
breaks.push_back({byteOffsetForIndex(cps, i + 1), isSoftHyphen(cp)});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (breaks.empty()) {
|
||||||
return breaks;
|
return breaks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sort by byte offset so we can deduplicate sequential markers.
|
||||||
|
// Multiple dash codepoints can point to the same byte offset once punctuation is trimmed; sort before merging.
|
||||||
|
std::sort(breaks.begin(), breaks.end(), [](const Hyphenator::BreakInfo& lhs, const Hyphenator::BreakInfo& rhs) {
|
||||||
|
return lhs.byteOffset < rhs.byteOffset;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Ensure we keep a single entry per break while retaining the "needs hyphen" flag when any marker requested it.
|
||||||
|
std::vector<Hyphenator::BreakInfo> deduped;
|
||||||
|
deduped.reserve(breaks.size());
|
||||||
|
for (const auto& entry : breaks) {
|
||||||
|
if (!deduped.empty() && deduped.back().byteOffset == entry.byteOffset) {
|
||||||
|
// Merge entries so that an explicit hyphen wins over a soft hyphen at the same offset.
|
||||||
|
deduped.back().requiresInsertedHyphen = deduped.back().requiresInsertedHyphen || entry.requiresInsertedHyphen;
|
||||||
|
} else {
|
||||||
|
deduped.push_back(entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return deduped;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) {
|
std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) {
|
||||||
@ -153,6 +144,7 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
|
|||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Convert to codepoints and normalize word boundaries.
|
||||||
auto cps = collectCodepoints(word);
|
auto cps = collectCodepoints(word);
|
||||||
trimSurroundingPunctuation(cps);
|
trimSurroundingPunctuation(cps);
|
||||||
trimTrailingFootnoteReference(cps);
|
trimTrailingFootnoteReference(cps);
|
||||||
@ -160,13 +152,13 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
|
|||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
auto explicitIndexes = collectExplicitHyphenIndexes(cps);
|
// Explicit hyphen markers (soft or hard) take precedence over heuristic breaks.
|
||||||
if (!explicitIndexes.empty()) {
|
auto explicitBreakInfos = buildExplicitBreakInfos(cps);
|
||||||
std::sort(explicitIndexes.begin(), explicitIndexes.end());
|
if (!explicitBreakInfos.empty()) {
|
||||||
explicitIndexes.erase(std::unique(explicitIndexes.begin(), explicitIndexes.end()), explicitIndexes.end());
|
return explicitBreakInfos;
|
||||||
return buildBreakInfoVector(explicitIndexes, cps, false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Ask language hyphenator for legal break points, optionally augment with naive fallback.
|
||||||
std::vector<size_t> indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector<size_t>();
|
std::vector<size_t> indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector<size_t>();
|
||||||
if (includeFallback) {
|
if (includeFallback) {
|
||||||
for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
|
for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
|
||||||
@ -178,8 +170,15 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
|
|||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sort/deduplicate break indexes before converting them back to byte offsets.
|
||||||
std::sort(indexes.begin(), indexes.end());
|
std::sort(indexes.begin(), indexes.end());
|
||||||
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
|
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
|
||||||
|
|
||||||
return buildBreakInfoVector(indexes, cps, true);
|
std::vector<Hyphenator::BreakInfo> breaks;
|
||||||
|
breaks.reserve(indexes.size());
|
||||||
|
for (const size_t idx : indexes) {
|
||||||
|
breaks.push_back({byteOffsetForIndex(cps, idx), true});
|
||||||
|
}
|
||||||
|
|
||||||
|
return breaks;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -136,21 +136,6 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip soft-hyphen with UTF-8 representation (U+00AD) = 0xC2 0xAD
|
|
||||||
const XML_Char SHY_BYTE_1 = static_cast<XML_Char>(0xC2);
|
|
||||||
const XML_Char SHY_BYTE_2 = static_cast<XML_Char>(0xAD);
|
|
||||||
// 1. Check for the start of the 2-byte Soft Hyphen sequence
|
|
||||||
if (s[i] == SHY_BYTE_1) {
|
|
||||||
// 2. Check if the next byte exists AND if it completes the sequence
|
|
||||||
// We must check i + 1 < len to prevent reading past the end of the buffer.
|
|
||||||
if ((i + 1 < len) && (s[i + 1] == SHY_BYTE_2)) {
|
|
||||||
// Sequence 0xC2 0xAD found!
|
|
||||||
// Skip the current byte (0xC2) and the next byte (0xAD)
|
|
||||||
i++; // Increment 'i' one more time to skip the 0xAD byte
|
|
||||||
continue; // Skip the rest of the loop and move to the next iteration
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we're about to run out of space, then cut the word off and start a new one
|
// If we're about to run out of space, then cut the word off and start a new one
|
||||||
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
|
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
|
||||||
self->partWordBuffer[self->partWordBufferIndex] = '\0';
|
self->partWordBuffer[self->partWordBufferIndex] = '\0';
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user