diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index 8aed230d..8e3d2f30 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -13,6 +13,42 @@ constexpr int MAX_COST = std::numeric_limits::max(); +namespace { + +// Soft hyphen byte pattern used throughout EPUBs (UTF-8 for U+00AD). +constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD"; +constexpr size_t SOFT_HYPHEN_BYTES = 2; + +bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; } + +// Removes every soft hyphen in-place so rendered glyphs match measured widths. +void stripSoftHyphensInPlace(std::string& word) { + size_t pos = 0; + while ((pos = word.find(SOFT_HYPHEN_UTF8, pos)) != std::string::npos) { + word.erase(pos, SOFT_HYPHEN_BYTES); + } +} + +// Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen. +uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word, + const EpdFontFamily::Style style, const bool appendHyphen = false) { + const bool hasSoftHyphen = containsSoftHyphen(word); + if (!hasSoftHyphen && !appendHyphen) { + return renderer.getTextWidth(fontId, word.c_str(), style); + } + + std::string sanitized = word; + if (hasSoftHyphen) { + stripSoftHyphensInPlace(sanitized); + } + if (appendHyphen) { + sanitized.push_back('-'); + } + return renderer.getTextWidth(fontId, sanitized.c_str(), style); +} + +} // namespace + void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle) { if (word.empty()) return; @@ -58,7 +94,7 @@ std::vector ParsedText::calculateWordWidths(const GfxRenderer& rendere auto wordStylesIt = wordStyles.begin(); while (wordsIt != words.end()) { - wordWidths.push_back(renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt)); + wordWidths.push_back(measureWordWidth(renderer, fontId, *wordsIt, *wordStylesIt)); std::advance(wordsIt, 1); std::advance(wordStylesIt, 1); @@ -239,10 +275,7 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl const bool needsHyphen = info.requiresInsertedHyphen; std::string prefix = wordIt->substr(0, offset); - if (needsHyphen) { - prefix.push_back('-'); - } - const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style); + const int prefixWidth = measureWordWidth(renderer, fontId, prefix, style, needsHyphen); if (prefixWidth > availableWidth) { continue; } @@ -274,7 +307,7 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl // Update cached widths to reflect the new prefix/remainder pairing. wordWidths[wordIndex] = static_cast(chosenWidth); - const uint16_t remainderWidth = renderer.getTextWidth(fontId, remainder.c_str(), style); + const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style); wordWidths.insert(wordWidths.begin() + wordIndex + 1, remainderWidth); return true; } @@ -330,5 +363,11 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const std::list lineWordStyles; lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyles.begin(), wordStyleEndIt); + for (auto& word : lineWords) { + if (containsSoftHyphen(word)) { + stripSoftHyphensInPlace(word); + } + } + processLine(std::make_shared(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), style)); } \ No newline at end of file diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp index 4e6be5bf..999cb53c 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp @@ -90,6 +90,8 @@ bool isPunctuation(const uint32_t cp) { } } +bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; } + bool isExplicitHyphen(const uint32_t cp) { switch (cp) { case '-': @@ -117,6 +119,8 @@ bool isExplicitHyphen(const uint32_t cp) { } } +bool isSoftHyphen(const uint32_t cp) { return cp == 0x00AD; } + void trimSurroundingPunctuation(std::vector& cps) { while (!cps.empty() && isPunctuation(cps.front().value)) { cps.erase(cps.begin()); @@ -126,6 +130,19 @@ void trimSurroundingPunctuation(std::vector& cps) { } } +bool hasOnlyAlphabetic(const std::vector& cps) { + if (cps.empty()) { + return false; + } + + for (const auto& info : cps) { + if (!isAlphabetic(info.value)) { + return false; + } + } + return true; +} + Script detectScript(const std::vector& cps) { bool hasLatin = false; bool hasCyrillic = false; diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.h b/lib/Epub/Epub/hyphenation/HyphenationCommon.h index b1f9271c..c28acfa7 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h @@ -11,6 +11,7 @@ struct CodepointInfo { enum class Script { Latin, Cyrillic, Mixed }; +// Minimum number of codepoints required in prefix and suffix for hyphenation. constexpr size_t MIN_PREFIX_CP = 2; constexpr size_t MIN_SUFFIX_CP = 2; @@ -28,7 +29,10 @@ bool isCyrillicConsonant(uint32_t cp); bool isAlphabetic(uint32_t cp); bool isVowel(uint32_t cp); bool isPunctuation(uint32_t cp); +bool isAsciiDigit(uint32_t cp); bool isExplicitHyphen(uint32_t cp); +bool isSoftHyphen(uint32_t cp); void trimSurroundingPunctuation(std::vector& cps); +bool hasOnlyAlphabetic(const std::vector& cps); Script detectScript(const std::vector& cps); diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp index 21d890b7..a06bb283 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp @@ -48,32 +48,6 @@ std::vector collectCodepoints(const std::string& word) { return cps; } -std::vector collectExplicitHyphenIndexes(const std::vector& cps) { - std::vector indexes; - for (size_t i = 0; i < cps.size(); ++i) { - if (!isExplicitHyphen(cps[i].value)) { - continue; - } - if (i == 0 || i + 1 >= cps.size()) { - continue; - } - if (!isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) { - continue; - } - const size_t breakIndex = i + 1; - if (breakIndex >= cps.size()) { - continue; - } - if (breakIndex == 0) { - continue; - } - indexes.push_back(breakIndex); - } - return indexes; -} - -bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; } - void trimTrailingFootnoteReference(std::vector& cps) { if (cps.size() < 3) { return; @@ -98,20 +72,6 @@ void trimTrailingFootnoteReference(std::vector& cps) { cps.erase(cps.begin() + pos, cps.end()); } -// Rejects words containing punctuation or digits unless forced. -bool hasOnlyAlphabetic(const std::vector& cps) { - if (cps.empty()) { - return false; - } - - for (const auto& info : cps) { - if (!isAlphabetic(info.value)) { - return false; - } - } - return true; -} - // Asks the language hyphenator for legal break positions inside the word. std::vector collectBreakIndexes(const std::vector& cps) { if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { @@ -135,15 +95,46 @@ size_t byteOffsetForIndex(const std::vector& cps, const size_t in return cps[index].byteOffset; } -std::vector buildBreakInfoVector(const std::vector& indexes, - const std::vector& cps, - const bool requiresHyphen) { +std::vector buildExplicitBreakInfos(const std::vector& cps) { std::vector breaks; - breaks.reserve(indexes.size()); - for (const size_t idx : indexes) { - breaks.push_back({byteOffsetForIndex(cps, idx), requiresHyphen}); + breaks.reserve(cps.size()); + + // Scan every codepoint looking for explicit/soft hyphen markers that are surrounded by letters. + for (size_t i = 0; i < cps.size(); ++i) { + const uint32_t cp = cps[i].value; + if (!isExplicitHyphen(cp) || i == 0 || i + 1 >= cps.size()) { + continue; // Need at least one alphabetic character on both sides. + } + if (!isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) { + continue; + } + // Offset points to the next codepoint so rendering starts after the hyphen marker. + breaks.push_back({byteOffsetForIndex(cps, i + 1), isSoftHyphen(cp)}); } - return breaks; + + if (breaks.empty()) { + return breaks; + } + + // Sort by byte offset so we can deduplicate sequential markers. + // Multiple dash codepoints can point to the same byte offset once punctuation is trimmed; sort before merging. + std::sort(breaks.begin(), breaks.end(), [](const Hyphenator::BreakInfo& lhs, const Hyphenator::BreakInfo& rhs) { + return lhs.byteOffset < rhs.byteOffset; + }); + + // Ensure we keep a single entry per break while retaining the "needs hyphen" flag when any marker requested it. + std::vector deduped; + deduped.reserve(breaks.size()); + for (const auto& entry : breaks) { + if (!deduped.empty() && deduped.back().byteOffset == entry.byteOffset) { + // Merge entries so that an explicit hyphen wins over a soft hyphen at the same offset. + deduped.back().requiresInsertedHyphen = deduped.back().requiresInsertedHyphen || entry.requiresInsertedHyphen; + } else { + deduped.push_back(entry); + } + } + + return deduped; } } // namespace @@ -153,6 +144,7 @@ std::vector Hyphenator::breakOffsets(const std::string& w return {}; } + // Convert to codepoints and normalize word boundaries. auto cps = collectCodepoints(word); trimSurroundingPunctuation(cps); trimTrailingFootnoteReference(cps); @@ -160,13 +152,13 @@ std::vector Hyphenator::breakOffsets(const std::string& w return {}; } - auto explicitIndexes = collectExplicitHyphenIndexes(cps); - if (!explicitIndexes.empty()) { - std::sort(explicitIndexes.begin(), explicitIndexes.end()); - explicitIndexes.erase(std::unique(explicitIndexes.begin(), explicitIndexes.end()), explicitIndexes.end()); - return buildBreakInfoVector(explicitIndexes, cps, false); + // Explicit hyphen markers (soft or hard) take precedence over heuristic breaks. + auto explicitBreakInfos = buildExplicitBreakInfos(cps); + if (!explicitBreakInfos.empty()) { + return explicitBreakInfos; } + // Ask language hyphenator for legal break points, optionally augment with naive fallback. std::vector indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector(); if (includeFallback) { for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) { @@ -178,8 +170,15 @@ std::vector Hyphenator::breakOffsets(const std::string& w return {}; } + // Sort/deduplicate break indexes before converting them back to byte offsets. std::sort(indexes.begin(), indexes.end()); indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); - return buildBreakInfoVector(indexes, cps, true); + std::vector breaks; + breaks.reserve(indexes.size()); + for (const size_t idx : indexes) { + breaks.push_back({byteOffsetForIndex(cps, idx), true}); + } + + return breaks; } diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index 98cc0120..9dd6b1cb 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -136,21 +136,6 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char continue; } - // Skip soft-hyphen with UTF-8 representation (U+00AD) = 0xC2 0xAD - const XML_Char SHY_BYTE_1 = static_cast(0xC2); - const XML_Char SHY_BYTE_2 = static_cast(0xAD); - // 1. Check for the start of the 2-byte Soft Hyphen sequence - if (s[i] == SHY_BYTE_1) { - // 2. Check if the next byte exists AND if it completes the sequence - // We must check i + 1 < len to prevent reading past the end of the buffer. - if ((i + 1 < len) && (s[i + 1] == SHY_BYTE_2)) { - // Sequence 0xC2 0xAD found! - // Skip the current byte (0xC2) and the next byte (0xAD) - i++; // Increment 'i' one more time to skip the 0xAD byte - continue; // Skip the rest of the loop and move to the next iteration - } - } - // If we're about to run out of space, then cut the word off and start a new one if (self->partWordBufferIndex >= MAX_WORD_SIZE) { self->partWordBuffer[self->partWordBufferIndex] = '\0';