diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index b3a37d6a..10f9322a 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -1,6 +1,7 @@ #include "ParsedText.h" #include +#include #include #include @@ -9,6 +10,7 @@ #include #include +#include "hyphenation/HyphenationCommon.h" #include "hyphenation/Hyphenator.h" constexpr int MAX_COST = std::numeric_limits::max(); @@ -18,8 +20,38 @@ namespace { struct HyphenSplitDecision { size_t byteOffset; uint16_t prefixWidth; + bool appendHyphen; // true when we must draw an extra hyphen after the prefix glyphs }; +// Verifies whether the substring ending at `offset` already contains a literal hyphen glyph, so we can avoid +// drawing a duplicate hyphen when breaking the word. +bool endsWithExplicitHyphen(const std::string& word, const size_t offset) { + if (offset == 0 || offset > word.size()) { + return false; + } + + const unsigned char* base = reinterpret_cast(word.data()); + const unsigned char* ptr = base; + const unsigned char* target = base + offset; + const unsigned char* lastStart = nullptr; + + while (ptr < target) { + lastStart = ptr; + utf8NextCodepoint(&ptr); + if (ptr > target) { + return false; + } + } + + if (!lastStart || ptr != target) { + return false; + } + + const unsigned char* tmp = lastStart; + const uint32_t cp = utf8NextCodepoint(&tmp); // decode the codepoint immediately prior to the break + return isExplicitHyphen(cp); +} + bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const std::string& word, const EpdFontStyle style, const int availableWidth, const bool includeFallback, HyphenSplitDecision* decision) { @@ -28,10 +60,6 @@ bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const st } const int hyphenWidth = renderer.getTextWidth(fontId, "-", style); - const int adjustedWidth = availableWidth - hyphenWidth; - if (adjustedWidth <= 0) { - return false; - } auto offsets = Hyphenator::breakOffsets(word, includeFallback); if (offsets.empty()) { @@ -40,13 +68,20 @@ bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const st size_t chosenOffset = std::numeric_limits::max(); uint16_t chosenWidth = 0; + bool chosenAppendHyphen = true; for (const size_t offset : offsets) { + const bool needsInsertedHyphen = !endsWithExplicitHyphen(word, offset); + const int budget = availableWidth - (needsInsertedHyphen ? hyphenWidth : 0); + if (budget <= 0) { + continue; + } const std::string prefix = word.substr(0, offset); const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style); - if (prefixWidth <= adjustedWidth) { + if (prefixWidth <= budget) { chosenOffset = offset; - chosenWidth = static_cast(prefixWidth + hyphenWidth); + chosenWidth = static_cast(prefixWidth + (needsInsertedHyphen ? hyphenWidth : 0)); + chosenAppendHyphen = needsInsertedHyphen; } else { break; } @@ -58,6 +93,7 @@ bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const st decision->byteOffset = chosenOffset; decision->prefixWidth = chosenWidth; + decision->appendHyphen = chosenAppendHyphen; return true; } @@ -110,14 +146,17 @@ std::vector ParsedText::calculateWordWidths(const GfxRenderer& rendere uint16_t width = renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt); if (width > pageWidth) { - HyphenSplitDecision decision; + HyphenSplitDecision decision{}; if (chooseSplitForWidth(renderer, fontId, *wordsIt, *wordStylesIt, pageWidth, true, &decision)) { const std::string originalWord = *wordsIt; const std::string tail = originalWord.substr(decision.byteOffset); if (tail.empty()) { continue; } - const std::string prefix = originalWord.substr(0, decision.byteOffset) + "-"; + std::string prefix = originalWord.substr(0, decision.byteOffset); + if (decision.appendHyphen) { + prefix += "-"; + } *wordsIt = prefix; auto nextWordIt = words.insert(std::next(wordsIt), tail); @@ -235,7 +274,7 @@ std::vector ParsedText::computeLineBreaks(const GfxRenderer& renderer, c } const int availableWidth = pageWidth - lineWidth - interWordSpace; - HyphenSplitDecision decision; + HyphenSplitDecision decision{}; if (!chooseSplitForWidth(renderer, fontId, *wordNodeIt, *styleNodeIt, availableWidth, false, &decision)) { break; } @@ -245,7 +284,10 @@ std::vector ParsedText::computeLineBreaks(const GfxRenderer& renderer, c if (tail.empty()) { break; } - const std::string prefix = originalWord.substr(0, decision.byteOffset) + "-"; + std::string prefix = originalWord.substr(0, decision.byteOffset); + if (decision.appendHyphen) { + prefix += "-"; + } const EpdFontStyle styleForSplit = *styleNodeIt; *wordNodeIt = tail; diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp index c7eeb691..4e6be5bf 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp @@ -79,8 +79,6 @@ bool isPunctuation(const uint32_t cp) { case 0x2019: // ’ case 0x201C: // “ case 0x201D: // ” - case '[': - case ']': case '{': case '}': case '/': @@ -92,6 +90,33 @@ bool isPunctuation(const uint32_t cp) { } } +bool isExplicitHyphen(const uint32_t cp) { + switch (cp) { + case '-': + case 0x00AD: // soft hyphen + case 0x058A: // Armenian hyphen + case 0x2010: // hyphen + case 0x2011: // non-breaking hyphen + case 0x2012: // figure dash + case 0x2013: // en dash + case 0x2014: // em dash + case 0x2015: // horizontal bar + case 0x2043: // hyphen bullet + case 0x207B: // superscript minus + case 0x208B: // subscript minus + case 0x2212: // minus sign + case 0x2E17: // double oblique hyphen + case 0x2E3A: // two-em dash + case 0x2E3B: // three-em dash + case 0xFE58: // small em dash + case 0xFE63: // small hyphen-minus + case 0xFF0D: // fullwidth hyphen-minus + return true; + default: + return false; + } +} + void trimSurroundingPunctuation(std::vector& cps) { while (!cps.empty() && isPunctuation(cps.front().value)) { cps.erase(cps.begin()); diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.h b/lib/Epub/Epub/hyphenation/HyphenationCommon.h index 9a6b69aa..b1f9271c 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h @@ -28,6 +28,7 @@ bool isCyrillicConsonant(uint32_t cp); bool isAlphabetic(uint32_t cp); bool isVowel(uint32_t cp); bool isPunctuation(uint32_t cp); +bool isExplicitHyphen(uint32_t cp); void trimSurroundingPunctuation(std::vector& cps); Script detectScript(const std::vector& cps); diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp index 2ac3bf9d..8b0d5138 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp @@ -48,8 +48,6 @@ std::vector collectCodepoints(const std::string& word) { return cps; } -bool isExplicitHyphen(const uint32_t cp) { return cp == '-' || cp == 0x2010; } - std::vector collectExplicitHyphenIndexes(const std::vector& cps) { std::vector indexes; for (size_t i = 0; i < cps.size(); ++i) { @@ -74,6 +72,32 @@ std::vector collectExplicitHyphenIndexes(const std::vector= '0' && cp <= '9'; } + +void trimTrailingFootnoteReference(std::vector& cps) { + if (cps.size() < 3) { + return; + } + int closing = static_cast(cps.size()) - 1; + if (cps[closing].value != ']') { + return; + } + int pos = closing - 1; + if (pos < 0 || !isAsciiDigit(cps[pos].value)) { + return; + } + while (pos >= 0 && isAsciiDigit(cps[pos].value)) { + --pos; + } + if (pos < 0 || cps[pos].value != '[') { + return; + } + if (closing - pos <= 1) { + return; + } + cps.erase(cps.begin() + pos, cps.end()); +} + // Rejects words containing punctuation or digits unless forced. bool hasOnlyAlphabetic(const std::vector& cps) { if (cps.empty()) { @@ -120,11 +144,13 @@ std::vector Hyphenator::breakOffsets(const std::string& word, const bool auto cps = collectCodepoints(word); trimSurroundingPunctuation(cps); + trimTrailingFootnoteReference(cps); if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { return {}; } - if (auto explicitIndexes = collectExplicitHyphenIndexes(cps); !explicitIndexes.empty()) { + auto explicitIndexes = collectExplicitHyphenIndexes(cps); + if (!explicitIndexes.empty()) { std::sort(explicitIndexes.begin(), explicitIndexes.end()); explicitIndexes.erase(std::unique(explicitIndexes.begin(), explicitIndexes.end()), explicitIndexes.end()); std::vector byteOffsets;