From f02872542f8b2fee6db070e1f5049203e81ad7fb Mon Sep 17 00:00:00 2001 From: Arthur Tazhitdinov Date: Thu, 15 Jan 2026 21:48:32 +0500 Subject: [PATCH] refactor: unify punctuation trimming to handle footnotes in hyphenation logic --- .../Epub/hyphenation/HyphenationCommon.cpp | 70 +++++++++---------- lib/Epub/Epub/hyphenation/HyphenationCommon.h | 3 +- lib/Epub/Epub/hyphenation/Hyphenator.cpp | 12 ++-- .../HyphenationEvaluationTest.cpp | 3 +- 4 files changed, 38 insertions(+), 50 deletions(-) diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp index a8b5cfa9..37bfeb1d 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp @@ -67,10 +67,16 @@ bool isLatinLetter(const uint32_t cp) { bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); } -bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); } +bool isAlphabetic(const uint32_t cp) { + if (isLatinLetter(cp) || isCyrillicLetter(cp) || isAsciiDigit(cp)) { + return true; + } + return cp > 0x7F && !isPunctuation(cp); +} bool isPunctuation(const uint32_t cp) { switch (cp) { + case '-': case '.': case ',': case '!': @@ -87,8 +93,11 @@ bool isPunctuation(const uint32_t cp) { case 0x2019: // ’ case 0x201C: // “ case 0x201D: // ” + case 0x00A0: // no-break space case '{': case '}': + case '[': + case ']': case '/': case 0x203A: // › case 0x2026: // … @@ -107,18 +116,6 @@ bool isExplicitHyphen(const uint32_t cp) { case 0x058A: // Armenian hyphen case 0x2010: // hyphen case 0x2011: // non-breaking hyphen - case 0x2012: // figure dash - case 0x2013: // en dash - case 0x2014: // em dash - case 0x2015: // horizontal bar - case 0x2043: // hyphen bullet - case 0x207B: // superscript minus - case 0x208B: // subscript minus - case 0x2212: // minus sign - case 0x2E17: // double oblique hyphen - case 0x2E3A: // two-em dash - case 0x2E3B: // three-em dash - case 0xFE58: // small em dash case 0xFE63: // small hyphen-minus case 0xFF0D: // fullwidth hyphen-minus return true; @@ -129,7 +126,28 @@ bool isExplicitHyphen(const uint32_t cp) { bool isSoftHyphen(const uint32_t cp) { return cp == 0x00AD; } -void trimSurroundingPunctuation(std::vector& cps) { +void trimSurroundingPunctuationAndFootnote(std::vector& cps) { + if (cps.empty()) { + return; + } + + // Remove trailing footnote references like [12], even if punctuation trails after the closing bracket. + if (cps.size() >= 3) { + int end = static_cast(cps.size()) - 1; + while (end >= 0 && isPunctuation(cps[end].value)) { + --end; + } + int pos = end; + if (pos >= 0 && isAsciiDigit(cps[pos].value)) { + while (pos >= 0 && isAsciiDigit(cps[pos].value)) { + --pos; + } + if (pos >= 0 && cps[pos].value == '[' && end - pos > 1) { + cps.erase(cps.begin() + pos, cps.end()); + } + } + } + while (!cps.empty() && isPunctuation(cps.front().value)) { cps.erase(cps.begin()); } @@ -152,27 +170,3 @@ std::vector collectCodepoints(const std::string& word) { return cps; } - -void trimTrailingFootnoteReference(std::vector& cps) { - if (cps.size() < 3) { - return; - } - int closing = static_cast(cps.size()) - 1; - if (cps[closing].value != ']') { - return; - } - int pos = closing - 1; - if (pos < 0 || !isAsciiDigit(cps[pos].value)) { - return; - } - while (pos >= 0 && isAsciiDigit(cps[pos].value)) { - --pos; - } - if (pos < 0 || cps[pos].value != '[') { - return; - } - if (closing - pos <= 1) { - return; - } - cps.erase(cps.begin() + pos, cps.end()); -} diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.h b/lib/Epub/Epub/hyphenation/HyphenationCommon.h index ebd49aa0..522a4673 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h @@ -21,6 +21,5 @@ bool isPunctuation(uint32_t cp); bool isAsciiDigit(uint32_t cp); bool isExplicitHyphen(uint32_t cp); bool isSoftHyphen(uint32_t cp); -void trimSurroundingPunctuation(std::vector& cps); +void trimSurroundingPunctuationAndFootnote(std::vector& cps); std::vector collectCodepoints(const std::string& word); -void trimTrailingFootnoteReference(std::vector& cps); diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp index 0e151be3..e485083f 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp @@ -1,8 +1,5 @@ #include "Hyphenator.h" -#include - -#include #include #include "HyphenationCommon.h" @@ -60,13 +57,10 @@ std::vector Hyphenator::breakOffsets(const std::string& w // Convert to codepoints and normalize word boundaries. auto cps = collectCodepoints(word); - trimSurroundingPunctuation(cps); - trimTrailingFootnoteReference(cps); + trimSurroundingPunctuationAndFootnote(cps); const auto* hyphenator = cachedHyphenator_; - const size_t minPrefix = hyphenator ? hyphenator->minPrefix() : LiangWordConfig::kDefaultMinPrefix; - const size_t minSuffix = hyphenator ? hyphenator->minSuffix() : LiangWordConfig::kDefaultMinSuffix; - // Explicit hyphen markers (soft or hard) take precedence over heuristic breaks. + // Explicit hyphen markers (soft or hard) take precedence over language breaks. auto explicitBreakInfos = buildExplicitBreakInfos(cps); if (!explicitBreakInfos.empty()) { return explicitBreakInfos; @@ -80,6 +74,8 @@ std::vector Hyphenator::breakOffsets(const std::string& w // Only add fallback breaks if needed if (includeFallback && indexes.empty()) { + const size_t minPrefix = hyphenator ? hyphenator->minPrefix() : LiangWordConfig::kDefaultMinPrefix; + const size_t minSuffix = hyphenator ? hyphenator->minSuffix() : LiangWordConfig::kDefaultMinSuffix; for (size_t idx = minPrefix; idx + minSuffix <= cps.size(); ++idx) { indexes.push_back(idx); } diff --git a/test/hyphenation_eval/HyphenationEvaluationTest.cpp b/test/hyphenation_eval/HyphenationEvaluationTest.cpp index ae667f9a..90d17101 100644 --- a/test/hyphenation_eval/HyphenationEvaluationTest.cpp +++ b/test/hyphenation_eval/HyphenationEvaluationTest.cpp @@ -128,8 +128,7 @@ std::string positionsToHyphenated(const std::string& word, const std::vector hyphenateWordWithHyphenator(const std::string& word, const LanguageHyphenator& hyphenator) { auto cps = collectCodepoints(word); - trimSurroundingPunctuation(cps); - trimTrailingFootnoteReference(cps); + trimSurroundingPunctuationAndFootnote(cps); return hyphenator.breakIndexes(cps); }