From f6767c857f85d90ad7017ba416b159bcb2d1907f Mon Sep 17 00:00:00 2001 From: Arthur Tazhitdinov Date: Fri, 26 Dec 2025 06:03:38 +0500 Subject: [PATCH] Rename trimTrailingPunctuation to trimSurroundingPunctuation and update logic to remove surrounding punctuation; add explicit hyphen handling in breakOffsets function. --- .../Epub/hyphenation/HyphenationCommon.cpp | 5 ++- lib/Epub/Epub/hyphenation/HyphenationCommon.h | 2 +- lib/Epub/Epub/hyphenation/Hyphenator.cpp | 39 ++++++++++++++++++- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp index fb8f26b1..c7eeb691 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp @@ -92,7 +92,10 @@ bool isPunctuation(const uint32_t cp) { } } -void trimTrailingPunctuation(std::vector& cps) { +void trimSurroundingPunctuation(std::vector& cps) { + while (!cps.empty() && isPunctuation(cps.front().value)) { + cps.erase(cps.begin()); + } while (!cps.empty() && isPunctuation(cps.back().value)) { cps.pop_back(); } diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.h b/lib/Epub/Epub/hyphenation/HyphenationCommon.h index 7e6f4eee..9a6b69aa 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h @@ -28,6 +28,6 @@ bool isCyrillicConsonant(uint32_t cp); bool isAlphabetic(uint32_t cp); bool isVowel(uint32_t cp); bool isPunctuation(uint32_t cp); -void trimTrailingPunctuation(std::vector& cps); +void trimSurroundingPunctuation(std::vector& cps); Script detectScript(const std::vector& cps); diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp index cecdc38a..2ac3bf9d 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp @@ -48,6 +48,32 @@ std::vector collectCodepoints(const std::string& word) { return cps; } +bool isExplicitHyphen(const uint32_t cp) { return cp == '-' || cp == 0x2010; } + +std::vector collectExplicitHyphenIndexes(const std::vector& cps) { + std::vector indexes; + for (size_t i = 0; i < cps.size(); ++i) { + if (!isExplicitHyphen(cps[i].value)) { + continue; + } + if (i == 0 || i + 1 >= cps.size()) { + continue; + } + if (!isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) { + continue; + } + const size_t breakIndex = i + 1; + if (breakIndex >= cps.size()) { + continue; + } + if (breakIndex == 0) { + continue; + } + indexes.push_back(breakIndex); + } + return indexes; +} + // Rejects words containing punctuation or digits unless forced. bool hasOnlyAlphabetic(const std::vector& cps) { if (cps.empty()) { @@ -93,11 +119,22 @@ std::vector Hyphenator::breakOffsets(const std::string& word, const bool } auto cps = collectCodepoints(word); - trimTrailingPunctuation(cps); + trimSurroundingPunctuation(cps); if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { return {}; } + if (auto explicitIndexes = collectExplicitHyphenIndexes(cps); !explicitIndexes.empty()) { + std::sort(explicitIndexes.begin(), explicitIndexes.end()); + explicitIndexes.erase(std::unique(explicitIndexes.begin(), explicitIndexes.end()), explicitIndexes.end()); + std::vector byteOffsets; + byteOffsets.reserve(explicitIndexes.size()); + for (const size_t idx : explicitIndexes) { + byteOffsets.push_back(byteOffsetForIndex(cps, idx)); + } + return byteOffsets; + } + std::vector indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector(); if (includeFallback) { for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {