From 26b84b38a216c66b8c7b26cab23429b0eb042c9c Mon Sep 17 00:00:00 2001 From: Arthur Tazhitdinov Date: Wed, 17 Dec 2025 18:10:58 +0300 Subject: [PATCH] Implement hyphenation support for English and Russian languages - Added EnglishHyphenator and RussianHyphenator classes to handle language-specific hyphenation rules. - Introduced HyphenationCommon for shared utilities and character classification functions. - Updated ParsedText to utilize hyphenation when laying out text. - Enhanced the hyphenation logic to consider word splitting based on available width and character properties. - Refactored existing code to improve readability and maintainability, including the use of iterators and lambda functions for line processing. - Added necessary includes and organized header files for better structure. --- lib/Epub/Epub/ParsedText.cpp | 264 +++++++++-------- .../Epub/hyphenation/EnglishHyphenator.cpp | 269 ++++++++++++++++++ lib/Epub/Epub/hyphenation/EnglishHyphenator.h | 14 + .../Epub/hyphenation/HyphenationCommon.cpp | 84 ++++++ lib/Epub/Epub/hyphenation/HyphenationCommon.h | 31 ++ lib/Epub/Epub/hyphenation/Hyphenator.cpp | 191 +++++++++++++ lib/Epub/Epub/hyphenation/Hyphenator.h | 18 ++ .../Epub/hyphenation/LanguageHyphenator.h | 12 + .../Epub/hyphenation/RussianHyphenator.cpp | 202 +++++++++++++ lib/Epub/Epub/hyphenation/RussianHyphenator.h | 14 + 10 files changed, 979 insertions(+), 120 deletions(-) create mode 100644 lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp create mode 100644 lib/Epub/Epub/hyphenation/EnglishHyphenator.h create mode 100644 lib/Epub/Epub/hyphenation/HyphenationCommon.cpp create mode 100644 lib/Epub/Epub/hyphenation/HyphenationCommon.h create mode 100644 lib/Epub/Epub/hyphenation/Hyphenator.cpp create mode 100644 lib/Epub/Epub/hyphenation/Hyphenator.h create mode 100644 lib/Epub/Epub/hyphenation/LanguageHyphenator.h create mode 100644 lib/Epub/Epub/hyphenation/RussianHyphenator.cpp create mode 100644 lib/Epub/Epub/hyphenation/RussianHyphenator.h diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index b666192..4d6386b 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -3,11 +3,12 @@ #include #include -#include -#include -#include +#include +#include #include +#include "hyphenation/Hyphenator.h" + constexpr int MAX_COST = std::numeric_limits::max(); void ParsedText::addWord(std::string word, const EpdFontStyle fontStyle) { @@ -24,148 +25,171 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo return; } - const size_t totalWordCount = words.size(); const int pageWidth = renderer.getScreenWidth() - horizontalMargin; + if (pageWidth <= 0) { + words.clear(); + wordStyles.clear(); + return; + } + const int spaceWidth = renderer.getSpaceWidth(fontId); - // width of 1em to indent first line of paragraph if Extra Spacing is enabled - const int indentWidth = (!extraParagraphSpacing) ? 1 * renderer.getTextWidth(fontId, "m", REGULAR) : 0; + const bool allowIndent = !extraParagraphSpacing && (style == TextBlock::JUSTIFIED || style == TextBlock::LEFT_ALIGN); + const int indentWidth = allowIndent ? renderer.getTextWidth(fontId, "m", REGULAR) : 0; + const int firstLinePageWidth = allowIndent ? std::max(pageWidth - indentWidth, 0) : pageWidth; + auto pageWidthForLine = [&](const bool isFirstLine) -> int { + return isFirstLine ? firstLinePageWidth : pageWidth; + }; - std::vector wordWidths; - wordWidths.reserve(totalWordCount); + auto wordIt = words.begin(); + auto styleIt = wordStyles.begin(); + auto lineStartWordIt = wordIt; + auto lineStartStyleIt = styleIt; - auto wordsIt = words.begin(); - auto wordStylesIt = wordStyles.begin(); + int lineWidthWithSpaces = 0; + int lineWordWidthSum = 0; + size_t lineWordCount = 0; + std::vector lineWordWidths; + lineWordWidths.reserve(16); - while (wordsIt != words.end()) { - wordWidths.push_back(renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt)); - - std::advance(wordsIt, 1); - std::advance(wordStylesIt, 1); - } - - // DP table to store the minimum badness (cost) of lines starting at index i - std::vector dp(totalWordCount); - // 'ans[i]' stores the index 'j' of the *last word* in the optimal line starting at 'i' - std::vector ans(totalWordCount); - - // Base Case - dp[totalWordCount - 1] = 0; - ans[totalWordCount - 1] = totalWordCount - 1; - - for (int i = totalWordCount - 2; i >= 0; --i) { - int currlen = -spaceWidth + indentWidth; - dp[i] = MAX_COST; - - for (size_t j = i; j < totalWordCount; ++j) { - // Current line length: previous width + space + current word width - currlen += wordWidths[j] + spaceWidth; - - if (currlen > pageWidth) { - break; - } - - int cost; - if (j == totalWordCount - 1) { - cost = 0; // Last line - } else { - const int remainingSpace = pageWidth - currlen; - // Use long long for the square to prevent overflow - const long long cost_ll = static_cast(remainingSpace) * remainingSpace + dp[j + 1]; - - if (cost_ll > MAX_COST) { - cost = MAX_COST; - } else { - cost = static_cast(cost_ll); - } - } - - if (cost < dp[i]) { - dp[i] = cost; - ans[i] = j; // j is the index of the last word in this optimal line - } - } - } - - // Stores the index of the word that starts the next line (last_word_index + 1) - std::vector lineBreakIndices; - size_t currentWordIndex = 0; + size_t producedLines = 0; constexpr size_t MAX_LINES = 1000; - while (currentWordIndex < totalWordCount) { - if (lineBreakIndices.size() >= MAX_LINES) { - break; + auto commitLine = [&](const bool isLastLine) { + if (lineWordCount == 0) { + return; } - size_t nextBreakIndex = ans[currentWordIndex] + 1; - lineBreakIndices.push_back(nextBreakIndex); + const bool isFirstLine = producedLines == 0; + const int linePageWidth = pageWidthForLine(isFirstLine); - currentWordIndex = nextBreakIndex; - } + std::list lineWords; + std::list lineStyles; + auto wordEndIt = wordIt; + auto styleEndIt = styleIt; - // Initialize iterators for consumption - auto wordStartIt = words.begin(); - auto wordStyleStartIt = wordStyles.begin(); - size_t wordWidthIndex = 0; + lineWords.splice(lineWords.begin(), words, lineStartWordIt, wordEndIt); + lineStyles.splice(lineStyles.begin(), wordStyles, lineStartStyleIt, styleEndIt); - size_t lastBreakAt = 0; - for (const size_t lineBreak : lineBreakIndices) { - const size_t lineWordCount = lineBreak - lastBreakAt; - - // Calculate end iterators for the range to splice - auto wordEndIt = wordStartIt; - auto wordStyleEndIt = wordStyleStartIt; - std::advance(wordEndIt, lineWordCount); - std::advance(wordStyleEndIt, lineWordCount); - - // Calculate total word width for this line - int lineWordWidthSum = 0; - for (size_t i = 0; i < lineWordCount; ++i) { - lineWordWidthSum += wordWidths[wordWidthIndex + i]; - } - - // Calculate spacing - int spareSpace = pageWidth - lineWordWidthSum; - if (wordWidthIndex == 0) { - spareSpace -= indentWidth; - } + const int gaps = lineWordCount > 0 ? static_cast(lineWordCount - 1) : 0; + const int baseSpaceTotal = spaceWidth * gaps; + const int spaceBudget = linePageWidth - lineWordWidthSum; int spacing = spaceWidth; - const bool isLastLine = lineBreak == totalWordCount; - - if (style == TextBlock::JUSTIFIED && !isLastLine && lineWordCount >= 2) { - spacing = spareSpace / (lineWordCount - 1); + int spacingRemainder = 0; + if (style == TextBlock::JUSTIFIED && !isLastLine && gaps > 0) { + const int additional = std::max(0, spaceBudget - baseSpaceTotal); + spacing = spaceWidth + (gaps > 0 ? additional / gaps : 0); + spacingRemainder = (gaps > 0) ? additional % gaps : 0; } - // Calculate initial x position - uint16_t xpos = (wordWidthIndex == 0) ? indentWidth : 0; + int renderedWidth = lineWordWidthSum; + if (gaps > 0) { + renderedWidth += spacing * gaps; + } + uint16_t xpos = 0; if (style == TextBlock::RIGHT_ALIGN) { - xpos = spareSpace - (lineWordCount - 1) * spaceWidth; + xpos = renderedWidth < linePageWidth ? linePageWidth - renderedWidth : 0; } else if (style == TextBlock::CENTER_ALIGN) { - xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2; + xpos = renderedWidth < linePageWidth ? (linePageWidth - renderedWidth) / 2 : 0; + } else if (allowIndent && isFirstLine) { + xpos = indentWidth; } - // Pre-calculate X positions for words std::list lineXPos; - for (size_t i = 0; i < lineWordCount; ++i) { - const uint16_t currentWordWidth = wordWidths[wordWidthIndex + i]; + for (size_t idx = 0; idx < lineWordWidths.size(); ++idx) { lineXPos.push_back(xpos); - xpos += currentWordWidth + spacing; + xpos += lineWordWidths[idx]; + if (idx + 1 < lineWordWidths.size()) { + int gap = spacing; + if (spacingRemainder > 0) { + gap += 1; + spacingRemainder--; + } + xpos += gap; + } } - // *** CRITICAL STEP: CONSUME DATA USING SPLICE *** - std::list lineWords; - lineWords.splice(lineWords.begin(), words, wordStartIt, wordEndIt); - std::list lineWordStyles; - lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyleStartIt, wordStyleEndIt); - processLine( - std::make_shared(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), style)); + std::make_shared(std::move(lineWords), std::move(lineXPos), std::move(lineStyles), style)); - // Update pointers/indices for the next line - wordStartIt = wordEndIt; - wordStyleStartIt = wordStyleEndIt; - wordWidthIndex += lineWordCount; - lastBreakAt = lineBreak; + producedLines++; + lineWordWidths.clear(); + lineWordWidthSum = 0; + lineWidthWithSpaces = 0; + lineWordCount = 0; + lineStartWordIt = wordIt; + lineStartStyleIt = styleIt; + }; + + while (wordIt != words.end() && producedLines < MAX_LINES) { + const int currentLinePageWidth = pageWidthForLine(producedLines == 0); + + if (lineWordCount == 0) { + lineStartWordIt = wordIt; + lineStartStyleIt = styleIt; + } + + const int wordWidth = renderer.getTextWidth(fontId, wordIt->c_str(), *styleIt); + const int gapWidth = (lineWordCount > 0) ? spaceWidth : 0; + const int candidateWidth = lineWidthWithSpaces + gapWidth + wordWidth; + + if (candidateWidth <= currentLinePageWidth) { + lineWordWidths.push_back(static_cast(wordWidth)); + lineWordWidthSum += wordWidth; + lineWidthWithSpaces = candidateWidth; + lineWordCount++; + ++wordIt; + ++styleIt; + continue; + } + + const int availableWidth = currentLinePageWidth - lineWidthWithSpaces - gapWidth; + if (lineWordCount > 0 && availableWidth <= 0) { + commitLine(false); + continue; + } + + if (lineWordCount > 0 && availableWidth > 0) { + HyphenationResult split; + if (Hyphenator::splitWord(renderer, fontId, *wordIt, *styleIt, availableWidth, &split, false)) { + *wordIt = std::move(split.head); + auto nextWordIt = std::next(wordIt); + auto nextStyleIt = std::next(styleIt); + words.insert(nextWordIt, std::move(split.tail)); + wordStyles.insert(nextStyleIt, *styleIt); + continue; + } + } + + if (lineWordCount == 0) { + HyphenationResult split; + if (Hyphenator::splitWord(renderer, fontId, *wordIt, *styleIt, currentLinePageWidth, &split, true)) { + *wordIt = std::move(split.head); + auto nextWordIt = std::next(wordIt); + auto nextStyleIt = std::next(styleIt); + words.insert(nextWordIt, std::move(split.tail)); + wordStyles.insert(nextStyleIt, *styleIt); + continue; + } + + lineWordWidths.push_back(static_cast(wordWidth)); + lineWordWidthSum += wordWidth; + lineWidthWithSpaces = candidateWidth; + lineWordCount = 1; + ++wordIt; + ++styleIt; + commitLine(wordIt == words.end()); + continue; + } + + commitLine(false); } -} + + if (lineWordCount > 0 && producedLines < MAX_LINES) { + commitLine(true); + } + + words.clear(); + wordStyles.clear(); +} \ No newline at end of file diff --git a/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp new file mode 100644 index 0000000..e4f79d0 --- /dev/null +++ b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp @@ -0,0 +1,269 @@ +#include "EnglishHyphenator.h" + +#include +#include +#include + +namespace { + +char lowerLatinChar(const uint32_t cp) { + if (!isLatinLetter(cp)) { + return 0; + } + return static_cast(toLowerLatin(cp)); +} + +bool isEnglishApproximantChar(const char c) { return c == 'l' || c == 'r' || c == 'w' || c == 'y'; } + +bool isEnglishStopChar(const char c) { + switch (c) { + case 'p': + case 'b': + case 't': + case 'd': + case 'k': + case 'g': + case 'c': + case 'q': + return true; + default: + return false; + } +} + +bool isEnglishFricativeChar(const char c) { + switch (c) { + case 'f': + case 'v': + case 's': + case 'z': + case 'h': + case 'x': + return true; + default: + return false; + } +} + +struct CharPair { + char first; + char second; +}; + +bool matchesDigraph(const char first, const char second, const std::initializer_list& pairs) { + for (const auto& pair : pairs) { + if (pair.first == first && pair.second == second) { + return true; + } + } + return false; +} + +bool isEnglishDiphthong(const uint32_t first, const uint32_t second) { + if (!isLatinLetter(first) || !isLatinLetter(second)) { + return false; + } + const auto f = static_cast(toLowerLatin(first)); + const auto s = static_cast(toLowerLatin(second)); + switch (f) { + case 'a': + return s == 'i' || s == 'y' || s == 'u'; + case 'e': + return s == 'a' || s == 'e' || s == 'i' || s == 'o' || s == 'u' || s == 'y'; + case 'i': + return s == 'e' || s == 'u' || s == 'a'; + case 'o': + return s == 'a' || s == 'e' || s == 'i' || s == 'o' || s == 'u' || s == 'y'; + case 'u': + return s == 'i' || s == 'a' || s == 'e'; + } + return false; +} + +bool isValidEnglishOnsetBigram(const uint32_t firstCp, const uint32_t secondCp) { + const char first = lowerLatinChar(firstCp); + const char second = lowerLatinChar(secondCp); + if (!first || !second) { + return false; + } + + if (matchesDigraph(first, second, + {{'c', 'h'}, {'s', 'h'}, {'t', 'h'}, {'p', 'h'}, {'w', 'h'}, {'w', 'r'}, {'k', 'n'}, + {'g', 'n'}, {'p', 's'}, {'p', 't'}, {'p', 'n'}, {'r', 'h'}})) { + return true; + } + + if (isEnglishStopChar(first) && isEnglishApproximantChar(second)) { + return true; + } + + if (isEnglishFricativeChar(first) && isEnglishApproximantChar(second)) { + return true; + } + + if (first == 's' && (second == 'p' || second == 't' || second == 'k' || second == 'm' || second == 'n' || + second == 'f' || second == 'l' || second == 'w' || second == 'c')) { + return true; + } + + if (second == 'y' && (first == 'p' || first == 'b' || first == 't' || first == 'd' || first == 'f' || first == 'k' || + first == 'g' || first == 'h' || first == 'm' || first == 'n' || first == 'l' || first == 's')) { + return true; + } + + return false; +} + +bool isValidEnglishOnsetTrigram(const uint32_t firstCp, const uint32_t secondCp, const uint32_t thirdCp) { + const char first = lowerLatinChar(firstCp); + const char second = lowerLatinChar(secondCp); + const char third = lowerLatinChar(thirdCp); + if (!first || !second || !third) { + return false; + } + + if (first == 's') { + if (second == 'p' && (third == 'l' || third == 'r' || third == 'w')) { + return true; + } + if (second == 't' && (third == 'r' || third == 'w' || third == 'y')) { + return true; + } + if (second == 'k' && (third == 'l' || third == 'r' || third == 'w')) { + return true; + } + if (second == 'c' && (third == 'l' || third == 'r')) { + return true; + } + if (second == 'f' && third == 'r') { + return true; + } + if (second == 'h' && third == 'r') { + return true; + } + } + + if (first == 't' && second == 'h' && third == 'r') { + return true; + } + + return false; +} + +bool englishClusterIsValidOnset(const std::vector& cps, const size_t start, const size_t end) { + if (start >= end) { + return false; + } + + for (size_t i = start; i < end; ++i) { + const char ch = lowerLatinChar(cps[i].value); + if (!ch) { + return false; + } + if (!isLatinConsonant(cps[i].value) && ch != 'y') { + return false; + } + } + + const size_t len = end - start; + if (len == 1) { + return true; + } + if (len == 2) { + return isValidEnglishOnsetBigram(cps[start].value, cps[start + 1].value); + } + if (len == 3) { + return isValidEnglishOnsetTrigram(cps[start].value, cps[start + 1].value, cps[start + 2].value); + } + + return false; +} + +size_t englishOnsetLength(const std::vector& cps, const size_t clusterStart, const size_t clusterEnd) { + const size_t clusterLen = clusterEnd - clusterStart; + if (clusterLen == 0) { + return 0; + } + + const size_t maxLen = std::min(3, clusterLen); + for (size_t len = maxLen; len >= 1; --len) { + const size_t suffixStart = clusterEnd - len; + if (englishClusterIsValidOnset(cps, suffixStart, clusterEnd)) { + return len; + } + } + + return 1; +} + +bool nextToApostrophe(const std::vector& cps, const size_t index) { + if (index == 0 || index >= cps.size()) { + return false; + } + const auto left = cps[index - 1].value; + const auto right = cps[index].value; + return left == '\'' || right == '\''; +} + +std::vector englishBreakIndexes(const std::vector& cps) { + std::vector indexes; + if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { + return indexes; + } + + std::vector vowelPositions; + vowelPositions.reserve(cps.size()); + for (size_t i = 0; i < cps.size(); ++i) { + if (isLatinVowel(cps[i].value)) { + vowelPositions.push_back(i); + } + } + + if (vowelPositions.size() < 2) { + return indexes; + } + + for (size_t v = 0; v + 1 < vowelPositions.size(); ++v) { + const size_t leftVowel = vowelPositions[v]; + const size_t rightVowel = vowelPositions[v + 1]; + + if (rightVowel - leftVowel == 1) { + if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) && + rightVowel >= MIN_PREFIX_CP && cps.size() - rightVowel >= MIN_SUFFIX_CP && + !nextToApostrophe(cps, rightVowel)) { + indexes.push_back(rightVowel); + } + continue; + } + + const size_t clusterStart = leftVowel + 1; + const size_t clusterEnd = rightVowel; + const size_t onsetLen = englishOnsetLength(cps, clusterStart, clusterEnd); + size_t breakIndex = clusterEnd - onsetLen; + + if (breakIndex < MIN_PREFIX_CP || cps.size() - breakIndex < MIN_SUFFIX_CP) { + continue; + } + if (nextToApostrophe(cps, breakIndex)) { + continue; + } + indexes.push_back(breakIndex); + } + + std::sort(indexes.begin(), indexes.end()); + indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); + return indexes; +} + +} // namespace + +const EnglishHyphenator& EnglishHyphenator::instance() { + static EnglishHyphenator instance; + return instance; +} + +Script EnglishHyphenator::script() const { return Script::Latin; } + +std::vector EnglishHyphenator::breakIndexes(const std::vector& cps) const { + return englishBreakIndexes(cps); +} diff --git a/lib/Epub/Epub/hyphenation/EnglishHyphenator.h b/lib/Epub/Epub/hyphenation/EnglishHyphenator.h new file mode 100644 index 0000000..17c8f87 --- /dev/null +++ b/lib/Epub/Epub/hyphenation/EnglishHyphenator.h @@ -0,0 +1,14 @@ +#pragma once + +#include "LanguageHyphenator.h" + +class EnglishHyphenator final : public LanguageHyphenator { + public: + static const EnglishHyphenator& instance(); + + Script script() const override; + std::vector breakIndexes(const std::vector& cps) const override; + + private: + EnglishHyphenator() = default; +}; diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp new file mode 100644 index 0000000..64b11e8 --- /dev/null +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp @@ -0,0 +1,84 @@ +#include "HyphenationCommon.h" + +namespace { + +uint32_t toLowerLatinImpl(const uint32_t cp) { + if (cp >= 'A' && cp <= 'Z') { + return cp - 'A' + 'a'; + } + return cp; +} + +uint32_t toLowerCyrillicImpl(const uint32_t cp) { + if (cp >= 0x0410 && cp <= 0x042F) { + return cp + 0x20; + } + if (cp == 0x0401) { + return 0x0451; + } + return cp; +} + +} // namespace + +uint32_t toLowerLatin(const uint32_t cp) { return toLowerLatinImpl(cp); } + +uint32_t toLowerCyrillic(const uint32_t cp) { return toLowerCyrillicImpl(cp); } + +bool isLatinLetter(const uint32_t cp) { + return (cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z'); +} + +bool isLatinVowel(uint32_t cp) { + cp = toLowerLatinImpl(cp); + return cp == 'a' || cp == 'e' || cp == 'i' || cp == 'o' || cp == 'u' || cp == 'y'; +} + +bool isLatinConsonant(const uint32_t cp) { return isLatinLetter(cp) && !isLatinVowel(cp); } + +bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); } + +bool isCyrillicVowel(uint32_t cp) { + cp = toLowerCyrillicImpl(cp); + switch (cp) { + case 0x0430: // а + case 0x0435: // е + case 0x0451: // ё + case 0x0438: // и + case 0x043E: // о + case 0x0443: // у + case 0x044B: // ы + case 0x044D: // э + case 0x044E: // ю + case 0x044F: // я + return true; + default: + return false; + } +} + +bool isCyrillicConsonant(const uint32_t cp) { return isCyrillicLetter(cp) && !isCyrillicVowel(cp); } + +bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); } + +bool isVowel(const uint32_t cp) { return isLatinVowel(cp) || isCyrillicVowel(cp); } + +Script detectScript(const std::vector& cps) { + bool hasLatin = false; + bool hasCyrillic = false; + for (const auto& info : cps) { + if (isLatinLetter(info.value)) { + hasLatin = true; + } else if (isCyrillicLetter(info.value)) { + hasCyrillic = true; + } + } + + if (hasLatin && !hasCyrillic) { + return Script::Latin; + } + if (!hasLatin && hasCyrillic) { + return Script::Cyrillic; + } + return Script::Mixed; +} diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.h b/lib/Epub/Epub/hyphenation/HyphenationCommon.h new file mode 100644 index 0000000..d3f95a4 --- /dev/null +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h @@ -0,0 +1,31 @@ +#pragma once + +#include +#include +#include + +struct CodepointInfo { + uint32_t value; + size_t byteOffset; +}; + +enum class Script { Latin, Cyrillic, Mixed }; + +constexpr size_t MIN_PREFIX_CP = 3; +constexpr size_t MIN_SUFFIX_CP = 2; + +uint32_t toLowerLatin(uint32_t cp); +uint32_t toLowerCyrillic(uint32_t cp); + +bool isLatinLetter(uint32_t cp); +bool isLatinVowel(uint32_t cp); +bool isLatinConsonant(uint32_t cp); + +bool isCyrillicLetter(uint32_t cp); +bool isCyrillicVowel(uint32_t cp); +bool isCyrillicConsonant(uint32_t cp); + +bool isAlphabetic(uint32_t cp); +bool isVowel(uint32_t cp); + +Script detectScript(const std::vector& cps); diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp new file mode 100644 index 0000000..ef5231c --- /dev/null +++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp @@ -0,0 +1,191 @@ +#include "Hyphenator.h" + +#include + +#include +#include +#include +#include + +#include + +#include "EnglishHyphenator.h" +#include "HyphenationCommon.h" +#include "LanguageHyphenator.h" +#include "RussianHyphenator.h" + +namespace { + +const std::array& registeredHyphenators() { + static const std::array hyphenators = { + &EnglishHyphenator::instance(), + &RussianHyphenator::instance(), + }; + return hyphenators; +} + +const LanguageHyphenator* hyphenatorForScript(const Script script) { + for (const auto* hyphenator : registeredHyphenators()) { + if (hyphenator->script() == script) { + return hyphenator; + } + } + return nullptr; +} + +std::vector collectCodepoints(const std::string& word) { + std::vector cps; + cps.reserve(word.size()); + + const unsigned char* base = reinterpret_cast(word.c_str()); + const unsigned char* ptr = base; + while (*ptr != 0) { + const unsigned char* current = ptr; + const uint32_t cp = utf8NextCodepoint(&ptr); + cps.push_back({cp, static_cast(current - base)}); + } + + return cps; +} + +bool hasOnlyAlphabetic(const std::vector& cps) { + if (cps.empty()) { + return false; + } + + for (const auto& info : cps) { + if (!isAlphabetic(info.value)) { + return false; + } + } + return true; +} + +std::vector fallbackBreakIndexes(const std::vector& cps) { + std::vector indexes; + if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { + return indexes; + } + + for (size_t i = MIN_PREFIX_CP; i + MIN_SUFFIX_CP <= cps.size(); ++i) { + const uint32_t prev = cps[i - 1].value; + const uint32_t curr = cps[i].value; + + if (!isAlphabetic(prev) || !isAlphabetic(curr)) { + continue; + } + + const bool prevVowel = isVowel(prev); + const bool currVowel = isVowel(curr); + const bool prevConsonant = !prevVowel; + const bool currConsonant = !currVowel; + + const bool breakable = (prevVowel && currConsonant) || (prevConsonant && currConsonant) || + (prevConsonant && currVowel); + + if (breakable) { + indexes.push_back(i); + } + } + + return indexes; +} + +std::vector collectBreakIndexes(const std::vector& cps) { + if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { + return {}; + } + + const Script script = detectScript(cps); + if (const auto* hyphenator = hyphenatorForScript(script)) { + auto indexes = hyphenator->breakIndexes(cps); + if (!indexes.empty()) { + return indexes; + } + } + + return fallbackBreakIndexes(cps); +} + +size_t byteOffsetForIndex(const std::vector& cps, const size_t index) { + if (index >= cps.size()) { + return cps.empty() ? 0 : cps.back().byteOffset; + } + return cps[index].byteOffset; +} + +std::string slice(const std::string& word, const size_t startByte, const size_t endByte) { + if (startByte >= endByte || startByte >= word.size()) { + return std::string(); + } + const size_t boundedEnd = std::min(endByte, word.size()); + return word.substr(startByte, boundedEnd - startByte); +} + +} // namespace + +bool Hyphenator::splitWord(const GfxRenderer& renderer, const int fontId, const std::string& word, + const EpdFontStyle style, const int availableWidth, HyphenationResult* result, + const bool force) { + if (!result || word.empty()) { + return false; + } + + auto cps = collectCodepoints(word); + if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { + return false; + } + + if (!force && !hasOnlyAlphabetic(cps)) { + return false; + } + + const auto breakIndexes = collectBreakIndexes(cps); + const int hyphenWidth = renderer.getTextWidth(fontId, "-", style); + const int adjustedWidth = availableWidth - hyphenWidth; + + size_t chosenIndex = std::numeric_limits::max(); + + if (adjustedWidth > 0) { + for (const size_t idx : breakIndexes) { + const size_t byteOffset = byteOffsetForIndex(cps, idx); + const std::string prefix = word.substr(0, byteOffset); + const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style); + if (prefixWidth <= adjustedWidth) { + chosenIndex = idx; + } else { + break; + } + } + } + + if (chosenIndex == std::numeric_limits::max() && force) { + for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) { + const size_t byteOffset = byteOffsetForIndex(cps, idx); + const std::string prefix = word.substr(0, byteOffset); + const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style); + if (adjustedWidth <= 0 || prefixWidth <= adjustedWidth) { + chosenIndex = idx; + if (adjustedWidth > 0 && prefixWidth > adjustedWidth) { + break; + } + } + } + } + + if (chosenIndex == std::numeric_limits::max()) { + return false; + } + + const size_t splitByte = byteOffsetForIndex(cps, chosenIndex); + const std::string head = word.substr(0, splitByte); + const std::string tail = slice(word, splitByte, word.size()); + + if (head.empty() || tail.empty()) { + return false; + } + + result->head = head + "-"; + result->tail = tail; + return true; +} diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.h b/lib/Epub/Epub/hyphenation/Hyphenator.h new file mode 100644 index 0000000..819bf56 --- /dev/null +++ b/lib/Epub/Epub/hyphenation/Hyphenator.h @@ -0,0 +1,18 @@ +#pragma once + +#include + +#include + +class GfxRenderer; + +struct HyphenationResult { + std::string head; + std::string tail; +}; + +class Hyphenator { + public: + static bool splitWord(const GfxRenderer& renderer, int fontId, const std::string& word, EpdFontStyle style, + int availableWidth, HyphenationResult* result, bool force); +}; \ No newline at end of file diff --git a/lib/Epub/Epub/hyphenation/LanguageHyphenator.h b/lib/Epub/Epub/hyphenation/LanguageHyphenator.h new file mode 100644 index 0000000..e8f7d9c --- /dev/null +++ b/lib/Epub/Epub/hyphenation/LanguageHyphenator.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +#include "HyphenationCommon.h" + +class LanguageHyphenator { + public: + virtual ~LanguageHyphenator() = default; + virtual Script script() const = 0; + virtual std::vector breakIndexes(const std::vector& cps) const = 0; +}; diff --git a/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp b/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp new file mode 100644 index 0000000..1af669e --- /dev/null +++ b/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp @@ -0,0 +1,202 @@ +#include "RussianHyphenator.h" + +#include +#include + +namespace { + +bool isSoftOrHardSign(const uint32_t cp) { + return cp == 0x044C || cp == 0x042C || cp == 0x044A || cp == 0x042A; +} + +bool isRussianPrefixConsonant(uint32_t cp) { + cp = toLowerCyrillic(cp); + return cp == 0x0432 || cp == 0x0437 || cp == 0x0441; // в, з, с +} + +bool isRussianSibilant(uint32_t cp) { + cp = toLowerCyrillic(cp); + switch (cp) { + case 0x0437: // з + case 0x0441: // с + case 0x0436: // ж + case 0x0448: // ш + case 0x0449: // щ + case 0x0447: // ч + case 0x0446: // ц + return true; + default: + return false; + } +} + +bool isRussianStop(uint32_t cp) { + cp = toLowerCyrillic(cp); + switch (cp) { + case 0x0431: // б + case 0x0433: // г + case 0x0434: // д + case 0x043F: // п + case 0x0442: // т + case 0x043A: // к + return true; + default: + return false; + } +} + +int russianSonority(uint32_t cp) { + cp = toLowerCyrillic(cp); + switch (cp) { + case 0x043B: // л + case 0x0440: // р + case 0x0439: // й + return 4; + case 0x043C: // м + case 0x043D: // н + return 3; + case 0x0432: // в + case 0x0437: // з + case 0x0436: // ж + return 2; + case 0x0444: // ф + case 0x0441: // с + case 0x0448: // ш + case 0x0449: // щ + case 0x0447: // ч + case 0x0446: // ц + case 0x0445: // х + return 1; + case 0x0431: // б + case 0x0433: // г + case 0x0434: // д + case 0x043F: // п + case 0x0442: // т + case 0x043A: // к + return 0; + default: + return 1; + } +} + +bool russianClusterIsValidOnset(const std::vector& cps, const size_t start, const size_t end) { + if (start >= end) { + return false; + } + + for (size_t i = start; i < end; ++i) { + const auto cp = cps[i].value; + if (!isCyrillicConsonant(cp) || isSoftOrHardSign(cp)) { + return false; + } + } + + if (end - start == 1) { + return true; + } + + for (size_t i = start; i + 1 < end; ++i) { + const uint32_t current = cps[i].value; + const uint32_t next = cps[i + 1].value; + const int currentRank = russianSonority(current); + const int nextRank = russianSonority(next); + if (currentRank > nextRank) { + const bool atClusterStart = (i == start); + const bool prefixAllowance = atClusterStart && isRussianPrefixConsonant(current); + const bool sibilantAllowance = isRussianSibilant(current) && isRussianStop(next); + if (!prefixAllowance && !sibilantAllowance) { + return false; + } + } + } + + return true; +} + +size_t russianOnsetLength(const std::vector& cps, const size_t clusterStart, const size_t clusterEnd) { + const size_t clusterLen = clusterEnd - clusterStart; + if (clusterLen == 0) { + return 0; + } + + const size_t maxLen = std::min(4, clusterLen); + for (size_t len = maxLen; len >= 1; --len) { + const size_t suffixStart = clusterEnd - len; + if (russianClusterIsValidOnset(cps, suffixStart, clusterEnd)) { + return len; + } + } + + return 1; +} + +bool nextToSoftSign(const std::vector& cps, const size_t index) { + if (index == 0 || index >= cps.size()) { + return false; + } + const auto left = cps[index - 1].value; + const auto right = cps[index].value; + return isSoftOrHardSign(left) || isSoftOrHardSign(right); +} + +std::vector russianBreakIndexes(const std::vector& cps) { + std::vector indexes; + if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { + return indexes; + } + + std::vector vowelPositions; + vowelPositions.reserve(cps.size()); + for (size_t i = 0; i < cps.size(); ++i) { + if (isCyrillicVowel(cps[i].value)) { + vowelPositions.push_back(i); + } + } + + if (vowelPositions.size() < 2) { + return indexes; + } + + for (size_t v = 0; v + 1 < vowelPositions.size(); ++v) { + const size_t leftVowel = vowelPositions[v]; + const size_t rightVowel = vowelPositions[v + 1]; + + if (rightVowel - leftVowel == 1) { + if (rightVowel >= MIN_PREFIX_CP && cps.size() - rightVowel >= MIN_SUFFIX_CP && + !nextToSoftSign(cps, rightVowel)) { + indexes.push_back(rightVowel); + } + continue; + } + + const size_t clusterStart = leftVowel + 1; + const size_t clusterEnd = rightVowel; + const size_t onsetLen = russianOnsetLength(cps, clusterStart, clusterEnd); + size_t breakIndex = clusterEnd - onsetLen; + + if (breakIndex < MIN_PREFIX_CP || cps.size() - breakIndex < MIN_SUFFIX_CP) { + continue; + } + if (nextToSoftSign(cps, breakIndex)) { + continue; + } + indexes.push_back(breakIndex); + } + + std::sort(indexes.begin(), indexes.end()); + indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); + return indexes; +} + +} // namespace + +const RussianHyphenator& RussianHyphenator::instance() { + static RussianHyphenator instance; + return instance; +} + +Script RussianHyphenator::script() const { return Script::Cyrillic; } + +std::vector RussianHyphenator::breakIndexes(const std::vector& cps) const { + return russianBreakIndexes(cps); +} diff --git a/lib/Epub/Epub/hyphenation/RussianHyphenator.h b/lib/Epub/Epub/hyphenation/RussianHyphenator.h new file mode 100644 index 0000000..a55ce26 --- /dev/null +++ b/lib/Epub/Epub/hyphenation/RussianHyphenator.h @@ -0,0 +1,14 @@ +#pragma once + +#include "LanguageHyphenator.h" + +class RussianHyphenator final : public LanguageHyphenator { + public: + static const RussianHyphenator& instance(); + + Script script() const override; + std::vector breakIndexes(const std::vector& cps) const override; + + private: + RussianHyphenator() = default; +};