Add explicit hyphen handling and improve hyphenation logic in ParsedText and Hyphenator

2026-02-05 15:17:37 +03:00 · 2026-01-03 15:20:53 +05:00 · 2026-01-03 15:20:53 +05:00 · cb1ecdb505
commit cb1ecdb505
parent f6767c857f
4 changed files with 109 additions and 15 deletions
--- a/lib/Epub/Epub/ParsedText.cpp
+++ b/lib/Epub/Epub/ParsedText.cpp
@ -1,6 +1,7 @@
 #include "ParsedText.h"
 #include <GfxRenderer.h>
 #include <Utf8.h>
 #include <algorithm>
 #include <cmath>
@ -9,6 +10,7 @@
 #include <limits>
 #include <vector>
 #include "hyphenation/HyphenationCommon.h"
 #include "hyphenation/Hyphenator.h"
 constexpr int MAX_COST = std::numeric_limits<int>::max();
@ -18,8 +20,38 @@ namespace {
 struct HyphenSplitDecision {
  size_t byteOffset;
  uint16_t prefixWidth;
  bool appendHyphen;  // true when we must draw an extra hyphen after the prefix glyphs
 };
 // Verifies whether the substring ending at `offset` already contains a literal hyphen glyph, so we can avoid
 // drawing a duplicate hyphen when breaking the word.
 bool endsWithExplicitHyphen(const std::string& word, const size_t offset) {
  if (offset == 0 || offset > word.size()) {
    return false;
  }
  const unsigned char* base = reinterpret_cast<const unsigned char*>(word.data());
  const unsigned char* ptr = base;
  const unsigned char* target = base + offset;
  const unsigned char* lastStart = nullptr;
  while (ptr < target) {
    lastStart = ptr;
    utf8NextCodepoint(&ptr);
    if (ptr > target) {
      return false;
    }
  }
  if (!lastStart || ptr != target) {
    return false;
  }
  const unsigned char* tmp = lastStart;
  const uint32_t cp = utf8NextCodepoint(&tmp);  // decode the codepoint immediately prior to the break
  return isExplicitHyphen(cp);
 }
 bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
                         const EpdFontStyle style, const int availableWidth, const bool includeFallback,
                         HyphenSplitDecision* decision) {
@ -28,10 +60,6 @@ bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const st
  }
  const int hyphenWidth = renderer.getTextWidth(fontId, "-", style);
  const int adjustedWidth = availableWidth - hyphenWidth;
  if (adjustedWidth <= 0) {
    return false;
  }
  auto offsets = Hyphenator::breakOffsets(word, includeFallback);
  if (offsets.empty()) {
@ -40,13 +68,20 @@ bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const st
  size_t chosenOffset = std::numeric_limits<size_t>::max();
  uint16_t chosenWidth = 0;
  bool chosenAppendHyphen = true;
  for (const size_t offset : offsets) {
    const bool needsInsertedHyphen = !endsWithExplicitHyphen(word, offset);
    const int budget = availableWidth - (needsInsertedHyphen ? hyphenWidth : 0);
    if (budget <= 0) {
      continue;
    }
    const std::string prefix = word.substr(0, offset);
    const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style);
-    if (prefixWidth <= adjustedWidth) {
+    if (prefixWidth <= budget) {
      chosenOffset = offset;
-      chosenWidth = static_cast<uint16_t>(prefixWidth + hyphenWidth);
+      chosenWidth = static_cast<uint16_t>(prefixWidth + (needsInsertedHyphen ? hyphenWidth : 0));
      chosenAppendHyphen = needsInsertedHyphen;
    } else {
      break;
    }
@ -58,6 +93,7 @@ bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const st
  decision->byteOffset = chosenOffset;
  decision->prefixWidth = chosenWidth;
  decision->appendHyphen = chosenAppendHyphen;
  return true;
 }
@ -110,14 +146,17 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
    uint16_t width = renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt);
    if (width > pageWidth) {
-      HyphenSplitDecision decision;
+      HyphenSplitDecision decision{};
      if (chooseSplitForWidth(renderer, fontId, *wordsIt, *wordStylesIt, pageWidth, true, &decision)) {
        const std::string originalWord = *wordsIt;
        const std::string tail = originalWord.substr(decision.byteOffset);
        if (tail.empty()) {
          continue;
        }
-        const std::string prefix = originalWord.substr(0, decision.byteOffset) + "-";
+        std::string prefix = originalWord.substr(0, decision.byteOffset);
        if (decision.appendHyphen) {
          prefix += "-";
        }
        *wordsIt = prefix;
        auto nextWordIt = words.insert(std::next(wordsIt), tail);
@ -235,7 +274,7 @@ std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, c
        }
        const int availableWidth = pageWidth - lineWidth - interWordSpace;
-        HyphenSplitDecision decision;
+        HyphenSplitDecision decision{};
        if (!chooseSplitForWidth(renderer, fontId, *wordNodeIt, *styleNodeIt, availableWidth, false, &decision)) {
          break;
        }
@ -245,7 +284,10 @@ std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, c
        if (tail.empty()) {
          break;
        }
-        const std::string prefix = originalWord.substr(0, decision.byteOffset) + "-";
+        std::string prefix = originalWord.substr(0, decision.byteOffset);
        if (decision.appendHyphen) {
          prefix += "-";
        }
        const EpdFontStyle styleForSplit = *styleNodeIt;
        *wordNodeIt = tail;
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
@ -79,8 +79,6 @@ bool isPunctuation(const uint32_t cp) {
    case 0x2019:  // ’
    case 0x201C:  // “
    case 0x201D:  // ”
    case '[':
    case ']':
    case '{':
    case '}':
    case '/':
@ -92,6 +90,33 @@ bool isPunctuation(const uint32_t cp) {
  }
 }
 bool isExplicitHyphen(const uint32_t cp) {
  switch (cp) {
    case '-':
    case 0x00AD:  // soft hyphen
    case 0x058A:  // Armenian hyphen
    case 0x2010:  // hyphen
    case 0x2011:  // non-breaking hyphen
    case 0x2012:  // figure dash
    case 0x2013:  // en dash
    case 0x2014:  // em dash
    case 0x2015:  // horizontal bar
    case 0x2043:  // hyphen bullet
    case 0x207B:  // superscript minus
    case 0x208B:  // subscript minus
    case 0x2212:  // minus sign
    case 0x2E17:  // double oblique hyphen
    case 0x2E3A:  // two-em dash
    case 0x2E3B:  // three-em dash
    case 0xFE58:  // small em dash
    case 0xFE63:  // small hyphen-minus
    case 0xFF0D:  // fullwidth hyphen-minus
      return true;
    default:
      return false;
  }
 }
 void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps) {
  while (!cps.empty() && isPunctuation(cps.front().value)) {
    cps.erase(cps.begin());
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h
@ -28,6 +28,7 @@ bool isCyrillicConsonant(uint32_t cp);
 bool isAlphabetic(uint32_t cp);
 bool isVowel(uint32_t cp);
 bool isPunctuation(uint32_t cp);
 bool isExplicitHyphen(uint32_t cp);
 void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);
 Script detectScript(const std::vector<CodepointInfo>& cps);
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@ -48,8 +48,6 @@ std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
  return cps;
 }
 bool isExplicitHyphen(const uint32_t cp) { return cp == '-' || cp == 0x2010; }
 std::vector<size_t> collectExplicitHyphenIndexes(const std::vector<CodepointInfo>& cps) {
  std::vector<size_t> indexes;
  for (size_t i = 0; i < cps.size(); ++i) {
@ -74,6 +72,32 @@ std::vector<size_t> collectExplicitHyphenIndexes(const std::vector<CodepointInfo
  return indexes;
 }
 bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
 void trimTrailingFootnoteReference(std::vector<CodepointInfo>& cps) {
  if (cps.size() < 3) {
    return;
  }
  int closing = static_cast<int>(cps.size()) - 1;
  if (cps[closing].value != ']') {
    return;
  }
  int pos = closing - 1;
  if (pos < 0 || !isAsciiDigit(cps[pos].value)) {
    return;
  }
  while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
    --pos;
  }
  if (pos < 0 || cps[pos].value != '[') {
    return;
  }
  if (closing - pos <= 1) {
    return;
  }
  cps.erase(cps.begin() + pos, cps.end());
 }
 // Rejects words containing punctuation or digits unless forced.
 bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
  if (cps.empty()) {
@ -120,11 +144,13 @@ std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool
  auto cps = collectCodepoints(word);
  trimSurroundingPunctuation(cps);
  trimTrailingFootnoteReference(cps);
  if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
    return {};
  }
-  if (auto explicitIndexes = collectExplicitHyphenIndexes(cps); !explicitIndexes.empty()) {
+  auto explicitIndexes = collectExplicitHyphenIndexes(cps);
  if (!explicitIndexes.empty()) {
    std::sort(explicitIndexes.begin(), explicitIndexes.end());
    explicitIndexes.erase(std::unique(explicitIndexes.begin(), explicitIndexes.end()), explicitIndexes.end());
    std::vector<size_t> byteOffsets;