Refactor hyphenation logic to return detailed break information, enhancing line breaking capabilities

2026-02-06 07:37:37 +03:00 · 2026-01-07 03:54:43 +05:00 · 2026-01-07 03:54:43 +05:00 · 2315513ca1
commit 2315513ca1
parent f998180353
3 changed files with 34 additions and 20 deletions
--- a/lib/Epub/Epub/ParsedText.cpp
+++ b/lib/Epub/Epub/ParsedText.cpp
@ -199,7 +199,8 @@ std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r
  return lineBreakIndices;
 }

-// Splits words[wordIndex] into prefix+hyphen and remainder when a legal breakpoint fits the available width.
+// Splits words[wordIndex] into prefix (adding a hyphen only when needed) and remainder when a legal breakpoint fits the
+// available width.
 bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availableWidth, const GfxRenderer& renderer,
                                      const int fontId, std::vector<uint16_t>& wordWidths,
                                      const bool allowFallbackBreaks) {
@ -212,22 +213,27 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
  std::advance(wordIt, wordIndex);
  std::advance(styleIt, wordIndex);

-  const auto breakOffsets = Hyphenator::breakOffsets(*wordIt, allowFallbackBreaks);
-  if (breakOffsets.empty()) {
+  const auto breakInfos = Hyphenator::breakOffsets(*wordIt, allowFallbackBreaks);
+  if (breakInfos.empty()) {
    return false;
  }

  const auto style = *styleIt;
  size_t chosenOffset = 0;
  int chosenWidth = -1;
+  bool chosenNeedsHyphen = true;

-  for (const size_t offset : breakOffsets) {
+  for (const auto& info : breakInfos) {
+    const size_t offset = info.byteOffset;
    if (offset == 0 || offset >= wordIt->size()) {
      continue;
    }

+    const bool needsHyphen = info.requiresInsertedHyphen;
    std::string prefix = wordIt->substr(0, offset);
-    prefix.push_back('-');
+    if (needsHyphen) {
+      prefix.push_back('-');
+    }
    const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style);
    if (prefixWidth > availableWidth) {
      continue;
@ -236,6 +242,7 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
    if (prefixWidth > chosenWidth) {
      chosenWidth = prefixWidth;
      chosenOffset = offset;
+      chosenNeedsHyphen = needsHyphen;
    }
  }

@ -245,7 +252,9 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl

  std::string remainder = wordIt->substr(chosenOffset);
  wordIt->resize(chosenOffset);
-  wordIt->push_back('-');
+  if (chosenNeedsHyphen) {
+    wordIt->push_back('-');
+  }

  auto insertWordIt = std::next(wordIt);
  auto insertStyleIt = std::next(styleIt);
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@ -135,9 +135,20 @@ size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t in
  return cps[index].byteOffset;
 }

+std::vector<Hyphenator::BreakInfo> buildBreakInfoVector(const std::vector<size_t>& indexes,
+                                                        const std::vector<CodepointInfo>& cps,
+                                                        const bool requiresHyphen) {
+  std::vector<Hyphenator::BreakInfo> breaks;
+  breaks.reserve(indexes.size());
+  for (const size_t idx : indexes) {
+    breaks.push_back({byteOffsetForIndex(cps, idx), requiresHyphen});
+  }
+  return breaks;
+}
+
 }  // namespace

-std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) {
+std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) {
  if (word.empty()) {
    return {};
  }
@ -153,12 +164,7 @@ std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool
  if (!explicitIndexes.empty()) {
    std::sort(explicitIndexes.begin(), explicitIndexes.end());
    explicitIndexes.erase(std::unique(explicitIndexes.begin(), explicitIndexes.end()), explicitIndexes.end());
-    std::vector<size_t> byteOffsets;
-    byteOffsets.reserve(explicitIndexes.size());
-    for (const size_t idx : explicitIndexes) {
-      byteOffsets.push_back(byteOffsetForIndex(cps, idx));
-    }
-    return byteOffsets;
+    return buildBreakInfoVector(explicitIndexes, cps, false);
  }

  std::vector<size_t> indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector<size_t>();
@ -175,10 +181,5 @@ std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool
  std::sort(indexes.begin(), indexes.end());
  indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());

-  std::vector<size_t> byteOffsets;
-  byteOffsets.reserve(indexes.size());
-  for (const size_t idx : indexes) {
-    byteOffsets.push_back(byteOffsetForIndex(cps, idx));
-  }
-  return byteOffsets;
+  return buildBreakInfoVector(indexes, cps, true);
 }
--- a/lib/Epub/Epub/hyphenation/Hyphenator.h
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.h
@ -6,7 +6,11 @@

 class Hyphenator {
 public:
+  struct BreakInfo {
+    size_t byteOffset;
+    bool requiresInsertedHyphen;
+  };
  // Returns byte offsets where the word may be hyphenated. When includeFallback is true, all positions obeying the
  // minimum prefix/suffix constraints are returned even if no language-specific rule matches.
-  static std::vector<size_t> breakOffsets(const std::string& word, bool includeFallback);
+  static std::vector<BreakInfo> breakOffsets(const std::string& word, bool includeFallback);
 };