From f02872542f8b2fee6db070e1f5049203e81ad7fb Mon Sep 17 00:00:00 2001
From: Arthur Tazhitdinov <lisnake@gmail.com>
Date: Thu, 15 Jan 2026 21:48:32 +0500
Subject: [PATCH] refactor: unify punctuation trimming to handle footnotes in
 hyphenation logic

---
 .../Epub/hyphenation/HyphenationCommon.cpp    | 70 +++++++++----------
 lib/Epub/Epub/hyphenation/HyphenationCommon.h |  3 +-
 lib/Epub/Epub/hyphenation/Hyphenator.cpp      | 12 ++--
 .../HyphenationEvaluationTest.cpp             |  3 +-
 4 files changed, 38 insertions(+), 50 deletions(-)
diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
index a8b5cfa9..37bfeb1d 100644
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
@@ -67,10 +67,16 @@ bool isLatinLetter(const uint32_t cp) {
 
 bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); }
 
-bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
+bool isAlphabetic(const uint32_t cp) {
+  if (isLatinLetter(cp) || isCyrillicLetter(cp) || isAsciiDigit(cp)) {
+    return true;
+  }
+  return cp > 0x7F && !isPunctuation(cp);
+}
 
 bool isPunctuation(const uint32_t cp) {
   switch (cp) {
+    case '-':
     case '.':
     case ',':
     case '!':
@@ -87,8 +93,11 @@ bool isPunctuation(const uint32_t cp) {
     case 0x2019:  // ’
     case 0x201C:  // “
     case 0x201D:  // ”
+    case 0x00A0:  // no-break space
     case '{':
     case '}':
+    case '[':
+    case ']':
     case '/':
     case 0x203A:  // ›
     case 0x2026:  // …
@@ -107,18 +116,6 @@ bool isExplicitHyphen(const uint32_t cp) {
     case 0x058A:  // Armenian hyphen
     case 0x2010:  // hyphen
     case 0x2011:  // non-breaking hyphen
-    case 0x2012:  // figure dash
-    case 0x2013:  // en dash
-    case 0x2014:  // em dash
-    case 0x2015:  // horizontal bar
-    case 0x2043:  // hyphen bullet
-    case 0x207B:  // superscript minus
-    case 0x208B:  // subscript minus
-    case 0x2212:  // minus sign
-    case 0x2E17:  // double oblique hyphen
-    case 0x2E3A:  // two-em dash
-    case 0x2E3B:  // three-em dash
-    case 0xFE58:  // small em dash
     case 0xFE63:  // small hyphen-minus
     case 0xFF0D:  // fullwidth hyphen-minus
       return true;
@@ -129,7 +126,28 @@ bool isExplicitHyphen(const uint32_t cp) {
 
 bool isSoftHyphen(const uint32_t cp) { return cp == 0x00AD; }
 
-void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps) {
+void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps) {
+  if (cps.empty()) {
+    return;
+  }
+
+  // Remove trailing footnote references like [12], even if punctuation trails after the closing bracket.
+  if (cps.size() >= 3) {
+    int end = static_cast<int>(cps.size()) - 1;
+    while (end >= 0 && isPunctuation(cps[end].value)) {
+      --end;
+    }
+    int pos = end;
+    if (pos >= 0 && isAsciiDigit(cps[pos].value)) {
+      while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
+        --pos;
+      }
+      if (pos >= 0 && cps[pos].value == '[' && end - pos > 1) {
+        cps.erase(cps.begin() + pos, cps.end());
+      }
+    }
+  }
+
   while (!cps.empty() && isPunctuation(cps.front().value)) {
     cps.erase(cps.begin());
   }
@@ -152,27 +170,3 @@ std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
 
   return cps;
 }
-
-void trimTrailingFootnoteReference(std::vector<CodepointInfo>& cps) {
-  if (cps.size() < 3) {
-    return;
-  }
-  int closing = static_cast<int>(cps.size()) - 1;
-  if (cps[closing].value != ']') {
-    return;
-  }
-  int pos = closing - 1;
-  if (pos < 0 || !isAsciiDigit(cps[pos].value)) {
-    return;
-  }
-  while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
-    --pos;
-  }
-  if (pos < 0 || cps[pos].value != '[') {
-    return;
-  }
-  if (closing - pos <= 1) {
-    return;
-  }
-  cps.erase(cps.begin() + pos, cps.end());
-}
diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.h b/lib/Epub/Epub/hyphenation/HyphenationCommon.h
index ebd49aa0..522a4673 100644
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h
@@ -21,6 +21,5 @@ bool isPunctuation(uint32_t cp);
 bool isAsciiDigit(uint32_t cp);
 bool isExplicitHyphen(uint32_t cp);
 bool isSoftHyphen(uint32_t cp);
-void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);
+void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps);
 std::vector<CodepointInfo> collectCodepoints(const std::string& word);
-void trimTrailingFootnoteReference(std::vector<CodepointInfo>& cps);
diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
index 0e151be3..e485083f 100644
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@@ -1,8 +1,5 @@
 #include "Hyphenator.h"
 
-#include <Utf8.h>
-
-#include <algorithm>
 #include <vector>
 
 #include "HyphenationCommon.h"
@@ -60,13 +57,10 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
 
   // Convert to codepoints and normalize word boundaries.
   auto cps = collectCodepoints(word);
-  trimSurroundingPunctuation(cps);
-  trimTrailingFootnoteReference(cps);
+  trimSurroundingPunctuationAndFootnote(cps);
   const auto* hyphenator = cachedHyphenator_;
-  const size_t minPrefix = hyphenator ? hyphenator->minPrefix() : LiangWordConfig::kDefaultMinPrefix;
-  const size_t minSuffix = hyphenator ? hyphenator->minSuffix() : LiangWordConfig::kDefaultMinSuffix;
 
-  // Explicit hyphen markers (soft or hard) take precedence over heuristic breaks.
+  // Explicit hyphen markers (soft or hard) take precedence over language breaks.
   auto explicitBreakInfos = buildExplicitBreakInfos(cps);
   if (!explicitBreakInfos.empty()) {
     return explicitBreakInfos;
@@ -80,6 +74,8 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
 
   // Only add fallback breaks if needed
   if (includeFallback && indexes.empty()) {
+    const size_t minPrefix = hyphenator ? hyphenator->minPrefix() : LiangWordConfig::kDefaultMinPrefix;
+    const size_t minSuffix = hyphenator ? hyphenator->minSuffix() : LiangWordConfig::kDefaultMinSuffix;
     for (size_t idx = minPrefix; idx + minSuffix <= cps.size(); ++idx) {
       indexes.push_back(idx);
     }
diff --git a/test/hyphenation_eval/HyphenationEvaluationTest.cpp b/test/hyphenation_eval/HyphenationEvaluationTest.cpp
index ae667f9a..90d17101 100644
--- a/test/hyphenation_eval/HyphenationEvaluationTest.cpp
+++ b/test/hyphenation_eval/HyphenationEvaluationTest.cpp
@@ -128,8 +128,7 @@ std::string positionsToHyphenated(const std::string& word, const std::vector<siz
 
 std::vector<size_t> hyphenateWordWithHyphenator(const std::string& word, const LanguageHyphenator& hyphenator) {
   auto cps = collectCodepoints(word);
-  trimSurroundingPunctuation(cps);
-  trimTrailingFootnoteReference(cps);
+  trimSurroundingPunctuationAndFootnote(cps);
 
   return hyphenator.breakIndexes(cps);
 }