From 39080c0e510f824cab8c055f3a20410830d9ce67 Mon Sep 17 00:00:00 2001
From: Jonas Diemer <jonasdiemer@gmail.com>
Date: Fri, 2 Jan 2026 07:54:46 +0100
Subject: [PATCH] Skip soft hyphens. (#195)

For now, let's skip the soft hyphens (later, we can treat them in the
layouter). See
https://github.com/daveallie/crosspoint-reader/discussions/17#discussioncomment-15378475
---
 lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
index 5cd53293..a2ff485c 100644
--- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
+++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
@@ -137,6 +137,21 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
       continue;
     }
 
+    // Skip soft-hyphen with UTF-8 representation (U+00AD) = 0xC2 0xAD
+    const XML_Char SHY_BYTE_1 = static_cast<XML_Char>(0xC2);
+    const XML_Char SHY_BYTE_2 = static_cast<XML_Char>(0xAD);
+    // 1. Check for the start of the 2-byte Soft Hyphen sequence
+    if (s[i] == SHY_BYTE_1) {
+      // 2. Check if the next byte exists AND if it completes the sequence
+      //    We must check i + 1 < len to prevent reading past the end of the buffer.
+      if ((i + 1 < len) && (s[i + 1] == SHY_BYTE_2)) {
+        // Sequence 0xC2 0xAD found!
+        // Skip the current byte (0xC2) and the next byte (0xAD)
+        i++;       // Increment 'i' one more time to skip the 0xAD byte
+        continue;  // Skip the rest of the loop and move to the next iteration
+      }
+    }
+
     // If we're about to run out of space, then cut the word off and start a new one
     if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
       self->partWordBuffer[self->partWordBufferIndex] = '\0';