From 1c027ce2cd7370d5d02668dda9e956967e99a49f Mon Sep 17 00:00:00 2001
From: Jonas Diemer <jonasdiemer@gmail.com>
Date: Wed, 14 Jan 2026 12:38:30 +0100
Subject: [PATCH] Skip BOM character (sometimes used in front of em-dashes)
 (#340)

## Summary

Skip BOM character (sometimes used in front of em-dashes) - they are not
part of the glyph set and would render `?` otherwise.

---

### AI Usage

Did you use AI tools to help write this code? _**YES**_
---
 lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
index b96d28f8..b9305b1e 100644
--- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
+++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
@@ -151,6 +151,20 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
       }
     }
 
+    // Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF
+    const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF);
+    const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB);
+    const XML_Char FEFF_BYTE_3 = static_cast<XML_Char>(0xBF);
+
+    if (s[i] == FEFF_BYTE_1) {
+      // Check if the next two bytes complete the 3-byte sequence
+      if ((i + 2 < len) && (s[i + 1] == FEFF_BYTE_2) && (s[i + 2] == FEFF_BYTE_3)) {
+        // Sequence 0xEF 0xBB 0xBF found!
+        i += 2;    // Skip the next two bytes
+        continue;  // Move to the next iteration
+      }
+    }
+
     // If we're about to run out of space, then cut the word off and start a new one
     if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
       self->partWordBuffer[self->partWordBufferIndex] = '\0';