diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index b96d28f8..b9305b1e 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -151,6 +151,20 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char } } + // Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF + const XML_Char FEFF_BYTE_1 = static_cast(0xEF); + const XML_Char FEFF_BYTE_2 = static_cast(0xBB); + const XML_Char FEFF_BYTE_3 = static_cast(0xBF); + + if (s[i] == FEFF_BYTE_1) { + // Check if the next two bytes complete the 3-byte sequence + if ((i + 2 < len) && (s[i + 1] == FEFF_BYTE_2) && (s[i + 2] == FEFF_BYTE_3)) { + // Sequence 0xEF 0xBB 0xBF found! + i += 2; // Skip the next two bytes + continue; // Move to the next iteration + } + } + // If we're about to run out of space, then cut the word off and start a new one if (self->partWordBufferIndex >= MAX_WORD_SIZE) { self->partWordBuffer[self->partWordBufferIndex] = '\0';