From 39080c0e510f824cab8c055f3a20410830d9ce67 Mon Sep 17 00:00:00 2001 From: Jonas Diemer Date: Fri, 2 Jan 2026 07:54:46 +0100 Subject: [PATCH] Skip soft hyphens. (#195) For now, let's skip the soft hyphens (later, we can treat them in the layouter). See https://github.com/daveallie/crosspoint-reader/discussions/17#discussioncomment-15378475 --- lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index 5cd53293..a2ff485c 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -137,6 +137,21 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char continue; } + // Skip soft-hyphen with UTF-8 representation (U+00AD) = 0xC2 0xAD + const XML_Char SHY_BYTE_1 = static_cast(0xC2); + const XML_Char SHY_BYTE_2 = static_cast(0xAD); + // 1. Check for the start of the 2-byte Soft Hyphen sequence + if (s[i] == SHY_BYTE_1) { + // 2. Check if the next byte exists AND if it completes the sequence + // We must check i + 1 < len to prevent reading past the end of the buffer. + if ((i + 1 < len) && (s[i + 1] == SHY_BYTE_2)) { + // Sequence 0xC2 0xAD found! + // Skip the current byte (0xC2) and the next byte (0xAD) + i++; // Increment 'i' one more time to skip the 0xAD byte + continue; // Skip the rest of the loop and move to the next iteration + } + } + // If we're about to run out of space, then cut the word off and start a new one if (self->partWordBufferIndex >= MAX_WORD_SIZE) { self->partWordBuffer[self->partWordBufferIndex] = '\0';